diff --git a/src/GridText.c b/src/GridText.c index 04e9a4a..d9a1665 100644 --- a/src/GridText.c +++ b/src/GridText.c @@ -453,7 +453,11 @@ struct _HText { HTList *hidden_links; /* Content-less links ... */ int hiddenlinkflag; /* ... and how to treat them */ BOOL no_cache; /* Always refresh? */ +#ifdef EXP_JAPANESE_SPACES + char LastChars[7]; /* utf-8 buffer */ +#else char LastChar; /* For absorbing white space */ +#endif /* For Internal use: */ HTStyle *style; /* Current style */ @@ -1134,7 +1138,11 @@ HText *HText_new(HTParentAnchor *anchor) anchor->post_data) ? YES : NO); +#ifdef EXP_JAPANESE_SPACES + memset(self->LastChars, 0, sizeof(self->LastChars)); +#else self->LastChar = '\0'; +#endif #ifndef USE_PRETTYSRC if (HTOutputFormat == WWW_SOURCE) @@ -2867,7 +2875,7 @@ static void split_line(HText *text, unsigned split) #ifdef EXP_WCWIDTH_SUPPORT utfxtracells_on_this_line = 0; #endif - text->LastChar = ' '; + HText_setLastChar(text, ' '); #ifdef DEBUG_APPCH CTRACE((tfp, "GridText: split_line(%p,%d) called\n", text, split)); @@ -4648,7 +4656,20 @@ void HText_setLastChar(HText *text, int ch) if (!text) return; +#ifdef EXP_JAPANESE_SPACES + if (IS_UTF_EXTRA(ch) && IS_UTF_FIRST(text->LastChars[0])) { + int i; + for (i = 1; text->LastChars[i] != '\0' && i < sizeof(text->LastChars) - 1; i++) + ; + text->LastChars[i] = (char) ch; + text->LastChars[i + 1] = '\0'; + return; + } + memset(text->LastChars, 0, sizeof(text->LastChars)); + text->LastChars[0] = (char) ch; +#else text->LastChar = (char) ch; +#endif } /* Get LastChar element in the text object. @@ -4659,8 +4680,37 @@ char HText_getLastChar(HText *text) if (!text) return ('\0'); +#ifdef EXP_JAPANESE_SPACES + if (IS_UTF_FIRST(text->LastChars[0])) { + int i; + for (i = 1; text->LastChars[i] != '\0' && i < sizeof(text->LastChars); i++) + ; + return ((char) text->LastChars[i - 1]); + } + return ((char) text->LastChars[0]); +#else return ((char) text->LastChar); +#endif +} + +#ifdef EXP_JAPANESE_SPACES +BOOL HText_checkLastChar_needSpaceOnJoinLines(HText *text) +{ + if (!text) + return YES; + + if (IS_UTF_FIRST(text->LastChars[0]) && isUTF8CJChar(text->LastChars)) + return NO; + if ((HTCJK == CHINESE || HTCJK == JAPANESE) && is8bits(text->LastChars[0])) { + /* TODO: support 2nd byte of some SJIS kanji (!is8bits && IS_SJIS_LO) */ + return NO; + } + if (text->LastChars[0] != ' ') + return YES; + return NO; } +#endif + /* Simple table handling - private * ------------------------------- @@ -5204,7 +5254,7 @@ static void add_link_number(HText *text, TextAnchor *a, int save_position) && (text->source ? !psrcview_no_anchor_numbering : 1) #endif && links_are_numbered()) { - char saved_lastchar = text->LastChar; + char saved_lastchar = HText_getLastChar(text); int saved_linenum = text->Lines; HTAnchor *link_dest; char *link_text; @@ -5222,7 +5272,7 @@ static void add_link_number(HText *text, TextAnchor *a, int save_position) HText_appendText(text, marker); } if (saved_linenum && text->Lines && saved_lastchar != ' ') - text->LastChar = ']'; /* if marker not after space caused split */ + HText_setLastChar(text, ']'); /* if marker not after space caused split */ if (save_position) { a->line_num = text->Lines; a->line_pos = (short) text->last_line->size; @@ -14973,6 +15023,14 @@ static void permit_split_after_CJchar(HText *text, const char *s, unsigned short { /* Can split after almost any CJ char (Korean uses space) */ /* TODO: UAX#14 Unicode Line Breaking Algorithm (use ICU4C?) */ + if (isUTF8CJChar(s)) + text->permissible_split = pos; +} +#endif /* EXP_WCWIDTH_SUPPORT */ + +#if defined(EXP_WCWIDTH_SUPPORT) || defined(EXP_JAPANESE_SPACES) +BOOL isUTF8CJChar(const char *s) +{ UCode_t u = UCGetUniFromUtf8String(&s); if (u >= 0x4e00 && u <= 0x9fff || /* CJK Unified Ideographs */ u >= 0x3000 && u <= 0x30ff || /* CJK Symbols and Punctuation, Hiragana, Katakana */ @@ -14981,6 +15039,7 @@ static void permit_split_after_CJchar(HText *text, const char *s, unsigned short u >= 0x3400 && u <= 0x4dbf || /* CJK Unified Ideographs Extension A */ u >= 0xf900 && u <= 0xfaff || /* CJK Compatibility Ideographs */ u >= 0x20000 && u <= 0x3ffff) /* {Supplementary,Tertiary} Ideographic Plane */ - text->permissible_split = pos; + return YES; + return NO; } -#endif +#endif /* EXP_WCWIDTH_SUPPORT || EXP_JAPANESE_SPACES */ diff --git a/src/GridText.h b/src/GridText.h index 911de26..40b17b1 100644 --- a/src/GridText.h +++ b/src/GridText.h @@ -93,6 +93,9 @@ US-ASCII control characters <32 which are not defined in Unicode standard extern void HText_setLastChar(HText *text, int ch); extern char HText_getLastChar(HText *text); +#ifdef EXP_JAPANESE_SPACES + extern BOOL HText_checkLastChar_needSpaceOnJoinLines(HText *text); +#endif extern int HText_sourceAnchors(HText *text); extern void HText_setStale(HText *text); @@ -289,6 +292,10 @@ US-ASCII control characters <32 which are not defined in Unicode standard extern HTkcode HText_getSpecifiedKcode(HText *text); extern void HText_updateSpecifiedKcode(HText *text, HTkcode kcode); +#if defined(EXP_WCWIDTH_SUPPORT) || defined(EXP_JAPANESE_SPACES) + extern BOOL isUTF8CJChar(const char *s); +#endif + #ifdef __cplusplus } #endif diff --git a/src/HTML.c b/src/HTML.c index a012466..cf2e18b 100644 --- a/src/HTML.c +++ b/src/HTML.c @@ -275,18 +275,6 @@ void LYShowBadHTML(const char *message) * A C T I O N R O U T I N E S */ -/* FIXME: this should be amended to do the substitution only when not in a - * multibyte stream. - */ -#ifdef EXP_JAPANESE_SPACES -#define FIX_JAPANESE_SPACES \ - (HTCJK == CHINESE || HTCJK == JAPANESE || HTCJK == TAIPEI) - /* don't replace '\n' with ' ' if Chinese or Japanese - HN - */ -#else -#define FIX_JAPANESE_SPACES 0 -#endif - /* Character handling * ------------------ */ @@ -333,12 +321,25 @@ void HTML_put_character(HTStructured * me, int c) return; if (c != '\n' && c != '\t' && c != '\r') { HTChunkPutc(&me->title, uc); - } else if (FIX_JAPANESE_SPACES) { - if (c == '\t') { - HTChunkPutc(&me->title, ' '); - } else { +#ifdef EXP_JAPANESE_SPACES + } else if (c == '\t') { + HTChunkPutc(&me->title, ' '); + /* don't replace '\n' with ' ' if Chinese or Japanese - HN + */ + } else if (me->title.size > 0 && is8bits(me->title.data[me->title.size - 1])) { + if (HTCJK == CHINESE || HTCJK == JAPANESE) { + /* TODO: support 2nd byte of SJIS (!is8bits && IS_SJIS_LO) */ return; + } else if (IS_UTF8_TTY) { + /* find start position of UTF-8 sequence */ + int i = me->title.size - 1; + while (i > 0 && (me->title.data[i] & 0xc0) == 0x80) /* UTF_EXTRA */ + i--; + if (isUTF8CJChar(&(me->title.data[i]))) + return; } + HTChunkPutc(&me->title, ' '); +#endif } else { HTChunkPutc(&me->title, ' '); } @@ -453,15 +454,17 @@ void HTML_put_character(HTStructured * me, int c) UPDATE_STYLE; } if (c == '\n') { - if (!FIX_JAPANESE_SPACES) { - if (me->in_word) { - if (HText_getLastChar(me->text) != ' ') { - me->inP = TRUE; - me->inLABEL = FALSE; - HText_appendCharacter(me->text, ' '); - } - me->in_word = NO; + if (me->in_word) { +#ifdef EXP_JAPANESE_SPACES + if (HText_checkLastChar_needSpaceOnJoinLines(me->text)) { +#else + if (HText_getLastChar(me->text) != ' ') { +#endif + me->inP = TRUE; + me->inLABEL = FALSE; + HText_appendCharacter(me->text, ' '); } + me->in_word = NO; } } else if (c == ' ' || c == '\t') { @@ -607,12 +610,14 @@ void HTML_put_string(HTStructured * me, const char *s) UPDATE_STYLE; } if (c == '\n') { - if (!FIX_JAPANESE_SPACES) { - if (me->in_word) { - if (HText_getLastChar(me->text) != ' ') - HText_appendCharacter(me->text, ' '); - me->in_word = NO; - } + if (me->in_word) { +#ifdef EXP_JAPANESE_SPACES + if (HText_checkLastChar_needSpaceOnJoinLines(me->text)) +#else + if (HText_getLastChar(me->text) != ' ') +#endif + HText_appendCharacter(me->text, ' '); + me->in_word = NO; } } else if (c == ' ' || c == '\t') {