From cfdf68a7acde16597fbd896674af68c42361102c Mon Sep 17 00:00:00 2001 From: bfredl Date: Thu, 8 Aug 2024 10:42:08 +0200 Subject: [PATCH] feat(mbyte): support extended grapheme clusters including more emoji Use the grapheme break algorithm from utf8proc to support grapheme clusters from recent unicode versions. Handle variant selector VS16 turning some codepoints into double-width emoji. This means we need to use ptr2cells rather than char2cells when possible. --- runtime/doc/mbyte.txt | 6 + runtime/doc/news.txt | 6 + runtime/doc/options.txt | 9 +- runtime/lua/vim/_meta/options.lua | 9 +- src/nvim/api/extmark.c | 2 +- src/nvim/api/ui.c | 2 +- src/nvim/change.c | 7 +- src/nvim/digraph.c | 2 +- src/nvim/drawline.c | 22 +- src/nvim/edit.c | 12 +- src/nvim/ex_cmds.c | 2 +- src/nvim/ex_getln.c | 6 +- src/nvim/grid.c | 37 ++- src/nvim/mbyte.c | 288 +++++++++++++++++------- src/nvim/mbyte.h | 32 +-- src/nvim/message.c | 4 +- src/nvim/normal.c | 5 +- src/nvim/options.lua | 9 +- src/nvim/plines.c | 28 ++- src/nvim/plines.h | 2 +- src/nvim/regexp.c | 29 +-- src/nvim/search.c | 2 +- src/nvim/sign.c | 2 +- src/nvim/spellsuggest.c | 10 +- src/nvim/textformat.c | 2 +- src/nvim/tui/tui.c | 17 +- src/nvim/tui/tui_defs.h | 1 + test/functional/api/vim_spec.lua | 22 ++ test/functional/ui/decorations_spec.lua | 21 ++ test/functional/ui/messages_spec.lua | 35 +++ test/functional/ui/multibyte_spec.lua | 122 +++++++++- test/old/testdir/test_functions.vim | 2 +- test/old/testdir/test_normal.vim | 4 +- test/unit/mbyte_spec.lua | 119 +++++++--- 34 files changed, 657 insertions(+), 221 deletions(-) diff --git a/runtime/doc/mbyte.txt b/runtime/doc/mbyte.txt index a8c5670352..47fd4f3343 100644 --- a/runtime/doc/mbyte.txt +++ b/runtime/doc/mbyte.txt @@ -646,6 +646,12 @@ widespread as file format. A composing or combining character is used to change the meaning of the character before it. The combining characters are drawn on top of the preceding character. + +Nvim largely follows the definition of extended grapheme clusters in UAX#29 +in the Unicode standard, with some modifications: An ascii char will always +start a new cluster. In addition 'arabicshape' enables the combining of some +arabic letters, when they are shaped to be displayed together in a single cell. + Too big combined characters cannot be displayed, but they can still be inspected using the |g8| and |ga| commands described below. When editing text a composing character is mostly considered part of the diff --git a/runtime/doc/news.txt b/runtime/doc/news.txt index 80511ccb87..b7e1e0c84f 100644 --- a/runtime/doc/news.txt +++ b/runtime/doc/news.txt @@ -200,6 +200,12 @@ These existing features changed their behavior. top lines are calculated using screen line numbers which take virtual lines into account. +β€’ The implementation of grapheme clusters (or combining chars |mbyte-combining|) + was upgraded to closely follow extended grapheme clusters as defined by UAX#29 + in the unicode standard. Noteworthily, this enables proper display of many + more emoji characters than before, including those encoded with multiple + emoji codepoints combined with ZWJ (zero width joiner) codepoints. + ============================================================================== REMOVED FEATURES *news-removed* diff --git a/runtime/doc/options.txt b/runtime/doc/options.txt index f44e0954a5..4945a1b46d 100644 --- a/runtime/doc/options.txt +++ b/runtime/doc/options.txt @@ -2217,9 +2217,12 @@ A jump table for the options with a short description can be found at |Q_op|. global When on all Unicode emoji characters are considered to be full width. This excludes "text emoji" characters, which are normally displayed as - single width. Unfortunately there is no good specification for this - and it has been determined on trial-and-error basis. Use the - |setcellwidths()| function to change the behavior. + single width. However, such "text emoji" are treated as full-width + emoji if they are followed by the U+FE0F variant selector. + + Unfortunately there is no good specification for this and it has been + determined on trial-and-error basis. Use the |setcellwidths()| + function to change the behavior. *'encoding'* *'enc'* 'encoding' 'enc' string (default "utf-8") diff --git a/runtime/lua/vim/_meta/options.lua b/runtime/lua/vim/_meta/options.lua index b4ac478b61..05c9b89d77 100644 --- a/runtime/lua/vim/_meta/options.lua +++ b/runtime/lua/vim/_meta/options.lua @@ -1829,9 +1829,12 @@ vim.go.ead = vim.go.eadirection --- When on all Unicode emoji characters are considered to be full width. --- This excludes "text emoji" characters, which are normally displayed as ---- single width. Unfortunately there is no good specification for this ---- and it has been determined on trial-and-error basis. Use the ---- `setcellwidths()` function to change the behavior. +--- single width. However, such "text emoji" are treated as full-width +--- emoji if they are followed by the U+FE0F variant selector. +--- +--- Unfortunately there is no good specification for this and it has been +--- determined on trial-and-error basis. Use the `setcellwidths()` +--- function to change the behavior. --- --- @type boolean vim.o.emoji = true diff --git a/src/nvim/api/extmark.c b/src/nvim/api/extmark.c index 1673519479..d694b64f66 100644 --- a/src/nvim/api/extmark.c +++ b/src/nvim/api/extmark.c @@ -571,7 +571,7 @@ Integer nvim_buf_set_extmark(Buffer buffer, Integer ns_id, Integer line, Integer String c = opts->conceal; if (c.size > 0) { int ch; - hl.conceal_char = utfc_ptr2schar_len(c.data, (int)c.size, &ch); + hl.conceal_char = utfc_ptr2schar(c.data, &ch); if (!hl.conceal_char || !vim_isprintc(ch)) { api_set_error(err, kErrorTypeValidation, "conceal char has to be printable"); goto error; diff --git a/src/nvim/api/ui.c b/src/nvim/api/ui.c index 82a5ff5f8e..a99d97acb8 100644 --- a/src/nvim/api/ui.c +++ b/src/nvim/api/ui.c @@ -847,7 +847,7 @@ void remote_ui_raw_line(RemoteUI *ui, Integer grid, Integer row, Integer startco char sc_buf[MAX_SCHAR_SIZE]; schar_get(sc_buf, chunk[i]); remote_ui_put(ui, sc_buf); - if (utf_ambiguous_width(utf_ptr2char(sc_buf))) { + if (utf_ambiguous_width(sc_buf)) { ui->client_col = -1; // force cursor update } } diff --git a/src/nvim/change.c b/src/nvim/change.c index 6e9fab5a9b..47a9f0ce92 100644 --- a/src/nvim/change.c +++ b/src/nvim/change.c @@ -896,14 +896,15 @@ int del_bytes(colnr_T count, bool fixpos_arg, bool use_delcombine) // delete the last combining character. if (p_deco && use_delcombine && utfc_ptr2len(oldp + col) >= count) { char *p0 = oldp + col; - if (utf_composinglike(p0, p0 + utf_ptr2len(p0))) { + GraphemeState state = GRAPHEME_STATE_INIT; + if (utf_composinglike(p0, p0 + utf_ptr2len(p0), &state)) { // Find the last composing char, there can be several. int n = col; do { col = n; count = utf_ptr2len(oldp + n); n += count; - } while (utf_composinglike(oldp + col, oldp + n)); + } while (utf_composinglike(oldp + col, oldp + n, &state)); fixpos = false; } } @@ -1694,7 +1695,7 @@ bool open_line(int dir, int flags, int second_line_indent, bool *did_do_comment) } if (curbuf->b_p_ai || (flags & OPENLINE_DELSPACES)) { while ((*p_extra == ' ' || *p_extra == '\t') - && !utf_iscomposing(utf_ptr2char(p_extra + 1))) { + && !utf_iscomposing_first(utf_ptr2char(p_extra + 1))) { if (REPLACE_NORMAL(State)) { replace_push(*p_extra); } diff --git a/src/nvim/digraph.c b/src/nvim/digraph.c index 8149c5964f..7413d33fe4 100644 --- a/src/nvim/digraph.c +++ b/src/nvim/digraph.c @@ -1865,7 +1865,7 @@ static void printdigraph(const digr_T *dp, result_T *previous) p = buf; // add a space to draw a composing char on - if (utf_iscomposing(dp->result)) { + if (utf_iscomposing_first(dp->result)) { *p++ = ' '; } p += utf_char2bytes(dp->result, p); diff --git a/src/nvim/drawline.c b/src/nvim/drawline.c index 8a948716e5..b5273a54ca 100644 --- a/src/nvim/drawline.c +++ b/src/nvim/drawline.c @@ -1826,7 +1826,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, int col_rows, s // If a double-width char doesn't fit display a '>' in the last column. // Don't advance the pointer but put the character at the start of the next line. - if (wlv.col >= grid->cols - 1 && utf_char2cells(mb_c) == 2) { + if (wlv.col >= grid->cols - 1 && schar_cells(mb_schar) == 2) { mb_c = '>'; mb_l = 1; (void)mb_l; @@ -1922,7 +1922,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, int col_rows, s // If a double-width char doesn't fit display a '>' in the // last column; the character is displayed at the start of the // next line. - if (wlv.col >= grid->cols - 1 && utf_char2cells(mb_c) == 2) { + if (wlv.col >= grid->cols - 1 && schar_cells(mb_schar) == 2) { mb_schar = schar_from_ascii('>'); mb_c = '>'; mb_l = 1; @@ -2393,6 +2393,12 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, int col_rows, s || (decor_conceal && decor_state.conceal_char) || wp->w_p_cole == 1) && wp->w_p_cole != 3) { + if (schar_cells(mb_schar) > 1) { + // When the first char to be concealed is double-width, + // need to advance one more virtual column. + wlv.n_extra++; + } + // First time at this concealed item: display one // character. if (has_match_conc && match_conc) { @@ -2410,12 +2416,6 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, int col_rows, s mb_schar = schar_from_ascii(' '); } - if (utf_char2cells(mb_c) > 1) { - // When the first char to be concealed is double-width, - // need to advance one more virtual column. - wlv.n_extra++; - } - mb_c = schar_get_first_codepoint(mb_schar); prev_syntax_id = syntax_seqnr; @@ -2484,7 +2484,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, int col_rows, s && mb_schar != NUL) { mb_schar = wp->w_p_lcs_chars.prec; lcs_prec_todo = NUL; - if (utf_char2cells(mb_c) > 1) { + if (schar_cells(mb_schar) > 1) { // Double-width character being overwritten by the "precedes" // character, need to fill up half the character. wlv.sc_extra = schar_from_ascii(MB_FILLER_CHAR); @@ -2725,7 +2725,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, int col_rows, s linebuf_vcol[wlv.off] = wlv.vcol; - if (utf_char2cells(mb_c) > 1) { + if (schar_cells(mb_schar) > 1) { // Need to fill two screen columns. wlv.off++; wlv.col++; @@ -2744,7 +2744,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, int col_rows, s wlv.off++; wlv.col++; } else if (wp->w_p_cole > 0 && is_concealing) { - bool concealed_wide = utf_char2cells(mb_c) > 1; + bool concealed_wide = schar_cells(mb_schar) > 1; wlv.skip_cells--; wlv.vcol_off_co++; diff --git a/src/nvim/edit.c b/src/nvim/edit.c index 00ce38c4b1..f8723f9680 100644 --- a/src/nvim/edit.c +++ b/src/nvim/edit.c @@ -2832,6 +2832,8 @@ int replace_push_mb(char *p) { int l = utfc_ptr2len(p); + // TODO(bfredl): stop doing this insantity and instead use utf_head_off() when popping. + // or just keep a secondary array with char byte lenghts for (int j = l - 1; j >= 0; j--) { replace_push(p[j]); } @@ -2911,7 +2913,9 @@ static void mb_replace_pop_ins(int cc) for (int i = 1; i < n; i++) { buf[i] = (uint8_t)replace_pop(); } - if (utf_iscomposing(utf_ptr2char((char *)buf))) { + // TODO(bfredl): by fixing replace_push_mb, upgrade to use + // the new composing algorithm + if (utf_iscomposing_legacy(utf_ptr2char((char *)buf))) { ins_bytes_len((char *)buf, (size_t)n); } else { // Not a composing char, put it back. @@ -3843,7 +3847,7 @@ static bool ins_bs(int c, int mode, int *inserted_space_p) space_sci = sci; space_vcol = vcol; } - vcol += charsize_nowrap(curbuf, use_ts, vcol, sci.chr.value); + vcol += charsize_nowrap(curbuf, sci.ptr, use_ts, vcol, sci.chr.value); sci = utfc_next(sci); prev_space = cur_space; } @@ -3859,7 +3863,7 @@ static bool ins_bs(int c, int mode, int *inserted_space_p) // Find the position to stop backspacing. // Use charsize_nowrap() so that virtual text and wrapping are ignored. while (true) { - int size = charsize_nowrap(curbuf, use_ts, space_vcol, space_sci.chr.value); + int size = charsize_nowrap(curbuf, space_sci.ptr, use_ts, space_vcol, space_sci.chr.value); if (space_vcol + size > want_vcol) { break; } @@ -3930,7 +3934,7 @@ static bool ins_bs(int c, int mode, int *inserted_space_p) bool has_composing = false; if (p_deco) { char *p0 = get_cursor_pos_ptr(); - has_composing = utf_composinglike(p0, p0 + utf_ptr2len(p0)); + has_composing = utf_composinglike(p0, p0 + utf_ptr2len(p0), NULL); } del_char(false); // If there are combining characters and 'delcombine' is set diff --git a/src/nvim/ex_cmds.c b/src/nvim/ex_cmds.c index 6ac73527ee..1b6861f750 100644 --- a/src/nvim/ex_cmds.c +++ b/src/nvim/ex_cmds.c @@ -204,7 +204,7 @@ void do_ascii(exarg_T *eap) IObuff[iobuff_len++] = ' '; } IObuff[iobuff_len++] = '<'; - if (utf_iscomposing(c)) { + if (utf_iscomposing_first(c)) { IObuff[iobuff_len++] = ' '; // Draw composing char on top of a space. } iobuff_len += (size_t)utf_char2bytes(c, IObuff + iobuff_len); diff --git a/src/nvim/ex_getln.c b/src/nvim/ex_getln.c index 8a34e03d91..722a857f03 100644 --- a/src/nvim/ex_getln.c +++ b/src/nvim/ex_getln.c @@ -2118,7 +2118,7 @@ static int command_line_handle_key(CommandLineState *s) s->do_abbr = false; // don't do abbreviation now ccline.special_char = NUL; // may need to remove ^ when composing char was typed - if (utf_iscomposing(s->c) && !cmd_silent) { + if (utf_iscomposing_first(s->c) && !cmd_silent) { if (ui_has(kUICmdline)) { // TODO(bfredl): why not make unputcmdline also work with true? unputcmdline(); @@ -3585,7 +3585,9 @@ void put_on_cmdline(const char *str, int len, bool redraw) // backup to the character before it. There could be two of them. int i = 0; int c = utf_ptr2char(ccline.cmdbuff + ccline.cmdpos); - while (ccline.cmdpos > 0 && utf_iscomposing(c)) { + // TODO(bfredl): this can be corrected/simplified as utf_head_off implements the + // correct grapheme cluster breaks + while (ccline.cmdpos > 0 && utf_iscomposing_legacy(c)) { i = utf_head_off(ccline.cmdbuff, ccline.cmdbuff + ccline.cmdpos - 1) + 1; ccline.cmdpos -= i; len += i; diff --git a/src/nvim/grid.c b/src/nvim/grid.c index 56246bf001..acb336c725 100644 --- a/src/nvim/grid.c +++ b/src/nvim/grid.c @@ -186,6 +186,24 @@ size_t schar_len(schar_T sc) } } +int schar_cells(schar_T sc) +{ + // hot path +#ifdef ORDER_BIG_ENDIAN + if (!(sc & 0x80FFFFFF)) { + return 1; + } +#else + if (sc < 0x80) { + return 1; + } +#endif + + char sc_buf[MAX_SCHAR_SIZE]; + schar_get(sc_buf, sc); + return utf_ptr2cells(sc_buf); +} + /// gets first raw UTF-8 byte of an schar static char schar_get_first_byte(schar_T sc) { @@ -428,14 +446,19 @@ int grid_line_puts(int col, const char *text, int textlen, int attr) const int max_col = grid_line_maxcol; while (col < max_col && (len < 0 || (int)(ptr - text) < len) && *ptr != NUL) { // check if this is the first byte of a multibyte - int mbyte_blen = len > 0 - ? utfc_ptr2len_len(ptr, (int)((text + len) - ptr)) - : utfc_ptr2len(ptr); + int mbyte_blen; + if (len >= 0) { + int maxlen = (int)((text + len) - ptr); + mbyte_blen = utfc_ptr2len_len(ptr, maxlen); + if (mbyte_blen > maxlen) { + mbyte_blen = 1; + } + } else { + mbyte_blen = utfc_ptr2len(ptr); + } int firstc; - schar_T schar = len >= 0 - ? utfc_ptr2schar_len(ptr, (int)((text + len) - ptr), &firstc) - : utfc_ptr2schar(ptr, &firstc); - int mbyte_cells = utf_char2cells(firstc); + schar_T schar = utfc_ptrlen2schar(ptr, mbyte_blen, &firstc); + int mbyte_cells = utf_ptr2cells_len(ptr, mbyte_blen); if (mbyte_cells > 2 || schar == 0) { mbyte_cells = 1; schar = schar_from_char(0xFFFD); diff --git a/src/nvim/mbyte.c b/src/nvim/mbyte.c index 0c1b537f3a..666a904fc5 100644 --- a/src/nvim/mbyte.c +++ b/src/nvim/mbyte.c @@ -511,20 +511,30 @@ int utf_char2cells(int c) /// Return the number of display cells character at "*p" occupies. /// This doesn't take care of unprintable characters, use ptr2cells() for that. -int utf_ptr2cells(const char *p) +int utf_ptr2cells(const char *p_in) { + const uint8_t *p = (const uint8_t *)p_in; // Need to convert to a character number. - if ((uint8_t)(*p) >= 0x80) { - int c = utf_ptr2char(p); + if ((*p) >= 0x80) { + int len = utf8len_tab[*p]; + int32_t c = utf_ptr2CharInfo_impl(p, (uintptr_t)len); // An illegal byte is displayed as . - if (utf_ptr2len(p) == 1 || c == NUL) { + if (c <= 0) { return 4; } // If the char is ASCII it must be an overlong sequence. if (c < 0x80) { return char2cells(c); } - return utf_char2cells(c); + int cells = utf_char2cells(c); + if (cells == 1 && p_emoji + && intable(emoji_all, ARRAY_SIZE(emoji_all), c)) { + int c2 = utf_ptr2char(p_in + len); + if (c2 == 0xFE0F) { + return 2; // emoji presentation + } + } + return cells; } return 1; } @@ -603,7 +613,8 @@ int utf_ptr2cells_len(const char *p, int size) { // Need to convert to a wide character. if (size > 0 && (uint8_t)(*p) >= 0x80) { - if (utf_ptr2len_len(p, size) < utf8len_tab[(uint8_t)(*p)]) { + int len = utf_ptr2len_len(p, size); + if (len < utf8len_tab[(uint8_t)(*p)]) { return 1; // truncated } int c = utf_ptr2char(p); @@ -615,7 +626,16 @@ int utf_ptr2cells_len(const char *p, int size) if (c < 0x80) { return char2cells(c); } - return utf_char2cells(c); + int cells = utf_char2cells(c); + if (cells == 1 && p_emoji && size > len + && intable(emoji_all, ARRAY_SIZE(emoji_all), c) + && utf_ptr2len_len(p + len, size - len) == utf8len_tab[(uint8_t)p[len]]) { + int c2 = utf_ptr2char(p + len); + if (c2 == 0xFE0F) { + return 2; // emoji presentation + } + } + return cells; } return 1; } @@ -648,8 +668,8 @@ size_t mb_string2cells_len(const char *str, size_t size) size_t clen = 0; for (const char *p = str; *p != NUL && p < str + size; - p += utfc_ptr2len_len(p, (int)size + (int)(p - str))) { - clen += (size_t)utf_ptr2cells(p); + p += utfc_ptr2len_len(p, (int)size - (int)(p - str))) { + clen += (size_t)utf_ptr2cells_len(p, (int)size - (int)(p - str)); } return clen; @@ -793,29 +813,48 @@ int mb_cptr2char_adv(const char **pp) return c; } -/// Check if the character pointed to by "p2" is a composing character when it -/// comes after "p1". For Arabic sometimes "ab" is replaced with "c", which -/// behaves like a composing character. -bool utf_composinglike(const char *p1, const char *p2) +/// When "c" is the first char of a string, determine if it needs to be prefixed +/// by a space byte to be drawn correctly, and not merge with the space left of +/// the string. +bool utf_iscomposing_first(int c) { - int c2 = utf_ptr2char(p2); - if (utf_iscomposing(c2)) { - return true; - } - if (!arabic_maycombine(c2)) { - return false; - } - return arabic_combine(utf_ptr2char(p1), c2); + return c >= 128 && !utf8proc_grapheme_break(' ', c); } -/// Check if the next character is a composing character when it -/// comes after the first. For Arabic sometimes "ab" is replaced with "c", which -/// behaves like a composing character. -/// returns false for negative values -bool utf_char_composinglike(int32_t const first, int32_t const next) - FUNC_ATTR_PURE +/// Check if the character pointed to by "p2" is a composing character when it +/// comes after "p1". +/// +/// We use the definition in UAX#29 as implemented by utf8proc with the following +/// exceptions: +/// +/// - ASCII chars always begin a new cluster. This is a long assumed invariant +/// in the code base and very useful for performance (we can exit early for ASCII +/// all over the place, branch predictor go brrr in ASCII-only text). +/// As of Unicode 15.1 this will only break BOUNDCLASS_UREPEND followed by ASCII, +/// which should be exceedingly rare (these PREPEND chars are expected to be +/// followed by multibyte chars within the same script family) +/// +/// - When 'arabicshape' is active, some pairs of arabic letters "ab" is replaced with +/// "c" taking one single cell, which behaves like a cluster. +/// +/// @param "state" should be set to GRAPHEME_STATE_INIT before first call +/// it is allowed to be null, but will then not handle some longer +/// sequences, like ZWJ based emoji +bool utf_composinglike(const char *p1, const char *p2, GraphemeState *state) + FUNC_ATTR_NONNULL_ARG(1, 2) { - return utf_iscomposing(next) || arabic_combine(first, next); + if ((uint8_t)(*p2) < 128) { + return false; + } + + int first = utf_ptr2char(p1); + int second = utf_ptr2char(p2); + + if (!utf8proc_grapheme_break_stateful(first, second, state)) { + return true; + } + + return arabic_combine(first, second); } /// Get the screen char at the beginning of a string @@ -834,7 +873,7 @@ schar_T utfc_ptr2schar(const char *p, int *firstc) { int c = utf_ptr2char(p); *firstc = c; // NOT optional, you are gonna need it - bool first_compose = utf_iscomposing(c); + bool first_compose = utf_iscomposing_first(c); size_t maxlen = MAX_SCHAR_SIZE - 1 - first_compose; size_t len = (size_t)utfc_ptr2len_len(p, (int)maxlen); @@ -845,16 +884,13 @@ schar_T utfc_ptr2schar(const char *p, int *firstc) return schar_from_buf_first(p, len, first_compose); } -/// Get the screen char at the beginning of a string with length +/// Get the screen char from a char with a known length /// /// Like utfc_ptr2schar but use no more than p[maxlen]. -schar_T utfc_ptr2schar_len(const char *p, int maxlen, int *firstc) +schar_T utfc_ptrlen2schar(const char *p, int len, int *firstc) FUNC_ATTR_NONNULL_ALL { - assert(maxlen > 0); - - size_t len = (size_t)utf_ptr2len_len(p, maxlen); - if (len > (size_t)maxlen || (len == 1 && (uint8_t)(*p) >= 0x80) || len == 0) { + if ((len == 1 && (uint8_t)(*p) >= 0x80) || len == 0) { // invalid or truncated sequence *firstc = (uint8_t)(*p); return 0; @@ -862,11 +898,13 @@ schar_T utfc_ptr2schar_len(const char *p, int maxlen, int *firstc) int c = utf_ptr2char(p); *firstc = c; - bool first_compose = utf_iscomposing(c); - maxlen = MIN(maxlen, MAX_SCHAR_SIZE - 1 - first_compose); - len = (size_t)utfc_ptr2len_len(p, maxlen); + bool first_compose = utf_iscomposing_first(c); + int maxlen = MAX_SCHAR_SIZE - 1 - first_compose; + if (len > maxlen) { + len = utfc_ptr2len_len(p, maxlen); + } - return schar_from_buf_first(p, len, first_compose); + return schar_from_buf_first(p, (size_t)len, first_compose); } /// Caller must ensure there is space for `first_compose` @@ -964,8 +1002,9 @@ int utfc_ptr2len(const char *const p) // Check for composing characters. int prevlen = 0; + GraphemeState state = GRAPHEME_STATE_INIT; while (true) { - if ((uint8_t)p[len] < 0x80 || !utf_composinglike(p + prevlen, p + len)) { + if ((uint8_t)p[len] < 0x80 || !utf_composinglike(p + prevlen, p + len, &state)) { return len; } @@ -996,9 +1035,10 @@ int utfc_ptr2len_len(const char *p, int size) return 1; } - // Check for composing characters. We can handle only the first six, but + // Check for composing characters. We can only display a limited amount, but // skip all of them (otherwise the cursor would get stuck). int prevlen = 0; + GraphemeState state = GRAPHEME_STATE_INIT; while (len < size) { if ((uint8_t)p[len] < 0x80) { break; @@ -1011,7 +1051,7 @@ int utfc_ptr2len_len(const char *p, int size) break; } - if (!utf_composinglike(p + prevlen, p + len)) { + if (!utf_composinglike(p + prevlen, p + len, &state)) { break; } @@ -1084,11 +1124,18 @@ int utf_char2bytes(const int c, char *const buf) } } -/// Return true if "c" is a composing UTF-8 character. -/// This means it will be drawn on top of the preceding character. +/// Return true if "c" is a legacy composing UTF-8 character. +/// +/// This is deprecated in favour of utf_composinglike() which uses the modern +/// stateful algorithm to determine grapheme clusters. Still available +/// to support some legacy code which hasn't been refactored yet. +/// +/// To check if a char would combine with a preceeding space, use +/// utf_iscomposing_first() instead. +/// /// Based on code from Markus Kuhn. /// Returns false for negative values. -bool utf_iscomposing(int c) +bool utf_iscomposing_legacy(int c) { return intable(combining, ARRAY_SIZE(combining), c); } @@ -1278,8 +1325,9 @@ int utf_class_tab(const int c, const uint64_t *const chartab) return 2; } -bool utf_ambiguous_width(int c) +bool utf_ambiguous_width(const char *p) { + int c = utf_ptr2char(p); return c >= 0x80 && (intable(ambiguous, ARRAY_SIZE(ambiguous), c) || intable(emoji_all, ARRAY_SIZE(emoji_all), c)); } @@ -1666,6 +1714,26 @@ void show_utf8(void) msg(IObuff, 0); } +/// @return true if boundclass bc always starts a new cluster regardless of what's before +/// false negatives are allowed (perf cost, not correctness) +static bool always_break(int bc) +{ + return (bc == UTF8PROC_BOUNDCLASS_CONTROL); +} + +/// @return true if bc2 always starts a cluster after bc1 +/// false negatives are allowed (perf cost, not correctness) +static bool always_break_two(int bc1, int bc2) +{ + // don't check for UTF8PROC_BOUNDCLASS_CONTROL for bc2 as it either has been checked by + // "always_break" on first iteration or when it was bc1 in the previous iteration + return ((bc1 != UTF8PROC_BOUNDCLASS_PREPEND && bc2 == UTF8PROC_BOUNDCLASS_OTHER) + || (bc1 >= UTF8PROC_BOUNDCLASS_CR && bc1 <= UTF8PROC_BOUNDCLASS_CONTROL) + || (bc2 == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC + && (bc1 == UTF8PROC_BOUNDCLASS_OTHER + || bc1 == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC))); +} + /// Return offset from "p" to the start of a character, including composing characters. /// "base" must be the start of the string, which must be NUL terminated. /// If "p" points to the NUL at the end of the string return 0. @@ -1679,50 +1747,108 @@ int utf_head_off(const char *base_in, const char *p_in) const uint8_t *base = (uint8_t *)base_in; const uint8_t *p = (uint8_t *)p_in; - // Skip backwards over trailing bytes: 10xx.xxxx - // Skip backwards again if on a composing char. - const uint8_t *q; - for (q = p;; q--) { - // Move s to the last byte of this char. - const uint8_t *s; - for (s = q; (s[1] & 0xc0) == 0x80; s++) {} + const uint8_t *start = p; - // Move q to the first byte of this char. - while (q > base && (*q & 0xc0) == 0x80) { - q--; - } - // Check for illegal sequence. Do allow an illegal byte after where we - // started. - int len = utf8len_tab[*q]; - if (len != (int)(s - q + 1) && len != (int)(p - q + 1)) { - return 0; + // move start to the first byte of this codepoint + // might stop on a continuation byte if overlong, handled by utf_ptr2CharInfo_impl + while (start > base && (*start & 0xc0) == 0x80 && (p - start) < 6) { + start--; + } + + uint8_t cur_len = utf8len_tab[*start]; + int32_t cur_code = utf_ptr2CharInfo_impl(start, (uintptr_t)cur_len); + if (cur_code < 0) { + return 0; // p must be part of an illegal sequence + } + const uint8_t * const safe_end = start + cur_len; + + int cur_bc = utf8proc_get_property(cur_code)->boundclass; + if (always_break(cur_bc)) { + return (int)(p - start); + } + + // backtrack to find the start of a cluster. we might go too far, checked in the next loop + const uint8_t *cur_pos = start; + const uint8_t *const p_start = start; + + if (start == base) { + return (int)(p - start); + } + + start--; + while (*start >= 0x80) { // stop on ascii, we are done + while (start > base && (*start & 0xc0) == 0x80 && (cur_pos - start) < 6) { + start--; } - if (q <= base) { + int32_t prev_code = utf_ptr2CharInfo_impl(start, (uintptr_t)utf8len_tab[*start]); + if (prev_code < 0) { + start = cur_pos; // start at valid sequence after invalid bytes break; } - int c = utf_ptr2char((char *)q); - if (utf_iscomposing(c)) { - continue; + int prev_bc = utf8proc_get_property(prev_code)->boundclass; + if (always_break_two(prev_bc, cur_bc) && !arabic_combine(prev_code, cur_code)) { + start = cur_pos; // prev_code cannot be a part of this cluster + break; + } else if (start == base) { + break; } + cur_pos = start; + cur_bc = prev_bc; + cur_code = prev_code; - if (arabic_maycombine(c)) { - // Advance to get a sneak-peak at the next char - const uint8_t *j = q; - j--; - // Move j to the first byte of this char. - while (j > base && (*j & 0xc0) == 0x80) { - j--; - } - if (arabic_combine(utf_ptr2char((char *)j), c)) { - continue; - } - } - break; + start--; } - return (int)(p - q); + // hot path: we are already on the first codepoint of a sequence + if (start == p_start) { + return (int)(p - start); + } + + const uint8_t *q = start; + while (q < p) { + // don't need to find end of cluster. once we reached the codepoint of p, we are done + int len = utfc_ptr2len_len((const char *)q, (int)(safe_end - q)); + + if (q + len > p) { + return (int)(p - q); + } + + q += len; + } + + return 0; +} + +/// Assumes caller already handles ascii. see `utfc_next` +StrCharInfo utfc_next_impl(StrCharInfo cur) +{ + int32_t prev_code = cur.chr.value; + uint8_t *next = (uint8_t *)(cur.ptr + cur.chr.len); + GraphemeState state = GRAPHEME_STATE_INIT; + assert(*next >= 0x80); + + while (true) { + uint8_t const next_len = utf8len_tab[*next]; + int32_t const next_code = utf_ptr2CharInfo_impl(next, (uintptr_t)next_len); + if (utf8proc_grapheme_break_stateful(prev_code, next_code, &state) + && !arabic_combine(prev_code, next_code)) { + return (StrCharInfo){ + .ptr = (char *)next, + .chr = (CharInfo){ .value = next_code, .len = (next_code < 0 ? 1 : next_len) }, + }; + } + + prev_code = next_code; + next += next_len; + if (EXPECT(*next < 0x80U, true)) { + return (StrCharInfo){ + .ptr = (char *)next, + .chr = (CharInfo){ .value = *next, .len = 1 }, + }; + } + } } // Whether space is NOT allowed before/after 'c'. @@ -2681,7 +2807,7 @@ char *string_convert_ext(const vimconv_T *const vcp, char *ptr, size_t *lenp, si c = 0x100; break; // not in latin9 } } - if (!utf_iscomposing(c)) { // skip composing chars + if (!utf_iscomposing_legacy(c)) { // skip composing chars if (c < 0x100) { *d++ = (uint8_t)c; } else if (vcp->vc_fail) { diff --git a/src/nvim/mbyte.h b/src/nvim/mbyte.h index 6cbfbcbc3c..2da051fca2 100644 --- a/src/nvim/mbyte.h +++ b/src/nvim/mbyte.h @@ -3,6 +3,7 @@ #include #include #include // IWYU pragma: keep +#include #include // IWYU pragma: keep #include "nvim/cmdexpand_defs.h" // IWYU pragma: keep @@ -11,6 +12,9 @@ #include "nvim/mbyte_defs.h" // IWYU pragma: keep #include "nvim/types_defs.h" // IWYU pragma: keep +typedef utf8proc_int32_t GraphemeState; +#define GRAPHEME_STATE_INIT 0 + #ifdef INCLUDE_GENERATED_DECLARATIONS # include "mbyte.h.generated.h" # include "mbyte.h.inline.generated.h" @@ -92,28 +96,16 @@ static inline CharInfo utf_ptr2CharInfo(char const *const p_in) static inline StrCharInfo utfc_next(StrCharInfo cur) FUNC_ATTR_NONNULL_ALL FUNC_ATTR_ALWAYS_INLINE FUNC_ATTR_PURE { - int32_t prev_code = cur.chr.value; + // handle ASCII case inline uint8_t *next = (uint8_t *)(cur.ptr + cur.chr.len); - - while (true) { - if (EXPECT(*next < 0x80U, true)) { - return (StrCharInfo){ - .ptr = (char *)next, - .chr = (CharInfo){ .value = *next, .len = 1 }, - }; - } - uint8_t const next_len = utf8len_tab[*next]; - int32_t const next_code = utf_ptr2CharInfo_impl(next, (uintptr_t)next_len); - if (!utf_char_composinglike(prev_code, next_code)) { - return (StrCharInfo){ - .ptr = (char *)next, - .chr = (CharInfo){ .value = next_code, .len = (next_code < 0 ? 1 : next_len) }, - }; - } - - prev_code = next_code; - next += next_len; + if (EXPECT(*next < 0x80U, true)) { + return (StrCharInfo){ + .ptr = (char *)next, + .chr = (CharInfo){ .value = *next, .len = 1 }, + }; } + + return utfc_next_impl(cur); } static inline StrCharInfo utf_ptr2StrCharInfo(char *ptr) diff --git a/src/nvim/message.c b/src/nvim/message.c index 53e5511a5a..79e6bc8be7 100644 --- a/src/nvim/message.c +++ b/src/nvim/message.c @@ -446,9 +446,7 @@ void trunc_string(const char *s, char *buf, int room_in, int buflen) // Last part: End of the string. half = i = (int)strlen(s); while (true) { - do { - half = half - utf_head_off(s, s + half - 1) - 1; - } while (half > 0 && utf_iscomposing(utf_ptr2char(s + half))); + half = half - utf_head_off(s, s + half - 1) - 1; n = ptr2cells(s + half); if (len + n > room || half == 0) { break; diff --git a/src/nvim/normal.c b/src/nvim/normal.c index f44a64af21..f3bdea9a85 100644 --- a/src/nvim/normal.c +++ b/src/nvim/normal.c @@ -837,7 +837,10 @@ static void normal_get_additional_char(NormalState *s) while ((s->c = vpeekc()) > 0 && (s->c >= 0x100 || MB_BYTE2LEN(vpeekc()) > 1)) { s->c = plain_vgetc(); - if (!utf_iscomposing(s->c)) { + // TODO(bfredl): only allowing up to two composing chars is cringe af. + // Could reuse/abuse schar_T to at least allow us to input anything we are able + // to display and use the stateful utf8proc algorithm like utf_composinglike + if (!utf_iscomposing_legacy(s->c)) { vungetc(s->c); // it wasn't, put it back break; } else if (s->ca.ncharC1 == 0) { diff --git a/src/nvim/options.lua b/src/nvim/options.lua index 3612a80fb8..1c17b0fc9f 100644 --- a/src/nvim/options.lua +++ b/src/nvim/options.lua @@ -2326,9 +2326,12 @@ return { desc = [=[ When on all Unicode emoji characters are considered to be full width. This excludes "text emoji" characters, which are normally displayed as - single width. Unfortunately there is no good specification for this - and it has been determined on trial-and-error basis. Use the - |setcellwidths()| function to change the behavior. + single width. However, such "text emoji" are treated as full-width + emoji if they are followed by the U+FE0F variant selector. + + Unfortunately there is no good specification for this and it has been + determined on trial-and-error basis. Use the |setcellwidths()| + function to change the behavior. ]=], full_name = 'emoji', redraw = { 'all_windows', 'ui_option' }, diff --git a/src/nvim/plines.c b/src/nvim/plines.c index e51e9bf8c3..408fe26bf3 100644 --- a/src/nvim/plines.c +++ b/src/nvim/plines.c @@ -146,7 +146,7 @@ CharSize charsize_regular(CharsizeArg *csarg, char *const cur, colnr_T const vco } else if (cur_char < 0) { size = kInvalidByteCells; } else { - size = char2cells(cur_char); + size = ptr2cells(cur); is_doublewidth = size == 2 && cur_char > 0x80; } @@ -337,8 +337,8 @@ CharSize charsize_regular(CharsizeArg *csarg, char *const cur, colnr_T const vco /// /// @see charsize_regular /// @see charsize_fast -static inline CharSize charsize_fast_impl(win_T *const wp, bool use_tabstop, colnr_T const vcol, - int32_t const cur_char) +static inline CharSize charsize_fast_impl(win_T *const wp, const char *cur, bool use_tabstop, + colnr_T const vcol, int32_t const cur_char) FUNC_ATTR_PURE FUNC_ATTR_ALWAYS_INLINE { // A tab gets expanded, depending on the current column @@ -352,7 +352,11 @@ static inline CharSize charsize_fast_impl(win_T *const wp, bool use_tabstop, col if (cur_char < 0) { width = kInvalidByteCells; } else { - width = char2cells(cur_char); + // TODO(bfredl): perf: often cur_char is enough at this point to determine width. + // we likely want a specialized version of utf_ptr2StrCharInfo also determining + // the ptr2cells width at the same time without any extra decoding. (also applies + // to charsize_regular and charsize_nowrap) + width = ptr2cells(cur); } // If a double-width char doesn't fit at the end of a line, it wraps to the next line, @@ -371,23 +375,23 @@ static inline CharSize charsize_fast_impl(win_T *const wp, bool use_tabstop, col /// Can be used if CSType is kCharsizeFast. /// /// @see charsize_regular -CharSize charsize_fast(CharsizeArg *csarg, colnr_T const vcol, int32_t const cur_char) +CharSize charsize_fast(CharsizeArg *csarg, const char *cur, colnr_T vcol, int32_t cur_char) FUNC_ATTR_PURE { - return charsize_fast_impl(csarg->win, csarg->use_tabstop, vcol, cur_char); + return charsize_fast_impl(csarg->win, cur, csarg->use_tabstop, vcol, cur_char); } /// Get the number of cells taken up on the screen at given virtual column. /// /// @see win_chartabsize() -int charsize_nowrap(buf_T *buf, bool use_tabstop, colnr_T vcol, int32_t cur_char) +int charsize_nowrap(buf_T *buf, const char *cur, bool use_tabstop, colnr_T vcol, int32_t cur_char) { if (cur_char == TAB && use_tabstop) { return tabstop_padding(vcol, buf->b_p_ts, buf->b_p_vts_array); } else if (cur_char < 0) { return kInvalidByteCells; } else { - return char2cells(cur_char); + return ptr2cells(cur); } } @@ -467,7 +471,7 @@ int linesize_fast(CharsizeArg const *const csarg, int vcol_arg, colnr_T const le StrCharInfo ci = utf_ptr2StrCharInfo(line); while (ci.ptr - line < len && *ci.ptr != NUL) { - vcol += charsize_fast_impl(wp, use_tabstop, vcol_arg, ci.chr.value).width; + vcol += charsize_fast_impl(wp, ci.ptr, use_tabstop, vcol_arg, ci.chr.value).width; ci = utfc_next(ci); if (vcol > MAXCOL) { vcol_arg = MAXCOL; @@ -530,7 +534,7 @@ void getvcol(win_T *wp, pos_T *pos, colnr_T *start, colnr_T *cursor, colnr_T *en char_size = (CharSize){ .width = 1 }; break; } - char_size = charsize_fast_impl(wp, use_tabstop, vcol, ci.chr.value); + char_size = charsize_fast_impl(wp, ci.ptr, use_tabstop, vcol, ci.chr.value); StrCharInfo const next = utfc_next(ci); if (next.ptr - line > end_col) { break; @@ -627,7 +631,7 @@ void getvvcol(win_T *wp, pos_T *pos, colnr_T *start, colnr_T *cursor, colnr_T *e if (pos->col < ml_get_buf_len(wp->w_buffer, pos->lnum)) { int c = utf_ptr2char(ptr + pos->col); if ((c != TAB) && vim_isprintc(c)) { - endadd = (colnr_T)(char2cells(c) - 1); + endadd = (colnr_T)(ptr2cells(ptr + pos->col) - 1); if (coladd > endadd) { // past end of line endadd = 0; @@ -824,7 +828,7 @@ int plines_win_col(win_T *wp, linenr_T lnum, long column) if (cstype == kCharsizeFast) { bool const use_tabstop = csarg.use_tabstop; while (*ci.ptr != NUL && --column >= 0) { - vcol += charsize_fast_impl(wp, use_tabstop, vcol, ci.chr.value).width; + vcol += charsize_fast_impl(wp, ci.ptr, use_tabstop, vcol, ci.chr.value).width; ci = utfc_next(ci); } } else { diff --git a/src/nvim/plines.h b/src/nvim/plines.h index 7128e37237..50310b8ce1 100644 --- a/src/nvim/plines.h +++ b/src/nvim/plines.h @@ -54,7 +54,7 @@ static inline CharSize win_charsize(CSType cstype, int vcol, char *ptr, int32_t FUNC_ATTR_NONNULL_ALL FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_ALWAYS_INLINE { if (cstype == kCharsizeFast) { - return charsize_fast(csarg, vcol, chr); + return charsize_fast(csarg, ptr, vcol, chr); } else { return charsize_regular(csarg, ptr, vcol, chr); } diff --git a/src/nvim/regexp.c b/src/nvim/regexp.c index 7dbbb19545..c91c112c3c 100644 --- a/src/nvim/regexp.c +++ b/src/nvim/regexp.c @@ -3031,7 +3031,7 @@ static bool use_multibytecode(int c) { return utf_char2len(c) > 1 && (re_multi_type(peekchr()) != NOT_MULTI - || utf_iscomposing(c)); + || utf_iscomposing_legacy(c)); } // Emit (if appropriate) a byte of code @@ -4326,7 +4326,7 @@ static uint8_t *regatom(int *flagp) } // When '.' is followed by a composing char ignore the dot, so that // the composing char is matched here. - if (c == Magic('.') && utf_iscomposing(peekchr())) { + if (c == Magic('.') && utf_iscomposing_legacy(peekchr())) { c = getchr(); goto do_multibyte; } @@ -5001,9 +5001,10 @@ do_multibyte: int l; // Need to get composing character too. + GraphemeState state = GRAPHEME_STATE_INIT; while (true) { l = utf_ptr2len(regparse); - if (!utf_composinglike(regparse, regparse + l)) { + if (!utf_composinglike(regparse, regparse + l, &state)) { break; } regmbc(utf_ptr2char(regparse)); @@ -6569,7 +6570,7 @@ static bool regmatch(uint8_t *scan, const proftime_T *tm, int *timed_out) // Check for following composing character, unless %C // follows (skips over all composing chars). if (status != RA_NOMATCH - && utf_composinglike((char *)rex.input, (char *)rex.input + len) + && utf_composinglike((char *)rex.input, (char *)rex.input + len, NULL) && !rex.reg_icombine && OP(next) != RE_COMPOSING) { // raaron: This code makes a composing character get @@ -6624,14 +6625,14 @@ static bool regmatch(uint8_t *scan, const proftime_T *tm, int *timed_out) break; } const int opndc = utf_ptr2char((char *)opnd); - if (utf_iscomposing(opndc)) { + if (utf_iscomposing_legacy(opndc)) { // When only a composing char is given match at any // position where that composing char appears. status = RA_NOMATCH; for (i = 0; rex.input[i] != NUL; i += utf_ptr2len((char *)rex.input + i)) { const int inpc = utf_ptr2char((char *)rex.input + i); - if (!utf_iscomposing(inpc)) { + if (!utf_iscomposing_legacy(inpc)) { if (i > 0) { break; } @@ -6654,7 +6655,7 @@ static bool regmatch(uint8_t *scan, const proftime_T *tm, int *timed_out) case RE_COMPOSING: // Skip composing characters. - while (utf_iscomposing(utf_ptr2char((char *)rex.input))) { + while (utf_iscomposing_legacy(utf_ptr2char((char *)rex.input))) { rex.input += utf_ptr2len((char *)rex.input); } break; @@ -10070,7 +10071,7 @@ static int nfa_regatom(void) } // When '.' is followed by a composing char ignore the dot, so that // the composing char is matched here. - if (c == Magic('.') && utf_iscomposing(peekchr())) { + if (c == Magic('.') && utf_iscomposing_legacy(peekchr())) { old_regparse = (uint8_t *)regparse; c = getchr(); goto nfa_do_multibyte; @@ -10705,7 +10706,7 @@ collection: nfa_do_multibyte: // plen is length of current char with composing chars if (utf_char2len(c) != (plen = utfc_ptr2len((char *)old_regparse)) - || utf_iscomposing(c)) { + || utf_iscomposing_legacy(c)) { int i = 0; // A base character plus composing characters, or just one @@ -14033,7 +14034,7 @@ static int find_match_text(colnr_T *startcol, int regstart, uint8_t *match_text) } if (match // check that no composing char follows - && !utf_iscomposing(utf_ptr2char((char *)s2))) { + && !utf_iscomposing_legacy(utf_ptr2char((char *)s2))) { cleanup_subexpr(); if (REG_MULTI) { rex.reg_startpos[0].lnum = rex.lnum; @@ -14278,7 +14279,7 @@ static int nfa_regmatch(nfa_regprog_T *prog, nfa_state_T *start, regsubs_T *subm // is not really a match. if (!rex.reg_icombine && rex.input != rex.line - && utf_iscomposing(curc)) { + && utf_iscomposing_legacy(curc)) { break; } nfa_match = true; @@ -14622,7 +14623,7 @@ static int nfa_regmatch(nfa_regprog_T *prog, nfa_state_T *start, regsubs_T *subm sta = t->state->out; len = 0; - if (utf_iscomposing(sta->c)) { + if (utf_iscomposing_legacy(sta->c)) { // Only match composing character(s), ignore base // character. Used for ".{composing}" and "{composing}" // (no preceding character). @@ -14724,7 +14725,7 @@ static int nfa_regmatch(nfa_regprog_T *prog, nfa_state_T *start, regsubs_T *subm int j; sta = t->state->out->out; - if (utf_iscomposing(sta->c)) { + if (utf_iscomposing_legacy(sta->c)) { // Only match composing character(s), ignore base // character. Used for ".{composing}" and "{composing}" // (no preceding character). @@ -14846,7 +14847,7 @@ static int nfa_regmatch(nfa_regprog_T *prog, nfa_state_T *start, regsubs_T *subm case NFA_ANY_COMPOSING: // On a composing character skip over it. Otherwise do // nothing. Always matches. - if (utf_iscomposing(curc)) { + if (utf_iscomposing_legacy(curc)) { add_off = clen; } else { add_here = true; diff --git a/src/nvim/search.c b/src/nvim/search.c index 9e00664d86..ff6e135df1 100644 --- a/src/nvim/search.c +++ b/src/nvim/search.c @@ -1260,7 +1260,7 @@ int do_search(oparg_T *oap, int dirc, int search_delim, char *pat, size_t patlen // empty for the search_stat feature. if (!cmd_silent) { msgbuf[0] = (char)dirc; - if (utf_iscomposing(utf_ptr2char(p))) { + if (utf_iscomposing_first(utf_ptr2char(p))) { // Use a space to draw the composing char on. msgbuf[1] = ' '; memmove(msgbuf + 2, p, plen); diff --git a/src/nvim/sign.c b/src/nvim/sign.c index 9b2516ed83..b4ba7833e9 100644 --- a/src/nvim/sign.c +++ b/src/nvim/sign.c @@ -376,7 +376,7 @@ int init_sign_text(sign_T *sp, schar_T *sign_text, char *text) if (!vim_isprintc(c)) { break; } - int width = utf_char2cells(c); + int width = utf_ptr2cells(s); if (width == 2) { sign_text[cells + 1] = 0; } diff --git a/src/nvim/spellsuggest.c b/src/nvim/spellsuggest.c index d6053a533e..b37f01e769 100644 --- a/src/nvim/spellsuggest.c +++ b/src/nvim/spellsuggest.c @@ -1792,10 +1792,8 @@ static void suggest_trie_walk(suginfo_T *su, langp_T *lp, char *fword, bool soun // For changing a composing character adjust // the score from SCORE_SUBST to // SCORE_SUBCOMP. - if (utf_iscomposing(utf_ptr2char(tword + sp->ts_twordlen - - sp->ts_tcharlen)) - && utf_iscomposing(utf_ptr2char(fword - + sp->ts_fcharstart))) { + if (utf_iscomposing_legacy(utf_ptr2char(tword + sp->ts_twordlen - sp->ts_tcharlen)) + && utf_iscomposing_legacy(utf_ptr2char(fword + sp->ts_fcharstart))) { sp->ts_score -= SCORE_SUBST - SCORE_SUBCOMP; } else if (!soundfold && slang->sl_has_map @@ -1811,7 +1809,7 @@ static void suggest_trie_walk(suginfo_T *su, langp_T *lp, char *fword, bool soun && sp->ts_twordlen > sp->ts_tcharlen) { p = tword + sp->ts_twordlen - sp->ts_tcharlen; c = utf_ptr2char(p); - if (utf_iscomposing(c)) { + if (utf_iscomposing_legacy(c)) { // Inserting a composing char doesn't // count that much. sp->ts_score -= SCORE_INS - SCORE_INSCOMP; @@ -1876,7 +1874,7 @@ static void suggest_trie_walk(suginfo_T *su, langp_T *lp, char *fword, bool soun c = utf_ptr2char(fword + sp->ts_fidx); stack[depth].ts_fidx = (uint8_t)(stack[depth].ts_fidx + utfc_ptr2len(fword + sp->ts_fidx)); - if (utf_iscomposing(c)) { + if (utf_iscomposing_legacy(c)) { stack[depth].ts_score -= SCORE_DEL - SCORE_DELCOMP; } else if (c == utf_ptr2char(fword + stack[depth].ts_fidx)) { stack[depth].ts_score -= SCORE_DEL - SCORE_DELDUP; diff --git a/src/nvim/textformat.c b/src/nvim/textformat.c index 96907362dd..30c7d0ee92 100644 --- a/src/nvim/textformat.c +++ b/src/nvim/textformat.c @@ -47,7 +47,7 @@ static bool did_add_space = false; ///< auto_format() added an extra space ///< under the cursor #define WHITECHAR(cc) (ascii_iswhite(cc) \ - && !utf_iscomposing(utf_ptr2char((char *)get_cursor_pos_ptr() + 1))) + && !utf_iscomposing_first(utf_ptr2char((char *)get_cursor_pos_ptr() + 1))) /// Return true if format option 'x' is in effect. /// Take care of no formatting when 'paste' is set. diff --git a/src/nvim/tui/tui.c b/src/nvim/tui/tui.c index 1866a4a592..7e1068ed56 100644 --- a/src/nvim/tui/tui.c +++ b/src/nvim/tui/tui.c @@ -109,6 +109,7 @@ struct TUIData { bool set_cursor_color_as_str; bool cursor_color_changed; bool is_starting; + bool did_set_grapheme_cluster_mode; FILE *screenshot; cursorentry_T cursor_shapes[SHAPE_IDX_COUNT]; HlAttrs clear_attrs; @@ -220,6 +221,7 @@ static void tui_set_term_mode(TUIData *tui, TermMode mode, bool set) void tui_handle_term_mode(TUIData *tui, TermMode mode, TermModeState state) FUNC_ATTR_NONNULL_ALL { + bool is_set = false; switch (state) { case kTermModeNotRecognized: case kTermModePermanentlySet: @@ -228,6 +230,8 @@ void tui_handle_term_mode(TUIData *tui, TermMode mode, TermModeState state) // then there is nothing to do break; case kTermModeSet: + is_set = true; + FALLTHROUGH; case kTermModeReset: // The terminal supports changing the given mode switch (mode) { @@ -240,6 +244,12 @@ void tui_handle_term_mode(TUIData *tui, TermMode mode, TermModeState state) signal_watcher_stop(&tui->winch_handle); tui_set_term_mode(tui, mode, true); break; + case kTermModeGraphemeClusters: + if (!is_set) { + tui_set_term_mode(tui, mode, true); + tui->did_set_grapheme_cluster_mode = true; + } + break; } } } @@ -434,6 +444,7 @@ static void terminfo_start(TUIData *tui) if (!nsterm) { tui_request_term_mode(tui, kTermModeSynchronizedOutput); tui_request_term_mode(tui, kTermModeResizeEvents); + tui_request_term_mode(tui, kTermModeGraphemeClusters); } // Don't use DECRQSS in screen or tmux, as they behave strangely when receiving it. @@ -494,7 +505,9 @@ static void terminfo_stop(TUIData *tui) // Disable resize events tui_set_term_mode(tui, kTermModeResizeEvents, false); - + if (tui->did_set_grapheme_cluster_mode) { + tui_set_term_mode(tui, kTermModeGraphemeClusters, false); + } // May restore old title before exiting alternate screen. tui_set_title(tui, NULL_STRING); if (ui_client_exit_status == 0) { @@ -1010,7 +1023,7 @@ static void print_cell_at_pos(TUIData *tui, int row, int col, UCell *cell, bool char buf[MAX_SCHAR_SIZE]; schar_get(buf, cell->data); int c = utf_ptr2char(buf); - bool is_ambiwidth = utf_ambiguous_width(c); + bool is_ambiwidth = utf_ambiguous_width(buf); if (is_doublewidth && (is_ambiwidth || utf_char2cells(c) == 1)) { // If the server used setcellwidths() to treat a single-width char as double-width, // it needs to be treated like an ambiguous-width char. diff --git a/src/nvim/tui/tui_defs.h b/src/nvim/tui/tui_defs.h index 46913e07a2..bd99d6b0ad 100644 --- a/src/nvim/tui/tui_defs.h +++ b/src/nvim/tui/tui_defs.h @@ -4,6 +4,7 @@ typedef struct TUIData TUIData; typedef enum { kTermModeSynchronizedOutput = 2026, + kTermModeGraphemeClusters = 2027, kTermModeResizeEvents = 2048, } TermMode; diff --git a/test/functional/api/vim_spec.lua b/test/functional/api/vim_spec.lua index 4210b7ecf0..074d3ac0a3 100644 --- a/test/functional/api/vim_spec.lua +++ b/test/functional/api/vim_spec.lua @@ -1435,6 +1435,28 @@ describe('API', function() it('cannot handle NULs', function() eq(0, api.nvim_strwidth('\0abc')) end) + + it('can handle emoji with variant selectors and ZWJ', function() + local selector = '❀️' + eq(2, fn.strchars(selector)) + eq(1, fn.strcharlen(selector)) + eq(2, api.nvim_strwidth(selector)) + + local no_selector = '❀' + eq(1, fn.strchars(no_selector)) + eq(1, fn.strcharlen(no_selector)) + eq(1, api.nvim_strwidth(no_selector)) + + local selector_zwj_selector = 'πŸ³οΈβ€βš§οΈ' + eq(5, fn.strchars(selector_zwj_selector)) + eq(1, fn.strcharlen(selector_zwj_selector)) + eq(2, api.nvim_strwidth(selector_zwj_selector)) + + local emoji_zwj_emoji = 'πŸ§‘β€πŸŒΎ' + eq(3, fn.strchars(emoji_zwj_emoji)) + eq(1, fn.strcharlen(emoji_zwj_emoji)) + eq(2, api.nvim_strwidth(emoji_zwj_emoji)) + end) end) describe('nvim_get_current_line, nvim_set_current_line', function() diff --git a/test/functional/ui/decorations_spec.lua b/test/functional/ui/decorations_spec.lua index 1709819575..61a5e1d6f7 100644 --- a/test/functional/ui/decorations_spec.lua +++ b/test/functional/ui/decorations_spec.lua @@ -5620,6 +5620,27 @@ l5 ]] }) end) + + it('supports emoji as signs', function() + insert(example_test3) + feed 'gg' + api.nvim_buf_set_extmark(0, ns, 1, 0, {sign_text='πŸ§‘β€πŸŒΎ'}) + -- VS16 can change width of character + api.nvim_buf_set_extmark(0, ns, 2, 0, {sign_text='❀️'}) + api.nvim_buf_set_extmark(0, ns, 3, 0, {sign_text='❀'}) + api.nvim_buf_set_extmark(0, ns, 4, 0, {sign_text='❀x'}) + screen:expect([[ + {7: }^l1 | + πŸ§‘β€πŸŒΎl2 | + ❀️l3 | + ❀ l4 | + ❀xl5 | + {7: } | + {1:~ }|*3 + | + ]]) + eq("Invalid 'sign_text'", pcall_err(api.nvim_buf_set_extmark, 0, ns, 5, 0, {sign_text='❀️x'})) + end) end) describe('decorations: virt_text', function() diff --git a/test/functional/ui/messages_spec.lua b/test/functional/ui/messages_spec.lua index 07192800e5..036b5ceefc 100644 --- a/test/functional/ui/messages_spec.lua +++ b/test/functional/ui/messages_spec.lua @@ -1436,6 +1436,41 @@ vimComment xxx match /\s"[^\-:.%#=*].*$/ms=s+1,lc=1 excludenl contains=@vim } end) + it('supports nvim_echo messages with emoji', function() + -- stylua: ignore + async_meths.nvim_echo( + { { 'wow, πŸ³οΈβ€βš§οΈπŸ§‘β€πŸŒΎβ€οΈπŸ˜‚πŸ΄β€β˜ οΈ\nvariant ❀️ one\nvariant ❀ two' } }, true, {} + ) + + screen:expect([[ + | + {1:~ }| + {3: }| + wow, πŸ³οΈβ€βš§οΈπŸ§‘β€πŸŒΎβ€οΈπŸ˜‚πŸ΄β€β˜ οΈ | + variant ❀️ one | + variant ❀ two | + {6:Press ENTER or type command to continue}^ | + ]]) + + feed '' + screen:expect([[ + ^ | + {1:~ }|*5 + | + ]]) + + feed ':messages' + screen:expect([[ + | + {1:~ }| + {3: }| + wow, πŸ³οΈβ€βš§οΈπŸ§‘β€πŸŒΎβ€οΈπŸ˜‚πŸ΄β€β˜ οΈ | + variant ❀️ one | + variant ❀ two | + {6:Press ENTER or type command to continue}^ | + ]]) + end) + it('prints lines in Ex mode correctly with a burst of carriage returns #19341', function() command('set number') api.nvim_buf_set_lines(0, 0, 0, true, { 'aaa', 'bbb', 'ccc' }) diff --git a/test/functional/ui/multibyte_spec.lua b/test/functional/ui/multibyte_spec.lua index dc25a09d0d..f16f750ea1 100644 --- a/test/functional/ui/multibyte_spec.lua +++ b/test/functional/ui/multibyte_spec.lua @@ -296,6 +296,86 @@ describe('multibyte rendering', function() ]], } end) + + it('supports emoji with variant selectors and ZWJ', function() + command('set ruler') + insert('πŸ³οΈβ€βš§οΈ') + screen:expect([[ + ^πŸ³οΈβ€βš§οΈ | + {1:~ }|*4 + 1,1 All | + ]]) + + feed('a word') + screen:expect([[ + πŸ³οΈβ€βš§οΈ wor^d | + {1:~ }|*4 + 1,21-7 All | + ]]) + + feed('0') + screen:expect([[ + ^πŸ³οΈβ€βš§οΈ word | + {1:~ }|*4 + 1,1 All | + ]]) + + feed('l') + screen:expect([[ + πŸ³οΈβ€βš§οΈ^ word | + {1:~ }|*4 + 1,17-3 All | + ]]) + + feed('h') + screen:expect([[ + ^πŸ³οΈβ€βš§οΈ word | + {1:~ }|*4 + 1,1 All | + ]]) + + feed('o❀️ variant selected') + screen:expect([[ + πŸ³οΈβ€βš§οΈ word | + ❀️ variant selecte^d | + {1:~ }|*3 + 2,23-19 All | + ]]) + + feed('0') + screen:expect([[ + πŸ³οΈβ€βš§οΈ word | + ^❀️ variant selected | + {1:~ }|*3 + 2,1 All | + ]]) + + feed('l') + screen:expect([[ + πŸ³οΈβ€βš§οΈ word | + ❀️^ variant selected | + {1:~ }|*3 + 2,7-3 All | + ]]) + + feed('h') + screen:expect([[ + πŸ³οΈβ€βš§οΈ word | + ^❀️ variant selected | + {1:~ }|*3 + 2,1 All | + ]]) + + -- without selector: single width (note column 18 and not 19) + feed('o❀ variant selected') + screen:expect([[ + πŸ³οΈβ€βš§οΈ word | + ❀️ variant selected | + ❀ variant selecte^d | + {1:~ }|*2 + 3,20-18 All | + ]]) + end) end) describe('multibyte rendering: statusline', function() @@ -348,11 +428,12 @@ describe('multibyte rendering: statusline', function() it('non-printable followed by MAX_MCO unicode combination points', function() command('set statusline=ΒŸα·°βƒ―ΜΈβƒβƒ§βƒ') -- U+9F + U+1DF0 + U+20EF + U+0338 + U+20D0 + U+20E7 + U+20DD + -- TODO: not ideal, better with plain ">" and then space+combining screen:expect([[ - ^ | - {1:~ }| - {3:<9f><1df0><20ef><0338><20d0><20e7><20dd>}| - | + ^ | + {1:~ }| + {3:<9f≯⃯ᷰ⃐⃧⃝ }| + | ]]) end) @@ -368,9 +449,20 @@ describe('multibyte rendering: statusline', function() } end) - it('unprintable chars in filename with default stl', function() + it('emoji with ZWJ in filename with default stl', function() command('file πŸ§‘β€πŸ’»') - -- TODO: this is wrong but avoids a crash + screen:expect { + grid = [[ + ^ | + {1:~ }| + {3:πŸ§‘β€πŸ’» }| + | + ]], + } + end) + + it('unprintable chars in filename with default stl', function() + command('file πŸ§‘β€‹πŸ’»') screen:expect { grid = [[ ^ | @@ -381,15 +473,27 @@ describe('multibyte rendering: statusline', function() } end) - it('unprintable chars in filename with custom stl', function() + it('emoji with ZWJ in filename with custom stl', function() command('set statusline=xx%#ErrorMsg#%f%##yy') command('file πŸ§‘β€πŸ’»') - -- TODO: this is also wrong but also avoids a crash screen:expect { grid = [[ ^ | {1:~ }| - {3:xx}{9:πŸ§‘<200d>πŸ’»}{3:yy }| + {3:xx}{9:πŸ§‘β€πŸ’»}{3:yy }| + | + ]], + } + end) + + it('unprintable chars in filename with custom stl', function() + command('set statusline=xx%#ErrorMsg#%f%##yy') + command('file πŸ§‘β€‹πŸ’»') + screen:expect { + grid = [[ + ^ | + {1:~ }| + {3:xx}{9:πŸ§‘<200b>πŸ’»}{3:yy }| | ]], } diff --git a/test/old/testdir/test_functions.vim b/test/old/testdir/test_functions.vim index 7047a62017..ffe7f3fb39 100644 --- a/test/old/testdir/test_functions.vim +++ b/test/old/testdir/test_functions.vim @@ -3663,7 +3663,7 @@ func Test_string_reverse() call assert_equal('', reverse(v:_null_string)) for [s1, s2] in [['', ''], ['a', 'a'], ['ab', 'ba'], ['abc', 'cba'], \ ['abcd', 'dcba'], ['Β«-Β«-Β»-Β»', 'Β»-Β»-Β«-Β«'], - \ ['πŸ‡¦', 'πŸ‡¦'], ['πŸ‡¦πŸ‡§', 'πŸ‡§πŸ‡¦'], ['πŸ‡¦πŸ‡§πŸ‡¨', 'πŸ‡¨πŸ‡§πŸ‡¦'], + \ ['πŸ‡¦', 'πŸ‡¦'], ['πŸ‡¦πŸ‡§', 'πŸ‡¦πŸ‡§'], ['πŸ‡¦πŸ‡§πŸ‡¨', 'πŸ‡¨πŸ‡¦πŸ‡§'], \ ['πŸ‡¦Β«πŸ‡§-πŸ‡¨Β»πŸ‡©', 'πŸ‡©Β»πŸ‡¨-πŸ‡§Β«πŸ‡¦']] call assert_equal(s2, reverse(s1)) endfor diff --git a/test/old/testdir/test_normal.vim b/test/old/testdir/test_normal.vim index 8088b1fc57..3ebc9a69a4 100644 --- a/test/old/testdir/test_normal.vim +++ b/test/old/testdir/test_normal.vim @@ -3897,9 +3897,9 @@ func Test_normal_count_after_operator() bw! endfunc -func Test_normal_gj_on_extra_wide_char() +func Test_normal_gj_on_6_cell_wide_unprintable_char() new | 25vsp - let text='1 foooooooo ar e ins‍zwe1 foooooooo ins‍zwei' . + let text='1 foooooooo ar e ins​zwe1 foooooooo ins​zwei' . \ ' i drei vier fΓΌnf sechs sieben acht un zehn elf zwΓΆfl' . \ ' dreizehn v ierzehn fΓΌnfzehn' put =text diff --git a/test/unit/mbyte_spec.lua b/test/unit/mbyte_spec.lua index 8fcc67d20b..787a8862ae 100644 --- a/test/unit/mbyte_spec.lua +++ b/test/unit/mbyte_spec.lua @@ -3,8 +3,15 @@ local itp = t.gen_itp(it) local ffi = t.ffi local eq = t.eq +local to_cstr = t.to_cstr +local ok = t.ok -local lib = t.cimport('./src/nvim/mbyte.h', './src/nvim/charset.h', './src/nvim/grid.h') +local lib = t.cimport( + './src/nvim/mbyte.h', + './src/nvim/charset.h', + './src/nvim/grid.h', + './src/nvim/option_vars.h' +) describe('mbyte', function() -- Convert from bytes to string @@ -45,12 +52,21 @@ describe('mbyte', function() end) end - describe('utfc_ptr2schar_len', function() + describe('utfc_ptr2schar', function() local function test_seq(seq) local firstc = ffi.new('int[1]') local buf = ffi.new('char[32]') - lib.schar_get(buf, lib.utfc_ptr2schar_len(to_string(seq), #seq, firstc)) - return { ffi.string(buf), firstc[0] } + lib.schar_get(buf, lib.utfc_ptr2schar(to_string(seq), firstc)) + local str = ffi.string(buf) + if 1 > 2 then -- for debugging + local tabel = {} + for i = 1, #str do + table.insert(tabel, string.format('0x%02x', string.byte(str, i))) + end + print('{ ' .. table.concat(tabel, ', ') .. ' }') + io.stdout:flush() + end + return { str, firstc[0] } end local function byte(val) @@ -88,7 +104,9 @@ describe('mbyte', function() eq(byte(0x7f), test_seq { 0x7f, 0xc2, 0x80 }) -- Combining character is U+0300 - eq({ '\x7f\xcc\x80', 0x7f }, test_seq { 0x7f, 0xcc, 0x80 }) + eq({ '\x29\xcc\x80', 0x29 }, test_seq { 0x29, 0xcc, 0x80 }) + -- invalid start byte for combining + eq({ '\x7f', 0x7f }, test_seq { 0x7f, 0xcc, 0x80 }) -- No UTF-8 sequence eq({ '', 0xc2 }, test_seq { 0xc2, 0x7f, 0xcc }) @@ -102,18 +120,21 @@ describe('mbyte', function() itp('4-byte sequences', function() -- No following combining character eq(byte(0x7f), test_seq { 0x7f, 0x7f, 0xcc, 0x80 }) + eq(byte(0x29), test_seq { 0x29, 0x29, 0xcc, 0x80 }) -- No second UTF-8 character eq(byte(0x7f), test_seq { 0x7f, 0xc2, 0xcc, 0x80 }) -- Combining character U+0300 - eq({ '\x7f\xcc\x80', 0x7f }, test_seq { 0x7f, 0xcc, 0x80, 0xcc }) + eq({ '\x29\xcc\x80', 0x29 }, test_seq { 0x29, 0xcc, 0x80, 0xcc }) -- No UTF-8 sequence eq({ '', 0xc2 }, test_seq { 0xc2, 0x7f, 0xcc, 0x80 }) -- No following UTF-8 character eq({ '\xc2\x80', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0xcc }) -- Combining character U+0301 - eq({ '\xc2\x80\xcc\x81', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0x81 }) + eq({ '\xc2\xbc\xcc\x81', 0xbc }, test_seq { 0xc2, 0xbc, 0xcc, 0x81 }) + -- U+0080 : not a valid start char + eq({ '\xc2\x80', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0x81 }) -- One UTF-8 character eq({ '\xf4\x80\x80\x80', 0x100000 }, test_seq { 0xf4, 0x80, 0x80, 0x80 }) @@ -126,36 +147,36 @@ describe('mbyte', function() eq(byte(0x7f), test_seq { 0x7f, 0xc2, 0xcc, 0x80, 0x80 }) -- Combining character U+0300 - eq({ '\x7f\xcc\x80', 0x7f }, test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x00 }) + eq({ '\x29\xcc\x80', 0x29 }, test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x00 }) -- Combining characters U+0300 and U+0301 - eq({ '\x7f\xcc\x80\xcc\x81', 0x7f }, test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81 }) + eq({ '\x29\xcc\x80\xcc\x81', 0x29 }, test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81 }) -- Combining characters U+0300, U+0301, U+0302 eq( - { '\x7f\xcc\x80\xcc\x81\xcc\x82', 0x7f }, - test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82 } + { '\x29\xcc\x80\xcc\x81\xcc\x82', 0x29 }, + test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82 } ) -- Combining characters U+0300, U+0301, U+0302, U+0303 eq( - { '\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83', 0x7f }, - test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83 } + { '\x29\xcc\x80\xcc\x81\xcc\x82\xcc\x83', 0x29 }, + test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83 } ) -- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304 eq( - { '\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84', 0x7f }, - test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84 } + { '\x29\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84', 0x29 }, + test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84 } ) -- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304, U+0305 eq( - { '\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85', 0x7f }, - test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85 } + { '\x29\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85', 0x29 }, + test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85 } ) -- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304, U+0305, U+0306 eq( - { '\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85\xcc\x86', 0x7f }, + { '\x29\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85\xcc\x86', 0x29 }, test_seq { - 0x7f, + 0x29, 0xcc, 0x80, 0xcc, @@ -175,18 +196,18 @@ describe('mbyte', function() -- Only three following combining characters U+0300, U+0301, U+0302 eq( - { '\x7f\xcc\x80\xcc\x81\xcc\x82', 0x7f }, - test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xc2, 0x80, 0xcc, 0x84, 0xcc, 0x85 } + { '\x29\xcc\x80\xcc\x81\xcc\x82', 0x29 }, + test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xc2, 0x80, 0xcc, 0x84, 0xcc, 0x85 } ) -- No UTF-8 sequence eq({ '', 0xc2 }, test_seq { 0xc2, 0x7f, 0xcc, 0x80, 0x80 }) -- No following UTF-8 character - eq({ '\xc2\x80', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0xcc, 0x80 }) + eq({ '\xc2\xbc', 0xbc }, test_seq { 0xc2, 0xbc, 0xcc, 0xcc, 0x80 }) -- Combining character U+0301 - eq({ '\xc2\x80\xcc\x81', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0x81, 0x7f }) + eq({ '\xc2\xbc\xcc\x81', 0xbc }, test_seq { 0xc2, 0xbc, 0xcc, 0x81, 0x7f }) -- Combining character U+0301 - eq({ '\xc2\x80\xcc\x81', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0x81, 0xcc }) + eq({ '\xc2\xbc\xcc\x81', 0xbc }, test_seq { 0xc2, 0xbc, 0xcc, 0x81, 0xcc }) -- One UTF-8 character eq({ '\xf4\x80\x80\x80', 0x100000 }, test_seq { 0xf4, 0x80, 0x80, 0x80, 0x7f }) @@ -205,8 +226,6 @@ describe('mbyte', function() end) describe('utf_cp_bounds_len', function() - local to_cstr = t.to_cstr - local tests = { { name = 'for valid string', @@ -273,4 +292,52 @@ describe('mbyte', function() eq(expected_offsets, { b = b_offsets, e = e_offsets }) end) end) + + itp('utf_head_off', function() + local function check(str, expected_glyphs) + local len = #str + local cstr = to_cstr(str) + local breaks = { 0 } -- SOT + local pos = 0 + local mb_glyphs = {} + while pos < len do + local clen = lib.utfc_ptr2len(cstr + pos) + ok(clen > 0) -- otherwise we get stuck + if clen > 1 then + table.insert(mb_glyphs, string.sub(str, pos + 1, pos + clen)) + end + pos = pos + clen + table.insert(breaks, pos) + end + eq(breaks[#breaks], len) -- include EOT as break + -- we could also send in breaks, but this is more human readable + eq(mb_glyphs, expected_glyphs) + + for i = 1, #breaks - 1 do + local start, next = breaks[i], breaks[i + 1] + + for p = start, next - 1 do + eq(p - start, lib.utf_head_off(cstr, cstr + p)) + end + end + eq(0, lib.utf_head_off(cstr, cstr + len)) -- NUL byte is safe + end + -- stylua doesn't like ZWJ chars.. + -- stylua: ignore start + check('hej och hΓ₯ πŸ§‘β€πŸŒΎ!', { 'Γ₯', 'πŸ§‘β€πŸŒΎ' }) + -- emoji only (various kinds of combinations, use g8 to see them) + check("πŸ³οΈβ€βš§οΈπŸ§‘β€πŸŒΎβ€οΈπŸ˜‚πŸ΄β€β˜ οΈ", {"πŸ³οΈβ€βš§οΈ", "πŸ§‘β€πŸŒΎ", "❀️", "πŸ˜‚", "πŸ΄β€β˜ οΈ"}) + check('πŸ³οΈβ€βš§οΈxyπŸ§‘β€πŸŒΎ\rβ€οΈπŸ˜‚Γ₯πŸ΄β€β˜ οΈΒ€', { 'πŸ³οΈβ€βš§οΈ', 'πŸ§‘β€πŸŒΎ', '❀️', 'πŸ˜‚', 'Γ₯', 'πŸ΄β€β˜ οΈ', 'Β€' }) + + check('πŸ‡¦πŸ…±οΈ πŸ‡¦πŸ‡½ πŸ‡¦πŸ‡¨πŸ‡¦ πŸ‡²πŸ‡½πŸ‡ΉπŸ‡±',{'πŸ‡¦', 'πŸ…±οΈ', 'πŸ‡¦πŸ‡½', 'πŸ‡¦πŸ‡¨', 'πŸ‡¦', 'πŸ‡²πŸ‡½', 'πŸ‡ΉπŸ‡±'}) + check('🏴󠁧󠁒󠁳󠁣󠁴󠁿🏴󠁧󠁒󠁷󠁬󠁳󠁿', {'🏴󠁧󠁒󠁳󠁣󠁴󠁿', '🏴󠁧󠁒󠁷󠁬󠁳󠁿'}) + + lib.p_arshape = true -- default + check('Ψ³Ω„Ψ§Ω…', { 'Ψ³', 'Ω„Ψ§', 'Ω…' }) + lib.p_arshape = false + check('Ψ³Ω„Ψ§Ω…', { 'Ψ³', 'Ω„', 'Ψ§', 'Ω…' }) + + check('LΜ“Μ‰Μ‘Μ’ΜŒΜšoΜŒΜ’Μ—Μ„Μ›Μ€rΜΜˆΜ•ΜˆΜŽΜè̇̅̄̄̐mΜ…Μ–ΜŸΜ„ΜŸΜš', {'LΜ“Μ‰Μ‘Μ’ΜŒΜš', 'oΜŒΜ’Μ—Μ„Μ›Μ€', 'rΜΜˆΜ•ΜˆΜŽΜ', 'è̇̅̄̄̐', 'mΜ…Μ–ΜŸΜ„ΜŸΜš'}) + -- stylua: ignore end + end) end)