diff --git a/runtime/doc/mbyte.txt b/runtime/doc/mbyte.txt index a8c5670352..47fd4f3343 100644 --- a/runtime/doc/mbyte.txt +++ b/runtime/doc/mbyte.txt @@ -646,6 +646,12 @@ widespread as file format. A composing or combining character is used to change the meaning of the character before it. The combining characters are drawn on top of the preceding character. + +Nvim largely follows the definition of extended grapheme clusters in UAX#29 +in the Unicode standard, with some modifications: An ascii char will always +start a new cluster. In addition 'arabicshape' enables the combining of some +arabic letters, when they are shaped to be displayed together in a single cell. + Too big combined characters cannot be displayed, but they can still be inspected using the |g8| and |ga| commands described below. When editing text a composing character is mostly considered part of the diff --git a/runtime/doc/news.txt b/runtime/doc/news.txt index 80511ccb87..b7e1e0c84f 100644 --- a/runtime/doc/news.txt +++ b/runtime/doc/news.txt @@ -200,6 +200,12 @@ These existing features changed their behavior. top lines are calculated using screen line numbers which take virtual lines into account. +β€’ The implementation of grapheme clusters (or combining chars |mbyte-combining|) + was upgraded to closely follow extended grapheme clusters as defined by UAX#29 + in the unicode standard. Noteworthily, this enables proper display of many + more emoji characters than before, including those encoded with multiple + emoji codepoints combined with ZWJ (zero width joiner) codepoints. + ============================================================================== REMOVED FEATURES *news-removed* diff --git a/runtime/doc/options.txt b/runtime/doc/options.txt index f44e0954a5..4945a1b46d 100644 --- a/runtime/doc/options.txt +++ b/runtime/doc/options.txt @@ -2217,9 +2217,12 @@ A jump table for the options with a short description can be found at |Q_op|. global When on all Unicode emoji characters are considered to be full width. This excludes "text emoji" characters, which are normally displayed as - single width. Unfortunately there is no good specification for this - and it has been determined on trial-and-error basis. Use the - |setcellwidths()| function to change the behavior. + single width. However, such "text emoji" are treated as full-width + emoji if they are followed by the U+FE0F variant selector. + + Unfortunately there is no good specification for this and it has been + determined on trial-and-error basis. Use the |setcellwidths()| + function to change the behavior. *'encoding'* *'enc'* 'encoding' 'enc' string (default "utf-8") diff --git a/runtime/lua/vim/_meta/options.lua b/runtime/lua/vim/_meta/options.lua index b4ac478b61..05c9b89d77 100644 --- a/runtime/lua/vim/_meta/options.lua +++ b/runtime/lua/vim/_meta/options.lua @@ -1829,9 +1829,12 @@ vim.go.ead = vim.go.eadirection --- When on all Unicode emoji characters are considered to be full width. --- This excludes "text emoji" characters, which are normally displayed as ---- single width. Unfortunately there is no good specification for this ---- and it has been determined on trial-and-error basis. Use the ---- `setcellwidths()` function to change the behavior. +--- single width. However, such "text emoji" are treated as full-width +--- emoji if they are followed by the U+FE0F variant selector. +--- +--- Unfortunately there is no good specification for this and it has been +--- determined on trial-and-error basis. Use the `setcellwidths()` +--- function to change the behavior. --- --- @type boolean vim.o.emoji = true diff --git a/src/nvim/api/extmark.c b/src/nvim/api/extmark.c index 1673519479..d694b64f66 100644 --- a/src/nvim/api/extmark.c +++ b/src/nvim/api/extmark.c @@ -571,7 +571,7 @@ Integer nvim_buf_set_extmark(Buffer buffer, Integer ns_id, Integer line, Integer String c = opts->conceal; if (c.size > 0) { int ch; - hl.conceal_char = utfc_ptr2schar_len(c.data, (int)c.size, &ch); + hl.conceal_char = utfc_ptr2schar(c.data, &ch); if (!hl.conceal_char || !vim_isprintc(ch)) { api_set_error(err, kErrorTypeValidation, "conceal char has to be printable"); goto error; diff --git a/src/nvim/api/ui.c b/src/nvim/api/ui.c index 82a5ff5f8e..a99d97acb8 100644 --- a/src/nvim/api/ui.c +++ b/src/nvim/api/ui.c @@ -847,7 +847,7 @@ void remote_ui_raw_line(RemoteUI *ui, Integer grid, Integer row, Integer startco char sc_buf[MAX_SCHAR_SIZE]; schar_get(sc_buf, chunk[i]); remote_ui_put(ui, sc_buf); - if (utf_ambiguous_width(utf_ptr2char(sc_buf))) { + if (utf_ambiguous_width(sc_buf)) { ui->client_col = -1; // force cursor update } } diff --git a/src/nvim/change.c b/src/nvim/change.c index 6e9fab5a9b..47a9f0ce92 100644 --- a/src/nvim/change.c +++ b/src/nvim/change.c @@ -896,14 +896,15 @@ int del_bytes(colnr_T count, bool fixpos_arg, bool use_delcombine) // delete the last combining character. if (p_deco && use_delcombine && utfc_ptr2len(oldp + col) >= count) { char *p0 = oldp + col; - if (utf_composinglike(p0, p0 + utf_ptr2len(p0))) { + GraphemeState state = GRAPHEME_STATE_INIT; + if (utf_composinglike(p0, p0 + utf_ptr2len(p0), &state)) { // Find the last composing char, there can be several. int n = col; do { col = n; count = utf_ptr2len(oldp + n); n += count; - } while (utf_composinglike(oldp + col, oldp + n)); + } while (utf_composinglike(oldp + col, oldp + n, &state)); fixpos = false; } } @@ -1694,7 +1695,7 @@ bool open_line(int dir, int flags, int second_line_indent, bool *did_do_comment) } if (curbuf->b_p_ai || (flags & OPENLINE_DELSPACES)) { while ((*p_extra == ' ' || *p_extra == '\t') - && !utf_iscomposing(utf_ptr2char(p_extra + 1))) { + && !utf_iscomposing_first(utf_ptr2char(p_extra + 1))) { if (REPLACE_NORMAL(State)) { replace_push(*p_extra); } diff --git a/src/nvim/digraph.c b/src/nvim/digraph.c index 8149c5964f..7413d33fe4 100644 --- a/src/nvim/digraph.c +++ b/src/nvim/digraph.c @@ -1865,7 +1865,7 @@ static void printdigraph(const digr_T *dp, result_T *previous) p = buf; // add a space to draw a composing char on - if (utf_iscomposing(dp->result)) { + if (utf_iscomposing_first(dp->result)) { *p++ = ' '; } p += utf_char2bytes(dp->result, p); diff --git a/src/nvim/drawline.c b/src/nvim/drawline.c index 8a948716e5..b5273a54ca 100644 --- a/src/nvim/drawline.c +++ b/src/nvim/drawline.c @@ -1826,7 +1826,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, int col_rows, s // If a double-width char doesn't fit display a '>' in the last column. // Don't advance the pointer but put the character at the start of the next line. - if (wlv.col >= grid->cols - 1 && utf_char2cells(mb_c) == 2) { + if (wlv.col >= grid->cols - 1 && schar_cells(mb_schar) == 2) { mb_c = '>'; mb_l = 1; (void)mb_l; @@ -1922,7 +1922,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, int col_rows, s // If a double-width char doesn't fit display a '>' in the // last column; the character is displayed at the start of the // next line. - if (wlv.col >= grid->cols - 1 && utf_char2cells(mb_c) == 2) { + if (wlv.col >= grid->cols - 1 && schar_cells(mb_schar) == 2) { mb_schar = schar_from_ascii('>'); mb_c = '>'; mb_l = 1; @@ -2393,6 +2393,12 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, int col_rows, s || (decor_conceal && decor_state.conceal_char) || wp->w_p_cole == 1) && wp->w_p_cole != 3) { + if (schar_cells(mb_schar) > 1) { + // When the first char to be concealed is double-width, + // need to advance one more virtual column. + wlv.n_extra++; + } + // First time at this concealed item: display one // character. if (has_match_conc && match_conc) { @@ -2410,12 +2416,6 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, int col_rows, s mb_schar = schar_from_ascii(' '); } - if (utf_char2cells(mb_c) > 1) { - // When the first char to be concealed is double-width, - // need to advance one more virtual column. - wlv.n_extra++; - } - mb_c = schar_get_first_codepoint(mb_schar); prev_syntax_id = syntax_seqnr; @@ -2484,7 +2484,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, int col_rows, s && mb_schar != NUL) { mb_schar = wp->w_p_lcs_chars.prec; lcs_prec_todo = NUL; - if (utf_char2cells(mb_c) > 1) { + if (schar_cells(mb_schar) > 1) { // Double-width character being overwritten by the "precedes" // character, need to fill up half the character. wlv.sc_extra = schar_from_ascii(MB_FILLER_CHAR); @@ -2725,7 +2725,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, int col_rows, s linebuf_vcol[wlv.off] = wlv.vcol; - if (utf_char2cells(mb_c) > 1) { + if (schar_cells(mb_schar) > 1) { // Need to fill two screen columns. wlv.off++; wlv.col++; @@ -2744,7 +2744,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, int col_rows, s wlv.off++; wlv.col++; } else if (wp->w_p_cole > 0 && is_concealing) { - bool concealed_wide = utf_char2cells(mb_c) > 1; + bool concealed_wide = schar_cells(mb_schar) > 1; wlv.skip_cells--; wlv.vcol_off_co++; diff --git a/src/nvim/edit.c b/src/nvim/edit.c index 00ce38c4b1..f8723f9680 100644 --- a/src/nvim/edit.c +++ b/src/nvim/edit.c @@ -2832,6 +2832,8 @@ int replace_push_mb(char *p) { int l = utfc_ptr2len(p); + // TODO(bfredl): stop doing this insantity and instead use utf_head_off() when popping. + // or just keep a secondary array with char byte lenghts for (int j = l - 1; j >= 0; j--) { replace_push(p[j]); } @@ -2911,7 +2913,9 @@ static void mb_replace_pop_ins(int cc) for (int i = 1; i < n; i++) { buf[i] = (uint8_t)replace_pop(); } - if (utf_iscomposing(utf_ptr2char((char *)buf))) { + // TODO(bfredl): by fixing replace_push_mb, upgrade to use + // the new composing algorithm + if (utf_iscomposing_legacy(utf_ptr2char((char *)buf))) { ins_bytes_len((char *)buf, (size_t)n); } else { // Not a composing char, put it back. @@ -3843,7 +3847,7 @@ static bool ins_bs(int c, int mode, int *inserted_space_p) space_sci = sci; space_vcol = vcol; } - vcol += charsize_nowrap(curbuf, use_ts, vcol, sci.chr.value); + vcol += charsize_nowrap(curbuf, sci.ptr, use_ts, vcol, sci.chr.value); sci = utfc_next(sci); prev_space = cur_space; } @@ -3859,7 +3863,7 @@ static bool ins_bs(int c, int mode, int *inserted_space_p) // Find the position to stop backspacing. // Use charsize_nowrap() so that virtual text and wrapping are ignored. while (true) { - int size = charsize_nowrap(curbuf, use_ts, space_vcol, space_sci.chr.value); + int size = charsize_nowrap(curbuf, space_sci.ptr, use_ts, space_vcol, space_sci.chr.value); if (space_vcol + size > want_vcol) { break; } @@ -3930,7 +3934,7 @@ static bool ins_bs(int c, int mode, int *inserted_space_p) bool has_composing = false; if (p_deco) { char *p0 = get_cursor_pos_ptr(); - has_composing = utf_composinglike(p0, p0 + utf_ptr2len(p0)); + has_composing = utf_composinglike(p0, p0 + utf_ptr2len(p0), NULL); } del_char(false); // If there are combining characters and 'delcombine' is set diff --git a/src/nvim/ex_cmds.c b/src/nvim/ex_cmds.c index 6ac73527ee..1b6861f750 100644 --- a/src/nvim/ex_cmds.c +++ b/src/nvim/ex_cmds.c @@ -204,7 +204,7 @@ void do_ascii(exarg_T *eap) IObuff[iobuff_len++] = ' '; } IObuff[iobuff_len++] = '<'; - if (utf_iscomposing(c)) { + if (utf_iscomposing_first(c)) { IObuff[iobuff_len++] = ' '; // Draw composing char on top of a space. } iobuff_len += (size_t)utf_char2bytes(c, IObuff + iobuff_len); diff --git a/src/nvim/ex_getln.c b/src/nvim/ex_getln.c index 8a34e03d91..722a857f03 100644 --- a/src/nvim/ex_getln.c +++ b/src/nvim/ex_getln.c @@ -2118,7 +2118,7 @@ static int command_line_handle_key(CommandLineState *s) s->do_abbr = false; // don't do abbreviation now ccline.special_char = NUL; // may need to remove ^ when composing char was typed - if (utf_iscomposing(s->c) && !cmd_silent) { + if (utf_iscomposing_first(s->c) && !cmd_silent) { if (ui_has(kUICmdline)) { // TODO(bfredl): why not make unputcmdline also work with true? unputcmdline(); @@ -3585,7 +3585,9 @@ void put_on_cmdline(const char *str, int len, bool redraw) // backup to the character before it. There could be two of them. int i = 0; int c = utf_ptr2char(ccline.cmdbuff + ccline.cmdpos); - while (ccline.cmdpos > 0 && utf_iscomposing(c)) { + // TODO(bfredl): this can be corrected/simplified as utf_head_off implements the + // correct grapheme cluster breaks + while (ccline.cmdpos > 0 && utf_iscomposing_legacy(c)) { i = utf_head_off(ccline.cmdbuff, ccline.cmdbuff + ccline.cmdpos - 1) + 1; ccline.cmdpos -= i; len += i; diff --git a/src/nvim/grid.c b/src/nvim/grid.c index 56246bf001..acb336c725 100644 --- a/src/nvim/grid.c +++ b/src/nvim/grid.c @@ -186,6 +186,24 @@ size_t schar_len(schar_T sc) } } +int schar_cells(schar_T sc) +{ + // hot path +#ifdef ORDER_BIG_ENDIAN + if (!(sc & 0x80FFFFFF)) { + return 1; + } +#else + if (sc < 0x80) { + return 1; + } +#endif + + char sc_buf[MAX_SCHAR_SIZE]; + schar_get(sc_buf, sc); + return utf_ptr2cells(sc_buf); +} + /// gets first raw UTF-8 byte of an schar static char schar_get_first_byte(schar_T sc) { @@ -428,14 +446,19 @@ int grid_line_puts(int col, const char *text, int textlen, int attr) const int max_col = grid_line_maxcol; while (col < max_col && (len < 0 || (int)(ptr - text) < len) && *ptr != NUL) { // check if this is the first byte of a multibyte - int mbyte_blen = len > 0 - ? utfc_ptr2len_len(ptr, (int)((text + len) - ptr)) - : utfc_ptr2len(ptr); + int mbyte_blen; + if (len >= 0) { + int maxlen = (int)((text + len) - ptr); + mbyte_blen = utfc_ptr2len_len(ptr, maxlen); + if (mbyte_blen > maxlen) { + mbyte_blen = 1; + } + } else { + mbyte_blen = utfc_ptr2len(ptr); + } int firstc; - schar_T schar = len >= 0 - ? utfc_ptr2schar_len(ptr, (int)((text + len) - ptr), &firstc) - : utfc_ptr2schar(ptr, &firstc); - int mbyte_cells = utf_char2cells(firstc); + schar_T schar = utfc_ptrlen2schar(ptr, mbyte_blen, &firstc); + int mbyte_cells = utf_ptr2cells_len(ptr, mbyte_blen); if (mbyte_cells > 2 || schar == 0) { mbyte_cells = 1; schar = schar_from_char(0xFFFD); diff --git a/src/nvim/mbyte.c b/src/nvim/mbyte.c index 0c1b537f3a..666a904fc5 100644 --- a/src/nvim/mbyte.c +++ b/src/nvim/mbyte.c @@ -511,20 +511,30 @@ int utf_char2cells(int c) /// Return the number of display cells character at "*p" occupies. /// This doesn't take care of unprintable characters, use ptr2cells() for that. -int utf_ptr2cells(const char *p) +int utf_ptr2cells(const char *p_in) { + const uint8_t *p = (const uint8_t *)p_in; // Need to convert to a character number. - if ((uint8_t)(*p) >= 0x80) { - int c = utf_ptr2char(p); + if ((*p) >= 0x80) { + int len = utf8len_tab[*p]; + int32_t c = utf_ptr2CharInfo_impl(p, (uintptr_t)len); // An illegal byte is displayed as . - if (utf_ptr2len(p) == 1 || c == NUL) { + if (c <= 0) { return 4; } // If the char is ASCII it must be an overlong sequence. if (c < 0x80) { return char2cells(c); } - return utf_char2cells(c); + int cells = utf_char2cells(c); + if (cells == 1 && p_emoji + && intable(emoji_all, ARRAY_SIZE(emoji_all), c)) { + int c2 = utf_ptr2char(p_in + len); + if (c2 == 0xFE0F) { + return 2; // emoji presentation + } + } + return cells; } return 1; } @@ -603,7 +613,8 @@ int utf_ptr2cells_len(const char *p, int size) { // Need to convert to a wide character. if (size > 0 && (uint8_t)(*p) >= 0x80) { - if (utf_ptr2len_len(p, size) < utf8len_tab[(uint8_t)(*p)]) { + int len = utf_ptr2len_len(p, size); + if (len < utf8len_tab[(uint8_t)(*p)]) { return 1; // truncated } int c = utf_ptr2char(p); @@ -615,7 +626,16 @@ int utf_ptr2cells_len(const char *p, int size) if (c < 0x80) { return char2cells(c); } - return utf_char2cells(c); + int cells = utf_char2cells(c); + if (cells == 1 && p_emoji && size > len + && intable(emoji_all, ARRAY_SIZE(emoji_all), c) + && utf_ptr2len_len(p + len, size - len) == utf8len_tab[(uint8_t)p[len]]) { + int c2 = utf_ptr2char(p + len); + if (c2 == 0xFE0F) { + return 2; // emoji presentation + } + } + return cells; } return 1; } @@ -648,8 +668,8 @@ size_t mb_string2cells_len(const char *str, size_t size) size_t clen = 0; for (const char *p = str; *p != NUL && p < str + size; - p += utfc_ptr2len_len(p, (int)size + (int)(p - str))) { - clen += (size_t)utf_ptr2cells(p); + p += utfc_ptr2len_len(p, (int)size - (int)(p - str))) { + clen += (size_t)utf_ptr2cells_len(p, (int)size - (int)(p - str)); } return clen; @@ -793,29 +813,48 @@ int mb_cptr2char_adv(const char **pp) return c; } -/// Check if the character pointed to by "p2" is a composing character when it -/// comes after "p1". For Arabic sometimes "ab" is replaced with "c", which -/// behaves like a composing character. -bool utf_composinglike(const char *p1, const char *p2) +/// When "c" is the first char of a string, determine if it needs to be prefixed +/// by a space byte to be drawn correctly, and not merge with the space left of +/// the string. +bool utf_iscomposing_first(int c) { - int c2 = utf_ptr2char(p2); - if (utf_iscomposing(c2)) { - return true; - } - if (!arabic_maycombine(c2)) { - return false; - } - return arabic_combine(utf_ptr2char(p1), c2); + return c >= 128 && !utf8proc_grapheme_break(' ', c); } -/// Check if the next character is a composing character when it -/// comes after the first. For Arabic sometimes "ab" is replaced with "c", which -/// behaves like a composing character. -/// returns false for negative values -bool utf_char_composinglike(int32_t const first, int32_t const next) - FUNC_ATTR_PURE +/// Check if the character pointed to by "p2" is a composing character when it +/// comes after "p1". +/// +/// We use the definition in UAX#29 as implemented by utf8proc with the following +/// exceptions: +/// +/// - ASCII chars always begin a new cluster. This is a long assumed invariant +/// in the code base and very useful for performance (we can exit early for ASCII +/// all over the place, branch predictor go brrr in ASCII-only text). +/// As of Unicode 15.1 this will only break BOUNDCLASS_UREPEND followed by ASCII, +/// which should be exceedingly rare (these PREPEND chars are expected to be +/// followed by multibyte chars within the same script family) +/// +/// - When 'arabicshape' is active, some pairs of arabic letters "ab" is replaced with +/// "c" taking one single cell, which behaves like a cluster. +/// +/// @param "state" should be set to GRAPHEME_STATE_INIT before first call +/// it is allowed to be null, but will then not handle some longer +/// sequences, like ZWJ based emoji +bool utf_composinglike(const char *p1, const char *p2, GraphemeState *state) + FUNC_ATTR_NONNULL_ARG(1, 2) { - return utf_iscomposing(next) || arabic_combine(first, next); + if ((uint8_t)(*p2) < 128) { + return false; + } + + int first = utf_ptr2char(p1); + int second = utf_ptr2char(p2); + + if (!utf8proc_grapheme_break_stateful(first, second, state)) { + return true; + } + + return arabic_combine(first, second); } /// Get the screen char at the beginning of a string @@ -834,7 +873,7 @@ schar_T utfc_ptr2schar(const char *p, int *firstc) { int c = utf_ptr2char(p); *firstc = c; // NOT optional, you are gonna need it - bool first_compose = utf_iscomposing(c); + bool first_compose = utf_iscomposing_first(c); size_t maxlen = MAX_SCHAR_SIZE - 1 - first_compose; size_t len = (size_t)utfc_ptr2len_len(p, (int)maxlen); @@ -845,16 +884,13 @@ schar_T utfc_ptr2schar(const char *p, int *firstc) return schar_from_buf_first(p, len, first_compose); } -/// Get the screen char at the beginning of a string with length +/// Get the screen char from a char with a known length /// /// Like utfc_ptr2schar but use no more than p[maxlen]. -schar_T utfc_ptr2schar_len(const char *p, int maxlen, int *firstc) +schar_T utfc_ptrlen2schar(const char *p, int len, int *firstc) FUNC_ATTR_NONNULL_ALL { - assert(maxlen > 0); - - size_t len = (size_t)utf_ptr2len_len(p, maxlen); - if (len > (size_t)maxlen || (len == 1 && (uint8_t)(*p) >= 0x80) || len == 0) { + if ((len == 1 && (uint8_t)(*p) >= 0x80) || len == 0) { // invalid or truncated sequence *firstc = (uint8_t)(*p); return 0; @@ -862,11 +898,13 @@ schar_T utfc_ptr2schar_len(const char *p, int maxlen, int *firstc) int c = utf_ptr2char(p); *firstc = c; - bool first_compose = utf_iscomposing(c); - maxlen = MIN(maxlen, MAX_SCHAR_SIZE - 1 - first_compose); - len = (size_t)utfc_ptr2len_len(p, maxlen); + bool first_compose = utf_iscomposing_first(c); + int maxlen = MAX_SCHAR_SIZE - 1 - first_compose; + if (len > maxlen) { + len = utfc_ptr2len_len(p, maxlen); + } - return schar_from_buf_first(p, len, first_compose); + return schar_from_buf_first(p, (size_t)len, first_compose); } /// Caller must ensure there is space for `first_compose` @@ -964,8 +1002,9 @@ int utfc_ptr2len(const char *const p) // Check for composing characters. int prevlen = 0; + GraphemeState state = GRAPHEME_STATE_INIT; while (true) { - if ((uint8_t)p[len] < 0x80 || !utf_composinglike(p + prevlen, p + len)) { + if ((uint8_t)p[len] < 0x80 || !utf_composinglike(p + prevlen, p + len, &state)) { return len; } @@ -996,9 +1035,10 @@ int utfc_ptr2len_len(const char *p, int size) return 1; } - // Check for composing characters. We can handle only the first six, but + // Check for composing characters. We can only display a limited amount, but // skip all of them (otherwise the cursor would get stuck). int prevlen = 0; + GraphemeState state = GRAPHEME_STATE_INIT; while (len < size) { if ((uint8_t)p[len] < 0x80) { break; @@ -1011,7 +1051,7 @@ int utfc_ptr2len_len(const char *p, int size) break; } - if (!utf_composinglike(p + prevlen, p + len)) { + if (!utf_composinglike(p + prevlen, p + len, &state)) { break; } @@ -1084,11 +1124,18 @@ int utf_char2bytes(const int c, char *const buf) } } -/// Return true if "c" is a composing UTF-8 character. -/// This means it will be drawn on top of the preceding character. +/// Return true if "c" is a legacy composing UTF-8 character. +/// +/// This is deprecated in favour of utf_composinglike() which uses the modern +/// stateful algorithm to determine grapheme clusters. Still available +/// to support some legacy code which hasn't been refactored yet. +/// +/// To check if a char would combine with a preceeding space, use +/// utf_iscomposing_first() instead. +/// /// Based on code from Markus Kuhn. /// Returns false for negative values. -bool utf_iscomposing(int c) +bool utf_iscomposing_legacy(int c) { return intable(combining, ARRAY_SIZE(combining), c); } @@ -1278,8 +1325,9 @@ int utf_class_tab(const int c, const uint64_t *const chartab) return 2; } -bool utf_ambiguous_width(int c) +bool utf_ambiguous_width(const char *p) { + int c = utf_ptr2char(p); return c >= 0x80 && (intable(ambiguous, ARRAY_SIZE(ambiguous), c) || intable(emoji_all, ARRAY_SIZE(emoji_all), c)); } @@ -1666,6 +1714,26 @@ void show_utf8(void) msg(IObuff, 0); } +/// @return true if boundclass bc always starts a new cluster regardless of what's before +/// false negatives are allowed (perf cost, not correctness) +static bool always_break(int bc) +{ + return (bc == UTF8PROC_BOUNDCLASS_CONTROL); +} + +/// @return true if bc2 always starts a cluster after bc1 +/// false negatives are allowed (perf cost, not correctness) +static bool always_break_two(int bc1, int bc2) +{ + // don't check for UTF8PROC_BOUNDCLASS_CONTROL for bc2 as it either has been checked by + // "always_break" on first iteration or when it was bc1 in the previous iteration + return ((bc1 != UTF8PROC_BOUNDCLASS_PREPEND && bc2 == UTF8PROC_BOUNDCLASS_OTHER) + || (bc1 >= UTF8PROC_BOUNDCLASS_CR && bc1 <= UTF8PROC_BOUNDCLASS_CONTROL) + || (bc2 == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC + && (bc1 == UTF8PROC_BOUNDCLASS_OTHER + || bc1 == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC))); +} + /// Return offset from "p" to the start of a character, including composing characters. /// "base" must be the start of the string, which must be NUL terminated. /// If "p" points to the NUL at the end of the string return 0. @@ -1679,50 +1747,108 @@ int utf_head_off(const char *base_in, const char *p_in) const uint8_t *base = (uint8_t *)base_in; const uint8_t *p = (uint8_t *)p_in; - // Skip backwards over trailing bytes: 10xx.xxxx - // Skip backwards again if on a composing char. - const uint8_t *q; - for (q = p;; q--) { - // Move s to the last byte of this char. - const uint8_t *s; - for (s = q; (s[1] & 0xc0) == 0x80; s++) {} + const uint8_t *start = p; - // Move q to the first byte of this char. - while (q > base && (*q & 0xc0) == 0x80) { - q--; - } - // Check for illegal sequence. Do allow an illegal byte after where we - // started. - int len = utf8len_tab[*q]; - if (len != (int)(s - q + 1) && len != (int)(p - q + 1)) { - return 0; + // move start to the first byte of this codepoint + // might stop on a continuation byte if overlong, handled by utf_ptr2CharInfo_impl + while (start > base && (*start & 0xc0) == 0x80 && (p - start) < 6) { + start--; + } + + uint8_t cur_len = utf8len_tab[*start]; + int32_t cur_code = utf_ptr2CharInfo_impl(start, (uintptr_t)cur_len); + if (cur_code < 0) { + return 0; // p must be part of an illegal sequence + } + const uint8_t * const safe_end = start + cur_len; + + int cur_bc = utf8proc_get_property(cur_code)->boundclass; + if (always_break(cur_bc)) { + return (int)(p - start); + } + + // backtrack to find the start of a cluster. we might go too far, checked in the next loop + const uint8_t *cur_pos = start; + const uint8_t *const p_start = start; + + if (start == base) { + return (int)(p - start); + } + + start--; + while (*start >= 0x80) { // stop on ascii, we are done + while (start > base && (*start & 0xc0) == 0x80 && (cur_pos - start) < 6) { + start--; } - if (q <= base) { + int32_t prev_code = utf_ptr2CharInfo_impl(start, (uintptr_t)utf8len_tab[*start]); + if (prev_code < 0) { + start = cur_pos; // start at valid sequence after invalid bytes break; } - int c = utf_ptr2char((char *)q); - if (utf_iscomposing(c)) { - continue; + int prev_bc = utf8proc_get_property(prev_code)->boundclass; + if (always_break_two(prev_bc, cur_bc) && !arabic_combine(prev_code, cur_code)) { + start = cur_pos; // prev_code cannot be a part of this cluster + break; + } else if (start == base) { + break; } + cur_pos = start; + cur_bc = prev_bc; + cur_code = prev_code; - if (arabic_maycombine(c)) { - // Advance to get a sneak-peak at the next char - const uint8_t *j = q; - j--; - // Move j to the first byte of this char. - while (j > base && (*j & 0xc0) == 0x80) { - j--; - } - if (arabic_combine(utf_ptr2char((char *)j), c)) { - continue; - } - } - break; + start--; } - return (int)(p - q); + // hot path: we are already on the first codepoint of a sequence + if (start == p_start) { + return (int)(p - start); + } + + const uint8_t *q = start; + while (q < p) { + // don't need to find end of cluster. once we reached the codepoint of p, we are done + int len = utfc_ptr2len_len((const char *)q, (int)(safe_end - q)); + + if (q + len > p) { + return (int)(p - q); + } + + q += len; + } + + return 0; +} + +/// Assumes caller already handles ascii. see `utfc_next` +StrCharInfo utfc_next_impl(StrCharInfo cur) +{ + int32_t prev_code = cur.chr.value; + uint8_t *next = (uint8_t *)(cur.ptr + cur.chr.len); + GraphemeState state = GRAPHEME_STATE_INIT; + assert(*next >= 0x80); + + while (true) { + uint8_t const next_len = utf8len_tab[*next]; + int32_t const next_code = utf_ptr2CharInfo_impl(next, (uintptr_t)next_len); + if (utf8proc_grapheme_break_stateful(prev_code, next_code, &state) + && !arabic_combine(prev_code, next_code)) { + return (StrCharInfo){ + .ptr = (char *)next, + .chr = (CharInfo){ .value = next_code, .len = (next_code < 0 ? 1 : next_len) }, + }; + } + + prev_code = next_code; + next += next_len; + if (EXPECT(*next < 0x80U, true)) { + return (StrCharInfo){ + .ptr = (char *)next, + .chr = (CharInfo){ .value = *next, .len = 1 }, + }; + } + } } // Whether space is NOT allowed before/after 'c'. @@ -2681,7 +2807,7 @@ char *string_convert_ext(const vimconv_T *const vcp, char *ptr, size_t *lenp, si c = 0x100; break; // not in latin9 } } - if (!utf_iscomposing(c)) { // skip composing chars + if (!utf_iscomposing_legacy(c)) { // skip composing chars if (c < 0x100) { *d++ = (uint8_t)c; } else if (vcp->vc_fail) { diff --git a/src/nvim/mbyte.h b/src/nvim/mbyte.h index 6cbfbcbc3c..2da051fca2 100644 --- a/src/nvim/mbyte.h +++ b/src/nvim/mbyte.h @@ -3,6 +3,7 @@ #include #include #include // IWYU pragma: keep +#include #include // IWYU pragma: keep #include "nvim/cmdexpand_defs.h" // IWYU pragma: keep @@ -11,6 +12,9 @@ #include "nvim/mbyte_defs.h" // IWYU pragma: keep #include "nvim/types_defs.h" // IWYU pragma: keep +typedef utf8proc_int32_t GraphemeState; +#define GRAPHEME_STATE_INIT 0 + #ifdef INCLUDE_GENERATED_DECLARATIONS # include "mbyte.h.generated.h" # include "mbyte.h.inline.generated.h" @@ -92,28 +96,16 @@ static inline CharInfo utf_ptr2CharInfo(char const *const p_in) static inline StrCharInfo utfc_next(StrCharInfo cur) FUNC_ATTR_NONNULL_ALL FUNC_ATTR_ALWAYS_INLINE FUNC_ATTR_PURE { - int32_t prev_code = cur.chr.value; + // handle ASCII case inline uint8_t *next = (uint8_t *)(cur.ptr + cur.chr.len); - - while (true) { - if (EXPECT(*next < 0x80U, true)) { - return (StrCharInfo){ - .ptr = (char *)next, - .chr = (CharInfo){ .value = *next, .len = 1 }, - }; - } - uint8_t const next_len = utf8len_tab[*next]; - int32_t const next_code = utf_ptr2CharInfo_impl(next, (uintptr_t)next_len); - if (!utf_char_composinglike(prev_code, next_code)) { - return (StrCharInfo){ - .ptr = (char *)next, - .chr = (CharInfo){ .value = next_code, .len = (next_code < 0 ? 1 : next_len) }, - }; - } - - prev_code = next_code; - next += next_len; + if (EXPECT(*next < 0x80U, true)) { + return (StrCharInfo){ + .ptr = (char *)next, + .chr = (CharInfo){ .value = *next, .len = 1 }, + }; } + + return utfc_next_impl(cur); } static inline StrCharInfo utf_ptr2StrCharInfo(char *ptr) diff --git a/src/nvim/message.c b/src/nvim/message.c index 53e5511a5a..79e6bc8be7 100644 --- a/src/nvim/message.c +++ b/src/nvim/message.c @@ -446,9 +446,7 @@ void trunc_string(const char *s, char *buf, int room_in, int buflen) // Last part: End of the string. half = i = (int)strlen(s); while (true) { - do { - half = half - utf_head_off(s, s + half - 1) - 1; - } while (half > 0 && utf_iscomposing(utf_ptr2char(s + half))); + half = half - utf_head_off(s, s + half - 1) - 1; n = ptr2cells(s + half); if (len + n > room || half == 0) { break; diff --git a/src/nvim/normal.c b/src/nvim/normal.c index f44a64af21..f3bdea9a85 100644 --- a/src/nvim/normal.c +++ b/src/nvim/normal.c @@ -837,7 +837,10 @@ static void normal_get_additional_char(NormalState *s) while ((s->c = vpeekc()) > 0 && (s->c >= 0x100 || MB_BYTE2LEN(vpeekc()) > 1)) { s->c = plain_vgetc(); - if (!utf_iscomposing(s->c)) { + // TODO(bfredl): only allowing up to two composing chars is cringe af. + // Could reuse/abuse schar_T to at least allow us to input anything we are able + // to display and use the stateful utf8proc algorithm like utf_composinglike + if (!utf_iscomposing_legacy(s->c)) { vungetc(s->c); // it wasn't, put it back break; } else if (s->ca.ncharC1 == 0) { diff --git a/src/nvim/options.lua b/src/nvim/options.lua index 3612a80fb8..1c17b0fc9f 100644 --- a/src/nvim/options.lua +++ b/src/nvim/options.lua @@ -2326,9 +2326,12 @@ return { desc = [=[ When on all Unicode emoji characters are considered to be full width. This excludes "text emoji" characters, which are normally displayed as - single width. Unfortunately there is no good specification for this - and it has been determined on trial-and-error basis. Use the - |setcellwidths()| function to change the behavior. + single width. However, such "text emoji" are treated as full-width + emoji if they are followed by the U+FE0F variant selector. + + Unfortunately there is no good specification for this and it has been + determined on trial-and-error basis. Use the |setcellwidths()| + function to change the behavior. ]=], full_name = 'emoji', redraw = { 'all_windows', 'ui_option' }, diff --git a/src/nvim/plines.c b/src/nvim/plines.c index e51e9bf8c3..408fe26bf3 100644 --- a/src/nvim/plines.c +++ b/src/nvim/plines.c @@ -146,7 +146,7 @@ CharSize charsize_regular(CharsizeArg *csarg, char *const cur, colnr_T const vco } else if (cur_char < 0) { size = kInvalidByteCells; } else { - size = char2cells(cur_char); + size = ptr2cells(cur); is_doublewidth = size == 2 && cur_char > 0x80; } @@ -337,8 +337,8 @@ CharSize charsize_regular(CharsizeArg *csarg, char *const cur, colnr_T const vco /// /// @see charsize_regular /// @see charsize_fast -static inline CharSize charsize_fast_impl(win_T *const wp, bool use_tabstop, colnr_T const vcol, - int32_t const cur_char) +static inline CharSize charsize_fast_impl(win_T *const wp, const char *cur, bool use_tabstop, + colnr_T const vcol, int32_t const cur_char) FUNC_ATTR_PURE FUNC_ATTR_ALWAYS_INLINE { // A tab gets expanded, depending on the current column @@ -352,7 +352,11 @@ static inline CharSize charsize_fast_impl(win_T *const wp, bool use_tabstop, col if (cur_char < 0) { width = kInvalidByteCells; } else { - width = char2cells(cur_char); + // TODO(bfredl): perf: often cur_char is enough at this point to determine width. + // we likely want a specialized version of utf_ptr2StrCharInfo also determining + // the ptr2cells width at the same time without any extra decoding. (also applies + // to charsize_regular and charsize_nowrap) + width = ptr2cells(cur); } // If a double-width char doesn't fit at the end of a line, it wraps to the next line, @@ -371,23 +375,23 @@ static inline CharSize charsize_fast_impl(win_T *const wp, bool use_tabstop, col /// Can be used if CSType is kCharsizeFast. /// /// @see charsize_regular -CharSize charsize_fast(CharsizeArg *csarg, colnr_T const vcol, int32_t const cur_char) +CharSize charsize_fast(CharsizeArg *csarg, const char *cur, colnr_T vcol, int32_t cur_char) FUNC_ATTR_PURE { - return charsize_fast_impl(csarg->win, csarg->use_tabstop, vcol, cur_char); + return charsize_fast_impl(csarg->win, cur, csarg->use_tabstop, vcol, cur_char); } /// Get the number of cells taken up on the screen at given virtual column. /// /// @see win_chartabsize() -int charsize_nowrap(buf_T *buf, bool use_tabstop, colnr_T vcol, int32_t cur_char) +int charsize_nowrap(buf_T *buf, const char *cur, bool use_tabstop, colnr_T vcol, int32_t cur_char) { if (cur_char == TAB && use_tabstop) { return tabstop_padding(vcol, buf->b_p_ts, buf->b_p_vts_array); } else if (cur_char < 0) { return kInvalidByteCells; } else { - return char2cells(cur_char); + return ptr2cells(cur); } } @@ -467,7 +471,7 @@ int linesize_fast(CharsizeArg const *const csarg, int vcol_arg, colnr_T const le StrCharInfo ci = utf_ptr2StrCharInfo(line); while (ci.ptr - line < len && *ci.ptr != NUL) { - vcol += charsize_fast_impl(wp, use_tabstop, vcol_arg, ci.chr.value).width; + vcol += charsize_fast_impl(wp, ci.ptr, use_tabstop, vcol_arg, ci.chr.value).width; ci = utfc_next(ci); if (vcol > MAXCOL) { vcol_arg = MAXCOL; @@ -530,7 +534,7 @@ void getvcol(win_T *wp, pos_T *pos, colnr_T *start, colnr_T *cursor, colnr_T *en char_size = (CharSize){ .width = 1 }; break; } - char_size = charsize_fast_impl(wp, use_tabstop, vcol, ci.chr.value); + char_size = charsize_fast_impl(wp, ci.ptr, use_tabstop, vcol, ci.chr.value); StrCharInfo const next = utfc_next(ci); if (next.ptr - line > end_col) { break; @@ -627,7 +631,7 @@ void getvvcol(win_T *wp, pos_T *pos, colnr_T *start, colnr_T *cursor, colnr_T *e if (pos->col < ml_get_buf_len(wp->w_buffer, pos->lnum)) { int c = utf_ptr2char(ptr + pos->col); if ((c != TAB) && vim_isprintc(c)) { - endadd = (colnr_T)(char2cells(c) - 1); + endadd = (colnr_T)(ptr2cells(ptr + pos->col) - 1); if (coladd > endadd) { // past end of line endadd = 0; @@ -824,7 +828,7 @@ int plines_win_col(win_T *wp, linenr_T lnum, long column) if (cstype == kCharsizeFast) { bool const use_tabstop = csarg.use_tabstop; while (*ci.ptr != NUL && --column >= 0) { - vcol += charsize_fast_impl(wp, use_tabstop, vcol, ci.chr.value).width; + vcol += charsize_fast_impl(wp, ci.ptr, use_tabstop, vcol, ci.chr.value).width; ci = utfc_next(ci); } } else { diff --git a/src/nvim/plines.h b/src/nvim/plines.h index 7128e37237..50310b8ce1 100644 --- a/src/nvim/plines.h +++ b/src/nvim/plines.h @@ -54,7 +54,7 @@ static inline CharSize win_charsize(CSType cstype, int vcol, char *ptr, int32_t FUNC_ATTR_NONNULL_ALL FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_ALWAYS_INLINE { if (cstype == kCharsizeFast) { - return charsize_fast(csarg, vcol, chr); + return charsize_fast(csarg, ptr, vcol, chr); } else { return charsize_regular(csarg, ptr, vcol, chr); } diff --git a/src/nvim/regexp.c b/src/nvim/regexp.c index 7dbbb19545..c91c112c3c 100644 --- a/src/nvim/regexp.c +++ b/src/nvim/regexp.c @@ -3031,7 +3031,7 @@ static bool use_multibytecode(int c) { return utf_char2len(c) > 1 && (re_multi_type(peekchr()) != NOT_MULTI - || utf_iscomposing(c)); + || utf_iscomposing_legacy(c)); } // Emit (if appropriate) a byte of code @@ -4326,7 +4326,7 @@ static uint8_t *regatom(int *flagp) } // When '.' is followed by a composing char ignore the dot, so that // the composing char is matched here. - if (c == Magic('.') && utf_iscomposing(peekchr())) { + if (c == Magic('.') && utf_iscomposing_legacy(peekchr())) { c = getchr(); goto do_multibyte; } @@ -5001,9 +5001,10 @@ do_multibyte: int l; // Need to get composing character too. + GraphemeState state = GRAPHEME_STATE_INIT; while (true) { l = utf_ptr2len(regparse); - if (!utf_composinglike(regparse, regparse + l)) { + if (!utf_composinglike(regparse, regparse + l, &state)) { break; } regmbc(utf_ptr2char(regparse)); @@ -6569,7 +6570,7 @@ static bool regmatch(uint8_t *scan, const proftime_T *tm, int *timed_out) // Check for following composing character, unless %C // follows (skips over all composing chars). if (status != RA_NOMATCH - && utf_composinglike((char *)rex.input, (char *)rex.input + len) + && utf_composinglike((char *)rex.input, (char *)rex.input + len, NULL) && !rex.reg_icombine && OP(next) != RE_COMPOSING) { // raaron: This code makes a composing character get @@ -6624,14 +6625,14 @@ static bool regmatch(uint8_t *scan, const proftime_T *tm, int *timed_out) break; } const int opndc = utf_ptr2char((char *)opnd); - if (utf_iscomposing(opndc)) { + if (utf_iscomposing_legacy(opndc)) { // When only a composing char is given match at any // position where that composing char appears. status = RA_NOMATCH; for (i = 0; rex.input[i] != NUL; i += utf_ptr2len((char *)rex.input + i)) { const int inpc = utf_ptr2char((char *)rex.input + i); - if (!utf_iscomposing(inpc)) { + if (!utf_iscomposing_legacy(inpc)) { if (i > 0) { break; } @@ -6654,7 +6655,7 @@ static bool regmatch(uint8_t *scan, const proftime_T *tm, int *timed_out) case RE_COMPOSING: // Skip composing characters. - while (utf_iscomposing(utf_ptr2char((char *)rex.input))) { + while (utf_iscomposing_legacy(utf_ptr2char((char *)rex.input))) { rex.input += utf_ptr2len((char *)rex.input); } break; @@ -10070,7 +10071,7 @@ static int nfa_regatom(void) } // When '.' is followed by a composing char ignore the dot, so that // the composing char is matched here. - if (c == Magic('.') && utf_iscomposing(peekchr())) { + if (c == Magic('.') && utf_iscomposing_legacy(peekchr())) { old_regparse = (uint8_t *)regparse; c = getchr(); goto nfa_do_multibyte; @@ -10705,7 +10706,7 @@ collection: nfa_do_multibyte: // plen is length of current char with composing chars if (utf_char2len(c) != (plen = utfc_ptr2len((char *)old_regparse)) - || utf_iscomposing(c)) { + || utf_iscomposing_legacy(c)) { int i = 0; // A base character plus composing characters, or just one @@ -14033,7 +14034,7 @@ static int find_match_text(colnr_T *startcol, int regstart, uint8_t *match_text) } if (match // check that no composing char follows - && !utf_iscomposing(utf_ptr2char((char *)s2))) { + && !utf_iscomposing_legacy(utf_ptr2char((char *)s2))) { cleanup_subexpr(); if (REG_MULTI) { rex.reg_startpos[0].lnum = rex.lnum; @@ -14278,7 +14279,7 @@ static int nfa_regmatch(nfa_regprog_T *prog, nfa_state_T *start, regsubs_T *subm // is not really a match. if (!rex.reg_icombine && rex.input != rex.line - && utf_iscomposing(curc)) { + && utf_iscomposing_legacy(curc)) { break; } nfa_match = true; @@ -14622,7 +14623,7 @@ static int nfa_regmatch(nfa_regprog_T *prog, nfa_state_T *start, regsubs_T *subm sta = t->state->out; len = 0; - if (utf_iscomposing(sta->c)) { + if (utf_iscomposing_legacy(sta->c)) { // Only match composing character(s), ignore base // character. Used for ".{composing}" and "{composing}" // (no preceding character). @@ -14724,7 +14725,7 @@ static int nfa_regmatch(nfa_regprog_T *prog, nfa_state_T *start, regsubs_T *subm int j; sta = t->state->out->out; - if (utf_iscomposing(sta->c)) { + if (utf_iscomposing_legacy(sta->c)) { // Only match composing character(s), ignore base // character. Used for ".{composing}" and "{composing}" // (no preceding character). @@ -14846,7 +14847,7 @@ static int nfa_regmatch(nfa_regprog_T *prog, nfa_state_T *start, regsubs_T *subm case NFA_ANY_COMPOSING: // On a composing character skip over it. Otherwise do // nothing. Always matches. - if (utf_iscomposing(curc)) { + if (utf_iscomposing_legacy(curc)) { add_off = clen; } else { add_here = true; diff --git a/src/nvim/search.c b/src/nvim/search.c index 9e00664d86..ff6e135df1 100644 --- a/src/nvim/search.c +++ b/src/nvim/search.c @@ -1260,7 +1260,7 @@ int do_search(oparg_T *oap, int dirc, int search_delim, char *pat, size_t patlen // empty for the search_stat feature. if (!cmd_silent) { msgbuf[0] = (char)dirc; - if (utf_iscomposing(utf_ptr2char(p))) { + if (utf_iscomposing_first(utf_ptr2char(p))) { // Use a space to draw the composing char on. msgbuf[1] = ' '; memmove(msgbuf + 2, p, plen); diff --git a/src/nvim/sign.c b/src/nvim/sign.c index 9b2516ed83..b4ba7833e9 100644 --- a/src/nvim/sign.c +++ b/src/nvim/sign.c @@ -376,7 +376,7 @@ int init_sign_text(sign_T *sp, schar_T *sign_text, char *text) if (!vim_isprintc(c)) { break; } - int width = utf_char2cells(c); + int width = utf_ptr2cells(s); if (width == 2) { sign_text[cells + 1] = 0; } diff --git a/src/nvim/spellsuggest.c b/src/nvim/spellsuggest.c index d6053a533e..b37f01e769 100644 --- a/src/nvim/spellsuggest.c +++ b/src/nvim/spellsuggest.c @@ -1792,10 +1792,8 @@ static void suggest_trie_walk(suginfo_T *su, langp_T *lp, char *fword, bool soun // For changing a composing character adjust // the score from SCORE_SUBST to // SCORE_SUBCOMP. - if (utf_iscomposing(utf_ptr2char(tword + sp->ts_twordlen - - sp->ts_tcharlen)) - && utf_iscomposing(utf_ptr2char(fword - + sp->ts_fcharstart))) { + if (utf_iscomposing_legacy(utf_ptr2char(tword + sp->ts_twordlen - sp->ts_tcharlen)) + && utf_iscomposing_legacy(utf_ptr2char(fword + sp->ts_fcharstart))) { sp->ts_score -= SCORE_SUBST - SCORE_SUBCOMP; } else if (!soundfold && slang->sl_has_map @@ -1811,7 +1809,7 @@ static void suggest_trie_walk(suginfo_T *su, langp_T *lp, char *fword, bool soun && sp->ts_twordlen > sp->ts_tcharlen) { p = tword + sp->ts_twordlen - sp->ts_tcharlen; c = utf_ptr2char(p); - if (utf_iscomposing(c)) { + if (utf_iscomposing_legacy(c)) { // Inserting a composing char doesn't // count that much. sp->ts_score -= SCORE_INS - SCORE_INSCOMP; @@ -1876,7 +1874,7 @@ static void suggest_trie_walk(suginfo_T *su, langp_T *lp, char *fword, bool soun c = utf_ptr2char(fword + sp->ts_fidx); stack[depth].ts_fidx = (uint8_t)(stack[depth].ts_fidx + utfc_ptr2len(fword + sp->ts_fidx)); - if (utf_iscomposing(c)) { + if (utf_iscomposing_legacy(c)) { stack[depth].ts_score -= SCORE_DEL - SCORE_DELCOMP; } else if (c == utf_ptr2char(fword + stack[depth].ts_fidx)) { stack[depth].ts_score -= SCORE_DEL - SCORE_DELDUP; diff --git a/src/nvim/textformat.c b/src/nvim/textformat.c index 96907362dd..30c7d0ee92 100644 --- a/src/nvim/textformat.c +++ b/src/nvim/textformat.c @@ -47,7 +47,7 @@ static bool did_add_space = false; ///< auto_format() added an extra space ///< under the cursor #define WHITECHAR(cc) (ascii_iswhite(cc) \ - && !utf_iscomposing(utf_ptr2char((char *)get_cursor_pos_ptr() + 1))) + && !utf_iscomposing_first(utf_ptr2char((char *)get_cursor_pos_ptr() + 1))) /// Return true if format option 'x' is in effect. /// Take care of no formatting when 'paste' is set. diff --git a/src/nvim/tui/tui.c b/src/nvim/tui/tui.c index 1866a4a592..7e1068ed56 100644 --- a/src/nvim/tui/tui.c +++ b/src/nvim/tui/tui.c @@ -109,6 +109,7 @@ struct TUIData { bool set_cursor_color_as_str; bool cursor_color_changed; bool is_starting; + bool did_set_grapheme_cluster_mode; FILE *screenshot; cursorentry_T cursor_shapes[SHAPE_IDX_COUNT]; HlAttrs clear_attrs; @@ -220,6 +221,7 @@ static void tui_set_term_mode(TUIData *tui, TermMode mode, bool set) void tui_handle_term_mode(TUIData *tui, TermMode mode, TermModeState state) FUNC_ATTR_NONNULL_ALL { + bool is_set = false; switch (state) { case kTermModeNotRecognized: case kTermModePermanentlySet: @@ -228,6 +230,8 @@ void tui_handle_term_mode(TUIData *tui, TermMode mode, TermModeState state) // then there is nothing to do break; case kTermModeSet: + is_set = true; + FALLTHROUGH; case kTermModeReset: // The terminal supports changing the given mode switch (mode) { @@ -240,6 +244,12 @@ void tui_handle_term_mode(TUIData *tui, TermMode mode, TermModeState state) signal_watcher_stop(&tui->winch_handle); tui_set_term_mode(tui, mode, true); break; + case kTermModeGraphemeClusters: + if (!is_set) { + tui_set_term_mode(tui, mode, true); + tui->did_set_grapheme_cluster_mode = true; + } + break; } } } @@ -434,6 +444,7 @@ static void terminfo_start(TUIData *tui) if (!nsterm) { tui_request_term_mode(tui, kTermModeSynchronizedOutput); tui_request_term_mode(tui, kTermModeResizeEvents); + tui_request_term_mode(tui, kTermModeGraphemeClusters); } // Don't use DECRQSS in screen or tmux, as they behave strangely when receiving it. @@ -494,7 +505,9 @@ static void terminfo_stop(TUIData *tui) // Disable resize events tui_set_term_mode(tui, kTermModeResizeEvents, false); - + if (tui->did_set_grapheme_cluster_mode) { + tui_set_term_mode(tui, kTermModeGraphemeClusters, false); + } // May restore old title before exiting alternate screen. tui_set_title(tui, NULL_STRING); if (ui_client_exit_status == 0) { @@ -1010,7 +1023,7 @@ static void print_cell_at_pos(TUIData *tui, int row, int col, UCell *cell, bool char buf[MAX_SCHAR_SIZE]; schar_get(buf, cell->data); int c = utf_ptr2char(buf); - bool is_ambiwidth = utf_ambiguous_width(c); + bool is_ambiwidth = utf_ambiguous_width(buf); if (is_doublewidth && (is_ambiwidth || utf_char2cells(c) == 1)) { // If the server used setcellwidths() to treat a single-width char as double-width, // it needs to be treated like an ambiguous-width char. diff --git a/src/nvim/tui/tui_defs.h b/src/nvim/tui/tui_defs.h index 46913e07a2..bd99d6b0ad 100644 --- a/src/nvim/tui/tui_defs.h +++ b/src/nvim/tui/tui_defs.h @@ -4,6 +4,7 @@ typedef struct TUIData TUIData; typedef enum { kTermModeSynchronizedOutput = 2026, + kTermModeGraphemeClusters = 2027, kTermModeResizeEvents = 2048, } TermMode; diff --git a/test/functional/api/vim_spec.lua b/test/functional/api/vim_spec.lua index 4210b7ecf0..074d3ac0a3 100644 --- a/test/functional/api/vim_spec.lua +++ b/test/functional/api/vim_spec.lua @@ -1435,6 +1435,28 @@ describe('API', function() it('cannot handle NULs', function() eq(0, api.nvim_strwidth('\0abc')) end) + + it('can handle emoji with variant selectors and ZWJ', function() + local selector = '❀️' + eq(2, fn.strchars(selector)) + eq(1, fn.strcharlen(selector)) + eq(2, api.nvim_strwidth(selector)) + + local no_selector = '❀' + eq(1, fn.strchars(no_selector)) + eq(1, fn.strcharlen(no_selector)) + eq(1, api.nvim_strwidth(no_selector)) + + local selector_zwj_selector = 'πŸ³οΈβ€βš§οΈ' + eq(5, fn.strchars(selector_zwj_selector)) + eq(1, fn.strcharlen(selector_zwj_selector)) + eq(2, api.nvim_strwidth(selector_zwj_selector)) + + local emoji_zwj_emoji = 'πŸ§‘β€πŸŒΎ' + eq(3, fn.strchars(emoji_zwj_emoji)) + eq(1, fn.strcharlen(emoji_zwj_emoji)) + eq(2, api.nvim_strwidth(emoji_zwj_emoji)) + end) end) describe('nvim_get_current_line, nvim_set_current_line', function() diff --git a/test/functional/ui/decorations_spec.lua b/test/functional/ui/decorations_spec.lua index 1709819575..61a5e1d6f7 100644 --- a/test/functional/ui/decorations_spec.lua +++ b/test/functional/ui/decorations_spec.lua @@ -5620,6 +5620,27 @@ l5 ]] }) end) + + it('supports emoji as signs', function() + insert(example_test3) + feed 'gg' + api.nvim_buf_set_extmark(0, ns, 1, 0, {sign_text='πŸ§‘β€πŸŒΎ'}) + -- VS16 can change width of character + api.nvim_buf_set_extmark(0, ns, 2, 0, {sign_text='❀️'}) + api.nvim_buf_set_extmark(0, ns, 3, 0, {sign_text='❀'}) + api.nvim_buf_set_extmark(0, ns, 4, 0, {sign_text='❀x'}) + screen:expect([[ + {7: }^l1 | + πŸ§‘β€πŸŒΎl2 | + ❀️l3 | + ❀ l4 | + ❀xl5 | + {7: } | + {1:~ }|*3 + | + ]]) + eq("Invalid 'sign_text'", pcall_err(api.nvim_buf_set_extmark, 0, ns, 5, 0, {sign_text='❀️x'})) + end) end) describe('decorations: virt_text', function() diff --git a/test/functional/ui/messages_spec.lua b/test/functional/ui/messages_spec.lua index 07192800e5..036b5ceefc 100644 --- a/test/functional/ui/messages_spec.lua +++ b/test/functional/ui/messages_spec.lua @@ -1436,6 +1436,41 @@ vimComment xxx match /\s"[^\-:.%#=*].*$/ms=s+1,lc=1 excludenl contains=@vim } end) + it('supports nvim_echo messages with emoji', function() + -- stylua: ignore + async_meths.nvim_echo( + { { 'wow, πŸ³οΈβ€βš§οΈπŸ§‘β€πŸŒΎβ€οΈπŸ˜‚πŸ΄β€β˜ οΈ\nvariant ❀️ one\nvariant ❀ two' } }, true, {} + ) + + screen:expect([[ + | + {1:~ }| + {3: }| + wow, πŸ³οΈβ€βš§οΈπŸ§‘β€πŸŒΎβ€οΈπŸ˜‚πŸ΄β€β˜ οΈ | + variant ❀️ one | + variant ❀ two | + {6:Press ENTER or type command to continue}^ | + ]]) + + feed '' + screen:expect([[ + ^ | + {1:~ }|*5 + | + ]]) + + feed ':messages' + screen:expect([[ + | + {1:~ }| + {3: }| + wow, πŸ³οΈβ€βš§οΈπŸ§‘β€πŸŒΎβ€οΈπŸ˜‚πŸ΄β€β˜ οΈ | + variant ❀️ one | + variant ❀ two | + {6:Press ENTER or type command to continue}^ | + ]]) + end) + it('prints lines in Ex mode correctly with a burst of carriage returns #19341', function() command('set number') api.nvim_buf_set_lines(0, 0, 0, true, { 'aaa', 'bbb', 'ccc' }) diff --git a/test/functional/ui/multibyte_spec.lua b/test/functional/ui/multibyte_spec.lua index dc25a09d0d..f16f750ea1 100644 --- a/test/functional/ui/multibyte_spec.lua +++ b/test/functional/ui/multibyte_spec.lua @@ -296,6 +296,86 @@ describe('multibyte rendering', function() ]], } end) + + it('supports emoji with variant selectors and ZWJ', function() + command('set ruler') + insert('πŸ³οΈβ€βš§οΈ') + screen:expect([[ + ^πŸ³οΈβ€βš§οΈ | + {1:~ }|*4 + 1,1 All | + ]]) + + feed('a word') + screen:expect([[ + πŸ³οΈβ€βš§οΈ wor^d | + {1:~ }|*4 + 1,21-7 All | + ]]) + + feed('0') + screen:expect([[ + ^πŸ³οΈβ€βš§οΈ word | + {1:~ }|*4 + 1,1 All | + ]]) + + feed('l') + screen:expect([[ + πŸ³οΈβ€βš§οΈ^ word | + {1:~ }|*4 + 1,17-3 All | + ]]) + + feed('h') + screen:expect([[ + ^πŸ³οΈβ€βš§οΈ word | + {1:~ }|*4 + 1,1 All | + ]]) + + feed('o❀️ variant selected') + screen:expect([[ + πŸ³οΈβ€βš§οΈ word | + ❀️ variant selecte^d | + {1:~ }|*3 + 2,23-19 All | + ]]) + + feed('0') + screen:expect([[ + πŸ³οΈβ€βš§οΈ word | + ^❀️ variant selected | + {1:~ }|*3 + 2,1 All | + ]]) + + feed('l') + screen:expect([[ + πŸ³οΈβ€βš§οΈ word | + ❀️^ variant selected | + {1:~ }|*3 + 2,7-3 All | + ]]) + + feed('h') + screen:expect([[ + πŸ³οΈβ€βš§οΈ word | + ^❀️ variant selected | + {1:~ }|*3 + 2,1 All | + ]]) + + -- without selector: single width (note column 18 and not 19) + feed('o❀ variant selected') + screen:expect([[ + πŸ³οΈβ€βš§οΈ word | + ❀️ variant selected | + ❀ variant selecte^d | + {1:~ }|*2 + 3,20-18 All | + ]]) + end) end) describe('multibyte rendering: statusline', function() @@ -348,11 +428,12 @@ describe('multibyte rendering: statusline', function() it('non-printable followed by MAX_MCO unicode combination points', function() command('set statusline=ΒŸα·°βƒ―ΜΈβƒβƒ§βƒ') -- U+9F + U+1DF0 + U+20EF + U+0338 + U+20D0 + U+20E7 + U+20DD + -- TODO: not ideal, better with plain ">" and then space+combining screen:expect([[ - ^ | - {1:~ }| - {3:<9f><1df0><20ef><0338><20d0><20e7><20dd>}| - | + ^ | + {1:~ }| + {3:<9f≯⃯ᷰ⃐⃧⃝ }| + | ]]) end) @@ -368,9 +449,20 @@ describe('multibyte rendering: statusline', function() } end) - it('unprintable chars in filename with default stl', function() + it('emoji with ZWJ in filename with default stl', function() command('file πŸ§‘β€πŸ’»') - -- TODO: this is wrong but avoids a crash + screen:expect { + grid = [[ + ^ | + {1:~ }| + {3:πŸ§‘β€πŸ’» }| + | + ]], + } + end) + + it('unprintable chars in filename with default stl', function() + command('file πŸ§‘β€‹πŸ’»') screen:expect { grid = [[ ^ | @@ -381,15 +473,27 @@ describe('multibyte rendering: statusline', function() } end) - it('unprintable chars in filename with custom stl', function() + it('emoji with ZWJ in filename with custom stl', function() command('set statusline=xx%#ErrorMsg#%f%##yy') command('file πŸ§‘β€πŸ’»') - -- TODO: this is also wrong but also avoids a crash screen:expect { grid = [[ ^ | {1:~ }| - {3:xx}{9:πŸ§‘<200d>πŸ’»}{3:yy }| + {3:xx}{9:πŸ§‘β€πŸ’»}{3:yy }| + | + ]], + } + end) + + it('unprintable chars in filename with custom stl', function() + command('set statusline=xx%#ErrorMsg#%f%##yy') + command('file πŸ§‘β€‹πŸ’»') + screen:expect { + grid = [[ + ^ | + {1:~ }| + {3:xx}{9:πŸ§‘<200b>πŸ’»}{3:yy }| | ]], } diff --git a/test/old/testdir/test_functions.vim b/test/old/testdir/test_functions.vim index 7047a62017..ffe7f3fb39 100644 --- a/test/old/testdir/test_functions.vim +++ b/test/old/testdir/test_functions.vim @@ -3663,7 +3663,7 @@ func Test_string_reverse() call assert_equal('', reverse(v:_null_string)) for [s1, s2] in [['', ''], ['a', 'a'], ['ab', 'ba'], ['abc', 'cba'], \ ['abcd', 'dcba'], ['Β«-Β«-Β»-Β»', 'Β»-Β»-Β«-Β«'], - \ ['πŸ‡¦', 'πŸ‡¦'], ['πŸ‡¦πŸ‡§', 'πŸ‡§πŸ‡¦'], ['πŸ‡¦πŸ‡§πŸ‡¨', 'πŸ‡¨πŸ‡§πŸ‡¦'], + \ ['πŸ‡¦', 'πŸ‡¦'], ['πŸ‡¦πŸ‡§', 'πŸ‡¦πŸ‡§'], ['πŸ‡¦πŸ‡§πŸ‡¨', 'πŸ‡¨πŸ‡¦πŸ‡§'], \ ['πŸ‡¦Β«πŸ‡§-πŸ‡¨Β»πŸ‡©', 'πŸ‡©Β»πŸ‡¨-πŸ‡§Β«πŸ‡¦']] call assert_equal(s2, reverse(s1)) endfor diff --git a/test/old/testdir/test_normal.vim b/test/old/testdir/test_normal.vim index 8088b1fc57..3ebc9a69a4 100644 --- a/test/old/testdir/test_normal.vim +++ b/test/old/testdir/test_normal.vim @@ -3897,9 +3897,9 @@ func Test_normal_count_after_operator() bw! endfunc -func Test_normal_gj_on_extra_wide_char() +func Test_normal_gj_on_6_cell_wide_unprintable_char() new | 25vsp - let text='1 foooooooo ar e ins‍zwe1 foooooooo ins‍zwei' . + let text='1 foooooooo ar e ins​zwe1 foooooooo ins​zwei' . \ ' i drei vier fΓΌnf sechs sieben acht un zehn elf zwΓΆfl' . \ ' dreizehn v ierzehn fΓΌnfzehn' put =text diff --git a/test/unit/mbyte_spec.lua b/test/unit/mbyte_spec.lua index 8fcc67d20b..787a8862ae 100644 --- a/test/unit/mbyte_spec.lua +++ b/test/unit/mbyte_spec.lua @@ -3,8 +3,15 @@ local itp = t.gen_itp(it) local ffi = t.ffi local eq = t.eq +local to_cstr = t.to_cstr +local ok = t.ok -local lib = t.cimport('./src/nvim/mbyte.h', './src/nvim/charset.h', './src/nvim/grid.h') +local lib = t.cimport( + './src/nvim/mbyte.h', + './src/nvim/charset.h', + './src/nvim/grid.h', + './src/nvim/option_vars.h' +) describe('mbyte', function() -- Convert from bytes to string @@ -45,12 +52,21 @@ describe('mbyte', function() end) end - describe('utfc_ptr2schar_len', function() + describe('utfc_ptr2schar', function() local function test_seq(seq) local firstc = ffi.new('int[1]') local buf = ffi.new('char[32]') - lib.schar_get(buf, lib.utfc_ptr2schar_len(to_string(seq), #seq, firstc)) - return { ffi.string(buf), firstc[0] } + lib.schar_get(buf, lib.utfc_ptr2schar(to_string(seq), firstc)) + local str = ffi.string(buf) + if 1 > 2 then -- for debugging + local tabel = {} + for i = 1, #str do + table.insert(tabel, string.format('0x%02x', string.byte(str, i))) + end + print('{ ' .. table.concat(tabel, ', ') .. ' }') + io.stdout:flush() + end + return { str, firstc[0] } end local function byte(val) @@ -88,7 +104,9 @@ describe('mbyte', function() eq(byte(0x7f), test_seq { 0x7f, 0xc2, 0x80 }) -- Combining character is U+0300 - eq({ '\x7f\xcc\x80', 0x7f }, test_seq { 0x7f, 0xcc, 0x80 }) + eq({ '\x29\xcc\x80', 0x29 }, test_seq { 0x29, 0xcc, 0x80 }) + -- invalid start byte for combining + eq({ '\x7f', 0x7f }, test_seq { 0x7f, 0xcc, 0x80 }) -- No UTF-8 sequence eq({ '', 0xc2 }, test_seq { 0xc2, 0x7f, 0xcc }) @@ -102,18 +120,21 @@ describe('mbyte', function() itp('4-byte sequences', function() -- No following combining character eq(byte(0x7f), test_seq { 0x7f, 0x7f, 0xcc, 0x80 }) + eq(byte(0x29), test_seq { 0x29, 0x29, 0xcc, 0x80 }) -- No second UTF-8 character eq(byte(0x7f), test_seq { 0x7f, 0xc2, 0xcc, 0x80 }) -- Combining character U+0300 - eq({ '\x7f\xcc\x80', 0x7f }, test_seq { 0x7f, 0xcc, 0x80, 0xcc }) + eq({ '\x29\xcc\x80', 0x29 }, test_seq { 0x29, 0xcc, 0x80, 0xcc }) -- No UTF-8 sequence eq({ '', 0xc2 }, test_seq { 0xc2, 0x7f, 0xcc, 0x80 }) -- No following UTF-8 character eq({ '\xc2\x80', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0xcc }) -- Combining character U+0301 - eq({ '\xc2\x80\xcc\x81', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0x81 }) + eq({ '\xc2\xbc\xcc\x81', 0xbc }, test_seq { 0xc2, 0xbc, 0xcc, 0x81 }) + -- U+0080 : not a valid start char + eq({ '\xc2\x80', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0x81 }) -- One UTF-8 character eq({ '\xf4\x80\x80\x80', 0x100000 }, test_seq { 0xf4, 0x80, 0x80, 0x80 }) @@ -126,36 +147,36 @@ describe('mbyte', function() eq(byte(0x7f), test_seq { 0x7f, 0xc2, 0xcc, 0x80, 0x80 }) -- Combining character U+0300 - eq({ '\x7f\xcc\x80', 0x7f }, test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x00 }) + eq({ '\x29\xcc\x80', 0x29 }, test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x00 }) -- Combining characters U+0300 and U+0301 - eq({ '\x7f\xcc\x80\xcc\x81', 0x7f }, test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81 }) + eq({ '\x29\xcc\x80\xcc\x81', 0x29 }, test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81 }) -- Combining characters U+0300, U+0301, U+0302 eq( - { '\x7f\xcc\x80\xcc\x81\xcc\x82', 0x7f }, - test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82 } + { '\x29\xcc\x80\xcc\x81\xcc\x82', 0x29 }, + test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82 } ) -- Combining characters U+0300, U+0301, U+0302, U+0303 eq( - { '\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83', 0x7f }, - test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83 } + { '\x29\xcc\x80\xcc\x81\xcc\x82\xcc\x83', 0x29 }, + test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83 } ) -- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304 eq( - { '\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84', 0x7f }, - test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84 } + { '\x29\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84', 0x29 }, + test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84 } ) -- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304, U+0305 eq( - { '\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85', 0x7f }, - test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85 } + { '\x29\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85', 0x29 }, + test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85 } ) -- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304, U+0305, U+0306 eq( - { '\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85\xcc\x86', 0x7f }, + { '\x29\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85\xcc\x86', 0x29 }, test_seq { - 0x7f, + 0x29, 0xcc, 0x80, 0xcc, @@ -175,18 +196,18 @@ describe('mbyte', function() -- Only three following combining characters U+0300, U+0301, U+0302 eq( - { '\x7f\xcc\x80\xcc\x81\xcc\x82', 0x7f }, - test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xc2, 0x80, 0xcc, 0x84, 0xcc, 0x85 } + { '\x29\xcc\x80\xcc\x81\xcc\x82', 0x29 }, + test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xc2, 0x80, 0xcc, 0x84, 0xcc, 0x85 } ) -- No UTF-8 sequence eq({ '', 0xc2 }, test_seq { 0xc2, 0x7f, 0xcc, 0x80, 0x80 }) -- No following UTF-8 character - eq({ '\xc2\x80', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0xcc, 0x80 }) + eq({ '\xc2\xbc', 0xbc }, test_seq { 0xc2, 0xbc, 0xcc, 0xcc, 0x80 }) -- Combining character U+0301 - eq({ '\xc2\x80\xcc\x81', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0x81, 0x7f }) + eq({ '\xc2\xbc\xcc\x81', 0xbc }, test_seq { 0xc2, 0xbc, 0xcc, 0x81, 0x7f }) -- Combining character U+0301 - eq({ '\xc2\x80\xcc\x81', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0x81, 0xcc }) + eq({ '\xc2\xbc\xcc\x81', 0xbc }, test_seq { 0xc2, 0xbc, 0xcc, 0x81, 0xcc }) -- One UTF-8 character eq({ '\xf4\x80\x80\x80', 0x100000 }, test_seq { 0xf4, 0x80, 0x80, 0x80, 0x7f }) @@ -205,8 +226,6 @@ describe('mbyte', function() end) describe('utf_cp_bounds_len', function() - local to_cstr = t.to_cstr - local tests = { { name = 'for valid string', @@ -273,4 +292,52 @@ describe('mbyte', function() eq(expected_offsets, { b = b_offsets, e = e_offsets }) end) end) + + itp('utf_head_off', function() + local function check(str, expected_glyphs) + local len = #str + local cstr = to_cstr(str) + local breaks = { 0 } -- SOT + local pos = 0 + local mb_glyphs = {} + while pos < len do + local clen = lib.utfc_ptr2len(cstr + pos) + ok(clen > 0) -- otherwise we get stuck + if clen > 1 then + table.insert(mb_glyphs, string.sub(str, pos + 1, pos + clen)) + end + pos = pos + clen + table.insert(breaks, pos) + end + eq(breaks[#breaks], len) -- include EOT as break + -- we could also send in breaks, but this is more human readable + eq(mb_glyphs, expected_glyphs) + + for i = 1, #breaks - 1 do + local start, next = breaks[i], breaks[i + 1] + + for p = start, next - 1 do + eq(p - start, lib.utf_head_off(cstr, cstr + p)) + end + end + eq(0, lib.utf_head_off(cstr, cstr + len)) -- NUL byte is safe + end + -- stylua doesn't like ZWJ chars.. + -- stylua: ignore start + check('hej och hΓ₯ πŸ§‘β€πŸŒΎ!', { 'Γ₯', 'πŸ§‘β€πŸŒΎ' }) + -- emoji only (various kinds of combinations, use g8 to see them) + check("πŸ³οΈβ€βš§οΈπŸ§‘β€πŸŒΎβ€οΈπŸ˜‚πŸ΄β€β˜ οΈ", {"πŸ³οΈβ€βš§οΈ", "πŸ§‘β€πŸŒΎ", "❀️", "πŸ˜‚", "πŸ΄β€β˜ οΈ"}) + check('πŸ³οΈβ€βš§οΈxyπŸ§‘β€πŸŒΎ\rβ€οΈπŸ˜‚Γ₯πŸ΄β€β˜ οΈΒ€', { 'πŸ³οΈβ€βš§οΈ', 'πŸ§‘β€πŸŒΎ', '❀️', 'πŸ˜‚', 'Γ₯', 'πŸ΄β€β˜ οΈ', 'Β€' }) + + check('πŸ‡¦πŸ…±οΈ πŸ‡¦πŸ‡½ πŸ‡¦πŸ‡¨πŸ‡¦ πŸ‡²πŸ‡½πŸ‡ΉπŸ‡±',{'πŸ‡¦', 'πŸ…±οΈ', 'πŸ‡¦πŸ‡½', 'πŸ‡¦πŸ‡¨', 'πŸ‡¦', 'πŸ‡²πŸ‡½', 'πŸ‡ΉπŸ‡±'}) + check('🏴󠁧󠁒󠁳󠁣󠁴󠁿🏴󠁧󠁒󠁷󠁬󠁳󠁿', {'🏴󠁧󠁒󠁳󠁣󠁴󠁿', '🏴󠁧󠁒󠁷󠁬󠁳󠁿'}) + + lib.p_arshape = true -- default + check('Ψ³Ω„Ψ§Ω…', { 'Ψ³', 'Ω„Ψ§', 'Ω…' }) + lib.p_arshape = false + check('Ψ³Ω„Ψ§Ω…', { 'Ψ³', 'Ω„', 'Ψ§', 'Ω…' }) + + check('LΜ“Μ‰Μ‘Μ’ΜŒΜšoΜŒΜ’Μ—Μ„Μ›Μ€rΜΜˆΜ•ΜˆΜŽΜè̇̅̄̄̐mΜ…Μ–ΜŸΜ„ΜŸΜš', {'LΜ“Μ‰Μ‘Μ’ΜŒΜš', 'oΜŒΜ’Μ—Μ„Μ›Μ€', 'rΜΜˆΜ•ΜˆΜŽΜ', 'è̇̅̄̄̐', 'mΜ…Μ–ΜŸΜ„ΜŸΜš'}) + -- stylua: ignore end + end) end)