feat(mbyte): support extended grapheme clusters including more emoji

Use the grapheme break algorithm from utf8proc to support grapheme clusters from recent unicode versions. Handle variant selector VS16 turning some codepoints into double-width emoji. This means we need to use ptr2cells rather than char2cells when possible.
2024-09-17 20:58:20 -04:00 · 2024-08-08 10:42:08 +02:00 · 2024-08-08 10:42:08 +02:00 · cfdf68a7ac
commit cfdf68a7ac
parent 4353996d0f
34 changed files with 657 additions and 221 deletions
--- a/runtime/doc/mbyte.txt
+++ b/runtime/doc/mbyte.txt
@ -646,6 +646,12 @@ widespread as file format.
 A composing or combining character is used to change the meaning of the
 character before it.  The combining characters are drawn on top of the
 preceding character.
+
+Nvim largely follows the definition of extended grapheme clusters in UAX#29
+in the Unicode standard, with some modifications: An ascii char will always
+start a new cluster. In addition 'arabicshape' enables the combining of some
+arabic letters, when they are shaped to be displayed together in a single cell.
+
 Too big combined characters cannot be displayed, but they can still be
 inspected using the |g8| and |ga| commands described below.
 When editing text a composing character is mostly considered part of the
--- a/runtime/doc/news.txt
+++ b/runtime/doc/news.txt
@ -200,6 +200,12 @@ These existing features changed their behavior.
  top lines are calculated using screen line numbers which take virtual lines
  into account.

+• The implementation of grapheme clusters (or combining chars |mbyte-combining|)
+  was upgraded to closely follow extended grapheme clusters as defined by UAX#29
+  in the unicode standard. Noteworthily, this enables proper display of many
+  more emoji characters than before, including those encoded with multiple
+  emoji codepoints combined with ZWJ (zero width joiner) codepoints.
+
 ==============================================================================
 REMOVED FEATURES                                                 *news-removed*

--- a/runtime/doc/options.txt
+++ b/runtime/doc/options.txt
@ -2217,9 +2217,12 @@ A jump table for the options with a short description can be found at |Q_op|.
 			global
 	When on all Unicode emoji characters are considered to be full width.
 	This excludes "text emoji" characters, which are normally displayed as
-	single width.  Unfortunately there is no good specification for this
-	and it has been determined on trial-and-error basis.  Use the
-	|setcellwidths()| function to change the behavior.
+	single width. However, such "text emoji" are treated as full-width
+	emoji if they are followed by the U+FE0F variant selector.
+
+	Unfortunately there is no good specification for this and it has been
+	determined on trial-and-error basis.  Use the |setcellwidths()|
+	function to change the behavior.

 						*'encoding'* *'enc'*
 'encoding' 'enc'	string	(default "utf-8")
--- a/runtime/lua/vim/_meta/options.lua
+++ b/runtime/lua/vim/_meta/options.lua
@ -1829,9 +1829,12 @@ vim.go.ead = vim.go.eadirection

 --- When on all Unicode emoji characters are considered to be full width.
 --- This excludes "text emoji" characters, which are normally displayed as
--- single width.  Unfortunately there is no good specification for this
--- and it has been determined on trial-and-error basis.  Use the
--- `setcellwidths()` function to change the behavior.
+--- single width. However, such "text emoji" are treated as full-width
+--- emoji if they are followed by the U+FE0F variant selector.
+---
+--- Unfortunately there is no good specification for this and it has been
+--- determined on trial-and-error basis.  Use the `setcellwidths()`
+--- function to change the behavior.
 ---
 --- @type boolean
 vim.o.emoji = true
--- a/src/nvim/api/extmark.c
+++ b/src/nvim/api/extmark.c
@ -571,7 +571,7 @@ Integer nvim_buf_set_extmark(Buffer buffer, Integer ns_id, Integer line, Integer
    String c = opts->conceal;
    if (c.size > 0) {
      int ch;
-      hl.conceal_char = utfc_ptr2schar_len(c.data, (int)c.size, &ch);
+      hl.conceal_char = utfc_ptr2schar(c.data, &ch);
      if (!hl.conceal_char || !vim_isprintc(ch)) {
        api_set_error(err, kErrorTypeValidation, "conceal char has to be printable");
        goto error;
--- a/src/nvim/api/ui.c
+++ b/src/nvim/api/ui.c
@ -847,7 +847,7 @@ void remote_ui_raw_line(RemoteUI *ui, Integer grid, Integer row, Integer startco
      char sc_buf[MAX_SCHAR_SIZE];
      schar_get(sc_buf, chunk[i]);
      remote_ui_put(ui, sc_buf);
-      if (utf_ambiguous_width(utf_ptr2char(sc_buf))) {
+      if (utf_ambiguous_width(sc_buf)) {
        ui->client_col = -1;  // force cursor update
      }
    }
--- a/src/nvim/change.c
+++ b/src/nvim/change.c
@ -896,14 +896,15 @@ int del_bytes(colnr_T count, bool fixpos_arg, bool use_delcombine)
  // delete the last combining character.
  if (p_deco && use_delcombine && utfc_ptr2len(oldp + col) >= count) {
    char *p0 = oldp + col;
-    if (utf_composinglike(p0, p0 + utf_ptr2len(p0))) {
+    GraphemeState state = GRAPHEME_STATE_INIT;
+    if (utf_composinglike(p0, p0 + utf_ptr2len(p0), &state)) {
      // Find the last composing char, there can be several.
      int n = col;
      do {
        col = n;
        count = utf_ptr2len(oldp + n);
        n += count;
-      } while (utf_composinglike(oldp + col, oldp + n));
+      } while (utf_composinglike(oldp + col, oldp + n, &state));
      fixpos = false;
    }
  }
@ -1694,7 +1695,7 @@ bool open_line(int dir, int flags, int second_line_indent, bool *did_do_comment)
    }
    if (curbuf->b_p_ai || (flags & OPENLINE_DELSPACES)) {
      while ((*p_extra == ' ' || *p_extra == '\t')
-             && !utf_iscomposing(utf_ptr2char(p_extra + 1))) {
+             && !utf_iscomposing_first(utf_ptr2char(p_extra + 1))) {
        if (REPLACE_NORMAL(State)) {
          replace_push(*p_extra);
        }
--- a/src/nvim/digraph.c
+++ b/src/nvim/digraph.c
@ -1865,7 +1865,7 @@ static void printdigraph(const digr_T *dp, result_T *previous)
  p = buf;

  // add a space to draw a composing char on
-  if (utf_iscomposing(dp->result)) {
+  if (utf_iscomposing_first(dp->result)) {
    *p++ = ' ';
  }
  p += utf_char2bytes(dp->result, p);
--- a/src/nvim/drawline.c
+++ b/src/nvim/drawline.c
@ -1826,7 +1826,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, int col_rows, s

        // If a double-width char doesn't fit display a '>' in the last column.
        // Don't advance the pointer but put the character at the start of the next line.
-        if (wlv.col >= grid->cols - 1 && utf_char2cells(mb_c) == 2) {
+        if (wlv.col >= grid->cols - 1 && schar_cells(mb_schar) == 2) {
          mb_c = '>';
          mb_l = 1;
          (void)mb_l;
@ -1922,7 +1922,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, int col_rows, s
      // If a double-width char doesn't fit display a '>' in the
      // last column; the character is displayed at the start of the
      // next line.
-      if (wlv.col >= grid->cols - 1 && utf_char2cells(mb_c) == 2) {
+      if (wlv.col >= grid->cols - 1 && schar_cells(mb_schar) == 2) {
        mb_schar = schar_from_ascii('>');
        mb_c = '>';
        mb_l = 1;
@ -2393,6 +2393,12 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, int col_rows, s
                || (decor_conceal && decor_state.conceal_char)
                || wp->w_p_cole == 1)
            && wp->w_p_cole != 3) {
+          if (schar_cells(mb_schar) > 1) {
+            // When the first char to be concealed is double-width,
+            // need to advance one more virtual column.
+            wlv.n_extra++;
+          }
+
          // First time at this concealed item: display one
          // character.
          if (has_match_conc && match_conc) {
@ -2410,12 +2416,6 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, int col_rows, s
            mb_schar = schar_from_ascii(' ');
          }

-          if (utf_char2cells(mb_c) > 1) {
-            // When the first char to be concealed is double-width,
-            // need to advance one more virtual column.
-            wlv.n_extra++;
-          }
-
          mb_c = schar_get_first_codepoint(mb_schar);

          prev_syntax_id = syntax_seqnr;
@ -2484,7 +2484,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, int col_rows, s
        && mb_schar != NUL) {
      mb_schar = wp->w_p_lcs_chars.prec;
      lcs_prec_todo = NUL;
-      if (utf_char2cells(mb_c) > 1) {
+      if (schar_cells(mb_schar) > 1) {
        // Double-width character being overwritten by the "precedes"
        // character, need to fill up half the character.
        wlv.sc_extra = schar_from_ascii(MB_FILLER_CHAR);
@ -2725,7 +2725,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, int col_rows, s

      linebuf_vcol[wlv.off] = wlv.vcol;

-      if (utf_char2cells(mb_c) > 1) {
+      if (schar_cells(mb_schar) > 1) {
        // Need to fill two screen columns.
        wlv.off++;
        wlv.col++;
@ -2744,7 +2744,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, int col_rows, s
      wlv.off++;
      wlv.col++;
    } else if (wp->w_p_cole > 0 && is_concealing) {
-      bool concealed_wide = utf_char2cells(mb_c) > 1;
+      bool concealed_wide = schar_cells(mb_schar) > 1;

      wlv.skip_cells--;
      wlv.vcol_off_co++;
--- a/src/nvim/edit.c
+++ b/src/nvim/edit.c
@ -2832,6 +2832,8 @@ int replace_push_mb(char *p)
 {
  int l = utfc_ptr2len(p);

+  // TODO(bfredl): stop doing this insantity and instead use utf_head_off() when popping.
+  // or just keep a secondary array with char byte lenghts
  for (int j = l - 1; j >= 0; j--) {
    replace_push(p[j]);
  }
@ -2911,7 +2913,9 @@ static void mb_replace_pop_ins(int cc)
    for (int i = 1; i < n; i++) {
      buf[i] = (uint8_t)replace_pop();
    }
-    if (utf_iscomposing(utf_ptr2char((char *)buf))) {
+    // TODO(bfredl): by fixing replace_push_mb, upgrade to use
+    // the new composing algorithm
+    if (utf_iscomposing_legacy(utf_ptr2char((char *)buf))) {
      ins_bytes_len((char *)buf, (size_t)n);
    } else {
      // Not a composing char, put it back.
@ -3843,7 +3847,7 @@ static bool ins_bs(int c, int mode, int *inserted_space_p)
          space_sci = sci;
          space_vcol = vcol;
        }
-        vcol += charsize_nowrap(curbuf, use_ts, vcol, sci.chr.value);
+        vcol += charsize_nowrap(curbuf, sci.ptr, use_ts, vcol, sci.chr.value);
        sci = utfc_next(sci);
        prev_space = cur_space;
      }
@ -3859,7 +3863,7 @@ static bool ins_bs(int c, int mode, int *inserted_space_p)
      // Find the position to stop backspacing.
      // Use charsize_nowrap() so that virtual text and wrapping are ignored.
      while (true) {
-        int size = charsize_nowrap(curbuf, use_ts, space_vcol, space_sci.chr.value);
+        int size = charsize_nowrap(curbuf, space_sci.ptr, use_ts, space_vcol, space_sci.chr.value);
        if (space_vcol + size > want_vcol) {
          break;
        }
@ -3930,7 +3934,7 @@ static bool ins_bs(int c, int mode, int *inserted_space_p)
          bool has_composing = false;
          if (p_deco) {
            char *p0 = get_cursor_pos_ptr();
-            has_composing = utf_composinglike(p0, p0 + utf_ptr2len(p0));
+            has_composing = utf_composinglike(p0, p0 + utf_ptr2len(p0), NULL);
          }
          del_char(false);
          // If there are combining characters and 'delcombine' is set
--- a/src/nvim/ex_cmds.c
+++ b/src/nvim/ex_cmds.c
@ -204,7 +204,7 @@ void do_ascii(exarg_T *eap)
      IObuff[iobuff_len++] = ' ';
    }
    IObuff[iobuff_len++] = '<';
-    if (utf_iscomposing(c)) {
+    if (utf_iscomposing_first(c)) {
      IObuff[iobuff_len++] = ' ';  // Draw composing char on top of a space.
    }
    iobuff_len += (size_t)utf_char2bytes(c, IObuff + iobuff_len);
--- a/src/nvim/ex_getln.c
+++ b/src/nvim/ex_getln.c
@ -2118,7 +2118,7 @@ static int command_line_handle_key(CommandLineState *s)
    s->do_abbr = false;                   // don't do abbreviation now
    ccline.special_char = NUL;
    // may need to remove ^ when composing char was typed
-    if (utf_iscomposing(s->c) && !cmd_silent) {
+    if (utf_iscomposing_first(s->c) && !cmd_silent) {
      if (ui_has(kUICmdline)) {
        // TODO(bfredl): why not make unputcmdline also work with true?
        unputcmdline();
@ -3585,7 +3585,9 @@ void put_on_cmdline(const char *str, int len, bool redraw)
    // backup to the character before it.  There could be two of them.
    int i = 0;
    int c = utf_ptr2char(ccline.cmdbuff + ccline.cmdpos);
-    while (ccline.cmdpos > 0 && utf_iscomposing(c)) {
+    // TODO(bfredl): this can be corrected/simplified as utf_head_off implements the
+    // correct grapheme cluster breaks
+    while (ccline.cmdpos > 0 && utf_iscomposing_legacy(c)) {
      i = utf_head_off(ccline.cmdbuff, ccline.cmdbuff + ccline.cmdpos - 1) + 1;
      ccline.cmdpos -= i;
      len += i;
--- a/src/nvim/grid.c
+++ b/src/nvim/grid.c
@ -186,6 +186,24 @@ size_t schar_len(schar_T sc)
  }
 }

+int schar_cells(schar_T sc)
+{
+  // hot path
+#ifdef ORDER_BIG_ENDIAN
+  if (!(sc & 0x80FFFFFF)) {
+    return 1;
+  }
+#else
+  if (sc < 0x80) {
+    return 1;
+  }
+#endif
+
+  char sc_buf[MAX_SCHAR_SIZE];
+  schar_get(sc_buf, sc);
+  return utf_ptr2cells(sc_buf);
+}
+
 /// gets first raw UTF-8 byte of an schar
 static char schar_get_first_byte(schar_T sc)
 {
@ -428,14 +446,19 @@ int grid_line_puts(int col, const char *text, int textlen, int attr)
  const int max_col = grid_line_maxcol;
  while (col < max_col && (len < 0 || (int)(ptr - text) < len) && *ptr != NUL) {
    // check if this is the first byte of a multibyte
-    int mbyte_blen = len > 0
-                     ? utfc_ptr2len_len(ptr, (int)((text + len) - ptr))
-                     : utfc_ptr2len(ptr);
+    int mbyte_blen;
+    if (len >= 0) {
+      int maxlen = (int)((text + len) - ptr);
+      mbyte_blen = utfc_ptr2len_len(ptr, maxlen);
+      if (mbyte_blen > maxlen) {
+        mbyte_blen = 1;
+      }
+    } else {
+      mbyte_blen = utfc_ptr2len(ptr);
+    }
    int firstc;
-    schar_T schar = len >= 0
-                    ? utfc_ptr2schar_len(ptr, (int)((text + len) - ptr), &firstc)
-                    : utfc_ptr2schar(ptr, &firstc);
-    int mbyte_cells = utf_char2cells(firstc);
+    schar_T schar = utfc_ptrlen2schar(ptr, mbyte_blen, &firstc);
+    int mbyte_cells = utf_ptr2cells_len(ptr, mbyte_blen);
    if (mbyte_cells > 2 || schar == 0) {
      mbyte_cells = 1;
      schar = schar_from_char(0xFFFD);
--- a/src/nvim/mbyte.c
+++ b/src/nvim/mbyte.c
@ -511,20 +511,30 @@ int utf_char2cells(int c)

 /// Return the number of display cells character at "*p" occupies.
 /// This doesn't take care of unprintable characters, use ptr2cells() for that.
-int utf_ptr2cells(const char *p)
+int utf_ptr2cells(const char *p_in)
 {
+  const uint8_t *p = (const uint8_t *)p_in;
  // Need to convert to a character number.
-  if ((uint8_t)(*p) >= 0x80) {
-    int c = utf_ptr2char(p);
+  if ((*p) >= 0x80) {
+    int len = utf8len_tab[*p];
+    int32_t c = utf_ptr2CharInfo_impl(p, (uintptr_t)len);
    // An illegal byte is displayed as <xx>.
-    if (utf_ptr2len(p) == 1 || c == NUL) {
+    if (c <= 0) {
      return 4;
    }
    // If the char is ASCII it must be an overlong sequence.
    if (c < 0x80) {
      return char2cells(c);
    }
-    return utf_char2cells(c);
+    int cells = utf_char2cells(c);
+    if (cells == 1 && p_emoji
+        && intable(emoji_all, ARRAY_SIZE(emoji_all), c)) {
+      int c2 = utf_ptr2char(p_in + len);
+      if (c2 == 0xFE0F) {
+        return 2;  // emoji presentation
+      }
+    }
+    return cells;
  }
  return 1;
 }
@ -603,7 +613,8 @@ int utf_ptr2cells_len(const char *p, int size)
 {
  // Need to convert to a wide character.
  if (size > 0 && (uint8_t)(*p) >= 0x80) {
-    if (utf_ptr2len_len(p, size) < utf8len_tab[(uint8_t)(*p)]) {
+    int len = utf_ptr2len_len(p, size);
+    if (len < utf8len_tab[(uint8_t)(*p)]) {
      return 1;        // truncated
    }
    int c = utf_ptr2char(p);
@ -615,7 +626,16 @@ int utf_ptr2cells_len(const char *p, int size)
    if (c < 0x80) {
      return char2cells(c);
    }
-    return utf_char2cells(c);
+    int cells = utf_char2cells(c);
+    if (cells == 1 && p_emoji && size > len
+        && intable(emoji_all, ARRAY_SIZE(emoji_all), c)
+        && utf_ptr2len_len(p + len, size - len) == utf8len_tab[(uint8_t)p[len]]) {
+      int c2 = utf_ptr2char(p + len);
+      if (c2 == 0xFE0F) {
+        return 2;  // emoji presentation
+      }
+    }
+    return cells;
  }
  return 1;
 }
@ -648,8 +668,8 @@ size_t mb_string2cells_len(const char *str, size_t size)
  size_t clen = 0;

  for (const char *p = str; *p != NUL && p < str + size;
-       p += utfc_ptr2len_len(p, (int)size + (int)(p - str))) {
-    clen += (size_t)utf_ptr2cells(p);
+       p += utfc_ptr2len_len(p, (int)size - (int)(p - str))) {
+    clen += (size_t)utf_ptr2cells_len(p, (int)size - (int)(p - str));
  }

  return clen;
@ -793,29 +813,48 @@ int mb_cptr2char_adv(const char **pp)
  return c;
 }

-/// Check if the character pointed to by "p2" is a composing character when it
-/// comes after "p1".  For Arabic sometimes "ab" is replaced with "c", which
-/// behaves like a composing character.
-bool utf_composinglike(const char *p1, const char *p2)
+/// When "c" is the first char of a string, determine if it needs to be prefixed
+/// by a space byte to be drawn correctly, and not merge with the space left of
+/// the string.
+bool utf_iscomposing_first(int c)
 {
-  int c2 = utf_ptr2char(p2);
-  if (utf_iscomposing(c2)) {
-    return true;
-  }
-  if (!arabic_maycombine(c2)) {
-    return false;
-  }
-  return arabic_combine(utf_ptr2char(p1), c2);
+  return c >= 128 && !utf8proc_grapheme_break(' ', c);
 }

-/// Check if the next character is a composing character when it
-/// comes after the first. For Arabic sometimes "ab" is replaced with "c", which
-/// behaves like a composing character.
-/// returns false for negative values
-bool utf_char_composinglike(int32_t const first, int32_t const next)
-  FUNC_ATTR_PURE
+/// Check if the character pointed to by "p2" is a composing character when it
+/// comes after "p1".
+///
+/// We use the definition in UAX#29 as implemented by utf8proc with the following
+/// exceptions:
+///
+/// - ASCII chars always begin a new cluster. This is a long assumed invariant
+///   in the code base and very useful for performance (we can exit early for ASCII
+///   all over the place, branch predictor go brrr in ASCII-only text).
+///   As of Unicode 15.1 this will only break BOUNDCLASS_UREPEND followed by ASCII,
+///   which should be exceedingly rare (these PREPEND chars are expected to be
+///   followed by multibyte chars within the same script family)
+///
+/// - When 'arabicshape' is active, some pairs of arabic letters "ab" is replaced with
+///   "c" taking one single cell, which behaves like a cluster.
+///
+/// @param "state" should be set to GRAPHEME_STATE_INIT before first call
+///        it is allowed to be null, but will then not handle some longer
+///        sequences, like ZWJ based emoji
+bool utf_composinglike(const char *p1, const char *p2, GraphemeState *state)
+  FUNC_ATTR_NONNULL_ARG(1, 2)
 {
-  return utf_iscomposing(next) || arabic_combine(first, next);
+  if ((uint8_t)(*p2) < 128) {
+    return false;
+  }
+
+  int first = utf_ptr2char(p1);
+  int second = utf_ptr2char(p2);
+
+  if (!utf8proc_grapheme_break_stateful(first, second, state)) {
+    return true;
+  }
+
+  return arabic_combine(first, second);
 }

 /// Get the screen char at the beginning of a string
@ -834,7 +873,7 @@ schar_T utfc_ptr2schar(const char *p, int *firstc)
 {
  int c = utf_ptr2char(p);
  *firstc = c;  // NOT optional, you are gonna need it
-  bool first_compose = utf_iscomposing(c);
+  bool first_compose = utf_iscomposing_first(c);
  size_t maxlen = MAX_SCHAR_SIZE - 1 - first_compose;
  size_t len = (size_t)utfc_ptr2len_len(p, (int)maxlen);

@ -845,16 +884,13 @@ schar_T utfc_ptr2schar(const char *p, int *firstc)
  return schar_from_buf_first(p, len, first_compose);
 }

-/// Get the screen char at the beginning of a string with length
+/// Get the screen char from a char with a known length
 ///
 /// Like utfc_ptr2schar but use no more than p[maxlen].
-schar_T utfc_ptr2schar_len(const char *p, int maxlen, int *firstc)
+schar_T utfc_ptrlen2schar(const char *p, int len, int *firstc)
  FUNC_ATTR_NONNULL_ALL
 {
-  assert(maxlen > 0);
-
-  size_t len = (size_t)utf_ptr2len_len(p, maxlen);
-  if (len > (size_t)maxlen || (len == 1 && (uint8_t)(*p) >= 0x80) || len == 0) {
+  if ((len == 1 && (uint8_t)(*p) >= 0x80) || len == 0) {
    // invalid or truncated sequence
    *firstc = (uint8_t)(*p);
    return 0;
@ -862,11 +898,13 @@ schar_T utfc_ptr2schar_len(const char *p, int maxlen, int *firstc)

  int c = utf_ptr2char(p);
  *firstc = c;
-  bool first_compose = utf_iscomposing(c);
-  maxlen = MIN(maxlen, MAX_SCHAR_SIZE - 1 - first_compose);
-  len = (size_t)utfc_ptr2len_len(p, maxlen);
+  bool first_compose = utf_iscomposing_first(c);
+  int maxlen = MAX_SCHAR_SIZE - 1 - first_compose;
+  if (len > maxlen) {
+    len = utfc_ptr2len_len(p, maxlen);
+  }

-  return schar_from_buf_first(p, len, first_compose);
+  return schar_from_buf_first(p, (size_t)len, first_compose);
 }

 /// Caller must ensure there is space for `first_compose`
@ -964,8 +1002,9 @@ int utfc_ptr2len(const char *const p)

  // Check for composing characters.
  int prevlen = 0;
+  GraphemeState state = GRAPHEME_STATE_INIT;
  while (true) {
-    if ((uint8_t)p[len] < 0x80 || !utf_composinglike(p + prevlen, p + len)) {
+    if ((uint8_t)p[len] < 0x80 || !utf_composinglike(p + prevlen, p + len, &state)) {
      return len;
    }

@ -996,9 +1035,10 @@ int utfc_ptr2len_len(const char *p, int size)
    return 1;
  }

-  // Check for composing characters.  We can handle only the first six, but
+  // Check for composing characters.  We can only display a limited amount, but
  // skip all of them (otherwise the cursor would get stuck).
  int prevlen = 0;
+  GraphemeState state = GRAPHEME_STATE_INIT;
  while (len < size) {
    if ((uint8_t)p[len] < 0x80) {
      break;
@ -1011,7 +1051,7 @@ int utfc_ptr2len_len(const char *p, int size)
      break;
    }

-    if (!utf_composinglike(p + prevlen, p + len)) {
+    if (!utf_composinglike(p + prevlen, p + len, &state)) {
      break;
    }

@ -1084,11 +1124,18 @@ int utf_char2bytes(const int c, char *const buf)
  }
 }

-/// Return true if "c" is a composing UTF-8 character.
-/// This means it will be drawn on top of the preceding character.
+/// Return true if "c" is a legacy composing UTF-8 character.
+///
+/// This is deprecated in favour of utf_composinglike() which uses the modern
+/// stateful algorithm to determine grapheme clusters. Still available
+/// to support some legacy code which hasn't been refactored yet.
+///
+/// To check if a char would combine with a preceeding space, use
+/// utf_iscomposing_first() instead.
+///
 /// Based on code from Markus Kuhn.
 /// Returns false for negative values.
-bool utf_iscomposing(int c)
+bool utf_iscomposing_legacy(int c)
 {
  return intable(combining, ARRAY_SIZE(combining), c);
 }
@ -1278,8 +1325,9 @@ int utf_class_tab(const int c, const uint64_t *const chartab)
  return 2;
 }

-bool utf_ambiguous_width(int c)
+bool utf_ambiguous_width(const char *p)
 {
+  int c = utf_ptr2char(p);
  return c >= 0x80 && (intable(ambiguous, ARRAY_SIZE(ambiguous), c)
                       || intable(emoji_all, ARRAY_SIZE(emoji_all), c));
 }
@ -1666,6 +1714,26 @@ void show_utf8(void)
  msg(IObuff, 0);
 }

+/// @return true if boundclass bc always starts a new cluster regardless of what's before
+/// false negatives are allowed (perf cost, not correctness)
+static bool always_break(int bc)
+{
+  return (bc == UTF8PROC_BOUNDCLASS_CONTROL);
+}
+
+/// @return true if bc2 always starts a cluster after bc1
+/// false negatives are allowed (perf cost, not correctness)
+static bool always_break_two(int bc1, int bc2)
+{
+  // don't check for UTF8PROC_BOUNDCLASS_CONTROL for bc2 as it either has been checked by
+  // "always_break" on first iteration or when it was bc1 in the previous iteration
+  return ((bc1 != UTF8PROC_BOUNDCLASS_PREPEND && bc2 == UTF8PROC_BOUNDCLASS_OTHER)
+          || (bc1 >= UTF8PROC_BOUNDCLASS_CR && bc1 <= UTF8PROC_BOUNDCLASS_CONTROL)
+          || (bc2 == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC
+              && (bc1 == UTF8PROC_BOUNDCLASS_OTHER
+                  || bc1 == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC)));
+}
+
 /// Return offset from "p" to the start of a character, including composing characters.
 /// "base" must be the start of the string, which must be NUL terminated.
 /// If "p" points to the NUL at the end of the string return 0.
@ -1679,50 +1747,108 @@ int utf_head_off(const char *base_in, const char *p_in)
  const uint8_t *base = (uint8_t *)base_in;
  const uint8_t *p = (uint8_t *)p_in;

-  // Skip backwards over trailing bytes: 10xx.xxxx
-  // Skip backwards again if on a composing char.
-  const uint8_t *q;
-  for (q = p;; q--) {
-    // Move s to the last byte of this char.
-    const uint8_t *s;
-    for (s = q; (s[1] & 0xc0) == 0x80; s++) {}
+  const uint8_t *start = p;

-    // Move q to the first byte of this char.
-    while (q > base && (*q & 0xc0) == 0x80) {
-      q--;
-    }
-    // Check for illegal sequence. Do allow an illegal byte after where we
-    // started.
-    int len = utf8len_tab[*q];
-    if (len != (int)(s - q + 1) && len != (int)(p - q + 1)) {
-      return 0;
+  // move start to the first byte of this codepoint
+  // might stop on a continuation byte if overlong, handled by utf_ptr2CharInfo_impl
+  while (start > base && (*start & 0xc0) == 0x80 && (p - start) < 6) {
+    start--;
+  }
+
+  uint8_t cur_len = utf8len_tab[*start];
+  int32_t cur_code = utf_ptr2CharInfo_impl(start, (uintptr_t)cur_len);
+  if (cur_code < 0) {
+    return 0;  // p must be part of an illegal sequence
+  }
+  const uint8_t * const safe_end = start + cur_len;
+
+  int cur_bc = utf8proc_get_property(cur_code)->boundclass;
+  if (always_break(cur_bc)) {
+    return (int)(p - start);
+  }
+
+  // backtrack to find the start of a cluster. we might go too far, checked in the next loop
+  const uint8_t *cur_pos = start;
+  const uint8_t *const p_start = start;
+
+  if (start == base) {
+    return (int)(p - start);
+  }
+
+  start--;
+  while (*start >= 0x80) {  // stop on ascii, we are done
+    while (start > base && (*start & 0xc0) == 0x80 && (cur_pos - start) < 6) {
+      start--;
    }

-    if (q <= base) {
+    int32_t prev_code = utf_ptr2CharInfo_impl(start, (uintptr_t)utf8len_tab[*start]);
+    if (prev_code < 0) {
+      start = cur_pos;  // start at valid sequence after invalid bytes
      break;
    }

-    int c = utf_ptr2char((char *)q);
-    if (utf_iscomposing(c)) {
-      continue;
+    int prev_bc = utf8proc_get_property(prev_code)->boundclass;
+    if (always_break_two(prev_bc, cur_bc) && !arabic_combine(prev_code, cur_code)) {
+      start = cur_pos;  // prev_code cannot be a part of this cluster
+      break;
+    } else if (start == base) {
+      break;
    }
+    cur_pos = start;
+    cur_bc = prev_bc;
+    cur_code = prev_code;

-    if (arabic_maycombine(c)) {
-      // Advance to get a sneak-peak at the next char
-      const uint8_t *j = q;
-      j--;
-      // Move j to the first byte of this char.
-      while (j > base && (*j & 0xc0) == 0x80) {
-        j--;
-      }
-      if (arabic_combine(utf_ptr2char((char *)j), c)) {
-        continue;
-      }
-    }
-    break;
+    start--;
  }

-  return (int)(p - q);
+  // hot path: we are already on the first codepoint of a sequence
+  if (start == p_start) {
+    return (int)(p - start);
+  }
+
+  const uint8_t *q = start;
+  while (q < p) {
+    // don't need to find end of cluster. once we reached the codepoint of p, we are done
+    int len = utfc_ptr2len_len((const char *)q, (int)(safe_end - q));
+
+    if (q + len > p) {
+      return (int)(p - q);
+    }
+
+    q += len;
+  }
+
+  return 0;
+}
+
+/// Assumes caller already handles ascii. see `utfc_next`
+StrCharInfo utfc_next_impl(StrCharInfo cur)
+{
+  int32_t prev_code = cur.chr.value;
+  uint8_t *next = (uint8_t *)(cur.ptr + cur.chr.len);
+  GraphemeState state = GRAPHEME_STATE_INIT;
+  assert(*next >= 0x80);
+
+  while (true) {
+    uint8_t const next_len = utf8len_tab[*next];
+    int32_t const next_code = utf_ptr2CharInfo_impl(next, (uintptr_t)next_len);
+    if (utf8proc_grapheme_break_stateful(prev_code, next_code, &state)
+        && !arabic_combine(prev_code, next_code)) {
+      return (StrCharInfo){
+        .ptr = (char *)next,
+        .chr = (CharInfo){ .value = next_code, .len = (next_code < 0 ? 1 : next_len) },
+      };
+    }
+
+    prev_code = next_code;
+    next += next_len;
+    if (EXPECT(*next < 0x80U, true)) {
+      return (StrCharInfo){
+        .ptr = (char *)next,
+        .chr = (CharInfo){ .value = *next, .len = 1 },
+      };
+    }
+  }
 }

 // Whether space is NOT allowed before/after 'c'.
@ -2681,7 +2807,7 @@ char *string_convert_ext(const vimconv_T *const vcp, char *ptr, size_t *lenp, si
            c = 0x100; break;                   // not in latin9
          }
        }
-        if (!utf_iscomposing(c)) {              // skip composing chars
+        if (!utf_iscomposing_legacy(c)) {  // skip composing chars
          if (c < 0x100) {
            *d++ = (uint8_t)c;
          } else if (vcp->vc_fail) {
--- a/src/nvim/mbyte.h
+++ b/src/nvim/mbyte.h
@ -3,6 +3,7 @@
 #include <stdbool.h>
 #include <stdint.h>
 #include <sys/types.h>  // IWYU pragma: keep
+#include <utf8proc.h>
 #include <uv.h>  // IWYU pragma: keep

 #include "nvim/cmdexpand_defs.h"  // IWYU pragma: keep
@ -11,6 +12,9 @@
 #include "nvim/mbyte_defs.h"  // IWYU pragma: keep
 #include "nvim/types_defs.h"  // IWYU pragma: keep

+typedef utf8proc_int32_t GraphemeState;
+#define GRAPHEME_STATE_INIT 0
+
 #ifdef INCLUDE_GENERATED_DECLARATIONS
 # include "mbyte.h.generated.h"
 # include "mbyte.h.inline.generated.h"
@ -92,28 +96,16 @@ static inline CharInfo utf_ptr2CharInfo(char const *const p_in)
 static inline StrCharInfo utfc_next(StrCharInfo cur)
  FUNC_ATTR_NONNULL_ALL FUNC_ATTR_ALWAYS_INLINE FUNC_ATTR_PURE
 {
-  int32_t prev_code = cur.chr.value;
+  // handle ASCII case inline
  uint8_t *next = (uint8_t *)(cur.ptr + cur.chr.len);
-
-  while (true) {
-    if (EXPECT(*next < 0x80U, true)) {
-      return (StrCharInfo){
-        .ptr = (char *)next,
-        .chr = (CharInfo){ .value = *next, .len = 1 },
-      };
-    }
-    uint8_t const next_len = utf8len_tab[*next];
-    int32_t const next_code = utf_ptr2CharInfo_impl(next, (uintptr_t)next_len);
-    if (!utf_char_composinglike(prev_code, next_code)) {
-      return (StrCharInfo){
-        .ptr = (char *)next,
-        .chr = (CharInfo){ .value = next_code, .len = (next_code < 0 ? 1 : next_len) },
-      };
-    }
-
-    prev_code = next_code;
-    next += next_len;
+  if (EXPECT(*next < 0x80U, true)) {
+    return (StrCharInfo){
+      .ptr = (char *)next,
+      .chr = (CharInfo){ .value = *next, .len = 1 },
+    };
  }
+
+  return utfc_next_impl(cur);
 }

 static inline StrCharInfo utf_ptr2StrCharInfo(char *ptr)
--- a/src/nvim/message.c
+++ b/src/nvim/message.c
@ -446,9 +446,7 @@ void trunc_string(const char *s, char *buf, int room_in, int buflen)
  // Last part: End of the string.
  half = i = (int)strlen(s);
  while (true) {
-    do {
-      half = half - utf_head_off(s, s + half - 1) - 1;
-    } while (half > 0 && utf_iscomposing(utf_ptr2char(s + half)));
+    half = half - utf_head_off(s, s + half - 1) - 1;
    n = ptr2cells(s + half);
    if (len + n > room || half == 0) {
      break;
--- a/src/nvim/normal.c
+++ b/src/nvim/normal.c
@ -837,7 +837,10 @@ static void normal_get_additional_char(NormalState *s)
      while ((s->c = vpeekc()) > 0
             && (s->c >= 0x100 || MB_BYTE2LEN(vpeekc()) > 1)) {
        s->c = plain_vgetc();
-        if (!utf_iscomposing(s->c)) {
+        // TODO(bfredl): only allowing up to two composing chars is cringe af.
+        // Could reuse/abuse schar_T to at least allow us to input anything we are able
+        // to display and use the stateful utf8proc algorithm like utf_composinglike
+        if (!utf_iscomposing_legacy(s->c)) {
          vungetc(s->c);                   // it wasn't, put it back
          break;
        } else if (s->ca.ncharC1 == 0) {
--- a/src/nvim/options.lua
+++ b/src/nvim/options.lua
@ -2326,9 +2326,12 @@ return {
      desc = [=[
        When on all Unicode emoji characters are considered to be full width.
        This excludes "text emoji" characters, which are normally displayed as
-        single width.  Unfortunately there is no good specification for this
-        and it has been determined on trial-and-error basis.  Use the
-        |setcellwidths()| function to change the behavior.
+        single width. However, such "text emoji" are treated as full-width
+        emoji if they are followed by the U+FE0F variant selector.
+
+        Unfortunately there is no good specification for this and it has been
+        determined on trial-and-error basis.  Use the |setcellwidths()|
+        function to change the behavior.
      ]=],
      full_name = 'emoji',
      redraw = { 'all_windows', 'ui_option' },
--- a/src/nvim/plines.c
+++ b/src/nvim/plines.c
@ -146,7 +146,7 @@ CharSize charsize_regular(CharsizeArg *csarg, char *const cur, colnr_T const vco
  } else if (cur_char < 0) {
    size = kInvalidByteCells;
  } else {
-    size = char2cells(cur_char);
+    size = ptr2cells(cur);
    is_doublewidth = size == 2 && cur_char > 0x80;
  }

@ -337,8 +337,8 @@ CharSize charsize_regular(CharsizeArg *csarg, char *const cur, colnr_T const vco
 ///
 /// @see charsize_regular
 /// @see charsize_fast
-static inline CharSize charsize_fast_impl(win_T *const wp, bool use_tabstop, colnr_T const vcol,
-                                          int32_t const cur_char)
+static inline CharSize charsize_fast_impl(win_T *const wp, const char *cur, bool use_tabstop,
+                                          colnr_T const vcol, int32_t const cur_char)
  FUNC_ATTR_PURE FUNC_ATTR_ALWAYS_INLINE
 {
  // A tab gets expanded, depending on the current column
@ -352,7 +352,11 @@ static inline CharSize charsize_fast_impl(win_T *const wp, bool use_tabstop, col
    if (cur_char < 0) {
      width = kInvalidByteCells;
    } else {
-      width = char2cells(cur_char);
+      // TODO(bfredl): perf: often cur_char is enough at this point to determine width.
+      // we likely want a specialized version of utf_ptr2StrCharInfo also determining
+      // the ptr2cells width at the same time without any extra decoding. (also applies
+      // to charsize_regular and charsize_nowrap)
+      width = ptr2cells(cur);
    }

    // If a double-width char doesn't fit at the end of a line, it wraps to the next line,
@ -371,23 +375,23 @@ static inline CharSize charsize_fast_impl(win_T *const wp, bool use_tabstop, col
 /// Can be used if CSType is kCharsizeFast.
 ///
 /// @see charsize_regular
-CharSize charsize_fast(CharsizeArg *csarg, colnr_T const vcol, int32_t const cur_char)
+CharSize charsize_fast(CharsizeArg *csarg, const char *cur, colnr_T vcol, int32_t cur_char)
  FUNC_ATTR_PURE
 {
-  return charsize_fast_impl(csarg->win, csarg->use_tabstop, vcol, cur_char);
+  return charsize_fast_impl(csarg->win, cur, csarg->use_tabstop, vcol, cur_char);
 }

 /// Get the number of cells taken up on the screen at given virtual column.
 ///
 /// @see win_chartabsize()
-int charsize_nowrap(buf_T *buf, bool use_tabstop, colnr_T vcol, int32_t cur_char)
+int charsize_nowrap(buf_T *buf, const char *cur, bool use_tabstop, colnr_T vcol, int32_t cur_char)
 {
  if (cur_char == TAB && use_tabstop) {
    return tabstop_padding(vcol, buf->b_p_ts, buf->b_p_vts_array);
  } else if (cur_char < 0) {
    return kInvalidByteCells;
  } else {
-    return char2cells(cur_char);
+    return ptr2cells(cur);
  }
 }

@ -467,7 +471,7 @@ int linesize_fast(CharsizeArg const *const csarg, int vcol_arg, colnr_T const le

  StrCharInfo ci = utf_ptr2StrCharInfo(line);
  while (ci.ptr - line < len && *ci.ptr != NUL) {
-    vcol += charsize_fast_impl(wp, use_tabstop, vcol_arg, ci.chr.value).width;
+    vcol += charsize_fast_impl(wp, ci.ptr, use_tabstop, vcol_arg, ci.chr.value).width;
    ci = utfc_next(ci);
    if (vcol > MAXCOL) {
      vcol_arg = MAXCOL;
@ -530,7 +534,7 @@ void getvcol(win_T *wp, pos_T *pos, colnr_T *start, colnr_T *cursor, colnr_T *en
        char_size = (CharSize){ .width = 1 };
        break;
      }
-      char_size = charsize_fast_impl(wp, use_tabstop, vcol, ci.chr.value);
+      char_size = charsize_fast_impl(wp, ci.ptr, use_tabstop, vcol, ci.chr.value);
      StrCharInfo const next = utfc_next(ci);
      if (next.ptr - line > end_col) {
        break;
@ -627,7 +631,7 @@ void getvvcol(win_T *wp, pos_T *pos, colnr_T *start, colnr_T *cursor, colnr_T *e
    if (pos->col < ml_get_buf_len(wp->w_buffer, pos->lnum)) {
      int c = utf_ptr2char(ptr + pos->col);
      if ((c != TAB) && vim_isprintc(c)) {
-        endadd = (colnr_T)(char2cells(c) - 1);
+        endadd = (colnr_T)(ptr2cells(ptr + pos->col) - 1);
        if (coladd > endadd) {
          // past end of line
          endadd = 0;
@ -824,7 +828,7 @@ int plines_win_col(win_T *wp, linenr_T lnum, long column)
  if (cstype == kCharsizeFast) {
    bool const use_tabstop = csarg.use_tabstop;
    while (*ci.ptr != NUL && --column >= 0) {
-      vcol += charsize_fast_impl(wp, use_tabstop, vcol, ci.chr.value).width;
+      vcol += charsize_fast_impl(wp, ci.ptr, use_tabstop, vcol, ci.chr.value).width;
      ci = utfc_next(ci);
    }
  } else {
--- a/src/nvim/plines.h
+++ b/src/nvim/plines.h
@ -54,7 +54,7 @@ static inline CharSize win_charsize(CSType cstype, int vcol, char *ptr, int32_t
  FUNC_ATTR_NONNULL_ALL FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_ALWAYS_INLINE
 {
  if (cstype == kCharsizeFast) {
-    return charsize_fast(csarg, vcol, chr);
+    return charsize_fast(csarg, ptr, vcol, chr);
  } else {
    return charsize_regular(csarg, ptr, vcol, chr);
  }
--- a/src/nvim/regexp.c
+++ b/src/nvim/regexp.c
@ -3031,7 +3031,7 @@ static bool use_multibytecode(int c)
 {
  return utf_char2len(c) > 1
         && (re_multi_type(peekchr()) != NOT_MULTI
-             || utf_iscomposing(c));
+             || utf_iscomposing_legacy(c));
 }

 // Emit (if appropriate) a byte of code
@ -4326,7 +4326,7 @@ static uint8_t *regatom(int *flagp)
    }
    // When '.' is followed by a composing char ignore the dot, so that
    // the composing char is matched here.
-    if (c == Magic('.') && utf_iscomposing(peekchr())) {
+    if (c == Magic('.') && utf_iscomposing_legacy(peekchr())) {
      c = getchr();
      goto do_multibyte;
    }
@ -5001,9 +5001,10 @@ do_multibyte:
          int l;

          // Need to get composing character too.
+          GraphemeState state = GRAPHEME_STATE_INIT;
          while (true) {
            l = utf_ptr2len(regparse);
-            if (!utf_composinglike(regparse, regparse + l)) {
+            if (!utf_composinglike(regparse, regparse + l, &state)) {
              break;
            }
            regmbc(utf_ptr2char(regparse));
@ -6569,7 +6570,7 @@ static bool regmatch(uint8_t *scan, const proftime_T *tm, int *timed_out)
            // Check for following composing character, unless %C
            // follows (skips over all composing chars).
            if (status != RA_NOMATCH
-                && utf_composinglike((char *)rex.input, (char *)rex.input + len)
+                && utf_composinglike((char *)rex.input, (char *)rex.input + len, NULL)
                && !rex.reg_icombine
                && OP(next) != RE_COMPOSING) {
              // raaron: This code makes a composing character get
@ -6624,14 +6625,14 @@ static bool regmatch(uint8_t *scan, const proftime_T *tm, int *timed_out)
            break;
          }
          const int opndc = utf_ptr2char((char *)opnd);
-          if (utf_iscomposing(opndc)) {
+          if (utf_iscomposing_legacy(opndc)) {
            // When only a composing char is given match at any
            // position where that composing char appears.
            status = RA_NOMATCH;
            for (i = 0; rex.input[i] != NUL;
                 i += utf_ptr2len((char *)rex.input + i)) {
              const int inpc = utf_ptr2char((char *)rex.input + i);
-              if (!utf_iscomposing(inpc)) {
+              if (!utf_iscomposing_legacy(inpc)) {
                if (i > 0) {
                  break;
                }
@ -6654,7 +6655,7 @@ static bool regmatch(uint8_t *scan, const proftime_T *tm, int *timed_out)

        case RE_COMPOSING:
          // Skip composing characters.
-          while (utf_iscomposing(utf_ptr2char((char *)rex.input))) {
+          while (utf_iscomposing_legacy(utf_ptr2char((char *)rex.input))) {
            rex.input += utf_ptr2len((char *)rex.input);
          }
          break;
@ -10070,7 +10071,7 @@ static int nfa_regatom(void)
    }
    // When '.' is followed by a composing char ignore the dot, so that
    // the composing char is matched here.
-    if (c == Magic('.') && utf_iscomposing(peekchr())) {
+    if (c == Magic('.') && utf_iscomposing_legacy(peekchr())) {
      old_regparse = (uint8_t *)regparse;
      c = getchr();
      goto nfa_do_multibyte;
@ -10705,7 +10706,7 @@ collection:
 nfa_do_multibyte:
    // plen is length of current char with composing chars
    if (utf_char2len(c) != (plen = utfc_ptr2len((char *)old_regparse))
-        || utf_iscomposing(c)) {
+        || utf_iscomposing_legacy(c)) {
      int i = 0;

      // A base character plus composing characters, or just one
@ -14033,7 +14034,7 @@ static int find_match_text(colnr_T *startcol, int regstart, uint8_t *match_text)
    }
    if (match
        // check that no composing char follows
-        && !utf_iscomposing(utf_ptr2char((char *)s2))) {
+        && !utf_iscomposing_legacy(utf_ptr2char((char *)s2))) {
      cleanup_subexpr();
      if (REG_MULTI) {
        rex.reg_startpos[0].lnum = rex.lnum;
@ -14278,7 +14279,7 @@ static int nfa_regmatch(nfa_regprog_T *prog, nfa_state_T *start, regsubs_T *subm
        // is not really a match.
        if (!rex.reg_icombine
            && rex.input != rex.line
-            && utf_iscomposing(curc)) {
+            && utf_iscomposing_legacy(curc)) {
          break;
        }
        nfa_match = true;
@ -14622,7 +14623,7 @@ static int nfa_regmatch(nfa_regprog_T *prog, nfa_state_T *start, regsubs_T *subm

        sta = t->state->out;
        len = 0;
-        if (utf_iscomposing(sta->c)) {
+        if (utf_iscomposing_legacy(sta->c)) {
          // Only match composing character(s), ignore base
          // character.  Used for ".{composing}" and "{composing}"
          // (no preceding character).
@ -14724,7 +14725,7 @@ static int nfa_regmatch(nfa_regprog_T *prog, nfa_state_T *start, regsubs_T *subm
            int j;

            sta = t->state->out->out;
-            if (utf_iscomposing(sta->c)) {
+            if (utf_iscomposing_legacy(sta->c)) {
              // Only match composing character(s), ignore base
              // character.  Used for ".{composing}" and "{composing}"
              // (no preceding character).
@ -14846,7 +14847,7 @@ static int nfa_regmatch(nfa_regprog_T *prog, nfa_state_T *start, regsubs_T *subm
      case NFA_ANY_COMPOSING:
        // On a composing character skip over it.  Otherwise do
        // nothing.  Always matches.
-        if (utf_iscomposing(curc)) {
+        if (utf_iscomposing_legacy(curc)) {
          add_off = clen;
        } else {
          add_here = true;
--- a/src/nvim/search.c
+++ b/src/nvim/search.c
@ -1260,7 +1260,7 @@ int do_search(oparg_T *oap, int dirc, int search_delim, char *pat, size_t patlen
      // empty for the search_stat feature.
      if (!cmd_silent) {
        msgbuf[0] = (char)dirc;
-        if (utf_iscomposing(utf_ptr2char(p))) {
+        if (utf_iscomposing_first(utf_ptr2char(p))) {
          // Use a space to draw the composing char on.
          msgbuf[1] = ' ';
          memmove(msgbuf + 2, p, plen);
--- a/src/nvim/sign.c
+++ b/src/nvim/sign.c
@ -376,7 +376,7 @@ int init_sign_text(sign_T *sp, schar_T *sign_text, char *text)
    if (!vim_isprintc(c)) {
      break;
    }
-    int width = utf_char2cells(c);
+    int width = utf_ptr2cells(s);
    if (width == 2) {
      sign_text[cells + 1] = 0;
    }
--- a/src/nvim/spellsuggest.c
+++ b/src/nvim/spellsuggest.c
@ -1792,10 +1792,8 @@ static void suggest_trie_walk(suginfo_T *su, langp_T *lp, char *fword, bool soun
              // For changing a composing character adjust
              // the score from SCORE_SUBST to
              // SCORE_SUBCOMP.
-              if (utf_iscomposing(utf_ptr2char(tword + sp->ts_twordlen
-                                               - sp->ts_tcharlen))
-                  && utf_iscomposing(utf_ptr2char(fword
-                                                  + sp->ts_fcharstart))) {
+              if (utf_iscomposing_legacy(utf_ptr2char(tword + sp->ts_twordlen - sp->ts_tcharlen))
+                  && utf_iscomposing_legacy(utf_ptr2char(fword + sp->ts_fcharstart))) {
                sp->ts_score -= SCORE_SUBST - SCORE_SUBCOMP;
              } else if (!soundfold
                         && slang->sl_has_map
@ -1811,7 +1809,7 @@ static void suggest_trie_walk(suginfo_T *su, langp_T *lp, char *fword, bool soun
                       && sp->ts_twordlen > sp->ts_tcharlen) {
              p = tword + sp->ts_twordlen - sp->ts_tcharlen;
              c = utf_ptr2char(p);
-              if (utf_iscomposing(c)) {
+              if (utf_iscomposing_legacy(c)) {
                // Inserting a composing char doesn't
                // count that much.
                sp->ts_score -= SCORE_INS - SCORE_INSCOMP;
@ -1876,7 +1874,7 @@ static void suggest_trie_walk(suginfo_T *su, langp_T *lp, char *fword, bool soun
        c = utf_ptr2char(fword + sp->ts_fidx);
        stack[depth].ts_fidx =
          (uint8_t)(stack[depth].ts_fidx + utfc_ptr2len(fword + sp->ts_fidx));
-        if (utf_iscomposing(c)) {
+        if (utf_iscomposing_legacy(c)) {
          stack[depth].ts_score -= SCORE_DEL - SCORE_DELCOMP;
        } else if (c == utf_ptr2char(fword + stack[depth].ts_fidx)) {
          stack[depth].ts_score -= SCORE_DEL - SCORE_DELDUP;
--- a/src/nvim/textformat.c
+++ b/src/nvim/textformat.c
@ -47,7 +47,7 @@ static bool did_add_space = false;  ///< auto_format() added an extra space
                                    ///< under the cursor

 #define WHITECHAR(cc) (ascii_iswhite(cc) \
-                       && !utf_iscomposing(utf_ptr2char((char *)get_cursor_pos_ptr() + 1)))
+                       && !utf_iscomposing_first(utf_ptr2char((char *)get_cursor_pos_ptr() + 1)))

 /// Return true if format option 'x' is in effect.
 /// Take care of no formatting when 'paste' is set.
--- a/src/nvim/tui/tui.c
+++ b/src/nvim/tui/tui.c
@ -109,6 +109,7 @@ struct TUIData {
  bool set_cursor_color_as_str;
  bool cursor_color_changed;
  bool is_starting;
+  bool did_set_grapheme_cluster_mode;
  FILE *screenshot;
  cursorentry_T cursor_shapes[SHAPE_IDX_COUNT];
  HlAttrs clear_attrs;
@ -220,6 +221,7 @@ static void tui_set_term_mode(TUIData *tui, TermMode mode, bool set)
 void tui_handle_term_mode(TUIData *tui, TermMode mode, TermModeState state)
  FUNC_ATTR_NONNULL_ALL
 {
+  bool is_set = false;
  switch (state) {
  case kTermModeNotRecognized:
  case kTermModePermanentlySet:
@ -228,6 +230,8 @@ void tui_handle_term_mode(TUIData *tui, TermMode mode, TermModeState state)
    // then there is nothing to do
    break;
  case kTermModeSet:
+    is_set = true;
+    FALLTHROUGH;
  case kTermModeReset:
    // The terminal supports changing the given mode
    switch (mode) {
@ -240,6 +244,12 @@ void tui_handle_term_mode(TUIData *tui, TermMode mode, TermModeState state)
      signal_watcher_stop(&tui->winch_handle);
      tui_set_term_mode(tui, mode, true);
      break;
+    case kTermModeGraphemeClusters:
+      if (!is_set) {
+        tui_set_term_mode(tui, mode, true);
+        tui->did_set_grapheme_cluster_mode = true;
+      }
+      break;
    }
  }
 }
@ -434,6 +444,7 @@ static void terminfo_start(TUIData *tui)
  if (!nsterm) {
    tui_request_term_mode(tui, kTermModeSynchronizedOutput);
    tui_request_term_mode(tui, kTermModeResizeEvents);
+    tui_request_term_mode(tui, kTermModeGraphemeClusters);
  }

  // Don't use DECRQSS in screen or tmux, as they behave strangely when receiving it.
@ -494,7 +505,9 @@ static void terminfo_stop(TUIData *tui)

  // Disable resize events
  tui_set_term_mode(tui, kTermModeResizeEvents, false);
-
+  if (tui->did_set_grapheme_cluster_mode) {
+    tui_set_term_mode(tui, kTermModeGraphemeClusters, false);
+  }
  // May restore old title before exiting alternate screen.
  tui_set_title(tui, NULL_STRING);
  if (ui_client_exit_status == 0) {
@ -1010,7 +1023,7 @@ static void print_cell_at_pos(TUIData *tui, int row, int col, UCell *cell, bool
  char buf[MAX_SCHAR_SIZE];
  schar_get(buf, cell->data);
  int c = utf_ptr2char(buf);
-  bool is_ambiwidth = utf_ambiguous_width(c);
+  bool is_ambiwidth = utf_ambiguous_width(buf);
  if (is_doublewidth && (is_ambiwidth || utf_char2cells(c) == 1)) {
    // If the server used setcellwidths() to treat a single-width char as double-width,
    // it needs to be treated like an ambiguous-width char.
--- a/src/nvim/tui/tui_defs.h
+++ b/src/nvim/tui/tui_defs.h
@ -4,6 +4,7 @@ typedef struct TUIData TUIData;

 typedef enum {
  kTermModeSynchronizedOutput = 2026,
+  kTermModeGraphemeClusters = 2027,
  kTermModeResizeEvents = 2048,
 } TermMode;

--- a/test/functional/api/vim_spec.lua
+++ b/test/functional/api/vim_spec.lua
@ -1435,6 +1435,28 @@ describe('API', function()
    it('cannot handle NULs', function()
      eq(0, api.nvim_strwidth('\0abc'))
    end)
+
+    it('can handle emoji with variant selectors and ZWJ', function()
+      local selector = '❤️'
+      eq(2, fn.strchars(selector))
+      eq(1, fn.strcharlen(selector))
+      eq(2, api.nvim_strwidth(selector))
+
+      local no_selector = '❤'
+      eq(1, fn.strchars(no_selector))
+      eq(1, fn.strcharlen(no_selector))
+      eq(1, api.nvim_strwidth(no_selector))
+
+      local selector_zwj_selector = '🏳️‍⚧️'
+      eq(5, fn.strchars(selector_zwj_selector))
+      eq(1, fn.strcharlen(selector_zwj_selector))
+      eq(2, api.nvim_strwidth(selector_zwj_selector))
+
+      local emoji_zwj_emoji = '🧑‍🌾'
+      eq(3, fn.strchars(emoji_zwj_emoji))
+      eq(1, fn.strcharlen(emoji_zwj_emoji))
+      eq(2, api.nvim_strwidth(emoji_zwj_emoji))
+    end)
  end)

  describe('nvim_get_current_line, nvim_set_current_line', function()
--- a/test/functional/ui/decorations_spec.lua
+++ b/test/functional/ui/decorations_spec.lua
@ -5620,6 +5620,27 @@ l5
      ]]
    })
  end)
+
+  it('supports emoji as signs', function()
+    insert(example_test3)
+    feed 'gg'
+    api.nvim_buf_set_extmark(0, ns, 1, 0, {sign_text='🧑‍🌾'})
+    -- VS16 can change width of character
+    api.nvim_buf_set_extmark(0, ns, 2, 0, {sign_text='❤️'})
+    api.nvim_buf_set_extmark(0, ns, 3, 0, {sign_text='❤'})
+    api.nvim_buf_set_extmark(0, ns, 4, 0, {sign_text='❤x'})
+    screen:expect([[
+      {7:  }^l1                                              |
+      🧑‍🌾l2                                              |
+      ❤️l3                                              |
+      ❤ l4                                              |
+      ❤xl5                                              |
+      {7:  }                                                |
+      {1:~                                                 }|*3
+                                                        |
+    ]])
+    eq("Invalid 'sign_text'", pcall_err(api.nvim_buf_set_extmark, 0, ns, 5, 0, {sign_text='❤️x'}))
+  end)
 end)

 describe('decorations: virt_text', function()
--- a/test/functional/ui/messages_spec.lua
+++ b/test/functional/ui/messages_spec.lua
@ -1436,6 +1436,41 @@ vimComment     xxx match /\s"[^\-:.%#=*].*$/ms=s+1,lc=1  excludenl contains=@vim
    }
  end)

+  it('supports nvim_echo messages with emoji', function()
+    -- stylua: ignore
+    async_meths.nvim_echo(
+      { { 'wow, 🏳️‍⚧️🧑‍🌾❤️😂🏴‍☠️\nvariant ❤️ one\nvariant ❤ two' } }, true, {}
+    )
+
+    screen:expect([[
+                                                                  |
+      {1:~                                                           }|
+      {3:                                                            }|
+      wow, 🏳️‍⚧️🧑‍🌾❤️😂🏴‍☠️                                             |
+      variant ❤️ one                                              |
+      variant ❤ two                                               |
+      {6:Press ENTER or type command to continue}^                     |
+    ]])
+
+    feed '<cr>'
+    screen:expect([[
+      ^                                                            |
+      {1:~                                                           }|*5
+                                                                  |
+    ]])
+
+    feed ':messages<cr>'
+    screen:expect([[
+                                                                  |
+      {1:~                                                           }|
+      {3:                                                            }|
+      wow, 🏳️‍⚧️🧑‍🌾❤️😂🏴‍☠️                                             |
+      variant ❤️ one                                              |
+      variant ❤ two                                               |
+      {6:Press ENTER or type command to continue}^                     |
+    ]])
+  end)
+
  it('prints lines in Ex mode correctly with a burst of carriage returns #19341', function()
    command('set number')
    api.nvim_buf_set_lines(0, 0, 0, true, { 'aaa', 'bbb', 'ccc' })
--- a/test/functional/ui/multibyte_spec.lua
+++ b/test/functional/ui/multibyte_spec.lua
@ -296,6 +296,86 @@ describe('multibyte rendering', function()
    ]],
    }
  end)
+
+  it('supports emoji with variant selectors and ZWJ', function()
+    command('set ruler')
+    insert('🏳️‍⚧️')
+    screen:expect([[
+      ^🏳️‍⚧️                                                          |
+      {1:~                                                           }|*4
+                                                1,1           All |
+    ]])
+
+    feed('a word<esc>')
+    screen:expect([[
+      🏳️‍⚧️ wor^d                                                     |
+      {1:~                                                           }|*4
+                                                1,21-7        All |
+    ]])
+
+    feed('0')
+    screen:expect([[
+      ^🏳️‍⚧️ word                                                     |
+      {1:~                                                           }|*4
+                                                1,1           All |
+    ]])
+
+    feed('l')
+    screen:expect([[
+        🏳️‍⚧️^ word                                                     |
+        {1:~                                                           }|*4
+                                                  1,17-3        All |
+    ]])
+
+    feed('h')
+    screen:expect([[
+      ^🏳️‍⚧️ word                                                     |
+      {1:~                                                           }|*4
+                                                1,1           All |
+    ]])
+
+    feed('o❤️ variant selected<esc>')
+    screen:expect([[
+      🏳️‍⚧️ word                                                     |
+      ❤️ variant selecte^d                                         |
+      {1:~                                                           }|*3
+                                                2,23-19       All |
+    ]])
+
+    feed('0')
+    screen:expect([[
+      🏳️‍⚧️ word                                                     |
+      ^❤️ variant selected                                         |
+      {1:~                                                           }|*3
+                                                2,1           All |
+    ]])
+
+    feed('l')
+    screen:expect([[
+      🏳️‍⚧️ word                                                     |
+      ❤️^ variant selected                                         |
+      {1:~                                                           }|*3
+                                                2,7-3         All |
+    ]])
+
+    feed('h')
+    screen:expect([[
+      🏳️‍⚧️ word                                                     |
+      ^❤️ variant selected                                         |
+      {1:~                                                           }|*3
+                                                2,1           All |
+    ]])
+
+    -- without selector: single width (note column 18 and not 19)
+    feed('o❤ variant selected<esc>')
+    screen:expect([[
+      🏳️‍⚧️ word                                                     |
+      ❤️ variant selected                                         |
+      ❤ variant selecte^d                                          |
+      {1:~                                                           }|*2
+                                                3,20-18       All |
+    ]])
+  end)
 end)

 describe('multibyte rendering: statusline', function()
@ -348,11 +428,12 @@ describe('multibyte rendering: statusline', function()
  it('non-printable followed by MAX_MCO unicode combination points', function()
    command('set statusline≠⃯ᷰ⃐⃧⃝')
    -- U+9F + U+1DF0 + U+20EF + U+0338 + U+20D0 + U+20E7 + U+20DD
+    -- TODO: not ideal, better with plain ">" and then space+combining
    screen:expect([[
-    ^                                        |
-    {1:~                                       }|
-    {3:<9f><1df0><20ef><0338><20d0><20e7><20dd>}|
-                                            |
+      ^                                        |
+      {1:~                                       }|
+      {3:<9f≯⃯ᷰ⃐⃧⃝                                    }|
+                                              |
    ]])
  end)

@ -368,9 +449,20 @@ describe('multibyte rendering: statusline', function()
    }
  end)

-  it('unprintable chars in filename with default stl', function()
+  it('emoji with ZWJ in filename with default stl', function()
    command('file 🧑‍💻')
-    -- TODO: this is wrong but avoids a crash
+    screen:expect {
+      grid = [[
+      ^                                        |
+      {1:~                                       }|
+      {3:🧑‍💻                                      }|
+                                              |
+    ]],
+    }
+  end)
+
+  it('unprintable chars in filename with default stl', function()
+    command('file 🧑💻')
    screen:expect {
      grid = [[
      ^                                        |
@ -381,15 +473,27 @@ describe('multibyte rendering: statusline', function()
    }
  end)

-  it('unprintable chars in filename with custom stl', function()
+  it('emoji with ZWJ in filename with custom stl', function()
    command('set statusline=xx%#ErrorMsg#%f%##yy')
    command('file 🧑‍💻')
-    -- TODO: this is also wrong but also avoids a crash
    screen:expect {
      grid = [[
      ^                                        |
      {1:~                                       }|
-      {3:xx}{9:🧑<200d>💻}{3:yy                          }|
+      {3:xx}{9:🧑‍💻}{3:yy                                  }|
+                                              |
+    ]],
+    }
+  end)
+
+  it('unprintable chars in filename with custom stl', function()
+    command('set statusline=xx%#ErrorMsg#%f%##yy')
+    command('file 🧑💻')
+    screen:expect {
+      grid = [[
+      ^                                        |
+      {1:~                                       }|
+      {3:xx}{9:🧑<200b>💻}{3:yy                          }|
                                              |
    ]],
    }
--- a/test/old/testdir/test_functions.vim
+++ b/test/old/testdir/test_functions.vim
@ -3663,7 +3663,7 @@ func Test_string_reverse()
    call assert_equal('', reverse(v:_null_string))
    for [s1, s2] in [['', ''], ['a', 'a'], ['ab', 'ba'], ['abc', 'cba'],
                   \ ['abcd', 'dcba'], ['«-«-»-»', '»-»-«-«'],
-                   \ ['🇦', '🇦'], ['🇦🇧', '🇧🇦'], ['🇦🇧🇨', '🇨🇧🇦'],
+                   \ ['🇦', '🇦'], ['🇦🇧', '🇦🇧'], ['🇦🇧🇨', '🇨🇦🇧'],
                   \ ['🇦«🇧-🇨»🇩', '🇩»🇨-🇧«🇦']]
      call assert_equal(s2, reverse(s1))
    endfor
--- a/test/old/testdir/test_normal.vim
+++ b/test/old/testdir/test_normal.vim
@ -3897,9 +3897,9 @@ func Test_normal_count_after_operator()
  bw!
 endfunc

-func Test_normal_gj_on_extra_wide_char()
+func Test_normal_gj_on_6_cell_wide_unprintable_char()
  new | 25vsp
-  let text='1 foooooooo ar e  ins‍zwe1 foooooooo ins‍zwei' .
+  let text='1 foooooooo ar e  inszwe1 foooooooo inszwei' .
         \ ' i drei vier fünf sechs sieben acht un zehn elf zwöfl' .
         \ ' dreizehn v ierzehn fünfzehn'
  put =text
--- a/test/unit/mbyte_spec.lua
+++ b/test/unit/mbyte_spec.lua
@ -3,8 +3,15 @@ local itp = t.gen_itp(it)

 local ffi = t.ffi
 local eq = t.eq
+local to_cstr = t.to_cstr
+local ok = t.ok

-local lib = t.cimport('./src/nvim/mbyte.h', './src/nvim/charset.h', './src/nvim/grid.h')
+local lib = t.cimport(
+  './src/nvim/mbyte.h',
+  './src/nvim/charset.h',
+  './src/nvim/grid.h',
+  './src/nvim/option_vars.h'
+)

 describe('mbyte', function()
  -- Convert from bytes to string
@ -45,12 +52,21 @@ describe('mbyte', function()
    end)
  end

-  describe('utfc_ptr2schar_len', function()
+  describe('utfc_ptr2schar', function()
    local function test_seq(seq)
      local firstc = ffi.new('int[1]')
      local buf = ffi.new('char[32]')
-      lib.schar_get(buf, lib.utfc_ptr2schar_len(to_string(seq), #seq, firstc))
-      return { ffi.string(buf), firstc[0] }
+      lib.schar_get(buf, lib.utfc_ptr2schar(to_string(seq), firstc))
+      local str = ffi.string(buf)
+      if 1 > 2 then -- for debugging
+        local tabel = {}
+        for i = 1, #str do
+          table.insert(tabel, string.format('0x%02x', string.byte(str, i)))
+        end
+        print('{ ' .. table.concat(tabel, ', ') .. ' }')
+        io.stdout:flush()
+      end
+      return { str, firstc[0] }
    end

    local function byte(val)
@ -88,7 +104,9 @@ describe('mbyte', function()
      eq(byte(0x7f), test_seq { 0x7f, 0xc2, 0x80 })

      -- Combining character is U+0300
-      eq({ '\x7f\xcc\x80', 0x7f }, test_seq { 0x7f, 0xcc, 0x80 })
+      eq({ '\x29\xcc\x80', 0x29 }, test_seq { 0x29, 0xcc, 0x80 })
+      -- invalid start byte for combining
+      eq({ '\x7f', 0x7f }, test_seq { 0x7f, 0xcc, 0x80 })

      -- No UTF-8 sequence
      eq({ '', 0xc2 }, test_seq { 0xc2, 0x7f, 0xcc })
@ -102,18 +120,21 @@ describe('mbyte', function()
    itp('4-byte sequences', function()
      -- No following combining character
      eq(byte(0x7f), test_seq { 0x7f, 0x7f, 0xcc, 0x80 })
+      eq(byte(0x29), test_seq { 0x29, 0x29, 0xcc, 0x80 })
      -- No second UTF-8 character
      eq(byte(0x7f), test_seq { 0x7f, 0xc2, 0xcc, 0x80 })

      -- Combining character U+0300
-      eq({ '\x7f\xcc\x80', 0x7f }, test_seq { 0x7f, 0xcc, 0x80, 0xcc })
+      eq({ '\x29\xcc\x80', 0x29 }, test_seq { 0x29, 0xcc, 0x80, 0xcc })

      -- No UTF-8 sequence
      eq({ '', 0xc2 }, test_seq { 0xc2, 0x7f, 0xcc, 0x80 })
      -- No following UTF-8 character
      eq({ '\xc2\x80', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0xcc })
      -- Combining character U+0301
-      eq({ '\xc2\x80\xcc\x81', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0x81 })
+      eq({ '\xc2\xbc\xcc\x81', 0xbc }, test_seq { 0xc2, 0xbc, 0xcc, 0x81 })
+      -- U+0080 : not a valid start char
+      eq({ '\xc2\x80', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0x81 })

      -- One UTF-8 character
      eq({ '\xf4\x80\x80\x80', 0x100000 }, test_seq { 0xf4, 0x80, 0x80, 0x80 })
@ -126,36 +147,36 @@ describe('mbyte', function()
      eq(byte(0x7f), test_seq { 0x7f, 0xc2, 0xcc, 0x80, 0x80 })

      -- Combining character U+0300
-      eq({ '\x7f\xcc\x80', 0x7f }, test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x00 })
+      eq({ '\x29\xcc\x80', 0x29 }, test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x00 })

      -- Combining characters U+0300 and U+0301
-      eq({ '\x7f\xcc\x80\xcc\x81', 0x7f }, test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81 })
+      eq({ '\x29\xcc\x80\xcc\x81', 0x29 }, test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81 })
      -- Combining characters U+0300, U+0301, U+0302
      eq(
-        { '\x7f\xcc\x80\xcc\x81\xcc\x82', 0x7f },
-        test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82 }
+        { '\x29\xcc\x80\xcc\x81\xcc\x82', 0x29 },
+        test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82 }
      )
      -- Combining characters U+0300, U+0301, U+0302, U+0303
      eq(
-        { '\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83', 0x7f },
-        test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83 }
+        { '\x29\xcc\x80\xcc\x81\xcc\x82\xcc\x83', 0x29 },
+        test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83 }
      )
      -- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304
      eq(
-        { '\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84', 0x7f },
-        test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84 }
+        { '\x29\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84', 0x29 },
+        test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84 }
      )
      -- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304, U+0305
      eq(
-        { '\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85', 0x7f },
-        test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85 }
+        { '\x29\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85', 0x29 },
+        test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85 }
      )

      -- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304, U+0305, U+0306
      eq(
-        { '\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85\xcc\x86', 0x7f },
+        { '\x29\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85\xcc\x86', 0x29 },
        test_seq {
-          0x7f,
+          0x29,
          0xcc,
          0x80,
          0xcc,
@ -175,18 +196,18 @@ describe('mbyte', function()

      -- Only three following combining characters U+0300, U+0301, U+0302
      eq(
-        { '\x7f\xcc\x80\xcc\x81\xcc\x82', 0x7f },
-        test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xc2, 0x80, 0xcc, 0x84, 0xcc, 0x85 }
+        { '\x29\xcc\x80\xcc\x81\xcc\x82', 0x29 },
+        test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xc2, 0x80, 0xcc, 0x84, 0xcc, 0x85 }
      )

      -- No UTF-8 sequence
      eq({ '', 0xc2 }, test_seq { 0xc2, 0x7f, 0xcc, 0x80, 0x80 })
      -- No following UTF-8 character
-      eq({ '\xc2\x80', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0xcc, 0x80 })
+      eq({ '\xc2\xbc', 0xbc }, test_seq { 0xc2, 0xbc, 0xcc, 0xcc, 0x80 })
      -- Combining character U+0301
-      eq({ '\xc2\x80\xcc\x81', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0x81, 0x7f })
+      eq({ '\xc2\xbc\xcc\x81', 0xbc }, test_seq { 0xc2, 0xbc, 0xcc, 0x81, 0x7f })
      -- Combining character U+0301
-      eq({ '\xc2\x80\xcc\x81', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0x81, 0xcc })
+      eq({ '\xc2\xbc\xcc\x81', 0xbc }, test_seq { 0xc2, 0xbc, 0xcc, 0x81, 0xcc })

      -- One UTF-8 character
      eq({ '\xf4\x80\x80\x80', 0x100000 }, test_seq { 0xf4, 0x80, 0x80, 0x80, 0x7f })
@ -205,8 +226,6 @@ describe('mbyte', function()
  end)

  describe('utf_cp_bounds_len', function()
-    local to_cstr = t.to_cstr
-
    local tests = {
      {
        name = 'for valid string',
@ -273,4 +292,52 @@ describe('mbyte', function()
      eq(expected_offsets, { b = b_offsets, e = e_offsets })
    end)
  end)
+
+  itp('utf_head_off', function()
+    local function check(str, expected_glyphs)
+      local len = #str
+      local cstr = to_cstr(str)
+      local breaks = { 0 } -- SOT
+      local pos = 0
+      local mb_glyphs = {}
+      while pos < len do
+        local clen = lib.utfc_ptr2len(cstr + pos)
+        ok(clen > 0) -- otherwise we get stuck
+        if clen > 1 then
+          table.insert(mb_glyphs, string.sub(str, pos + 1, pos + clen))
+        end
+        pos = pos + clen
+        table.insert(breaks, pos)
+      end
+      eq(breaks[#breaks], len) -- include EOT as break
+      -- we could also send in breaks, but this is more human readable
+      eq(mb_glyphs, expected_glyphs)
+
+      for i = 1, #breaks - 1 do
+        local start, next = breaks[i], breaks[i + 1]
+
+        for p = start, next - 1 do
+          eq(p - start, lib.utf_head_off(cstr, cstr + p))
+        end
+      end
+      eq(0, lib.utf_head_off(cstr, cstr + len)) -- NUL byte is safe
+    end
+    -- stylua doesn't like ZWJ chars..
+    -- stylua: ignore start
+    check('hej och hå 🧑‍🌾!', { 'å', '🧑‍🌾' })
+    -- emoji only (various kinds of combinations, use g8 to see them)
+    check("🏳️‍⚧️🧑‍🌾❤️😂🏴‍☠️", {"🏳️‍⚧️", "🧑‍🌾", "❤️", "😂", "🏴‍☠️"})
+    check('🏳️‍⚧️xy🧑‍🌾\r❤️😂å🏴‍☠️', { '🏳️‍⚧️', '🧑‍🌾', '❤️', '😂', 'å', '🏴‍☠️', '' })
+
+    check('🇦🅱️ 🇦🇽 🇦🇨🇦 🇲🇽🇹🇱',{'🇦', '🅱️', '🇦🇽', '🇦🇨', '🇦', '🇲🇽', '🇹🇱'})
+    check('🏴󠁧󠁢󠁳󠁣󠁴󠁿🏴󠁧󠁢󠁷󠁬󠁳󠁿', {'🏴󠁧󠁢󠁳󠁣󠁴󠁿', '🏴󠁧󠁢󠁷󠁬󠁳󠁿'})
+
+    lib.p_arshape = true -- default
+    check('سلام', { 'س', 'لا', 'م' })
+    lib.p_arshape = false
+    check('سلام', { 'س', 'ل', 'ا', 'م' })
+
+    check('L̓̉̑̒̌̚ơ̗̌̒̄̀ŕ̈̈̎̐̕è̇̅̄̄̐m̖̟̟̅̄̚', {'L̓̉̑̒̌̚', 'ơ̗̌̒̄̀', 'ŕ̈̈̎̐̕', 'è̇̅̄̄̐', 'm̖̟̟̅̄̚'})
+    -- stylua: ignore end
+  end)
 end)