From ad5a155b1f4b387d3aaa54c91d0146cb0287bb9f Mon Sep 17 00:00:00 2001
From: VanaIgr <vanaigranov@gmail.com>
Date: Mon, 26 Feb 2024 04:12:55 -0600
Subject: [PATCH] fix(mbyte): fix bugs in utf_cp_*_off() functions

Problems:
- Illegal bytes after valid UTF-8 char cause utf_cp_*_off() to fail.
- When stream isn't NUL-terminated, utf_cp_*_off() may go over the end.
Solution: Don't go over end of the char of end of the string.
---
 runtime/doc/dev_vimpatch.txt |   3 +-
 src/nvim/buffer.c            |   2 +-
 src/nvim/ex_getln.c          |   4 +-
 src/nvim/grid.c              |   2 +-
 src/nvim/lua/stdlib.c        |  14 +++--
 src/nvim/mbyte.c             | 117 +++++++++++------------------------
 src/nvim/mbyte.h             |  10 +++
 src/nvim/mbyte_defs.h        |   5 ++
 test/unit/mbyte_spec.lua     |  70 +++++++++++++++++++++
 9 files changed, 134 insertions(+), 93 deletions(-)

diff --git a/runtime/doc/dev_vimpatch.txt b/runtime/doc/dev_vimpatch.txt
index 4ed585589c..e72aa05c90 100644
--- a/runtime/doc/dev_vimpatch.txt
+++ b/runtime/doc/dev_vimpatch.txt
@@ -203,7 +203,8 @@ information.
   mb_off2cells                                         utf_off2cells
   mb_ptr2char                                          utf_ptr2char
   mb_head_off                                          utf_head_off
-  mb_tail_off                                         utf_cp_tail_off
+  mb_tail_off                                          utf_cp_bounds
+  mb_off_next                                          utf_cp_bounds
   mb_lefthalve                                        grid_lefthalve
   mb_fix_col                                           grid_fix_col
   utf_off2cells                                       grid_off2cells
diff --git a/src/nvim/buffer.c b/src/nvim/buffer.c
index ac6ccf223c..2ceed20768 100644
--- a/src/nvim/buffer.c
+++ b/src/nvim/buffer.c
@@ -3436,7 +3436,7 @@ void maketitle(void)
       int len = (int)strlen(buf_p);
       if (len > 100) {
         len -= 100;
-        len += utf_cp_tail_off(buf_p, buf_p + len) + 1;
+        len += utf_cp_bounds(buf_p, buf_p + len).end_off;
         buf_p += len;
       }
       STRCPY(icon_str, buf_p);
diff --git a/src/nvim/ex_getln.c b/src/nvim/ex_getln.c
index 8af5730ffe..c396bbaae3 100644
--- a/src/nvim/ex_getln.c
+++ b/src/nvim/ex_getln.c
@@ -1512,8 +1512,8 @@ static int command_line_erase_chars(CommandLineState *s)
   }
 
   if (s->c == K_DEL) {
-    ccline.cmdpos += mb_off_next(ccline.cmdbuff,
-                                 ccline.cmdbuff + ccline.cmdpos);
+    CharBoundsOff bounds = utf_cp_bounds(ccline.cmdbuff, ccline.cmdbuff + ccline.cmdpos);
+    ccline.cmdpos += bounds.begin_off != 0 ? bounds.end_off : 0;
   }
 
   if (ccline.cmdpos > 0) {
diff --git a/src/nvim/grid.c b/src/nvim/grid.c
index b6a8a8bd9f..e386853022 100644
--- a/src/nvim/grid.c
+++ b/src/nvim/grid.c
@@ -277,7 +277,7 @@ void line_do_arabic_shape(schar_T *buf, int cols)
       // Too bigly, discard one code-point.
       // This should be enough as c0 cannot grow more than from 2 to 4 bytes
       // (base arabic to extended arabic)
-      rest -= (size_t)utf_cp_head_off(scbuf + off, scbuf + off + rest - 1) + 1;
+      rest -= (size_t)utf_cp_bounds(scbuf + off, scbuf + off + rest - 1).begin_off + 1;
     }
     memcpy(scbuf_new + len, scbuf + off, rest);
     buf[i] = schar_from_buf(scbuf_new, len + rest);
diff --git a/src/nvim/lua/stdlib.c b/src/nvim/lua/stdlib.c
index 5fea1ba5d8..8f58fd1a1a 100644
--- a/src/nvim/lua/stdlib.c
+++ b/src/nvim/lua/stdlib.c
@@ -226,11 +226,12 @@ static int nlua_str_utf_start(lua_State *const lstate) FUNC_ATTR_NONNULL_ALL
   size_t s1_len;
   const char *s1 = luaL_checklstring(lstate, 1, &s1_len);
   ptrdiff_t offset = luaL_checkinteger(lstate, 2);
-  if (offset < 0 || offset > (intptr_t)s1_len) {
+  if (offset <= 0 || offset > (intptr_t)s1_len) {
     return luaL_error(lstate, "index out of range");
   }
-  int head_offset = -utf_cp_head_off(s1, s1 + offset - 1);
-  lua_pushinteger(lstate, head_offset);
+  size_t const off = (size_t)(offset - 1);
+  int head_off = -utf_cp_bounds_len(s1, s1 + off, (int)(s1_len - off)).begin_off;
+  lua_pushinteger(lstate, head_off);
   return 1;
 }
 
@@ -246,11 +247,12 @@ static int nlua_str_utf_end(lua_State *const lstate) FUNC_ATTR_NONNULL_ALL
   size_t s1_len;
   const char *s1 = luaL_checklstring(lstate, 1, &s1_len);
   ptrdiff_t offset = luaL_checkinteger(lstate, 2);
-  if (offset < 0 || offset > (intptr_t)s1_len) {
+  if (offset <= 0 || offset > (intptr_t)s1_len) {
     return luaL_error(lstate, "index out of range");
   }
-  int tail_offset = utf_cp_tail_off(s1, s1 + offset - 1);
-  lua_pushinteger(lstate, tail_offset);
+  size_t const off = (size_t)(offset - 1);
+  int tail_off = utf_cp_bounds_len(s1, s1 + off, (int)(s1_len - off)).end_off - 1;
+  lua_pushinteger(lstate, tail_off);
   return 1;
 }
 
diff --git a/src/nvim/mbyte.c b/src/nvim/mbyte.c
index f8451e62e2..cf206aa68b 100644
--- a/src/nvim/mbyte.c
+++ b/src/nvim/mbyte.c
@@ -1884,99 +1884,52 @@ void mb_copy_char(const char **const fp, char **const tp)
   *fp += l;
 }
 
-/// Return the offset from "p_in" to the first byte of a character.  When "p_in" is
-/// at the start of a character 0 is returned, otherwise the offset to the next
-/// character.  Can start anywhere in a stream of bytes.
-int mb_off_next(const char *base, const char *p_in)
-{
-  const uint8_t *p = (uint8_t *)p_in;
-  int i;
-
-  if (*p < 0x80) {              // be quick for ASCII
-    return 0;
-  }
-
-  // Find the next character that isn't 10xx.xxxx
-  for (i = 0; (p[i] & 0xc0) == 0x80; i++) {}
-  if (i > 0) {
-    int j;
-    // Check for illegal sequence.
-    for (j = 0; p - j > (uint8_t *)base; j++) {
-      if ((p[-j] & 0xc0) != 0x80) {
-        break;
-      }
-    }
-    if (utf8len_tab[p[-j]] != i + j) {
-      return 0;
-    }
-  }
-  return i;
-}
-
-/// Return the offset from `p_in` to the last byte of the codepoint it points
-/// to.  Can start anywhere in a stream of bytes.
+/// Returns the offset in bytes from "p_in" to the first and one-past-end bytes
+/// of the codepoint it points to.
+/// "p_in" can point anywhere in a stream of bytes.
+/// "p_len" limits number of bytes after "p_in".
 /// Note: Counts individual codepoints of composed characters separately.
-int utf_cp_tail_off(const char *base, const char *p_in)
+CharBoundsOff utf_cp_bounds_len(char const *base, char const *p_in, int p_len)
+  FUNC_ATTR_PURE FUNC_ATTR_NONNULL_ALL
 {
-  const uint8_t *p = (uint8_t *)p_in;
-  int i;
-  int j;
-
-  if (*p == NUL) {
-    return 0;
+  assert(base <= p_in && p_len > 0);
+  uint8_t const *const b = (uint8_t *)base;
+  uint8_t const *const p = (uint8_t *)p_in;
+  if (*p < 0x80U) {  // be quick for ASCII
+    return (CharBoundsOff){ 0, 1 };
   }
 
-  // Find the last character that is 10xx.xxxx
-  for (i = 0; (p[i + 1] & 0xc0) == 0x80; i++) {}
-
-  // Check for illegal sequence.
-  for (j = 0; p_in - j > base; j++) {
-    if ((p[-j] & 0xc0) != 0x80) {
-      break;
+  int const max_first_off = -MIN((int)(p - b), MB_MAXCHAR - 1);
+  int first_off = 0;
+  for (; utf_is_trail_byte(p[first_off]); first_off--) {
+    if (first_off == max_first_off) {  // failed to find first byte
+      return (CharBoundsOff){ 0, 1 };
     }
   }
 
-  if (utf8len_tab[p[-j]] != i + j + 1) {
-    return 0;
+  int const max_end_off = utf8len_tab[p[first_off]] + first_off;
+  if (max_end_off <= 0 || max_end_off > p_len) {  // illegal or incomplete sequence
+    return (CharBoundsOff){ 0, 1 };
   }
-  return i;
+
+  for (int end_off = 1; end_off < max_end_off; end_off++) {
+    if (!utf_is_trail_byte(p[end_off])) {  // not enough trail bytes
+      return (CharBoundsOff){ 0, 1 };
+    }
+  }
+
+  return (CharBoundsOff){ .begin_off = (int8_t)-first_off, .end_off = (int8_t)max_end_off };
 }
 
-/// Return the offset from "p" to the first byte of the codepoint it points
-/// to. Can start anywhere in a stream of bytes.
-/// Note: Unlike `utf_head_off`, this counts individual codepoints of composed characters
-/// separately.
-///
-/// @param[in] base  Pointer to start of string
-/// @param[in] p     Pointer to byte for which to return the offset to the previous codepoint
-//
-/// @return 0 if invalid sequence, else number of bytes to previous codepoint
-int utf_cp_head_off(const char *base, const char *p)
+/// Returns the offset in bytes from "p_in" to the first and one-past-end bytes
+/// of the codepoint it points to.
+/// "p_in" can point anywhere in a stream of bytes.
+/// Stream must be NUL-terminated.
+/// Note: Counts individual codepoints of composed characters separately.
+CharBoundsOff utf_cp_bounds(char const *base, char const *p_in)
+  FUNC_ATTR_PURE FUNC_ATTR_NONNULL_ALL
 {
-  int i;
-
-  if (*p == NUL) {
-    return 0;
-  }
-
-  // Find the first character that is not 10xx.xxxx
-  for (i = 0; p - i >= base; i++) {
-    if (((uint8_t)p[-i] & 0xc0) != 0x80) {
-      break;
-    }
-  }
-
-  // Find the last character that is 10xx.xxxx (condition terminates on NUL)
-  int j = 1;
-  while (((uint8_t)p[j] & 0xc0) == 0x80) {
-    j++;
-  }
-
-  // Check for illegal sequence.
-  if (utf8len_tab[(uint8_t)p[-i]] != j + i) {
-    return 0;
-  }
-  return i;
+  return utf_cp_bounds_len(base, p_in, INT_MAX);
 }
 
 // Find the next illegal byte sequence.
diff --git a/src/nvim/mbyte.h b/src/nvim/mbyte.h
index 2f17c706c1..ddac040aae 100644
--- a/src/nvim/mbyte.h
+++ b/src/nvim/mbyte.h
@@ -52,6 +52,16 @@ extern const uint8_t utf8len_tab[256];
 #define MB_PTR_BACK(s, p) \
   (p -= utf_head_off((char *)(s), (char *)(p) - 1) + 1)
 
+/// Check whether a given UTF-8 byte is a trailing byte (10xx.xxxx).
+static inline bool utf_is_trail_byte(uint8_t byte)
+  REAL_FATTR_CONST REAL_FATTR_ALWAYS_INLINE;
+
+static inline bool utf_is_trail_byte(uint8_t const byte)
+{
+  // uint8_t is for clang to use smaller cmp
+  return (uint8_t)(byte & 0xC0U) == 0x80U;
+}
+
 static inline CharInfo utf_ptr2CharInfo(char const *p_in)
   REAL_FATTR_NONNULL_ALL REAL_FATTR_PURE REAL_FATTR_WARN_UNUSED_RESULT REAL_FATTR_ALWAYS_INLINE;
 
diff --git a/src/nvim/mbyte_defs.h b/src/nvim/mbyte_defs.h
index 058ec353e5..8670a0595d 100644
--- a/src/nvim/mbyte_defs.h
+++ b/src/nvim/mbyte_defs.h
@@ -66,3 +66,8 @@ typedef struct {
   char *ptr;     ///< Pointer to the first byte of the character.
   CharInfo chr;  ///< Information about the character.
 } StrCharInfo;
+
+typedef struct {
+  int8_t begin_off;  ///< Offset to the first byte of the codepoint.
+  int8_t end_off;    ///< Offset to one past the end byte of the codepoint.
+} CharBoundsOff;
diff --git a/test/unit/mbyte_spec.lua b/test/unit/mbyte_spec.lua
index 67220d7c19..00a8c06ceb 100644
--- a/test/unit/mbyte_spec.lua
+++ b/test/unit/mbyte_spec.lua
@@ -203,4 +203,74 @@ describe('mbyte', function()
       )
     end)
   end)
+
+  describe('utf_cp_bounds_len', function()
+    local to_cstr = helpers.to_cstr
+
+    local tests = {
+      {
+        name = 'for valid string',
+        str = 'iÀiiⱠiⱠⱠ𐀀i',
+        offsets = {
+          b = { 0, 0, 1, 0, 0, 0, 1, 2, 0, 0, 1, 2, 0, 1, 2, 0, 1, 2, 3, 0 },
+          e = { 1, 2, 1, 1, 1, 3, 2, 1, 1, 3, 2, 1, 3, 2, 1, 4, 3, 2, 1, 1 },
+        },
+      },
+      {
+        name = 'for string with incomplete sequence',
+        str = 'i\xC3iÀⱠiÀ\xE2\xB1Ⱡ\xF0\x90\x80',
+        offsets = {
+          b = { 0, 0, 0, 0, 1, 0, 1, 2, 0, 0, 1, 0, 0, 0, 1, 2, 0, 0, 0 },
+          e = { 1, 1, 1, 2, 1, 3, 2, 1, 1, 2, 1, 1, 1, 3, 2, 1, 1, 1, 1 },
+        },
+      },
+      {
+        name = 'for string with trailing bytes after multibyte',
+        str = 'iÀ\xA0Ⱡ\xA0Ⱡ𐀀\xA0i',
+        offsets = {
+          b = { 0, 0, 1, 0, 0, 1, 2, 0, 0, 1, 2, 0, 1, 2, 3, 0, 0 },
+          e = { 1, 2, 1, 1, 3, 2, 1, 1, 3, 2, 1, 4, 3, 2, 1, 1, 1 },
+        },
+      },
+    }
+
+    for _, test in ipairs(tests) do
+      itp(test.name, function()
+        local cstr = to_cstr(test.str)
+        local b_offsets, e_offsets = {}, {}
+        for i = 1, #test.str do
+          local result = lib.utf_cp_bounds_len(cstr, cstr + i - 1, #test.str - (i - 1))
+          table.insert(b_offsets, result.begin_off)
+          table.insert(e_offsets, result.end_off)
+        end
+        eq(test.offsets, { b = b_offsets, e = e_offsets })
+      end)
+    end
+
+    itp('does not read before start', function()
+      local str = '𐀀'
+      local expected_offsets = { b = { 0, 0, 0 }, e = { 1, 1, 1 } }
+      local cstr = to_cstr(str) + 1
+      local b_offsets, e_offsets = {}, {}
+      for i = 1, 3 do
+        local result = lib.utf_cp_bounds_len(cstr, cstr + i - 1, 3 - (i - 1))
+        table.insert(b_offsets, result.begin_off)
+        table.insert(e_offsets, result.end_off)
+      end
+      eq(expected_offsets, { b = b_offsets, e = e_offsets })
+    end)
+
+    itp('does not read past the end', function()
+      local str = '𐀀'
+      local expected_offsets = { b = { 0, 0, 0 }, e = { 1, 1, 1 } }
+      local cstr = to_cstr(str)
+      local b_offsets, e_offsets = {}, {}
+      for i = 1, 3 do
+        local result = lib.utf_cp_bounds_len(cstr, cstr + i - 1, 3 - (i - 1))
+        table.insert(b_offsets, result.begin_off)
+        table.insert(e_offsets, result.end_off)
+      end
+      eq(expected_offsets, { b = b_offsets, e = e_offsets })
+    end)
+  end)
 end)