diff --git a/runtime/doc/dev_vimpatch.txt b/runtime/doc/dev_vimpatch.txt index 4ed585589c..e72aa05c90 100644 --- a/runtime/doc/dev_vimpatch.txt +++ b/runtime/doc/dev_vimpatch.txt @@ -203,7 +203,8 @@ information. mb_off2cells utf_off2cells mb_ptr2char utf_ptr2char mb_head_off utf_head_off - mb_tail_off utf_cp_tail_off + mb_tail_off utf_cp_bounds + mb_off_next utf_cp_bounds mb_lefthalve grid_lefthalve mb_fix_col grid_fix_col utf_off2cells grid_off2cells diff --git a/src/nvim/buffer.c b/src/nvim/buffer.c index ac6ccf223c..2ceed20768 100644 --- a/src/nvim/buffer.c +++ b/src/nvim/buffer.c @@ -3436,7 +3436,7 @@ void maketitle(void) int len = (int)strlen(buf_p); if (len > 100) { len -= 100; - len += utf_cp_tail_off(buf_p, buf_p + len) + 1; + len += utf_cp_bounds(buf_p, buf_p + len).end_off; buf_p += len; } STRCPY(icon_str, buf_p); diff --git a/src/nvim/ex_getln.c b/src/nvim/ex_getln.c index 8af5730ffe..c396bbaae3 100644 --- a/src/nvim/ex_getln.c +++ b/src/nvim/ex_getln.c @@ -1512,8 +1512,8 @@ static int command_line_erase_chars(CommandLineState *s) } if (s->c == K_DEL) { - ccline.cmdpos += mb_off_next(ccline.cmdbuff, - ccline.cmdbuff + ccline.cmdpos); + CharBoundsOff bounds = utf_cp_bounds(ccline.cmdbuff, ccline.cmdbuff + ccline.cmdpos); + ccline.cmdpos += bounds.begin_off != 0 ? bounds.end_off : 0; } if (ccline.cmdpos > 0) { diff --git a/src/nvim/grid.c b/src/nvim/grid.c index b6a8a8bd9f..e386853022 100644 --- a/src/nvim/grid.c +++ b/src/nvim/grid.c @@ -277,7 +277,7 @@ void line_do_arabic_shape(schar_T *buf, int cols) // Too bigly, discard one code-point. // This should be enough as c0 cannot grow more than from 2 to 4 bytes // (base arabic to extended arabic) - rest -= (size_t)utf_cp_head_off(scbuf + off, scbuf + off + rest - 1) + 1; + rest -= (size_t)utf_cp_bounds(scbuf + off, scbuf + off + rest - 1).begin_off + 1; } memcpy(scbuf_new + len, scbuf + off, rest); buf[i] = schar_from_buf(scbuf_new, len + rest); diff --git a/src/nvim/lua/stdlib.c b/src/nvim/lua/stdlib.c index 5fea1ba5d8..8f58fd1a1a 100644 --- a/src/nvim/lua/stdlib.c +++ b/src/nvim/lua/stdlib.c @@ -226,11 +226,12 @@ static int nlua_str_utf_start(lua_State *const lstate) FUNC_ATTR_NONNULL_ALL size_t s1_len; const char *s1 = luaL_checklstring(lstate, 1, &s1_len); ptrdiff_t offset = luaL_checkinteger(lstate, 2); - if (offset < 0 || offset > (intptr_t)s1_len) { + if (offset <= 0 || offset > (intptr_t)s1_len) { return luaL_error(lstate, "index out of range"); } - int head_offset = -utf_cp_head_off(s1, s1 + offset - 1); - lua_pushinteger(lstate, head_offset); + size_t const off = (size_t)(offset - 1); + int head_off = -utf_cp_bounds_len(s1, s1 + off, (int)(s1_len - off)).begin_off; + lua_pushinteger(lstate, head_off); return 1; } @@ -246,11 +247,12 @@ static int nlua_str_utf_end(lua_State *const lstate) FUNC_ATTR_NONNULL_ALL size_t s1_len; const char *s1 = luaL_checklstring(lstate, 1, &s1_len); ptrdiff_t offset = luaL_checkinteger(lstate, 2); - if (offset < 0 || offset > (intptr_t)s1_len) { + if (offset <= 0 || offset > (intptr_t)s1_len) { return luaL_error(lstate, "index out of range"); } - int tail_offset = utf_cp_tail_off(s1, s1 + offset - 1); - lua_pushinteger(lstate, tail_offset); + size_t const off = (size_t)(offset - 1); + int tail_off = utf_cp_bounds_len(s1, s1 + off, (int)(s1_len - off)).end_off - 1; + lua_pushinteger(lstate, tail_off); return 1; } diff --git a/src/nvim/mbyte.c b/src/nvim/mbyte.c index f8451e62e2..cf206aa68b 100644 --- a/src/nvim/mbyte.c +++ b/src/nvim/mbyte.c @@ -1884,99 +1884,52 @@ void mb_copy_char(const char **const fp, char **const tp) *fp += l; } -/// Return the offset from "p_in" to the first byte of a character. When "p_in" is -/// at the start of a character 0 is returned, otherwise the offset to the next -/// character. Can start anywhere in a stream of bytes. -int mb_off_next(const char *base, const char *p_in) -{ - const uint8_t *p = (uint8_t *)p_in; - int i; - - if (*p < 0x80) { // be quick for ASCII - return 0; - } - - // Find the next character that isn't 10xx.xxxx - for (i = 0; (p[i] & 0xc0) == 0x80; i++) {} - if (i > 0) { - int j; - // Check for illegal sequence. - for (j = 0; p - j > (uint8_t *)base; j++) { - if ((p[-j] & 0xc0) != 0x80) { - break; - } - } - if (utf8len_tab[p[-j]] != i + j) { - return 0; - } - } - return i; -} - -/// Return the offset from `p_in` to the last byte of the codepoint it points -/// to. Can start anywhere in a stream of bytes. +/// Returns the offset in bytes from "p_in" to the first and one-past-end bytes +/// of the codepoint it points to. +/// "p_in" can point anywhere in a stream of bytes. +/// "p_len" limits number of bytes after "p_in". /// Note: Counts individual codepoints of composed characters separately. -int utf_cp_tail_off(const char *base, const char *p_in) +CharBoundsOff utf_cp_bounds_len(char const *base, char const *p_in, int p_len) + FUNC_ATTR_PURE FUNC_ATTR_NONNULL_ALL { - const uint8_t *p = (uint8_t *)p_in; - int i; - int j; - - if (*p == NUL) { - return 0; + assert(base <= p_in && p_len > 0); + uint8_t const *const b = (uint8_t *)base; + uint8_t const *const p = (uint8_t *)p_in; + if (*p < 0x80U) { // be quick for ASCII + return (CharBoundsOff){ 0, 1 }; } - // Find the last character that is 10xx.xxxx - for (i = 0; (p[i + 1] & 0xc0) == 0x80; i++) {} - - // Check for illegal sequence. - for (j = 0; p_in - j > base; j++) { - if ((p[-j] & 0xc0) != 0x80) { - break; + int const max_first_off = -MIN((int)(p - b), MB_MAXCHAR - 1); + int first_off = 0; + for (; utf_is_trail_byte(p[first_off]); first_off--) { + if (first_off == max_first_off) { // failed to find first byte + return (CharBoundsOff){ 0, 1 }; } } - if (utf8len_tab[p[-j]] != i + j + 1) { - return 0; + int const max_end_off = utf8len_tab[p[first_off]] + first_off; + if (max_end_off <= 0 || max_end_off > p_len) { // illegal or incomplete sequence + return (CharBoundsOff){ 0, 1 }; } - return i; + + for (int end_off = 1; end_off < max_end_off; end_off++) { + if (!utf_is_trail_byte(p[end_off])) { // not enough trail bytes + return (CharBoundsOff){ 0, 1 }; + } + } + + return (CharBoundsOff){ .begin_off = (int8_t)-first_off, .end_off = (int8_t)max_end_off }; } -/// Return the offset from "p" to the first byte of the codepoint it points -/// to. Can start anywhere in a stream of bytes. -/// Note: Unlike `utf_head_off`, this counts individual codepoints of composed characters -/// separately. -/// -/// @param[in] base Pointer to start of string -/// @param[in] p Pointer to byte for which to return the offset to the previous codepoint -// -/// @return 0 if invalid sequence, else number of bytes to previous codepoint -int utf_cp_head_off(const char *base, const char *p) +/// Returns the offset in bytes from "p_in" to the first and one-past-end bytes +/// of the codepoint it points to. +/// "p_in" can point anywhere in a stream of bytes. +/// Stream must be NUL-terminated. +/// Note: Counts individual codepoints of composed characters separately. +CharBoundsOff utf_cp_bounds(char const *base, char const *p_in) + FUNC_ATTR_PURE FUNC_ATTR_NONNULL_ALL { - int i; - - if (*p == NUL) { - return 0; - } - - // Find the first character that is not 10xx.xxxx - for (i = 0; p - i >= base; i++) { - if (((uint8_t)p[-i] & 0xc0) != 0x80) { - break; - } - } - - // Find the last character that is 10xx.xxxx (condition terminates on NUL) - int j = 1; - while (((uint8_t)p[j] & 0xc0) == 0x80) { - j++; - } - - // Check for illegal sequence. - if (utf8len_tab[(uint8_t)p[-i]] != j + i) { - return 0; - } - return i; + return utf_cp_bounds_len(base, p_in, INT_MAX); } // Find the next illegal byte sequence. diff --git a/src/nvim/mbyte.h b/src/nvim/mbyte.h index 2f17c706c1..ddac040aae 100644 --- a/src/nvim/mbyte.h +++ b/src/nvim/mbyte.h @@ -52,6 +52,16 @@ extern const uint8_t utf8len_tab[256]; #define MB_PTR_BACK(s, p) \ (p -= utf_head_off((char *)(s), (char *)(p) - 1) + 1) +/// Check whether a given UTF-8 byte is a trailing byte (10xx.xxxx). +static inline bool utf_is_trail_byte(uint8_t byte) + REAL_FATTR_CONST REAL_FATTR_ALWAYS_INLINE; + +static inline bool utf_is_trail_byte(uint8_t const byte) +{ + // uint8_t is for clang to use smaller cmp + return (uint8_t)(byte & 0xC0U) == 0x80U; +} + static inline CharInfo utf_ptr2CharInfo(char const *p_in) REAL_FATTR_NONNULL_ALL REAL_FATTR_PURE REAL_FATTR_WARN_UNUSED_RESULT REAL_FATTR_ALWAYS_INLINE; diff --git a/src/nvim/mbyte_defs.h b/src/nvim/mbyte_defs.h index 058ec353e5..8670a0595d 100644 --- a/src/nvim/mbyte_defs.h +++ b/src/nvim/mbyte_defs.h @@ -66,3 +66,8 @@ typedef struct { char *ptr; ///< Pointer to the first byte of the character. CharInfo chr; ///< Information about the character. } StrCharInfo; + +typedef struct { + int8_t begin_off; ///< Offset to the first byte of the codepoint. + int8_t end_off; ///< Offset to one past the end byte of the codepoint. +} CharBoundsOff; diff --git a/test/unit/mbyte_spec.lua b/test/unit/mbyte_spec.lua index 67220d7c19..00a8c06ceb 100644 --- a/test/unit/mbyte_spec.lua +++ b/test/unit/mbyte_spec.lua @@ -203,4 +203,74 @@ describe('mbyte', function() ) end) end) + + describe('utf_cp_bounds_len', function() + local to_cstr = helpers.to_cstr + + local tests = { + { + name = 'for valid string', + str = 'iÀiiⱠiⱠⱠ𐀀i', + offsets = { + b = { 0, 0, 1, 0, 0, 0, 1, 2, 0, 0, 1, 2, 0, 1, 2, 0, 1, 2, 3, 0 }, + e = { 1, 2, 1, 1, 1, 3, 2, 1, 1, 3, 2, 1, 3, 2, 1, 4, 3, 2, 1, 1 }, + }, + }, + { + name = 'for string with incomplete sequence', + str = 'i\xC3iÀⱠiÀ\xE2\xB1Ⱡ\xF0\x90\x80', + offsets = { + b = { 0, 0, 0, 0, 1, 0, 1, 2, 0, 0, 1, 0, 0, 0, 1, 2, 0, 0, 0 }, + e = { 1, 1, 1, 2, 1, 3, 2, 1, 1, 2, 1, 1, 1, 3, 2, 1, 1, 1, 1 }, + }, + }, + { + name = 'for string with trailing bytes after multibyte', + str = 'iÀ\xA0Ⱡ\xA0Ⱡ𐀀\xA0i', + offsets = { + b = { 0, 0, 1, 0, 0, 1, 2, 0, 0, 1, 2, 0, 1, 2, 3, 0, 0 }, + e = { 1, 2, 1, 1, 3, 2, 1, 1, 3, 2, 1, 4, 3, 2, 1, 1, 1 }, + }, + }, + } + + for _, test in ipairs(tests) do + itp(test.name, function() + local cstr = to_cstr(test.str) + local b_offsets, e_offsets = {}, {} + for i = 1, #test.str do + local result = lib.utf_cp_bounds_len(cstr, cstr + i - 1, #test.str - (i - 1)) + table.insert(b_offsets, result.begin_off) + table.insert(e_offsets, result.end_off) + end + eq(test.offsets, { b = b_offsets, e = e_offsets }) + end) + end + + itp('does not read before start', function() + local str = '𐀀' + local expected_offsets = { b = { 0, 0, 0 }, e = { 1, 1, 1 } } + local cstr = to_cstr(str) + 1 + local b_offsets, e_offsets = {}, {} + for i = 1, 3 do + local result = lib.utf_cp_bounds_len(cstr, cstr + i - 1, 3 - (i - 1)) + table.insert(b_offsets, result.begin_off) + table.insert(e_offsets, result.end_off) + end + eq(expected_offsets, { b = b_offsets, e = e_offsets }) + end) + + itp('does not read past the end', function() + local str = '𐀀' + local expected_offsets = { b = { 0, 0, 0 }, e = { 1, 1, 1 } } + local cstr = to_cstr(str) + local b_offsets, e_offsets = {}, {} + for i = 1, 3 do + local result = lib.utf_cp_bounds_len(cstr, cstr + i - 1, 3 - (i - 1)) + table.insert(b_offsets, result.begin_off) + table.insert(e_offsets, result.end_off) + end + eq(expected_offsets, { b = b_offsets, e = e_offsets }) + end) + end) end)