Merge pull request #30014 from bfredl/neoemoji

support emojis with ZWJ and variant selectors
This commit is contained in:
bfredl 2024-08-30 12:58:48 +02:00 committed by GitHub
commit 5f95f1249f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
34 changed files with 657 additions and 221 deletions

View File

@ -646,6 +646,12 @@ widespread as file format.
A composing or combining character is used to change the meaning of the
character before it. The combining characters are drawn on top of the
preceding character.
Nvim largely follows the definition of extended grapheme clusters in UAX#29
in the Unicode standard, with some modifications: An ascii char will always
start a new cluster. In addition 'arabicshape' enables the combining of some
arabic letters, when they are shaped to be displayed together in a single cell.
Too big combined characters cannot be displayed, but they can still be
inspected using the |g8| and |ga| commands described below.
When editing text a composing character is mostly considered part of the

View File

@ -200,6 +200,12 @@ These existing features changed their behavior.
top lines are calculated using screen line numbers which take virtual lines
into account.
• The implementation of grapheme clusters (or combining chars |mbyte-combining|)
was upgraded to closely follow extended grapheme clusters as defined by UAX#29
in the unicode standard. Noteworthily, this enables proper display of many
more emoji characters than before, including those encoded with multiple
emoji codepoints combined with ZWJ (zero width joiner) codepoints.
==============================================================================
REMOVED FEATURES *news-removed*

View File

@ -2217,9 +2217,12 @@ A jump table for the options with a short description can be found at |Q_op|.
global
When on all Unicode emoji characters are considered to be full width.
This excludes "text emoji" characters, which are normally displayed as
single width. Unfortunately there is no good specification for this
and it has been determined on trial-and-error basis. Use the
|setcellwidths()| function to change the behavior.
single width. However, such "text emoji" are treated as full-width
emoji if they are followed by the U+FE0F variant selector.
Unfortunately there is no good specification for this and it has been
determined on trial-and-error basis. Use the |setcellwidths()|
function to change the behavior.
*'encoding'* *'enc'*
'encoding' 'enc' string (default "utf-8")

View File

@ -1829,9 +1829,12 @@ vim.go.ead = vim.go.eadirection
--- When on all Unicode emoji characters are considered to be full width.
--- This excludes "text emoji" characters, which are normally displayed as
--- single width. Unfortunately there is no good specification for this
--- and it has been determined on trial-and-error basis. Use the
--- `setcellwidths()` function to change the behavior.
--- single width. However, such "text emoji" are treated as full-width
--- emoji if they are followed by the U+FE0F variant selector.
---
--- Unfortunately there is no good specification for this and it has been
--- determined on trial-and-error basis. Use the `setcellwidths()`
--- function to change the behavior.
---
--- @type boolean
vim.o.emoji = true

View File

@ -571,7 +571,7 @@ Integer nvim_buf_set_extmark(Buffer buffer, Integer ns_id, Integer line, Integer
String c = opts->conceal;
if (c.size > 0) {
int ch;
hl.conceal_char = utfc_ptr2schar_len(c.data, (int)c.size, &ch);
hl.conceal_char = utfc_ptr2schar(c.data, &ch);
if (!hl.conceal_char || !vim_isprintc(ch)) {
api_set_error(err, kErrorTypeValidation, "conceal char has to be printable");
goto error;

View File

@ -847,7 +847,7 @@ void remote_ui_raw_line(RemoteUI *ui, Integer grid, Integer row, Integer startco
char sc_buf[MAX_SCHAR_SIZE];
schar_get(sc_buf, chunk[i]);
remote_ui_put(ui, sc_buf);
if (utf_ambiguous_width(utf_ptr2char(sc_buf))) {
if (utf_ambiguous_width(sc_buf)) {
ui->client_col = -1; // force cursor update
}
}

View File

@ -896,14 +896,15 @@ int del_bytes(colnr_T count, bool fixpos_arg, bool use_delcombine)
// delete the last combining character.
if (p_deco && use_delcombine && utfc_ptr2len(oldp + col) >= count) {
char *p0 = oldp + col;
if (utf_composinglike(p0, p0 + utf_ptr2len(p0))) {
GraphemeState state = GRAPHEME_STATE_INIT;
if (utf_composinglike(p0, p0 + utf_ptr2len(p0), &state)) {
// Find the last composing char, there can be several.
int n = col;
do {
col = n;
count = utf_ptr2len(oldp + n);
n += count;
} while (utf_composinglike(oldp + col, oldp + n));
} while (utf_composinglike(oldp + col, oldp + n, &state));
fixpos = false;
}
}
@ -1694,7 +1695,7 @@ bool open_line(int dir, int flags, int second_line_indent, bool *did_do_comment)
}
if (curbuf->b_p_ai || (flags & OPENLINE_DELSPACES)) {
while ((*p_extra == ' ' || *p_extra == '\t')
&& !utf_iscomposing(utf_ptr2char(p_extra + 1))) {
&& !utf_iscomposing_first(utf_ptr2char(p_extra + 1))) {
if (REPLACE_NORMAL(State)) {
replace_push(*p_extra);
}

View File

@ -1865,7 +1865,7 @@ static void printdigraph(const digr_T *dp, result_T *previous)
p = buf;
// add a space to draw a composing char on
if (utf_iscomposing(dp->result)) {
if (utf_iscomposing_first(dp->result)) {
*p++ = ' ';
}
p += utf_char2bytes(dp->result, p);

View File

@ -1826,7 +1826,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, int col_rows, s
// If a double-width char doesn't fit display a '>' in the last column.
// Don't advance the pointer but put the character at the start of the next line.
if (wlv.col >= grid->cols - 1 && utf_char2cells(mb_c) == 2) {
if (wlv.col >= grid->cols - 1 && schar_cells(mb_schar) == 2) {
mb_c = '>';
mb_l = 1;
(void)mb_l;
@ -1922,7 +1922,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, int col_rows, s
// If a double-width char doesn't fit display a '>' in the
// last column; the character is displayed at the start of the
// next line.
if (wlv.col >= grid->cols - 1 && utf_char2cells(mb_c) == 2) {
if (wlv.col >= grid->cols - 1 && schar_cells(mb_schar) == 2) {
mb_schar = schar_from_ascii('>');
mb_c = '>';
mb_l = 1;
@ -2393,6 +2393,12 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, int col_rows, s
|| (decor_conceal && decor_state.conceal_char)
|| wp->w_p_cole == 1)
&& wp->w_p_cole != 3) {
if (schar_cells(mb_schar) > 1) {
// When the first char to be concealed is double-width,
// need to advance one more virtual column.
wlv.n_extra++;
}
// First time at this concealed item: display one
// character.
if (has_match_conc && match_conc) {
@ -2410,12 +2416,6 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, int col_rows, s
mb_schar = schar_from_ascii(' ');
}
if (utf_char2cells(mb_c) > 1) {
// When the first char to be concealed is double-width,
// need to advance one more virtual column.
wlv.n_extra++;
}
mb_c = schar_get_first_codepoint(mb_schar);
prev_syntax_id = syntax_seqnr;
@ -2484,7 +2484,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, int col_rows, s
&& mb_schar != NUL) {
mb_schar = wp->w_p_lcs_chars.prec;
lcs_prec_todo = NUL;
if (utf_char2cells(mb_c) > 1) {
if (schar_cells(mb_schar) > 1) {
// Double-width character being overwritten by the "precedes"
// character, need to fill up half the character.
wlv.sc_extra = schar_from_ascii(MB_FILLER_CHAR);
@ -2725,7 +2725,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, int col_rows, s
linebuf_vcol[wlv.off] = wlv.vcol;
if (utf_char2cells(mb_c) > 1) {
if (schar_cells(mb_schar) > 1) {
// Need to fill two screen columns.
wlv.off++;
wlv.col++;
@ -2744,7 +2744,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, int col_rows, s
wlv.off++;
wlv.col++;
} else if (wp->w_p_cole > 0 && is_concealing) {
bool concealed_wide = utf_char2cells(mb_c) > 1;
bool concealed_wide = schar_cells(mb_schar) > 1;
wlv.skip_cells--;
wlv.vcol_off_co++;

View File

@ -2832,6 +2832,8 @@ int replace_push_mb(char *p)
{
int l = utfc_ptr2len(p);
// TODO(bfredl): stop doing this insantity and instead use utf_head_off() when popping.
// or just keep a secondary array with char byte lenghts
for (int j = l - 1; j >= 0; j--) {
replace_push(p[j]);
}
@ -2911,7 +2913,9 @@ static void mb_replace_pop_ins(int cc)
for (int i = 1; i < n; i++) {
buf[i] = (uint8_t)replace_pop();
}
if (utf_iscomposing(utf_ptr2char((char *)buf))) {
// TODO(bfredl): by fixing replace_push_mb, upgrade to use
// the new composing algorithm
if (utf_iscomposing_legacy(utf_ptr2char((char *)buf))) {
ins_bytes_len((char *)buf, (size_t)n);
} else {
// Not a composing char, put it back.
@ -3843,7 +3847,7 @@ static bool ins_bs(int c, int mode, int *inserted_space_p)
space_sci = sci;
space_vcol = vcol;
}
vcol += charsize_nowrap(curbuf, use_ts, vcol, sci.chr.value);
vcol += charsize_nowrap(curbuf, sci.ptr, use_ts, vcol, sci.chr.value);
sci = utfc_next(sci);
prev_space = cur_space;
}
@ -3859,7 +3863,7 @@ static bool ins_bs(int c, int mode, int *inserted_space_p)
// Find the position to stop backspacing.
// Use charsize_nowrap() so that virtual text and wrapping are ignored.
while (true) {
int size = charsize_nowrap(curbuf, use_ts, space_vcol, space_sci.chr.value);
int size = charsize_nowrap(curbuf, space_sci.ptr, use_ts, space_vcol, space_sci.chr.value);
if (space_vcol + size > want_vcol) {
break;
}
@ -3930,7 +3934,7 @@ static bool ins_bs(int c, int mode, int *inserted_space_p)
bool has_composing = false;
if (p_deco) {
char *p0 = get_cursor_pos_ptr();
has_composing = utf_composinglike(p0, p0 + utf_ptr2len(p0));
has_composing = utf_composinglike(p0, p0 + utf_ptr2len(p0), NULL);
}
del_char(false);
// If there are combining characters and 'delcombine' is set

View File

@ -204,7 +204,7 @@ void do_ascii(exarg_T *eap)
IObuff[iobuff_len++] = ' ';
}
IObuff[iobuff_len++] = '<';
if (utf_iscomposing(c)) {
if (utf_iscomposing_first(c)) {
IObuff[iobuff_len++] = ' '; // Draw composing char on top of a space.
}
iobuff_len += (size_t)utf_char2bytes(c, IObuff + iobuff_len);

View File

@ -2118,7 +2118,7 @@ static int command_line_handle_key(CommandLineState *s)
s->do_abbr = false; // don't do abbreviation now
ccline.special_char = NUL;
// may need to remove ^ when composing char was typed
if (utf_iscomposing(s->c) && !cmd_silent) {
if (utf_iscomposing_first(s->c) && !cmd_silent) {
if (ui_has(kUICmdline)) {
// TODO(bfredl): why not make unputcmdline also work with true?
unputcmdline();
@ -3585,7 +3585,9 @@ void put_on_cmdline(const char *str, int len, bool redraw)
// backup to the character before it. There could be two of them.
int i = 0;
int c = utf_ptr2char(ccline.cmdbuff + ccline.cmdpos);
while (ccline.cmdpos > 0 && utf_iscomposing(c)) {
// TODO(bfredl): this can be corrected/simplified as utf_head_off implements the
// correct grapheme cluster breaks
while (ccline.cmdpos > 0 && utf_iscomposing_legacy(c)) {
i = utf_head_off(ccline.cmdbuff, ccline.cmdbuff + ccline.cmdpos - 1) + 1;
ccline.cmdpos -= i;
len += i;

View File

@ -186,6 +186,24 @@ size_t schar_len(schar_T sc)
}
}
int schar_cells(schar_T sc)
{
// hot path
#ifdef ORDER_BIG_ENDIAN
if (!(sc & 0x80FFFFFF)) {
return 1;
}
#else
if (sc < 0x80) {
return 1;
}
#endif
char sc_buf[MAX_SCHAR_SIZE];
schar_get(sc_buf, sc);
return utf_ptr2cells(sc_buf);
}
/// gets first raw UTF-8 byte of an schar
static char schar_get_first_byte(schar_T sc)
{
@ -428,14 +446,19 @@ int grid_line_puts(int col, const char *text, int textlen, int attr)
const int max_col = grid_line_maxcol;
while (col < max_col && (len < 0 || (int)(ptr - text) < len) && *ptr != NUL) {
// check if this is the first byte of a multibyte
int mbyte_blen = len > 0
? utfc_ptr2len_len(ptr, (int)((text + len) - ptr))
: utfc_ptr2len(ptr);
int mbyte_blen;
if (len >= 0) {
int maxlen = (int)((text + len) - ptr);
mbyte_blen = utfc_ptr2len_len(ptr, maxlen);
if (mbyte_blen > maxlen) {
mbyte_blen = 1;
}
} else {
mbyte_blen = utfc_ptr2len(ptr);
}
int firstc;
schar_T schar = len >= 0
? utfc_ptr2schar_len(ptr, (int)((text + len) - ptr), &firstc)
: utfc_ptr2schar(ptr, &firstc);
int mbyte_cells = utf_char2cells(firstc);
schar_T schar = utfc_ptrlen2schar(ptr, mbyte_blen, &firstc);
int mbyte_cells = utf_ptr2cells_len(ptr, mbyte_blen);
if (mbyte_cells > 2 || schar == 0) {
mbyte_cells = 1;
schar = schar_from_char(0xFFFD);

View File

@ -511,20 +511,30 @@ int utf_char2cells(int c)
/// Return the number of display cells character at "*p" occupies.
/// This doesn't take care of unprintable characters, use ptr2cells() for that.
int utf_ptr2cells(const char *p)
int utf_ptr2cells(const char *p_in)
{
const uint8_t *p = (const uint8_t *)p_in;
// Need to convert to a character number.
if ((uint8_t)(*p) >= 0x80) {
int c = utf_ptr2char(p);
if ((*p) >= 0x80) {
int len = utf8len_tab[*p];
int32_t c = utf_ptr2CharInfo_impl(p, (uintptr_t)len);
// An illegal byte is displayed as <xx>.
if (utf_ptr2len(p) == 1 || c == NUL) {
if (c <= 0) {
return 4;
}
// If the char is ASCII it must be an overlong sequence.
if (c < 0x80) {
return char2cells(c);
}
return utf_char2cells(c);
int cells = utf_char2cells(c);
if (cells == 1 && p_emoji
&& intable(emoji_all, ARRAY_SIZE(emoji_all), c)) {
int c2 = utf_ptr2char(p_in + len);
if (c2 == 0xFE0F) {
return 2; // emoji presentation
}
}
return cells;
}
return 1;
}
@ -603,7 +613,8 @@ int utf_ptr2cells_len(const char *p, int size)
{
// Need to convert to a wide character.
if (size > 0 && (uint8_t)(*p) >= 0x80) {
if (utf_ptr2len_len(p, size) < utf8len_tab[(uint8_t)(*p)]) {
int len = utf_ptr2len_len(p, size);
if (len < utf8len_tab[(uint8_t)(*p)]) {
return 1; // truncated
}
int c = utf_ptr2char(p);
@ -615,7 +626,16 @@ int utf_ptr2cells_len(const char *p, int size)
if (c < 0x80) {
return char2cells(c);
}
return utf_char2cells(c);
int cells = utf_char2cells(c);
if (cells == 1 && p_emoji && size > len
&& intable(emoji_all, ARRAY_SIZE(emoji_all), c)
&& utf_ptr2len_len(p + len, size - len) == utf8len_tab[(uint8_t)p[len]]) {
int c2 = utf_ptr2char(p + len);
if (c2 == 0xFE0F) {
return 2; // emoji presentation
}
}
return cells;
}
return 1;
}
@ -648,8 +668,8 @@ size_t mb_string2cells_len(const char *str, size_t size)
size_t clen = 0;
for (const char *p = str; *p != NUL && p < str + size;
p += utfc_ptr2len_len(p, (int)size + (int)(p - str))) {
clen += (size_t)utf_ptr2cells(p);
p += utfc_ptr2len_len(p, (int)size - (int)(p - str))) {
clen += (size_t)utf_ptr2cells_len(p, (int)size - (int)(p - str));
}
return clen;
@ -793,29 +813,48 @@ int mb_cptr2char_adv(const char **pp)
return c;
}
/// Check if the character pointed to by "p2" is a composing character when it
/// comes after "p1". For Arabic sometimes "ab" is replaced with "c", which
/// behaves like a composing character.
bool utf_composinglike(const char *p1, const char *p2)
/// When "c" is the first char of a string, determine if it needs to be prefixed
/// by a space byte to be drawn correctly, and not merge with the space left of
/// the string.
bool utf_iscomposing_first(int c)
{
int c2 = utf_ptr2char(p2);
if (utf_iscomposing(c2)) {
return true;
}
if (!arabic_maycombine(c2)) {
return false;
}
return arabic_combine(utf_ptr2char(p1), c2);
return c >= 128 && !utf8proc_grapheme_break(' ', c);
}
/// Check if the next character is a composing character when it
/// comes after the first. For Arabic sometimes "ab" is replaced with "c", which
/// behaves like a composing character.
/// returns false for negative values
bool utf_char_composinglike(int32_t const first, int32_t const next)
FUNC_ATTR_PURE
/// Check if the character pointed to by "p2" is a composing character when it
/// comes after "p1".
///
/// We use the definition in UAX#29 as implemented by utf8proc with the following
/// exceptions:
///
/// - ASCII chars always begin a new cluster. This is a long assumed invariant
/// in the code base and very useful for performance (we can exit early for ASCII
/// all over the place, branch predictor go brrr in ASCII-only text).
/// As of Unicode 15.1 this will only break BOUNDCLASS_UREPEND followed by ASCII,
/// which should be exceedingly rare (these PREPEND chars are expected to be
/// followed by multibyte chars within the same script family)
///
/// - When 'arabicshape' is active, some pairs of arabic letters "ab" is replaced with
/// "c" taking one single cell, which behaves like a cluster.
///
/// @param "state" should be set to GRAPHEME_STATE_INIT before first call
/// it is allowed to be null, but will then not handle some longer
/// sequences, like ZWJ based emoji
bool utf_composinglike(const char *p1, const char *p2, GraphemeState *state)
FUNC_ATTR_NONNULL_ARG(1, 2)
{
return utf_iscomposing(next) || arabic_combine(first, next);
if ((uint8_t)(*p2) < 128) {
return false;
}
int first = utf_ptr2char(p1);
int second = utf_ptr2char(p2);
if (!utf8proc_grapheme_break_stateful(first, second, state)) {
return true;
}
return arabic_combine(first, second);
}
/// Get the screen char at the beginning of a string
@ -834,7 +873,7 @@ schar_T utfc_ptr2schar(const char *p, int *firstc)
{
int c = utf_ptr2char(p);
*firstc = c; // NOT optional, you are gonna need it
bool first_compose = utf_iscomposing(c);
bool first_compose = utf_iscomposing_first(c);
size_t maxlen = MAX_SCHAR_SIZE - 1 - first_compose;
size_t len = (size_t)utfc_ptr2len_len(p, (int)maxlen);
@ -845,16 +884,13 @@ schar_T utfc_ptr2schar(const char *p, int *firstc)
return schar_from_buf_first(p, len, first_compose);
}
/// Get the screen char at the beginning of a string with length
/// Get the screen char from a char with a known length
///
/// Like utfc_ptr2schar but use no more than p[maxlen].
schar_T utfc_ptr2schar_len(const char *p, int maxlen, int *firstc)
schar_T utfc_ptrlen2schar(const char *p, int len, int *firstc)
FUNC_ATTR_NONNULL_ALL
{
assert(maxlen > 0);
size_t len = (size_t)utf_ptr2len_len(p, maxlen);
if (len > (size_t)maxlen || (len == 1 && (uint8_t)(*p) >= 0x80) || len == 0) {
if ((len == 1 && (uint8_t)(*p) >= 0x80) || len == 0) {
// invalid or truncated sequence
*firstc = (uint8_t)(*p);
return 0;
@ -862,11 +898,13 @@ schar_T utfc_ptr2schar_len(const char *p, int maxlen, int *firstc)
int c = utf_ptr2char(p);
*firstc = c;
bool first_compose = utf_iscomposing(c);
maxlen = MIN(maxlen, MAX_SCHAR_SIZE - 1 - first_compose);
len = (size_t)utfc_ptr2len_len(p, maxlen);
bool first_compose = utf_iscomposing_first(c);
int maxlen = MAX_SCHAR_SIZE - 1 - first_compose;
if (len > maxlen) {
len = utfc_ptr2len_len(p, maxlen);
}
return schar_from_buf_first(p, len, first_compose);
return schar_from_buf_first(p, (size_t)len, first_compose);
}
/// Caller must ensure there is space for `first_compose`
@ -964,8 +1002,9 @@ int utfc_ptr2len(const char *const p)
// Check for composing characters.
int prevlen = 0;
GraphemeState state = GRAPHEME_STATE_INIT;
while (true) {
if ((uint8_t)p[len] < 0x80 || !utf_composinglike(p + prevlen, p + len)) {
if ((uint8_t)p[len] < 0x80 || !utf_composinglike(p + prevlen, p + len, &state)) {
return len;
}
@ -996,9 +1035,10 @@ int utfc_ptr2len_len(const char *p, int size)
return 1;
}
// Check for composing characters. We can handle only the first six, but
// Check for composing characters. We can only display a limited amount, but
// skip all of them (otherwise the cursor would get stuck).
int prevlen = 0;
GraphemeState state = GRAPHEME_STATE_INIT;
while (len < size) {
if ((uint8_t)p[len] < 0x80) {
break;
@ -1011,7 +1051,7 @@ int utfc_ptr2len_len(const char *p, int size)
break;
}
if (!utf_composinglike(p + prevlen, p + len)) {
if (!utf_composinglike(p + prevlen, p + len, &state)) {
break;
}
@ -1084,11 +1124,18 @@ int utf_char2bytes(const int c, char *const buf)
}
}
/// Return true if "c" is a composing UTF-8 character.
/// This means it will be drawn on top of the preceding character.
/// Return true if "c" is a legacy composing UTF-8 character.
///
/// This is deprecated in favour of utf_composinglike() which uses the modern
/// stateful algorithm to determine grapheme clusters. Still available
/// to support some legacy code which hasn't been refactored yet.
///
/// To check if a char would combine with a preceeding space, use
/// utf_iscomposing_first() instead.
///
/// Based on code from Markus Kuhn.
/// Returns false for negative values.
bool utf_iscomposing(int c)
bool utf_iscomposing_legacy(int c)
{
return intable(combining, ARRAY_SIZE(combining), c);
}
@ -1278,8 +1325,9 @@ int utf_class_tab(const int c, const uint64_t *const chartab)
return 2;
}
bool utf_ambiguous_width(int c)
bool utf_ambiguous_width(const char *p)
{
int c = utf_ptr2char(p);
return c >= 0x80 && (intable(ambiguous, ARRAY_SIZE(ambiguous), c)
|| intable(emoji_all, ARRAY_SIZE(emoji_all), c));
}
@ -1666,6 +1714,26 @@ void show_utf8(void)
msg(IObuff, 0);
}
/// @return true if boundclass bc always starts a new cluster regardless of what's before
/// false negatives are allowed (perf cost, not correctness)
static bool always_break(int bc)
{
return (bc == UTF8PROC_BOUNDCLASS_CONTROL);
}
/// @return true if bc2 always starts a cluster after bc1
/// false negatives are allowed (perf cost, not correctness)
static bool always_break_two(int bc1, int bc2)
{
// don't check for UTF8PROC_BOUNDCLASS_CONTROL for bc2 as it either has been checked by
// "always_break" on first iteration or when it was bc1 in the previous iteration
return ((bc1 != UTF8PROC_BOUNDCLASS_PREPEND && bc2 == UTF8PROC_BOUNDCLASS_OTHER)
|| (bc1 >= UTF8PROC_BOUNDCLASS_CR && bc1 <= UTF8PROC_BOUNDCLASS_CONTROL)
|| (bc2 == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC
&& (bc1 == UTF8PROC_BOUNDCLASS_OTHER
|| bc1 == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC)));
}
/// Return offset from "p" to the start of a character, including composing characters.
/// "base" must be the start of the string, which must be NUL terminated.
/// If "p" points to the NUL at the end of the string return 0.
@ -1679,50 +1747,108 @@ int utf_head_off(const char *base_in, const char *p_in)
const uint8_t *base = (uint8_t *)base_in;
const uint8_t *p = (uint8_t *)p_in;
// Skip backwards over trailing bytes: 10xx.xxxx
// Skip backwards again if on a composing char.
const uint8_t *q;
for (q = p;; q--) {
// Move s to the last byte of this char.
const uint8_t *s;
for (s = q; (s[1] & 0xc0) == 0x80; s++) {}
const uint8_t *start = p;
// Move q to the first byte of this char.
while (q > base && (*q & 0xc0) == 0x80) {
q--;
}
// Check for illegal sequence. Do allow an illegal byte after where we
// started.
int len = utf8len_tab[*q];
if (len != (int)(s - q + 1) && len != (int)(p - q + 1)) {
return 0;
// move start to the first byte of this codepoint
// might stop on a continuation byte if overlong, handled by utf_ptr2CharInfo_impl
while (start > base && (*start & 0xc0) == 0x80 && (p - start) < 6) {
start--;
}
uint8_t cur_len = utf8len_tab[*start];
int32_t cur_code = utf_ptr2CharInfo_impl(start, (uintptr_t)cur_len);
if (cur_code < 0) {
return 0; // p must be part of an illegal sequence
}
const uint8_t * const safe_end = start + cur_len;
int cur_bc = utf8proc_get_property(cur_code)->boundclass;
if (always_break(cur_bc)) {
return (int)(p - start);
}
// backtrack to find the start of a cluster. we might go too far, checked in the next loop
const uint8_t *cur_pos = start;
const uint8_t *const p_start = start;
if (start == base) {
return (int)(p - start);
}
start--;
while (*start >= 0x80) { // stop on ascii, we are done
while (start > base && (*start & 0xc0) == 0x80 && (cur_pos - start) < 6) {
start--;
}
if (q <= base) {
int32_t prev_code = utf_ptr2CharInfo_impl(start, (uintptr_t)utf8len_tab[*start]);
if (prev_code < 0) {
start = cur_pos; // start at valid sequence after invalid bytes
break;
}
int c = utf_ptr2char((char *)q);
if (utf_iscomposing(c)) {
continue;
int prev_bc = utf8proc_get_property(prev_code)->boundclass;
if (always_break_two(prev_bc, cur_bc) && !arabic_combine(prev_code, cur_code)) {
start = cur_pos; // prev_code cannot be a part of this cluster
break;
} else if (start == base) {
break;
}
cur_pos = start;
cur_bc = prev_bc;
cur_code = prev_code;
if (arabic_maycombine(c)) {
// Advance to get a sneak-peak at the next char
const uint8_t *j = q;
j--;
// Move j to the first byte of this char.
while (j > base && (*j & 0xc0) == 0x80) {
j--;
}
if (arabic_combine(utf_ptr2char((char *)j), c)) {
continue;
}
}
break;
start--;
}
return (int)(p - q);
// hot path: we are already on the first codepoint of a sequence
if (start == p_start) {
return (int)(p - start);
}
const uint8_t *q = start;
while (q < p) {
// don't need to find end of cluster. once we reached the codepoint of p, we are done
int len = utfc_ptr2len_len((const char *)q, (int)(safe_end - q));
if (q + len > p) {
return (int)(p - q);
}
q += len;
}
return 0;
}
/// Assumes caller already handles ascii. see `utfc_next`
StrCharInfo utfc_next_impl(StrCharInfo cur)
{
int32_t prev_code = cur.chr.value;
uint8_t *next = (uint8_t *)(cur.ptr + cur.chr.len);
GraphemeState state = GRAPHEME_STATE_INIT;
assert(*next >= 0x80);
while (true) {
uint8_t const next_len = utf8len_tab[*next];
int32_t const next_code = utf_ptr2CharInfo_impl(next, (uintptr_t)next_len);
if (utf8proc_grapheme_break_stateful(prev_code, next_code, &state)
&& !arabic_combine(prev_code, next_code)) {
return (StrCharInfo){
.ptr = (char *)next,
.chr = (CharInfo){ .value = next_code, .len = (next_code < 0 ? 1 : next_len) },
};
}
prev_code = next_code;
next += next_len;
if (EXPECT(*next < 0x80U, true)) {
return (StrCharInfo){
.ptr = (char *)next,
.chr = (CharInfo){ .value = *next, .len = 1 },
};
}
}
}
// Whether space is NOT allowed before/after 'c'.
@ -2681,7 +2807,7 @@ char *string_convert_ext(const vimconv_T *const vcp, char *ptr, size_t *lenp, si
c = 0x100; break; // not in latin9
}
}
if (!utf_iscomposing(c)) { // skip composing chars
if (!utf_iscomposing_legacy(c)) { // skip composing chars
if (c < 0x100) {
*d++ = (uint8_t)c;
} else if (vcp->vc_fail) {

View File

@ -3,6 +3,7 @@
#include <stdbool.h>
#include <stdint.h>
#include <sys/types.h> // IWYU pragma: keep
#include <utf8proc.h>
#include <uv.h> // IWYU pragma: keep
#include "nvim/cmdexpand_defs.h" // IWYU pragma: keep
@ -11,6 +12,9 @@
#include "nvim/mbyte_defs.h" // IWYU pragma: keep
#include "nvim/types_defs.h" // IWYU pragma: keep
typedef utf8proc_int32_t GraphemeState;
#define GRAPHEME_STATE_INIT 0
#ifdef INCLUDE_GENERATED_DECLARATIONS
# include "mbyte.h.generated.h"
# include "mbyte.h.inline.generated.h"
@ -92,28 +96,16 @@ static inline CharInfo utf_ptr2CharInfo(char const *const p_in)
static inline StrCharInfo utfc_next(StrCharInfo cur)
FUNC_ATTR_NONNULL_ALL FUNC_ATTR_ALWAYS_INLINE FUNC_ATTR_PURE
{
int32_t prev_code = cur.chr.value;
// handle ASCII case inline
uint8_t *next = (uint8_t *)(cur.ptr + cur.chr.len);
while (true) {
if (EXPECT(*next < 0x80U, true)) {
return (StrCharInfo){
.ptr = (char *)next,
.chr = (CharInfo){ .value = *next, .len = 1 },
};
}
uint8_t const next_len = utf8len_tab[*next];
int32_t const next_code = utf_ptr2CharInfo_impl(next, (uintptr_t)next_len);
if (!utf_char_composinglike(prev_code, next_code)) {
return (StrCharInfo){
.ptr = (char *)next,
.chr = (CharInfo){ .value = next_code, .len = (next_code < 0 ? 1 : next_len) },
};
}
prev_code = next_code;
next += next_len;
if (EXPECT(*next < 0x80U, true)) {
return (StrCharInfo){
.ptr = (char *)next,
.chr = (CharInfo){ .value = *next, .len = 1 },
};
}
return utfc_next_impl(cur);
}
static inline StrCharInfo utf_ptr2StrCharInfo(char *ptr)

View File

@ -446,9 +446,7 @@ void trunc_string(const char *s, char *buf, int room_in, int buflen)
// Last part: End of the string.
half = i = (int)strlen(s);
while (true) {
do {
half = half - utf_head_off(s, s + half - 1) - 1;
} while (half > 0 && utf_iscomposing(utf_ptr2char(s + half)));
half = half - utf_head_off(s, s + half - 1) - 1;
n = ptr2cells(s + half);
if (len + n > room || half == 0) {
break;

View File

@ -837,7 +837,10 @@ static void normal_get_additional_char(NormalState *s)
while ((s->c = vpeekc()) > 0
&& (s->c >= 0x100 || MB_BYTE2LEN(vpeekc()) > 1)) {
s->c = plain_vgetc();
if (!utf_iscomposing(s->c)) {
// TODO(bfredl): only allowing up to two composing chars is cringe af.
// Could reuse/abuse schar_T to at least allow us to input anything we are able
// to display and use the stateful utf8proc algorithm like utf_composinglike
if (!utf_iscomposing_legacy(s->c)) {
vungetc(s->c); // it wasn't, put it back
break;
} else if (s->ca.ncharC1 == 0) {

View File

@ -2326,9 +2326,12 @@ return {
desc = [=[
When on all Unicode emoji characters are considered to be full width.
This excludes "text emoji" characters, which are normally displayed as
single width. Unfortunately there is no good specification for this
and it has been determined on trial-and-error basis. Use the
|setcellwidths()| function to change the behavior.
single width. However, such "text emoji" are treated as full-width
emoji if they are followed by the U+FE0F variant selector.
Unfortunately there is no good specification for this and it has been
determined on trial-and-error basis. Use the |setcellwidths()|
function to change the behavior.
]=],
full_name = 'emoji',
redraw = { 'all_windows', 'ui_option' },

View File

@ -146,7 +146,7 @@ CharSize charsize_regular(CharsizeArg *csarg, char *const cur, colnr_T const vco
} else if (cur_char < 0) {
size = kInvalidByteCells;
} else {
size = char2cells(cur_char);
size = ptr2cells(cur);
is_doublewidth = size == 2 && cur_char > 0x80;
}
@ -337,8 +337,8 @@ CharSize charsize_regular(CharsizeArg *csarg, char *const cur, colnr_T const vco
///
/// @see charsize_regular
/// @see charsize_fast
static inline CharSize charsize_fast_impl(win_T *const wp, bool use_tabstop, colnr_T const vcol,
int32_t const cur_char)
static inline CharSize charsize_fast_impl(win_T *const wp, const char *cur, bool use_tabstop,
colnr_T const vcol, int32_t const cur_char)
FUNC_ATTR_PURE FUNC_ATTR_ALWAYS_INLINE
{
// A tab gets expanded, depending on the current column
@ -352,7 +352,11 @@ static inline CharSize charsize_fast_impl(win_T *const wp, bool use_tabstop, col
if (cur_char < 0) {
width = kInvalidByteCells;
} else {
width = char2cells(cur_char);
// TODO(bfredl): perf: often cur_char is enough at this point to determine width.
// we likely want a specialized version of utf_ptr2StrCharInfo also determining
// the ptr2cells width at the same time without any extra decoding. (also applies
// to charsize_regular and charsize_nowrap)
width = ptr2cells(cur);
}
// If a double-width char doesn't fit at the end of a line, it wraps to the next line,
@ -371,23 +375,23 @@ static inline CharSize charsize_fast_impl(win_T *const wp, bool use_tabstop, col
/// Can be used if CSType is kCharsizeFast.
///
/// @see charsize_regular
CharSize charsize_fast(CharsizeArg *csarg, colnr_T const vcol, int32_t const cur_char)
CharSize charsize_fast(CharsizeArg *csarg, const char *cur, colnr_T vcol, int32_t cur_char)
FUNC_ATTR_PURE
{
return charsize_fast_impl(csarg->win, csarg->use_tabstop, vcol, cur_char);
return charsize_fast_impl(csarg->win, cur, csarg->use_tabstop, vcol, cur_char);
}
/// Get the number of cells taken up on the screen at given virtual column.
///
/// @see win_chartabsize()
int charsize_nowrap(buf_T *buf, bool use_tabstop, colnr_T vcol, int32_t cur_char)
int charsize_nowrap(buf_T *buf, const char *cur, bool use_tabstop, colnr_T vcol, int32_t cur_char)
{
if (cur_char == TAB && use_tabstop) {
return tabstop_padding(vcol, buf->b_p_ts, buf->b_p_vts_array);
} else if (cur_char < 0) {
return kInvalidByteCells;
} else {
return char2cells(cur_char);
return ptr2cells(cur);
}
}
@ -467,7 +471,7 @@ int linesize_fast(CharsizeArg const *const csarg, int vcol_arg, colnr_T const le
StrCharInfo ci = utf_ptr2StrCharInfo(line);
while (ci.ptr - line < len && *ci.ptr != NUL) {
vcol += charsize_fast_impl(wp, use_tabstop, vcol_arg, ci.chr.value).width;
vcol += charsize_fast_impl(wp, ci.ptr, use_tabstop, vcol_arg, ci.chr.value).width;
ci = utfc_next(ci);
if (vcol > MAXCOL) {
vcol_arg = MAXCOL;
@ -530,7 +534,7 @@ void getvcol(win_T *wp, pos_T *pos, colnr_T *start, colnr_T *cursor, colnr_T *en
char_size = (CharSize){ .width = 1 };
break;
}
char_size = charsize_fast_impl(wp, use_tabstop, vcol, ci.chr.value);
char_size = charsize_fast_impl(wp, ci.ptr, use_tabstop, vcol, ci.chr.value);
StrCharInfo const next = utfc_next(ci);
if (next.ptr - line > end_col) {
break;
@ -627,7 +631,7 @@ void getvvcol(win_T *wp, pos_T *pos, colnr_T *start, colnr_T *cursor, colnr_T *e
if (pos->col < ml_get_buf_len(wp->w_buffer, pos->lnum)) {
int c = utf_ptr2char(ptr + pos->col);
if ((c != TAB) && vim_isprintc(c)) {
endadd = (colnr_T)(char2cells(c) - 1);
endadd = (colnr_T)(ptr2cells(ptr + pos->col) - 1);
if (coladd > endadd) {
// past end of line
endadd = 0;
@ -824,7 +828,7 @@ int plines_win_col(win_T *wp, linenr_T lnum, long column)
if (cstype == kCharsizeFast) {
bool const use_tabstop = csarg.use_tabstop;
while (*ci.ptr != NUL && --column >= 0) {
vcol += charsize_fast_impl(wp, use_tabstop, vcol, ci.chr.value).width;
vcol += charsize_fast_impl(wp, ci.ptr, use_tabstop, vcol, ci.chr.value).width;
ci = utfc_next(ci);
}
} else {

View File

@ -54,7 +54,7 @@ static inline CharSize win_charsize(CSType cstype, int vcol, char *ptr, int32_t
FUNC_ATTR_NONNULL_ALL FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_ALWAYS_INLINE
{
if (cstype == kCharsizeFast) {
return charsize_fast(csarg, vcol, chr);
return charsize_fast(csarg, ptr, vcol, chr);
} else {
return charsize_regular(csarg, ptr, vcol, chr);
}

View File

@ -3031,7 +3031,7 @@ static bool use_multibytecode(int c)
{
return utf_char2len(c) > 1
&& (re_multi_type(peekchr()) != NOT_MULTI
|| utf_iscomposing(c));
|| utf_iscomposing_legacy(c));
}
// Emit (if appropriate) a byte of code
@ -4326,7 +4326,7 @@ static uint8_t *regatom(int *flagp)
}
// When '.' is followed by a composing char ignore the dot, so that
// the composing char is matched here.
if (c == Magic('.') && utf_iscomposing(peekchr())) {
if (c == Magic('.') && utf_iscomposing_legacy(peekchr())) {
c = getchr();
goto do_multibyte;
}
@ -5001,9 +5001,10 @@ do_multibyte:
int l;
// Need to get composing character too.
GraphemeState state = GRAPHEME_STATE_INIT;
while (true) {
l = utf_ptr2len(regparse);
if (!utf_composinglike(regparse, regparse + l)) {
if (!utf_composinglike(regparse, regparse + l, &state)) {
break;
}
regmbc(utf_ptr2char(regparse));
@ -6569,7 +6570,7 @@ static bool regmatch(uint8_t *scan, const proftime_T *tm, int *timed_out)
// Check for following composing character, unless %C
// follows (skips over all composing chars).
if (status != RA_NOMATCH
&& utf_composinglike((char *)rex.input, (char *)rex.input + len)
&& utf_composinglike((char *)rex.input, (char *)rex.input + len, NULL)
&& !rex.reg_icombine
&& OP(next) != RE_COMPOSING) {
// raaron: This code makes a composing character get
@ -6624,14 +6625,14 @@ static bool regmatch(uint8_t *scan, const proftime_T *tm, int *timed_out)
break;
}
const int opndc = utf_ptr2char((char *)opnd);
if (utf_iscomposing(opndc)) {
if (utf_iscomposing_legacy(opndc)) {
// When only a composing char is given match at any
// position where that composing char appears.
status = RA_NOMATCH;
for (i = 0; rex.input[i] != NUL;
i += utf_ptr2len((char *)rex.input + i)) {
const int inpc = utf_ptr2char((char *)rex.input + i);
if (!utf_iscomposing(inpc)) {
if (!utf_iscomposing_legacy(inpc)) {
if (i > 0) {
break;
}
@ -6654,7 +6655,7 @@ static bool regmatch(uint8_t *scan, const proftime_T *tm, int *timed_out)
case RE_COMPOSING:
// Skip composing characters.
while (utf_iscomposing(utf_ptr2char((char *)rex.input))) {
while (utf_iscomposing_legacy(utf_ptr2char((char *)rex.input))) {
rex.input += utf_ptr2len((char *)rex.input);
}
break;
@ -10070,7 +10071,7 @@ static int nfa_regatom(void)
}
// When '.' is followed by a composing char ignore the dot, so that
// the composing char is matched here.
if (c == Magic('.') && utf_iscomposing(peekchr())) {
if (c == Magic('.') && utf_iscomposing_legacy(peekchr())) {
old_regparse = (uint8_t *)regparse;
c = getchr();
goto nfa_do_multibyte;
@ -10705,7 +10706,7 @@ collection:
nfa_do_multibyte:
// plen is length of current char with composing chars
if (utf_char2len(c) != (plen = utfc_ptr2len((char *)old_regparse))
|| utf_iscomposing(c)) {
|| utf_iscomposing_legacy(c)) {
int i = 0;
// A base character plus composing characters, or just one
@ -14033,7 +14034,7 @@ static int find_match_text(colnr_T *startcol, int regstart, uint8_t *match_text)
}
if (match
// check that no composing char follows
&& !utf_iscomposing(utf_ptr2char((char *)s2))) {
&& !utf_iscomposing_legacy(utf_ptr2char((char *)s2))) {
cleanup_subexpr();
if (REG_MULTI) {
rex.reg_startpos[0].lnum = rex.lnum;
@ -14278,7 +14279,7 @@ static int nfa_regmatch(nfa_regprog_T *prog, nfa_state_T *start, regsubs_T *subm
// is not really a match.
if (!rex.reg_icombine
&& rex.input != rex.line
&& utf_iscomposing(curc)) {
&& utf_iscomposing_legacy(curc)) {
break;
}
nfa_match = true;
@ -14622,7 +14623,7 @@ static int nfa_regmatch(nfa_regprog_T *prog, nfa_state_T *start, regsubs_T *subm
sta = t->state->out;
len = 0;
if (utf_iscomposing(sta->c)) {
if (utf_iscomposing_legacy(sta->c)) {
// Only match composing character(s), ignore base
// character. Used for ".{composing}" and "{composing}"
// (no preceding character).
@ -14724,7 +14725,7 @@ static int nfa_regmatch(nfa_regprog_T *prog, nfa_state_T *start, regsubs_T *subm
int j;
sta = t->state->out->out;
if (utf_iscomposing(sta->c)) {
if (utf_iscomposing_legacy(sta->c)) {
// Only match composing character(s), ignore base
// character. Used for ".{composing}" and "{composing}"
// (no preceding character).
@ -14846,7 +14847,7 @@ static int nfa_regmatch(nfa_regprog_T *prog, nfa_state_T *start, regsubs_T *subm
case NFA_ANY_COMPOSING:
// On a composing character skip over it. Otherwise do
// nothing. Always matches.
if (utf_iscomposing(curc)) {
if (utf_iscomposing_legacy(curc)) {
add_off = clen;
} else {
add_here = true;

View File

@ -1260,7 +1260,7 @@ int do_search(oparg_T *oap, int dirc, int search_delim, char *pat, size_t patlen
// empty for the search_stat feature.
if (!cmd_silent) {
msgbuf[0] = (char)dirc;
if (utf_iscomposing(utf_ptr2char(p))) {
if (utf_iscomposing_first(utf_ptr2char(p))) {
// Use a space to draw the composing char on.
msgbuf[1] = ' ';
memmove(msgbuf + 2, p, plen);

View File

@ -376,7 +376,7 @@ int init_sign_text(sign_T *sp, schar_T *sign_text, char *text)
if (!vim_isprintc(c)) {
break;
}
int width = utf_char2cells(c);
int width = utf_ptr2cells(s);
if (width == 2) {
sign_text[cells + 1] = 0;
}

View File

@ -1792,10 +1792,8 @@ static void suggest_trie_walk(suginfo_T *su, langp_T *lp, char *fword, bool soun
// For changing a composing character adjust
// the score from SCORE_SUBST to
// SCORE_SUBCOMP.
if (utf_iscomposing(utf_ptr2char(tword + sp->ts_twordlen
- sp->ts_tcharlen))
&& utf_iscomposing(utf_ptr2char(fword
+ sp->ts_fcharstart))) {
if (utf_iscomposing_legacy(utf_ptr2char(tword + sp->ts_twordlen - sp->ts_tcharlen))
&& utf_iscomposing_legacy(utf_ptr2char(fword + sp->ts_fcharstart))) {
sp->ts_score -= SCORE_SUBST - SCORE_SUBCOMP;
} else if (!soundfold
&& slang->sl_has_map
@ -1811,7 +1809,7 @@ static void suggest_trie_walk(suginfo_T *su, langp_T *lp, char *fword, bool soun
&& sp->ts_twordlen > sp->ts_tcharlen) {
p = tword + sp->ts_twordlen - sp->ts_tcharlen;
c = utf_ptr2char(p);
if (utf_iscomposing(c)) {
if (utf_iscomposing_legacy(c)) {
// Inserting a composing char doesn't
// count that much.
sp->ts_score -= SCORE_INS - SCORE_INSCOMP;
@ -1876,7 +1874,7 @@ static void suggest_trie_walk(suginfo_T *su, langp_T *lp, char *fword, bool soun
c = utf_ptr2char(fword + sp->ts_fidx);
stack[depth].ts_fidx =
(uint8_t)(stack[depth].ts_fidx + utfc_ptr2len(fword + sp->ts_fidx));
if (utf_iscomposing(c)) {
if (utf_iscomposing_legacy(c)) {
stack[depth].ts_score -= SCORE_DEL - SCORE_DELCOMP;
} else if (c == utf_ptr2char(fword + stack[depth].ts_fidx)) {
stack[depth].ts_score -= SCORE_DEL - SCORE_DELDUP;

View File

@ -47,7 +47,7 @@ static bool did_add_space = false; ///< auto_format() added an extra space
///< under the cursor
#define WHITECHAR(cc) (ascii_iswhite(cc) \
&& !utf_iscomposing(utf_ptr2char((char *)get_cursor_pos_ptr() + 1)))
&& !utf_iscomposing_first(utf_ptr2char((char *)get_cursor_pos_ptr() + 1)))
/// Return true if format option 'x' is in effect.
/// Take care of no formatting when 'paste' is set.

View File

@ -109,6 +109,7 @@ struct TUIData {
bool set_cursor_color_as_str;
bool cursor_color_changed;
bool is_starting;
bool did_set_grapheme_cluster_mode;
FILE *screenshot;
cursorentry_T cursor_shapes[SHAPE_IDX_COUNT];
HlAttrs clear_attrs;
@ -220,6 +221,7 @@ static void tui_set_term_mode(TUIData *tui, TermMode mode, bool set)
void tui_handle_term_mode(TUIData *tui, TermMode mode, TermModeState state)
FUNC_ATTR_NONNULL_ALL
{
bool is_set = false;
switch (state) {
case kTermModeNotRecognized:
case kTermModePermanentlySet:
@ -228,6 +230,8 @@ void tui_handle_term_mode(TUIData *tui, TermMode mode, TermModeState state)
// then there is nothing to do
break;
case kTermModeSet:
is_set = true;
FALLTHROUGH;
case kTermModeReset:
// The terminal supports changing the given mode
switch (mode) {
@ -240,6 +244,12 @@ void tui_handle_term_mode(TUIData *tui, TermMode mode, TermModeState state)
signal_watcher_stop(&tui->winch_handle);
tui_set_term_mode(tui, mode, true);
break;
case kTermModeGraphemeClusters:
if (!is_set) {
tui_set_term_mode(tui, mode, true);
tui->did_set_grapheme_cluster_mode = true;
}
break;
}
}
}
@ -434,6 +444,7 @@ static void terminfo_start(TUIData *tui)
if (!nsterm) {
tui_request_term_mode(tui, kTermModeSynchronizedOutput);
tui_request_term_mode(tui, kTermModeResizeEvents);
tui_request_term_mode(tui, kTermModeGraphemeClusters);
}
// Don't use DECRQSS in screen or tmux, as they behave strangely when receiving it.
@ -494,7 +505,9 @@ static void terminfo_stop(TUIData *tui)
// Disable resize events
tui_set_term_mode(tui, kTermModeResizeEvents, false);
if (tui->did_set_grapheme_cluster_mode) {
tui_set_term_mode(tui, kTermModeGraphemeClusters, false);
}
// May restore old title before exiting alternate screen.
tui_set_title(tui, NULL_STRING);
if (ui_client_exit_status == 0) {
@ -1010,7 +1023,7 @@ static void print_cell_at_pos(TUIData *tui, int row, int col, UCell *cell, bool
char buf[MAX_SCHAR_SIZE];
schar_get(buf, cell->data);
int c = utf_ptr2char(buf);
bool is_ambiwidth = utf_ambiguous_width(c);
bool is_ambiwidth = utf_ambiguous_width(buf);
if (is_doublewidth && (is_ambiwidth || utf_char2cells(c) == 1)) {
// If the server used setcellwidths() to treat a single-width char as double-width,
// it needs to be treated like an ambiguous-width char.

View File

@ -4,6 +4,7 @@ typedef struct TUIData TUIData;
typedef enum {
kTermModeSynchronizedOutput = 2026,
kTermModeGraphemeClusters = 2027,
kTermModeResizeEvents = 2048,
} TermMode;

View File

@ -1435,6 +1435,28 @@ describe('API', function()
it('cannot handle NULs', function()
eq(0, api.nvim_strwidth('\0abc'))
end)
it('can handle emoji with variant selectors and ZWJ', function()
local selector = '❤️'
eq(2, fn.strchars(selector))
eq(1, fn.strcharlen(selector))
eq(2, api.nvim_strwidth(selector))
local no_selector = ''
eq(1, fn.strchars(no_selector))
eq(1, fn.strcharlen(no_selector))
eq(1, api.nvim_strwidth(no_selector))
local selector_zwj_selector = '🏳️‍⚧️'
eq(5, fn.strchars(selector_zwj_selector))
eq(1, fn.strcharlen(selector_zwj_selector))
eq(2, api.nvim_strwidth(selector_zwj_selector))
local emoji_zwj_emoji = '🧑‍🌾'
eq(3, fn.strchars(emoji_zwj_emoji))
eq(1, fn.strcharlen(emoji_zwj_emoji))
eq(2, api.nvim_strwidth(emoji_zwj_emoji))
end)
end)
describe('nvim_get_current_line, nvim_set_current_line', function()

View File

@ -5620,6 +5620,27 @@ l5
]]
})
end)
it('supports emoji as signs', function()
insert(example_test3)
feed 'gg'
api.nvim_buf_set_extmark(0, ns, 1, 0, {sign_text='🧑‍🌾'})
-- VS16 can change width of character
api.nvim_buf_set_extmark(0, ns, 2, 0, {sign_text='❤️'})
api.nvim_buf_set_extmark(0, ns, 3, 0, {sign_text=''})
api.nvim_buf_set_extmark(0, ns, 4, 0, {sign_text='❤x'})
screen:expect([[
{7: }^l1 |
🧑🌾l2 |
l3 |
l4 |
xl5 |
{7: } |
{1:~ }|*3
|
]])
eq("Invalid 'sign_text'", pcall_err(api.nvim_buf_set_extmark, 0, ns, 5, 0, {sign_text='x'}))
end)
end)
describe('decorations: virt_text', function()

View File

@ -1436,6 +1436,41 @@ vimComment xxx match /\s"[^\-:.%#=*].*$/ms=s+1,lc=1 excludenl contains=@vim
}
end)
it('supports nvim_echo messages with emoji', function()
-- stylua: ignore
async_meths.nvim_echo(
{ { 'wow, 🏳️‍⚧️🧑‍🌾❤️😂🏴‍☠️\nvariant ❤️ one\nvariant ❤ two' } }, true, {}
)
screen:expect([[
|
{1:~ }|
{3: }|
wow, 🏳🧑🌾😂🏴 |
variant one |
variant two |
{6:Press ENTER or type command to continue}^ |
]])
feed '<cr>'
screen:expect([[
^ |
{1:~ }|*5
|
]])
feed ':messages<cr>'
screen:expect([[
|
{1:~ }|
{3: }|
wow, 🏳🧑🌾😂🏴 |
variant one |
variant two |
{6:Press ENTER or type command to continue}^ |
]])
end)
it('prints lines in Ex mode correctly with a burst of carriage returns #19341', function()
command('set number')
api.nvim_buf_set_lines(0, 0, 0, true, { 'aaa', 'bbb', 'ccc' })

View File

@ -296,6 +296,86 @@ describe('multibyte rendering', function()
]],
}
end)
it('supports emoji with variant selectors and ZWJ', function()
command('set ruler')
insert('🏳️‍⚧️')
screen:expect([[
^🏳 |
{1:~ }|*4
1,1 All |
]])
feed('a word<esc>')
screen:expect([[
🏳 wor^d |
{1:~ }|*4
1,21-7 All |
]])
feed('0')
screen:expect([[
^🏳 word |
{1:~ }|*4
1,1 All |
]])
feed('l')
screen:expect([[
🏳^ word |
{1:~ }|*4
1,17-3 All |
]])
feed('h')
screen:expect([[
^🏳 word |
{1:~ }|*4
1,1 All |
]])
feed('o❤ variant selected<esc>')
screen:expect([[
🏳 word |
variant selecte^d |
{1:~ }|*3
2,23-19 All |
]])
feed('0')
screen:expect([[
🏳 word |
^ variant selected |
{1:~ }|*3
2,1 All |
]])
feed('l')
screen:expect([[
🏳 word |
^ variant selected |
{1:~ }|*3
2,7-3 All |
]])
feed('h')
screen:expect([[
🏳 word |
^ variant selected |
{1:~ }|*3
2,1 All |
]])
-- without selector: single width (note column 18 and not 19)
feed('o❤ variant selected<esc>')
screen:expect([[
🏳 word |
variant selected |
variant selecte^d |
{1:~ }|*2
3,20-18 All |
]])
end)
end)
describe('multibyte rendering: statusline', function()
@ -348,11 +428,12 @@ describe('multibyte rendering: statusline', function()
it('non-printable followed by MAX_MCO unicode combination points', function()
command('set statusline=Ÿ̸⃯ᷰ⃐⃧⃝')
-- U+9F + U+1DF0 + U+20EF + U+0338 + U+20D0 + U+20E7 + U+20DD
-- TODO: not ideal, better with plain ">" and then space+combining
screen:expect([[
^ |
{1:~ }|
{3:<9f><1df0><20ef><0338><20d0><20e7><20dd>}|
|
^ |
{1:~ }|
{3:<9f≯ }|
|
]])
end)
@ -368,9 +449,20 @@ describe('multibyte rendering: statusline', function()
}
end)
it('unprintable chars in filename with default stl', function()
it('emoji with ZWJ in filename with default stl', function()
command('file 🧑‍💻')
-- TODO: this is wrong but avoids a crash
screen:expect {
grid = [[
^ |
{1:~ }|
{3:🧑💻 }|
|
]],
}
end)
it('unprintable chars in filename with default stl', function()
command('file 🧑​💻')
screen:expect {
grid = [[
^ |
@ -381,15 +473,27 @@ describe('multibyte rendering: statusline', function()
}
end)
it('unprintable chars in filename with custom stl', function()
it('emoji with ZWJ in filename with custom stl', function()
command('set statusline=xx%#ErrorMsg#%f%##yy')
command('file 🧑‍💻')
-- TODO: this is also wrong but also avoids a crash
screen:expect {
grid = [[
^ |
{1:~ }|
{3:xx}{9:🧑<200d>💻}{3:yy }|
{3:xx}{9:🧑💻}{3:yy }|
|
]],
}
end)
it('unprintable chars in filename with custom stl', function()
command('set statusline=xx%#ErrorMsg#%f%##yy')
command('file 🧑​💻')
screen:expect {
grid = [[
^ |
{1:~ }|
{3:xx}{9:🧑<200b>💻}{3:yy }|
|
]],
}

View File

@ -3663,7 +3663,7 @@ func Test_string_reverse()
call assert_equal('', reverse(v:_null_string))
for [s1, s2] in [['', ''], ['a', 'a'], ['ab', 'ba'], ['abc', 'cba'],
\ ['abcd', 'dcba'], ['«-«-»-»', '»-»-«-«'],
\ ['🇦', '🇦'], ['🇦🇧', '🇧🇦'], ['🇦🇧🇨', '🇨🇧🇦'],
\ ['🇦', '🇦'], ['🇦🇧', '🇦🇧'], ['🇦🇧🇨', '🇨🇦🇧'],
\ ['🇦«🇧-🇨»🇩', '🇩»🇨-🇧«🇦']]
call assert_equal(s2, reverse(s1))
endfor

View File

@ -3897,9 +3897,9 @@ func Test_normal_count_after_operator()
bw!
endfunc
func Test_normal_gj_on_extra_wide_char()
func Test_normal_gj_on_6_cell_wide_unprintable_char()
new | 25vsp
let text='1 foooooooo ar e inszwe1 foooooooo inszwei' .
let text='1 foooooooo ar e inszwe1 foooooooo inszwei' .
\ ' i drei vier fünf sechs sieben acht un zehn elf zwöfl' .
\ ' dreizehn v ierzehn fünfzehn'
put =text

View File

@ -3,8 +3,15 @@ local itp = t.gen_itp(it)
local ffi = t.ffi
local eq = t.eq
local to_cstr = t.to_cstr
local ok = t.ok
local lib = t.cimport('./src/nvim/mbyte.h', './src/nvim/charset.h', './src/nvim/grid.h')
local lib = t.cimport(
'./src/nvim/mbyte.h',
'./src/nvim/charset.h',
'./src/nvim/grid.h',
'./src/nvim/option_vars.h'
)
describe('mbyte', function()
-- Convert from bytes to string
@ -45,12 +52,21 @@ describe('mbyte', function()
end)
end
describe('utfc_ptr2schar_len', function()
describe('utfc_ptr2schar', function()
local function test_seq(seq)
local firstc = ffi.new('int[1]')
local buf = ffi.new('char[32]')
lib.schar_get(buf, lib.utfc_ptr2schar_len(to_string(seq), #seq, firstc))
return { ffi.string(buf), firstc[0] }
lib.schar_get(buf, lib.utfc_ptr2schar(to_string(seq), firstc))
local str = ffi.string(buf)
if 1 > 2 then -- for debugging
local tabel = {}
for i = 1, #str do
table.insert(tabel, string.format('0x%02x', string.byte(str, i)))
end
print('{ ' .. table.concat(tabel, ', ') .. ' }')
io.stdout:flush()
end
return { str, firstc[0] }
end
local function byte(val)
@ -88,7 +104,9 @@ describe('mbyte', function()
eq(byte(0x7f), test_seq { 0x7f, 0xc2, 0x80 })
-- Combining character is U+0300
eq({ '\x7f\xcc\x80', 0x7f }, test_seq { 0x7f, 0xcc, 0x80 })
eq({ '\x29\xcc\x80', 0x29 }, test_seq { 0x29, 0xcc, 0x80 })
-- invalid start byte for combining
eq({ '\x7f', 0x7f }, test_seq { 0x7f, 0xcc, 0x80 })
-- No UTF-8 sequence
eq({ '', 0xc2 }, test_seq { 0xc2, 0x7f, 0xcc })
@ -102,18 +120,21 @@ describe('mbyte', function()
itp('4-byte sequences', function()
-- No following combining character
eq(byte(0x7f), test_seq { 0x7f, 0x7f, 0xcc, 0x80 })
eq(byte(0x29), test_seq { 0x29, 0x29, 0xcc, 0x80 })
-- No second UTF-8 character
eq(byte(0x7f), test_seq { 0x7f, 0xc2, 0xcc, 0x80 })
-- Combining character U+0300
eq({ '\x7f\xcc\x80', 0x7f }, test_seq { 0x7f, 0xcc, 0x80, 0xcc })
eq({ '\x29\xcc\x80', 0x29 }, test_seq { 0x29, 0xcc, 0x80, 0xcc })
-- No UTF-8 sequence
eq({ '', 0xc2 }, test_seq { 0xc2, 0x7f, 0xcc, 0x80 })
-- No following UTF-8 character
eq({ '\xc2\x80', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0xcc })
-- Combining character U+0301
eq({ '\xc2\x80\xcc\x81', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0x81 })
eq({ '\xc2\xbc\xcc\x81', 0xbc }, test_seq { 0xc2, 0xbc, 0xcc, 0x81 })
-- U+0080 : not a valid start char
eq({ '\xc2\x80', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0x81 })
-- One UTF-8 character
eq({ '\xf4\x80\x80\x80', 0x100000 }, test_seq { 0xf4, 0x80, 0x80, 0x80 })
@ -126,36 +147,36 @@ describe('mbyte', function()
eq(byte(0x7f), test_seq { 0x7f, 0xc2, 0xcc, 0x80, 0x80 })
-- Combining character U+0300
eq({ '\x7f\xcc\x80', 0x7f }, test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x00 })
eq({ '\x29\xcc\x80', 0x29 }, test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x00 })
-- Combining characters U+0300 and U+0301
eq({ '\x7f\xcc\x80\xcc\x81', 0x7f }, test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81 })
eq({ '\x29\xcc\x80\xcc\x81', 0x29 }, test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81 })
-- Combining characters U+0300, U+0301, U+0302
eq(
{ '\x7f\xcc\x80\xcc\x81\xcc\x82', 0x7f },
test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82 }
{ '\x29\xcc\x80\xcc\x81\xcc\x82', 0x29 },
test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82 }
)
-- Combining characters U+0300, U+0301, U+0302, U+0303
eq(
{ '\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83', 0x7f },
test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83 }
{ '\x29\xcc\x80\xcc\x81\xcc\x82\xcc\x83', 0x29 },
test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83 }
)
-- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304
eq(
{ '\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84', 0x7f },
test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84 }
{ '\x29\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84', 0x29 },
test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84 }
)
-- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304, U+0305
eq(
{ '\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85', 0x7f },
test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85 }
{ '\x29\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85', 0x29 },
test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85 }
)
-- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304, U+0305, U+0306
eq(
{ '\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85\xcc\x86', 0x7f },
{ '\x29\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85\xcc\x86', 0x29 },
test_seq {
0x7f,
0x29,
0xcc,
0x80,
0xcc,
@ -175,18 +196,18 @@ describe('mbyte', function()
-- Only three following combining characters U+0300, U+0301, U+0302
eq(
{ '\x7f\xcc\x80\xcc\x81\xcc\x82', 0x7f },
test_seq { 0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xc2, 0x80, 0xcc, 0x84, 0xcc, 0x85 }
{ '\x29\xcc\x80\xcc\x81\xcc\x82', 0x29 },
test_seq { 0x29, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xc2, 0x80, 0xcc, 0x84, 0xcc, 0x85 }
)
-- No UTF-8 sequence
eq({ '', 0xc2 }, test_seq { 0xc2, 0x7f, 0xcc, 0x80, 0x80 })
-- No following UTF-8 character
eq({ '\xc2\x80', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0xcc, 0x80 })
eq({ '\xc2\xbc', 0xbc }, test_seq { 0xc2, 0xbc, 0xcc, 0xcc, 0x80 })
-- Combining character U+0301
eq({ '\xc2\x80\xcc\x81', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0x81, 0x7f })
eq({ '\xc2\xbc\xcc\x81', 0xbc }, test_seq { 0xc2, 0xbc, 0xcc, 0x81, 0x7f })
-- Combining character U+0301
eq({ '\xc2\x80\xcc\x81', 0x80 }, test_seq { 0xc2, 0x80, 0xcc, 0x81, 0xcc })
eq({ '\xc2\xbc\xcc\x81', 0xbc }, test_seq { 0xc2, 0xbc, 0xcc, 0x81, 0xcc })
-- One UTF-8 character
eq({ '\xf4\x80\x80\x80', 0x100000 }, test_seq { 0xf4, 0x80, 0x80, 0x80, 0x7f })
@ -205,8 +226,6 @@ describe('mbyte', function()
end)
describe('utf_cp_bounds_len', function()
local to_cstr = t.to_cstr
local tests = {
{
name = 'for valid string',
@ -273,4 +292,52 @@ describe('mbyte', function()
eq(expected_offsets, { b = b_offsets, e = e_offsets })
end)
end)
itp('utf_head_off', function()
local function check(str, expected_glyphs)
local len = #str
local cstr = to_cstr(str)
local breaks = { 0 } -- SOT
local pos = 0
local mb_glyphs = {}
while pos < len do
local clen = lib.utfc_ptr2len(cstr + pos)
ok(clen > 0) -- otherwise we get stuck
if clen > 1 then
table.insert(mb_glyphs, string.sub(str, pos + 1, pos + clen))
end
pos = pos + clen
table.insert(breaks, pos)
end
eq(breaks[#breaks], len) -- include EOT as break
-- we could also send in breaks, but this is more human readable
eq(mb_glyphs, expected_glyphs)
for i = 1, #breaks - 1 do
local start, next = breaks[i], breaks[i + 1]
for p = start, next - 1 do
eq(p - start, lib.utf_head_off(cstr, cstr + p))
end
end
eq(0, lib.utf_head_off(cstr, cstr + len)) -- NUL byte is safe
end
-- stylua doesn't like ZWJ chars..
-- stylua: ignore start
check('hej och hå 🧑‍🌾!', { 'å', '🧑‍🌾' })
-- emoji only (various kinds of combinations, use g8 to see them)
check("🏳️‍⚧️🧑‍🌾❤️😂🏴‍☠️", {"🏳️‍⚧️", "🧑‍🌾", "❤️", "😂", "🏴‍☠️"})
check('🏳xy🧑🌾\r❤️😂å🏴‍☠️€', { '🏳️‍⚧️', '🧑‍🌾', '❤️', '😂', 'å', '🏴‍☠️', '€' })
check('🇦🅱️ 🇦🇽 🇦🇨🇦 🇲🇽🇹🇱',{'🇦', '🅱️', '🇦🇽', '🇦🇨', '🇦', '🇲🇽', '🇹🇱'})
check('🏴󠁧󠁢󠁳󠁣󠁴󠁿🏴󠁧󠁢󠁷󠁬󠁳󠁿', {'🏴󠁧󠁢󠁳󠁣󠁴󠁿', '🏴󠁧󠁢󠁷󠁬󠁳󠁿'})
lib.p_arshape = true -- default
check('سلام', { 'س', 'لا', 'م' })
lib.p_arshape = false
check('سلام', { 'س', 'ل', 'ا', 'م' })
check('L̓̉̑̒̌̚ơ̗̌̒̄̀ŕ̈̈̎̐̕è̇̅̄̄̐m̖̟̟̅̄̚', {'L̓̉̑̒̌̚', 'ơ̗̌̒̄̀', 'ŕ̈̈̎̐̕', 'è̇̅̄̄̐', 'm̖̟̟̅̄̚'})
-- stylua: ignore end
end)
end)