mirror of
https://github.com/neovim/neovim.git
synced 2024-09-17 20:58:20 -04:00
Merge pull request #2905 from bfredl/utf8
Only allow encoding=utf-8 and simplify multibyte code
This commit is contained in:
commit
9147331e21
@ -1029,8 +1029,8 @@ A string constant accepts these special characters:
|
||||
\x. byte specified with one hex number (must be followed by non-hex char)
|
||||
\X.. same as \x..
|
||||
\X. same as \x.
|
||||
\u.... character specified with up to 4 hex numbers, stored according to the
|
||||
current value of 'encoding' (e.g., "\u02a4")
|
||||
\u.... character specified with up to 4 hex numbers, stored as UTF-8
|
||||
(e.g., "\u02a4")
|
||||
\U.... same as \u but allows up to 8 hex numbers.
|
||||
\b backspace <BS>
|
||||
\e escape <Esc>
|
||||
@ -1045,8 +1045,7 @@ A string constant accepts these special characters:
|
||||
utf-8 character, use \uxxxx as mentioned above.
|
||||
|
||||
Note that "\xff" is stored as the byte 255, which may be invalid in some
|
||||
encodings. Use "\u00ff" to store character 255 according to the current value
|
||||
of 'encoding'.
|
||||
encodings. Use "\u00ff" to store character 255 correctly as UTF-8.
|
||||
|
||||
Note that "\000" and "\x00" force the end of the string.
|
||||
|
||||
@ -2532,8 +2531,6 @@ byteidxcomp({expr}, {nr}) *byteidxcomp()*
|
||||
< The first and third echo result in 3 ('e' plus composing
|
||||
character is 3 bytes), the second echo results in 1 ('e' is
|
||||
one byte).
|
||||
Only works different from byteidx() when 'encoding' is set to
|
||||
a Unicode encoding.
|
||||
|
||||
call({func}, {arglist} [, {dict}]) *call()* *E699*
|
||||
Call function {func} with the items in |List| {arglist} as
|
||||
@ -2568,11 +2565,11 @@ char2nr({expr}[, {utf8}]) *char2nr()*
|
||||
Return number value of the first char in {expr}. Examples: >
|
||||
char2nr(" ") returns 32
|
||||
char2nr("ABC") returns 65
|
||||
< When {utf8} is omitted or zero, the current 'encoding' is used.
|
||||
Example for "utf-8": >
|
||||
char2nr("á") returns 225
|
||||
char2nr("á"[0]) returns 195
|
||||
< With {utf8} set to 1, always treat as utf-8 characters.
|
||||
< Non-ASCII characters are always treated as UTF-8 characters.
|
||||
{utf8} has no effect, and exists only for
|
||||
backwards-compatibility.
|
||||
A combining character is a separate character.
|
||||
|nr2char()| does the opposite.
|
||||
|
||||
@ -4225,11 +4222,7 @@ iconv({expr}, {from}, {to}) *iconv()*
|
||||
Most conversions require Vim to be compiled with the |+iconv|
|
||||
feature. Otherwise only UTF-8 to latin1 conversion and back
|
||||
can be done.
|
||||
This can be used to display messages with special characters,
|
||||
no matter what 'encoding' is set to. Write the message in
|
||||
UTF-8 and use: >
|
||||
echo iconv(utf8_str, "utf-8", &enc)
|
||||
< Note that Vim uses UTF-8 for all Unicode encodings, conversion
|
||||
Note that Vim uses UTF-8 for all Unicode encodings, conversion
|
||||
from/to UCS-2 is automatically changed to use UTF-8. You
|
||||
cannot use UCS-2 in a string anyway, because of the NUL bytes.
|
||||
{only available when compiled with the |+multi_byte| feature}
|
||||
@ -4513,9 +4506,7 @@ join({list} [, {sep}]) *join()*
|
||||
json_decode({expr}) *json_decode()*
|
||||
Convert {expr} from JSON object. Accepts |readfile()|-style
|
||||
list as the input, as well as regular string. May output any
|
||||
Vim value. When 'encoding' is not UTF-8 string is converted
|
||||
from UTF-8 to 'encoding', failing conversion fails
|
||||
json_decode(). In the following cases it will output
|
||||
Vim value. In the following cases it will output
|
||||
|msgpack-special-dict|:
|
||||
1. Dictionary contains duplicate key.
|
||||
2. Dictionary contains empty key.
|
||||
@ -4523,33 +4514,22 @@ json_decode({expr}) *json_decode()*
|
||||
dictionary and for string will be emitted in case string
|
||||
with NUL byte was a dictionary key.
|
||||
|
||||
Note: function treats its input as UTF-8 always regardless of
|
||||
'encoding' value. This is needed because JSON source is
|
||||
supposed to be external (e.g. |readfile()|) and JSON standard
|
||||
allows only a few encodings, of which UTF-8 is recommended and
|
||||
the only one required to be supported. Non-UTF-8 characters
|
||||
are an error.
|
||||
Note: function treats its input as UTF-8 always. The JSON
|
||||
standard allows only a few encodings, of which UTF-8 is
|
||||
recommended and the only one required to be supported.
|
||||
Non-UTF-8 characters are an error.
|
||||
|
||||
json_encode({expr}) *json_encode()*
|
||||
Convert {expr} into a JSON string. Accepts
|
||||
|msgpack-special-dict| as the input. Converts from 'encoding'
|
||||
to UTF-8 when encoding strings. Will not convert |Funcref|s,
|
||||
|msgpack-special-dict| as the input. Will not convert |Funcref|s,
|
||||
mappings with non-string keys (can be created as
|
||||
|msgpack-special-dict|), values with self-referencing
|
||||
containers, strings which contain non-UTF-8 characters,
|
||||
pseudo-UTF-8 strings which contain codepoints reserved for
|
||||
surrogate pairs (such strings are not valid UTF-8 strings).
|
||||
When converting 'encoding' is taken into account, if it is not
|
||||
"utf-8", then conversion is performed before encoding strings.
|
||||
Non-printable characters are converted into "\u1234" escapes
|
||||
or special escapes like "\t", other are dumped as-is.
|
||||
|
||||
Note: all characters above U+0079 are considered non-printable
|
||||
when 'encoding' is not UTF-8. This function always outputs
|
||||
UTF-8 strings as required by the standard thus when 'encoding'
|
||||
is not unicode resulting string will look incorrect if
|
||||
"\u1234" notation is not used.
|
||||
|
||||
keys({dict}) *keys()*
|
||||
Return a |List| with all the keys of {dict}. The |List| is in
|
||||
arbitrary order.
|
||||
@ -4651,9 +4631,9 @@ line2byte({lnum}) *line2byte()*
|
||||
Return the byte count from the start of the buffer for line
|
||||
{lnum}. This includes the end-of-line character, depending on
|
||||
the 'fileformat' option for the current buffer. The first
|
||||
line returns 1. 'encoding' matters, 'fileencoding' is ignored.
|
||||
This can also be used to get the byte count for the line just
|
||||
below the last line: >
|
||||
line returns 1. UTF-8 encoding is used, 'fileencoding' is
|
||||
ignored. This can also be used to get the byte count for the
|
||||
line just below the last line: >
|
||||
line2byte(line("$") + 1)
|
||||
< This is the buffer size plus one. If 'fileencoding' is empty
|
||||
it is the file size plus one.
|
||||
@ -5172,10 +5152,10 @@ nr2char({expr}[, {utf8}]) *nr2char()*
|
||||
value {expr}. Examples: >
|
||||
nr2char(64) returns "@"
|
||||
nr2char(32) returns " "
|
||||
< When {utf8} is omitted or zero, the current 'encoding' is used.
|
||||
Example for "utf-8": >
|
||||
< Example for "utf-8": >
|
||||
nr2char(300) returns I with bow character
|
||||
< With {utf8} set to 1, always return utf-8 characters.
|
||||
< UTF-8 encoding is always used, {utf8} option has no effect,
|
||||
and exists only for backwards-compatibility.
|
||||
Note that a NUL character in the file is specified with
|
||||
nr2char(10), because NULs are represented with newline
|
||||
characters. nr2char(0) is a real NUL and terminates the
|
||||
@ -5417,7 +5397,7 @@ py3eval({expr}) *py3eval()*
|
||||
converted to Vim data structures.
|
||||
Numbers and strings are returned as they are (strings are
|
||||
copied though, Unicode strings are additionally converted to
|
||||
'encoding').
|
||||
UTF-8).
|
||||
Lists are represented as Vim |List| type.
|
||||
Dictionaries are represented as Vim |Dictionary| type with
|
||||
keys converted to strings.
|
||||
@ -5467,8 +5447,7 @@ readfile({fname} [, {binary} [, {max}]])
|
||||
Otherwise:
|
||||
- CR characters that appear before a NL are removed.
|
||||
- Whether the last line ends in a NL or not does not matter.
|
||||
- When 'encoding' is Unicode any UTF-8 byte order mark is
|
||||
removed from the text.
|
||||
- Any UTF-8 byte order mark is removed from the text.
|
||||
When {max} is given this specifies the maximum number of lines
|
||||
to be read. Useful if you only want to check the first ten
|
||||
lines of a file: >
|
||||
@ -6621,8 +6600,7 @@ string({expr}) Return {expr} converted to a String. If {expr} is a Number,
|
||||
for infinite and NaN floating-point values representations
|
||||
which use |str2float()|. Strings are also dumped literally,
|
||||
only single quote is escaped, which does not allow using YAML
|
||||
for parsing back binary strings (including text when
|
||||
'encoding' is not UTF-8). |eval()| should always work for
|
||||
for parsing back binary strings. |eval()| should always work for
|
||||
strings and floats though and this is the only official
|
||||
method, use |msgpackdump()| or |json_encode()| if you need to
|
||||
share data with other application.
|
||||
|
@ -70,29 +70,24 @@ See |mbyte-locale| for details.
|
||||
|
||||
ENCODING
|
||||
|
||||
If your locale works properly, Vim will try to set the 'encoding' option
|
||||
accordingly. If this doesn't work you can overrule its value: >
|
||||
Nvim always uses UTF-8 internally. Thus 'encoding' option is always set
|
||||
to "utf-8" and cannot be changed.
|
||||
|
||||
:set encoding=utf-8
|
||||
All the text that is used inside Vim will be in UTF-8. Not only the text in
|
||||
the buffers, but also in registers, variables, etc.
|
||||
|
||||
See |encoding-values| for a list of acceptable values.
|
||||
|
||||
The result is that all the text that is used inside Vim will be in this
|
||||
encoding. Not only the text in the buffers, but also in registers, variables,
|
||||
etc. 'encoding' is read-only after startup because changing it would make the
|
||||
existing text invalid.
|
||||
|
||||
You can edit files in another encoding than what 'encoding' is set to. Vim
|
||||
You can edit files in different encodings than UTF-8. Nvim
|
||||
will convert the file when you read it and convert it back when you write it.
|
||||
See 'fileencoding', 'fileencodings' and |++enc|.
|
||||
|
||||
|
||||
DISPLAY AND FONTS
|
||||
|
||||
If you are working in a terminal (emulator) you must make sure it accepts the
|
||||
same encoding as which Vim is working with.
|
||||
If you are working in a terminal (emulator) you must make sure it accepts
|
||||
UTF-8, the encoding which Vim is working with. Otherwise only ASCII can
|
||||
be displayed and edited correctly.
|
||||
|
||||
For the GUI you must select fonts that work with the current 'encoding'. This
|
||||
For the GUI you must select fonts that work with UTF-8. This
|
||||
is the difficult part. It depends on the system you are using, the locale and
|
||||
a few other things. See the chapters on fonts: |mbyte-fonts-X11| for
|
||||
X-Windows and |mbyte-fonts-MSwin| for MS-Windows.
|
||||
@ -216,10 +211,9 @@ You could make a small shell script for this.
|
||||
==============================================================================
|
||||
3. Encoding *mbyte-encoding*
|
||||
|
||||
Vim uses the 'encoding' option to specify how characters are identified and
|
||||
encoded when they are used inside Vim. This applies to all the places where
|
||||
text is used, including buffers (files loaded into memory), registers and
|
||||
variables.
|
||||
In Nvim UTF-8 is always used internally to encode characters.
|
||||
This applies to all the places where text is used, including buffers (files
|
||||
loaded into memory), registers and variables.
|
||||
|
||||
*charset* *codeset*
|
||||
Charset is another name for encoding. There are subtle differences, but these
|
||||
@ -240,7 +234,7 @@ matter what language is used. Thus you might see the right text even when the
|
||||
encoding was set wrong.
|
||||
|
||||
*encoding-names*
|
||||
Vim can use many different character encodings. There are three major groups:
|
||||
Vim can edit files in different character encodings. There are three major groups:
|
||||
|
||||
1 8bit Single-byte encodings, 256 different characters. Mostly used
|
||||
in USA and Europe. Example: ISO-8859-1 (Latin1). All
|
||||
@ -255,11 +249,10 @@ u Unicode Universal encoding, can replace all others. ISO 10646.
|
||||
Millions of different characters. Example: UTF-8. The
|
||||
relation between bytes and screen cells is complex.
|
||||
|
||||
Other encodings cannot be used by Vim internally. But files in other
|
||||
Only UTF-8 is used by Vim internally. But files in other
|
||||
encodings can be edited by using conversion, see 'fileencoding'.
|
||||
Note that all encodings must use ASCII for the characters up to 128.
|
||||
|
||||
Supported 'encoding' values are: *encoding-values*
|
||||
Recognized 'fileencoding' values include: *encoding-values*
|
||||
1 latin1 8-bit characters (ISO 8859-1, also used for cp1252)
|
||||
1 iso-8859-n ISO_8859 variant (n = 2 to 15)
|
||||
1 koi8-r Russian
|
||||
@ -311,11 +304,11 @@ u ucs-4 32 bit UCS-4 encoded Unicode (ISO/IEC 10646-1)
|
||||
u ucs-4le like ucs-4, little endian
|
||||
|
||||
The {name} can be any encoding name that your system supports. It is passed
|
||||
to iconv() to convert between the encoding of the file and the current locale.
|
||||
to iconv() to convert between UTF-8 and the encoding of the file.
|
||||
For MS-Windows "cp{number}" means using codepage {number}.
|
||||
Examples: >
|
||||
:set encoding=8bit-cp1252
|
||||
:set encoding=2byte-cp932
|
||||
:set fileencoding=8bit-cp1252
|
||||
:set fileencoding=2byte-cp932
|
||||
|
||||
The MS-Windows codepage 1252 is very similar to latin1. For practical reasons
|
||||
the same encoding is used and it's called latin1. 'isprint' can be used to
|
||||
@ -337,8 +330,7 @@ u ucs-2be same as ucs-2 (big endian)
|
||||
u ucs-4be same as ucs-4 (big endian)
|
||||
u utf-32 same as ucs-4
|
||||
u utf-32le same as ucs-4le
|
||||
default stands for the default value of 'encoding', depends on the
|
||||
environment
|
||||
default the encoding of the current locale.
|
||||
|
||||
For the UCS codes the byte order matters. This is tricky, use UTF-8 whenever
|
||||
you can. The default is to use big-endian (most significant byte comes
|
||||
@ -363,13 +355,12 @@ or when conversion is not possible:
|
||||
CONVERSION *charset-conversion*
|
||||
|
||||
Vim will automatically convert from one to another encoding in several places:
|
||||
- When reading a file and 'fileencoding' is different from 'encoding'
|
||||
- When writing a file and 'fileencoding' is different from 'encoding'
|
||||
- When reading a file and 'fileencoding' is different from "utf-8"
|
||||
- When writing a file and 'fileencoding' is different from "utf-8"
|
||||
- When displaying messages and the encoding used for LC_MESSAGES differs from
|
||||
'encoding' (requires a gettext version that supports this).
|
||||
"utf-8" (requires a gettext version that supports this).
|
||||
- When reading a Vim script where |:scriptencoding| is different from
|
||||
'encoding'.
|
||||
- When reading or writing a |shada| file.
|
||||
"utf-8".
|
||||
Most of these require the |+iconv| feature. Conversion for reading and
|
||||
writing files may also be specified with the 'charconvert' option.
|
||||
|
||||
@ -408,11 +399,11 @@ Useful utilities for converting the charset:
|
||||
|
||||
|
||||
*mbyte-conversion*
|
||||
When reading and writing files in an encoding different from 'encoding',
|
||||
When reading and writing files in an encoding different from "utf-8",
|
||||
conversion needs to be done. These conversions are supported:
|
||||
- All conversions between Latin-1 (ISO-8859-1), UTF-8, UCS-2 and UCS-4 are
|
||||
handled internally.
|
||||
- For MS-Windows, when 'encoding' is a Unicode encoding, conversion from and
|
||||
- For MS-Windows, conversion from and
|
||||
to any codepage should work.
|
||||
- Conversion specified with 'charconvert'
|
||||
- Conversion with the iconv library, if it is available.
|
||||
@ -468,8 +459,6 @@ and you will have a working UTF-8 terminal emulator. Try both >
|
||||
with the demo text that comes with ucs-fonts.tar.gz in order to see
|
||||
whether there are any problems with UTF-8 in your xterm.
|
||||
|
||||
For Vim you may need to set 'encoding' to "utf-8".
|
||||
|
||||
==============================================================================
|
||||
5. Fonts on X11 *mbyte-fonts-X11*
|
||||
|
||||
@ -864,11 +853,11 @@ between two keyboard settings.
|
||||
The value of the 'keymap' option specifies a keymap file to use. The name of
|
||||
this file is one of these two:
|
||||
|
||||
keymap/{keymap}_{encoding}.vim
|
||||
keymap/{keymap}_utf-8.vim
|
||||
keymap/{keymap}.vim
|
||||
|
||||
Here {keymap} is the value of the 'keymap' option and {encoding} of the
|
||||
'encoding' option. The file name with the {encoding} included is tried first.
|
||||
Here {keymap} is the value of the 'keymap' option.
|
||||
The file name with "utf-8" included is tried first.
|
||||
|
||||
'runtimepath' is used to find these files. To see an overview of all
|
||||
available keymap files, use this: >
|
||||
@ -950,7 +939,7 @@ this is unusual. But you can use various ways to specify the character: >
|
||||
A <char-0141> octal value
|
||||
x <Space> special key name
|
||||
|
||||
The characters are assumed to be encoded for the current value of 'encoding'.
|
||||
The characters are assumed to be encoded in UTF-8.
|
||||
It's possible to use ":scriptencoding" when all characters are given
|
||||
literally. That doesn't work when using the <char-> construct, because the
|
||||
conversion is done on the keymap file, not on the resulting character.
|
||||
@ -1170,21 +1159,13 @@ Useful commands:
|
||||
message is truncated, use ":messages").
|
||||
- "g8" shows the bytes used in a UTF-8 character, also the composing
|
||||
characters, as hex numbers.
|
||||
- ":set encoding=utf-8 fileencodings=" forces using UTF-8 for all files. The
|
||||
default is to use the current locale for 'encoding' and set 'fileencodings'
|
||||
to automatically detect the encoding of a file.
|
||||
- ":set fileencodings=" forces using UTF-8 for all files. The
|
||||
default is to automatically detect the encoding of a file.
|
||||
|
||||
|
||||
STARTING VIM
|
||||
|
||||
If your current locale is in an utf-8 encoding, Vim will automatically start
|
||||
in utf-8 mode.
|
||||
|
||||
If you are using another locale: >
|
||||
|
||||
set encoding=utf-8
|
||||
|
||||
You might also want to select the font used for the menus. Unfortunately this
|
||||
You might want to select the font used for the menus. Unfortunately this
|
||||
doesn't always work. See the system specific remarks below, and 'langmenu'.
|
||||
|
||||
|
||||
@ -1245,10 +1226,9 @@ not everybody is able to type a composing character.
|
||||
These options are relevant for editing multi-byte files. Check the help in
|
||||
options.txt for detailed information.
|
||||
|
||||
'encoding' Encoding used for the keyboard and display. It is also the
|
||||
default encoding for files.
|
||||
'encoding' Internal text encoding, always "utf-8".
|
||||
|
||||
'fileencoding' Encoding of a file. When it's different from 'encoding'
|
||||
'fileencoding' Encoding of a file. When it's different from "utf-8"
|
||||
conversion is done when reading or writing the file.
|
||||
|
||||
'fileencodings' List of possible encodings of a file. When opening a file
|
||||
|
@ -52,7 +52,6 @@ achieve special effects. These options come in three forms:
|
||||
:se[t] all& Set all options to their default value. The values of
|
||||
these options are not changed:
|
||||
'columns'
|
||||
'encoding'
|
||||
'lines'
|
||||
Warning: This may have a lot of side effects.
|
||||
|
||||
@ -615,7 +614,6 @@ A jump table for the options with a short description can be found at |Q_op|.
|
||||
global
|
||||
{only available when compiled with the |+multi_byte|
|
||||
feature}
|
||||
Only effective when 'encoding' is "utf-8" or another Unicode encoding.
|
||||
Tells Vim what to do with characters with East Asian Width Class
|
||||
Ambiguous (such as Euro, Registered Sign, Copyright Sign, Greek
|
||||
letters, Cyrillic letters).
|
||||
@ -668,7 +666,6 @@ A jump table for the options with a short description can be found at |Q_op|.
|
||||
- Set the 'keymap' option to "arabic"; in Insert mode CTRL-^ toggles
|
||||
between typing English and Arabic key mapping.
|
||||
- Set the 'delcombine' option
|
||||
Note that 'encoding' must be "utf-8" for working with Arabic text.
|
||||
|
||||
Resetting this option will:
|
||||
- Reset the 'rightleft' option.
|
||||
@ -1078,8 +1075,7 @@ A jump table for the options with a short description can be found at |Q_op|.
|
||||
{not available when compiled without the |+linebreak|
|
||||
feature}
|
||||
This option lets you choose which characters might cause a line
|
||||
break if 'linebreak' is on. Only works for ASCII and also for 8-bit
|
||||
characters when 'encoding' is an 8-bit encoding.
|
||||
break if 'linebreak' is on. Only works for ASCII characters.
|
||||
|
||||
*'breakindent'* *'bri'*
|
||||
'breakindent' 'bri' boolean (default off)
|
||||
@ -1214,11 +1210,9 @@ A jump table for the options with a short description can be found at |Q_op|.
|
||||
Specifies details about changing the case of letters. It may contain
|
||||
these words, separated by a comma:
|
||||
internal Use internal case mapping functions, the current
|
||||
locale does not change the case mapping. This only
|
||||
matters when 'encoding' is a Unicode encoding,
|
||||
"latin1" or "iso-8859-15". When "internal" is
|
||||
omitted, the towupper() and towlower() system library
|
||||
functions are used when available.
|
||||
locale does not change the case mapping. When
|
||||
"internal" is omitted, the towupper() and towlower()
|
||||
system library functions are used when available.
|
||||
keepascii For the ASCII characters (0x00 to 0x7f) use the US
|
||||
case mapping, the current locale is not effective.
|
||||
This probably only matters for Turkish.
|
||||
@ -1271,13 +1265,12 @@ A jump table for the options with a short description can be found at |Q_op|.
|
||||
file to convert from. You will have to save the text in a file first.
|
||||
The expression must return zero or an empty string for success,
|
||||
non-zero for failure.
|
||||
The possible encoding names encountered are in 'encoding'.
|
||||
See |encoding-names| for possible encoding names.
|
||||
Additionally, names given in 'fileencodings' and 'fileencoding' are
|
||||
used.
|
||||
Conversion between "latin1", "unicode", "ucs-2", "ucs-4" and "utf-8"
|
||||
is done internally by Vim, 'charconvert' is not used for this.
|
||||
'charconvert' is also used to convert the shada file, if 'encoding' is
|
||||
not "utf-8". Also used for Unicode conversion.
|
||||
Also used for Unicode conversion.
|
||||
Example: >
|
||||
set charconvert=CharConvert()
|
||||
fun CharConvert()
|
||||
@ -1292,8 +1285,6 @@ A jump table for the options with a short description can be found at |Q_op|.
|
||||
v:fname_in name of the input file
|
||||
v:fname_out name of the output file
|
||||
Note that v:fname_in and v:fname_out will never be the same.
|
||||
Note that v:charconvert_from and v:charconvert_to may be different
|
||||
from 'encoding'. Vim internally uses UTF-8 instead of UCS-2 or UCS-4.
|
||||
This option cannot be set from a |modeline| or in the |sandbox|, for
|
||||
security reasons.
|
||||
|
||||
@ -2140,44 +2131,14 @@ A jump table for the options with a short description can be found at |Q_op|.
|
||||
|
||||
|
||||
*'encoding'* *'enc'* *E543*
|
||||
'encoding' 'enc' string (default: "utf-8")
|
||||
global
|
||||
{only available when compiled with the |+multi_byte|
|
||||
feature}
|
||||
Sets the character encoding used inside Vim. It applies to text in
|
||||
the buffers, registers, Strings in expressions, text stored in the
|
||||
shada file, etc. It sets the kind of characters which Vim can work
|
||||
with. See |encoding-names| for the possible values.
|
||||
'encoding' 'enc' Removed. |vim-differences| {Nvim}
|
||||
Nvim always uses UTF-8 internally. RPC communication
|
||||
(remote plugins/GUIs) must use UTF-8 strings.
|
||||
|
||||
'encoding' cannot be changed after startup, because (1) it causes
|
||||
non-ASCII text inside Vim to become invalid, and (2) it complicates
|
||||
runtime logic. The recommended 'encoding' is "utf-8". Remote plugins
|
||||
and GUIs only support utf-8. See |multibyte|.
|
||||
|
||||
The character encoding of files can be different from 'encoding'.
|
||||
The character encoding of files can be different than UTF-8.
|
||||
This is specified with 'fileencoding'. The conversion is done with
|
||||
iconv() or as specified with 'charconvert'.
|
||||
|
||||
If you need to know whether 'encoding' is a multi-byte encoding, you
|
||||
can use: >
|
||||
if has("multi_byte_encoding")
|
||||
<
|
||||
When you set this option, it fires the |EncodingChanged| autocommand
|
||||
event so that you can set up fonts if necessary.
|
||||
|
||||
When the option is set, the value is converted to lowercase. Thus
|
||||
you can set it with uppercase values too. Underscores are translated
|
||||
to '-' signs.
|
||||
When the encoding is recognized, it is changed to the standard name.
|
||||
For example "Latin-1" becomes "latin1", "ISO_88592" becomes
|
||||
"iso-8859-2" and "utf8" becomes "utf-8".
|
||||
|
||||
When "unicode", "ucs-2" or "ucs-4" is used, Vim internally uses utf-8.
|
||||
You don't notice this while editing, but it does matter for the
|
||||
|shada-file|. And Vim expects the terminal to use utf-8 too. Thus
|
||||
setting 'encoding' to one of these values instead of utf-8 only has
|
||||
effect for encoding used for files when 'fileencoding' is empty.
|
||||
|
||||
*'endofline'* *'eol'* *'noendofline'* *'noeol'*
|
||||
'endofline' 'eol' boolean (default on)
|
||||
local to buffer
|
||||
@ -2304,20 +2265,14 @@ A jump table for the options with a short description can be found at |Q_op|.
|
||||
feature}
|
||||
Sets the character encoding for the file of this buffer.
|
||||
|
||||
When 'fileencoding' is different from 'encoding', conversion will be
|
||||
When 'fileencoding' is different from "utf-8", conversion will be
|
||||
done when writing the file. For reading see below.
|
||||
When 'fileencoding' is empty, the same value as 'encoding' will be
|
||||
used (no conversion when reading or writing a file).
|
||||
Conversion will also be done when 'encoding' and 'fileencoding' are
|
||||
both a Unicode encoding and 'fileencoding' is not utf-8. That's
|
||||
because internally Unicode is always stored as utf-8.
|
||||
WARNING: Conversion can cause loss of information! When
|
||||
'encoding' is "utf-8" or another Unicode encoding, conversion
|
||||
is most likely done in a way that the reverse conversion
|
||||
results in the same text. When 'encoding' is not "utf-8" some
|
||||
characters may be lost!
|
||||
When 'fileencoding' is empty, the file will be saved with utf-8
|
||||
encoding. (no conversion when reading or writing a file).
|
||||
WARNING: Conversion to a non-Unicode encoding can cause loss of
|
||||
information!
|
||||
|
||||
See 'encoding' for the possible values. Additionally, values may be
|
||||
See |encoding-names| for the possible values. Additionally, values may be
|
||||
specified that can be handled by the converter, see
|
||||
|mbyte-conversion|.
|
||||
|
||||
@ -2330,8 +2285,8 @@ A jump table for the options with a short description can be found at |Q_op|.
|
||||
Prepending "8bit-" and "2byte-" has no meaning here, they are ignored.
|
||||
When the option is set, the value is converted to lowercase. Thus
|
||||
you can set it with uppercase values too. '_' characters are
|
||||
replaced with '-'. If a name is recognized from the list for
|
||||
'encoding', it is replaced by the standard name. For example
|
||||
replaced with '-'. If a name is recognized from the list at
|
||||
|encoding-names|, it is replaced by the standard name. For example
|
||||
"ISO8859-2" becomes "iso-8859-2".
|
||||
|
||||
When this option is set, after starting to edit a file, the 'modified'
|
||||
@ -2354,12 +2309,8 @@ A jump table for the options with a short description can be found at |Q_op|.
|
||||
mentioned character encoding. If an error is detected, the next one
|
||||
in the list is tried. When an encoding is found that works,
|
||||
'fileencoding' is set to it. If all fail, 'fileencoding' is set to
|
||||
an empty string, which means the value of 'encoding' is used.
|
||||
WARNING: Conversion can cause loss of information! When
|
||||
'encoding' is "utf-8" (or one of the other Unicode variants)
|
||||
conversion is most likely done in a way that the reverse
|
||||
conversion results in the same text. When 'encoding' is not
|
||||
"utf-8" some non-ASCII characters may be lost! You can use
|
||||
an empty string, which means that UTF-8 is used.
|
||||
WARNING: Conversion can cause loss of information! You can use
|
||||
the |++bad| argument to specify what is done with characters
|
||||
that can't be converted.
|
||||
For an empty file or a file with only ASCII characters most encodings
|
||||
@ -2385,11 +2336,11 @@ A jump table for the options with a short description can be found at |Q_op|.
|
||||
because Vim cannot detect an error, thus the encoding is always
|
||||
accepted.
|
||||
The special value "default" can be used for the encoding from the
|
||||
environment. It is useful when 'encoding' is set to "utf-8" and
|
||||
your environment uses a non-latin1 encoding, such as Russian.
|
||||
When 'encoding' is "utf-8" and a file contains an illegal byte
|
||||
sequence it won't be recognized as UTF-8. You can use the |8g8|
|
||||
command to find the illegal byte sequence.
|
||||
environment. It is useful when your environment uses a non-latin1
|
||||
encoding, such as Russian.
|
||||
When a file contains an illegal UTF-8 byte sequence it won't be
|
||||
recognized as "utf-8". You can use the |8g8| command to find the
|
||||
illegal byte sequence.
|
||||
WRONG VALUES: WHAT'S WRONG:
|
||||
latin1,utf-8 "latin1" will always be used
|
||||
utf-8,ucs-bom,latin1 BOM won't be recognized in an utf-8
|
||||
@ -3048,8 +2999,7 @@ A jump table for the options with a short description can be found at |Q_op|.
|
||||
Note: The size of these fonts must be exactly twice as wide as the one
|
||||
specified with 'guifont' and the same height.
|
||||
|
||||
'guifontwide' is only used when 'encoding' is set to "utf-8" and
|
||||
'guifontset' is empty or invalid.
|
||||
'guifontwide' is only used when 'guifontset' is empty or invalid.
|
||||
When 'guifont' is set and a valid font is found in it and
|
||||
'guifontwide' is empty Vim will attempt to find a matching
|
||||
double-width font and set 'guifontwide' to it.
|
||||
@ -3702,7 +3652,7 @@ A jump table for the options with a short description can be found at |Q_op|.
|
||||
128 - 159 "~@" - "~_"
|
||||
160 - 254 "| " - "|~"
|
||||
255 "~?"
|
||||
When 'encoding' is a Unicode one, illegal bytes from 128 to 255 are
|
||||
Illegal bytes from 128 to 255 (invalid UTF-8) are
|
||||
displayed as <xx>, with the hexadecimal value of the byte.
|
||||
When 'display' contains "uhex" all unprintable characters are
|
||||
displayed as <xx>.
|
||||
@ -3980,8 +3930,7 @@ A jump table for the options with a short description can be found at |Q_op|.
|
||||
omitted.
|
||||
|
||||
The characters ':' and ',' should not be used. UTF-8 characters can
|
||||
be used when 'encoding' is "utf-8", otherwise only printable
|
||||
characters are allowed. All characters must be single width.
|
||||
be used. All characters must be single width.
|
||||
|
||||
Examples: >
|
||||
:set lcs=tab:>-,trail:-
|
||||
@ -4078,7 +4027,6 @@ A jump table for the options with a short description can be found at |Q_op|.
|
||||
{only available when compiled with the |+multi_byte|
|
||||
feature}
|
||||
The maximum number of combining characters supported for displaying.
|
||||
Only used when 'encoding' is "utf-8".
|
||||
The default is OK for most languages. Hebrew may require 4.
|
||||
Maximum value is 6.
|
||||
Even when this option is set to 2 you can still edit text with more
|
||||
@ -5825,9 +5773,6 @@ A jump table for the options with a short description can be found at |Q_op|.
|
||||
(_xx is an underscore, two letters and followed by a non-letter).
|
||||
This is mainly for testing purposes. You must make sure the correct
|
||||
encoding is used, Vim doesn't check it.
|
||||
When 'encoding' is set the word lists are reloaded. Thus it's a good
|
||||
idea to set 'spelllang' after setting 'encoding' to avoid loading the
|
||||
files twice.
|
||||
How the related spell files are found is explained here: |spell-load|.
|
||||
|
||||
If the |spellfile.vim| plugin is active and you use a language name
|
||||
|
@ -40,7 +40,6 @@ these differences.
|
||||
- 'complete' doesn't include "i"
|
||||
- 'directory' defaults to ~/.local/share/nvim/swap// (|xdg|), auto-created
|
||||
- 'display' defaults to "lastline"
|
||||
- 'encoding' defaults to "utf-8"
|
||||
- 'formatoptions' defaults to "tcqj"
|
||||
- 'history' defaults to 10000 (the maximum)
|
||||
- 'hlsearch' is set by default
|
||||
@ -159,7 +158,7 @@ are always available and may be used simultaneously in separate plugins. The
|
||||
'p')) mkdir() will silently exit. In Vim this was an error.
|
||||
3. mkdir() error messages now include strerror() text when mkdir fails.
|
||||
|
||||
'encoding' cannot be changed after startup.
|
||||
'encoding' is always "utf-8".
|
||||
|
||||
|string()| and |:echo| behaviour changed:
|
||||
1. No maximum recursion depth limit is applied to nested container
|
||||
@ -266,6 +265,7 @@ Highlight groups:
|
||||
Other options:
|
||||
'antialias'
|
||||
'cpoptions' ("g", "w", "H", "*", "-", "j", and all POSIX flags were removed)
|
||||
'encoding' ("utf-8" is always used)
|
||||
'guioptions' "t" flag was removed
|
||||
*'guipty'* (Nvim uses pipes and PTYs consistently on all platforms.)
|
||||
*'imactivatefunc'* *'imaf'*
|
||||
|
@ -1612,9 +1612,7 @@ bool vim_islower(int c)
|
||||
return false;
|
||||
}
|
||||
|
||||
if (enc_latin1like) {
|
||||
return (latin1flags[c] & LATIN1LOWER) == LATIN1LOWER;
|
||||
}
|
||||
return (latin1flags[c] & LATIN1LOWER) == LATIN1LOWER;
|
||||
}
|
||||
return islower(c);
|
||||
}
|
||||
@ -1643,9 +1641,7 @@ bool vim_isupper(int c)
|
||||
return false;
|
||||
}
|
||||
|
||||
if (enc_latin1like) {
|
||||
return (latin1flags[c] & LATIN1UPPER) == LATIN1UPPER;
|
||||
}
|
||||
return (latin1flags[c] & LATIN1UPPER) == LATIN1UPPER;
|
||||
}
|
||||
return isupper(c);
|
||||
}
|
||||
@ -1670,9 +1666,7 @@ int vim_toupper(int c)
|
||||
return c;
|
||||
}
|
||||
|
||||
if (enc_latin1like) {
|
||||
return latin1upper[c];
|
||||
}
|
||||
return latin1upper[c];
|
||||
}
|
||||
return TOUPPER_LOC(c);
|
||||
}
|
||||
@ -1697,9 +1691,7 @@ int vim_tolower(int c)
|
||||
return c;
|
||||
}
|
||||
|
||||
if (enc_latin1like) {
|
||||
return latin1lower[c];
|
||||
}
|
||||
return latin1lower[c];
|
||||
}
|
||||
return TOLOWER_LOC(c);
|
||||
}
|
||||
|
@ -4165,9 +4165,8 @@ static bool need_conversion(const char_u *fenc)
|
||||
same_encoding = (enc_flags != 0 && fenc_flags == enc_flags);
|
||||
}
|
||||
if (same_encoding) {
|
||||
/* Specified encoding matches with 'encoding'. This requires
|
||||
* conversion when 'encoding' is Unicode but not UTF-8. */
|
||||
return enc_unicode != 0;
|
||||
// Specified file encoding matches UTF-8.
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Encodings differ. However, conversion is not needed when 'enc' is any
|
||||
|
@ -778,44 +778,18 @@ EXTERN int vr_lines_changed INIT(= 0); /* #Lines changed by "gR" so far */
|
||||
# define DBCS_2BYTE 1 /* 2byte- */
|
||||
# define DBCS_DEBUG -1
|
||||
|
||||
EXTERN int enc_dbcs INIT(= 0); /* One of DBCS_xxx values if
|
||||
DBCS encoding */
|
||||
EXTERN int enc_unicode INIT(= 0); /* 2: UCS-2 or UTF-16, 4: UCS-4 */
|
||||
EXTERN bool enc_utf8 INIT(= false); /* UTF-8 encoded Unicode */
|
||||
EXTERN int enc_latin1like INIT(= TRUE); /* 'encoding' is latin1 comp. */
|
||||
EXTERN int has_mbyte INIT(= 0); /* any multi-byte encoding */
|
||||
// mbyte flags that used to depend on 'encoding'. These are now deprecated, as
|
||||
// 'encoding' is always "utf-8". Code that use them can be refactored to
|
||||
// remove dead code.
|
||||
#define enc_dbcs false
|
||||
#define enc_utf8 true
|
||||
#define has_mbyte true
|
||||
|
||||
/// Encoding used when 'fencs' is set to "default"
|
||||
EXTERN char_u *fenc_default INIT(= NULL);
|
||||
|
||||
/*
|
||||
* To speed up BYTELEN() we fill a table with the byte lengths whenever
|
||||
* enc_utf8 or enc_dbcs changes.
|
||||
*/
|
||||
EXTERN char mb_bytelen_tab[256];
|
||||
|
||||
/*
|
||||
* Function pointers, used to quickly get to the right function. Each has
|
||||
* three possible values: latin_ (8-bit), utfc_ or utf_ (utf-8) and dbcs_
|
||||
* (DBCS).
|
||||
* The value is set in mb_init();
|
||||
*/
|
||||
/* length of char in bytes, including following composing chars */
|
||||
EXTERN int (*mb_ptr2len)(const char_u *p) INIT(= latin_ptr2len);
|
||||
/* idem, with limit on string length */
|
||||
EXTERN int (*mb_ptr2len_len)(const char_u *p, int size) INIT(= latin_ptr2len_len);
|
||||
/* byte length of char */
|
||||
EXTERN int (*mb_char2len)(int c) INIT(= latin_char2len);
|
||||
/* convert char to bytes, return the length */
|
||||
EXTERN int (*mb_char2bytes)(int c, char_u *buf) INIT(= latin_char2bytes);
|
||||
EXTERN int (*mb_ptr2cells)(const char_u *p) INIT(= latin_ptr2cells);
|
||||
EXTERN int (*mb_ptr2cells_len)(const char_u *p, int size) INIT(
|
||||
= latin_ptr2cells_len);
|
||||
EXTERN int (*mb_char2cells)(int c) INIT(= latin_char2cells);
|
||||
EXTERN int (*mb_off2cells)(unsigned off, unsigned max_off) INIT(
|
||||
= latin_off2cells);
|
||||
EXTERN int (*mb_ptr2char)(const char_u *p) INIT(= latin_ptr2char);
|
||||
EXTERN int (*mb_head_off)(const char_u *base, const char_u *p) INIT(= latin_head_off);
|
||||
// To speed up BYTELEN() we keep a table with the byte lengths for utf-8
|
||||
EXTERN char utf8len_tab[256];
|
||||
|
||||
# if defined(USE_ICONV) && defined(DYNAMIC_ICONV)
|
||||
/* Pointers to functions and variables to be loaded at runtime */
|
||||
|
@ -122,32 +122,29 @@
|
||||
/* Whether to draw the vertical bar on the right side of the cell. */
|
||||
# define CURSOR_BAR_RIGHT (curwin->w_p_rl && (!(State & CMDLINE) || cmdmsg_rl))
|
||||
|
||||
/*
|
||||
* mb_ptr_adv(): advance a pointer to the next character, taking care of
|
||||
* multi-byte characters if needed.
|
||||
* mb_ptr_back(): backup a pointer to the previous character, taking care of
|
||||
* multi-byte characters if needed.
|
||||
* MB_COPY_CHAR(f, t): copy one char from "f" to "t" and advance the pointers.
|
||||
* PTR2CHAR(): get character from pointer.
|
||||
*/
|
||||
/* Get the length of the character p points to */
|
||||
# define MB_PTR2LEN(p) (has_mbyte ? (*mb_ptr2len)(p) : 1)
|
||||
/* Advance multi-byte pointer, skip over composing chars. */
|
||||
# define mb_ptr_adv(p) (p += has_mbyte ? (*mb_ptr2len)((char_u *)p) : 1)
|
||||
/* Advance multi-byte pointer, do not skip over composing chars. */
|
||||
# define mb_cptr_adv(p) (p += \
|
||||
enc_utf8 ? utf_ptr2len(p) : has_mbyte ? (*mb_ptr2len)(p) : 1)
|
||||
/* Backup multi-byte pointer. Only use with "p" > "s" ! */
|
||||
# define mb_ptr_back(s, p) (p -= has_mbyte ? ((*mb_head_off)((char_u *)s, (char_u *)p - 1) + 1) : 1)
|
||||
/* get length of multi-byte char, not including composing chars */
|
||||
# define mb_cptr2len(p) (enc_utf8 ? utf_ptr2len(p) : (*mb_ptr2len)(p))
|
||||
// mb_ptr_adv(): advance a pointer to the next character, taking care of
|
||||
// multi-byte characters if needed.
|
||||
// mb_ptr_back(): backup a pointer to the previous character, taking care of
|
||||
// multi-byte characters if needed.
|
||||
// MB_COPY_CHAR(f, t): copy one char from "f" to "t" and advance the pointers.
|
||||
// PTR2CHAR(): get character from pointer.
|
||||
|
||||
# define MB_COPY_CHAR(f, t) \
|
||||
if (has_mbyte) mb_copy_char((const char_u **)(&f), &t); \
|
||||
else *t++ = *f++
|
||||
# define MB_CHARLEN(p) (has_mbyte ? mb_charlen(p) : (int)STRLEN(p))
|
||||
# define MB_CHAR2LEN(c) (has_mbyte ? mb_char2len(c) : 1)
|
||||
# define PTR2CHAR(p) (has_mbyte ? mb_ptr2char(p) : (int)*(p))
|
||||
// Get the length of the character p points to
|
||||
# define MB_PTR2LEN(p) mb_ptr2len(p)
|
||||
// Advance multi-byte pointer, skip over composing chars.
|
||||
# define mb_ptr_adv(p) (p += mb_ptr2len((char_u *)p))
|
||||
// Advance multi-byte pointer, do not skip over composing chars.
|
||||
# define mb_cptr_adv(p) (p += utf_ptr2len(p))
|
||||
// Backup multi-byte pointer. Only use with "p" > "s" !
|
||||
# define mb_ptr_back(s, p) (p -= mb_head_off((char_u *)s, (char_u *)p - 1) + 1)
|
||||
// get length of multi-byte char, not including composing chars
|
||||
# define mb_cptr2len(p) utf_ptr2len(p)
|
||||
|
||||
# define MB_COPY_CHAR(f, t) mb_copy_char((const char_u **)(&f), &t);
|
||||
|
||||
# define MB_CHARLEN(p) mb_charlen(p)
|
||||
# define MB_CHAR2LEN(c) mb_char2len(c)
|
||||
# define PTR2CHAR(p) mb_ptr2char(p)
|
||||
|
||||
# define RESET_BINDING(wp) (wp)->w_p_scb = FALSE; (wp)->w_p_crb = FALSE
|
||||
|
||||
|
@ -177,7 +177,6 @@ void early_init(void)
|
||||
fs_init();
|
||||
handle_init();
|
||||
|
||||
(void)mb_init(); // init mb_bytelen_tab[] to ones
|
||||
eval_init(); // init global variables
|
||||
|
||||
// Init the table of Normal mode commands.
|
||||
|
609
src/nvim/mbyte.c
609
src/nvim/mbyte.c
@ -1,68 +1,27 @@
|
||||
/*
|
||||
* mbyte.c: Code specifically for handling multi-byte characters.
|
||||
* Multibyte extensions partly by Sung-Hoon Baek
|
||||
*
|
||||
* The encoding used in the core is set with 'encoding'. When 'encoding' is
|
||||
* changed, the following four variables are set (for speed).
|
||||
* Currently these types of character encodings are supported:
|
||||
*
|
||||
* "enc_dbcs" When non-zero it tells the type of double byte character
|
||||
* encoding (Chinese, Korean, Japanese, etc.).
|
||||
* The cell width on the display is equal to the number of
|
||||
* bytes. (exception: DBCS_JPNU with first byte 0x8e)
|
||||
* Recognizing the first or second byte is difficult, it
|
||||
* requires checking a byte sequence from the start.
|
||||
* "enc_utf8" When TRUE use Unicode characters in UTF-8 encoding.
|
||||
* The cell width on the display needs to be determined from
|
||||
* the character value.
|
||||
* Recognizing bytes is easy: 0xxx.xxxx is a single-byte
|
||||
* char, 10xx.xxxx is a trailing byte, 11xx.xxxx is a leading
|
||||
* byte of a multi-byte character.
|
||||
* To make things complicated, up to six composing characters
|
||||
* are allowed. These are drawn on top of the first char.
|
||||
* For most editing the sequence of bytes with composing
|
||||
* characters included is considered to be one character.
|
||||
* "enc_unicode" When 2 use 16-bit Unicode characters (or UTF-16).
|
||||
* When 4 use 32-but Unicode characters.
|
||||
* Internally characters are stored in UTF-8 encoding to
|
||||
* avoid NUL bytes. Conversion happens when doing I/O.
|
||||
* "enc_utf8" will also be TRUE.
|
||||
*
|
||||
* "has_mbyte" is set when "enc_dbcs" or "enc_utf8" is non-zero.
|
||||
*
|
||||
* If none of these is TRUE, 8-bit bytes are used for a character. The
|
||||
* encoding isn't currently specified (TODO).
|
||||
*
|
||||
* 'encoding' specifies the encoding used in the core. This is in registers,
|
||||
* text manipulation, buffers, etc. Conversion has to be done when characters
|
||||
* in another encoding are received or send:
|
||||
*
|
||||
* clipboard
|
||||
* ^
|
||||
* | (2)
|
||||
* V
|
||||
* +---------------+
|
||||
* (1) | | (3)
|
||||
* keyboard ----->| core |-----> display
|
||||
* | |
|
||||
* +---------------+
|
||||
* ^
|
||||
* | (4)
|
||||
* V
|
||||
* file
|
||||
*
|
||||
* (1) Typed characters arrive in the current locale.
|
||||
* (2) Text will be made available with the encoding specified with
|
||||
* 'encoding'. If this is not sufficient, system-specific conversion
|
||||
* might be required.
|
||||
* (3) For the GUI the correct font must be selected, no conversion done.
|
||||
* (4) The encoding of the file is specified with 'fileencoding'. Conversion
|
||||
* is to be done when it's different from 'encoding'.
|
||||
*
|
||||
* The ShaDa file is a special case: Only text is converted, not file names.
|
||||
* Vim scripts may contain an ":encoding" command. This has an effect for
|
||||
* some commands, like ":menutrans"
|
||||
*/
|
||||
/// mbyte.c: Code specifically for handling multi-byte characters.
|
||||
/// Multibyte extensions partly by Sung-Hoon Baek
|
||||
///
|
||||
/// The encoding used in nvim is always UTF-8. "enc_utf8" and "has_mbyte" is
|
||||
/// thus always true. "enc_dbcs" is always zero. The 'encoding' option is
|
||||
/// read-only and always reads "utf-8".
|
||||
///
|
||||
/// The cell width on the display needs to be determined from the character
|
||||
/// value. Recognizing UTF-8 bytes is easy: 0xxx.xxxx is a single-byte char,
|
||||
/// 10xx.xxxx is a trailing byte, 11xx.xxxx is a leading byte of a multi-byte
|
||||
/// character. To make things complicated, up to six composing characters
|
||||
/// are allowed. These are drawn on top of the first char. For most editing
|
||||
/// the sequence of bytes with composing characters included is considered to
|
||||
/// be one character.
|
||||
///
|
||||
/// UTF-8 is used everywhere in the core. This is in registers, text
|
||||
/// manipulation, buffers, etc. Nvim core communicates with external plugins
|
||||
/// and GUIs in this encoding.
|
||||
///
|
||||
/// The encoding of a file is specified with 'fileencoding'. Conversion
|
||||
/// is to be done when it's different from "utf-8".
|
||||
///
|
||||
/// Vim scripts may contain an ":scriptencoding" command. This has an effect
|
||||
/// for some commands, like ":menutrans".
|
||||
|
||||
#include <inttypes.h>
|
||||
#include <stdbool.h>
|
||||
@ -115,7 +74,7 @@ struct interval {
|
||||
* Bytes which are illegal when used as the first byte have a 1.
|
||||
* The NUL byte has length 1.
|
||||
*/
|
||||
static char utf8len_tab[256] =
|
||||
char utf8len_tab[256] =
|
||||
{
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
||||
@ -384,207 +343,6 @@ int enc_canon_props(const char_u *name)
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Set up for using multi-byte characters.
|
||||
* Called in three cases:
|
||||
* - by main() to initialize (p_enc == NULL)
|
||||
* - by set_init_1() after 'encoding' was set to its default.
|
||||
* - by do_set() when 'encoding' has been set.
|
||||
* p_enc must have been passed through enc_canonize() already.
|
||||
* Sets the "enc_unicode", "enc_utf8", "enc_dbcs" and "has_mbyte" flags.
|
||||
* Fills mb_bytelen_tab[] and returns NULL when there are no problems.
|
||||
* When there is something wrong: Returns an error message and doesn't change
|
||||
* anything.
|
||||
*/
|
||||
char_u * mb_init(void)
|
||||
{
|
||||
int i;
|
||||
int idx;
|
||||
int n;
|
||||
int enc_dbcs_new = 0;
|
||||
#if defined(USE_ICONV) && !defined(WIN3264) && !defined(WIN32UNIX) \
|
||||
&& !defined(MACOS)
|
||||
# define LEN_FROM_CONV
|
||||
vimconv_T vimconv;
|
||||
char_u *p;
|
||||
#endif
|
||||
|
||||
if (p_enc == NULL) {
|
||||
/* Just starting up: set the whole table to one's. */
|
||||
for (i = 0; i < 256; ++i)
|
||||
mb_bytelen_tab[i] = 1;
|
||||
return NULL;
|
||||
} else if (STRNCMP(p_enc, "8bit-", 5) == 0
|
||||
|| STRNCMP(p_enc, "iso-8859-", 9) == 0) {
|
||||
/* Accept any "8bit-" or "iso-8859-" name. */
|
||||
enc_unicode = 0;
|
||||
enc_utf8 = false;
|
||||
} else if (STRNCMP(p_enc, "2byte-", 6) == 0) {
|
||||
/* Unix: accept any "2byte-" name, assume current locale. */
|
||||
enc_dbcs_new = DBCS_2BYTE;
|
||||
} else if ((idx = enc_canon_search(p_enc)) >= 0) {
|
||||
i = enc_canon_table[idx].prop;
|
||||
if (i & ENC_UNICODE) {
|
||||
/* Unicode */
|
||||
enc_utf8 = true;
|
||||
if (i & (ENC_2BYTE | ENC_2WORD))
|
||||
enc_unicode = 2;
|
||||
else if (i & ENC_4BYTE)
|
||||
enc_unicode = 4;
|
||||
else
|
||||
enc_unicode = 0;
|
||||
} else if (i & ENC_DBCS) {
|
||||
/* 2byte, handle below */
|
||||
enc_dbcs_new = enc_canon_table[idx].codepage;
|
||||
} else {
|
||||
/* Must be 8-bit. */
|
||||
enc_unicode = 0;
|
||||
enc_utf8 = false;
|
||||
}
|
||||
} else /* Don't know what encoding this is, reject it. */
|
||||
return e_invarg;
|
||||
|
||||
if (enc_dbcs_new != 0) {
|
||||
enc_unicode = 0;
|
||||
enc_utf8 = false;
|
||||
}
|
||||
enc_dbcs = enc_dbcs_new;
|
||||
has_mbyte = (enc_dbcs != 0 || enc_utf8);
|
||||
|
||||
|
||||
/* Detect an encoding that uses latin1 characters. */
|
||||
enc_latin1like = (enc_utf8 || STRCMP(p_enc, "latin1") == 0
|
||||
|| STRCMP(p_enc, "iso-8859-15") == 0);
|
||||
|
||||
/*
|
||||
* Set the function pointers.
|
||||
*/
|
||||
if (enc_utf8) {
|
||||
mb_ptr2len = utfc_ptr2len;
|
||||
mb_ptr2len_len = utfc_ptr2len_len;
|
||||
mb_char2len = utf_char2len;
|
||||
mb_char2bytes = utf_char2bytes;
|
||||
mb_ptr2cells = utf_ptr2cells;
|
||||
mb_ptr2cells_len = utf_ptr2cells_len;
|
||||
mb_char2cells = utf_char2cells;
|
||||
mb_off2cells = utf_off2cells;
|
||||
mb_ptr2char = utf_ptr2char;
|
||||
mb_head_off = utf_head_off;
|
||||
} else if (enc_dbcs != 0) {
|
||||
mb_ptr2len = dbcs_ptr2len;
|
||||
mb_ptr2len_len = dbcs_ptr2len_len;
|
||||
mb_char2len = dbcs_char2len;
|
||||
mb_char2bytes = dbcs_char2bytes;
|
||||
mb_ptr2cells = dbcs_ptr2cells;
|
||||
mb_ptr2cells_len = dbcs_ptr2cells_len;
|
||||
mb_char2cells = dbcs_char2cells;
|
||||
mb_off2cells = dbcs_off2cells;
|
||||
mb_ptr2char = dbcs_ptr2char;
|
||||
mb_head_off = dbcs_head_off;
|
||||
} else {
|
||||
mb_ptr2len = latin_ptr2len;
|
||||
mb_ptr2len_len = latin_ptr2len_len;
|
||||
mb_char2len = latin_char2len;
|
||||
mb_char2bytes = latin_char2bytes;
|
||||
mb_ptr2cells = latin_ptr2cells;
|
||||
mb_ptr2cells_len = latin_ptr2cells_len;
|
||||
mb_char2cells = latin_char2cells;
|
||||
mb_off2cells = latin_off2cells;
|
||||
mb_ptr2char = latin_ptr2char;
|
||||
mb_head_off = latin_head_off;
|
||||
}
|
||||
|
||||
/*
|
||||
* Fill the mb_bytelen_tab[] for MB_BYTE2LEN().
|
||||
*/
|
||||
#ifdef LEN_FROM_CONV
|
||||
/* When 'encoding' is different from the current locale mblen() won't
|
||||
* work. Use conversion to "utf-8" instead. */
|
||||
vimconv.vc_type = CONV_NONE;
|
||||
if (enc_dbcs) {
|
||||
p = enc_locale();
|
||||
if (p == NULL || STRCMP(p, p_enc) != 0) {
|
||||
convert_setup(&vimconv, p_enc, (char_u *)"utf-8");
|
||||
vimconv.vc_fail = true;
|
||||
}
|
||||
xfree(p);
|
||||
}
|
||||
#endif
|
||||
|
||||
for (i = 0; i < 256; ++i) {
|
||||
/* Our own function to reliably check the length of UTF-8 characters,
|
||||
* independent of mblen(). */
|
||||
if (enc_utf8)
|
||||
n = utf8len_tab[i];
|
||||
else if (enc_dbcs == 0)
|
||||
n = 1;
|
||||
else {
|
||||
char buf[MB_MAXBYTES + 1];
|
||||
if (i == NUL) /* just in case mblen() can't handle "" */
|
||||
n = 1;
|
||||
else {
|
||||
buf[0] = i;
|
||||
buf[1] = 0;
|
||||
#ifdef LEN_FROM_CONV
|
||||
if (vimconv.vc_type != CONV_NONE) {
|
||||
/*
|
||||
* string_convert() should fail when converting the first
|
||||
* byte of a double-byte character.
|
||||
*/
|
||||
p = string_convert(&vimconv, (char_u *)buf, NULL);
|
||||
if (p != NULL) {
|
||||
xfree(p);
|
||||
n = 1;
|
||||
} else
|
||||
n = 2;
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
/*
|
||||
* mblen() should return -1 for invalid (means the leading
|
||||
* multibyte) character. However there are some platforms
|
||||
* where mblen() returns 0 for invalid character.
|
||||
* Therefore, following condition includes 0.
|
||||
*/
|
||||
ignored = mblen(NULL, 0); /* First reset the state. */
|
||||
if (mblen(buf, (size_t)1) <= 0)
|
||||
n = 2;
|
||||
else
|
||||
n = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
mb_bytelen_tab[i] = n;
|
||||
}
|
||||
|
||||
#ifdef LEN_FROM_CONV
|
||||
convert_setup(&vimconv, NULL, NULL);
|
||||
#endif
|
||||
|
||||
/* The cell width depends on the type of multi-byte characters. */
|
||||
(void)init_chartab();
|
||||
|
||||
/* When enc_utf8 is set or reset, (de)allocate ScreenLinesUC[] */
|
||||
screenalloc(false);
|
||||
|
||||
#ifdef HAVE_WORKING_LIBINTL
|
||||
/* GNU gettext 0.10.37 supports this feature: set the codeset used for
|
||||
* translated messages independently from the current locale. */
|
||||
(void)bind_textdomain_codeset(PROJECT_NAME,
|
||||
enc_utf8 ? "utf-8" : (char *)p_enc);
|
||||
#endif
|
||||
|
||||
|
||||
/* Fire an autocommand to let people do custom font setup. This must be
|
||||
* after Vim has been setup for the new encoding. */
|
||||
apply_autocmds(EVENT_ENCODINGCHANGED, NULL, (char_u *)"", FALSE, curbuf);
|
||||
|
||||
/* Need to reload spell dictionaries */
|
||||
spell_reload();
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Return the size of the BOM for the current buffer:
|
||||
* 0 - no BOM
|
||||
@ -597,20 +355,15 @@ int bomb_size(void)
|
||||
int n = 0;
|
||||
|
||||
if (curbuf->b_p_bomb && !curbuf->b_p_bin) {
|
||||
if (*curbuf->b_p_fenc == NUL) {
|
||||
if (enc_utf8) {
|
||||
if (enc_unicode != 0)
|
||||
n = enc_unicode;
|
||||
else
|
||||
n = 3;
|
||||
}
|
||||
} else if (STRCMP(curbuf->b_p_fenc, "utf-8") == 0)
|
||||
if (*curbuf->b_p_fenc == NUL
|
||||
|| STRCMP(curbuf->b_p_fenc, "utf-8") == 0) {
|
||||
n = 3;
|
||||
else if (STRNCMP(curbuf->b_p_fenc, "ucs-2", 5) == 0
|
||||
|| STRNCMP(curbuf->b_p_fenc, "utf-16", 6) == 0)
|
||||
} else if (STRNCMP(curbuf->b_p_fenc, "ucs-2", 5) == 0
|
||||
|| STRNCMP(curbuf->b_p_fenc, "utf-16", 6) == 0) {
|
||||
n = 2;
|
||||
else if (STRNCMP(curbuf->b_p_fenc, "ucs-4", 5) == 0)
|
||||
} else if (STRNCMP(curbuf->b_p_fenc, "ucs-4", 5) == 0) {
|
||||
n = 4;
|
||||
}
|
||||
}
|
||||
return n;
|
||||
}
|
||||
@ -803,99 +556,6 @@ int dbcs_class(unsigned lead, unsigned trail)
|
||||
return 3;
|
||||
}
|
||||
|
||||
/*
|
||||
* mb_char2len() function pointer.
|
||||
* Return length in bytes of character "c".
|
||||
* Returns 1 for a single-byte character.
|
||||
*/
|
||||
int latin_char2len(int c)
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int dbcs_char2len(int c)
|
||||
{
|
||||
if (c >= 0x100)
|
||||
return 2;
|
||||
return 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* mb_char2bytes() function pointer.
|
||||
* Convert a character to its bytes.
|
||||
* Returns the length in bytes.
|
||||
*/
|
||||
int latin_char2bytes(int c, char_u *buf)
|
||||
{
|
||||
buf[0] = c;
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int dbcs_char2bytes(int c, char_u *buf)
|
||||
{
|
||||
if (c >= 0x100) {
|
||||
buf[0] = (unsigned)c >> 8;
|
||||
buf[1] = c;
|
||||
/* Never use a NUL byte, it causes lots of trouble. It's an invalid
|
||||
* character anyway. */
|
||||
if (buf[1] == NUL)
|
||||
buf[1] = '\n';
|
||||
return 2;
|
||||
}
|
||||
buf[0] = c;
|
||||
return 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* mb_ptr2len() function pointer.
|
||||
* Get byte length of character at "*p" but stop at a NUL.
|
||||
* For UTF-8 this includes following composing characters.
|
||||
* Returns 0 when *p is NUL.
|
||||
*/
|
||||
int latin_ptr2len(const char_u *p)
|
||||
{
|
||||
return MB_BYTE2LEN(*p);
|
||||
}
|
||||
|
||||
static int dbcs_ptr2len(const char_u *p)
|
||||
{
|
||||
int len;
|
||||
|
||||
/* Check if second byte is not missing. */
|
||||
len = MB_BYTE2LEN(*p);
|
||||
if (len == 2 && p[1] == NUL)
|
||||
len = 1;
|
||||
return len;
|
||||
}
|
||||
|
||||
/*
|
||||
* mb_ptr2len_len() function pointer.
|
||||
* Like mb_ptr2len(), but limit to read "size" bytes.
|
||||
* Returns 0 for an empty string.
|
||||
* Returns 1 for an illegal char or an incomplete byte sequence.
|
||||
*/
|
||||
int latin_ptr2len_len(const char_u *p, int size)
|
||||
{
|
||||
if (size < 1 || *p == NUL)
|
||||
return 0;
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int dbcs_ptr2len_len(const char_u *p, int size)
|
||||
{
|
||||
int len;
|
||||
|
||||
if (size < 1 || *p == NUL)
|
||||
return 0;
|
||||
if (size == 1)
|
||||
return 1;
|
||||
/* Check that second byte is not missing. */
|
||||
len = MB_BYTE2LEN(*p);
|
||||
if (len == 2 && p[1] == NUL)
|
||||
len = 1;
|
||||
return len;
|
||||
}
|
||||
|
||||
/*
|
||||
* Return true if "c" is in "table".
|
||||
*/
|
||||
@ -963,16 +623,8 @@ int utf_char2cells(int c)
|
||||
return 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* mb_ptr2cells() function pointer.
|
||||
* Return the number of display cells character at "*p" occupies.
|
||||
* This doesn't take care of unprintable characters, use ptr2cells() for that.
|
||||
*/
|
||||
int latin_ptr2cells(const char_u *p)
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
|
||||
/// Return the number of display cells character at "*p" occupies.
|
||||
/// This doesn't take care of unprintable characters, use ptr2cells() for that.
|
||||
int utf_ptr2cells(const char_u *p)
|
||||
{
|
||||
int c;
|
||||
@ -991,26 +643,9 @@ int utf_ptr2cells(const char_u *p)
|
||||
return 1;
|
||||
}
|
||||
|
||||
int dbcs_ptr2cells(const char_u *p)
|
||||
{
|
||||
/* Number of cells is equal to number of bytes, except for euc-jp when
|
||||
* the first byte is 0x8e. */
|
||||
if (enc_dbcs == DBCS_JPNU && *p == 0x8e)
|
||||
return 1;
|
||||
return MB_BYTE2LEN(*p);
|
||||
}
|
||||
|
||||
/*
|
||||
* mb_ptr2cells_len() function pointer.
|
||||
* Like mb_ptr2cells(), but limit string length to "size".
|
||||
* For an empty string or truncated character returns 1.
|
||||
*/
|
||||
int latin_ptr2cells_len(const char_u *p, int size)
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int utf_ptr2cells_len(const char_u *p, int size)
|
||||
/// Like utf_ptr2cells(), but limit string length to "size".
|
||||
/// For an empty string or truncated character returns 1.
|
||||
int utf_ptr2cells_len(const char_u *p, int size)
|
||||
{
|
||||
int c;
|
||||
|
||||
@ -1030,35 +665,6 @@ static int utf_ptr2cells_len(const char_u *p, int size)
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int dbcs_ptr2cells_len(const char_u *p, int size)
|
||||
{
|
||||
/* Number of cells is equal to number of bytes, except for euc-jp when
|
||||
* the first byte is 0x8e. */
|
||||
if (size <= 1 || (enc_dbcs == DBCS_JPNU && *p == 0x8e))
|
||||
return 1;
|
||||
return MB_BYTE2LEN(*p);
|
||||
}
|
||||
|
||||
/*
|
||||
* mb_char2cells() function pointer.
|
||||
* Return the number of display cells character "c" occupies.
|
||||
* Only takes care of multi-byte chars, not "^C" and such.
|
||||
*/
|
||||
int latin_char2cells(int c)
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int dbcs_char2cells(int c)
|
||||
{
|
||||
/* Number of cells is equal to number of bytes, except for euc-jp when
|
||||
* the first byte is 0x8e. */
|
||||
if (enc_dbcs == DBCS_JPNU && ((unsigned)c >> 8) == 0x8e)
|
||||
return 1;
|
||||
/* use the first byte */
|
||||
return MB_BYTE2LEN((unsigned)c >> 8);
|
||||
}
|
||||
|
||||
/// Calculate the number of cells occupied by string `str`.
|
||||
///
|
||||
/// @param str The source string, may not be NULL, must be a NUL-terminated
|
||||
@ -1075,50 +681,13 @@ size_t mb_string2cells(const char_u *str)
|
||||
return clen;
|
||||
}
|
||||
|
||||
/*
|
||||
* mb_off2cells() function pointer.
|
||||
* Return number of display cells for char at ScreenLines[off].
|
||||
* We make sure that the offset used is less than "max_off".
|
||||
*/
|
||||
int latin_off2cells(unsigned off, unsigned max_off)
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
|
||||
int dbcs_off2cells(unsigned off, unsigned max_off)
|
||||
{
|
||||
/* never check beyond end of the line */
|
||||
if (off >= max_off)
|
||||
return 1;
|
||||
|
||||
/* Number of cells is equal to number of bytes, except for euc-jp when
|
||||
* the first byte is 0x8e. */
|
||||
if (enc_dbcs == DBCS_JPNU && ScreenLines[off] == 0x8e)
|
||||
return 1;
|
||||
return MB_BYTE2LEN(ScreenLines[off]);
|
||||
}
|
||||
|
||||
/// Return number of display cells for char at ScreenLines[off].
|
||||
/// We make sure that the offset used is less than "max_off".
|
||||
int utf_off2cells(unsigned off, unsigned max_off)
|
||||
{
|
||||
return (off + 1 < max_off && ScreenLines[off + 1] == 0) ? 2 : 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* mb_ptr2char() function pointer.
|
||||
* Convert a byte sequence into a character.
|
||||
*/
|
||||
int latin_ptr2char(const char_u *p)
|
||||
{
|
||||
return *p;
|
||||
}
|
||||
|
||||
static int dbcs_ptr2char(const char_u *p)
|
||||
{
|
||||
if (MB_BYTE2LEN(*p) > 1 && p[1] != NUL)
|
||||
return (p[0] << 8) + p[1];
|
||||
return *p;
|
||||
}
|
||||
|
||||
/*
|
||||
* Convert a UTF-8 byte sequence to a wide character.
|
||||
* If the sequence is illegal or truncated by a NUL the first byte is
|
||||
@ -2065,68 +1634,9 @@ void show_utf8(void)
|
||||
msg(IObuff);
|
||||
}
|
||||
|
||||
/*
|
||||
* mb_head_off() function pointer.
|
||||
* Return offset from "p" to the first byte of the character it points into.
|
||||
* If "p" points to the NUL at the end of the string return 0.
|
||||
* Returns 0 when already at the first byte of a character.
|
||||
*/
|
||||
int latin_head_off(const char_u *base, const char_u *p)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
int dbcs_head_off(const char_u *base, const char_u *p)
|
||||
{
|
||||
/* It can't be a trailing byte when not using DBCS, at the start of the
|
||||
* string or the previous byte can't start a double-byte. */
|
||||
if (p <= base || MB_BYTE2LEN(p[-1]) == 1 || *p == NUL) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* This is slow: need to start at the base and go forward until the
|
||||
* byte we are looking for. Return 1 when we went past it, 0 otherwise. */
|
||||
const char_u *q = base;
|
||||
while (q < p) {
|
||||
q += dbcs_ptr2len(q);
|
||||
}
|
||||
|
||||
return (q == p) ? 0 : 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* Special version of dbcs_head_off() that works for ScreenLines[], where
|
||||
* single-width DBCS_JPNU characters are stored separately.
|
||||
*/
|
||||
int dbcs_screen_head_off(const char_u *base, const char_u *p)
|
||||
{
|
||||
/* It can't be a trailing byte when not using DBCS, at the start of the
|
||||
* string or the previous byte can't start a double-byte.
|
||||
* For euc-jp an 0x8e byte in the previous cell always means we have a
|
||||
* lead byte in the current cell. */
|
||||
if (p <= base
|
||||
|| (enc_dbcs == DBCS_JPNU && p[-1] == 0x8e)
|
||||
|| MB_BYTE2LEN(p[-1]) == 1
|
||||
|| *p == NUL)
|
||||
return 0;
|
||||
|
||||
/* This is slow: need to start at the base and go forward until the
|
||||
* byte we are looking for. Return 1 when we went past it, 0 otherwise.
|
||||
* For DBCS_JPNU look out for 0x8e, which means the second byte is not
|
||||
* stored as the next byte. */
|
||||
const char_u *q = base;
|
||||
while (q < p) {
|
||||
if (enc_dbcs == DBCS_JPNU && *q == 0x8e) {
|
||||
++q;
|
||||
}
|
||||
else {
|
||||
q += dbcs_ptr2len(q);
|
||||
}
|
||||
}
|
||||
|
||||
return (q == p) ? 0 : 1;
|
||||
}
|
||||
|
||||
/// Return offset from "p" to the first byte of the character it points into.
|
||||
/// If "p" points to the NUL at the end of the string return 0.
|
||||
/// Returns 0 when already at the first byte of a character.
|
||||
int utf_head_off(const char_u *base, const char_u *p)
|
||||
{
|
||||
int c;
|
||||
@ -2232,26 +1742,20 @@ int mb_tail_off(char_u *base, char_u *p)
|
||||
if (*p == NUL)
|
||||
return 0;
|
||||
|
||||
if (enc_utf8) {
|
||||
/* Find the last character that is 10xx.xxxx */
|
||||
for (i = 0; (p[i + 1] & 0xc0) == 0x80; ++i)
|
||||
;
|
||||
/* Check for illegal sequence. */
|
||||
for (j = 0; p - j > base; ++j)
|
||||
if ((p[-j] & 0xc0) != 0x80)
|
||||
break;
|
||||
if (utf8len_tab[p[-j]] != i + j + 1)
|
||||
return 0;
|
||||
return i;
|
||||
// Find the last character that is 10xx.xxxx
|
||||
for (i = 0; (p[i + 1] & 0xc0) == 0x80; i++) {}
|
||||
|
||||
// Check for illegal sequence.
|
||||
for (j = 0; p - j > base; j++) {
|
||||
if ((p[-j] & 0xc0) != 0x80) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* It can't be the first byte if a double-byte when not using DBCS, at the
|
||||
* end of the string or the byte can't start a double-byte. */
|
||||
if (enc_dbcs == 0 || p[1] == NUL || MB_BYTE2LEN(*p) == 1)
|
||||
if (utf8len_tab[p[-j]] != i + j + 1) {
|
||||
return 0;
|
||||
|
||||
/* Return 1 when on the lead byte, 0 when on the tail byte. */
|
||||
return 1 - dbcs_head_off(base, p);
|
||||
}
|
||||
return i;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -2466,13 +1970,10 @@ int mb_fix_col(int col, int row)
|
||||
{
|
||||
col = check_col(col);
|
||||
row = check_row(row);
|
||||
if (has_mbyte && ScreenLines != NULL && col > 0
|
||||
&& ((enc_dbcs
|
||||
&& ScreenLines[LineOffset[row] + col] != NUL
|
||||
&& dbcs_screen_head_off(ScreenLines + LineOffset[row],
|
||||
ScreenLines + LineOffset[row] + col))
|
||||
|| (enc_utf8 && ScreenLines[LineOffset[row] + col] == 0)))
|
||||
if (ScreenLines != NULL && col > 0
|
||||
&& ScreenLines[LineOffset[row] + col] == 0) {
|
||||
return col - 1;
|
||||
}
|
||||
return col;
|
||||
}
|
||||
|
||||
|
@ -9,8 +9,8 @@
|
||||
* MB_BYTE2LEN_CHECK() can be used to count a special key as one byte.
|
||||
* Don't call MB_BYTE2LEN(b) with b < 0 or b > 255!
|
||||
*/
|
||||
#define MB_BYTE2LEN(b) mb_bytelen_tab[b]
|
||||
#define MB_BYTE2LEN_CHECK(b) (((b) < 0 || (b) > 255) ? 1 : mb_bytelen_tab[b])
|
||||
#define MB_BYTE2LEN(b) utf8len_tab[b]
|
||||
#define MB_BYTE2LEN_CHECK(b) (((b) < 0 || (b) > 255) ? 1 : utf8len_tab[b])
|
||||
|
||||
/* properties used in enc_canon_table[] (first three mutually exclusive) */
|
||||
#define ENC_8BIT 0x01
|
||||
@ -28,6 +28,18 @@
|
||||
#define ENC_LATIN9 0x400 /* Latin9 */
|
||||
#define ENC_MACROMAN 0x800 /* Mac Roman (not Macro Man! :-) */
|
||||
|
||||
// TODO(bfredl): eventually we should keep only one of the namings
|
||||
#define mb_ptr2len utfc_ptr2len
|
||||
#define mb_ptr2len_len utfc_ptr2len_len
|
||||
#define mb_char2len utf_char2len
|
||||
#define mb_char2bytes utf_char2bytes
|
||||
#define mb_ptr2cells utf_ptr2cells
|
||||
#define mb_ptr2cells_len utf_ptr2cells_len
|
||||
#define mb_char2cells utf_char2cells
|
||||
#define mb_off2cells utf_off2cells
|
||||
#define mb_ptr2char utf_ptr2char
|
||||
#define mb_head_off utf_head_off
|
||||
|
||||
#ifdef INCLUDE_GENERATED_DECLARATIONS
|
||||
# include "mbyte.h.generated.h"
|
||||
#endif
|
||||
|
@ -1936,8 +1936,7 @@ int swapchar(int op_type, pos_T *pos)
|
||||
if (c >= 0x80 && op_type == OP_ROT13)
|
||||
return FALSE;
|
||||
|
||||
if (op_type == OP_UPPER && c == 0xdf
|
||||
&& (enc_latin1like || STRCMP(p_enc, "iso-8859-2") == 0)) {
|
||||
if (op_type == OP_UPPER && c == 0xdf) {
|
||||
pos_T sp = curwin->w_cursor;
|
||||
|
||||
/* Special handling of German sharp s: change to "SS". */
|
||||
|
@ -780,14 +780,11 @@ void set_init_1(void)
|
||||
}
|
||||
fenc_default = p;
|
||||
|
||||
// Initialize multibyte (utf-8) handling
|
||||
mb_init();
|
||||
|
||||
// Don't change &encoding when resetting to defaults with ":set all&".
|
||||
opt_idx = findoption((char_u *)"encoding");
|
||||
if (opt_idx >= 0) {
|
||||
options[opt_idx].flags |= P_NODEFAULT;
|
||||
}
|
||||
#ifdef HAVE_WORKING_LIBINTL
|
||||
// GNU gettext 0.10.37 supports this feature: set the codeset used for
|
||||
// translated messages independently from the current locale.
|
||||
(void)bind_textdomain_codeset(PROJECT_NAME, (char *)p_enc);
|
||||
#endif
|
||||
|
||||
/* Set the default for 'helplang'. */
|
||||
set_helplang_default(get_mess_lang());
|
||||
@ -2580,19 +2577,17 @@ did_set_string_option (
|
||||
errmsg = e_invarg;
|
||||
/* 'encoding' and 'fileencoding' */
|
||||
} else if (varp == &p_enc || gvarp == &p_fenc) {
|
||||
if (varp == &p_enc && did_source_startup_scripts) {
|
||||
errmsg = e_afterinit;
|
||||
} else if (gvarp == &p_fenc) {
|
||||
if (!MODIFIABLE(curbuf) && opt_flags != OPT_GLOBAL)
|
||||
if (gvarp == &p_fenc) {
|
||||
if (!MODIFIABLE(curbuf) && opt_flags != OPT_GLOBAL) {
|
||||
errmsg = e_modifiable;
|
||||
else if (vim_strchr(*varp, ',') != NULL)
|
||||
/* No comma allowed in 'fileencoding'; catches confusing it
|
||||
* with 'fileencodings'. */
|
||||
} else if (vim_strchr(*varp, ',') != NULL) {
|
||||
// No comma allowed in 'fileencoding'; catches confusing it
|
||||
// with 'fileencodings'.
|
||||
errmsg = e_invarg;
|
||||
else {
|
||||
/* May show a "+" in the title now. */
|
||||
} else {
|
||||
// May show a "+" in the title now.
|
||||
redraw_titles();
|
||||
/* Add 'fileencoding' to the swap file. */
|
||||
// Add 'fileencoding' to the swap file.
|
||||
ml_setflags(curbuf);
|
||||
}
|
||||
}
|
||||
@ -2603,17 +2598,12 @@ did_set_string_option (
|
||||
xfree(*varp);
|
||||
*varp = p;
|
||||
if (varp == &p_enc) {
|
||||
errmsg = mb_init();
|
||||
redraw_titles();
|
||||
// only encoding=utf-8 allowed
|
||||
if (STRCMP(p_enc, "utf-8") != 0) {
|
||||
errmsg = e_invarg;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (errmsg == NULL) {
|
||||
/* When 'keymap' is used and 'encoding' changes, reload the keymap
|
||||
* (with another encoding). */
|
||||
if (varp == &p_enc && *curbuf->b_p_keymap != NUL)
|
||||
(void)keymap_init();
|
||||
}
|
||||
} else if (varp == &p_penc) {
|
||||
/* Canonize printencoding if VIM standard one */
|
||||
p = enc_canonize(p_penc);
|
||||
|
@ -5292,7 +5292,7 @@ void screen_puts_len(char_u *text, int textlen, int row, int col, int attr)
|
||||
int force_redraw_next = FALSE;
|
||||
int need_redraw;
|
||||
|
||||
const int l_has_mbyte = has_mbyte;
|
||||
const bool l_has_mbyte = has_mbyte;
|
||||
const bool l_enc_utf8 = enc_utf8;
|
||||
const int l_enc_dbcs = enc_dbcs;
|
||||
|
||||
@ -5459,9 +5459,6 @@ void screen_puts_len(char_u *text, int textlen, int row, int col, int attr)
|
||||
/* If we detected the next character needs to be redrawn, but the text
|
||||
* doesn't extend up to there, update the character here. */
|
||||
if (force_redraw_next && col < screen_Columns) {
|
||||
if (l_enc_dbcs != 0 && dbcs_off2cells(off, max_off) > 1)
|
||||
screen_char_2(off, row, col);
|
||||
else
|
||||
screen_char(off, row, col);
|
||||
}
|
||||
}
|
||||
|
@ -9266,9 +9266,7 @@ static void allcap_copy(char_u *word, char_u *wcopy)
|
||||
else
|
||||
c = *s++;
|
||||
|
||||
// We only change 0xdf to SS when we are certain latin1 is used. It
|
||||
// would cause weird errors in other 8-bit encodings.
|
||||
if (enc_latin1like && c == 0xdf) {
|
||||
if (c == 0xdf) {
|
||||
c = 'S';
|
||||
if (d - wcopy >= MAXWLEN - 1)
|
||||
break;
|
||||
@ -12602,7 +12600,7 @@ static int spell_edit_score(slang_T *slang, char_u *badword, char_u *goodword)
|
||||
char_u *p;
|
||||
int wbadword[MAXWLEN];
|
||||
int wgoodword[MAXWLEN];
|
||||
const int l_has_mbyte = has_mbyte;
|
||||
const bool l_has_mbyte = has_mbyte;
|
||||
|
||||
if (l_has_mbyte) {
|
||||
// Get the characters from the multi-byte strings and put them in an
|
||||
|
@ -31,8 +31,8 @@ void term_input_init(TermInput *input, Loop *loop)
|
||||
if (!term) {
|
||||
term = ""; // termkey_new_abstract assumes non-null (#2745)
|
||||
}
|
||||
int enc_flag = enc_utf8 ? TERMKEY_FLAG_UTF8 : TERMKEY_FLAG_RAW;
|
||||
input->tk = termkey_new_abstract(term, enc_flag);
|
||||
|
||||
input->tk = termkey_new_abstract(term, TERMKEY_FLAG_UTF8);
|
||||
|
||||
int curflags = termkey_get_canonflags(input->tk);
|
||||
termkey_set_canonflags(input->tk, curflags | TERMKEY_CANON_DELBS);
|
||||
|
@ -489,18 +489,6 @@ describe('json_decode() function', function()
|
||||
'{"b": 3, "a": 1, "c": 4, "d": 2, "\\u0000": 4}')
|
||||
end)
|
||||
|
||||
it('converts strings to latin1 when &encoding is latin1', function()
|
||||
restart('--cmd', 'set encoding=latin1')
|
||||
eq('\171', funcs.json_decode('"\\u00AB"'))
|
||||
sp_decode_eq({_TYPE='string', _VAL={'\n\171\n'}}, '"\\u0000\\u00AB\\u0000"')
|
||||
end)
|
||||
|
||||
it('fails to convert string to latin1 if it is impossible', function()
|
||||
restart('--cmd', 'set encoding=latin1')
|
||||
eq('Vim(call):E474: Failed to convert string "ꯍ" from UTF-8',
|
||||
exc_exec('call json_decode(\'"\\uABCD"\')'))
|
||||
end)
|
||||
|
||||
it('parses U+00C3 correctly', function()
|
||||
eq('\195\131', funcs.json_decode('"\195\131"'))
|
||||
end)
|
||||
@ -528,14 +516,6 @@ describe('json_decode() function', function()
|
||||
eq({key={'val', 'val2'}, key2=1}, funcs.json_decode(str))
|
||||
end)
|
||||
|
||||
it('always treats input as UTF-8', function()
|
||||
-- When &encoding is latin1 string "«" is U+00C2 U+00AB U+00C2: «Â. So if
|
||||
-- '"«"' was parsed as latin1 json_decode would return three characters, and
|
||||
-- only one U+00AB when this string is parsed as latin1.
|
||||
restart('--cmd', 'set encoding=latin1')
|
||||
eq(('%c'):format(0xAB), funcs.json_decode('"«"'))
|
||||
end)
|
||||
|
||||
it('does not overflow when writing error message about decoding ["", ""]',
|
||||
function()
|
||||
eq('\nE474: Attempt to decode a blank string'
|
||||
@ -762,12 +742,6 @@ describe('json_encode() function', function()
|
||||
exc_exec('call json_encode(["", ""], 1)'))
|
||||
end)
|
||||
|
||||
it('converts strings from latin1 when &encoding is latin1', function()
|
||||
clear('--cmd', 'set encoding=latin1')
|
||||
eq('"\\u00AB"', funcs.json_encode('\171'))
|
||||
eq('"\\u0000\\u00AB\\u0000"', eval('json_encode({"_TYPE": v:msgpack_types.string, "_VAL": ["\\n\171\\n"]})'))
|
||||
end)
|
||||
|
||||
it('ignores improper values in &isprint', function()
|
||||
meths.set_option('isprint', '1')
|
||||
eq(1, eval('"\1" =~# "\\\\p"'))
|
||||
|
@ -15,27 +15,26 @@ describe('&encoding', function()
|
||||
execute('set encoding=latin1')
|
||||
-- error message expected
|
||||
feed('<cr>')
|
||||
neq(nil, string.find(eval('v:errmsg'), '^E905:'))
|
||||
neq(nil, string.find(eval('v:errmsg'), '^E474:'))
|
||||
eq('utf-8', eval('&encoding'))
|
||||
-- check nvim is still in utf-8 mode
|
||||
eq(3, eval('strwidth("Bär")'))
|
||||
end)
|
||||
|
||||
it('can be changed before startup', function()
|
||||
it('cannot be changed before startup', function()
|
||||
clear('--cmd', 'set enc=latin1')
|
||||
execute('set encoding=utf-8')
|
||||
-- error message expected
|
||||
feed('<cr>')
|
||||
eq('latin1', eval('&encoding'))
|
||||
eq(4, eval('strwidth("Bär")'))
|
||||
neq(nil, string.find(eval('v:errmsg'), '^E474:'))
|
||||
eq('utf-8', eval('&encoding'))
|
||||
eq(3, eval('strwidth("Bär")'))
|
||||
end)
|
||||
|
||||
it('is not changed by `set all&`', function()
|
||||
-- we need to set &encoding to something non-default. Use 'latin1'
|
||||
clear('--cmd', 'set enc=latin1')
|
||||
execute('set all&')
|
||||
eq('latin1', eval('&encoding'))
|
||||
eq(4, eval('strwidth("Bär")'))
|
||||
end)
|
||||
it('can be set to utf-8 without error', function()
|
||||
execute('set encoding=utf-8')
|
||||
eq("", eval('v:errmsg'))
|
||||
|
||||
clear('--cmd', 'set enc=utf-8')
|
||||
eq("", eval('v:errmsg'))
|
||||
end)
|
||||
end)
|
||||
|
@ -4,9 +4,7 @@ local nvim_command, funcs, meths, nvim_feed, eq =
|
||||
helpers.command, helpers.funcs, helpers.meths, helpers.feed, helpers.eq
|
||||
|
||||
local shada_helpers = require('test.functional.shada.helpers')
|
||||
local reset, set_additional_cmd, clear =
|
||||
shada_helpers.reset, shada_helpers.set_additional_cmd,
|
||||
shada_helpers.clear
|
||||
local reset, clear = shada_helpers.reset, shada_helpers.clear
|
||||
|
||||
describe('ShaDa support code', function()
|
||||
before_each(reset)
|
||||
@ -173,120 +171,48 @@ describe('ShaDa support code', function()
|
||||
eq('goo', funcs.getline(1))
|
||||
end)
|
||||
|
||||
it('dumps and loads history correctly when &encoding is not UTF-8', function()
|
||||
set_additional_cmd('set encoding=latin1')
|
||||
it('dumps and loads history with UTF-8 characters', function()
|
||||
reset()
|
||||
-- \171 is U+00AB LEFT-POINTING DOUBLE ANGLE QUOTATION MARK in latin1
|
||||
nvim_feed(':echo "\171"\n')
|
||||
nvim_command('qall')
|
||||
reset()
|
||||
eq('echo "\171"', funcs.histget(':', -1))
|
||||
end)
|
||||
|
||||
it('dumps and loads history correctly when &encoding /= UTF-8 when dumping',
|
||||
function()
|
||||
set_additional_cmd('set encoding=latin1')
|
||||
reset()
|
||||
-- \171 is U+00AB LEFT-POINTING DOUBLE ANGLE QUOTATION MARK in latin1
|
||||
nvim_feed(':echo "\171"\n')
|
||||
set_additional_cmd('')
|
||||
nvim_feed(':echo "«"\n')
|
||||
nvim_command('qall')
|
||||
reset()
|
||||
eq('echo "«"', funcs.histget(':', -1))
|
||||
end)
|
||||
|
||||
it('dumps and loads history correctly when &encoding /= UTF-8 when loading',
|
||||
it('dumps and loads replacement with UTF-8 characters',
|
||||
function()
|
||||
-- \171 is U+00AB LEFT-POINTING DOUBLE ANGLE QUOTATION MARK in latin1
|
||||
nvim_feed(':echo "«"\n')
|
||||
set_additional_cmd('set encoding=latin1')
|
||||
nvim_command('qall')
|
||||
reset()
|
||||
eq('echo "\171"', funcs.histget(':', -1))
|
||||
end)
|
||||
|
||||
it('dumps and loads replacement correctly when &encoding is not UTF-8',
|
||||
function()
|
||||
set_additional_cmd('set encoding=latin1')
|
||||
reset()
|
||||
-- \171 is U+00AB LEFT-POINTING DOUBLE ANGLE QUOTATION MARK in latin1
|
||||
nvim_command('substitute/./\171/ge')
|
||||
nvim_command('substitute/./«/ge')
|
||||
nvim_command('qall!')
|
||||
reset()
|
||||
funcs.setline('.', {'.'})
|
||||
nvim_command('&')
|
||||
eq('\171', funcs.getline('.'))
|
||||
end)
|
||||
|
||||
it('dumps&loads replacement correctly when &encoding /= UTF-8 when dumping',
|
||||
function()
|
||||
set_additional_cmd('set encoding=latin1')
|
||||
reset()
|
||||
-- \171 is U+00AB LEFT-POINTING DOUBLE ANGLE QUOTATION MARK in latin1
|
||||
nvim_command('substitute/./\171/ge')
|
||||
set_additional_cmd('')
|
||||
nvim_command('qall')
|
||||
reset()
|
||||
funcs.setline('.', {'.'})
|
||||
nvim_command('&')
|
||||
eq('«', funcs.getline('.'))
|
||||
end)
|
||||
|
||||
it('dumps&loads replacement correctly when &encoding /= UTF-8 when loading',
|
||||
it('dumps and loads substitute pattern with UTF-8 characters',
|
||||
function()
|
||||
-- \171 is U+00AB LEFT-POINTING DOUBLE ANGLE QUOTATION MARK in latin1
|
||||
nvim_command('substitute/./«/ge')
|
||||
set_additional_cmd('set encoding=latin1')
|
||||
nvim_command('qall')
|
||||
reset()
|
||||
funcs.setline('.', {'.'})
|
||||
nvim_command('&')
|
||||
eq('\171', funcs.getline('.'))
|
||||
end)
|
||||
|
||||
it('dumps and loads substitute pattern correctly when &encoding is not UTF-8',
|
||||
function()
|
||||
set_additional_cmd('set encoding=latin1')
|
||||
reset()
|
||||
-- \171 is U+00AB LEFT-POINTING DOUBLE ANGLE QUOTATION MARK in latin1
|
||||
nvim_command('substitute/\171/./ge')
|
||||
nvim_command('substitute/«/./ge')
|
||||
nvim_command('qall!')
|
||||
reset()
|
||||
funcs.setline('.', {'\171«'})
|
||||
nvim_command('&')
|
||||
eq('.«', funcs.getline('.'))
|
||||
end)
|
||||
|
||||
it('dumps&loads s/pattern correctly when &encoding /= UTF-8 when dumping',
|
||||
function()
|
||||
set_additional_cmd('set encoding=latin1')
|
||||
reset()
|
||||
-- \171 is U+00AB LEFT-POINTING DOUBLE ANGLE QUOTATION MARK in latin1
|
||||
nvim_command('substitute/\171/./ge')
|
||||
set_additional_cmd('')
|
||||
nvim_command('qall')
|
||||
reset()
|
||||
funcs.setline('.', {'«\171'})
|
||||
nvim_command('&')
|
||||
eq('.\171', funcs.getline('.'))
|
||||
end)
|
||||
|
||||
it('dumps&loads s/pattern correctly when &encoding /= UTF-8 when loading',
|
||||
it('dumps and loads search pattern with UTF-8 characters',
|
||||
function()
|
||||
-- \171 is U+00AB LEFT-POINTING DOUBLE ANGLE QUOTATION MARK in latin1
|
||||
nvim_command('substitute/«/./ge')
|
||||
set_additional_cmd('set encoding=latin1')
|
||||
nvim_command('qall')
|
||||
nvim_command('silent! /«/')
|
||||
nvim_command('set shada+=/0')
|
||||
nvim_command('qall!')
|
||||
reset()
|
||||
funcs.setline('.', {'\171«'})
|
||||
nvim_command('&')
|
||||
eq('.«', funcs.getline('.'))
|
||||
nvim_command('~&')
|
||||
eq('\171', funcs.getline('.'))
|
||||
eq('', funcs.histget('/', -1))
|
||||
end)
|
||||
|
||||
it('dumps and loads search pattern correctly when &encoding is not UTF-8',
|
||||
it('dumps and loads search pattern with 8-bit single-byte',
|
||||
function()
|
||||
set_additional_cmd('set encoding=latin1')
|
||||
reset()
|
||||
-- \171 is U+00AB LEFT-POINTING DOUBLE ANGLE QUOTATION MARK in latin1
|
||||
nvim_command('silent! /\171/')
|
||||
nvim_command('set shada+=/0')
|
||||
@ -298,33 +224,4 @@ describe('ShaDa support code', function()
|
||||
eq('', funcs.histget('/', -1))
|
||||
end)
|
||||
|
||||
it('dumps&loads /pattern correctly when &encoding /= UTF-8 when dumping',
|
||||
function()
|
||||
set_additional_cmd('set encoding=latin1')
|
||||
reset()
|
||||
-- \171 is U+00AB LEFT-POINTING DOUBLE ANGLE QUOTATION MARK in latin1
|
||||
nvim_command('silent! /\171/')
|
||||
nvim_command('set shada+=/0')
|
||||
set_additional_cmd('')
|
||||
nvim_command('qall')
|
||||
reset()
|
||||
funcs.setline('.', {'«\171'})
|
||||
nvim_command('~&')
|
||||
eq('\171', funcs.getline('.'))
|
||||
eq('', funcs.histget('/', -1))
|
||||
end)
|
||||
|
||||
it('dumps&loads /pattern correctly when &encoding /= UTF-8 when loading',
|
||||
function()
|
||||
-- \171 is U+00AB LEFT-POINTING DOUBLE ANGLE QUOTATION MARK in latin1
|
||||
nvim_command('silent! /«/')
|
||||
nvim_command('set shada+=/0')
|
||||
set_additional_cmd('set encoding=latin1')
|
||||
nvim_command('qall')
|
||||
reset()
|
||||
funcs.setline('.', {'\171«'})
|
||||
nvim_command('~&')
|
||||
eq('«', funcs.getline('.'))
|
||||
eq('', funcs.histget('/', -1))
|
||||
end)
|
||||
end)
|
||||
|
@ -128,36 +128,24 @@ describe('ShaDa support code', function()
|
||||
eq({{}, ''}, getreg('h'))
|
||||
end)
|
||||
|
||||
it('dumps and loads register correctly when &encoding is not UTF-8',
|
||||
it('dumps and loads register correctly with utf-8 contents',
|
||||
function()
|
||||
set_additional_cmd('set encoding=latin1')
|
||||
reset()
|
||||
-- \171 is U+00AB LEFT-POINTING DOUBLE ANGLE QUOTATION MARK in latin1
|
||||
setreg('e', {'\171'}, 'c')
|
||||
nvim_command('qall')
|
||||
reset()
|
||||
eq({{'\171'}, 'v'}, getreg('e'))
|
||||
end)
|
||||
|
||||
it('dumps and loads history correctly when &encoding /= UTF-8 when dumping',
|
||||
function()
|
||||
set_additional_cmd('set encoding=latin1')
|
||||
reset()
|
||||
-- \171 is U+00AB LEFT-POINTING DOUBLE ANGLE QUOTATION MARK in latin1
|
||||
setreg('e', {'\171'}, 'c')
|
||||
set_additional_cmd('')
|
||||
setreg('e', {'«'}, 'c')
|
||||
nvim_command('qall')
|
||||
reset()
|
||||
eq({{'«'}, 'v'}, getreg('e'))
|
||||
end)
|
||||
|
||||
it('dumps and loads history correctly when &encoding /= UTF-8 when loading',
|
||||
it('dumps and loads history correctly with 8-bit single-byte',
|
||||
function()
|
||||
reset()
|
||||
-- \171 is U+00AB LEFT-POINTING DOUBLE ANGLE QUOTATION MARK in latin1
|
||||
setreg('e', {'«'}, 'c')
|
||||
set_additional_cmd('set encoding=latin1')
|
||||
setreg('e', {'\171«'}, 'c')
|
||||
set_additional_cmd('')
|
||||
nvim_command('qall')
|
||||
reset()
|
||||
eq({{'\171'}, 'v'}, getreg('e'))
|
||||
eq({{'\171«'}, 'v'}, getreg('e'))
|
||||
end)
|
||||
|
||||
end)
|
||||
|
@ -91,35 +91,13 @@ describe('ShaDa support code', function()
|
||||
eq(0, funcs.exists('g:str_var'))
|
||||
end)
|
||||
|
||||
it('dumps and loads variables correctly when &encoding is not UTF-8',
|
||||
it('dumps and loads variables correctly with utf-8 strings',
|
||||
function()
|
||||
set_additional_cmd('set encoding=latin1')
|
||||
reset()
|
||||
-- \171 is U+00AB LEFT-POINTING DOUBLE ANGLE QUOTATION MARK in latin1
|
||||
meths.set_var('STRVAR', '\171')
|
||||
meths.set_var('LSTVAR', {'\171'})
|
||||
meths.set_var('DCTVAR', {['\171']='\171'})
|
||||
meths.set_var('NESTEDVAR', {['\171']={{'\171'}, {['\171']='\171'},
|
||||
{a='Test'}}})
|
||||
nvim_command('qall')
|
||||
reset()
|
||||
eq('\171', meths.get_var('STRVAR'))
|
||||
eq({'\171'}, meths.get_var('LSTVAR'))
|
||||
eq({['\171']='\171'}, meths.get_var('DCTVAR'))
|
||||
eq({['\171']={{'\171'}, {['\171']='\171'}, {a='Test'}}},
|
||||
meths.get_var('NESTEDVAR'))
|
||||
end)
|
||||
|
||||
it('dumps and loads variables correctly when &encoding /= UTF-8 when dumping',
|
||||
function()
|
||||
set_additional_cmd('set encoding=latin1')
|
||||
reset()
|
||||
-- \171 is U+00AB LEFT-POINTING DOUBLE ANGLE QUOTATION MARK in latin1
|
||||
meths.set_var('STRVAR', '\171')
|
||||
meths.set_var('LSTVAR', {'\171'})
|
||||
meths.set_var('DCTVAR', {['\171']='\171'})
|
||||
meths.set_var('NESTEDVAR', {['\171']={{'\171'}, {['\171']='\171'},
|
||||
{a='Test'}}})
|
||||
meths.set_var('STRVAR', '«')
|
||||
meths.set_var('LSTVAR', {'«'})
|
||||
meths.set_var('DCTVAR', {['«']='«'})
|
||||
meths.set_var('NESTEDVAR', {['«']={{'«'}, {['«']='«'}, {a='Test'}}})
|
||||
set_additional_cmd('')
|
||||
nvim_command('qall')
|
||||
reset()
|
||||
@ -129,20 +107,22 @@ describe('ShaDa support code', function()
|
||||
eq({['«']={{'«'}, {['«']='«'}, {a='Test'}}}, meths.get_var('NESTEDVAR'))
|
||||
end)
|
||||
|
||||
it('dumps and loads variables correctly when &encoding /= UTF-8 when loading',
|
||||
it('dumps and loads variables correctly with 8-bit strings',
|
||||
function()
|
||||
reset()
|
||||
-- \171 is U+00AB LEFT-POINTING DOUBLE ANGLE QUOTATION MARK in latin1
|
||||
meths.set_var('STRVAR', '«')
|
||||
meths.set_var('LSTVAR', {'«'})
|
||||
meths.set_var('DCTVAR', {['«']='«'})
|
||||
meths.set_var('NESTEDVAR', {['«']={{'«'}, {['«']='«'}, {a='Test'}}})
|
||||
set_additional_cmd('set encoding=latin1')
|
||||
-- This is invalid unicode, but we should still dump and restore it.
|
||||
meths.set_var('STRVAR', '\171')
|
||||
meths.set_var('LSTVAR', {'\171'})
|
||||
meths.set_var('DCTVAR', {['«\171']='«\171'})
|
||||
meths.set_var('NESTEDVAR', {['\171']={{'\171«'}, {['\171']='\171'},
|
||||
{a='Test'}}})
|
||||
nvim_command('qall')
|
||||
reset()
|
||||
eq('\171', meths.get_var('STRVAR'))
|
||||
eq({'\171'}, meths.get_var('LSTVAR'))
|
||||
eq({['\171']='\171'}, meths.get_var('DCTVAR'))
|
||||
eq({['\171']={{'\171'}, {['\171']='\171'}, {a='Test'}}},
|
||||
eq({['«\171']='«\171'}, meths.get_var('DCTVAR'))
|
||||
eq({['\171']={{'\171«'}, {['\171']='\171'}, {a='Test'}}},
|
||||
meths.get_var('NESTEDVAR'))
|
||||
end)
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user