Commit: patch 9.2.0622: str2blob() does not work with wide UTF-16 encoding

1 view
Skip to first unread message

Christian Brabandt

unread,
6:15 AM (5 hours ago) 6:15 AM
to vim...@googlegroups.com
patch 9.2.0622: str2blob() does not work with wide UTF-16 encoding

Commit: https://github.com/vim/vim/commit/26dc90a21079a5f5ae472d98c05770ba2eb7868e
Author: Yasuhiro Matsumoto <matt...@gmail.com>
Date: Fri Jun 12 10:00:37 2026 +0000

patch 9.2.0622: str2blob() does not work with wide UTF-16 encoding

Problem: str2blob() does not work with wide UTF-16 encoding
Solution: Use iconv() and convert the UTF-16 and similar encodings
directly (Yasuhiro Matsumoto)

str2blob() routed every target encoding through convert_string(), which
treats all Unicode encodings as utf-8 and therefore left the bytes
unconverted. As a result str2blob(['Hello'], {'encoding': 'utf-16le'})
returned 0z48656C6C6F instead of 0z480065006C006C006F00, breaking the
round-trip with blob2str(). Add the same wide-encoding handling blob2str()
uses: join the list items with a newline, convert the whole string at once
with the endianness-preserving encoding name, and append the raw bytes.

closes: #20466

Signed-off-by: Yasuhiro Matsumoto <matt...@gmail.com>
Signed-off-by: Christian Brabandt <c...@256bit.org>

diff --git a/src/strings.c b/src/strings.c
index ff63a3ef4..71f9d83e2 100644
--- a/src/strings.c
+++ b/src/strings.c
@@ -1584,6 +1584,7 @@ f_str2blob(typval_T *argvars, typval_T *rettv)
return;

char_u *to_encoding = NULL;
+ char_u *to_encoding_raw = NULL; // Encoding name with endianness preserved for iconv
if (argvars[1].v_type != VAR_UNKNOWN)
{
dict_T *d = argvars[1].vval.v_dict;
@@ -1591,50 +1592,144 @@ f_str2blob(typval_T *argvars, typval_T *rettv)
{
char_u *enc = dict_get_string(d, "encoding", FALSE);
if (enc != NULL)
- to_encoding = enc_canonize(enc_skip(enc));
+ {
+ char_u *enc_skipped = enc_skip(enc);
+ to_encoding = enc_canonize(enc_skipped);
+
+ // For iconv, preserve the endianness suffix by creating a
+ // normalized version with hyphens: "utf16le" -> "utf-16le"
+ to_encoding_raw = normalize_encoding_name(enc_skipped);
+ if (to_encoding_raw == NULL)
+ {
+ emsg(_(e_out_of_memory));
+ VIM_CLEAR(to_encoding);
+ return;
+ }
+ }
}
}

- FOR_ALL_LIST_ITEMS(list, li)
+ // Special handling for UTF-16/UCS-2/UTF-32/UCS-4 target encodings: join the
+ // list items with a newline and convert the whole string at once, so that
+ // the wide-encoded newline separators and embedded NUL bytes are preserved
+ // (mirrors blob2str()). convert_string() cannot be used here because it
+ // treats every Unicode encoding as utf-8, leaving the bytes unconverted.
+ int to_prop = 0;
+ if (to_encoding != NULL)
+ to_prop = enc_canon_props(to_encoding);
+ if (to_encoding != NULL && (to_prop & (ENC_2BYTE | ENC_4BYTE | ENC_2WORD)))
{
- if (li->li_tv.v_type != VAR_STRING)
- continue;
+ garray_T str_ga;

- string_T str = {li->li_tv.vval.v_string, 0};
+ ga_init2(&str_ga, 1, 256);
+ FOR_ALL_LIST_ITEMS(list, li)
+ {
+ char_u *s;

- if (str.string == NULL)
- STR_LITERAL_SET(str, "");
- else
- str.length = STRLEN(str.string);
+ if (li->li_tv.v_type != VAR_STRING)
+ continue;
+
+ s = li->li_tv.vval.v_string;

- if (to_encoding != NULL)
+ // Each list string item is separated by a newline in the blob
+ if (li != list->lv_first)
+ ga_append(&str_ga, NL);
+ if (s != NULL && *s != NUL)
+ {
+ int slen = (int)STRLEN(s);
+
+ if (ga_grow(&str_ga, slen) == FAIL)
+ {
+ ga_clear(&str_ga);
+ goto done;
+ }
+ mch_memmove((char_u *)str_ga.ga_data + str_ga.ga_len, s,
+ (size_t)slen);
+ str_ga.ga_len += slen;
+ }
+ }
+
+ if (str_ga.ga_len > 0)
{
- int res;
- string_T converted;
+ vimconv_T vimconv;

- res = convert_string(&str, p_enc, to_encoding, &converted);
- if (res != OK)
+ vimconv.vc_type = CONV_NONE;
+ if (convert_setup_ext(&vimconv, p_enc, FALSE, to_encoding_raw, FALSE)
+ == FAIL)
{
+ ga_clear(&str_ga);
semsg(_(e_str_encoding_to_failed), to_encoding);
goto done;
}
- str.string = converted.string;
- str.length = converted.length;
+ vimconv.vc_fail = TRUE;
+
+ int len = str_ga.ga_len;
+ char_u *converted = string_convert_ext(&vimconv,
+ (char_u *)str_ga.ga_data, &len, NULL);
+ convert_setup(&vimconv, NULL, NULL);
+ ga_clear(&str_ga);
+
+ if (converted == NULL)
+ {
+ semsg(_(e_str_encoding_to_failed), to_encoding);
+ goto done;
+ }
+ if (len > 0 && ga_grow(&blob->bv_ga, len) == OK)
+ {
+ mch_memmove((char_u *)blob->bv_ga.ga_data + blob->bv_ga.ga_len,
+ converted, (size_t)len);
+ blob->bv_ga.ga_len += len;
+ }
+ vim_free(converted);
}
+ else
+ ga_clear(&str_ga);
+ }
+ else
+ {
+ FOR_ALL_LIST_ITEMS(list, li)
+ {
+ if (li->li_tv.v_type != VAR_STRING)
+ continue;

- if (li != list->lv_first)
- // Each list string item is separated by a newline in the blob
- ga_append(&blob->bv_ga, NL);
+ string_T str = {li->li_tv.vval.v_string, 0};

- blob_from_string(str.string, blob);
+ if (str.string == NULL)
+ STR_LITERAL_SET(str, "");
+ else
+ str.length = STRLEN(str.string);

- if (to_encoding != NULL)
- vim_free(str.string);
+ if (to_encoding != NULL)
+ {
+ int res;
+ string_T converted;
+
+ res = convert_string(&str, p_enc, to_encoding, &converted);
+ if (res != OK)
+ {
+ semsg(_(e_str_encoding_to_failed), to_encoding);
+ goto done;
+ }
+ str.string = converted.string;
+ str.length = converted.length;
+ }
+
+ if (li != list->lv_first)
+ // Each list string item is separated by a newline in the blob
+ ga_append(&blob->bv_ga, NL);
+
+ blob_from_string(str.string, blob);
+
+ if (to_encoding != NULL)
+ vim_free(str.string);
+ }
}

done:
if (to_encoding != NULL)
vim_free(to_encoding);
+ if (to_encoding_raw != NULL)
+ vim_free(to_encoding_raw);
}

/*
diff --git a/src/testdir/test_functions.vim b/src/testdir/test_functions.vim
index 375359527..8ca73a62a 100644
--- a/src/testdir/test_functions.vim
+++ b/src/testdir/test_functions.vim
@@ -4513,6 +4513,20 @@ func Test_str2blob()
call assert_equal(0zABBB0AABBB, str2blob(['«»', '«»'], {'encoding': 'latin1'}))
call assert_equal(0zC2ABC2BB, str2blob(['«»'], {'encoding': 'utf8'}))

+ if has('iconv')
+ call assert_equal(0z480065006C006C006F00, str2blob(['Hello'], {'encoding': 'utf-16le'}))
+ call assert_equal(0z480065006C006C006F00, str2blob(['Hello'], {'encoding': 'utf16le'}))
+ call assert_equal(0z00480065006C006C006F, str2blob(['Hello'], {'encoding': 'utf-16be'}))
+ call assert_equal(0z48006900.0A004200.79006500, str2blob(['Hi', 'Bye'], {'encoding': 'utf-16le'}))
+ call assert_equal(0z61000A006200, str2blob(["a
b"], {'encoding': 'utf-16le'}))
+ call assert_equal(0z, str2blob([''], {'encoding': 'utf-16le'}))
+ call assert_equal(0z0A00, str2blob(['', ''], {'encoding': 'utf-16le'}))
+ for enc in ['utf-16le', 'utf-16be', 'ucs-2le', 'utf-32le', 'utf-32be']
+ call assert_equal(['Hello', 'World'],
+ \ blob2str(str2blob(['Hello', 'World'], {'encoding': enc}), {'encoding': enc}), enc)
+ endfor
+ endif
+
call assert_equal(0z62, str2blob(["b"], test_null_dict()))
call assert_equal(0z63, str2blob(["c"], {'encoding': test_null_string()}))

@@ -4581,12 +4595,14 @@ func Test_blob2str()
call assert_fails("call blob2str(0z6162, {'encoding': []})", 'E730: Using a List as a String')
call assert_fails("call blob2str(0z6162, {'encoding': 'ab12xy'})", 'E1515: Unable to convert from ''ab12xy'' encoding')

- #" UTF-16LE encoding
- call assert_equal(['Hello'], blob2str(0z480065006C006C006F00, {'encoding': 'utf-16le'}))
- call assert_equal(['Hello'], blob2str(0z480065006C006C006F00, {'encoding': 'utf16le'}))
- #" UCS-2LE encoding
- call assert_equal(['Hello'], blob2str(0z480065006C006C006F00, {'encoding': 'ucs-2le'}))
- call assert_equal(['Hello'], blob2str(0z480065006C006C006F00, {'encoding': 'ucs2le'}))
+ if has("iconv")
+ #" UTF-16LE encoding
+ call assert_equal(['Hello'], blob2str(0z480065006C006C006F00, {'encoding': 'utf-16le'}))
+ call assert_equal(['Hello'], blob2str(0z480065006C006C006F00, {'encoding': 'utf16le'}))
+ #" UCS-2LE encoding
+ call assert_equal(['Hello'], blob2str(0z480065006C006C006F00, {'encoding': 'ucs-2le'}))
+ call assert_equal(['Hello'], blob2str(0z480065006C006C006F00, {'encoding': 'ucs2le'}))
+ endif
END
call v9.CheckLegacyAndVim9Success(lines)
endfunc
diff --git a/src/version.c b/src/version.c
index fac15c00f..57bd82493 100644
--- a/src/version.c
+++ b/src/version.c
@@ -754,6 +754,8 @@ static char *(features[]) =

static int included_patches[] =
{ /* Add new patch number below this line */
+/**/
+ 622,
/**/
621,
/**/
Reply all
Reply to author
Forward
0 new messages