Confusing stop position on error in ucnv_convertEx()

16 views
Skip to first unread message

Philip McGrath

unread,
Oct 13, 2025, 1:43:54 PMOct 13
to icu-s...@unicode.org
Hi,

For context, I am working on a programming language's runtime system to add
support for ICU as an alternative to iconv.

To do that, I have been using ucnv_convertEx() with converters that have
UCNV_TO_U_CALLBACK_STOP and UCNV_FROM_U_CALLBACK_STOP set. However, while
tracking down some test suite failures, I found that, when there is an error,
the position of the source pointer, i.e. the number of source bytes consumed,
differs from our iconv-based implementation.

To be concrete, I'm confused by the behavior on these two examples when
converting from (supposed) UTF-8 to UTF-8:

1. "\xFF\xFF" reports 1 byte consumed, rather than 0.

2. "abc\xF0\x9F\xFF" reports 5 bytes consumed. I would like the answer
to be 3, but I'm also surprised that it isn't 5, given that the bad
"\xFF" was consumed by the previous example. (The two-byte sequence
"\xF0\x9F" could potentially begin a UTF-8 character.)

I've illustrated these examples with a single-file C program that I've posted
to https://gist.github.com/LiberalArtist/3f32e8c0b4d9a7db2a23a0cafe444d12 and
will also include at the end of this message.

I'd be grateful if someone could explain this behavior. More to the point, is
there a good way to emulate the behavior of our iconv backend? Do I need to
implement custom callbacks, or maybe drop down to ucnv_fromUnicode() and
ucnv_toUnicode()?

Thanks,
Philip McGrath

--8<---------------cut here---------------start------------->8---
/* cc -o ucnv-debug ucnv-debug.c `pkg-config --libs --cflags icu-uc`
* SPDX-License-Identifier: (Apache-2.0 OR CC0-1.0)
* SPDX-FileCopyrightText: Philip McGrath <phi...@philipmcgrath.com>
*/
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <errno.h>
#include "unicode/utypes.h" /* Basic ICU data types */
#include "unicode/ucnv.h" /* C Converter API */
#include "unicode/uloc.h" /* for precautionary thread initialization */

static UConverter *rktio_ucnv_open_and_set_callbacks(const char *converterName,
UErrorCode *error)
{
UConverter *ucnv = ucnv_open(converterName, error);
ucnv_setToUCallBack(ucnv, UCNV_TO_U_CALLBACK_STOP, NULL, NULL, NULL, error);
ucnv_setFromUCallBack(ucnv, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, error);
if (U_FAILURE(*error)) {
if (NULL != ucnv)
ucnv_close(ucnv);
return NULL;
}
return ucnv;
}

#define ICU_BUF_SIZE 1024
typedef struct rktio_icu_converter_t {
/* rktio_converter_t tag; */
UConverter *sourceCnv;
UConverter *targetCnv;
UChar *pivotSource; /* pointer info buf */
UChar *pivotTarget; /* pointer info buf */
UChar buf[ICU_BUF_SIZE];
} rktio_icu_converter_t;

const char *from_enc = "utf-8";
const char *to_enc = "UTF-8";
static void get_posix_error(void)
{
return;
}

static void icu_convert(char **in, intptr_t *in_left,
char **out, intptr_t *out_left)
{
UErrorCode errorCode = U_ZERO_ERROR;
rktio_icu_converter_t *cvt = (rktio_icu_converter_t *)calloc(1, sizeof(rktio_icu_converter_t));
if (NULL == cvt)
return;
cvt->pivotSource = &cvt->buf[0];
cvt->pivotTarget = &cvt->buf[0];
cvt->sourceCnv = rktio_ucnv_open_and_set_callbacks(from_enc, &errorCode);
cvt->targetCnv = rktio_ucnv_open_and_set_callbacks(to_enc, &errorCode);
if (U_FAILURE(errorCode)) {
if (NULL != cvt->sourceCnv)
ucnv_close(cvt->sourceCnv);
free(cvt);
errno = (U_MEMORY_ALLOCATION_ERROR == errorCode) ? ENOMEM : EINVAL;
get_posix_error();
return;
}
if ((NULL == in) || (NULL == *in)) {
return;
} else {
/* Main case: in is not NULL and *in is not NULL */
char *source = *in;
char *target = *out;
size_t ret = 0;
printf("cvt->pivotSource => %p\n", cvt->pivotSource);
printf("cvt->pivotTarget => %p\n", cvt->pivotTarget);
printf("cvt->pivotTarget - cvt->pivotSource => %i\n",
cvt->pivotTarget - cvt->pivotSource);
printf(" ******\n");
ucnv_convertEx(cvt->targetCnv,
cvt->sourceCnv,
&target,
target + *out_left,
(const char **) &source, /* TODO: double-check cast */
source + *in_left,
cvt->buf,
&cvt->pivotSource,
&cvt->pivotTarget,
cvt->buf + ICU_BUF_SIZE,
0, /* reset */
0, /* flush */
&errorCode);
printf("cvt->pivotSource => %p\n", cvt->pivotSource);
printf("cvt->pivotTarget => %p\n", cvt->pivotTarget);
printf("cvt->pivotTarget - cvt->pivotSource => %i\n",
cvt->pivotTarget - cvt->pivotSource);
printf("source => %p\n", source);
printf("*in => %p\n", *in);
ret = source - *in;
printf("ret => %i\n", ret);
printf("old *in_left => %i\n", *in_left);
*in_left = *in_left - ret;
printf("new *in_left => %i\n", *in_left);
*in = source;
printf("target => %p\n", target);
printf("*out => %p\n", *out);
printf("(target - *out) => %i\n", (target - *out));
printf("old *out_left => %i\n", *out_left);
*out_left = *out_left - (target - *out);
printf("new *out_left => %i\n", *out_left);
*out = target;
switch (errorCode) {
case U_ZERO_ERROR:
printf("U_ZERO_ERROR\n");
return;
case U_BUFFER_OVERFLOW_ERROR:
printf("RKTIO_ERROR_CONVERT_NOT_ENOUGH_SPACE\n");
return;
case U_TRUNCATED_CHAR_FOUND:
printf("RKTIO_ERROR_CONVERT_PREMATURE_END\n");
return;
case U_ILLEGAL_CHAR_FOUND:
case U_INVALID_CHAR_FOUND:
printf("RKTIO_ERROR_CONVERT_BAD_SEQUENCE\n");
return;
default:
printf("RKTIO_ERROR_CONVERT_OTHER\n");
return;
};
};
}

void do_example(char *show, char *in)
{
printf("--------\n\"%s\"\n", show);
intptr_t in_left = strlen(in);
char buf[101] = {0};
char *out = buf;
intptr_t out_left = 100;
icu_convert(&in, &in_left, &out, &out_left);
return;
}


int main(void)
{
do_example("\\xFF" "\\xFF",
"\xFF" "\xFF");
do_example("abc" "\\xF0" "\\x9F" "\\xFF",
"abc" "\xF0" "\x9F" "\xFF");
return 0;
}
--8<---------------cut here---------------end--------------->8---
Reply all
Reply to author
Forward
0 new messages