Hi,
For context, I am working on a programming language's runtime system to add
support for ICU as an alternative to iconv.
To do that, I have been using ucnv_convertEx() with converters that have
UCNV_TO_U_CALLBACK_STOP and UCNV_FROM_U_CALLBACK_STOP set. However, while
tracking down some test suite failures, I found that, when there is an error,
the position of the source pointer, i.e. the number of source bytes consumed,
differs from our iconv-based implementation.
To be concrete, I'm confused by the behavior on these two examples when
converting from (supposed) UTF-8 to UTF-8:
 1. "\xFF\xFF" reports 1 byte consumed, rather than 0.
 2. "abc\xF0\x9F\xFF" reports 5 bytes consumed. I would like the answer
    to be 3, but I'm also surprised that it isn't 5, given that the bad
    "\xFF" was consumed by the previous example. (The two-byte sequence
    "\xF0\x9F" could potentially begin a UTF-8 character.)
I've illustrated these examples with a single-file C program that I've posted
to 
https://gist.github.com/LiberalArtist/3f32e8c0b4d9a7db2a23a0cafe444d12 and
will also include at the end of this message.
I'd be grateful if someone could explain this behavior. More to the point, is
there a good way to emulate the behavior of our iconv backend? Do I need to
implement custom callbacks, or maybe drop down to ucnv_fromUnicode() and
ucnv_toUnicode()?
Thanks,
Philip McGrath
--8<---------------cut here---------------start------------->8---
/* cc -o ucnv-debug ucnv-debug.c `pkg-config --libs --cflags icu-uc`
 * SPDX-License-Identifier: (Apache-2.0 OR CC0-1.0)
 * SPDX-FileCopyrightText: Philip McGrath <
phi...@philipmcgrath.com>
 */
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <errno.h>
#include "unicode/utypes.h" /* Basic ICU data types  */
#include "unicode/ucnv.h"   /* C Converter API */
#include "unicode/uloc.h"   /* for precautionary thread initialization */
static UConverter *rktio_ucnv_open_and_set_callbacks(const char *converterName,
                                                     UErrorCode *error)
{
  UConverter *ucnv = ucnv_open(converterName, error);
  ucnv_setToUCallBack(ucnv, UCNV_TO_U_CALLBACK_STOP, NULL, NULL, NULL, error);
  ucnv_setFromUCallBack(ucnv, UCNV_FROM_U_CALLBACK_STOP,  NULL, NULL, NULL, error);
  if (U_FAILURE(*error)) {
    if (NULL != ucnv)
      ucnv_close(ucnv);
    return NULL;
  }
  return ucnv;
}
#define ICU_BUF_SIZE 1024
typedef struct rktio_icu_converter_t {
  /* rktio_converter_t tag; */
  UConverter *sourceCnv;
  UConverter *targetCnv;
  UChar *pivotSource; /* pointer info buf */
  UChar *pivotTarget; /* pointer info buf */
  UChar buf[ICU_BUF_SIZE];
} rktio_icu_converter_t;
const char *from_enc = "utf-8";
const char *to_enc = "UTF-8";
static void get_posix_error(void)
{
  return;
}
static void icu_convert(char **in, intptr_t *in_left,
                        char **out, intptr_t *out_left)
{
  UErrorCode errorCode = U_ZERO_ERROR;
  rktio_icu_converter_t *cvt = (rktio_icu_converter_t *)calloc(1, sizeof(rktio_icu_converter_t));
  if (NULL == cvt)
    return;
  cvt->pivotSource = &cvt->buf[0];
  cvt->pivotTarget = &cvt->buf[0];
  cvt->sourceCnv = rktio_ucnv_open_and_set_callbacks(from_enc, &errorCode);
  cvt->targetCnv = rktio_ucnv_open_and_set_callbacks(to_enc, &errorCode);
  if (U_FAILURE(errorCode)) {
    if (NULL != cvt->sourceCnv)
      ucnv_close(cvt->sourceCnv);
    free(cvt);
    errno = (U_MEMORY_ALLOCATION_ERROR == errorCode) ? ENOMEM : EINVAL;
    get_posix_error();
    return;
  }
  if ((NULL == in) || (NULL == *in)) {
    return;
  } else {
    /* Main case: in is not NULL and *in is not NULL */
    char *source = *in;
    char *target = *out;
    size_t ret = 0;
    printf("cvt->pivotSource => %p\n", cvt->pivotSource);
    printf("cvt->pivotTarget => %p\n", cvt->pivotTarget);
    printf("cvt->pivotTarget - cvt->pivotSource => %i\n",
           cvt->pivotTarget - cvt->pivotSource);
    printf("  ******\n");
    ucnv_convertEx(cvt->targetCnv,
                   cvt->sourceCnv,
                   &target,
                   target + *out_left,
                   (const char **) &source, /* TODO: double-check cast */
                   source + *in_left,
                   cvt->buf,
                   &cvt->pivotSource,
                   &cvt->pivotTarget,
                   cvt->buf + ICU_BUF_SIZE,
                   0, /* reset */
                   0, /* flush */
                   &errorCode);
    printf("cvt->pivotSource => %p\n", cvt->pivotSource);
    printf("cvt->pivotTarget => %p\n", cvt->pivotTarget);
    printf("cvt->pivotTarget - cvt->pivotSource => %i\n",
           cvt->pivotTarget - cvt->pivotSource);
    printf("source => %p\n", source);
    printf("*in => %p\n", *in);
    ret = source - *in;
    printf("ret => %i\n", ret);
    printf("old *in_left => %i\n", *in_left);
    *in_left = *in_left - ret;
    printf("new *in_left => %i\n", *in_left);
    *in = source;
    printf("target => %p\n", target);
    printf("*out => %p\n", *out);
    printf("(target - *out) => %i\n", (target - *out));
    printf("old *out_left => %i\n", *out_left);
    *out_left = *out_left - (target - *out);
    printf("new *out_left => %i\n", *out_left);
    *out = target;
    switch (errorCode) {
    case U_ZERO_ERROR:
      printf("U_ZERO_ERROR\n");
      return;
    case U_BUFFER_OVERFLOW_ERROR:
      printf("RKTIO_ERROR_CONVERT_NOT_ENOUGH_SPACE\n");
      return;
    case U_TRUNCATED_CHAR_FOUND:
      printf("RKTIO_ERROR_CONVERT_PREMATURE_END\n");
      return;
    case U_ILLEGAL_CHAR_FOUND:
    case U_INVALID_CHAR_FOUND:
      printf("RKTIO_ERROR_CONVERT_BAD_SEQUENCE\n");
      return;
    default:
      printf("RKTIO_ERROR_CONVERT_OTHER\n");
      return;
    };
  };
}
void do_example(char *show, char *in)
{
  printf("--------\n\"%s\"\n", show);
  intptr_t in_left = strlen(in);
  char buf[101] = {0};
  char *out = buf;
  intptr_t out_left = 100;
  icu_convert(&in, &in_left, &out, &out_left);
  return;
}
int main(void)
{
  do_example("\\xFF" "\\xFF",
              "\xFF"  "\xFF");
  do_example("abc" "\\xF0" "\\x9F" "\\xFF",
             "abc"  "\xF0"  "\x9F"  "\xFF");
  return 0;
}
--8<---------------cut here---------------end--------------->8---