On Saturday, July 18, 2015 at 3:06:34 AM UTC-5, luser droog wrote:
> I tried to write this straight from the table in the rfc
> (illustrating "my style"). What do y'all thinks?
>
Here's a revised version with comment. Still working on the bulk
of Richard Damon's suggestions.
In a way, I think the addition of comments decreases readability
by increasing the overall size and separating things that are
closely related (like the values in the master table and the next
set of macros where each piece is given a name.
/*
<-------- adapters ("apps-"hungarian naming scheme)
utf8_ucs4
ucs4_utf8
*/
int_least32_t *ucs4_utf8(char *str, int n);
char *utf8_ucs4(int_least32_t *ar, int n, int *an);
/* Master table macro describing the translation.
The arguments to the parameter macro are the lower bound of the UCS range for this case,
upper bound, number of corresponding encoded utf-8 bytes, followed by quadruples describing
each byte. The quadruple is the mask for the byte's header field, the value of the signature
matching the masked bits, and the mask for the value section (== logical NOT of the header
mask), and shift or number of lower bits allocated to later bytes.
*/
#define UCS_RANGE(_) \
_(0x00000000,0x0000007F,1, 0x80,0x00,0x7F,0) \
_(0x00000080,0x000007FF,2, 0xE0,0xC0,0x1F,6, 0xC0,0x80,0x3F,0) \
_(0x00000800,0x0000FFFF,3, 0xF0,0xE0,0x0F,12, 0xC0,0x80,0x3F,6, 0xC0,0x80,0x3F,0) \
_(0x00010000,0x0010FFFF,4, 0xF8,0xF0,0x07,18, 0xC0,0x80,0x3F,12, 0xC0,0x80,0x3F,6, 0xC0,0x80,0x3F,0)
/* Macros to match each of the 4 valid forms of utf-8 encoding.
If the first byte ANDed with the header mask is equal to the signature, then this
is the correct case. Decode the bytes, updating the buffer and u pointer, and
increment the p pointer so one more increment will point to the next byte.
*/
#define MATCH1(msk,sig,val,shf) \
if ((*p&msk)==sig) { *u++=*p&val; }
#define MATCH2(msk0,sig0,val0,shf0, msk1,sig1,val1,shf1) \
if ((*p&msk0)==sig0) { *u++=((*p&val0)<<shf0) | (p[1]&val1); ++p; }
#define MATCH3(msk0,sig0,val0,shf0, msk1,sig1,val1,shf1, msk2,sig2,val2,shf2) \
if ((*p&msk0)==sig0) { *u++=((*p&val0)<<shf0) | ((p[1]&val1)<<shf1) | (p[2]&val2); p+=2; }
#define MATCH4(msk0,sig0,val0,shf0, msk1,sig1,val1,shf1, msk2,sig2,val2,shf2, msk3,sig3,val3,shf3) \
if ((*p&msk0)==sig0) { *u++=((*p*val0)<<shf0) | ((p[1]&val1)<<shf1) | ((p[2]&val2)<<shf2) | (p[3]&val3); p+=3; }
/* Invoke the appropriate MATCH macro for each case.
*/
#define UTF_CASE(a,z,n,...) \
else MATCH##n(__VA_ARGS__)
/* Allocate buffer.
Iterate through input string,
decode each utf-8 sequence using the table
Return pointer to buffer.
*/
int_least32_t *ucs4_utf8(char *str, int n){
char *p=str;
int_least32_t *u,*buf=u=malloc(n*sizeof*u);
if (buf) {
for (;*p;p++) {
if (0); UCS_RANGE(UTF_CASE)
}
}
return buf;
}
/* Macros to output each of the 4 valid forms of utf-8 encoding.
Encode the appropriate signatures and masked+shifted portions of the UCS-4 value,
updating the p pointer.
*/
#define OUTBYTE1(msk,sig,val,shf) \
*p++=sig|(x&val);
#define OUTBYTE2(msk0,sig0,val0,shf0, msk1,sig1,val1,shf1) \
*p++=sig0|((x>>shf0)&val0); \
*p++=sig1|(x&val1);
#define OUTBYTE3(msk0,sig0,val0,shf0, msk1,sig1,val1,shf1, msk2,sig2,val2,shf2) \
*p++=sig0|((x>>shf0)&val0); \
*p++=sig1|((x>>shf1)&val1); \
*p++=sig2|(x&val2);
#define OUTBYTE4(msk0,sig0,val0,shf0, msk1,sig1,val1,shf1, msk2,sig2,val2,shf2, msk3,sig3,val3,shf3) \
*p++=sig0|((x>>shf0)&val0); \
*p++=sig1|((x>>shf1)&val1); \
*p++=sig2|((x>>shf2)&val2); \
*p++=sig3|(x&val3);
/* Invoke the appropriate OUTBYTE macro for each case
*/
#define UCS_CASE(a,z,n,...) \
else if (ar[i]<z) { int_least32_t x=ar[i]; OUTBYTE##n(__VA_ARGS__) }
/* Allocate buffer.
Iterate through input array,
encode each UCS-4 value using the table
Return pointer to buffer.
*/
char *utf8_ucs4(int_least32_t *ar, int n, int *an){
char *p,*buf=p=malloc(n*4+1);
if (buf) {
int i;
for (i=0; i<n; i++) {
if (0); UCS_RANGE(UCS_CASE)
}
if (an)
*an=p-buf;
*p++=0;
}
return buf;
}
And here's the same code processed with `cpp -P` and
`indent -gnu -i4 -br -ce -cdw -nbc -brf -brs -l100 -bbo`.
It's certainly smaller, but now it's chock-full of magic numbers.
int_least32_t *ucs4_utf8 (char *str, int n);
char *utf8_ucs4 (int_least32_t * ar, int n, int *an);
int_least32_t *
ucs4_utf8 (char *str, int n) {
char *p = str;
int_least32_t *u, *buf = u = malloc (n * sizeof *u);
if (buf) {
for (; *p; p++) {
if (0);
else if ((*p & 0x80) == 0x00) {
*u++ = *p & 0x7F;
} else if ((*p & 0xE0) == 0xC0) {
*u++ = ((*p & 0x1F) << 6) | (p[1] & 0x3F);
++p;
} else if ((*p & 0xF0) == 0xE0) {
*u++ = ((*p & 0x0F) << 12) | ((p[1] & 0x3F) << 6) | (p[2] & 0x3F);
p += 2;
} else if ((*p & 0xF8) == 0xF0) {
*u++ =
((*p *
0x07) << 18) | ((p[1] & 0x3F) << 12) | ((p[2] & 0x3F) << 6) | (p[3] & 0x3F);
p += 3;
}
}
}
return buf;
}
char *
utf8_ucs4 (int_least32_t * ar, int n, int *an) {
char *p, *buf = p = malloc (n * 4 + 1);
if (buf) {
int i;
for (i = 0; i < n; i++) {
if (0);
else if (ar[i] < 0x0000007F) {
int_least32_t x = ar[i];
*p++ = 0x00 | (x & 0x7F);
} else if (ar[i] < 0x000007FF) {
int_least32_t x = ar[i];
*p++ = 0xC0 | ((x >> 6) & 0x1F);
*p++ = 0x80 | (x & 0x3F);
} else if (ar[i] < 0x0000FFFF) {
int_least32_t x = ar[i];
*p++ = 0xE0 | ((x >> 12) & 0x0F);
*p++ = 0x80 | ((x >> 6) & 0x3F);
*p++ = 0x80 | (x & 0x3F);
} else if (ar[i] < 0x0010FFFF) {
int_least32_t x = ar[i];
*p++ = 0xF0 | ((x >> 18) & 0x07);
*p++ = 0x80 | ((x >> 12) & 0x3F);
*p++ = 0x80 | ((x >> 6) & 0x3F);
*p++ = 0x80 | (x & 0x3F);
}
}
if (an)
*an = p - buf;
*p++ = 0;
}
return buf;
}