Google Groups no longer supports new Usenet posts or subscriptions. Historical content remains viewable.

Dismiss

feature share : arbitrary-byte-safe array splitting in gawk-unicode mode

24 views

Skip to first unread message

Kpop 2GM

unread,

Jun 4, 2022, 3:49:56 AM6/4/22

gnu-gawk normally would generate warning messages when UTF-8 unsafe bytes are used in functions like length( ) , index( ), or match( ). At the same time, the default array splitting methodology involves either

1.keeping UTF-8 characters intact, adding extra layer of complexity if you want to perform operations on the bytes, e.g. URL-encoding or base64 encoding (without having to resort to LC_ALL=C),

or

2. padding custom SEPs between every single byte, and wasting individual array cells for each and every ASCII safe alphanumeric when it's perfectly safe to take a chunk of them out, and substring out what you need.

here's a scripting level solution that doesn't require custom gawk-extension libraries, allowing one to handle ANY arbitrary combination of binary bytes, in gawk-unicode mode, without triggering any warning messages, nor needing to manually suppress them.

The BAU way takes 26-array cells, and you still have to deal with manually splitting up the UTF-8 characters.

The new approach needs 20-cells, plus providing useful metadata to the coder -

e.g. one can quickly identify array index # 4-7 as overly-long 3-byte sequence,

| [ 4] { 1 } <( { \354 EC # utf-8 3-bytes lead } )>
| [ 5] { 1 } <( { \210 88 # utf-8 cont-byte } )>
| [ 6] { 1 } <( { \267 B7 # utf-8 cont-byte } )>
| [ 7] { 1 } <( { \273 BB # utf-8 cont-byte } )>

so together, it constitutes 1 valid UTF-8 3-byte character, specifically, U+C237, plus one extra utf-8 invalid byte.

* caveat 1 : it acts up when using gawk -P flag, but should be mostly safe in gawk -e.

* caveat 2 : it's not 100% fool-proof, in the sense I've only included 3 different custom-SEP pairs without using any of the named ones ,

so you'll need to modify it if you absolutely ned it to handle any arbitrary input without the SEPs conflicting with the actual input data and leading to unexpected results.

***** the 0x dfxx only look like that cuz of legacy code-pages. they're individual 8-bit bytes .

-- 0x df80 ( 57216 ) :: [ Ä ]
-- 0x df81 ( 57217 ) :: [ Å ]
-- 0x df82 ( 57218 ) :: [ Ç ]
-- 0x df83 ( 57219 ) :: [ É ]
-- 0x df84 ( 57220 ) :: [ Ñ ]
-- 0x df85 ( 57221 ) :: [ Ö ]
-- 0x df86 ( 57222 ) :: [ Ü ]
-- 0x df87 ( 57223 ) :: [ á ]
-- 0x df88 ( 57224 ) :: [ à ]
-- 0x df89 ( 57225 ) :: [ â ]
-- 0x df8a ( 57226 ) :: [ ä ]
-- 0x df8b ( 57227 ) :: [ ã ]
-- 0x df8c ( 57228 ) :: [ å ]
-- 0x df8d ( 57229 ) :: [ ç ]
-- 0x df8e ( 57230 ) :: [ é ]
-- 0x df8f ( 57231 ) :: [ è ]
-- 0x df90 ( 57232 ) :: [ ê ]
-- 0x df91 ( 57233 ) :: [ ë ]
-- 0x df92 ( 57234 ) :: [ í ]
-- 0x df93 ( 57235 ) :: [ ì ]
-- 0x df94 ( 57236 ) :: [ î ]
-- 0x df95 ( 57237 ) :: [ ï ]
-- 0x df96 ( 57238 ) :: [ ñ ]
-- 0x df97 ( 57239 ) :: [ ó ]
-- 0x df98 ( 57240 ) :: [ ò ]
-- 0x df99 ( 57241 ) :: [ ô ]
-- 0x df9a ( 57242 ) :: [ ö ]
-- 0x df9b ( 57243 ) :: [ õ ]
-- 0x df9c ( 57244 ) :: [ ú ]
-- 0x df9d ( 57245 ) :: [ ù ]
-- 0x df9e ( 57246 ) :: [ û ]
-- 0x df9f ( 57247 ) :: [ ü ]
-- 0x dfa0 ( 57248 ) :: [ † ]
-- 0x dfa1 ( 57249 ) :: [ ° ]
-- 0x dfa2 ( 57250 ) :: [ ¢ ]
-- 0x dfa3 ( 57251 ) :: [ £ ]
-- 0x dfa4 ( 57252 ) :: [ § ]
-- 0x dfa5 ( 57253 ) :: [ • ]
-- 0x dfa6 ( 57254 ) :: [ ¶ ]
-- 0x dfa7 ( 57255 ) :: [ ß ]
-- 0x dfa8 ( 57256 ) :: [ ® ]
-- 0x dfa9 ( 57257 ) :: [ © ]
-- 0x dfaa ( 57258 ) :: [ ™ ]
-- 0x dfab ( 57259 ) :: [ ´ ]
-- 0x dfac ( 57260 ) :: [ ¨ ]
-- 0x dfad ( 57261 ) :: [ ≠ ]
-- 0x dfae ( 57262 ) :: [ Æ ]
-- 0x dfaf ( 57263 ) :: [ Ø ]
-- 0x dfb0 ( 57264 ) :: [ ∞ ]
-- 0x dfb1 ( 57265 ) :: [ ± ]
-- 0x dfb2 ( 57266 ) :: [ ≤ ]
-- 0x dfb3 ( 57267 ) :: [ ≥ ]
-- 0x dfb4 ( 57268 ) :: [ ¥ ]
-- 0x dfb5 ( 57269 ) :: [ µ ]
-- 0x dfb6 ( 57270 ) :: [ ∂ ]
-- 0x dfb7 ( 57271 ) :: [ ∑ ]
-- 0x dfb8 ( 57272 ) :: [ ∏ ]
-- 0x dfb9 ( 57273 ) :: [ π ]
-- 0x dfba ( 57274 ) :: [ ∫ ]
-- 0x dfbb ( 57275 ) :: [ ª ]
-- 0x dfbc ( 57276 ) :: [ º ]
-- 0x dfbd ( 57277 ) :: [ Ω ]
-- 0x dfbe ( 57278 ) :: [ æ ]
-- 0x dfbf ( 57279 ) :: [ ø ]
-- 0x dfc0 ( 57280 ) :: [ ¿ ]
-- 0x dfc1 ( 57281 ) :: [ ¡ ]
-- 0x dfc2 ( 57282 ) :: [ ¬ ]
-- 0x dfc3 ( 57283 ) :: [ √ ]
-- 0x dfc4 ( 57284 ) :: [ ƒ ]
-- 0x dfc5 ( 57285 ) :: [ ≈ ]
-- 0x dfc6 ( 57286 ) :: [ ∆ ]
-- 0x dfc7 ( 57287 ) :: [ « ]
-- 0x dfc8 ( 57288 ) :: [ » ]
-- 0x dfc9 ( 57289 ) :: [ … ]
-- 0x dfca ( 57290 ) :: [ ]
-- 0x dfcb ( 57291 ) :: [ À ]
-- 0x dfcc ( 57292 ) :: [ Ã ]
-- 0x dfcd ( 57293 ) :: [ Õ ]
-- 0x dfce ( 57294 ) :: [ Œ ]
-- 0x dfcf ( 57295 ) :: [ œ ]
-- 0x dfd0 ( 57296 ) :: [ – ]
-- 0x dfd1 ( 57297 ) :: [ — ]
-- 0x dfd2 ( 57298 ) :: [ “ ]
-- 0x dfd3 ( 57299 ) :: [ ” ]
-- 0x dfd4 ( 57300 ) :: [ ‘ ]
-- 0x dfd5 ( 57301 ) :: [ ’ ]
-- 0x dfd6 ( 57302 ) :: [ ÷ ]
-- 0x dfd7 ( 57303 ) :: [ ◊ ]
-- 0x dfd8 ( 57304 ) :: [ ÿ ]
-- 0x dfd9 ( 57305 ) :: [ Ÿ ]
-- 0x dfda ( 57306 ) :: [ ⁄ ]
-- 0x dfdb ( 57307 ) :: [ € ]
-- 0x dfdc ( 57308 ) :: [ ‹ ]
-- 0x dfdd ( 57309 ) :: [ › ]
-- 0x dfde ( 57310 ) :: [ ﬁ ]
-- 0x dfdf ( 57311 ) :: [ ﬂ ]
-- 0x dfe0 ( 57312 ) :: [ ‡ ]
-- 0x dfe1 ( 57313 ) :: [ · ]
-- 0x dfe2 ( 57314 ) :: [ ‚ ]
-- 0x dfe3 ( 57315 ) :: [ „ ]
-- 0x dfe4 ( 57316 ) :: [ ‰ ]
-- 0x dfe5 ( 57317 ) :: [ Â ]
-- 0x dfe6 ( 57318 ) :: [ Ê ]
-- 0x dfe7 ( 57319 ) :: [ Á ]
-- 0x dfe8 ( 57320 ) :: [ Ë ]
-- 0x dfe9 ( 57321 ) :: [ È ]
-- 0x dfea ( 57322 ) :: [ Í ]
-- 0x dfeb ( 57323 ) :: [ Î ]
-- 0x dfec ( 57324 ) :: [ Ï ]
-- 0x dfed ( 57325 ) :: [ Ì ]
-- 0x dfee ( 57326 ) :: [ Ó ]
-- 0x dfef ( 57327 ) :: [ Ô ]
-- 0x dff0 ( 57328 ) :: [  ]
-- 0x dff1 ( 57329 ) :: [ Ò ]
-- 0x dff2 ( 57330 ) :: [ Ú ]
-- 0x dff3 ( 57331 ) :: [ Û ]
-- 0x dff4 ( 57332 ) :: [ Ù ]
-- 0x dff5 ( 57333 ) :: [ ı ]
-- 0x dff6 ( 57334 ) :: [ ˆ ]
-- 0x dff7 ( 57335 ) :: [ ˜ ]
-- 0x dff8 ( 57336 ) :: [ ¯ ]
-- 0x dff9 ( 57337 ) :: [ ˘ ]
-- 0x dffa ( 57338 ) :: [ ˙ ]
-- 0x dffb ( 57339 ) :: [ ˚ ]
-- 0x dffc ( 57340 ) :: [ ¸ ]
-- 0x dffd ( 57341 ) :: [ ˝ ]
-- 0x dffe ( 57342 ) :: [ ˛ ]
-- 0x dfff ( 57343 ) :: [ ˇ ]

{ \0 # >NULL-byte }
{ \a \7 # >BEL/alert }
{ \b \10 # >backspc }
{ \t \11 # >h-TAB }
{ \n \12 # >NL line-feed }
{ \v \13 # >v-TAB }
{ \f \14 # >FF form-feed }
{ \r \15 # >\r\n most common }
{ \33 # >ESCAPE; \e awk-invalid }
{ \34 # >SUBSEP-def. }
{ \177 # >DELETE }
{ \37 1F # [:cntrl:] }
{ \36 1E # [:cntrl:] }
{ \35 1D # [:cntrl:] }
{ \32 1A # [:cntrl:] }
{ \31 19 # [:cntrl:] }
{ \30 18 # [:cntrl:] }
{ \27 17 # [:cntrl:] }
{ \26 16 # [:cntrl:] }
{ \25 15 # [:cntrl:] }
{ \24 14 # [:cntrl:] }
{ \23 13 # [:cntrl:] }
{ \22 12 # [:cntrl:] }
{ \21 11 # [:cntrl:] }
{ \20 10 # [:cntrl:] }
{ \17 0F # [:cntrl:] }
{ \16 0E # [:cntrl:] }
{ \06 06 # [:cntrl:] }
{ \05 05 # [:cntrl:] }
{ \04 04 # [:cntrl:] }
{ \03 03 # [:cntrl:] }
{ \02 02 # [:cntrl:] }
{ \01 01 # [:cntrl:] }
161
input length( abc
12%3=숷??:??5@XYZ6 ~~~ 30 bytes

| [ 1] { 1 } <( "a" )>
| [ 2] { 1 } <( "b" )>
| [ 3] { 1 } <( "c" )>
| [ 4] { 1 } <( { \f \14 # >FF form-feed } )>
| [ 5] { 1 } <( "1" )>
| [ 6] { 1 } <( "2" )>
| [ 7] { 1 } <( "%" )>
| [ 8] { 1 } <( "3" )>
| [ 9] { 1 } <( "=" )>
| [ 10] { 3 } <( "숷" )>
| [ 11] { 1 } <( { \273 BB # utf-8 cont-byte } )>
| [ 12] { 1 } <( { \33 # >ESCAPE; \e awk-invalid } )>
| [ 13] { 1 } <( "+" )>
| [ 14] { 1 } <( { \300 C0 # utf-8 INVALID } )>
| [ 15] { 1 } <( { \255 AD # utf-8 cont-byte } )>
| [ 16] { 1 } <( ":" )>
| [ 17] { 1 } <( { \374 FC # utf-8 INVALID } )>
| [ 18] { 1 } <( { \27 17 # [:cntrl:] } )>
| [ 19] { 1 } <( { \344 E4 # utf-8 3-bytes lead } )>
| [ 20] { 1 } <( "5" )>
| [ 21] { 3 } <( "" )>
| [ 22] { 1 } <( "@" )>
| [ 23] { 1 } <( "X" )>
| [ 24] { 1 } <( "Y" )>
| [ 25] { 1 } <( "Z" )>
| [ 26] { 1 } <( "6" )>

.
_____AFTER_NEW_SPLITTING_____
.
.
.

| [ 1] { 3 } <( "abc" )>
| [ 2] { 1 } <( { \f \14 # >FF form-feed } )>
| [ 3] { 5 } <( "12%3=" )>
| [ 4] { 1 } <( { \354 EC # utf-8 3-bytes lead } )>
| [ 5] { 1 } <( { \210 88 # utf-8 cont-byte } )>
| [ 6] { 1 } <( { \267 B7 # utf-8 cont-byte } )>
| [ 7] { 1 } <( { \273 BB # utf-8 cont-byte } )>
| [ 8] { 1 } <( { \33 # >ESCAPE; \e awk-invalid } )>
| [ 9] { 1 } <( "+" )>
| [ 10] { 1 } <( { \300 C0 # utf-8 INVALID } )>
| [ 11] { 1 } <( { \255 AD # utf-8 cont-byte } )>
| [ 12] { 1 } <( ":" )>
| [ 13] { 1 } <( { \374 FC # utf-8 INVALID } )>
| [ 14] { 1 } <( { \27 17 # [:cntrl:] } )>
| [ 15] { 1 } <( { \344 E4 # utf-8 3-bytes lead } )>
| [ 16] { 1 } <( "5" )>
| [ 17] { 1 } <( { \357 EF # utf-8 3-bytes lead } )>
| [ 18] { 1 } <( { \243 A3 # utf-8 cont-byte } )>
| [ 19] { 1 } <( { \277 BF # utf-8 cont-byte } )>
| [ 20] { 5 } <( "@XYZ6" )>

# gawk profile, created Sat Jun 4 03:08:09 2022

LC_ALL= LANG="en_US.UTF-8" gawk -d- -p- -e '

# BEGIN rule(s)

BEGIN {
1 print initOct()
1 _ = ""
1 _____ = "^[ -~]*$"
1 print " input length( ",
(_ = "abc\f12%3=\354\210\267\273\033+"\
"\300\255:\374\027\3445\357\243\277@XYZ6") (" ~~~ "),
((_) ~ (_____)) && (_ ~ /^[ -~]*?/) \
? length(_) " utf8 chars" \
: (match(_, "$") - 1) " bytes\n\n"

1 split(_, __, "")
26 for (____ in __) {
26 printf " | [%3.f] { %3.f } <( %45.45s )> \n", ____,
((___ = __[____]) ~ _____) && (___ ~ /^[ -~]*?/) ? length(___) : match(___, "$")-1,
(___ in octalRE_L) ? octalRE_L[___] : "\"" (___) "\""
}

1 printf ("\n .\n _____AFTER_NEW_SPLITTING_____\n .\n .\n .\n\n")

1 ___ = (_) !~ (___ = "\31") ? (___) : _ !~ ( ___ = "\24\23") ? ___ : "\5\4\32\1"
1 _____ = _ !~ (_____ = "\25") ? _____ : _ !~ (_____="\35\17") ? _____ : "\26\36\6\16"

1 gsub("", ("&") ___, _)
1 gsub("([ -~]" (___) ")+", ("&") ___, _)
1 gsub("[ -~]", ("&") _____, _)
1 gsub("^" (___) "|" (_____) (___) "|" (___) "$", "", _)
1 split(_, __, ___)
1 _____ = "^[ -~]*$"

20 for (____ in __) {
20 printf " | [%3.f] { %3.f } <( %45.45s )> \n", ____,
(___=__[____]) ~_____ ? length(___) : match(___, "$")-1,
(___ in octalRE_L) ? octalRE_L[___] : "\"" (___) "\""
}
}

# Functions, listed alphabetically

1 function initOct(_, __, ___, ____, _____)
{
1 __ = -(_ ^= _ < _) + (++_ - ++_ ^ _ + _ ^ ++_) * (_ ^ ++_)
1 ___ = -(_-- ^ --_ + _)
1 _ ^= ++_
1 ++__
128 do {
128 ____ = ""
128 printf " -- 0x %4x ( %5.f ) :: [ %3.1s ]\n", __ + ___, __ + ___, ____ = sprintf("%c", __ + ___)
128 octalRE_L[____] = \
sprintf("{ \\%03o %.2X # %-20.25s }", ___ + _, _ + ___,
("utf-8 ") ((____ < "\300") ? "cont-byte" \
: (____ < "\340") ? ((____ < "\302") ? "INVALID" : "2-bytes lead") \
: (____ < "\365") ? (4 - (____ < "\360")) "-bytes lead" : "INVALID" ))
128 ____ = ""
} while (++___ < -___)
1 OFS = ORS
1 print "", octalRE_L["\0"] = "{ \\0 # >NULL-byte }",
octalRE_L["\a"] = "{ \\a \\7 # >BEL/alert }",
octalRE_L["\b"] = "{ \\b \\10 # >backspc }",
octalRE_L["\t"] = "{ \\t \\11 # >h-TAB }",
octalRE_L["\n"] = "{ \\n \\12 # >NL line-feed }",
octalRE_L["\v"] = "{ \\v \\13 # >v-TAB }",
octalRE_L["\f"] = "{ \\f \\14 # >FF form-feed }",
octalRE_L["\r"] = "{ \\r \\15 # >\\r\\n most common }",

octalRE_L["\33"] = "{ \\33 # >ESCAPE; \\e awk-invalid }",
octalRE_L["\34"] = "{ \\34 # >SUBSEP-def. }",
octalRE_L["\177"] = "{ \\177 # >DELETE }"
1 _ = ""
1 OFS = FS
31 for (_ = (_ += _ ^= _ < _) ^ _ - +-(++_ ^ _); _; _--) {
31 if (! ((__ = sprintf("%c", _)) in octalRE_L)) { # 22
22 print octalRE_L[__] = sprintf("{ \\%.2o %.2X # [:cntrl:] }", _, _)
}
}
1 _ = ""
161 for (__ in octalRE_L) {
161 ++_
}
1 return +_
}

ARGC: 1
ARGIND: 0
ARGV: array, 1 elements
BINMODE: 0
CONVFMT: "%.6g"
ENVIRON: array, 123 elements
ERRNO: ""
FIELDWIDTHS: ""
FILENAME: ""
FNR: 0
FPAT: "[^[:space:]]+"
FS: " "
FUNCTAB: array, 42 elements
IGNORECASE: 0
LINT: 0
NF: 0
NR: 0
OFMT: "%.6g"
OFS: " "
ORS: "\n"
PREC: 53
PROCINFO: array, 35 elements
RLENGTH: 0
ROUNDMODE: "N"
RS: "\n"
RSTART: 2
RT: ""
SUBSEP: "\034"
SYMTAB: array, 34 elements
TEXTDOMAIN: "messages"
_: "abc\031\f\03112%3=\031\354\031\210\031\267\031\273\031\033\031+\031\300\031\255\031:\031\374\031\027\031\344\0315\031\357\031\243\031\277\031@XYZ6"
__: array, 20 elements
___: "@XYZ6"
____: 20
_____: "^[ -~]*$"
octalRE_L: array, 161 elements

0 new messages