Kpop 2GM
unread,Jun 4, 2022, 3:49:56 AM6/4/22You do not have permission to delete messages in this group
Sign in to report message
Either email addresses are anonymous for this group or you need the view member email addresses permission to view the original message
to
gnu-gawk normally would generate warning messages when UTF-8 unsafe bytes are used in functions like length( ) , index( ), or match( ). At the same time, the default array splitting methodology involves either
1.keeping UTF-8 characters intact, adding extra layer of complexity if you want to perform operations on the bytes, e.g. URL-encoding or base64 encoding (without having to resort to LC_ALL=C),
or
2. padding custom SEPs between every single byte, and wasting individual array cells for each and every ASCII safe alphanumeric when it's perfectly safe to take a chunk of them out, and substring out what you need.
here's a scripting level solution that doesn't require custom gawk-extension libraries, allowing one to handle ANY arbitrary combination of binary bytes, in gawk-unicode mode, without triggering any warning messages, nor needing to manually suppress them.
The BAU way takes 26-array cells, and you still have to deal with manually splitting up the UTF-8 characters.
The new approach needs 20-cells, plus providing useful metadata to the coder -
e.g. one can quickly identify array index # 4-7 as overly-long 3-byte sequence,
| [ 4] { 1 } <( { \354 EC # utf-8 3-bytes lead } )>
| [ 5] { 1 } <( { \210 88 # utf-8 cont-byte } )>
| [ 6] { 1 } <( { \267 B7 # utf-8 cont-byte } )>
| [ 7] { 1 } <( { \273 BB # utf-8 cont-byte } )>
so together, it constitutes 1 valid UTF-8 3-byte character, specifically, U+C237, plus one extra utf-8 invalid byte.
* caveat 1 : it acts up when using gawk -P flag, but should be mostly safe in gawk -e.
* caveat 2 : it's not 100% fool-proof, in the sense I've only included 3 different custom-SEP pairs without using any of the named ones ,
so you'll need to modify it if you absolutely ned it to handle any arbitrary input without the SEPs conflicting with the actual input data and leading to unexpected results.
***** the 0x dfxx only look like that cuz of legacy code-pages. they're individual 8-bit bytes .
-- 0x df80 ( 57216 ) :: [ Ä ]
-- 0x df81 ( 57217 ) :: [ Å ]
-- 0x df82 ( 57218 ) :: [ Ç ]
-- 0x df83 ( 57219 ) :: [ É ]
-- 0x df84 ( 57220 ) :: [ Ñ ]
-- 0x df85 ( 57221 ) :: [ Ö ]
-- 0x df86 ( 57222 ) :: [ Ü ]
-- 0x df87 ( 57223 ) :: [ á ]
-- 0x df88 ( 57224 ) :: [ à ]
-- 0x df89 ( 57225 ) :: [ â ]
-- 0x df8a ( 57226 ) :: [ ä ]
-- 0x df8b ( 57227 ) :: [ ã ]
-- 0x df8c ( 57228 ) :: [ å ]
-- 0x df8d ( 57229 ) :: [ ç ]
-- 0x df8e ( 57230 ) :: [ é ]
-- 0x df8f ( 57231 ) :: [ è ]
-- 0x df90 ( 57232 ) :: [ ê ]
-- 0x df91 ( 57233 ) :: [ ë ]
-- 0x df92 ( 57234 ) :: [ í ]
-- 0x df93 ( 57235 ) :: [ ì ]
-- 0x df94 ( 57236 ) :: [ î ]
-- 0x df95 ( 57237 ) :: [ ï ]
-- 0x df96 ( 57238 ) :: [ ñ ]
-- 0x df97 ( 57239 ) :: [ ó ]
-- 0x df98 ( 57240 ) :: [ ò ]
-- 0x df99 ( 57241 ) :: [ ô ]
-- 0x df9a ( 57242 ) :: [ ö ]
-- 0x df9b ( 57243 ) :: [ õ ]
-- 0x df9c ( 57244 ) :: [ ú ]
-- 0x df9d ( 57245 ) :: [ ù ]
-- 0x df9e ( 57246 ) :: [ û ]
-- 0x df9f ( 57247 ) :: [ ü ]
-- 0x dfa0 ( 57248 ) :: [ † ]
-- 0x dfa1 ( 57249 ) :: [ ° ]
-- 0x dfa2 ( 57250 ) :: [ ¢ ]
-- 0x dfa3 ( 57251 ) :: [ £ ]
-- 0x dfa4 ( 57252 ) :: [ § ]
-- 0x dfa5 ( 57253 ) :: [ • ]
-- 0x dfa6 ( 57254 ) :: [ ¶ ]
-- 0x dfa7 ( 57255 ) :: [ ß ]
-- 0x dfa8 ( 57256 ) :: [ ® ]
-- 0x dfa9 ( 57257 ) :: [ © ]
-- 0x dfaa ( 57258 ) :: [ ™ ]
-- 0x dfab ( 57259 ) :: [ ´ ]
-- 0x dfac ( 57260 ) :: [ ¨ ]
-- 0x dfad ( 57261 ) :: [ ≠ ]
-- 0x dfae ( 57262 ) :: [ Æ ]
-- 0x dfaf ( 57263 ) :: [ Ø ]
-- 0x dfb0 ( 57264 ) :: [ ∞ ]
-- 0x dfb1 ( 57265 ) :: [ ± ]
-- 0x dfb2 ( 57266 ) :: [ ≤ ]
-- 0x dfb3 ( 57267 ) :: [ ≥ ]
-- 0x dfb4 ( 57268 ) :: [ ¥ ]
-- 0x dfb5 ( 57269 ) :: [ µ ]
-- 0x dfb6 ( 57270 ) :: [ ∂ ]
-- 0x dfb7 ( 57271 ) :: [ ∑ ]
-- 0x dfb8 ( 57272 ) :: [ ∏ ]
-- 0x dfb9 ( 57273 ) :: [ π ]
-- 0x dfba ( 57274 ) :: [ ∫ ]
-- 0x dfbb ( 57275 ) :: [ ª ]
-- 0x dfbc ( 57276 ) :: [ º ]
-- 0x dfbd ( 57277 ) :: [ Ω ]
-- 0x dfbe ( 57278 ) :: [ æ ]
-- 0x dfbf ( 57279 ) :: [ ø ]
-- 0x dfc0 ( 57280 ) :: [ ¿ ]
-- 0x dfc1 ( 57281 ) :: [ ¡ ]
-- 0x dfc2 ( 57282 ) :: [ ¬ ]
-- 0x dfc3 ( 57283 ) :: [ √ ]
-- 0x dfc4 ( 57284 ) :: [ ƒ ]
-- 0x dfc5 ( 57285 ) :: [ ≈ ]
-- 0x dfc6 ( 57286 ) :: [ ∆ ]
-- 0x dfc7 ( 57287 ) :: [ « ]
-- 0x dfc8 ( 57288 ) :: [ » ]
-- 0x dfc9 ( 57289 ) :: [ … ]
-- 0x dfca ( 57290 ) :: [ ]
-- 0x dfcb ( 57291 ) :: [ À ]
-- 0x dfcc ( 57292 ) :: [ Ã ]
-- 0x dfcd ( 57293 ) :: [ Õ ]
-- 0x dfce ( 57294 ) :: [ Œ ]
-- 0x dfcf ( 57295 ) :: [ œ ]
-- 0x dfd0 ( 57296 ) :: [ – ]
-- 0x dfd1 ( 57297 ) :: [ — ]
-- 0x dfd2 ( 57298 ) :: [ “ ]
-- 0x dfd3 ( 57299 ) :: [ ” ]
-- 0x dfd4 ( 57300 ) :: [ ‘ ]
-- 0x dfd5 ( 57301 ) :: [ ’ ]
-- 0x dfd6 ( 57302 ) :: [ ÷ ]
-- 0x dfd7 ( 57303 ) :: [ ◊ ]
-- 0x dfd8 ( 57304 ) :: [ ÿ ]
-- 0x dfd9 ( 57305 ) :: [ Ÿ ]
-- 0x dfda ( 57306 ) :: [ ⁄ ]
-- 0x dfdb ( 57307 ) :: [ € ]
-- 0x dfdc ( 57308 ) :: [ ‹ ]
-- 0x dfdd ( 57309 ) :: [ › ]
-- 0x dfde ( 57310 ) :: [ fi ]
-- 0x dfdf ( 57311 ) :: [ fl ]
-- 0x dfe0 ( 57312 ) :: [ ‡ ]
-- 0x dfe1 ( 57313 ) :: [ · ]
-- 0x dfe2 ( 57314 ) :: [ ‚ ]
-- 0x dfe3 ( 57315 ) :: [ „ ]
-- 0x dfe4 ( 57316 ) :: [ ‰ ]
-- 0x dfe5 ( 57317 ) :: [ Â ]
-- 0x dfe6 ( 57318 ) :: [ Ê ]
-- 0x dfe7 ( 57319 ) :: [ Á ]
-- 0x dfe8 ( 57320 ) :: [ Ë ]
-- 0x dfe9 ( 57321 ) :: [ È ]
-- 0x dfea ( 57322 ) :: [ Í ]
-- 0x dfeb ( 57323 ) :: [ Î ]
-- 0x dfec ( 57324 ) :: [ Ï ]
-- 0x dfed ( 57325 ) :: [ Ì ]
-- 0x dfee ( 57326 ) :: [ Ó ]
-- 0x dfef ( 57327 ) :: [ Ô ]
-- 0x dff0 ( 57328 ) :: [ ]
-- 0x dff1 ( 57329 ) :: [ Ò ]
-- 0x dff2 ( 57330 ) :: [ Ú ]
-- 0x dff3 ( 57331 ) :: [ Û ]
-- 0x dff4 ( 57332 ) :: [ Ù ]
-- 0x dff5 ( 57333 ) :: [ ı ]
-- 0x dff6 ( 57334 ) :: [ ˆ ]
-- 0x dff7 ( 57335 ) :: [ ˜ ]
-- 0x dff8 ( 57336 ) :: [ ¯ ]
-- 0x dff9 ( 57337 ) :: [ ˘ ]
-- 0x dffa ( 57338 ) :: [ ˙ ]
-- 0x dffb ( 57339 ) :: [ ˚ ]
-- 0x dffc ( 57340 ) :: [ ¸ ]
-- 0x dffd ( 57341 ) :: [ ˝ ]
-- 0x dffe ( 57342 ) :: [ ˛ ]
-- 0x dfff ( 57343 ) :: [ ˇ ]
{ \0 # >NULL-byte }
{ \a \7 # >BEL/alert }
{ \b \10 # >backspc }
{ \t \11 # >h-TAB }
{ \n \12 # >NL line-feed }
{ \v \13 # >v-TAB }
{ \f \14 # >FF form-feed }
{ \r \15 # >\r\n most common }
{ \33 # >ESCAPE; \e awk-invalid }
{ \34 # >SUBSEP-def. }
{ \177 # >DELETE }
{ \37 1F # [:cntrl:] }
{ \36 1E # [:cntrl:] }
{ \35 1D # [:cntrl:] }
{ \32 1A # [:cntrl:] }
{ \31 19 # [:cntrl:] }
{ \30 18 # [:cntrl:] }
{ \27 17 # [:cntrl:] }
{ \26 16 # [:cntrl:] }
{ \25 15 # [:cntrl:] }
{ \24 14 # [:cntrl:] }
{ \23 13 # [:cntrl:] }
{ \22 12 # [:cntrl:] }
{ \21 11 # [:cntrl:] }
{ \20 10 # [:cntrl:] }
{ \17 0F # [:cntrl:] }
{ \16 0E # [:cntrl:] }
{ \06 06 # [:cntrl:] }
{ \05 05 # [:cntrl:] }
{ \04 04 # [:cntrl:] }
{ \03 03 # [:cntrl:] }
{ \02 02 # [:cntrl:] }
{ \01 01 # [:cntrl:] }
161
input length( abc
12%3=숷??:??5@XYZ6 ~~~ 30 bytes
| [ 1] { 1 } <( "a" )>
| [ 2] { 1 } <( "b" )>
| [ 3] { 1 } <( "c" )>
| [ 4] { 1 } <( { \f \14 # >FF form-feed } )>
| [ 5] { 1 } <( "1" )>
| [ 6] { 1 } <( "2" )>
| [ 7] { 1 } <( "%" )>
| [ 8] { 1 } <( "3" )>
| [ 9] { 1 } <( "=" )>
| [ 10] { 3 } <( "숷" )>
| [ 11] { 1 } <( { \273 BB # utf-8 cont-byte } )>
| [ 12] { 1 } <( { \33 # >ESCAPE; \e awk-invalid } )>
| [ 13] { 1 } <( "+" )>
| [ 14] { 1 } <( { \300 C0 # utf-8 INVALID } )>
| [ 15] { 1 } <( { \255 AD # utf-8 cont-byte } )>
| [ 16] { 1 } <( ":" )>
| [ 17] { 1 } <( { \374 FC # utf-8 INVALID } )>
| [ 18] { 1 } <( { \27 17 # [:cntrl:] } )>
| [ 19] { 1 } <( { \344 E4 # utf-8 3-bytes lead } )>
| [ 20] { 1 } <( "5" )>
| [ 21] { 3 } <( "" )>
| [ 22] { 1 } <( "@" )>
| [ 23] { 1 } <( "X" )>
| [ 24] { 1 } <( "Y" )>
| [ 25] { 1 } <( "Z" )>
| [ 26] { 1 } <( "6" )>
.
_____AFTER_NEW_SPLITTING_____
.
.
.
| [ 1] { 3 } <( "abc" )>
| [ 2] { 1 } <( { \f \14 # >FF form-feed } )>
| [ 3] { 5 } <( "12%3=" )>
| [ 4] { 1 } <( { \354 EC # utf-8 3-bytes lead } )>
| [ 5] { 1 } <( { \210 88 # utf-8 cont-byte } )>
| [ 6] { 1 } <( { \267 B7 # utf-8 cont-byte } )>
| [ 7] { 1 } <( { \273 BB # utf-8 cont-byte } )>
| [ 8] { 1 } <( { \33 # >ESCAPE; \e awk-invalid } )>
| [ 9] { 1 } <( "+" )>
| [ 10] { 1 } <( { \300 C0 # utf-8 INVALID } )>
| [ 11] { 1 } <( { \255 AD # utf-8 cont-byte } )>
| [ 12] { 1 } <( ":" )>
| [ 13] { 1 } <( { \374 FC # utf-8 INVALID } )>
| [ 14] { 1 } <( { \27 17 # [:cntrl:] } )>
| [ 15] { 1 } <( { \344 E4 # utf-8 3-bytes lead } )>
| [ 16] { 1 } <( "5" )>
| [ 17] { 1 } <( { \357 EF # utf-8 3-bytes lead } )>
| [ 18] { 1 } <( { \243 A3 # utf-8 cont-byte } )>
| [ 19] { 1 } <( { \277 BF # utf-8 cont-byte } )>
| [ 20] { 5 } <( "@XYZ6" )>
# gawk profile, created Sat Jun 4 03:08:09 2022
LC_ALL= LANG="en_US.UTF-8" gawk -d- -p- -e '
# BEGIN rule(s)
BEGIN {
1 print initOct()
1 _ = ""
1 _____ = "^[ -~]*$"
1 print " input length( ",
(_ = "abc\f12%3=\354\210\267\273\033+"\
"\300\255:\374\027\3445\357\243\277@XYZ6") (" ~~~ "),
((_) ~ (_____)) && (_ ~ /^[ -~]*?/) \
? length(_) " utf8 chars" \
: (match(_, "$") - 1) " bytes\n\n"
1 split(_, __, "")
26 for (____ in __) {
26 printf " | [%3.f] { %3.f } <( %45.45s )> \n", ____,
((___ = __[____]) ~ _____) && (___ ~ /^[ -~]*?/) ? length(___) : match(___, "$")-1,
(___ in octalRE_L) ? octalRE_L[___] : "\"" (___) "\""
}
1 printf ("\n .\n _____AFTER_NEW_SPLITTING_____\n .\n .\n .\n\n")
1 ___ = (_) !~ (___ = "\31") ? (___) : _ !~ ( ___ = "\24\23") ? ___ : "\5\4\32\1"
1 _____ = _ !~ (_____ = "\25") ? _____ : _ !~ (_____="\35\17") ? _____ : "\26\36\6\16"
1 gsub("", ("&") ___, _)
1 gsub("([ -~]" (___) ")+", ("&") ___, _)
1 gsub("[ -~]", ("&") _____, _)
1 gsub("^" (___) "|" (_____) (___) "|" (___) "$", "", _)
1 split(_, __, ___)
1 _____ = "^[ -~]*$"
20 for (____ in __) {
20 printf " | [%3.f] { %3.f } <( %45.45s )> \n", ____,
(___=__[____]) ~_____ ? length(___) : match(___, "$")-1,
(___ in octalRE_L) ? octalRE_L[___] : "\"" (___) "\""
}
}
# Functions, listed alphabetically
1 function initOct(_, __, ___, ____, _____)
{
1 __ = -(_ ^= _ < _) + (++_ - ++_ ^ _ + _ ^ ++_) * (_ ^ ++_)
1 ___ = -(_-- ^ --_ + _)
1 _ ^= ++_
1 ++__
128 do {
128 ____ = ""
128 printf " -- 0x %4x ( %5.f ) :: [ %3.1s ]\n", __ + ___, __ + ___, ____ = sprintf("%c", __ + ___)
128 octalRE_L[____] = \
sprintf("{ \\%03o %.2X # %-20.25s }", ___ + _, _ + ___,
("utf-8 ") ((____ < "\300") ? "cont-byte" \
: (____ < "\340") ? ((____ < "\302") ? "INVALID" : "2-bytes lead") \
: (____ < "\365") ? (4 - (____ < "\360")) "-bytes lead" : "INVALID" ))
128 ____ = ""
} while (++___ < -___)
1 OFS = ORS
1 print "", octalRE_L["\0"] = "{ \\0 # >NULL-byte }",
octalRE_L["\a"] = "{ \\a \\7 # >BEL/alert }",
octalRE_L["\b"] = "{ \\b \\10 # >backspc }",
octalRE_L["\t"] = "{ \\t \\11 # >h-TAB }",
octalRE_L["\n"] = "{ \\n \\12 # >NL line-feed }",
octalRE_L["\v"] = "{ \\v \\13 # >v-TAB }",
octalRE_L["\f"] = "{ \\f \\14 # >FF form-feed }",
octalRE_L["\r"] = "{ \\r \\15 # >\\r\\n most common }",
octalRE_L["\33"] = "{ \\33 # >ESCAPE; \\e awk-invalid }",
octalRE_L["\34"] = "{ \\34 # >SUBSEP-def. }",
octalRE_L["\177"] = "{ \\177 # >DELETE }"
1 _ = ""
1 OFS = FS
31 for (_ = (_ += _ ^= _ < _) ^ _ - +-(++_ ^ _); _; _--) {
31 if (! ((__ = sprintf("%c", _)) in octalRE_L)) { # 22
22 print octalRE_L[__] = sprintf("{ \\%.2o %.2X # [:cntrl:] }", _, _)
}
}
1 _ = ""
161 for (__ in octalRE_L) {
161 ++_
}
1 return +_
}
ARGC: 1
ARGIND: 0
ARGV: array, 1 elements
BINMODE: 0
CONVFMT: "%.6g"
ENVIRON: array, 123 elements
ERRNO: ""
FIELDWIDTHS: ""
FILENAME: ""
FNR: 0
FPAT: "[^[:space:]]+"
FS: " "
FUNCTAB: array, 42 elements
IGNORECASE: 0
LINT: 0
NF: 0
NR: 0
OFMT: "%.6g"
OFS: " "
ORS: "\n"
PREC: 53
PROCINFO: array, 35 elements
RLENGTH: 0
ROUNDMODE: "N"
RS: "\n"
RSTART: 2
RT: ""
SUBSEP: "\034"
SYMTAB: array, 34 elements
TEXTDOMAIN: "messages"
_: "abc\031\f\03112%3=\031\354\031\210\031\267\031\273\031\033\031+\031\300\031\255\031:\031\374\031\027\031\344\0315\031\357\031\243\031\277\031@XYZ6"
__: array, 20 elements
___: "@XYZ6"
____: 20
_____: "^[ -~]*$"
octalRE_L: array, 161 elements