On 4/15/2014 7:54 AM, Janis Papanagnou wrote:
> On 15.04.2014 14:43, Ed Morton wrote:
> [...]
>>
>> Sure there's a few additional things I'd like to have in it but no big deal if
>> I have to work around them and the only "missing" functionality I've needed
>> frequently enough to actively put some effort into providing for myself was
>> the ability to specify here documents in print statements so with my home-spun
>> hack you can simply do:
>>
>> awk '/some regexp/ {
>> print <<!
>> /* Here is some random function
>> * being added to some code.
>> */
>> int foo()
>> {
>> int c = bar();
>>
>> printf("c is now: %d\n", c);
>>
>> return 0;
>> }
>> !
>> }' file
>>
>> to add multi-line text to files.
>
> Are you doing some preprocessing or did you modify the gawk source to obtain
> that feature?
Sadly it's a pe-processor. It parses the input with awk to create a new awk
script, essentially replacing:
print <<!
line 1
line 2
!
with:
print "line 1"
print "line 2"
and then executes that generated script on the original input file.
Beyond that, it just does some interesting stuff with the indenting and allowing
awk variables and quotes within the print statements.
In the latter case I'd be interested whether you consider that
> code to be clean enough to ask Arnold to incorporate it in gawk.
>
> In the past I occasionally needed such a feature as well, and worked around
> it by either writing multiple print/printf lines, or by solving it in shell.
>
> (Seeing your code, and until such feature is available, I suppose next time
> I'll use such a shell syntax and pre-process awk programs that are using it.)
Mine's attached below in case it's useful, with usage examples at the end. I
call it "epawk" for "Extended Print AWK".
Ed.
#!/usr/bin/bash
# The above must be the first line of this script as bash or zsh is
# required for the shell array reference syntax used in this script.
##########################################################
# Extended Print AWK
#
# Allows printing of pre-formatted blocks of multi-line text in awk scripts.
#
# Before invoking the tool, do the following IN ORDER:
#
# 1) Start each block of pre-formatted text in your script with
# print << TERMINATOR
# on it's own line and end it with
# TERMINATOR
# on it's own line. TERMINATOR can be any sequence of non-blank characters
# you like. Spaces are allowed around the symbols but are not required.
# If << is followed by -, e.g.:
# print <<- TERMINATOR
# then all leading tabs are removed from the block of pre-formatted
# text (just like shell here documents), if it's followed by + instead, e.g.:
# print <<+ TERMINATOR
# then however many leading tabs are common across all non-blank lines
# in the current pre-formatted block are removed.
# If << is followed by =, e.g.
# print <<= TERMINATOR
# then whatever leading white space (tabs or blanks) occurs before the
# "print" command will be removed from all non-blank lines in
# the current pre-formatted block.
# By default no leading spaces are removed. Anything you place after
# the TERMINATOR will be reproduced as-is after every line in the
# post-processed script, so this for example:
# print << HERE |"cat>&2"
# foo
# HERE
# would cause "foo" to be printed to stderr.
#
# 2) Within each block of pre-formatted text only:
# a) Put a backslash character before every backslash (\ -> \\).
# b) Put a backslash character before every double quote (" -> \").
# c) Enclose awk variables in double quotes without leading
# backslashes (awkVar -> "awkVar").
# d) Enclose awk record and field references ($0, $1, $2, etc.)
# in double quotes without leading backslashes ($1 -> "$1").
#
# 3) If the script is specified on the command line instead of via
# "-f script" then replace all single quote characters (') in or out
# of the pre-formatted blocks with their ANSI octal escape sequence (\047)
# or the sequence '\'' (tick backslash tick tick). This is normal and is
# required because command-line awk scripts cannot contain single quote
# characters as those delimit the script. Do not use hex \x27, see
#
http://awk.freeshell.org/PrintASingleQuote.
#
# Then just use it like you would gawk with the small caveat that only
# "-W <option>", not "--<option>", is supported for long options so you
# can use "-W re-interval" but not "--re-interval" for example.
#
# To just see the post-processed script and not execute it, call this
# script with the "-X" option.
#
# See the bottom of this file for usage examples.
##########################################################
toolName="$(basename "$0")"
expand_prints() {
gawk '
!inBlock {
if ( match($0,/^[[:blank:]]*print[[:blank:]]*<</) ) {
# save any blanks before the print in case
# skipType "=" is used.
leadBlanks = $0
sub(/[^[:blank:]].*$/,"",leadBlanks)
$0 = substr($0,RSTART+RLENGTH)
if ( sub(/^[-]/,"") ) { skipType = "-" }
else if ( sub(/^[+]/,"") ) { skipType = "+" }
else if ( sub(/^[=]/,"") ) { skipType = "=" }
else { skipType = "" }
gsub(/(^[[:blank:]]+|[[:blank:]]+$)/,"")
if (/[[:blank:]]/) {
terminator = $0
sub(/[[:blank:]].*/,"",terminator)
postprint = $0
sub(/[^[:blank:]]+[[:blank:]]+/,"",postprint)
}
else {
terminator = $0
postprint = ""
}
startBlock()
next
}
}
inBlock {
stripped=$0
gsub(/(^[[:blank:]]+|[[:blank:]]+$)/,"",stripped)
if ( stripped"" == terminator"" ) {
endBlock()
}
else {
updBlock()
}
next
}
{ print }
function startBlock() { inBlock=1; numLines=0 }
function updBlock() { block[++numLines] = $0 }
function endBlock( i,numSkip,indent) {
if (skipType == "") {
# do not skip any leading tabs
indent = ""
}
else if (skipType == "-") {
# skip all leading tabs
indent = "[\t]+"
}
else if (skipType == "+") {
# skip however many leading tabs are common across
# all non-blank lines in the current pre-formatted block
for (i=1;i<=numLines;i++) {
if (block[i] ~ /[^[:blank:]]/) {
match(block[i],/^[\t]+/)
if ( (numSkip == "") || (numSkip > RLENGTH) ) {
numSkip = RLENGTH
}
}
}
for (i=1;i<=numSkip;i++) {
indent = indent "\t"
}
}
else if (skipType == "=") {
# skip whatever pattern of blanks existed
# before the "print" statement
indent = leadBlanks
}
for (i=1;i<=numLines;i++) {
sub(indent,"",block[i])
print "print \"" block[i] "\"\t" postprint
}
inBlock=0
}
' "$@"
}
unset awkArgs
unset scriptFiles
expandOnly=0
while getopts "v:F:W:f:X" arg
do
case $arg in
f ) scriptFiles+=( "$OPTARG" ) ;;
[vFW] ) awkArgs+=( "-$arg" "$OPTARG" ) ;;
X ) expandOnly=1 ;;
* ) exit 1 ;;
esac
done
shift $(( OPTIND - 1 ))
if [ -z "${scriptFiles[*]}" -a "$#" -gt "0" ]
then
# The script cannot contain literal 's because in cases like this:
# 'BEGIN{ ...abc'def... }'
# the args parsed here (and later again by gawk) would be:
# $1 = BEGIN{ ...abc
# $2 = def... }
# Replace 's with \047 or '\'' if you need them:
# 'BEGIN{ ...abc\047def... }'
# 'BEGIN{ ...abc'\''def... }'
scriptText="$1"
shift
fi
# Remaining symbols in "$@" must be data file names and/or variable
# assignments that do not use the "-v name=value" syntax.
if [ -n "${scriptFiles[*]}" ]
then
if (( expandOnly == 1 ))
then
expand_prints "${scriptFiles[@]}"
else
gawk "${awkArgs[@]}" "$(expand_prints "${scriptFiles[@]}")" "$@"
fi
elif [ -n "$scriptText" ]
then
if (( expandOnly == 1 ))
then
printf '%s\n' "$scriptText" | expand_prints
else
gawk "${awkArgs[@]}" "$(printf '%s\n' "$scriptText" | expand_prints)" "$@"
fi
else
printf '%s: ERROR: no awk script specified.\n' "$toolName" >&2
exit 1
fi
exit
##########################################################
USAGE EXAMPLES:
$ cat data.txt
abc def"ghi
$
#######
$ cat script.awk
{
awkVar="bar"
print "----------------"
print << HERE
backslash: \\
quoted text: \"text\"
single quote as ANSI sequence: \047
literal single quote (ONLY works when script is in a file): '
awk variable: "awkVar"
awk field: "$2"
HERE
print "----------------"
print <<-!
backslash: \\
quoted text: \"text\"
single quote as ANSI sequence: \047
literal single quote (ONLY works when script is in a file): '
awk variable: "awkVar"
awk field: "$2"
!
print "----------------"
print <<+ whatever
backslash: \\
quoted text: \"text\"
single quote as ANSI sequence: \047
literal single quote (ONLY works when script is in a file): '
awk variable: "awkVar"
awk field: "$2"
whatever
print "----------------"
}
$ epawk -f script.awk data.txt
----------------
backslash: \
quoted text: "text"
single quote as ANSI sequence: '
literal single quote (ONLY works when script is in a file): '
awk variable: bar
awk field: def"ghi
----------------
backslash: \
quoted text: "text"
single quote as ANSI sequence: '
literal single quote (ONLY works when script is in a file): '
awk variable: bar
awk field: def"ghi
----------------
backslash: \
quoted text: "text"
single quote as ANSI sequence: '
literal single quote (ONLY works when script is in a file): '
awk variable: bar
awk field: def"ghi
----------------
#######
$ epawk -F\" '{
print <<!
ANSI-tick-surrounded quote-separated field 2 (will work): \047"$2"\047
!
}' data.txt
ANSI-tick-surrounded quote-separated field 2 (will work): 'ghi'
$
#######
epawk -F\" '{
print <<!
Shell-escaped-tick-surrounded quote-separated field 2 (will work): '\''"$2"'\''
"
}' data.txt
Shell-escaped-tick-surrounded quote-separated field 2 (will work): 'ghi'
$
#######
$ epawk -F\" '{
print <<!
Literal-tick-surrounded quote-separated field 2 (will not work): '"$2"'
!
}' data.txt
Literal-tick-surrounded quote-separated field 2 (will not work):
$
#######
$ epawk -X 'BEGIN{
print <<!
foo
bar
!
}'
BEGIN{
print " foo"
print " bar"
}
$
#######
$ cat file
a
b
c
$ epawk '{
print <<+! |"cat>o2"
numLines="NR"
numFields="NF", $0="$0", $1="$1"
!
}' file
$ cat o2
numLines=1
numFields=1, $0=a, $1=a
numLines=2
numFields=1, $0=b, $1=b
numLines=3
numFields=1, $0=c, $1=c
$
#######
$ epawk 'BEGIN{
cmd = "sort"
print <<+! |& cmd
d
b
a
c
!
close(cmd, "to")
while ( (cmd |& getline line) > 0 ) {
print "got:", line
}
close(cmd)
}' file
got: a
got: b
got: c
got: d
$