From 282fbd3bc2cca32b1a83c28cb9649de46cf404da Mon Sep 17 00:00:00 2001 From: Kerin Millar Date: Thu, 1 Aug 2024 07:30:19 +0100 Subject: Render quote_args() robust and implement a test case Coerce the effective character set as being C (US-ASCII) in the course of executing awk(1). Some implementations are strict and will otherwise fail in situations where the bytes cannot be decoded. $ uname -o Darwin $ echo "$LC_ALL" en_GB.UTF-8 $ printf '\200' | awk '/[\001-\037\177-\377]/' awk: towc: multibyte conversion failure on: '' In the above case, awk aborts because it has a need to decode the input, which turns out not to be valid UTF-8. Now, it is rather beyond the purview of quote_args() to guarantee that its parameters adhere to any particular character encoding. Fortunately, for it to contend with strings on a byte-by-byte basis is acceptable. Refactor the code somewhat. The behaviour has been adjusted so to be virtually identical to that of the "${*@Q}" expansion in bash, with the exception that the ESC character is rendered as $'\e' instead of $'\E'. Such an exception is necessary for POSIX-1.2024 conformance, wherein dollar-single-quotes are now a standard feature (see section 2.2.4 of the Shell Command Language). Revise the comment preceding the function so as to accurately document its behaviour. Finally, add a test case. It works by calling quote_args for every possible single-byte string before calculating a CRC checksum for the cumulative output and comparing it against a pre-determined value. Signed-off-by: Kerin Millar --- functions.sh | 65 +++++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 40 insertions(+), 25 deletions(-) (limited to 'functions.sh') diff --git a/functions.sh b/functions.sh index faacdca..036e3a7 100644 --- a/functions.sh +++ b/functions.sh @@ -425,47 +425,62 @@ parallel_run() # # Prints the positional parameters in a format that may be reused as shell # input. For each considered, it shall be determined whether its value contains -# any non-printable characters in lieu of the US-ASCII character set. If no such -# characters are found, the value shall have each instance of be -# replaced by before being -# enclosed by a pair of characters. Otherwise, non-printable -# characters shall be replaced by octal escape sequences, by -# and by , prior to -# the value being given a prefix of and a suffix of -# , per POSIX-1.2024. Finally, the resulting values shall be printed -# as separated. The latter quoting strategy can be suppressed by setting -# the POSIXLY_CORRECT variable as non-empty in the environment. +# any bytes that are either outside the scope of the US-ASCII character set or +# which are considered as non-printable. If no such bytes are found, the value +# shall have each instance of be replaced by +# before being enclosed by a pair of +# characters. However, as a special case, a value consisting of a +# single shall be replaced by . +# +# If any such bytes are found, the value shall instead be requoted in a manner +# that conforms with section 2.2.4 of the Shell Command Language, wherein the +# the use of dollar-single-quotes sequences is described. Such sequences are +# standard as of POSIX-1.2024. However, as of August 2024, many implementations +# lack support for this feature. So as to mitigate this state of affairs, the +# use of dollar-single-quotes may be suppressed by setting POSIXLY_CORRECT as a +# non-empty string. # quote_args() { - awk -v q=\' -f - -- "$@" <<-'EOF' + LC_ALL=C awk -v q=\' -f - -- "$@" <<-'EOF' + function init_table() { + # Iterate over ranges \001-\037 and \177-\377. + for (i = 1; i <= 255; i += (i == 31 ? 96 : 1)) { + char = sprintf("%c", i) + seq_by[char] = sprintf("%03o", i) + } + seq_by["\007"] = "a" + seq_by["\010"] = "b" + seq_by["\011"] = "t" + seq_by["\012"] = "n" + seq_by["\013"] = "v" + seq_by["\014"] = "f" + seq_by["\015"] = "r" + seq_by["\033"] = "e" + seq_by["\047"] = "'" + seq_by["\134"] = "\\" + } BEGIN { strictly_posix = length(ENVIRON["POSIXLY_CORRECT"]) argc = ARGC ARGC = 1 for (arg_idx = 1; arg_idx < argc; arg_idx++) { arg = ARGV[arg_idx] - if (strictly_posix || arg !~ /[\001-\037\177]/) { + if (arg == q) { + word = "\\" q + } else if (strictly_posix || arg !~ /[\001-\037\177-\377]/) { gsub(q, q "\\" q q, arg) word = q arg q } else { - # Use $'' quoting per POSIX-1.2024 - if (! ("\001" in ord_by)) { - for (i = 1; i < 32; i++) { - char = sprintf("%c", i) - ord_by[char] = i - } - ord_by["\177"] = 127 + # Use $'' quoting per POSIX-1.2024. + if (! ("\001" in seq_by)) { + init_table() } word = "$'" for (i = 1; i <= length(arg); i++) { char = substr(arg, i, 1) - if (char == "\\") { - word = word "\\\\" - } else if (char == q) { - word = word "\\'" - } else if (char in ord_by) { - word = word "\\" sprintf("%03o", ord_by[char]) + if (char in seq_by) { + word = word "\\" seq_by[char] } else { word = word char } -- cgit v1.2.3-65-gdbad