From b9996f8a8f4c327500df8830b9959723ea19c676 Mon Sep 17 00:00:00 2001 From: Peter Lobsinger Date: Wed, 20 Nov 2024 22:48:29 -0500 Subject: [PATCH] Use a sed script to vis-encode non-ASCII bytes for mtree Bazel's Starlark does not provide access to a string's bytes, only its codepoints, so we are unable to do this escaping in Starlark. So a second pass is needed, at least until the spec and implementation work to get a [`bytes` type](https://github.com/bazelbuild/starlark/issues/112) lands. Fixes https://github.com/bazel-contrib/bazel-lib/issues/794 --- lib/private/BUILD.bazel | 1 + lib/private/gen_vis_scripts/BUILD.bazel | 3 + .../gen_vis_scripts/gen_vis_scripts.go | 15 ++ lib/private/tar.bzl | 26 +++- lib/private/vis_escape_nonascii.sed | 132 ++++++++++++++++++ 5 files changed, 172 insertions(+), 5 deletions(-) create mode 100644 lib/private/vis_escape_nonascii.sed diff --git a/lib/private/BUILD.bazel b/lib/private/BUILD.bazel index 3ae7d66de..8b1f13c35 100644 --- a/lib/private/BUILD.bazel +++ b/lib/private/BUILD.bazel @@ -8,6 +8,7 @@ exports_files( "modify_mtree.awk", "parse_status_file.jq", "parse_status_file.yq", + "vis_escape_nonascii.sed", ], visibility = ["//visibility:public"], ) diff --git a/lib/private/gen_vis_scripts/BUILD.bazel b/lib/private/gen_vis_scripts/BUILD.bazel index 52596abf4..7a9378001 100644 --- a/lib/private/gen_vis_scripts/BUILD.bazel +++ b/lib/private/gen_vis_scripts/BUILD.bazel @@ -11,9 +11,11 @@ run_binary( name = "run_gen_vis_scripts", outs = [ "vis_escape_ascii.bzl", + "vis_escape_nonascii.sed", ], args = [ "vis_escape_ascii.bzl=$(location vis_escape_ascii.bzl)", + "vis_escape_nonascii.sed=$(location vis_escape_nonascii.sed)", ], tool = ":gen_vis_scripts", ) @@ -26,5 +28,6 @@ write_source_files( # files = { "//lib/private:vis_escape_ascii.bzl": ":vis_escape_ascii.bzl", + "//lib/private:vis_escape_nonascii.sed": ":vis_escape_nonascii.sed", }, ) diff --git a/lib/private/gen_vis_scripts/gen_vis_scripts.go b/lib/private/gen_vis_scripts/gen_vis_scripts.go index 6180695b5..0b21a16c3 100644 --- a/lib/private/gen_vis_scripts/gen_vis_scripts.go +++ b/lib/private/gen_vis_scripts/gen_vis_scripts.go @@ -26,6 +26,8 @@ func main() { switch name { case "vis_escape_ascii.bzl": writeEscapeASCIIBzl(f) + case "vis_escape_nonascii.sed": + writeEscapeNonASCIISed(f) default: log.Fatal("unknown generated content:", name) } @@ -63,3 +65,16 @@ VIS_ESCAPE_ASCII = maketrans({ } fmt.Fprintln(w, "})") } + +func writeEscapeNonASCIISed(w io.Writer) { + fmt.Fprintln(w, strings.TrimSpace(` +# Code generated by gen_vis_scripts. DO NOT EDIT. +# Replace non-ASCII bytes with their octal escape sequences. +# Escaping of ASCII is done in Starlark prior to writing content out. + `)) + fmt.Fprintln(w, "") + + for i := 0x80; i <= 0xFF; i++ { + fmt.Fprintf(w, `s/\x%02[1]x/\\%03[1]o/g%[2]c`, i, newline) + } +} diff --git a/lib/private/tar.bzl b/lib/private/tar.bzl index 1332aa12f..91af77332 100644 --- a/lib/private/tar.bzl +++ b/lib/private/tar.bzl @@ -121,11 +121,13 @@ Possible values: values = [-1, 0, 1], ), "_compute_unused_inputs_flag": attr.label(default = Label("//lib:tar_compute_unused_inputs")), + "_vis_escape_nonascii": attr.label(allow_single_file = True, default = Label("//lib/private:vis_escape_nonascii.sed")), } _mtree_attrs = { "srcs": attr.label_list(doc = "Files that are placed into the tar", allow_files = True), "out": attr.output(doc = "Resulting specification file to write"), + "_vis_escape_nonascii": attr.label(allow_single_file = True, default = Label("//lib/private:vis_escape_nonascii.sed")), } def _add_compression_args(compress, args): @@ -255,14 +257,14 @@ def _configured_unused_inputs_file(ctx, srcs, keep): # See also: https://github.com/bazel-contrib/bazel-lib/issues/794 ctx.actions.run_shell( outputs = [unused_inputs], - inputs = [prunable_inputs, keep_inputs, ctx.file.mtree], + inputs = [prunable_inputs, keep_inputs, ctx.file.mtree, ctx.file._vis_escape_nonascii], tools = [coreutils], command = ''' "$COREUTILS" join -v 1 \\ - <("$COREUTILS" sort -u "$PRUNABLE_INPUTS") \\ + <(sed -f "$VIS_ESCAPE_NONASCII" "$PRUNABLE_INPUTS" | "$COREUTILS" sort -u) \\ <("$COREUTILS" sort -u \\ <(grep -o '\\bcontents\\?=\\S*' "$MTREE" | "$COREUTILS" cut -d'=' -f 2-) \\ - "$KEEP_INPUTS" \\ + <(sed -f "$VIS_ESCAPE_NONASCII" "$KEEP_INPUTS") \\ ) \\ | "$COREUTILS" cut -d' ' -f 2- \\ > "$UNUSED_INPUTS" @@ -273,6 +275,7 @@ def _configured_unused_inputs_file(ctx, srcs, keep): "KEEP_INPUTS": keep_inputs.path, "MTREE": ctx.file.mtree.path, "UNUSED_INPUTS": unused_inputs.path, + "VIS_ESCAPE_NONASCII": ctx.file._vis_escape_nonascii.path, }, mnemonic = "UnusedTarInputs", toolchain = "@aspect_bazel_lib//lib:coreutils_toolchain_type", @@ -373,7 +376,8 @@ def _to_rlocation_path(file, workspace): return workspace + "/" + file.short_path def _vis_encode(filename): - # TODO(#794): correctly encode all filenames by using vis(3) (or porting it) + # Escaping of non-ASCII bytes cannot be performed within Starlark. + # After writing content out, a second pass is performed with vis_escape_nonascii.sed. return str_translate(filename, VIS_ESCAPE_ASCII) def _expand(file, expander, transform = to_repository_relative_path): @@ -401,6 +405,7 @@ def _expand(file, expander, transform = to_repository_relative_path): def _mtree_impl(ctx): out = ctx.outputs.out or ctx.actions.declare_file(ctx.attr.name + ".spec") + unescaped = ctx.actions.declare_file(ctx.attr.name + ".spec.unescaped") content = ctx.actions.args() content.set_param_file_format("multiline") @@ -445,7 +450,18 @@ def _mtree_impl(ctx): _mtree_line(_vis_encode(runfiles_dir + "/_repo_mapping"), "file", content = _vis_encode(repo_mapping.path)), ) - ctx.actions.write(out, content = content) + ctx.actions.write(unescaped, content = content) + ctx.actions.run_shell( + outputs = [out], + inputs = [unescaped, ctx.file._vis_escape_nonascii], + command = 'sed -f "$VIS_ESCAPE_NONASCII" "$UNESCAPED" > "$OUT"', + env = { + "VIS_ESCAPE_NONASCII": ctx.file._vis_escape_nonascii.path, + "UNESCAPED": unescaped.path, + "OUT": out.path, + }, + mnemonic = "EscapeNonAscii", + ) return DefaultInfo(files = depset([out]), runfiles = ctx.runfiles([out])) diff --git a/lib/private/vis_escape_nonascii.sed b/lib/private/vis_escape_nonascii.sed new file mode 100644 index 000000000..744713564 --- /dev/null +++ b/lib/private/vis_escape_nonascii.sed @@ -0,0 +1,132 @@ +# Code generated by gen_vis_scripts. DO NOT EDIT. +# Replace non-ASCII bytes with their octal escape sequences. +# Escaping of ASCII is done in Starlark prior to writing content out. + +s/\x80/\\200/g +s/\x81/\\201/g +s/\x82/\\202/g +s/\x83/\\203/g +s/\x84/\\204/g +s/\x85/\\205/g +s/\x86/\\206/g +s/\x87/\\207/g +s/\x88/\\210/g +s/\x89/\\211/g +s/\x8a/\\212/g +s/\x8b/\\213/g +s/\x8c/\\214/g +s/\x8d/\\215/g +s/\x8e/\\216/g +s/\x8f/\\217/g +s/\x90/\\220/g +s/\x91/\\221/g +s/\x92/\\222/g +s/\x93/\\223/g +s/\x94/\\224/g +s/\x95/\\225/g +s/\x96/\\226/g +s/\x97/\\227/g +s/\x98/\\230/g +s/\x99/\\231/g +s/\x9a/\\232/g +s/\x9b/\\233/g +s/\x9c/\\234/g +s/\x9d/\\235/g +s/\x9e/\\236/g +s/\x9f/\\237/g +s/\xa0/\\240/g +s/\xa1/\\241/g +s/\xa2/\\242/g +s/\xa3/\\243/g +s/\xa4/\\244/g +s/\xa5/\\245/g +s/\xa6/\\246/g +s/\xa7/\\247/g +s/\xa8/\\250/g +s/\xa9/\\251/g +s/\xaa/\\252/g +s/\xab/\\253/g +s/\xac/\\254/g +s/\xad/\\255/g +s/\xae/\\256/g +s/\xaf/\\257/g +s/\xb0/\\260/g +s/\xb1/\\261/g +s/\xb2/\\262/g +s/\xb3/\\263/g +s/\xb4/\\264/g +s/\xb5/\\265/g +s/\xb6/\\266/g +s/\xb7/\\267/g +s/\xb8/\\270/g +s/\xb9/\\271/g +s/\xba/\\272/g +s/\xbb/\\273/g +s/\xbc/\\274/g +s/\xbd/\\275/g +s/\xbe/\\276/g +s/\xbf/\\277/g +s/\xc0/\\300/g +s/\xc1/\\301/g +s/\xc2/\\302/g +s/\xc3/\\303/g +s/\xc4/\\304/g +s/\xc5/\\305/g +s/\xc6/\\306/g +s/\xc7/\\307/g +s/\xc8/\\310/g +s/\xc9/\\311/g +s/\xca/\\312/g +s/\xcb/\\313/g +s/\xcc/\\314/g +s/\xcd/\\315/g +s/\xce/\\316/g +s/\xcf/\\317/g +s/\xd0/\\320/g +s/\xd1/\\321/g +s/\xd2/\\322/g +s/\xd3/\\323/g +s/\xd4/\\324/g +s/\xd5/\\325/g +s/\xd6/\\326/g +s/\xd7/\\327/g +s/\xd8/\\330/g +s/\xd9/\\331/g +s/\xda/\\332/g +s/\xdb/\\333/g +s/\xdc/\\334/g +s/\xdd/\\335/g +s/\xde/\\336/g +s/\xdf/\\337/g +s/\xe0/\\340/g +s/\xe1/\\341/g +s/\xe2/\\342/g +s/\xe3/\\343/g +s/\xe4/\\344/g +s/\xe5/\\345/g +s/\xe6/\\346/g +s/\xe7/\\347/g +s/\xe8/\\350/g +s/\xe9/\\351/g +s/\xea/\\352/g +s/\xeb/\\353/g +s/\xec/\\354/g +s/\xed/\\355/g +s/\xee/\\356/g +s/\xef/\\357/g +s/\xf0/\\360/g +s/\xf1/\\361/g +s/\xf2/\\362/g +s/\xf3/\\363/g +s/\xf4/\\364/g +s/\xf5/\\365/g +s/\xf6/\\366/g +s/\xf7/\\367/g +s/\xf8/\\370/g +s/\xf9/\\371/g +s/\xfa/\\372/g +s/\xfb/\\373/g +s/\xfc/\\374/g +s/\xfd/\\375/g +s/\xfe/\\376/g +s/\xff/\\377/g