From 0f2312b7f20125812a301519948af16c1e560544 Mon Sep 17 00:00:00 2001 From: Tianon Gravi Date: Wed, 30 Apr 2025 16:42:05 -0700 Subject: [PATCH] Implement more TODOs in `oci-validate` code I'm much happier with this interface now -- it can properly handle an image with attestations now, for example. --- helpers/oci-import.sh | 8 +- helpers/oci-sbom.sh | 7 +- helpers/oci-validate.sh | 159 ++++++++++++++++++++++++++-------------- oci.jq | 91 ++++++++++++++++++----- 4 files changed, 184 insertions(+), 81 deletions(-) diff --git a/helpers/oci-import.sh b/helpers/oci-import.sh index 81661e7..3d503ef 100755 --- a/helpers/oci-import.sh +++ b/helpers/oci-import.sh @@ -98,7 +98,7 @@ jq -L"$BASHBREW_META_SCRIPTS" --slurp --tab ' else . end | .mediaType //= media_type_oci_index # TODO index normalize function? just force this to be set/valid instead? - | validate_oci_index + | validate_oci_index({ indexPlatformsOptional: true }) | validate_length(.manifests; 1) # TODO allow upstream attestation in the future? # purge maintainer-provided URLs / annotations (https://github.com/docker-library/bashbrew/blob/4e0ea8d8aba49d54daf22bd8415fabba65dc83ee/cmd/bashbrew/oci-builder.go#L146-L147) @@ -123,7 +123,6 @@ jq -L"$BASHBREW_META_SCRIPTS" --slurp --tab ' $build | .source.arches[.build.arch].platform ) - # TODO .manifests[1].platform ? # inject our build annotations | .manifests[0].annotations += ( @@ -136,8 +135,6 @@ jq -L"$BASHBREW_META_SCRIPTS" --slurp --tab ' ' "$file" | tee index.json.new mv -f index.json.new index.json -# TODO "crane validate" is definitely interesting here -- it essentially validates all the descriptors recursively, including diff_ids, but it only supports "remote" or "tarball" (which refers to the *old* "docker save" tarball format), so isn't useful here, but we need to do basically that exact work - # now that "index.json" represents the exact index we want to push, let's push it down into a blob and make a new appropriate "index.json" for "crane push" # TODO we probably want/need some "traverse/manipulate an OCI layout" helpers 😭 mediaType="$(jq --raw-output '.mediaType' index.json)" @@ -159,3 +156,6 @@ jq -L"$BASHBREW_META_SCRIPTS" --null-input --tab ' } | normalize_manifest ' > index.json + +# TODO move this further out +"$BASHBREW_META_SCRIPTS/helpers/oci-validate.sh" . diff --git a/helpers/oci-sbom.sh b/helpers/oci-sbom.sh index 51eea2b..354457d 100755 --- a/helpers/oci-sbom.sh +++ b/helpers/oci-sbom.sh @@ -38,7 +38,7 @@ cd "$output" imageIndex="$(jq -L"$BASHBREW_META_SCRIPTS" --raw-output ' include "oci"; include "validate"; - validate_oci_index + validate_oci_index({ indexPlatformsOptional: true }) | validate_length(.manifests; 1) | validate_IN(.manifests[0].mediaType; media_types_index) | .manifests[0].digest @@ -91,7 +91,7 @@ done sbomIndex="$(jq -L"$BASHBREW_META_SCRIPTS" --raw-output ' include "oci"; include "validate"; - validate_oci_index + validate_oci_index({ indexPlatformsOptional: true }) | validate_length(.manifests; 1) | validate_IN(.manifests[0].mediaType; media_types_index) | .manifests[0].digest @@ -146,3 +146,6 @@ jq -L"$BASHBREW_META_SCRIPTS" --null-input --tab ' } | normalize_manifest ' > index.json + +# TODO move this further out +"$BASHBREW_META_SCRIPTS/helpers/oci-validate.sh" . diff --git a/helpers/oci-validate.sh b/helpers/oci-validate.sh index 16ed3dc..761c3ba 100755 --- a/helpers/oci-validate.sh +++ b/helpers/oci-validate.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash set -Eeuo pipefail -# given an OCI image layout (https://github.com/opencontainers/image-spec/blob/v1.1.1/image-layout.md), verifies all descriptors as much as possible (digest matches content, size, some media types, layer diff_ids, etc) +# given an OCI image layout (https://github.com/opencontainers/image-spec/blob/v1.1.1/image-layout.md), verifies all descriptors as much as possible (digest matches content, size, media types, layer diff_ids, etc) layout="$1"; shift @@ -23,85 +23,134 @@ jq -L"$BASHBREW_META_SCRIPTS" --slurp ' | empty ' oci-layout -# TODO this is all rubbish; it needs more thought (the jq functions it invokes are pretty solid now though) +# TODO (recursively?) validate subject descriptors in here somewhere 🤔 + +# TODO handle objects that *only* exist in the "data" field too 🤔 https://github.com/docker-library/meta-scripts/pull/125#discussion_r2070633122 +# maybe descriptor takes a "--data" flag that then returns the input descriptor, but enhanced with a "data" field so the other functions can use that to extract the data instead of relying on files? descriptor() { local file="$1"; shift # "blobs/sha256/xxx" - echo "blob: $file" - local digest="$1"; shift # "sha256:xxx" - local size="$1"; shift # "123" - local algo="${digest%%:*}" # sha256 - local hash="${digest#$algo:}" # xxx - local diskSize - [ "$algo" = 'sha256' ] # TODO error message - diskSize="$(stat --dereference --format '%s' "$file")" - [ "$size" = "$diskSize" ] # TODO error message - "${algo}sum" <<<"$hash *$file" --check --quiet --strict - + local desc; desc="$(cat)" + local shell + shell="$(jq <<<"$desc" -L"$BASHBREW_META_SCRIPTS" --slurp --raw-output ' + include "validate"; + include "oci"; + validate_one + | validate_oci_descriptor + | ( + @sh "local algo=\( + .digest + | split(":")[0] + | validate_IN(.; "sha256", "sha512") # TODO more algorithms? need more tools on the host + )", + + @sh "local data=\( + if has("data") then + .data + else " " end # empty string is valid base64 (which we should validate), but spaces are not, so we can use a single space to detect "data not set" + )", + + empty + ) + ')" + eval "$shell" + local digest size dataDigest= dataSize= + digest="$("${algo}sum" "$file" | cut -d' ' -f1)" + digest="$algo:$digest" + size="$(stat --dereference --format '%s' "$file")" + if [ "$data" != ' ' ]; then + dataDigest="$(base64 <<<"$data" -d | "${algo}sum" | cut -d' ' -f1)" + dataDigest="$algo:$dataDigest" + dataSize="$(base64 <<<"$data" -d | wc --bytes)" + # TODO *technically* we could get clever here and pass `base64 -d` to something like `tee >(wc --bytes) >(dig="$(sha256sum | cut -d' ' -f1)" && echo "sha256:$dig" && false) > /dev/null` to avoid parsing the base64 twice, but then failure cases are less likely to be caught, so it's safer to simply redecode (and we can't decode into a variable because this might be binary data *and* bash will do newline munging in both directions) + fi + jq <<<"$desc" -L"$BASHBREW_META_SCRIPTS" --slurp --arg digest "$digest" --arg size "$size" --arg dataDigest "$dataDigest" --arg dataSize "$dataSize" ' + include "validate"; + validate_one + | validate_IN(.digest; $digest) + | validate_IN(.size; $size | tonumber) + | if has("data") then + validate(.data; + $digest == $dataDigest + and $size == $dataSize + ; "(decoded) data has size \($dataSize) and digest \($dataDigest) (expected \($size) and \($digest))") + else . end + | empty + ' } -images() { - echo "image: $*" +# TODO validate config (diff_ids, history, platform - gotta carry *two* levels of descriptors for that, and decompress all the layers 🙊) +# TODO validate provenance/SBOM layer contents? + +image() { + local file="$1"; shift + echo "image: $file" + local desc; desc="$(cat)" + descriptor <<<"$desc" "$file" local shell shell="$( - jq -L"$BASHBREW_META_SCRIPTS" --arg expected "$#" --slurp --raw-output ' + jq <<<"$desc" -L"$BASHBREW_META_SCRIPTS" --slurp --raw-output ' include "validate"; include "oci"; - # TODO technically, this would pass if one file is empty and another file has two documents in it (since it is counting the total), so that is not great, but probably is not a real problem - validate_length(.; $expected | tonumber) - | map(validate_oci_image) + validate_length(.; 2) + | .[0] as $desc + | .[1] + | validate_oci_image({ + imageAttestation: IN($desc.annotations["vnd.docker.reference.type"]; "attestation-manifest"), + }) + | if $desc then + validate_IN(.mediaType; $desc.mediaType) + | validate_IN(.artifactType; $desc.artifactType) + else . end | ( ( - .[].config, .[].layers[] - | @sh "descriptor \("blobs/\(.digest | sub(":"; "/"))") \(.digest) \(.size)" - # TODO data? + .config, .layers[] + | @sh "descriptor <<<\(tojson) \(.digest | "blobs/\(sub(":"; "/"))")" ), empty # trailing comma ) - ' "$@" + ' /dev/stdin "$file" )" eval "$shell" } -# TODO pass descriptor values down so we can validate that they match (.mediaType, .artifactType, .platform across *two* levels index->manifest->config), similar to .data -# TODO disallow urls completely? - -indexes() { - echo "index: $*" +index() { + local file="$1"; shift + echo "index: $file" + local desc; desc="$(cat)" + if [ "$desc" != 'null' ]; then + descriptor <<<"$desc" "$file" + fi local shell shell="$( - jq -L"$BASHBREW_META_SCRIPTS" --arg expected "$#" --slurp --raw-output ' + jq <<<"$desc" -L"$BASHBREW_META_SCRIPTS" --slurp --raw-output ' include "validate"; include "oci"; - # TODO technically, this would pass if one file is empty and another file has two documents in it (since it is counting the total), so that is not great, but probably is not a real problem - validate_length(.; $expected | tonumber) - | map(validate_oci_index) + validate_length(.; 2) + | .[0] as $desc + | .[1] + | validate_oci_index({ + indexPlatformsOptional: (input_filename == "index.json"), + }) + | if $desc then + validate_IN(.mediaType; $desc.mediaType) + | validate_IN(.artifactType; $desc.artifactType) + else . end + | .manifests[] | ( - ( - .[].manifests[] - | @sh "descriptor \("blobs/\(.digest | sub(":"; "/"))") \(.digest) \(.size)" - # TODO data? - ), - - ( - [ .[].manifests[] | select(IN(.mediaType; media_types_image)) | .digest ] - | if length > 0 then - "images \(map("blobs/\(sub(":"; "/"))" | @sh) | join(" "))" - else empty end - ), - - ( - [ .[].manifests[] | select(IN(.mediaType; media_types_index)) | .digest ] - | if length > 0 then - "indexes \(map("blobs/\(sub(":"; "/"))" | @sh) | join(" "))" - else empty end - ), - - empty # trailing comma - ) - ' "$@" + .mediaType + | if IN(media_types_index) then + "index" + elif IN(media_types_image) then + "image" + else + error("UNSUPPORTED MEDIA TYPE: \(.)") + end + ) + @sh " <<<\(tojson) \(.digest | "blobs/\(sub(":"; "/"))")" + ' /dev/stdin "$file" )" eval "$shell" } -indexes index.json +index <<<'null' index.json diff --git a/oci.jq b/oci.jq index efc3279..7b9794e 100644 --- a/oci.jq +++ b/oci.jq @@ -162,12 +162,16 @@ def validate_oci_digest: $ ") // null) as $dig | validate(.; $dig; "invalid digest syntax") - | { - "sha256": [ 64 ], - "sha512": [ 128 ], - } as $lengths - | validate_IN($dig.algorithm; $lengths | keys[]) - | validate_length($dig.encoded; $lengths[$dig.algorithm][]) + | validate($dig; + validate_IN(.algorithm; "sha256", "sha512", "blake3") + | if .algorithm == "sha256" then + validate(.encoded; test("^[a-f0-9]{64}$"); "the encoded portion MUST match /[a-f0-9]{64}/") + elif .algorithm == "sha512" then + validate(.encoded; test("^[a-f0-9]{128}$"); "the encoded portion MUST match /[a-f0-9]{128}/") + elif .algorithm == "blake3" then # https://github.com/opencontainers/image-spec/pull/1240 + validate(.encoded; test("^[a-f0-9]{64}$"); "the encoded portion MUST match /[a-f0-9]{64}/") + else . end + ) ; # https://github.com/opencontainers/image-spec/blob/v1.1.1/annotations.md#rules @@ -175,6 +179,7 @@ def validate_oci_annotations_haver: if has("annotations") then validate(.annotations; type == "object"; "if present, annotations must be an object") | validate(.annotations[]; type == "string"; "annotation values must be strings") + # TODO validate that keys are not bare words (reverse DNS or vendor/bar) else . end ; @@ -191,7 +196,11 @@ def validate_oci_descriptor: | validate(.size; . == floor; "size must be whole") | validate(.size; . == ceil; "size must be whole") - # TODO urls? + | if has("urls") then + validate(.urls; type == "array") + | validate(.urls[]; type == "string") + | validate_length(.urls; 0) # TODO this intentionally contradicts the above lines -- are there cases where we should allow urls? + else . end | validate_oci_annotations_haver @@ -205,7 +214,9 @@ def validate_oci_descriptor: # someday, maybe we can validate that .data matches .digest here (needs more jq functionality, including and especially the ability to deal with non-UTF8 binary data from base64 and perform sha256 over it) else . end - # TODO artifactType? + | if has("artifactType") then + validate(.artifactType; type == "string") + else . end # https://github.com/opencontainers/image-spec/blob/v1.1.1/image-index.md#image-index-property-descriptions | if has("platform") then @@ -240,47 +251,87 @@ def validate_oci_subject_haver: ; # https://github.com/opencontainers/image-spec/blob/v1.1.1/image-index.md -def validate_oci_index: +def validate_oci_index($opt): validate_IN(type; "object") | validate_IN(.schemaVersion; 2) - | validate_IN(.mediaType; media_types_index) # TODO allow "null" here too? (https://github.com/opencontainers/image-spec/security/advisories/GHSA-77vh-xpmg-72qh) - # TODO artifactType? + | validate_IN(.mediaType; media_types_index) + | if has("artifactType") then + validate(.artifactType; type == "string") + | validate_IN(.artifactType; null) # TODO acceptable values? (this check intentionally contradicts the one above so artifactType generates an error) + else . end | validate(.manifests[]; validate_oci_descriptor | validate_IN(.mediaType; media_types_index, media_types_image) | validate(.size; . > 2; "manifest size must be at *least* big enough for {} plus *some* content") # https://github.com/opencontainers/distribution-spec/pull/293#issuecomment-1452780554 | validate(.size; . <= 4 * 1024 * 1024; "manifest size must be 4MiB (\(4 * 1024 * 1024)) or less") + + # slightly stricter enforcement than "validate_oci_descriptor" by default + | if $opt.indexPlatformsOptional then . else + validate(.platform; type == "object") + end + + # https://github.com/moby/buildkit/blob/c6145c2423de48f891862ac02f9b2653864d3c9e/docs/attestations/attestation-storage.md + | if .annotations | has("vnd.docker.reference.type") or has("vnd.docker.reference.digest") then + validate_IN(.mediaType; media_type_oci_image) + | validate_IN(.artifactType; null, "application/vnd.docker.attestation.manifest.v1+json") # https://github.com/moby/buildkit/pull/5573/files#r2069525281 + | validate_IN(.annotations["vnd.docker.reference.type"]; "attestation-manifest") + | validate(.annotations["vnd.docker.reference.digest"]; validate_oci_digest) + | validate_IN(.platform.os; "unknown") + | validate_IN(.platform.architecture; "unknown") + else + validate_IN(.artifactType; null) + end ) - # TODO if any manifest has "vnd.docker.reference.digest", validate the subject exists in the list + | if any(.manifests[].annotations; has("vnd.docker.reference.digest")) then + [ .manifests[].digest ] as $digests + | validate_IN(.manifests[].annotations["vnd.docker.reference.digest"]; null, $digests[]) + else . end | validate_oci_subject_haver | validate_oci_annotations_haver ; +def validate_oci_index: validate_oci_index({}); # https://github.com/opencontainers/image-spec/blob/v1.1.1/manifest.md -def validate_oci_image: +def validate_oci_image($opt): validate_IN(type; "object") | validate_IN(.schemaVersion; 2) - | validate_IN(.mediaType; media_types_image) # TODO allow "null" here too? (https://github.com/opencontainers/image-spec/security/advisories/GHSA-77vh-xpmg-72qh) - # TODO artifactType (but only selectively / certain values) + | validate_IN(.mediaType; media_types_image) + | if has("artifactType") then + validate(.artifactType; type == "string") + | validate_IN(.artifactType; + if $opt.imageAttestation then + "application/vnd.docker.attestation.manifest.v1+json" # https://github.com/moby/buildkit/pull/5573/files#r2069525281 + else null end # (this check intentionally contradicts the one above so artifactType normally generates an error) + ) + else . end | validate(.config; validate_oci_descriptor | validate(.size; . >= 2; "config must be at *least* big enough for {}") | validate_IN(.mediaType; media_types_config) + | validate_IN(.artifactType; null) ) | validate(.layers[]; validate_oci_descriptor - | validate_IN(.mediaType; media_types_layer) # TODO allow "application/vnd.in-toto+json" and friends, but selectively 🤔 + | if $opt.imageAttestation then + # https://github.com/moby/buildkit/blob/c6145c2423de48f891862ac02f9b2653864d3c9e/docs/attestations/attestation-storage.md + validate_IN(.mediaType; "application/vnd.in-toto+json") + | validate_IN(.annotations["in-toto.io/predicate-type"]; + "https://slsa.dev/provenance/v0.2", + "https://spdx.dev/Document", + empty # trailing comma + ) + else + validate_IN(.mediaType; media_types_layer) + end + | validate_IN(.artifactType; null) ) | validate_oci_subject_haver | validate_oci_annotations_haver ; +def validate_oci_image: validate_oci_image({}); # https://github.com/opencontainers/image-spec/blob/v1.1.1/image-layout.md#oci-layout-file def validate_oci_layout_file: validate_IN(.imageLayoutVersion; "1.0.0") ; - -# TODO validate digest, size of blobs (*somewhere*, probably not here - this is all "cheap" validations / version+ordering+format assumption validations) -# TODO if .data, validate that somehow too (size, digest); https://github.com/jqlang/jq/issues/1116#issuecomment-2515814615 -# TODO also we should validate that the length of every/any manifest is <= 4MiB (https://github.com/opencontainers/distribution-spec/pull/293#issuecomment-1452780554)