From a6b95b7fd91c84c1b1daec86452200a461e11487 Mon Sep 17 00:00:00 2001 From: Jarek Potiuk Date: Fri, 24 Apr 2026 13:29:34 +0200 Subject: [PATCH] verify-action-build: handle source-detached orphan release tags MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some actions (e.g. slackapi/slack-github-action with its changesets-driven release flow) publish their version tag as a rootless orphan commit whose tree contains only distributable artifacts — action.yml, dist/, LICENSE, README.md. Consumers pin to that commit SHA, so the verifier clones it and runs npm run build, but there is no src/ and no package.json at the tag and the rebuild silently produces nothing, causing a misleading "DIFFERENCES DETECTED" failure. Detect that pattern and resolve the corresponding default-branch source commit via the GitHub Releases API: find the tag(s) pointing at the commit, read published_at, and pick the most recent default-branch commit at or just before published_at that has a buildable package.json — preferring "chore: release"-style messages (changesets / release-please / Version Packages). The Docker build then captures /original-dist from the orphan tag and git-checkouts to the resolved source commit before building, so the rebuild runs against real source and the diff is against the tag's published dist. The detection is narrow: only top-level tags whose tree has dist/ but no package.json and no src/. Monorepo sub-actions and normal actions are untouched. Verified end-to-end against slackapi/slack-github-action@ v3.0.2 (byte-identical rebuild) and v3.0.1, with regression checks on actions/checkout, astral-sh/setup-uv, and scacap/action-surefire-report (not flagged as detached, normal path preserved). --- utils/verify_action_build/docker_build.py | 14 + .../dockerfiles/build_action.Dockerfile | 13 + utils/verify_action_build/release_lookup.py | 240 ++++++++++++++++++ utils/verify_action_build/verification.py | 50 ++++ 4 files changed, 317 insertions(+) create mode 100644 utils/verify_action_build/release_lookup.py diff --git a/utils/verify_action_build/docker_build.py b/utils/verify_action_build/docker_build.py index e6929987..ec6f625d 100644 --- a/utils/verify_action_build/docker_build.py +++ b/utils/verify_action_build/docker_build.py @@ -104,6 +104,7 @@ def build_in_docker( cache: bool = True, show_build_steps: bool = False, approved_hash: str = "", + source_commit_hash: str = "", ) -> tuple[Path, Path, str, str, bool, Path, Path]: """Build the action in a Docker container and extract original + rebuilt dist. @@ -111,6 +112,11 @@ def build_in_docker( from that commit so the rebuild uses the same dev-dependency versions that produced the original dist/. + When *source_commit_hash* is supplied the Docker build captures the original + dist/ from *commit_hash* (a source-detached release tag) and then switches + the tree to *source_commit_hash* before building. Used for actions whose + tagged commit is an orphan tree without buildable source. + Returns (original_dir, rebuilt_dir, action_type, out_dir_name, has_node_modules, original_node_modules, rebuilt_node_modules). """ @@ -139,6 +145,12 @@ def build_in_docker( info_table.add_column() info_table.add_row("Action", repo_link) info_table.add_row("Commit", commit_link) + if source_commit_hash: + source_link = link( + f"https://github.com/{org}/{repo}/commit/{source_commit_hash}", + source_commit_hash, + ) + info_table.add_row("Source commit", source_link) console.print() console.print(Panel(info_table, title="Action Build Verification", border_style="blue")) @@ -160,6 +172,8 @@ def build_in_docker( f"SUB_PATH={sub_path}", "--build-arg", f"APPROVED_HASH={approved_hash}", + "--build-arg", + f"SOURCE_COMMIT_HASH={source_commit_hash}", "-t", image_tag, "-f", diff --git a/utils/verify_action_build/dockerfiles/build_action.Dockerfile b/utils/verify_action_build/dockerfiles/build_action.Dockerfile index 49c3b7eb..34e94e34 100644 --- a/utils/verify_action_build/dockerfiles/build_action.Dockerfile +++ b/utils/verify_action_build/dockerfiles/build_action.Dockerfile @@ -114,6 +114,19 @@ RUN MAIN_PATH=$(cat /main-path.txt); \ RUN OUT_DIR=$(cat /out-dir.txt); \ if [ -d "$OUT_DIR" ]; then cp -r "$OUT_DIR" /original-dist; else mkdir /original-dist; fi +# Some actions publish their release tag as an orphan commit containing only the +# distributable artifacts (action.yml, dist/, LICENSE, README.md) — no src/, no +# package.json, no lock files. When that pattern is detected upstream (in +# release_lookup.py) we're handed SOURCE_COMMIT_HASH: the default-branch commit +# the release was cut from. Swap the tree to that commit now — /original-dist +# has already been captured from COMMIT_HASH — so the rebuild below runs against +# real source. +ARG SOURCE_COMMIT_HASH="" +RUN if [ -n "$SOURCE_COMMIT_HASH" ]; then \ + echo "source-commit: $SOURCE_COMMIT_HASH (rebuilding from default-branch source)" >> /build-info.log; \ + git checkout "$SOURCE_COMMIT_HASH"; \ + fi + # Detect if node_modules/ is committed (vendored dependencies pattern) RUN if [ -d "node_modules" ]; then \ echo "true" > /has-node-modules.txt; \ diff --git a/utils/verify_action_build/release_lookup.py b/utils/verify_action_build/release_lookup.py new file mode 100644 index 00000000..2aa20df5 --- /dev/null +++ b/utils/verify_action_build/release_lookup.py @@ -0,0 +1,240 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +"""Detect and resolve source-detached release tags. + +Some actions (e.g. slackapi/slack-github-action, ones using changesets' +release automation) publish their tagged commit as a rootless *orphan* +commit whose tree only contains the distributable artifacts — ``action.yml``, +``dist/``, ``LICENSE``, ``README.md``, etc — but no ``src/``, no +``package.json`` and no lock files. Consumers pin to that commit SHA, but +there is literally no source to rebuild from, so a naive ``git checkout && +npm run build`` produces empty output. + +This module detects that pattern and resolves the corresponding *source +commit* on the default branch that was released from, so the verifier can +rebuild against real source and still diff the rebuilt ``dist/`` against the +orphan tag's published ``dist/``. +""" + +from __future__ import annotations + +import json +import subprocess +from datetime import datetime, timedelta, timezone + + +def _gh_api(endpoint: str) -> dict | list | None: + """Call ``gh api`` and return parsed JSON, or ``None`` on any failure.""" + result = subprocess.run( + ["gh", "api", endpoint], + capture_output=True, text=True, + ) + if result.returncode != 0 or not result.stdout.strip(): + return None + try: + return json.loads(result.stdout) + except json.JSONDecodeError: + return None + + +def _tree_top_level_names(org: str, repo: str, commit_hash: str) -> set[str]: + """Return the set of top-level entry names in the commit's tree. + + Returns an empty set if the lookup fails — callers should treat that as + "unknown, don't infer anything". + """ + data = _gh_api(f"repos/{org}/{repo}/git/trees/{commit_hash}") + if not isinstance(data, dict): + return set() + return {entry.get("path", "") for entry in data.get("tree", []) if entry.get("path")} + + +def is_source_detached(org: str, repo: str, commit_hash: str, sub_path: str = "") -> bool: + """Return True when the commit tree lacks buildable source. + + A "source-detached" commit is one where the tagged tree contains only + distributable artifacts — no ``package.json`` at the build root. When + ``sub_path`` is set (monorepo sub-action), we check that sub-tree; else + the repo root. The heuristic is intentionally narrow: we only flag + commits that *also* contain a ``dist/`` directory, so repos that simply + don't use a build step (composite/docker actions) aren't false-positived. + """ + # Monorepo sub-actions typically keep their build tooling at the repo + # root, so a sub_path without package.json is expected, not source- + # detached. Limit this heuristic to the top-level case for now. + if sub_path: + return False + + names = _tree_top_level_names(org, repo, commit_hash) + if not names: + return False + + has_dist = "dist" in names + has_pkg = "package.json" in names + has_src_tree = "src" in names + return has_dist and not has_pkg and not has_src_tree + + +def _find_tags_for_commit(org: str, repo: str, commit_hash: str) -> list[str]: + """Return every tag name that points at ``commit_hash``, most specific first. + + Actions often attach both a pinned version tag (``v3.0.2``) and rolling + major/minor tags (``v3``, ``v3.0``) to the same commit. Only the pinned + version has its own GitHub Release, so we return all matches and sort by + specificity — longest tag name first, which for semver-like schemes is a + good proxy for "more specific" (``v3.0.2`` beats ``v3``). + """ + matches: list[str] = [] + for page in range(1, 6): + data = _gh_api( + f"repos/{org}/{repo}/git/matching-refs/tags?per_page=100&page={page}" + ) + if not isinstance(data, list) or not data: + break + for ref in data: + obj = ref.get("object", {}) + obj_sha = obj.get("sha") + obj_type = obj.get("type") + ref_name = ref.get("ref", "") + if not ref_name.startswith("refs/tags/"): + continue + tag_name = ref_name[len("refs/tags/"):] + if obj_sha == commit_hash: + matches.append(tag_name) + continue + if obj_type == "tag": + # Annotated tag — the ref points at a tag object whose + # .object.sha is the actual commit. Fetch and check. + tag_obj = _gh_api(f"repos/{org}/{repo}/git/tags/{obj_sha}") + if isinstance(tag_obj, dict): + inner = tag_obj.get("object", {}) + if inner.get("sha") == commit_hash: + matches.append(tag_name) + if len(data) < 100: + break + matches.sort(key=lambda t: (-len(t), t)) + return matches + + +def _release_published_at(org: str, repo: str, tag_name: str) -> datetime | None: + """Return the release's published_at timestamp for the given tag, or None.""" + data = _gh_api(f"repos/{org}/{repo}/releases/tags/{tag_name}") + if not isinstance(data, dict): + return None + ts = data.get("published_at") or data.get("created_at") + if not ts: + return None + try: + # GitHub returns ISO8601 with trailing Z. + return datetime.fromisoformat(ts.replace("Z", "+00:00")) + except ValueError: + return None + + +def _default_branch(org: str, repo: str) -> str: + """Return the repo's default branch name (falls back to ``main``).""" + data = _gh_api(f"repos/{org}/{repo}") + if isinstance(data, dict): + br = data.get("default_branch") + if isinstance(br, str) and br: + return br + return "main" + + +def _commit_has_package_json( + org: str, repo: str, commit_hash: str, sub_path: str = "", +) -> bool: + """Cheap tree check: does this commit have a buildable package.json?""" + if sub_path: + data = _gh_api(f"repos/{org}/{repo}/contents/{sub_path}?ref={commit_hash}") + if not isinstance(data, list): + return False + return any(e.get("name") == "package.json" for e in data) + names = _tree_top_level_names(org, repo, commit_hash) + return "package.json" in names + + +def resolve_source_commit( + org: str, repo: str, commit_hash: str, sub_path: str = "", +) -> tuple[str, str] | None: + """Resolve the default-branch source commit that a source-detached tag was cut from. + + Returns ``(source_commit_sha, tag_name)`` on success, or ``None`` if we + couldn't confidently identify the source commit. + + Strategy: + 1. Find the tag name(s) that point at ``commit_hash``. + 2. Look up the GitHub Release object for that tag — use its + ``published_at`` as a time anchor. + 3. List commits on the default branch at or just before that time. + 4. Pick the most recent one whose tree actually has ``package.json`` + at the build root (confirming it's buildable source). + """ + candidate_tags = _find_tags_for_commit(org, repo, commit_hash) + if not candidate_tags: + return None + + tag_name = None + published = None + for candidate in candidate_tags: + ts = _release_published_at(org, repo, candidate) + if ts is not None: + tag_name = candidate + published = ts + break + if tag_name is None or published is None: + return None + + default_branch = _default_branch(org, repo) + + # The orphan tag is typically pushed a few seconds *after* the release + # PR lands on the default branch, so we cap the window at published_at + + # a short tolerance to cover race conditions while keeping commits that + # landed *after* the release (e.g. subsequent dependabot bumps) out. + cutoff = published + timedelta(minutes=1) + until = cutoff.astimezone(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") + commits = _gh_api( + f"repos/{org}/{repo}/commits?sha={default_branch}&until={until}&per_page=20" + ) + if not isinstance(commits, list): + return None + + # Prefer commits whose message looks like a release commit (changesets + # uses "chore: release", release-please uses "chore(main): release + # x.y.z", other automations use "Release …"). Fall back to the most + # recent buildable commit in the window otherwise. + release_markers = ("chore: release", "chore(main): release", "release:", "Release ", "Version Packages") + + def _is_release_commit(commit: dict) -> bool: + msg = commit.get("commit", {}).get("message", "") + first_line = msg.splitlines()[0] if msg else "" + return any(marker.lower() in first_line.lower() for marker in release_markers) + + ordered = sorted( + commits, + key=lambda c: (not _is_release_commit(c), commits.index(c)), + ) + for commit in ordered: + sha = commit.get("sha") + if not sha: + continue + if _commit_has_package_json(org, repo, sha, sub_path): + return sha, tag_name + + return None diff --git a/utils/verify_action_build/verification.py b/utils/verify_action_build/verification.py index e7cac08b..563bd74a 100644 --- a/utils/verify_action_build/verification.py +++ b/utils/verify_action_build/verification.py @@ -33,6 +33,7 @@ from .diff_source import diff_approved_vs_new from .docker_build import build_in_docker from .github_client import GitHubClient +from .release_lookup import is_source_detached, resolve_source_commit from .security import ( analyze_action_metadata, analyze_binary_downloads_recursive, @@ -181,15 +182,64 @@ def verify_single_action( matched_with_approved_lockfile = False binary_download_failures: list[str] = [] + # Detect source-detached release tags (orphan commits containing only + # distributable artifacts) and resolve the default-branch source commit + # the release was cut from, so the rebuild has real source to build from. + source_commit_hash = "" + source_detached_detail = "" + if is_source_detached(org, repo, commit_hash, sub_path): + resolved = resolve_source_commit(org, repo, commit_hash, sub_path) + if resolved: + source_commit_hash, tag_name = resolved + source_detached_detail = ( + f"orphan tag {tag_name}; rebuilding from {source_commit_hash[:12]}" + ) + console.print() + console.print( + Panel( + f"[yellow]Release tag [bold]{tag_name}[/bold] at " + f"[bold]{commit_hash[:12]}[/bold] is a source-detached " + f"orphan commit (no src/ or package.json at the tag).\n" + f"Rebuilding from the default-branch source commit " + f"[bold]{source_commit_hash[:12]}[/bold] the release was " + f"cut from, then diffing the rebuilt dist/ against the " + f"tag's published dist/.[/yellow]", + border_style="yellow", + title="SOURCE-DETACHED RELEASE TAG", + ) + ) + else: + source_detached_detail = "detected but source commit could not be resolved" + console.print() + console.print( + Panel( + f"[red]Tag commit [bold]{commit_hash[:12]}[/bold] has no " + f"buildable source at the tag, and the default-branch " + f"source commit the release was cut from could not be " + f"resolved via the GitHub Releases API. The rebuild " + f"below will almost certainly produce no output — this " + f"action requires manual source review.[/red]", + border_style="red", + title="SOURCE-DETACHED RELEASE TAG (unresolved)", + ) + ) + with tempfile.TemporaryDirectory(prefix="verify-action-") as tmp: work_dir = Path(tmp) (original_dir, rebuilt_dir, action_type, out_dir_name, has_node_modules, original_node_modules, rebuilt_node_modules) = build_in_docker( org, repo, commit_hash, work_dir, sub_path=sub_path, gh=gh, cache=cache, show_build_steps=show_build_steps, + source_commit_hash=source_commit_hash, ) checks_performed.append(("Action type detection", "info", action_type)) + if source_detached_detail: + checks_performed.append(( + "Source-detached release tag", + "info" if source_commit_hash else "warn", + source_detached_detail, + )) is_js_action = action_type.startswith("node") or action_type in ("unknown",)