From e9968cbe793a8ab09720c95c268b49c45c5de49e Mon Sep 17 00:00:00 2001 From: mesutoezdil Date: Sat, 16 May 2026 11:50:03 +0200 Subject: [PATCH] fix(ci): eliminate image-tag race between concurrent workflows - Add publish-manifest input to docker-build.yml (default true); single-arch branch callers set it false so the merge job is skipped and the shared bare :SHA tag in GHCR is never written by branch workflows - branch-kubernetes-e2e: retag :SHA-amd64 to :SHA before kind load so Helm's image.tag matches what is loaded in kind containerd - branch-e2e: pass image-tag as :SHA-arm64 to e2e-test so the arch-specific GHCR tag is used directly without depending on the bare tag - bare :SHA in GHCR is now written only by test-gpu.yml (multi-arch build), eliminating the last-writer-wins race across concurrent workflows --- .github/workflows/branch-e2e.yml | 4 +++- .github/workflows/branch-kubernetes-e2e.yml | 13 +++++++------ .github/workflows/docker-build.yml | 9 +++++++-- 3 files changed, 17 insertions(+), 9 deletions(-) diff --git a/.github/workflows/branch-e2e.yml b/.github/workflows/branch-e2e.yml index 3d8dd5928..fc65fd87c 100644 --- a/.github/workflows/branch-e2e.yml +++ b/.github/workflows/branch-e2e.yml @@ -34,6 +34,7 @@ jobs: with: component: gateway platform: linux/arm64 + publish-manifest: false build-supervisor: needs: [pr_metadata] @@ -45,6 +46,7 @@ jobs: with: component: supervisor platform: linux/arm64 + publish-manifest: false e2e: needs: [pr_metadata, build-gateway, build-supervisor] @@ -54,5 +56,5 @@ jobs: packages: read uses: ./.github/workflows/e2e-test.yml with: - image-tag: ${{ github.sha }} + image-tag: ${{ github.sha }}-arm64 runner: linux-arm64-cpu8 diff --git a/.github/workflows/branch-kubernetes-e2e.yml b/.github/workflows/branch-kubernetes-e2e.yml index 0c10c1577..dd542fa05 100644 --- a/.github/workflows/branch-kubernetes-e2e.yml +++ b/.github/workflows/branch-kubernetes-e2e.yml @@ -38,6 +38,7 @@ jobs: with: component: gateway platform: linux/amd64 + publish-manifest: false build-supervisor: needs: [pr_metadata] @@ -49,6 +50,7 @@ jobs: with: component: supervisor platform: linux/amd64 + publish-manifest: false kubernetes-e2e: name: Kubernetes E2E (Rust smoke) @@ -106,16 +108,15 @@ jobs: kind get kubeconfig --name "$KIND_CLUSTER_NAME" > "$GITHUB_WORKSPACE/kubeconfig" chmod 600 "$GITHUB_WORKSPACE/kubeconfig" - # Pre-pull and side-load: kind nodes don't have ghcr credentials, and - # tagging IMAGE_TAG to a SHA means the chart's IfNotPresent pull policy - # is satisfied once the image is loaded into the node's containerd. - name: Load gateway and supervisor images into kind run: | set -euo pipefail for component in gateway supervisor; do - image="ghcr.io/nvidia/openshell/${component}:${{ github.sha }}" - docker pull "$image" - kind load docker-image "$image" --name "$KIND_CLUSTER_NAME" + src="ghcr.io/nvidia/openshell/${component}:${{ github.sha }}-amd64" + bare="ghcr.io/nvidia/openshell/${component}:${{ github.sha }}" + docker pull "$src" + docker tag "$src" "$bare" + kind load docker-image "$bare" --name "$KIND_CLUSTER_NAME" done - name: Run Kubernetes E2E (Rust smoke) diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml index 450d6b5c5..f6eb67b81 100644 --- a/.github/workflows/docker-build.yml +++ b/.github/workflows/docker-build.yml @@ -42,6 +42,11 @@ on: required: false type: string default: "" + publish-manifest: + description: "Push the bare-SHA manifest. Set false for single-arch branch workflows." + required: false + type: boolean + default: true env: MISE_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} @@ -181,7 +186,7 @@ jobs: # inside the container so setup-buildx can read it. - /etc/buildkit:/etc/buildkit:ro env: - IMAGE_TAG: ${{ needs.resolve.outputs.platform_count == '1' && needs.resolve.outputs.image_tag_base || format('{0}-{1}', needs.resolve.outputs.image_tag_base, matrix.arch) }} + IMAGE_TAG: ${{ format('{0}-{1}', needs.resolve.outputs.image_tag_base, matrix.arch) }} IMAGE_REGISTRY: ghcr.io/nvidia/openshell DOCKER_PUSH: ${{ inputs.push && '1' || '0' }} DOCKER_PLATFORM: ${{ matrix.platform }} @@ -257,7 +262,7 @@ jobs: merge: name: Merge ${{ inputs.component }} manifest needs: [resolve, build] - if: ${{ inputs.push && needs.resolve.outputs.platform_count != '1' }} + if: ${{ inputs.push && inputs['publish-manifest'] }} runs-on: linux-amd64-cpu8 timeout-minutes: 10 container: