diff --git a/.github/workflows/agent-evaluation.yml b/.github/workflows/agent-evaluation.yml index 725fd44c2..6ad10a289 100644 --- a/.github/workflows/agent-evaluation.yml +++ b/.github/workflows/agent-evaluation.yml @@ -54,16 +54,17 @@ jobs: agent-evaluation: name: Agent Quality Evaluation runs-on: ubuntu-latest + timeout-minutes: 30 environment: ${{ inputs.environment || 'integration' }} permissions: contents: read id-token: write # Needed for OIDC → DefaultAzureCredential steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - name: Set up Python - uses: actions/setup-python@v5 + uses: actions/setup-python@v6 with: python-version: '3.12' diff --git a/.github/workflows/destroy.yml b/.github/workflows/destroy.yml index 38bfb6a09..ed6d2d995 100644 --- a/.github/workflows/destroy.yml +++ b/.github/workflows/destroy.yml @@ -26,10 +26,17 @@ on: default: tf required: true +# Never run two destroys against the same environment at once, and don't +# cancel a destroy in progress (interrupting Terraform mid-destroy is unsafe). +concurrency: + group: destroy-${{ inputs.environment }} + cancel-in-progress: false + jobs: terraform_destroy: name: Terraform Destroy runs-on: ubuntu-latest + timeout-minutes: 30 environment: ${{ inputs.environment || 'integration' }} permissions: id-token: write diff --git a/.github/workflows/docker-application.yml b/.github/workflows/docker-application.yml index 3f1ab122a..55c13a4b6 100644 --- a/.github/workflows/docker-application.yml +++ b/.github/workflows/docker-application.yml @@ -27,13 +27,14 @@ jobs: build: name: Build & Push Backend Image runs-on: ubuntu-latest + timeout-minutes: 30 environment: ${{ inputs.environment || 'integration' }} permissions: id-token: write contents: read steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - name: Azure OIDC Login uses: azure/login@v2 diff --git a/.github/workflows/docker-mcp.yml b/.github/workflows/docker-mcp.yml index 3242142a8..c03be5fcd 100644 --- a/.github/workflows/docker-mcp.yml +++ b/.github/workflows/docker-mcp.yml @@ -27,13 +27,14 @@ jobs: build: name: Build & Push MCP Image runs-on: ubuntu-latest + timeout-minutes: 30 environment: ${{ inputs.environment || 'integration' }} permissions: id-token: write contents: read steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - name: Azure OIDC Login uses: azure/login@v2 diff --git a/.github/workflows/infrastructure.yml b/.github/workflows/infrastructure.yml index 00a4f579b..ccf2f7bf1 100644 --- a/.github/workflows/infrastructure.yml +++ b/.github/workflows/infrastructure.yml @@ -41,6 +41,7 @@ jobs: tf: name: Terraform Deployment runs-on: ubuntu-latest + timeout-minutes: 45 environment: ${{ inputs.environment }} if: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.iac-tool || 'tf') == 'tf' }} permissions: @@ -166,6 +167,7 @@ jobs: bicep: runs-on: ubuntu-latest + timeout-minutes: 45 environment: ${{ inputs.environment }} if: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.iac-tool || 'tf') == 'bicep' }} permissions: diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml index d3332a657..e46bdf57b 100644 --- a/.github/workflows/integration-tests.yml +++ b/.github/workflows/integration-tests.yml @@ -20,6 +20,14 @@ on: required: false default: true description: 'Whether MCP is internal-only (skip MCP tests)' + advisory: + type: boolean + required: false + default: false + description: >- + When true, a degraded/unreachable shared environment is reported but + does not fail the job (used for tests-only PRs to main, where the env + is pre-deployed and not built from the PR). workflow_dispatch: inputs: @@ -35,45 +43,129 @@ on: description: 'MCP service endpoint URL (optional if internal)' required: false +# Least-privilege: this workflow only reads the repo. +permissions: + contents: read + jobs: integration-tests: name: Run Integration Tests runs-on: ubuntu-latest + timeout-minutes: 20 permissions: contents: read - # No environment needed - uses repo-level variables steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - name: Set up Python - uses: actions/setup-python@v5 + uses: actions/setup-python@v6 with: python-version: '3.12' + cache: 'pip' + cache-dependency-path: tests/requirements.txt - name: Install test dependencies + run: pip install -r tests/requirements.txt + + # ------------------------------------------------------------------ + # Liveness gate: poll the backend until it responds, instead of a + # blind `sleep 30`. Distinguishes "environment unreachable/degraded" + # from "tests failed", so a broken *shared* env doesn't masquerade as + # a code regression. + # ------------------------------------------------------------------ + - name: Wait for backend to become reachable + id: liveness run: | - pip install -r tests/requirements.txt + BE="${{ inputs.backend_endpoint }}" + if [ -z "$BE" ]; then + echo "reachable=false" >> "$GITHUB_OUTPUT" + echo "::warning::No backend endpoint provided" + exit 0 + fi + + echo "Polling $BE for readiness (up to ~90s)..." + reachable=false + for i in $(seq 1 18); do + code=$(curl -s -o /dev/null -w '%{http_code}' --max-time 10 "$BE/" || echo 000) + # Any HTTP response below 500 means the app is up and serving. + if [ "$code" != "000" ] && [ "$code" -lt 500 ]; then + echo "Backend reachable (HTTP $code) after $((i*5))s" + reachable=true + break + fi + echo " attempt $i: HTTP $code — retrying in 5s" + sleep 5 + done + echo "reachable=$reachable" >> "$GITHUB_OUTPUT" - - name: Wait for Container Apps to warm up + - name: Skip note (environment unreachable) + if: steps.liveness.outputs.reachable != 'true' run: | - echo "Waiting 30 seconds for Container Apps to be ready..." - sleep 30 + { + echo "## Integration Tests — SKIPPED (environment unreachable)" + echo "" + echo "The target backend did not return a healthy response, so the" + echo "integration suite was not run. This usually means the shared" + echo "environment is degraded (e.g. backend down, model key invalid)," + echo "not that this change is broken." + echo "" + echo "- Backend: \`${{ inputs.backend_endpoint }}\`" + echo "- Environment: \`${{ inputs.environment }}\`" + echo "- Advisory mode: \`${{ inputs.advisory }}\`" + } >> "$GITHUB_STEP_SUMMARY" + if [ "${{ inputs.advisory }}" = "true" ]; then + echo "Advisory mode: not failing the pipeline for a degraded shared env." + exit 0 + fi + echo "::error::Backend unreachable and not in advisory mode." + exit 1 - name: Run integration tests + if: steps.liveness.outputs.reachable == 'true' + id: pytest + # In advisory mode (tests-only PRs to main) the shared env is not built + # from this PR, so we report results without blocking the PR. + continue-on-error: ${{ inputs.advisory }} run: | cd tests - pytest -v -m "integration" --tb=short + pytest -v -m "integration" --tb=short \ + --junitxml="junit-integration.xml" env: BACKEND_API_ENDPOINT: ${{ inputs.backend_endpoint }} MCP_ENDPOINT: ${{ inputs.mcp_endpoint }} MCP_INTERNAL_ONLY: ${{ inputs.mcp_internal_only && 'true' || 'false' }} - - name: Test Summary + - name: Upload test results + if: always() && steps.liveness.outputs.reachable == 'true' + uses: actions/upload-artifact@v4 + with: + name: integration-test-results-${{ inputs.environment }} + path: tests/junit-integration.xml + if-no-files-found: ignore + retention-days: 14 + + - name: Test summary if: always() run: | - echo "## Integration Test Results" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - echo "- Backend Endpoint: ${{ inputs.backend_endpoint }}" >> $GITHUB_STEP_SUMMARY - echo "- MCP Endpoint: ${{ inputs.mcp_endpoint || 'Internal (skipped)' }}" >> $GITHUB_STEP_SUMMARY - echo "- Environment: ${{ inputs.environment }}" >> $GITHUB_STEP_SUMMARY + { + echo "## Integration Test Results" + echo "" + echo "- Backend Endpoint: ${{ inputs.backend_endpoint }}" + echo "- MCP Endpoint: ${{ inputs.mcp_endpoint || 'Internal (skipped)' }}" + echo "- Environment: ${{ inputs.environment }}" + echo "- Reachable: ${{ steps.liveness.outputs.reachable }}" + echo "- Outcome: ${{ steps.pytest.outcome }}" + echo "- Advisory: ${{ inputs.advisory }}" + } >> "$GITHUB_STEP_SUMMARY" + + # In advisory mode we surface failures as a warning but keep the job green + # so a degraded shared env can't block unrelated PRs to main. + - name: Advisory failure note + if: >- + steps.liveness.outputs.reachable == 'true' + && steps.pytest.outcome == 'failure' + && inputs.advisory + run: | + echo "::warning::Integration tests failed against the shared environment, but this is advisory (tests-only PR to main). Not blocking." + diff --git a/.github/workflows/orchestrate.yml b/.github/workflows/orchestrate.yml index d60707512..975863860 100644 --- a/.github/workflows/orchestrate.yml +++ b/.github/workflows/orchestrate.yml @@ -95,6 +95,14 @@ permissions: pull-requests: write id-token: write +# Prevent overlapping runs for the same PR/branch. Never cancel an in-flight +# run on `main` (cancelling a production deploy mid-apply is dangerous); +# cancel superseded runs on dev branches / PRs to save minutes and avoid +# concurrent Terraform state access on the same environment. +concurrency: + group: cicd-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} + jobs: # ──────────────────────────────────────────────────────────────────── @@ -102,6 +110,9 @@ jobs: # ──────────────────────────────────────────────────────────────────── pipeline-config: runs-on: ubuntu-latest + timeout-minutes: 5 + permissions: + contents: read outputs: environment: ${{ steps.config.outputs.environment }} full_deploy: ${{ steps.config.outputs.full_deploy }} @@ -175,6 +186,50 @@ jobs: echo "Create PR only: $CREATE_PR_ONLY" echo "──────────────────────────────────────" + # ──────────────────────────────────────────────────────────────────── + # Fast unit / regression tests (no Azure) + # Deterministic, mock-based tests that validate the agent-framework + # API surface used by every agent. A cheap, fast signal that catches + # breakage (e.g. a dependency upgrade) long before the expensive + # build/deploy path. Runs on PRs and pushes; skipped only for the + # lightweight *-dev "create PR" pass (the resulting PR re-runs them). + # ──────────────────────────────────────────────────────────────────── + unit-tests: + name: Unit & Regression Tests (no Azure) + needs: pipeline-config + if: needs.pipeline-config.outputs.create_pr_only != 'true' + runs-on: ubuntu-latest + timeout-minutes: 15 + permissions: + contents: read + steps: + - uses: actions/checkout@v6 + + - name: Install uv + uses: astral-sh/setup-uv@v7 + with: + enable-cache: true + + - name: Sync application environment + working-directory: agentic_ai/applications + run: uv sync + + - name: Run agent-framework regression tests + working-directory: agentic_ai/applications + run: >- + uv run --with pytest python -m pytest + ../../tests/test_agent_framework_1_2_1_regression.py + -v --junitxml=../../junit-unit.xml + + - name: Upload results + if: always() + uses: actions/upload-artifact@v4 + with: + name: unit-test-results + path: junit-unit.xml + if-no-files-found: ignore + retention-days: 14 + # ──────────────────────────────────────────────────────────────────── # Step 0.5: Create PR to int-agentic (*-dev push only) # When a developer pushes to their *-dev branch, we just create @@ -185,6 +240,10 @@ jobs: needs: pipeline-config if: needs.pipeline-config.outputs.create_pr_only == 'true' runs-on: ubuntu-latest + timeout-minutes: 10 + permissions: + contents: read + pull-requests: write steps: - name: Create or update PR to int-agentic env: @@ -227,6 +286,10 @@ jobs: needs: pipeline-config if: needs.pipeline-config.outputs.full_deploy == 'true' runs-on: ubuntu-latest + timeout-minutes: 10 + permissions: + contents: read + id-token: write environment: ${{ needs.pipeline-config.outputs.environment }} steps: - name: Azure OIDC Login @@ -306,6 +369,10 @@ jobs: needs.pipeline-config.outputs.full_deploy == 'false' && needs.pipeline-config.outputs.create_pr_only == 'false' runs-on: ubuntu-latest + timeout-minutes: 10 + permissions: + contents: read + id-token: write environment: ${{ needs.pipeline-config.outputs.environment }} outputs: backend_endpoint: ${{ steps.lookup.outputs.backend_endpoint }} @@ -363,6 +430,9 @@ jobs: backend_endpoint: ${{ needs.deploy-infrastructure.outputs.backend_endpoint || needs.resolve-endpoints.outputs.backend_endpoint }} mcp_endpoint: ${{ needs.deploy-infrastructure.outputs.mcp_endpoint || needs.resolve-endpoints.outputs.mcp_endpoint }} mcp_internal_only: true + # Tests-only mode (PR → main) runs against a pre-existing shared env that + # is NOT built from this PR, so a degraded env must not block the PR. + advisory: ${{ needs.pipeline-config.outputs.full_deploy == 'false' }} secrets: inherit # ──────────────────────────────────────────────────────────────────── @@ -396,6 +466,10 @@ jobs: && github.event_name == 'pull_request' && github.base_ref == 'int-agentic' runs-on: ubuntu-latest + timeout-minutes: 10 + permissions: + contents: write + pull-requests: write steps: - name: Auto-merge PR into int-agentic env: diff --git a/.github/workflows/promote-to-main.yml b/.github/workflows/promote-to-main.yml index 3ad54c1bd..a175b5a31 100644 --- a/.github/workflows/promote-to-main.yml +++ b/.github/workflows/promote-to-main.yml @@ -36,9 +36,15 @@ permissions: contents: read pull-requests: write +# Only the most recent promotion attempt matters; supersede older ones. +concurrency: + group: promote-to-main + cancel-in-progress: true + jobs: promote: runs-on: ubuntu-latest + timeout-minutes: 10 steps: - name: Checkout uses: actions/checkout@v6 diff --git a/.github/workflows/update-containers.yml b/.github/workflows/update-containers.yml index 294d3aee7..18d06a059 100644 --- a/.github/workflows/update-containers.yml +++ b/.github/workflows/update-containers.yml @@ -35,6 +35,7 @@ jobs: update-containers: name: Update Container Apps runs-on: ubuntu-latest + timeout-minutes: 20 environment: ${{ inputs.environment }} permissions: id-token: write