diff --git a/.github/workflows/ai-eval.yml b/.github/workflows/ai-eval.yml new file mode 100644 index 00000000..5f9aa679 --- /dev/null +++ b/.github/workflows/ai-eval.yml @@ -0,0 +1,57 @@ +name: AI Eval + +on: + schedule: + - cron: "0 0 * * *" + workflow_dispatch: + +jobs: + ai-eval: + if: github.ref == 'refs/heads/main' + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Setup Node.js 22 + uses: actions/setup-node@v4 + with: + node-version: 22 + cache: 'npm' + + - name: Install dependencies + run: npm install + + - name: Install Claude Code + run: npm install -g @anthropic-ai/claude-code + + - name: Check Claude authentication + id: claude-auth + run: node scripts/check-claude-ai-eval-gate.js + env: + CLAUDE_CODE_OAUTH_TOKEN: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }} + + - name: Run AI prompt evaluations + if: steps.claude-auth.outputs.available == 'true' + env: + CLAUDE_CODE_OAUTH_TOKEN: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }} + run: npm run test:ai-eval + + - name: Upload AI eval responses + if: always() && steps.claude-auth.outputs.available == 'true' + uses: actions/upload-artifact@v4 + with: + name: ai-eval-responses + path: ai-evals/*.responses.md + retention-days: 14 + + - name: Notify Slack on failure + if: failure() && steps.claude-auth.outputs.available == 'true' + uses: slackapi/slack-github-action@v2.1.0 + with: + method: chat.postMessage + token: ${{ secrets.SLACK_BOT_TOKEN }} + payload: | + channel: "C0A5ZRP7XR5" + text: "🔴 AI Eval failed on `${{ github.ref_name }}` — <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|View run>" diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 75008374..27a96b30 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -32,43 +32,3 @@ jobs: - name: Run tests run: npm test - - ai-eval: - runs-on: ubuntu-latest - needs: test - - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Setup Node.js 22 - uses: actions/setup-node@v4 - with: - node-version: 22 - cache: 'npm' - - - name: Install dependencies - run: npm install - - - name: Install Claude Code - run: npm install -g @anthropic-ai/claude-code - - - name: Check Claude authentication - id: claude-auth - run: node scripts/check-claude-ai-eval-gate.js - env: - CLAUDE_CODE_OAUTH_TOKEN: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }} - - - name: Run AI prompt evaluations - if: steps.claude-auth.outputs.available == 'true' - env: - CLAUDE_CODE_OAUTH_TOKEN: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }} - run: npm run test:ai-eval - - - name: Upload AI eval responses - if: always() && steps.claude-auth.outputs.available == 'true' - uses: actions/upload-artifact@v4 - with: - name: ai-eval-responses - path: ai-evals/*.responses.md - retention-days: 14 diff --git a/package.json b/package.json index 53d54bfa..3eae00af 100644 --- a/package.json +++ b/package.json @@ -105,7 +105,7 @@ "prepare": "husky", "release": "node release.js", "test": "vitest run && echo 'Test complete.' && npm run -s lint && npm run -s typecheck", - "test:ai-eval": "riteway ai ai-evals/aidd-review/review-skill-test.sudo --runs 4 --threshold 75 --timeout 600000 --agent claude --color --save-responses", + "test:ai-eval": "riteway ai ai-evals/aidd-review/review-skill-test.sudo --runs 1 --threshold 75 --timeout 600000 --agent claude --color --save-responses", "test:e2e": "vitest run **/*-e2e.test.js && echo 'E2E tests complete.'", "test:unit": "vitest run --exclude '**/*-e2e.test.js' && echo 'Unit tests complete.' && npm run -s lint && npm run -s typecheck", "toc": "doctoc README.md",