paralleldrive · janhesters · Apr 15, 2026 · Apr 14, 2026 · Apr 14, 2026 · Apr 14, 2026
diff --git a/.github/workflows/ai-eval.yml b/.github/workflows/ai-eval.yml
@@ -0,0 +1,57 @@
+name: AI Eval
+
+on:
+  schedule:
+    - cron: "0 0 * * *"
+  workflow_dispatch:
+
+jobs:
+  ai-eval:
+    if: github.ref == 'refs/heads/main'
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Setup Node.js 22
+        uses: actions/setup-node@v4
+        with:
+          node-version: 22
+          cache: 'npm'
+
+      - name: Install dependencies
+        run: npm install
+
+      - name: Install Claude Code
+        run: npm install -g @anthropic-ai/claude-code
+
+      - name: Check Claude authentication
+        id: claude-auth
+        run: node scripts/check-claude-ai-eval-gate.js
+        env:
+          CLAUDE_CODE_OAUTH_TOKEN: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }}
+
+      - name: Run AI prompt evaluations
+        if: steps.claude-auth.outputs.available == 'true'
+        env:
+          CLAUDE_CODE_OAUTH_TOKEN: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }}
+        run: npm run test:ai-eval
+
+      - name: Upload AI eval responses
+        if: always() && steps.claude-auth.outputs.available == 'true'
+        uses: actions/upload-artifact@v4
+        with:
+          name: ai-eval-responses
+          path: ai-evals/*.responses.md
+          retention-days: 14
+
+      - name: Notify Slack on failure
+        if: failure() && steps.claude-auth.outputs.available == 'true'
+        uses: slackapi/slack-github-action@v2.1.0
+        with:
+          method: chat.postMessage
+          token: ${{ secrets.SLACK_BOT_TOKEN }}
+          payload: |
+            channel: "C0A5ZRP7XR5"
+            text: "🔴 AI Eval failed on `${{ github.ref_name }}` — <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|View run>"
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -32,43 +32,3 @@ jobs:
 
       - name: Run tests
         run: npm test
-
-  ai-eval:
-    runs-on: ubuntu-latest
-    needs: test
-
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-
-      - name: Setup Node.js 22
-        uses: actions/setup-node@v4
-        with:
-          node-version: 22
-          cache: 'npm'
-
-      - name: Install dependencies
-        run: npm install
-
-      - name: Install Claude Code
-        run: npm install -g @anthropic-ai/claude-code
-
-      - name: Check Claude authentication
-        id: claude-auth
-        run: node scripts/check-claude-ai-eval-gate.js
-        env:
-          CLAUDE_CODE_OAUTH_TOKEN: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }}
-
-      - name: Run AI prompt evaluations
-        if: steps.claude-auth.outputs.available == 'true'
-        env:
-          CLAUDE_CODE_OAUTH_TOKEN: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }}
-        run: npm run test:ai-eval
-
-      - name: Upload AI eval responses
-        if: always() && steps.claude-auth.outputs.available == 'true'
-        uses: actions/upload-artifact@v4
-        with:
-          name: ai-eval-responses
-          path: ai-evals/*.responses.md
-          retention-days: 14
diff --git a/package.json b/package.json
@@ -105,7 +105,7 @@
     "prepare": "husky",
     "release": "node release.js",
     "test": "vitest run && echo 'Test complete.' && npm run -s lint && npm run -s typecheck",
-    "test:ai-eval": "riteway ai ai-evals/aidd-review/review-skill-test.sudo --runs 4 --threshold 75 --timeout 600000 --agent claude --color --save-responses",
+    "test:ai-eval": "riteway ai ai-evals/aidd-review/review-skill-test.sudo --runs 1 --threshold 75 --timeout 600000 --agent claude --color --save-responses",
     "test:e2e": "vitest run **/*-e2e.test.js && echo 'E2E tests complete.'",
     "test:unit": "vitest run --exclude '**/*-e2e.test.js' && echo 'Unit tests complete.' && npm run -s lint && npm run -s typecheck",
     "toc": "doctoc README.md",