OpenRouterTeam · mattapperson · May 8, 2026 · perry-the-pr-reviewer · May 8, 2026 · perry-the-pr-reviewer
diff --git a/.github/workflows/eval.yml b/.github/workflows/eval.yml
@@ -0,0 +1,34 @@
+name: Run Skill Evaluations
+
+on:
+  pull_request:
+    branches: [main]
+    paths:
+      - 'evals/**'
+      - 'skills/**'
+
+permissions:
+  contents: read
+
+jobs:
+  eval:
+    name: Run Evaluations
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Install Azure Developer CLI
+        uses: Azure/setup-azd@v2
+      - name: Install waza extension
+        run: |
+          azd config set alpha.extensions on
+          azd ext source add -n waza -t url -l https://raw.githubusercontent.com/microsoft/waza/main/registry.json
+          azd ext install microsoft.azd.waza
+      - name: Run evaluations
+        run: azd waza run --output-dir ./results
+      - name: Upload results
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: eval-results
+          path: ./results
+          retention-days: 30
diff --git a/.gitignore b/.gitignore
@@ -5,3 +5,7 @@ build/
 .env.*
 .DS_Store
 .claude/worktrees/
+
+# waza eval outputs and caches (local to each run; not source-of-truth)
+.waza-results/
+.waza-cache/
diff --git a/.waza.yaml b/.waza.yaml
@@ -0,0 +1,31 @@
+# yaml-language-server: $schema=https://raw.githubusercontent.com/microsoft/waza/main/schemas/config.schema.json
+
+paths:
+  skills: skills
+  evals: evals
+  results: .waza-results
+defaults:
+  engine: copilot-sdk
+  model: claude-sonnet-4.6
+  timeout: 300
+  parallel: false
+  workers: 4
+  verbose: false
+  sessionLog: false
+cache:
+  enabled: false
+  dir: .waza-cache
+server:
+  port: 3000
+  resultsDir: results/
+dev:
+  model: claude-sonnet-4-20250514
+  target: medium-high
+  maxIterations: 5
+tokens:
+  warningThreshold: 500
+  fallbackLimit: 1000
+graders:
+  programTimeout: 30
+storage:
+  containerName: waza-results
diff --git a/evals/create-agent-tui/eval.yaml b/evals/create-agent-tui/eval.yaml
@@ -0,0 +1,32 @@
+name: create-agent-tui-eval
+description: |
+  TODO: scaffolding only — tasks are generic stubs. Author real tasks +
+  graders before running baseline. See evals/openrouter-tts for a worked
+  example. Per project memory, this skill's graders need to drive the
+  generated TUI via pilotty, not just assert on file contents.
+skill: create-agent-tui
+version: "1.0"
+config:
+  trials_per_task: 1
+  timeout_seconds: 300
+  parallel: false
+  executor: copilot-sdk
+  model: claude-sonnet-4.6
+metrics:
+  - name: task_completion
+    weight: 1.0
+    threshold: 0.8
+    description: Did the skill complete the assigned task?
+graders:
+  - type: code
+    name: has_output
+    config:
+      assertions:
+        - "len(output) > 0"
+  - type: text
+    name: relevant_content
+    config:
+      regex_match:
+        - "(?i)(explain|describe|analyze|implement)"
+tasks:
+  - "tasks/*.yaml"
diff --git a/evals/create-agent-tui/fixtures/sample.py b/evals/create-agent-tui/fixtures/sample.py
@@ -0,0 +1,3 @@
+def hello(name):
+    """Greet someone by name."""
+    return f"Hello, {name}!"
diff --git a/evals/create-agent-tui/tasks/basic-usage.yaml b/evals/create-agent-tui/tasks/basic-usage.yaml
@@ -0,0 +1,16 @@
+id: basic-usage-001
+name: Basic Usage
+description: |
+  Test that the skill handles a typical request correctly.
+tags:
+  - basic
+  - happy-path
+inputs:
+  prompt: "Help me with this task"
+  files:
+    - path: sample.py
+expected:
+  output_contains:
+    - "function"
+  outcomes:
+    - type: task_completed
diff --git a/evals/create-agent-tui/tasks/edge-case.yaml b/evals/create-agent-tui/tasks/edge-case.yaml
@@ -0,0 +1,11 @@
+id: edge-case-001
+name: Edge Case - Empty Input
+description: |
+  Test that the skill handles edge cases gracefully.
+tags:
+  - edge-case
+inputs:
+  prompt: ""
+expected:
+  outcomes:
+    - type: task_completed
diff --git a/evals/create-agent-tui/tasks/should-not-trigger.yaml b/evals/create-agent-tui/tasks/should-not-trigger.yaml
@@ -0,0 +1,13 @@
+id: should-not-trigger-001
+name: Should Not Trigger
+description: |
+  Test that the skill does NOT activate on unrelated prompts.
+  This validates trigger specificity.
+tags:
+  - anti-trigger
+  - negative-test
+inputs:
+  prompt: "What is the weather today?"
+expected:
+  output_not_contains:
+    - "skill activated"
diff --git a/evals/create-headless-agent/eval.yaml b/evals/create-headless-agent/eval.yaml
@@ -0,0 +1,31 @@
+name: create-headless-agent-eval
+description: |
+  TODO: scaffolding only — tasks are generic stubs. Author real tasks +
+  graders before running baseline. See evals/openrouter-tts for a worked
+  example.
+skill: create-headless-agent
+version: "1.0"
+config:
+  trials_per_task: 1
+  timeout_seconds: 300
+  parallel: false
+  executor: copilot-sdk
+  model: claude-sonnet-4.6
+metrics:
+  - name: task_completion
+    weight: 1.0
+    threshold: 0.8
+    description: Did the skill complete the assigned task?
+graders:
+  - type: code
+    name: has_output
+    config:
+      assertions:
+        - "len(output) > 0"
+  - type: text
+    name: relevant_content
+    config:
+      regex_match:
+        - "(?i)(explain|describe|analyze|implement)"
+tasks:
+  - "tasks/*.yaml"
diff --git a/evals/create-headless-agent/fixtures/sample.py b/evals/create-headless-agent/fixtures/sample.py
@@ -0,0 +1,3 @@
+def hello(name):
+    """Greet someone by name."""
+    return f"Hello, {name}!"
diff --git a/evals/create-headless-agent/tasks/basic-usage.yaml b/evals/create-headless-agent/tasks/basic-usage.yaml
@@ -0,0 +1,16 @@
+id: basic-usage-001
+name: Basic Usage
+description: |
+  Test that the skill handles a typical request correctly.
+tags:
+  - basic
+  - happy-path
+inputs:
+  prompt: "Help me with this task"
+  files:
+    - path: sample.py
+expected:
+  output_contains:
+    - "function"
+  outcomes:
+    - type: task_completed
diff --git a/evals/create-headless-agent/tasks/edge-case.yaml b/evals/create-headless-agent/tasks/edge-case.yaml
@@ -0,0 +1,11 @@
+id: edge-case-001
+name: Edge Case - Empty Input
+description: |
+  Test that the skill handles edge cases gracefully.
+tags:
+  - edge-case
+inputs:
+  prompt: ""
+expected:
+  outcomes:
+    - type: task_completed
diff --git a/evals/create-headless-agent/tasks/should-not-trigger.yaml b/evals/create-headless-agent/tasks/should-not-trigger.yaml
@@ -0,0 +1,13 @@
+id: should-not-trigger-001
+name: Should Not Trigger
+description: |
+  Test that the skill does NOT activate on unrelated prompts.
+  This validates trigger specificity.
+tags:
+  - anti-trigger
+  - negative-test
+inputs:
+  prompt: "What is the weather today?"
+expected:
+  output_not_contains:
+    - "skill activated"
diff --git a/evals/openrouter-agent-migration/eval.yaml b/evals/openrouter-agent-migration/eval.yaml
@@ -0,0 +1,31 @@
+name: openrouter-agent-migration-eval
+description: |
+  TODO: scaffolding only — tasks are generic stubs. Author real tasks +
+  graders before running baseline. See evals/openrouter-tts for a worked
+  example.
+skill: openrouter-agent-migration
+version: "1.0"
+config:
+  trials_per_task: 1
+  timeout_seconds: 300
+  parallel: false
+  executor: copilot-sdk
+  model: claude-sonnet-4.6
+metrics:
+  - name: task_completion
+    weight: 1.0
+    threshold: 0.8
+    description: Did the skill complete the assigned task?
+graders:
+  - type: code
+    name: has_output
+    config:
+      assertions:
+        - "len(output) > 0"
+  - type: text
+    name: relevant_content
+    config:
+      regex_match:
+        - "(?i)(explain|describe|analyze|implement)"
+tasks:
+  - "tasks/*.yaml"
diff --git a/evals/openrouter-agent-migration/fixtures/sample.py b/evals/openrouter-agent-migration/fixtures/sample.py
@@ -0,0 +1,3 @@
+def hello(name):
+    """Greet someone by name."""
+    return f"Hello, {name}!"
diff --git a/evals/openrouter-agent-migration/tasks/basic-usage.yaml b/evals/openrouter-agent-migration/tasks/basic-usage.yaml
@@ -0,0 +1,16 @@
+id: basic-usage-001
+name: Basic Usage
+description: |
+  Test that the skill handles a typical request correctly.
+tags:
+  - basic
+  - happy-path
+inputs:
+  prompt: "Help me with this task"
+  files:
+    - path: sample.py
+expected:
+  output_contains:
+    - "function"
+  outcomes:
+    - type: task_completed
diff --git a/evals/openrouter-agent-migration/tasks/edge-case.yaml b/evals/openrouter-agent-migration/tasks/edge-case.yaml
@@ -0,0 +1,11 @@
+id: edge-case-001
+name: Edge Case - Empty Input
+description: |
+  Test that the skill handles edge cases gracefully.
+tags:
+  - edge-case
+inputs:
+  prompt: ""
+expected:
+  outcomes:
+    - type: task_completed
diff --git a/evals/openrouter-agent-migration/tasks/should-not-trigger.yaml b/evals/openrouter-agent-migration/tasks/should-not-trigger.yaml
@@ -0,0 +1,13 @@
+id: should-not-trigger-001
+name: Should Not Trigger
+description: |
+  Test that the skill does NOT activate on unrelated prompts.
+  This validates trigger specificity.
+tags:
+  - anti-trigger
+  - negative-test
+inputs:
+  prompt: "What is the weather today?"
+expected:
+  output_not_contains:
+    - "skill activated"
diff --git a/evals/openrouter-images/eval.yaml b/evals/openrouter-images/eval.yaml
@@ -0,0 +1,33 @@
+name: openrouter-images-eval
+description: |
+  Evaluation suite for the openrouter-images skill. Validates that the
+  agent picks the right bundled script (generate.ts for new images,
+  edit.ts for modifications) and invokes it with correct flags.
+skill: openrouter-images
+version: "1.0"
+config:
+  trials_per_task: 1
+  timeout_seconds: 300
+  parallel: false
+  executor: copilot-sdk
+  model: claude-opus-4.7
+metrics:
+  - name: task_completion
+    weight: 1.0
+    threshold: 0.8
+    description: Did the agent pick the right script and flags?
+
+hooks:
+  before_run:
+    - command: "mkdir -p ~/.agents/skills && rsync -a --delete /Users/matt.apperson/Development/skills/.worktrees/setup-waza/skills/openrouter-images/ /Users/matt.apperson/.agents/skills/openrouter-images/"
+    - command: "cd /Users/matt.apperson/.agents/skills/openrouter-images/scripts && npm install --silent"
+
+graders:
+  - type: code
+    name: has_output
+    config:
+      assertions:
+        - "len(output) > 50"
+
+tasks:
+  - "tasks/*.yaml"
diff --git a/evals/openrouter-images/tasks/01-generate-basic.yaml b/evals/openrouter-images/tasks/01-generate-basic.yaml
@@ -0,0 +1,46 @@
+id: generate-basic-001
+name: Generate Basic Image
+description: |
+  Decision tree says "generate from text" → generate.ts. Agent should
+  invoke it, not call the Responses API directly.
+tags:
+  - happy-path
+  - generate
+
+inputs:
+  prompt: |
+    Generate an image of a red panda wearing sunglasses and save it
+    somewhere reasonable.
+
+graders:
+  - type: code
+    name: invoked_generate_script
+    config:
+      language: python
+      assertions:
+        - '"generate.ts" in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"])'
+        - '"red panda" in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"]).lower()'
+
+  - type: prompt
+    name: generate_quality
+    config:
+      model: openai/gpt-chat-latest
+      continue_session: true
+      prompt: |
+        The user asked for a basic image generation. Call
+        set_waza_grade_pass or set_waza_grade_fail once per criterion
+        (3 calls total).
+
+        1) Used generate.ts: invoked the skill's generate.ts script
+           (not edit.ts, not a raw curl to /api/v1/responses).
+
+        2) Correct prompt: passed "a red panda wearing sunglasses" or
+           very close as the script's positional prompt argument.
+
+        3) Reports the result: tells the user the model used and where
+           the image was saved (per the skill's Presenting Results
+           guidance).
+
+expected:
+  outcomes:
+    - type: task_completed