diff --git a/.github/workflows/eval.yml b/.github/workflows/eval.yml
new file mode 100644
index 0000000..a002bd7
--- /dev/null
+++ b/.github/workflows/eval.yml
@@ -0,0 +1,34 @@
+name: Run Skill Evaluations
+
+on:
+  pull_request:
+    branches: [main]
+    paths:
+      - 'evals/**'
+      - 'skills/**'
+
+permissions:
+  contents: read
+
+jobs:
+  eval:
+    name: Run Evaluations
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Install Azure Developer CLI
+        uses: Azure/setup-azd@v2
+      - name: Install waza extension
+        run: |
+          azd config set alpha.extensions on
+          azd ext source add -n waza -t url -l https://raw.githubusercontent.com/microsoft/waza/main/registry.json
+          azd ext install microsoft.azd.waza
+      - name: Run evaluations
+        run: azd waza run --output-dir ./results
+      - name: Upload results
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: eval-results
+          path: ./results
+          retention-days: 30
diff --git a/.gitignore b/.gitignore
index 3b9ac34..f955f26 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,3 +5,7 @@ build/
 .env.*
 .DS_Store
 .claude/worktrees/
+
+# waza eval outputs and caches (local to each run; not source-of-truth)
+.waza-results/
+.waza-cache/
diff --git a/.waza.yaml b/.waza.yaml
new file mode 100644
index 0000000..7dcaaf8
--- /dev/null
+++ b/.waza.yaml
@@ -0,0 +1,31 @@
+# yaml-language-server: $schema=https://raw.githubusercontent.com/microsoft/waza/main/schemas/config.schema.json
+
+paths:
+  skills: skills
+  evals: evals
+  results: .waza-results
+defaults:
+  engine: copilot-sdk
+  model: claude-sonnet-4.6
+  timeout: 300
+  parallel: false
+  workers: 4
+  verbose: false
+  sessionLog: false
+cache:
+  enabled: false
+  dir: .waza-cache
+server:
+  port: 3000
+  resultsDir: results/
+dev:
+  model: claude-sonnet-4-20250514
+  target: medium-high
+  maxIterations: 5
+tokens:
+  warningThreshold: 500
+  fallbackLimit: 1000
+graders:
+  programTimeout: 30
+storage:
+  containerName: waza-results
diff --git a/evals/create-agent-tui/eval.yaml b/evals/create-agent-tui/eval.yaml
new file mode 100644
index 0000000..de8b6d3
--- /dev/null
+++ b/evals/create-agent-tui/eval.yaml
@@ -0,0 +1,32 @@
+name: create-agent-tui-eval
+description: |
+  TODO: scaffolding only — tasks are generic stubs. Author real tasks +
+  graders before running baseline. See evals/openrouter-tts for a worked
+  example. Per project memory, this skill's graders need to drive the
+  generated TUI via pilotty, not just assert on file contents.
+skill: create-agent-tui
+version: "1.0"
+config:
+  trials_per_task: 1
+  timeout_seconds: 300
+  parallel: false
+  executor: copilot-sdk
+  model: claude-sonnet-4.6
+metrics:
+  - name: task_completion
+    weight: 1.0
+    threshold: 0.8
+    description: Did the skill complete the assigned task?
+graders:
+  - type: code
+    name: has_output
+    config:
+      assertions:
+        - "len(output) > 0"
+  - type: text
+    name: relevant_content
+    config:
+      regex_match:
+        - "(?i)(explain|describe|analyze|implement)"
+tasks:
+  - "tasks/*.yaml"
diff --git a/evals/create-agent-tui/fixtures/sample.py b/evals/create-agent-tui/fixtures/sample.py
new file mode 100644
index 0000000..3f022d1
--- /dev/null
+++ b/evals/create-agent-tui/fixtures/sample.py
@@ -0,0 +1,3 @@
+def hello(name):
+    """Greet someone by name."""
+    return f"Hello, {name}!"
diff --git a/evals/create-agent-tui/tasks/basic-usage.yaml b/evals/create-agent-tui/tasks/basic-usage.yaml
new file mode 100644
index 0000000..a08b1a9
--- /dev/null
+++ b/evals/create-agent-tui/tasks/basic-usage.yaml
@@ -0,0 +1,16 @@
+id: basic-usage-001
+name: Basic Usage
+description: |
+  Test that the skill handles a typical request correctly.
+tags:
+  - basic
+  - happy-path
+inputs:
+  prompt: "Help me with this task"
+  files:
+    - path: sample.py
+expected:
+  output_contains:
+    - "function"
+  outcomes:
+    - type: task_completed
diff --git a/evals/create-agent-tui/tasks/edge-case.yaml b/evals/create-agent-tui/tasks/edge-case.yaml
new file mode 100644
index 0000000..0ff236a
--- /dev/null
+++ b/evals/create-agent-tui/tasks/edge-case.yaml
@@ -0,0 +1,11 @@
+id: edge-case-001
+name: Edge Case - Empty Input
+description: |
+  Test that the skill handles edge cases gracefully.
+tags:
+  - edge-case
+inputs:
+  prompt: ""
+expected:
+  outcomes:
+    - type: task_completed
diff --git a/evals/create-agent-tui/tasks/should-not-trigger.yaml b/evals/create-agent-tui/tasks/should-not-trigger.yaml
new file mode 100644
index 0000000..9d85a0d
--- /dev/null
+++ b/evals/create-agent-tui/tasks/should-not-trigger.yaml
@@ -0,0 +1,13 @@
+id: should-not-trigger-001
+name: Should Not Trigger
+description: |
+  Test that the skill does NOT activate on unrelated prompts.
+  This validates trigger specificity.
+tags:
+  - anti-trigger
+  - negative-test
+inputs:
+  prompt: "What is the weather today?"
+expected:
+  output_not_contains:
+    - "skill activated"
diff --git a/evals/create-headless-agent/eval.yaml b/evals/create-headless-agent/eval.yaml
new file mode 100644
index 0000000..8ed36fd
--- /dev/null
+++ b/evals/create-headless-agent/eval.yaml
@@ -0,0 +1,31 @@
+name: create-headless-agent-eval
+description: |
+  TODO: scaffolding only — tasks are generic stubs. Author real tasks +
+  graders before running baseline. See evals/openrouter-tts for a worked
+  example.
+skill: create-headless-agent
+version: "1.0"
+config:
+  trials_per_task: 1
+  timeout_seconds: 300
+  parallel: false
+  executor: copilot-sdk
+  model: claude-sonnet-4.6
+metrics:
+  - name: task_completion
+    weight: 1.0
+    threshold: 0.8
+    description: Did the skill complete the assigned task?
+graders:
+  - type: code
+    name: has_output
+    config:
+      assertions:
+        - "len(output) > 0"
+  - type: text
+    name: relevant_content
+    config:
+      regex_match:
+        - "(?i)(explain|describe|analyze|implement)"
+tasks:
+  - "tasks/*.yaml"
diff --git a/evals/create-headless-agent/fixtures/sample.py b/evals/create-headless-agent/fixtures/sample.py
new file mode 100644
index 0000000..3f022d1
--- /dev/null
+++ b/evals/create-headless-agent/fixtures/sample.py
@@ -0,0 +1,3 @@
+def hello(name):
+    """Greet someone by name."""
+    return f"Hello, {name}!"
diff --git a/evals/create-headless-agent/tasks/basic-usage.yaml b/evals/create-headless-agent/tasks/basic-usage.yaml
new file mode 100644
index 0000000..a08b1a9
--- /dev/null
+++ b/evals/create-headless-agent/tasks/basic-usage.yaml
@@ -0,0 +1,16 @@
+id: basic-usage-001
+name: Basic Usage
+description: |
+  Test that the skill handles a typical request correctly.
+tags:
+  - basic
+  - happy-path
+inputs:
+  prompt: "Help me with this task"
+  files:
+    - path: sample.py
+expected:
+  output_contains:
+    - "function"
+  outcomes:
+    - type: task_completed
diff --git a/evals/create-headless-agent/tasks/edge-case.yaml b/evals/create-headless-agent/tasks/edge-case.yaml
new file mode 100644
index 0000000..0ff236a
--- /dev/null
+++ b/evals/create-headless-agent/tasks/edge-case.yaml
@@ -0,0 +1,11 @@
+id: edge-case-001
+name: Edge Case - Empty Input
+description: |
+  Test that the skill handles edge cases gracefully.
+tags:
+  - edge-case
+inputs:
+  prompt: ""
+expected:
+  outcomes:
+    - type: task_completed
diff --git a/evals/create-headless-agent/tasks/should-not-trigger.yaml b/evals/create-headless-agent/tasks/should-not-trigger.yaml
new file mode 100644
index 0000000..9d85a0d
--- /dev/null
+++ b/evals/create-headless-agent/tasks/should-not-trigger.yaml
@@ -0,0 +1,13 @@
+id: should-not-trigger-001
+name: Should Not Trigger
+description: |
+  Test that the skill does NOT activate on unrelated prompts.
+  This validates trigger specificity.
+tags:
+  - anti-trigger
+  - negative-test
+inputs:
+  prompt: "What is the weather today?"
+expected:
+  output_not_contains:
+    - "skill activated"
diff --git a/evals/openrouter-agent-migration/eval.yaml b/evals/openrouter-agent-migration/eval.yaml
new file mode 100644
index 0000000..72be69b
--- /dev/null
+++ b/evals/openrouter-agent-migration/eval.yaml
@@ -0,0 +1,31 @@
+name: openrouter-agent-migration-eval
+description: |
+  TODO: scaffolding only — tasks are generic stubs. Author real tasks +
+  graders before running baseline. See evals/openrouter-tts for a worked
+  example.
+skill: openrouter-agent-migration
+version: "1.0"
+config:
+  trials_per_task: 1
+  timeout_seconds: 300
+  parallel: false
+  executor: copilot-sdk
+  model: claude-sonnet-4.6
+metrics:
+  - name: task_completion
+    weight: 1.0
+    threshold: 0.8
+    description: Did the skill complete the assigned task?
+graders:
+  - type: code
+    name: has_output
+    config:
+      assertions:
+        - "len(output) > 0"
+  - type: text
+    name: relevant_content
+    config:
+      regex_match:
+        - "(?i)(explain|describe|analyze|implement)"
+tasks:
+  - "tasks/*.yaml"
diff --git a/evals/openrouter-agent-migration/fixtures/sample.py b/evals/openrouter-agent-migration/fixtures/sample.py
new file mode 100644
index 0000000..3f022d1
--- /dev/null
+++ b/evals/openrouter-agent-migration/fixtures/sample.py
@@ -0,0 +1,3 @@
+def hello(name):
+    """Greet someone by name."""
+    return f"Hello, {name}!"
diff --git a/evals/openrouter-agent-migration/tasks/basic-usage.yaml b/evals/openrouter-agent-migration/tasks/basic-usage.yaml
new file mode 100644
index 0000000..a08b1a9
--- /dev/null
+++ b/evals/openrouter-agent-migration/tasks/basic-usage.yaml
@@ -0,0 +1,16 @@
+id: basic-usage-001
+name: Basic Usage
+description: |
+  Test that the skill handles a typical request correctly.
+tags:
+  - basic
+  - happy-path
+inputs:
+  prompt: "Help me with this task"
+  files:
+    - path: sample.py
+expected:
+  output_contains:
+    - "function"
+  outcomes:
+    - type: task_completed
diff --git a/evals/openrouter-agent-migration/tasks/edge-case.yaml b/evals/openrouter-agent-migration/tasks/edge-case.yaml
new file mode 100644
index 0000000..0ff236a
--- /dev/null
+++ b/evals/openrouter-agent-migration/tasks/edge-case.yaml
@@ -0,0 +1,11 @@
+id: edge-case-001
+name: Edge Case - Empty Input
+description: |
+  Test that the skill handles edge cases gracefully.
+tags:
+  - edge-case
+inputs:
+  prompt: ""
+expected:
+  outcomes:
+    - type: task_completed
diff --git a/evals/openrouter-agent-migration/tasks/should-not-trigger.yaml b/evals/openrouter-agent-migration/tasks/should-not-trigger.yaml
new file mode 100644
index 0000000..9d85a0d
--- /dev/null
+++ b/evals/openrouter-agent-migration/tasks/should-not-trigger.yaml
@@ -0,0 +1,13 @@
+id: should-not-trigger-001
+name: Should Not Trigger
+description: |
+  Test that the skill does NOT activate on unrelated prompts.
+  This validates trigger specificity.
+tags:
+  - anti-trigger
+  - negative-test
+inputs:
+  prompt: "What is the weather today?"
+expected:
+  output_not_contains:
+    - "skill activated"
diff --git a/evals/openrouter-images/eval.yaml b/evals/openrouter-images/eval.yaml
new file mode 100644
index 0000000..9e5f8b1
--- /dev/null
+++ b/evals/openrouter-images/eval.yaml
@@ -0,0 +1,33 @@
+name: openrouter-images-eval
+description: |
+  Evaluation suite for the openrouter-images skill. Validates that the
+  agent picks the right bundled script (generate.ts for new images,
+  edit.ts for modifications) and invokes it with correct flags.
+skill: openrouter-images
+version: "1.0"
+config:
+  trials_per_task: 1
+  timeout_seconds: 300
+  parallel: false
+  executor: copilot-sdk
+  model: claude-opus-4.7
+metrics:
+  - name: task_completion
+    weight: 1.0
+    threshold: 0.8
+    description: Did the agent pick the right script and flags?
+
+hooks:
+  before_run:
+    - command: "mkdir -p ~/.agents/skills && rsync -a --delete /Users/matt.apperson/Development/skills/.worktrees/setup-waza/skills/openrouter-images/ /Users/matt.apperson/.agents/skills/openrouter-images/"
+    - command: "cd /Users/matt.apperson/.agents/skills/openrouter-images/scripts && npm install --silent"
+
+graders:
+  - type: code
+    name: has_output
+    config:
+      assertions:
+        - "len(output) > 50"
+
+tasks:
+  - "tasks/*.yaml"
diff --git a/evals/openrouter-images/tasks/01-generate-basic.yaml b/evals/openrouter-images/tasks/01-generate-basic.yaml
new file mode 100644
index 0000000..8a51fce
--- /dev/null
+++ b/evals/openrouter-images/tasks/01-generate-basic.yaml
@@ -0,0 +1,46 @@
+id: generate-basic-001
+name: Generate Basic Image
+description: |
+  Decision tree says "generate from text" → generate.ts. Agent should
+  invoke it, not call the Responses API directly.
+tags:
+  - happy-path
+  - generate
+
+inputs:
+  prompt: |
+    Generate an image of a red panda wearing sunglasses and save it
+    somewhere reasonable.
+
+graders:
+  - type: code
+    name: invoked_generate_script
+    config:
+      language: python
+      assertions:
+        - '"generate.ts" in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"])'
+        - '"red panda" in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"]).lower()'
+
+  - type: prompt
+    name: generate_quality
+    config:
+      model: openai/gpt-chat-latest
+      continue_session: true
+      prompt: |
+        The user asked for a basic image generation. Call
+        set_waza_grade_pass or set_waza_grade_fail once per criterion
+        (3 calls total).
+
+        1) Used generate.ts: invoked the skill's generate.ts script
+           (not edit.ts, not a raw curl to /api/v1/responses).
+
+        2) Correct prompt: passed "a red panda wearing sunglasses" or
+           very close as the script's positional prompt argument.
+
+        3) Reports the result: tells the user the model used and where
+           the image was saved (per the skill's Presenting Results
+           guidance).
+
+expected:
+  outcomes:
+    - type: task_completed
diff --git a/evals/openrouter-images/tasks/02-generate-with-aspect-ratio.yaml b/evals/openrouter-images/tasks/02-generate-with-aspect-ratio.yaml
new file mode 100644
index 0000000..197cb98
--- /dev/null
+++ b/evals/openrouter-images/tasks/02-generate-with-aspect-ratio.yaml
@@ -0,0 +1,45 @@
+id: generate-aspect-ratio-001
+name: Generate With Aspect Ratio
+description: |
+  User specifies a wide / landscape image. Agent should pass
+  --aspect-ratio 16:9 (or similar) to generate.ts.
+tags:
+  - happy-path
+  - generate
+  - aspect-ratio
+
+inputs:
+  prompt: |
+    Make a wide landscape image of a futuristic city at night, 16:9.
+
+graders:
+  - type: code
+    name: aspect_ratio_flag
+    config:
+      language: python
+      assertions:
+        - '"generate.ts" in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"])'
+        - '"--aspect-ratio" in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"]) or "--aspect_ratio" in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"])'
+        - '"16:9" in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"])'
+
+  - type: prompt
+    name: aspect_ratio_usage
+    config:
+      model: openai/gpt-chat-latest
+      continue_session: true
+      prompt: |
+        The user wanted a 16:9 landscape cityscape. Call
+        set_waza_grade_pass or set_waza_grade_fail once per criterion
+        (2 calls total).
+
+        1) Used flag correctly: passed --aspect-ratio 16:9 to generate.ts.
+           Does NOT hardcode the ratio into the prompt text or resort to
+           a different approach.
+
+        2) Prompt preserved: the prompt positional argument contains
+           "city" / "futuristic" / "night" (user's actual request), not
+           a rewritten description.
+
+expected:
+  outcomes:
+    - type: task_completed
diff --git a/evals/openrouter-images/tasks/03-edit-image.yaml b/evals/openrouter-images/tasks/03-edit-image.yaml
new file mode 100644
index 0000000..3710f0b
--- /dev/null
+++ b/evals/openrouter-images/tasks/03-edit-image.yaml
@@ -0,0 +1,54 @@
+id: edit-image-001
+name: Edit Existing Image
+description: |
+  User wants to modify an existing image → edit.ts, not generate.ts.
+  Common failure mode: agent uses generate.ts with the edit description.
+tags:
+  - happy-path
+  - edit
+
+inputs:
+  prompt: |
+    I have a file called photo.png. Edit it so the sky is purple.
+
+graders:
+  - type: code
+    name: uses_edit_script
+    config:
+      language: python
+      assertions:
+        - '"edit.ts" in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"])'
+        - '"generate.ts" not in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"]) or "edit.ts" in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"])'
+
+  - type: code
+    name: passes_photo_path
+    config:
+      language: python
+      assertions:
+        - '"photo.png" in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"])'
+        - '"purple" in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"]).lower() or "sky" in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"]).lower()'
+
+  - type: prompt
+    name: edit_quality
+    config:
+      model: openai/gpt-chat-latest
+      continue_session: true
+      prompt: |
+        The user asked to edit photo.png (sky → purple). Call
+        set_waza_grade_pass or set_waza_grade_fail once per criterion
+        (3 calls total).
+
+        1) Correct script: used edit.ts (not generate.ts, not a raw API
+           call), per the decision tree "edit existing image → edit.ts".
+
+        2) Passed source path and prompt: first positional arg was
+           photo.png, second positional arg was a prompt about making
+           the sky purple.
+
+        3) Reports result: tells the user the output location and
+           references the source (photo.png), per the skill's presenting
+           guidance for edit operations.
+
+expected:
+  outcomes:
+    - type: task_completed
diff --git a/evals/openrouter-images/tasks/04-anti-trigger-image-theory.yaml b/evals/openrouter-images/tasks/04-anti-trigger-image-theory.yaml
new file mode 100644
index 0000000..8831530
--- /dev/null
+++ b/evals/openrouter-images/tasks/04-anti-trigger-image-theory.yaml
@@ -0,0 +1,45 @@
+id: anti-trigger-image-theory-001
+name: Anti-Trigger - Image Gen Theory
+description: |
+  Conceptual question about image generation. Should not trigger the
+  bundled scripts.
+tags:
+  - anti-trigger
+  - negative-test
+
+inputs:
+  prompt: |
+    How do diffusion models generate images from text? Explain the basic
+    idea — I'm curious about the technique, not looking to generate
+    anything.
+
+graders:
+  - type: code
+    name: no_scripts_invoked
+    config:
+      language: python
+      assertions:
+        - '"generate.ts" not in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"])'
+        - '"edit.ts" not in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"])'
+
+  - type: prompt
+    name: stays_educational
+    config:
+      model: openai/gpt-chat-latest
+      continue_session: true
+      prompt: |
+        The user asked a conceptual question about diffusion models for
+        image generation. Call set_waza_grade_pass or set_waza_grade_fail
+        once per criterion (2 calls total).
+
+        1) Answers the question: explains the diffusion process (noise →
+           image via iterative denoising guided by a text encoder) at a
+           reasonable depth for someone curious.
+
+        2) Does NOT generate: does not invoke generate.ts or any other
+           script that produces an actual image. The user explicitly
+           said "not looking to generate anything".
+
+expected:
+  outcomes:
+    - type: task_completed
diff --git a/evals/openrouter-images/tasks/05-indirect-trigger-blog-hero.yaml b/evals/openrouter-images/tasks/05-indirect-trigger-blog-hero.yaml
new file mode 100644
index 0000000..4850b53
--- /dev/null
+++ b/evals/openrouter-images/tasks/05-indirect-trigger-blog-hero.yaml
@@ -0,0 +1,51 @@
+id: indirect-trigger-blog-hero-001
+name: Indirect Trigger - Blog Hero Image
+description: |
+  User is writing a blog post and needs a hero image — doesn't say
+  "generate an image". The skill should activate and generate.ts should
+  be invoked.
+tags:
+  - happy-path
+  - indirect-trigger
+
+inputs:
+  prompt: |
+    I'm writing a blog post about remote work and need a hero image for
+    the top of the page. Something that captures "working from a quiet
+    home office with good natural light".
+
+graders:
+  - type: code
+    name: invoked_generate
+    config:
+      language: python
+      assertions:
+        - '"generate.ts" in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"])'
+        - '"home office" in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"]).lower() or "remote work" in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"]).lower()'
+
+  - type: prompt
+    name: indirect_assembly
+    config:
+      model: openai/gpt-chat-latest
+      continue_session: true
+      prompt: |
+        The user wants a blog hero image. Call set_waza_grade_pass or
+        set_waza_grade_fail once per criterion (3 calls total).
+
+        1) Recognizes this as image-gen: loads the openrouter-images
+           skill and uses generate.ts, even though the user didn't
+           directly say "generate an image".
+
+        2) Prompt faithfully captures the user's intent: the script is
+           invoked with a prompt referencing home-office / quiet /
+           natural light — not a generic placeholder or the user's full
+           sentence verbatim.
+
+        3) Aspect ratio appropriate for a hero image: passes
+           --aspect-ratio with a wide ratio (16:9, 21:9, 3:1, etc.)
+           typical for blog hero banners. Square (1:1) or portrait
+           ratios do not satisfy this criterion.
+
+expected:
+  outcomes:
+    - type: task_completed
diff --git a/evals/openrouter-models/eval.yaml b/evals/openrouter-models/eval.yaml
new file mode 100644
index 0000000..5b3b70d
--- /dev/null
+++ b/evals/openrouter-models/eval.yaml
@@ -0,0 +1,50 @@
+name: openrouter-models-eval
+description: |
+  Evaluation suite for the openrouter-models skill. Validates that the agent
+  picks the correct script (list-models / search-models / resolve-model /
+  compare-models / get-endpoints) and invokes it with the correct flags, then
+  formats the result per the skill's "Presenting Results" guidance.
+
+  Graders are PER-TASK (task-level graders are additive).
+skill: openrouter-models
+version: "1.0"
+config:
+  trials_per_task: 1
+  timeout_seconds: 300
+  parallel: false
+  executor: copilot-sdk
+  model: claude-opus-4.7
+  # Force the agent to load the worktree copy of this skill, not whatever
+  # stale copy lives in ~/.claude/skills/ from a prior plugin install.
+  skill_directories:
+    - "skills"
+metrics:
+  - name: task_completion
+    weight: 1.0
+    threshold: 0.8
+    description: Did the agent pick the right script and produce a skill-faithful answer?
+
+# Install the skill's script deps once before any tasks run, so each task
+# doesn't pay a 20s+ npm install and we don't false-fail on missing tsx.
+hooks:
+  before_run:
+    # Sync both skills the agent may route to for model-related coding
+    # tasks: the models skill itself, and openrouter-typescript-sdk (which
+    # carries the cross-skill cue pointing at openrouter-models). Paths
+    # are absolute because working_directory defaults to the eval's own
+    # directory, not the repo root.
+    - command: "mkdir -p ~/.agents/skills && rsync -a --delete /Users/matt.apperson/Development/skills/.worktrees/setup-waza/skills/openrouter-models/ /Users/matt.apperson/.agents/skills/openrouter-models/"
+    - command: "mkdir -p ~/.agents/skills/openrouter-typescript-sdk && rsync -a --delete /Users/matt.apperson/Development/skills/.worktrees/setup-waza/skills/openrouter-typescript-sdk/ /Users/matt.apperson/.agents/skills/openrouter-typescript-sdk/"
+    # Install script deps once per run inside the synced location.
+    - command: "cd /Users/matt.apperson/.agents/skills/openrouter-models/scripts && npm install --silent"
+
+graders:
+  # Universal: non-empty response.
+  - type: code
+    name: has_output
+    config:
+      assertions:
+        - "len(output) > 50"
+
+tasks:
+  - "tasks/*.yaml"
diff --git a/evals/openrouter-models/tasks/01-cheapest-models.yaml b/evals/openrouter-models/tasks/01-cheapest-models.yaml
new file mode 100644
index 0000000..19c427c
--- /dev/null
+++ b/evals/openrouter-models/tasks/01-cheapest-models.yaml
@@ -0,0 +1,57 @@
+id: cheapest-models-001
+name: Find Cheapest Models
+description: |
+  Tests the decision-tree mapping "cheapest models" → list-models.ts --sort price.
+  Also validates the skill's "Presenting Results" guidance: convert pricing to
+  per-million-tokens format.
+tags:
+  - happy-path
+  - list-models
+  - sort
+
+inputs:
+  prompt: |
+    What are the five cheapest models available on OpenRouter right now?
+
+graders:
+  # Join all bash commands into one searchable string for substring checks.
+  # tool_calls[i] has keys: name, arguments, result, success.
+  - type: code
+    name: invoked_list_models_sort_price
+    config:
+      language: python
+      assertions:
+        - '"list-models.ts" in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"])'
+        - '"--sort price" in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"]) or "--sort=price" in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"])'
+
+  - type: code
+    name: per_million_tokens_formatting
+    config:
+      language: python
+      assertions:
+        - '"/m" in output.lower() or "per million" in output.lower() or "/1m" in output.lower() or "1,000,000" in output'
+
+  - type: prompt
+    name: answer_quality
+    config:
+      model: openai/gpt-chat-latest
+      continue_session: true
+      prompt: |
+        The user asked for the five cheapest OpenRouter models. Call
+        set_waza_grade_pass or set_waza_grade_fail once per criterion
+        (3 calls total).
+
+        1) Correct tool chosen: the agent ran list-models.ts with a
+           price-sort flag (not a wrong script like compare-models,
+           not a guess without running the script).
+
+        2) Five models returned: the response names about five specific
+           models with their provider/id.
+
+        3) Presentation correct: pricing is shown in per-million-tokens
+           format ("$X/M input", "$X per million", etc.) rather than raw
+           per-token values, per the skill's Presenting Results guidance.
+
+expected:
+  outcomes:
+    - type: task_completed
diff --git a/evals/openrouter-models/tasks/02-compare-two-models.yaml b/evals/openrouter-models/tasks/02-compare-two-models.yaml
new file mode 100644
index 0000000..d93a779
--- /dev/null
+++ b/evals/openrouter-models/tasks/02-compare-two-models.yaml
@@ -0,0 +1,55 @@
+id: compare-models-001
+name: Compare Two Models
+description: |
+  Tests the decision-tree mapping "compare X vs Y" → compare-models.ts.
+  Skill guidance requires markdown table for comparisons.
+tags:
+  - happy-path
+  - compare-models
+
+inputs:
+  prompt: |
+    How do Claude Sonnet 4 and GPT-4o compare on pricing and context length?
+
+graders:
+  - type: code
+    name: invoked_compare_models
+    config:
+      language: python
+      assertions:
+        - '"compare-models.ts" in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"])'
+        - '"claude" in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"]).lower() and ("sonnet" in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"]).lower())'
+        - '"gpt-4o" in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"]).lower()'
+
+  - type: code
+    name: markdown_table_output
+    config:
+      language: python
+      assertions:
+        - '"|" in output and "---" in output'
+
+  - type: prompt
+    name: comparison_quality
+    config:
+      model: openai/gpt-chat-latest
+      continue_session: true
+      prompt: |
+        The user asked to compare Claude Sonnet 4 and GPT-4o on pricing and
+        context length. Call set_waza_grade_pass or set_waza_grade_fail once
+        per criterion (3 calls total).
+
+        1) Correct tool: the agent ran compare-models.ts with both models
+           as arguments (not list-models + manual diff, not a single-model
+           lookup).
+
+        2) Both dimensions addressed: the response covers pricing AND
+           context length for both models.
+
+        3) Tabular presentation: results shown in a markdown table with
+           models as columns (or a clearly aligned comparison format),
+           per the skill's Presenting Results guidance. Pricing in
+           per-million-tokens format.
+
+expected:
+  outcomes:
+    - type: task_completed
diff --git a/evals/openrouter-models/tasks/03-resolve-informal-name.yaml b/evals/openrouter-models/tasks/03-resolve-informal-name.yaml
new file mode 100644
index 0000000..2839e10
--- /dev/null
+++ b/evals/openrouter-models/tasks/03-resolve-informal-name.yaml
@@ -0,0 +1,57 @@
+id: resolve-informal-001
+name: Resolve Informal Model Name
+description: |
+  Tests the two-step workflow: resolve-model.ts for an informal name, then
+  feed the resolved ID into another script. The skill's decision tree is
+  explicit that informal names go through resolve first.
+tags:
+  - happy-path
+  - resolve-model
+  - two-step
+
+inputs:
+  prompt: |
+    I want to compare "Claude Opus" against "the latest GPT" on context
+    length and pricing. Can you do that?
+
+graders:
+  - type: code
+    name: invoked_resolve_model
+    config:
+      language: python
+      assertions:
+        - '"resolve-model.ts" in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"])'
+
+  - type: code
+    name: invoked_compare_after_resolve
+    config:
+      language: python
+      assertions:
+        - '"compare-models.ts" in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"])'
+
+  - type: prompt
+    name: two_step_workflow
+    config:
+      model: openai/gpt-chat-latest
+      continue_session: true
+      prompt: |
+        The user gave informal model names ("Claude Opus", "the latest GPT").
+        Call set_waza_grade_pass or set_waza_grade_fail once per criterion
+        (3 calls total).
+
+        1) Resolved before using: the agent called resolve-model.ts for the
+           informal names BEFORE invoking compare-models.ts. Did not guess
+           exact IDs.
+
+        2) Confidence-aware: if either resolution returned medium/low
+           confidence the agent either confirmed with the user or noted
+           the ambiguity. (If both came back high, a direct comparison is
+           fine and this criterion still passes.)
+
+        3) Comparison delivered: the response ends with an actual comparison
+           of the two resolved models on pricing AND context length,
+           not just a list of candidate model IDs.
+
+expected:
+  outcomes:
+    - type: task_completed
diff --git a/evals/openrouter-models/tasks/04-fastest-provider.yaml b/evals/openrouter-models/tasks/04-fastest-provider.yaml
new file mode 100644
index 0000000..3286816
--- /dev/null
+++ b/evals/openrouter-models/tasks/04-fastest-provider.yaml
@@ -0,0 +1,49 @@
+id: fastest-provider-001
+name: Fastest Provider for a Model
+description: |
+  Tests get-endpoints.ts for provider-performance questions. Skill guidance
+  says to highlight fastest (lowest p50 latency) and most reliable (highest
+  uptime).
+tags:
+  - happy-path
+  - get-endpoints
+  - provider-performance
+
+inputs:
+  prompt: |
+    Which provider is currently fastest for anthropic/claude-sonnet-4?
+
+graders:
+  - type: code
+    name: invoked_get_endpoints
+    config:
+      language: python
+      assertions:
+        - '"get-endpoints.ts" in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"])'
+        - '"anthropic/claude-sonnet-4" in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"])'
+        - '"--sort throughput" in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"]) or "--sort=throughput" in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"]) or "--sort latency" in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"]) or "--sort=latency" in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"])'
+
+  - type: prompt
+    name: provider_answer_quality
+    config:
+      model: openai/gpt-chat-latest
+      continue_session: true
+      prompt: |
+        The user asked which provider is fastest for claude-sonnet-4. Call
+        set_waza_grade_pass or set_waza_grade_fail once per criterion
+        (3 calls total).
+
+        1) Correct tool: agent ran get-endpoints.ts for
+           anthropic/claude-sonnet-4 (not list-models, not a guess).
+
+        2) Names a specific provider: response identifies a specific
+           provider (Anthropic, Google Vertex, AWS Bedrock, etc.) as the
+           fastest, with supporting numbers (p50 latency or throughput).
+
+        3) Includes context: response mentions uptime or reliability for
+           the recommended provider, per the skill's guidance to call out
+           reliable providers.
+
+expected:
+  outcomes:
+    - type: task_completed
diff --git a/evals/openrouter-models/tasks/05-anti-trigger-weather.yaml b/evals/openrouter-models/tasks/05-anti-trigger-weather.yaml
new file mode 100644
index 0000000..266945c
--- /dev/null
+++ b/evals/openrouter-models/tasks/05-anti-trigger-weather.yaml
@@ -0,0 +1,56 @@
+id: anti-trigger-weather-001
+name: Anti-Trigger - Unrelated Question
+description: |
+  Negative test: unrelated prompt. The skill should not activate, and the
+  response must not invoke any of the bundled scripts or drag OpenRouter
+  model commentary into a weather answer.
+tags:
+  - anti-trigger
+  - negative-test
+  - trigger-specificity
+
+inputs:
+  prompt: |
+    What's the weather going to be in San Francisco tomorrow?
+
+graders:
+  - type: code
+    name: no_skill_scripts_invoked
+    config:
+      language: python
+      assertions:
+        - '"list-models.ts" not in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"])'
+        - '"search-models.ts" not in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"])'
+        - '"resolve-model.ts" not in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"])'
+        - '"compare-models.ts" not in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"])'
+        - '"get-endpoints.ts" not in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"])'
+
+  - type: code
+    name: no_openrouter_contamination
+    config:
+      language: python
+      assertions:
+        - "output.lower().count('openrouter') <= 1"
+        - "'anthropic/claude' not in output.lower()"
+        - "'openai/gpt' not in output.lower()"
+
+  - type: prompt
+    name: stays_on_topic
+    config:
+      model: openai/gpt-chat-latest
+      continue_session: true
+      prompt: |
+        The user asked about San Francisco weather. Call set_waza_grade_pass
+        or set_waza_grade_fail once per criterion (2 calls total).
+
+        1) Addresses weather: the response is about weather or politely
+           explains the agent can't provide live weather and suggests an
+           alternative (weather.com, phone app, checking the forecast).
+
+        2) No OpenRouter contamination: the response does NOT invoke or
+           reference the openrouter-models scripts, does NOT list OpenRouter
+           models, does NOT discuss AI model pricing or providers.
+
+expected:
+  outcomes:
+    - type: task_completed
diff --git a/evals/openrouter-models/tasks/06-mentioned-model-in-task.yaml b/evals/openrouter-models/tasks/06-mentioned-model-in-task.yaml
new file mode 100644
index 0000000..50aac57
--- /dev/null
+++ b/evals/openrouter-models/tasks/06-mentioned-model-in-task.yaml
@@ -0,0 +1,67 @@
+id: mentioned-model-in-task-001
+name: Model Mentioned In Larger Task
+description: |
+  Tests the subtler trigger: the user doesn't ask about models directly —
+  they NAME a model (informally, partial, or ambiguous) as part of a larger
+  coding task. The skill should activate and resolve the name to an exact
+  OpenRouter ID before proceeding, rather than guessing or using the
+  informal string verbatim.
+tags:
+  - happy-path
+  - resolve-model
+  - indirect-trigger
+
+inputs:
+  prompt: |
+    Write a small Node.js script that sends a prompt to OpenRouter and
+    prints the response. Use GLM as the model.
+
+graders:
+  - type: code
+    name: invoked_resolve_model
+    config:
+      language: python
+      assertions:
+        - '"resolve-model.ts" in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"])'
+        - '"glm" in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"]).lower()'
+
+  - type: code
+    name: resolved_id_used_in_output
+    config:
+      language: python
+      assertions:
+        - '"glm" in output.lower()'
+        - '"/" in output'
+
+  - type: prompt
+    name: silent_resolve_then_use
+    config:
+      model: openai/gpt-chat-latest
+      continue_session: true
+      prompt: |
+        The user asked for a Node.js OpenRouter script "using GLM as the
+        model." Call set_waza_grade_pass or set_waza_grade_fail once per
+        criterion (3 calls total).
+
+        1) Resolved, didn't guess: the agent invoked resolve-model.ts for
+           "glm" before writing code. It did NOT hardcode a made-up string
+           like "glm" or "glm-4" as the model ID — it used the resolver to
+           get an exact OpenRouter-style "vendor/model-id" and put THAT
+           string in the code.
+
+        2) Task completed: the response includes an actual Node.js script
+           that sends a prompt to OpenRouter (fetch to /api/v1/* or SDK
+           call) and prints the response. A snippet that only resolves the
+           model ID without producing the script does NOT pass this
+           criterion.
+
+        3) Noted resolution confidence (soft): if the resolver returned
+           medium/low confidence the agent either picked the best match and
+           mentioned it, OR asked the user to confirm. If the resolver
+           returned high confidence and the agent just used it silently,
+           that also passes this criterion — confirm-or-use is the right
+           behavior either way.
+
+expected:
+  outcomes:
+    - type: task_completed
diff --git a/evals/openrouter-oauth/eval.yaml b/evals/openrouter-oauth/eval.yaml
new file mode 100644
index 0000000..6c01974
--- /dev/null
+++ b/evals/openrouter-oauth/eval.yaml
@@ -0,0 +1,33 @@
+name: openrouter-oauth-eval
+description: |
+  Evaluation suite for the openrouter-oauth skill. Validates that the agent,
+  when the skill is loaded, produces responses that actually follow the
+  documented PKCE flow and surface skill-specific guarantees (sessionStorage
+  for verifier, S256 challenge, openrouter.ai/auth, callback guard, etc.).
+
+  Graders are defined PER-TASK (task-level graders are additive in waza) —
+  only universal checks live here at the eval level.
+skill: openrouter-oauth
+version: "1.0"
+config:
+  trials_per_task: 1
+  timeout_seconds: 300
+  parallel: false
+  executor: copilot-sdk
+  model: claude-opus-4.7
+metrics:
+  - name: task_completion
+    weight: 1.0
+    threshold: 0.8
+    description: Did the skill produce a correct, skill-faithful response?
+
+graders:
+  # Universal: every task must produce a non-empty response.
+  - type: code
+    name: has_output
+    config:
+      assertions:
+        - "len(output) > 50"
+
+tasks:
+  - "tasks/*.yaml"
diff --git a/evals/openrouter-oauth/tasks/01-happy-path-react.yaml b/evals/openrouter-oauth/tasks/01-happy-path-react.yaml
new file mode 100644
index 0000000..337357c
--- /dev/null
+++ b/evals/openrouter-oauth/tasks/01-happy-path-react.yaml
@@ -0,0 +1,70 @@
+id: happy-path-react-001
+name: Happy Path - React App
+description: |
+  Standard request to add Sign In with OpenRouter to a React app.
+  Exercises the full PKCE flow and the sign-in button section of the skill.
+tags:
+  - happy-path
+  - react
+  - full-flow
+
+inputs:
+  prompt: |
+    I have a React app and I want to add "Sign in with OpenRouter" so users
+    can authorize and my app can make inference calls with their API key.
+    Walk me through the implementation and show me the code I need.
+
+graders:
+  # Full PKCE rubric + structural claims — this task is expected to cover
+  # the whole skill surface.
+  - type: code
+    name: pkce_technical_claims
+    config:
+      language: python
+      assertions:
+        - "'sessionstorage' in output.lower()"
+        - "'S256' in output or 's256' in output.lower()"
+        - "'openrouter.ai/auth' in output.lower() or 'openrouter.ai/api/v1/auth/keys' in output.lower()"
+        - "'code_verifier' in output.lower() or 'verifier' in output.lower()"
+        - "'code_challenge' in output.lower() or 'challenge' in output.lower()"
+
+  - type: prompt
+    name: skill_faithful_rubric
+    config:
+      model: openai/gpt-chat-latest
+      continue_session: true
+      prompt: |
+        You are grading an agent's response to a user who asked for
+        "Sign in with OpenRouter" in a React app. The OpenRouter OAuth flow
+        is documented as PKCE with no client registration and no backend.
+
+        Evaluate these four criteria INDEPENDENTLY. Call set_waza_grade_pass
+        or set_waza_grade_fail once for EACH criterion (4 total calls).
+
+        1) Correct PKCE initiation: response describes generating a
+           code_verifier (32 random bytes, base64url encoded), computing an
+           S256 challenge, and redirecting to https://openrouter.ai/auth with
+           callback_url, code_challenge, and code_challenge_method=S256.
+           No invented client_id, client_secret, or app registration.
+
+        2) Correct storage model: code_verifier in sessionStorage (NOT
+           localStorage), final API key in localStorage, and a guard
+           (hasOAuthCallbackPending or equivalent) that checks the verifier
+           exists before processing ?code= params.
+
+        3) Correct key exchange: POST to
+           https://openrouter.ai/api/v1/auth/keys with JSON body containing
+           `code`, `code_verifier`, and `code_challenge_method: "S256"`, then
+           read `key` from the JSON response. No backend proxy claimed.
+
+        4) No fabricated details: does not introduce client_id/secret/app
+           registration/backend as requirements. Does not require installing
+           the OpenRouter SDK specifically for auth (the flow is plain fetch).
+
+        For each criterion: set_waza_grade_pass with description="criterion N: <name>"
+        and a reason if satisfied, otherwise set_waza_grade_fail with the same
+        description and a reason explaining what's missing.
+
+expected:
+  outcomes:
+    - type: task_completed
diff --git a/evals/openrouter-oauth/tasks/02-programmatic-key.yaml b/evals/openrouter-oauth/tasks/02-programmatic-key.yaml
new file mode 100644
index 0000000..bd181b3
--- /dev/null
+++ b/evals/openrouter-oauth/tasks/02-programmatic-key.yaml
@@ -0,0 +1,55 @@
+id: programmatic-key-001
+name: Programmatic API Key (No Button)
+description: |
+  User wants the API key but isn't building a UI-first flow.
+  The skill should provide the PKCE flow and skip the button/variants section.
+tags:
+  - happy-path
+  - programmatic
+  - no-ui
+
+inputs:
+  prompt: |
+    I need to obtain an OpenRouter API key from the browser for my app
+    programmatically — I'm not building a sign-in button component, just need
+    the OAuth flow to get back a usable key I can store. What do I do?
+
+graders:
+  # PKCE protocol must be complete; skill's "no client registration" claim matters.
+  - type: code
+    name: pkce_technical_claims
+    config:
+      language: python
+      assertions:
+        - "'sessionstorage' in output.lower()"
+        - "'S256' in output or 's256' in output.lower()"
+        - "'openrouter.ai/auth' in output.lower() or 'openrouter.ai/api/v1/auth/keys' in output.lower()"
+        - "'code_verifier' in output.lower() or 'verifier' in output.lower()"
+
+  - type: prompt
+    name: no_button_focus
+    config:
+      model: openai/gpt-chat-latest
+      continue_session: true
+      prompt: |
+        The user explicitly asked for the OAuth flow only, no button UI.
+        Evaluate these criteria. Call set_waza_grade_pass or set_waza_grade_fail
+        once per criterion (3 total calls).
+
+        1) PKCE flow complete: covers verifier generation, S256 challenge,
+           redirect to https://openrouter.ai/auth, POST to
+           https://openrouter.ai/api/v1/auth/keys for the exchange, and
+           reading `key` from the response.
+
+        2) No unnecessary UI content: does NOT push a button component with
+           logo SVG, multiple style variants, or Tailwind class tables.
+           Button styling / variant guidance is explicitly out of scope for
+           this request. A minimal "call initiateOAuth() on click" mention is
+           fine; a full button taxonomy is not.
+
+        3) No fabricated requirements: does not claim a client_id, secret,
+           app registration, or backend is needed.
+
+expected:
+  outcomes:
+    - type: task_completed
diff --git a/evals/openrouter-oauth/tasks/03-callback-guard.yaml b/evals/openrouter-oauth/tasks/03-callback-guard.yaml
new file mode 100644
index 0000000..f2b5afe
--- /dev/null
+++ b/evals/openrouter-oauth/tasks/03-callback-guard.yaml
@@ -0,0 +1,61 @@
+id: callback-guard-001
+name: Callback Guard for Ambiguous ?code= Params
+description: |
+  Tests whether the skill surfaces the callback-guard requirement
+  (hasOAuthCallbackPending / verifier presence check) when the user's app
+  already uses ?code= for other things. This is a concrete, security-relevant
+  detail the skill documents — the task is narrow by design, so graders
+  should focus on guard handling, not the full protocol restate.
+tags:
+  - edge-case
+  - security
+  - guard
+
+inputs:
+  prompt: |
+    My existing app already uses ?code= query parameters for a different
+    feature (a referral tracking thing). I want to add Sign In with
+    OpenRouter without breaking that. How do I safely handle the OAuth
+    callback so I only consume codes that belong to the OpenRouter flow?
+
+graders:
+  # Narrow, focused graders — the guard mechanism is the point of this task.
+  - type: code
+    name: guard_claims
+    config:
+      language: python
+      assertions:
+        # The skill-native guard function, or at minimum the underlying check.
+        - "'hasoauthcallbackpending' in output.lower() or 'sessionstorage.getitem' in output.lower()"
+        # Must mention sessionStorage as the signal for guard.
+        - "'sessionstorage' in output.lower()"
+        # Must mention the verifier key as the signal.
+        - "'verifier' in output.lower()"
+
+  - type: prompt
+    name: guard_reasoning
+    config:
+      model: openai/gpt-chat-latest
+      continue_session: true
+      prompt: |
+        The user already uses ?code= for their own unrelated feature.
+        Evaluate these criteria. Call set_waza_grade_pass or set_waza_grade_fail
+        once per criterion (3 total calls).
+
+        1) Explains the hazard: makes clear that blindly consuming ?code= in
+           the OpenRouter callback handler would clash with the user's
+           existing feature.
+
+        2) Correct guard mechanism: proposes checking for the presence of a
+           code_verifier in sessionStorage (hasOAuthCallbackPending or
+           equivalent) BEFORE invoking handleOAuthCallback. Gates the
+           OpenRouter code path on "did this tab initiate an OAuth flow?"
+
+        3) Doesn't recommend fragile alternatives as primary: does not
+           recommend checking URL path, a custom query param name, or
+           abandoning ?code= as the main fix. A sessionStorage-backed
+           verifier check is the correct approach here.
+
+expected:
+  outcomes:
+    - type: task_completed
diff --git a/evals/openrouter-oauth/tasks/04-sign-in-button.yaml b/evals/openrouter-oauth/tasks/04-sign-in-button.yaml
new file mode 100644
index 0000000..d231b4f
--- /dev/null
+++ b/evals/openrouter-oauth/tasks/04-sign-in-button.yaml
@@ -0,0 +1,54 @@
+id: sign-in-button-001
+name: Sign-In Button Component
+description: |
+  Narrow request focused on the button section of the skill — logo SVG,
+  variants, sizes, dark mode, loading state, click handler wiring.
+  User has said the OAuth flow is already working, so PKCE protocol details
+  are NOT required (and would be scope bloat).
+tags:
+  - happy-path
+  - ui
+  - button
+
+inputs:
+  prompt: |
+    I already have the OAuth flow working — I can get an API key from
+    OpenRouter. Now I just want a polished "Sign in with OpenRouter" button
+    component I can drop into my app. Multiple visual variants, proper
+    sizing, dark mode support, and a loading state while the exchange is
+    happening.
+
+graders:
+  # Rely on the LLM-judge rubric below rather than a `code` grader that
+  # greps the chat-text output for "<svg". The skill produces the component
+  # by writing a .tsx file to the workspace; the response text summarizes
+  # the work. Grepping the response misses the actual artifact. If we want
+  # structural assertions on the component file we should add a `file`
+  # grader pointed at the generated .tsx path — deferred until we know the
+  # deterministic filename the agents pick.
+  - type: prompt
+    name: button_quality_rubric
+    config:
+      model: openai/gpt-chat-latest
+      continue_session: true
+      prompt: |
+        The user wants ONLY a button component — the OAuth flow is done.
+        Evaluate these criteria. Call set_waza_grade_pass or set_waza_grade_fail
+        once per criterion (3 total calls).
+
+        1) Renders the OpenRouter logo: includes an SVG logo (inline <svg> or
+           a clearly-identified logo asset) so the button is branded as
+           "Sign in with OpenRouter", not a generic button.
+
+        2) Multiple variants + sizes: offers at least 3 visual variants (e.g.,
+           default/minimal/branded/icon/cta) AND multiple size options
+           (e.g., sm/default/lg/xl). Hardcoding a single style is not enough.
+
+        3) Interaction state handled: accessible button (not a div), has a
+           loading/disabled state while the auth flow runs, and a clear
+           onClick that triggers the sign-in flow (calling initiateOAuth or
+           equivalent). Dark mode classes or theming are addressed.
+
+expected:
+  outcomes:
+    - type: task_completed
diff --git a/evals/openrouter-oauth/tasks/05-anti-trigger-google-oauth.yaml b/evals/openrouter-oauth/tasks/05-anti-trigger-google-oauth.yaml
new file mode 100644
index 0000000..c966025
--- /dev/null
+++ b/evals/openrouter-oauth/tasks/05-anti-trigger-google-oauth.yaml
@@ -0,0 +1,67 @@
+id: anti-trigger-google-001
+name: Anti-Trigger - Unrelated OAuth Request
+description: |
+  Negative test: a generic "Google OAuth in Django" request shares the word
+  "OAuth" with this skill but is otherwise unrelated. The openrouter-oauth
+  skill should NOT activate, and the response must not leak OpenRouter-specific
+  details (openrouter.ai/auth, sk-or-, the PKCE code-verifier storage model
+  from this skill) into what should be a Google/Django answer.
+tags:
+  - anti-trigger
+  - negative-test
+  - trigger-specificity
+
+inputs:
+  prompt: |
+    How do I implement Google OAuth sign-in in my Django web app?
+    I need users to log in with their Google accounts.
+
+graders:
+  # Direct leak check: must not contain OpenRouter endpoints or api-key prefix.
+  # We intentionally omit a `trigger` grader here — it measures keyword overlap
+  # between the prompt and the skill's description, which is inherently high
+  # for any OAuth-adjacent prompt and doesn't reflect whether the skill
+  # actually contaminated the answer. The real signal is the content checks
+  # below.
+  - type: code
+    name: no_openrouter_leakage
+    config:
+      language: python
+      assertions:
+        - "'openrouter.ai' not in output.lower()"
+        - "'sk-or-' not in output.lower()"
+        # Allow the word "openrouter" once (as in "this is not about OpenRouter")
+        # but flag repeated mentions that suggest the skill activated.
+        - "output.lower().count('openrouter') <= 1"
+
+  # LLM-judge: evaluate that the response actually answers the Google/Django
+  # question well, rather than being contaminated by OpenRouter content.
+  - type: prompt
+    name: stays_on_topic
+    config:
+      model: openai/gpt-chat-latest
+      continue_session: true
+      prompt: |
+        The user asked about Google OAuth in Django. Call set_waza_grade_pass
+        or set_waza_grade_fail once per criterion (3 total calls).
+
+        1) Answers the actual question: the response addresses Google OAuth
+           in a Django context (covers OAuth consent screen, client_id/secret
+           from Google Cloud Console, callback URL, session/user creation).
+           A library like django-allauth or authlib is fine; a raw OAuth
+           implementation is also fine.
+
+        2) No OpenRouter contamination: the response does NOT describe the
+           OpenRouter PKCE flow, does NOT tell the user to go to
+           openrouter.ai/auth, does NOT mention an openrouter API key
+           (sk-or-...), and does NOT apply the skill's sessionStorage
+           verifier model to Google OAuth. A brief disclaimer like "this is
+           different from OpenRouter OAuth" does not count as contamination.
+
+        3) Django-appropriate advice: recommends Django-idiomatic patterns
+           (e.g., a callback view, settings.py config, session-based user
+           auth) rather than a frontend-only PKCE flow.
+
+expected:
+  outcomes:
+    - type: task_completed
diff --git a/evals/openrouter-stt/eval.yaml b/evals/openrouter-stt/eval.yaml
new file mode 100644
index 0000000..85aadaf
--- /dev/null
+++ b/evals/openrouter-stt/eval.yaml
@@ -0,0 +1,33 @@
+name: openrouter-stt-eval
+description: |
+  Evaluation suite for the openrouter-stt skill. Validates the agent calls
+  OpenRouter's NOT-OpenAI-compatible /api/v1/audio/transcriptions endpoint
+  correctly: JSON body, base64 audio under input_audio.data, format field,
+  direct fetch/requests (NOT the OpenAI SDK).
+skill: openrouter-stt
+version: "1.0"
+config:
+  trials_per_task: 1
+  timeout_seconds: 300
+  parallel: false
+  executor: copilot-sdk
+  model: claude-opus-4.7
+metrics:
+  - name: task_completion
+    weight: 1.0
+    threshold: 0.8
+    description: Did the agent write correct STT code?
+
+hooks:
+  before_run:
+    - command: "mkdir -p ~/.agents/skills && rsync -a --delete /Users/matt.apperson/Development/skills/.worktrees/setup-waza/skills/openrouter-stt/ /Users/matt.apperson/.agents/skills/openrouter-stt/"
+
+graders:
+  - type: code
+    name: has_output
+    config:
+      assertions:
+        - "len(output) > 100"
+
+tasks:
+  - "tasks/*.yaml"
diff --git a/evals/openrouter-stt/tasks/01-happy-path-bash.yaml b/evals/openrouter-stt/tasks/01-happy-path-bash.yaml
new file mode 100644
index 0000000..02e8156
--- /dev/null
+++ b/evals/openrouter-stt/tasks/01-happy-path-bash.yaml
@@ -0,0 +1,74 @@
+id: happy-path-bash-001
+name: Happy Path - Bash Transcription
+description: |
+  Standard request: transcribe an audio file. Agent should write a bash
+  script using curl that POSTs JSON (not multipart) to
+  /api/v1/audio/transcriptions with the audio base64-encoded under
+  input_audio.data.
+tags:
+  - happy-path
+  - bash
+  - curl
+
+inputs:
+  prompt: |
+    Write a bash script that transcribes audio.wav to plain text using
+    OpenRouter. Print only the transcript to stdout. Show me the full
+    script in your response.
+
+graders:
+  - type: code
+    name: endpoint_and_shape
+    config:
+      language: python
+      assertions:
+        - '"/api/v1/audio/transcriptions" in output.lower()'
+        - '"input_audio" in output.lower()'
+        - '"base64" in output.lower() or "b64" in output.lower()'
+
+  - type: code
+    name: json_not_multipart
+    config:
+      language: python
+      assertions:
+        - '"content-type: application/json" in output.lower() or "content-type\":\"application/json" in output.lower() or "\"content-type\": \"application/json\"" in output.lower()'
+        # Agent should NOT be sending multipart/form-data.
+        - '"multipart" not in output.lower() and "form-data" not in output.lower()'
+
+  - type: code
+    name: auth_handled
+    config:
+      language: python
+      assertions:
+        - '"bearer" in output.lower() and "openrouter_api_key" in output.lower()'
+
+  - type: prompt
+    name: correctness_rubric
+    config:
+      model: openai/gpt-chat-latest
+      continue_session: true
+      prompt: |
+        The user asked for a bash script that transcribes audio.wav via
+        OpenRouter's STT API. Call set_waza_grade_pass or set_waza_grade_fail
+        once per criterion (4 calls total).
+
+        1) Correct endpoint: POSTs to
+           https://openrouter.ai/api/v1/audio/transcriptions.
+
+        2) Correct body shape: JSON body with model (a real transcription
+           model slug like google/chirp-3 or openai/whisper-1) and
+           input_audio: {data: <base64>, format: "wav"}. NOT
+           multipart/form-data.
+
+        3) Base64 encoding done correctly: uses base64 on the audio file
+           and strips newlines, uses --data-binary @file to avoid ARG_MAX
+           issues with large payloads, does NOT prefix with a data URI
+           ("data:audio/wav;base64,...").
+
+        4) Prints the transcript: parses response JSON and prints only
+           the .text field to stdout (e.g. via jq -r '.text'). Handles
+           non-200 HTTP with a clear error.
+
+expected:
+  outcomes:
+    - type: task_completed
diff --git a/evals/openrouter-stt/tasks/02-python-requests.yaml b/evals/openrouter-stt/tasks/02-python-requests.yaml
new file mode 100644
index 0000000..a57659a
--- /dev/null
+++ b/evals/openrouter-stt/tasks/02-python-requests.yaml
@@ -0,0 +1,67 @@
+id: python-requests-001
+name: Python Transcription via requests
+description: |
+  Python flow using the requests library (or equivalent). Critical
+  negative test: the agent should NOT use the OpenAI SDK, because the
+  endpoint is not OpenAI-compatible (documented in the skill).
+tags:
+  - happy-path
+  - python
+  - not-openai-sdk
+
+inputs:
+  prompt: |
+    Write a Python script that transcribes a local audio.wav file using
+    OpenRouter and prints the transcript. Show me the complete code in
+    your response.
+
+graders:
+  - type: code
+    name: uses_direct_http
+    config:
+      language: python
+      assertions:
+        - '"requests" in output.lower() or "httpx" in output.lower() or "urllib" in output.lower() or "aiohttp" in output.lower()'
+        - '"/api/v1/audio/transcriptions" in output.lower()'
+
+  - type: code
+    name: avoids_openai_sdk
+    config:
+      language: python
+      assertions:
+        # Agent must not use the OpenAI Python SDK for this endpoint.
+        - '"from openai" not in output.lower() and "import openai" not in output.lower()'
+        - '"client.audio.transcriptions.create" not in output.lower()'
+
+  - type: code
+    name: base64_encoding
+    config:
+      language: python
+      assertions:
+        - '"base64" in output.lower()'
+        - '"input_audio" in output.lower()'
+
+  - type: prompt
+    name: python_quality
+    config:
+      model: openai/gpt-chat-latest
+      continue_session: true
+      prompt: |
+        The user asked for a Python STT script. Call set_waza_grade_pass
+        or set_waza_grade_fail once per criterion (3 calls total).
+
+        1) Correct HTTP approach: uses requests / httpx / urllib / aiohttp
+           (NOT the OpenAI SDK) to POST JSON to
+           https://openrouter.ai/api/v1/audio/transcriptions.
+
+        2) Correct body: JSON body with model (real transcription slug),
+           input_audio.data set to base64-encoded file bytes (no data:
+           URI prefix), input_audio.format matching the audio container
+           ("wav").
+
+        3) Prints transcript: on success reads response.json()["text"]
+           and prints it. Handles non-200 with a clear error.
+
+expected:
+  outcomes:
+    - type: task_completed
diff --git a/evals/openrouter-stt/tasks/03-typescript-fetch.yaml b/evals/openrouter-stt/tasks/03-typescript-fetch.yaml
new file mode 100644
index 0000000..136cb16
--- /dev/null
+++ b/evals/openrouter-stt/tasks/03-typescript-fetch.yaml
@@ -0,0 +1,57 @@
+id: typescript-fetch-001
+name: TypeScript Transcription via fetch
+description: |
+  TypeScript flow using native fetch. Tests the skill's documented
+  TypeScript example is reproducible by the agent.
+tags:
+  - happy-path
+  - typescript
+  - fetch
+
+inputs:
+  prompt: |
+    Write a TypeScript function that transcribes a local audio file using
+    OpenRouter's STT and returns the transcript as a string. Include the
+    complete code in your response.
+
+graders:
+  - type: code
+    name: ts_shape
+    config:
+      language: python
+      assertions:
+        - '"fetch" in output.lower()'
+        - '"/api/v1/audio/transcriptions" in output.lower()'
+        - '"input_audio" in output.lower()'
+
+  - type: code
+    name: avoids_openai_sdk
+    config:
+      language: python
+      assertions:
+        - '"new openai(" not in output.lower() and "openai.audio.transcriptions" not in output.lower()'
+
+  - type: prompt
+    name: ts_quality
+    config:
+      model: openai/gpt-chat-latest
+      continue_session: true
+      prompt: |
+        The user asked for a TypeScript transcription function. Call
+        set_waza_grade_pass or set_waza_grade_fail once per criterion
+        (3 calls total).
+
+        1) Correct endpoint + method: POST to
+           https://openrouter.ai/api/v1/audio/transcriptions with
+           Authorization: Bearer from OPENROUTER_API_KEY env var.
+
+        2) Correct body: JSON body with model (real slug), input_audio:
+           {data, format} where data is base64-encoded file bytes (no
+           data: URI prefix).
+
+        3) Returns transcript: on success returns result.text as a
+           string. Throws or handles non-ok responses clearly.
+
+expected:
+  outcomes:
+    - type: task_completed
diff --git a/evals/openrouter-stt/tasks/04-anti-trigger-asr-theory.yaml b/evals/openrouter-stt/tasks/04-anti-trigger-asr-theory.yaml
new file mode 100644
index 0000000..892b4ad
--- /dev/null
+++ b/evals/openrouter-stt/tasks/04-anti-trigger-asr-theory.yaml
@@ -0,0 +1,53 @@
+id: anti-trigger-asr-theory-001
+name: Anti-Trigger - ASR Theory Question
+description: |
+  Negative test: conceptual question about ASR/STT. Shares keywords
+  ("STT", "speech-to-text", "ASR") with the skill but is not a
+  transcription request. The skill should not activate into code
+  generation.
+tags:
+  - anti-trigger
+  - negative-test
+  - trigger-specificity
+
+inputs:
+  prompt: |
+    Can you explain how modern speech-to-text models like Whisper handle
+    multi-speaker audio? I want to understand if they do speaker
+    diarization out of the box or if that's a separate step.
+
+graders:
+  - type: code
+    name: no_openrouter_api_call
+    config:
+      language: python
+      assertions:
+        - "'openrouter.ai/api/v1/audio/transcriptions' not in output.lower()"
+        - "'input_audio' not in output.lower() or output.lower().count('input_audio') <= 1"
+        - "'$openrouter_api_key' not in output.lower() and 'os.environ[\"openrouter_api_key\"]' not in output.lower()"
+
+  - type: prompt
+    name: stays_educational
+    config:
+      model: openai/gpt-chat-latest
+      continue_session: true
+      prompt: |
+        The user asked a conceptual question about ASR and speaker
+        diarization. Call set_waza_grade_pass or set_waza_grade_fail once
+        per criterion (2 calls total).
+
+        1) Answers the actual question: explains how Whisper handles
+           multi-speaker input, and clarifies whether speaker diarization
+           is built-in vs a separate step (correct answer: Whisper does
+           NOT do speaker diarization natively — that's a separate task,
+           typically handled with pyannote or similar).
+
+        2) No OpenRouter contamination: does NOT produce curl commands,
+           does NOT write a transcription script, does NOT insert code
+           that calls /api/v1/audio/transcriptions. A brief mention of
+           OpenRouter models is okay; generating an STT client script
+           is not.
+
+expected:
+  outcomes:
+    - type: task_completed
diff --git a/evals/openrouter-stt/tasks/05-indirect-trigger-meeting-bot.yaml b/evals/openrouter-stt/tasks/05-indirect-trigger-meeting-bot.yaml
new file mode 100644
index 0000000..b843baa
--- /dev/null
+++ b/evals/openrouter-stt/tasks/05-indirect-trigger-meeting-bot.yaml
@@ -0,0 +1,61 @@
+id: indirect-trigger-meeting-bot-001
+name: Indirect Trigger - Meeting Transcription As Component
+description: |
+  Indirect mention: user is building something that needs STT as a piece,
+  but doesn't ask for STT directly. The skill should activate and wire in
+  the correct OpenRouter transcription call.
+tags:
+  - happy-path
+  - indirect-trigger
+
+inputs:
+  prompt: |
+    Write a Python script that reads a meeting recording (meeting.wav)
+    and generates a summary in summary.md. Show me the full code.
+
+graders:
+  - type: code
+    name: uses_openrouter_stt
+    config:
+      language: python
+      assertions:
+        # Accept either the full literal path or the path suffix, since agents
+        # sometimes use a variable like API_BASE for the host prefix.
+        - '"/audio/transcriptions" in output.lower()'
+        - '"input_audio" in output.lower()'
+        - '"meeting.wav" in output.lower()'
+
+  - type: code
+    name: writes_summary_md
+    config:
+      language: python
+      assertions:
+        - '"summary.md" in output.lower()'
+
+  - type: prompt
+    name: pipeline_quality
+    config:
+      model: openai/gpt-chat-latest
+      continue_session: true
+      prompt: |
+        The user wants a script that transcribes a meeting and writes a
+        summary to disk. Call set_waza_grade_pass or set_waza_grade_fail
+        once per criterion (3 calls total).
+
+        1) STT step correct: reads meeting.wav, base64-encodes it, POSTs
+           to /api/v1/audio/transcriptions with JSON body
+           {model, input_audio: {data, format: "wav"}}. Does NOT try the
+           OpenAI SDK for this endpoint.
+
+        2) Summarization step present: the transcript is passed to an
+           LLM chat completion endpoint (OpenRouter /api/v1/chat/completions
+           or equivalent) to produce a summary. A pipeline that stops at
+           the transcript does NOT pass this criterion.
+
+        3) Writes summary.md: the LLM-generated summary is written to
+           summary.md on disk using standard file I/O. Does NOT just
+           print to stdout.
+
+expected:
+  outcomes:
+    - type: task_completed
diff --git a/evals/openrouter-tts/eval.yaml b/evals/openrouter-tts/eval.yaml
new file mode 100644
index 0000000..daee387
--- /dev/null
+++ b/evals/openrouter-tts/eval.yaml
@@ -0,0 +1,37 @@
+name: openrouter-tts-eval
+description: |
+  Evaluation suite for the openrouter-tts skill. Validates that the agent
+  produces code that correctly calls OpenRouter's /api/v1/audio/speech
+  endpoint with proper auth, body shape, response-format handling, and
+  file-extension matching.
+
+  Graders are PER-TASK (task-level graders are additive in waza).
+skill: openrouter-tts
+version: "1.0"
+config:
+  trials_per_task: 1
+  timeout_seconds: 300
+  parallel: false
+  executor: copilot-sdk
+  model: claude-opus-4.7
+metrics:
+  - name: task_completion
+    weight: 1.0
+    threshold: 0.8
+    description: Did the agent write correct TTS code?
+
+# Keep the worktree skill content in sync with what the agent actually reads.
+hooks:
+  before_run:
+    - command: "mkdir -p ~/.agents/skills && rsync -a --delete /Users/matt.apperson/Development/skills/.worktrees/setup-waza/skills/openrouter-tts/ /Users/matt.apperson/.agents/skills/openrouter-tts/"
+
+graders:
+  # Universal: non-empty response.
+  - type: code
+    name: has_output
+    config:
+      assertions:
+        - "len(output) > 100"
+
+tasks:
+  - "tasks/*.yaml"
diff --git a/evals/openrouter-tts/tasks/01-happy-path-curl.yaml b/evals/openrouter-tts/tasks/01-happy-path-curl.yaml
new file mode 100644
index 0000000..9fff6a0
--- /dev/null
+++ b/evals/openrouter-tts/tasks/01-happy-path-curl.yaml
@@ -0,0 +1,75 @@
+id: happy-path-curl-001
+name: Happy Path - Simple TTS via curl
+description: |
+  Standard request: generate an MP3 from a short phrase. Agent should produce
+  a shell/curl approach that hits /api/v1/audio/speech with the right body
+  fields and saves the raw bytes to an .mp3 file.
+tags:
+  - happy-path
+  - curl
+  - bash
+
+inputs:
+  prompt: |
+    Write a bash script that uses OpenRouter to generate an MP3 file saying
+    "Hello world, this is a test" and saves it to speech.mp3. Show me the
+    complete script in your response so I can see it.
+
+graders:
+  - type: code
+    name: endpoint_and_auth
+    config:
+      language: python
+      assertions:
+        - '"/api/v1/audio/speech" in output.lower()'
+        - '"authorization" in output.lower() or "bearer" in output.lower()'
+        - '"openrouter_api_key" in output.lower() or "$OPENROUTER_API_KEY" in output'
+
+  - type: code
+    name: body_shape
+    config:
+      language: python
+      assertions:
+        - '"model" in output.lower() and "input" in output.lower()'
+        - '"voice" in output.lower()'
+        - '"response_format" in output.lower() or "format" in output.lower()'
+
+  - type: code
+    name: format_extension_match
+    config:
+      language: python
+      assertions:
+        # If the script sets response_format: mp3, output file must also be .mp3.
+        - '("mp3" in output.lower() and "speech.mp3" in output) or ("pcm" not in output.lower())'
+
+  - type: prompt
+    name: correctness_rubric
+    config:
+      model: openai/gpt-chat-latest
+      continue_session: true
+      prompt: |
+        The user asked for a bash script that calls OpenRouter's TTS API and
+        saves the result as speech.mp3. Evaluate the agent's response on
+        these criteria. Call set_waza_grade_pass or set_waza_grade_fail once
+        per criterion (4 calls total).
+
+        1) Correct endpoint + method: POST to
+           https://openrouter.ai/api/v1/audio/speech with JSON body.
+
+        2) Correct body fields: JSON body includes model (a real TTS model
+           slug like openai/gpt-4o-mini-tts-2025-12-15), input (the user's
+           text), voice (a real voice like alloy/nova), and
+           response_format set to "mp3".
+
+        3) Raw-bytes handling: the script treats the response body as raw
+           audio bytes, NOT as JSON. It writes bytes to speech.mp3 via
+           --output/-o or equivalent. Does not try to jq-parse the 200
+           response body.
+
+        4) Auth + API key: uses Authorization: Bearer $OPENROUTER_API_KEY
+           from environment. Does not hardcode a key or use the wrong
+           header name.
+
+expected:
+  outcomes:
+    - type: task_completed
diff --git a/evals/openrouter-tts/tasks/02-python-sdk.yaml b/evals/openrouter-tts/tasks/02-python-sdk.yaml
new file mode 100644
index 0000000..6f790ec
--- /dev/null
+++ b/evals/openrouter-tts/tasks/02-python-sdk.yaml
@@ -0,0 +1,60 @@
+id: python-sdk-001
+name: Python TTS via OpenAI SDK
+description: |
+  Tests the OpenAI-SDK-compatible path. Agent should use the OpenAI Python
+  SDK with base_url override, per the skill's documented pattern.
+tags:
+  - happy-path
+  - python
+  - sdk
+
+inputs:
+  prompt: |
+    I'm working in a Python project. Show me how to use OpenRouter's TTS
+    to narrate the first paragraph of a blog post and save it as
+    narration.mp3. Use the OpenAI Python SDK — I'd rather not shell out.
+    Include the full code in your response.
+
+graders:
+  - type: code
+    name: openai_sdk_with_base_url
+    config:
+      language: python
+      assertions:
+        - '"from openai import openai" in output.lower() or "import openai" in output.lower()'
+        - '"base_url" in output.lower() and "openrouter.ai/api/v1" in output.lower()'
+
+  - type: code
+    name: audio_speech_call
+    config:
+      language: python
+      assertions:
+        - '"audio.speech" in output.lower() or "audio/speech" in output.lower()'
+        - '"stream_to_file" in output.lower() or "write" in output.lower() or "arraybuffer" in output.lower() or "iter_bytes" in output.lower()'
+
+  - type: prompt
+    name: python_quality
+    config:
+      model: openai/gpt-chat-latest
+      continue_session: true
+      prompt: |
+        The user asked for a Python-SDK-based TTS implementation. Call
+        set_waza_grade_pass or set_waza_grade_fail once per criterion
+        (3 calls total).
+
+        1) Correct SDK usage: uses the OpenAI Python SDK (not a custom HTTP
+           call), initializes the client with base_url pointing at
+           https://openrouter.ai/api/v1 and api_key from
+           os.environ["OPENROUTER_API_KEY"].
+
+        2) Correct TTS call: calls client.audio.speech.create or equivalent
+           with model (real OpenRouter TTS slug), input (the paragraph),
+           voice, and response_format="mp3".
+
+        3) Saves as narration.mp3: the code writes the response bytes to
+           narration.mp3, using streaming or arrayBuffer — not trying to
+           parse the body as JSON on success.
+
+expected:
+  outcomes:
+    - type: task_completed
diff --git a/evals/openrouter-tts/tasks/03-voice-selection.yaml b/evals/openrouter-tts/tasks/03-voice-selection.yaml
new file mode 100644
index 0000000..be55d2e
--- /dev/null
+++ b/evals/openrouter-tts/tasks/03-voice-selection.yaml
@@ -0,0 +1,58 @@
+id: voice-selection-001
+name: Voice Selection
+description: |
+  User asks for a specific voice. The skill documents that voices are
+  provider-namespaced; the agent should either use a known-valid voice or
+  look it up via the models endpoint.
+tags:
+  - happy-path
+  - voice
+  - provider-specific
+
+inputs:
+  prompt: |
+    I want to generate speech with OpenRouter TTS using the "nova" voice.
+    Write a TypeScript script that does it and saves the output as
+    out.mp3. Include the complete script in your response.
+
+graders:
+  - type: code
+    name: uses_nova_voice
+    config:
+      language: python
+      assertions:
+        - '"nova" in output.lower()'
+        - '"/api/v1/audio/speech" in output.lower() or "audio.speech" in output.lower() or "audio/speech" in output.lower()'
+
+  - type: code
+    name: typescript_approach
+    config:
+      language: python
+      assertions:
+        - '"import" in output.lower() and ("openai" in output.lower() or "fetch" in output.lower())'
+        - '"out.mp3" in output'
+
+  - type: prompt
+    name: voice_quality
+    config:
+      model: openai/gpt-chat-latest
+      continue_session: true
+      prompt: |
+        The user asked for TypeScript code using the "nova" voice. Call
+        set_waza_grade_pass or set_waza_grade_fail once per criterion
+        (3 calls total).
+
+        1) Uses nova: passes voice: "nova" to the API.
+
+        2) Picks a compatible model: uses an OpenAI TTS model slug (e.g.
+           openai/gpt-4o-mini-tts-2025-12-15) — because nova is an OpenAI
+           voice and will not work on Voxtral/Kokoro. The skill explicitly
+           documents this.
+
+        3) Saves out.mp3 correctly: writes raw bytes to out.mp3 using the
+           OpenAI SDK (arrayBuffer → Buffer → writeFile) OR a direct fetch
+           that extracts the blob/arrayBuffer.
+
+expected:
+  outcomes:
+    - type: task_completed
diff --git a/evals/openrouter-tts/tasks/04-long-input-splitting.yaml b/evals/openrouter-tts/tasks/04-long-input-splitting.yaml
new file mode 100644
index 0000000..72cd6a0
--- /dev/null
+++ b/evals/openrouter-tts/tasks/04-long-input-splitting.yaml
@@ -0,0 +1,57 @@
+id: long-input-001
+name: Long Input - Chunk and Concat
+description: |
+  Tests the skill's Long-Inputs guidance: split at sentence/paragraph
+  boundaries, same model+voice per chunk, concatenate audio. Agent should
+  recognize this isn't a single-call scenario.
+tags:
+  - happy-path
+  - long-input
+  - chunking
+
+inputs:
+  prompt: |
+    I have a 15,000-character article I want to narrate end-to-end with
+    OpenRouter TTS. What's the right approach? Show me the code and
+    explain your strategy inline in the response.
+
+graders:
+  - type: code
+    name: recognizes_chunking_need
+    config:
+      language: python
+      assertions:
+        - '"split" in output.lower() or "chunk" in output.lower() or "paragraph" in output.lower() or "sentence" in output.lower()'
+
+  - type: code
+    name: mentions_concatenation
+    config:
+      language: python
+      assertions:
+        - '"concat" in output.lower() or "ffmpeg" in output.lower() or "combine" in output.lower() or "merge" in output.lower()'
+
+  - type: prompt
+    name: long_input_strategy
+    config:
+      model: openai/gpt-chat-latest
+      continue_session: true
+      prompt: |
+        The user has a 15k-character article to narrate via TTS. Call
+        set_waza_grade_pass or set_waza_grade_fail once per criterion
+        (3 calls total).
+
+        1) Recognizes per-request limits: explains or implies TTS has a
+           per-request character limit and that the full article must be
+           split, not sent in one call.
+
+        2) Correct splitting strategy: splits at sentence or paragraph
+           boundaries, not mid-word. Keeps model + voice consistent across
+           chunks for prosody continuity.
+
+        3) Concatenation path: describes or implements concatenating the
+           resulting audio (e.g., ffmpeg concat, or collecting buffers and
+           writing them in order). Does NOT skip the concat step.
+
+expected:
+  outcomes:
+    - type: task_completed
diff --git a/evals/openrouter-tts/tasks/05-anti-trigger-tts-theory.yaml b/evals/openrouter-tts/tasks/05-anti-trigger-tts-theory.yaml
new file mode 100644
index 0000000..07064ae
--- /dev/null
+++ b/evals/openrouter-tts/tasks/05-anti-trigger-tts-theory.yaml
@@ -0,0 +1,55 @@
+id: anti-trigger-tts-theory-001
+name: Anti-Trigger - TTS Theory Question
+description: |
+  Negative test: a theoretical question about TTS that shares keywords
+  ("TTS", "text-to-speech") but is NOT asking to generate audio. The skill
+  should not activate into code generation, and should not produce
+  OpenRouter-specific TTS API calls in the response.
+tags:
+  - anti-trigger
+  - negative-test
+  - trigger-specificity
+
+inputs:
+  prompt: |
+    What are the main differences between neural TTS models and the older
+    concatenative synthesis approach? I'm writing a blog post and want to
+    explain it clearly.
+
+graders:
+  - type: code
+    name: no_openrouter_api_call
+    config:
+      language: python
+      assertions:
+        # Response should not contain OpenRouter endpoints or API-key references,
+        # since this is an educational question, not a generation task.
+        - "'openrouter.ai/api/v1/audio/speech' not in output.lower()"
+        - "'$openrouter_api_key' not in output.lower() and 'os.environ[\"openrouter_api_key\"]' not in output.lower()"
+        # Mentioning OpenRouter by name once in context ("OpenRouter supports various models") is fine; full code is not.
+        - "output.lower().count('response_format') <= 1"
+
+  - type: prompt
+    name: stays_educational
+    config:
+      model: openai/gpt-chat-latest
+      continue_session: true
+      prompt: |
+        The user asked a conceptual/educational question about TTS
+        approaches. Call set_waza_grade_pass or set_waza_grade_fail once
+        per criterion (2 calls total).
+
+        1) Answers the actual question: explains differences between
+           neural TTS (e.g., Tacotron, WaveNet, modern diffusion-based
+           models) and concatenative synthesis (unit selection from
+           recorded speech databases). Covers quality, flexibility,
+           compute requirements, or similar dimensions.
+
+        2) No OpenRouter contamination: does NOT produce a curl command,
+           does NOT write a script that calls /api/v1/audio/speech, does
+           NOT suggest "here's how to do it with OpenRouter" — this is a
+           conceptual question, not a generation request.
+
+expected:
+  outcomes:
+    - type: task_completed
diff --git a/evals/openrouter-tts/tasks/06-indirect-trigger-voice-assistant.yaml b/evals/openrouter-tts/tasks/06-indirect-trigger-voice-assistant.yaml
new file mode 100644
index 0000000..9d634ee
--- /dev/null
+++ b/evals/openrouter-tts/tasks/06-indirect-trigger-voice-assistant.yaml
@@ -0,0 +1,62 @@
+id: indirect-trigger-voice-assistant-001
+name: Indirect Trigger - TTS As Component
+description: |
+  Tests the indirect-mention pattern documented in the repo's memory:
+  the user is building something that needs TTS as a component (a voice
+  assistant) but hasn't asked for TTS directly. The skill should activate
+  and the agent should wire in /api/v1/audio/speech for the spoken output.
+tags:
+  - happy-path
+  - indirect-trigger
+
+inputs:
+  prompt: |
+    Write a Node.js script for a simple voice greeting system: it takes
+    a user's name as input and produces an audio file that says
+    "Hello, <name>, welcome!" in a friendly voice. Save the audio as
+    greeting-<name>.mp3. Show me the complete script in your response.
+
+graders:
+  - type: code
+    name: uses_openrouter_tts
+    config:
+      language: python
+      assertions:
+        - '"/api/v1/audio/speech" in output.lower() or "audio.speech" in output.lower()'
+        - '"openrouter_api_key" in output.lower() or "$OPENROUTER_API_KEY" in output'
+
+  - type: code
+    name: dynamic_filename_and_content
+    config:
+      language: python
+      assertions:
+        # Script must use the name variable in both the spoken text AND the filename.
+        - '"greeting-" in output.lower() or "greeting_" in output.lower()'
+        - '"welcome" in output.lower() or "hello" in output.lower()'
+
+  - type: prompt
+    name: component_assembly
+    config:
+      model: openai/gpt-chat-latest
+      continue_session: true
+      prompt: |
+        The user wants a voice greeting system. Call set_waza_grade_pass
+        or set_waza_grade_fail once per criterion (3 calls total).
+
+        1) Correctly uses TTS: calls OpenRouter's TTS endpoint
+           (/api/v1/audio/speech) with the greeting text and a real voice,
+           writes the response bytes to the .mp3 file. Does NOT try to
+           generate a placeholder or skip the audio step.
+
+        2) Dynamic personalization: the name is injected into BOTH the
+           spoken input ("Hello, Alice, welcome!") AND the output
+           filename (greeting-alice.mp3). A single hardcoded value for
+           both does NOT pass this criterion.
+
+        3) Complete script: the code is runnable as a Node.js script
+           (has imports, an entry point, reads name from argv or function
+           arg, handles the TTS response as raw bytes, writes the file).
+
+expected:
+  outcomes:
+    - type: task_completed
diff --git a/evals/openrouter-typescript-sdk/eval.yaml b/evals/openrouter-typescript-sdk/eval.yaml
new file mode 100644
index 0000000..086ecc8
--- /dev/null
+++ b/evals/openrouter-typescript-sdk/eval.yaml
@@ -0,0 +1,31 @@
+name: openrouter-typescript-sdk-eval
+description: |
+  TODO: scaffolding only — tasks are generic stubs. Author real tasks +
+  graders before running baseline. See evals/openrouter-tts for a worked
+  example.
+skill: openrouter-typescript-sdk
+version: "1.0"
+config:
+  trials_per_task: 1
+  timeout_seconds: 300
+  parallel: false
+  executor: copilot-sdk
+  model: claude-sonnet-4.6
+metrics:
+  - name: task_completion
+    weight: 1.0
+    threshold: 0.8
+    description: Did the skill complete the assigned task?
+graders:
+  - type: code
+    name: has_output
+    config:
+      assertions:
+        - "len(output) > 0"
+  - type: text
+    name: relevant_content
+    config:
+      regex_match:
+        - "(?i)(explain|describe|analyze|implement)"
+tasks:
+  - "tasks/*.yaml"
diff --git a/evals/openrouter-typescript-sdk/fixtures/sample.py b/evals/openrouter-typescript-sdk/fixtures/sample.py
new file mode 100644
index 0000000..3f022d1
--- /dev/null
+++ b/evals/openrouter-typescript-sdk/fixtures/sample.py
@@ -0,0 +1,3 @@
+def hello(name):
+    """Greet someone by name."""
+    return f"Hello, {name}!"
diff --git a/evals/openrouter-typescript-sdk/tasks/basic-usage.yaml b/evals/openrouter-typescript-sdk/tasks/basic-usage.yaml
new file mode 100644
index 0000000..a08b1a9
--- /dev/null
+++ b/evals/openrouter-typescript-sdk/tasks/basic-usage.yaml
@@ -0,0 +1,16 @@
+id: basic-usage-001
+name: Basic Usage
+description: |
+  Test that the skill handles a typical request correctly.
+tags:
+  - basic
+  - happy-path
+inputs:
+  prompt: "Help me with this task"
+  files:
+    - path: sample.py
+expected:
+  output_contains:
+    - "function"
+  outcomes:
+    - type: task_completed
diff --git a/evals/openrouter-typescript-sdk/tasks/edge-case.yaml b/evals/openrouter-typescript-sdk/tasks/edge-case.yaml
new file mode 100644
index 0000000..0ff236a
--- /dev/null
+++ b/evals/openrouter-typescript-sdk/tasks/edge-case.yaml
@@ -0,0 +1,11 @@
+id: edge-case-001
+name: Edge Case - Empty Input
+description: |
+  Test that the skill handles edge cases gracefully.
+tags:
+  - edge-case
+inputs:
+  prompt: ""
+expected:
+  outcomes:
+    - type: task_completed
diff --git a/evals/openrouter-typescript-sdk/tasks/should-not-trigger.yaml b/evals/openrouter-typescript-sdk/tasks/should-not-trigger.yaml
new file mode 100644
index 0000000..9d85a0d
--- /dev/null
+++ b/evals/openrouter-typescript-sdk/tasks/should-not-trigger.yaml
@@ -0,0 +1,13 @@
+id: should-not-trigger-001
+name: Should Not Trigger
+description: |
+  Test that the skill does NOT activate on unrelated prompts.
+  This validates trigger specificity.
+tags:
+  - anti-trigger
+  - negative-test
+inputs:
+  prompt: "What is the weather today?"
+expected:
+  output_not_contains:
+    - "skill activated"
diff --git a/evals/openrouter-video/eval.yaml b/evals/openrouter-video/eval.yaml
new file mode 100644
index 0000000..c7e01f6
--- /dev/null
+++ b/evals/openrouter-video/eval.yaml
@@ -0,0 +1,33 @@
+name: openrouter-video-eval
+description: |
+  Evaluation suite for the openrouter-video skill. Video generation is
+  async: submit → poll → download. Graders verify the agent writes code
+  covering all 3 steps, uses real model slugs, and validates parameters
+  against the models endpoint before submitting.
+skill: openrouter-video
+version: "1.0"
+config:
+  trials_per_task: 1
+  timeout_seconds: 300
+  parallel: false
+  executor: copilot-sdk
+  model: claude-opus-4.7
+metrics:
+  - name: task_completion
+    weight: 1.0
+    threshold: 0.8
+    description: Did the agent produce correct async video code?
+
+hooks:
+  before_run:
+    - command: "mkdir -p ~/.agents/skills && rsync -a --delete /Users/matt.apperson/Development/skills/.worktrees/setup-waza/skills/openrouter-video/ /Users/matt.apperson/.agents/skills/openrouter-video/"
+
+graders:
+  - type: code
+    name: has_output
+    config:
+      assertions:
+        - "len(output) > 100"
+
+tasks:
+  - "tasks/*.yaml"
diff --git a/evals/openrouter-video/tasks/01-full-async-flow.yaml b/evals/openrouter-video/tasks/01-full-async-flow.yaml
new file mode 100644
index 0000000..1769bbd
--- /dev/null
+++ b/evals/openrouter-video/tasks/01-full-async-flow.yaml
@@ -0,0 +1,75 @@
+id: full-async-flow-001
+name: Happy Path - Full 3-Step Video Generation
+description: |
+  Standard request. Agent should write a bash script covering all three
+  steps: POST /api/v1/videos, poll the polling_url until status=completed
+  (handling failed/cancelled/expired terminal states), then GET the content.
+tags:
+  - happy-path
+  - bash
+  - async-flow
+
+inputs:
+  prompt: |
+    Write a bash script that generates a short video from this prompt:
+    "a golden retriever playing fetch on a sunny beach". Save the final
+    MP4 to video.mp4. Show me the complete script in your response.
+
+graders:
+  - type: code
+    name: submit_step
+    config:
+      language: python
+      assertions:
+        - '"POST" in output or "-X POST" in output or "method: \"POST\"" in output.lower()'
+        - '"/videos" in output.lower()'
+        - '"prompt" in output.lower() and "model" in output.lower()'
+
+  - type: code
+    name: poll_step
+    config:
+      language: python
+      assertions:
+        - '"polling_url" in output.lower() or "poll" in output.lower()'
+        - '"status" in output.lower() and "completed" in output.lower()'
+        - '"failed" in output.lower() and ("cancelled" in output.lower() or "canceled" in output.lower() or "expired" in output.lower())'
+
+  - type: code
+    name: download_step
+    config:
+      language: python
+      assertions:
+        - '"video.mp4" in output.lower()'
+        - '"authorization" in output.lower() or "bearer" in output.lower()'
+
+  - type: prompt
+    name: async_flow_quality
+    config:
+      model: openai/gpt-chat-latest
+      continue_session: true
+      prompt: |
+        The user asked for a bash script that generates a video from a
+        prompt using OpenRouter's async video API. Call set_waza_grade_pass
+        or set_waza_grade_fail once per criterion (4 calls total).
+
+        1) Submit step: POST to https://openrouter.ai/api/v1/videos with
+           JSON body containing model (real slug like google/veo-3.1) and
+           prompt. Auth via Authorization: Bearer $OPENROUTER_API_KEY.
+
+        2) Poll step: polls the polling_url returned from submit (with
+           the auth header) at a reasonable interval (30s or so), breaks
+           on status="completed", exits with an error on "failed",
+           "cancelled", or "expired" and prints the .error field.
+
+        3) Download step: downloads the MP4 from the completed response
+           (unsigned_urls[0] or /api/v1/videos/{id}/content) WITH the
+           Authorization header, saves to video.mp4. Does NOT try to
+           stream output without auth.
+
+        4) Correct overall: recognizes video generation is async and
+           does NOT expect the POST to return video bytes directly.
+           Explains the delay to the user or handles it gracefully.
+
+expected:
+  outcomes:
+    - type: task_completed
diff --git a/evals/openrouter-video/tasks/02-model-parameter-validation.yaml b/evals/openrouter-video/tasks/02-model-parameter-validation.yaml
new file mode 100644
index 0000000..d700161
--- /dev/null
+++ b/evals/openrouter-video/tasks/02-model-parameter-validation.yaml
@@ -0,0 +1,60 @@
+id: model-parameter-validation-001
+name: Parameter Validation Against Models Endpoint
+description: |
+  User asks for specific video params (duration, aspect ratio). The skill
+  says to fetch model capabilities first and only send values from the
+  returned sets — guessing causes 400s.
+tags:
+  - happy-path
+  - parameters
+  - validation
+
+inputs:
+  prompt: |
+    I want to generate a 6-second vertical (9:16) video with google/veo-3.1
+    from the prompt "a sunrise over the ocean". Write the bash to submit
+    the job. Show me the script inline.
+
+graders:
+  - type: code
+    name: checks_model_capabilities
+    config:
+      language: python
+      assertions:
+        - '"videos/models" in output.lower() or "supported_resolutions" in output.lower() or "supported_aspect_ratios" in output.lower() or "supported_durations" in output.lower()'
+
+  - type: code
+    name: passes_correct_params
+    config:
+      language: python
+      assertions:
+        - '"duration" in output.lower() and "6" in output'
+        - '"9:16" in output or "aspect_ratio" in output.lower()'
+        - '"google/veo-3.1" in output.lower()'
+
+  - type: prompt
+    name: validation_approach
+    config:
+      model: openai/gpt-chat-latest
+      continue_session: true
+      prompt: |
+        The user asked for a 6s, 9:16 video with google/veo-3.1. Call
+        set_waza_grade_pass or set_waza_grade_fail once per criterion
+        (3 calls total).
+
+        1) Validates first OR uses known-good values: either fetches
+           /api/v1/videos/models to check supported_durations contains 6
+           and supported_aspect_ratios contains "9:16" before submitting,
+           OR notes the skill's "don't guess" guidance and uses values
+           straight from the model's documented capability set.
+
+        2) Submits with correct params: POST body includes
+           duration: 6, aspect_ratio: "9:16" (or size), model:
+           "google/veo-3.1", prompt: (user's text).
+
+        3) Async-aware: does not expect the POST to return video bytes,
+           either polls or explains the next step is polling.
+
+expected:
+  outcomes:
+    - type: task_completed
diff --git a/evals/openrouter-video/tasks/03-image-to-video.yaml b/evals/openrouter-video/tasks/03-image-to-video.yaml
new file mode 100644
index 0000000..e5dad62
--- /dev/null
+++ b/evals/openrouter-video/tasks/03-image-to-video.yaml
@@ -0,0 +1,70 @@
+id: image-to-video-001
+name: Image-to-Video with frame_images
+description: |
+  User wants to animate an image. Skill documents that frame_images[]
+  carries {type: "image_url", image_url: {url}, frame_type: "first_frame"}.
+tags:
+  - happy-path
+  - image-to-video
+  - frame_images
+
+inputs:
+  prompt: |
+    I have a local image start.png. Animate it into a 4-second video using
+    google/veo-3.1 with the prompt "camera slowly zooms out". Save as
+    animation.mp4. Include the complete bash script.
+
+graders:
+  - type: code
+    name: frame_images_structure
+    config:
+      language: python
+      assertions:
+        - '"frame_images" in output.lower()'
+        - '"first_frame" in output.lower() or "frame_type" in output.lower()'
+        - '"image_url" in output.lower()'
+
+  - type: code
+    name: local_image_encoding
+    config:
+      language: python
+      assertions:
+        # Local image → base64 data URL
+        - '"base64" in output.lower() or "b64" in output.lower()'
+        - '"data:image" in output.lower() or "data:" in output'
+
+  - type: code
+    name: uses_specified_params
+    config:
+      language: python
+      assertions:
+        - '"google/veo-3.1" in output.lower()'
+        - '"animation.mp4" in output.lower()'
+        - '"zoom" in output.lower()'
+
+  - type: prompt
+    name: i2v_quality
+    config:
+      model: openai/gpt-chat-latest
+      continue_session: true
+      prompt: |
+        The user wants to animate start.png into animation.mp4 via
+        google/veo-3.1 with a zoom prompt. Call set_waza_grade_pass or
+        set_waza_grade_fail once per criterion (3 calls total).
+
+        1) Correct frame_images shape: POST body includes frame_images
+           as an array where each entry is
+           {type: "image_url", image_url: {url}, frame_type: "first_frame"}.
+           Does NOT use input_references (that's for reference-to-video
+           style guidance, not for animating an image directly).
+
+        2) Local image encoded correctly: converts start.png to a base64
+           data URL with a proper image MIME type prefix (data:image/png;
+           base64,...), NOT just raw base64 without the prefix.
+
+        3) Full async flow: submits with model, prompt, frame_images;
+           polls for completion; downloads final video as animation.mp4.
+
+expected:
+  outcomes:
+    - type: task_completed
diff --git a/evals/openrouter-video/tasks/04-anti-trigger-video-theory.yaml b/evals/openrouter-video/tasks/04-anti-trigger-video-theory.yaml
new file mode 100644
index 0000000..35e6813
--- /dev/null
+++ b/evals/openrouter-video/tasks/04-anti-trigger-video-theory.yaml
@@ -0,0 +1,50 @@
+id: anti-trigger-video-theory-001
+name: Anti-Trigger - Video Gen Theory Question
+description: |
+  Negative test: conceptual question about video generation models. Shares
+  keywords but is not a generation request.
+tags:
+  - anti-trigger
+  - negative-test
+  - trigger-specificity
+
+inputs:
+  prompt: |
+    What's the difference between diffusion-based video models like Sora
+    and older GAN-based approaches for video generation? I'm writing a
+    primer for my blog.
+
+graders:
+  - type: code
+    name: no_openrouter_api_call
+    config:
+      language: python
+      assertions:
+        - "'openrouter.ai/api/v1/videos' not in output.lower()"
+        - "'polling_url' not in output.lower()"
+        - "'$openrouter_api_key' not in output.lower()"
+
+  - type: prompt
+    name: stays_educational
+    config:
+      model: openai/gpt-chat-latest
+      continue_session: true
+      prompt: |
+        The user asked a conceptual question about video generation
+        architectures. Call set_waza_grade_pass or set_waza_grade_fail
+        once per criterion (2 calls total).
+
+        1) Answers the question: explains diffusion video models (noise
+           → latent video via iterative denoising, e.g., Sora, Veo) vs
+           GAN-based video (generator/discriminator adversarial training,
+           e.g., VGAN, MoCoGAN), covering differences in training
+           stability, output quality, temporal coherence, or compute.
+
+        2) No OpenRouter contamination: does NOT produce curl commands,
+           does NOT invoke /api/v1/videos, does NOT include any script
+           that submits a generation job. A brief mention of "OpenRouter
+           exposes these models" is fine; generating code is not.
+
+expected:
+  outcomes:
+    - type: task_completed
diff --git a/evals/openrouter-video/tasks/05-indirect-trigger-explainer.yaml b/evals/openrouter-video/tasks/05-indirect-trigger-explainer.yaml
new file mode 100644
index 0000000..1982391
--- /dev/null
+++ b/evals/openrouter-video/tasks/05-indirect-trigger-explainer.yaml
@@ -0,0 +1,56 @@
+id: indirect-trigger-explainer-001
+name: Indirect Trigger - Product Explainer Pipeline
+description: |
+  Indirect mention: user builds a pipeline that needs video as a
+  component. The skill should activate without being asked directly for
+  "video generation".
+tags:
+  - happy-path
+  - indirect-trigger
+
+inputs:
+  prompt: |
+    Write a Node.js script that takes a product description as input and
+    produces a 4-second explainer clip in clip.mp4. Use OpenRouter.
+    Include the full code.
+
+graders:
+  - type: code
+    name: uses_video_endpoint
+    config:
+      language: python
+      assertions:
+        - '"/videos" in output.lower()'
+        - '"clip.mp4" in output.lower()'
+
+  - type: code
+    name: handles_async
+    config:
+      language: python
+      assertions:
+        - '"polling_url" in output.lower() or ("poll" in output.lower() and "status" in output.lower())'
+        - '"completed" in output.lower()'
+
+  - type: prompt
+    name: pipeline_quality
+    config:
+      model: openai/gpt-chat-latest
+      continue_session: true
+      prompt: |
+        The user wants a Node.js explainer-video pipeline. Call
+        set_waza_grade_pass or set_waza_grade_fail once per criterion
+        (3 calls total).
+
+        1) Takes description as input: reads the product description
+           from argv, stdin, or function arg — not hardcoded.
+
+        2) Uses video API correctly: submits to /api/v1/videos with the
+           description as the prompt, picks a real video-gen model slug,
+           passes duration: 4. Polls for completion, downloads the MP4.
+
+        3) Writes clip.mp4: saves the resulting video bytes to clip.mp4
+           with the auth header on the download call.
+
+expected:
+  outcomes:
+    - type: task_completed
diff --git a/skills/openrouter-typescript-sdk/SKILL.md b/skills/openrouter-typescript-sdk/SKILL.md
index 7fc11cb..b4f4728 100644
--- a/skills/openrouter-typescript-sdk/SKILL.md
+++ b/skills/openrouter-typescript-sdk/SKILL.md
@@ -8,6 +8,12 @@ version: 2.0.0
 
 A comprehensive TypeScript SDK for interacting with OpenRouter's unified API, providing access to 300+ AI models through a single, type-safe interface. This skill enables AI agents to leverage the `callModel` pattern for text generation, tool usage, streaming, and multi-turn conversations.
 
+## Resolving Model Names Before Use
+
+**If the user names a specific model — exact ID, informal alias, or passing mention ("use GLM", "hit the latest Claude") — resolve it to an exact OpenRouter ID BEFORE writing any SDK code that passes the model string.** Load the `openrouter-models` skill and run its `resolve-model.ts` with the user's phrase. Do not guess a model ID, do not query `/api/v1/models` directly, do not hardcode a string like `"glm"` or `"claude-4"` into the `model:` field.
+
+After resolution, use the exact `id` (e.g. `z-ai/glm-4.5`, `anthropic/claude-opus-4.7`) in `callModel({ model: ... })`. If the resolver returns medium/low confidence, surface the chosen match to the user or ask for confirmation.
+
 The SDK is split into two packages:
 - **`@openrouter/agent`** — Agent features: `callModel`, `tool()`, stop conditions, streaming, format converters
 - **`@openrouter/sdk`** — Platform features: model listing, chat completions, credits, OAuth, API key management