diff --git a/.github/workflows/eval.yml b/.github/workflows/eval.yml new file mode 100644 index 0000000..a002bd7 --- /dev/null +++ b/.github/workflows/eval.yml @@ -0,0 +1,34 @@ +name: Run Skill Evaluations + +on: + pull_request: + branches: [main] + paths: + - 'evals/**' + - 'skills/**' + +permissions: + contents: read + +jobs: + eval: + name: Run Evaluations + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Install Azure Developer CLI + uses: Azure/setup-azd@v2 + - name: Install waza extension + run: | + azd config set alpha.extensions on + azd ext source add -n waza -t url -l https://raw.githubusercontent.com/microsoft/waza/main/registry.json + azd ext install microsoft.azd.waza + - name: Run evaluations + run: azd waza run --output-dir ./results + - name: Upload results + if: always() + uses: actions/upload-artifact@v4 + with: + name: eval-results + path: ./results + retention-days: 30 diff --git a/.gitignore b/.gitignore index 3b9ac34..f955f26 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,7 @@ build/ .env.* .DS_Store .claude/worktrees/ + +# waza eval outputs and caches (local to each run; not source-of-truth) +.waza-results/ +.waza-cache/ diff --git a/.waza.yaml b/.waza.yaml new file mode 100644 index 0000000..7dcaaf8 --- /dev/null +++ b/.waza.yaml @@ -0,0 +1,31 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/microsoft/waza/main/schemas/config.schema.json + +paths: + skills: skills + evals: evals + results: .waza-results +defaults: + engine: copilot-sdk + model: claude-sonnet-4.6 + timeout: 300 + parallel: false + workers: 4 + verbose: false + sessionLog: false +cache: + enabled: false + dir: .waza-cache +server: + port: 3000 + resultsDir: results/ +dev: + model: claude-sonnet-4-20250514 + target: medium-high + maxIterations: 5 +tokens: + warningThreshold: 500 + fallbackLimit: 1000 +graders: + programTimeout: 30 +storage: + containerName: waza-results diff --git a/evals/create-agent-tui/eval.yaml b/evals/create-agent-tui/eval.yaml new file mode 100644 index 0000000..de8b6d3 --- /dev/null +++ b/evals/create-agent-tui/eval.yaml @@ -0,0 +1,32 @@ +name: create-agent-tui-eval +description: | + TODO: scaffolding only — tasks are generic stubs. Author real tasks + + graders before running baseline. See evals/openrouter-tts for a worked + example. Per project memory, this skill's graders need to drive the + generated TUI via pilotty, not just assert on file contents. +skill: create-agent-tui +version: "1.0" +config: + trials_per_task: 1 + timeout_seconds: 300 + parallel: false + executor: copilot-sdk + model: claude-sonnet-4.6 +metrics: + - name: task_completion + weight: 1.0 + threshold: 0.8 + description: Did the skill complete the assigned task? +graders: + - type: code + name: has_output + config: + assertions: + - "len(output) > 0" + - type: text + name: relevant_content + config: + regex_match: + - "(?i)(explain|describe|analyze|implement)" +tasks: + - "tasks/*.yaml" diff --git a/evals/create-agent-tui/fixtures/sample.py b/evals/create-agent-tui/fixtures/sample.py new file mode 100644 index 0000000..3f022d1 --- /dev/null +++ b/evals/create-agent-tui/fixtures/sample.py @@ -0,0 +1,3 @@ +def hello(name): + """Greet someone by name.""" + return f"Hello, {name}!" diff --git a/evals/create-agent-tui/tasks/basic-usage.yaml b/evals/create-agent-tui/tasks/basic-usage.yaml new file mode 100644 index 0000000..a08b1a9 --- /dev/null +++ b/evals/create-agent-tui/tasks/basic-usage.yaml @@ -0,0 +1,16 @@ +id: basic-usage-001 +name: Basic Usage +description: | + Test that the skill handles a typical request correctly. +tags: + - basic + - happy-path +inputs: + prompt: "Help me with this task" + files: + - path: sample.py +expected: + output_contains: + - "function" + outcomes: + - type: task_completed diff --git a/evals/create-agent-tui/tasks/edge-case.yaml b/evals/create-agent-tui/tasks/edge-case.yaml new file mode 100644 index 0000000..0ff236a --- /dev/null +++ b/evals/create-agent-tui/tasks/edge-case.yaml @@ -0,0 +1,11 @@ +id: edge-case-001 +name: Edge Case - Empty Input +description: | + Test that the skill handles edge cases gracefully. +tags: + - edge-case +inputs: + prompt: "" +expected: + outcomes: + - type: task_completed diff --git a/evals/create-agent-tui/tasks/should-not-trigger.yaml b/evals/create-agent-tui/tasks/should-not-trigger.yaml new file mode 100644 index 0000000..9d85a0d --- /dev/null +++ b/evals/create-agent-tui/tasks/should-not-trigger.yaml @@ -0,0 +1,13 @@ +id: should-not-trigger-001 +name: Should Not Trigger +description: | + Test that the skill does NOT activate on unrelated prompts. + This validates trigger specificity. +tags: + - anti-trigger + - negative-test +inputs: + prompt: "What is the weather today?" +expected: + output_not_contains: + - "skill activated" diff --git a/evals/create-headless-agent/eval.yaml b/evals/create-headless-agent/eval.yaml new file mode 100644 index 0000000..8ed36fd --- /dev/null +++ b/evals/create-headless-agent/eval.yaml @@ -0,0 +1,31 @@ +name: create-headless-agent-eval +description: | + TODO: scaffolding only — tasks are generic stubs. Author real tasks + + graders before running baseline. See evals/openrouter-tts for a worked + example. +skill: create-headless-agent +version: "1.0" +config: + trials_per_task: 1 + timeout_seconds: 300 + parallel: false + executor: copilot-sdk + model: claude-sonnet-4.6 +metrics: + - name: task_completion + weight: 1.0 + threshold: 0.8 + description: Did the skill complete the assigned task? +graders: + - type: code + name: has_output + config: + assertions: + - "len(output) > 0" + - type: text + name: relevant_content + config: + regex_match: + - "(?i)(explain|describe|analyze|implement)" +tasks: + - "tasks/*.yaml" diff --git a/evals/create-headless-agent/fixtures/sample.py b/evals/create-headless-agent/fixtures/sample.py new file mode 100644 index 0000000..3f022d1 --- /dev/null +++ b/evals/create-headless-agent/fixtures/sample.py @@ -0,0 +1,3 @@ +def hello(name): + """Greet someone by name.""" + return f"Hello, {name}!" diff --git a/evals/create-headless-agent/tasks/basic-usage.yaml b/evals/create-headless-agent/tasks/basic-usage.yaml new file mode 100644 index 0000000..a08b1a9 --- /dev/null +++ b/evals/create-headless-agent/tasks/basic-usage.yaml @@ -0,0 +1,16 @@ +id: basic-usage-001 +name: Basic Usage +description: | + Test that the skill handles a typical request correctly. +tags: + - basic + - happy-path +inputs: + prompt: "Help me with this task" + files: + - path: sample.py +expected: + output_contains: + - "function" + outcomes: + - type: task_completed diff --git a/evals/create-headless-agent/tasks/edge-case.yaml b/evals/create-headless-agent/tasks/edge-case.yaml new file mode 100644 index 0000000..0ff236a --- /dev/null +++ b/evals/create-headless-agent/tasks/edge-case.yaml @@ -0,0 +1,11 @@ +id: edge-case-001 +name: Edge Case - Empty Input +description: | + Test that the skill handles edge cases gracefully. +tags: + - edge-case +inputs: + prompt: "" +expected: + outcomes: + - type: task_completed diff --git a/evals/create-headless-agent/tasks/should-not-trigger.yaml b/evals/create-headless-agent/tasks/should-not-trigger.yaml new file mode 100644 index 0000000..9d85a0d --- /dev/null +++ b/evals/create-headless-agent/tasks/should-not-trigger.yaml @@ -0,0 +1,13 @@ +id: should-not-trigger-001 +name: Should Not Trigger +description: | + Test that the skill does NOT activate on unrelated prompts. + This validates trigger specificity. +tags: + - anti-trigger + - negative-test +inputs: + prompt: "What is the weather today?" +expected: + output_not_contains: + - "skill activated" diff --git a/evals/openrouter-agent-migration/eval.yaml b/evals/openrouter-agent-migration/eval.yaml new file mode 100644 index 0000000..72be69b --- /dev/null +++ b/evals/openrouter-agent-migration/eval.yaml @@ -0,0 +1,31 @@ +name: openrouter-agent-migration-eval +description: | + TODO: scaffolding only — tasks are generic stubs. Author real tasks + + graders before running baseline. See evals/openrouter-tts for a worked + example. +skill: openrouter-agent-migration +version: "1.0" +config: + trials_per_task: 1 + timeout_seconds: 300 + parallel: false + executor: copilot-sdk + model: claude-sonnet-4.6 +metrics: + - name: task_completion + weight: 1.0 + threshold: 0.8 + description: Did the skill complete the assigned task? +graders: + - type: code + name: has_output + config: + assertions: + - "len(output) > 0" + - type: text + name: relevant_content + config: + regex_match: + - "(?i)(explain|describe|analyze|implement)" +tasks: + - "tasks/*.yaml" diff --git a/evals/openrouter-agent-migration/fixtures/sample.py b/evals/openrouter-agent-migration/fixtures/sample.py new file mode 100644 index 0000000..3f022d1 --- /dev/null +++ b/evals/openrouter-agent-migration/fixtures/sample.py @@ -0,0 +1,3 @@ +def hello(name): + """Greet someone by name.""" + return f"Hello, {name}!" diff --git a/evals/openrouter-agent-migration/tasks/basic-usage.yaml b/evals/openrouter-agent-migration/tasks/basic-usage.yaml new file mode 100644 index 0000000..a08b1a9 --- /dev/null +++ b/evals/openrouter-agent-migration/tasks/basic-usage.yaml @@ -0,0 +1,16 @@ +id: basic-usage-001 +name: Basic Usage +description: | + Test that the skill handles a typical request correctly. +tags: + - basic + - happy-path +inputs: + prompt: "Help me with this task" + files: + - path: sample.py +expected: + output_contains: + - "function" + outcomes: + - type: task_completed diff --git a/evals/openrouter-agent-migration/tasks/edge-case.yaml b/evals/openrouter-agent-migration/tasks/edge-case.yaml new file mode 100644 index 0000000..0ff236a --- /dev/null +++ b/evals/openrouter-agent-migration/tasks/edge-case.yaml @@ -0,0 +1,11 @@ +id: edge-case-001 +name: Edge Case - Empty Input +description: | + Test that the skill handles edge cases gracefully. +tags: + - edge-case +inputs: + prompt: "" +expected: + outcomes: + - type: task_completed diff --git a/evals/openrouter-agent-migration/tasks/should-not-trigger.yaml b/evals/openrouter-agent-migration/tasks/should-not-trigger.yaml new file mode 100644 index 0000000..9d85a0d --- /dev/null +++ b/evals/openrouter-agent-migration/tasks/should-not-trigger.yaml @@ -0,0 +1,13 @@ +id: should-not-trigger-001 +name: Should Not Trigger +description: | + Test that the skill does NOT activate on unrelated prompts. + This validates trigger specificity. +tags: + - anti-trigger + - negative-test +inputs: + prompt: "What is the weather today?" +expected: + output_not_contains: + - "skill activated" diff --git a/evals/openrouter-images/eval.yaml b/evals/openrouter-images/eval.yaml new file mode 100644 index 0000000..9e5f8b1 --- /dev/null +++ b/evals/openrouter-images/eval.yaml @@ -0,0 +1,33 @@ +name: openrouter-images-eval +description: | + Evaluation suite for the openrouter-images skill. Validates that the + agent picks the right bundled script (generate.ts for new images, + edit.ts for modifications) and invokes it with correct flags. +skill: openrouter-images +version: "1.0" +config: + trials_per_task: 1 + timeout_seconds: 300 + parallel: false + executor: copilot-sdk + model: claude-opus-4.7 +metrics: + - name: task_completion + weight: 1.0 + threshold: 0.8 + description: Did the agent pick the right script and flags? + +hooks: + before_run: + - command: "mkdir -p ~/.agents/skills && rsync -a --delete /Users/matt.apperson/Development/skills/.worktrees/setup-waza/skills/openrouter-images/ /Users/matt.apperson/.agents/skills/openrouter-images/" + - command: "cd /Users/matt.apperson/.agents/skills/openrouter-images/scripts && npm install --silent" + +graders: + - type: code + name: has_output + config: + assertions: + - "len(output) > 50" + +tasks: + - "tasks/*.yaml" diff --git a/evals/openrouter-images/tasks/01-generate-basic.yaml b/evals/openrouter-images/tasks/01-generate-basic.yaml new file mode 100644 index 0000000..8a51fce --- /dev/null +++ b/evals/openrouter-images/tasks/01-generate-basic.yaml @@ -0,0 +1,46 @@ +id: generate-basic-001 +name: Generate Basic Image +description: | + Decision tree says "generate from text" → generate.ts. Agent should + invoke it, not call the Responses API directly. +tags: + - happy-path + - generate + +inputs: + prompt: | + Generate an image of a red panda wearing sunglasses and save it + somewhere reasonable. + +graders: + - type: code + name: invoked_generate_script + config: + language: python + assertions: + - '"generate.ts" in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"])' + - '"red panda" in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"]).lower()' + + - type: prompt + name: generate_quality + config: + model: openai/gpt-chat-latest + continue_session: true + prompt: | + The user asked for a basic image generation. Call + set_waza_grade_pass or set_waza_grade_fail once per criterion + (3 calls total). + + 1) Used generate.ts: invoked the skill's generate.ts script + (not edit.ts, not a raw curl to /api/v1/responses). + + 2) Correct prompt: passed "a red panda wearing sunglasses" or + very close as the script's positional prompt argument. + + 3) Reports the result: tells the user the model used and where + the image was saved (per the skill's Presenting Results + guidance). + +expected: + outcomes: + - type: task_completed diff --git a/evals/openrouter-images/tasks/02-generate-with-aspect-ratio.yaml b/evals/openrouter-images/tasks/02-generate-with-aspect-ratio.yaml new file mode 100644 index 0000000..197cb98 --- /dev/null +++ b/evals/openrouter-images/tasks/02-generate-with-aspect-ratio.yaml @@ -0,0 +1,45 @@ +id: generate-aspect-ratio-001 +name: Generate With Aspect Ratio +description: | + User specifies a wide / landscape image. Agent should pass + --aspect-ratio 16:9 (or similar) to generate.ts. +tags: + - happy-path + - generate + - aspect-ratio + +inputs: + prompt: | + Make a wide landscape image of a futuristic city at night, 16:9. + +graders: + - type: code + name: aspect_ratio_flag + config: + language: python + assertions: + - '"generate.ts" in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"])' + - '"--aspect-ratio" in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"]) or "--aspect_ratio" in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"])' + - '"16:9" in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"])' + + - type: prompt + name: aspect_ratio_usage + config: + model: openai/gpt-chat-latest + continue_session: true + prompt: | + The user wanted a 16:9 landscape cityscape. Call + set_waza_grade_pass or set_waza_grade_fail once per criterion + (2 calls total). + + 1) Used flag correctly: passed --aspect-ratio 16:9 to generate.ts. + Does NOT hardcode the ratio into the prompt text or resort to + a different approach. + + 2) Prompt preserved: the prompt positional argument contains + "city" / "futuristic" / "night" (user's actual request), not + a rewritten description. + +expected: + outcomes: + - type: task_completed diff --git a/evals/openrouter-images/tasks/03-edit-image.yaml b/evals/openrouter-images/tasks/03-edit-image.yaml new file mode 100644 index 0000000..3710f0b --- /dev/null +++ b/evals/openrouter-images/tasks/03-edit-image.yaml @@ -0,0 +1,54 @@ +id: edit-image-001 +name: Edit Existing Image +description: | + User wants to modify an existing image → edit.ts, not generate.ts. + Common failure mode: agent uses generate.ts with the edit description. +tags: + - happy-path + - edit + +inputs: + prompt: | + I have a file called photo.png. Edit it so the sky is purple. + +graders: + - type: code + name: uses_edit_script + config: + language: python + assertions: + - '"edit.ts" in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"])' + - '"generate.ts" not in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"]) or "edit.ts" in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"])' + + - type: code + name: passes_photo_path + config: + language: python + assertions: + - '"photo.png" in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"])' + - '"purple" in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"]).lower() or "sky" in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"]).lower()' + + - type: prompt + name: edit_quality + config: + model: openai/gpt-chat-latest + continue_session: true + prompt: | + The user asked to edit photo.png (sky → purple). Call + set_waza_grade_pass or set_waza_grade_fail once per criterion + (3 calls total). + + 1) Correct script: used edit.ts (not generate.ts, not a raw API + call), per the decision tree "edit existing image → edit.ts". + + 2) Passed source path and prompt: first positional arg was + photo.png, second positional arg was a prompt about making + the sky purple. + + 3) Reports result: tells the user the output location and + references the source (photo.png), per the skill's presenting + guidance for edit operations. + +expected: + outcomes: + - type: task_completed diff --git a/evals/openrouter-images/tasks/04-anti-trigger-image-theory.yaml b/evals/openrouter-images/tasks/04-anti-trigger-image-theory.yaml new file mode 100644 index 0000000..8831530 --- /dev/null +++ b/evals/openrouter-images/tasks/04-anti-trigger-image-theory.yaml @@ -0,0 +1,45 @@ +id: anti-trigger-image-theory-001 +name: Anti-Trigger - Image Gen Theory +description: | + Conceptual question about image generation. Should not trigger the + bundled scripts. +tags: + - anti-trigger + - negative-test + +inputs: + prompt: | + How do diffusion models generate images from text? Explain the basic + idea — I'm curious about the technique, not looking to generate + anything. + +graders: + - type: code + name: no_scripts_invoked + config: + language: python + assertions: + - '"generate.ts" not in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"])' + - '"edit.ts" not in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"])' + + - type: prompt + name: stays_educational + config: + model: openai/gpt-chat-latest + continue_session: true + prompt: | + The user asked a conceptual question about diffusion models for + image generation. Call set_waza_grade_pass or set_waza_grade_fail + once per criterion (2 calls total). + + 1) Answers the question: explains the diffusion process (noise → + image via iterative denoising guided by a text encoder) at a + reasonable depth for someone curious. + + 2) Does NOT generate: does not invoke generate.ts or any other + script that produces an actual image. The user explicitly + said "not looking to generate anything". + +expected: + outcomes: + - type: task_completed diff --git a/evals/openrouter-images/tasks/05-indirect-trigger-blog-hero.yaml b/evals/openrouter-images/tasks/05-indirect-trigger-blog-hero.yaml new file mode 100644 index 0000000..4850b53 --- /dev/null +++ b/evals/openrouter-images/tasks/05-indirect-trigger-blog-hero.yaml @@ -0,0 +1,51 @@ +id: indirect-trigger-blog-hero-001 +name: Indirect Trigger - Blog Hero Image +description: | + User is writing a blog post and needs a hero image — doesn't say + "generate an image". The skill should activate and generate.ts should + be invoked. +tags: + - happy-path + - indirect-trigger + +inputs: + prompt: | + I'm writing a blog post about remote work and need a hero image for + the top of the page. Something that captures "working from a quiet + home office with good natural light". + +graders: + - type: code + name: invoked_generate + config: + language: python + assertions: + - '"generate.ts" in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"])' + - '"home office" in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"]).lower() or "remote work" in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"]).lower()' + + - type: prompt + name: indirect_assembly + config: + model: openai/gpt-chat-latest + continue_session: true + prompt: | + The user wants a blog hero image. Call set_waza_grade_pass or + set_waza_grade_fail once per criterion (3 calls total). + + 1) Recognizes this as image-gen: loads the openrouter-images + skill and uses generate.ts, even though the user didn't + directly say "generate an image". + + 2) Prompt faithfully captures the user's intent: the script is + invoked with a prompt referencing home-office / quiet / + natural light — not a generic placeholder or the user's full + sentence verbatim. + + 3) Aspect ratio appropriate for a hero image: passes + --aspect-ratio with a wide ratio (16:9, 21:9, 3:1, etc.) + typical for blog hero banners. Square (1:1) or portrait + ratios do not satisfy this criterion. + +expected: + outcomes: + - type: task_completed diff --git a/evals/openrouter-models/eval.yaml b/evals/openrouter-models/eval.yaml new file mode 100644 index 0000000..5b3b70d --- /dev/null +++ b/evals/openrouter-models/eval.yaml @@ -0,0 +1,50 @@ +name: openrouter-models-eval +description: | + Evaluation suite for the openrouter-models skill. Validates that the agent + picks the correct script (list-models / search-models / resolve-model / + compare-models / get-endpoints) and invokes it with the correct flags, then + formats the result per the skill's "Presenting Results" guidance. + + Graders are PER-TASK (task-level graders are additive). +skill: openrouter-models +version: "1.0" +config: + trials_per_task: 1 + timeout_seconds: 300 + parallel: false + executor: copilot-sdk + model: claude-opus-4.7 + # Force the agent to load the worktree copy of this skill, not whatever + # stale copy lives in ~/.claude/skills/ from a prior plugin install. + skill_directories: + - "skills" +metrics: + - name: task_completion + weight: 1.0 + threshold: 0.8 + description: Did the agent pick the right script and produce a skill-faithful answer? + +# Install the skill's script deps once before any tasks run, so each task +# doesn't pay a 20s+ npm install and we don't false-fail on missing tsx. +hooks: + before_run: + # Sync both skills the agent may route to for model-related coding + # tasks: the models skill itself, and openrouter-typescript-sdk (which + # carries the cross-skill cue pointing at openrouter-models). Paths + # are absolute because working_directory defaults to the eval's own + # directory, not the repo root. + - command: "mkdir -p ~/.agents/skills && rsync -a --delete /Users/matt.apperson/Development/skills/.worktrees/setup-waza/skills/openrouter-models/ /Users/matt.apperson/.agents/skills/openrouter-models/" + - command: "mkdir -p ~/.agents/skills/openrouter-typescript-sdk && rsync -a --delete /Users/matt.apperson/Development/skills/.worktrees/setup-waza/skills/openrouter-typescript-sdk/ /Users/matt.apperson/.agents/skills/openrouter-typescript-sdk/" + # Install script deps once per run inside the synced location. + - command: "cd /Users/matt.apperson/.agents/skills/openrouter-models/scripts && npm install --silent" + +graders: + # Universal: non-empty response. + - type: code + name: has_output + config: + assertions: + - "len(output) > 50" + +tasks: + - "tasks/*.yaml" diff --git a/evals/openrouter-models/tasks/01-cheapest-models.yaml b/evals/openrouter-models/tasks/01-cheapest-models.yaml new file mode 100644 index 0000000..19c427c --- /dev/null +++ b/evals/openrouter-models/tasks/01-cheapest-models.yaml @@ -0,0 +1,57 @@ +id: cheapest-models-001 +name: Find Cheapest Models +description: | + Tests the decision-tree mapping "cheapest models" → list-models.ts --sort price. + Also validates the skill's "Presenting Results" guidance: convert pricing to + per-million-tokens format. +tags: + - happy-path + - list-models + - sort + +inputs: + prompt: | + What are the five cheapest models available on OpenRouter right now? + +graders: + # Join all bash commands into one searchable string for substring checks. + # tool_calls[i] has keys: name, arguments, result, success. + - type: code + name: invoked_list_models_sort_price + config: + language: python + assertions: + - '"list-models.ts" in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"])' + - '"--sort price" in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"]) or "--sort=price" in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"])' + + - type: code + name: per_million_tokens_formatting + config: + language: python + assertions: + - '"/m" in output.lower() or "per million" in output.lower() or "/1m" in output.lower() or "1,000,000" in output' + + - type: prompt + name: answer_quality + config: + model: openai/gpt-chat-latest + continue_session: true + prompt: | + The user asked for the five cheapest OpenRouter models. Call + set_waza_grade_pass or set_waza_grade_fail once per criterion + (3 calls total). + + 1) Correct tool chosen: the agent ran list-models.ts with a + price-sort flag (not a wrong script like compare-models, + not a guess without running the script). + + 2) Five models returned: the response names about five specific + models with their provider/id. + + 3) Presentation correct: pricing is shown in per-million-tokens + format ("$X/M input", "$X per million", etc.) rather than raw + per-token values, per the skill's Presenting Results guidance. + +expected: + outcomes: + - type: task_completed diff --git a/evals/openrouter-models/tasks/02-compare-two-models.yaml b/evals/openrouter-models/tasks/02-compare-two-models.yaml new file mode 100644 index 0000000..d93a779 --- /dev/null +++ b/evals/openrouter-models/tasks/02-compare-two-models.yaml @@ -0,0 +1,55 @@ +id: compare-models-001 +name: Compare Two Models +description: | + Tests the decision-tree mapping "compare X vs Y" → compare-models.ts. + Skill guidance requires markdown table for comparisons. +tags: + - happy-path + - compare-models + +inputs: + prompt: | + How do Claude Sonnet 4 and GPT-4o compare on pricing and context length? + +graders: + - type: code + name: invoked_compare_models + config: + language: python + assertions: + - '"compare-models.ts" in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"])' + - '"claude" in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"]).lower() and ("sonnet" in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"]).lower())' + - '"gpt-4o" in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"]).lower()' + + - type: code + name: markdown_table_output + config: + language: python + assertions: + - '"|" in output and "---" in output' + + - type: prompt + name: comparison_quality + config: + model: openai/gpt-chat-latest + continue_session: true + prompt: | + The user asked to compare Claude Sonnet 4 and GPT-4o on pricing and + context length. Call set_waza_grade_pass or set_waza_grade_fail once + per criterion (3 calls total). + + 1) Correct tool: the agent ran compare-models.ts with both models + as arguments (not list-models + manual diff, not a single-model + lookup). + + 2) Both dimensions addressed: the response covers pricing AND + context length for both models. + + 3) Tabular presentation: results shown in a markdown table with + models as columns (or a clearly aligned comparison format), + per the skill's Presenting Results guidance. Pricing in + per-million-tokens format. + +expected: + outcomes: + - type: task_completed diff --git a/evals/openrouter-models/tasks/03-resolve-informal-name.yaml b/evals/openrouter-models/tasks/03-resolve-informal-name.yaml new file mode 100644 index 0000000..2839e10 --- /dev/null +++ b/evals/openrouter-models/tasks/03-resolve-informal-name.yaml @@ -0,0 +1,57 @@ +id: resolve-informal-001 +name: Resolve Informal Model Name +description: | + Tests the two-step workflow: resolve-model.ts for an informal name, then + feed the resolved ID into another script. The skill's decision tree is + explicit that informal names go through resolve first. +tags: + - happy-path + - resolve-model + - two-step + +inputs: + prompt: | + I want to compare "Claude Opus" against "the latest GPT" on context + length and pricing. Can you do that? + +graders: + - type: code + name: invoked_resolve_model + config: + language: python + assertions: + - '"resolve-model.ts" in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"])' + + - type: code + name: invoked_compare_after_resolve + config: + language: python + assertions: + - '"compare-models.ts" in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"])' + + - type: prompt + name: two_step_workflow + config: + model: openai/gpt-chat-latest + continue_session: true + prompt: | + The user gave informal model names ("Claude Opus", "the latest GPT"). + Call set_waza_grade_pass or set_waza_grade_fail once per criterion + (3 calls total). + + 1) Resolved before using: the agent called resolve-model.ts for the + informal names BEFORE invoking compare-models.ts. Did not guess + exact IDs. + + 2) Confidence-aware: if either resolution returned medium/low + confidence the agent either confirmed with the user or noted + the ambiguity. (If both came back high, a direct comparison is + fine and this criterion still passes.) + + 3) Comparison delivered: the response ends with an actual comparison + of the two resolved models on pricing AND context length, + not just a list of candidate model IDs. + +expected: + outcomes: + - type: task_completed diff --git a/evals/openrouter-models/tasks/04-fastest-provider.yaml b/evals/openrouter-models/tasks/04-fastest-provider.yaml new file mode 100644 index 0000000..3286816 --- /dev/null +++ b/evals/openrouter-models/tasks/04-fastest-provider.yaml @@ -0,0 +1,49 @@ +id: fastest-provider-001 +name: Fastest Provider for a Model +description: | + Tests get-endpoints.ts for provider-performance questions. Skill guidance + says to highlight fastest (lowest p50 latency) and most reliable (highest + uptime). +tags: + - happy-path + - get-endpoints + - provider-performance + +inputs: + prompt: | + Which provider is currently fastest for anthropic/claude-sonnet-4? + +graders: + - type: code + name: invoked_get_endpoints + config: + language: python + assertions: + - '"get-endpoints.ts" in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"])' + - '"anthropic/claude-sonnet-4" in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"])' + - '"--sort throughput" in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"]) or "--sort=throughput" in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"]) or "--sort latency" in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"]) or "--sort=latency" in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"])' + + - type: prompt + name: provider_answer_quality + config: + model: openai/gpt-chat-latest + continue_session: true + prompt: | + The user asked which provider is fastest for claude-sonnet-4. Call + set_waza_grade_pass or set_waza_grade_fail once per criterion + (3 calls total). + + 1) Correct tool: agent ran get-endpoints.ts for + anthropic/claude-sonnet-4 (not list-models, not a guess). + + 2) Names a specific provider: response identifies a specific + provider (Anthropic, Google Vertex, AWS Bedrock, etc.) as the + fastest, with supporting numbers (p50 latency or throughput). + + 3) Includes context: response mentions uptime or reliability for + the recommended provider, per the skill's guidance to call out + reliable providers. + +expected: + outcomes: + - type: task_completed diff --git a/evals/openrouter-models/tasks/05-anti-trigger-weather.yaml b/evals/openrouter-models/tasks/05-anti-trigger-weather.yaml new file mode 100644 index 0000000..266945c --- /dev/null +++ b/evals/openrouter-models/tasks/05-anti-trigger-weather.yaml @@ -0,0 +1,56 @@ +id: anti-trigger-weather-001 +name: Anti-Trigger - Unrelated Question +description: | + Negative test: unrelated prompt. The skill should not activate, and the + response must not invoke any of the bundled scripts or drag OpenRouter + model commentary into a weather answer. +tags: + - anti-trigger + - negative-test + - trigger-specificity + +inputs: + prompt: | + What's the weather going to be in San Francisco tomorrow? + +graders: + - type: code + name: no_skill_scripts_invoked + config: + language: python + assertions: + - '"list-models.ts" not in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"])' + - '"search-models.ts" not in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"])' + - '"resolve-model.ts" not in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"])' + - '"compare-models.ts" not in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"])' + - '"get-endpoints.ts" not in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"])' + + - type: code + name: no_openrouter_contamination + config: + language: python + assertions: + - "output.lower().count('openrouter') <= 1" + - "'anthropic/claude' not in output.lower()" + - "'openai/gpt' not in output.lower()" + + - type: prompt + name: stays_on_topic + config: + model: openai/gpt-chat-latest + continue_session: true + prompt: | + The user asked about San Francisco weather. Call set_waza_grade_pass + or set_waza_grade_fail once per criterion (2 calls total). + + 1) Addresses weather: the response is about weather or politely + explains the agent can't provide live weather and suggests an + alternative (weather.com, phone app, checking the forecast). + + 2) No OpenRouter contamination: the response does NOT invoke or + reference the openrouter-models scripts, does NOT list OpenRouter + models, does NOT discuss AI model pricing or providers. + +expected: + outcomes: + - type: task_completed diff --git a/evals/openrouter-models/tasks/06-mentioned-model-in-task.yaml b/evals/openrouter-models/tasks/06-mentioned-model-in-task.yaml new file mode 100644 index 0000000..50aac57 --- /dev/null +++ b/evals/openrouter-models/tasks/06-mentioned-model-in-task.yaml @@ -0,0 +1,67 @@ +id: mentioned-model-in-task-001 +name: Model Mentioned In Larger Task +description: | + Tests the subtler trigger: the user doesn't ask about models directly — + they NAME a model (informally, partial, or ambiguous) as part of a larger + coding task. The skill should activate and resolve the name to an exact + OpenRouter ID before proceeding, rather than guessing or using the + informal string verbatim. +tags: + - happy-path + - resolve-model + - indirect-trigger + +inputs: + prompt: | + Write a small Node.js script that sends a prompt to OpenRouter and + prints the response. Use GLM as the model. + +graders: + - type: code + name: invoked_resolve_model + config: + language: python + assertions: + - '"resolve-model.ts" in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"])' + - '"glm" in " ".join([tc["arguments"].get("command", "") for tc in tool_calls if tc["name"] == "bash"]).lower()' + + - type: code + name: resolved_id_used_in_output + config: + language: python + assertions: + - '"glm" in output.lower()' + - '"/" in output' + + - type: prompt + name: silent_resolve_then_use + config: + model: openai/gpt-chat-latest + continue_session: true + prompt: | + The user asked for a Node.js OpenRouter script "using GLM as the + model." Call set_waza_grade_pass or set_waza_grade_fail once per + criterion (3 calls total). + + 1) Resolved, didn't guess: the agent invoked resolve-model.ts for + "glm" before writing code. It did NOT hardcode a made-up string + like "glm" or "glm-4" as the model ID — it used the resolver to + get an exact OpenRouter-style "vendor/model-id" and put THAT + string in the code. + + 2) Task completed: the response includes an actual Node.js script + that sends a prompt to OpenRouter (fetch to /api/v1/* or SDK + call) and prints the response. A snippet that only resolves the + model ID without producing the script does NOT pass this + criterion. + + 3) Noted resolution confidence (soft): if the resolver returned + medium/low confidence the agent either picked the best match and + mentioned it, OR asked the user to confirm. If the resolver + returned high confidence and the agent just used it silently, + that also passes this criterion — confirm-or-use is the right + behavior either way. + +expected: + outcomes: + - type: task_completed diff --git a/evals/openrouter-oauth/eval.yaml b/evals/openrouter-oauth/eval.yaml new file mode 100644 index 0000000..6c01974 --- /dev/null +++ b/evals/openrouter-oauth/eval.yaml @@ -0,0 +1,33 @@ +name: openrouter-oauth-eval +description: | + Evaluation suite for the openrouter-oauth skill. Validates that the agent, + when the skill is loaded, produces responses that actually follow the + documented PKCE flow and surface skill-specific guarantees (sessionStorage + for verifier, S256 challenge, openrouter.ai/auth, callback guard, etc.). + + Graders are defined PER-TASK (task-level graders are additive in waza) — + only universal checks live here at the eval level. +skill: openrouter-oauth +version: "1.0" +config: + trials_per_task: 1 + timeout_seconds: 300 + parallel: false + executor: copilot-sdk + model: claude-opus-4.7 +metrics: + - name: task_completion + weight: 1.0 + threshold: 0.8 + description: Did the skill produce a correct, skill-faithful response? + +graders: + # Universal: every task must produce a non-empty response. + - type: code + name: has_output + config: + assertions: + - "len(output) > 50" + +tasks: + - "tasks/*.yaml" diff --git a/evals/openrouter-oauth/tasks/01-happy-path-react.yaml b/evals/openrouter-oauth/tasks/01-happy-path-react.yaml new file mode 100644 index 0000000..337357c --- /dev/null +++ b/evals/openrouter-oauth/tasks/01-happy-path-react.yaml @@ -0,0 +1,70 @@ +id: happy-path-react-001 +name: Happy Path - React App +description: | + Standard request to add Sign In with OpenRouter to a React app. + Exercises the full PKCE flow and the sign-in button section of the skill. +tags: + - happy-path + - react + - full-flow + +inputs: + prompt: | + I have a React app and I want to add "Sign in with OpenRouter" so users + can authorize and my app can make inference calls with their API key. + Walk me through the implementation and show me the code I need. + +graders: + # Full PKCE rubric + structural claims — this task is expected to cover + # the whole skill surface. + - type: code + name: pkce_technical_claims + config: + language: python + assertions: + - "'sessionstorage' in output.lower()" + - "'S256' in output or 's256' in output.lower()" + - "'openrouter.ai/auth' in output.lower() or 'openrouter.ai/api/v1/auth/keys' in output.lower()" + - "'code_verifier' in output.lower() or 'verifier' in output.lower()" + - "'code_challenge' in output.lower() or 'challenge' in output.lower()" + + - type: prompt + name: skill_faithful_rubric + config: + model: openai/gpt-chat-latest + continue_session: true + prompt: | + You are grading an agent's response to a user who asked for + "Sign in with OpenRouter" in a React app. The OpenRouter OAuth flow + is documented as PKCE with no client registration and no backend. + + Evaluate these four criteria INDEPENDENTLY. Call set_waza_grade_pass + or set_waza_grade_fail once for EACH criterion (4 total calls). + + 1) Correct PKCE initiation: response describes generating a + code_verifier (32 random bytes, base64url encoded), computing an + S256 challenge, and redirecting to https://openrouter.ai/auth with + callback_url, code_challenge, and code_challenge_method=S256. + No invented client_id, client_secret, or app registration. + + 2) Correct storage model: code_verifier in sessionStorage (NOT + localStorage), final API key in localStorage, and a guard + (hasOAuthCallbackPending or equivalent) that checks the verifier + exists before processing ?code= params. + + 3) Correct key exchange: POST to + https://openrouter.ai/api/v1/auth/keys with JSON body containing + `code`, `code_verifier`, and `code_challenge_method: "S256"`, then + read `key` from the JSON response. No backend proxy claimed. + + 4) No fabricated details: does not introduce client_id/secret/app + registration/backend as requirements. Does not require installing + the OpenRouter SDK specifically for auth (the flow is plain fetch). + + For each criterion: set_waza_grade_pass with description="criterion N: " + and a reason if satisfied, otherwise set_waza_grade_fail with the same + description and a reason explaining what's missing. + +expected: + outcomes: + - type: task_completed diff --git a/evals/openrouter-oauth/tasks/02-programmatic-key.yaml b/evals/openrouter-oauth/tasks/02-programmatic-key.yaml new file mode 100644 index 0000000..bd181b3 --- /dev/null +++ b/evals/openrouter-oauth/tasks/02-programmatic-key.yaml @@ -0,0 +1,55 @@ +id: programmatic-key-001 +name: Programmatic API Key (No Button) +description: | + User wants the API key but isn't building a UI-first flow. + The skill should provide the PKCE flow and skip the button/variants section. +tags: + - happy-path + - programmatic + - no-ui + +inputs: + prompt: | + I need to obtain an OpenRouter API key from the browser for my app + programmatically — I'm not building a sign-in button component, just need + the OAuth flow to get back a usable key I can store. What do I do? + +graders: + # PKCE protocol must be complete; skill's "no client registration" claim matters. + - type: code + name: pkce_technical_claims + config: + language: python + assertions: + - "'sessionstorage' in output.lower()" + - "'S256' in output or 's256' in output.lower()" + - "'openrouter.ai/auth' in output.lower() or 'openrouter.ai/api/v1/auth/keys' in output.lower()" + - "'code_verifier' in output.lower() or 'verifier' in output.lower()" + + - type: prompt + name: no_button_focus + config: + model: openai/gpt-chat-latest + continue_session: true + prompt: | + The user explicitly asked for the OAuth flow only, no button UI. + Evaluate these criteria. Call set_waza_grade_pass or set_waza_grade_fail + once per criterion (3 total calls). + + 1) PKCE flow complete: covers verifier generation, S256 challenge, + redirect to https://openrouter.ai/auth, POST to + https://openrouter.ai/api/v1/auth/keys for the exchange, and + reading `key` from the response. + + 2) No unnecessary UI content: does NOT push a button component with + logo SVG, multiple style variants, or Tailwind class tables. + Button styling / variant guidance is explicitly out of scope for + this request. A minimal "call initiateOAuth() on click" mention is + fine; a full button taxonomy is not. + + 3) No fabricated requirements: does not claim a client_id, secret, + app registration, or backend is needed. + +expected: + outcomes: + - type: task_completed diff --git a/evals/openrouter-oauth/tasks/03-callback-guard.yaml b/evals/openrouter-oauth/tasks/03-callback-guard.yaml new file mode 100644 index 0000000..f2b5afe --- /dev/null +++ b/evals/openrouter-oauth/tasks/03-callback-guard.yaml @@ -0,0 +1,61 @@ +id: callback-guard-001 +name: Callback Guard for Ambiguous ?code= Params +description: | + Tests whether the skill surfaces the callback-guard requirement + (hasOAuthCallbackPending / verifier presence check) when the user's app + already uses ?code= for other things. This is a concrete, security-relevant + detail the skill documents — the task is narrow by design, so graders + should focus on guard handling, not the full protocol restate. +tags: + - edge-case + - security + - guard + +inputs: + prompt: | + My existing app already uses ?code= query parameters for a different + feature (a referral tracking thing). I want to add Sign In with + OpenRouter without breaking that. How do I safely handle the OAuth + callback so I only consume codes that belong to the OpenRouter flow? + +graders: + # Narrow, focused graders — the guard mechanism is the point of this task. + - type: code + name: guard_claims + config: + language: python + assertions: + # The skill-native guard function, or at minimum the underlying check. + - "'hasoauthcallbackpending' in output.lower() or 'sessionstorage.getitem' in output.lower()" + # Must mention sessionStorage as the signal for guard. + - "'sessionstorage' in output.lower()" + # Must mention the verifier key as the signal. + - "'verifier' in output.lower()" + + - type: prompt + name: guard_reasoning + config: + model: openai/gpt-chat-latest + continue_session: true + prompt: | + The user already uses ?code= for their own unrelated feature. + Evaluate these criteria. Call set_waza_grade_pass or set_waza_grade_fail + once per criterion (3 total calls). + + 1) Explains the hazard: makes clear that blindly consuming ?code= in + the OpenRouter callback handler would clash with the user's + existing feature. + + 2) Correct guard mechanism: proposes checking for the presence of a + code_verifier in sessionStorage (hasOAuthCallbackPending or + equivalent) BEFORE invoking handleOAuthCallback. Gates the + OpenRouter code path on "did this tab initiate an OAuth flow?" + + 3) Doesn't recommend fragile alternatives as primary: does not + recommend checking URL path, a custom query param name, or + abandoning ?code= as the main fix. A sessionStorage-backed + verifier check is the correct approach here. + +expected: + outcomes: + - type: task_completed diff --git a/evals/openrouter-oauth/tasks/04-sign-in-button.yaml b/evals/openrouter-oauth/tasks/04-sign-in-button.yaml new file mode 100644 index 0000000..d231b4f --- /dev/null +++ b/evals/openrouter-oauth/tasks/04-sign-in-button.yaml @@ -0,0 +1,54 @@ +id: sign-in-button-001 +name: Sign-In Button Component +description: | + Narrow request focused on the button section of the skill — logo SVG, + variants, sizes, dark mode, loading state, click handler wiring. + User has said the OAuth flow is already working, so PKCE protocol details + are NOT required (and would be scope bloat). +tags: + - happy-path + - ui + - button + +inputs: + prompt: | + I already have the OAuth flow working — I can get an API key from + OpenRouter. Now I just want a polished "Sign in with OpenRouter" button + component I can drop into my app. Multiple visual variants, proper + sizing, dark mode support, and a loading state while the exchange is + happening. + +graders: + # Rely on the LLM-judge rubric below rather than a `code` grader that + # greps the chat-text output for " or + a clearly-identified logo asset) so the button is branded as + "Sign in with OpenRouter", not a generic button. + + 2) Multiple variants + sizes: offers at least 3 visual variants (e.g., + default/minimal/branded/icon/cta) AND multiple size options + (e.g., sm/default/lg/xl). Hardcoding a single style is not enough. + + 3) Interaction state handled: accessible button (not a div), has a + loading/disabled state while the auth flow runs, and a clear + onClick that triggers the sign-in flow (calling initiateOAuth or + equivalent). Dark mode classes or theming are addressed. + +expected: + outcomes: + - type: task_completed diff --git a/evals/openrouter-oauth/tasks/05-anti-trigger-google-oauth.yaml b/evals/openrouter-oauth/tasks/05-anti-trigger-google-oauth.yaml new file mode 100644 index 0000000..c966025 --- /dev/null +++ b/evals/openrouter-oauth/tasks/05-anti-trigger-google-oauth.yaml @@ -0,0 +1,67 @@ +id: anti-trigger-google-001 +name: Anti-Trigger - Unrelated OAuth Request +description: | + Negative test: a generic "Google OAuth in Django" request shares the word + "OAuth" with this skill but is otherwise unrelated. The openrouter-oauth + skill should NOT activate, and the response must not leak OpenRouter-specific + details (openrouter.ai/auth, sk-or-, the PKCE code-verifier storage model + from this skill) into what should be a Google/Django answer. +tags: + - anti-trigger + - negative-test + - trigger-specificity + +inputs: + prompt: | + How do I implement Google OAuth sign-in in my Django web app? + I need users to log in with their Google accounts. + +graders: + # Direct leak check: must not contain OpenRouter endpoints or api-key prefix. + # We intentionally omit a `trigger` grader here — it measures keyword overlap + # between the prompt and the skill's description, which is inherently high + # for any OAuth-adjacent prompt and doesn't reflect whether the skill + # actually contaminated the answer. The real signal is the content checks + # below. + - type: code + name: no_openrouter_leakage + config: + language: python + assertions: + - "'openrouter.ai' not in output.lower()" + - "'sk-or-' not in output.lower()" + # Allow the word "openrouter" once (as in "this is not about OpenRouter") + # but flag repeated mentions that suggest the skill activated. + - "output.lower().count('openrouter') <= 1" + + # LLM-judge: evaluate that the response actually answers the Google/Django + # question well, rather than being contaminated by OpenRouter content. + - type: prompt + name: stays_on_topic + config: + model: openai/gpt-chat-latest + continue_session: true + prompt: | + The user asked about Google OAuth in Django. Call set_waza_grade_pass + or set_waza_grade_fail once per criterion (3 total calls). + + 1) Answers the actual question: the response addresses Google OAuth + in a Django context (covers OAuth consent screen, client_id/secret + from Google Cloud Console, callback URL, session/user creation). + A library like django-allauth or authlib is fine; a raw OAuth + implementation is also fine. + + 2) No OpenRouter contamination: the response does NOT describe the + OpenRouter PKCE flow, does NOT tell the user to go to + openrouter.ai/auth, does NOT mention an openrouter API key + (sk-or-...), and does NOT apply the skill's sessionStorage + verifier model to Google OAuth. A brief disclaimer like "this is + different from OpenRouter OAuth" does not count as contamination. + + 3) Django-appropriate advice: recommends Django-idiomatic patterns + (e.g., a callback view, settings.py config, session-based user + auth) rather than a frontend-only PKCE flow. + +expected: + outcomes: + - type: task_completed diff --git a/evals/openrouter-stt/eval.yaml b/evals/openrouter-stt/eval.yaml new file mode 100644 index 0000000..85aadaf --- /dev/null +++ b/evals/openrouter-stt/eval.yaml @@ -0,0 +1,33 @@ +name: openrouter-stt-eval +description: | + Evaluation suite for the openrouter-stt skill. Validates the agent calls + OpenRouter's NOT-OpenAI-compatible /api/v1/audio/transcriptions endpoint + correctly: JSON body, base64 audio under input_audio.data, format field, + direct fetch/requests (NOT the OpenAI SDK). +skill: openrouter-stt +version: "1.0" +config: + trials_per_task: 1 + timeout_seconds: 300 + parallel: false + executor: copilot-sdk + model: claude-opus-4.7 +metrics: + - name: task_completion + weight: 1.0 + threshold: 0.8 + description: Did the agent write correct STT code? + +hooks: + before_run: + - command: "mkdir -p ~/.agents/skills && rsync -a --delete /Users/matt.apperson/Development/skills/.worktrees/setup-waza/skills/openrouter-stt/ /Users/matt.apperson/.agents/skills/openrouter-stt/" + +graders: + - type: code + name: has_output + config: + assertions: + - "len(output) > 100" + +tasks: + - "tasks/*.yaml" diff --git a/evals/openrouter-stt/tasks/01-happy-path-bash.yaml b/evals/openrouter-stt/tasks/01-happy-path-bash.yaml new file mode 100644 index 0000000..02e8156 --- /dev/null +++ b/evals/openrouter-stt/tasks/01-happy-path-bash.yaml @@ -0,0 +1,74 @@ +id: happy-path-bash-001 +name: Happy Path - Bash Transcription +description: | + Standard request: transcribe an audio file. Agent should write a bash + script using curl that POSTs JSON (not multipart) to + /api/v1/audio/transcriptions with the audio base64-encoded under + input_audio.data. +tags: + - happy-path + - bash + - curl + +inputs: + prompt: | + Write a bash script that transcribes audio.wav to plain text using + OpenRouter. Print only the transcript to stdout. Show me the full + script in your response. + +graders: + - type: code + name: endpoint_and_shape + config: + language: python + assertions: + - '"/api/v1/audio/transcriptions" in output.lower()' + - '"input_audio" in output.lower()' + - '"base64" in output.lower() or "b64" in output.lower()' + + - type: code + name: json_not_multipart + config: + language: python + assertions: + - '"content-type: application/json" in output.lower() or "content-type\":\"application/json" in output.lower() or "\"content-type\": \"application/json\"" in output.lower()' + # Agent should NOT be sending multipart/form-data. + - '"multipart" not in output.lower() and "form-data" not in output.lower()' + + - type: code + name: auth_handled + config: + language: python + assertions: + - '"bearer" in output.lower() and "openrouter_api_key" in output.lower()' + + - type: prompt + name: correctness_rubric + config: + model: openai/gpt-chat-latest + continue_session: true + prompt: | + The user asked for a bash script that transcribes audio.wav via + OpenRouter's STT API. Call set_waza_grade_pass or set_waza_grade_fail + once per criterion (4 calls total). + + 1) Correct endpoint: POSTs to + https://openrouter.ai/api/v1/audio/transcriptions. + + 2) Correct body shape: JSON body with model (a real transcription + model slug like google/chirp-3 or openai/whisper-1) and + input_audio: {data: , format: "wav"}. NOT + multipart/form-data. + + 3) Base64 encoding done correctly: uses base64 on the audio file + and strips newlines, uses --data-binary @file to avoid ARG_MAX + issues with large payloads, does NOT prefix with a data URI + ("data:audio/wav;base64,..."). + + 4) Prints the transcript: parses response JSON and prints only + the .text field to stdout (e.g. via jq -r '.text'). Handles + non-200 HTTP with a clear error. + +expected: + outcomes: + - type: task_completed diff --git a/evals/openrouter-stt/tasks/02-python-requests.yaml b/evals/openrouter-stt/tasks/02-python-requests.yaml new file mode 100644 index 0000000..a57659a --- /dev/null +++ b/evals/openrouter-stt/tasks/02-python-requests.yaml @@ -0,0 +1,67 @@ +id: python-requests-001 +name: Python Transcription via requests +description: | + Python flow using the requests library (or equivalent). Critical + negative test: the agent should NOT use the OpenAI SDK, because the + endpoint is not OpenAI-compatible (documented in the skill). +tags: + - happy-path + - python + - not-openai-sdk + +inputs: + prompt: | + Write a Python script that transcribes a local audio.wav file using + OpenRouter and prints the transcript. Show me the complete code in + your response. + +graders: + - type: code + name: uses_direct_http + config: + language: python + assertions: + - '"requests" in output.lower() or "httpx" in output.lower() or "urllib" in output.lower() or "aiohttp" in output.lower()' + - '"/api/v1/audio/transcriptions" in output.lower()' + + - type: code + name: avoids_openai_sdk + config: + language: python + assertions: + # Agent must not use the OpenAI Python SDK for this endpoint. + - '"from openai" not in output.lower() and "import openai" not in output.lower()' + - '"client.audio.transcriptions.create" not in output.lower()' + + - type: code + name: base64_encoding + config: + language: python + assertions: + - '"base64" in output.lower()' + - '"input_audio" in output.lower()' + + - type: prompt + name: python_quality + config: + model: openai/gpt-chat-latest + continue_session: true + prompt: | + The user asked for a Python STT script. Call set_waza_grade_pass + or set_waza_grade_fail once per criterion (3 calls total). + + 1) Correct HTTP approach: uses requests / httpx / urllib / aiohttp + (NOT the OpenAI SDK) to POST JSON to + https://openrouter.ai/api/v1/audio/transcriptions. + + 2) Correct body: JSON body with model (real transcription slug), + input_audio.data set to base64-encoded file bytes (no data: + URI prefix), input_audio.format matching the audio container + ("wav"). + + 3) Prints transcript: on success reads response.json()["text"] + and prints it. Handles non-200 with a clear error. + +expected: + outcomes: + - type: task_completed diff --git a/evals/openrouter-stt/tasks/03-typescript-fetch.yaml b/evals/openrouter-stt/tasks/03-typescript-fetch.yaml new file mode 100644 index 0000000..136cb16 --- /dev/null +++ b/evals/openrouter-stt/tasks/03-typescript-fetch.yaml @@ -0,0 +1,57 @@ +id: typescript-fetch-001 +name: TypeScript Transcription via fetch +description: | + TypeScript flow using native fetch. Tests the skill's documented + TypeScript example is reproducible by the agent. +tags: + - happy-path + - typescript + - fetch + +inputs: + prompt: | + Write a TypeScript function that transcribes a local audio file using + OpenRouter's STT and returns the transcript as a string. Include the + complete code in your response. + +graders: + - type: code + name: ts_shape + config: + language: python + assertions: + - '"fetch" in output.lower()' + - '"/api/v1/audio/transcriptions" in output.lower()' + - '"input_audio" in output.lower()' + + - type: code + name: avoids_openai_sdk + config: + language: python + assertions: + - '"new openai(" not in output.lower() and "openai.audio.transcriptions" not in output.lower()' + + - type: prompt + name: ts_quality + config: + model: openai/gpt-chat-latest + continue_session: true + prompt: | + The user asked for a TypeScript transcription function. Call + set_waza_grade_pass or set_waza_grade_fail once per criterion + (3 calls total). + + 1) Correct endpoint + method: POST to + https://openrouter.ai/api/v1/audio/transcriptions with + Authorization: Bearer from OPENROUTER_API_KEY env var. + + 2) Correct body: JSON body with model (real slug), input_audio: + {data, format} where data is base64-encoded file bytes (no + data: URI prefix). + + 3) Returns transcript: on success returns result.text as a + string. Throws or handles non-ok responses clearly. + +expected: + outcomes: + - type: task_completed diff --git a/evals/openrouter-stt/tasks/04-anti-trigger-asr-theory.yaml b/evals/openrouter-stt/tasks/04-anti-trigger-asr-theory.yaml new file mode 100644 index 0000000..892b4ad --- /dev/null +++ b/evals/openrouter-stt/tasks/04-anti-trigger-asr-theory.yaml @@ -0,0 +1,53 @@ +id: anti-trigger-asr-theory-001 +name: Anti-Trigger - ASR Theory Question +description: | + Negative test: conceptual question about ASR/STT. Shares keywords + ("STT", "speech-to-text", "ASR") with the skill but is not a + transcription request. The skill should not activate into code + generation. +tags: + - anti-trigger + - negative-test + - trigger-specificity + +inputs: + prompt: | + Can you explain how modern speech-to-text models like Whisper handle + multi-speaker audio? I want to understand if they do speaker + diarization out of the box or if that's a separate step. + +graders: + - type: code + name: no_openrouter_api_call + config: + language: python + assertions: + - "'openrouter.ai/api/v1/audio/transcriptions' not in output.lower()" + - "'input_audio' not in output.lower() or output.lower().count('input_audio') <= 1" + - "'$openrouter_api_key' not in output.lower() and 'os.environ[\"openrouter_api_key\"]' not in output.lower()" + + - type: prompt + name: stays_educational + config: + model: openai/gpt-chat-latest + continue_session: true + prompt: | + The user asked a conceptual question about ASR and speaker + diarization. Call set_waza_grade_pass or set_waza_grade_fail once + per criterion (2 calls total). + + 1) Answers the actual question: explains how Whisper handles + multi-speaker input, and clarifies whether speaker diarization + is built-in vs a separate step (correct answer: Whisper does + NOT do speaker diarization natively — that's a separate task, + typically handled with pyannote or similar). + + 2) No OpenRouter contamination: does NOT produce curl commands, + does NOT write a transcription script, does NOT insert code + that calls /api/v1/audio/transcriptions. A brief mention of + OpenRouter models is okay; generating an STT client script + is not. + +expected: + outcomes: + - type: task_completed diff --git a/evals/openrouter-stt/tasks/05-indirect-trigger-meeting-bot.yaml b/evals/openrouter-stt/tasks/05-indirect-trigger-meeting-bot.yaml new file mode 100644 index 0000000..b843baa --- /dev/null +++ b/evals/openrouter-stt/tasks/05-indirect-trigger-meeting-bot.yaml @@ -0,0 +1,61 @@ +id: indirect-trigger-meeting-bot-001 +name: Indirect Trigger - Meeting Transcription As Component +description: | + Indirect mention: user is building something that needs STT as a piece, + but doesn't ask for STT directly. The skill should activate and wire in + the correct OpenRouter transcription call. +tags: + - happy-path + - indirect-trigger + +inputs: + prompt: | + Write a Python script that reads a meeting recording (meeting.wav) + and generates a summary in summary.md. Show me the full code. + +graders: + - type: code + name: uses_openrouter_stt + config: + language: python + assertions: + # Accept either the full literal path or the path suffix, since agents + # sometimes use a variable like API_BASE for the host prefix. + - '"/audio/transcriptions" in output.lower()' + - '"input_audio" in output.lower()' + - '"meeting.wav" in output.lower()' + + - type: code + name: writes_summary_md + config: + language: python + assertions: + - '"summary.md" in output.lower()' + + - type: prompt + name: pipeline_quality + config: + model: openai/gpt-chat-latest + continue_session: true + prompt: | + The user wants a script that transcribes a meeting and writes a + summary to disk. Call set_waza_grade_pass or set_waza_grade_fail + once per criterion (3 calls total). + + 1) STT step correct: reads meeting.wav, base64-encodes it, POSTs + to /api/v1/audio/transcriptions with JSON body + {model, input_audio: {data, format: "wav"}}. Does NOT try the + OpenAI SDK for this endpoint. + + 2) Summarization step present: the transcript is passed to an + LLM chat completion endpoint (OpenRouter /api/v1/chat/completions + or equivalent) to produce a summary. A pipeline that stops at + the transcript does NOT pass this criterion. + + 3) Writes summary.md: the LLM-generated summary is written to + summary.md on disk using standard file I/O. Does NOT just + print to stdout. + +expected: + outcomes: + - type: task_completed diff --git a/evals/openrouter-tts/eval.yaml b/evals/openrouter-tts/eval.yaml new file mode 100644 index 0000000..daee387 --- /dev/null +++ b/evals/openrouter-tts/eval.yaml @@ -0,0 +1,37 @@ +name: openrouter-tts-eval +description: | + Evaluation suite for the openrouter-tts skill. Validates that the agent + produces code that correctly calls OpenRouter's /api/v1/audio/speech + endpoint with proper auth, body shape, response-format handling, and + file-extension matching. + + Graders are PER-TASK (task-level graders are additive in waza). +skill: openrouter-tts +version: "1.0" +config: + trials_per_task: 1 + timeout_seconds: 300 + parallel: false + executor: copilot-sdk + model: claude-opus-4.7 +metrics: + - name: task_completion + weight: 1.0 + threshold: 0.8 + description: Did the agent write correct TTS code? + +# Keep the worktree skill content in sync with what the agent actually reads. +hooks: + before_run: + - command: "mkdir -p ~/.agents/skills && rsync -a --delete /Users/matt.apperson/Development/skills/.worktrees/setup-waza/skills/openrouter-tts/ /Users/matt.apperson/.agents/skills/openrouter-tts/" + +graders: + # Universal: non-empty response. + - type: code + name: has_output + config: + assertions: + - "len(output) > 100" + +tasks: + - "tasks/*.yaml" diff --git a/evals/openrouter-tts/tasks/01-happy-path-curl.yaml b/evals/openrouter-tts/tasks/01-happy-path-curl.yaml new file mode 100644 index 0000000..9fff6a0 --- /dev/null +++ b/evals/openrouter-tts/tasks/01-happy-path-curl.yaml @@ -0,0 +1,75 @@ +id: happy-path-curl-001 +name: Happy Path - Simple TTS via curl +description: | + Standard request: generate an MP3 from a short phrase. Agent should produce + a shell/curl approach that hits /api/v1/audio/speech with the right body + fields and saves the raw bytes to an .mp3 file. +tags: + - happy-path + - curl + - bash + +inputs: + prompt: | + Write a bash script that uses OpenRouter to generate an MP3 file saying + "Hello world, this is a test" and saves it to speech.mp3. Show me the + complete script in your response so I can see it. + +graders: + - type: code + name: endpoint_and_auth + config: + language: python + assertions: + - '"/api/v1/audio/speech" in output.lower()' + - '"authorization" in output.lower() or "bearer" in output.lower()' + - '"openrouter_api_key" in output.lower() or "$OPENROUTER_API_KEY" in output' + + - type: code + name: body_shape + config: + language: python + assertions: + - '"model" in output.lower() and "input" in output.lower()' + - '"voice" in output.lower()' + - '"response_format" in output.lower() or "format" in output.lower()' + + - type: code + name: format_extension_match + config: + language: python + assertions: + # If the script sets response_format: mp3, output file must also be .mp3. + - '("mp3" in output.lower() and "speech.mp3" in output) or ("pcm" not in output.lower())' + + - type: prompt + name: correctness_rubric + config: + model: openai/gpt-chat-latest + continue_session: true + prompt: | + The user asked for a bash script that calls OpenRouter's TTS API and + saves the result as speech.mp3. Evaluate the agent's response on + these criteria. Call set_waza_grade_pass or set_waza_grade_fail once + per criterion (4 calls total). + + 1) Correct endpoint + method: POST to + https://openrouter.ai/api/v1/audio/speech with JSON body. + + 2) Correct body fields: JSON body includes model (a real TTS model + slug like openai/gpt-4o-mini-tts-2025-12-15), input (the user's + text), voice (a real voice like alloy/nova), and + response_format set to "mp3". + + 3) Raw-bytes handling: the script treats the response body as raw + audio bytes, NOT as JSON. It writes bytes to speech.mp3 via + --output/-o or equivalent. Does not try to jq-parse the 200 + response body. + + 4) Auth + API key: uses Authorization: Bearer $OPENROUTER_API_KEY + from environment. Does not hardcode a key or use the wrong + header name. + +expected: + outcomes: + - type: task_completed diff --git a/evals/openrouter-tts/tasks/02-python-sdk.yaml b/evals/openrouter-tts/tasks/02-python-sdk.yaml new file mode 100644 index 0000000..6f790ec --- /dev/null +++ b/evals/openrouter-tts/tasks/02-python-sdk.yaml @@ -0,0 +1,60 @@ +id: python-sdk-001 +name: Python TTS via OpenAI SDK +description: | + Tests the OpenAI-SDK-compatible path. Agent should use the OpenAI Python + SDK with base_url override, per the skill's documented pattern. +tags: + - happy-path + - python + - sdk + +inputs: + prompt: | + I'm working in a Python project. Show me how to use OpenRouter's TTS + to narrate the first paragraph of a blog post and save it as + narration.mp3. Use the OpenAI Python SDK — I'd rather not shell out. + Include the full code in your response. + +graders: + - type: code + name: openai_sdk_with_base_url + config: + language: python + assertions: + - '"from openai import openai" in output.lower() or "import openai" in output.lower()' + - '"base_url" in output.lower() and "openrouter.ai/api/v1" in output.lower()' + + - type: code + name: audio_speech_call + config: + language: python + assertions: + - '"audio.speech" in output.lower() or "audio/speech" in output.lower()' + - '"stream_to_file" in output.lower() or "write" in output.lower() or "arraybuffer" in output.lower() or "iter_bytes" in output.lower()' + + - type: prompt + name: python_quality + config: + model: openai/gpt-chat-latest + continue_session: true + prompt: | + The user asked for a Python-SDK-based TTS implementation. Call + set_waza_grade_pass or set_waza_grade_fail once per criterion + (3 calls total). + + 1) Correct SDK usage: uses the OpenAI Python SDK (not a custom HTTP + call), initializes the client with base_url pointing at + https://openrouter.ai/api/v1 and api_key from + os.environ["OPENROUTER_API_KEY"]. + + 2) Correct TTS call: calls client.audio.speech.create or equivalent + with model (real OpenRouter TTS slug), input (the paragraph), + voice, and response_format="mp3". + + 3) Saves as narration.mp3: the code writes the response bytes to + narration.mp3, using streaming or arrayBuffer — not trying to + parse the body as JSON on success. + +expected: + outcomes: + - type: task_completed diff --git a/evals/openrouter-tts/tasks/03-voice-selection.yaml b/evals/openrouter-tts/tasks/03-voice-selection.yaml new file mode 100644 index 0000000..be55d2e --- /dev/null +++ b/evals/openrouter-tts/tasks/03-voice-selection.yaml @@ -0,0 +1,58 @@ +id: voice-selection-001 +name: Voice Selection +description: | + User asks for a specific voice. The skill documents that voices are + provider-namespaced; the agent should either use a known-valid voice or + look it up via the models endpoint. +tags: + - happy-path + - voice + - provider-specific + +inputs: + prompt: | + I want to generate speech with OpenRouter TTS using the "nova" voice. + Write a TypeScript script that does it and saves the output as + out.mp3. Include the complete script in your response. + +graders: + - type: code + name: uses_nova_voice + config: + language: python + assertions: + - '"nova" in output.lower()' + - '"/api/v1/audio/speech" in output.lower() or "audio.speech" in output.lower() or "audio/speech" in output.lower()' + + - type: code + name: typescript_approach + config: + language: python + assertions: + - '"import" in output.lower() and ("openai" in output.lower() or "fetch" in output.lower())' + - '"out.mp3" in output' + + - type: prompt + name: voice_quality + config: + model: openai/gpt-chat-latest + continue_session: true + prompt: | + The user asked for TypeScript code using the "nova" voice. Call + set_waza_grade_pass or set_waza_grade_fail once per criterion + (3 calls total). + + 1) Uses nova: passes voice: "nova" to the API. + + 2) Picks a compatible model: uses an OpenAI TTS model slug (e.g. + openai/gpt-4o-mini-tts-2025-12-15) — because nova is an OpenAI + voice and will not work on Voxtral/Kokoro. The skill explicitly + documents this. + + 3) Saves out.mp3 correctly: writes raw bytes to out.mp3 using the + OpenAI SDK (arrayBuffer → Buffer → writeFile) OR a direct fetch + that extracts the blob/arrayBuffer. + +expected: + outcomes: + - type: task_completed diff --git a/evals/openrouter-tts/tasks/04-long-input-splitting.yaml b/evals/openrouter-tts/tasks/04-long-input-splitting.yaml new file mode 100644 index 0000000..72cd6a0 --- /dev/null +++ b/evals/openrouter-tts/tasks/04-long-input-splitting.yaml @@ -0,0 +1,57 @@ +id: long-input-001 +name: Long Input - Chunk and Concat +description: | + Tests the skill's Long-Inputs guidance: split at sentence/paragraph + boundaries, same model+voice per chunk, concatenate audio. Agent should + recognize this isn't a single-call scenario. +tags: + - happy-path + - long-input + - chunking + +inputs: + prompt: | + I have a 15,000-character article I want to narrate end-to-end with + OpenRouter TTS. What's the right approach? Show me the code and + explain your strategy inline in the response. + +graders: + - type: code + name: recognizes_chunking_need + config: + language: python + assertions: + - '"split" in output.lower() or "chunk" in output.lower() or "paragraph" in output.lower() or "sentence" in output.lower()' + + - type: code + name: mentions_concatenation + config: + language: python + assertions: + - '"concat" in output.lower() or "ffmpeg" in output.lower() or "combine" in output.lower() or "merge" in output.lower()' + + - type: prompt + name: long_input_strategy + config: + model: openai/gpt-chat-latest + continue_session: true + prompt: | + The user has a 15k-character article to narrate via TTS. Call + set_waza_grade_pass or set_waza_grade_fail once per criterion + (3 calls total). + + 1) Recognizes per-request limits: explains or implies TTS has a + per-request character limit and that the full article must be + split, not sent in one call. + + 2) Correct splitting strategy: splits at sentence or paragraph + boundaries, not mid-word. Keeps model + voice consistent across + chunks for prosody continuity. + + 3) Concatenation path: describes or implements concatenating the + resulting audio (e.g., ffmpeg concat, or collecting buffers and + writing them in order). Does NOT skip the concat step. + +expected: + outcomes: + - type: task_completed diff --git a/evals/openrouter-tts/tasks/05-anti-trigger-tts-theory.yaml b/evals/openrouter-tts/tasks/05-anti-trigger-tts-theory.yaml new file mode 100644 index 0000000..07064ae --- /dev/null +++ b/evals/openrouter-tts/tasks/05-anti-trigger-tts-theory.yaml @@ -0,0 +1,55 @@ +id: anti-trigger-tts-theory-001 +name: Anti-Trigger - TTS Theory Question +description: | + Negative test: a theoretical question about TTS that shares keywords + ("TTS", "text-to-speech") but is NOT asking to generate audio. The skill + should not activate into code generation, and should not produce + OpenRouter-specific TTS API calls in the response. +tags: + - anti-trigger + - negative-test + - trigger-specificity + +inputs: + prompt: | + What are the main differences between neural TTS models and the older + concatenative synthesis approach? I'm writing a blog post and want to + explain it clearly. + +graders: + - type: code + name: no_openrouter_api_call + config: + language: python + assertions: + # Response should not contain OpenRouter endpoints or API-key references, + # since this is an educational question, not a generation task. + - "'openrouter.ai/api/v1/audio/speech' not in output.lower()" + - "'$openrouter_api_key' not in output.lower() and 'os.environ[\"openrouter_api_key\"]' not in output.lower()" + # Mentioning OpenRouter by name once in context ("OpenRouter supports various models") is fine; full code is not. + - "output.lower().count('response_format') <= 1" + + - type: prompt + name: stays_educational + config: + model: openai/gpt-chat-latest + continue_session: true + prompt: | + The user asked a conceptual/educational question about TTS + approaches. Call set_waza_grade_pass or set_waza_grade_fail once + per criterion (2 calls total). + + 1) Answers the actual question: explains differences between + neural TTS (e.g., Tacotron, WaveNet, modern diffusion-based + models) and concatenative synthesis (unit selection from + recorded speech databases). Covers quality, flexibility, + compute requirements, or similar dimensions. + + 2) No OpenRouter contamination: does NOT produce a curl command, + does NOT write a script that calls /api/v1/audio/speech, does + NOT suggest "here's how to do it with OpenRouter" — this is a + conceptual question, not a generation request. + +expected: + outcomes: + - type: task_completed diff --git a/evals/openrouter-tts/tasks/06-indirect-trigger-voice-assistant.yaml b/evals/openrouter-tts/tasks/06-indirect-trigger-voice-assistant.yaml new file mode 100644 index 0000000..9d634ee --- /dev/null +++ b/evals/openrouter-tts/tasks/06-indirect-trigger-voice-assistant.yaml @@ -0,0 +1,62 @@ +id: indirect-trigger-voice-assistant-001 +name: Indirect Trigger - TTS As Component +description: | + Tests the indirect-mention pattern documented in the repo's memory: + the user is building something that needs TTS as a component (a voice + assistant) but hasn't asked for TTS directly. The skill should activate + and the agent should wire in /api/v1/audio/speech for the spoken output. +tags: + - happy-path + - indirect-trigger + +inputs: + prompt: | + Write a Node.js script for a simple voice greeting system: it takes + a user's name as input and produces an audio file that says + "Hello, , welcome!" in a friendly voice. Save the audio as + greeting-.mp3. Show me the complete script in your response. + +graders: + - type: code + name: uses_openrouter_tts + config: + language: python + assertions: + - '"/api/v1/audio/speech" in output.lower() or "audio.speech" in output.lower()' + - '"openrouter_api_key" in output.lower() or "$OPENROUTER_API_KEY" in output' + + - type: code + name: dynamic_filename_and_content + config: + language: python + assertions: + # Script must use the name variable in both the spoken text AND the filename. + - '"greeting-" in output.lower() or "greeting_" in output.lower()' + - '"welcome" in output.lower() or "hello" in output.lower()' + + - type: prompt + name: component_assembly + config: + model: openai/gpt-chat-latest + continue_session: true + prompt: | + The user wants a voice greeting system. Call set_waza_grade_pass + or set_waza_grade_fail once per criterion (3 calls total). + + 1) Correctly uses TTS: calls OpenRouter's TTS endpoint + (/api/v1/audio/speech) with the greeting text and a real voice, + writes the response bytes to the .mp3 file. Does NOT try to + generate a placeholder or skip the audio step. + + 2) Dynamic personalization: the name is injected into BOTH the + spoken input ("Hello, Alice, welcome!") AND the output + filename (greeting-alice.mp3). A single hardcoded value for + both does NOT pass this criterion. + + 3) Complete script: the code is runnable as a Node.js script + (has imports, an entry point, reads name from argv or function + arg, handles the TTS response as raw bytes, writes the file). + +expected: + outcomes: + - type: task_completed diff --git a/evals/openrouter-typescript-sdk/eval.yaml b/evals/openrouter-typescript-sdk/eval.yaml new file mode 100644 index 0000000..086ecc8 --- /dev/null +++ b/evals/openrouter-typescript-sdk/eval.yaml @@ -0,0 +1,31 @@ +name: openrouter-typescript-sdk-eval +description: | + TODO: scaffolding only — tasks are generic stubs. Author real tasks + + graders before running baseline. See evals/openrouter-tts for a worked + example. +skill: openrouter-typescript-sdk +version: "1.0" +config: + trials_per_task: 1 + timeout_seconds: 300 + parallel: false + executor: copilot-sdk + model: claude-sonnet-4.6 +metrics: + - name: task_completion + weight: 1.0 + threshold: 0.8 + description: Did the skill complete the assigned task? +graders: + - type: code + name: has_output + config: + assertions: + - "len(output) > 0" + - type: text + name: relevant_content + config: + regex_match: + - "(?i)(explain|describe|analyze|implement)" +tasks: + - "tasks/*.yaml" diff --git a/evals/openrouter-typescript-sdk/fixtures/sample.py b/evals/openrouter-typescript-sdk/fixtures/sample.py new file mode 100644 index 0000000..3f022d1 --- /dev/null +++ b/evals/openrouter-typescript-sdk/fixtures/sample.py @@ -0,0 +1,3 @@ +def hello(name): + """Greet someone by name.""" + return f"Hello, {name}!" diff --git a/evals/openrouter-typescript-sdk/tasks/basic-usage.yaml b/evals/openrouter-typescript-sdk/tasks/basic-usage.yaml new file mode 100644 index 0000000..a08b1a9 --- /dev/null +++ b/evals/openrouter-typescript-sdk/tasks/basic-usage.yaml @@ -0,0 +1,16 @@ +id: basic-usage-001 +name: Basic Usage +description: | + Test that the skill handles a typical request correctly. +tags: + - basic + - happy-path +inputs: + prompt: "Help me with this task" + files: + - path: sample.py +expected: + output_contains: + - "function" + outcomes: + - type: task_completed diff --git a/evals/openrouter-typescript-sdk/tasks/edge-case.yaml b/evals/openrouter-typescript-sdk/tasks/edge-case.yaml new file mode 100644 index 0000000..0ff236a --- /dev/null +++ b/evals/openrouter-typescript-sdk/tasks/edge-case.yaml @@ -0,0 +1,11 @@ +id: edge-case-001 +name: Edge Case - Empty Input +description: | + Test that the skill handles edge cases gracefully. +tags: + - edge-case +inputs: + prompt: "" +expected: + outcomes: + - type: task_completed diff --git a/evals/openrouter-typescript-sdk/tasks/should-not-trigger.yaml b/evals/openrouter-typescript-sdk/tasks/should-not-trigger.yaml new file mode 100644 index 0000000..9d85a0d --- /dev/null +++ b/evals/openrouter-typescript-sdk/tasks/should-not-trigger.yaml @@ -0,0 +1,13 @@ +id: should-not-trigger-001 +name: Should Not Trigger +description: | + Test that the skill does NOT activate on unrelated prompts. + This validates trigger specificity. +tags: + - anti-trigger + - negative-test +inputs: + prompt: "What is the weather today?" +expected: + output_not_contains: + - "skill activated" diff --git a/evals/openrouter-video/eval.yaml b/evals/openrouter-video/eval.yaml new file mode 100644 index 0000000..c7e01f6 --- /dev/null +++ b/evals/openrouter-video/eval.yaml @@ -0,0 +1,33 @@ +name: openrouter-video-eval +description: | + Evaluation suite for the openrouter-video skill. Video generation is + async: submit → poll → download. Graders verify the agent writes code + covering all 3 steps, uses real model slugs, and validates parameters + against the models endpoint before submitting. +skill: openrouter-video +version: "1.0" +config: + trials_per_task: 1 + timeout_seconds: 300 + parallel: false + executor: copilot-sdk + model: claude-opus-4.7 +metrics: + - name: task_completion + weight: 1.0 + threshold: 0.8 + description: Did the agent produce correct async video code? + +hooks: + before_run: + - command: "mkdir -p ~/.agents/skills && rsync -a --delete /Users/matt.apperson/Development/skills/.worktrees/setup-waza/skills/openrouter-video/ /Users/matt.apperson/.agents/skills/openrouter-video/" + +graders: + - type: code + name: has_output + config: + assertions: + - "len(output) > 100" + +tasks: + - "tasks/*.yaml" diff --git a/evals/openrouter-video/tasks/01-full-async-flow.yaml b/evals/openrouter-video/tasks/01-full-async-flow.yaml new file mode 100644 index 0000000..1769bbd --- /dev/null +++ b/evals/openrouter-video/tasks/01-full-async-flow.yaml @@ -0,0 +1,75 @@ +id: full-async-flow-001 +name: Happy Path - Full 3-Step Video Generation +description: | + Standard request. Agent should write a bash script covering all three + steps: POST /api/v1/videos, poll the polling_url until status=completed + (handling failed/cancelled/expired terminal states), then GET the content. +tags: + - happy-path + - bash + - async-flow + +inputs: + prompt: | + Write a bash script that generates a short video from this prompt: + "a golden retriever playing fetch on a sunny beach". Save the final + MP4 to video.mp4. Show me the complete script in your response. + +graders: + - type: code + name: submit_step + config: + language: python + assertions: + - '"POST" in output or "-X POST" in output or "method: \"POST\"" in output.lower()' + - '"/videos" in output.lower()' + - '"prompt" in output.lower() and "model" in output.lower()' + + - type: code + name: poll_step + config: + language: python + assertions: + - '"polling_url" in output.lower() or "poll" in output.lower()' + - '"status" in output.lower() and "completed" in output.lower()' + - '"failed" in output.lower() and ("cancelled" in output.lower() or "canceled" in output.lower() or "expired" in output.lower())' + + - type: code + name: download_step + config: + language: python + assertions: + - '"video.mp4" in output.lower()' + - '"authorization" in output.lower() or "bearer" in output.lower()' + + - type: prompt + name: async_flow_quality + config: + model: openai/gpt-chat-latest + continue_session: true + prompt: | + The user asked for a bash script that generates a video from a + prompt using OpenRouter's async video API. Call set_waza_grade_pass + or set_waza_grade_fail once per criterion (4 calls total). + + 1) Submit step: POST to https://openrouter.ai/api/v1/videos with + JSON body containing model (real slug like google/veo-3.1) and + prompt. Auth via Authorization: Bearer $OPENROUTER_API_KEY. + + 2) Poll step: polls the polling_url returned from submit (with + the auth header) at a reasonable interval (30s or so), breaks + on status="completed", exits with an error on "failed", + "cancelled", or "expired" and prints the .error field. + + 3) Download step: downloads the MP4 from the completed response + (unsigned_urls[0] or /api/v1/videos/{id}/content) WITH the + Authorization header, saves to video.mp4. Does NOT try to + stream output without auth. + + 4) Correct overall: recognizes video generation is async and + does NOT expect the POST to return video bytes directly. + Explains the delay to the user or handles it gracefully. + +expected: + outcomes: + - type: task_completed diff --git a/evals/openrouter-video/tasks/02-model-parameter-validation.yaml b/evals/openrouter-video/tasks/02-model-parameter-validation.yaml new file mode 100644 index 0000000..d700161 --- /dev/null +++ b/evals/openrouter-video/tasks/02-model-parameter-validation.yaml @@ -0,0 +1,60 @@ +id: model-parameter-validation-001 +name: Parameter Validation Against Models Endpoint +description: | + User asks for specific video params (duration, aspect ratio). The skill + says to fetch model capabilities first and only send values from the + returned sets — guessing causes 400s. +tags: + - happy-path + - parameters + - validation + +inputs: + prompt: | + I want to generate a 6-second vertical (9:16) video with google/veo-3.1 + from the prompt "a sunrise over the ocean". Write the bash to submit + the job. Show me the script inline. + +graders: + - type: code + name: checks_model_capabilities + config: + language: python + assertions: + - '"videos/models" in output.lower() or "supported_resolutions" in output.lower() or "supported_aspect_ratios" in output.lower() or "supported_durations" in output.lower()' + + - type: code + name: passes_correct_params + config: + language: python + assertions: + - '"duration" in output.lower() and "6" in output' + - '"9:16" in output or "aspect_ratio" in output.lower()' + - '"google/veo-3.1" in output.lower()' + + - type: prompt + name: validation_approach + config: + model: openai/gpt-chat-latest + continue_session: true + prompt: | + The user asked for a 6s, 9:16 video with google/veo-3.1. Call + set_waza_grade_pass or set_waza_grade_fail once per criterion + (3 calls total). + + 1) Validates first OR uses known-good values: either fetches + /api/v1/videos/models to check supported_durations contains 6 + and supported_aspect_ratios contains "9:16" before submitting, + OR notes the skill's "don't guess" guidance and uses values + straight from the model's documented capability set. + + 2) Submits with correct params: POST body includes + duration: 6, aspect_ratio: "9:16" (or size), model: + "google/veo-3.1", prompt: (user's text). + + 3) Async-aware: does not expect the POST to return video bytes, + either polls or explains the next step is polling. + +expected: + outcomes: + - type: task_completed diff --git a/evals/openrouter-video/tasks/03-image-to-video.yaml b/evals/openrouter-video/tasks/03-image-to-video.yaml new file mode 100644 index 0000000..e5dad62 --- /dev/null +++ b/evals/openrouter-video/tasks/03-image-to-video.yaml @@ -0,0 +1,70 @@ +id: image-to-video-001 +name: Image-to-Video with frame_images +description: | + User wants to animate an image. Skill documents that frame_images[] + carries {type: "image_url", image_url: {url}, frame_type: "first_frame"}. +tags: + - happy-path + - image-to-video + - frame_images + +inputs: + prompt: | + I have a local image start.png. Animate it into a 4-second video using + google/veo-3.1 with the prompt "camera slowly zooms out". Save as + animation.mp4. Include the complete bash script. + +graders: + - type: code + name: frame_images_structure + config: + language: python + assertions: + - '"frame_images" in output.lower()' + - '"first_frame" in output.lower() or "frame_type" in output.lower()' + - '"image_url" in output.lower()' + + - type: code + name: local_image_encoding + config: + language: python + assertions: + # Local image → base64 data URL + - '"base64" in output.lower() or "b64" in output.lower()' + - '"data:image" in output.lower() or "data:" in output' + + - type: code + name: uses_specified_params + config: + language: python + assertions: + - '"google/veo-3.1" in output.lower()' + - '"animation.mp4" in output.lower()' + - '"zoom" in output.lower()' + + - type: prompt + name: i2v_quality + config: + model: openai/gpt-chat-latest + continue_session: true + prompt: | + The user wants to animate start.png into animation.mp4 via + google/veo-3.1 with a zoom prompt. Call set_waza_grade_pass or + set_waza_grade_fail once per criterion (3 calls total). + + 1) Correct frame_images shape: POST body includes frame_images + as an array where each entry is + {type: "image_url", image_url: {url}, frame_type: "first_frame"}. + Does NOT use input_references (that's for reference-to-video + style guidance, not for animating an image directly). + + 2) Local image encoded correctly: converts start.png to a base64 + data URL with a proper image MIME type prefix (data:image/png; + base64,...), NOT just raw base64 without the prefix. + + 3) Full async flow: submits with model, prompt, frame_images; + polls for completion; downloads final video as animation.mp4. + +expected: + outcomes: + - type: task_completed diff --git a/evals/openrouter-video/tasks/04-anti-trigger-video-theory.yaml b/evals/openrouter-video/tasks/04-anti-trigger-video-theory.yaml new file mode 100644 index 0000000..35e6813 --- /dev/null +++ b/evals/openrouter-video/tasks/04-anti-trigger-video-theory.yaml @@ -0,0 +1,50 @@ +id: anti-trigger-video-theory-001 +name: Anti-Trigger - Video Gen Theory Question +description: | + Negative test: conceptual question about video generation models. Shares + keywords but is not a generation request. +tags: + - anti-trigger + - negative-test + - trigger-specificity + +inputs: + prompt: | + What's the difference between diffusion-based video models like Sora + and older GAN-based approaches for video generation? I'm writing a + primer for my blog. + +graders: + - type: code + name: no_openrouter_api_call + config: + language: python + assertions: + - "'openrouter.ai/api/v1/videos' not in output.lower()" + - "'polling_url' not in output.lower()" + - "'$openrouter_api_key' not in output.lower()" + + - type: prompt + name: stays_educational + config: + model: openai/gpt-chat-latest + continue_session: true + prompt: | + The user asked a conceptual question about video generation + architectures. Call set_waza_grade_pass or set_waza_grade_fail + once per criterion (2 calls total). + + 1) Answers the question: explains diffusion video models (noise + → latent video via iterative denoising, e.g., Sora, Veo) vs + GAN-based video (generator/discriminator adversarial training, + e.g., VGAN, MoCoGAN), covering differences in training + stability, output quality, temporal coherence, or compute. + + 2) No OpenRouter contamination: does NOT produce curl commands, + does NOT invoke /api/v1/videos, does NOT include any script + that submits a generation job. A brief mention of "OpenRouter + exposes these models" is fine; generating code is not. + +expected: + outcomes: + - type: task_completed diff --git a/evals/openrouter-video/tasks/05-indirect-trigger-explainer.yaml b/evals/openrouter-video/tasks/05-indirect-trigger-explainer.yaml new file mode 100644 index 0000000..1982391 --- /dev/null +++ b/evals/openrouter-video/tasks/05-indirect-trigger-explainer.yaml @@ -0,0 +1,56 @@ +id: indirect-trigger-explainer-001 +name: Indirect Trigger - Product Explainer Pipeline +description: | + Indirect mention: user builds a pipeline that needs video as a + component. The skill should activate without being asked directly for + "video generation". +tags: + - happy-path + - indirect-trigger + +inputs: + prompt: | + Write a Node.js script that takes a product description as input and + produces a 4-second explainer clip in clip.mp4. Use OpenRouter. + Include the full code. + +graders: + - type: code + name: uses_video_endpoint + config: + language: python + assertions: + - '"/videos" in output.lower()' + - '"clip.mp4" in output.lower()' + + - type: code + name: handles_async + config: + language: python + assertions: + - '"polling_url" in output.lower() or ("poll" in output.lower() and "status" in output.lower())' + - '"completed" in output.lower()' + + - type: prompt + name: pipeline_quality + config: + model: openai/gpt-chat-latest + continue_session: true + prompt: | + The user wants a Node.js explainer-video pipeline. Call + set_waza_grade_pass or set_waza_grade_fail once per criterion + (3 calls total). + + 1) Takes description as input: reads the product description + from argv, stdin, or function arg — not hardcoded. + + 2) Uses video API correctly: submits to /api/v1/videos with the + description as the prompt, picks a real video-gen model slug, + passes duration: 4. Polls for completion, downloads the MP4. + + 3) Writes clip.mp4: saves the resulting video bytes to clip.mp4 + with the auth header on the download call. + +expected: + outcomes: + - type: task_completed diff --git a/skills/openrouter-typescript-sdk/SKILL.md b/skills/openrouter-typescript-sdk/SKILL.md index 7fc11cb..b4f4728 100644 --- a/skills/openrouter-typescript-sdk/SKILL.md +++ b/skills/openrouter-typescript-sdk/SKILL.md @@ -8,6 +8,12 @@ version: 2.0.0 A comprehensive TypeScript SDK for interacting with OpenRouter's unified API, providing access to 300+ AI models through a single, type-safe interface. This skill enables AI agents to leverage the `callModel` pattern for text generation, tool usage, streaming, and multi-turn conversations. +## Resolving Model Names Before Use + +**If the user names a specific model — exact ID, informal alias, or passing mention ("use GLM", "hit the latest Claude") — resolve it to an exact OpenRouter ID BEFORE writing any SDK code that passes the model string.** Load the `openrouter-models` skill and run its `resolve-model.ts` with the user's phrase. Do not guess a model ID, do not query `/api/v1/models` directly, do not hardcode a string like `"glm"` or `"claude-4"` into the `model:` field. + +After resolution, use the exact `id` (e.g. `z-ai/glm-4.5`, `anthropic/claude-opus-4.7`) in `callModel({ model: ... })`. If the resolver returns medium/low confidence, surface the chosen match to the user or ask for confirmation. + The SDK is split into two packages: - **`@openrouter/agent`** — Agent features: `callModel`, `tool()`, stop conditions, streaming, format converters - **`@openrouter/sdk`** — Platform features: model listing, chat completions, credits, OAuth, API key management