diff --git a/src/routes/tangent/hooks/useRunAutomatedResearch.ts b/src/routes/tangent/hooks/useRunAutomatedResearch.ts new file mode 100644 index 000000000..8b64c94ef --- /dev/null +++ b/src/routes/tangent/hooks/useRunAutomatedResearch.ts @@ -0,0 +1,36 @@ +import { useMutation } from "@tanstack/react-query"; + +import useToastNotification from "@/hooks/useToastNotification"; +import type { ScenarioEntry } from "@/routes/tangent/idb/tangentDb"; +import { + createOpencodeSession, + resolveInstanceId, + sendAutoresearchMessage, +} from "@/routes/tangent/services/autoresearchOpencode"; +import { buildAutoresearchPrompt } from "@/routes/tangent/services/autoresearchPrompt"; + +/** + * Sends a scenario's autoresearch prompt to a Tangent OpenCode agent: resolves + * (or creates) an instance, opens a fresh session, and fires the prompt. + */ +export function useRunAutomatedResearch() { + const notify = useToastNotification(); + + return useMutation({ + mutationFn: async (scenario: ScenarioEntry) => { + const instanceId = await resolveInstanceId(); + const sessionId = await createOpencodeSession( + instanceId, + `Autoresearch: ${scenario.plan.name}`, + ); + const prompt = buildAutoresearchPrompt(scenario); + await sendAutoresearchMessage(instanceId, sessionId, prompt); + }, + onSuccess: () => { + notify("Automated research started", "success"); + }, + onError: (error) => { + notify(`Failed to start automated research: ${error}`, "error"); + }, + }); +} diff --git a/src/routes/tangent/services/autoresearchOpencode.ts b/src/routes/tangent/services/autoresearchOpencode.ts new file mode 100644 index 000000000..7c07ba721 --- /dev/null +++ b/src/routes/tangent/services/autoresearchOpencode.ts @@ -0,0 +1,112 @@ +import { client } from "@/api/client.gen"; +import { + createInstanceApiTangentInstancesPost, + listInstancesApiTangentInstancesGet, +} from "@/api/sdk.gen"; + +/** + * Workspace directory the OpenCode agent runs in. OpenCode scopes sessions to + * a directory, passed as the `directory` query param (base64 `L3Jvb3Qvd29ya3NwYWNl` + * in the web UI URL). + */ +const OPENCODE_WORKSPACE_DIR = "/root/workspace"; + +function isRecord(value: unknown): value is Record { + return typeof value === "object" && value !== null; +} + +interface OpencodeProxyPostOptions { + query?: Record; + body?: unknown; +} + +/** + * POST through the backend OpenCode reverse proxy + * (`/api/tangent/instances/{instanceId}/opencode/api/{path}`), which forwards + * the path, query string, and body to the in-pod OpenCode server. + * + * We call the hey-api client directly rather than the generated proxy fn so the + * `{path}` segment keeps its slashes (the generated fn would URL-encode them) + * and so we can attach a query and body. + */ +async function opencodeProxyPost( + instanceId: string, + path: string, + { query, body }: OpencodeProxyPostOptions = {}, +): Promise { + const { data, error } = await client.post({ + url: `/api/tangent/instances/${instanceId}/opencode/api/${path}`, + query, + body, + }); + + if (error) { + throw new Error( + `OpenCode request failed (${path}): ${JSON.stringify(error)}`, + ); + } + + return data; +} + +/** + * Resolve a Tangent OpenCode instance to talk to, mirroring the backend + * `/api/tangent/go` logic: reuse the earliest existing instance, otherwise + * create one. + */ +export async function resolveInstanceId(): Promise { + const { data, error } = await listInstancesApiTangentInstancesGet(); + if (error) { + throw new Error( + `Failed to list Tangent instances: ${JSON.stringify(error)}`, + ); + } + + const instanceIds = (data?.instances ?? []) + .map((instance) => instance.instance_id) + .sort(); + if (instanceIds.length > 0) { + return instanceIds[0]; + } + + const created = await createInstanceApiTangentInstancesPost(); + if (created.error || !created.data) { + throw new Error( + `Failed to create Tangent instance: ${JSON.stringify(created.error)}`, + ); + } + return created.data.instance_id; +} + +/** + * Create a fresh OpenCode session in the workspace directory and return its id. + */ +export async function createOpencodeSession( + instanceId: string, + title: string, +): Promise { + const data = await opencodeProxyPost(instanceId, "session", { + query: { directory: OPENCODE_WORKSPACE_DIR }, + body: { title }, + }); + + if (!isRecord(data) || typeof data.id !== "string") { + throw new Error("OpenCode session response did not include an id"); + } + return data.id; +} + +/** + * Send a prompt to an OpenCode session without waiting for the agent to finish + * (fire-and-forget via `prompt_async`). + */ +export async function sendAutoresearchMessage( + instanceId: string, + sessionId: string, + text: string, +): Promise { + await opencodeProxyPost(instanceId, `session/${sessionId}/prompt_async`, { + query: { directory: OPENCODE_WORKSPACE_DIR }, + body: { parts: [{ type: "text", text }] }, + }); +} diff --git a/src/routes/tangent/services/autoresearchPrompt.ts b/src/routes/tangent/services/autoresearchPrompt.ts new file mode 100644 index 000000000..fd9e51309 --- /dev/null +++ b/src/routes/tangent/services/autoresearchPrompt.ts @@ -0,0 +1,157 @@ +/** + * This file is a PoC for the autoresearch prompt. + */ +import yaml from "js-yaml"; + +import type { + ScenarioEntry, + ScenarioIdea, +} from "@/routes/tangent/idb/tangentDb"; + +const UNVERIFIED = "UNVERIFIED - resolve from baseline run before submitting"; + +/** Subset of the legacy prompt-builder `p` object, derived from a scenario. */ +interface AutoresearchParams { + id: string; + runId: string; + ideas: ScenarioIdea[]; + metric?: string; + metricBaseline?: string; +} + +/** + * Synthesize a minimal `scenario.yaml` from the saved scenario plan. Populated + * plan fields are passed through; unknown fields are emitted as explicit + * UNVERIFIED placeholders so the agent's pre-flight checks resolve them. + */ +function buildScenarioYaml(scenario: ScenarioEntry): string { + const { plan } = scenario; + + const yamlObject: Record = { + name: plan.name, + description: plan.description, + pipeline: plan.pipeline ?? { + path: UNVERIFIED, + baseline_run_id: scenario.run.runId, + }, + metrics: plan.metrics ?? { target: { path: UNVERIFIED } }, + search_space: plan.search_space ?? {}, + }; + + if (plan.experiment_actions) { + yamlObject.experiment_actions = plan.experiment_actions; + } + if (plan.research) yamlObject.research = plan.research; + if (plan.budget) yamlObject.budget = plan.budget; + if (plan.timing) yamlObject.timing = plan.timing; + if (plan.failure_playbook) + yamlObject.failure_playbook = plan.failure_playbook; + + return yaml.dump(yamlObject, { sortKeys: false }); +} + +/** Initial MEMORY.md scaffolding for a scenario that has not run yet. */ +function initialMemoryMd(p: AutoresearchParams): string { + const metricLine = p.metric + ? `Baseline ${p.metric}: ${p.metricBaseline ?? "unknown"}` + : "Baseline metric: unknown"; + const ideasSection = + p.ideas.length > 0 + ? p.ideas + .slice(0, 3) + .map((idea, i) => `${i + 1}. ${idea.title} [${idea.impact}]`) + .join("\n") + : "No ideas yet — researcher will generate in Step 1."; + + return `# MEMORY.md + +## Best Config +No experiments run yet — round 1 has not completed. +${metricLine} + +## Key Lessons +- Starting fresh. Researcher will analyze baseline metrics in Step 1. + +## Top Hypotheses to Test +${ideasSection} + +## Active Runs +(none yet — will be filled after round 1 submission) + +## Experiment Log +(none) +`; +} + +interface BuildPromptOptions { + rounds?: number; + autoApprove?: boolean; +} + +/** + * Build the autoresearch prompt for a scenario, ported from the legacy + * `generateRiverMessage`. The agent is asked to run `tangent auto` for one round + * and sync state to GCS. + */ +export function buildAutoresearchPrompt( + scenario: ScenarioEntry, + { rounds = 3, autoApprove = true }: BuildPromptOptions = {}, +): string { + const p: AutoresearchParams = { + id: scenario.id, + runId: scenario.run.runId, + ideas: scenario.ideas, + }; + + // todo: remove hardcoded GCS path + const gcsPath = `gs://shopify-discovery-relevance/tangent/scenarios/${p.id}/`; + const approveNote = autoApprove + ? "Auto-approve the first hypothesis — use the top researcher suggestion without waiting for human input." + : "Pause for hypothesis approval before submitting runs."; + const memory = initialMemoryMd(p).trim(); + + // Ensure the scenario.yaml name matches the pipeline ID to avoid ambiguity. + const yamlFixed = buildScenarioYaml(scenario) + .trim() + .replace(/^(name:\s*)(.+)/m, `$1${p.id}`); + + const currentLocation = window.location.href; + + return `Please run \`tangent auto\` for the following scenario — **1 round in this session** (${rounds} total planned across sessions). +This is a multi-session workflow: run round 1, sync state to GCS, then hand off cleanly. The next session resumes from "Active Runs" in MEMORY.md. +${approveNote} + +**Current URL:** \`${currentLocation}\` +**Baseline run ID:** \`${p.runId}\` +**Scenario ID (canonical):** \`${p.id}\` +**GCS output path:** \`${gcsPath}\` +Fork the pipeline from the baseline run — do not touch the mainline. After round 1 completes, write MEMORY.md, sessions/, and logs/ to the GCS path above. + +**Pre-flight checks (resolve before submitting any runs):** +- [ ] All \`search_space[*].current\` values match the actual baseline run config (not guessed) +- [ ] All \`search_space[*].range\` brackets extend at or above the baseline value for quality-lift experiments +- [ ] All categorical \`choices\` are confirmed against source code validation (no guessed enum values) +- [ ] \`metrics.target.path\` key exists in a downloaded baseline metrics artifact +- [ ] Any UNVERIFIED fields in the YAML are resolved or explicitly noted in MEMORY.md before runs are submitted + +**Auto-approvals — no need to confirm these with me:** +- Fix \`score_transform\` (or any categorical) choices to match what the pipeline code actually accepts. +- If baseline HPs fall outside declared search space ranges, widen the ranges to include the baseline. +- Resolve all metric paths from the latest baseline run artifact — override any placeholder names in the YAML. +- If the primary metric is monotonically dominated by a boundary value, apply option B: add a hard floor at 10% above the lower bound and continue. +- Submit sentinel immediately without waiting for confirmation. +- Stage round 1 immediately after sentinel submission — do not wait for it to complete before planning. +- Hold and ask only if: pipeline export fails, auth is broken, sentinel metric deviates >5% from stated baseline, or a guard metric key is missing from the artifact entirely. + +--- +**scenario.yaml:** +\`\`\`yaml +${yamlFixed} +\`\`\` + +--- +**MEMORY.md:** +\`\`\` +${memory} +\`\`\``; +} diff --git a/src/routes/v2/shared/components/MlExperimentPlanner/MlExperimentPlannerContent.tsx b/src/routes/v2/shared/components/MlExperimentPlanner/MlExperimentPlannerContent.tsx index 9656257ea..2b0007cd1 100644 --- a/src/routes/v2/shared/components/MlExperimentPlanner/MlExperimentPlannerContent.tsx +++ b/src/routes/v2/shared/components/MlExperimentPlanner/MlExperimentPlannerContent.tsx @@ -9,6 +9,7 @@ import { BreadcrumbPage, BreadcrumbSeparator, } from "@/components/ui/breadcrumb"; +import { Button } from "@/components/ui/button"; import { Card, CardContent, CardHeader, CardTitle } from "@/components/ui/card"; import { BlockStack, InlineStack } from "@/components/ui/layout"; import { ScrollArea } from "@/components/ui/scroll-area"; @@ -21,6 +22,7 @@ import { TableRow, } from "@/components/ui/table"; import { Paragraph, Text } from "@/components/ui/typography"; +import { useRunAutomatedResearch } from "@/routes/tangent/hooks/useRunAutomatedResearch"; import { useRunScenarios } from "@/routes/tangent/hooks/useRunScenarios"; import type { ScenarioEntry, @@ -88,9 +90,16 @@ function IdeaRow({ idea }: { idea: ScenarioIdea }) { interface ScenarioRowProps { scenario: ScenarioEntry; onSelect: () => void; + onRunResearch: () => void; + isResearchPending: boolean; } -function ScenarioRow({ scenario, onSelect }: ScenarioRowProps) { +function ScenarioRow({ + scenario, + onSelect, + onRunResearch, + isResearchPending, +}: ScenarioRowProps) { return ( @@ -113,6 +122,19 @@ function ScenarioRow({ scenario, onSelect }: ScenarioRowProps) { {formatCreatedAt(scenario.createdAt)} + + + ); } @@ -120,9 +142,16 @@ function ScenarioRow({ scenario, onSelect }: ScenarioRowProps) { interface ScenarioDetailProps { scenario: ScenarioEntry; onBack: () => void; + onRunResearch: () => void; + isResearchPending: boolean; } -function ScenarioDetail({ scenario, onBack }: ScenarioDetailProps) { +function ScenarioDetail({ + scenario, + onBack, + onRunResearch, + isResearchPending, +}: ScenarioDetailProps) { return ( @@ -151,6 +180,13 @@ function ScenarioDetail({ scenario, onBack }: ScenarioDetailProps) { {scenario.plan.name} + {scenario.rationale} @@ -183,6 +219,8 @@ export function MlExperimentPlannerContent({ selectedScenarioId, }: MlExperimentPlannerContentProps) { const { scenarios } = useRunScenarios(runId); + const { mutate: runResearch, isPending: isResearchPending } = + useRunAutomatedResearch(); const [selectedId, setSelectedId] = useState( selectedScenarioId ?? null, ); @@ -211,6 +249,8 @@ export function MlExperimentPlannerContent({ setSelectedId(null)} + onRunResearch={() => runResearch(selectedScenario)} + isResearchPending={isResearchPending} /> ); @@ -226,6 +266,7 @@ export function MlExperimentPlannerContent({ Name Ideas Created + Actions @@ -234,6 +275,8 @@ export function MlExperimentPlannerContent({ key={scenario.id} scenario={scenario} onSelect={() => setSelectedId(scenario.id)} + onRunResearch={() => runResearch(scenario)} + isResearchPending={isResearchPending} /> ))}