From 3bde499c7212e83941d45d31e285bf315c83dcbb Mon Sep 17 00:00:00 2001 From: Shunsuke Date: Tue, 9 Jun 2026 18:51:11 +0800 Subject: [PATCH] refactor: make EnvAdapter.reflect a shared default (fixes dropped reflect kwargs) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All six adapters duplicated an identical reflect() that delegates to run_minibatch_reflect. The copies had drifted: OfficeQA/DocVQA silently dropped meta_skill_context and ALFWorld dropped update_mode, so those analysts ran without inputs every other benchmark receives (active under the default use_meta_skill: true). Move the delegation into EnvAdapter.reflect as one default that forwards all kwargs uniformly, and delete the six overrides. reflect is no longer abstract — adapters inherit it and override only for custom logic. Net -225 lines. Behavior change: OfficeQA/DocVQA/ALFWorld reflect now receive the kwargs they previously dropped; the three already-correct benchmarks are unaffected. Co-Authored-By: Claude Opus 4.8 (1M context) --- docs/guide/new-benchmark.md | 34 ++--------- skillopt/envs/_template/README.md | 8 +-- skillopt/envs/_template/env_template.py | 57 ++----------------- skillopt/envs/alfworld/adapter.py | 31 ---------- skillopt/envs/base.py | 34 ++++++++--- skillopt/envs/docvqa/adapter.py | 25 -------- .../envs/livemathematicianbench/adapter.py | 33 ----------- skillopt/envs/officeqa/adapter.py | 23 -------- skillopt/envs/searchqa/adapter.py | 33 ----------- skillopt/envs/spreadsheetbench/adapter.py | 33 ----------- 10 files changed, 43 insertions(+), 268 deletions(-) diff --git a/docs/guide/new-benchmark.md b/docs/guide/new-benchmark.md index 6d2f009a..d9e314b5 100644 --- a/docs/guide/new-benchmark.md +++ b/docs/guide/new-benchmark.md @@ -161,13 +161,10 @@ Two design points worth flagging: ```python from __future__ import annotations -import os - from skillopt.datasets.base import BatchSpec from skillopt.envs.base import EnvAdapter from skillopt.envs.docfaithful.loader import DocFaithfulDataLoader from skillopt.envs.docfaithful.rollout import run_batch -from skillopt.gradient.reflect import run_minibatch_reflect class DocFaithfulAdapter(EnvAdapter): @@ -234,7 +231,7 @@ class DocFaithfulAdapter(EnvAdapter): ) return self.build_env_from_batch(batch, **kwargs) - # ── The two real action methods ───────────────────────────────────── + # ── The rollout method (reflect is inherited) ─────────────────────── def rollout(self, env_manager, skill_content: str, out_dir: str, **kwargs) -> list[dict]: @@ -247,27 +244,9 @@ class DocFaithfulAdapter(EnvAdapter): max_completion_tokens=self.max_completion_tokens, ) - def reflect(self, results: list[dict], skill_content: str, - out_dir: str, **kwargs) -> list[dict | None]: - return run_minibatch_reflect( - results=results, - skill_content=skill_content, - prediction_dir=kwargs.get( - "prediction_dir", os.path.join(out_dir, "predictions") - ), - patches_dir=kwargs.get( - "patches_dir", os.path.join(out_dir, "patches") - ), - workers=self.analyst_workers, - failure_only=self.failure_only, - minibatch_size=self.minibatch_size, - edit_budget=self.edit_budget, - random_seed=kwargs.get("random_seed"), - error_system=self.get_error_minibatch_prompt(), - success_system=self.get_success_minibatch_prompt(), - step_buffer_context=kwargs.get("step_buffer_context", ""), - update_mode=getattr(self, "_cfg", {}).get("skill_update_mode", "patch"), - ) + # reflect() is inherited from EnvAdapter — it delegates to + # run_minibatch_reflect with your analyst_error_* / analyst_success_* + # prompts. Override it only if you need custom reflection logic. def get_task_types(self) -> list[str]: seen: list[str] = [] @@ -373,9 +352,8 @@ If you get `ValueError: Unknown environment 'docfaithful'. Available: [...]`, you forgot Step 5. If you get `TypeError: Can't instantiate abstract class DocFaithfulAdapter`, -you forgot to implement one of the five abstract methods on `EnvAdapter`: -`build_train_env`, `build_eval_env`, `rollout`, `reflect`, -`get_task_types`. +you forgot to implement one of the four abstract methods on `EnvAdapter`: +`build_train_env`, `build_eval_env`, `rollout`, `get_task_types`. ## Tips diff --git a/skillopt/envs/_template/README.md b/skillopt/envs/_template/README.md index 787efe24..7830bbc7 100644 --- a/skillopt/envs/_template/README.md +++ b/skillopt/envs/_template/README.md @@ -5,8 +5,8 @@ This directory provides scaffold files for adding a new benchmark to SkillOpt. ## Files - `env_template.py` — Environment adapter template (subclasses - `EnvAdapter`; implements the 5 abstract methods so the file is - instantiable out of the box). + `EnvAdapter`; implements the 4 abstract methods so the file is + instantiable out of the box — `reflect` is inherited). - `loader_template.py` — Data loader template (subclasses `SplitDataLoader`; implements `load_split_items` for `.json`/`.jsonl`). - `config_template.yaml` — Config file template. @@ -28,8 +28,8 @@ This directory provides scaffold files for adding a new benchmark to SkillOpt. `TemplateBenchmarkLoader → YourBenchmarkLoader`) and fix the cross-import in `adapter.py`. 3. **Implement the TODO blocks** inside `adapter.py:rollout` and the - `_normalize_item` helper in `loader.py`. If you want real reflection, - uncomment the `run_minibatch_reflect` block in `adapter.py:reflect`. + `_normalize_item` helper in `loader.py`. (`reflect` is inherited from + `EnvAdapter`; override it only for custom reflection logic.) 4. **Register** the adapter — add a `try / except ImportError` block in `scripts/train.py`'s `_register_builtins()` mapping the registry key to your `YourBenchmarkAdapter` class. There is no diff --git a/skillopt/envs/_template/env_template.py b/skillopt/envs/_template/env_template.py index 63a70b19..330b9533 100644 --- a/skillopt/envs/_template/env_template.py +++ b/skillopt/envs/_template/env_template.py @@ -14,13 +14,9 @@ """ from __future__ import annotations -import os - from skillopt.datasets.base import BatchSpec from skillopt.envs.base import EnvAdapter from skillopt.envs._template.loader_template import TemplateBenchmarkLoader -# When you wire in real reflection, also import: -# from skillopt.gradient.reflect import run_minibatch_reflect class TemplateBenchmarkEnv(EnvAdapter): @@ -131,53 +127,12 @@ def rollout( ) return results - # ── Reflect: turn rollout results into patch dicts ───────────────── - - def reflect( - self, - results: list[dict], - skill_content: str, - out_dir: str, - **kwargs, - ) -> list[dict | None]: - """ - Turn rollouts into a list of raw patch dicts (or None to drop). - - Each non-None dict MUST have: - - "patch": {"edits": [...]} a Patch.to_dict() payload - - "source_type": "failure" | "success" - - Most benchmarks delegate to - :func:`skillopt.gradient.reflect.run_minibatch_reflect` which - will call the optimizer model with the - ``analyst_error_*`` / ``analyst_success_*`` prompts. To enable it, - uncomment the import above and call: - - from skillopt.gradient.reflect import run_minibatch_reflect - return run_minibatch_reflect( - results=results, - skill_content=skill_content, - prediction_dir=kwargs.get( - "prediction_dir", os.path.join(out_dir, "predictions") - ), - patches_dir=kwargs.get( - "patches_dir", os.path.join(out_dir, "patches") - ), - workers=self.analyst_workers, - failure_only=self.failure_only, - minibatch_size=self.minibatch_size, - edit_budget=self.edit_budget, - random_seed=kwargs.get("random_seed"), - error_system=self.get_error_minibatch_prompt(), - success_system=self.get_success_minibatch_prompt(), - step_buffer_context=kwargs.get("step_buffer_context", ""), - update_mode=getattr(self, "_cfg", {}).get( - "skill_update_mode", "patch" - ), - ) - """ - # Template default: produce no patches (no-op trainer step). - return [None for _ in results] + # ── Reflect (inherited) ───────────────────────────────────────────── + # + # ``reflect`` is inherited from ``EnvAdapter``: the default delegates to + # ``skillopt.gradient.reflect.run_minibatch_reflect`` using your + # ``analyst_error_*`` / ``analyst_success_*`` prompts. You do NOT need to + # implement it — override only if your benchmark needs custom reflection. # ── Stratification hint ──────────────────────────────────────────── diff --git a/skillopt/envs/alfworld/adapter.py b/skillopt/envs/alfworld/adapter.py index e6891692..18db01b0 100644 --- a/skillopt/envs/alfworld/adapter.py +++ b/skillopt/envs/alfworld/adapter.py @@ -17,7 +17,6 @@ run_alfworld_batch, TASKS, ) -from skillopt.gradient.reflect import run_minibatch_reflect from skillopt.utils import compute_score @@ -425,35 +424,5 @@ def _run_batch( all_results.extend(chunk_results) return all_results - def reflect( - self, - results: list[dict], - skill_content: str, - out_dir: str, - **kwargs, - ) -> list[dict | None]: - prediction_dir = kwargs.get("prediction_dir", os.path.join(out_dir, "predictions")) - patches_dir = kwargs.get("patches_dir", os.path.join(out_dir, "patches")) - random_seed = kwargs.get("random_seed") - step_buffer_context = kwargs.get("step_buffer_context", "") - meta_skill_context = kwargs.get("meta_skill_context", "") - - return run_minibatch_reflect( - results=results, - skill_content=skill_content, - prediction_dir=prediction_dir, - patches_dir=patches_dir, - workers=self.analyst_workers, - failure_only=self.failure_only, - minibatch_size=self.minibatch_size, - edit_budget=self.edit_budget, - random_seed=random_seed, - error_system=self.get_error_minibatch_prompt(), - success_system=self.get_success_minibatch_prompt(), - step_buffer_context=step_buffer_context, - meta_skill_context=meta_skill_context, - ) - - def get_task_types(self) -> list[str]: return list(TASKS) diff --git a/skillopt/envs/base.py b/skillopt/envs/base.py index c2e57eaa..243c2b78 100644 --- a/skillopt/envs/base.py +++ b/skillopt/envs/base.py @@ -231,7 +231,6 @@ def rollout( (float 0-1). May include env-specific fields. """ - @abstractmethod def reflect( self, results: list[dict], @@ -241,15 +240,36 @@ def reflect( ) -> list[dict | None]: """Analyze rollout results and produce patches. + Default implementation: delegate to the shared minibatch reflect + stage. Every built-in benchmark uses this unchanged — override only + if your environment needs custom reflection logic. + Each returned dict conforms to :class:`~skillopt.types.RawPatch`: ``"patch"`` (with ``"edits"`` list) + ``"source_type"`` - (``"failure"`` or ``"success"``). - - Returns - ------- - list[dict | None] - Raw analyst outputs; ``None`` entries are filtered out. + (``"failure"`` or ``"success"``); ``None`` entries are filtered out. """ + from skillopt.gradient.reflect import run_minibatch_reflect + + return run_minibatch_reflect( + results=results, + skill_content=skill_content, + prediction_dir=kwargs.get( + "prediction_dir", os.path.join(out_dir, "predictions") + ), + patches_dir=kwargs.get( + "patches_dir", os.path.join(out_dir, "patches") + ), + workers=self.analyst_workers, + failure_only=self.failure_only, + minibatch_size=self.minibatch_size, + edit_budget=self.edit_budget, + random_seed=kwargs.get("random_seed"), + error_system=self.get_error_minibatch_prompt(), + success_system=self.get_success_minibatch_prompt(), + step_buffer_context=kwargs.get("step_buffer_context", ""), + meta_skill_context=kwargs.get("meta_skill_context", ""), + update_mode=getattr(self, "_cfg", {}).get("skill_update_mode", "patch"), + ) @abstractmethod def get_task_types(self) -> list[str]: diff --git a/skillopt/envs/docvqa/adapter.py b/skillopt/envs/docvqa/adapter.py index 91849061..ddf1dbf0 100644 --- a/skillopt/envs/docvqa/adapter.py +++ b/skillopt/envs/docvqa/adapter.py @@ -1,12 +1,9 @@ from __future__ import annotations -import os - from skillopt.datasets.base import BatchSpec from skillopt.envs.base import EnvAdapter from skillopt.envs.docvqa.dataloader import DocVQADataLoader from skillopt.envs.docvqa.rollout import run_batch -from skillopt.gradient.reflect import run_minibatch_reflect class DocVQAAdapter(EnvAdapter): @@ -84,28 +81,6 @@ def rollout(self, env_manager, skill_content: str, out_dir: str, **kwargs) -> li task_timeout=self.exec_timeout, ) - def reflect(self, results: list[dict], skill_content: str, out_dir: str, **kwargs) -> list[dict | None]: - prediction_dir = kwargs.get("prediction_dir", os.path.join(out_dir, "predictions")) - patches_dir = kwargs.get("patches_dir", os.path.join(out_dir, "patches")) - random_seed = kwargs.get("random_seed") - step_buffer_context = kwargs.get("step_buffer_context", "") - return run_minibatch_reflect( - results=results, - skill_content=skill_content, - prediction_dir=prediction_dir, - patches_dir=patches_dir, - workers=self.analyst_workers, - failure_only=self.failure_only, - minibatch_size=self.minibatch_size, - edit_budget=self.edit_budget, - random_seed=random_seed, - error_system=self.get_error_minibatch_prompt(), - success_system=self.get_success_minibatch_prompt(), - step_buffer_context=step_buffer_context, - update_mode=getattr(self, "_cfg", {}).get("skill_update_mode", "patch"), - ) - - def get_task_types(self) -> list[str]: seen: list[str] = [] for item in self.dataloader.train_items + self.dataloader.val_items + self.dataloader.test_items: diff --git a/skillopt/envs/livemathematicianbench/adapter.py b/skillopt/envs/livemathematicianbench/adapter.py index 554b0675..ef96c864 100644 --- a/skillopt/envs/livemathematicianbench/adapter.py +++ b/skillopt/envs/livemathematicianbench/adapter.py @@ -2,10 +2,8 @@ from __future__ import annotations import json -import os from skillopt.datasets.base import BatchSpec -from skillopt.gradient.reflect import run_minibatch_reflect from skillopt.envs.base import EnvAdapter from skillopt.envs.livemathematicianbench.dataloader import LiveMathematicianBenchDataLoader from skillopt.envs.livemathematicianbench.rollout import run_batch @@ -127,36 +125,5 @@ def rollout( task_timeout=self.exec_timeout, ) - def reflect( - self, - results: list[dict], - skill_content: str, - out_dir: str, - **kwargs, - ) -> list[dict | None]: - prediction_dir = kwargs.get("prediction_dir", os.path.join(out_dir, "predictions")) - patches_dir = kwargs.get("patches_dir", os.path.join(out_dir, "patches")) - random_seed = kwargs.get("random_seed") - step_buffer_context = kwargs.get("step_buffer_context", "") - meta_skill_context = kwargs.get("meta_skill_context", "") - - return run_minibatch_reflect( - results=results, - skill_content=skill_content, - prediction_dir=prediction_dir, - patches_dir=patches_dir, - workers=self.analyst_workers, - failure_only=self.failure_only, - minibatch_size=self.minibatch_size, - edit_budget=self.edit_budget, - random_seed=random_seed, - error_system=self.get_error_minibatch_prompt(), - success_system=self.get_success_minibatch_prompt(), - step_buffer_context=step_buffer_context, - meta_skill_context=meta_skill_context, - update_mode=getattr(self, "_cfg", {}).get("skill_update_mode", "patch"), - ) - - def get_task_types(self) -> list[str]: return self.dataloader.get_task_types() diff --git a/skillopt/envs/officeqa/adapter.py b/skillopt/envs/officeqa/adapter.py index ba2e6f1c..63419d48 100644 --- a/skillopt/envs/officeqa/adapter.py +++ b/skillopt/envs/officeqa/adapter.py @@ -6,7 +6,6 @@ from skillopt.envs.base import EnvAdapter from skillopt.envs.officeqa.dataloader import OfficeQADataLoader from skillopt.envs.officeqa.rollout import run_batch -from skillopt.gradient.reflect import run_minibatch_reflect class OfficeQAAdapter(EnvAdapter): @@ -104,28 +103,6 @@ def rollout(self, env_manager, skill_content: str, out_dir: str, **kwargs) -> li diagnostic_instruction=kwargs.get("diagnostic_instruction", ""), ) - def reflect(self, results: list[dict], skill_content: str, out_dir: str, **kwargs) -> list[dict | None]: - prediction_dir = kwargs.get("prediction_dir", os.path.join(out_dir, "predictions")) - patches_dir = kwargs.get("patches_dir", os.path.join(out_dir, "patches")) - random_seed = kwargs.get("random_seed") - step_buffer_context = kwargs.get("step_buffer_context", "") - return run_minibatch_reflect( - results=results, - skill_content=skill_content, - prediction_dir=prediction_dir, - patches_dir=patches_dir, - workers=self.analyst_workers, - failure_only=self.failure_only, - minibatch_size=self.minibatch_size, - edit_budget=self.edit_budget, - random_seed=random_seed, - error_system=self.get_error_minibatch_prompt(), - success_system=self.get_success_minibatch_prompt(), - step_buffer_context=step_buffer_context, - update_mode=getattr(self, "_cfg", {}).get("skill_update_mode", "patch"), - ) - - def get_task_types(self) -> list[str]: seen: list[str] = [] for item in self.dataloader.train_items + self.dataloader.val_items + self.dataloader.test_items: diff --git a/skillopt/envs/searchqa/adapter.py b/skillopt/envs/searchqa/adapter.py index 2253ebe5..d173b966 100644 --- a/skillopt/envs/searchqa/adapter.py +++ b/skillopt/envs/searchqa/adapter.py @@ -2,13 +2,11 @@ from __future__ import annotations import json -import os from skillopt.datasets.base import BatchSpec from skillopt.envs.base import EnvAdapter from skillopt.envs.searchqa.dataloader import SearchQADataLoader from skillopt.envs.searchqa.rollout import run_batch -from skillopt.gradient.reflect import run_minibatch_reflect from skillopt.model import get_target_backend @@ -94,36 +92,5 @@ def rollout( task_timeout=self.exec_timeout, ) - def reflect( - self, - results: list[dict], - skill_content: str, - out_dir: str, - **kwargs, - ) -> list[dict | None]: - prediction_dir = kwargs.get("prediction_dir", os.path.join(out_dir, "predictions")) - patches_dir = kwargs.get("patches_dir", os.path.join(out_dir, "patches")) - random_seed = kwargs.get("random_seed") - step_buffer_context = kwargs.get("step_buffer_context", "") - meta_skill_context = kwargs.get("meta_skill_context", "") - - return run_minibatch_reflect( - results=results, - skill_content=skill_content, - prediction_dir=prediction_dir, - patches_dir=patches_dir, - workers=self.analyst_workers, - failure_only=self.failure_only, - minibatch_size=self.minibatch_size, - edit_budget=self.edit_budget, - random_seed=random_seed, - error_system=self.get_error_minibatch_prompt(), - success_system=self.get_success_minibatch_prompt(), - step_buffer_context=step_buffer_context, - meta_skill_context=meta_skill_context, - update_mode=getattr(self, "_cfg", {}).get("skill_update_mode", "patch"), - ) - - def get_task_types(self) -> list[str]: return ["qa"] diff --git a/skillopt/envs/spreadsheetbench/adapter.py b/skillopt/envs/spreadsheetbench/adapter.py index 5b2b6782..16e7856f 100644 --- a/skillopt/envs/spreadsheetbench/adapter.py +++ b/skillopt/envs/spreadsheetbench/adapter.py @@ -16,7 +16,6 @@ run_spreadsheet_batch, run_spreadsheet_batch_codegen, ) -from skillopt.gradient.reflect import run_minibatch_reflect from skillopt.model import get_target_backend, is_target_exec_backend @@ -156,37 +155,5 @@ def rollout( return results - def reflect( - self, - results: list[dict], - skill_content: str, - out_dir: str, - **kwargs, - ) -> list[dict | None]: - """Analyze rollout results and produce patches (minibatch mode).""" - prediction_dir = kwargs.get("prediction_dir", os.path.join(out_dir, "predictions")) - patches_dir = kwargs.get("patches_dir", os.path.join(out_dir, "patches")) - random_seed = kwargs.get("random_seed") - step_buffer_context = kwargs.get("step_buffer_context", "") - meta_skill_context = kwargs.get("meta_skill_context", "") - - return run_minibatch_reflect( - results=results, - skill_content=skill_content, - prediction_dir=prediction_dir, - patches_dir=patches_dir, - workers=self.analyst_workers, - failure_only=self.failure_only, - minibatch_size=self.minibatch_size, - edit_budget=self.edit_budget, - random_seed=random_seed, - error_system=self.get_error_minibatch_prompt(), - success_system=self.get_success_minibatch_prompt(), - step_buffer_context=step_buffer_context, - meta_skill_context=meta_skill_context, - update_mode=getattr(self, "_cfg", {}).get("skill_update_mode", "patch"), - ) - - def get_task_types(self) -> list[str]: return list(TASK_TYPES)