Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 6 additions & 28 deletions docs/guide/new-benchmark.md
Original file line number Diff line number Diff line change
Expand Up @@ -161,13 +161,10 @@ Two design points worth flagging:
```python
from __future__ import annotations

import os

from skillopt.datasets.base import BatchSpec
from skillopt.envs.base import EnvAdapter
from skillopt.envs.docfaithful.loader import DocFaithfulDataLoader
from skillopt.envs.docfaithful.rollout import run_batch
from skillopt.gradient.reflect import run_minibatch_reflect


class DocFaithfulAdapter(EnvAdapter):
Expand Down Expand Up @@ -234,7 +231,7 @@ class DocFaithfulAdapter(EnvAdapter):
)
return self.build_env_from_batch(batch, **kwargs)

# ── The two real action methods ─────────────────────────────────────
# ── The rollout method (reflect is inherited) ───────────────────────

def rollout(self, env_manager, skill_content: str,
out_dir: str, **kwargs) -> list[dict]:
Expand All @@ -247,27 +244,9 @@ class DocFaithfulAdapter(EnvAdapter):
max_completion_tokens=self.max_completion_tokens,
)

def reflect(self, results: list[dict], skill_content: str,
out_dir: str, **kwargs) -> list[dict | None]:
return run_minibatch_reflect(
results=results,
skill_content=skill_content,
prediction_dir=kwargs.get(
"prediction_dir", os.path.join(out_dir, "predictions")
),
patches_dir=kwargs.get(
"patches_dir", os.path.join(out_dir, "patches")
),
workers=self.analyst_workers,
failure_only=self.failure_only,
minibatch_size=self.minibatch_size,
edit_budget=self.edit_budget,
random_seed=kwargs.get("random_seed"),
error_system=self.get_error_minibatch_prompt(),
success_system=self.get_success_minibatch_prompt(),
step_buffer_context=kwargs.get("step_buffer_context", ""),
update_mode=getattr(self, "_cfg", {}).get("skill_update_mode", "patch"),
)
# reflect() is inherited from EnvAdapter — it delegates to
# run_minibatch_reflect with your analyst_error_* / analyst_success_*
# prompts. Override it only if you need custom reflection logic.

def get_task_types(self) -> list[str]:
seen: list[str] = []
Expand Down Expand Up @@ -373,9 +352,8 @@ If you get `ValueError: Unknown environment 'docfaithful'. Available: [...]`,
you forgot Step 5.

If you get `TypeError: Can't instantiate abstract class DocFaithfulAdapter`,
you forgot to implement one of the five abstract methods on `EnvAdapter`:
`build_train_env`, `build_eval_env`, `rollout`, `reflect`,
`get_task_types`.
you forgot to implement one of the four abstract methods on `EnvAdapter`:
`build_train_env`, `build_eval_env`, `rollout`, `get_task_types`.

## Tips

Expand Down
8 changes: 4 additions & 4 deletions skillopt/envs/_template/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ This directory provides scaffold files for adding a new benchmark to SkillOpt.
## Files

- `env_template.py` — Environment adapter template (subclasses
`EnvAdapter`; implements the 5 abstract methods so the file is
instantiable out of the box).
`EnvAdapter`; implements the 4 abstract methods so the file is
instantiable out of the box — `reflect` is inherited).
- `loader_template.py` — Data loader template (subclasses
`SplitDataLoader`; implements `load_split_items` for `.json`/`.jsonl`).
- `config_template.yaml` — Config file template.
Expand All @@ -28,8 +28,8 @@ This directory provides scaffold files for adding a new benchmark to SkillOpt.
`TemplateBenchmarkLoader → YourBenchmarkLoader`)
and fix the cross-import in `adapter.py`.
3. **Implement the TODO blocks** inside `adapter.py:rollout` and the
`_normalize_item` helper in `loader.py`. If you want real reflection,
uncomment the `run_minibatch_reflect` block in `adapter.py:reflect`.
`_normalize_item` helper in `loader.py`. (`reflect` is inherited from
`EnvAdapter`; override it only for custom reflection logic.)
4. **Register** the adapter — add a `try / except ImportError` block in
`scripts/train.py`'s `_register_builtins()` mapping the registry key
to your `YourBenchmarkAdapter` class. There is no
Expand Down
57 changes: 6 additions & 51 deletions skillopt/envs/_template/env_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,9 @@
"""
from __future__ import annotations

import os

from skillopt.datasets.base import BatchSpec
from skillopt.envs.base import EnvAdapter
from skillopt.envs._template.loader_template import TemplateBenchmarkLoader
# When you wire in real reflection, also import:
# from skillopt.gradient.reflect import run_minibatch_reflect


class TemplateBenchmarkEnv(EnvAdapter):
Expand Down Expand Up @@ -131,53 +127,12 @@ def rollout(
)
return results

# ── Reflect: turn rollout results into patch dicts ─────────────────

def reflect(
self,
results: list[dict],
skill_content: str,
out_dir: str,
**kwargs,
) -> list[dict | None]:
"""
Turn rollouts into a list of raw patch dicts (or None to drop).

Each non-None dict MUST have:
- "patch": {"edits": [...]} a Patch.to_dict() payload
- "source_type": "failure" | "success"

Most benchmarks delegate to
:func:`skillopt.gradient.reflect.run_minibatch_reflect` which
will call the optimizer model with the
``analyst_error_*`` / ``analyst_success_*`` prompts. To enable it,
uncomment the import above and call:

from skillopt.gradient.reflect import run_minibatch_reflect
return run_minibatch_reflect(
results=results,
skill_content=skill_content,
prediction_dir=kwargs.get(
"prediction_dir", os.path.join(out_dir, "predictions")
),
patches_dir=kwargs.get(
"patches_dir", os.path.join(out_dir, "patches")
),
workers=self.analyst_workers,
failure_only=self.failure_only,
minibatch_size=self.minibatch_size,
edit_budget=self.edit_budget,
random_seed=kwargs.get("random_seed"),
error_system=self.get_error_minibatch_prompt(),
success_system=self.get_success_minibatch_prompt(),
step_buffer_context=kwargs.get("step_buffer_context", ""),
update_mode=getattr(self, "_cfg", {}).get(
"skill_update_mode", "patch"
),
)
"""
# Template default: produce no patches (no-op trainer step).
return [None for _ in results]
# ── Reflect (inherited) ─────────────────────────────────────────────
#
# ``reflect`` is inherited from ``EnvAdapter``: the default delegates to
# ``skillopt.gradient.reflect.run_minibatch_reflect`` using your
# ``analyst_error_*`` / ``analyst_success_*`` prompts. You do NOT need to
# implement it — override only if your benchmark needs custom reflection.

# ── Stratification hint ────────────────────────────────────────────

Expand Down
31 changes: 0 additions & 31 deletions skillopt/envs/alfworld/adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
run_alfworld_batch,
TASKS,
)
from skillopt.gradient.reflect import run_minibatch_reflect
from skillopt.utils import compute_score


Expand Down Expand Up @@ -425,35 +424,5 @@ def _run_batch(
all_results.extend(chunk_results)
return all_results

def reflect(
self,
results: list[dict],
skill_content: str,
out_dir: str,
**kwargs,
) -> list[dict | None]:
prediction_dir = kwargs.get("prediction_dir", os.path.join(out_dir, "predictions"))
patches_dir = kwargs.get("patches_dir", os.path.join(out_dir, "patches"))
random_seed = kwargs.get("random_seed")
step_buffer_context = kwargs.get("step_buffer_context", "")
meta_skill_context = kwargs.get("meta_skill_context", "")

return run_minibatch_reflect(
results=results,
skill_content=skill_content,
prediction_dir=prediction_dir,
patches_dir=patches_dir,
workers=self.analyst_workers,
failure_only=self.failure_only,
minibatch_size=self.minibatch_size,
edit_budget=self.edit_budget,
random_seed=random_seed,
error_system=self.get_error_minibatch_prompt(),
success_system=self.get_success_minibatch_prompt(),
step_buffer_context=step_buffer_context,
meta_skill_context=meta_skill_context,
)


def get_task_types(self) -> list[str]:
return list(TASKS)
34 changes: 27 additions & 7 deletions skillopt/envs/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,6 @@ def rollout(
(float 0-1). May include env-specific fields.
"""

@abstractmethod
def reflect(
self,
results: list[dict],
Expand All @@ -241,15 +240,36 @@ def reflect(
) -> list[dict | None]:
"""Analyze rollout results and produce patches.

Default implementation: delegate to the shared minibatch reflect
stage. Every built-in benchmark uses this unchanged — override only
if your environment needs custom reflection logic.

Each returned dict conforms to :class:`~skillopt.types.RawPatch`:
``"patch"`` (with ``"edits"`` list) + ``"source_type"``
(``"failure"`` or ``"success"``).

Returns
-------
list[dict | None]
Raw analyst outputs; ``None`` entries are filtered out.
(``"failure"`` or ``"success"``); ``None`` entries are filtered out.
"""
from skillopt.gradient.reflect import run_minibatch_reflect

return run_minibatch_reflect(
results=results,
skill_content=skill_content,
prediction_dir=kwargs.get(
"prediction_dir", os.path.join(out_dir, "predictions")
),
patches_dir=kwargs.get(
"patches_dir", os.path.join(out_dir, "patches")
),
workers=self.analyst_workers,
failure_only=self.failure_only,
minibatch_size=self.minibatch_size,
edit_budget=self.edit_budget,
random_seed=kwargs.get("random_seed"),
error_system=self.get_error_minibatch_prompt(),
success_system=self.get_success_minibatch_prompt(),
step_buffer_context=kwargs.get("step_buffer_context", ""),
meta_skill_context=kwargs.get("meta_skill_context", ""),
update_mode=getattr(self, "_cfg", {}).get("skill_update_mode", "patch"),
)

@abstractmethod
def get_task_types(self) -> list[str]:
Expand Down
25 changes: 0 additions & 25 deletions skillopt/envs/docvqa/adapter.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,9 @@
from __future__ import annotations

import os

from skillopt.datasets.base import BatchSpec
from skillopt.envs.base import EnvAdapter
from skillopt.envs.docvqa.dataloader import DocVQADataLoader
from skillopt.envs.docvqa.rollout import run_batch
from skillopt.gradient.reflect import run_minibatch_reflect


class DocVQAAdapter(EnvAdapter):
Expand Down Expand Up @@ -84,28 +81,6 @@ def rollout(self, env_manager, skill_content: str, out_dir: str, **kwargs) -> li
task_timeout=self.exec_timeout,
)

def reflect(self, results: list[dict], skill_content: str, out_dir: str, **kwargs) -> list[dict | None]:
prediction_dir = kwargs.get("prediction_dir", os.path.join(out_dir, "predictions"))
patches_dir = kwargs.get("patches_dir", os.path.join(out_dir, "patches"))
random_seed = kwargs.get("random_seed")
step_buffer_context = kwargs.get("step_buffer_context", "")
return run_minibatch_reflect(
results=results,
skill_content=skill_content,
prediction_dir=prediction_dir,
patches_dir=patches_dir,
workers=self.analyst_workers,
failure_only=self.failure_only,
minibatch_size=self.minibatch_size,
edit_budget=self.edit_budget,
random_seed=random_seed,
error_system=self.get_error_minibatch_prompt(),
success_system=self.get_success_minibatch_prompt(),
step_buffer_context=step_buffer_context,
update_mode=getattr(self, "_cfg", {}).get("skill_update_mode", "patch"),
)


def get_task_types(self) -> list[str]:
seen: list[str] = []
for item in self.dataloader.train_items + self.dataloader.val_items + self.dataloader.test_items:
Expand Down
33 changes: 0 additions & 33 deletions skillopt/envs/livemathematicianbench/adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,8 @@
from __future__ import annotations

import json
import os

from skillopt.datasets.base import BatchSpec
from skillopt.gradient.reflect import run_minibatch_reflect
from skillopt.envs.base import EnvAdapter
from skillopt.envs.livemathematicianbench.dataloader import LiveMathematicianBenchDataLoader
from skillopt.envs.livemathematicianbench.rollout import run_batch
Expand Down Expand Up @@ -127,36 +125,5 @@ def rollout(
task_timeout=self.exec_timeout,
)

def reflect(
self,
results: list[dict],
skill_content: str,
out_dir: str,
**kwargs,
) -> list[dict | None]:
prediction_dir = kwargs.get("prediction_dir", os.path.join(out_dir, "predictions"))
patches_dir = kwargs.get("patches_dir", os.path.join(out_dir, "patches"))
random_seed = kwargs.get("random_seed")
step_buffer_context = kwargs.get("step_buffer_context", "")
meta_skill_context = kwargs.get("meta_skill_context", "")

return run_minibatch_reflect(
results=results,
skill_content=skill_content,
prediction_dir=prediction_dir,
patches_dir=patches_dir,
workers=self.analyst_workers,
failure_only=self.failure_only,
minibatch_size=self.minibatch_size,
edit_budget=self.edit_budget,
random_seed=random_seed,
error_system=self.get_error_minibatch_prompt(),
success_system=self.get_success_minibatch_prompt(),
step_buffer_context=step_buffer_context,
meta_skill_context=meta_skill_context,
update_mode=getattr(self, "_cfg", {}).get("skill_update_mode", "patch"),
)


def get_task_types(self) -> list[str]:
return self.dataloader.get_task_types()
23 changes: 0 additions & 23 deletions skillopt/envs/officeqa/adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from skillopt.envs.base import EnvAdapter
from skillopt.envs.officeqa.dataloader import OfficeQADataLoader
from skillopt.envs.officeqa.rollout import run_batch
from skillopt.gradient.reflect import run_minibatch_reflect


class OfficeQAAdapter(EnvAdapter):
Expand Down Expand Up @@ -104,28 +103,6 @@ def rollout(self, env_manager, skill_content: str, out_dir: str, **kwargs) -> li
diagnostic_instruction=kwargs.get("diagnostic_instruction", ""),
)

def reflect(self, results: list[dict], skill_content: str, out_dir: str, **kwargs) -> list[dict | None]:
prediction_dir = kwargs.get("prediction_dir", os.path.join(out_dir, "predictions"))
patches_dir = kwargs.get("patches_dir", os.path.join(out_dir, "patches"))
random_seed = kwargs.get("random_seed")
step_buffer_context = kwargs.get("step_buffer_context", "")
return run_minibatch_reflect(
results=results,
skill_content=skill_content,
prediction_dir=prediction_dir,
patches_dir=patches_dir,
workers=self.analyst_workers,
failure_only=self.failure_only,
minibatch_size=self.minibatch_size,
edit_budget=self.edit_budget,
random_seed=random_seed,
error_system=self.get_error_minibatch_prompt(),
success_system=self.get_success_minibatch_prompt(),
step_buffer_context=step_buffer_context,
update_mode=getattr(self, "_cfg", {}).get("skill_update_mode", "patch"),
)


def get_task_types(self) -> list[str]:
seen: list[str] = []
for item in self.dataloader.train_items + self.dataloader.val_items + self.dataloader.test_items:
Expand Down
Loading