diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..a54c5d9 --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,27 @@ +name: test +on: + pull_request: + push: + branches: [main] +permissions: + contents: read +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true +jobs: + pytest: + runs-on: ubuntu-latest + timeout-minutes: 15 + strategy: + fail-fast: false + matrix: + python: ["3.11", "3.12"] + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python }} + cache: pip + cache-dependency-path: pyproject.toml + - run: pip install -e ".[dev]" + - run: pytest -q diff --git a/.gitignore b/.gitignore index 08b7c7b..262c028 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,9 @@ __pycache__/ *.pyc *.pyo *.pyd +*.egg-info/ +build/ +dist/ # macOS .DS_Store @@ -20,3 +23,4 @@ __pycache__/ # Local runtime environment /.venv311/ /.runtime/ +.venv/ diff --git a/README.md b/README.md index 3d22ccb..6eb1ac0 100644 --- a/README.md +++ b/README.md @@ -132,6 +132,25 @@ Core artifacts: Global index: - `reports/run_index.tsv` +## Sources & backends + +OpenRevise's source router can be configured to retrieve evidence from multiple categories. Pipeline code dispatches by `source_type` to the registered backend, with optional dependencies allowing graceful degradation. + +- **Preprints (arXiv / medRxiv / bioRxiv):** retrieval is provided via [DeepXiv-SDK](https://github.com/DeepXiv/deepxiv_sdk) (optional, install with `pip install "openrevise[preprint-deepxiv]"`) or a built-in arXiv API client (`pip install "openrevise[preprint-arxiv]"`). If neither is installed, preprint retrieval is disabled and pipelines using non-preprint sources continue to work. +- **Biomedical literature (PubMed / PMC / Europe PMC):** built-in clients using NCBI E-utilities and the EuropePMC REST API. No optional dependencies required beyond `requests` (a base dep). +- **Scholarly indexes (Semantic Scholar / OpenAlex):** retained from the existing `ideaclaw.sources.scholar` module. +- **Regulatory & guidelines (FDA / EMA / NCCN / ESMO / ASCO / CSCO):** generic HTTP fetchers driven by `config/source_registry.yaml`. +- **Local files (PDF / DOCX / PPTX / images):** evidence extraction via `openrevise.sources.evidence_extractors`. + +## Quick install + +```bash +pip install openrevise # core (no preprint search) +pip install "openrevise[preprint-arxiv]" # + native arXiv client +pip install "openrevise[preprint-deepxiv]" # + DeepXiv-SDK preprint backend +pip install "openrevise[all]" # everything +``` + ## Repository Structure | Path | Purpose | |---|---| diff --git a/config/revision_patch_spec_template.json b/config/revision_patch_spec_template.json index 6a1d9ee..492d3fd 100644 --- a/config/revision_patch_spec_template.json +++ b/config/revision_patch_spec_template.json @@ -1,4 +1,9 @@ { + "meta": { + "notes": [ + "Demo: when replacement contains subgroup label tuples (e.g. ITT/mITT), pipeline label-value gate checks prefix/value binding before revise." + ] + }, "footnote_sources": { "src_reg_notice": "Source: Regulatory authority notice URL ... verified on YYYY-MM-DD.", "src_journal": "Source: Journal paper citation ... verified on YYYY-MM-DD." diff --git a/config/source_registry.yaml b/config/source_registry.yaml index 5c858b9..f74da7b 100644 --- a/config/source_registry.yaml +++ b/config/source_registry.yaml @@ -235,3 +235,24 @@ sources: alias_keywords: - sun yat-sen - sysu +- source_id: arxiv + source_type: preprint + tier: B + country: INTL + institution_tier: '' + reliability_rule: preprint_unreviewed + alias_keywords: [arxiv] +- source_id: medrxiv + source_type: preprint + tier: B + country: INTL + institution_tier: '' + reliability_rule: preprint_unreviewed + alias_keywords: [medrxiv] +- source_id: biorxiv + source_type: preprint + tier: B + country: INTL + institution_tier: '' + reliability_rule: preprint_unreviewed + alias_keywords: [biorxiv] diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..16c46f1 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,54 @@ +[build-system] +requires = ["setuptools>=68", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "openrevise" +version = "0.1.0.dev0" +description = "Evidence-gated revision infrastructure for high-stakes documents" +readme = "README.md" +license = { text = "Apache-2.0" } +requires-python = ">=3.11" +authors = [{ name = "StartripAI" }] +classifiers = [ + "Development Status :: 3 - Alpha", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", +] +dependencies = [ + "lxml>=4.9", + "PyYAML>=6.0", + "requests>=2.31", +] + +[project.optional-dependencies] +preprint-deepxiv = ["deepxiv-sdk>=0.1"] +preprint-arxiv = ["sentence-transformers>=2.2", "feedparser>=6.0"] +biomed = ["biopython>=1.83"] +mcp = ["mcp>=0.9"] +docx = ["python-docx>=1.1"] +pdf = ["pypdf>=4.0", "pdfminer.six>=20231228"] +xlsx = ["openpyxl>=3.1"] +all = ["openrevise[preprint-deepxiv,preprint-arxiv,biomed,mcp,docx,pdf,xlsx]"] +dev = ["pytest>=8.0", "pytest-cov>=4.1", "ruff>=0.5", "requests-mock>=1.11"] + +[project.scripts] +openrevise = "openrevise.cli:main" +openrevise-mcp = "openrevise.mcp.server:main" + +[project.urls] +Homepage = "https://github.com/StartripAI/OpenRevise" +Issues = "https://github.com/StartripAI/OpenRevise/issues" + +[tool.setuptools.packages.find] +where = ["src"] +include = ["openrevise*"] + +[tool.pytest.ini_options] +testpaths = ["tests"] +addopts = "-q --strict-markers" + +[tool.ruff] +line-length = 100 +target-version = "py311" diff --git a/src/openrevise/__init__.py b/src/openrevise/__init__.py new file mode 100644 index 0000000..c594097 --- /dev/null +++ b/src/openrevise/__init__.py @@ -0,0 +1,10 @@ +"""OpenRevise: Evidence-gated revision infrastructure for high-stakes documents.""" + +from importlib.metadata import PackageNotFoundError, version + +try: + __version__ = version("openrevise") +except PackageNotFoundError: # pragma: no cover - only hit when package isn't installed + __version__ = "0.0.0+unknown" + +__all__ = ["__version__"] diff --git a/src/openrevise/artifacts/__init__.py b/src/openrevise/artifacts/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/scripts/build_q_source_map.py b/src/openrevise/artifacts/build_q_source_map.py similarity index 98% rename from scripts/build_q_source_map.py rename to src/openrevise/artifacts/build_q_source_map.py index e9217b7..1dc8cc7 100644 --- a/scripts/build_q_source_map.py +++ b/src/openrevise/artifacts/build_q_source_map.py @@ -14,7 +14,7 @@ import xml.etree.ElementTree as ET from pathlib import Path from typing import Dict, List, Tuple -from run_artifact_utils import is_valid_run_id +from openrevise.artifacts.run_artifact_utils import is_valid_run_id W = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}" QUESTION_PREFIX_RE = re.compile( diff --git a/scripts/query_q_source.py b/src/openrevise/artifacts/query_q_source.py similarity index 100% rename from scripts/query_q_source.py rename to src/openrevise/artifacts/query_q_source.py diff --git a/scripts/run_artifact_utils.py b/src/openrevise/artifacts/run_artifact_utils.py similarity index 100% rename from scripts/run_artifact_utils.py rename to src/openrevise/artifacts/run_artifact_utils.py diff --git a/scripts/update_run_index.py b/src/openrevise/artifacts/update_run_index.py similarity index 96% rename from scripts/update_run_index.py rename to src/openrevise/artifacts/update_run_index.py index b725bee..82c9ee4 100644 --- a/scripts/update_run_index.py +++ b/src/openrevise/artifacts/update_run_index.py @@ -11,7 +11,7 @@ from pathlib import Path from typing import Dict, List -from run_artifact_utils import read_tsv, write_tsv +from openrevise.artifacts.run_artifact_utils import read_tsv, write_tsv RUN_INDEX_FIELDS: List[str] = [ @@ -85,7 +85,7 @@ def main() -> int: parser.add_argument( "--index", type=Path, - default=Path(__file__).resolve().parents[1] / "reports" / "run_index.tsv", + default=Path(__file__).resolve().parents[3] / "reports" / "run_index.tsv", ) parser.add_argument("--marker", required=True) parser.add_argument("--run-id", required=True) diff --git a/src/openrevise/gates/__init__.py b/src/openrevise/gates/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/openrevise/gates/check_label_value_consistency.py b/src/openrevise/gates/check_label_value_consistency.py new file mode 100644 index 0000000..42c3914 --- /dev/null +++ b/src/openrevise/gates/check_label_value_consistency.py @@ -0,0 +1,495 @@ +#!/usr/bin/env python3 +""" +Label-value consistency gate for high-risk subgroup metrics. + +Current hard rule: +- If an opioid-related replacement sentence contains both ITT and mITT metric tuples, + verify label-to-value binding against extracted source text before revise. +""" + +from __future__ import annotations + +import argparse +import json +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, Iterable, List, Tuple + + +def _maybe_reexec_runtime_python() -> None: + if os.environ.get("REVISE_NO_REEXEC") == "1": + return + repo_root = Path(__file__).resolve().parents[3] + override = os.environ.get("REVISE_RUNTIME_PYTHON", "").strip() + preferred = Path(override) if override else (repo_root / ".venv311" / "bin" / "python") + if not preferred.exists(): + return + try: + current = Path(sys.executable).resolve() + target = preferred.resolve() + except OSError: + return + if current == target: + return + os.environ["REVISE_NO_REEXEC"] = "1" + os.execv(str(preferred), [str(preferred), str(Path(__file__).resolve()), *sys.argv[1:]]) + + +_maybe_reexec_runtime_python() + +from openrevise.sources.evidence_extractors import extract_local_source_text +from openrevise.artifacts.run_artifact_utils import is_valid_run_id + + +FOOTNOTE_KEY_PATTERN = re.compile(r"\[\[fn:([A-Za-z0-9_]+)\]\]") +LABEL_MARKER_PATTERN = re.compile(r"(?\d+(?:\.\d+)?)\s*(?:个月|month|months|mo)?\s*(?:vs|/|v)\s*(?P\d+(?:\.\d+)?)", + re.IGNORECASE, +) +PAIR_MEDIAN_PATTERN = re.compile( + r"(?:median|中位数)\s*(?:\(\s*\d+%?\s*ci[^)]*\)\s*)?" + r"(?P\d+(?:\.\d+)?)\s*(?:\([^)]{0,80}\))?\s*" + r"(?P\d+(?:\.\d+)?)\s*(?:\([^)]{0,80}\))?", + re.IGNORECASE, +) +HR_PATTERN = re.compile(r"hr[^0-9]{0,12}(?P
\d+(?:\.\d+)?)", re.IGNORECASE) +P_PATTERN = re.compile(r"p(?:值)?[^0-9]{0,12}(?P

\d+(?:\.\d+)?)", re.IGNORECASE) +SOURCE_TUPLE_PATTERN = re.compile( + r"(?:median|中位数)\s*(?:\(\s*\d+%?\s*ci[^)]*\)\s*)?" + r"(?P\d+(?:\.\d+)?)\s*(?:\([^)]{0,80}\))?\s*" + r"(?P\d+(?:\.\d+)?)\s*(?:\([^)]{0,80}\))?" + r".{0,260}?" + r"hr[^0-9]{0,20}(?P


\d+(?:\.\d+)?)" + r".{0,220}?" + r"p(?:值)?[^0-9]{0,20}(?P

\d+(?:\.\d+)?)", + re.IGNORECASE | re.DOTALL, +) +OPIOID_ITT_ANCHOR_PATTERN = re.compile( + r"(?:opioid\s*use|使用阿片类药物)\s*\(\s*itt\s*\)", + re.IGNORECASE, +) +ITT_PATTERN = re.compile(r"(? str: + return "|".join([self.left, self.right, self.hr, self.p or "na"]) + + def core_key(self) -> str: + return "|".join([self.left, self.right, self.hr]) + + +@dataclass(frozen=True) +class SourceInference: + source_id: str + label_map: Dict[str, MetricTuple] + detail: str + + +def _normalize_text(text: str) -> str: + normalized = ( + text.replace(":", ":") + .replace(";", ";") + .replace(",", ",") + .replace("(", "(") + .replace(")", ")") + .replace("=", "=") + .replace(" ", " ") + ) + return re.sub(r"\s+", " ", normalized).strip() + + +def _normalize_number(raw: str) -> str: + value = float(raw) + formatted = f"{value:.6f}".rstrip("0").rstrip(".") + if not formatted: + return "0" + return formatted + + +def _metric_tuple( + *, + left: str, + right: str, + hr: str, + p: str | None, +) -> MetricTuple: + return MetricTuple( + left=_normalize_number(left), + right=_normalize_number(right), + hr=_normalize_number(hr), + p=_normalize_number(p) if p is not None else None, + ) + + +def _canonical_label(raw: str) -> str: + clean = re.sub(r"\s+", "", raw).lower() + if clean == "itt": + return "ITT" + if clean == "mitt": + return "mITT" + return raw + + +def _find_metric_in_segment(segment: str) -> MetricTuple | None: + pair_match = PAIR_VS_PATTERN.search(segment) + if pair_match is None: + pair_match = PAIR_MEDIAN_PATTERN.search(segment) + if pair_match is None: + return None + hr_match = HR_PATTERN.search(segment) + if hr_match is None: + return None + p_match = P_PATTERN.search(segment) + return _metric_tuple( + left=pair_match.group("a"), + right=pair_match.group("b"), + hr=hr_match.group("hr"), + p=(p_match.group("p") if p_match is not None else None), + ) + + +def _extract_patch_label_map(replacement: str) -> Dict[str, MetricTuple]: + text = _normalize_text(replacement) + markers = list(LABEL_MARKER_PATTERN.finditer(text)) + if len(markers) < 2: + return {} + + out: Dict[str, MetricTuple] = {} + for idx, marker in enumerate(markers): + label = _canonical_label(marker.group(1)) + next_start = markers[idx + 1].start() if idx + 1 < len(markers) else len(text) + segment = text[marker.end() : next_start] + metric = _find_metric_in_segment(segment) + if metric is None: + continue + out[label] = metric + return out + + +def _extract_source_ids_from_replacement(replacement: str) -> List[str]: + keys = [m.group(1) for m in FOOTNOTE_KEY_PATTERN.finditer(replacement)] + dedup: List[str] = [] + seen = set() + for key in keys: + if key in seen: + continue + seen.add(key) + dedup.append(key) + return dedup + + +def _is_opioid_context(text: str) -> bool: + low = text.lower() + return ("opioid" in low) or ("阿片" in text) + + +def _window_for_endpoint(text: str) -> str: + normalized = _normalize_text(text) + anchor = OPIOID_ITT_ANCHOR_PATTERN.search(normalized) + if anchor is not None: + start = anchor.start() + return normalized[start : start + 7000] + + low = normalized.lower() + anchor_positions: List[int] = [] + for term in ["opioid", "阿片"]: + idx = low.find(term.lower()) + if idx >= 0: + anchor_positions.append(idx) + if not anchor_positions: + return normalized + start = min(anchor_positions) + return normalized[start : start + 9000] + + +def _extract_source_metric_tuples(window_text: str) -> List[MetricTuple]: + out: List[MetricTuple] = [] + for m in SOURCE_TUPLE_PATTERN.finditer(window_text): + try: + metric = _metric_tuple( + left=m.group("a"), + right=m.group("b"), + hr=m.group("hr"), + p=m.group("p"), + ) + except ValueError: + continue + out.append(metric) + return out + + +def _infer_label_order(window_text: str) -> List[str]: + itt_match = ITT_PATTERN.search(window_text) + mitt_match = MITT_PATTERN.search(window_text) + if itt_match is None or mitt_match is None: + return [] + if itt_match.start() < mitt_match.start(): + return ["ITT", "mITT"] + return ["mITT", "ITT"] + + +def _infer_source_label_map(text: str) -> Dict[str, MetricTuple]: + window = _window_for_endpoint(text) + tuples = _extract_source_metric_tuples(window) + if len(tuples) < 2: + return {} + label_order = _infer_label_order(window) + if len(label_order) != 2: + return {} + return { + label_order[0]: tuples[0], + label_order[1]: tuples[1], + } + + +def _load_source_specs(path: Path | None) -> Dict[str, Dict[str, object]]: + if path is None or not path.exists(): + return {} + payload = json.loads(path.read_text(encoding="utf-8")) + out: Dict[str, Dict[str, object]] = {} + for block in ["required_sources", "optional_sources"]: + source_block = payload.get(block, {}) + if isinstance(source_block, dict): + for source_id, spec in source_block.items(): + if isinstance(spec, dict): + out[str(source_id)] = spec + return out + + +def _iter_local_fallback_source_ids(specs: Dict[str, Dict[str, object]]) -> List[str]: + local_types = {"local_pdf", "local_docx", "local_pptx", "local_txt", "local_text"} + out: List[str] = [] + for source_id, spec in specs.items(): + source_type = str(spec.get("type", "")).strip() + if source_type in local_types: + out.append(source_id) + return out + + +def _extract_source_text( + *, + source_id: str, + spec: Dict[str, object], +) -> Tuple[str, str]: + source_type = str(spec.get("type", "")).strip() + if source_type in {"local_txt", "local_text"}: + path = Path(str(spec.get("path", "")).strip()) + if not path.exists(): + raise FileNotFoundError(f"Local file not found: {path}") + return path.read_text(encoding="utf-8", errors="ignore"), "native_text" + if source_type not in {"local_pdf", "local_docx", "local_pptx"}: + raise RuntimeError(f"unsupported_source_type:{source_type}") + path = Path(str(spec.get("path", "")).strip()) + extract_mode = str(spec.get("extract_mode", "auto")).strip() or "auto" + ocr_mode = str(spec.get("ocr_mode", "dual")).strip() or "dual" + location_hints = [str(x) for x in spec.get("location_hints", []) if str(x).strip()] + result = extract_local_source_text( + source_type=source_type, + path=path, + extract_mode=extract_mode, + ocr_mode=ocr_mode, + location_hints=location_hints, + ) + return result.text, result.detail + + +def _build_expected_map( + *, + source_ids: Iterable[str], + source_specs: Dict[str, Dict[str, object]], + cache: Dict[str, Tuple[str, str]], +) -> Tuple[Dict[str, Dict[str, int]], List[SourceInference], List[str]]: + expected_counts: Dict[str, Dict[str, int]] = {"ITT": {}, "mITT": {}} + inferences: List[SourceInference] = [] + warnings: List[str] = [] + + for source_id in source_ids: + spec = source_specs.get(source_id) + if spec is None: + warnings.append(f"source_spec_missing:{source_id}") + continue + if source_id not in cache: + try: + cache[source_id] = _extract_source_text(source_id=source_id, spec=spec) + except Exception as exc: # noqa: BLE001 + warnings.append(f"source_extract_failed:{source_id}:{exc}") + continue + text, detail = cache[source_id] + inferred = _infer_source_label_map(text) + if not inferred: + warnings.append(f"source_infer_failed:{source_id}") + continue + inferences.append(SourceInference(source_id=source_id, label_map=inferred, detail=detail)) + for label, metric in inferred.items(): + bucket = expected_counts.setdefault(label, {}) + key = metric.key() + bucket[key] = bucket.get(key, 0) + 1 + + return expected_counts, inferences, warnings + + +def run_gate( + *, + patch_spec_path: Path, + source_config_path: Path | None, +) -> Dict[str, object]: + payload = json.loads(patch_spec_path.read_text(encoding="utf-8")) + patch_items = payload.get("patches", []) + if not isinstance(patch_items, list): + raise ValueError("patch-spec field patches must be a list") + + source_specs = _load_source_specs(source_config_path) + fallback_sources = _iter_local_fallback_source_ids(source_specs) + source_cache: Dict[str, Tuple[str, str]] = {} + + results: List[Dict[str, object]] = [] + fail_count = 0 + checked_count = 0 + + for item in patch_items: + if not isinstance(item, dict): + continue + label = str(item.get("label", "")).strip() or "" + replacement = str(item.get("replacement", "")) + patch_map = _extract_patch_label_map(replacement) + if not _is_opioid_context(replacement): + continue + if not ({"ITT", "mITT"} <= set(patch_map.keys())): + continue + + checked_count += 1 + source_ids = [ + sid for sid in _extract_source_ids_from_replacement(replacement) if sid in source_specs + ] + if not source_ids: + source_ids = fallback_sources + + expected_counts, inferences, warnings = _build_expected_map( + source_ids=source_ids, + source_specs=source_specs, + cache=source_cache, + ) + expected_keys = {k: sorted(v.keys()) for k, v in expected_counts.items() if v} + + status = "PASS" + reason = "ok" + + if not expected_counts.get("ITT") or not expected_counts.get("mITT"): + status = "FAIL" + reason = "unverifiable_label_binding" + else: + for metric_label in ["ITT", "mITT"]: + proposed_key = patch_map[metric_label].key() + proposed_core_key = patch_map[metric_label].core_key() + expected_for_label = set(expected_counts.get(metric_label, {}).keys()) + expected_for_label_core = { + "|".join(value.split("|")[:3]) for value in expected_counts.get(metric_label, {}).keys() + } + opposite = "mITT" if metric_label == "ITT" else "ITT" + expected_for_opposite = set(expected_counts.get(opposite, {}).keys()) + expected_for_opposite_core = { + "|".join(value.split("|")[:3]) for value in expected_counts.get(opposite, {}).keys() + } + + if proposed_key in expected_for_label or proposed_core_key in expected_for_label_core: + continue + if proposed_key in expected_for_opposite or proposed_core_key in expected_for_opposite_core: + status = "FAIL" + reason = f"swapped_label_binding:{metric_label}->{opposite}" + break + status = "FAIL" + reason = f"label_value_mismatch:{metric_label}" + break + + if status == "PASS": + for metric_label in ["ITT", "mITT"]: + if len(expected_counts.get(metric_label, {})) > 1: + status = "FAIL" + reason = f"source_value_conflict:{metric_label}" + break + + if status == "FAIL": + fail_count += 1 + + results.append( + { + "patch_label": label, + "status": status, + "reason": reason, + "source_ids": source_ids, + "warnings": warnings, + "replacement_map": {k: v.key() for k, v in patch_map.items()}, + "expected_map": expected_keys, + "source_inference": [ + { + "source_id": x.source_id, + "detail": x.detail, + "label_map": {k: v.key() for k, v in x.label_map.items()}, + } + for x in inferences + ], + } + ) + + return { + "status": "PASS" if fail_count == 0 else "FAIL", + "candidate_patch_count": checked_count, + "fail_count": fail_count, + "results": results, + } + + +def main() -> int: + parser = argparse.ArgumentParser(description="Check label-value consistency for high-risk subgroup metrics.") + parser.add_argument("--patch-spec", type=Path, required=True) + parser.add_argument("--source-config", type=Path, default=None) + parser.add_argument("--output-json", type=Path, default=None) + parser.add_argument( + "--run-dir", + type=Path, + default=None, + help="Run directory root. If used with --run-id and --output-json omitted, " + "defaults to /reports/label_value_consistency_.json", + ) + parser.add_argument("--run-id", type=str, default=None) + args = parser.parse_args() + + if not args.patch_spec.exists(): + parser.error(f"patch spec not found: {args.patch_spec}") + if args.source_config is not None and not args.source_config.exists(): + parser.error(f"source config not found: {args.source_config}") + + if args.run_dir is not None and args.output_json is None: + if not args.run_id: + parser.error("--run-id is required when --run-dir is used without --output-json") + if not is_valid_run_id(args.run_id): + parser.error(f"Invalid --run-id format: {args.run_id}") + args.output_json = args.run_dir / "reports" / f"label_value_consistency_{args.run_id}.json" + + payload = run_gate( + patch_spec_path=args.patch_spec, + source_config_path=args.source_config, + ) + out = json.dumps(payload, ensure_ascii=False, indent=2) + print(out) + if args.output_json is not None: + args.output_json.parent.mkdir(parents=True, exist_ok=True) + args.output_json.write_text(out + "\n", encoding="utf-8") + + return 0 if payload["status"] == "PASS" else 2 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/check_revise_sources.py b/src/openrevise/gates/check_revise_sources.py similarity index 97% rename from scripts/check_revise_sources.py rename to src/openrevise/gates/check_revise_sources.py index ba92cf6..f4f376c 100644 --- a/scripts/check_revise_sources.py +++ b/src/openrevise/gates/check_revise_sources.py @@ -26,7 +26,7 @@ def _maybe_reexec_runtime_python() -> None: if os.environ.get("REVISE_NO_REEXEC") == "1": return - repo_root = Path(__file__).resolve().parents[1] + repo_root = Path(__file__).resolve().parents[3] override = os.environ.get("REVISE_RUNTIME_PYTHON", "").strip() preferred = Path(override) if override else (repo_root / ".venv311" / "bin" / "python") if not preferred.exists(): @@ -46,8 +46,8 @@ def _maybe_reexec_runtime_python() -> None: from pypdf import PdfReader -from evidence_extractors import extract_local_source_text -from run_artifact_utils import is_valid_run_id +from openrevise.sources.evidence_extractors import extract_local_source_text +from openrevise.artifacts.run_artifact_utils import is_valid_run_id @dataclass @@ -321,7 +321,7 @@ def main() -> int: parser.add_argument( "--config", type=Path, - default=Path(__file__).resolve().parents[1] / "config" / "revise_sources.json", + default=Path(__file__).resolve().parents[3] / "config" / "revise_sources.json", ) parser.add_argument("--output-json", type=Path, default=None) parser.add_argument( diff --git a/scripts/check_revision_sop.py b/src/openrevise/gates/check_revision_sop.py similarity index 99% rename from scripts/check_revision_sop.py rename to src/openrevise/gates/check_revision_sop.py index 544eefe..b40aefc 100644 --- a/scripts/check_revision_sop.py +++ b/src/openrevise/gates/check_revision_sop.py @@ -69,7 +69,7 @@ class ClaimResult: def _maybe_reexec_runtime_python() -> None: if os.environ.get("REVISE_NO_REEXEC") == "1": return - repo_root = Path(__file__).resolve().parents[1] + repo_root = Path(__file__).resolve().parents[3] override = os.environ.get("REVISE_RUNTIME_PYTHON", "").strip() preferred = Path(override) if override else (repo_root / ".venv311" / "bin" / "python") if not preferred.exists(): diff --git a/src/openrevise/pipeline/__init__.py b/src/openrevise/pipeline/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/scripts/housekeeping.py b/src/openrevise/pipeline/housekeeping.py similarity index 91% rename from scripts/housekeeping.py rename to src/openrevise/pipeline/housekeeping.py index 90d5421..299aa3a 100644 --- a/scripts/housekeeping.py +++ b/src/openrevise/pipeline/housekeeping.py @@ -11,7 +11,7 @@ from pathlib import Path from typing import Dict, List, Set -from run_artifact_utils import ( +from openrevise.artifacts.run_artifact_utils import ( DEFAULT_MARKER, PurgeRecord, is_valid_run_id, @@ -22,7 +22,7 @@ utc_now, write_tsv, ) -from update_run_index import upsert_run_record +from openrevise.artifacts.update_run_index import upsert_run_record DELETED_FIELDS = ["marker", "run_id", "reason", "status_before", "status_after", "path", "deleted_at"] @@ -102,8 +102,8 @@ def _purge_non_key_dirs(run_dir: Path, dry_run: bool) -> List[Path]: return removed -def main() -> int: - repo_root = Path(__file__).resolve().parents[1] +def main(argv: List[str] | None = None) -> int: + repo_root = Path(__file__).resolve().parents[3] parser = argparse.ArgumentParser(description="Retention housekeeping for revise run artifacts.") parser.add_argument("--runs-root", type=Path, default=repo_root / "runs") parser.add_argument("--archive-dir", type=Path, default=repo_root / "archive") @@ -115,7 +115,7 @@ def main() -> int: parser.add_argument("--run-id", default=None) parser.add_argument("--approved-by", default="housekeeping.py") parser.add_argument("--dry-run", action="store_true") - args = parser.parse_args() + args = parser.parse_args(argv) if args.run_id is not None and not is_valid_run_id(args.run_id): print(f"Invalid --run-id: {args.run_id}") @@ -161,17 +161,18 @@ def main() -> int: ) if not args.dry_run: shutil.rmtree(run_dir) - upsert_run_record( - run_index, - { - "marker": args.marker, - "run_id": run_id, - "status": "COLD_ARCHIVED", - "archive_path": str(archive_path), - "notes": f"migrated to archive at {now_iso}", - "retention_policy": args.retention_policy, - }, - ) + if not args.dry_run: + upsert_run_record( + run_index, + { + "marker": args.marker, + "run_id": run_id, + "status": "COLD_ARCHIVED", + "archive_path": str(archive_path), + "notes": f"migrated to archive at {now_iso}", + "retention_policy": args.retention_policy, + }, + ) continue if age_days > args.cold_days: @@ -227,7 +228,7 @@ def main() -> int: if not args.dry_run: archive_path.unlink() - if run_has_purge: + if run_has_purge and not args.dry_run: upsert_run_record( run_index, { diff --git a/scripts/run_revise_pipeline.py b/src/openrevise/pipeline/run_revise_pipeline.py similarity index 71% rename from scripts/run_revise_pipeline.py rename to src/openrevise/pipeline/run_revise_pipeline.py index 88f2606..08d936c 100644 --- a/scripts/run_revise_pipeline.py +++ b/src/openrevise/pipeline/run_revise_pipeline.py @@ -2,8 +2,9 @@ """ Top-level revise pipeline: 1) source gate check -2) tracked DOCX revision -3) Q->source mapping export +2) label-value consistency gate (ITT/mITT high-risk tuples) +3) tracked DOCX revision +4) Q->source mapping export """ from __future__ import annotations @@ -14,7 +15,7 @@ import subprocess import sys from pathlib import Path -from run_artifact_utils import is_valid_run_id +from openrevise.artifacts.run_artifact_utils import is_valid_run_id def _run(cmd: list[str]) -> None: @@ -38,7 +39,7 @@ def main() -> int: parser = argparse.ArgumentParser(description="Run revise pipeline with hard source gate.") parser.add_argument("--input-docx", required=True, type=Path) parser.add_argument("--output-docx", type=Path, default=None) - parser.add_argument("--author", default="Codex") + parser.add_argument("--author", default="OpenRevise") parser.add_argument( "--date", default=dt.datetime.now(dt.timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z"), @@ -46,7 +47,7 @@ def main() -> int: parser.add_argument( "--source-config", type=Path, - default=Path(__file__).resolve().parents[1] / "config" / "revise_sources.json", + default=Path(__file__).resolve().parents[3] / "config" / "revise_sources.json", ) parser.add_argument( "--patch-spec", @@ -64,6 +65,12 @@ def main() -> int: type=Path, default=None, ) + parser.add_argument( + "--label-check-report-json", + type=Path, + default=None, + help="Label-value consistency report JSON.", + ) parser.add_argument( "--run-dir", type=Path, @@ -97,6 +104,11 @@ def main() -> int: action="store_true", help="Allow revising from an input DOCX that already contains tracked changes.", ) + parser.add_argument( + "--skip-label-value-check", + action="store_true", + help="Skip ITT/mITT label-value consistency gate (not recommended).", + ) args = parser.parse_args() if args.run_id is not None and not is_valid_run_id(args.run_id): @@ -107,7 +119,7 @@ def main() -> int: if args.run_dir is not None and args.run_id is None: parser.error("--run-id is required when --run-dir is provided") - repo_root = Path(__file__).resolve().parents[1] + repo_root = Path(__file__).resolve().parents[3] runtime_python = _resolve_runtime_python(repo_root) if args.run_dir is not None: args.output_docx = args.output_docx or (args.run_dir / "revision" / f"revised_{args.run_id}.docx") @@ -115,20 +127,22 @@ def main() -> int: args.run_dir / "reports" / f"source_gate_report_{args.run_id}.json" ) args.q_map_csv = args.q_map_csv or (args.run_dir / "reports" / f"q_source_map_{args.run_id}.csv") + args.label_check_report_json = args.label_check_report_json or ( + args.run_dir / "reports" / f"label_value_consistency_{args.run_id}.json" + ) else: if args.output_docx is None: parser.error("--output-docx is required unless --run-dir and --run-id are provided") args.source_report_json = args.source_report_json or (repo_root / "reports" / "source_gate_report.json") args.q_map_csv = args.q_map_csv or (repo_root / "reports" / "q_source_map.csv") - - base = Path(__file__).resolve().parent - check_script = base / "check_revise_sources.py" - revise_script = base / "revise_docx.py" - qmap_script = base / "build_q_source_map.py" + args.label_check_report_json = args.label_check_report_json or ( + repo_root / "reports" / "label_value_consistency_report.json" + ) check_cmd = [ runtime_python, - str(check_script), + "-m", + "openrevise.gates.check_revise_sources", "--config", str(args.source_config), "--output-json", @@ -145,9 +159,29 @@ def main() -> int: print("Source gate failed. Revision aborted.") return check_proc.returncode + if not args.skip_label_value_check: + label_check_cmd = [ + runtime_python, + "-m", + "openrevise.gates.check_label_value_consistency", + "--patch-spec", + str(args.patch_spec), + "--output-json", + str(args.label_check_report_json), + ] + if args.source_config is not None: + label_check_cmd += ["--source-config", str(args.source_config)] + if args.run_dir is not None: + label_check_cmd += ["--run-dir", str(args.run_dir), "--run-id", args.run_id] + label_check_proc = subprocess.run(label_check_cmd, check=False) + if label_check_proc.returncode != 0: + print("Label-value consistency gate failed. Revision aborted.") + return label_check_proc.returncode + revise_cmd = [ runtime_python, - str(revise_script), + "-m", + "openrevise.revise.revise_docx", "--input-docx", str(args.input_docx), "--output-docx", @@ -168,7 +202,8 @@ def main() -> int: _run( [ runtime_python, - str(qmap_script), + "-m", + "openrevise.artifacts.build_q_source_map", "--input-docx", str(args.output_docx), "--output-csv", @@ -178,6 +213,8 @@ def main() -> int: print(f"Revision output: {args.output_docx}") print(f"Source gate report: {args.source_report_json}") + if not args.skip_label_value_check: + print(f"Label-value consistency report: {args.label_check_report_json}") print(f"Q-source map: {args.q_map_csv}") return 0 diff --git a/scripts/run_revise_pipeline_v2.py b/src/openrevise/pipeline/run_revise_pipeline_v2.py similarity index 80% rename from scripts/run_revise_pipeline_v2.py rename to src/openrevise/pipeline/run_revise_pipeline_v2.py index dff8db7..3c15240 100644 --- a/scripts/run_revise_pipeline_v2.py +++ b/src/openrevise/pipeline/run_revise_pipeline_v2.py @@ -14,7 +14,7 @@ from pathlib import Path from typing import Dict, List -from run_artifact_utils import ( +from openrevise.artifacts.run_artifact_utils import ( ArtifactRecord, DEFAULT_MARKER, RunContext, @@ -28,7 +28,7 @@ utc_now, write_tsv, ) -from update_run_index import upsert_run_record +from openrevise.artifacts.update_run_index import upsert_run_record SYNC_FIELDS = [ @@ -131,7 +131,7 @@ def _append_sync_row( def _parse_args() -> argparse.Namespace: - repo_root = Path(__file__).resolve().parents[1] + repo_root = Path(__file__).resolve().parents[3] parser = argparse.ArgumentParser(description="Run revise pipeline with run-scoped governance.") parser.add_argument("--input-docx", required=True, type=Path) parser.add_argument("--run-id", type=str, default=None) @@ -151,7 +151,7 @@ def _parse_args() -> argparse.Namespace: required=True, help="JSON spec containing generic revision patches and source footnote texts.", ) - parser.add_argument("--author", default="Codex") + parser.add_argument("--author", default="OpenRevise") parser.add_argument( "--date", default=dt.datetime.now(dt.timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z"), @@ -183,6 +183,11 @@ def _parse_args() -> argparse.Namespace: default=None, help="Optional extra copy destination for revised DOCX.", ) + parser.add_argument( + "--skip-label-value-check", + action="store_true", + help="Skip ITT/mITT label-value consistency gate (not recommended).", + ) return parser.parse_args() @@ -190,9 +195,8 @@ def main() -> int: args = _parse_args() ensure_non_empty_marker(args.marker) - repo_root = Path(__file__).resolve().parents[1] + repo_root = Path(__file__).resolve().parents[3] runtime_python = _resolve_runtime_python(repo_root) - scripts_dir = Path(__file__).resolve().parent runs_root = repo_root / "runs" runs_root.mkdir(parents=True, exist_ok=True) (repo_root / "archive").mkdir(parents=True, exist_ok=True) @@ -235,6 +239,7 @@ def main() -> int: intake_copy = run_dir / "intake" / f"input_{run_id}.docx" source_report = run_dir / "reports" / f"source_gate_report_{run_id}.json" + label_value_report = run_dir / "reports" / f"label_value_consistency_{run_id}.json" run_context_file = run_dir / "reports" / f"run_context_{run_id}.json" revised_docx = run_dir / "revision" / f"revised_{run_id}.docx" revision_audit = run_dir / "revision" / f"revision_change_audit_{run_id}.csv" @@ -250,7 +255,15 @@ def main() -> int: safe_copy2(args.input_docx, intake_copy) safe_copy2(args.patch_spec, patch_spec_copy) - for target in [source_report, run_context_file, revised_docx, revision_audit, q_source_map, claim_verdicts]: + for target in [ + source_report, + label_value_report, + run_context_file, + revised_docx, + revision_audit, + q_source_map, + claim_verdicts, + ]: _must_not_exist(target) for target in [sync_manifest, deleted_manifest, artifact_manifest]: @@ -301,7 +314,8 @@ def main() -> int: source_check_rc = _run( [ runtime_python, - str(scripts_dir / "check_revise_sources.py"), + "-m", + "openrevise.gates.check_revise_sources", "--config", str(args.source_config), "--output-json", @@ -318,50 +332,75 @@ def main() -> int: finished_status = "FAILED_GATE" finished_notes = "required source gate failed" else: - revise_cmd = [ - runtime_python, - str(scripts_dir / "revise_docx.py"), - "--input-docx", - str(intake_copy), - "--output-docx", - str(revised_docx), - "--audit-csv", - str(revision_audit), - "--patch-spec", - str(patch_spec_copy), - "--author", - args.author, - "--date", - args.date, - "--run-dir", - str(run_dir), - "--run-id", - run_id, - ] - if args.allow_incremental: - revise_cmd.append("--allow-incremental") - revise_rc = _run(revise_cmd) - if revise_rc != 0: - finished_status = "FAILED_REVISE" - finished_notes = f"revise_docx failed with code {revise_rc}" - else: - qmap_rc = _run( + label_rc = 0 + if not args.skip_label_value_check: + label_rc = _run( [ runtime_python, - str(scripts_dir / "build_q_source_map.py"), - "--input-docx", - str(revised_docx), - "--output-csv", - str(q_source_map), + "-m", + "openrevise.gates.check_label_value_consistency", + "--patch-spec", + str(patch_spec_copy), + "--source-config", + str(args.source_config), + "--output-json", + str(label_value_report), "--run-dir", str(run_dir), "--run-id", run_id, ] ) - if qmap_rc != 0: - finished_status = "FAILED_QMAP" - finished_notes = f"build_q_source_map failed with code {qmap_rc}" + if label_rc != 0: + finished_status = "FAILED_LABEL_GATE" + finished_notes = f"label-value consistency gate failed with code {label_rc}" + else: + revise_cmd = [ + runtime_python, + "-m", + "openrevise.revise.revise_docx", + "--input-docx", + str(intake_copy), + "--output-docx", + str(revised_docx), + "--audit-csv", + str(revision_audit), + "--patch-spec", + str(patch_spec_copy), + "--author", + args.author, + "--date", + args.date, + "--run-dir", + str(run_dir), + "--run-id", + run_id, + ] + if args.allow_incremental: + revise_cmd.append("--allow-incremental") + revise_rc = _run(revise_cmd) + if revise_rc != 0: + finished_status = "FAILED_REVISE" + finished_notes = f"revise_docx failed with code {revise_rc}" + else: + qmap_rc = _run( + [ + runtime_python, + "-m", + "openrevise.artifacts.build_q_source_map", + "--input-docx", + str(revised_docx), + "--output-csv", + str(q_source_map), + "--run-dir", + str(run_dir), + "--run-id", + run_id, + ] + ) + if qmap_rc != 0: + finished_status = "FAILED_QMAP" + finished_notes = f"build_q_source_map failed with code {qmap_rc}" now_iso = to_iso_z(utc_now()) @@ -430,7 +469,7 @@ def add_artifact( "input_docx_copy", intake_copy, "intake", - "run_revise_pipeline_v2.py", + "openrevise.pipeline.run_revise_pipeline_v2", str(args.input_docx), "HOT", "input", @@ -439,7 +478,7 @@ def add_artifact( "patch_spec_copy", patch_spec_copy, "scope", - "run_revise_pipeline_v2.py", + "openrevise.pipeline.run_revise_pipeline_v2", str(args.patch_spec), "PERMANENT", "patch_spec", @@ -448,7 +487,7 @@ def add_artifact( "run_context", run_context_file, "reports", - "run_revise_pipeline_v2.py", + "openrevise.pipeline.run_revise_pipeline_v2", "", "PERMANENT", "run_context", @@ -457,16 +496,25 @@ def add_artifact( "source_gate_report", source_report, "gate", - "check_revise_sources.py", + "openrevise.gates.check_revise_sources", str(args.source_config), "HOT", "source_gate_report", ) + add_artifact( + "label_value_consistency_report", + label_value_report, + "gate", + "openrevise.gates.check_label_value_consistency", + str(patch_spec_copy), + "HOT", + "label_value_consistency_report", + ) add_artifact( "revised_docx", revised_docx, "revise", - "revise_docx.py", + "openrevise.revise.revise_docx", str(patch_spec_copy), "PERMANENT", "revised_docx", @@ -475,7 +523,7 @@ def add_artifact( "revision_change_audit", revision_audit, "revise", - "revise_docx.py", + "openrevise.revise.revise_docx", str(revised_docx), "PERMANENT", "change_audit", @@ -484,7 +532,7 @@ def add_artifact( "q_source_map", q_source_map, "reports", - "build_q_source_map.py", + "openrevise.artifacts.build_q_source_map", str(revised_docx), "PERMANENT", "q_source_map", @@ -493,7 +541,7 @@ def add_artifact( "claim_verdicts", claim_verdicts, "verify", - "run_revise_pipeline_v2.py", + "openrevise.pipeline.run_revise_pipeline_v2", "", "HOT", "claim_verdicts", @@ -550,7 +598,8 @@ def add_artifact( hk_rc = _run( [ runtime_python, - str(scripts_dir / "housekeeping.py"), + "-m", + "openrevise.pipeline.housekeeping", "--marker", args.marker, "--retention-policy", diff --git a/src/openrevise/revise/__init__.py b/src/openrevise/revise/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/scripts/revise_docx.py b/src/openrevise/revise/revise_docx.py similarity index 97% rename from scripts/revise_docx.py rename to src/openrevise/revise/revise_docx.py index 51a75e1..41821c2 100644 --- a/scripts/revise_docx.py +++ b/src/openrevise/revise/revise_docx.py @@ -21,7 +21,7 @@ from typing import Dict, Iterable, List, Tuple import xml.etree.ElementTree as ET -from run_artifact_utils import is_valid_run_id +from openrevise.artifacts.run_artifact_utils import is_valid_run_id W_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main" @@ -453,7 +453,7 @@ def main() -> int: action="store_true", help="Allow using an input DOCX that already contains tracked revisions (w:ins/w:del).", ) - parser.add_argument("--author", default="Codex") + parser.add_argument("--author", default="OpenRevise") parser.add_argument( "--date", default=dt.datetime.now(dt.timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z"), @@ -486,8 +486,20 @@ def main() -> int: patches, source_texts = load_patch_spec(args.patch_spec) - document_root = load_xml_from_docx(args.input_docx, "word/document.xml") - footnotes_root = load_xml_from_docx(args.input_docx, "word/footnotes.xml") + try: + document_root = load_xml_from_docx(args.input_docx, "word/document.xml") + except KeyError: + print("Invalid DOCX: missing word/document.xml", file=sys.stderr) + return 1 + try: + footnotes_root = load_xml_from_docx(args.input_docx, "word/footnotes.xml") + except KeyError: + print( + "Input DOCX is missing word/footnotes.xml; this tool requires a baseline DOCX " + "with footnotes support.", + file=sys.stderr, + ) + return 1 existing_ids = existing_footnote_ids(footnotes_root) existing_text_map = footnote_text_map(footnotes_root) diff --git a/src/openrevise/sources/__init__.py b/src/openrevise/sources/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/openrevise/sources/biomed/__init__.py b/src/openrevise/sources/biomed/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/openrevise/sources/biomed/europepmc.py b/src/openrevise/sources/biomed/europepmc.py new file mode 100644 index 0000000..13c7f6e --- /dev/null +++ b/src/openrevise/sources/biomed/europepmc.py @@ -0,0 +1,24 @@ +"""Europe PMC REST client (https://europepmc.org/RestfulWebService).""" +from __future__ import annotations + +from typing import Any, Dict, List + +import requests + +SEARCH_URL = "https://www.ebi.ac.uk/europepmc/webservices/rest/search" + + +def search(spec: Dict[str, Any]) -> List[Dict[str, Any]]: + """Run a query against Europe PMC; return [{id, title, ...}, ...].""" + query = spec.get("query", "") + limit = spec.get("limit", 25) + params = { + "query": query, + "format": "json", + "pageSize": limit, + } + response = requests.get(SEARCH_URL, params=params, timeout=30) + response.raise_for_status() + data = response.json() + results = data.get("resultList", {}).get("result", []) + return list(results) diff --git a/src/openrevise/sources/biomed/pmc.py b/src/openrevise/sources/biomed/pmc.py new file mode 100644 index 0000000..54fa664 --- /dev/null +++ b/src/openrevise/sources/biomed/pmc.py @@ -0,0 +1,28 @@ +"""PMC (PubMed Central) client via NCBI E-utilities.""" +from __future__ import annotations + +from typing import Any, Dict, List + +import requests + +ESEARCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi" + + +def search(spec: Dict[str, Any]) -> List[Dict[str, Any]]: + """Run an esearch query against PMC; return [{pmcid}, ...].""" + query = spec.get("query", "") + limit = spec.get("limit", 20) + api_key = spec.get("api_key") + params: Dict[str, Any] = { + "db": "pmc", + "term": query, + "retmax": limit, + "retmode": "json", + } + if api_key: + params["api_key"] = api_key + response = requests.get(ESEARCH_URL, params=params, timeout=30) + response.raise_for_status() + data = response.json() + id_list = data.get("esearchresult", {}).get("idlist", []) + return [{"pmcid": pmcid} for pmcid in id_list] diff --git a/src/openrevise/sources/biomed/pubmed.py b/src/openrevise/sources/biomed/pubmed.py new file mode 100644 index 0000000..2618fac --- /dev/null +++ b/src/openrevise/sources/biomed/pubmed.py @@ -0,0 +1,35 @@ +"""PubMed client via NCBI E-utilities esearch. + +Returns a list of `{pmid: ...}` dicts. Pair with a downstream esummary/efetch +call when richer metadata is needed; v0 only does ID retrieval. + +E-utilities docs: https://www.ncbi.nlm.nih.gov/books/NBK25500/ +No API key required for low-volume use; consider passing api_key for >3 req/sec. +""" +from __future__ import annotations + +from typing import Any, Dict, List + +import requests + +ESEARCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi" + + +def search(spec: Dict[str, Any]) -> List[Dict[str, Any]]: + """Run an esearch query against PubMed; return [{pmid, ...}, ...].""" + query = spec.get("query", "") + limit = spec.get("limit", 20) + api_key = spec.get("api_key") + params: Dict[str, Any] = { + "db": "pubmed", + "term": query, + "retmax": limit, + "retmode": "json", + } + if api_key: + params["api_key"] = api_key + response = requests.get(ESEARCH_URL, params=params, timeout=30) + response.raise_for_status() + data = response.json() + id_list = data.get("esearchresult", {}).get("idlist", []) + return [{"pmid": pmid} for pmid in id_list] diff --git a/scripts/evidence_extractors.py b/src/openrevise/sources/evidence_extractors.py similarity index 100% rename from scripts/evidence_extractors.py rename to src/openrevise/sources/evidence_extractors.py diff --git a/src/openrevise/sources/preprint/__init__.py b/src/openrevise/sources/preprint/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/openrevise/sources/preprint/arxiv.py b/src/openrevise/sources/preprint/arxiv.py new file mode 100644 index 0000000..2d39c36 --- /dev/null +++ b/src/openrevise/sources/preprint/arxiv.py @@ -0,0 +1,31 @@ +"""Native arXiv API client (https://export.arxiv.org/api/query). No paywall, no auth.""" +from __future__ import annotations + +from urllib.parse import urlencode + +from .router import BackendUnavailable + +ARXIV_ENDPOINT = "https://export.arxiv.org/api/query" + + +def search(spec: dict) -> list: + try: + import feedparser # type: ignore + except ImportError as e: + raise BackendUnavailable( + "feedparser not installed; pip install openrevise[preprint-arxiv]" + ) from e + query = spec.get("query", "") + limit = spec.get("limit", 20) + url = f"{ARXIV_ENDPOINT}?{urlencode({'search_query': query, 'max_results': limit})}" + feed = feedparser.parse(url) + return [ + { + "id": entry.get("id", ""), + "title": entry.get("title", ""), + "summary": entry.get("summary", ""), + "authors": [a.get("name", "") for a in entry.get("authors", [])], + "published": entry.get("published", ""), + } + for entry in feed.entries + ] diff --git a/src/openrevise/sources/preprint/deepxiv.py b/src/openrevise/sources/preprint/deepxiv.py new file mode 100644 index 0000000..2a575c1 --- /dev/null +++ b/src/openrevise/sources/preprint/deepxiv.py @@ -0,0 +1,16 @@ +"""DeepXiv-SDK backend (https://github.com/DeepXiv/deepxiv_sdk). + +Optional dependency. Install with: pip install "openrevise[preprint-deepxiv]" +""" +from __future__ import annotations + +from .router import BackendUnavailable + + +def search(spec: dict) -> list: + try: + import deepxiv_sdk # type: ignore + except ImportError as e: + raise BackendUnavailable("deepxiv_sdk not installed") from e + client = deepxiv_sdk.Client() # placeholder; replace with actual SDK init when wired up + return client.search(spec.get("query", ""), limit=spec.get("limit", 20)) diff --git a/src/openrevise/sources/preprint/router.py b/src/openrevise/sources/preprint/router.py new file mode 100644 index 0000000..30d1320 --- /dev/null +++ b/src/openrevise/sources/preprint/router.py @@ -0,0 +1,33 @@ +"""Preprint router with priority-ordered engines and graceful fallback.""" +from __future__ import annotations + +from typing import Any, Callable, List, Tuple + + +class BackendUnavailable(RuntimeError): + """Raised when a preprint engine is configured but not usable at runtime + (e.g., optional dependency not installed, network unreachable, missing API key). + + The PreprintRouter catches this and tries the next configured engine. + Other exceptions propagate (they indicate real bugs, not unavailability). + """ + + +class PreprintRouter: + """Multi-engine preprint search with primary→fallback dispatch. + + Engines are tried in order. If an engine raises BackendUnavailable, the next + engine is tried. If all engines are unavailable, returns an empty list (soft + degradation). + """ + + def __init__(self, engines: List[Tuple[str, Callable[[dict], list]]]) -> None: + self._engines = list(engines) + + def search(self, spec: dict) -> list: + for _name, engine in self._engines: + try: + return engine(spec) + except BackendUnavailable: + continue + return [] diff --git a/src/openrevise/sources/router.py b/src/openrevise/sources/router.py new file mode 100644 index 0000000..e192660 --- /dev/null +++ b/src/openrevise/sources/router.py @@ -0,0 +1,76 @@ +"""Source-type router: dispatches a source spec to its registered backend.""" +from __future__ import annotations + +from typing import Any, Callable, Dict + + +class BackendNotRegistered(KeyError): + """Raised when a source spec's source_type has no registered backend.""" + + +class SourceRouter: + """Registry of (source_type → backend) mappings. + + A backend is any callable that accepts a source spec dict and returns a result. + Pipeline code calls `router.dispatch(spec)` to route to the appropriate backend. + """ + + def __init__(self) -> None: + self._backends: Dict[str, Callable[[dict], Any]] = {} + + def register(self, source_type: str, backend: Callable[[dict], Any]) -> None: + self._backends[source_type] = backend + + def dispatch(self, spec: dict) -> Any: + source_type = spec.get("source_type", "") + try: + backend = self._backends[source_type] + except KeyError as e: + raise BackendNotRegistered(source_type) from e + return backend(spec) + + +def default_router( + *, + preprint_engine: Callable[[dict], Any] | None = None, +) -> SourceRouter: + """Build a SourceRouter wired with the standard backends. + + - preprint → preprint_engine if provided, else PreprintRouter with deepXIV→arXiv fallback + - biomed → routed by source_id to pubmed/europepmc/pmc + - local_* → evidence_extractors.extract_local_source_text (TODO: wire when needed) + """ + from openrevise.sources.biomed import pubmed, europepmc, pmc + from openrevise.sources.preprint.router import PreprintRouter + from openrevise.sources.preprint import deepxiv as preprint_deepxiv + from openrevise.sources.preprint import arxiv as preprint_arxiv + + if preprint_engine is None: + preprint_router = PreprintRouter( + engines=[ + ("deepxiv", preprint_deepxiv.search), + ("arxiv_native", preprint_arxiv.search), + ] + ) + preprint_engine = preprint_router.search + + def biomed_dispatch(spec: dict) -> Any: + source_id = spec.get("source_id", "") + if source_id == "pubmed": + return pubmed.search(spec) + if source_id == "europe_pmc": + return europepmc.search(spec) + if source_id == "pmc": + return pmc.search(spec) + raise BackendNotRegistered(f"biomed source_id '{source_id}' unknown") + + router = SourceRouter() + router.register("preprint", preprint_engine) + router.register("biomed", biomed_dispatch) + # Compatibility alias: the legacy source_registry.yaml labels pubmed/PMC/europe_pmc + # entries as `source_type: literature`. Register the same biomed dispatch under + # `literature` so existing-registry consumers route correctly. Future cleanup: + # split scholar (openalex/crossref) from biomed (NCBI/EuropePMC) and update the + # registry to use `biomed` directly. + router.register("literature", biomed_dispatch) + return router diff --git a/tests/fixtures/build_docx_no_footnotes.py b/tests/fixtures/build_docx_no_footnotes.py new file mode 100644 index 0000000..cff0d7a --- /dev/null +++ b/tests/fixtures/build_docx_no_footnotes.py @@ -0,0 +1,45 @@ +"""Build a minimal DOCX containing word/document.xml but NOT word/footnotes.xml. + +Used as a fixture for test_docx_missing_part.py. Run as a script when the +fixture needs to be regenerated: + python tests/fixtures/build_docx_no_footnotes.py +""" +from __future__ import annotations +import zipfile +from pathlib import Path + +CONTENT_TYPES_XML = """ + + + + + +""" + +RELS_XML = """ + + + +""" + +DOCUMENT_XML = """ + + + placeholder body + + +""" + + +def build(target: Path) -> None: + target.parent.mkdir(parents=True, exist_ok=True) + with zipfile.ZipFile(target, "w", compression=zipfile.ZIP_DEFLATED) as z: + z.writestr("[Content_Types].xml", CONTENT_TYPES_XML) + z.writestr("_rels/.rels", RELS_XML) + z.writestr("word/document.xml", DOCUMENT_XML) + + +if __name__ == "__main__": + target = Path(__file__).parent / "docx_no_footnotes.docx" + build(target) + print(f"wrote {target}") diff --git a/tests/fixtures/docx_no_footnotes.docx b/tests/fixtures/docx_no_footnotes.docx new file mode 100644 index 0000000..72587cb Binary files /dev/null and b/tests/fixtures/docx_no_footnotes.docx differ diff --git a/tests/test_biomed_clients.py b/tests/test_biomed_clients.py new file mode 100644 index 0000000..e42e554 --- /dev/null +++ b/tests/test_biomed_clients.py @@ -0,0 +1,33 @@ +"""Biomed source clients (PubMed via NCBI E-utilities, Europe PMC via REST, PMC via E-utilities).""" +from __future__ import annotations + +from openrevise.sources.biomed.pubmed import search as pubmed_search +from openrevise.sources.biomed.europepmc import search as epmc_search +from openrevise.sources.biomed.pmc import search as pmc_search + + +def test_pubmed_esearch_parses_id_list(requests_mock): + requests_mock.get( + "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi", + json={"esearchresult": {"idlist": ["12345", "67890"]}}, + ) + result = pubmed_search({"query": "HFrEF SGLT2", "limit": 10}) + assert [r["pmid"] for r in result] == ["12345", "67890"] + + +def test_europepmc_search_parses_results(requests_mock): + requests_mock.get( + "https://www.ebi.ac.uk/europepmc/webservices/rest/search", + json={"resultList": {"result": [{"id": "PMC1", "title": "x"}]}}, + ) + result = epmc_search({"query": "anything"}) + assert result[0]["id"] == "PMC1" + + +def test_pmc_esearch_parses_id_list(requests_mock): + requests_mock.get( + "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi", + json={"esearchresult": {"idlist": ["PMC123"]}}, + ) + result = pmc_search({"query": "test"}) + assert result[0]["pmcid"] == "PMC123" diff --git a/tests/test_docx_missing_part.py b/tests/test_docx_missing_part.py new file mode 100644 index 0000000..bf17cc5 --- /dev/null +++ b/tests/test_docx_missing_part.py @@ -0,0 +1,39 @@ +"""Regression test: revise_docx must report a friendly error when DOCX is missing required parts.""" +from __future__ import annotations +import json +import subprocess +import sys +from pathlib import Path + +FIXTURE = Path(__file__).parent / "fixtures" / "docx_no_footnotes.docx" + + +def test_missing_footnotes_returns_friendly_error(tmp_path: Path): + out = tmp_path / "out.docx" + spec = tmp_path / "patch.json" + # Need a non-empty patches list to get past load_patch_spec; the patch + # itself never gets evaluated because load_xml_from_docx fails first on + # the missing word/footnotes.xml part. + spec.write_text(json.dumps({ + "footnote_sources": {}, + "patches": [{"label": "P1", "anchor": "placeholder body", "replacement": "x", "reason": "r"}], + })) + + proc = subprocess.run( + [ + sys.executable, "-m", "openrevise.revise.revise_docx", + "--input-docx", str(FIXTURE), + "--output-docx", str(out), + "--patch-spec", str(spec), + ], + capture_output=True, text=True, + ) + assert proc.returncode != 0, f"expected non-zero exit; got 0. stderr={proc.stderr}" + assert "footnotes.xml" in proc.stderr, f"expected friendly error mentioning footnotes.xml; got stderr={proc.stderr}" + # Friendly handler must avoid surfacing a raw Python traceback. + assert "Traceback" not in proc.stderr, ( + f"expected friendly error, got raw traceback. stderr={proc.stderr}" + ) + assert "KeyError" not in proc.stderr, ( + f"expected friendly error, got raw KeyError. stderr={proc.stderr}" + ) diff --git a/tests/test_housekeeping_dryrun.py b/tests/test_housekeeping_dryrun.py new file mode 100644 index 0000000..0324c95 --- /dev/null +++ b/tests/test_housekeeping_dryrun.py @@ -0,0 +1,104 @@ +"""Regression test: housekeeping --dry-run must not mutate run_index.tsv.""" + +import os +import time +from pathlib import Path + +import pytest + +from openrevise.pipeline import housekeeping + + +def _make_run_id(age_days: float) -> tuple[str, float]: + """Build a valid run_id whose embedded timestamp is `age_days` in the past.""" + old_ts = time.time() - age_days * 24 * 3600 + old_struct = time.gmtime(old_ts) + run_id = time.strftime("%Y%m%dT%H%M%SZ", old_struct) + "_ABCDEF" + return run_id, old_ts + + +def test_dry_run_does_not_mutate_run_index(tmp_path: Path): + """Cold-archive branch (line ~164 guard): hot < age <= cold.""" + runs_root = tmp_path / "runs" + archive_dir = tmp_path / "archive" + reports_dir = tmp_path / "reports" + runs_root.mkdir() + archive_dir.mkdir() + reports_dir.mkdir() + + run_index = reports_dir / "run_index.tsv" + initial = "marker\trun_id\tstatus\n" + run_index.write_text(initial) + before = run_index.read_bytes() + + # Construct a run dir old enough to trigger COLD_ARCHIVED handling. + # is_valid_run_id expects the format YYYYMMDDTHHMMSSZ_AAAAAA where + # the last component is 6 alphanumerics (uppercase hex by convention). + # Pick a timestamp ~60 days ago so age_days falls between + # --hot-days=1 and --cold-days=180. + run_id, old_ts = _make_run_id(60) + run_dir = runs_root / run_id + run_dir.mkdir() + os.utime(run_dir, (old_ts, old_ts)) + + args_list = [ + "--runs-root", str(runs_root), + "--archive-dir", str(archive_dir), + "--reports-dir", str(reports_dir), + "--hot-days", "1", + "--cold-days", "180", + "--dry-run", + ] + rc = housekeeping.main(args_list) + assert rc == 0 + assert run_index.read_bytes() == before, ( + "dry-run must not mutate run-index (COLD_ARCHIVED branch)" + ) + + +def test_dry_run_expired_branch_does_not_mutate_run_index(tmp_path: Path): + """Expired-purge branch (line ~231 guard): age > cold_days, with run_has_purge=True. + + `_purge_non_key_dirs` walks subdirs ('intake', 'sources_raw', 'sources_parsed', + 'scope', 'verify', 'tmp') and appends each existing one to its `removed` list + even under --dry-run. The for-loop over `removed` then sets + `run_has_purge = True`, so we trigger the EXPIRED_NONKEY_PURGED upsert path + by simply creating one such non-key subdir under the run. + """ + runs_root = tmp_path / "runs" + archive_dir = tmp_path / "archive" + reports_dir = tmp_path / "reports" + runs_root.mkdir() + archive_dir.mkdir() + reports_dir.mkdir() + + run_index = reports_dir / "run_index.tsv" + initial = "marker\trun_id\tstatus\n" + run_index.write_text(initial) + before = run_index.read_bytes() + + # 400 days old > --cold-days=180, so the EXPIRED branch fires. + run_id, old_ts = _make_run_id(400) + run_dir = runs_root / run_id + run_dir.mkdir() + # Create a non-key subdir so _purge_non_key_dirs records it as "removed", + # which sets run_has_purge=True and would otherwise drive an + # EXPIRED_NONKEY_PURGED upsert. + (run_dir / "intake").mkdir() + os.utime(run_dir, (old_ts, old_ts)) + + args_list = [ + "--runs-root", str(runs_root), + "--archive-dir", str(archive_dir), + "--reports-dir", str(reports_dir), + "--hot-days", "1", + "--cold-days", "180", + "--dry-run", + ] + rc = housekeeping.main(args_list) + assert rc == 0 + assert run_index.read_bytes() == before, ( + "dry-run must not mutate run-index (EXPIRED_NONKEY_PURGED branch)" + ) + # Sanity: dry-run must also not actually delete the non-key subdir. + assert (run_dir / "intake").exists(), "dry-run must not delete files" diff --git a/tests/test_label_binding_swap.py b/tests/test_label_binding_swap.py new file mode 100644 index 0000000..68ca0e8 --- /dev/null +++ b/tests/test_label_binding_swap.py @@ -0,0 +1,62 @@ +"""Regression test: SOP §Q15 — patch with swapped ITT/mITT must be blocked by the gate.""" +from __future__ import annotations +import json +import subprocess +import sys +from pathlib import Path + + +def test_swapped_itt_mitt_blocks(tmp_path: Path): + spec = tmp_path / "patch.json" + src = tmp_path / "src.txt" + + # Patch claims ITT median 18 vs 12 HR=0.6, mITT 22 vs 14 HR=0.55. + # Source places ITT with 22 vs 14 HR=0.55 (so the patch SWAPPED them). + spec.write_text(json.dumps({ + "footnote_sources": {"src_demo": "Source: demo source"}, + "required_sources": { + "src_demo": { + "type": "local_txt", + "path": str(src), + } + }, + "patches": [ + { + "label": "Q15", + "anchor": "opioid use (ITT)", + "replacement": ( + "opioid use (ITT) summary. " + "ITT: median 18 vs 12 HR 0.60 p 0.001 [[fn:src_demo]]; " + "mITT: median 22 vs 14 HR 0.55 p 0.001 [[fn:src_demo]]." + ), + } + ], + })) + # Source uses "median A B" format (no "vs"); regex requires median prefix. + # Source binds ITT -> (22, 14, HR 0.55, p 0.001); mITT -> (18, 12, HR 0.60, p 0.001). + # Patch swapped them. + src.write_text( + "opioid use (ITT) summary. " + "ITT analysis: median 22.0 14.0 HR 0.55 p 0.001. " + "mITT analysis: median 18.0 12.0 HR 0.60 p 0.001." + ) + out = tmp_path / "label_check_report.json" + proc = subprocess.run( + [ + sys.executable, "-m", "openrevise.gates.check_label_value_consistency", + "--patch-spec", str(spec), + "--source-config", str(spec), + "--output-json", str(out), + ], + capture_output=True, text=True, + ) + # Module must exist and run; "No module named ..." would mean the gate isn't ported. + assert "No module named" not in proc.stderr, ( + f"gate module missing: {proc.stderr}" + ) + assert proc.returncode != 0, f"expected non-zero exit when labels are swapped; stderr={proc.stderr}" + assert out.exists(), f"gate must emit a report; stdout={proc.stdout}; stderr={proc.stderr}" + report = out.read_text() + assert "swapped_label_binding" in report or "swap" in report.lower(), ( + f"expected report to mention swapped binding; got {report[:500]}" + ) diff --git a/tests/test_leak_guard.py b/tests/test_leak_guard.py new file mode 100644 index 0000000..79cb492 --- /dev/null +++ b/tests/test_leak_guard.py @@ -0,0 +1,39 @@ +"""Self-check test: no sensitive tokens in tracked files.""" + +import re +import subprocess + +FORBIDDEN = re.compile(r"PANOVA|阿片|opioid use \(ITT\)|/Users/star/") + +# Files that necessarily reference forbidden tokens for legitimate reasons: +# - tests/test_leak_guard.py: defines the leak-guard pattern itself. +# - src/openrevise/gates/check_label_value_consistency.py: ships a default +# opioid ITT/mITT anchor profile inherited from the private gate. The +# gate is generic in design but currently embeds the anchor literal in +# its regex. +# TODO(refactor): make profile registry; remove allow-list entry after +# desensitization (LabelBindingProfile migration, design doc step 7). +# - tests/test_label_binding_swap.py: regression fixture exercises the +# opioid-anchor code path; will become a generic profile fixture once +# the registry refactor lands. +SELF_REFERENCE_FILES = frozenset({ + "tests/test_leak_guard.py", + "src/openrevise/gates/check_label_value_consistency.py", + "tests/test_label_binding_swap.py", +}) + + +def test_no_sensitive_tokens_in_tracked_files(): + files = subprocess.check_output(["git", "ls-files"], text=True).splitlines() + offenders = [] + for path in files: + if path in SELF_REFERENCE_FILES: + continue + try: + with open(path, encoding="utf-8", errors="ignore") as f: + content = f.read() + except (FileNotFoundError, IsADirectoryError): + continue + if FORBIDDEN.search(content): + offenders.append(path) + assert not offenders, f"Sensitive tokens found in: {offenders}" diff --git a/tests/test_preprint_fallback.py b/tests/test_preprint_fallback.py new file mode 100644 index 0000000..607dcee --- /dev/null +++ b/tests/test_preprint_fallback.py @@ -0,0 +1,42 @@ +"""Preprint router falls back from primary engine to secondary on BackendUnavailable.""" +from __future__ import annotations +from openrevise.sources.preprint.router import PreprintRouter, BackendUnavailable + + +def test_falls_back_when_primary_unavailable(): + seen = [] + + def primary(spec): + seen.append("primary") + raise BackendUnavailable("deepxiv not installed") + + def fallback(spec): + seen.append("fallback") + return [{"id": "arxiv:2603.00084", "title": "DeepXiv-SDK"}] + + router = PreprintRouter(engines=[("deepxiv", primary), ("arxiv_native", fallback)]) + out = router.search({"query": "DeepXiv"}) + assert seen == ["primary", "fallback"] + assert out[0]["id"].startswith("arxiv:") + + +def test_disabled_engine_returns_empty(): + router = PreprintRouter(engines=[]) + assert router.search({"query": "anything"}) == [] + + +def test_primary_success_skips_fallback(): + seen = [] + + def primary(spec): + seen.append("primary") + return [{"id": "arxiv:1234"}] + + def fallback(spec): + seen.append("fallback") + return [{"id": "arxiv:9999"}] + + router = PreprintRouter(engines=[("primary", primary), ("fallback", fallback)]) + out = router.search({"query": "x"}) + assert seen == ["primary"] + assert out[0]["id"] == "arxiv:1234" diff --git a/tests/test_router_default_wiring.py b/tests/test_router_default_wiring.py new file mode 100644 index 0000000..7180872 --- /dev/null +++ b/tests/test_router_default_wiring.py @@ -0,0 +1,52 @@ +"""default_router() wires preprint/biomed/local backends correctly.""" +from __future__ import annotations +import pytest + +from openrevise.sources.router import default_router + + +def test_default_router_dispatches_arxiv_to_preprint(): + calls = [] + + def fake_preprint_engine(spec): + calls.append(spec.get("source_id")) + return [{"id": "fake"}] + + router = default_router(preprint_engine=fake_preprint_engine) + out = router.dispatch({"source_id": "arxiv", "source_type": "preprint", "query": "x"}) + assert calls == ["arxiv"] + assert out[0]["id"] == "fake" + + +def test_default_router_dispatches_pubmed(monkeypatch): + calls = [] + monkeypatch.setattr( + "openrevise.sources.biomed.pubmed.search", + lambda spec: calls.append("pubmed") or [{"pmid": "1"}], + ) + router = default_router() + out = router.dispatch({"source_id": "pubmed", "source_type": "biomed", "query": "x"}) + assert calls == ["pubmed"] + assert out[0]["pmid"] == "1" + + +def test_default_router_dispatches_europepmc(monkeypatch): + calls = [] + monkeypatch.setattr( + "openrevise.sources.biomed.europepmc.search", + lambda spec: calls.append("europepmc") or [{"id": "PMC1"}], + ) + router = default_router() + out = router.dispatch({"source_id": "europe_pmc", "source_type": "biomed", "query": "x"}) + assert calls == ["europepmc"] + + +def test_default_router_dispatches_pmc(monkeypatch): + calls = [] + monkeypatch.setattr( + "openrevise.sources.biomed.pmc.search", + lambda spec: calls.append("pmc") or [{"pmcid": "PMC2"}], + ) + router = default_router() + out = router.dispatch({"source_id": "pmc", "source_type": "biomed", "query": "x"}) + assert calls == ["pmc"] diff --git a/tests/test_router_dispatch.py b/tests/test_router_dispatch.py new file mode 100644 index 0000000..d5e492d --- /dev/null +++ b/tests/test_router_dispatch.py @@ -0,0 +1,21 @@ +"""SourceRouter dispatches by source_type to registered backends.""" +from __future__ import annotations +import pytest + +from openrevise.sources.router import SourceRouter, BackendNotRegistered + + +def test_dispatches_by_source_type(): + router = SourceRouter() + calls = [] + router.register("preprint", lambda spec: calls.append(("preprint", spec)) or "ok") + router.register("biomed", lambda spec: calls.append(("biomed", spec)) or "ok") + spec = {"source_id": "arxiv", "source_type": "preprint", "query": "HFrEF"} + assert router.dispatch(spec) == "ok" + assert calls == [("preprint", spec)] + + +def test_unknown_source_type_raises(): + router = SourceRouter() + with pytest.raises(BackendNotRegistered): + router.dispatch({"source_type": "unknown"})