diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 2ab3bcf..4f5565b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -18,6 +18,12 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 + with: + # Phase-5 Track D needs origin/main reachable so + # check-schema-compat can diff against the PR base. Default + # checkout fetches only HEAD; full history is cheap on this + # repo (mostly markdown + small Python). + fetch-depth: 0 - uses: actions/setup-python@v5 with: @@ -92,6 +98,15 @@ jobs: if: github.event_name != 'schedule' run: make check-licenses + - name: Schema-version policing (Phase-5 Track D — per-PR only) + # Per phase5-plan.md §5 D3: this gate fires per-PR only — a + # weekly cron firing wouldn't add signal because schema_compat + # bumps land via PRs by definition. Diffs against origin/main + # so the gate has a stable base ref regardless of which branch + # opened the PR. + if: github.event_name != 'schedule' + run: make check-schema-compat + handshake: # Phase-3 Track D — discovery-protocol handshake. Runs the 8-step # external-agent walk-through end-to-end. On push/PR uses bundled diff --git a/Makefile b/Makefile index 5cc5037..0ccf430 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -.PHONY: catalog validate-catalog check-catalog check-repo-meta phase0-smoke check-docs-prose recipes-check handshake check-freshness check-links check-licenses +.PHONY: catalog validate-catalog check-catalog check-repo-meta phase0-smoke check-docs-prose recipes-check handshake check-freshness check-links check-licenses check-schema-compat # Phase-1 Track B's generator. Fetches each TIER_1+TIER_2+TIER_3 repo's # dist/repo.meta.json, validates it, translates it into a `tools.` @@ -124,3 +124,12 @@ check-links: # substring signature (≥ 2 markers per license). check-licenses: python3 profile/build/check-licenses.py --offline + +# Phase-5 Track D: schema-version policing gate. Per-PR only — diffs +# tools.schema.json + task_index.schema.json between the PR base and +# HEAD; fails if schema_compat bumped without a matching +# schema-changelog.md edit, or if a non-additive change (removed +# required field, removed enum value, additionalProperties tightened +# true→false) landed without a schema_compat bump. +check-schema-compat: + python3 profile/build/check-schema-compat.py diff --git a/docs/docs-discoverability/README.md b/docs/docs-discoverability/README.md new file mode 100644 index 0000000..25e9b94 --- /dev/null +++ b/docs/docs-discoverability/README.md @@ -0,0 +1,683 @@ +--- +created: 2026-05-11 +last_modified: 2026-05-11 +revisions: 0 +doc_type: [PROPOSAL, DESIGN, PLAN] +lifecycle: active +owner: rmrich5 +status: accepted +accepted_on: 2026-05-11 +--- + +# Docs Discoverability — m-dev-tools + +> Accepted standard for treating every document in the m-dev-tools org +> as a first-class, CI-enforced artifact. Anchors the next phase of the +> org's docs-as-code stack on top of the existing manifest / catalog / +> link-check infrastructure in `profile/`. + +This document records the **accepted** org-wide documentation standard, +the CI checks that enforce it, and a phased remediation plan for the +108 existing docs across the seven tier-1 and tier-2 repos. The seven +open questions raised during the proposal phase were resolved on +2026-05-11 and folded into §4–§9. The full Q&A is preserved in §13 as +the design record. Execution progress is tracked in +[`phases-tracker.md`](phases-tracker.md). + +--- + +## 1. Why this is needed + +The org already has strong infrastructure for *machine-readable* +artifacts: + +- `profile/repo.meta.schema.json` — per-repo manifest schema. +- `profile/tools.json` + `task_index.json` — catalog consumed by agents. +- `profile/build/validate-catalog.py`, `check-links.py`, + `check-freshness.py` — CI gates that already run per-PR and weekly. +- Per-repo `make check-docs-prose` — enforces that `docs/` holds only + prose. + +What is *not yet* enforced is anything about the prose inside `docs/`: + +| Concern | Today | Gap | +|---|---|---| +| Doc has a stated *type* | Frontmatter exists on m-stdlib modules only | No org-wide schema, no enforcement | +| Doc is discoverable | First-pass `docs/README.md` indexes exist in 7 repos | Hand-curated; can drift silently | +| Filename describes content | Mixed: `evolution.md`, `linter-profiles-guide.md` (actually a proposal), `discoveries.md` | No convention; misleading names persist | +| Doc has an owner | Implicit (CODEOWNERS at repo level only) | No per-doc owner; orphaned docs accumulate | +| Doc has a lifecycle | None | Active plans and superseded plans look identical | +| Doc is fresh | `check-freshness.py` exists for manifests | Not extended to prose | +| Cross-repo refs are valid | `check-links.py` covers catalog URLs | Doesn't yet crawl `docs/` markdown links | +| Plan and execution are distinct files | Often merged (`PLAN, BUILD-LOG` combinations seen in 6 docs) | No norm forcing split | + +The org has already done the hard work on machine-readable artifacts; +prose docs are the last unguarded surface. This proposal extends the +existing pipeline rather than starting a parallel one. + +--- + +## 2. Goals & non-goals + +### Goals + +1. **Every doc declares what it is.** Frontmatter with a typed + `doc_type` from a fixed vocabulary. Combinations allowed and + meaningful. +2. **Every doc is reachable from the org catalog.** Org-level builder + walks each repo's `docs/` and produces a single index keyed by type, + repo, lifecycle, and recency. +3. **Filenames tell humans what's inside without opening the file.** + Suffix or numeric-prefix conventions tied to doc_type. +4. **CI rejects drift.** The same per-PR + weekly cron pipeline that + guards `tools.json` guards `docs/`. +5. **Existing docs are remediated, not abandoned.** Phased rollout with + a warn-only period so legacy docs can be migrated without a flag day. + +### Non-goals + +- **Prose style guide.** This proposal pins *structure and metadata*, + not voice, tone, line length, or heading style. Those belong in a + separate (later) doc and are partly handled by `markdownlint`. +- **Replacing repo-level READMEs.** Top-level `README.md` remains the + public face of each repo; this proposal governs `docs/` only. +- **Authoring guidance.** "How to write a good ADR" is out of scope + here — referenced templates handle that. +- **Generated-doc enforcement.** Anything regenerated from + `dist/*.json` is already covered by the manifest drift gates. + +--- + +## 3. Principles + +1. **Docs-as-code.** Same review, same CI, same blast radius as source. + No separate "docs platform" with its own auth and storage. +2. **Schema before tooling.** Every check is a thin runner over a + versioned JSON schema in `profile/`. Schemas are the contract; + scripts are replaceable. +3. **Single source of truth per topic.** If two docs cover the same + subject, one is canonical and the other links to it. The catalog + surfaces duplicates so they get merged or one gets marked + `superseded`. +4. **Lifecycle is explicit.** A frozen plan from six months ago and an + active plan from yesterday must look different *in metadata*, not + only by reading-and-dating. +5. **Naming follows content, not history.** A file named `*-guide.md` + that is structurally a design proposal is a bug. CI catches it. +6. **The index is generated.** Hand-curated indexes drift the day after + they're written. `docs/README.md` is rebuilt from frontmatter. +7. **Combinations are first-class refactoring signals.** `[PLAN, + BUILD-LOG]` is allowed today, but the catalog flags it as a + split-candidate. Same for `[PLAN, ROADMAP]`, `[SURVEY, PLAN]`. +8. **Warn before block.** Every new gate ships warn-only first; we let + one weekly cron pass before flipping to blocking. + +--- + +## 4. The vocabulary + +Established in a first pass (2026-05-11) and now stamped into every +doc's `doc_type` field. Reproduced here for reference; the canonical +list lives in `profile/docs.schema.json` (proposed below). + +**Types** (23) — `HISTORY` · `ARCHITECTURE` · `DESIGN` · `ADR` · `SPEC` +· `REFERENCE` · `GUIDE` · `TUTORIAL` · `ROADMAP` · `PLAN` · `RESEARCH` +· `SURVEY` · `GAP-ANALYSIS` · `STATUS` · `EXPLAINER` · `NOTES` · +`WORKED-EXAMPLE` · `SETUP` · `INTEGRATION` · `PROPOSAL` · `BUILD-LOG` +· `CHANGELOG` · `POSTMORTEM`. + +**Connections** (6, lowercase) — `history` · `function` · `design` · +`architecture` · `planning` · `implementation`. + +Connections are **optional** in frontmatter (Phase 1 decision). The +first-pass labeling already captured connections per-doc in each +repo's `docs/README.md`; they'll be harvested retroactively into +frontmatter when the doc catalog builder lands in Phase 3. Whether +to make them required is revisited then. + +The four Diátaxis quadrants map cleanly onto a subset, which is a +useful sanity check that the taxonomy is coherent: + +| Diátaxis | Our type | +|---|---| +| Tutorial (learning-oriented) | `TUTORIAL` | +| How-to (task-oriented) | `GUIDE` | +| Reference (information-oriented) | `REFERENCE` | +| Explanation (understanding-oriented) | `EXPLAINER` | + +The remaining 19 types cover artifacts Diátaxis doesn't address +explicitly: decisions (`ADR`, `PROPOSAL`), forward-looking work +(`PLAN`, `ROADMAP`), backward-looking artifacts (`HISTORY`, +`BUILD-LOG`, `CHANGELOG`, `POSTMORTEM`), analytical work (`SURVEY`, +`RESEARCH`, `GAP-ANALYSIS`), state snapshots (`STATUS`), and the +inevitable bucket (`NOTES`). + +### Combinations and split-candidates + +Combinations are allowed but flagged. The first-pass labeling surfaced +these patterns: + +| Combination | Count | Refactor heuristic | +|---|---|---| +| `[PLAN, BUILD-LOG]` | 1 | Split into a frozen plan + a live log | +| `[PLAN, ROADMAP]` | 2 | Demote one to a section of the other | +| `[SURVEY, GAP-ANALYSIS]` | 4 | Often correct; survey describes, gap prescribes; consider whether two docs are clearer | +| `[GAP-ANALYSIS, PLAN]` | 2 | Split: gap is descriptive, plan is prescriptive | +| `[HISTORY, BUILD-LOG]` | 2 | Build-log freezes per release; history is the narrative | +| `[REFERENCE, STATUS]` | 1 | Reference is timeless; status is dated. Split | +| `[GUIDE, REFERENCE]` | 2 | Usually fine; pick one as primary, link the other | + +These are *targets for future refactor*, not immediate CI failures. + +--- + +## 5. Frontmatter schema + +Lands as `profile/docs.schema.json` (new, sibling of the existing +manifest schemas). Versioned via `schema-changelog.md`. Validated by a +new `profile/build/validate-docs.py`. + +```yaml +--- +# Required +created: 2026-04-15 # ISO date, first git commit +last_modified: 2026-05-09 # ISO date, latest git commit +revisions: 7 # int, commit count on this file +doc_type: [ADR, DESIGN] # one or more from the 23-vocab +lifecycle: active # draft | active | frozen | superseded | deprecated + +# Strongly encouraged +title: "ADR-0007: Use STDASSERT as the test protocol" +owner: rmrich5 # GitHub handle of accountable owner +connections: [design, architecture] # 0–3 from the 6-vocab + +# Optional, semantic links between docs +replaces: [adr/0003-...md] # this doc supersedes those +supersedes: [adr/0003-...md] # alias of `replaces` +superseded_by: adr/0011-...md # this doc was retired by that +related: [docs/guides/m-tdd-guide.md] + +# Optional, lifecycle +freeze_after: 2026-09-01 # for plans that should be frozen at a date +review_after: 2026-12-01 # surface in the freshness report after this + +# Optional, exclusion +generated: true # if true, this doc is machine-generated; + # see §5.1 — docs-QA CI skips it +--- +``` + +Notes: + +- `created`, `last_modified`, `revisions` are populated by tooling from + `git log --follow`, never hand-edited. CI verifies they match. +- `doc_type` is the type vocabulary list. Combinations remain explicit. +- `lifecycle` is the new field this proposal introduces. Default for + the org-wide backfill: `active`. +- `owner` defaults to the repo's top contributor if unset; the warning + surfaces in the catalog so it can be reassigned. + +### Lifecycle states + +| State | Meaning | Visible in catalog | Counts toward freshness gate | +|---|---|---|---| +| `draft` | In progress, not yet authoritative | Yes, flagged | No | +| `active` | Current and authoritative | Yes | Yes (warning at `review_after`) | +| `frozen` | Locked snapshot (e.g., a release plan) | Yes, marked | No | +| `superseded` | Retired in favor of another doc | Yes, with link | No | +| `deprecated` | Kept for archaeology only; do not act on it | Folded into history view | No | + +This is the single biggest authoring change: forward-looking work +(`PLAN`, `ROADMAP`, `PROPOSAL`) gets `lifecycle: draft` while being +written, flips to `active` at acceptance, and to `frozen` when locked. +Live build-logs and trackers stay `active` indefinitely; release plans +flip to `frozen` at GA. + +### 5.1 Generated documents — excluded from docs QA + +**Rule.** Documents that are produced by a generator (e.g., m-stdlib's +`docs/modules/std*.md` written by `make manifest`) are **out of scope +for docs QA/CI**. Their lifecycle is machine-driven: the generator is +the source of truth, and the existing manifest drift gate +(`make check-manifest` in each generating repo) already guards their +integrity. + +**How a doc is marked generated.** Frontmatter field +`generated: true`. Set by the generator on every regeneration; never +hand-edited. + +**What the docs validator does with `generated: true` docs.** + +1. Skips all docs-QA checks listed in §9 (frontmatter validation, + filename rules, required sections, README index membership, + markdownlint). +2. Asserts only that: + - `generated: true` is present + - The file path is declared in the repo's `repo.meta.json` under a + new `docs.generated_paths:` field (so a typo or stray marker + can't accidentally hide a hand-written doc from CI) + - The file's `last_modified` matches the latest generator run + (drift gate; identical guarantee to the existing + `dist/*.json` drift check, applied to generated prose) + +**Where the rule lives.** `profile/docs.schema.json` carries the +`generated` field; `profile/repo.meta.schema.json` is extended with +`docs.generated_paths` (a list of path globs) so the exclusion is +declared at repo-manifest time, not silently per-file. + +**What today's generated docs look like.** The 32 `m-stdlib/docs/modules/std*.md` +files are the only known generated docs in the org as of 2026-05-11. +Their existing frontmatter already carries `module:`, `tag:`, `phase:`, +`stable:`, `since:`, `synopsis:`, etc. The generator simply adds +`generated: true` on the next `make manifest` run, and the +docs-QA gate stops trying to validate them against the prose schema. + +--- + +## 6. Standard repo layout for `docs/` + +Already partly enforced by `make check-docs-prose`. This proposal pins +the subtree convention: + +``` +/docs/ +├── README.md # generated index — DO NOT hand-edit +├── adr/ # ADRs (numbered NNNN-slug.md), self-contained +│ └── README.md # generated index of ADRs (optional) +├── plans/ # PLAN, PROPOSAL, ROADMAP +├── guides/ # GUIDE, TUTORIAL, EXPLAINER, SETUP, INTEGRATION +│ # — organised by Diátaxis quadrant (see below) +├── reference/ # REFERENCE (often auto-generated; see §5.1) +├── research/ # RESEARCH, SURVEY, GAP-ANALYSIS +├── history/ # HISTORY, BUILD-LOG, CHANGELOG, POSTMORTEM +└── status/ # STATUS, dated trackers +``` + +Subdirs are *recommended*, not mandatory — repos with only a handful +of docs (e.g., `tree-sitter-m-vscode`) keep them at top-level. The +CI gate enforces that *if a subdir exists, the docs inside it have a +matching `doc_type`*. A `PLAN` doc may not live in `guides/`; a `GUIDE` +doc may not live in `plans/`. + +**`guides/` follows Diátaxis.** Within `guides/`, each doc belongs to +exactly one of the four quadrants defined by the [Diátaxis +framework](https://diataxis.fr): + +| Quadrant | Our type | Audience question | Filename pattern | +|---|---|---|---| +| Tutorial (learning) | `TUTORIAL` | "I'm new — teach me." | `*-tutorial.md` | +| How-to (task) | `GUIDE` | "I have a job — help me do it." | `*-guide.md` or `*-how-to.md` | +| Reference (info) | `REFERENCE` | "I need a fact — show me." | content-derived (no suffix required) | +| Explanation | `EXPLAINER` | "I want to understand — explain it." | `*-explainer.md` | + +`SETUP` and `INTEGRATION` docs are a subspecies of how-to and live in +`guides/` too (`*-setup.md`, `*-integration.md`). The CI gate enforces +the Diátaxis filename pattern within `guides/`. + +This is a major reshuffle for `m-cli/docs/plans/` (which today contains +surveys, status reports, history, and design proposals labeled +`PLAN`). Migration is part of the remediation plan in §10. + +--- + +## 7. Filename conventions + +Goal: the filename alone tells a reader what the doc is, without +opening it. + +| Doc type | Filename pattern | Examples | +|---|---|---| +| `ADR` | `NNNN-slug.md` (4-digit) | `0007-stdassert-as-test-protocol.md` | +| `PLAN` | `*-plan.md` | `m-stdlib-implementation-plan.md` | +| `ROADMAP` | `*-roadmap.md` | `m-libraries-roadmap.md` | +| `PROPOSAL` | `*-proposal.md` | `m-environment-tool-proposal.md` | +| `SPEC` | `*-spec.md` or `spec.md` | `spec.md`, `m-doc-grammar-spec.md` | +| `GUIDE` | `*-guide.md` or `*-how-to.md` (Diátaxis how-to) | `m-linting-user-guide.md` | +| `TUTORIAL` | `*-tutorial.md` (Diátaxis tutorial) | `accsum-tutorial.md` | +| `SURVEY` | `*-survey.md` | `m-linting-survey.md` | +| `GAP-ANALYSIS` | `*-gaps.md` | `ydb-dev-tools-gaps.md` | +| `STATUS` (snapshot) | `*-status-YYYY-MM-DD.md` (dated) | `m-linter-status-2026-04-30.md` | +| `STATUS` (live tracker) | `*-tracker.md` (no date — continually updated) | `phases-tracker.md`, `module-tracker.md` | +| `POSTMORTEM` | `postmortem-YYYY-MM-DD-slug.md` | `postmortem-2026-04-29-cache-stampede.md` | +| `BUILD-LOG` | `build-log.md` or `*-build-log.md` | `build-log.md` | +| `CHANGELOG` | `changelog.md` (always at repo root or `tracking/`) | | +| `HISTORY` | `*-history.md` or `evolution.md` | `m-cli-history.md` | +| `WORKED-EXAMPLE` | `*-worked-example.md` or `example-*.md` | `worked-example-accsum.md` | +| `SETUP` | `*-setup.md` | `lsp-setup.md` | +| `INTEGRATION` | `*-integration.md` | `pre-commit-integration.md` | +| `NOTES` | `*-notes.md` | `tree-sitter-notes.md` | +| `EXPLAINER` | `*-explainer.md` or content-derived | `vista-meta-bootstrap-explainer.md` | +| `REFERENCE` | content-derived; no required suffix | `stdjson.md`, `commands.md` | + +Rules: + +- **No `and` in filenames.** `gap-analysis-and-remediation-strategy.md` + is two docs in a trench coat. Split. +- **kebab-case.** No underscores, no spaces, no camelCase. +- **No filename collisions across `doc_type` families.** A `*-guide.md` + may not appear under `plans/`; if its content is a design proposal, + rename to `*-proposal.md` and move it. +- **Status docs are date-stamped.** Two `m-linter-status.md` files + shouldn't replace each other — they should sit next to each other + with their dates. + +The CI gate enforces filename ↔ doc_type consistency. + +--- + +## 8. Per-doc-type required sections + +Each doc_type gets a minimal required-structure check. Templates live +in `.github/docs/templates/` (proposed). CI checks heading presence, +not heading order or contents. + +| Doc type | Required H2 sections | +|---|---| +| `ADR` | Status · Context · Decision · Consequences | +| `PLAN` | Goal · Scope · Phases · Success criteria | +| `ROADMAP` | Horizon · Themes · Sequence | +| `PROPOSAL` | Problem · Proposal · Alternatives considered · Open questions | +| `SURVEY` | Scope · Method · Findings · Synthesis | +| `GAP-ANALYSIS` | Target · Current state · Gaps · Recommendations | +| `POSTMORTEM` | Summary · Timeline · Impact · Root cause · Action items | +| `STATUS` | As of · Summary · Per-area state · Next | +| `BUILD-LOG` | (none — chronological entries) | +| `GUIDE` | Audience · Prerequisites · Walkthrough | +| `TUTORIAL` | What you'll build · Prerequisites · Steps · Next | +| `EXPLAINER` | (none — narrative) | +| `SPEC` | Scope · Definitions · Normative requirements · Examples | +| `REFERENCE` | (frontmatter `synopsis:` field carries the summary) | + +These mirror well-established templates (Michael Nygard's ADRs, +Google's SRE postmortem template, the Diátaxis types). + +--- + +## 9. CI enforcement + +A new `profile/build/validate-docs.py` script runs per-PR on every +repo, mirroring how `validate-repo-meta.py` runs today. Lives as a +make target `make check-docs` in each repo's Makefile. + +### Phase 1 — frontmatter and indexing (warn-only, then blocking) + +0. **Generated-doc gate (always first).** If `generated: true`, the + validator runs only the §5.1 sanity checks (declared path, drift + vs. generator) and skips every check below. This rule is + non-overridable. +1. **Frontmatter present.** Every non-generated `.md` under `docs/` + has a valid YAML frontmatter block. +2. **Required keys present.** `created`, `last_modified`, `revisions`, + `doc_type`, `lifecycle`. +3. **`doc_type` values valid.** Each entry in the list is one of the 23 + known types. +4. **`lifecycle` value valid.** One of the five known states. +5. **`created` / `last_modified` / `revisions` match git.** Tooling + recomputes and compares; mismatch is auto-fixable. +6. **`docs/README.md` exists.** Top-level index per repo. +7. **No orphans.** Every `.md` file under `docs/` is referenced by + `docs/README.md`. +8. **No dangling refs.** Every link in `docs/README.md` resolves. + +### Phase 2 — naming and structure (warn-only, then blocking) + +9. **Filename matches doc_type.** Per the table in §7. +10. **Filename is content-derived.** No `and`; kebab-case; no + `temp`/`draft`/`new`/`copy` prefixes. +11. **Required H2 sections present.** Per the table in §8. +12. **`markdownlint` clean.** Standard ruleset, repo-overridable. + +### Phase 3 — cross-repo and freshness (extend existing weekly cron) + +13. **Cross-repo links valid.** Extend the existing + `check-links.py` to crawl `docs/`. +14. **Freshness gate.** `lifecycle: active` docs whose + `last_modified` is older than `review_after` (or 12 months by + default) surface as warnings. +15. **Supersession integrity.** If A says `superseded_by: B`, then B + must say `replaces: [A]`. Bidirectional check. +16. **Combination warnings.** Docs with refactor-candidate + combinations (table in §4) surface as warnings in the catalog. + +### Phase 4 — org-level catalog (new builder) + +17. **`build-doc-catalog.py`** — new sibling of `build-catalog.py`. + Walks every repo's `docs/`, harvests frontmatter, produces + `profile/docs.json` keyed by type / repo / lifecycle / freshness. + Runs in the same weekly cron as the live catalog drift gate. +18. **`docs.schema.json`** — pinned schema for the catalog output, so + downstream consumers (LLM agents, the m-cli `m doc` family) can + rely on a stable contract. + +Every check is opt-in via repo's `make check-docs` target, mirroring +how `check-docs-prose` already works. Per-PR run uses `--offline`; +weekly cron does the cross-repo walk. + +--- + +## 10. Remediation plan + +The bulk work is already done — see §11 — but the existing corpus +still needs cleanup. + +### Phase 0 — DONE (2026-05-11) + +- Vocabulary established (23 types, 6 connections). +- `docs/README.md` written in every repo (7 indexes, 108 entries). +- Frontmatter applied to every existing doc (108 files). +- Initial cross-repo refactor candidates surfaced (see §4 table). +- Standard accepted (this document) and 7 open questions resolved + (see §13). + +### Phase 1 — Schema + warn-only CI (target: 2026-Q2 weeks 1–4) + +- Land `profile/docs.schema.json`. +- Land `profile/build/validate-docs.py` + tests. +- Land `make check-docs` in each repo's Makefile. +- Wire into per-repo CI as **warn-only** for two cron cycles. +- Backfill `lifecycle` field across all 108 docs (default `active`). +- Backfill `owner` field across all 108 docs (default: repo's primary + committer per `git shortlog`). + +### Phase 2 — Block on new docs (target: 2026-Q2 weeks 5–8) + +- Flip CI to **blocking** for any new `.md` added under `docs/`. +- Legacy docs remain warn-only. +- First batch of legacy remediations: + - Rename mis-named files per §7 (estimated ~15 files org-wide + based on the first-pass labeling). + - Split confirmed combinations: + `m-cli/docs/plans/m-linting-implementation-plan.md` + → `m-linting-implementation-plan.md` (frozen) + `m-linting-build-log.md` (live). + Similar for `tree-sitter-m/docs/build-log.md` (which is + `[BUILD-LOG, HISTORY]`). + - Move mis-placed docs out of `plans/` in m-cli (the 4 docs that + are surveys, history, or design proposals). + +### Phase 3 — Block everywhere; org catalog (target: 2026-Q3) + +- Flip CI to **blocking** on legacy docs too. Any doc that hasn't been + remediated by this point gets a `lifecycle: deprecated` stamp and + drops out of the freshness gate. +- Land `profile/build/build-doc-catalog.py` + `profile/docs.json`. +- Publish a rendered HTML view in the `.github` repo (mirroring how + `profile/README.md` surfaces today). + +### Phase 4 — Cross-repo and freshness (target: 2026-Q4) + +- Extend `check-links.py` to crawl `docs/`. +- Wire freshness warnings into the weekly cron. +- Auto-PR for `last_modified` / `revisions` drift (since these are + derived from git, they should auto-update on commit; a hook in each + repo can do this). + +### Remediation budget + +| Phase | Approx. effort | +|---|---| +| Phase 1 (schema + tooling + warn-only) | ~3 dev-days | +| Phase 2 (block-on-new + first remediation batch) | ~2 dev-days | +| Phase 3 (block-on-legacy + catalog) | ~3 dev-days | +| Phase 4 (cross-repo + freshness) | ~2 dev-days | +| **Total** | **~10 dev-days** spread across Q2–Q4 | + +This is small because the org's existing pipeline absorbs most of the +work — there are no new auth surfaces, no new storage, no new +deployment targets. Doc-lint is one more validator under +`profile/build/`. + +--- + +## 11. Mapping to existing infrastructure + +| Existing piece | What we extend | +|---|---| +| `profile/repo.meta.schema.json` | Reference, no change. The new docs schema is a sibling. | +| `profile/tools.json` / `task_index.json` | No change. Catalog stays canonical for *tool* discovery. | +| `profile/build/validate-catalog.py` | Pattern to copy for `validate-docs.py`. | +| `profile/build/check-links.py` | Extend to crawl `docs/` markdown. | +| `profile/build/check-freshness.py` | Extend to honor doc-level `review_after`. | +| `profile/build/build-catalog.py` | Pattern to copy for `build-doc-catalog.py`. | +| `make check-docs-prose` (per-repo) | Co-located with new `make check-docs`. | +| `.github/.github/workflows/ci.yml` | One new step: `make check-docs`. | +| Weekly cron handshake | One new job: docs catalog drift + freshness. | + +No new languages, no new dependencies beyond `pyyaml` (already a +transitive dep of `jsonschema`-based validators). + +--- + +## 12. Industry references + +The pattern below is a synthesis from public-facing engineering org +practices. Direct lifts and inspirations: + +- **Spotify Backstage TechDocs** — docs-as-code shipped from each + service's repo; central catalog harvests metadata. The + org-catalog-via-per-repo-manifest model here mirrors that. +- **Michael Nygard's ADR template** — Status / Context / Decision / + Consequences. Adopted verbatim for the ADR required-sections row. +- **Google SRE Workbook — postmortem template** — Summary / Timeline / + Impact / Root cause / Action items. Adopted verbatim. +- **Diátaxis (Daniele Procida)** — Tutorial / How-to / Reference / + Explanation. Used as a sanity check on our type vocabulary (§4). +- **Stripe API docs** — single source of truth, generated catalog, + freshness flags. Pattern for §9 Phase 3. +- **GitHub's `.github` repo convention** — already in use here. This + proposal adds doc-level metadata to that surface. +- **Daniele Procida's "Documentation system" talk (PyCon AU 2017)** — + background for the lifecycle states distinction. +- **Cosmic Python (Harry Percival & Bob Gregory) ADR practice** — + per-decision file, numbered, never edited after acceptance. + +The combinable-types approach (§4) is less standard. Most orgs pick a +single canonical type per doc; we explicitly allow combinations because +they surface refactoring debt rather than hiding it. The catalog will +flag combinations as split-candidates, not errors. + +--- + +## 13. Resolved decisions (2026-05-11) + +All seven open questions raised during the proposal phase were resolved +on 2026-05-11. The resolutions are folded into the body of this +document (sections noted below); the Q&A is preserved here as the +design record. + +### Q1 — Adopt Diátaxis as the layout for `guides/`? + +**Decision: YES.** Every doc under `guides/` belongs to exactly one +Diátaxis quadrant. Filename and required-section rules tighten +accordingly. *(Folded into §6, §7, §8.)* + +### Q2 — Per-doc OWNERS vs. frontmatter `owner`? + +**Decision: frontmatter `owner` for now.** A `.github/CODEOWNERS`-style +file remains available as an escalation path if review routing becomes +painful, but is not part of Phase 1. *(Folded into §5.)* + +### Q3 — Should `frozen` plans move to `docs/history/`? + +**Decision: leave in place; mark in frontmatter only.** Moving frozen +docs would break inbound links across repos and across the existing +catalog. The catalog renders frozen docs in a separate view based on +`lifecycle: frozen`. *(Folded into §5 — lifecycle table.)* + +### Q4 — Connections (`connections: […]`) — required or optional? + +**Decision: optional. Harvest retroactively. Revisit later.** Phase 1 +leaves the field optional in the frontmatter schema. Phase 3's catalog +builder will harvest connections from each repo's existing +`docs/README.md` (which already captured them in the first pass) into +the central catalog. Whether to make them required is revisited when +the catalog is in production. *(Folded into §4.)* + +### Q5 — Generated documents + +**Decision: generated documents are excluded from docs QA/CI.** They +have a machine-driven lifecycle and their integrity is already +guarded by the existing manifest drift gates. The exclusion is a +**rule, not a guideline** — it is enforced by: + +1. The `generated: true` frontmatter field, which the validator honors + as a hard short-circuit. +2. A new `docs.generated_paths` array in `repo.meta.json` declaring + which path globs are generated, so the marker can't be applied to + hand-written docs to silence CI. +3. A drift check that the file's `last_modified` matches the latest + generator run. + +*(Folded into §5.1 as the canonical statement of the rule. §9 check #0 +implements it.)* + +### Q6 — Cross-repo doc links + +**Decision: full URLs.** Same convention as the existing catalog. An +`org:/` shorthand is deferred indefinitely; the cost of maintaining +full URLs is small relative to the cost of adopting a non-standard +link syntax that breaks every existing markdown renderer. + +### Q7 — Migration of `m-cli/docs/plans/` + +**Decision: one PR with movement-only changes, then per-doc PRs for +substantive splits.** This isolates filesystem moves (reviewable as a +diff of paths) from content changes (reviewable as a diff of text). +The movement-only PR is mechanical; the split PRs each address one +combination from the §4 refactor-candidate table. *(Reflected in §10 +Phase 2.)* + +--- + +## Appendix A — comparison to the first-pass output + +The first-pass index applied 23 types and 6 connections to 108 docs. +The distribution: + +- 32 docs are `[REFERENCE]` (one-third — m-stdlib modules dominate). +- 9 docs are `[ADR]` (m-standard 6, tree-sitter-m 6 — both numbered). +- 11 docs carry combinations from §4's table (refactor candidates). +- 9 docs are STATUS-typed (live trackers, mostly in m-stdlib). + +That distribution is healthy: most docs are single-typed, the +combinations are concentrated in the planning/execution split that +the §6 layout standard separates, and the live trackers are already +in `tracking/`. No huge surprises; the remediation work is small and +mostly mechanical. + +--- + +## Appendix B — what this proposal does NOT do + +- It does not change the existing manifest schemas (`repo.meta`, + `tools.json`, `task_index.json`). Those are for machine-readable + artifacts; this is for prose. +- It does not require existing docs to be rewritten — only re-stamped + with frontmatter (already done) and possibly renamed/moved. +- It does not introduce a separate docs build system. Same Python, + same `make`, same CI as everything else. +- It does not require an external service. The org catalog is one + more JSON file in `profile/`. + +--- + +*End of proposal. Pending review before any Phase 1 work begins.* diff --git a/docs/docs-discoverability/phases-tracker.md b/docs/docs-discoverability/phases-tracker.md new file mode 100644 index 0000000..6ff5862 --- /dev/null +++ b/docs/docs-discoverability/phases-tracker.md @@ -0,0 +1,176 @@ +--- +created: 2026-05-11 +last_modified: 2026-05-11 +revisions: 0 +doc_type: [STATUS] +lifecycle: active +owner: rmrich5 +title: "Docs Discoverability — Phases Tracker" +--- + +# Docs Discoverability — Phases Tracker + +> Live progress tracker for the org-wide documentation standard +> accepted on 2026-05-11. The standard itself is in +> [`README.md`](README.md). This document tracks execution; the +> README is the contract. + +**Status values** — `done` · `in-progress` · `not-started` · `blocked` · `deferred` + +**Update protocol** — this tracker is updated *in the same commit* as +any change that moves a row's status. Cadence is event-driven, not +scheduled. Closed items stay in the table marked `done`; history is +one of the legal values. + +--- + +## 1. Phase summary + +| Phase | Theme | Target | Status | Reference | +|---|---|---|---|---| +| 0 | Vocabulary, frontmatter, indexes, acceptance | 2026-05-11 | done | [§10 P0](README.md#phase-0--done-2026-05-11) | +| 1 | Schema + warn-only CI | 2026-Q2 weeks 1–4 | not-started | [§10 P1](README.md#phase-1--schema--warn-only-ci-target-2026-q2-weeks-14) | +| 2 | Block on new docs; first remediations | 2026-Q2 weeks 5–8 | not-started | [§10 P2](README.md#phase-2--block-on-new-docs-target-2026-q2-weeks-58) | +| 3 | Block everywhere; org catalog | 2026-Q3 | not-started | [§10 P3](README.md#phase-3--block-everywhere-org-catalog-target-2026-q3) | +| 4 | Cross-repo + freshness | 2026-Q4 | not-started | [§10 P4](README.md#phase-4--cross-repo-and-freshness-target-2026-q4) | + +A phase flips to `in-progress` when any of its rows below leave +`not-started`, and to `done` when every row reaches `done`. + +--- + +## 2. Phase 0 — DONE (2026-05-11) + +| ID | Item | Status | Date | Notes | +|---|---|---|---|---| +| P0.1 | Vocabulary established (23 types + 6 connections) | done | 2026-05-11 | Folded into README §4 | +| P0.2 | `docs/README.md` indexes generated in every repo | done | 2026-05-11 | 7 repos, 108 entries | +| P0.3 | Frontmatter applied to every existing doc | done | 2026-05-11 | 108 files; script is idempotent | +| P0.4 | Standard accepted; 7 open questions resolved | done | 2026-05-11 | README §13 captures the Q&A | +| P0.5 | Phases tracker created (this file) | done | 2026-05-11 | | + +--- + +## 3. Phase 1 — Schema + warn-only CI + +Target: **2026-Q2 weeks 1–4**. Owner default: rmrich5 unless reassigned. + +| ID | Item | Status | Target | Blocked-by | Notes | +|---|---|---|---|---|---| +| P1.1 | Land `profile/docs.schema.json` | not-started | Q2 wk 1 | — | Sibling of repo.meta.schema.json | +| P1.2 | Extend `profile/repo.meta.schema.json` with `docs.generated_paths` array | not-started | Q2 wk 1 | — | Enables §5.1 exclusion rule | +| P1.3 | Land `profile/build/validate-docs.py` + tests | not-started | Q2 wk 2 | P1.1, P1.2 | Copy pattern from validate-catalog.py | +| P1.4 | Add `make check-docs` target to each of the 7 repos | not-started | Q2 wk 2 | P1.3 | One PR per repo | +| P1.5 | Wire CI step into per-repo workflows (warn-only) | not-started | Q2 wk 3 | P1.4 | Don't block PRs yet | +| P1.6 | Backfill `lifecycle: active` across all 108 docs | not-started | Q2 wk 3 | P1.1 | Default value; one-time pass | +| P1.7 | Backfill `owner:` across all 108 docs | not-started | Q2 wk 3 | P1.1 | From `git shortlog -sn` per repo | +| P1.8 | Add `generated: true` to m-stdlib `docs/modules/std*.md` | not-started | Q2 wk 4 | P1.2 | Done by `make manifest` regeneration | +| P1.9 | Declare m-stdlib's `docs/modules/` in `docs.generated_paths` | not-started | Q2 wk 4 | P1.2 | One-line edit to repo.meta.json | +| P1.10 | Run weekly cron once with warn-only CI; review noise | not-started | end of Q2 wk 4 | P1.5 | Decision gate before Phase 2 | + +--- + +## 4. CI checks — implementation status + +The 18 checks defined in [README §9](README.md#9-ci-enforcement). +Each check lands in the indicated phase. + +| # | Check | Phase | Status | Implementation note | +|---|---|---|---|---| +| 0 | Generated-doc gate (skip if `generated: true`) | 1 | not-started | Hard short-circuit; first check evaluated | +| 1 | Frontmatter present | 1 | not-started | YAML block at top | +| 2 | Required keys present | 1 | not-started | created, last_modified, revisions, doc_type, lifecycle | +| 3 | `doc_type` values valid | 1 | not-started | From the 23-vocab in docs.schema.json | +| 4 | `lifecycle` value valid | 1 | not-started | One of 5 states | +| 5 | created/last_modified/revisions match git | 1 | not-started | Auto-fixable; tooling regenerates | +| 6 | `docs/README.md` exists | 1 | not-started | Per-repo gate | +| 7 | No orphans (every `.md` in index) | 1 | not-started | Bidirectional check | +| 8 | No dangling refs in `README.md` | 1 | not-started | Local link resolution | +| 9 | Filename matches doc_type | 2 | not-started | Per README §7 table | +| 10 | Filename content-derived (no `and`, kebab-case) | 2 | not-started | | +| 11 | Required H2 sections per doc_type | 2 | not-started | Per README §8 table | +| 12 | markdownlint clean | 2 | not-started | Standard ruleset, repo-overridable | +| 13 | Cross-repo links valid (extend `check-links.py`) | 3 | not-started | Reuses existing weekly cron | +| 14 | Freshness gate (`review_after` honored) | 3 | not-started | Extends `check-freshness.py` | +| 15 | Supersession bidirectional (A.superseded_by ↔ B.replaces) | 3 | not-started | Catalog-time check | +| 16 | Combination warnings in catalog | 3 | not-started | Refactor-candidate surface | +| 17 | `build-doc-catalog.py` + `profile/docs.json` | 4 | not-started | Sibling of build-catalog.py | +| 18 | `docs.schema.json` pinned for catalog output | 4 | not-started | Contract for downstream agents | + +--- + +## 5. Legacy remediation backlog (Phase 2) + +Per [Q7 decision](README.md#q7--migration-of-m-clidocsplans): one PR for +movement-only changes, then per-doc PRs for substantive splits. + +### 5.1 Renames (filename ↔ doc_type mismatch) + +| ID | Repo | From | To | Reason | Status | +|---|---|---|---|---|---| +| R1 | m-cli | `docs/plans/linter-profiles-guide.md` | `docs/plans/linter-profiles-proposal.md` | `[DESIGN, PROPOSAL]` mislabeled "guide" | not-started | +| R2 | m-cli | `docs/evolution.md` | `docs/history/m-cli-history.md` | Generic name; also moves into `history/` per §6 | not-started | +| R3 | m-cli | `docs/plans/m-cli-history-and-evolution.md` | `docs/history/m-cli-evolution.md` | Has `and`; doc_type is `[HISTORY, EXPLAINER]`; also moves | not-started | +| R4 | m-tools | `docs/m-tool-gap-analysis.md` | `docs/m-tool-gaps.md` | Align with `*-gaps.md` convention | not-started | +| R5 | m-tools | `docs/ydb-dev-tools-gap-analysis.md` | `docs/ydb-dev-tools-gaps.md` | Same; file is also a redirect stub | not-started | +| R6 | m-tools | `docs/gap-analysis-and-remediation-strategy.md` | (split — see S4 below) | Has `and`; superseded by split | not-started | + +### 5.2 Moves (subdir ↔ doc_type mismatch — m-cli `docs/plans/`) + +Single PR per [§10 Phase 2](README.md#phase-2--block-on-new-docs-target-2026-q2-weeks-58) +movement-only convention. + +| ID | File (currently in `m-cli/docs/plans/`) | Move to | doc_type | Status | +|---|---|---|---|---| +| M1 | `language-cli-survey.md` | `docs/research/` | `[SURVEY, GAP-ANALYSIS]` | not-started | +| M2 | `m-corpus-catalog.md` | `docs/reference/` | `[REFERENCE, RESEARCH]` | not-started | +| M3 | `m-linter-status-2026-04-30.md` | `docs/status/` | `[STATUS, POSTMORTEM]` | not-started | +| M4 | `m-linting-survey.md` | `docs/research/` | `[SURVEY, GAP-ANALYSIS]` | not-started | +| M5 | `iris-ydb-portability.md` | (stays in `plans/`) | `[PLAN, RESEARCH]` | n/a | +| M6 | `m-env-implementation-plan.md` | (stays in `plans/`) | `[PLAN]` | n/a | +| M7 | `m-environment-tool.md` | (stays in `plans/`) | `[PROPOSAL, DESIGN]` | n/a | + +### 5.3 Splits (combination doc_types — per-doc PRs) + +Per [README §4 refactor heuristics](README.md#combinations-and-split-candidates). + +| ID | File | Combination | Split target | Status | +|---|---|---|---|---| +| S1 | m-cli `docs/plans/m-linting-implementation-plan.md` | `[PLAN, BUILD-LOG]` | frozen plan + live build-log | not-started | +| S2 | tree-sitter-m `docs/build-log.md` | `[BUILD-LOG, HISTORY]` | live build-log + `docs/history/tree-sitter-m-history.md` | not-started | +| S3 | tree-sitter-m `docs/vista-parse-error-categories.md` | `[GAP-ANALYSIS, PLAN]` | gaps doc + remediation plan | not-started | +| S4 | m-tools `docs/gap-analysis-and-remediation-strategy.md` | `[GAP-ANALYSIS, PLAN]` | `docs/m-tools-gaps.md` + `docs/m-tools-remediation-plan.md` | not-started | +| S5 | m-tools `docs/implementation.md` | `[REFERENCE, STATUS]` | timeless reference + dated status | not-started | +| S6 | m-cli `docs/evolution.md` | `[HISTORY, BUILD-LOG]` | history (R2) + per-release build-logs | not-started | + +### 5.4 Reviews (combinations flagged "usually fine; confirm before splitting") + +| ID | File | Combination | Action | Status | +|---|---|---|---|---| +| V1 | m-cli `docs/guide.md` | `[GUIDE, REFERENCE]` | confirm primary; link the secondary | not-started | +| V2 | m-standard `docs/m-standards-guide.md` | `[GUIDE, REFERENCE]` | same | not-started | +| V3 | m-standard `docs/m-libraries-remediation.md` | `[PLAN, ROADMAP]` | demote one to a section of the other | not-started | +| V4 | m-cli `docs/plans/language-cli-survey.md` | `[SURVEY, GAP-ANALYSIS]` | typical pair; confirm whether two docs are clearer | not-started | +| V5 | m-cli `docs/plans/m-linting-survey.md` | `[SURVEY, GAP-ANALYSIS]` | same | not-started | +| V6 | m-stdlib `docs/testing/modern-m-corpus-test-results.md` | `[RESEARCH, GAP-ANALYSIS]` | same | not-started | +| V7 | m-tools `docs/m-tool-gap-analysis.md` | `[GAP-ANALYSIS, SURVEY]` | same | not-started | + +--- + +## 6. Phase 2–4 (placeholder) + +Detailed rows are added when the prior phase reaches `done`. Phase 2 +will inherit the legacy remediation backlog above as concrete rows; +Phase 3 will add the catalog builder and freshness gate rows; Phase 4 +will add the cross-repo link extension and the freshness-cron rows. + +--- + +## 7. Open issues encountered during execution + +Captured here as a running log so they survive across owner changes. +Empty for now. + +| Date | Issue | Phase | Resolution | +|---|---|---|---| +| — | — | — | — | diff --git a/profile/build/check-schema-compat.py b/profile/build/check-schema-compat.py new file mode 100644 index 0000000..5314acf --- /dev/null +++ b/profile/build/check-schema-compat.py @@ -0,0 +1,311 @@ +#!/usr/bin/env python3 +"""Phase 5 Track D — schema-version policing gate. + +Diffs ``profile/tools.schema.json`` + ``profile/task_index.schema.json`` +between a base ref (default ``origin/main``) and a head ref (default +``HEAD``). Enforces: + +* If ``schema_compat`` bumps in either schema, ``schema-changelog.md`` + must be modified in the same diff. Otherwise: + ``MISSING_CHANGELOG_ROW``. +* If a non-additive change happens *without* a ``schema_compat`` bump, + surface ``NON_ADDITIVE_WITHOUT_BUMP``. The heuristic catches three + common breakage shapes: + + 1. A required field is removed (consumer still producing it now + fails ``additionalProperties: false``). + 2. An enum value disappears (consumer producing the removed value + now rejects). + 3. ``additionalProperties`` tightens from ``true`` (or unset) to + ``false`` (extras previously accepted now reject). + + False positives are acceptable per phase5-plan.md §9: the maintainer + can bump ``schema_compat`` + add a changelog row. + +Exit codes: + +* ``0`` — pass. +* ``1`` — gate failed (one of the codes above). +* ``2`` — fixture / git error. +""" + +from __future__ import annotations + +import argparse +import json +import subprocess +import sys +from pathlib import Path +from typing import Any + +REPO_ROOT = Path(__file__).resolve().parents[2] +PROFILE_REL = "profile" +SCHEMA_FILES = ( + "tools.schema.json", + "task_index.schema.json", +) +CHANGELOG_REL = "profile/schema-changelog.md" + + +# ---- pure-function analyzer ---------------------------------------------- + + +def _check_required_shrunk(base: dict, head: dict, details: list[str]) -> bool: + """Return True if any required field present in base disappeared in head.""" + base_req = set(base.get("required", []) or []) + head_req = set(head.get("required", []) or []) + removed = base_req - head_req + if removed: + for field in sorted(removed): + details.append( + f"required field {field!r} removed (consumers producing it " + f"may now fail validation)" + ) + return True + return False + + +def _check_enums_shrunk(base: dict, head: dict, details: list[str]) -> bool: + """Walk the ``properties`` of both schemas; if any property's enum + list shrunk, that's a breaking change.""" + base_props = base.get("properties", {}) or {} + head_props = head.get("properties", {}) or {} + broke = False + for prop_name, base_prop in base_props.items(): + if not isinstance(base_prop, dict): + continue + base_enum = base_prop.get("enum") + if not isinstance(base_enum, list): + continue + head_prop = head_props.get(prop_name) + if not isinstance(head_prop, dict): + continue + head_enum = head_prop.get("enum") + if not isinstance(head_enum, list): + continue + removed = set(base_enum) - set(head_enum) + if removed: + for val in sorted(map(str, removed)): + details.append( + f"enum value {val!r} removed from {prop_name}.enum " + f"(consumers producing it now reject)" + ) + broke = True + return broke + + +def _check_additional_properties_tightened( + base: dict, head: dict, details: list[str] +) -> bool: + base_ap = base.get("additionalProperties", True) + head_ap = head.get("additionalProperties", True) + # `True` (or unset) → `False` is the breaking direction. Going the + # other way (False → True) is loosening; not a regression. + if base_ap is True and head_ap is False: + details.append( + "additionalProperties tightened from true to false at the " + "top level (extras previously accepted now reject)" + ) + return True + # Dict-form (a sub-schema) → False is also tightening. We treat any + # non-False base + False head as tightening. + if base_ap is not False and head_ap is False: + details.append( + "additionalProperties tightened to false (extras previously " + "accepted now reject)" + ) + return True + return False + + +def _is_non_additive(base: dict, head: dict, details: list[str]) -> bool: + """Apply the three heuristics; True if any fired.""" + found = False + if _check_required_shrunk(base, head, details): + found = True + if _check_enums_shrunk(base, head, details): + found = True + if _check_additional_properties_tightened(base, head, details): + found = True + return found + + +def check_schema_compat_impl( + *, + pairs: list[tuple[str, dict, dict]], + changelog_modified: bool, +) -> dict[str, Any]: + """Pure-function analyzer. + + ``pairs``: list of ``(filename, base_dict, head_dict)`` tuples. + ``changelog_modified``: did ``schema-changelog.md`` appear in the + PR's file diff? + + Returns: + ``{"status": "OK" | "MISSING_CHANGELOG_ROW" | "NON_ADDITIVE_WITHOUT_BUMP", + "bumped_files": [str], + "non_additive_files": [str], + "details": [str]}`` + """ + bumped_files: list[str] = [] + non_additive_files: list[str] = [] + details: list[str] = [] + + for name, base, head in pairs: + base_compat = base.get("schema_compat") + head_compat = head.get("schema_compat") + if ( + isinstance(base_compat, int) + and isinstance(head_compat, int) + and head_compat > base_compat + ): + bumped_files.append(name) + details.append( + f"{name}: schema_compat {base_compat} → {head_compat}" + ) + + non_additive_details: list[str] = [] + if _is_non_additive(base, head, non_additive_details): + non_additive_files.append(name) + for d in non_additive_details: + details.append(f"{name}: {d}") + + if bumped_files and not changelog_modified: + return { + "status": "MISSING_CHANGELOG_ROW", + "bumped_files": bumped_files, + "non_additive_files": non_additive_files, + "details": details, + } + + if non_additive_files and not bumped_files: + return { + "status": "NON_ADDITIVE_WITHOUT_BUMP", + "bumped_files": bumped_files, + "non_additive_files": non_additive_files, + "details": details, + } + + return { + "status": "OK", + "bumped_files": bumped_files, + "non_additive_files": non_additive_files, + "details": details, + } + + +# ---- git wrapper ---------------------------------------------------------- + + +def _git_show(repo: Path, ref: str, path: str) -> str | None: + """Return file contents at ``ref:path``, or None if the file + doesn't exist at that ref (e.g. it's brand-new in HEAD).""" + result = subprocess.run( + ["git", "show", f"{ref}:{path}"], + cwd=repo, + capture_output=True, + text=True, + ) + if result.returncode != 0: + return None + return result.stdout + + +def _git_files_changed(repo: Path, base: str, head: str) -> set[str]: + """Return the set of paths changed between ``base`` and ``head``.""" + result = subprocess.run( + ["git", "diff", "--name-only", f"{base}", f"{head}"], + cwd=repo, + capture_output=True, + text=True, + check=True, + ) + return {line.strip() for line in result.stdout.splitlines() if line.strip()} + + +def _load_pair(repo: Path, base: str, head: str, relpath: str) -> tuple[dict, dict] | None: + """Load ``relpath`` at both refs as JSON. Returns ``None`` if the + file doesn't exist at either ref.""" + base_text = _git_show(repo, base, relpath) + head_text = _git_show(repo, head, relpath) + if base_text is None or head_text is None: + return None + try: + base_d = json.loads(base_text) + head_d = json.loads(head_text) + except json.JSONDecodeError: + return None + return base_d, head_d + + +# ---- CLI ------------------------------------------------------------------- + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser( + description=__doc__.splitlines()[0] if __doc__ else "", + ) + parser.add_argument( + "--base", + default="origin/main", + help="Base ref to diff from (default: origin/main).", + ) + parser.add_argument( + "--head", + default="HEAD", + help="Head ref to diff to (default: HEAD).", + ) + parser.add_argument( + "--repo", + type=Path, + default=REPO_ROOT, + help="Path to the git repo (default: this repo's root).", + ) + args = parser.parse_args(argv) + + pairs: list[tuple[str, dict, dict]] = [] + for name in SCHEMA_FILES: + relpath = f"{PROFILE_REL}/{name}" + loaded = _load_pair(args.repo, args.base, args.head, relpath) + if loaded is None: + continue + base_d, head_d = loaded + pairs.append((name, base_d, head_d)) + + try: + changed = _git_files_changed(args.repo, args.base, args.head) + except subprocess.CalledProcessError as exc: + print(f"ERROR: git diff failed: {exc}", file=sys.stderr) + return 2 + + changelog_modified = CHANGELOG_REL in changed + + result = check_schema_compat_impl( + pairs=pairs, changelog_modified=changelog_modified + ) + + # Markdown summary on stdout (CI-friendly). + print("### Schema-version policing") + print() + print(f"* base: `{args.base}`") + print(f"* head: `{args.head}`") + print(f"* schemas inspected: {[name for name, _, _ in pairs]}") + print(f"* schema-changelog.md modified: {changelog_modified}") + print(f"* bumped: {result['bumped_files']}") + print(f"* non-additive: {result['non_additive_files']}") + if result["details"]: + print() + print("Details:") + for d in result["details"]: + print(f" * {d}") + print() + + if result["status"] != "OK": + print(f"check-schema-compat: FAIL ({result['status']})", file=sys.stderr) + return 1 + print("check-schema-compat: clean") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/profile/build/test_check_schema_compat.py b/profile/build/test_check_schema_compat.py new file mode 100644 index 0000000..7ed3851 --- /dev/null +++ b/profile/build/test_check_schema_compat.py @@ -0,0 +1,269 @@ +"""Tests for check-schema-compat.py — Phase 5 Track D1 (RED) before D2 (GREEN). + +Two layers: + +* Pure-function ``check_schema_compat_impl(pairs, changelog_modified)`` + exercised with hand-built dicts. No filesystem, no git. +* CLI-level smoke against an ephemeral git repo set up in ``tmp_path``. +""" + +from __future__ import annotations + +import importlib.util +import json +import subprocess +from pathlib import Path + +REPO_ROOT = Path(__file__).resolve().parents[2] +BUILD = REPO_ROOT / "profile" / "build" + +_path = BUILD / "check-schema-compat.py" +_spec = importlib.util.spec_from_file_location("_check_schema_compat", _path) +_check_schema_compat = importlib.util.module_from_spec(_spec) +assert _spec and _spec.loader +_spec.loader.exec_module(_check_schema_compat) + + +# ----------------------------------------------------------- fixture helpers + + +def _base_schema(schema_compat: int = 1) -> dict: + """Minimal-shape baseline; ``additionalProperties: false``, two + required keys, one enum.""" + return { + "$schema": "https://json-schema.org/draft/2020-12/schema", + "type": "object", + "additionalProperties": False, + "required": ["id", "license"], + "schema_compat": schema_compat, + "schema_version": "1.0", + "properties": { + "id": {"type": "string"}, + "license": {"enum": ["AGPL-3.0", "MIT"]}, + "optional_field": {"type": "string"}, + }, + } + + +# --------------------------------------------------------- pure function + + +def test_no_schema_change_is_ok() -> None: + base = _base_schema() + head = _base_schema() + result = _check_schema_compat.check_schema_compat_impl( + pairs=[("tools.schema.json", base, head)], + changelog_modified=False, + ) + assert result["status"] == "OK" + + +def test_schema_compat_bump_with_changelog_modified_is_ok() -> None: + """schema_compat 1→2 + the same PR modified schema-changelog.md → OK.""" + base = _base_schema(schema_compat=1) + head = _base_schema(schema_compat=2) + result = _check_schema_compat.check_schema_compat_impl( + pairs=[("tools.schema.json", base, head)], + changelog_modified=True, + ) + assert result["status"] == "OK" + assert result["bumped_files"] == ["tools.schema.json"] + + +def test_schema_compat_bump_without_changelog_fails() -> None: + """schema_compat bumped but the changelog was NOT modified in this PR + → MISSING_CHANGELOG_ROW.""" + base = _base_schema(schema_compat=1) + head = _base_schema(schema_compat=2) + result = _check_schema_compat.check_schema_compat_impl( + pairs=[("tools.schema.json", base, head)], + changelog_modified=False, + ) + assert result["status"] == "MISSING_CHANGELOG_ROW" + + +def test_non_additive_required_removal_without_bump_fails() -> None: + """A required field was removed without bumping schema_compat + → NON_ADDITIVE_WITHOUT_BUMP. Removing a required field is a + consumer-breaking change (consumers still producing it now fail + additionalProperties).""" + base = _base_schema() + head = _base_schema() + head["required"] = ["id"] # "license" removed + result = _check_schema_compat.check_schema_compat_impl( + pairs=[("tools.schema.json", base, head)], + changelog_modified=False, + ) + assert result["status"] == "NON_ADDITIVE_WITHOUT_BUMP" + assert any("license" in d for d in result["details"]) + + +def test_non_additive_enum_value_removed_without_bump_fails() -> None: + base = _base_schema() + head = _base_schema() + head["properties"]["license"]["enum"] = ["AGPL-3.0"] # "MIT" removed + result = _check_schema_compat.check_schema_compat_impl( + pairs=[("tools.schema.json", base, head)], + changelog_modified=False, + ) + assert result["status"] == "NON_ADDITIVE_WITHOUT_BUMP" + assert any("enum" in d.lower() and "MIT" in d for d in result["details"]) + + +def test_non_additive_additional_properties_tightened_without_bump_fails() -> None: + """additionalProperties: true → false is a consumer-breaking change + (extra fields previously accepted now reject).""" + base = _base_schema() + base["additionalProperties"] = True + head = _base_schema() + head["additionalProperties"] = False + result = _check_schema_compat.check_schema_compat_impl( + pairs=[("tools.schema.json", base, head)], + changelog_modified=False, + ) + assert result["status"] == "NON_ADDITIVE_WITHOUT_BUMP" + assert any("additionalProperties" in d for d in result["details"]) + + +def test_additive_change_without_bump_is_ok() -> None: + """Adding an optional field is additive — no bump needed.""" + base = _base_schema() + head = _base_schema() + head["properties"]["new_optional"] = {"type": "string"} + result = _check_schema_compat.check_schema_compat_impl( + pairs=[("tools.schema.json", base, head)], + changelog_modified=False, + ) + assert result["status"] == "OK" + + +def test_additive_enum_added_without_bump_is_ok() -> None: + """Adding an enum value is additive.""" + base = _base_schema() + head = _base_schema() + head["properties"]["license"]["enum"] = ["AGPL-3.0", "MIT", "Apache-2.0"] + result = _check_schema_compat.check_schema_compat_impl( + pairs=[("tools.schema.json", base, head)], + changelog_modified=False, + ) + assert result["status"] == "OK" + + +def test_non_additive_with_bump_and_changelog_is_ok() -> None: + """A breaking change is acceptable IF schema_compat bumped AND the + changelog was modified. This is the happy path for a real-world + schema migration.""" + base = _base_schema(schema_compat=1) + head = _base_schema(schema_compat=2) + head["required"] = ["id"] + result = _check_schema_compat.check_schema_compat_impl( + pairs=[("tools.schema.json", base, head)], + changelog_modified=True, + ) + assert result["status"] == "OK" + + +def test_two_schemas_one_bumps_one_doesnt() -> None: + """tools.schema.json bumps + changelog modified. task_index.schema.json + untouched. Result: OK.""" + base_tools = _base_schema(schema_compat=1) + head_tools = _base_schema(schema_compat=2) + base_ti = _base_schema(schema_compat=1) + head_ti = _base_schema(schema_compat=1) + result = _check_schema_compat.check_schema_compat_impl( + pairs=[ + ("tools.schema.json", base_tools, head_tools), + ("task_index.schema.json", base_ti, head_ti), + ], + changelog_modified=True, + ) + assert result["status"] == "OK" + + +# --------------------------------------------------------- CLI smoke + + +def _init_git_repo(path: Path) -> None: + subprocess.run(["git", "init", "-q"], cwd=path, check=True) + subprocess.run(["git", "config", "user.email", "test@example.invalid"], cwd=path, check=True) + subprocess.run(["git", "config", "user.name", "Test"], cwd=path, check=True) + subprocess.run(["git", "config", "commit.gpgsign", "false"], cwd=path, check=True) + + +def _commit_all(path: Path, message: str) -> None: + subprocess.run(["git", "add", "-A"], cwd=path, check=True) + subprocess.run(["git", "commit", "-q", "-m", message], cwd=path, check=True) + + +def test_cli_no_change_returns_0(tmp_path: Path) -> None: + _init_git_repo(tmp_path) + profile = tmp_path / "profile" + profile.mkdir() + (profile / "tools.schema.json").write_text(json.dumps(_base_schema()), encoding="utf-8") + (profile / "schema-changelog.md").write_text("# Schema changelog\n", encoding="utf-8") + _commit_all(tmp_path, "base") + base_ref = subprocess.run( + ["git", "rev-parse", "HEAD"], cwd=tmp_path, capture_output=True, text=True, check=True + ).stdout.strip() + + # No second commit needed; HEAD == base → no change. + rc = _check_schema_compat.main( + ["--base", base_ref, "--head", "HEAD", "--repo", str(tmp_path)] + ) + assert rc == 0 + + +def test_cli_bump_without_changelog_returns_1(tmp_path: Path) -> None: + _init_git_repo(tmp_path) + profile = tmp_path / "profile" + profile.mkdir() + (profile / "tools.schema.json").write_text(json.dumps(_base_schema()), encoding="utf-8") + (profile / "schema-changelog.md").write_text("# Schema changelog\n", encoding="utf-8") + _commit_all(tmp_path, "base") + base_ref = subprocess.run( + ["git", "rev-parse", "HEAD"], cwd=tmp_path, capture_output=True, text=True, check=True + ).stdout.strip() + + # Bump schema_compat WITHOUT touching the changelog. + bumped = _base_schema() + bumped["schema_compat"] = 2 + (profile / "tools.schema.json").write_text(json.dumps(bumped), encoding="utf-8") + _commit_all(tmp_path, "bump") + + rc = _check_schema_compat.main( + ["--base", base_ref, "--head", "HEAD", "--repo", str(tmp_path)] + ) + assert rc == 1 + + +def test_cli_bump_with_changelog_returns_0(tmp_path: Path) -> None: + _init_git_repo(tmp_path) + profile = tmp_path / "profile" + profile.mkdir() + (profile / "tools.schema.json").write_text(json.dumps(_base_schema()), encoding="utf-8") + (profile / "schema-changelog.md").write_text("# Schema changelog\n", encoding="utf-8") + _commit_all(tmp_path, "base") + base_ref = subprocess.run( + ["git", "rev-parse", "HEAD"], cwd=tmp_path, capture_output=True, text=True, check=True + ).stdout.strip() + + bumped = _base_schema() + bumped["schema_compat"] = 2 + (profile / "tools.schema.json").write_text(json.dumps(bumped), encoding="utf-8") + (profile / "schema-changelog.md").write_text( + "# Schema changelog\n\n## v2\n\nBumped to v2.\n", encoding="utf-8" + ) + _commit_all(tmp_path, "bump + changelog") + + rc = _check_schema_compat.main( + ["--base", base_ref, "--head", "HEAD", "--repo", str(tmp_path)] + ) + assert rc == 0 + + +def test_cli_against_committed_main_returns_0() -> None: + """Smoke: against the real `main` baseline, the gate should be a + no-op (HEAD == base). Pins the gate doesn't false-positive on the + current state.""" + rc = _check_schema_compat.main(["--base", "HEAD", "--head", "HEAD"]) + assert rc == 0