hackcode/prd.json at dev · itwizardo/hackcode · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
{
  "version": "1.0",
  "description": "Clawable Coding Harness - Clear roadmap stories and commit each",
  "stories": [
    {
      "id": "US-001",
      "title": "Phase 1.6 - startup-no-evidence evidence bundle + classifier",
      "description": "When startup times out, emit typed worker.startup_no_evidence event with evidence bundle including last known worker lifecycle state, pane command, prompt-send timestamp, prompt-acceptance state, trust-prompt detection result, and transport/MCP health summary. Classifier should down-rank into specific failure classes.",
      "acceptanceCriteria": [
        "worker.startup_no_evidence event emitted on startup timeout with evidence bundle",
        "Evidence bundle includes: last lifecycle state, pane command, prompt-send timestamp, prompt-acceptance state, trust-prompt detection, transport/MCP health",
        "Classifier attempts to categorize into: trust_required, prompt_misdelivery, prompt_acceptance_timeout, transport_dead, worker_crashed, or unknown",
        "Tests verify evidence bundle structure and classifier behavior"
      ],
      "passes": true,
      "priority": "P0"
    },
    {
      "id": "US-002",
      "title": "Phase 2 - Canonical lane event schema (4.x series)",
      "description": "Define typed events for lane lifecycle: lane.started, lane.ready, lane.prompt_misdelivery, lane.blocked, lane.red, lane.green, lane.commit.created, lane.pr.opened, lane.merge.ready, lane.finished, lane.failed, branch.stale_against_main. Also implement event ordering, reconciliation, provenance, deduplication, and projection contracts.",
      "acceptanceCriteria": [
        "LaneEvent enum with all required variants defined",
        "Event ordering with monotonic sequence metadata attached",
        "Event provenance labels (live_lane, test, healthcheck, replay, transport)",
        "Session identity completeness at creation (title, workspace, purpose)",
        "Duplicate terminal-event suppression with fingerprinting",
        "Lane ownership/scope binding in events",
        "Nudge acknowledgment with dedupe contract",
        "clawhip consumes typed lane events instead of pane scraping"
      ],
      "passes": true,
      "priority": "P0"
    },
    {
      "id": "US-003",
      "title": "Phase 3 - Stale-branch detection before broad verification",
      "description": "Before broad test runs, compare current branch to main and detect if known fixes are missing. Emit branch.stale_against_main event and suggest/auto-run rebase/merge-forward.",
      "acceptanceCriteria": [
        "Branch freshness comparison against main implemented",
        "branch.stale_against_main event emitted when behind",
        "Auto-rebase/merge-forward policy integration",
        "Avoid misclassifying stale-branch failures as new regressions"
      ],
      "passes": true,
      "priority": "P1"
    },
    {
      "id": "US-004",
      "title": "Phase 3 - Recovery recipes with ledger",
      "description": "Encode automatic recoveries for common failures (trust prompt, prompt misdelivery, stale branch, compile red, MCP startup). Expose recovery attempt ledger with recipe id, attempt count, state, timestamps, failure summary.",
      "acceptanceCriteria": [
        "Recovery recipes defined for: trust_prompt_unresolved, prompt_delivered_to_shell, stale_branch, compile_red_after_refactor, MCP_handshake_failure, partial_plugin_startup",
        "Recovery attempt ledger with: recipe id, attempt count, state, timestamps, failure summary, escalation reason",
        "One automatic recovery attempt before escalation",
        "Ledger emitted as structured event data"
      ],
      "passes": true,
      "priority": "P1"
    },
    {
      "id": "US-005",
      "title": "Phase 4 - Typed task packet format",
      "description": "Define structured task packet with fields: objective, scope, repo/worktree, branch policy, acceptance tests, commit policy, reporting contract, escalation policy.",
      "acceptanceCriteria": [
        "TaskPacket struct with all required fields",
        "TaskScope resolution (workspace/module/single-file/custom)",
        "Validation and serialization support",
        "Integration into tools/src/lib.rs"
      ],
      "passes": true,
      "priority": "P1"
    },
    {
      "id": "US-006",
      "title": "Phase 4 - Policy engine for autonomous coding",
      "description": "Encode automation rules: if green + scoped diff + review passed -> merge to dev; if stale branch -> merge-forward before broad tests; if startup blocked -> recover once, then escalate; if lane completed -> emit closeout and cleanup session.",
      "acceptanceCriteria": [
        "Policy rules engine implemented",
        "Rules: green + scoped diff + review -> merge",
        "Rules: stale branch -> merge-forward before tests",
        "Rules: startup blocked -> recover once, then escalate",
        "Rules: lane completed -> closeout and cleanup"
      ],
      "passes": true,
      "priority": "P2"
    },
    {
      "id": "US-007",
      "title": "Phase 5 - Plugin/MCP lifecycle maturity",
      "description": "First-class plugin/MCP lifecycle contract: config validation, startup healthcheck, discovery result, degraded-mode behavior, shutdown/cleanup. Close gaps in end-to-end lifecycle.",
      "acceptanceCriteria": [
        "Plugin/MCP config validation contract",
        "Startup healthcheck with structured results",
        "Discovery result reporting",
        "Degraded-mode behavior documented and implemented",
        "Shutdown/cleanup contract",
        "Partial startup and per-server failures reported structurally"
      ],
      "passes": true,
      "priority": "P2"
    },
    {
      "id": "US-008",
      "title": "Fix kimi-k2.5 model API compatibility",
      "description": "The kimi-k2.5 model (and other kimi models) reject API requests containing the is_error field in tool result messages. The OpenAI-compatible provider currently always includes is_error for all models. Need to make this field conditional based on model support.",
      "acceptanceCriteria": [
        "translate_message function accepts model parameter",
        "is_error field excluded for kimi models (kimi-k2.5, kimi-k1.5, etc.)",
        "is_error field included for models that support it (openai, grok, xai, etc.)",
        "build_chat_completion_request passes model to translate_message",
        "Tests verify is_error presence/absence based on model",
        "cargo test passes",
        "cargo clippy passes",
        "cargo fmt passes"
      ],
      "passes": true,
      "priority": "P0"
    },
    {
      "id": "US-009",
      "title": "Add unit tests for kimi model compatibility fix",
      "description": "During dogfooding we discovered the existing test coverage for model-specific is_error handling is insufficient. Need to add dedicated tests for model_rejects_is_error_field function and translate_message behavior with different models.",
      "acceptanceCriteria": [
        "Test model_rejects_is_error_field identifies kimi-k2.5, kimi-k1.5, dashscope/kimi-k2.5",
        "Test translate_message includes is_error for gpt-4, grok-3, claude models",
        "Test translate_message excludes is_error for kimi models",
        "Test build_chat_completion_request produces correct payload for kimi vs non-kimi",
        "All new tests pass",
        "cargo test --package api passes"
      ],
      "passes": true,
      "priority": "P1"
    },
    {
      "id": "US-010",
      "title": "Add model compatibility documentation",
      "description": "Document which models require special handling (is_error exclusion, reasoning model tuning param stripping, etc.) in a MODEL_COMPATIBILITY.md file for operators and contributors.",
      "acceptanceCriteria": [
        "MODEL_COMPATIBILITY.md created in docs/ or repo root",
        "Document kimi models is_error exclusion",
        "Document reasoning models (o1, o3, grok-3-mini) tuning param stripping",
        "Document gpt-5 max_completion_tokens requirement",
        "Document qwen model routing through dashscope",
        "Cross-reference with existing code comments"
      ],
      "passes": true,
      "priority": "P2"
    },
    {
      "id": "US-011",
      "title": "Performance optimization: reduce API request serialization overhead",
      "description": "The translate_message function creates intermediate JSON Value objects that could be optimized. Profile and optimize the hot path for API request building, especially for conversations with many tool results.",
      "acceptanceCriteria": [
        "Profile current request building with criterion or similar",
        "Identify bottlenecks in translate_message and build_chat_completion_request",
        "Implement optimizations (Vec pre-allocation, reduced cloning, etc.)",
        "Benchmark before/after showing improvement",
        "No functional changes or API breakage"
      ],
      "passes": true,
      "priority": "P2"
    },
    {
      "id": "US-012",
      "title": "Trust prompt resolver with allowlist auto-trust",
      "description": "Add allowlisted auto-trust behavior for known repos/worktrees. Trust prompts currently block TUI startup and require manual intervention. Implement automatic trust resolution for pre-approved repositories.",
      "acceptanceCriteria": [
        "TrustAllowlist config structure with repo patterns",
        "Auto-trust behavior for allowlisted repos/worktrees",
        "trust_required event emitted when trust prompt detected",
        "trust_resolved event emitted when trust is granted",
        "Non-allowlisted repos remain gated (manual trust required)",
        "Integration with worker boot lifecycle",
        "Tests for allowlist matching and event emission"
      ],
      "passes": true,
      "priority": "P1"
    },
    {
      "id": "US-013",
      "title": "Phase 2 - Session event ordering + terminal-state reconciliation",
      "description": "When the same session emits contradictory lifecycle events (idle, error, completed, transport/server-down) in close succession, expose deterministic final truth. Attach monotonic sequence/causal ordering metadata, classify terminal vs advisory events, reconcile duplicate/out-of-order terminal events into one canonical lane outcome.",
      "acceptanceCriteria": [
        "Monotonic sequence / causal ordering metadata attached to session lifecycle events",
        "Terminal vs advisory event classification implemented",
        "Reconcile duplicate or out-of-order terminal events into one canonical outcome",
        "Distinguish 'session terminal state unknown because transport died' from real 'completed'",
        "Tests verify reconciliation behavior with out-of-order event bursts"
      ],
      "passes": true,
      "priority": "P1"
    },
    {
      "id": "US-014",
      "title": "Phase 2 - Event provenance / environment labeling",
      "description": "Every emitted event should declare its source (live_lane, test, healthcheck, replay, transport) so claws do not mistake test noise for production truth. Include environment/channel label, emitter identity, and confidence/trust level.",
      "acceptanceCriteria": [
        "EventProvenance enum with live_lane, test, healthcheck, replay, transport variants",
        "Environment/channel label attached to all events",
        "Emitter identity field on events",
        "Confidence/trust level field for downstream automation",
        "Tests verify provenance labeling and filtering"
      ],
      "passes": true,
      "priority": "P1"
    },
    {
      "id": "US-015",
      "title": "Phase 2 - Session identity completeness at creation time",
      "description": "A newly created session should emit stable title, workspace/worktree path, and lane/session purpose at creation time. If any field is not yet known, emit explicit typed placeholder reason rather than bare unknown string.",
      "acceptanceCriteria": [
        "Session creation emits stable title, workspace/worktree path, purpose immediately",
        "Explicit typed placeholder when fields unknown (not bare 'unknown' strings)",
        "Later-enriched metadata reconciles onto same session identity without ambiguity",
        "Tests verify session identity completeness and placeholder handling"
      ],
      "passes": true,
      "priority": "P1"
    },
    {
      "id": "US-016",
      "title": "Phase 2 - Duplicate terminal-event suppression",
      "description": "When the same session emits repeated completed/failed/terminal notifications, collapse duplicates before they trigger repeated downstream reactions. Attach canonical terminal-event fingerprint per lane/session outcome.",
      "acceptanceCriteria": [
        "Canonical terminal-event fingerprint attached per lane/session outcome",
        "Suppress/coalesce repeated terminal notifications within reconciliation window",
        "Preserve raw event history for audit while exposing one actionable outcome downstream",
        "Surface when later duplicate materially differs from original terminal payload",
        "Tests verify deduplication and material difference detection"
      ],
      "passes": true,
      "priority": "P2"
    },
    {
      "id": "US-017",
      "title": "Phase 2 - Lane ownership / scope binding",
      "description": "Each session and lane event should declare who owns it and what workflow scope it belongs to. Attach owner/assignee identity, workflow scope (claw-code-dogfood, external-git-maintenance, infra-health, manual-operator), and mark whether watcher is expected to act, observe only, or ignore.",
      "acceptanceCriteria": [
        "Owner/assignee identity attached to sessions and lane events",
        "Workflow scope field (claw-code-dogfood, external-git-maintenance, etc.)",
        "Watcher action expectation field (act, observe-only, ignore)",
        "Preserve scope through session restarts, resumes, and late terminal events",
        "Tests verify ownership and scope binding"
      ],
      "passes": true,
      "priority": "P2"
    },
    {
      "id": "US-018",
      "title": "Phase 2 - Nudge acknowledgment / dedupe contract",
      "description": "Periodic clawhip nudges should carry nudge id/cycle id and delivery timestamp. Expose whether claw has already acknowledged or responded for that cycle. Distinguish new nudge, retry nudge, and stale duplicate.",
      "acceptanceCriteria": [
        "Nudge id / cycle id and delivery timestamp attached",
        "Acknowledgment state exposed (already acknowledged or not)",
        "Distinguish new nudge vs retry nudge vs stale duplicate",
        "Allow downstream summaries to bind reported pinpoint back to triggering nudge id",
        "Tests verify nudge deduplication and acknowledgment tracking"
      ],
      "passes": true,
      "priority": "P2"
    },
    {
      "id": "US-019",
      "title": "Phase 2 - Stable roadmap-id assignment for newly filed pinpoints",
      "description": "When a claw records a new pinpoint/follow-up, assign or expose a stable tracking id immediately. Expose that id in structured event/report payload and preserve across edits, reorderings, and summary compression.",
      "acceptanceCriteria": [
        "Canonical roadmap id assigned at filing time",
        "Roadmap id exposed in structured event/report payload",
        "Same id preserved across edits, reorderings, summary compression",
        "Distinguish 'new roadmap filing' from 'update to existing roadmap item'",
        "Tests verify stable id assignment and update detection"
      ],
      "passes": true,
      "priority": "P2"
    },
    {
      "id": "US-020",
      "title": "Phase 2 - Roadmap item lifecycle state contract",
      "description": "Each roadmap pinpoint should carry machine-readable lifecycle state (filed, acknowledged, in_progress, blocked, done, superseded). Attach last state-change timestamp and preserve lineage when one pinpoint supersedes or merges into another.",
      "acceptanceCriteria": [
        "Lifecycle state enum with filed, acknowledged, in_progress, blocked, done, superseded",
        "Last state-change timestamp attached",
        "New report can declare first filing, status update, or closure",
        "Preserve lineage when one pinpoint supersedes or merges into another",
        "Tests verify lifecycle state transitions"
      ],
      "passes": true,
      "priority": "P2"
    },
    {
      "id": "US-021",
      "title": "Request body size pre-flight check for OpenAI-compatible provider",
      "description": "Implement pre-flight request body size estimation to prevent 400 Bad Request errors from API gateways with size limits. Based on dogfood findings with kimi-k2.5 testing, DashScope API has a 6MB request body limit that was exceeded by large system prompts.",
      "acceptanceCriteria": [
        "Pre-flight size estimation before sending requests to OpenAI-compatible providers",
        "Clear error message when request exceeds provider-specific size limit",
        "Configuration for different provider limits (6MB DashScope, 100MB OpenAI, etc.)",
        "Unit tests for size estimation and limit checking",
        "Integration with existing error handling for actionable user messages"
      ],
      "passes": true,
      "priority": "P1"
    },
    {
      "id": "US-022",
      "title": "Enhanced error context for API failures",
      "description": "Add structured error context to API failures including request ID tracking across retries, provider-specific error code mapping, and suggested user actions based on error type (e.g., 'Reduce prompt size' for 413, 'Check API key' for 401).",
      "acceptanceCriteria": [
        "Request ID tracking across retries with full context in error messages",
        "Provider-specific error code mapping with actionable suggestions",
        "Suggested user actions for common error types (401, 403, 413, 429, 500, 502-504)",
        "Unit tests for error context extraction",
        "All existing tests pass and clippy is clean"
      ],
      "passes": true,
      "priority": "P1"
    },
    {
      "id": "US-023",
      "title": "Add automatic routing for kimi models to DashScope",
      "description": "Based on dogfood findings with kimi-k2.5 testing, users must manually prefix with dashscope/kimi-k2.5 instead of just using kimi-k2.5. Add automatic routing for kimi/ and kimi- prefixed models to DashScope (similar to qwen models), and add a 'kimi' alias to the model registry.",
      "acceptanceCriteria": [
        "kimi/ and kimi- prefix routing to DashScope in metadata_for_model()",
        "'kimi' alias in MODEL_REGISTRY that resolves to 'kimi-k2.5'",
        "resolve_model_alias() handles the kimi alias correctly",
        "Unit tests for kimi routing (similar to qwen routing tests)",
        "All tests pass and clippy is clean"
      ],
      "passes": true,
      "priority": "P1"
    },
    {
      "id": "US-024",
      "title": "Add token limit metadata for kimi models",
      "description": "The model_token_limit() function has no entries for kimi-k2.5 or kimi-k1.5, causing preflight context window validation to skip these models. Add token limit metadata to enable preflight checks and accurate max token defaults. Per Moonshot AI documentation, kimi-k2.5 supports 256K context window and 16K max output tokens.",
      "acceptanceCriteria": [
        "model_token_limit('kimi-k2.5') returns Some(ModelTokenLimit { max_output_tokens: 16384, context_window_tokens: 256000 })",
        "model_token_limit('kimi-k1.5') returns appropriate limits",
        "model_token_limit('kimi') follows alias chain (kimi → kimi-k2.5) and returns k2.5 limits",
        "preflight_message_request() validates context window for kimi models (via generic preflight, no provider-specific code needed)",
        "Unit tests verify limits and preflight behavior for kimi models",
        "All tests pass and clippy is clean"
      ],
      "passes": true,
      "priority": "P1"
    }
  ],
  "metadata": {
    "lastUpdated": "2026-04-17",
    "completedStories": ["US-001", "US-002", "US-003", "US-004", "US-005", "US-006", "US-007", "US-008", "US-009", "US-010", "US-011", "US-012", "US-013", "US-014", "US-015", "US-016", "US-017", "US-018", "US-019", "US-020", "US-021", "US-022", "US-023", "US-024"],
    "inProgressStories": [],
    "totalStories": 24,
    "status": "completed"
  }
}