From 265250fc5ce59a804a9840fc705601330384027b Mon Sep 17 00:00:00 2001 From: "g. nicholas d'andrea" Date: Thu, 16 Apr 2026 03:51:33 -0400 Subject: [PATCH 1/3] format: add transform context for compiler optimizations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a new context type annotating instructions with the compiler transformations that produced them. The value is an array of short identifiers; the list may repeat the same identifier when the transformation has been applied multiple times (e.g., ["inline", "inline"] for doubly-inlined code). Transform is *additional* annotation. The invoke/return contexts for the logical call are still emitted at the call boundary so debuggers see the source-level call stack; the transform context tells debuggers how the call was physically realized. Consumers that ignore transform contexts get a sound source-level view from the semantic contexts alone. v1 identifiers: - "inline": marked instruction is part of an inlined function body; surrounding invoke/return contexts name the inlined callee. - "tailcall": marked instruction is a tail-call-optimized back-edge JUMP or continuation, where the call was realized without pushing/popping a full activation. The identifier set is extensible. Debuggers unfamiliar with a given identifier should preserve it as an opaque label. Order in the array is not semantically significant — the multiset is what matters. Unblocks the final shape of TCO back-edge annotations in bugc (#210): a tail-call-optimized JUMP can now carry `gather: [return, invoke, transform: ["tailcall"]]`. Includes: - schemas/program/context/transform.schema.yaml - schemas/program/context.schema.yaml: wire into the if/$ref union. - packages/format/src/types/program/context.ts: Context.Transform interface, isTransform guard, and Transform.Identifier union preserving autocomplete for known values. - packages/format/src/types/program/context.test.ts: register Context.isTransform with the schema guard test harness. - packages/web/spec/program/context/transform.mdx: spec page covering role, v1 identifiers, repetition/composition, and interaction with gather. --- .../format/src/types/program/context.test.ts | 4 + packages/format/src/types/program/context.ts | 25 ++++- .../web/spec/program/context/transform.mdx | 91 +++++++++++++++++++ schemas/program/context.schema.yaml | 8 ++ schemas/program/context/transform.schema.yaml | 62 +++++++++++++ 5 files changed, 189 insertions(+), 1 deletion(-) create mode 100644 packages/web/spec/program/context/transform.mdx create mode 100644 schemas/program/context/transform.schema.yaml diff --git a/packages/format/src/types/program/context.test.ts b/packages/format/src/types/program/context.test.ts index 4470a322d..a36e1e4c7 100644 --- a/packages/format/src/types/program/context.test.ts +++ b/packages/format/src/types/program/context.test.ts @@ -46,4 +46,8 @@ testSchemaGuards("ethdebug/format/program/context", [ schema: "schema:ethdebug/format/program/context/function/revert", guard: Context.isRevert, }, + { + schema: "schema:ethdebug/format/program/context/transform", + guard: Context.isTransform, + }, ] as const); diff --git a/packages/format/src/types/program/context.ts b/packages/format/src/types/program/context.ts index 104f27196..24c92a915 100644 --- a/packages/format/src/types/program/context.ts +++ b/packages/format/src/types/program/context.ts @@ -11,7 +11,8 @@ export type Context = | Context.Frame | Context.Invoke | Context.Return - | Context.Revert; + | Context.Revert + | Context.Transform; export const isContext = (value: unknown): value is Context => [ @@ -24,6 +25,7 @@ export const isContext = (value: unknown): value is Context => Context.isInvoke, Context.isReturn, Context.isRevert, + Context.isTransform, ].some((guard) => guard(value)); export namespace Context { @@ -274,4 +276,25 @@ export namespace Context { (!("reason" in value) || Function.isPointerRef(value.reason)) && (!("panic" in value) || typeof value.panic === "number"); } + + export interface Transform { + transform: Transform.Identifier[]; + } + + export const isTransform = (value: unknown): value is Transform => + typeof value === "object" && + !!value && + "transform" in value && + Array.isArray(value.transform) && + value.transform.length > 0 && + value.transform.every( + (item) => typeof item === "string" && item.length > 0, + ); + + export namespace Transform { + // Recognized v1 identifiers. Unknown strings are permitted + // (the identifier set is extensible); the union preserves + // autocomplete for known values. + export type Identifier = "inline" | "tailcall" | (string & {}); + } } diff --git a/packages/web/spec/program/context/transform.mdx b/packages/web/spec/program/context/transform.mdx new file mode 100644 index 000000000..81d88677b --- /dev/null +++ b/packages/web/spec/program/context/transform.mdx @@ -0,0 +1,91 @@ +--- +sidebar_position: 8 +--- + +import SchemaViewer from "@site/src/components/SchemaViewer"; + +# Transform contexts + +A transform context annotates an instruction with the compiler +transformations that produced it. The value is a list of short +identifiers; the list may repeat the same identifier when the +transformation has been applied multiple times—for example, +doubly-inlined code carries `transform: ["inline", "inline"]`. + + + +## Role: additional annotation + +A transform context does not replace semantic contexts. When the +compiler inlines a function, the caller's debug info should still +carry invoke/return contexts naming the inlined callee at the +call boundary—so the debugger's logical call stack reflects the +source-level structure. The transform context is _additional_ +information telling the debugger **how** the call was realized. + +Consumers are free to ignore transform contexts entirely; the +invoke/return contexts alone always give a sound source-level +view. Consumers that understand transform contexts can offer +optimization-aware presentations: + +- Render inlined code as a collapsible block tied to the + original callee's source location. +- Show which call sites were tail-call-optimized vs. realized as + full call/return sequences. +- Explain apparent anomalies in the trace (e.g., a JUMP that + carries an invoke context is a TCO back-edge). + +## v1 identifiers + +Two identifiers are recognized in v1: + +- **`"inline"`** — the marked instruction is part of an inlined + function body. Surrounding invoke/return contexts name the + inlined callee; this marker tells the debugger the physical + code does not correspond to a separate activation record. +- **`"tailcall"`** — the marked instruction is a + tail-call-optimized back-edge JUMP or continuation, where the + call was realized without pushing/popping a full activation. + A JUMP carrying a `tailcall` transform often co-occurs with a + `gather` context combining a return (from the previous + iteration) and an invoke (of the new iteration). + +The identifier set is extensible. Compilers may emit additional +identifiers for optimizations not yet standardized; debuggers +should preserve unfamiliar identifiers as opaque labels rather +than rejecting them. + +## Repetition and composition + +Identifiers may repeat. A function inlined into another inlined +function produces `transform: ["inline", "inline"]`. Different +transformations compose: +`transform: ["inline", "tailcall"]` marks an instruction inside +an inlined body that was itself a TCO back-edge in the callee. + +Order in the array is not semantically significant—only the +multiset of identifiers matters. + +## Composing with other contexts + +To carry a transform context alongside other contexts at the +same instruction, use `gather`. A TCO back-edge JUMP, for +example, typically combines three facts: + +```yaml +gather: + - return: + identifier: "fact" + declaration: { ... } + - invoke: + jump: true + identifier: "fact" + target: { pointer: { location: code, offset: ... } } + - transform: ["tailcall"] +``` + +The return and invoke state the source-level facts (iteration N +returned, iteration N+1 was invoked); the transform explains +how the compiler realized that pair as a single JUMP. diff --git a/schemas/program/context.schema.yaml b/schemas/program/context.schema.yaml index a57fce654..1a82e76df 100644 --- a/schemas/program/context.schema.yaml +++ b/schemas/program/context.schema.yaml @@ -89,6 +89,14 @@ allOf: description: | Indicates association with a function revert. $ref: "schema:ethdebug/format/program/context/function/revert" + - if: + required: ["transform"] + then: + description: | + Compiler transformations applied to produce this instruction + (e.g., inlining, tail-call optimization). Additional + annotation — does not replace semantic contexts. + $ref: "schema:ethdebug/format/program/context/transform" unevaluatedProperties: false diff --git a/schemas/program/context/transform.schema.yaml b/schemas/program/context/transform.schema.yaml new file mode 100644 index 000000000..ea9ab4aaf --- /dev/null +++ b/schemas/program/context/transform.schema.yaml @@ -0,0 +1,62 @@ +$schema: "https://json-schema.org/draft/2020-12/schema" +$id: "schema:ethdebug/format/program/context/transform" + +title: ethdebug/format/program/context/transform +description: | + Annotates an instruction with compiler transformations that + produced it. The value is a list of short identifiers naming + each transformation; the list may repeat an identifier when + the same transformation has been applied more than once (e.g., + `["inline", "inline"]` for doubly-inlined code). + + A transform context is *additional* annotation — it does not + replace semantic contexts. When the compiler inlines a + function, the invoke/return contexts for the logical call + should still be emitted at the call boundary so the debugger's + source-level call stack remains coherent. The transform + context tells debuggers **how** the call was realized. + + Consumers that ignore transform contexts still get a sound + source-level view from the invoke/return contexts alone. + Consumers that understand transform contexts can offer + optimization-aware presentations — e.g., rendering inlined + code as a collapsible block, or reconciling tail-call-optimized + back-edges with the logical call stack. + + The identifier set is extensible. v1 defines: + + - `"inline"` — the marked instruction is part of an inlined + function body. Surrounding invoke/return contexts name the + inlined callee. + - `"tailcall"` — the marked instruction is a + tail-call-optimized back-edge JUMP or continuation, where + the call was realized as a direct jump (or reuse of the + caller's frame) rather than a standard call/return sequence. + + Debuggers unfamiliar with a given identifier should preserve + it as an opaque label. + + Order in the array is not semantically significant — only the + multiset of identifiers matters. + +type: object +properties: + transform: + title: Applied transformations + description: | + List of transformation identifiers. Identifiers may + repeat; order is not semantically significant. + type: array + items: + type: string + minLength: 1 + minItems: 1 + +required: + - transform + +examples: + - transform: ["inline"] + - transform: ["tailcall"] + - transform: ["inline", "inline"] + - transform: ["inline", "tailcall"] From 9562543291e3df2aa20705be1343165a777116d5 Mon Sep 17 00:00:00 2001 From: "g. nicholas d'andrea" Date: Thu, 16 Apr 2026 03:54:38 -0400 Subject: [PATCH 2/3] format: expand transform v1 vocabulary with fold and coalesce MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds two more identifiers to the v1 transform context vocabulary, based on bugc optimizer's audit of transformations the compiler currently performs or will perform: - "fold" — compile-time constant folding. The marked instruction carries the result (typically a PUSH) replacing a compute sequence that appeared in source. - "coalesce" — read-write merging. The marked instruction is part of a SHL/OR sequence (or similar) introduced by the compiler to combine adjacent source-level reads or writes, such as packing narrower fields into a single storage slot. Together with the previously-defined "inline" and "tailcall", this covers the four transformations bugc emits today or will emit in the near term (inline once a function inlining pass lands). Propagate was considered for v1 and deferred as borderline. Updates: - transform.schema.yaml: description enumerates the four v1 identifiers; examples include single-identifier cases for each plus combinations ["inline", "fold"], ["coalesce", "coalesce"]. - context.ts: Transform.Identifier union extended with "fold" and "coalesce" (still keeps `string & {}` for extensibility and autocomplete). - transform.mdx: subsection for each identifier with a concrete EVM-level example, updated repetition/composition section with new combinations. --- packages/format/src/types/program/context.ts | 7 ++++- .../web/spec/program/context/transform.mdx | 28 ++++++++++++++++--- schemas/program/context/transform.schema.yaml | 12 ++++++++ 3 files changed, 42 insertions(+), 5 deletions(-) diff --git a/packages/format/src/types/program/context.ts b/packages/format/src/types/program/context.ts index 24c92a915..d9c96c23f 100644 --- a/packages/format/src/types/program/context.ts +++ b/packages/format/src/types/program/context.ts @@ -295,6 +295,11 @@ export namespace Context { // Recognized v1 identifiers. Unknown strings are permitted // (the identifier set is extensible); the union preserves // autocomplete for known values. - export type Identifier = "inline" | "tailcall" | (string & {}); + export type Identifier = + | "inline" + | "tailcall" + | "fold" + | "coalesce" + | (string & {}); } } diff --git a/packages/web/spec/program/context/transform.mdx b/packages/web/spec/program/context/transform.mdx index 81d88677b..b52341d21 100644 --- a/packages/web/spec/program/context/transform.mdx +++ b/packages/web/spec/program/context/transform.mdx @@ -39,7 +39,7 @@ optimization-aware presentations: ## v1 identifiers -Two identifiers are recognized in v1: +Four identifiers are recognized in v1: - **`"inline"`** — the marked instruction is part of an inlined function body. Surrounding invoke/return contexts name the @@ -51,6 +51,21 @@ Two identifiers are recognized in v1: A JUMP carrying a `tailcall` transform often co-occurs with a `gather` context combining a return (from the previous iteration) and an invoke (of the new iteration). +- **`"fold"`** — the marked instruction carries the result of + a compile-time constant fold. Typically a PUSH of the folded + value replacing a compute sequence (e.g., `ADD` over two + known constants) that appeared in source. The instruction's + surrounding `code` context, if present, points to the + original expression. +- **`"coalesce"`** — the marked instruction is part of a + read-write merging sequence the compiler introduced to + combine adjacent source-level reads or writes. Common + examples include SHL/OR sequences that pack narrower fields + into a single storage slot, or wider loads split into + narrower field extractions. The user did not write these + instructions directly; the `coalesce` marker lets a debugger + present the sequence as one source-level operation rather + than stepping through each byte-shuffling opcode. The identifier set is extensible. Compilers may emit additional identifiers for optimizations not yet standardized; debuggers @@ -60,10 +75,15 @@ than rejecting them. ## Repetition and composition Identifiers may repeat. A function inlined into another inlined -function produces `transform: ["inline", "inline"]`. Different -transformations compose: +function produces `transform: ["inline", "inline"]`. A coalesce +sequence nested inside another coalesced region produces +`transform: ["coalesce", "coalesce"]`. + +Different transformations compose: `transform: ["inline", "tailcall"]` marks an instruction inside -an inlined body that was itself a TCO back-edge in the callee. +an inlined body that was itself a TCO back-edge in the callee; +`transform: ["inline", "fold"]` marks a constant-folded PUSH +sitting inside an inlined body. Order in the array is not semantically significant—only the multiset of identifiers matters. diff --git a/schemas/program/context/transform.schema.yaml b/schemas/program/context/transform.schema.yaml index ea9ab4aaf..313843da4 100644 --- a/schemas/program/context/transform.schema.yaml +++ b/schemas/program/context/transform.schema.yaml @@ -32,6 +32,14 @@ description: | tail-call-optimized back-edge JUMP or continuation, where the call was realized as a direct jump (or reuse of the caller's frame) rather than a standard call/return sequence. + - `"fold"` — the marked instruction carries the result of a + compile-time constant fold. Typically a PUSH of the folded + value, replacing a compute sequence that appeared in source. + - `"coalesce"` — the marked instruction is part of a + read-write merging sequence (e.g., SHL/OR sequences packing + narrower fields into a wider word) that the user did not + explicitly write; the compiler introduced it to combine + adjacent source-level reads or writes. Debuggers unfamiliar with a given identifier should preserve it as an opaque label. @@ -58,5 +66,9 @@ required: examples: - transform: ["inline"] - transform: ["tailcall"] + - transform: ["fold"] + - transform: ["coalesce"] - transform: ["inline", "inline"] - transform: ["inline", "tailcall"] + - transform: ["inline", "fold"] + - transform: ["coalesce", "coalesce"] From 7ac333d72ec0bff0ff8d7cb6ee0d24ff54a004d7 Mon Sep 17 00:00:00 2001 From: "g. nicholas d'andrea" Date: Thu, 16 Apr 2026 05:05:03 -0400 Subject: [PATCH 3/3] format: prefer flat context composition, document gather scope The context schema's discriminator keys combine via allOf of if/then rules, so a single context object can carry multiple keys at once (e.g., `invoke`, `return`, and `transform` all side by side). Use gather only when two contexts would collide on the same key. - transform spec: switch the TCO back-edge example from gather to the flat form; revise the tailcall bullet accordingly - transform schema: note in the description that flat composition is preferred; gather is for key collisions - gather spec: add a "When to use" section flagging the flat form as the default and listing the canonical collision cases (multiple frames, multiple variables blocks) --- packages/web/spec/program/context/gather.mdx | 28 ++++++++++++ .../web/spec/program/context/transform.mdx | 44 +++++++++++-------- schemas/program/context/transform.schema.yaml | 5 +++ 3 files changed, 59 insertions(+), 18 deletions(-) diff --git a/packages/web/spec/program/context/gather.mdx b/packages/web/spec/program/context/gather.mdx index eb9cc3613..0501ed70d 100644 --- a/packages/web/spec/program/context/gather.mdx +++ b/packages/web/spec/program/context/gather.mdx @@ -6,6 +6,34 @@ import SchemaViewer from "@site/src/components/SchemaViewer"; # Gather multiple contexts +A `gather` context asserts that every one of its child contexts +holds at the marked instruction. It is the tool for composing +multiple context facts that cannot coexist as sibling keys on a +single object. + + +## When to use `gather` + +The context schema is open: a single context object may carry +any number of discriminator keys together — `code`, `variables`, +`invoke`, `return`, `transform`, and so on all compose as +siblings on the same object. Prefer the flat form when it +works. + +Reach for `gather` only when two or more facts would collide on +the same key. The canonical cases are: + +- **Multiple `frame`s** — an instruction that maps + simultaneously to an IR step and a source step needs one + entry per frame, each with its own `code` range. +- **Multiple `variables` blocks** — when separate pipeline + passes each contribute variable information (e.g., one + names the variable, the other supplies its pointer), each + set lives in its own context. + +If every child context uses a different discriminator key, a +`gather` can be collapsed into a single flat object with the +same meaning — and that flat form is the preferred style. diff --git a/packages/web/spec/program/context/transform.mdx b/packages/web/spec/program/context/transform.mdx index b52341d21..427e42eb6 100644 --- a/packages/web/spec/program/context/transform.mdx +++ b/packages/web/spec/program/context/transform.mdx @@ -48,9 +48,9 @@ Four identifiers are recognized in v1: - **`"tailcall"`** — the marked instruction is a tail-call-optimized back-edge JUMP or continuation, where the call was realized without pushing/popping a full activation. - A JUMP carrying a `tailcall` transform often co-occurs with a - `gather` context combining a return (from the previous - iteration) and an invoke (of the new iteration). + A JUMP carrying a `tailcall` transform typically sits on a + context that also carries both a `return` (from the previous + iteration) and an `invoke` (of the new iteration). - **`"fold"`** — the marked instruction carries the result of a compile-time constant fold. Typically a PUSH of the folded value replacing a compute sequence (e.g., `ADD` over two @@ -90,22 +90,30 @@ multiset of identifiers matters. ## Composing with other contexts -To carry a transform context alongside other contexts at the -same instruction, use `gather`. A TCO back-edge JUMP, for -example, typically combines three facts: +A context object can carry several discriminator keys at once — +`code`, `variables`, `invoke`, `return`, `transform`, and so on +all live in the same object. A TCO back-edge JUMP, for example, +typically combines three facts as sibling keys on a single +context: ```yaml -gather: - - return: - identifier: "fact" - declaration: { ... } - - invoke: - jump: true - identifier: "fact" - target: { pointer: { location: code, offset: ... } } - - transform: ["tailcall"] +return: + identifier: "fact" + declaration: { ... } +invoke: + jump: true + identifier: "fact" + target: { pointer: { location: code, offset: ... } } +transform: ["tailcall"] ``` -The return and invoke state the source-level facts (iteration N -returned, iteration N+1 was invoked); the transform explains -how the compiler realized that pair as a single JUMP. +The `return` and `invoke` state the source-level facts +(iteration N returned, iteration N+1 was invoked); the +`transform` explains how the compiler realized that pair as a +single JUMP. + +Reach for [`gather`](/spec/program/context/gather) only when +two contexts would collide on the same key — e.g., two +independent `variables` blocks or two `frame`s from different +pipeline stages. When keys don't collide, the flat form is +preferred. diff --git a/schemas/program/context/transform.schema.yaml b/schemas/program/context/transform.schema.yaml index 313843da4..8951a00ca 100644 --- a/schemas/program/context/transform.schema.yaml +++ b/schemas/program/context/transform.schema.yaml @@ -16,6 +16,11 @@ description: | source-level call stack remains coherent. The transform context tells debuggers **how** the call was realized. + Combine a transform with other discriminator keys (`invoke`, + `return`, `code`, etc.) by placing them side-by-side on the + same context object — `gather` is only needed when two + contexts would collide on the same key. + Consumers that ignore transform contexts still get a sound source-level view from the invoke/return contexts alone. Consumers that understand transform contexts can offer