From 0d9278a51997fbfcc0d4cb0e3e6fe764431bc45c Mon Sep 17 00:00:00 2001 From: Joshua Temple Date: Thu, 4 Jun 2026 23:56:20 -0400 Subject: [PATCH] docs: add pages for telemetry and the runtime modules Signed-off-by: Joshua Temple --- README.md | 11 +- docs/astro.config.mjs | 18 +++ docs/src/content/docs/cluster/overview.md | 103 +++++++++++++ docs/src/content/docs/durable/overview.md | 123 ++++++++++++++++ .../content/docs/start/ingest-drive-emit.md | 136 ++++++++++++++++++ docs/src/content/docs/telemetry/adapters.md | 128 +++++++++++++++++ docs/src/content/docs/telemetry/overview.md | 109 ++++++++++++++ docs/src/content/docs/transport/overview.md | 87 +++++++++++ docs/src/content/docs/wasm/overview.md | 77 ++++++++++ 9 files changed, 790 insertions(+), 2 deletions(-) create mode 100644 docs/src/content/docs/cluster/overview.md create mode 100644 docs/src/content/docs/durable/overview.md create mode 100644 docs/src/content/docs/start/ingest-drive-emit.md create mode 100644 docs/src/content/docs/telemetry/adapters.md create mode 100644 docs/src/content/docs/telemetry/overview.md create mode 100644 docs/src/content/docs/transport/overview.md create mode 100644 docs/src/content/docs/wasm/overview.md diff --git a/README.md b/README.md index dc13ff4..a95e18a 100644 --- a/README.md +++ b/README.md @@ -65,6 +65,10 @@ stability label. | `source/cloudevents` | CloudEvents codec with structured and binary content modes. | experimental | | `source/cdc` | Change-data-capture codec: decode Debezium/OpenCDC change events, drive by key. | experimental | | `source/statemachine` | Bridge: an inbound message drives a transition, ack tied to the durable commit. | experimental | +| `durable` | Durable-execution runtime: record and replay nondeterminism to survive a crash. | experimental | +| `cluster` | Distribution runtime: remote actors, supervision, and live instance migration. | experimental | +| `transport` | gRPC network transport for cluster: remote deliver/spawn and time-travel. | experimental | +| `wasm` | Run state behaviors as WebAssembly: polyglot guards over a JSON ABI via wazero. | experimental | source also ships composable reliability middleware as its own opt-in modules (`source/retry`, `source/dlq`, `source/idempotency`, `source/schema`) and an @@ -77,8 +81,11 @@ hierarchical, parallel, and final states, history, guard combinators, delayed transitions, invoked services, an actor model, snapshots, and JSON (de)serialization, backed by its `analysis`, `evolution`, and `conformance` packages. `telemetry`, `sink`, and `source` (with all their adapters, codecs, and -middleware) are released and documented; `broker` is planned. Treat every API as -experimental until it reaches v1. +middleware) are released and documented, as are the host-side runtimes over the +kernel: `durable` (durable execution), `cluster` (distribution and live +migration), `transport` (the gRPC network transport for cluster), and `wasm` +(polyglot behaviors). `broker` is planned. Treat every API as experimental until +it reaches v1. ## Roadmap diff --git a/docs/astro.config.mjs b/docs/astro.config.mjs index 808db8e..16453fd 100644 --- a/docs/astro.config.mjs +++ b/docs/astro.config.mjs @@ -70,6 +70,24 @@ export default defineConfig({ // external streams and drive statecharts. items: [{ autogenerate: { directory: 'source' } }], }, + { + label: 'Telemetry', + // The vendor-neutral observability seam the IO modules depend on, + // plus its optional backend adapters. + items: [{ autogenerate: { directory: 'telemetry' } }], + }, + { + label: 'Runtimes', + // Host-side runtimes layered additively over the state kernel: + // durable execution, distribution, the gRPC transport, and polyglot + // WASM behaviors. Each leaves the kernel pure and stdlib-only. + items: [ + { label: 'Durable execution', items: [{ autogenerate: { directory: 'durable' } }] }, + { label: 'Cluster', items: [{ autogenerate: { directory: 'cluster' } }] }, + { label: 'Transport', items: [{ autogenerate: { directory: 'transport' } }] }, + { label: 'WASM behaviors', items: [{ autogenerate: { directory: 'wasm' } }] }, + ], + }, { label: 'State machine', items: [ diff --git a/docs/src/content/docs/cluster/overview.md b/docs/src/content/docs/cluster/overview.md new file mode 100644 index 0000000..a732345 --- /dev/null +++ b/docs/src/content/docs/cluster/overview.md @@ -0,0 +1,103 @@ +--- +title: What is crucible/cluster +description: A host-side distribution runtime that spreads a state machine and its actors across nodes with remote delivery, supervision, and live instance migration over a pluggable transport. +sidebar: + order: 1 +--- + + + +`crucible/cluster` is the host-side **distribution runtime** for the +[`state`](/crucible/start/introduction/) kernel: remote actors, supervision, and +live instance migration. `state` runs a machine and its child-machine actors in +one process; cluster spreads that across nodes. A parent on one node addresses +and drives an actor running on another, failures are supervised with +restart/backoff strategies, and a running instance can be migrated to a different +node, all over a pluggable `Transport`, with the kernel left pure and stdlib-only. + +The runtime is **additive** over the kernel. It consumes seams the kernel already +reserves (the opaque `ActorRef` whose `Node` locator names the owning host, the +injectable `ActorSystem`, the `Snapshot`/`Restore` pair, and the typed +`ActorEscalation`/`EscalationHandler`) and needs no kernel change beyond the +additive `ActorRef.Node` locator. The core is itself stdlib-only; transport +dependencies live behind the `Transport` interface, out of the core. + +## Remote actors + +A `System` wraps a node's local `state.ActorSystem` with a node identity and an +optional `Transport`. Operations on a ref this node owns are delegated locally; +operations on a ref another node owns are routed over the transport. + +```go +tr := cluster.NewInMemoryTransport() + +nodeA := cluster.NewSystem("node-a", actorSysA, cluster.WithTransport(tr)) +nodeB := cluster.NewSystem("node-b", actorSysB, cluster.WithTransport(tr)) +tr.Register("node-a", nodeA) +tr.Register("node-b", nodeB) + +// Spawn a worker on node-b from node-a, then drive it through the returned ref. +ref, err := nodeA.Spawn(ctx, "node-b", "worker", "w-1", nil) +_, err = nodeA.Deliver(ctx, ref, "start") // routed to node-b +``` + +The ref is opaque: the holder never learns where the actor runs. A ref this node +owns has an empty `Node`; a remote ref carries the owning node. A `System` with no +`Transport` serves its local actors transparently and reports `ErrNoTransport` for +a remote ref. The in-tree `InMemoryTransport` connects node-scoped systems in one +process; a real network transport (see [`transport`](/crucible/transport/overview/)) +implements the same `Transport` interface. + +## Supervision + +A `Supervisor` turns the kernel's typed `ActorEscalation` into a per-source policy. +Wire it with `ActorSystem.WithEscalationHandler(sup.Handle)`; each failed actor is +routed to a `Decision` by the src it was spawned from: + +| Decision | Behavior | +| --- | --- | +| `Escalate` | forward the failure to a sink up the hierarchy (the default) | +| `Stop` | contain the failure at this level | +| `Restart` | re-spawn through a `Respawner` (the `System`), bounded by a per-src budget; on exhaustion, escalate | +| `Backoff` | defer the re-spawn behind an exponentially growing delay; the host applies due restarts via `Tick` | + +```go +sup := cluster.NewSupervisor( + cluster.WithRestart("worker", 3), // up to 3 immediate restarts + cluster.WithBackoff("flaky", 5, 100*time.Millisecond, time.Minute, 2.0), + cluster.WithEscalationSink(parentHandler), +) +sup.SetRespawner(node) +actorSys.WithEscalationHandler(sup.Handle) +// ... drive backoff restarts from a timer loop: +for range ticker.C { sup.Tick(ctx) } +``` + +Backoff reads time through an injected `state.Clock` (`WithClock`, default the +system clock), so it is deterministic under a `state.FakeClock` in tests. + +## Live migration + +`Capture` snapshots a running instance, its actor tree, and its machine definition +into a wire-shippable `Checkpoint`; `Restore` rebuilds it on another node, resuming +in place. The move is **gated on schema compatibility**. `Restore` diffs the source +and target machine definitions with +[`state/evolution`](/crucible/analysis/evolution/) and refuses a breaking target +with `ErrIncompatibleMigration`, so an instance never resumes against a definition +that would misread its state. + +```go +cp, err := cluster.Capture(inst, actorSys, machine) // on the source node +// ... ship cp (it is all JSON) to the target node ... +inst, sys, err := cluster.Restore(ctx, cp, targetMachine, // on the target node + cluster.WithActorBehaviors(palette)) +// err is ErrIncompatibleMigration if targetMachine is a breaking change. +``` + +## Where it fits + +cluster is feature-complete on the in-memory transport, which is enough to build +and test a distributed topology in one process. To carry deliveries, spawns, and +time-travel queries between real nodes over the network, pair it with +[`transport`](/crucible/transport/overview/), the gRPC implementation of the same +`Transport` seam. diff --git a/docs/src/content/docs/durable/overview.md b/docs/src/content/docs/durable/overview.md new file mode 100644 index 0000000..25947ae --- /dev/null +++ b/docs/src/content/docs/durable/overview.md @@ -0,0 +1,123 @@ +--- +title: What is crucible/durable +description: A host-side durable-execution runtime that records every nondeterministic value a state instance consumes and replays them to rebuild it byte-identically after a crash. +sidebar: + order: 1 +--- + + + +`crucible/durable` is the host-side **durable-execution runtime** for the +[`state`](/crucible/start/introduction/) kernel. `state` is a pure statechart +engine: firing an event is a deterministic function of the instance's recorded +inputs. durable makes a running instance survive a process crash by recording +every nondeterministic value the instance consumes (clock readings, +invoked-service results, actor outcomes) and persisting each step before it is +acknowledged. Recovery **replays** those recorded values back through the same +pure transition function, so a recovered instance reaches exactly the +configuration, context, and history of a run that never crashed, without +re-invoking any external source. + +The runtime is **additive** over the kernel. It consumes seams the kernel already +reserves (`Snapshot.Journal`, the `EffectEnvelope.EffectID` correlation slot, and +the injectable `Clock` / `ServiceRunner` / `ActorSystem` drivers) and requires no +change to the kernel, which stays pure and stdlib-only. Heavy dependencies +(database drivers, cloud SDKs) never enter this module: persistent backends +implement the `Store` interface out of tree. + +## Guarantees + +- **Deterministic replay.** A recovered instance is byte-identical to one that + never crashed, because recovery replays the recorded driving events and + nondeterministic results rather than re-executing their sources. +- **Exactly-once effects.** A domain effect is applied exactly once over the + instance's lifetime (the live run plus any number of recoveries), even though + the replay loop is at-least-once. Each effect carries a deterministic + `EffectID` and is deduplicated through the `Store`'s dispatch set. +- **Durability across restart.** Every `Fire` step is write-ahead appended to the + `Store` before it is acknowledged, so a crash after a successful `Fire` never + loses the step. Periodic checkpoints bound the tail that recovery replays. + +## The shape of it + +Wire a `Runner` around a machine and a `Store`. `Start` creates a fresh durable +instance; `Fire` drives it; `Recover` rebuilds it from the `Store` after a crash. + +```go +runner := durable.NewRunner(machine, durable.NewMemStore()) + +// Start a fresh instance: persists a baseline checkpoint. +h, err := runner.Start(ctx, "order-42", OrderInput{ /* ... */ }) + +// Drive it. Each Fire write-ahead appends a Record before acknowledging. +_, err = h.Fire(ctx, "submit") + +// ...process crashes, comes back up... + +// Recover purely from the Store: load the latest checkpoint, replay the tail. +h, err = durable.Recover(ctx, machine, store, "order-42") +_, err = h.Fire(ctx, "confirm") // continues recording from the live tip +``` + +For a hot path firing many events in sequence, keep the `Handle` from `Start` or +`Recover` and call `Handle.Fire` directly to avoid a `Store` round-trip per step. +For a stateless handler that fires a single event per request, `Runner.Fire` +loads, replays, fires, and re-records in one call. + +## The seams + +Each source of nondeterminism is isolated behind an injectable driver, recorded +the first time, and replayed verbatim on recovery: + +| Seam | Wire with | Recorded as | +| --- | --- | --- | +| **Clock** (timers) | `WithRunnerClock` | `JournalClockRead`, replayed so timers fire at the same recorded instants, wall-clock-independent; armed deadlines survive checkpoint compaction | +| **Invoked services** | `WithServiceRegistry` + `Handle.RunService` | `JournalServiceResult`; the service runs once, then recovery replays its result through the kernel's settle seam | +| **Child-machine actors** | `WithActorPalette` + `Handle.DeliverToActor` | `JournalActorMessage`; the behavior runs once, then recovery re-fires the recorded parent transition | +| **Domain effects** | `WithEffectHandler` | dispatch set, applied exactly once via deterministic `EffectID` dedup | + +Use `WithCheckpointEvery(n)` to tune how often a full snapshot is written: a +shorter interval bounds recovery replay, a longer one cuts checkpoint cost. + +## Stores + +`Store` is the persistence seam. A durable instance is an ordered log of +`Record`s (one per `Fire` step) layered over periodic full-snapshot checkpoints. +Two stdlib-only reference implementations ship in-tree: + +- **`MemStore`** (`NewMemStore`): in-memory, thread-safe, not durable across + restarts. For tests, examples, and single-process development. `WithHistory` + retains the full record history, enabling time-travel below. +- **`FileStore`** (`NewFileStore`): on-disk, a directory of per-instance + subdirectories, each an append-only journal, an atomic checkpoint, an + idempotency ledger, and a dispatched-effect log. Each append flushes to stable + storage; each checkpoint uses write-temp-plus-rename for crash-safe atomicity. + Use it for durability across restarts without a database. + +Persistent database backends (PostgreSQL, DynamoDB, and the like) implement +`Store` out of tree, so their drivers never burden this module's dependency or +vulnerability surface. + +## Time-travel reader + +`StateAt` reconstructs an instance's state as of any recorded step, read-only: +restoring the start baseline and replaying recorded events forward to the target +step, running no service, re-instantiating no actor, reading no wall clock, and +dispatching no effect: + +```go +view, err := durable.StateAt(ctx, machine, store, "order-42", 3) +// view.Snapshot(), view.Instance(), view.Step(): detached and safe to read +``` + +Time-travel needs the full record history through the target step. A `Store` opts +in by implementing `HistoryStore` (the in-tree `MemStore` does so under +`WithHistory`); `StateAt` otherwise falls back to the latest checkpoint plus tail. + +## A note on serialized payloads + +Events, service done-data, actor done-data, and actor messages are recorded as +their JSON form. A parent reducer that type-asserts a non-JSON Go type from +`AssignCtx.Event` observes the JSON-decoded shape on a replayed `onDone`. A +typed-codec option to carry the concrete Go value across the journal boundary is +reserved for a later, additive change. diff --git a/docs/src/content/docs/start/ingest-drive-emit.md b/docs/src/content/docs/start/ingest-drive-emit.md new file mode 100644 index 0000000..330ffa6 --- /dev/null +++ b/docs/src/content/docs/start/ingest-drive-emit.md @@ -0,0 +1,136 @@ +--- +title: Ingest, drive, emit +description: A minimal end-to-end loop where source consumes an event, the event drives a state machine transition, and a sink emits the transition's effect. +sidebar: + order: 4 +--- + + + +The [getting started](/crucible/start/getting-started/) walkthrough fires a +machine by hand. In a real service the events arrive from a stream and the +transition's effects leave for the outside world. This quickstart wires the three +seams into one loop: [`source`](/crucible/source/overview/) consumes a message, +the message drives a [`state`](/crucible/start/introduction/) transition, and a +[`sink`](/crucible/sink/overview/) emits the effect. None of the three cores +imports another; the [`source/statemachine`](/crucible/source/with-state/) bridge +is the only thing that depends on all of them. + +## Install + +```sh +go get github.com/stablekernel/crucible/source +go get github.com/stablekernel/crucible/source/statemachine +go get github.com/stablekernel/crucible/sink +go get github.com/stablekernel/crucible/state +``` + +## The machine + +A toy turnstile that emits an `Opened` effect when it unlocks. The effect is pure +data; the machine performs no IO. + +```go +type Gate string // S +type Signal string // E +type Turnstile struct{ Coins int } // C + +const ( + Locked Gate = "Locked" + Unlocked Gate = "Unlocked" +) + +const Coin Signal = "Coin" + +type Opened struct{ Coins int } + +machine := state.Forge[Gate, Signal, Turnstile]("turnstile"). + // An action returns an effect (pure data) for the transition to emit. + Action("announceOpen", func(a state.ActionCtx[Turnstile]) (state.Effect, error) { + return Opened{Coins: a.Entity.Coins}, nil + }). + Initial(Locked). + Transition(Locked).On(Coin).GoTo(Unlocked).Do("announceOpen"). + Quench() +``` + +## Wire the loop + +`statemachine.Drive` binds the consume loop to the machine. A `Router` resolves +each message to an instance key and the event to fire; the bridge loads the +instance through a `Store`, fires the event, hands the emitted effects to the +configured `Sink`, persists the new state, and only then acks. Here the `Sink` is +a [`sink.Manifold`](/crucible/sink/model/) fanning the effect out to its outlets. + +```go +// Egress: a Manifold that fans each emitted effect out to its outlets. +manifold := sink.NewManifold(sink.WithOutlets(sink.OutletFunc( + func(ctx context.Context, payload any) error { + log.Printf("opened: %+v", payload) // a real outlet writes SQL, Dynamo, a webhook + return nil + }, +))) + +// Durable instance state for the bridge (in-memory for a single process). +store := statemachine.NewMemStore[string, Gate, Signal, Turnstile]() + +// Route a message to its instance key and the event to fire. +router := func(m source.Message) (string, Signal, error) { + return m.Headers().Get("turnstile-id"), Coin, nil +} + +handler := statemachine.Drive(machine, store, router, + statemachine.WithSink(statemachine.SinkFunc( + func(ctx context.Context, effect any) error { + manifold.Sink(ctx, effect) // fire-and-forget fan-out + return nil + }, + )), +) +``` + +## Run it + +Open a subscription on any [inlet](/crucible/source/adapters/) (Kafka, +JetStream, or the in-memory test source) and hand the bridge handler to +`Receive`. The engine runs the consume loop until the context is cancelled. + +```go +sub, err := inlet.Subscribe(ctx, source.SubscribeConfig{Topic: "turnstile"}) +if err != nil { + return err +} +// consume -> route -> Fire(Coin) -> emit Opened -> persist -> ack +return sub.Receive(ctx, handler) +``` + +That is the whole loop: + +```mermaid +flowchart LR + K[(stream)] --> S{{source}} + S --> R[route by key] + R --> F["Fire(Coin)"] + F --> P[persist new state] + P --> A[ack] + F -.Opened effect.-> M{{sink Manifold}} + M --> O1[outlet] + M --> O2[outlet] +``` + +The ack is tied to the durable transition, so an at-least-once stream never +applies an event twice and never acks an event it failed to persist. A redelivery +of an already-applied message is a no-op ack, keyed on the machine's own state +version with no external dedup store. An event that is illegal for the current +state is a guard rejection, classified as poison and routed to the +[DLQ](/crucible/source/reliability/#dlq), distinct from a transient error that +retries. + +## Next + +- [Driving a statechart from a stream](/crucible/source/with-state/): the bridge + in full, including exactly-once consume-process-produce on Kafka. +- [Fire-and-forget fan-out](/crucible/sink/fan-out/): the Manifold's emit + semantics, errors, and batching. +- [Effects and purity](/crucible/concepts/effects-and-purity/): why the machine + emits effects as data instead of performing IO. diff --git a/docs/src/content/docs/telemetry/adapters.md b/docs/src/content/docs/telemetry/adapters.md new file mode 100644 index 0000000..7710e07 --- /dev/null +++ b/docs/src/content/docs/telemetry/adapters.md @@ -0,0 +1,128 @@ +--- +title: Adapters +description: Optional sub-modules that translate the telemetry interface onto slog, OpenTelemetry, or Datadog, each pulling its own vendor SDK so the core stays stdlib-only. +sidebar: + order: 2 +--- + +An adapter translates the [`telemetry`](/crucible/telemetry/overview/) interface +to a concrete backend. Each one ships as a separate, optional sub-module with its +own `go.mod`, so the core never imports a vendor SDK and a deployment compiles in +only the backend it uses. Every adapter implements the same `Tracer` and `Meter` +interfaces and is wired identically through `WithTracer` and `WithMeter`. + +| Adapter | Deps | +| ------------------- | ----------------------------------------------------- | +| `telemetry/slog` | stdlib `log/slog` only; emits spans/metrics as logs | +| `telemetry/otel` | the OpenTelemetry Go SDK | +| `telemetry/datadog` | dd-trace-go and DogStatsD (datadog-go) | + +## slog + +`telemetry/slog` is the reference adapter, built on Go's `log/slog` with **zero +external dependencies**. It emits spans and metric instruments as structured log +records and proves the seam end to end, which makes it the natural choice for +development, tests, and environments where structured logs are the only +observability sink. Because `telemetry.Attr` is already `slog.Attr`, the adapter +is conversion-free: attributes pass straight to the slog handler, so +zero-allocation scalar attributes stay zero-allocation through emission. + +The package name is `slog`, which collides with the standard library, so import +it under an alias: + +```go +import ( + "log/slog" + + "github.com/stablekernel/crucible/telemetry" + crucibleslog "github.com/stablekernel/crucible/telemetry/slog" +) + +logger := slog.New(slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{ + Level: slog.LevelDebug, // span/metric records emit at DEBUG +})) + +tel := telemetry.Nop().Apply( + telemetry.WithTracer(crucibleslog.NewTracer(crucibleslog.WithLogger(logger))), + telemetry.WithMeter(crucibleslog.NewMeter(crucibleslog.WithLogger(logger))), +) +``` + +Span starts, attribute updates, errors, and ends become `span.start`, +`span.attributes`, `span.error`, and `span.end` records; counters, histograms, +and gauges become `metric` records. `Tracer.Start` carries the current span id in +the returned context, so a nested span logs its parent's id, reproducing span +parentage in the logs. + +## OpenTelemetry + +`telemetry/otel` bridges the interface onto an OpenTelemetry `trace.Tracer` and +`metric.Meter`. The consuming module keeps its dependency on the vendor-neutral +interface; the OpenTelemetry SDK is pulled in only by this adapter's own +`go.mod`. + +```go +import ( + "github.com/stablekernel/crucible/telemetry" + oteladapter "github.com/stablekernel/crucible/telemetry/otel" +) + +// tp/mp are your configured OpenTelemetry TracerProvider / MeterProvider. +tel := telemetry.Nop().Apply( + telemetry.WithTracer(oteladapter.NewTracer(tp.Tracer("crucible"))), + telemetry.WithMeter(oteladapter.NewMeter(mp.Meter("crucible"))), +) +``` + +`Tracer.Start` maps to `trace.Tracer.Start` with the context propagated for +parentage; `Span.SetStatus(OK/Error/Unset)` maps to the matching `codes` value; +counters, histograms, and gauges map to an `Int64Counter`, a `Float64Histogram`, +and a synchronous `Float64Gauge`. Each attribute is converted by reading +`slog.Value.Kind`: scalar kinds map to the matching typed attribute, `Duration` +encodes as integer nanoseconds, `Time` as RFC 3339, and anything else is +stringified. If the SDK returns an error constructing an instrument, the adapter +falls back to a no-op instrument rather than panicking, so a metric never brings +the caller down. + +## Datadog + +`telemetry/datadog` bridges the tracer onto dd-trace-go and the meter onto +DogStatsD (datadog-go). As with the others, the Datadog SDKs are pulled in only by +this adapter's `go.mod`. + +```go +import ( + "github.com/DataDog/datadog-go/v5/statsd" + ddtracer "github.com/DataDog/dd-trace-go/v2/ddtrace/tracer" + + "github.com/stablekernel/crucible/telemetry" + ddadapter "github.com/stablekernel/crucible/telemetry/datadog" +) + +ddtracer.Start() +defer ddtracer.Stop() + +client, _ := statsd.New("127.0.0.1:8125") + +tel := telemetry.Nop().Apply( + telemetry.WithTracer(ddadapter.NewTracer()), + telemetry.WithMeter(ddadapter.NewMeter(client)), +) +``` + +`Tracer.Start` maps to `tracer.StartSpanFromContext` with the context propagated +for parentage; span attributes become tags via `SetTag`; a recorded error is +attached on `Finish`; `SetStatus(Error)` marks the span errored. Counters, +histograms, and gauges map to `statsd.Count`, `statsd.Histogram`, and +`statsd.Gauge`, with metric attributes rendered as DogStatsD `"key:value"` tags. +`NewTracer` uses the active global dd-trace-go tracer by default; inject a span +starter with `WithSpanStarter` to drive it from a test (for example dd-trace-go's +`mocktracer`). + +## Writing your own + +Any backend works the same way: implement `Tracer`/`Span` and +`Meter`/`Counter`/`Histogram`/`Gauge`, convert each attribute with a `switch` over +`attr.Value.Kind()` (reading the typed accessor for each scalar kind and +`Value.Any()` only for the `KindAny` escape hatch), and wire it through +`WithTracer`/`WithMeter`. The `slog` adapter is the smallest worked example. diff --git a/docs/src/content/docs/telemetry/overview.md b/docs/src/content/docs/telemetry/overview.md new file mode 100644 index 0000000..138b9d2 --- /dev/null +++ b/docs/src/content/docs/telemetry/overview.md @@ -0,0 +1,109 @@ +--- +title: What is crucible/telemetry +description: A vendor-neutral tracing and metrics interface for the IO modules; stdlib-only, with a no-op default and opt-in adapters. +sidebar: + order: 1 +--- + + + +`crucible/telemetry` is the suite's **observability seam**: a small, stable set +of interfaces that the IO modules ([`sink`](/crucible/sink/overview/), +[`source`](/crucible/source/overview/), and friends) depend on for tracing and +metrics. + +It imports **only the Go standard library** and forces **no vendor SDK** on any +consumer. That is the whole point. A consumer brings their own tracing/metrics +backend through a thin adapter, and a consumer that brings nothing gets silent, +zero-overhead behavior from the built-in no-op defaults. This is the suite's +"thin seams, no-op defaults, no forced dependencies" rule applied to telemetry: +the core interface forces no dependency, the default does nothing, and vendor +wiring lives in optional, separately-versioned adapter sub-modules. + +## The interface + +Two surfaces, tracing and metrics, plus an attribute type: + +```go +// Tracing +Tracer.Start(ctx, name, attrs...) -> (ctx, Span) +Span.SetAttributes(attrs...) +Span.RecordError(err) +Span.SetStatus(code, msg) // StatusUnset | StatusOK | StatusError +Span.End() + +// Metrics +Meter.Counter(name, opts...) -> Counter.Add(ctx, n int64, attrs...) +Meter.Histogram(name, opts...) -> Histogram.Record(ctx, v float64, attrs...) +Meter.Gauge(name, opts...) -> Gauge.Record(ctx, v float64, attrs...) +``` + +Counters are monotonic `int64` deltas, so every counted thing (records sunk, +failures, drops) stays whole-numbered and adapter mappings stay exact. +Histograms and gauges carry `float64` samples so sub-unit measurements +(fractional milliseconds) are not truncated. Instrument metadata is supplied +with the additive `WithUnit` and `WithDescription` options. + +## Attributes are slog.Attr + +`Attr` is an alias for the standard library's `slog.Attr`, so an attribute value +is a `slog.Value`, the stdlib's allocation-optimized tagged union. Build +attributes with the typed constructors re-exported here: + +```go +telemetry.String("payload.type", "Order") +telemetry.Int64("size", 100) +telemetry.Float64("latency_ms", 3.2) +telemetry.Bool("retried", true) +``` + +The scalar constructors (`String`, `Int64`, `Int`, `Uint64`, `Float64`, `Bool`, +`Duration`, `Time`) are zero-allocation: the value is stored inline, never boxed. +`telemetry.Any` is the documented escape hatch for an arbitrary value; it boxes +into an interface and so allocates, so reach for it only when no typed +constructor fits. + +## Context is the only coupling + +`Tracer.Start` returns a context carrying the new span. Propagate that context +into nested work and a downstream span, in this module or another, parents under +the caller's span automatically. There is no shared global tracer and no +package-level state; context is the only coupling between modules. + +## No-op by default + +`NopTracer` and `NopMeter` record nothing, allocate nothing per call, never +panic, and are safe to call concurrently and after a span has ended. A consuming +module embeds a `Provider`, seeds it with `Nop`, and exposes `WithTracer` and +`WithMeter` options: + +```go +tel := telemetry.Nop().Apply( + telemetry.WithTracer(myTracer), + telemetry.WithMeter(myMeter), +) + +ctx, span := tel.Tracer.Start(ctx, "sink.Sink", + telemetry.String("payload.type", "Order")) +defer span.End() +tel.Meter.Counter("sink.sunk").Add(ctx, 1, telemetry.String("outlet", "dynamo")) +``` + +A `nil` tracer or meter passed to an option is ignored, preserving the no-op +default, so call sites never need nil checks. Recording telemetry carries no +error return on the hot path: it never fails a caller's operation. + +## Naming convention + +Instrument and span names use **dotted lower-snake with a module prefix**, so +metrics and traces line up across the suite: + +| Module | Examples | +| ------- | -------- | +| `sink` | `sink.sunk`, `sink.failed`, `sink.flush_latency_ms`, span `sink.Sink` | +| `state` | `state.transitions`, gauge `state.in_state`, span `state.transition` | + +## Next + +- [Adapters](/crucible/telemetry/adapters/): the slog, OpenTelemetry, and Datadog + backends, each in its own optional sub-module. diff --git a/docs/src/content/docs/transport/overview.md b/docs/src/content/docs/transport/overview.md new file mode 100644 index 0000000..9c5f200 --- /dev/null +++ b/docs/src/content/docs/transport/overview.md @@ -0,0 +1,87 @@ +--- +title: What is crucible/transport +description: A gRPC network transport for the cluster runtime, carrying actor deliver and spawn between nodes and serving read-only durable time-travel queries, all over a JSON codec with no protobuf schema. +sidebar: + order: 1 +--- + + + +`crucible/transport` is the **gRPC network transport** for the +[`cluster`](/crucible/cluster/overview/) runtime. It carries actor deliver and +spawn operations between nodes over real gRPC (HTTP/2) and serves read-only +historical state reconstruction for durable instances. It implements +`cluster.Transport` on the client side and serves a node's `cluster.WireEndpoint` +on the server side. + +It lives in its own module so the gRPC dependency stays out of the cluster core, +which remains stdlib-only: a deployment that uses only the in-memory transport +never compiles gRPC in. Payloads (events and spawn inputs) cross the wire as the +JSON the `WireEndpoint` seam already produces and consumes, through a JSON gRPC +codec. No protobuf schema or codegen is involved; the service descriptors are +hand-written and the messages are encoded as JSON. + +## Serving a node + +`NewServer` builds a gRPC server preconfigured with the JSON codec and a node's +`cluster.WireEndpoint` registered, so deliveries and spawns arriving over the wire +are decoded into the node's concrete event and input types and applied to its +local actor system. The caller serves it on a listener and owns its lifecycle, and +can pass extra `grpc.ServerOption`s (interceptors, credentials, keepalives): + +```go +gs := transport.NewServer(node) // node satisfies cluster.WireEndpoint +go gs.Serve(listener) +defer gs.GracefulStop() +``` + +`RegisterServer` registers the same service onto an existing `grpc.Server` when +you already run one. + +## Reaching other nodes + +`New` returns a client `Transport` with no nodes; register each reachable node's +client connection with `AddNode`. The caller dials the node (`grpc.NewClient`) and +owns the connection's lifecycle. The resulting `*Transport` satisfies +`cluster.Transport`, so hand it to a `cluster.System` and remote refs route over +gRPC transparently: + +```go +tr := transport.New() +tr.AddNode("node-b", connB) // connB is a grpc.ClientConnInterface + +sys := cluster.NewSystem("node-a", actorSys, cluster.WithTransport(tr)) +// A ref owned by node-b now delivers and spawns over the wire. +``` + +`Deliver` JSON-encodes the event and routes it to the owning node, which decodes +it into its concrete event type; `Spawn` asks a node to start an actor and returns +a ref to it. A ref whose node has no registered connection reports +`cluster.ErrNodeUnreachable`. + +## Remote time-travel + +Transport also exposes the [`durable`](/crucible/durable/overview/) time-travel +reader over the wire, so one node can reconstruct the past state of an instance +another node hosts. `DurableTimeTravel` adapts a durable `Store` and machine into a +`TimeTravelEndpoint` by running `durable.StateAt` and marshaling the reconstructed +snapshot. It is read-only: it runs no service, re-instantiates no actor, and +dispatches no effect. + +```go +// On the host node: serve the time-travel endpoint. +tt := transport.NewDurableTimeTravel(machine, store) +gs := transport.NewTimeTravelServer(tt) +go gs.Serve(ttListener) + +// On a remote node: ask for an instance's state as of a recorded step. +snapshot, err := tr.StateAt(ctx, "node-b", "order-42", 3) +inst, err := state.UnmarshalSnapshot[Gate, Signal, Turnstile](snapshot) +``` + +`NewTimeTravelServer` (or `RegisterTimeTravel` onto an existing server) hosts the +endpoint; the client `StateAt` reuses the transport's registered connections and +returns the marshaled snapshot, which the caller decodes with +`state.UnmarshalSnapshot` for its own `(S, E, C)`. For arbitrary steps the host's +`Store` should retain full history (a `durable.HistoryStore`); otherwise +reconstruction is limited to the latest checkpoint and tail. diff --git a/docs/src/content/docs/wasm/overview.md b/docs/src/content/docs/wasm/overview.md new file mode 100644 index 0000000..84063ac --- /dev/null +++ b/docs/src/content/docs/wasm/overview.md @@ -0,0 +1,77 @@ +--- +title: What is crucible/wasm +description: Run state behaviors as WebAssembly; polyglot guards authored in any language and evaluated by the host over a small JSON ABI through a pure-Go, CGo-free runtime. +sidebar: + order: 1 +--- + + + +`crucible/wasm` runs [`state`](/crucible/start/introduction/) **behaviors as +WebAssembly**: polyglot guards authored in any language that compiles to WASM and +evaluated by the host over a serializable JSON ABI. + +A guard is normally a Go func or a CEL expression. wasm adds a third option, a +guard implemented as a WebAssembly module, so behavior logic can be written in any +WASM-targeting language and dropped into a machine by name. The host invokes the +module over a small JSON ABI through **wazero, a pure-Go, CGo-free runtime**, so +adopting it adds no C toolchain and no cross-compilation burden. + +It lives apart from the kernel so the wazero dependency never enters the +stdlib-only core: a deployment that uses only Go or CEL guards never compiles WASM +in. The ABI is **core WebAssembly**, not the Component Model, which would require a +CGo runtime. + +## The shape of it + +Compile a module once and reuse it across calls, then bind it as a guard by name: + +```go +mod, err := wasm.Compile(ctx, moduleBytes) // instantiate once; reuse across calls +defer mod.Close(ctx) + +reg := state.NewRegistry[Order]() +guard := wasm.Guard[string](reg, "approved", mod) // a WASM-backed state.GuardBinding + +def := state.Forge[string, string, Order]("order"). + Guard("approved", func(state.GuardCtx[Order]) bool { return false }). // stub; Provide overwrites + State("pending"). + Transition("pending").On("submit").GoTo("submitted").WhenExpr(guard). + State("submitted"). + Initial("pending"). + Quench() +// ... ToJSON -> LoadFromJSON -> Provide(reg) -> Quench: the guard now evaluates in WASM. +``` + +The guard composes like any other: combine it with `And`/`Or`/`Not`, or reference +it by name from a JSON-authored machine. A broken module is **fail-safe**: an +evaluation error reports `false`, so the guarded transition is blocked rather than +taken on a bad verdict. + +## The JSON ABI + +A guest module exports two functions over its linear memory: + +| Export | Signature | Purpose | +| --- | --- | --- | +| `alloc` | `(size u32) u32` | reserve `size` bytes, return the pointer the host writes the request into | +| `eval` | `(ptr u32, size u32) u64` | read the JSON request at `[ptr, ptr+size)`, evaluate, write the JSON response, return packed `(outPtr<<32 \| outLen)` | + +For a guard the request is `{"context": }` and the response is +`{"ok": }`. Because the payloads are JSON, the same module works for any host +language. A `Module` serializes concurrent `Eval` calls behind a mutex (one linear +memory per instance). + +A guest can be written in any WASM-targeting language; the test suite compiles a +tiny Go `//go:wasmexport` guest with the standard toolchain (`GOOS=wasip1 +GOARCH=wasm`, `-buildmode=c-shared`), with no TinyGo and no committed binary. + +## When to reach for it + +Each `Eval` marshals the request, crosses into the guest, and reads the response +back, so the cost is dominated by the JSON round-trip across the linear-memory +boundary. A WASM guard is therefore heavier than an in-process Go or CEL guard and +is best reserved for genuinely polyglot logic: a rule you want to author once and +share with a non-Go service, or behavior shipped by a team that does not write Go. +The runtime is feature-complete for guards today; services are the next behavior to +land on the same ABI.