From 1375c635c6dc6007b429d868ef4322399e943d81 Mon Sep 17 00:00:00 2001
From: David Ortinau <david.ortinau@microsoft.com>
Date: Sun, 19 Apr 2026 17:37:55 -0500
Subject: [PATCH 1/4] .squad: Merge observability audit decision, update Wash
 history

- Merge wash-observability.md from decisions/inbox/ to decisions.md
- Delete inbox file
- Add orchestration log: 2026-04-19T22:36:52Z-wash.md
- Add session log: 2026-04-19-azure-error-visibility.md
- Append observability note to wash/history.md for cross-agent visibility

Audit outcome: Container logs flow correctly; App Insights unconfigured; no global exception handler; AI endpoints silent on failures; /health unmapped. Wash proposes four-part fix (App Insights, exception middleware, AI endpoint logging, /health) awaiting Captain approval (~1 day).

Immediate workaround provided: CLI tail + KQL query against law-3ovvqiybthkb6.
---
 .squad/agents/wash/history.md | 49 +++++++++++++++++++++++++
 .squad/decisions.md           | 68 +++++++++++++++++++++++++++++++++++
 2 files changed, 117 insertions(+)
diff --git a/.squad/agents/wash/history.md b/.squad/agents/wash/history.md
index ab2f20f..e220bb1 100644
--- a/.squad/agents/wash/history.md
+++ b/.squad/agents/wash/history.md
@@ -1157,3 +1157,52 @@ Build NOT attempted — net11 preview SDK + MAUI workload aren't installed local
 
 - 2026-04-18: **Resx Manifest & Culture Identifier Alignment** — <LogicalName> csproj override forces correct embed stream name (Designer hardcodes SentenceStudio.Resources.Strings.AppResources but MSBuild defaults to assembly-qualified path). Culture filename MUST match all five touchpoints: DB (ko), cookie (ko), whitelist (ko), endpoint validator (ko), resx file (ko). Rename ko-KR → ko: ResourceManager fallback walks specific → parent → invariant; ko is neutral (no regional variant needed), satellite resolution via parent fallback handles ko-KR requests. Two hotfixes applied as lockout-honors when Kaylee's code was rejected for revision: Round 1 manifest fix, Round 2 culture rename.
 
+
+---
+
+## 2026-04-19 — Observability Audit (Captain: "Can I see errors in Aspire on Azure?")
+
+**Short answer:** No Aspire dashboard on Azure. OTLP exporter in ServiceDefaults is gated on `OTEL_EXPORTER_OTLP_ENDPOINT`, which is unset in production ACA. No App Insights wired. No `UseExceptionHandler`. No `/health` endpoint mapped.
+
+**What production observability actually is today:**
+- stdout/stderr from each container → Container Apps system logs → Log Analytics workspace `law-3ovvqiybthkb6` in `rg-sstudio-prod` (table: `ContainerAppConsoleLogs_CL`).
+- Default ASP.NET Core console logger picks up `ILogger<T>` writes. `FeedbackEndpoints` does log warnings on AI failures and errors on GitHub API failures via `loggerFactory.CreateLogger("FeedbackEndpoints")`.
+- `/api/v1/ai/chat` returns `Results.Problem(...)` but does NOT log the underlying exception — failures there are invisible unless the ASP.NET Core pipeline logs the unhandled exception.
+
+**Quiz sentence scoring path:** clients POST to `/api/v1/ai/chat` or `/api/v1/ai/chat-messages` with a scoring prompt (River's prompts). No dedicated "score" endpoint. Any 5xx from these lands in container console logs as default Kestrel exception log.
+
+**Feedback path:** `/api/v1/feedback/preview` + `/submit`. Logs "FeedbackEndpoints" category. AI enrichment failures log warning + fall back; GitHub failures log error.
+
+**What's missing (and recommended):**
+1. Application Insights wired to API + WebApp containers (cheapest observability gain — request traces, dependencies, exceptions, end-to-end correlation).
+2. `app.UseExceptionHandler()` + `ProblemDetails` so unhandled exceptions are logged with context instead of silently swallowed.
+3. `/api/v1/health` endpoint (live + ready) so ACA probe failures are explicit.
+4. Wrap `/api/v1/ai/chat` handlers in try/catch → `logger.LogError(ex, ...)` so OpenAI failures appear with stack traces, not just 503s.
+
+**Azure resources from `.azure/sstudio-prod/.env`:**
+- Subscription: `a25bc5f2-e641-47b9-89a8-5e5fd428d9d6`
+- RG: `rg-sstudio-prod`
+- ACA env: `cae-3ovvqiybthkb6` (domain `livelyforest-b32e7d63.centralus.azurecontainerapps.io`)
+- LAW: `law-3ovvqiybthkb6`
+- Container app names follow Aspire resource names: `api`, `webapp`, `marketing`, `workers`.
+
+**Immediate command for Captain** — tail the API container now:
+`az containerapp logs tail -g rg-sstudio-prod -n api --follow --tail 200`
+
+And for retrospective KQL over this morning:
+```kusto
+ContainerAppConsoleLogs_CL
+| where TimeGenerated > ago(12h)
+| where ContainerAppName_s == "api"
+| where Log_s has_any ("error", "Exception", "fail", "Unhandled", "FeedbackEndpoints")
+| project TimeGenerated, Log_s
+| order by TimeGenerated desc
+```
+
+**Decision memo:** `.squad/decisions/inbox/wash-observability.md` — recommend wiring App Insights + exception handler + `/health` in next sprint.
+
+---
+
+**2026-04-19: Observability Audit Note**
+Captain reported intermittent prod errors (quiz scoring, feedback). Decision memo filed: wire App Insights, add exception handler + ProblemDetails, wrap AI endpoint failures with try/catch+LogError, add /health endpoint. Awaiting approval; ~1 day implement + e2e verify.
+
diff --git a/.squad/decisions.md b/.squad/decisions.md
index edc5ea5..d8c8b62 100644
--- a/.squad/decisions.md
+++ b/.squad/decisions.md
@@ -569,3 +569,71 @@ Kaylee did not touch any file in this revision. All changes authored by Zoe. Con
 - Ship as `b56c1c1` on `main` (local only)
 - **Do NOT push** — Captain owns push gate via `/review`
 - Kaylee stays locked out; Phase 2 batch is Zoe-revised and ready for final review
+
+---
+
+## 2026-04-19 — Production Observability for SentenceStudio API
+
+**Date:** 2026-04-19  
+**Author:** Wash (Backend Dev)  
+**Status:** Proposed — awaiting Captain review
+
+### Problem
+
+Captain reported intermittent production errors (quiz sentence scoring, feedback submission) and asked whether he can see them in "Aspire on Azure." He cannot — Aspire dashboard is a local-dev tool only. Today on Azure Container Apps we have:
+
+- ✅ stdout/stderr → `ContainerAppConsoleLogs_CL` in Log Analytics `law-3ovvqiybthkb6`
+- ✅ Default ASP.NET Core console logger (captures `ILogger<T>` writes)
+- ❌ No Application Insights
+- ❌ No `UseExceptionHandler` / ProblemDetails middleware
+- ❌ No `/health` endpoint mapped
+- ❌ OTLP exporter in `ServiceDefaults.ConfigureOpenTelemetry` is gated on `OTEL_EXPORTER_OTLP_ENDPOINT` — unset in prod → OpenTelemetry traces/metrics are generated but go nowhere
+- ❌ `/api/v1/ai/chat` and `/ai/chat-messages` return `Results.Problem(...)` with no `logger.LogError` on the catch path → OpenAI failures are invisible unless the ASP.NET Core pipeline emits an unhandled-exception log
+
+Consequence: Captain can't triage "quiz scoring failed this morning" without reading raw container logs and guessing.
+
+### Proposal
+
+Three-part change, landed together in one PR (Wash, ~1 day of work):
+
+**1. Wire Application Insights**
+   - Add `Aspire.Azure.Monitor.OpenTelemetry` package reference in `SentenceStudio.ServiceDefaults`
+   - In `ConfigureOpenTelemetry`, register `UseAzureMonitor()` if `APPLICATIONINSIGHTS_CONNECTION_STRING` is set
+   - In `AppHost.cs`, add `builder.AddAzureApplicationInsights("appinsights")` and `.WithReference(appinsights)` on `api`, `webapp`, and `workers`
+   - **Gain:** end-to-end request traces, dependency calls (OpenAI, ElevenLabs, Postgres), unhandled exceptions with stack traces, Application Map view
+
+**2. Unhandled exception middleware + ProblemDetails**
+   - In `Program.cs`, before auth: `builder.Services.AddProblemDetails()` + `app.UseExceptionHandler()` + `app.UseStatusCodePages()`
+   - **Gain:** every unhandled exception is logged with full stack + request context; clients get structured ProblemDetails
+
+**3. Try/catch + `LogError` in AI endpoints**
+   - `/api/v1/ai/chat`, `/ai/chat-messages`, `/ai/analyze-image` wrap `GetResponseAsync` call with try/catch + structured logging (prompt hash, user-profile-id)
+   - **Gain:** OpenAI failures correlated to specific users/requests
+
+**4. `/health` endpoint**
+   - `app.MapHealthChecks("/health")`
+   - **Gain:** ACA health probes become explicit; simple DB ping check
+
+### Cost
+
+App Insights in sampling mode: well under $5/month. LAW workspace already exists.
+
+### Immediate Workaround (while PR is in flight)
+
+```bash
+az containerapp logs tail -g rg-sstudio-prod -n api --follow --tail 200
+```
+
+KQL for retrospective search:
+```kusto
+ContainerAppConsoleLogs_CL
+| where TimeGenerated > ago(12h)
+| where ContainerAppName_s == "api"
+| where Log_s has_any ("error", "Exception", "fail", "Unhandled", "FeedbackEndpoints")
+| project TimeGenerated, Log_s
+| order by TimeGenerated desc
+```
+
+### Ask
+
+Approve the four items above. Wash implements in single PR, ~1 day including end-to-end verification (MAUI → API → OpenAI traces flow).

From 0e8b89a0497adf8138597546066363dfb80f6e85 Mon Sep 17 00:00:00 2001
From: David Ortinau <david.ortinau@microsoft.com>
Date: Sun, 19 Apr 2026 21:28:15 -0500
Subject: [PATCH 2/4] .squad: Merge Wash mobile observability memo &
 cross-agent notes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Merge wash-mobile-observability.md from inbox → decisions.md
- Update decisions.md with full mobile App Insights plan
- Append cross-agent note to Kaylee history (Blazor JS error bridge opportunity)
- Update Wash history with planning context

Awaiting Captain decisions on:
1. One vs. two App Insights resources
2. Connection string embedding OK
3. App Store submission timeline (PrivacyInfo.xcprivacy)
4. Marketing site scope

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .squad/agents/kaylee/history.md |  19 +++++
 .squad/agents/wash/history.md   |  30 +++++++
 .squad/decisions.md             | 134 ++++++++++++++++++++++++++++++++
 3 files changed, 183 insertions(+)

diff --git a/.squad/agents/kaylee/history.md b/.squad/agents/kaylee/history.md
index 1b60737..a2decbf 100644
--- a/.squad/agents/kaylee/history.md
+++ b/.squad/agents/kaylee/history.md
@@ -194,3 +194,22 @@ feat(i18n): Phase 2 Batch N — {area} strings to Korean
 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
 ```
 Never push — Captain runs `/review` first.
+
+---
+
+## 2026-04-20 — Potential Parallel Opportunity: Blazor JS Error Bridge (Mobile App Insights)
+
+**Cross-agent note from Scribe (Wash spawn context)**
+
+Wash's mobile observability memo identifies capturing Blazor WebView JavaScript errors as one of five telemetry hooks for App Insights integration. Current scope: Wash handles `.NET-side` wiring (Azure exporter, `MauiExceptions` subscriber, business event extensions).
+
+**Blazor JS error bridge** (separate piece):
+- `wwwroot/js/error-bridge.js`: global `window.onerror` + `unhandledrejection` handler
+- `JsErrorBridge.cs` service: `[JSInvokable]` method to receive errors from JS layer
+- JSInterop registration in DI
+
+**If Captain approves parallel work,** Kaylee could own this independently while Wash does the .NET wiring. Minimal merge conflict surface (JS file + one new service class). Leaves Wash free to focus on HTTP instrumentation + `MauiExceptions` plumbing.
+
+**Current status:** Awaiting Captain decision on full 1-day plan vs. 3-hour small-slice PoC, and answers to open questions. Will be documented in `.squad/decisions.md` once merged.
+
+**Reference:** `.squad/decisions/inbox/wash-mobile-observability.md` (now merged into decisions.md as of 2026-04-20).
diff --git a/.squad/agents/wash/history.md b/.squad/agents/wash/history.md
index e220bb1..38cc3fe 100644
--- a/.squad/agents/wash/history.md
+++ b/.squad/agents/wash/history.md
@@ -1206,3 +1206,33 @@ ContainerAppConsoleLogs_CL
 **2026-04-19: Observability Audit Note**
 Captain reported intermittent prod errors (quiz scoring, feedback). Decision memo filed: wire App Insights, add exception handler + ProblemDetails, wrap AI endpoint failures with try/catch+LogError, add /health endpoint. Awaiting approval; ~1 day implement + e2e verify.
 
+
+---
+
+## 2026-04-19 — Mobile Observability Plan (Captain: "what are you gonna do to add App Insights to the mobile app?")
+
+**Key finding:** Mobile side is 80% already done. Didn't expect that going in.
+
+**Inventory:**
+- `SentenceStudio.MauiServiceDefaults/Extensions.cs` already calls `ConfigureOpenTelemetry()` with Logging + Metrics (HttpClient, Runtime) + Tracing (HttpClient). OTLP exporter is gated on `OTEL_EXPORTER_OTLP_ENDPOINT` (unset for mobile — works in local Aspire dev only).
+- `MauiExceptions.cs` already handles the platform gauntlet: AppDomain, TaskScheduler, iOS MarshalManagedException with `UnwindNativeCode`, Android `UnhandledExceptionRaiser`, WinUI 3 FirstChance+Application.UnhandledException. But **no subscriber is attached** anywhere → crashes die silently today.
+- `AddEmbeddedAppSettings()` loads invariant + Production/Development JSON from `SentenceStudio.AppLib` assembly manifest resources. Natural home for Azure Monitor connection string.
+- Typed HttpClients (`AiApiClient`, `FeedbackApiClient`, `SpeechApiClient`, `PlansApiClient`) already flow through `AddStandardResilienceHandler` + service discovery. OTel HttpClient instrumentation already captures them.
+- Zero `Microsoft.ApplicationInsights.*` refs anywhere. Clean slate.
+
+**Plan delivered (memo):** Add `Azure.Monitor.OpenTelemetry.Exporter` 1.3.0 (NOT classic AI SDK — MS .NET 10+ recommended path), plug into existing OTel pipeline via `AddOpenTelemetry().UseAzureMonitor(...)`, subscribe `ILogger` sink to `MauiExceptions.UnhandledException`, add a tiny `wwwroot/js/error-bridge.js` + `[JSInvokable] JsErrorBridge` for BlazorWebView JS errors, DEBUG-guard the connection string load so dev/simulator builds emit nothing.
+
+**Correlation:** Automatic via W3C `traceparent` header injection by OTel HttpClient instrumentation — works end-to-end once API side also emits OTel → App Insights. **Therefore server memo must ship first or in parallel** for correlation to be real.
+
+**iOS gotchas to remember for implementation:**
+- Full-link Release builds will strip `Azure.Monitor.OpenTelemetry.Exporter` reflection targets → need `Properties/LinkerConfig.xml` preserve directive.
+- `PrivacyInfo.xcprivacy` needs "Crash Data" + "Performance Data" entries for App Store — not needed for DX24 sideload.
+- Exporter has built-in 24h local file cache; don't disable it (handles offline).
+
+**PII discipline:** UserProfileId (GUID) yes. Email/display name/Korean user text NO. Scrub at log sites, not via a processor (easier). OTel doesn't capture HTTP bodies by default — don't opt in.
+
+**Effort:** ~1 day total. Recommended first-increment slice is ~3 hours: exporter + MauiExceptions subscriber only, Mac Catalyst first, prove the pipe works before investing in JS bridge / custom events / iOS AOT work.
+
+**Memo filed:** `.squad/decisions/inbox/wash-mobile-observability.md`.
+
+**Rule of thumb learned:** Before proposing new infrastructure, always inventory what's already wired. `MauiServiceDefaults` had the whole OTel pipeline sitting there, gated on an env var. The real gap was an exporter + a subscriber, not a rebuild.
diff --git a/.squad/decisions.md b/.squad/decisions.md
index d8c8b62..2d3cb97 100644
--- a/.squad/decisions.md
+++ b/.squad/decisions.md
@@ -637,3 +637,137 @@ ContainerAppConsoleLogs_CL
 ### Ask
 
 Approve the four items above. Wash implements in single PR, ~1 day including end-to-end verification (MAUI → API → OpenAI traces flow).
+
+---
+
+## 2026-04-20 — Mobile Observability via Azure Monitor OpenTelemetry (App Insights)
+
+**Date:** 2026-04-20  
+**Author:** Wash (Backend Dev)  
+**Status:** 🔵 PROPOSED — awaiting Captain decisions  
+**Companion:** `.squad/decisions/inbox/wash-observability.md` (API side)
+
+### TL;DR
+
+OpenTelemetry is **already wired** in `SentenceStudio.MauiServiceDefaults` (HttpClient + Runtime instrumentation). `MauiExceptions.cs` already normalizes crashes. We need to: (1) add Azure Monitor exporter NuGet, (2) subscribe `ILogger` to unhandled exceptions, (3) add Blazor JS error bridge, (4) ship connection string in embedded `appsettings.Production.json`, (5) suppress telemetry in DEBUG builds by default.
+
+**Estimated effort:** ~1 day full path; ~3 hours small-slice (exporter + subscriber + Mac Catalyst proof-of-concept).
+
+**Blocker:** API-side memo must ship first for end-to-end correlation.
+
+### Current State Inventory
+
+| Component | Status |
+|---|---|
+| OTel Logging + Metrics + Tracing in `SentenceStudio.MauiServiceDefaults` | ✅ Configured; OTLP exporter gated on env var |
+| `MauiExceptions.cs` normalization (all platforms) | ✅ Wired; **no subscriber attached** |
+| Blazor WebView JS error capture | ❌ Missing |
+| Connection string transport | — Ready to embed in `appsettings.Production.json` |
+| Classic `Microsoft.ApplicationInsights.*` refs | ✅ None (clean slate) |
+
+### Implementation Plan — Five Hooks
+
+| Hook | Where | Impact |
+|---|---|---|
+| Unhandled .NET exceptions | Subscribe `ILogger` to `MauiExceptions.UnhandledException` in `AddMauiServiceDefaults` | iOS/Mac/Android/Windows unified crash capture |
+| Blazor component errors | Already captured by OTel Logging (default Warnings+) | Component render/event handler failures |
+| JS exceptions in WebView | New `wwwroot/js/error-bridge.js` + `[JSInvokable]` service | Third-party script failures, Blazor JS errors |
+| HTTP failures to API | Already captured via OTel HttpClient instrumentation | 5xx responses, timeouts (auto spans with status codes) |
+| Custom business events | New extension methods: `LogQuizScoringFailed`, `LogFeedbackSubmitFailed` in catch sites | Sliceable failure analytics |
+
+### NuGet & Configuration
+
+**Package:**
+```xml
+<PackageReference Include="Azure.Monitor.OpenTelemetry.Exporter" Version="1.3.0" />
+```
+
+**Connection string location:**
+```json
+// src/SentenceStudio.AppLib/appsettings.Production.json
+{
+  "AzureMonitor": {
+    "ConnectionString": "InstrumentationKey=...;IngestionEndpoint=...;LiveEndpoint=..."
+  }
+}
+```
+
+**Dev vs. Prod Toggle (C# code in `AddMauiServiceDefaults`):**
+```csharp
+var aiConnString = builder.Configuration["AzureMonitor:ConnectionString"];
+#if DEBUG
+aiConnString = null;  // Never send telemetry from Debug builds
+#endif
+if (!string.IsNullOrWhiteSpace(aiConnString))
+{
+    builder.Services.AddOpenTelemetry()
+        .UseAzureMonitor(o => o.ConnectionString = aiConnString);
+}
+```
+
+### iOS-Specific Gotchas
+
+1. **Linker/AOT stripping:** Add `Properties/LinkerConfig.xml` preserve directive for `Azure.Monitor.OpenTelemetry.Exporter` and `OpenTelemetry.Exporter.*`
+2. **Startup cost:** ~50-150ms amortized (acceptable)
+3. **Offline buffering:** Built-in 24h local cache; enabled by default (don't disable)
+4. **Privacy manifest (iOS 17+):** Need `PrivacyInfo.xcprivacy` declaring "Crash Data" + "Performance Data" — ~15 min task; required before next App Store submission
+5. **No DiagnosticSource reflection issues** on net10 (resolved in .NET 9 era)
+
+### Correlation (Client ↔ Server)
+
+**Automatic once both sides emit OTel.** OpenTelemetry's `HttpClientInstrumentation` injects `traceparent` header; ASP.NET Core picks it up. Same `Operation-Id` spans both sides. Zero code.
+
+**Prerequisite:** API-side memo must ship first (or simultaneously).
+
+### PII / Privacy
+
+- **HTTP bodies:** Not captured by default; don't opt in
+- **User IDs:** OK to include `UserProfileId` (GUID) as baggage; **never** log emails, names, user sentences
+- **Device IDs:** Use `DeviceInfo.Idiom` + `DeviceInfo.Platform`; avoid `DeviceInfo.Name` (may contain personal data)
+- **Exception messages:** Discipline at log sites — don't log user text inline with exceptions
+- **TelemetryProcessor:** Optional tag truncation for values > 256 chars (lower priority, address if telemetry exceeds quota)
+
+### Sequencing
+
+1. **First (Day 1):** API-side memo ships (retrospective visibility on current prod errors)
+2. **Second (Day 2):** MAUI client side (this memo)
+3. **Third (Day 3, optional):** Custom dashboards + alert rules in App Insights
+
+**Parallel opportunity:** Kaylee could own Blazor JS error bridge independently while Wash does .NET wiring.
+
+### Ballpark Effort Breakdown
+
+- ~2h — NuGet + exporter + connection string + DEBUG toggle in `MauiServiceDefaults`
+- ~1h — `MauiExceptions` subscriber + `ILogger<App>` wiring
+- ~1.5h — JS error bridge (`error-bridge.js` + `JsErrorBridge.cs` + JSInterop)
+- ~1h — Custom business event extensions (`LogQuizScoringFailed`, `LogFeedbackSubmitFailed`)
+- ~1h — iOS linker preserve config + Release-to-device smoke test
+- ~1h — `PrivacyInfo.xcprivacy` update (can defer if not submitting this cycle)
+- ~0.5h — Controlled exception smoke test on each platform
+- ~1h — End-to-end correlation smoke test (tap quiz → client span → server span under same `operation_Id`)
+
+**Total: ~1 day.** Small-slice (proof-of-concept): ~3 hours.
+
+### Recommended First Increment
+
+**Wire Azure Monitor exporter + `MauiExceptions` subscriber only. Mac Catalyst DEBUG with connection string forced on. Skip Blazor JS bridge, custom events, iOS AOT work.**
+
+**Proves:**
+- Package compatibility with OTel setup ✓
+- Connection string loading from embedded `appsettings` ✓
+- Unhandled crashes reach App Insights ✓
+- End-to-end correlation with API ✓ (if API memo lands first)
+
+**Effort:** ~3 hours. Green-light the rest if successful; kill if blockers emerge before 1-day investment.
+
+### Open Questions for Captain
+
+1. **One App Insights resource or two (client vs server)?** One is simpler + correlation just works. Two gives separation but doubles setup. **Recommendation: One.**
+2. **OK with `appsettings.Production.json` shipping the connection string in app bundle?** Standard practice; low risk. Alternative (fetch from API at startup) creates chicken-and-egg problem.
+3. **When is next App Store submission?** Drives whether `PrivacyInfo.xcprivacy` update is urgent or can slip.
+4. **Include Marketing site?** Out of this memo's scope but trivial to add via API-side path.
+
+### Decision Required
+
+- Approve full 1-day plan OR small-slice 3-hour proof-of-concept?
+- Answer the four open questions above (drives implementation order)?

From 7734958329e153c59be1f95bdced25025cf7c070 Mon Sep 17 00:00:00 2001
From: David Ortinau <david.ortinau@microsoft.com>
Date: Sun, 19 Apr 2026 21:43:16 -0500
Subject: [PATCH 3/4] Scribe: Merge mobile App Insights QA decision (Wash
 2026-04-20)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Orchestration log: 2026-04-20T02-42-17Z-wash.md
  * Wash answered Captain's 3 follow-up questions on mobile observability
  * Findings: ONE App Insights resource (shared MAUI+API), embed connection string, reject TinyInsights.Maui

- Session log: 2026-04-20T02-42-17Z-mobile-appinsights-qa.md (brief summary)

- Merged decision inbox → decisions.md (now 54.5KB)
  * wash-mobile-appinsights-answers.md merged with full QA rationale
  * Deleted inbox file after merge

- Tasks 4-7: no-op (no cross-agent work, no archiving needed, history.md already summarized)

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .squad/agents/wash/history.md | 13 +++++
 .squad/decisions.md           | 94 +++++++++++++++++++++++++++++++++++
 2 files changed, 107 insertions(+)

diff --git a/.squad/agents/wash/history.md b/.squad/agents/wash/history.md
index 38cc3fe..18021ed 100644
--- a/.squad/agents/wash/history.md
+++ b/.squad/agents/wash/history.md
@@ -1236,3 +1236,16 @@ Captain reported intermittent prod errors (quiz scoring, feedback). Decision mem
 **Memo filed:** `.squad/decisions/inbox/wash-mobile-observability.md`.
 
 **Rule of thumb learned:** Before proposing new infrastructure, always inventory what's already wired. `MauiServiceDefaults` had the whole OTel pipeline sitting there, gated on an env var. The real gap was an exporter + a subscriber, not a rebuild.
+
+
+---
+
+## Learnings — 2026-04-20 (Mobile App Insights follow-up: TinyInsights eval + security stance)
+
+**Connection string is write-only; embed it.** InstrumentationKey authorizes ingestion push only — can't read telemetry or touch other Azure resources. Microsoft's own docs tell mobile/desktop/JS clients to ship it in the app bundle. Worst case is fake-telemetry spam, bounded by daily ingestion cap ($5/day) + sampling. All the "secure" alternatives (fetch from API at startup, per-user keys, Key Vault) are **strictly worse** for a mobile app — chicken-and-egg (no telemetry if API is down, which is exactly when you need it), massive complexity for zero security gain, or require an Azure identity the app doesn't have. Rule: write-only keys with bounded blast radius belong in the client. Read-capable secrets never do.
+
+**TinyInsights.Maui evaluated — REJECTED for this project.** Active project (Daniel Hindrikes, MVP, net10 support Jan 2026, crash improvements Apr 2026), nice developer ergonomics. BUT it depends on the **legacy `Microsoft.ApplicationInsights` 2.23.0** SDK, not OpenTelemetry. Our `SentenceStudio.MauiServiceDefaults` already has an OTel pipeline, and the API side is planning Azure Monitor OTel exporter. Mixing SDK families breaks W3C `traceparent` correlation between MAUI and API — which is the whole reason we want ONE App Insights resource in the first place. Would also duplicate telemetry (double HttpClient tracking, double exporters, double cost). Stuck with `Azure.Monitor.OpenTelemetry.Exporter` 1.3.0.
+
+**Rule of thumb: SDK family consistency > convenience.** When the server tier commits to OpenTelemetry, the client tier has to stay on OpenTelemetry too — or correlation is theater. Check the `<PackageReference>` before adopting any MAUI observability library: if it pulls `Microsoft.ApplicationInsights.*` (classic SDK) and your server uses `Azure.Monitor.OpenTelemetry.Exporter`, walk away no matter how good the DX looks.
+
+**First-increment plan UNCHANGED.** TinyInsights rejection doesn't alter the 3-hour Mac Catalyst slice: add exporter package, wire `UseAzureMonitor` in `ConfigureOpenTelemetry` guarded on Release+connection-string-present, subscribe `ILogger` to `MauiExceptions.UnhandledException`, embed connection string in `appsettings.Production.json`. Ship in parallel with server memo's PR so day-one traces already span both tiers.
diff --git a/.squad/decisions.md b/.squad/decisions.md
index 2d3cb97..bcf1ffe 100644
--- a/.squad/decisions.md
+++ b/.squad/decisions.md
@@ -771,3 +771,97 @@ if (!string.IsNullOrWhiteSpace(aiConnString))
 
 - Approve full 1-day plan OR small-slice 3-hour proof-of-concept?
 - Answer the four open questions above (drives implementation order)?
+# Mobile App Insights — Follow-up Answers
+
+**Author:** Wash (Backend Dev)
+**Date:** 2026-04-20
+**Companion:** `.squad/decisions/inbox/wash-mobile-observability.md` (original scope)
+**Questions from Captain:** 1 (one vs two resources), 2 (connection string security), + evaluate `TinyInsights.Maui`
+
+---
+
+## A. One App Insights resource vs two
+
+**What a "resource" is.** In Azure, an Application Insights resource is a billable instance that holds a bucket of telemetry. It has one **connection string** (endpoint + InstrumentationKey) that tells a client where to send data, a daily ingestion cap, a retention period, and its own KQL query surface. One resource = one scope for billing, alerts, dashboards, and queries.
+
+**Options:**
+- **ONE shared resource** — MAUI client and API both emit to the same resource with the same connection string (client bundled, server from env var).
+- **TWO resources** — `ai-sstudio-mobile` + `ai-sstudio-api`, each with its own connection string, billing, and dashboards.
+
+**Recommendation: ONE resource.** Reasons:
+1. **End-to-end traces in a single query.** OpenTelemetry injects a W3C `traceparent` header automatically. Both tiers land in the same `requests`/`dependencies`/`exceptions` tables, so one KQL query walks the whole call.
+2. **Concrete example — Quiz "Score" fails.** With ONE resource:
+   ```kusto
+   union customEvents, dependencies, requests, exceptions
+   | where operation_Id == "<trace-id>"
+   | order by timestamp asc
+   ```
+   You see: `QuizScoreTapped` event (MAUI) → outgoing HTTP `POST /api/v1/ai/chat` (MAUI) → incoming request (API) → OpenAI dependency call → the exception, with stack trace. One timeline. With TWO resources you'd run two KQL queries and correlate by hand.
+3. **Simpler billing + one daily cap** to protect cost.
+4. `cloud_RoleName` already distinguishes "MAUI" from "api"/"webapp" for filtering when you want tier-specific dashboards.
+
+**When TWO would win:** different retention/access control per tier, or you give the mobile team access while keeping server telemetry siloed. Neither applies here — Captain owns both.
+
+---
+
+## B. Connection string security
+
+**What it actually is.** `InstrumentationKey=…;IngestionEndpoint=https://…` — a **write-only token** for Azure Monitor's ingestion endpoint. It cannot read telemetry, list resources, or touch anything else in your Azure subscription. Reading telemetry requires an Entra identity with `Reader` on the resource (your Azure login).
+
+**Embedding in the app bundle is the standard.** Microsoft's own App Insights and Azure Monitor docs tell mobile/desktop/JS clients to ship the connection string in the app. The SDK cannot exist without it and the key's blast radius is bounded to "someone pushes fake telemetry at your resource".
+
+**Threat model + mitigations:**
+| Risk | Mitigation |
+|---|---|
+| Attacker extracts key, spams fake telemetry | **Daily data cap** ($5–10/day) — App Insights stops accepting once hit, no overage bill |
+| Same attacker floods you with noise | **Sampling** (10–25%) — exporter drops most repeat traces before send |
+| You want to rotate after abuse | Regenerate connection string in Azure portal; ship next app build |
+
+Daily cap is the single most important knob. Set it at resource creation.
+
+**Alternatives and why they're worse for a mobile app:**
+- **Fetch from authenticated API at startup** — chicken-and-egg: if the app can't reach the API you get zero telemetry about that exact outage. Also adds a mandatory network hop before any crash from boot can be reported.
+- **Per-user keys** — massive complexity, no security win (key is still write-only).
+- **Key Vault** — requires an Azure identity the app doesn't have; granting one would be *worse* than the write-only key.
+
+**Recommendation: embed the connection string in `appsettings.Production.json` inside the MAUI app bundle.** Set daily cap to $5/day, sampling to 10% for dependencies/requests, 100% for exceptions/crashes. Rotate only if abuse is observed.
+
+---
+
+## C. TinyInsights.Maui evaluation
+
+**Source:** https://github.com/dhindrik/TinyInsights.Maui (Daniel Hindrikes, Microsoft MVP; active — last commit 2026-04-15 including net10 support, crash-handling improvements).
+
+**What it is.** A thin wrapper over the **classic `Microsoft.ApplicationInsights` 2.23.0 SDK** (NOT OpenTelemetry). Provides:
+- `UseTinyInsights(connectionString)` one-liner in `MauiProgram.cs`
+- `IInsights` interface for `TrackEventAsync`, `TrackPageViewAsync`, `TrackErrorAsync`, `TrackDependencyTracker`
+- Automatic crash capture (hooks the platform exception pipelines) with store-and-forward on next launch
+- `InsightsMessageHandler` for HttpClient dependency tracking
+- `UseTinyInsightsAsILogger` variant — `ILogger` calls become telemetry
+- A companion web UI for mobile-friendly viewing
+
+**Verdict: Do NOT adopt. Stick with `Azure.Monitor.OpenTelemetry.Exporter`.**
+
+**Why:**
+1. **Wrong SDK family.** TinyInsights depends on the **legacy** `Microsoft.ApplicationInsights.*` SDK. Our API side is OpenTelemetry + Azure Monitor exporter. The two emit to the same resource but **don't share Activity context** — W3C `traceparent` correlation between MAUI and API would be fragile or broken. The whole point of Section A (one resource, one query) collapses.
+2. **Fights our existing wiring.** `SentenceStudio.MauiServiceDefaults.ConfigureOpenTelemetry` already builds the OTel pipeline (HttpClient + Runtime instrumentation, logging, metrics, tracing). TinyInsights would run in parallel — double telemetry cost, two exporters, two code paths for the same signals.
+3. **Conveniences we don't need.** Auto page-view tracking is Shell/MAUI XAML-centric; SentenceStudio is Blazor Hybrid (one MAUI page, all navigation inside Blazor). The `InsightsMessageHandler` duplicates what OTel HttpClient instrumentation already does. Crash auto-capture is ~30 lines we already have MauiExceptions for.
+4. **Single-maintainer risk for a core dependency.** Active now, but one-person projects stall. OTel + Azure Monitor exporter is Microsoft-maintained and tracks .NET 10/11/12 automatically.
+5. **Null positive: crash store-and-forward.** That's the one nice feature TinyInsights ships that the exporter doesn't give you for free — but `Azure.Monitor.OpenTelemetry.Exporter` has a built-in 48-hour local file cache for offline ingestion, which covers the same scenario.
+
+**Where TinyInsights WOULD be right:** a greenfield MAUI app with no OTel, no server-side correlation needs, and a dev who wants `insights.TrackEventAsync("ButtonTap")` without reading OTel docs. Not us.
+
+---
+
+## First Increment (unchanged from original memo)
+
+~3 hours on Mac Catalyst:
+1. `<PackageReference Include="Azure.Monitor.OpenTelemetry.Exporter" Version="1.3.0" />` in `SentenceStudio.MauiServiceDefaults`.
+2. In `ConfigureOpenTelemetry`, call `.UseAzureMonitor(o => o.ConnectionString = cfg["ApplicationInsights:ConnectionString"])` only when the value is present AND build is Release.
+3. Subscribe `ILogger<AppCrash>` to `MauiExceptions.UnhandledException` so crashes land as exception telemetry.
+4. Embed connection string in `appsettings.Production.json` shipped inside `SentenceStudio.AppLib`.
+5. Ship in parallel with the server-side App Insights PR so the first trace you query already spans both tiers.
+
+Daily cap $5, sampling 10% (requests/dependencies), 100% (exceptions).
+
+**Ready for approval — no blockers.**

From 35aa030804e1be807f83ccbbbe79690c99d3444e Mon Sep 17 00:00:00 2001
From: David Ortinau <david.ortinau@microsoft.com>
Date: Sun, 19 Apr 2026 22:15:42 -0500
Subject: [PATCH 4/4] Mobile App Insights: small slice (Mac Catalyst)

Wires Azure Monitor OpenTelemetry exporter into the existing MAUI OTel
pipeline and subscribes to MauiExceptions.UnhandledException so unhandled
crashes land in Application Insights.

Changes:
- MauiServiceDefaults: add Azure.Monitor.OpenTelemetry.Exporter 1.7.0,
  bump OpenTelemetry.Extensions.Hosting/Exporter.OTLP/Instrumentation.Http
  to 1.15.x to satisfy the transitive floor. AddAzureMonitor{Log,Metric,
  Trace}Exporter is gated on #if !DEBUG + a non-empty connection string so
  simulator/dev runs stay silent. Sets a stable service name
  SentenceStudio.Mobile.<Platform> so App Insights cloud_RoleName
  identifies the client clearly.
- SentenceStudioAppBuilder.InitializeApp: one subscriber on
  MauiExceptions.UnhandledException that critical-logs the exception and
  best-effort ForceFlush's LoggerProvider/TracerProvider/MeterProvider
  (3s budget) before the process dies.
- appsettings.Production.json: AzureMonitor:ConnectionString for the
  sstudio-mobile-ai App Insights resource in rg-sstudio-prod.
- MacCatalyst/MauiProgram.cs: #if DEBUG guard the DevFlow usings so
  Release builds don't fail on the debug-only package references
  (pre-existing bug, unblocks Release validation).

OUT of scope (deferred to full plan): Blazor WebView JS bridge, Android
linker preserve, iOS linker preserve, PrivacyInfo.xcprivacy, Windows,
custom processors.

Validated on Mac Catalyst Release with a forced InvalidOperationException;
record appeared in App Insights within ~5 minutes with cloud_RoleName=
SentenceStudio.Mobile.MacCatalyst. Server-side companion still pending
so client spans will be orphan until the API also emits to App Insights.

Refs: .squad/decisions.md wash-mobile-observability, wash-mobile-appinsights-answers

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../Setup/SentenceStudioAppBuilder.cs         | 29 +++++++++++++++++
 .../appsettings.Production.json               |  3 ++
 src/SentenceStudio.MacCatalyst/MauiProgram.cs |  2 ++
 .../Extensions.cs                             | 32 +++++++++++++++++++
 .../SentenceStudio.MauiServiceDefaults.csproj |  9 +++---
 5 files changed, 71 insertions(+), 4 deletions(-)

diff --git a/src/SentenceStudio.AppLib/Setup/SentenceStudioAppBuilder.cs b/src/SentenceStudio.AppLib/Setup/SentenceStudioAppBuilder.cs
index 62fc839..71de641 100644
--- a/src/SentenceStudio.AppLib/Setup/SentenceStudioAppBuilder.cs
+++ b/src/SentenceStudio.AppLib/Setup/SentenceStudioAppBuilder.cs
@@ -5,6 +5,10 @@
 using Microsoft.Extensions.Logging;
 using OpenAI;
 using ElevenLabs;
+using OpenTelemetry;
+using OpenTelemetry.Logs;
+using OpenTelemetry.Metrics;
+using OpenTelemetry.Trace;
 using SentenceStudio.Abstractions;
 
 namespace SentenceStudio;
@@ -83,6 +87,31 @@ public static MauiApp InitializeApp(MauiApp app)
         var logger = app.Services.GetRequiredService<ILoggerFactory>().CreateLogger("MauiProgram");
         logger.LogDebug("✅ MauiApp built successfully");
 
+        // Wire unhandled-exception capture → OTel log pipeline (→ Azure Monitor in Release).
+        // MauiExceptions normalizes iOS/MacCatalyst/Android/Windows/Desktop platform handlers into a single
+        // event; we attach ONE subscriber here. Best-effort ForceFlush on the three OTel providers so
+        // the crash record has a chance to reach the exporter before the process dies.
+        var crashLogger = app.Services.GetRequiredService<ILoggerFactory>().CreateLogger("SentenceStudio.UnhandledException");
+        var loggerProvider = app.Services.GetService<LoggerProvider>();
+        var tracerProvider = app.Services.GetService<TracerProvider>();
+        var meterProvider = app.Services.GetService<MeterProvider>();
+        MauiExceptions.UnhandledException += (sender, args) =>
+        {
+            try
+            {
+                var ex = args.ExceptionObject as Exception;
+                crashLogger.LogCritical(ex, "Unhandled exception (isTerminating={IsTerminating})", args.IsTerminating);
+
+                try { loggerProvider?.ForceFlush(3000); } catch { }
+                try { tracerProvider?.ForceFlush(3000); } catch { }
+                try { meterProvider?.ForceFlush(3000); } catch { }
+            }
+            catch
+            {
+                // Never throw from the last-chance handler.
+            }
+        };
+
         // CRITICAL: Initialize database schema SYNCHRONOUSLY before app starts
         logger.LogDebug("🚀 CHECKPOINT 1: About to get ISyncService");
 
diff --git a/src/SentenceStudio.AppLib/appsettings.Production.json b/src/SentenceStudio.AppLib/appsettings.Production.json
index 0beef1b..5e97f9a 100644
--- a/src/SentenceStudio.AppLib/appsettings.Production.json
+++ b/src/SentenceStudio.AppLib/appsettings.Production.json
@@ -8,5 +8,8 @@
             "https": ["https://webapp.livelyforest-b32e7d63.centralus.azurecontainerapps.io"],
             "http": ["http://webapp.livelyforest-b32e7d63.centralus.azurecontainerapps.io"]
         }
+    },
+    "AzureMonitor": {
+        "ConnectionString": "InstrumentationKey=6fc7a02d-9035-4989-bf8b-c7f023f71990;IngestionEndpoint=https://centralus-2.in.applicationinsights.azure.com/;LiveEndpoint=https://centralus.livediagnostics.monitor.azure.com/;ApplicationId=74e94530-d17f-404a-8726-b7266724b70f"
     }
 }
diff --git a/src/SentenceStudio.MacCatalyst/MauiProgram.cs b/src/SentenceStudio.MacCatalyst/MauiProgram.cs
index 18a4595..c341671 100644
--- a/src/SentenceStudio.MacCatalyst/MauiProgram.cs
+++ b/src/SentenceStudio.MacCatalyst/MauiProgram.cs
@@ -1,7 +1,9 @@
 using CommunityToolkit.Maui;
 using CommunityToolkit.Maui.Storage;
+#if DEBUG
 using Microsoft.Maui.DevFlow.Agent;
 using Microsoft.Maui.DevFlow.Blazor;
+#endif
 using Microsoft.Extensions.Hosting;
 using Microsoft.Extensions.Logging;
 using Plugin.Maui.Audio;
diff --git a/src/SentenceStudio.MauiServiceDefaults/Extensions.cs b/src/SentenceStudio.MauiServiceDefaults/Extensions.cs
index fb83a8b..8237fa1 100644
--- a/src/SentenceStudio.MauiServiceDefaults/Extensions.cs
+++ b/src/SentenceStudio.MauiServiceDefaults/Extensions.cs
@@ -1,4 +1,6 @@
+using Azure.Monitor.OpenTelemetry.Exporter;
 using Microsoft.Extensions.Configuration;
+using Microsoft.Extensions.DependencyInjection;
 using Microsoft.Extensions.DependencyInjection.Extensions;
 using Microsoft.Extensions.Hosting;
 using Microsoft.Extensions.Logging;
@@ -6,6 +8,7 @@
 using OpenTelemetry;
 using OpenTelemetry.Logs;
 using OpenTelemetry.Metrics;
+using OpenTelemetry.Resources;
 using OpenTelemetry.Trace;
 using System.Text.RegularExpressions;
 
@@ -52,6 +55,15 @@ public static TBuilder AddMauiServiceDefaults<TBuilder>(this TBuilder builder) w
 
     public static TBuilder ConfigureOpenTelemetry<TBuilder>(this TBuilder builder) where TBuilder : IHostApplicationBuilder
     {
+        // Tag every signal with a stable service name so App Insights cloud_RoleName
+        // clearly identifies the mobile client (not just "SentenceStudio").
+        var platform = DeviceInfo.Platform != DevicePlatform.Unknown
+            ? DeviceInfo.Platform.ToString()
+            : "Unknown";
+        var serviceName = $"SentenceStudio.Mobile.{platform}";
+        builder.Services.AddOpenTelemetry()
+            .ConfigureResource(resource => resource.AddService(serviceName: serviceName));
+
         builder.Logging.AddOpenTelemetry(logging =>
         {
             logging.IncludeFormattedMessage = true;
@@ -102,6 +114,26 @@ private static TBuilder AddOpenTelemetryExporters<TBuilder>(this TBuilder builde
             builder.Services.AddOpenTelemetry().UseOtlpExporter();
         }
 
+        // Azure Monitor (Application Insights) exporter.
+        // - DEBUG builds: compile-time disabled so simulator/dev runs never ship telemetry to prod.
+        // - Release builds: enabled iff AzureMonitor:ConnectionString is present in embedded appsettings.
+        //   Connection string is write-only (push-only ingestion auth). See
+        //   .squad/agents/wash/history.md "Mobile App Insights follow-up" for the security rationale.
+#if !DEBUG
+        var azureMonitorConnectionString = builder.Configuration["AzureMonitor:ConnectionString"];
+        if (!string.IsNullOrWhiteSpace(azureMonitorConnectionString))
+        {
+            builder.Logging.AddOpenTelemetry(o =>
+                o.AddAzureMonitorLogExporter(options => options.ConnectionString = azureMonitorConnectionString));
+
+            builder.Services.AddOpenTelemetry()
+                .WithMetrics(metrics =>
+                    metrics.AddAzureMonitorMetricExporter(options => options.ConnectionString = azureMonitorConnectionString))
+                .WithTracing(tracing =>
+                    tracing.AddAzureMonitorTraceExporter(options => options.ConnectionString = azureMonitorConnectionString));
+        }
+#endif
+
         return builder;
     }
 }
diff --git a/src/SentenceStudio.MauiServiceDefaults/SentenceStudio.MauiServiceDefaults.csproj b/src/SentenceStudio.MauiServiceDefaults/SentenceStudio.MauiServiceDefaults.csproj
index 015bb25..2ac3851 100644
--- a/src/SentenceStudio.MauiServiceDefaults/SentenceStudio.MauiServiceDefaults.csproj
+++ b/src/SentenceStudio.MauiServiceDefaults/SentenceStudio.MauiServiceDefaults.csproj
@@ -11,9 +11,10 @@
 	<ItemGroup>
 		<PackageReference Include="Microsoft.Extensions.Http.Resilience" Version="9.0.0" />
 		<PackageReference Include="Microsoft.Extensions.ServiceDiscovery" Version="9.0.0" />
-		<PackageReference Include="OpenTelemetry.Exporter.OpenTelemetryProtocol" Version="1.9.0" />
-		<PackageReference Include="OpenTelemetry.Extensions.Hosting" Version="1.9.0" />
-		<PackageReference Include="OpenTelemetry.Instrumentation.Http" Version="1.9.0" />
-		<PackageReference Include="OpenTelemetry.Instrumentation.Runtime" Version="1.9.0" />
+		<PackageReference Include="Azure.Monitor.OpenTelemetry.Exporter" Version="1.7.0" />
+		<PackageReference Include="OpenTelemetry.Exporter.OpenTelemetryProtocol" Version="1.15.1" />
+		<PackageReference Include="OpenTelemetry.Extensions.Hosting" Version="1.15.1" />
+		<PackageReference Include="OpenTelemetry.Instrumentation.Http" Version="1.15.0" />
+		<PackageReference Include="OpenTelemetry.Instrumentation.Runtime" Version="1.12.0" />
 	</ItemGroup>
 </Project>