From bb7f5efbe0976b0406f8f4d547dbe713635aa266 Mon Sep 17 00:00:00 2001 From: Tyler Stapler Date: Fri, 24 Apr 2026 08:47:56 -0700 Subject: [PATCH 1/6] feat(android/voice): on-device STT and LLM via SpeechRecognizer + Gemini Nano MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Fix Android microphone runtime permission — was silently denied with no prompt; now uses ActivityResultContracts.RequestPermission wired through MainActivity - Add AndroidSpeechRecognizerProvider: wraps android.speech.SpeechRecognizer as a DirectSpeechProvider (combined record+transcribe, no audio upload, works offline) with EXTRA_PREFER_OFFLINE=true and RMS amplitude for waveform animation - Add MlKitLlmFormatterProvider: on-device LLM formatting via ML Kit Prompt API (Gemini Nano through AICore); handles AVAILABLE/DOWNLOADABLE/DOWNLOADING states; returns user-friendly error while model downloads rather than blocking - Add DirectSpeechProvider interface to commonMain to support integrated pipelines that bypass the two-step record→STT path - Both on-device options are configurable via Settings → Voice Capture toggles; toggles only appear on devices that report availability - Pipeline priority: device LLM > Anthropic Claude > OpenAI > no-op - Bump minSdk 24→26 (required by com.google.mlkit:genai-prompt:1.0.0-beta2) - Add genai-prompt dependency to kmp androidMain Co-Authored-By: Claude Sonnet 4.6 --- androidApp/build.gradle.kts | 3 +- .../dev/stapler/stelekit/MainActivity.kt | 48 ++++++- kmp/build.gradle.kts | 3 + .../stelekit/voice/AndroidAudioRecorder.kt | 10 +- .../voice/AndroidSpeechRecognizerProvider.kt | 114 ++++++++++++++++ .../voice/MlKitLlmFormatterProvider.kt | 81 +++++++++++ .../kotlin/dev/stapler/stelekit/ui/App.kt | 14 +- .../ui/components/settings/SettingsDialog.kt | 4 + .../settings/VoiceCaptureSettings.kt | 126 +++++++++++++----- .../stelekit/voice/DirectSpeechProvider.kt | 15 +++ .../stelekit/voice/VoiceCaptureViewModel.kt | 85 ++++++------ .../stelekit/voice/VoicePipelineConfig.kt | 7 +- .../stelekit/voice/VoicePipelineFactory.kt | 16 ++- .../stapler/stelekit/voice/VoiceSettings.kt | 14 ++ 14 files changed, 457 insertions(+), 83 deletions(-) create mode 100644 kmp/src/androidMain/kotlin/dev/stapler/stelekit/voice/AndroidSpeechRecognizerProvider.kt create mode 100644 kmp/src/androidMain/kotlin/dev/stapler/stelekit/voice/MlKitLlmFormatterProvider.kt create mode 100644 kmp/src/commonMain/kotlin/dev/stapler/stelekit/voice/DirectSpeechProvider.kt diff --git a/androidApp/build.gradle.kts b/androidApp/build.gradle.kts index ead54dc..9a33d92 100644 --- a/androidApp/build.gradle.kts +++ b/androidApp/build.gradle.kts @@ -13,7 +13,7 @@ android { defaultConfig { applicationId = "dev.stapler.stelekit" - minSdk = 24 + minSdk = 26 targetSdk = 36 versionCode = 1 versionName = (findProperty("appVersion") as? String ?: "0.1.0") @@ -66,4 +66,5 @@ dependencies { implementation(platform("androidx.compose:compose-bom:2024.09.02")) implementation("androidx.compose.ui:ui") implementation("androidx.compose.material3:material3") + } diff --git a/androidApp/src/main/kotlin/dev/stapler/stelekit/MainActivity.kt b/androidApp/src/main/kotlin/dev/stapler/stelekit/MainActivity.kt index babf894..d1e7a60 100644 --- a/androidApp/src/main/kotlin/dev/stapler/stelekit/MainActivity.kt +++ b/androidApp/src/main/kotlin/dev/stapler/stelekit/MainActivity.kt @@ -1,6 +1,8 @@ package dev.stapler.stelekit +import android.Manifest import android.content.Intent +import android.content.pm.PackageManager import android.net.Uri import android.os.Bundle import android.util.Log @@ -8,6 +10,7 @@ import androidx.activity.ComponentActivity import androidx.activity.compose.setContent import androidx.activity.enableEdgeToEdge import androidx.activity.result.contract.ActivityResultContracts +import androidx.core.content.ContextCompat import androidx.compose.runtime.getValue import androidx.compose.runtime.mutableStateOf import androidx.compose.runtime.remember @@ -17,7 +20,11 @@ import dev.stapler.stelekit.domain.UrlFetcherAndroid import dev.stapler.stelekit.platform.SteleKitContext import dev.stapler.stelekit.platform.PlatformFileSystem import dev.stapler.stelekit.ui.StelekitApp +import android.speech.SpeechRecognizer +import androidx.compose.runtime.LaunchedEffect import dev.stapler.stelekit.voice.AndroidAudioRecorder +import dev.stapler.stelekit.voice.AndroidSpeechRecognizerProvider +import dev.stapler.stelekit.voice.MlKitLlmFormatterProvider import dev.stapler.stelekit.voice.VoiceSettings import dev.stapler.stelekit.voice.buildVoicePipeline import dev.stapler.stelekit.platform.PlatformSettings @@ -26,6 +33,14 @@ import kotlinx.coroutines.CompletableDeferred class MainActivity : ComponentActivity() { private var pendingFolderPick: CompletableDeferred? = null + private var pendingMicPermission: CompletableDeferred? = null + + private val micPermissionLauncher = registerForActivityResult( + ActivityResultContracts.RequestPermission() + ) { granted -> + pendingMicPermission?.complete(granted) + pendingMicPermission = null + } private val folderPickerLauncher = registerForActivityResult( ActivityResultContracts.OpenDocumentTree() @@ -106,20 +121,47 @@ class MainActivity : ComponentActivity() { } } } - val audioRecorder = remember { AndroidAudioRecorder(this@MainActivity.applicationContext) } + val audioRecorder = remember { AndroidAudioRecorder(this@MainActivity.applicationContext, this@MainActivity::requestMicrophonePermission) } val voiceSettings = remember { VoiceSettings(PlatformSettings()) } - var voicePipeline by remember { mutableStateOf(buildVoicePipeline(audioRecorder, voiceSettings)) } + val deviceSttAvailable = remember { AndroidSpeechRecognizerProvider.isAvailable(this@MainActivity.applicationContext) } + val deviceSttProvider = remember { + if (deviceSttAvailable) AndroidSpeechRecognizerProvider(this@MainActivity.applicationContext) else null + } + val mlKitProvider = remember { MlKitLlmFormatterProvider.create() } + var deviceLlmAvailable by remember { mutableStateOf(false) } + LaunchedEffect(Unit) { + deviceLlmAvailable = mlKitProvider?.checkEligible() ?: false + } + fun buildPipeline() = buildVoicePipeline( + audioRecorder, + voiceSettings, + if (deviceSttAvailable && voiceSettings.getUseDeviceStt()) deviceSttProvider else null, + if (deviceLlmAvailable && voiceSettings.getUseDeviceLlm()) mlKitProvider else null, + ) + var voicePipeline by remember { mutableStateOf(buildPipeline()) } StelekitApp( fileSystem = fileSystem, graphPath = fileSystem.getDefaultGraphPath(), urlFetcher = UrlFetcherAndroid(), voicePipeline = voicePipeline, voiceSettings = voiceSettings, - onRebuildVoicePipeline = { voicePipeline = buildVoicePipeline(audioRecorder, voiceSettings) }, + onRebuildVoicePipeline = { voicePipeline = buildPipeline() }, + deviceSttAvailable = deviceSttAvailable, + deviceLlmAvailable = deviceLlmAvailable, ) } } + private suspend fun requestMicrophonePermission(): Boolean { + if (ContextCompat.checkSelfPermission(this, Manifest.permission.RECORD_AUDIO) + == PackageManager.PERMISSION_GRANTED + ) return true + val deferred = CompletableDeferred() + pendingMicPermission = deferred + runOnUiThread { micPermissionLauncher.launch(Manifest.permission.RECORD_AUDIO) } + return deferred.await() + } + companion object { private const val TAG = "MainActivity" /** Authority for AOSP ExternalStorageProvider — the only provider supported in v1. */ diff --git a/kmp/build.gradle.kts b/kmp/build.gradle.kts index e2f57f6..695f466 100644 --- a/kmp/build.gradle.kts +++ b/kmp/build.gradle.kts @@ -175,6 +175,9 @@ kotlin { // Encrypted SharedPreferences for API key storage implementation("androidx.security:security-crypto:1.1.0-alpha06") + + // On-device LLM via Gemini Nano (Pixel 9+ and AICore-enabled OEM flagships) + implementation("com.google.mlkit:genai-prompt:1.0.0-beta2") } } diff --git a/kmp/src/androidMain/kotlin/dev/stapler/stelekit/voice/AndroidAudioRecorder.kt b/kmp/src/androidMain/kotlin/dev/stapler/stelekit/voice/AndroidAudioRecorder.kt index 3c17caf..d82ab22 100644 --- a/kmp/src/androidMain/kotlin/dev/stapler/stelekit/voice/AndroidAudioRecorder.kt +++ b/kmp/src/androidMain/kotlin/dev/stapler/stelekit/voice/AndroidAudioRecorder.kt @@ -25,7 +25,11 @@ import kotlin.math.sqrt private const val TAG = "AndroidAudioRecorder" -class AndroidAudioRecorder(private val context: Context) : AudioRecorder { +class AndroidAudioRecorder( + private val context: Context, + /** Called before recording starts; must return true if RECORD_AUDIO permission is granted. */ + private val requestMicPermission: (suspend () -> Boolean)? = null, +) : AudioRecorder { companion object { private const val SAMPLE_RATE = 16_000 @@ -51,6 +55,10 @@ class AndroidAudioRecorder(private val context: Context) : AudioRecorder { @Volatile private var pauseRequested = false override suspend fun startRecording(): PlatformAudioFile = withContext(Dispatchers.IO) { + if (requestMicPermission != null && !requestMicPermission()) { + return@withContext PlatformAudioFile("") + } + stopRequested = false pauseRequested = false diff --git a/kmp/src/androidMain/kotlin/dev/stapler/stelekit/voice/AndroidSpeechRecognizerProvider.kt b/kmp/src/androidMain/kotlin/dev/stapler/stelekit/voice/AndroidSpeechRecognizerProvider.kt new file mode 100644 index 0000000..159a91a --- /dev/null +++ b/kmp/src/androidMain/kotlin/dev/stapler/stelekit/voice/AndroidSpeechRecognizerProvider.kt @@ -0,0 +1,114 @@ +// Copyright (c) 2026 Tyler Stapler +// SPDX-License-Identifier: Elastic-2.0 +package dev.stapler.stelekit.voice + +import android.content.Context +import android.content.Intent +import android.os.Bundle +import android.os.Handler +import android.os.Looper +import android.speech.RecognitionListener +import android.speech.RecognizerIntent +import android.speech.SpeechRecognizer +import android.util.Log +import kotlinx.coroutines.Dispatchers +import kotlinx.coroutines.flow.Flow +import kotlinx.coroutines.flow.MutableStateFlow +import kotlinx.coroutines.flow.asStateFlow +import kotlinx.coroutines.suspendCancellableCoroutine +import kotlinx.coroutines.withContext +import kotlin.coroutines.resume + +private const val TAG = "AndroidSpeechRecognizer" + +class AndroidSpeechRecognizerProvider(private val context: Context) : DirectSpeechProvider { + + companion object { + fun isAvailable(context: Context): Boolean = + SpeechRecognizer.isRecognitionAvailable(context) + } + + private val _amplitudeFlow = MutableStateFlow(0f) + override val amplitudeFlow: Flow = _amplitudeFlow.asStateFlow() + + @Volatile private var activeRecognizer: SpeechRecognizer? = null + private val mainHandler = Handler(Looper.getMainLooper()) + + override suspend fun listen(): TranscriptResult = suspendCancellableCoroutine { cont -> + mainHandler.post { + val recognizer = SpeechRecognizer.createSpeechRecognizer(context) + activeRecognizer = recognizer + + recognizer.setRecognitionListener(object : RecognitionListener { + override fun onReadyForSpeech(params: Bundle?) {} + override fun onBeginningOfSpeech() {} + override fun onBufferReceived(buffer: ByteArray?) {} + override fun onEndOfSpeech() {} + override fun onEvent(eventType: Int, params: Bundle?) {} + override fun onPartialResults(partialResults: Bundle?) {} + + override fun onRmsChanged(rmsdB: Float) { + // Map roughly -2..10 dB → 0..1 + _amplitudeFlow.value = ((rmsdB + 2f) / 12f).coerceIn(0f, 1f) + } + + override fun onResults(results: Bundle?) { + _amplitudeFlow.value = 0f + activeRecognizer = null + recognizer.destroy() + if (!cont.isActive) return + val text = results + ?.getStringArrayList(SpeechRecognizer.RESULTS_RECOGNITION) + ?.firstOrNull() + Log.d(TAG, "onResults: text=${text?.take(80)}") + if (text.isNullOrBlank()) cont.resume(TranscriptResult.Empty) + else cont.resume(TranscriptResult.Success(text)) + } + + override fun onError(error: Int) { + _amplitudeFlow.value = 0f + activeRecognizer = null + recognizer.destroy() + if (!cont.isActive) return + Log.w(TAG, "onError: code=$error") + cont.resume(mapError(error)) + } + }) + + val intent = Intent(RecognizerIntent.ACTION_RECOGNIZE_SPEECH).apply { + putExtra(RecognizerIntent.EXTRA_LANGUAGE_MODEL, RecognizerIntent.LANGUAGE_MODEL_FREE_FORM) + putExtra(RecognizerIntent.EXTRA_PREFER_OFFLINE, true) + putExtra(RecognizerIntent.EXTRA_MAX_RESULTS, 1) + putExtra(RecognizerIntent.EXTRA_SPEECH_INPUT_COMPLETE_SILENCE_LENGTH_MILLIS, 3_000L) + putExtra(RecognizerIntent.EXTRA_SPEECH_INPUT_POSSIBLY_COMPLETE_SILENCE_LENGTH_MILLIS, 1_500L) + } + recognizer.startListening(intent) + + cont.invokeOnCancellation { + mainHandler.post { + activeRecognizer?.let { + it.cancel() + it.destroy() + activeRecognizer = null + } + _amplitudeFlow.value = 0f + } + } + } + } + + override suspend fun stopListening() { + withContext(Dispatchers.Main) { + activeRecognizer?.stopListening() + } + } + + private fun mapError(error: Int): TranscriptResult = when (error) { + SpeechRecognizer.ERROR_NO_MATCH, + SpeechRecognizer.ERROR_SPEECH_TIMEOUT -> TranscriptResult.Empty + SpeechRecognizer.ERROR_INSUFFICIENT_PERMISSIONS -> TranscriptResult.Failure.PermissionDenied + SpeechRecognizer.ERROR_NETWORK, + SpeechRecognizer.ERROR_NETWORK_TIMEOUT -> TranscriptResult.Failure.NetworkError + else -> TranscriptResult.Failure.ApiError(error, "Speech recognition error (code $error)") + } +} diff --git a/kmp/src/androidMain/kotlin/dev/stapler/stelekit/voice/MlKitLlmFormatterProvider.kt b/kmp/src/androidMain/kotlin/dev/stapler/stelekit/voice/MlKitLlmFormatterProvider.kt new file mode 100644 index 0000000..2181931 --- /dev/null +++ b/kmp/src/androidMain/kotlin/dev/stapler/stelekit/voice/MlKitLlmFormatterProvider.kt @@ -0,0 +1,81 @@ +// Copyright (c) 2026 Tyler Stapler +// SPDX-License-Identifier: Elastic-2.0 +package dev.stapler.stelekit.voice + +import android.util.Log +import com.google.mlkit.genai.common.FeatureStatus +import com.google.mlkit.genai.prompt.Generation +import com.google.mlkit.genai.prompt.GenerativeModel +import kotlinx.coroutines.CancellationException + +private const val TAG = "MlKitLlmFormatter" + +/** + * On-device LLM formatter backed by ML Kit Prompt API (Gemini Nano via AICore). + * + * Supported devices: Pixel 9+ and major OEM flagships with AICore (Samsung S25, etc.). + * Output hard-capped at 256 tokens by the on-device model — suitable for short voice notes. + * API status: beta (com.google.mlkit:genai-prompt:1.0.0-beta2). + */ +class MlKitLlmFormatterProvider private constructor( + private val model: GenerativeModel, +) : LlmFormatterProvider { + + companion object { + /** Creates the provider; returns null if the ML Kit library fails to initialise. */ + fun create(): MlKitLlmFormatterProvider? = runCatching { + MlKitLlmFormatterProvider(Generation.getClient()) + }.getOrElse { e -> + Log.w(TAG, "Failed to create GenerativeModel", e) + null + } + } + + /** Returns true when the device supports on-device inference (model available or will download). */ + suspend fun checkEligible(): Boolean = runCatching { + when (model.checkStatus()) { + FeatureStatus.AVAILABLE, + FeatureStatus.DOWNLOADABLE, + FeatureStatus.DOWNLOADING -> true + else -> false + } + }.getOrElse { e -> + Log.w(TAG, "checkStatus failed", e) + false + } + + override suspend fun format(transcript: String, systemPrompt: String): LlmResult { + return try { + when (model.checkStatus()) { + FeatureStatus.AVAILABLE -> { + Log.d(TAG, "Running on-device inference (${transcript.length} chars input)") + val response = model.generateContent(systemPrompt) + val text = response.candidates.firstOrNull()?.text?.trim() + if (text.isNullOrBlank()) { + LlmResult.Failure.ApiError(-1, "Empty response from on-device model") + } else { + Log.d(TAG, "On-device inference complete (${text.length} chars output)") + LlmResult.Success(text, LlmProviderSupport.detectTruncation(text)) + } + } + FeatureStatus.DOWNLOADABLE, + FeatureStatus.DOWNLOADING -> { + // AICore downloads the model in the background automatically. + // Blocking here would take several minutes — return a friendly retry message. + LlmResult.Failure.ApiError( + -1, + "On-device model is downloading — try again in a few minutes" + ) + } + else -> { + LlmResult.Failure.ApiError(-1, "On-device LLM not supported on this device") + } + } + } catch (e: CancellationException) { + throw e + } catch (e: Exception) { + Log.e(TAG, "On-device inference error", e) + LlmResult.Failure.ApiError(-1, "On-device LLM error: ${e.message}") + } + } +} diff --git a/kmp/src/commonMain/kotlin/dev/stapler/stelekit/ui/App.kt b/kmp/src/commonMain/kotlin/dev/stapler/stelekit/ui/App.kt index c3dfdf4..ce22dba 100644 --- a/kmp/src/commonMain/kotlin/dev/stapler/stelekit/ui/App.kt +++ b/kmp/src/commonMain/kotlin/dev/stapler/stelekit/ui/App.kt @@ -110,6 +110,8 @@ fun StelekitApp( voicePipeline: VoicePipelineConfig = remember { VoicePipelineConfig() }, voiceSettings: VoiceSettings? = null, onRebuildVoicePipeline: (() -> Unit)? = null, + deviceSttAvailable: Boolean = false, + deviceLlmAvailable: Boolean = false, spanRecorder: SpanRecorder = NoOpSpanRecorder, ) { val platformSettings = remember { PlatformSettings() } @@ -229,6 +231,8 @@ fun StelekitApp( voicePipeline = voicePipeline, voiceSettings = voiceSettings, onRebuildVoicePipeline = onRebuildVoicePipeline, + deviceSttAvailable = deviceSttAvailable, + deviceLlmAvailable = deviceLlmAvailable, spanRecorder = spanRecorder, ) } @@ -254,6 +258,8 @@ private fun GraphContent( voicePipeline: VoicePipelineConfig = VoicePipelineConfig(), voiceSettings: VoiceSettings? = null, onRebuildVoicePipeline: (() -> Unit)? = null, + deviceSttAvailable: Boolean = false, + deviceLlmAvailable: Boolean = false, spanRecorder: SpanRecorder = NoOpSpanRecorder, ) { CompositionLocalProvider(LocalSpanRecorder provides spanRecorder) { @@ -647,7 +653,7 @@ private fun GraphContent( onTap = { voiceCaptureViewModel.onMicTapped() }, onDismissError = { voiceCaptureViewModel.dismissError() }, onAutoReset = { voiceCaptureViewModel.resetToIdle() }, - amplitudeFlow = voicePipeline.audioRecorder.amplitudeFlow, + amplitudeFlow = voicePipeline.effectiveAmplitudeFlow, ) }, ) @@ -662,6 +668,8 @@ private fun GraphContent( fileSystem = fileSystem, voiceSettings = voiceSettings, onRebuildVoicePipeline = onRebuildVoicePipeline, + deviceSttAvailable = deviceSttAvailable, + deviceLlmAvailable = deviceLlmAvailable, frameMetric = frameMetricState, debugState = debugMenuState, onDebugStateChange = { newState -> @@ -863,6 +871,8 @@ private fun GraphDialogLayer( fileSystem: FileSystem, voiceSettings: VoiceSettings? = null, onRebuildVoicePipeline: (() -> Unit)? = null, + deviceSttAvailable: Boolean = false, + deviceLlmAvailable: Boolean = false, frameMetric: kotlinx.coroutines.flow.StateFlow, debugState: DebugMenuState = DebugMenuState(), onDebugStateChange: (DebugMenuState) -> Unit = {}, @@ -900,6 +910,8 @@ private fun GraphDialogLayer( onLeftHandedChange = { viewModel.setLeftHanded(it) }, voiceSettings = voiceSettings, onRebuildVoicePipeline = onRebuildVoicePipeline, + deviceSttAvailable = deviceSttAvailable, + deviceLlmAvailable = deviceLlmAvailable, ) appState.diskConflict?.let { conflict -> diff --git a/kmp/src/commonMain/kotlin/dev/stapler/stelekit/ui/components/settings/SettingsDialog.kt b/kmp/src/commonMain/kotlin/dev/stapler/stelekit/ui/components/settings/SettingsDialog.kt index 0639e04..70785ae 100644 --- a/kmp/src/commonMain/kotlin/dev/stapler/stelekit/ui/components/settings/SettingsDialog.kt +++ b/kmp/src/commonMain/kotlin/dev/stapler/stelekit/ui/components/settings/SettingsDialog.kt @@ -32,6 +32,8 @@ fun SettingsDialog( onLeftHandedChange: (Boolean) -> Unit = {}, voiceSettings: VoiceSettings? = null, onRebuildVoicePipeline: (() -> Unit)? = null, + deviceSttAvailable: Boolean = false, + deviceLlmAvailable: Boolean = false, ) { if (visible) { Dialog( @@ -118,6 +120,8 @@ fun SettingsDialog( VoiceCaptureSettings( voiceSettings = voiceSettings, onRebuildPipeline = onRebuildVoicePipeline, + deviceSttAvailable = deviceSttAvailable, + deviceLlmAvailable = deviceLlmAvailable, ) } } diff --git a/kmp/src/commonMain/kotlin/dev/stapler/stelekit/ui/components/settings/VoiceCaptureSettings.kt b/kmp/src/commonMain/kotlin/dev/stapler/stelekit/ui/components/settings/VoiceCaptureSettings.kt index 7d26066..a98b682 100644 --- a/kmp/src/commonMain/kotlin/dev/stapler/stelekit/ui/components/settings/VoiceCaptureSettings.kt +++ b/kmp/src/commonMain/kotlin/dev/stapler/stelekit/ui/components/settings/VoiceCaptureSettings.kt @@ -27,34 +27,60 @@ import dev.stapler.stelekit.voice.VoiceSettings fun VoiceCaptureSettings( voiceSettings: VoiceSettings, onRebuildPipeline: () -> Unit, + deviceSttAvailable: Boolean = false, + deviceLlmAvailable: Boolean = false, ) { var whisperKey by remember { mutableStateOf(voiceSettings.getWhisperApiKey() ?: "") } var anthropicKey by remember { mutableStateOf(voiceSettings.getAnthropicKey() ?: "") } var openAiKey by remember { mutableStateOf(voiceSettings.getOpenAiKey() ?: "") } var llmEnabled by remember { mutableStateOf(voiceSettings.getLlmEnabled()) } + var useDeviceStt by remember { mutableStateOf(voiceSettings.getUseDeviceStt()) } + var useDeviceLlm by remember { mutableStateOf(voiceSettings.getUseDeviceLlm()) } var saved by remember { mutableStateOf(false) } SettingsSection("Transcription (Speech-to-Text)") { - Text( - "Whisper API key — used for speech transcription (~\$0.003/min).", - style = MaterialTheme.typography.bodySmall, - color = MaterialTheme.colorScheme.onSurfaceVariant, - modifier = Modifier.padding(bottom = 8.dp), - ) - OutlinedTextField( - value = whisperKey, - onValueChange = { whisperKey = it; saved = false }, - label = { Text("OpenAI / Whisper API key") }, - visualTransformation = PasswordVisualTransformation(), - singleLine = true, - modifier = Modifier.fillMaxWidth(), - ) + if (deviceSttAvailable) { + Row( + modifier = Modifier.fillMaxWidth().padding(bottom = 8.dp), + horizontalArrangement = Arrangement.SpaceBetween, + verticalAlignment = Alignment.CenterVertically, + ) { + Text("Use on-device speech recognition", style = MaterialTheme.typography.bodyMedium) + Switch( + checked = useDeviceStt, + onCheckedChange = { useDeviceStt = it; saved = false }, + ) + } + if (useDeviceStt) { + Text( + "Transcription happens on-device — no API key or network required.", + style = MaterialTheme.typography.bodySmall, + color = MaterialTheme.colorScheme.onSurfaceVariant, + modifier = Modifier.padding(bottom = 8.dp), + ) + } + } + if (!deviceSttAvailable || !useDeviceStt) { + Text( + "Whisper API key — used for speech transcription (~\$0.003/min).", + style = MaterialTheme.typography.bodySmall, + color = MaterialTheme.colorScheme.onSurfaceVariant, + modifier = Modifier.padding(bottom = 8.dp), + ) + OutlinedTextField( + value = whisperKey, + onValueChange = { whisperKey = it; saved = false }, + label = { Text("OpenAI / Whisper API key") }, + visualTransformation = PasswordVisualTransformation(), + singleLine = true, + modifier = Modifier.fillMaxWidth(), + ) + } } SettingsSection("LLM Formatting") { Text( - "Formats the raw transcript into Logseq outliner syntax with bullet points and [[wikilinks]]. " + - "Provide one key — Anthropic is used if both are set.", + "Formats the raw transcript into Logseq outliner syntax with bullet points and [[wikilinks]].", style = MaterialTheme.typography.bodySmall, color = MaterialTheme.colorScheme.onSurfaceVariant, modifier = Modifier.padding(bottom = 8.dp), @@ -71,24 +97,54 @@ fun VoiceCaptureSettings( ) } if (llmEnabled) { - OutlinedTextField( - value = anthropicKey, - onValueChange = { anthropicKey = it; saved = false }, - label = { Text("Anthropic (Claude) API key") }, - visualTransformation = PasswordVisualTransformation(), - singleLine = true, - modifier = Modifier.fillMaxWidth(), - ) - OutlinedTextField( - value = openAiKey, - onValueChange = { openAiKey = it; saved = false }, - label = { Text("OpenAI / compatible API key") }, - visualTransformation = PasswordVisualTransformation(), - singleLine = true, - modifier = Modifier - .fillMaxWidth() - .padding(top = 8.dp), - ) + if (deviceLlmAvailable) { + Row( + modifier = Modifier.fillMaxWidth().padding(bottom = 8.dp), + horizontalArrangement = Arrangement.SpaceBetween, + verticalAlignment = Alignment.CenterVertically, + ) { + Text("Use on-device LLM (Gemini Nano)", style = MaterialTheme.typography.bodyMedium) + Switch( + checked = useDeviceLlm, + onCheckedChange = { useDeviceLlm = it; saved = false }, + ) + } + if (useDeviceLlm) { + Text( + "Formatting runs on-device — no API key or network required. " + + "256-token output limit; longer notes may be truncated.", + style = MaterialTheme.typography.bodySmall, + color = MaterialTheme.colorScheme.onSurfaceVariant, + modifier = Modifier.padding(bottom = 8.dp), + ) + } + } + if (!deviceLlmAvailable || !useDeviceLlm) { + Text( + "Provide one key — Anthropic is used if both are set.", + style = MaterialTheme.typography.bodySmall, + color = MaterialTheme.colorScheme.onSurfaceVariant, + modifier = Modifier.padding(bottom = 8.dp), + ) + OutlinedTextField( + value = anthropicKey, + onValueChange = { anthropicKey = it; saved = false }, + label = { Text("Anthropic (Claude) API key") }, + visualTransformation = PasswordVisualTransformation(), + singleLine = true, + modifier = Modifier.fillMaxWidth(), + ) + OutlinedTextField( + value = openAiKey, + onValueChange = { openAiKey = it; saved = false }, + label = { Text("OpenAI / compatible API key") }, + visualTransformation = PasswordVisualTransformation(), + singleLine = true, + modifier = Modifier + .fillMaxWidth() + .padding(top = 8.dp), + ) + } } } @@ -104,6 +160,8 @@ fun VoiceCaptureSettings( voiceSettings.setAnthropicKey(anthropicKey) voiceSettings.setOpenAiKey(openAiKey) voiceSettings.setLlmEnabled(llmEnabled) + voiceSettings.setUseDeviceStt(useDeviceStt) + voiceSettings.setUseDeviceLlm(useDeviceLlm) saved = true onRebuildPipeline() }, diff --git a/kmp/src/commonMain/kotlin/dev/stapler/stelekit/voice/DirectSpeechProvider.kt b/kmp/src/commonMain/kotlin/dev/stapler/stelekit/voice/DirectSpeechProvider.kt new file mode 100644 index 0000000..8c19dad --- /dev/null +++ b/kmp/src/commonMain/kotlin/dev/stapler/stelekit/voice/DirectSpeechProvider.kt @@ -0,0 +1,15 @@ +// Copyright (c) 2026 Tyler Stapler +// SPDX-License-Identifier: Elastic-2.0 +package dev.stapler.stelekit.voice + +import kotlinx.coroutines.flow.Flow + +/** Combines recording and transcription in a single step (e.g. Android SpeechRecognizer). */ +interface DirectSpeechProvider { + /** Records and transcribes; suspends until the user stops or silence is detected. */ + suspend fun listen(): TranscriptResult + /** Signals an in-progress listen to stop and return results. */ + suspend fun stopListening() {} + /** Optional RMS amplitude stream for animated feedback. */ + val amplitudeFlow: Flow? get() = null +} diff --git a/kmp/src/commonMain/kotlin/dev/stapler/stelekit/voice/VoiceCaptureViewModel.kt b/kmp/src/commonMain/kotlin/dev/stapler/stelekit/voice/VoiceCaptureViewModel.kt index 6f9e76d..e8e0ba7 100644 --- a/kmp/src/commonMain/kotlin/dev/stapler/stelekit/voice/VoiceCaptureViewModel.kt +++ b/kmp/src/commonMain/kotlin/dev/stapler/stelekit/voice/VoiceCaptureViewModel.kt @@ -35,7 +35,8 @@ class VoiceCaptureViewModel( when (_state.value) { is VoiceCaptureState.Idle -> startPipeline() is VoiceCaptureState.Recording -> scope.launch { - pipeline.audioRecorder.stopRecording() + pipeline.directSpeechProvider?.stopListening() + ?: pipeline.audioRecorder.stopRecording() } else -> Unit } @@ -57,51 +58,55 @@ class VoiceCaptureViewModel( private fun startPipeline() { pipelineJob = scope.launch { - var file: PlatformAudioFile? = null - try { - _state.value = VoiceCaptureState.Recording - val result = pipeline.audioRecorder.startRecording() - file = result - - if (result.isEmpty) { + _state.value = VoiceCaptureState.Recording + val transcriptResult = if (pipeline.directSpeechProvider != null) { + pipeline.directSpeechProvider.listen() + } else { + recordAndTranscribe() + } + when (transcriptResult) { + null -> return@launch // error already set inside recordAndTranscribe + TranscriptResult.Empty -> { + _state.value = VoiceCaptureState.Error( + PipelineStage.TRANSCRIBING, "Nothing was captured — try again" + ) + } + is TranscriptResult.Failure.ApiError -> { + _state.value = VoiceCaptureState.Error( + PipelineStage.TRANSCRIBING, transcriptResult.message + ) + } + TranscriptResult.Failure.NetworkError -> { + _state.value = VoiceCaptureState.Error( + PipelineStage.TRANSCRIBING, "Network error — check your connection" + ) + } + TranscriptResult.Failure.PermissionDenied -> { _state.value = VoiceCaptureState.Error( PipelineStage.RECORDING, "Microphone permission denied" ) - return@launch } + is TranscriptResult.Success -> processTranscript(transcriptResult.text.trim()) + } + } + } - _state.value = VoiceCaptureState.Transcribing - val audioData = pipeline.audioRecorder.readBytes(result) - when (val sttResult = pipeline.sttProvider.transcribe(audioData)) { - TranscriptResult.Empty -> { - _state.value = VoiceCaptureState.Error( - PipelineStage.TRANSCRIBING, "Nothing was captured — try again" - ) - return@launch - } - is TranscriptResult.Failure.ApiError -> { - _state.value = VoiceCaptureState.Error( - PipelineStage.TRANSCRIBING, sttResult.message - ) - return@launch - } - TranscriptResult.Failure.NetworkError -> { - _state.value = VoiceCaptureState.Error( - PipelineStage.TRANSCRIBING, "Network error — check your connection" - ) - return@launch - } - TranscriptResult.Failure.PermissionDenied -> { - _state.value = VoiceCaptureState.Error( - PipelineStage.RECORDING, "Microphone permission denied" - ) - return@launch - } - is TranscriptResult.Success -> processTranscript(sttResult.text.trim()) - } - } finally { - file?.takeIf { !it.isEmpty }?.let { pipeline.audioRecorder.deleteRecording(it) } + /** Records via [AudioRecorder] then transcribes; returns null and sets error state on failure. */ + private suspend fun recordAndTranscribe(): TranscriptResult? { + var file: PlatformAudioFile? = null + return try { + val result = pipeline.audioRecorder.startRecording() + file = result + if (result.isEmpty) { + _state.value = VoiceCaptureState.Error( + PipelineStage.RECORDING, "Microphone permission denied" + ) + return null } + _state.value = VoiceCaptureState.Transcribing + pipeline.sttProvider.transcribe(pipeline.audioRecorder.readBytes(result)) + } finally { + file?.takeIf { !it.isEmpty }?.let { pipeline.audioRecorder.deleteRecording(it) } } } diff --git a/kmp/src/commonMain/kotlin/dev/stapler/stelekit/voice/VoicePipelineConfig.kt b/kmp/src/commonMain/kotlin/dev/stapler/stelekit/voice/VoicePipelineConfig.kt index 961d47e..a485db1 100644 --- a/kmp/src/commonMain/kotlin/dev/stapler/stelekit/voice/VoicePipelineConfig.kt +++ b/kmp/src/commonMain/kotlin/dev/stapler/stelekit/voice/VoicePipelineConfig.kt @@ -20,4 +20,9 @@ class VoicePipelineConfig( val llmProvider: LlmFormatterProvider = NoOpLlmFormatterProvider(), val systemPrompt: String = DEFAULT_VOICE_SYSTEM_PROMPT, val minWordCount: Int = 10, -) + /** When set, replaces the (record → STT) two-step path with a single integrated listen. */ + val directSpeechProvider: DirectSpeechProvider? = null, +) { + /** Amplitude flow for waveform animation: prefers directSpeechProvider, falls back to audioRecorder. */ + val effectiveAmplitudeFlow get() = directSpeechProvider?.amplitudeFlow ?: audioRecorder.amplitudeFlow +} diff --git a/kmp/src/commonMain/kotlin/dev/stapler/stelekit/voice/VoicePipelineFactory.kt b/kmp/src/commonMain/kotlin/dev/stapler/stelekit/voice/VoicePipelineFactory.kt index 6d98f04..288d45f 100644 --- a/kmp/src/commonMain/kotlin/dev/stapler/stelekit/voice/VoicePipelineFactory.kt +++ b/kmp/src/commonMain/kotlin/dev/stapler/stelekit/voice/VoicePipelineFactory.kt @@ -2,7 +2,12 @@ // SPDX-License-Identifier: Elastic-2.0 package dev.stapler.stelekit.voice -fun buildVoicePipeline(audioRecorder: AudioRecorder, settings: VoiceSettings): VoicePipelineConfig { +fun buildVoicePipeline( + audioRecorder: AudioRecorder, + settings: VoiceSettings, + directSpeechProvider: DirectSpeechProvider? = null, + deviceLlmProvider: LlmFormatterProvider? = null, +): VoicePipelineConfig { val sttProvider: SpeechToTextProvider = settings.getWhisperApiKey() ?.let { WhisperSpeechToTextProvider.withDefaults(it) } ?: SpeechToTextProvider { _ -> @@ -13,10 +18,17 @@ fun buildVoicePipeline(audioRecorder: AudioRecorder, settings: VoiceSettings): V } val llmProvider: LlmFormatterProvider = if (!settings.getLlmEnabled()) { NoOpLlmFormatterProvider() + } else if (deviceLlmProvider != null && settings.getUseDeviceLlm()) { + deviceLlmProvider } else { settings.getAnthropicKey()?.let { ClaudeLlmFormatterProvider.withDefaults(it) } ?: settings.getOpenAiKey()?.let { OpenAiLlmFormatterProvider.withDefaults(it) } ?: NoOpLlmFormatterProvider() } - return VoicePipelineConfig(audioRecorder = audioRecorder, sttProvider = sttProvider, llmProvider = llmProvider) + return VoicePipelineConfig( + audioRecorder = audioRecorder, + sttProvider = sttProvider, + llmProvider = llmProvider, + directSpeechProvider = directSpeechProvider, + ) } diff --git a/kmp/src/commonMain/kotlin/dev/stapler/stelekit/voice/VoiceSettings.kt b/kmp/src/commonMain/kotlin/dev/stapler/stelekit/voice/VoiceSettings.kt index ada7a42..e127bbd 100644 --- a/kmp/src/commonMain/kotlin/dev/stapler/stelekit/voice/VoiceSettings.kt +++ b/kmp/src/commonMain/kotlin/dev/stapler/stelekit/voice/VoiceSettings.kt @@ -30,10 +30,24 @@ class VoiceSettings(private val platformSettings: PlatformSettings) { fun setLlmEnabled(enabled: Boolean) = platformSettings.putBoolean(KEY_LLM_ENABLED, enabled) + fun getUseDeviceStt(): Boolean = + platformSettings.getBoolean(KEY_USE_DEVICE_STT, false) + + fun setUseDeviceStt(enabled: Boolean) = + platformSettings.putBoolean(KEY_USE_DEVICE_STT, enabled) + + fun getUseDeviceLlm(): Boolean = + platformSettings.getBoolean(KEY_USE_DEVICE_LLM, false) + + fun setUseDeviceLlm(enabled: Boolean) = + platformSettings.putBoolean(KEY_USE_DEVICE_LLM, enabled) + companion object { private const val KEY_WHISPER = "voice.whisper_key" private const val KEY_ANTHROPIC = "voice.anthropic_key" private const val KEY_OPENAI = "voice.openai_key" private const val KEY_LLM_ENABLED = "voice.llm_enabled" + private const val KEY_USE_DEVICE_STT = "voice.use_device_stt" + private const val KEY_USE_DEVICE_LLM = "voice.use_device_llm" } } From b72c42045f7fa86e667c24a9a5393738007e0a9f Mon Sep 17 00:00:00 2001 From: Tyler Stapler Date: Fri, 24 Apr 2026 11:16:18 -0700 Subject: [PATCH 2/6] fix(android): bump kmp minSdk to 26 for genai-prompt library genai-prompt:1.0.0-beta2 declares minSdkVersion 26 in its manifest. The kmp library module had its own android { defaultConfig { minSdk } } block at 24, causing processDebugUnitTestManifest to fail. Co-Authored-By: Claude Sonnet 4.6 --- kmp/build.gradle.kts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kmp/build.gradle.kts b/kmp/build.gradle.kts index 695f466..c6c7ab2 100644 --- a/kmp/build.gradle.kts +++ b/kmp/build.gradle.kts @@ -629,7 +629,7 @@ android { namespace = "dev.stapler.stelekit" defaultConfig { - minSdk = 24 + minSdk = 26 } compileOptions { From dc66ebeaed21d53721a2a9231388ab754f6e3bbe Mon Sep 17 00:00:00 2001 From: Tyler Stapler Date: Fri, 24 Apr 2026 11:25:48 -0700 Subject: [PATCH 3/6] fix(ios): replace Dispatchers.IO with PlatformDispatcher.DB in commonMain PerformanceDashboard.kt used Dispatchers.IO directly in commonMain, which is a JVM-only symbol. compileCommonMainKotlinMetadata fails when it encounters it because Dispatchers.IO is absent from the multiplatform metadata API surface. queryPercentiles() is a blocking SQLite call so PlatformDispatcher.DB is the correct dispatcher per the project's own dispatcher matrix. Co-Authored-By: Claude Sonnet 4.6 --- .github/workflows/ci-ios.yml | 21 +++++++++---------- .../ui/components/PerformanceDashboard.kt | 4 ++-- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/.github/workflows/ci-ios.yml b/.github/workflows/ci-ios.yml index c91e685..2f06905 100644 --- a/.github/workflows/ci-ios.yml +++ b/.github/workflows/ci-ios.yml @@ -27,17 +27,16 @@ jobs: ios-framework: name: iOS Framework Link Check runs-on: macos-latest - # Two pre-existing blockers prevent iOS compilation from passing: - # 1. Gradle issue #17559 — classloader mismatch: in a multi-project build where :kmp uses - # kotlin-multiplatform and :androidApp uses AGP, KotlinNativeBundleBuildService is loaded - # by different classloaders. The KGP sets the service on tasks via - # `Property.value(provider)` which Gradle 8.8+ rejects - # when the property and provider types are the same class from different loaders. Fix - # requires JetBrains to annotate with @ServiceReference or use Property upstream. - # No Kotlin version (2.1.x, 2.2.x, 2.3.x) contains this fix. Affects all iOS tasks. - # 2. commonMain contains JVM-specific symbols (java.*, System, Dispatchers.IO, OpenTelemetry) - # that fail metadata compilation against the full multiplatform API surface. - # Neither issue is introduced by this PR. Mark the job non-blocking until they are fixed. + # One pre-existing blocker prevents full iOS compilation from passing: + # Gradle issue #17559 — classloader mismatch: in a multi-project build where :kmp uses + # kotlin-multiplatform and :androidApp uses AGP, KotlinNativeBundleBuildService is loaded + # by different classloaders. The KGP sets the service on tasks via + # `Property.value(provider)` which Gradle 8.8+ rejects + # when the property and provider types are the same class from different loaders. Fix + # requires JetBrains to annotate with @ServiceReference or use Property upstream. + # No Kotlin version (2.1.x, 2.2.x, 2.3.x) contains this fix. Affects compileKotlinIos* + # tasks but NOT compileCommonMainKotlinMetadata (which is what we run here). + # Keep non-blocking until we can verify compileCommonMainKotlinMetadata passes consistently. continue-on-error: true if: github.event.pull_request.draft == false diff --git a/kmp/src/commonMain/kotlin/dev/stapler/stelekit/ui/components/PerformanceDashboard.kt b/kmp/src/commonMain/kotlin/dev/stapler/stelekit/ui/components/PerformanceDashboard.kt index 9a6987a..0cfa3c6 100644 --- a/kmp/src/commonMain/kotlin/dev/stapler/stelekit/ui/components/PerformanceDashboard.kt +++ b/kmp/src/commonMain/kotlin/dev/stapler/stelekit/ui/components/PerformanceDashboard.kt @@ -36,7 +36,7 @@ import dev.stapler.stelekit.performance.RingBufferSpanExporter import dev.stapler.stelekit.performance.SerializedSpan import dev.stapler.stelekit.performance.SpanRepository import dev.stapler.stelekit.performance.TraceEvent -import kotlinx.coroutines.Dispatchers +import dev.stapler.stelekit.coroutines.PlatformDispatcher import kotlinx.coroutines.delay import kotlinx.coroutines.launch import kotlinx.coroutines.withContext @@ -82,7 +82,7 @@ private fun HistogramsTab(histogramWriter: HistogramWriter?) { val summaries by produceState>(emptyMap(), histogramWriter) { while (true) { if (histogramWriter != null) { - val result = withContext(Dispatchers.IO) { + val result = withContext(PlatformDispatcher.DB) { operations .mapNotNull { op -> histogramWriter.queryPercentiles(op)?.let { op to it } } .toMap() From fd74bb3a42118a1eab106b78e437d2e530932df3 Mon Sep 17 00:00:00 2001 From: Tyler Stapler Date: Fri, 24 Apr 2026 12:51:50 -0700 Subject: [PATCH 4/6] fix(ci): exclude Kotlin IC state from Gradle home cache in Android job Kotlin incremental compilation stores per-project metadata in ~/.gradle/caches/kotlin-build-*/. When a previous CI run compiled PlatformSettings.kt with `: Settings` (commit dc1b51be9) the IC state was saved in the Gradle home cache restored by setup-gradle. Subsequent runs restore that stale metadata even though the source no longer has the supertype, causing a spurious expect/actual mismatch: expect: PlatformSettings : Settings actual: PlatformSettings : Any Excluding caches/kotlin-build-* forces a clean IC state per run, eliminating cross-branch metadata pollution. Co-Authored-By: Claude Sonnet 4.6 --- .github/workflows/ci.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3ba19e8..a4b5300 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -75,6 +75,9 @@ jobs: with: gradle-home-cache-cleanup: true cache-encryption-key: ${{ secrets.GRADLE_ENCRYPTION_KEY }} + # Exclude Kotlin IC state — stale metadata across branches causes spurious + # expect/actual mismatch errors (e.g. PlatformSettings supertype mismatch). + gradle-home-cache-excludes: caches/kotlin-build-* - name: Run Android unit tests run: ./gradlew :kmp:testDebugUnitTest --no-daemon --build-cache From edbf46ce81800367486b452b49aae1c70dba301e Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 25 Apr 2026 00:12:33 +0000 Subject: [PATCH 5/6] fix: address all PR review comments - MainActivity: rebuild voice pipeline automatically when deviceLlmAvailable flips via LaunchedEffect(deviceLlmAvailable) - AndroidAudioRecorder: move flag resets before permission check and add stopRequested guard after permission grant to prevent spurious recordings - AndroidSpeechRecognizerProvider: move invokeOnCancellation before mainHandler.post and wrap recognizer creation/start in try/catch so thrown exceptions don't leave the coroutine stuck forever - VoiceCaptureViewModelTest: add 4 tests covering DirectSpeechProvider path (success, PermissionDenied, Empty, cancel) - kmp/build.gradle.kts: minSdk=26 already aligned (no change needed)" Agent-Logs-Url: https://github.com/tstapler/stelekit/sessions/c6a0afad-d76c-48da-90b0-3565a04aeb3f Co-authored-by: tstapler <3860386+tstapler@users.noreply.github.com> --- .../dev/stapler/stelekit/MainActivity.kt | 3 + .../stelekit/voice/AndroidAudioRecorder.kt | 8 +- .../voice/AndroidSpeechRecognizerProvider.kt | 111 ++++++++++-------- .../voice/VoiceCaptureViewModelTest.kt | 76 ++++++++++++ 4 files changed, 146 insertions(+), 52 deletions(-) diff --git a/androidApp/src/main/kotlin/dev/stapler/stelekit/MainActivity.kt b/androidApp/src/main/kotlin/dev/stapler/stelekit/MainActivity.kt index d1e7a60..52332d2 100644 --- a/androidApp/src/main/kotlin/dev/stapler/stelekit/MainActivity.kt +++ b/androidApp/src/main/kotlin/dev/stapler/stelekit/MainActivity.kt @@ -139,6 +139,9 @@ class MainActivity : ComponentActivity() { if (deviceLlmAvailable && voiceSettings.getUseDeviceLlm()) mlKitProvider else null, ) var voicePipeline by remember { mutableStateOf(buildPipeline()) } + LaunchedEffect(deviceLlmAvailable) { + voicePipeline = buildPipeline() + } StelekitApp( fileSystem = fileSystem, graphPath = fileSystem.getDefaultGraphPath(), diff --git a/kmp/src/androidMain/kotlin/dev/stapler/stelekit/voice/AndroidAudioRecorder.kt b/kmp/src/androidMain/kotlin/dev/stapler/stelekit/voice/AndroidAudioRecorder.kt index d82ab22..a2bd1e9 100644 --- a/kmp/src/androidMain/kotlin/dev/stapler/stelekit/voice/AndroidAudioRecorder.kt +++ b/kmp/src/androidMain/kotlin/dev/stapler/stelekit/voice/AndroidAudioRecorder.kt @@ -55,12 +55,16 @@ class AndroidAudioRecorder( @Volatile private var pauseRequested = false override suspend fun startRecording(): PlatformAudioFile = withContext(Dispatchers.IO) { + stopRequested = false + pauseRequested = false + if (requestMicPermission != null && !requestMicPermission()) { return@withContext PlatformAudioFile("") } - stopRequested = false - pauseRequested = false + if (stopRequested) { + return@withContext PlatformAudioFile("") + } val outputFile = File(context.cacheDir, "voice_${System.currentTimeMillis()}.m4a") val audioManager = context.getSystemService(Context.AUDIO_SERVICE) as AudioManager diff --git a/kmp/src/androidMain/kotlin/dev/stapler/stelekit/voice/AndroidSpeechRecognizerProvider.kt b/kmp/src/androidMain/kotlin/dev/stapler/stelekit/voice/AndroidSpeechRecognizerProvider.kt index 159a91a..62e8bf8 100644 --- a/kmp/src/androidMain/kotlin/dev/stapler/stelekit/voice/AndroidSpeechRecognizerProvider.kt +++ b/kmp/src/androidMain/kotlin/dev/stapler/stelekit/voice/AndroidSpeechRecognizerProvider.kt @@ -35,63 +35,74 @@ class AndroidSpeechRecognizerProvider(private val context: Context) : DirectSpee private val mainHandler = Handler(Looper.getMainLooper()) override suspend fun listen(): TranscriptResult = suspendCancellableCoroutine { cont -> - mainHandler.post { - val recognizer = SpeechRecognizer.createSpeechRecognizer(context) - activeRecognizer = recognizer - - recognizer.setRecognitionListener(object : RecognitionListener { - override fun onReadyForSpeech(params: Bundle?) {} - override fun onBeginningOfSpeech() {} - override fun onBufferReceived(buffer: ByteArray?) {} - override fun onEndOfSpeech() {} - override fun onEvent(eventType: Int, params: Bundle?) {} - override fun onPartialResults(partialResults: Bundle?) {} - - override fun onRmsChanged(rmsdB: Float) { - // Map roughly -2..10 dB → 0..1 - _amplitudeFlow.value = ((rmsdB + 2f) / 12f).coerceIn(0f, 1f) - } - - override fun onResults(results: Bundle?) { - _amplitudeFlow.value = 0f + cont.invokeOnCancellation { + mainHandler.post { + activeRecognizer?.let { + it.cancel() + it.destroy() activeRecognizer = null - recognizer.destroy() - if (!cont.isActive) return - val text = results - ?.getStringArrayList(SpeechRecognizer.RESULTS_RECOGNITION) - ?.firstOrNull() - Log.d(TAG, "onResults: text=${text?.take(80)}") - if (text.isNullOrBlank()) cont.resume(TranscriptResult.Empty) - else cont.resume(TranscriptResult.Success(text)) } + _amplitudeFlow.value = 0f + } + } - override fun onError(error: Int) { - _amplitudeFlow.value = 0f - activeRecognizer = null - recognizer.destroy() - if (!cont.isActive) return - Log.w(TAG, "onError: code=$error") - cont.resume(mapError(error)) - } - }) + mainHandler.post { + var recognizer: SpeechRecognizer? = null + try { + recognizer = SpeechRecognizer.createSpeechRecognizer(context) + activeRecognizer = recognizer - val intent = Intent(RecognizerIntent.ACTION_RECOGNIZE_SPEECH).apply { - putExtra(RecognizerIntent.EXTRA_LANGUAGE_MODEL, RecognizerIntent.LANGUAGE_MODEL_FREE_FORM) - putExtra(RecognizerIntent.EXTRA_PREFER_OFFLINE, true) - putExtra(RecognizerIntent.EXTRA_MAX_RESULTS, 1) - putExtra(RecognizerIntent.EXTRA_SPEECH_INPUT_COMPLETE_SILENCE_LENGTH_MILLIS, 3_000L) - putExtra(RecognizerIntent.EXTRA_SPEECH_INPUT_POSSIBLY_COMPLETE_SILENCE_LENGTH_MILLIS, 1_500L) - } - recognizer.startListening(intent) + recognizer.setRecognitionListener(object : RecognitionListener { + override fun onReadyForSpeech(params: Bundle?) {} + override fun onBeginningOfSpeech() {} + override fun onBufferReceived(buffer: ByteArray?) {} + override fun onEndOfSpeech() {} + override fun onEvent(eventType: Int, params: Bundle?) {} + override fun onPartialResults(partialResults: Bundle?) {} - cont.invokeOnCancellation { - mainHandler.post { - activeRecognizer?.let { - it.cancel() - it.destroy() + override fun onRmsChanged(rmsdB: Float) { + // Map roughly -2..10 dB → 0..1 + _amplitudeFlow.value = ((rmsdB + 2f) / 12f).coerceIn(0f, 1f) + } + + override fun onResults(results: Bundle?) { + _amplitudeFlow.value = 0f activeRecognizer = null + recognizer.destroy() + if (!cont.isActive) return + val text = results + ?.getStringArrayList(SpeechRecognizer.RESULTS_RECOGNITION) + ?.firstOrNull() + Log.d(TAG, "onResults: text=${text?.take(80)}") + if (text.isNullOrBlank()) cont.resume(TranscriptResult.Empty) + else cont.resume(TranscriptResult.Success(text)) } - _amplitudeFlow.value = 0f + + override fun onError(error: Int) { + _amplitudeFlow.value = 0f + activeRecognizer = null + recognizer.destroy() + if (!cont.isActive) return + Log.w(TAG, "onError: code=$error") + cont.resume(mapError(error)) + } + }) + + val intent = Intent(RecognizerIntent.ACTION_RECOGNIZE_SPEECH).apply { + putExtra(RecognizerIntent.EXTRA_LANGUAGE_MODEL, RecognizerIntent.LANGUAGE_MODEL_FREE_FORM) + putExtra(RecognizerIntent.EXTRA_PREFER_OFFLINE, true) + putExtra(RecognizerIntent.EXTRA_MAX_RESULTS, 1) + putExtra(RecognizerIntent.EXTRA_SPEECH_INPUT_COMPLETE_SILENCE_LENGTH_MILLIS, 3_000L) + putExtra(RecognizerIntent.EXTRA_SPEECH_INPUT_POSSIBLY_COMPLETE_SILENCE_LENGTH_MILLIS, 1_500L) + } + recognizer.startListening(intent) + } catch (t: Throwable) { + _amplitudeFlow.value = 0f + activeRecognizer = null + recognizer?.destroy() + Log.w(TAG, "Failed to start speech recognition", t) + if (cont.isActive) { + cont.resume(mapError(SpeechRecognizer.ERROR_CLIENT)) } } } diff --git a/kmp/src/businessTest/kotlin/dev/stapler/stelekit/voice/VoiceCaptureViewModelTest.kt b/kmp/src/businessTest/kotlin/dev/stapler/stelekit/voice/VoiceCaptureViewModelTest.kt index 0803022..60723b4 100644 --- a/kmp/src/businessTest/kotlin/dev/stapler/stelekit/voice/VoiceCaptureViewModelTest.kt +++ b/kmp/src/businessTest/kotlin/dev/stapler/stelekit/voice/VoiceCaptureViewModelTest.kt @@ -413,6 +413,82 @@ class VoiceCaptureViewModelTest { assertIs(vm.state.first()) } + // --- DirectSpeechProvider path --- + + @Test + fun `directSpeechProvider success path reaches Done state`() = runTest { + val transcript = "this is a test transcript with more than ten words total here" + val fakeDirectProvider = object : DirectSpeechProvider { + override suspend fun listen(): TranscriptResult = TranscriptResult.Success(transcript) + } + val vm = VoiceCaptureViewModel( + VoicePipelineConfig(directSpeechProvider = fakeDirectProvider), + makeJournalService(), this, + ) + + vm.onMicTapped() + advanceUntilIdle() + + assertIs(vm.state.first()) + } + + @Test + fun `directSpeechProvider PermissionDenied emits Error at RECORDING`() = runTest { + val fakeDirectProvider = object : DirectSpeechProvider { + override suspend fun listen(): TranscriptResult = TranscriptResult.Failure.PermissionDenied + } + val vm = VoiceCaptureViewModel( + VoicePipelineConfig(directSpeechProvider = fakeDirectProvider), + makeJournalService(), this, + ) + + vm.onMicTapped() + advanceUntilIdle() + + val state = vm.state.first() + assertIs(state) + assertEquals(PipelineStage.RECORDING, state.stage) + } + + @Test + fun `directSpeechProvider Empty result emits Error at TRANSCRIBING`() = runTest { + val fakeDirectProvider = object : DirectSpeechProvider { + override suspend fun listen(): TranscriptResult = TranscriptResult.Empty + } + val vm = VoiceCaptureViewModel( + VoicePipelineConfig(directSpeechProvider = fakeDirectProvider), + makeJournalService(), this, + ) + + vm.onMicTapped() + advanceUntilIdle() + + val state = vm.state.first() + assertIs(state) + assertEquals(PipelineStage.TRANSCRIBING, state.stage) + } + + @Test + fun `directSpeechProvider cancel during Recording resets to Idle`() = runTest { + val fakeDirectProvider = object : DirectSpeechProvider { + override suspend fun listen(): TranscriptResult { + delay(10_000) + return TranscriptResult.Empty + } + } + val vm = VoiceCaptureViewModel( + VoicePipelineConfig(directSpeechProvider = fakeDirectProvider), + makeJournalService(), this, + ) + + vm.onMicTapped() + delay(1) + assertIs(vm.state.first()) + + vm.cancel() + assertIs(vm.state.first()) + } + @Test fun `transcript over 10000 chars is truncated before LLM`() = runTest { val longTranscript = "word ".repeat(2_500) // 12,500 chars From 9c0825ec56c142afcbd04b11ed5a45fb94517d40 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 25 Apr 2026 00:14:06 +0000 Subject: [PATCH 6/6] fix: guard against early cancellation before SpeechRecognizer is created Add an isActive check inside mainHandler.post after setting activeRecognizer so that if cancellation fires before the post runs (cleanup was a no-op), the recognizer is destroyed immediately rather than starting a zombie session." Agent-Logs-Url: https://github.com/tstapler/stelekit/sessions/c6a0afad-d76c-48da-90b0-3565a04aeb3f Co-authored-by: tstapler <3860386+tstapler@users.noreply.github.com> --- .../stelekit/voice/AndroidSpeechRecognizerProvider.kt | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/kmp/src/androidMain/kotlin/dev/stapler/stelekit/voice/AndroidSpeechRecognizerProvider.kt b/kmp/src/androidMain/kotlin/dev/stapler/stelekit/voice/AndroidSpeechRecognizerProvider.kt index 62e8bf8..84e556c 100644 --- a/kmp/src/androidMain/kotlin/dev/stapler/stelekit/voice/AndroidSpeechRecognizerProvider.kt +++ b/kmp/src/androidMain/kotlin/dev/stapler/stelekit/voice/AndroidSpeechRecognizerProvider.kt @@ -52,6 +52,13 @@ class AndroidSpeechRecognizerProvider(private val context: Context) : DirectSpee recognizer = SpeechRecognizer.createSpeechRecognizer(context) activeRecognizer = recognizer + // Guard against cancellation that fired before this post ran + if (!cont.isActive) { + recognizer.destroy() + activeRecognizer = null + return@post + } + recognizer.setRecognitionListener(object : RecognitionListener { override fun onReadyForSpeech(params: Bundle?) {} override fun onBeginningOfSpeech() {}