diff --git a/.github/workflows/ci-ios.yml b/.github/workflows/ci-ios.yml index c91e685..2f06905 100644 --- a/.github/workflows/ci-ios.yml +++ b/.github/workflows/ci-ios.yml @@ -27,17 +27,16 @@ jobs: ios-framework: name: iOS Framework Link Check runs-on: macos-latest - # Two pre-existing blockers prevent iOS compilation from passing: - # 1. Gradle issue #17559 — classloader mismatch: in a multi-project build where :kmp uses - # kotlin-multiplatform and :androidApp uses AGP, KotlinNativeBundleBuildService is loaded - # by different classloaders. The KGP sets the service on tasks via - # `Property.value(provider)` which Gradle 8.8+ rejects - # when the property and provider types are the same class from different loaders. Fix - # requires JetBrains to annotate with @ServiceReference or use Property upstream. - # No Kotlin version (2.1.x, 2.2.x, 2.3.x) contains this fix. Affects all iOS tasks. - # 2. commonMain contains JVM-specific symbols (java.*, System, Dispatchers.IO, OpenTelemetry) - # that fail metadata compilation against the full multiplatform API surface. - # Neither issue is introduced by this PR. Mark the job non-blocking until they are fixed. + # One pre-existing blocker prevents full iOS compilation from passing: + # Gradle issue #17559 — classloader mismatch: in a multi-project build where :kmp uses + # kotlin-multiplatform and :androidApp uses AGP, KotlinNativeBundleBuildService is loaded + # by different classloaders. The KGP sets the service on tasks via + # `Property.value(provider)` which Gradle 8.8+ rejects + # when the property and provider types are the same class from different loaders. Fix + # requires JetBrains to annotate with @ServiceReference or use Property upstream. + # No Kotlin version (2.1.x, 2.2.x, 2.3.x) contains this fix. Affects compileKotlinIos* + # tasks but NOT compileCommonMainKotlinMetadata (which is what we run here). + # Keep non-blocking until we can verify compileCommonMainKotlinMetadata passes consistently. continue-on-error: true if: github.event.pull_request.draft == false diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3ba19e8..a4b5300 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -75,6 +75,9 @@ jobs: with: gradle-home-cache-cleanup: true cache-encryption-key: ${{ secrets.GRADLE_ENCRYPTION_KEY }} + # Exclude Kotlin IC state — stale metadata across branches causes spurious + # expect/actual mismatch errors (e.g. PlatformSettings supertype mismatch). + gradle-home-cache-excludes: caches/kotlin-build-* - name: Run Android unit tests run: ./gradlew :kmp:testDebugUnitTest --no-daemon --build-cache diff --git a/androidApp/build.gradle.kts b/androidApp/build.gradle.kts index ead54dc..9a33d92 100644 --- a/androidApp/build.gradle.kts +++ b/androidApp/build.gradle.kts @@ -13,7 +13,7 @@ android { defaultConfig { applicationId = "dev.stapler.stelekit" - minSdk = 24 + minSdk = 26 targetSdk = 36 versionCode = 1 versionName = (findProperty("appVersion") as? String ?: "0.1.0") @@ -66,4 +66,5 @@ dependencies { implementation(platform("androidx.compose:compose-bom:2024.09.02")) implementation("androidx.compose.ui:ui") implementation("androidx.compose.material3:material3") + } diff --git a/androidApp/src/main/kotlin/dev/stapler/stelekit/MainActivity.kt b/androidApp/src/main/kotlin/dev/stapler/stelekit/MainActivity.kt index d708a9b..57e1685 100644 --- a/androidApp/src/main/kotlin/dev/stapler/stelekit/MainActivity.kt +++ b/androidApp/src/main/kotlin/dev/stapler/stelekit/MainActivity.kt @@ -1,6 +1,8 @@ package dev.stapler.stelekit +import android.Manifest import android.content.Intent +import android.content.pm.PackageManager import android.net.Uri import android.os.Bundle import android.util.Log @@ -8,6 +10,7 @@ import androidx.activity.ComponentActivity import androidx.activity.compose.setContent import androidx.activity.enableEdgeToEdge import androidx.activity.result.contract.ActivityResultContracts +import androidx.core.content.ContextCompat import androidx.compose.runtime.getValue import androidx.compose.runtime.mutableStateOf import androidx.compose.runtime.remember @@ -16,7 +19,11 @@ import dev.stapler.stelekit.domain.UrlFetcherAndroid import dev.stapler.stelekit.platform.PlatformFileSystem import dev.stapler.stelekit.platform.PlatformSettings import dev.stapler.stelekit.ui.StelekitApp +import android.speech.SpeechRecognizer +import androidx.compose.runtime.LaunchedEffect import dev.stapler.stelekit.voice.AndroidAudioRecorder +import dev.stapler.stelekit.voice.AndroidSpeechRecognizerProvider +import dev.stapler.stelekit.voice.MlKitLlmFormatterProvider import dev.stapler.stelekit.voice.VoiceSettings import dev.stapler.stelekit.voice.buildVoicePipeline import kotlinx.coroutines.CompletableDeferred @@ -24,6 +31,14 @@ import kotlinx.coroutines.CompletableDeferred class MainActivity : ComponentActivity() { private var pendingFolderPick: CompletableDeferred? = null + private var pendingMicPermission: CompletableDeferred? = null + + private val micPermissionLauncher = registerForActivityResult( + ActivityResultContracts.RequestPermission() + ) { granted -> + pendingMicPermission?.complete(granted) + pendingMicPermission = null + } private val folderPickerLauncher = registerForActivityResult( ActivityResultContracts.OpenDocumentTree() @@ -102,9 +117,39 @@ class MainActivity : ComponentActivity() { } setContent { - val audioRecorder = remember { AndroidAudioRecorder(this@MainActivity.applicationContext) } + val fileSystem = remember { + PlatformFileSystem().apply { + init(this@MainActivity) { + val deferred = CompletableDeferred() + pendingFolderPick = deferred + // Pre-fill the picker with the last known folder so "Reconnect" UX is smooth + val hintUri = getStoredTreeUri() + runOnUiThread { folderPickerLauncher.launch(hintUri) } + deferred.await() + } + } + } + val audioRecorder = remember { AndroidAudioRecorder(this@MainActivity.applicationContext, this@MainActivity::requestMicrophonePermission) } val voiceSettings = remember { VoiceSettings(PlatformSettings()) } - var voicePipeline by remember { mutableStateOf(buildVoicePipeline(audioRecorder, voiceSettings)) } + val deviceSttAvailable = remember { AndroidSpeechRecognizerProvider.isAvailable(this@MainActivity.applicationContext) } + val deviceSttProvider = remember { + if (deviceSttAvailable) AndroidSpeechRecognizerProvider(this@MainActivity.applicationContext) else null + } + val mlKitProvider = remember { MlKitLlmFormatterProvider.create() } + var deviceLlmAvailable by remember { mutableStateOf(false) } + LaunchedEffect(Unit) { + deviceLlmAvailable = mlKitProvider?.checkEligible() ?: false + } + fun buildPipeline() = buildVoicePipeline( + audioRecorder, + voiceSettings, + if (deviceSttAvailable && voiceSettings.getUseDeviceStt()) deviceSttProvider else null, + if (deviceLlmAvailable && voiceSettings.getUseDeviceLlm()) mlKitProvider else null, + ) + var voicePipeline by remember { mutableStateOf(buildPipeline()) } + LaunchedEffect(deviceLlmAvailable) { + voicePipeline = buildPipeline() + } StelekitApp( fileSystem = fileSystem, graphPath = fileSystem.getDefaultGraphPath(), @@ -112,11 +157,23 @@ class MainActivity : ComponentActivity() { urlFetcher = UrlFetcherAndroid(), voicePipeline = voicePipeline, voiceSettings = voiceSettings, - onRebuildVoicePipeline = { voicePipeline = buildVoicePipeline(audioRecorder, voiceSettings) }, + onRebuildVoicePipeline = { voicePipeline = buildPipeline() }, + deviceSttAvailable = deviceSttAvailable, + deviceLlmAvailable = deviceLlmAvailable, ) } } + private suspend fun requestMicrophonePermission(): Boolean { + if (ContextCompat.checkSelfPermission(this, Manifest.permission.RECORD_AUDIO) + == PackageManager.PERMISSION_GRANTED + ) return true + val deferred = CompletableDeferred() + pendingMicPermission = deferred + runOnUiThread { micPermissionLauncher.launch(Manifest.permission.RECORD_AUDIO) } + return deferred.await() + } + companion object { private const val TAG = "MainActivity" /** Authority for AOSP ExternalStorageProvider — the only provider supported in v1. */ diff --git a/kmp/build.gradle.kts b/kmp/build.gradle.kts index 5835285..fcd9a8d 100644 --- a/kmp/build.gradle.kts +++ b/kmp/build.gradle.kts @@ -176,6 +176,9 @@ kotlin { // Encrypted SharedPreferences for API key storage implementation("androidx.security:security-crypto:1.1.0-alpha06") + // On-device LLM via Gemini Nano (Pixel 9+ and AICore-enabled OEM flagships) + implementation("com.google.mlkit:genai-prompt:1.0.0-beta2") + // Jetpack Glance — Compose-based home screen widget API // Use 1.1.1 (not 1.1.0) to pick up a protobuf security fix. implementation("androidx.glance:glance-appwidget:1.1.1") @@ -643,7 +646,7 @@ android { namespace = "dev.stapler.stelekit" defaultConfig { - minSdk = 24 + minSdk = 26 testInstrumentationRunner = "androidx.test.runner.AndroidJUnitRunner" } diff --git a/kmp/src/androidMain/kotlin/dev/stapler/stelekit/voice/AndroidAudioRecorder.kt b/kmp/src/androidMain/kotlin/dev/stapler/stelekit/voice/AndroidAudioRecorder.kt index 3c17caf..a2bd1e9 100644 --- a/kmp/src/androidMain/kotlin/dev/stapler/stelekit/voice/AndroidAudioRecorder.kt +++ b/kmp/src/androidMain/kotlin/dev/stapler/stelekit/voice/AndroidAudioRecorder.kt @@ -25,7 +25,11 @@ import kotlin.math.sqrt private const val TAG = "AndroidAudioRecorder" -class AndroidAudioRecorder(private val context: Context) : AudioRecorder { +class AndroidAudioRecorder( + private val context: Context, + /** Called before recording starts; must return true if RECORD_AUDIO permission is granted. */ + private val requestMicPermission: (suspend () -> Boolean)? = null, +) : AudioRecorder { companion object { private const val SAMPLE_RATE = 16_000 @@ -54,6 +58,14 @@ class AndroidAudioRecorder(private val context: Context) : AudioRecorder { stopRequested = false pauseRequested = false + if (requestMicPermission != null && !requestMicPermission()) { + return@withContext PlatformAudioFile("") + } + + if (stopRequested) { + return@withContext PlatformAudioFile("") + } + val outputFile = File(context.cacheDir, "voice_${System.currentTimeMillis()}.m4a") val audioManager = context.getSystemService(Context.AUDIO_SERVICE) as AudioManager diff --git a/kmp/src/androidMain/kotlin/dev/stapler/stelekit/voice/AndroidSpeechRecognizerProvider.kt b/kmp/src/androidMain/kotlin/dev/stapler/stelekit/voice/AndroidSpeechRecognizerProvider.kt new file mode 100644 index 0000000..84e556c --- /dev/null +++ b/kmp/src/androidMain/kotlin/dev/stapler/stelekit/voice/AndroidSpeechRecognizerProvider.kt @@ -0,0 +1,132 @@ +// Copyright (c) 2026 Tyler Stapler +// SPDX-License-Identifier: Elastic-2.0 +package dev.stapler.stelekit.voice + +import android.content.Context +import android.content.Intent +import android.os.Bundle +import android.os.Handler +import android.os.Looper +import android.speech.RecognitionListener +import android.speech.RecognizerIntent +import android.speech.SpeechRecognizer +import android.util.Log +import kotlinx.coroutines.Dispatchers +import kotlinx.coroutines.flow.Flow +import kotlinx.coroutines.flow.MutableStateFlow +import kotlinx.coroutines.flow.asStateFlow +import kotlinx.coroutines.suspendCancellableCoroutine +import kotlinx.coroutines.withContext +import kotlin.coroutines.resume + +private const val TAG = "AndroidSpeechRecognizer" + +class AndroidSpeechRecognizerProvider(private val context: Context) : DirectSpeechProvider { + + companion object { + fun isAvailable(context: Context): Boolean = + SpeechRecognizer.isRecognitionAvailable(context) + } + + private val _amplitudeFlow = MutableStateFlow(0f) + override val amplitudeFlow: Flow = _amplitudeFlow.asStateFlow() + + @Volatile private var activeRecognizer: SpeechRecognizer? = null + private val mainHandler = Handler(Looper.getMainLooper()) + + override suspend fun listen(): TranscriptResult = suspendCancellableCoroutine { cont -> + cont.invokeOnCancellation { + mainHandler.post { + activeRecognizer?.let { + it.cancel() + it.destroy() + activeRecognizer = null + } + _amplitudeFlow.value = 0f + } + } + + mainHandler.post { + var recognizer: SpeechRecognizer? = null + try { + recognizer = SpeechRecognizer.createSpeechRecognizer(context) + activeRecognizer = recognizer + + // Guard against cancellation that fired before this post ran + if (!cont.isActive) { + recognizer.destroy() + activeRecognizer = null + return@post + } + + recognizer.setRecognitionListener(object : RecognitionListener { + override fun onReadyForSpeech(params: Bundle?) {} + override fun onBeginningOfSpeech() {} + override fun onBufferReceived(buffer: ByteArray?) {} + override fun onEndOfSpeech() {} + override fun onEvent(eventType: Int, params: Bundle?) {} + override fun onPartialResults(partialResults: Bundle?) {} + + override fun onRmsChanged(rmsdB: Float) { + // Map roughly -2..10 dB → 0..1 + _amplitudeFlow.value = ((rmsdB + 2f) / 12f).coerceIn(0f, 1f) + } + + override fun onResults(results: Bundle?) { + _amplitudeFlow.value = 0f + activeRecognizer = null + recognizer.destroy() + if (!cont.isActive) return + val text = results + ?.getStringArrayList(SpeechRecognizer.RESULTS_RECOGNITION) + ?.firstOrNull() + Log.d(TAG, "onResults: text=${text?.take(80)}") + if (text.isNullOrBlank()) cont.resume(TranscriptResult.Empty) + else cont.resume(TranscriptResult.Success(text)) + } + + override fun onError(error: Int) { + _amplitudeFlow.value = 0f + activeRecognizer = null + recognizer.destroy() + if (!cont.isActive) return + Log.w(TAG, "onError: code=$error") + cont.resume(mapError(error)) + } + }) + + val intent = Intent(RecognizerIntent.ACTION_RECOGNIZE_SPEECH).apply { + putExtra(RecognizerIntent.EXTRA_LANGUAGE_MODEL, RecognizerIntent.LANGUAGE_MODEL_FREE_FORM) + putExtra(RecognizerIntent.EXTRA_PREFER_OFFLINE, true) + putExtra(RecognizerIntent.EXTRA_MAX_RESULTS, 1) + putExtra(RecognizerIntent.EXTRA_SPEECH_INPUT_COMPLETE_SILENCE_LENGTH_MILLIS, 3_000L) + putExtra(RecognizerIntent.EXTRA_SPEECH_INPUT_POSSIBLY_COMPLETE_SILENCE_LENGTH_MILLIS, 1_500L) + } + recognizer.startListening(intent) + } catch (t: Throwable) { + _amplitudeFlow.value = 0f + activeRecognizer = null + recognizer?.destroy() + Log.w(TAG, "Failed to start speech recognition", t) + if (cont.isActive) { + cont.resume(mapError(SpeechRecognizer.ERROR_CLIENT)) + } + } + } + } + + override suspend fun stopListening() { + withContext(Dispatchers.Main) { + activeRecognizer?.stopListening() + } + } + + private fun mapError(error: Int): TranscriptResult = when (error) { + SpeechRecognizer.ERROR_NO_MATCH, + SpeechRecognizer.ERROR_SPEECH_TIMEOUT -> TranscriptResult.Empty + SpeechRecognizer.ERROR_INSUFFICIENT_PERMISSIONS -> TranscriptResult.Failure.PermissionDenied + SpeechRecognizer.ERROR_NETWORK, + SpeechRecognizer.ERROR_NETWORK_TIMEOUT -> TranscriptResult.Failure.NetworkError + else -> TranscriptResult.Failure.ApiError(error, "Speech recognition error (code $error)") + } +} diff --git a/kmp/src/androidMain/kotlin/dev/stapler/stelekit/voice/MlKitLlmFormatterProvider.kt b/kmp/src/androidMain/kotlin/dev/stapler/stelekit/voice/MlKitLlmFormatterProvider.kt new file mode 100644 index 0000000..2181931 --- /dev/null +++ b/kmp/src/androidMain/kotlin/dev/stapler/stelekit/voice/MlKitLlmFormatterProvider.kt @@ -0,0 +1,81 @@ +// Copyright (c) 2026 Tyler Stapler +// SPDX-License-Identifier: Elastic-2.0 +package dev.stapler.stelekit.voice + +import android.util.Log +import com.google.mlkit.genai.common.FeatureStatus +import com.google.mlkit.genai.prompt.Generation +import com.google.mlkit.genai.prompt.GenerativeModel +import kotlinx.coroutines.CancellationException + +private const val TAG = "MlKitLlmFormatter" + +/** + * On-device LLM formatter backed by ML Kit Prompt API (Gemini Nano via AICore). + * + * Supported devices: Pixel 9+ and major OEM flagships with AICore (Samsung S25, etc.). + * Output hard-capped at 256 tokens by the on-device model — suitable for short voice notes. + * API status: beta (com.google.mlkit:genai-prompt:1.0.0-beta2). + */ +class MlKitLlmFormatterProvider private constructor( + private val model: GenerativeModel, +) : LlmFormatterProvider { + + companion object { + /** Creates the provider; returns null if the ML Kit library fails to initialise. */ + fun create(): MlKitLlmFormatterProvider? = runCatching { + MlKitLlmFormatterProvider(Generation.getClient()) + }.getOrElse { e -> + Log.w(TAG, "Failed to create GenerativeModel", e) + null + } + } + + /** Returns true when the device supports on-device inference (model available or will download). */ + suspend fun checkEligible(): Boolean = runCatching { + when (model.checkStatus()) { + FeatureStatus.AVAILABLE, + FeatureStatus.DOWNLOADABLE, + FeatureStatus.DOWNLOADING -> true + else -> false + } + }.getOrElse { e -> + Log.w(TAG, "checkStatus failed", e) + false + } + + override suspend fun format(transcript: String, systemPrompt: String): LlmResult { + return try { + when (model.checkStatus()) { + FeatureStatus.AVAILABLE -> { + Log.d(TAG, "Running on-device inference (${transcript.length} chars input)") + val response = model.generateContent(systemPrompt) + val text = response.candidates.firstOrNull()?.text?.trim() + if (text.isNullOrBlank()) { + LlmResult.Failure.ApiError(-1, "Empty response from on-device model") + } else { + Log.d(TAG, "On-device inference complete (${text.length} chars output)") + LlmResult.Success(text, LlmProviderSupport.detectTruncation(text)) + } + } + FeatureStatus.DOWNLOADABLE, + FeatureStatus.DOWNLOADING -> { + // AICore downloads the model in the background automatically. + // Blocking here would take several minutes — return a friendly retry message. + LlmResult.Failure.ApiError( + -1, + "On-device model is downloading — try again in a few minutes" + ) + } + else -> { + LlmResult.Failure.ApiError(-1, "On-device LLM not supported on this device") + } + } + } catch (e: CancellationException) { + throw e + } catch (e: Exception) { + Log.e(TAG, "On-device inference error", e) + LlmResult.Failure.ApiError(-1, "On-device LLM error: ${e.message}") + } + } +} diff --git a/kmp/src/businessTest/kotlin/dev/stapler/stelekit/voice/VoiceCaptureViewModelTest.kt b/kmp/src/businessTest/kotlin/dev/stapler/stelekit/voice/VoiceCaptureViewModelTest.kt index 0803022..60723b4 100644 --- a/kmp/src/businessTest/kotlin/dev/stapler/stelekit/voice/VoiceCaptureViewModelTest.kt +++ b/kmp/src/businessTest/kotlin/dev/stapler/stelekit/voice/VoiceCaptureViewModelTest.kt @@ -413,6 +413,82 @@ class VoiceCaptureViewModelTest { assertIs(vm.state.first()) } + // --- DirectSpeechProvider path --- + + @Test + fun `directSpeechProvider success path reaches Done state`() = runTest { + val transcript = "this is a test transcript with more than ten words total here" + val fakeDirectProvider = object : DirectSpeechProvider { + override suspend fun listen(): TranscriptResult = TranscriptResult.Success(transcript) + } + val vm = VoiceCaptureViewModel( + VoicePipelineConfig(directSpeechProvider = fakeDirectProvider), + makeJournalService(), this, + ) + + vm.onMicTapped() + advanceUntilIdle() + + assertIs(vm.state.first()) + } + + @Test + fun `directSpeechProvider PermissionDenied emits Error at RECORDING`() = runTest { + val fakeDirectProvider = object : DirectSpeechProvider { + override suspend fun listen(): TranscriptResult = TranscriptResult.Failure.PermissionDenied + } + val vm = VoiceCaptureViewModel( + VoicePipelineConfig(directSpeechProvider = fakeDirectProvider), + makeJournalService(), this, + ) + + vm.onMicTapped() + advanceUntilIdle() + + val state = vm.state.first() + assertIs(state) + assertEquals(PipelineStage.RECORDING, state.stage) + } + + @Test + fun `directSpeechProvider Empty result emits Error at TRANSCRIBING`() = runTest { + val fakeDirectProvider = object : DirectSpeechProvider { + override suspend fun listen(): TranscriptResult = TranscriptResult.Empty + } + val vm = VoiceCaptureViewModel( + VoicePipelineConfig(directSpeechProvider = fakeDirectProvider), + makeJournalService(), this, + ) + + vm.onMicTapped() + advanceUntilIdle() + + val state = vm.state.first() + assertIs(state) + assertEquals(PipelineStage.TRANSCRIBING, state.stage) + } + + @Test + fun `directSpeechProvider cancel during Recording resets to Idle`() = runTest { + val fakeDirectProvider = object : DirectSpeechProvider { + override suspend fun listen(): TranscriptResult { + delay(10_000) + return TranscriptResult.Empty + } + } + val vm = VoiceCaptureViewModel( + VoicePipelineConfig(directSpeechProvider = fakeDirectProvider), + makeJournalService(), this, + ) + + vm.onMicTapped() + delay(1) + assertIs(vm.state.first()) + + vm.cancel() + assertIs(vm.state.first()) + } + @Test fun `transcript over 10000 chars is truncated before LLM`() = runTest { val longTranscript = "word ".repeat(2_500) // 12,500 chars diff --git a/kmp/src/commonMain/kotlin/dev/stapler/stelekit/ui/App.kt b/kmp/src/commonMain/kotlin/dev/stapler/stelekit/ui/App.kt index 715f974..ed1357d 100644 --- a/kmp/src/commonMain/kotlin/dev/stapler/stelekit/ui/App.kt +++ b/kmp/src/commonMain/kotlin/dev/stapler/stelekit/ui/App.kt @@ -110,6 +110,8 @@ fun StelekitApp( voicePipeline: VoicePipelineConfig = remember { VoicePipelineConfig() }, voiceSettings: VoiceSettings? = null, onRebuildVoicePipeline: (() -> Unit)? = null, + deviceSttAvailable: Boolean = false, + deviceLlmAvailable: Boolean = false, spanRecorder: SpanRecorder = NoOpSpanRecorder, ) { val platformSettings = remember { PlatformSettings() } @@ -241,6 +243,8 @@ fun StelekitApp( voicePipeline = voicePipeline, voiceSettings = voiceSettings, onRebuildVoicePipeline = onRebuildVoicePipeline, + deviceSttAvailable = deviceSttAvailable, + deviceLlmAvailable = deviceLlmAvailable, spanRecorder = spanRecorder, ) } @@ -266,6 +270,8 @@ private fun GraphContent( voicePipeline: VoicePipelineConfig = VoicePipelineConfig(), voiceSettings: VoiceSettings? = null, onRebuildVoicePipeline: (() -> Unit)? = null, + deviceSttAvailable: Boolean = false, + deviceLlmAvailable: Boolean = false, spanRecorder: SpanRecorder = NoOpSpanRecorder, ) { CompositionLocalProvider(LocalSpanRecorder provides spanRecorder) { @@ -659,7 +665,7 @@ private fun GraphContent( onTap = { voiceCaptureViewModel.onMicTapped() }, onDismissError = { voiceCaptureViewModel.dismissError() }, onAutoReset = { voiceCaptureViewModel.resetToIdle() }, - amplitudeFlow = voicePipeline.audioRecorder.amplitudeFlow, + amplitudeFlow = voicePipeline.effectiveAmplitudeFlow, ) }, ) @@ -674,6 +680,8 @@ private fun GraphContent( fileSystem = fileSystem, voiceSettings = voiceSettings, onRebuildVoicePipeline = onRebuildVoicePipeline, + deviceSttAvailable = deviceSttAvailable, + deviceLlmAvailable = deviceLlmAvailable, frameMetric = frameMetricState, debugState = debugMenuState, onDebugStateChange = { newState -> @@ -875,6 +883,8 @@ private fun GraphDialogLayer( fileSystem: FileSystem, voiceSettings: VoiceSettings? = null, onRebuildVoicePipeline: (() -> Unit)? = null, + deviceSttAvailable: Boolean = false, + deviceLlmAvailable: Boolean = false, frameMetric: kotlinx.coroutines.flow.StateFlow, debugState: DebugMenuState = DebugMenuState(), onDebugStateChange: (DebugMenuState) -> Unit = {}, @@ -912,6 +922,8 @@ private fun GraphDialogLayer( onLeftHandedChange = { viewModel.setLeftHanded(it) }, voiceSettings = voiceSettings, onRebuildVoicePipeline = onRebuildVoicePipeline, + deviceSttAvailable = deviceSttAvailable, + deviceLlmAvailable = deviceLlmAvailable, ) appState.diskConflict?.let { conflict -> diff --git a/kmp/src/commonMain/kotlin/dev/stapler/stelekit/ui/components/PerformanceDashboard.kt b/kmp/src/commonMain/kotlin/dev/stapler/stelekit/ui/components/PerformanceDashboard.kt index c9b8470..4cf6d3b 100644 --- a/kmp/src/commonMain/kotlin/dev/stapler/stelekit/ui/components/PerformanceDashboard.kt +++ b/kmp/src/commonMain/kotlin/dev/stapler/stelekit/ui/components/PerformanceDashboard.kt @@ -82,7 +82,7 @@ private fun HistogramsTab(histogramWriter: HistogramWriter?) { val summaries by produceState>(emptyMap(), histogramWriter) { while (true) { if (histogramWriter != null) { - val result = withContext(PlatformDispatcher.IO) { + val result = withContext(PlatformDispatcher.DB) { operations .mapNotNull { op -> histogramWriter.queryPercentiles(op)?.let { op to it } } .toMap() diff --git a/kmp/src/commonMain/kotlin/dev/stapler/stelekit/ui/components/settings/SettingsDialog.kt b/kmp/src/commonMain/kotlin/dev/stapler/stelekit/ui/components/settings/SettingsDialog.kt index 3bd5967..83c5751 100644 --- a/kmp/src/commonMain/kotlin/dev/stapler/stelekit/ui/components/settings/SettingsDialog.kt +++ b/kmp/src/commonMain/kotlin/dev/stapler/stelekit/ui/components/settings/SettingsDialog.kt @@ -33,6 +33,8 @@ fun SettingsDialog( onLeftHandedChange: (Boolean) -> Unit = {}, voiceSettings: VoiceSettings? = null, onRebuildVoicePipeline: (() -> Unit)? = null, + deviceSttAvailable: Boolean = false, + deviceLlmAvailable: Boolean = false, ) { if (visible) { Dialog( @@ -129,6 +131,8 @@ fun SettingsDialog( VoiceCaptureSettings( voiceSettings = voiceSettings, onRebuildPipeline = onRebuildVoicePipeline, + deviceSttAvailable = deviceSttAvailable, + deviceLlmAvailable = deviceLlmAvailable, ) } } diff --git a/kmp/src/commonMain/kotlin/dev/stapler/stelekit/ui/components/settings/VoiceCaptureSettings.kt b/kmp/src/commonMain/kotlin/dev/stapler/stelekit/ui/components/settings/VoiceCaptureSettings.kt index 7d26066..a98b682 100644 --- a/kmp/src/commonMain/kotlin/dev/stapler/stelekit/ui/components/settings/VoiceCaptureSettings.kt +++ b/kmp/src/commonMain/kotlin/dev/stapler/stelekit/ui/components/settings/VoiceCaptureSettings.kt @@ -27,34 +27,60 @@ import dev.stapler.stelekit.voice.VoiceSettings fun VoiceCaptureSettings( voiceSettings: VoiceSettings, onRebuildPipeline: () -> Unit, + deviceSttAvailable: Boolean = false, + deviceLlmAvailable: Boolean = false, ) { var whisperKey by remember { mutableStateOf(voiceSettings.getWhisperApiKey() ?: "") } var anthropicKey by remember { mutableStateOf(voiceSettings.getAnthropicKey() ?: "") } var openAiKey by remember { mutableStateOf(voiceSettings.getOpenAiKey() ?: "") } var llmEnabled by remember { mutableStateOf(voiceSettings.getLlmEnabled()) } + var useDeviceStt by remember { mutableStateOf(voiceSettings.getUseDeviceStt()) } + var useDeviceLlm by remember { mutableStateOf(voiceSettings.getUseDeviceLlm()) } var saved by remember { mutableStateOf(false) } SettingsSection("Transcription (Speech-to-Text)") { - Text( - "Whisper API key — used for speech transcription (~\$0.003/min).", - style = MaterialTheme.typography.bodySmall, - color = MaterialTheme.colorScheme.onSurfaceVariant, - modifier = Modifier.padding(bottom = 8.dp), - ) - OutlinedTextField( - value = whisperKey, - onValueChange = { whisperKey = it; saved = false }, - label = { Text("OpenAI / Whisper API key") }, - visualTransformation = PasswordVisualTransformation(), - singleLine = true, - modifier = Modifier.fillMaxWidth(), - ) + if (deviceSttAvailable) { + Row( + modifier = Modifier.fillMaxWidth().padding(bottom = 8.dp), + horizontalArrangement = Arrangement.SpaceBetween, + verticalAlignment = Alignment.CenterVertically, + ) { + Text("Use on-device speech recognition", style = MaterialTheme.typography.bodyMedium) + Switch( + checked = useDeviceStt, + onCheckedChange = { useDeviceStt = it; saved = false }, + ) + } + if (useDeviceStt) { + Text( + "Transcription happens on-device — no API key or network required.", + style = MaterialTheme.typography.bodySmall, + color = MaterialTheme.colorScheme.onSurfaceVariant, + modifier = Modifier.padding(bottom = 8.dp), + ) + } + } + if (!deviceSttAvailable || !useDeviceStt) { + Text( + "Whisper API key — used for speech transcription (~\$0.003/min).", + style = MaterialTheme.typography.bodySmall, + color = MaterialTheme.colorScheme.onSurfaceVariant, + modifier = Modifier.padding(bottom = 8.dp), + ) + OutlinedTextField( + value = whisperKey, + onValueChange = { whisperKey = it; saved = false }, + label = { Text("OpenAI / Whisper API key") }, + visualTransformation = PasswordVisualTransformation(), + singleLine = true, + modifier = Modifier.fillMaxWidth(), + ) + } } SettingsSection("LLM Formatting") { Text( - "Formats the raw transcript into Logseq outliner syntax with bullet points and [[wikilinks]]. " + - "Provide one key — Anthropic is used if both are set.", + "Formats the raw transcript into Logseq outliner syntax with bullet points and [[wikilinks]].", style = MaterialTheme.typography.bodySmall, color = MaterialTheme.colorScheme.onSurfaceVariant, modifier = Modifier.padding(bottom = 8.dp), @@ -71,24 +97,54 @@ fun VoiceCaptureSettings( ) } if (llmEnabled) { - OutlinedTextField( - value = anthropicKey, - onValueChange = { anthropicKey = it; saved = false }, - label = { Text("Anthropic (Claude) API key") }, - visualTransformation = PasswordVisualTransformation(), - singleLine = true, - modifier = Modifier.fillMaxWidth(), - ) - OutlinedTextField( - value = openAiKey, - onValueChange = { openAiKey = it; saved = false }, - label = { Text("OpenAI / compatible API key") }, - visualTransformation = PasswordVisualTransformation(), - singleLine = true, - modifier = Modifier - .fillMaxWidth() - .padding(top = 8.dp), - ) + if (deviceLlmAvailable) { + Row( + modifier = Modifier.fillMaxWidth().padding(bottom = 8.dp), + horizontalArrangement = Arrangement.SpaceBetween, + verticalAlignment = Alignment.CenterVertically, + ) { + Text("Use on-device LLM (Gemini Nano)", style = MaterialTheme.typography.bodyMedium) + Switch( + checked = useDeviceLlm, + onCheckedChange = { useDeviceLlm = it; saved = false }, + ) + } + if (useDeviceLlm) { + Text( + "Formatting runs on-device — no API key or network required. " + + "256-token output limit; longer notes may be truncated.", + style = MaterialTheme.typography.bodySmall, + color = MaterialTheme.colorScheme.onSurfaceVariant, + modifier = Modifier.padding(bottom = 8.dp), + ) + } + } + if (!deviceLlmAvailable || !useDeviceLlm) { + Text( + "Provide one key — Anthropic is used if both are set.", + style = MaterialTheme.typography.bodySmall, + color = MaterialTheme.colorScheme.onSurfaceVariant, + modifier = Modifier.padding(bottom = 8.dp), + ) + OutlinedTextField( + value = anthropicKey, + onValueChange = { anthropicKey = it; saved = false }, + label = { Text("Anthropic (Claude) API key") }, + visualTransformation = PasswordVisualTransformation(), + singleLine = true, + modifier = Modifier.fillMaxWidth(), + ) + OutlinedTextField( + value = openAiKey, + onValueChange = { openAiKey = it; saved = false }, + label = { Text("OpenAI / compatible API key") }, + visualTransformation = PasswordVisualTransformation(), + singleLine = true, + modifier = Modifier + .fillMaxWidth() + .padding(top = 8.dp), + ) + } } } @@ -104,6 +160,8 @@ fun VoiceCaptureSettings( voiceSettings.setAnthropicKey(anthropicKey) voiceSettings.setOpenAiKey(openAiKey) voiceSettings.setLlmEnabled(llmEnabled) + voiceSettings.setUseDeviceStt(useDeviceStt) + voiceSettings.setUseDeviceLlm(useDeviceLlm) saved = true onRebuildPipeline() }, diff --git a/kmp/src/commonMain/kotlin/dev/stapler/stelekit/voice/DirectSpeechProvider.kt b/kmp/src/commonMain/kotlin/dev/stapler/stelekit/voice/DirectSpeechProvider.kt new file mode 100644 index 0000000..8c19dad --- /dev/null +++ b/kmp/src/commonMain/kotlin/dev/stapler/stelekit/voice/DirectSpeechProvider.kt @@ -0,0 +1,15 @@ +// Copyright (c) 2026 Tyler Stapler +// SPDX-License-Identifier: Elastic-2.0 +package dev.stapler.stelekit.voice + +import kotlinx.coroutines.flow.Flow + +/** Combines recording and transcription in a single step (e.g. Android SpeechRecognizer). */ +interface DirectSpeechProvider { + /** Records and transcribes; suspends until the user stops or silence is detected. */ + suspend fun listen(): TranscriptResult + /** Signals an in-progress listen to stop and return results. */ + suspend fun stopListening() {} + /** Optional RMS amplitude stream for animated feedback. */ + val amplitudeFlow: Flow? get() = null +} diff --git a/kmp/src/commonMain/kotlin/dev/stapler/stelekit/voice/VoiceCaptureViewModel.kt b/kmp/src/commonMain/kotlin/dev/stapler/stelekit/voice/VoiceCaptureViewModel.kt index 6f9e76d..e8e0ba7 100644 --- a/kmp/src/commonMain/kotlin/dev/stapler/stelekit/voice/VoiceCaptureViewModel.kt +++ b/kmp/src/commonMain/kotlin/dev/stapler/stelekit/voice/VoiceCaptureViewModel.kt @@ -35,7 +35,8 @@ class VoiceCaptureViewModel( when (_state.value) { is VoiceCaptureState.Idle -> startPipeline() is VoiceCaptureState.Recording -> scope.launch { - pipeline.audioRecorder.stopRecording() + pipeline.directSpeechProvider?.stopListening() + ?: pipeline.audioRecorder.stopRecording() } else -> Unit } @@ -57,51 +58,55 @@ class VoiceCaptureViewModel( private fun startPipeline() { pipelineJob = scope.launch { - var file: PlatformAudioFile? = null - try { - _state.value = VoiceCaptureState.Recording - val result = pipeline.audioRecorder.startRecording() - file = result - - if (result.isEmpty) { + _state.value = VoiceCaptureState.Recording + val transcriptResult = if (pipeline.directSpeechProvider != null) { + pipeline.directSpeechProvider.listen() + } else { + recordAndTranscribe() + } + when (transcriptResult) { + null -> return@launch // error already set inside recordAndTranscribe + TranscriptResult.Empty -> { + _state.value = VoiceCaptureState.Error( + PipelineStage.TRANSCRIBING, "Nothing was captured — try again" + ) + } + is TranscriptResult.Failure.ApiError -> { + _state.value = VoiceCaptureState.Error( + PipelineStage.TRANSCRIBING, transcriptResult.message + ) + } + TranscriptResult.Failure.NetworkError -> { + _state.value = VoiceCaptureState.Error( + PipelineStage.TRANSCRIBING, "Network error — check your connection" + ) + } + TranscriptResult.Failure.PermissionDenied -> { _state.value = VoiceCaptureState.Error( PipelineStage.RECORDING, "Microphone permission denied" ) - return@launch } + is TranscriptResult.Success -> processTranscript(transcriptResult.text.trim()) + } + } + } - _state.value = VoiceCaptureState.Transcribing - val audioData = pipeline.audioRecorder.readBytes(result) - when (val sttResult = pipeline.sttProvider.transcribe(audioData)) { - TranscriptResult.Empty -> { - _state.value = VoiceCaptureState.Error( - PipelineStage.TRANSCRIBING, "Nothing was captured — try again" - ) - return@launch - } - is TranscriptResult.Failure.ApiError -> { - _state.value = VoiceCaptureState.Error( - PipelineStage.TRANSCRIBING, sttResult.message - ) - return@launch - } - TranscriptResult.Failure.NetworkError -> { - _state.value = VoiceCaptureState.Error( - PipelineStage.TRANSCRIBING, "Network error — check your connection" - ) - return@launch - } - TranscriptResult.Failure.PermissionDenied -> { - _state.value = VoiceCaptureState.Error( - PipelineStage.RECORDING, "Microphone permission denied" - ) - return@launch - } - is TranscriptResult.Success -> processTranscript(sttResult.text.trim()) - } - } finally { - file?.takeIf { !it.isEmpty }?.let { pipeline.audioRecorder.deleteRecording(it) } + /** Records via [AudioRecorder] then transcribes; returns null and sets error state on failure. */ + private suspend fun recordAndTranscribe(): TranscriptResult? { + var file: PlatformAudioFile? = null + return try { + val result = pipeline.audioRecorder.startRecording() + file = result + if (result.isEmpty) { + _state.value = VoiceCaptureState.Error( + PipelineStage.RECORDING, "Microphone permission denied" + ) + return null } + _state.value = VoiceCaptureState.Transcribing + pipeline.sttProvider.transcribe(pipeline.audioRecorder.readBytes(result)) + } finally { + file?.takeIf { !it.isEmpty }?.let { pipeline.audioRecorder.deleteRecording(it) } } } diff --git a/kmp/src/commonMain/kotlin/dev/stapler/stelekit/voice/VoicePipelineConfig.kt b/kmp/src/commonMain/kotlin/dev/stapler/stelekit/voice/VoicePipelineConfig.kt index 961d47e..a485db1 100644 --- a/kmp/src/commonMain/kotlin/dev/stapler/stelekit/voice/VoicePipelineConfig.kt +++ b/kmp/src/commonMain/kotlin/dev/stapler/stelekit/voice/VoicePipelineConfig.kt @@ -20,4 +20,9 @@ class VoicePipelineConfig( val llmProvider: LlmFormatterProvider = NoOpLlmFormatterProvider(), val systemPrompt: String = DEFAULT_VOICE_SYSTEM_PROMPT, val minWordCount: Int = 10, -) + /** When set, replaces the (record → STT) two-step path with a single integrated listen. */ + val directSpeechProvider: DirectSpeechProvider? = null, +) { + /** Amplitude flow for waveform animation: prefers directSpeechProvider, falls back to audioRecorder. */ + val effectiveAmplitudeFlow get() = directSpeechProvider?.amplitudeFlow ?: audioRecorder.amplitudeFlow +} diff --git a/kmp/src/commonMain/kotlin/dev/stapler/stelekit/voice/VoicePipelineFactory.kt b/kmp/src/commonMain/kotlin/dev/stapler/stelekit/voice/VoicePipelineFactory.kt index 6d98f04..288d45f 100644 --- a/kmp/src/commonMain/kotlin/dev/stapler/stelekit/voice/VoicePipelineFactory.kt +++ b/kmp/src/commonMain/kotlin/dev/stapler/stelekit/voice/VoicePipelineFactory.kt @@ -2,7 +2,12 @@ // SPDX-License-Identifier: Elastic-2.0 package dev.stapler.stelekit.voice -fun buildVoicePipeline(audioRecorder: AudioRecorder, settings: VoiceSettings): VoicePipelineConfig { +fun buildVoicePipeline( + audioRecorder: AudioRecorder, + settings: VoiceSettings, + directSpeechProvider: DirectSpeechProvider? = null, + deviceLlmProvider: LlmFormatterProvider? = null, +): VoicePipelineConfig { val sttProvider: SpeechToTextProvider = settings.getWhisperApiKey() ?.let { WhisperSpeechToTextProvider.withDefaults(it) } ?: SpeechToTextProvider { _ -> @@ -13,10 +18,17 @@ fun buildVoicePipeline(audioRecorder: AudioRecorder, settings: VoiceSettings): V } val llmProvider: LlmFormatterProvider = if (!settings.getLlmEnabled()) { NoOpLlmFormatterProvider() + } else if (deviceLlmProvider != null && settings.getUseDeviceLlm()) { + deviceLlmProvider } else { settings.getAnthropicKey()?.let { ClaudeLlmFormatterProvider.withDefaults(it) } ?: settings.getOpenAiKey()?.let { OpenAiLlmFormatterProvider.withDefaults(it) } ?: NoOpLlmFormatterProvider() } - return VoicePipelineConfig(audioRecorder = audioRecorder, sttProvider = sttProvider, llmProvider = llmProvider) + return VoicePipelineConfig( + audioRecorder = audioRecorder, + sttProvider = sttProvider, + llmProvider = llmProvider, + directSpeechProvider = directSpeechProvider, + ) } diff --git a/kmp/src/commonMain/kotlin/dev/stapler/stelekit/voice/VoiceSettings.kt b/kmp/src/commonMain/kotlin/dev/stapler/stelekit/voice/VoiceSettings.kt index 9f6957f..07640c9 100644 --- a/kmp/src/commonMain/kotlin/dev/stapler/stelekit/voice/VoiceSettings.kt +++ b/kmp/src/commonMain/kotlin/dev/stapler/stelekit/voice/VoiceSettings.kt @@ -30,10 +30,24 @@ class VoiceSettings(private val platformSettings: Settings) { fun setLlmEnabled(enabled: Boolean) = platformSettings.putBoolean(KEY_LLM_ENABLED, enabled) + fun getUseDeviceStt(): Boolean = + platformSettings.getBoolean(KEY_USE_DEVICE_STT, false) + + fun setUseDeviceStt(enabled: Boolean) = + platformSettings.putBoolean(KEY_USE_DEVICE_STT, enabled) + + fun getUseDeviceLlm(): Boolean = + platformSettings.getBoolean(KEY_USE_DEVICE_LLM, false) + + fun setUseDeviceLlm(enabled: Boolean) = + platformSettings.putBoolean(KEY_USE_DEVICE_LLM, enabled) + companion object { private const val KEY_WHISPER = "voice.whisper_key" private const val KEY_ANTHROPIC = "voice.anthropic_key" private const val KEY_OPENAI = "voice.openai_key" private const val KEY_LLM_ENABLED = "voice.llm_enabled" + private const val KEY_USE_DEVICE_STT = "voice.use_device_stt" + private const val KEY_USE_DEVICE_LLM = "voice.use_device_llm" } }