From b86aeb90015e2159d1b6c9c3e89191187479770e Mon Sep 17 00:00:00 2001 From: Jeremiah K <17190268+jeremiah-k@users.noreply.github.com> Date: Wed, 17 Jun 2026 22:25:01 -0500 Subject: [PATCH] fix(database): harden Room single-connection startup flow Force Room single-connection behavior to avoid KMP BundledSQLiteDriver pool-acquire failures during startup and selected-device database switches. Defer the first real per-device FTS backfill so cold-start reads are not blocked behind a long backfill on the single DB connection. Track and cancel scheduled backfill work when switching databases or closing the manager, skip stale database instances, and swallow non-cancellation backfill failures so closed or stale DB cleanup cannot crash the app. Also serialize the single-connection query coroutine context with limitedParallelism(1) and keep the required coroutine opt-in scoped to the Room builder configuration. --- .../core/database/DatabaseManager.kt | 35 +++++++++++++++++-- .../core/database/MeshtasticDatabase.kt | 29 ++++++++++----- 2 files changed, 54 insertions(+), 10 deletions(-) diff --git a/core/database/src/commonMain/kotlin/org/meshtastic/core/database/DatabaseManager.kt b/core/database/src/commonMain/kotlin/org/meshtastic/core/database/DatabaseManager.kt index ceb2391230..3511c4e598 100644 --- a/core/database/src/commonMain/kotlin/org/meshtastic/core/database/DatabaseManager.kt +++ b/core/database/src/commonMain/kotlin/org/meshtastic/core/database/DatabaseManager.kt @@ -26,9 +26,11 @@ import co.touchlab.kermit.Logger import kotlinx.coroutines.CancellationException import kotlinx.coroutines.CoroutineScope import kotlinx.coroutines.ExperimentalCoroutinesApi +import kotlinx.coroutines.Job import kotlinx.coroutines.NonCancellable import kotlinx.coroutines.SupervisorJob import kotlinx.coroutines.cancel +import kotlinx.coroutines.delay import kotlinx.coroutines.flow.MutableStateFlow import kotlinx.coroutines.flow.SharingStarted import kotlinx.coroutines.flow.StateFlow @@ -44,6 +46,7 @@ import org.koin.core.annotation.Named import org.koin.core.annotation.Single import org.meshtastic.core.common.util.nowMillis import org.meshtastic.core.di.CoroutineDispatchers +import kotlin.concurrent.Volatile import org.meshtastic.core.common.database.DatabaseManager as SharedDatabaseManager /** Manages per-device Room database instances for node data, with LRU eviction. */ @@ -64,6 +67,10 @@ open class DatabaseManager( private fun lastUsedKey(dbName: String) = longPreferencesKey("db_last_used:$dbName") + private var backfillJob: Job? = null + + @Volatile private var hasDelayedFirstDeviceBackfill = false + override val cacheLimit: StateFlow = datastore.data .map { it[cacheLimitKey] ?: DatabaseConstants.DEFAULT_CACHE_LIMIT } @@ -150,8 +157,12 @@ open class DatabaseManager( // One-time cleanup: remove legacy DB if present and not active managerScope.launch(dispatchers.io) { cleanupLegacyDbIfNeeded(activeDbName = dbName) } - // Backfill FTS search index for any text messages missing messageText - managerScope.launch(dispatchers.io) { backfillSearchIndexIfNeeded(db) } + // Backfill FTS search index for any text messages missing messageText. + // On the first real device DB, defer this so it does not starve the single DB connection while + // the UI is collecting startup flows. The default DB should not consume the cold-start delay. + val shouldDelayBackfill = dbName != DatabaseConstants.DEFAULT_DB_NAME && !hasDelayedFirstDeviceBackfill + if (shouldDelayBackfill) hasDelayedFirstDeviceBackfill = true + scheduleSearchIndexBackfill(dbName = dbName, db = db, shouldDelayBackfill = shouldDelayBackfill) Logger.i { "Switched active DB to ${anonymizeDbName(dbName)} for address ${anonymizeAddress(address)}" } } @@ -208,6 +219,7 @@ open class DatabaseManager( } private companion object { + private const val BACKFILL_COLD_START_DELAY_MS = 2_000L val DB_TERMS = listOf("pool", "database", "connection", "sqlite") } @@ -309,6 +321,23 @@ open class DatabaseManager( datastore.edit { it[legacyCleanedKey] = true } } + @Suppress("TooGenericExceptionCaught") + private fun scheduleSearchIndexBackfill(dbName: String, db: MeshtasticDatabase, shouldDelayBackfill: Boolean) { + backfillJob?.cancel() + backfillJob = + managerScope.launch(dispatchers.io) { + try { + if (shouldDelayBackfill) delay(BACKFILL_COLD_START_DELAY_MS) + if (_currentDb.value !== db) return@launch + backfillSearchIndexIfNeeded(db) + } catch (e: CancellationException) { + throw e + } catch (e: Exception) { + Logger.w(e) { "Failed to backfill search index for ${anonymizeDbName(dbName)}" } + } + } + } + /** * Backfills [Packet.messageText] for existing text-message packets that predate the FTS5 schema, then rebuilds the * FTS index so search covers historical messages. The text is decoded in Kotlin from each packet's payload (see @@ -333,6 +362,8 @@ open class DatabaseManager( /** Closes all open databases and cancels background work. */ fun close() { + backfillJob?.cancel() + backfillJob = null managerScope.cancel() dbCache.values.forEach { it.close() } dbCache.clear() diff --git a/core/database/src/commonMain/kotlin/org/meshtastic/core/database/MeshtasticDatabase.kt b/core/database/src/commonMain/kotlin/org/meshtastic/core/database/MeshtasticDatabase.kt index 18013c0b48..1fe418acbf 100644 --- a/core/database/src/commonMain/kotlin/org/meshtastic/core/database/MeshtasticDatabase.kt +++ b/core/database/src/commonMain/kotlin/org/meshtastic/core/database/MeshtasticDatabase.kt @@ -23,6 +23,7 @@ import androidx.room3.DeleteColumn import androidx.room3.DeleteTable import androidx.room3.RoomDatabase import androidx.room3.migration.AutoMigrationSpec +import kotlinx.coroutines.ExperimentalCoroutinesApi import org.meshtastic.core.common.util.ioDispatcher import org.meshtastic.core.database.dao.DeviceHardwareDao import org.meshtastic.core.database.dao.DeviceLinkDao @@ -143,13 +144,14 @@ abstract class MeshtasticDatabase : RoomDatabase() { * Configures a [RoomDatabase.Builder] with standard settings for this project. * * @param multiConnection when `true` (default), opens a multi-reader connection pool (`maxNumOfReaders = 4`, - * `maxNumOfWriters = 1`) so reads can run concurrently. Pass `false` to serialize all reads and writes - * through a single connection (no separate reader pool). + * `maxNumOfWriters = 1`) so reads can run concurrently. Pass `false` to explicitly force + * [setSingleConnectionPool], serializing all reads and writes through one connection. * - * **Android production passes `false`.** Under coroutine cancellation churn (e.g. DB switches via - * `flatMapLatest`), the Room KMP reader-pool permit semaphore can wedge: all reader connections report `Free` - * but `permits=0`, so every read acquisition times out indefinitely. Single-connection mode eliminates the - * separate reader permit pool. See `DatabaseBuilder.kt` (androidMain). + * **Android production passes `false`.** Without the explicit `setSingleConnectionPool()` call, Room defaults + * to a 4-reader pool for named databases regardless of whether `setMultipleConnectionPool` was called. Under + * coroutine cancellation churn (e.g. DB switches via `flatMapLatest`), the reader-pool permit semaphore can + * wedge: all reader connections report `Free` but `permits=0`, so every read acquisition times out + * indefinitely. Forcing single-connection eliminates the separate reader permit pool entirely. * * **In-memory databases MUST pass `false`** for deterministic read-after-write: a pooled reader connection can * serve a snapshot older than the latest write on the writer connection, so a read immediately after a write @@ -159,11 +161,22 @@ abstract class MeshtasticDatabase : RoomDatabase() { * **JVM/iOS production uses `true`** (the default). Revisit if desktop/iOS field logs show similar * pool-exhaustion patterns under cancellation churn. */ + @OptIn(ExperimentalCoroutinesApi::class) fun RoomDatabase.Builder.configureCommon( multiConnection: Boolean = true, ): RoomDatabase.Builder = this.fallbackToDestructiveMigration(dropAllTables = false) - .apply { if (multiConnection) setMultipleConnectionPool(maxNumOfReaders = 4, maxNumOfWriters = 1) } - .setQueryCoroutineContext(ioDispatcher) + .apply { + if (multiConnection) { + setMultipleConnectionPool(maxNumOfReaders = 4, maxNumOfWriters = 1) + } else { + setSingleConnectionPool() + } + } + .setQueryCoroutineContext( + // limitedParallelism(1) has the same throughput ceiling as the single-connection pool + // (already serialized), so this only blocks the cancellation pileup — not real I/O concurrency. + if (multiConnection) ioDispatcher else ioDispatcher.limitedParallelism(1), + ) } }