From 4fdc61c38188a34e6b7907a7a2cbad902b6571a7 Mon Sep 17 00:00:00 2001 From: Benjamin Lee Date: Wed, 15 Apr 2026 12:24:43 -0700 Subject: [PATCH 1/4] Removed unnessary copies and Mel Spectrogram --- .../Diarizer/LS-EEND/LSEENDDiarizer.swift | 62 ++++++------------- .../LS-EEND/LSEENDModelInference.swift | 7 ++- 2 files changed, 23 insertions(+), 46 deletions(-) diff --git a/Sources/FluidAudio/Diarizer/LS-EEND/LSEENDDiarizer.swift b/Sources/FluidAudio/Diarizer/LS-EEND/LSEENDDiarizer.swift index b74affa83..4b86d7285 100644 --- a/Sources/FluidAudio/Diarizer/LS-EEND/LSEENDDiarizer.swift +++ b/Sources/FluidAudio/Diarizer/LS-EEND/LSEENDDiarizer.swift @@ -71,11 +71,11 @@ public final class LSEENDDiarizer: Diarizer { private var _engine: LSEENDInferenceHelper? private var _session: LSEENDStreamingSession? - private var _melSpectrogram: AudioMelSpectrogram? private var _timeline: DiarizerTimeline private var _numFramesProcessed: Int = 0 private var _timelineConfig: DiarizerTimelineConfig private var _visibleStartFrameOffset: Int = 0 + private var _cachedConverter: AudioConverter? // Audio buffering private var pendingAudio: [Float] = [] @@ -154,12 +154,10 @@ public final class LSEENDDiarizer: Diarizer { /// - Parameter descriptor: Model descriptor specifying variant and file paths public func initialize(descriptor: LSEENDModelDescriptor) throws { let engine = try LSEENDInferenceHelper(descriptor: descriptor, computeUnits: computeUnits) - let melSpectrogram = Self.createMelSpectrogram(featureConfig: engine.featureConfig) lock.withLock { updateTimelineConfig(engine: engine) _engine = engine - _melSpectrogram = melSpectrogram _timeline = DiarizerTimeline(config: _timelineConfig) _session = nil resetBuffersLocked() @@ -175,12 +173,9 @@ public final class LSEENDDiarizer: Diarizer { /// Initialize with a pre-loaded engine. public func initialize(engine: LSEENDInferenceHelper) { - let melSpectrogram = Self.createMelSpectrogram(featureConfig: engine.featureConfig) - lock.withLock { updateTimelineConfig(engine: engine) _engine = engine - _melSpectrogram = melSpectrogram _timeline = DiarizerTimeline(config: _timelineConfig) _session = nil resetBuffersLocked() @@ -270,7 +265,7 @@ public final class LSEENDDiarizer: Diarizer { if _session == nil { _session = try engine.createSession( - inputSampleRate: engine.targetSampleRate, melSpectrogram: _melSpectrogram!) + inputSampleRate: engine.targetSampleRate) } guard let session = _session else { return nil @@ -292,9 +287,9 @@ public final class LSEENDDiarizer: Diarizer { let numSpeakers = engine.metadata.realOutputDim let result = DiarizerChunkResult( startFrame: max(0, update.startFrame - _visibleStartFrameOffset), - finalizedPredictions: flattenRowMajor(update.probabilities, numSpeakers: numSpeakers), + finalizedPredictions: update.probabilities.values, finalizedFrameCount: update.probabilities.rows, - tentativePredictions: flattenRowMajor(update.previewProbabilities, numSpeakers: numSpeakers), + tentativePredictions: update.previewProbabilities.values, tentativeFrameCount: update.previewProbabilities.rows ) _numFramesProcessed += result.finalizedFrameCount @@ -409,8 +404,7 @@ public final class LSEENDDiarizer: Diarizer { // Lazily create session on first process call if _session == nil { - _session = try engine.createSession( - inputSampleRate: engine.targetSampleRate, melSpectrogram: _melSpectrogram!) + _session = try engine.createSession(inputSampleRate: engine.targetSampleRate) } guard let session = _session else { return nil } @@ -426,9 +420,9 @@ public final class LSEENDDiarizer: Diarizer { let numSpeakers = engine.metadata.realOutputDim let result = DiarizerChunkResult( startFrame: max(0, update.startFrame - _visibleStartFrameOffset), - finalizedPredictions: flattenRowMajor(update.probabilities, numSpeakers: numSpeakers), + finalizedPredictions: update.probabilities.values, finalizedFrameCount: update.probabilities.rows, - tentativePredictions: flattenRowMajor(update.previewProbabilities, numSpeakers: numSpeakers), + tentativePredictions: update.previewProbabilities.values, tentativeFrameCount: update.previewProbabilities.rows ) @@ -563,7 +557,7 @@ public final class LSEENDDiarizer: Diarizer { retainedSession } else { try engine.createSession( - inputSampleRate: engine.targetSampleRate, melSpectrogram: _melSpectrogram!) + inputSampleRate: engine.targetSampleRate) } let numSpeakers = engine.metadata.realOutputDim @@ -571,9 +565,9 @@ public final class LSEENDDiarizer: Diarizer { if let update = try session.pushAudio(normalized) { let chunk = DiarizerChunkResult( startFrame: max(0, update.startFrame - _visibleStartFrameOffset), - finalizedPredictions: flattenRowMajor(update.probabilities, numSpeakers: numSpeakers), + finalizedPredictions: update.probabilities.values, finalizedFrameCount: update.probabilities.rows, - tentativePredictions: flattenRowMajor(update.previewProbabilities, numSpeakers: numSpeakers), + tentativePredictions: update.previewProbabilities.values, tentativeFrameCount: update.previewProbabilities.rows ) _numFramesProcessed += chunk.finalizedFrameCount @@ -586,7 +580,7 @@ public final class LSEENDDiarizer: Diarizer { if let finalUpdate = try session.finalize() { let chunk = DiarizerChunkResult( startFrame: max(0, finalUpdate.startFrame - _visibleStartFrameOffset), - finalizedPredictions: flattenRowMajor(finalUpdate.probabilities, numSpeakers: numSpeakers), + finalizedPredictions: finalUpdate.probabilities.values, finalizedFrameCount: finalUpdate.probabilities.rows, tentativePredictions: [], tentativeFrameCount: 0 @@ -623,7 +617,7 @@ public final class LSEENDDiarizer: Diarizer { lock.withLock { _engine = nil _session = nil - _melSpectrogram = nil + _cachedConverter = nil _timeline.reset() resetBuffersLocked() logger.info("LS-EEND resources cleaned up") @@ -656,7 +650,7 @@ public final class LSEENDDiarizer: Diarizer { if let update = pushedUpdate { let flushedResult = DiarizerChunkResult( startFrame: _numFramesProcessed, - finalizedPredictions: flattenRowMajor(update.probabilities, numSpeakers: numSpeakers), + finalizedPredictions: update.probabilities.values, finalizedFrameCount: update.probabilities.rows, tentativePredictions: [], tentativeFrameCount: 0 @@ -670,7 +664,7 @@ public final class LSEENDDiarizer: Diarizer { if let finalUpdate = try session.finalize() { let finalResult = DiarizerChunkResult( startFrame: _numFramesProcessed, - finalizedPredictions: flattenRowMajor(finalUpdate.probabilities, numSpeakers: numSpeakers), + finalizedPredictions: finalUpdate.probabilities.values, finalizedFrameCount: finalUpdate.probabilities.rows, tentativePredictions: [], tentativeFrameCount: 0 @@ -704,34 +698,14 @@ public final class LSEENDDiarizer: Diarizer { return nil } - return try AudioConverter(sampleRate: Double(engine.targetSampleRate)) - .resample(Array(samples), from: sourceSampleRate) - } - - /// Create a new mel spectrogram instance owned by this diarizer. - private static func createMelSpectrogram(featureConfig: LSEENDFeatureConfig) -> AudioMelSpectrogram { - AudioMelSpectrogram( - sampleRate: featureConfig.sampleRate, - nMels: featureConfig.nMels, - nFFT: featureConfig.nFFT, - hopLength: featureConfig.hopLength, - winLength: featureConfig.winLength, - preemph: 0, - padTo: 1, - logFloor: 1e-10, - logFloorMode: .clamped, - windowPeriodic: true - ) + if _cachedConverter == nil { + _cachedConverter = AudioConverter(sampleRate: Double(engine.targetSampleRate)) + } + return try _cachedConverter!.resample(Array(samples), from: sourceSampleRate) } private func updateTimelineConfig(engine: LSEENDInferenceHelper) { self._timelineConfig.numSpeakers = engine.metadata.realOutputDim self._timelineConfig.frameDurationSeconds = Float(1.0 / engine.modelFrameHz) } - - /// Convert an LSEENDMatrix to a flat [Float] in row-major layout. - private func flattenRowMajor(_ matrix: LSEENDMatrix, numSpeakers: Int) -> [Float] { - guard matrix.rows > 0, matrix.columns > 0 else { return [] } - return matrix.values - } } diff --git a/Sources/FluidAudio/Diarizer/LS-EEND/LSEENDModelInference.swift b/Sources/FluidAudio/Diarizer/LS-EEND/LSEENDModelInference.swift index f978b9006..3675cc0a8 100644 --- a/Sources/FluidAudio/Diarizer/LS-EEND/LSEENDModelInference.swift +++ b/Sources/FluidAudio/Diarizer/LS-EEND/LSEENDModelInference.swift @@ -644,9 +644,12 @@ public final class LSEENDStreamingSession { let previewFullLogits: LSEENDMatrix if includePreview { - let previewState = try state.copy() + // flushTail does not mutate the passed-in state — it reassigns a local + // variable on each step, leaving self.state untouched. finalize() relies + // on this same guarantee. Skipping the former state.copy() eliminates + // 6 × cloneAlignedMultiArray per pushAudio call. let pending = totalFeatureFrames - emittedFrames - previewFullLogits = try flushTail(from: previewState, pendingFrames: pending) + previewFullLogits = try flushTail(from: state, pendingFrames: pending) } else { previewFullLogits = .empty(columns: engine.decodeMaxSpeakers) } From 56df9aa290c581a4a193a6de90ed4e71a2ff7ee7 Mon Sep 17 00:00:00 2001 From: Benjamin Lee Date: Wed, 15 Apr 2026 12:47:52 -0700 Subject: [PATCH 2/4] Bug fix --- .../Diarizer/LS-EEND/LSEENDDiarizer.swift | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/Sources/FluidAudio/Diarizer/LS-EEND/LSEENDDiarizer.swift b/Sources/FluidAudio/Diarizer/LS-EEND/LSEENDDiarizer.swift index 4b86d7285..eb824a280 100644 --- a/Sources/FluidAudio/Diarizer/LS-EEND/LSEENDDiarizer.swift +++ b/Sources/FluidAudio/Diarizer/LS-EEND/LSEENDDiarizer.swift @@ -158,6 +158,7 @@ public final class LSEENDDiarizer: Diarizer { lock.withLock { updateTimelineConfig(engine: engine) _engine = engine + _cachedConverter = nil _timeline = DiarizerTimeline(config: _timelineConfig) _session = nil resetBuffersLocked() @@ -176,6 +177,7 @@ public final class LSEENDDiarizer: Diarizer { lock.withLock { updateTimelineConfig(engine: engine) _engine = engine + _cachedConverter = nil _timeline = DiarizerTimeline(config: _timelineConfig) _session = nil resetBuffersLocked() @@ -284,7 +286,6 @@ public final class LSEENDDiarizer: Diarizer { } if let update { - let numSpeakers = engine.metadata.realOutputDim let result = DiarizerChunkResult( startFrame: max(0, update.startFrame - _visibleStartFrameOffset), finalizedPredictions: update.probabilities.values, @@ -417,7 +418,6 @@ public final class LSEENDDiarizer: Diarizer { return nil } - let numSpeakers = engine.metadata.realOutputDim let result = DiarizerChunkResult( startFrame: max(0, update.startFrame - _visibleStartFrameOffset), finalizedPredictions: update.probabilities.values, @@ -559,8 +559,6 @@ public final class LSEENDDiarizer: Diarizer { try engine.createSession( inputSampleRate: engine.targetSampleRate) } - let numSpeakers = engine.metadata.realOutputDim - // Push all audio at once if let update = try session.pushAudio(normalized) { let chunk = DiarizerChunkResult( @@ -639,7 +637,6 @@ public final class LSEENDDiarizer: Diarizer { defer { lock.unlock() } guard let engine = _engine, let session = _session else { return nil } - let numSpeakers = engine.metadata.realOutputDim var lastResult: DiarizerChunkResult? // Flush pending audio first — clear unconditionally so failed audio isn't retained. @@ -698,10 +695,12 @@ public final class LSEENDDiarizer: Diarizer { return nil } - if _cachedConverter == nil { - _cachedConverter = AudioConverter(sampleRate: Double(engine.targetSampleRate)) - } - return try _cachedConverter!.resample(Array(samples), from: sourceSampleRate) + let converter = _cachedConverter ?? { + let c = AudioConverter(sampleRate: Double(engine.targetSampleRate)) + _cachedConverter = c + return c + }() + return try converter.resample(Array(samples), from: sourceSampleRate) } private func updateTimelineConfig(engine: LSEENDInferenceHelper) { From 3c3c13a335da9db75b5720af5e46ffd0ece9335a Mon Sep 17 00:00:00 2001 From: Benjamin Lee Date: Wed, 15 Apr 2026 12:52:33 -0700 Subject: [PATCH 3/4] Format fix --- .../FluidAudio/Diarizer/LS-EEND/LSEENDDiarizer.swift | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/Sources/FluidAudio/Diarizer/LS-EEND/LSEENDDiarizer.swift b/Sources/FluidAudio/Diarizer/LS-EEND/LSEENDDiarizer.swift index eb824a280..60937afb7 100644 --- a/Sources/FluidAudio/Diarizer/LS-EEND/LSEENDDiarizer.swift +++ b/Sources/FluidAudio/Diarizer/LS-EEND/LSEENDDiarizer.swift @@ -695,11 +695,13 @@ public final class LSEENDDiarizer: Diarizer { return nil } - let converter = _cachedConverter ?? { - let c = AudioConverter(sampleRate: Double(engine.targetSampleRate)) - _cachedConverter = c - return c - }() + let converter = + _cachedConverter + ?? { + let c = AudioConverter(sampleRate: Double(engine.targetSampleRate)) + _cachedConverter = c + return c + }() return try converter.resample(Array(samples), from: sourceSampleRate) } From c9b9017349f47e90198ba4cf9b66c7f3f000af4e Mon Sep 17 00:00:00 2001 From: Benjamin Lee Date: Wed, 15 Apr 2026 14:04:26 -0700 Subject: [PATCH 4/4] Removed unused engine in finalize session --- Sources/FluidAudio/Diarizer/LS-EEND/LSEENDDiarizer.swift | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Sources/FluidAudio/Diarizer/LS-EEND/LSEENDDiarizer.swift b/Sources/FluidAudio/Diarizer/LS-EEND/LSEENDDiarizer.swift index 60937afb7..28cc12c4b 100644 --- a/Sources/FluidAudio/Diarizer/LS-EEND/LSEENDDiarizer.swift +++ b/Sources/FluidAudio/Diarizer/LS-EEND/LSEENDDiarizer.swift @@ -636,7 +636,7 @@ public final class LSEENDDiarizer: Diarizer { lock.lock() defer { lock.unlock() } - guard let engine = _engine, let session = _session else { return nil } + guard let session = _session else { return nil } var lastResult: DiarizerChunkResult? // Flush pending audio first — clear unconditionally so failed audio isn't retained.