From 8f7b02f18b9895a6780761c15ddf877a15717192 Mon Sep 17 00:00:00 2001 From: ross <3024454314@qq.com> Date: Sat, 10 Jan 2026 17:54:30 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BC=98=E5=8C=96=E5=8F=82=E6=95=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../zs/smarthuman/sherpa/VoiceController.kt | 171 +++++++++++------- 1 file changed, 109 insertions(+), 62 deletions(-) diff --git a/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt b/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt index 9821ba2..3fa10b3 100644 --- a/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt +++ b/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt @@ -23,8 +23,23 @@ class VoiceController( private val onTimeoutTip: OnTimeoutTip? = null ) { - private val TAG = "VoiceController" - private val sampleRate = 16000 + companion object { + // 日志标签 + private const val TAG = "VoiceController" + // 采样率 + private const val SAMPLE_RATE = 16000 + // 预缓存大小(2秒) + private const val PRE_BUFFER_SIZE = SAMPLE_RATE * 2 + // 声纹验证阈值 + private const val SPEAKER_VERIFY_THRESHOLD_NORMAL = 0.25f + private const val SPEAKER_VERIFY_THRESHOLD_SHORT = 0.20f + private const val SHORT_AUDIO_THRESHOLD = SAMPLE_RATE * 0.5f // 0.5秒音频长度 + // 防抖时间 + private const val INVALID_RESET_DEBOUNCE_MS = 1500L + // 最小语音时长 + private const val MIN_SPEECH_MS = 800L + private const val MIN_EFFECTIVE_VOICE_DURATION = 400L + } var state: VoiceState = VoiceState.WAIT_WAKEUP private set(value) { @@ -47,7 +62,8 @@ class VoiceController( private var isMultiPersonDialogueDetected = false // 防抖重置标记 private var lastInvalidResetMs = 0L - private val INVALID_RESET_DEBOUNCE_MS = 1500L + // 声纹管理器锁(解决并发问题) + private val speakerManagerLock = ReentrantLock() private val wakeupManager = WakeupManager(assetManager, onWakeup) private val vadManager = VadManager( @@ -57,8 +73,7 @@ class VoiceController( ) private val audioBuffer = mutableListOf() - private val preBuffer = ArrayDeque() - private val PRE_BUFFER_SIZE = sampleRate * 2 + private val preBuffer = ArrayDeque(PRE_BUFFER_SIZE) private var recordingStartMs = 0L private var waitSpeechFailStartMs = 0L @@ -71,7 +86,6 @@ class VoiceController( private var speechEnableAtMs = 0L private val SPEECH_COOLDOWN_MS = 300L - private val MIN_SPEECH_MS = 800L private val idleTimeoutMs = idleTimeoutSeconds * 1000L private val maxRecordingMs = maxRecordingSeconds * 1000L @@ -112,7 +126,6 @@ class VoiceController( private val MULTI_DIALOGUE_MIN_VAD_RATIO = 0.55f // ========== 核心调整:近距离场景 微弱人声过滤配置(重点优化) ========== - private val MIN_EFFECTIVE_VOICE_DURATION = 400L private val MIN_VOICE_FRAME_RATIO = 0.08f private val MIN_PEAK_ENERGY_RATIO = 1.5f private val NORMAL_VOICE_ENERGY_THRESHOLD = 0.008f @@ -121,19 +134,20 @@ class VoiceController( // ========== 核心新增:MIN_EFFECTIVE_SPEECH_RMS 常量 ========== private val MIN_EFFECTIVE_SPEECH_RMS = 0.0005f - // ========== 核心新增:无效说话标记 + 超时类型 ========== private var hasInvalidSpeech = false private var currentTimeoutType: TimeoutType = TimeoutType.IDLE_TIMEOUT - // ========== 核心配置:声纹验证相关 ========== private val CURRENT_USER_ID = "current_wakeup_user" // 当前唤醒用户唯一标识 private val ENABLE_STRICT_SPEAKER_VERIFY = true // 严格验证开关 - private val SPEAKER_VERIFY_THRESHOLD = 0.5f // 严格验证开关 - init { + // 参数校验 + require(idleTimeoutSeconds > 0) { "idleTimeoutSeconds 必须大于0" } + require(maxRecordingSeconds > 0) { "maxRecordingSeconds 必须大于0" } + require(maxRecordingSeconds >= idleTimeoutSeconds) { "maxRecordingSeconds 必须大于等于 idleTimeoutSeconds" } + // 初始化声纹识别器(适配你提供的API) try { SpeakerRecognition.initExtractor(assetManager) // 对齐原生API @@ -149,10 +163,10 @@ class VoiceController( cachePreBuffer(samples) wakeupManager.acceptAudio(samples) if (wakeupManager.consumeWakeupFlag()) { - handleWakeupEvent() + handleWakeupEvent() // 仅调用一次 // 注册唤醒用户特征(异步执行) CoroutineScope(Dispatchers.IO).launch { - var stream: OnlineStream? = null + var stream: OnlineStream? = null runCatching { val wakeupAudio = preBuffer.toFloatArray() if (wakeupAudio.isEmpty()) { @@ -160,42 +174,40 @@ class VoiceController( return@launch } - // 2. 创建原生Stream(按你提供的API) + // 创建原生Stream stream = SpeakerRecognition.extractor.createStream() - - stream.acceptWaveform(samples = preBuffer.toFloatArray(), sampleRate = sampleRate) + stream.acceptWaveform(samples = wakeupAudio, sampleRate = SAMPLE_RATE) stream.inputFinished() - // 4. 计算特征并注册(仅当前用户) + // 计算特征并注册(仅当前用户) if (SpeakerRecognition.extractor.isReady(stream)) { val embedding = SpeakerRecognition.extractor.compute(stream) - // 清空历史特征,确保当前用户唯一 - SpeakerRecognition.manager.remove(CURRENT_USER_ID) - // 注册当前唤醒用户(按你提供的add API) - val embeddingList: MutableList = mutableListOf() - embeddingList.add(embedding) - val ok = SpeakerRecognition.manager.add( - name = CURRENT_USER_ID, - embedding = embeddingList.toTypedArray() - ) - if (ok) { - LogUtils.d(TAG, "✅ 注册当前唤醒用户特征成功 | 特征长度: ${embedding.size}") - } else { - LogUtils.w(TAG, "❌ 注册当前唤醒用户特征失败(manager.add返回false)") + // 加锁保护 manager 操作 + speakerManagerLock.withLock { + SpeakerRecognition.manager.remove(CURRENT_USER_ID) + // 注册当前唤醒用户 + val embeddingList = mutableListOf(embedding) + val ok = SpeakerRecognition.manager.add( + name = CURRENT_USER_ID, + embedding = embeddingList.toTypedArray() + ) + if (ok) { + LogUtils.d(TAG, "✅ 注册当前唤醒用户特征成功 | 特征长度: ${embedding.size}") + } else { + LogUtils.w(TAG, "❌ 注册当前唤醒用户特征失败(manager.add返回false)") + } } } else { LogUtils.w(TAG, "❌ 唤醒音频Stream未就绪,跳过用户注册") } }.onFailure { - LogUtils.e(TAG, "❌ 唤醒用户特征注册失败", it.message) + LogUtils.e(TAG, "❌ 唤醒用户特征注册失败", it) }.also { - // 释放Stream(原生Stream用完即释放) + // 释放Stream stream?.release() LogUtils.d(TAG, "🔄 唤醒注册Stream已释放") } - } - handleWakeupEvent() return } @@ -410,7 +422,6 @@ class VoiceController( val now = System.currentTimeMillis() val duration = now - recordingStartMs - if (!vadStarted || duration < MIN_SPEECH_MS) { LogUtils.d(TAG, "❌ 语音过短: $duration ms | 基线: $currentEnvBaseline") hasInvalidSpeech = true @@ -563,7 +574,6 @@ class VoiceController( isMultiPersonDialogueDetected = false } - /* ================= 播放/上传/Reset 回调 ================= */ fun onPlayStartPrompt() { LogUtils.d(TAG, "🎵 播放提示音 | 基线: $currentEnvBaseline") @@ -643,6 +653,26 @@ class VoiceController( resetRealtimeStats() hasInvalidSpeech = false currentTimeoutType = TimeoutType.IDLE_TIMEOUT + + // 释放声纹识别器资源 + runCatching { + SpeakerRecognition.extractor.release() + speakerManagerLock.withLock { + SpeakerRecognition.manager.release() + } + LogUtils.d(TAG, "✅ 声纹识别器资源已释放") + }.onFailure { + LogUtils.e(TAG, "❌ 释放声纹识别器资源失败", it) + } + } + + // 兜底释放(防止未调用release) + protected fun finalize() { + runCatching { + release() + }.onFailure { + LogUtils.e(TAG, "❌ finalize 释放资源失败", it) + } } private fun cachePreBuffer(samples: FloatArray) { @@ -667,36 +697,53 @@ class VoiceController( * @return true=是当前用户,false=非当前用户 */ private fun verifySpeaker(audio: FloatArray): Boolean { - var stream: OnlineStream? = null - return try { - stream = SpeakerRecognition.extractor.createStream() - stream.acceptWaveform(samples = audio, sampleRate = sampleRate) - stream.inputFinished() + if (audio.isEmpty()) { + LogUtils.w(TAG, "❌ 待验证音频为空,声纹验证失败") + return false + } - // 4. 计算特征并验证(按你提供的API) - if (!SpeakerRecognition.extractor.isReady(stream)) { - LogUtils.w(TAG, "❌ 验证音频Stream未就绪,验证失败") - return false - } + var stream: OnlineStream? = null + return try { + stream = SpeakerRecognition.extractor.createStream() + stream.acceptWaveform(samples = audio, sampleRate = SAMPLE_RATE) + stream.inputFinished() - val embedding = SpeakerRecognition.extractor.compute(stream) - - val verifyPass = SpeakerRecognition.manager.verify(name = CURRENT_USER_ID, embedding = embedding, threshold = SPEAKER_VERIFY_THRESHOLD) - if (verifyPass) { - LogUtils.d(TAG, "✅ 声纹验证通过") - } else { - LogUtils.w(TAG, "❌ 声纹验证失败") - } - verifyPass - } catch (e: Exception) { - LogUtils.e(TAG, "❌ 声纹验证异常", e) - false - } finally { - // 释放Stream(原生Stream用完即释放) - stream?.release() - LogUtils.d(TAG, "🔄 验证Stream已释放") + if (!SpeakerRecognition.extractor.isReady(stream)) { + LogUtils.w(TAG, "❌ 验证音频Stream未就绪,验证失败") + return false } + val embedding = SpeakerRecognition.extractor.compute(stream) + // 动态选择阈值 + val threshold = if (audio.size < SHORT_AUDIO_THRESHOLD) { + LogUtils.d(TAG, "📢 检测到短速语音,使用放宽阈值: $SPEAKER_VERIFY_THRESHOLD_SHORT") + SPEAKER_VERIFY_THRESHOLD_SHORT + } else { + SPEAKER_VERIFY_THRESHOLD_NORMAL + } + + // 加锁验证 + speakerManagerLock.withLock { + val verifyPass = SpeakerRecognition.manager.verify( + name = CURRENT_USER_ID, + embedding = embedding, + threshold = threshold + ) + if (verifyPass) { + LogUtils.d(TAG, "✅ 声纹验证通过 | 阈值: $threshold") + } else { + LogUtils.w(TAG, "❌ 声纹验证失败 | 阈值: $threshold") + } + return verifyPass + } + } catch (e: Exception) { + LogUtils.e(TAG, "❌ 声纹验证异常", e) + false + } finally { + // 释放Stream + stream?.release() + LogUtils.d(TAG, "🔄 验证Stream已释放") + } } } \ No newline at end of file