From 99cf06456d19ec8e91ef3b67aba72be045ba3247 Mon Sep 17 00:00:00 2001 From: ross <3024454314@qq.com> Date: Fri, 30 Jan 2026 17:02:50 +0800 Subject: [PATCH] =?UTF-8?q?=E5=BC=BA=E5=8C=96=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../com/zs/smarthuman/sherpa/VadManager.kt | 80 ++++++++----- .../zs/smarthuman/sherpa/VoiceController.kt | 106 ++++++++++++------ 2 files changed, 125 insertions(+), 61 deletions(-) diff --git a/app/src/main/java/com/zs/smarthuman/sherpa/VadManager.kt b/app/src/main/java/com/zs/smarthuman/sherpa/VadManager.kt index 813eb9f..4cbce5d 100644 --- a/app/src/main/java/com/zs/smarthuman/sherpa/VadManager.kt +++ b/app/src/main/java/com/zs/smarthuman/sherpa/VadManager.kt @@ -2,7 +2,9 @@ package com.zs.smarthuman.sherpa import android.content.res.AssetManager import com.blankj.utilcode.util.LogUtils -import com.k2fsa.sherpa.onnx.* +import com.k2fsa.sherpa.onnx.SileroVadModelConfig +import com.k2fsa.sherpa.onnx.Vad +import com.k2fsa.sherpa.onnx.VadModelConfig import kotlin.math.sqrt class VadManager( @@ -12,19 +14,22 @@ class VadManager( ) { private val TAG = "VadManager" - private val vad: Vad private var isSpeaking = false private var lastSpeechMs = 0L private var lastActiveMs = 0L - private var speechStartMs = 0L // 新增:记录语音启动时间,用于启动保护 + private var speechStartMs = 0L // 语音启动保护 - // 核心参数优化:适配中文说话停顿 - private val END_SILENCE_MS = 350L // 静音多久判定结束(350→600,留足停顿缓冲) - private val RESET_IDLE_MS = 3_000L // 超长空闲重置时间,保持不变 - private val MIN_RMS = 0.001f // 能量检测阈值(0.002→0.001,降低误判静音) - private val SPEECH_START_PROTECT_MS = 90L // 语音启动保护期,避免开头停顿误判 + // **************** 超短语音(300ms)专属参数优化 **************** + // 1. 大幅提高RMS阈值,彻底过滤麦克风底噪(核心中的核心) + private val MIN_RMS = 0.003f + // 2. 缩短静音结束判定时间,适配短语音(350→200ms,有效语音结束后200ms静音就触发结束) + private val END_SILENCE_MS = 200L + // 3. 保留启动保护,防止语音开头轻微停顿误判(短语音也需要,90ms足够) + private val SPEECH_START_PROTECT_MS = 90L + // 4. 空闲重置时间保留,不影响短语音 + private val RESET_IDLE_MS = 3_000L init { vad = Vad( @@ -32,33 +37,41 @@ class VadManager( VadModelConfig( sileroVadModelConfig = SileroVadModelConfig( model = "silero_vad.onnx", - threshold = 0.40F, // 模型敏感度(0.45→0.40,降低误判) - minSilenceDuration = 0.1F, - minSpeechDuration = 0.25F, - windowSize = 512, - ), - sampleRate = 16000, - numThreads = 1, - provider = "cpu" + threshold = 0.45F, // 轻微提高敏感度,让VAD更快识别语音结束(0.40→0.45) + minSilenceDuration = 0.05F, // 适配短静音判定(0.1→0.05) + minSpeechDuration = 0.1F, // 允许超短语音(0.25→0.1,支持100ms以上语音) + windowSize = 512 + ) ) ) - LogUtils.i(TAG, "✅ VAD init(已优化抗停顿参数)") + LogUtils.i(TAG, "✅ VAD init(适配300ms超短语音+中文抗停顿)") } fun accept(samples: FloatArray) { if (samples.isEmpty()) return val now = System.currentTimeMillis() - // 1️⃣ 快速RMS能量判断,过滤纯静音 + // 1️⃣ 快速RMS能量判断,彻底过滤底噪+纯静音 val rms = fastRms(samples) if (rms < MIN_RMS) { handleSilence(now) return } - // 2️⃣ 有有效能量才喂给VAD模型,减少计算 - vad.acceptWaveform(samples) - val hasSpeech = vad.isSpeechDetected() + // 2️⃣ 有有效能量才喂给VAD模型,减少计算+防原生异常 + try { + vad.acceptWaveform(samples) + } catch (e: Exception) { + LogUtils.e(TAG, "❌ VAD acceptWaveform异常", e) + return + } + + val hasSpeech = try { + vad.isSpeechDetected() + } catch (e: Exception) { + LogUtils.e(TAG, "❌ VAD isSpeechDetected异常", e) + false + } if (hasSpeech) { lastSpeechMs = now @@ -67,32 +80,38 @@ class VadManager( isSpeaking = true speechStartMs = now onSpeechStart() - LogUtils.d(TAG, "🗣 VAD检测到语音开始") + LogUtils.d(TAG, "🗣 VAD检测到语音开始 | RMS: $rms | 采样数: ${samples.size}") } } else { handleSilence(now) + lastActiveMs = now } } private fun handleSilence(now: Long) { - // 语音启动保护期内 + 静音超阈值,才判定结束 + // 核心判定:启动保护后+连续静音超200ms → 立即触发结束(适配300ms短语音) if (isSpeaking && now - speechStartMs > SPEECH_START_PROTECT_MS && now - lastSpeechMs > END_SILENCE_MS) { isSpeaking = false onSpeechEnd() - LogUtils.d(TAG, "🔇 VAD检测到语音结束(静音超${END_SILENCE_MS}ms)") + // 打印详细日志,方便排查:总时长/有效语音后静音时长 + LogUtils.d(TAG, "🔇 VAD检测到语音结束 | 静音超${END_SILENCE_MS}ms | 总时长: ${now - speechStartMs}ms | 有效语音后静音: ${now - lastSpeechMs}ms") } // 超长空闲重置VAD,避免状态残留 if (!isSpeaking && now - lastActiveMs > RESET_IDLE_MS) { - vad.reset() + try { + vad.reset() + } catch (e: Exception) { + LogUtils.e(TAG, "❌ VAD reset异常", e) + } lastActiveMs = now LogUtils.d(TAG, "🔄 VAD reset (idle)") } } - // 快速RMS计算,步采样减少计算量,保持原有逻辑 + // 快速RMS计算,步采样减少计算量,保持原有逻辑不变 private fun fastRms(samples: FloatArray): Float { var sum = 0f var count = 0 @@ -104,17 +123,20 @@ class VadManager( count++ i += step } - // 避免除0异常(极端情况count=0) return if (count == 0) 0f else sqrt(sum / count) } - // 重置所有状态,包括新增的语音启动时间 + // 重置所有状态,保持原有逻辑不变 fun reset() { isSpeaking = false lastSpeechMs = 0L lastActiveMs = 0L speechStartMs = 0L - vad.reset() + try { + vad.reset() + } catch (e: Exception) { + LogUtils.e(TAG, "❌ VAD 手动reset异常", e) + } LogUtils.d(TAG, "🔄 VAD手动重置所有状态") } } \ No newline at end of file diff --git a/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt b/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt index 17211ae..57a8910 100644 --- a/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt +++ b/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt @@ -6,8 +6,11 @@ import com.k2fsa.sherpa.onnx.OnlineStream import com.k2fsa.sherpa.onnx.SpeakerRecognition import kotlinx.coroutines.CoroutineScope import kotlinx.coroutines.Dispatchers +import kotlinx.coroutines.Job +import kotlinx.coroutines.delay import kotlinx.coroutines.launch import java.util.ArrayDeque +import java.util.concurrent.CopyOnWriteArrayList import java.util.concurrent.locks.ReentrantLock import kotlin.concurrent.withLock @@ -34,9 +37,8 @@ class VoiceController( private const val INVALID_RESET_DEBOUNCE_MS = 1500L - - // 统一的声纹验证阈值(不再分场景) - private const val SPEAKER_THRESHOLD = 0.36f + // 统一的声纹验证阈值(可根据实际效果微调,原0.36) + private const val SPEAKER_THRESHOLD = 0.35f private const val MIN_VERIFY_MS = 600L private const val MAX_VERIFY_MS = 1200L @@ -56,13 +58,14 @@ class VoiceController( private val speakerManagerLock = ReentrantLock() private val wakeupManager = WakeupManager(assetManager, onWakeup) + // 【修改】保留vadManager可访问,用于手动触发结束 private val vadManager = VadManager( assetManager, onSpeechStart = { onVadStart() }, onSpeechEnd = { onVadEnd() } ) - private val audioBuffer = mutableListOf() + private val audioBuffer = CopyOnWriteArrayList() private val preBuffer = ArrayDeque(PRE_BUFFER_SIZE) private var recordingStartMs = 0L @@ -84,6 +87,14 @@ class VoiceController( private val ENABLE_STRICT_SPEAKER_VERIFY = true private val preBufferLock = Any() + // 声纹验证标记 + @Volatile private var speakerVerifyFinished = false + @Volatile private var speakerVerifyPassed = true // fail-open + + @Volatile private var speakerVerifyFailed = false + + @Volatile private var currentSpeakerVerifyJob: Job? = null + init { try { SpeakerRecognition.initExtractor(assetManager) @@ -220,8 +231,6 @@ class VoiceController( onWakeup() LogUtils.d(TAG, "🔔 唤醒成功") } - @Volatile private var speakerVerifyFinished = false - @Volatile private var speakerVerifyPassed = true private fun onVadStart() { if (state != VoiceState.WAIT_SPEECH) return @@ -236,24 +245,39 @@ class VoiceController( state = VoiceState.RECORDING } - private fun onVadEnd() { if (state != VoiceState.RECORDING) return LogUtils.d(TAG, "🧠 VAD END") finishSentence() } + // 【新增2】手动触发VAD结束核心方法(切主线程保证线程安全) + private fun manualTriggerVadEnd() { + CoroutineScope(Dispatchers.Main).launch { + if (vadStarted && state == VoiceState.RECORDING) { + LogUtils.d(TAG, "🔴 声纹失败,手动触发VAD结束") + vadManager.reset() + onVadEnd() + } else { + // 打印状态,方便问题排查 + LogUtils.w(TAG, "🟡 尝试手动触发VAD结束,但非录音状态(state:$state,vadStarted:$vadStarted),放弃") + } + } + } + private fun startAsyncSpeakerVerify() { + currentSpeakerVerifyJob?.cancel() + currentSpeakerVerifyJob = null speakerVerifyFinished = false speakerVerifyPassed = true // fail-open - - CoroutineScope(Dispatchers.IO).launch { - // 等 600ms 音频 + speakerVerifyFailed = false // 重置失败标记 + currentSpeakerVerifyJob = CoroutineScope(Dispatchers.IO).launch { + // 可优化:500ms(原600ms),更快触发验证,需Samples对应修改 val needSamples = SAMPLE_RATE * 600 / 1000 var waited = 0L while (audioBuffer.size < needSamples && waited < 800) { - kotlinx.coroutines.delay(20) + delay(20) waited += 20 } @@ -272,22 +296,26 @@ class VoiceController( } } - /* ================= 结束录音 ================= */ private fun finishSentence() { + currentSpeakerVerifyJob?.cancel() + currentSpeakerVerifyJob = null val now = System.currentTimeMillis() val duration = now - recordingStartMs + speakerVerifyFailed = false -// if (!vadStarted || duration < MIN_SPEECH_MS) { -// LogUtils.d(TAG, "❌ 语音过短: $duration ms") -// hasInvalidSpeech = true -// resetToWaitSpeech() -// return -// } + // 【修改1】声纹失败快速终止,直接跳过所有校验 + if (speakerVerifyFailed) { + LogUtils.w(TAG, "🔴 声纹失败快速终止,跳过语音时长校验 | 实际录音时长: $duration ms") + hasInvalidSpeech = true + resetToWaitSpeech() + speakerVerifyFailed = false // 重置标记,避免状态残留 + return + } - val audio = audioBuffer.toFloatArray() + val audio = audioBuffer.toFloatArray().copyOf() - // 声纹验证(保留核心逻辑) + // 原有声纹最终校验(双重保障,极端情况兜底) if (ENABLE_STRICT_SPEAKER_VERIFY && speakerVerifyFinished && !speakerVerifyPassed @@ -298,8 +326,7 @@ class VoiceController( return } - - // 最终通过 + // 最终通过逻辑 audioBuffer.clear() state = VoiceState.UPLOADING onFinalAudio(audio) @@ -317,7 +344,7 @@ class VoiceController( speechEnableAtMs = System.currentTimeMillis() + SPEECH_COOLDOWN_MS LogUtils.d(TAG, "🎵 提示音结束") if (!preBuffer.isEmpty()) { - synchronized(preBufferLock){ + synchronized(preBufferLock) { preBuffer.clear() } } @@ -356,10 +383,16 @@ class VoiceController( LogUtils.d(TAG, "🛡 防抖:1.5秒内重复无效语音,跳过重置") return } + currentSpeakerVerifyJob?.cancel() + currentSpeakerVerifyJob = null lastInvalidResetMs = now audioBuffer.clear() vadManager.reset() vadStarted = false + // 【修改2】重置声纹所有标记,避免状态残留 + speakerVerifyFinished = false + speakerVerifyPassed = true + speakerVerifyFailed = false state = VoiceState.WAIT_SPEECH if (waitSpeechFailStartMs == 0L) waitSpeechFailStartMs = System.currentTimeMillis() } @@ -368,7 +401,7 @@ class VoiceController( LogUtils.d(TAG, "🔄 重置所有状态 | 本次超时类型: $currentTimeoutType") audioBuffer.clear() if (!preBuffer.isEmpty()) { - synchronized(preBufferLock){ + synchronized(preBufferLock) { preBuffer.clear() } } @@ -379,6 +412,10 @@ class VoiceController( waitSpeechFailStartMs = 0L hasInvalidSpeech = false currentTimeoutType = TimeoutType.IDLE_TIMEOUT + // 【修改3】重置声纹所有标记 + speakerVerifyFinished = false + speakerVerifyPassed = true + speakerVerifyFailed = false state = VoiceState.WAIT_WAKEUP } @@ -409,7 +446,6 @@ class VoiceController( } private fun cachePreBuffer(samples: FloatArray) { - // 空数据快速返回,避免无效循环 if (samples.isEmpty()) return synchronized(preBufferLock) { for (s in samples) { @@ -421,14 +457,10 @@ class VoiceController( preBuffer.removeFirst() } } - } } - } - - private fun verifySpeaker(audio: FloatArray): Boolean { if (audio.isEmpty()) return true @@ -447,7 +479,6 @@ class VoiceController( audio } - var stream: OnlineStream? = null return runCatching { @@ -476,6 +507,19 @@ class VoiceController( "📊 声纹 | pass=$pass | 音频=${audioMs}ms | 输入=${input.size} | 耗时=${cost}ms" ) + // 【修改4】声纹失败核心处理:置位标记 + 手动触发VAD结束 + if (!pass) { + // 只有当前还在录音中,才做后续处理;否则直接阻断,不影响任何状态 + if (state == VoiceState.RECORDING) { + speakerVerifyFailed = true + manualTriggerVadEnd() + LogUtils.d(TAG, "🔴 声纹失败,当前处于录音状态,执行终止流程") + } else { + // 窗口期已过,仅打印日志,不做任何操作 + LogUtils.w(TAG, "🟡 声纹验证失败,但录音流程已完成(当前状态:$state),不处理") + } + } + pass }.onFailure { LogUtils.e(TAG, "❌ 声纹异常,放行", it) @@ -483,6 +527,4 @@ class VoiceController( runCatching { stream?.release() } }.getOrDefault(true) } - - } \ No newline at end of file