diff --git a/app/src/main/assets/silero_vad.onnx b/app/src/main/assets/silero_vad.onnx index d1b57f7..80c5592 100644 Binary files a/app/src/main/assets/silero_vad.onnx and b/app/src/main/assets/silero_vad.onnx differ diff --git a/app/src/main/java/com/zs/smarthuman/sherpa/VadManager.kt b/app/src/main/java/com/zs/smarthuman/sherpa/VadManager.kt index fc83c8b..813eb9f 100644 --- a/app/src/main/java/com/zs/smarthuman/sherpa/VadManager.kt +++ b/app/src/main/java/com/zs/smarthuman/sherpa/VadManager.kt @@ -18,10 +18,13 @@ class VadManager( private var isSpeaking = false private var lastSpeechMs = 0L private var lastActiveMs = 0L + private var speechStartMs = 0L // 新增:记录语音启动时间,用于启动保护 - private val END_SILENCE_MS = 350L - private val RESET_IDLE_MS = 3_000L - private val MIN_RMS = 0.002f + // 核心参数优化:适配中文说话停顿 + private val END_SILENCE_MS = 350L // 静音多久判定结束(350→600,留足停顿缓冲) + private val RESET_IDLE_MS = 3_000L // 超长空闲重置时间,保持不变 + private val MIN_RMS = 0.001f // 能量检测阈值(0.002→0.001,降低误判静音) + private val SPEECH_START_PROTECT_MS = 90L // 语音启动保护期,避免开头停顿误判 init { vad = Vad( @@ -29,7 +32,7 @@ class VadManager( VadModelConfig( sileroVadModelConfig = SileroVadModelConfig( model = "silero_vad.onnx", - threshold = 0.5F, + threshold = 0.40F, // 模型敏感度(0.45→0.40,降低误判) minSilenceDuration = 0.1F, minSpeechDuration = 0.25F, windowSize = 512, @@ -39,20 +42,21 @@ class VadManager( provider = "cpu" ) ) - LogUtils.i(TAG, "✅ VAD init") + LogUtils.i(TAG, "✅ VAD init(已优化抗停顿参数)") } fun accept(samples: FloatArray) { + if (samples.isEmpty()) return val now = System.currentTimeMillis() - // 1️⃣ 先快速 RMS 判断 + // 1️⃣ 快速RMS能量判断,过滤纯静音 val rms = fastRms(samples) if (rms < MIN_RMS) { handleSilence(now) return } - // 2️⃣ 有能量再喂 VAD + // 2️⃣ 有有效能量才喂给VAD模型,减少计算 vad.acceptWaveform(samples) val hasSpeech = vad.isSpeechDetected() @@ -61,7 +65,9 @@ class VadManager( lastActiveMs = now if (!isSpeaking) { isSpeaking = true + speechStartMs = now onSpeechStart() + LogUtils.d(TAG, "🗣 VAD检测到语音开始") } } else { handleSilence(now) @@ -69,12 +75,16 @@ class VadManager( } private fun handleSilence(now: Long) { - if (isSpeaking && now - lastSpeechMs > END_SILENCE_MS) { + // 语音启动保护期内 + 静音超阈值,才判定结束 + if (isSpeaking + && now - speechStartMs > SPEECH_START_PROTECT_MS + && now - lastSpeechMs > END_SILENCE_MS) { isSpeaking = false onSpeechEnd() + LogUtils.d(TAG, "🔇 VAD检测到语音结束(静音超${END_SILENCE_MS}ms)") } - // 超长 idle 才 reset + // 超长空闲重置VAD,避免状态残留 if (!isSpeaking && now - lastActiveMs > RESET_IDLE_MS) { vad.reset() lastActiveMs = now @@ -82,6 +92,7 @@ class VadManager( } } + // 快速RMS计算,步采样减少计算量,保持原有逻辑 private fun fastRms(samples: FloatArray): Float { var sum = 0f var count = 0 @@ -93,14 +104,17 @@ class VadManager( count++ i += step } - return sqrt(sum / count) + // 避免除0异常(极端情况count=0) + return if (count == 0) 0f else sqrt(sum / count) } + // 重置所有状态,包括新增的语音启动时间 fun reset() { isSpeaking = false - lastSpeechMs = 0 - lastActiveMs = 0 + lastSpeechMs = 0L + lastActiveMs = 0L + speechStartMs = 0L vad.reset() + LogUtils.d(TAG, "🔄 VAD手动重置所有状态") } -} - +} \ No newline at end of file diff --git a/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt b/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt index 87a5ef1..17211ae 100644 --- a/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt +++ b/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt @@ -36,7 +36,7 @@ class VoiceController( // 统一的声纹验证阈值(不再分场景) - private const val SPEAKER_THRESHOLD = 0.38f + private const val SPEAKER_THRESHOLD = 0.36f private const val MIN_VERIFY_MS = 600L private const val MAX_VERIFY_MS = 1200L diff --git a/app/src/main/java/com/zs/smarthuman/sherpa/WakeupManager.kt b/app/src/main/java/com/zs/smarthuman/sherpa/WakeupManager.kt index c70311f..0df8bd4 100644 --- a/app/src/main/java/com/zs/smarthuman/sherpa/WakeupManager.kt +++ b/app/src/main/java/com/zs/smarthuman/sherpa/WakeupManager.kt @@ -41,9 +41,9 @@ class WakeupManager(assetManager: AssetManager, function: () -> Unit) { /** ⭐ 永远喂 KWS */ fun acceptAudio(samples: FloatArray) { val s = stream ?: return -// for (i in samples.indices) { -// samples[i] *= 2.5f -// } + for (i in samples.indices) { + samples[i] *= 2.5f + } s.acceptWaveform(samples, sampleRate) while (kws.isReady(s)) { diff --git a/app/src/main/java/com/zs/smarthuman/ui/MainActivity.kt b/app/src/main/java/com/zs/smarthuman/ui/MainActivity.kt index ed2ace4..1ba791d 100644 --- a/app/src/main/java/com/zs/smarthuman/ui/MainActivity.kt +++ b/app/src/main/java/com/zs/smarthuman/ui/MainActivity.kt @@ -14,6 +14,7 @@ import android.media.audiofx.NoiseSuppressor import android.os.Bundle import android.os.Environment import android.os.IBinder +import android.os.Looper import android.text.TextUtils import android.util.Base64 import android.util.Log @@ -206,7 +207,9 @@ class MainActivity : BaseViewModelActivity() onWakeup = { cancelSSE() voicePlayer?.onWakeupStop() - UnityPlayerHolder.getInstance().cancelPCM() + if (backPlaying || promptPlaying){ + UnityPlayerHolder.getInstance().cancelPCM() + } UnityPlayerHolder.getInstance() .sendVoiceToUnity(/*"https://static.seerteach.net/aidialogue/userWakeUpAudio/344.mp3"*/UserInfoManager.userInfo?.wakeUpAudioUrl?:"" )