diff --git a/app/src/main/java/com/zs/smarthuman/sherpa/VadManager.kt b/app/src/main/java/com/zs/smarthuman/sherpa/VadManager.kt index 61115c7..fc83c8b 100644 --- a/app/src/main/java/com/zs/smarthuman/sherpa/VadManager.kt +++ b/app/src/main/java/com/zs/smarthuman/sherpa/VadManager.kt @@ -12,13 +12,15 @@ class VadManager( ) { private val TAG = "VadManager" + private val vad: Vad private var isSpeaking = false private var lastSpeechMs = 0L + private var lastActiveMs = 0L - /** 更果断结束 */ private val END_SILENCE_MS = 350L + private val RESET_IDLE_MS = 3_000L private val MIN_RMS = 0.002f init { @@ -43,31 +45,62 @@ class VadManager( fun accept(samples: FloatArray) { val now = System.currentTimeMillis() + // 1️⃣ 先快速 RMS 判断 + val rms = fastRms(samples) + if (rms < MIN_RMS) { + handleSilence(now) + return + } + + // 2️⃣ 有能量再喂 VAD vad.acceptWaveform(samples) val hasSpeech = vad.isSpeechDetected() - val rms = calcRms(samples) - if (hasSpeech && rms >= MIN_RMS) { + if (hasSpeech) { lastSpeechMs = now + lastActiveMs = now if (!isSpeaking) { isSpeaking = true onSpeechStart() } - } else if (isSpeaking && now - lastSpeechMs > END_SILENCE_MS) { - onSpeechEnd() - reset() + } else { + handleSilence(now) } } + private fun handleSilence(now: Long) { + if (isSpeaking && now - lastSpeechMs > END_SILENCE_MS) { + isSpeaking = false + onSpeechEnd() + } + + // 超长 idle 才 reset + if (!isSpeaking && now - lastActiveMs > RESET_IDLE_MS) { + vad.reset() + lastActiveMs = now + LogUtils.d(TAG, "🔄 VAD reset (idle)") + } + } + + private fun fastRms(samples: FloatArray): Float { + var sum = 0f + var count = 0 + var i = 0 + val step = 4 + while (i < samples.size) { + val v = samples[i] + sum += v * v + count++ + i += step + } + return sqrt(sum / count) + } + fun reset() { isSpeaking = false lastSpeechMs = 0 + lastActiveMs = 0 vad.reset() } - - private fun calcRms(samples: FloatArray): Float { - var sum = 0f - for (v in samples) sum += v * v - return sqrt(sum / samples.size) - } } + diff --git a/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt b/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt index 39b3ec8..87a5ef1 100644 --- a/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt +++ b/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt @@ -34,11 +34,12 @@ class VoiceController( private const val INVALID_RESET_DEBOUNCE_MS = 1500L - // 最小语音时长 - private const val MIN_SPEECH_MS = 600L // 统一的声纹验证阈值(不再分场景) - private const val SPEAKER_THRESHOLD = 0.45f + private const val SPEAKER_THRESHOLD = 0.38f + + private const val MIN_VERIFY_MS = 600L + private const val MAX_VERIFY_MS = 1200L } var state: VoiceState = VoiceState.WAIT_WAKEUP @@ -219,23 +220,59 @@ class VoiceController( onWakeup() LogUtils.d(TAG, "🔔 唤醒成功") } + @Volatile private var speakerVerifyFinished = false + @Volatile private var speakerVerifyPassed = true private fun onVadStart() { if (state != VoiceState.WAIT_SPEECH) return - LogUtils.d(TAG, "🎤 REAL VAD START") + vadStarted = true recordingStartMs = System.currentTimeMillis() audioBuffer.clear() audioBuffer.addAll(preBuffer) + + startAsyncSpeakerVerify() + state = VoiceState.RECORDING } + private fun onVadEnd() { if (state != VoiceState.RECORDING) return LogUtils.d(TAG, "🧠 VAD END") finishSentence() } + private fun startAsyncSpeakerVerify() { + speakerVerifyFinished = false + speakerVerifyPassed = true // fail-open + + CoroutineScope(Dispatchers.IO).launch { + // 等 600ms 音频 + val needSamples = SAMPLE_RATE * 600 / 1000 + var waited = 0L + + while (audioBuffer.size < needSamples && waited < 800) { + kotlinx.coroutines.delay(20) + waited += 20 + } + + if (audioBuffer.size < needSamples) { + speakerVerifyFinished = true + return@launch + } + + val input = audioBuffer + .takeLast(needSamples) + .toFloatArray() + + val pass = verifySpeaker(input) + speakerVerifyPassed = pass + speakerVerifyFinished = true + } + } + + /* ================= 结束录音 ================= */ private fun finishSentence() { val now = System.currentTimeMillis() @@ -251,17 +288,17 @@ class VoiceController( val audio = audioBuffer.toFloatArray() // 声纹验证(保留核心逻辑) - if (ENABLE_STRICT_SPEAKER_VERIFY) { - val isCurrentUser = verifySpeaker(audio) - if (!isCurrentUser) { - LogUtils.w(TAG, "❌ 非当前唤醒用户,拒绝语音 | 录音时长: $duration ms") - hasInvalidSpeech = true - resetToWaitSpeech() - return - } - LogUtils.d(TAG, "✅ 当前用户语音,继续处理 | 录音时长: $duration ms") + if (ENABLE_STRICT_SPEAKER_VERIFY && + speakerVerifyFinished && + !speakerVerifyPassed + ) { + LogUtils.w(TAG, "❌ 声纹失败(已完成),拒绝") + hasInvalidSpeech = true + resetToWaitSpeech() + return } + // 最终通过 audioBuffer.clear() state = VoiceState.UPLOADING @@ -390,65 +427,62 @@ class VoiceController( } + + private fun verifySpeaker(audio: FloatArray): Boolean { - if (audio.isEmpty()) { - LogUtils.w(TAG, "❌ 待验证音频为空,声纹验证失败") - return false + if (audio.isEmpty()) return true + + val audioMs = audio.size * 1000L / SAMPLE_RATE + if (audioMs < MIN_VERIFY_MS) { + LogUtils.d(TAG, "🟡 短音频 $audioMs ms,跳过声纹") + return true } - // 1. 记录验证开始时间(关键:统计处理耗时) val verifyStartMs = System.currentTimeMillis() - // 2. 原有音频裁剪逻辑(保留) - val audioDurationMs = (audio.size.toFloat() / SAMPLE_RATE * 1000).toLong() - val validAudio = if (audioDurationMs > 0) { - val validSampleCount = (audioDurationMs * SAMPLE_RATE / 1000).toInt() - if (validSampleCount < audio.size) { - audio.copyOfRange(audio.size - validSampleCount, audio.size) - } else { - audio - } + val maxSamples = (SAMPLE_RATE * MAX_VERIFY_MS / 1000).toInt() + val input = if (audio.size > maxSamples) { + audio.copyOfRange(audio.size - maxSamples, audio.size) } else { audio } + var stream: OnlineStream? = null return runCatching { stream = SpeakerRecognition.extractor.createStream() - stream.acceptWaveform(samples = validAudio, sampleRate = SAMPLE_RATE) + stream.acceptWaveform(input, SAMPLE_RATE) stream.inputFinished() if (!SpeakerRecognition.extractor.isReady(stream)) { - LogUtils.w(TAG, "❌ 音频Stream未就绪,验证失败") - return@runCatching false + LogUtils.w(TAG, "⚠️ stream not ready,放行") + return@runCatching true } val embedding = SpeakerRecognition.extractor.compute(stream) - speakerManagerLock.withLock { - val verifyPass = SpeakerRecognition.manager.verify( - name = CURRENT_USER_ID, - embedding = embedding, - threshold = SPEAKER_THRESHOLD - ) - // 3. 计算真实处理耗时(结束时间 - 开始时间) - val verifyCostMs = System.currentTimeMillis() - verifyStartMs - // 日志区分:音频时长 vs 处理耗时 - LogUtils.d( - TAG, - "📊 声纹验证 | 统一阈值: $SPEAKER_THRESHOLD | 通过: $verifyPass | 音频时长: $audioDurationMs ms | 处理耗时: $verifyCostMs ms" + val pass = speakerManagerLock.withLock { + SpeakerRecognition.manager.verify( + CURRENT_USER_ID, + embedding, + SPEAKER_THRESHOLD ) - verifyPass } - }.onFailure { e -> - LogUtils.e(TAG, "❌ 声纹验证异常,拒绝", e) + + val cost = System.currentTimeMillis() - verifyStartMs + LogUtils.d( + TAG, + "📊 声纹 | pass=$pass | 音频=${audioMs}ms | 输入=${input.size} | 耗时=${cost}ms" + ) + + pass + }.onFailure { + LogUtils.e(TAG, "❌ 声纹异常,放行", it) }.also { - runCatching { - stream?.release() - }.onFailure { e -> - LogUtils.w(TAG, "⚠️ 释放 Stream 资源失败", e) - } - }.getOrDefault(false) + runCatching { stream?.release() } + }.getOrDefault(true) } + + } \ No newline at end of file