diff --git a/app/src/main/java/com/zs/smarthuman/sherpa/VadManager.kt b/app/src/main/java/com/zs/smarthuman/sherpa/VadManager.kt index d8def4e..8e589d6 100644 --- a/app/src/main/java/com/zs/smarthuman/sherpa/VadManager.kt +++ b/app/src/main/java/com/zs/smarthuman/sherpa/VadManager.kt @@ -16,39 +16,41 @@ class VadManager( private var isSpeaking = false private var lastSpeechTime = 0L - // ========== 核心调整:适配人类正常说话停顿 ========== - private val END_SILENCE_MS = 1500L // 基础静默阈值(1.5秒) - private val MAX_SILENCE_AFTER_SPEECH_MS = 3000L// 兜底静默阈值(3秒) - private val SPEECH_ACTIVE_DURATION = 5000L // 语音活跃期(5秒内容忍更长停顿) + // ========== 核心调整:区分活跃期/收尾期阈值 ========== + // 说话活跃期(容忍停顿) + private val ACTIVE_END_SILENCE_MS = 1500L // 活跃期基础静默(保留停顿容忍) + private val ACTIVE_CONSECUTIVE_FRAMES = 10 // 活跃期连续静音帧 + // 说话收尾期(快速结束) + private val FINAL_END_SILENCE_MS = 800L // 收尾期基础静默(缩短到800ms) + private val FINAL_CONSECUTIVE_FRAMES = 5 // 收尾期连续静音帧(5帧=100ms) + // 收尾期触发条件:最后一次有效语音后超过X秒,判定为进入收尾期 + private val FINAL_PHASE_TRIGGER_MS = 1000L // 1秒无有效语音,进入收尾期 - // ========== 核心调整:降低有效语音阈值 ========== - private val MIN_EFFECTIVE_SPEECH_RMS = 0.001f // 有效语音最小RMS - private val ENV_BASELINE_FACTOR = 1.2f // 环境基线倍数 - private var envBaselineRms = 0.0005f // 初始环境基线 - private var lastEffectiveSpeechTime = 0L // 最后一次有效语音时间戳 + private val MAX_SILENCE_AFTER_SPEECH_MS = 2000L // 兜底阈值从3秒降到2秒 - // ========== 新增:语音活跃期变量 ========== - private var isSpeechActive = false // 语音活跃期标记 - private var speechActiveStartMs = 0L // 活跃期开始时间 + // 原有基础配置 + private val MIN_EFFECTIVE_SPEECH_RMS = 0.001f + private val ENV_BASELINE_FACTOR = 1.2f + private var envBaselineRms = 0.0005f + private var lastEffectiveSpeechTime = 0L - // ========== 连续静音帧校验(放宽阈值) ========== private var consecutiveSilenceFrames = 0 - private val CONSECUTIVE_SILENCE_FRAME_THRESHOLD = 10 // 连续10帧静音(200ms) - // 基础统计变量 - private var activeFrameCount = 0 - private var activeSpeechFrameCount = 0 + // 新增:收尾期标记 + private var isInFinalPhase = false + + // 统计变量 private var speechEnergySum = 0f private var speechFrameCount = 0 private var peakRms = 0f - - // 连续性检测核心变量 private var totalFrames = 0 private var speechFrames = 0 private var continuousSpeechFrames = 0 private var lastFrameIsSpeech = false private var peakPosition = 0 private var frameIndex = 0 + private var activeFrameCount = 0 + private var activeSpeechFrameCount = 0 init { val config = getVadModelConfig(0) ?: throw IllegalStateException("[$TAG] VAD config not found") @@ -62,42 +64,39 @@ class VadManager( val vadHasSpeech = vad.isSpeechDetected() val rms = calcRms(samples) - // 环境基线更新(滑动平均,适配背景噪音) + // 环境基线更新 if (!vadHasSpeech || rms < MIN_EFFECTIVE_SPEECH_RMS) { envBaselineRms = (envBaselineRms * 0.9f) + (rms * 0.1f) } - // 有效语音判定:阈值更低,更容易触发 val effectiveSpeechThreshold = maxOf(MIN_EFFECTIVE_SPEECH_RMS, envBaselineRms * ENV_BASELINE_FACTOR) val isEffectiveSpeech = vadHasSpeech && rms >= effectiveSpeechThreshold - // ========== 核心优化:语音活跃期逻辑 ========== + // ========== 核心优化:动态判定收尾期 ========== if (isEffectiveSpeech) { - isSpeechActive = true - speechActiveStartMs = now // 重置活跃期 - } - // 活跃期内(5秒)用2秒静默阈值,活跃期外用1.5秒 - val dynamicEndSilenceMs = if (isSpeechActive && (now - speechActiveStartMs) < SPEECH_ACTIVE_DURATION) { - 2000L + lastEffectiveSpeechTime = now + isInFinalPhase = false // 有有效语音,退出收尾期 } else { - END_SILENCE_MS + // 最后一次有效语音后超过1秒,进入收尾期 + if (now - lastEffectiveSpeechTime >= FINAL_PHASE_TRIGGER_MS) { + isInFinalPhase = true + } } - // 语音能量统计(仅有效语音) + // 语音能量统计 if (isEffectiveSpeech) { speechEnergySum += rms speechFrameCount++ peakRms = maxOf(peakRms, rms) - lastEffectiveSpeechTime = now lastSpeechTime = now - consecutiveSilenceFrames = 0 // 重置连续静音帧 - LogUtils.v(TAG, "🔊 有效语音帧 | RMS: $rms | 阈值: $effectiveSpeechThreshold") + consecutiveSilenceFrames = 0 + LogUtils.v(TAG, "🔊 有效语音帧 | RMS: $rms | 阈值: $effectiveSpeechThreshold | 收尾期: $isInFinalPhase") } else { - consecutiveSilenceFrames++ // 累计连续静音帧 + consecutiveSilenceFrames++ LogUtils.v(TAG, if (vadHasSpeech) "⚠ 低能量语音帧 | RMS: $rms | 阈值: $effectiveSpeechThreshold" - else "🔇 静音帧 | 连续静音帧: $consecutiveSilenceFrames") + else "🔇 静音帧 | 连续静音帧: $consecutiveSilenceFrames | 收尾期: $isInFinalPhase") } - // 帧统计与连续性计算 + // 帧统计 totalFrames++ frameIndex++ if (isEffectiveSpeech) { @@ -109,7 +108,14 @@ class VadManager( lastFrameIsSpeech = false } - // VAD核心状态流转(使用动态静默阈值) + // ========== 核心优化:根据收尾期选择不同阈值 ========== + val (endSilenceMs, consecutiveFrames) = if (isInFinalPhase) { + Pair(FINAL_END_SILENCE_MS, FINAL_CONSECUTIVE_FRAMES) + } else { + Pair(ACTIVE_END_SILENCE_MS, ACTIVE_CONSECUTIVE_FRAMES) + } + + // VAD状态流转 if (isEffectiveSpeech) { if (!isSpeaking) { isSpeaking = true @@ -124,32 +130,36 @@ class VadManager( val vadSilenceDuration = now - lastSpeechTime val effectiveSilenceDuration = now - lastEffectiveSpeechTime - // 触发条件:动态静默时长 + 连续静音帧达标 - val isSilenceTimeout = (vadSilenceDuration >= dynamicEndSilenceMs || + // 触发结束条件:适配当前阶段的阈值 + val isSilenceTimeout = (vadSilenceDuration >= endSilenceMs || effectiveSilenceDuration >= MAX_SILENCE_AFTER_SPEECH_MS) && - consecutiveSilenceFrames >= CONSECUTIVE_SILENCE_FRAME_THRESHOLD + consecutiveSilenceFrames >= consecutiveFrames if (isSilenceTimeout) { isSpeaking = false - isSpeechActive = false // 结束活跃期 + isInFinalPhase = false // 重置收尾期 val avgEnergy = if (speechFrameCount > 0) speechEnergySum / speechFrameCount else 0f LogUtils.d(TAG, """ 🛑 语音结束 - 有效静默时长: ${effectiveSilenceDuration}ms - 连续静音帧: $consecutiveSilenceFrames - 平均能量: $avgEnergy | 峰值: $peakRms - - 活跃期: ${if (isSpeechActive) "是" else "否"} + - 收尾期: $isInFinalPhase | 所用阈值: $endSilenceMs ms """.trimIndent()) onSpeechEnd(avgEnergy, peakRms) resetStats() } else { - LogUtils.v(TAG, "⏳ 静默中(停顿容忍) | 连续静音帧: $consecutiveSilenceFrames | 静默时长: ${effectiveSilenceDuration}ms | 动态阈值: $dynamicEndSilenceMs") + LogUtils.v(TAG, "⏳ 静默中 | 连续静音帧: $consecutiveSilenceFrames | 静默时长: ${effectiveSilenceDuration}ms | 所用阈值: $endSilenceMs ms") } } } } - // ========== 保留原有方法 ========== + // 保留原有方法... + fun isSpeechDetected(): Boolean { + return vad.isSpeechDetected() + } + fun activeSpeechRatio(): Float { val ratio = if (activeFrameCount == 0) 0f else activeSpeechFrameCount.toFloat() / activeFrameCount LogUtils.d(TAG, "📊 语音占比: $ratio | 有效语音帧: $activeSpeechFrameCount | 总帧: $activeFrameCount") @@ -169,10 +179,7 @@ class VadManager( lastEffectiveSpeechTime = 0L envBaselineRms = 0.0005f consecutiveSilenceFrames = 0 - // 重置活跃期 - isSpeechActive = false - speechActiveStartMs = 0L - // 重置统计 + isInFinalPhase = false // 重置收尾期 resetStats() vad.reset() diff --git a/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt b/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt index fefb783..517014c 100644 --- a/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt +++ b/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt @@ -80,11 +80,27 @@ class VoiceController( private val SHORT_SPEECH_MAX = 2000L // ========== 核心修改:多人对话过滤配置(适配2人以上场景) ========== - private val MULTI_DIALOGUE_MIN_DURATION = 2500L // 多人对话最小时长(2.5秒,比两人更短也能判定) - private val MULTI_DIALOGUE_MAX_PEAK_AVG_RATIO = 2.5f // 多人对话峰均比范围更大(多人音量差异更大) + private val MULTI_DIALOGUE_MIN_DURATION = 2500L // 多人对话最小时长(2.5秒) + private val MULTI_DIALOGUE_MAX_PEAK_AVG_RATIO = 2.5f // 多人对话峰均比范围 private val MULTI_DIALOGUE_MIN_PEAK_AVG_RATIO = 0.4f - private val MULTI_DIALOGUE_MAX_CONTINUOUS_RATIO = 0.3f // 多人对话连续帧占比更低(轮流说话,断层更多) - private val MULTI_DIALOGUE_MIN_VAD_RATIO = 0.55f // 多人对话有效帧占比要求稍低(避免漏过滤) + private val MULTI_DIALOGUE_MAX_CONTINUOUS_RATIO = 0.3f // 多人对话连续帧占比 + private val MULTI_DIALOGUE_MIN_VAD_RATIO = 0.55f // 多人对话有效帧占比 + + // ========== 新增:录音过程中实时统计的变量 ========== + // 能量统计 + private var realtimeEnergySum = 0f + private var realtimeEnergyCount = 0 + private var realtimePeakRms = 0f + // 帧统计(实时累加) + private var realtimeTotalFrames = 0 + private var realtimeSpeechFrames = 0 + private var realtimeContinuousSpeechFrames = 0 + private var realtimeLastFrameIsSpeech = false + // 多人对话实时判定标记 + private var isMultiPersonDialogueDetected = false + // 防抖变量 + private var lastInvalidResetMs = 0L + private val INVALID_RESET_DEBOUNCE_MS = 1500L // 1.5秒内不重复重置 // 阈值配置数据类 private data class ThresholdConfig( @@ -143,15 +159,78 @@ class VoiceController( audioBuffer.addAll(samples.asList()) vadManager.accept(samples) + // ========== 核心优化:录音过程中实时计算 ========== + // 1. 实时校准环境基线(适配录音中环境变化) + calibrateEnvBaseline(samples) + // 2. 实时计算能量/峰值 + updateRealtimeEnergy(samples) + // 3. 实时更新帧统计 + updateRealtimeFrameStats() + // 4. 实时判定是否为多人对话,若是则立即终止录音 + if (checkMultiPersonDialogueRealtime(now)) { + LogUtils.w(TAG, "🚨 录音中识别出多人对话,提前终止") + finishSentence(realtimeEnergySum / realtimeEnergyCount, realtimePeakRms) + return + } + + // 原有最大录音时长判断 if (System.currentTimeMillis() - recordingStartMs > maxRecordingMs) { LogUtils.w(TAG, "⏱ Max recording reached | 当前环境基线: $currentEnvBaseline") - finishSentence() + finishSentence(realtimeEnergySum / realtimeEnergyCount, realtimePeakRms) } } } } - /* ================= 环境基线校准 ================= */ + /* ================= 新增:录音中实时更新能量统计 ================= */ + private fun updateRealtimeEnergy(samples: FloatArray) { + val rms = vadManager.calcRms(samples) + // 仅统计有效语音帧的能量 + if (rms >= MIN_EFFECTIVE_SPEECH_RMS) { + realtimeEnergySum += rms + realtimeEnergyCount++ + realtimePeakRms = maxOf(realtimePeakRms, rms) + } + } + + /* ================= 新增:录音中实时更新帧统计 ================= */ + private fun updateRealtimeFrameStats() { + // 从VADManager获取最新帧状态 + realtimeTotalFrames = vadManager.getTotalFrames() + realtimeSpeechFrames = vadManager.getSpeechFrames() + realtimeContinuousSpeechFrames = vadManager.getContinuousSpeechFrames() + // 实时更新连续帧标记 + val currentFrameIsSpeech = vadManager.isSpeechDetected() // 需给VadManager新增isSpeechDetected()方法 + if (currentFrameIsSpeech) { + realtimeContinuousSpeechFrames = if (realtimeLastFrameIsSpeech) realtimeContinuousSpeechFrames + 1 else 1 + } else { + realtimeContinuousSpeechFrames = 0 + } + realtimeLastFrameIsSpeech = currentFrameIsSpeech + } + + /* ================= 新增:录音中实时判定多人对话 ================= */ + private fun checkMultiPersonDialogueRealtime(now: Long): Boolean { + // 还没到多人对话最小时长,不判定 + val duration = now - recordingStartMs + if (duration < MULTI_DIALOGUE_MIN_DURATION) return false + + // 实时计算特征值 + val avgEnergy = if (realtimeEnergyCount > 0) realtimeEnergySum / realtimeEnergyCount else 0f + val peakAvgRatio = if (avgEnergy > 0) realtimePeakRms / avgEnergy else 0f + val continuousRatio = if (realtimeSpeechFrames > 0) realtimeContinuousSpeechFrames.toFloat() / realtimeSpeechFrames else 0f + val vadRatio = vadManager.activeSpeechRatio() + + // 多人对话判定逻辑(和原逻辑一致,但实时执行) + isMultiPersonDialogueDetected = duration >= MULTI_DIALOGUE_MIN_DURATION && + peakAvgRatio in MULTI_DIALOGUE_MIN_PEAK_AVG_RATIO..MULTI_DIALOGUE_MAX_PEAK_AVG_RATIO && + continuousRatio <= MULTI_DIALOGUE_MAX_CONTINUOUS_RATIO && + vadRatio >= MULTI_DIALOGUE_MIN_VAD_RATIO + + return isMultiPersonDialogueDetected + } + + /* ================= 环境基线校准(保留,录音中也会调用) ================= */ private fun calibrateEnvBaseline(samples: FloatArray) { val rms = vadManager.calcRms(samples) // 新增:只保留低于基线+阈值的有效值,过滤突发噪音 @@ -160,7 +239,7 @@ class VoiceController( if (envNoiseBuffer.size >= BASELINE_WINDOW_SIZE) { envNoiseBuffer.removeFirst() } - envNoiseBuffer.addLast(rms) + envNoiseBuffer.addLast(validRms) // 用过滤后的有效值更新 currentEnvBaseline = envNoiseBuffer.maxOrNull() ?: 0.001f } } @@ -180,6 +259,8 @@ class VoiceController( audioBuffer.clear() vadManager.reset() vadStarted = false + // 重置实时统计变量 + resetRealtimeStats() } inKwsObserve = true @@ -195,16 +276,21 @@ class VoiceController( recordingStartMs = System.currentTimeMillis() audioBuffer.clear() audioBuffer.addAll(preBuffer) + // 初始化实时统计变量 + resetRealtimeStats() state = VoiceState.RECORDING } private fun onVadEnd(avgEnergy: Float, peakRms: Float) { if (state != VoiceState.RECORDING) return LogUtils.d(TAG, "🧠 VAD END | 环境基线: $currentEnvBaseline") - finishSentence(avgEnergy, peakRms) + // 优先使用实时统计的能量值,避免重复计算 + val realAvgEnergy = if (realtimeEnergyCount > 0) realtimeEnergySum / realtimeEnergyCount else avgEnergy + val realPeakRms = if (realtimePeakRms > 0) realtimePeakRms else peakRms + finishSentence(realAvgEnergy, realPeakRms) } - /* ================= 结束录音(核心:多人对话过滤) ================= */ + /* ================= 结束录音(核心:复用实时计算结果) ================= */ private fun finishSentence(avgEnergy: Float = 0f, peakRms: Float = 0f) { val now = System.currentTimeMillis() val duration = now - recordingStartMs @@ -219,24 +305,13 @@ class VoiceController( val vadRatio = vadManager.activeSpeechRatio() val peakAvgRatio = if (avgEnergy > 0f) peakRms / avgEnergy else 0f - // 获取VAD帧统计 - val totalFrames = vadManager.getTotalFrames() - val speechFrames = vadManager.getSpeechFrames() - val continuousSpeechFrames = vadManager.getContinuousSpeechFrames() - val peakPositionRatio = vadManager.getPeakPositionRatio() - + // 直接复用实时统计的帧数据,无需重新获取 LogUtils.d(TAG, "📊 录音信息 | 时长: $duration ms | 能量: $avgEnergy | 峰均比: $peakAvgRatio | 基线: $currentEnvBaseline") - LogUtils.d(TAG, "📊 帧统计 | 总帧: $totalFrames | 语音帧: $speechFrames | 连续语音帧: $continuousSpeechFrames | 峰值位置占比: $peakPositionRatio") + LogUtils.d(TAG, "📊 实时帧统计 | 总帧: $realtimeTotalFrames | 语音帧: $realtimeSpeechFrames | 连续语音帧: $realtimeContinuousSpeechFrames") - // ========== 核心修改:第一步过滤多人对话垃圾语音 ========== - val continuousRatio = if (speechFrames > 0) continuousSpeechFrames.toFloat() / speechFrames else 0f - val isMultiPersonDialogue = duration >= MULTI_DIALOGUE_MIN_DURATION && // 时长≥2.5秒 - peakAvgRatio in MULTI_DIALOGUE_MIN_PEAK_AVG_RATIO..MULTI_DIALOGUE_MAX_PEAK_AVG_RATIO && // 峰均比0.4~2.5 - continuousRatio <= MULTI_DIALOGUE_MAX_CONTINUOUS_RATIO && // 连续帧占比≤0.3(多人轮流说,断层多) - vadRatio >= MULTI_DIALOGUE_MIN_VAD_RATIO // 有效帧占比≥0.55(整体语音占比高) - - if (isMultiPersonDialogue) { - LogUtils.w(TAG, "❌ 过滤多人对话垃圾语音 | 时长: $duration ms | 连续占比: $continuousRatio | 有效占比: $vadRatio | 峰均比: $peakAvgRatio") + // 若录音中已识别出多人对话,直接过滤 + if (isMultiPersonDialogueDetected) { + LogUtils.w(TAG, "❌ 过滤多人对话垃圾语音(实时识别) | 时长: $duration ms") resetToWaitSpeech() return } @@ -248,6 +323,7 @@ class VoiceController( audioBuffer.clear() state = VoiceState.UPLOADING onFinalAudio(audio) + resetRealtimeStats() // 重置实时统计 return } @@ -261,8 +337,10 @@ class VoiceController( } // ========== 3. 非连续判定:极度宽松 ========== + val continuousRatio = if (realtimeSpeechFrames > 0) realtimeContinuousSpeechFrames.toFloat() / realtimeSpeechFrames else 0f + val peakPositionRatio = vadManager.getPeakPositionRatio() val isDiscontinuous = continuousRatio < MIN_CONTINUOUS_FRAME_RATIO && - speechFrames < MIN_EFFECTIVE_SPEECH_FRAMES && + realtimeSpeechFrames < MIN_EFFECTIVE_SPEECH_FRAMES && peakPositionRatio > MAX_PEAK_POSITION_RATIO if (isDiscontinuous) { LogUtils.w(TAG, "❌ 非连续杂音过滤 | 连续占比: $continuousRatio < $MIN_CONTINUOUS_FRAME_RATIO") @@ -327,9 +405,22 @@ class VoiceController( audioBuffer.clear() state = VoiceState.UPLOADING onFinalAudio(audio) + resetRealtimeStats() // 重置实时统计 LogUtils.i(TAG, "✅ 低能量语音通过 | 时长: $duration ms | 能量: $avgEnergy | 场景: ${thresholdConfig.scene}") } + /* ================= 新增:重置实时统计变量 ================= */ + private fun resetRealtimeStats() { + realtimeEnergySum = 0f + realtimeEnergyCount = 0 + realtimePeakRms = 0f + realtimeTotalFrames = 0 + realtimeSpeechFrames = 0 + realtimeContinuousSpeechFrames = 0 + realtimeLastFrameIsSpeech = false + isMultiPersonDialogueDetected = false + } + /* ================= 播放/上传/Reset 回调 ================= */ fun onPlayStartPrompt() { LogUtils.d(TAG, "🎵 播放提示音 | 基线: $currentEnvBaseline") @@ -362,8 +453,7 @@ class VoiceController( VoiceState.WAIT_SPEECH_COOLDOWN } } - private var lastInvalidResetMs = 0L - private val INVALID_RESET_DEBOUNCE_MS = 1500L // 1.5秒内不重复重置 + private fun resetToWaitSpeech() { LogUtils.d(TAG, "🔄 重置到等待说话 | 基线: $currentEnvBaseline") val now = System.currentTimeMillis() @@ -375,6 +465,7 @@ class VoiceController( audioBuffer.clear() vadManager.reset() vadStarted = false + resetRealtimeStats() // 重置实时统计 state = VoiceState.WAIT_SPEECH if (waitSpeechFailStartMs == 0L) waitSpeechFailStartMs = System.currentTimeMillis() } @@ -390,6 +481,7 @@ class VoiceController( waitSpeechFailStartMs = 0L envNoiseBuffer.clear() currentEnvBaseline = 0.001f + resetRealtimeStats() // 重置实时统计 LogUtils.d(TAG, "🔄 环境基线已重置 | 新基线: $currentEnvBaseline") state = VoiceState.WAIT_WAKEUP } @@ -399,6 +491,7 @@ class VoiceController( wakeupManager.release() vadManager.reset() envNoiseBuffer.clear() + resetRealtimeStats() // 重置实时统计 } private fun cachePreBuffer(samples: FloatArray) { @@ -407,4 +500,7 @@ class VoiceController( if (preBuffer.size > PRE_BUFFER_SIZE) preBuffer.removeFirst() } } + + // ========== 补充:MIN_EFFECTIVE_SPEECH_RMS 常量(和VadManager对齐) ========== + private val MIN_EFFECTIVE_SPEECH_RMS = 0.001f } \ No newline at end of file