From 58e2bc3e063d1742606b684621be573e9c45c0a5 Mon Sep 17 00:00:00 2001 From: ross <3024454314@qq.com> Date: Thu, 8 Jan 2026 15:42:13 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E5=90=88=E9=80=82=E9=98=88?= =?UTF-8?q?=E5=80=BC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../com/zs/smarthuman/sherpa/VadManager.kt | 157 ++++++++++-------- .../zs/smarthuman/sherpa/VoiceController.kt | 39 ++++- 2 files changed, 123 insertions(+), 73 deletions(-) diff --git a/app/src/main/java/com/zs/smarthuman/sherpa/VadManager.kt b/app/src/main/java/com/zs/smarthuman/sherpa/VadManager.kt index f3c150b..d8def4e 100644 --- a/app/src/main/java/com/zs/smarthuman/sherpa/VadManager.kt +++ b/app/src/main/java/com/zs/smarthuman/sherpa/VadManager.kt @@ -15,7 +15,25 @@ class VadManager( private val vad: Vad private var isSpeaking = false private var lastSpeechTime = 0L - private val END_SILENCE_MS = 800L + + // ========== 核心调整:适配人类正常说话停顿 ========== + private val END_SILENCE_MS = 1500L // 基础静默阈值(1.5秒) + private val MAX_SILENCE_AFTER_SPEECH_MS = 3000L// 兜底静默阈值(3秒) + private val SPEECH_ACTIVE_DURATION = 5000L // 语音活跃期(5秒内容忍更长停顿) + + // ========== 核心调整:降低有效语音阈值 ========== + private val MIN_EFFECTIVE_SPEECH_RMS = 0.001f // 有效语音最小RMS + private val ENV_BASELINE_FACTOR = 1.2f // 环境基线倍数 + private var envBaselineRms = 0.0005f // 初始环境基线 + private var lastEffectiveSpeechTime = 0L // 最后一次有效语音时间戳 + + // ========== 新增:语音活跃期变量 ========== + private var isSpeechActive = false // 语音活跃期标记 + private var speechActiveStartMs = 0L // 活跃期开始时间 + + // ========== 连续静音帧校验(放宽阈值) ========== + private var consecutiveSilenceFrames = 0 + private val CONSECUTIVE_SILENCE_FRAME_THRESHOLD = 10 // 连续10帧静音(200ms) // 基础统计变量 private var activeFrameCount = 0 @@ -24,13 +42,13 @@ class VadManager( private var speechFrameCount = 0 private var peakRms = 0f - // ========== 新增:连续性检测核心变量 ========== - private var totalFrames = 0 // 总处理帧数 - private var speechFrames = 0 // 语音帧总数 - private var continuousSpeechFrames = 0 // 连续语音帧数 - private var lastFrameIsSpeech = false // 上一帧是否为语音 - private var peakPosition = 0 // 峰值所在帧位置 - private var frameIndex = 0 // 当前帧索引 + // 连续性检测核心变量 + private var totalFrames = 0 + private var speechFrames = 0 + private var continuousSpeechFrames = 0 + private var lastFrameIsSpeech = false + private var peakPosition = 0 + private var frameIndex = 0 init { val config = getVadModelConfig(0) ?: throw IllegalStateException("[$TAG] VAD config not found") @@ -38,54 +56,64 @@ class VadManager( LogUtils.i(TAG, "✅ VAD 初始化成功") } - /** - * 接收音频数据并进行VAD检测 - * @param samples 音频采样数据(float数组) - */ fun accept(samples: FloatArray) { val now = System.currentTimeMillis() vad.acceptWaveform(samples) - val hasSpeech = vad.isSpeechDetected() + val vadHasSpeech = vad.isSpeechDetected() val rms = calcRms(samples) - // ========== 1. 语音能量统计 ========== - if (hasSpeech) { + // 环境基线更新(滑动平均,适配背景噪音) + if (!vadHasSpeech || rms < MIN_EFFECTIVE_SPEECH_RMS) { + envBaselineRms = (envBaselineRms * 0.9f) + (rms * 0.1f) + } + // 有效语音判定:阈值更低,更容易触发 + val effectiveSpeechThreshold = maxOf(MIN_EFFECTIVE_SPEECH_RMS, envBaselineRms * ENV_BASELINE_FACTOR) + val isEffectiveSpeech = vadHasSpeech && rms >= effectiveSpeechThreshold + + // ========== 核心优化:语音活跃期逻辑 ========== + if (isEffectiveSpeech) { + isSpeechActive = true + speechActiveStartMs = now // 重置活跃期 + } + // 活跃期内(5秒)用2秒静默阈值,活跃期外用1.5秒 + val dynamicEndSilenceMs = if (isSpeechActive && (now - speechActiveStartMs) < SPEECH_ACTIVE_DURATION) { + 2000L + } else { + END_SILENCE_MS + } + + // 语音能量统计(仅有效语音) + if (isEffectiveSpeech) { speechEnergySum += rms speechFrameCount++ peakRms = maxOf(peakRms, rms) - LogUtils.v(TAG, "🔊 检测到语音帧 | RMS: $rms | 累计峰值: $peakRms") + lastEffectiveSpeechTime = now + lastSpeechTime = now + consecutiveSilenceFrames = 0 // 重置连续静音帧 + LogUtils.v(TAG, "🔊 有效语音帧 | RMS: $rms | 阈值: $effectiveSpeechThreshold") } else { - LogUtils.v(TAG, "🔇 检测到静音帧 | RMS: $rms") + consecutiveSilenceFrames++ // 累计连续静音帧 + LogUtils.v(TAG, if (vadHasSpeech) "⚠ 低能量语音帧 | RMS: $rms | 阈值: $effectiveSpeechThreshold" + else "🔇 静音帧 | 连续静音帧: $consecutiveSilenceFrames") } - // ========== 2. 新增:帧统计与连续性计算 ========== + // 帧统计与连续性计算 totalFrames++ frameIndex++ - - if (hasSpeech) { + if (isEffectiveSpeech) { speechFrames++ - // 连续语音帧计数 - continuousSpeechFrames = if (lastFrameIsSpeech) { - continuousSpeechFrames + 1 - } else { - 1 // 重置连续计数 - } + continuousSpeechFrames = if (lastFrameIsSpeech) continuousSpeechFrames + 1 else 1 lastFrameIsSpeech = true - - // 更新峰值位置(仅当当前RMS为新峰值时) - if (rms == peakRms) { - peakPosition = frameIndex - } + if (rms == peakRms) peakPosition = frameIndex } else { lastFrameIsSpeech = false } - // ========== 3. VAD核心状态流转 ========== - if (hasSpeech) { - lastSpeechTime = now + // VAD核心状态流转(使用动态静默阈值) + if (isEffectiveSpeech) { if (!isSpeaking) { isSpeaking = true - LogUtils.d(TAG, "🎤 语音开始") + LogUtils.d(TAG, "🎤 有效语音开始 | 阈值: $effectiveSpeechThreshold") onSpeechStart() } activeFrameCount++ @@ -93,56 +121,61 @@ class VadManager( } else { if (isSpeaking) { activeFrameCount++ - val silenceDuration = now - lastSpeechTime - if (silenceDuration >= END_SILENCE_MS) { + val vadSilenceDuration = now - lastSpeechTime + val effectiveSilenceDuration = now - lastEffectiveSpeechTime + + // 触发条件:动态静默时长 + 连续静音帧达标 + val isSilenceTimeout = (vadSilenceDuration >= dynamicEndSilenceMs || + effectiveSilenceDuration >= MAX_SILENCE_AFTER_SPEECH_MS) && + consecutiveSilenceFrames >= CONSECUTIVE_SILENCE_FRAME_THRESHOLD + + if (isSilenceTimeout) { isSpeaking = false + isSpeechActive = false // 结束活跃期 val avgEnergy = if (speechFrameCount > 0) speechEnergySum / speechFrameCount else 0f - LogUtils.d(TAG, "🛑 语音结束 | 静音时长: ${silenceDuration}ms | 平均能量: $avgEnergy | 峰值: $peakRms") + LogUtils.d(TAG, """ + 🛑 语音结束 + - 有效静默时长: ${effectiveSilenceDuration}ms + - 连续静音帧: $consecutiveSilenceFrames + - 平均能量: $avgEnergy | 峰值: $peakRms + - 活跃期: ${if (isSpeechActive) "是" else "否"} + """.trimIndent()) onSpeechEnd(avgEnergy, peakRms) - resetStats() // 重置基础统计 + resetStats() } else { - LogUtils.v(TAG, "⏳ 静音中,时长: ${silenceDuration}ms (阈值: ${END_SILENCE_MS}ms)") + LogUtils.v(TAG, "⏳ 静默中(停顿容忍) | 连续静音帧: $consecutiveSilenceFrames | 静默时长: ${effectiveSilenceDuration}ms | 动态阈值: $dynamicEndSilenceMs") } } } } - /** - * 计算语音占比(活跃语音帧 / 总活跃帧) - * @return 语音占比(0~1) - */ + // ========== 保留原有方法 ========== fun activeSpeechRatio(): Float { val ratio = if (activeFrameCount == 0) 0f else activeSpeechFrameCount.toFloat() / activeFrameCount - LogUtils.d(TAG, "📊 语音占比: $ratio | 语音帧: $activeSpeechFrameCount | 总帧: $activeFrameCount") + LogUtils.d(TAG, "📊 语音占比: $ratio | 有效语音帧: $activeSpeechFrameCount | 总帧: $activeFrameCount") return ratio } - // ========== 新增:帧统计获取方法(给VoiceController调用) ========== - /** 获取总处理帧数 */ fun getTotalFrames(): Int = totalFrames - - /** 获取语音帧总数 */ fun getSpeechFrames(): Int = speechFrames - - /** 获取连续语音帧数 */ fun getContinuousSpeechFrames(): Int = continuousSpeechFrames - - /** 获取峰值位置占比(峰值帧索引/总帧数) */ fun getPeakPositionRatio(): Float { return if (totalFrames == 0) 0f else peakPosition.toFloat() / totalFrames } - /** - * 重置VAD状态(保留核心对象,清空统计数据) - */ fun reset() { - // 基础状态重置 isSpeaking = false lastSpeechTime = 0L + lastEffectiveSpeechTime = 0L + envBaselineRms = 0.0005f + consecutiveSilenceFrames = 0 + // 重置活跃期 + isSpeechActive = false + speechActiveStartMs = 0L + // 重置统计 resetStats() vad.reset() - // 新增:连续性统计重置 totalFrames = 0 speechFrames = 0 continuousSpeechFrames = 0 @@ -153,9 +186,6 @@ class VadManager( LogUtils.d(TAG, "🔄 VAD 状态已完全重置") } - /** - * 重置统计数据(内部使用) - */ private fun resetStats() { activeFrameCount = 0 activeSpeechFrameCount = 0 @@ -164,11 +194,6 @@ class VadManager( peakRms = 0f } - /** - * 计算音频采样的RMS(均方根)能量 - * @param samples 音频采样数据 - * @return RMS值 - */ fun calcRms(samples: FloatArray): Float { var sum = 0f for (v in samples) sum += v * v diff --git a/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt b/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt index 6b4cf6a..fefb783 100644 --- a/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt +++ b/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt @@ -55,7 +55,7 @@ class VoiceController( private val envNoiseBuffer = ArrayDeque(BASELINE_WINDOW_SIZE) private var currentEnvBaseline = 0.001f - // 强制兜底:正常语音最低门槛(你的0.0809≥0.06直接通过) + // 强制兜底:正常语音最低门槛 private val MIN_NORMAL_VOICE_ENERGY = 0.06f private val MIN_NORMAL_VOICE_VAD_RATIO = 0.3f @@ -79,6 +79,13 @@ class VoiceController( private val SHORT_SPEECH_MIN = 500L private val SHORT_SPEECH_MAX = 2000L + // ========== 核心修改:多人对话过滤配置(适配2人以上场景) ========== + private val MULTI_DIALOGUE_MIN_DURATION = 2500L // 多人对话最小时长(2.5秒,比两人更短也能判定) + private val MULTI_DIALOGUE_MAX_PEAK_AVG_RATIO = 2.5f // 多人对话峰均比范围更大(多人音量差异更大) + private val MULTI_DIALOGUE_MIN_PEAK_AVG_RATIO = 0.4f + private val MULTI_DIALOGUE_MAX_CONTINUOUS_RATIO = 0.3f // 多人对话连续帧占比更低(轮流说话,断层更多) + private val MULTI_DIALOGUE_MIN_VAD_RATIO = 0.55f // 多人对话有效帧占比要求稍低(避免漏过滤) + // 阈值配置数据类 private data class ThresholdConfig( val energyThreshold: Float, @@ -147,15 +154,14 @@ class VoiceController( /* ================= 环境基线校准 ================= */ private fun calibrateEnvBaseline(samples: FloatArray) { val rms = vadManager.calcRms(samples) + // 新增:只保留低于基线+阈值的有效值,过滤突发噪音 + val validRms = if (rms < currentEnvBaseline + 0.005f) rms else currentEnvBaseline if (rms < 0.03f) { if (envNoiseBuffer.size >= BASELINE_WINDOW_SIZE) { envNoiseBuffer.removeFirst() } envNoiseBuffer.addLast(rms) currentEnvBaseline = envNoiseBuffer.maxOrNull() ?: 0.001f -// LogUtils.d(TAG, "🌡 环境基线校准 | RMS: $rms | 基线: $currentEnvBaseline | 缓存数: ${envNoiseBuffer.size}") - } else { -// LogUtils.v(TAG, "🔊 高能量音频跳过校准 | RMS: $rms | 基线: $currentEnvBaseline") } } @@ -198,7 +204,7 @@ class VoiceController( finishSentence(avgEnergy, peakRms) } - /* ================= 结束录音(分场景系数+强制兜底) ================= */ + /* ================= 结束录音(核心:多人对话过滤) ================= */ private fun finishSentence(avgEnergy: Float = 0f, peakRms: Float = 0f) { val now = System.currentTimeMillis() val duration = now - recordingStartMs @@ -222,6 +228,19 @@ class VoiceController( LogUtils.d(TAG, "📊 录音信息 | 时长: $duration ms | 能量: $avgEnergy | 峰均比: $peakAvgRatio | 基线: $currentEnvBaseline") LogUtils.d(TAG, "📊 帧统计 | 总帧: $totalFrames | 语音帧: $speechFrames | 连续语音帧: $continuousSpeechFrames | 峰值位置占比: $peakPositionRatio") + // ========== 核心修改:第一步过滤多人对话垃圾语音 ========== + val continuousRatio = if (speechFrames > 0) continuousSpeechFrames.toFloat() / speechFrames else 0f + val isMultiPersonDialogue = duration >= MULTI_DIALOGUE_MIN_DURATION && // 时长≥2.5秒 + peakAvgRatio in MULTI_DIALOGUE_MIN_PEAK_AVG_RATIO..MULTI_DIALOGUE_MAX_PEAK_AVG_RATIO && // 峰均比0.4~2.5 + continuousRatio <= MULTI_DIALOGUE_MAX_CONTINUOUS_RATIO && // 连续帧占比≤0.3(多人轮流说,断层多) + vadRatio >= MULTI_DIALOGUE_MIN_VAD_RATIO // 有效帧占比≥0.55(整体语音占比高) + + if (isMultiPersonDialogue) { + LogUtils.w(TAG, "❌ 过滤多人对话垃圾语音 | 时长: $duration ms | 连续占比: $continuousRatio | 有效占比: $vadRatio | 峰均比: $peakAvgRatio") + resetToWaitSpeech() + return + } + // ========== 1. 强制兜底:正常语音直接通过 ========== val isNormalVoice = avgEnergy >= MIN_NORMAL_VOICE_ENERGY && vadRatio >= MIN_NORMAL_VOICE_VAD_RATIO if (isNormalVoice) { @@ -242,7 +261,6 @@ class VoiceController( } // ========== 3. 非连续判定:极度宽松 ========== - val continuousRatio = if (speechFrames > 0) continuousSpeechFrames.toFloat() / speechFrames else 0f val isDiscontinuous = continuousRatio < MIN_CONTINUOUS_FRAME_RATIO && speechFrames < MIN_EFFECTIVE_SPEECH_FRAMES && peakPositionRatio > MAX_PEAK_POSITION_RATIO @@ -344,9 +362,16 @@ class VoiceController( VoiceState.WAIT_SPEECH_COOLDOWN } } - + private var lastInvalidResetMs = 0L + private val INVALID_RESET_DEBOUNCE_MS = 1500L // 1.5秒内不重复重置 private fun resetToWaitSpeech() { LogUtils.d(TAG, "🔄 重置到等待说话 | 基线: $currentEnvBaseline") + val now = System.currentTimeMillis() + if (now - lastInvalidResetMs < INVALID_RESET_DEBOUNCE_MS) { + LogUtils.d(TAG, "🛡 防抖:1.5秒内重复无效语音,跳过重置") + return + } + lastInvalidResetMs = now audioBuffer.clear() vadManager.reset() vadStarted = false