From b15621985d41b5e98e86ebee76d6936969250913 Mon Sep 17 00:00:00 2001 From: ross <3024454314@qq.com> Date: Thu, 8 Jan 2026 14:53:22 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E5=90=88=E9=80=82=E9=98=88?= =?UTF-8?q?=E5=80=BC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../zs/smarthuman/sherpa/VoiceController.kt | 158 ++++++++---------- 1 file changed, 73 insertions(+), 85 deletions(-) diff --git a/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt b/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt index cf420dc..6b4cf6a 100644 --- a/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt +++ b/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt @@ -50,27 +50,32 @@ class VoiceController( private val idleTimeoutMs = idleTimeoutSeconds * 1000L private val maxRecordingMs = maxRecordingSeconds * 1000L - // ================= 动态阈值核心配置(修复+调整) ================= + // ================= 保留分场景动态系数 + 强制兜底配置 ================= private val BASELINE_WINDOW_SIZE = 50 private val envNoiseBuffer = ArrayDeque(BASELINE_WINDOW_SIZE) private var currentEnvBaseline = 0.001f - // ========== 修复:远场过滤配置(关键调整) ========== - private val MAX_FAR_FIELD_ENERGY = 0.05f // 远场能量上限(正常语音>0.05,远场<0.05) - private val MIN_VALID_PEAK_AVG_RATIO = 1.5f // 有效峰均比下限(正常语音>1.5,咳嗽<1.5) - private val BASELINE_QUIET_THRESHOLD = 0.002f - private val MIN_CONTINUOUS_FRAME_RATIO = 0.4f // 连续帧占比下限(从0.6调低,兼容正常短语音) - private val MAX_PEAK_POSITION_RATIO = 0.5f // 峰值位置上限(从0.3调高,兼容正常语音) - private val MIN_EFFECTIVE_SPEECH_FRAMES = 3 // 最低有效帧数(从5调低) + // 强制兜底:正常语音最低门槛(你的0.0809≥0.06直接通过) + private val MIN_NORMAL_VOICE_ENERGY = 0.06f + private val MIN_NORMAL_VOICE_VAD_RATIO = 0.3f - // 分场景动态系数(调整:降低安静环境系数) - private val SHORT_SPEECH_ENERGY_COEFF_QUIET = 6.0f // 从8.0调低 - private val SHORT_SPEECH_ENERGY_COEFF_NOISY = 4.0f // 从5.0调低 - private val LONG_SPEECH_ENERGY_COEFF = 15.0f // 从20.0调低 - private val SHORT_SPEECH_VAD_COEFF = 0.15f // 从0.2调低 - private val LONG_SPEECH_VAD_COEFF = 0.3f // 从0.4调低 - private val SHORT_SPEECH_MIN_SCORE = 2 // 调回2分 - private val LONG_SPEECH_MIN_SCORE = 4 // 从5调低 + // 分场景动态系数(安静环境系数极低) + private val BASELINE_QUIET_THRESHOLD = 0.005f // 安静环境基线阈值 + private val SHORT_SPEECH_ENERGY_COEFF_QUIET = 2.0f // 安静环境短语音系数 + private val SHORT_SPEECH_ENERGY_COEFF_NOISY = 3.0f // 嘈杂环境短语音系数 + private val LONG_SPEECH_ENERGY_COEFF_QUIET = 4.0f // 安静环境长语音系数 + private val LONG_SPEECH_ENERGY_COEFF_NOISY = 6.0f // 嘈杂环境长语音系数 + private val SHORT_SPEECH_VAD_COEFF = 0.08f + private val LONG_SPEECH_VAD_COEFF = 0.15f + private val SHORT_SPEECH_MIN_SCORE = 1 + private val LONG_SPEECH_MIN_SCORE = 2 + + // 其他过滤参数 + private val MAX_FAR_FIELD_ENERGY = 0.03f + private val MIN_VALID_PEAK_AVG_RATIO = 0.8f + private val MIN_CONTINUOUS_FRAME_RATIO = 0.2f + private val MAX_PEAK_POSITION_RATIO = 0.95f + private val MIN_EFFECTIVE_SPEECH_FRAMES = 5 private val SHORT_SPEECH_MIN = 500L private val SHORT_SPEECH_MAX = 2000L @@ -142,15 +147,15 @@ class VoiceController( /* ================= 环境基线校准 ================= */ private fun calibrateEnvBaseline(samples: FloatArray) { val rms = vadManager.calcRms(samples) - if (rms < 0.01f) { + if (rms < 0.03f) { if (envNoiseBuffer.size >= BASELINE_WINDOW_SIZE) { envNoiseBuffer.removeFirst() } envNoiseBuffer.addLast(rms) currentEnvBaseline = envNoiseBuffer.maxOrNull() ?: 0.001f - LogUtils.d(TAG, "🌡 环境基线校准 | RMS: $rms | 基线: $currentEnvBaseline | 缓存数: ${envNoiseBuffer.size}") +// LogUtils.d(TAG, "🌡 环境基线校准 | RMS: $rms | 基线: $currentEnvBaseline | 缓存数: ${envNoiseBuffer.size}") } else { - LogUtils.v(TAG, "🔊 高能量音频跳过校准 | RMS: $rms | 基线: $currentEnvBaseline") +// LogUtils.v(TAG, "🔊 高能量音频跳过校准 | RMS: $rms | 基线: $currentEnvBaseline") } } @@ -193,7 +198,7 @@ class VoiceController( finishSentence(avgEnergy, peakRms) } - /* ================= 结束录音(修复远场过滤逻辑) ================= */ + /* ================= 结束录音(分场景系数+强制兜底) ================= */ private fun finishSentence(avgEnergy: Float = 0f, peakRms: Float = 0f) { val now = System.currentTimeMillis() val duration = now - recordingStartMs @@ -217,52 +222,56 @@ class VoiceController( LogUtils.d(TAG, "📊 录音信息 | 时长: $duration ms | 能量: $avgEnergy | 峰均比: $peakAvgRatio | 基线: $currentEnvBaseline") LogUtils.d(TAG, "📊 帧统计 | 总帧: $totalFrames | 语音帧: $speechFrames | 连续语音帧: $continuousSpeechFrames | 峰值位置占比: $peakPositionRatio") - // ========== 修复:远场语音过滤逻辑(核心) ========== - // 正确逻辑:能量 < MAX_FAR_FIELD_ENERGY 才是远场;峰均比 < MIN_VALID_PEAK_AVG_RATIO 才是无效语音 - val isFarField = avgEnergy < MAX_FAR_FIELD_ENERGY // 修复:从 < MIN 改为 < MAX - val isInvalidPeakRatio = peakAvgRatio < MIN_VALID_PEAK_AVG_RATIO // 降低阈值 + // ========== 1. 强制兜底:正常语音直接通过 ========== + val isNormalVoice = avgEnergy >= MIN_NORMAL_VOICE_ENERGY && vadRatio >= MIN_NORMAL_VOICE_VAD_RATIO + if (isNormalVoice) { + LogUtils.i(TAG, "✅ 正常语音强制通过 | 能量: $avgEnergy ≥ $MIN_NORMAL_VOICE_ENERGY | 占比: $vadRatio ≥ $MIN_NORMAL_VOICE_VAD_RATIO") + audioBuffer.clear() + state = VoiceState.UPLOADING + onFinalAudio(audio) + return + } - // 非连续特征(调整阈值后更宽松) + // ========== 2. 远场过滤:只过滤极低能量 ========== + val isFarField = avgEnergy < MAX_FAR_FIELD_ENERGY + val isInvalidPeakRatio = peakAvgRatio < MIN_VALID_PEAK_AVG_RATIO + if (isFarField && isInvalidPeakRatio) { + LogUtils.w(TAG, "❌ 远场/无效语音过滤 | 能量: $avgEnergy < $MAX_FAR_FIELD_ENERGY") + resetToWaitSpeech() + return + } + + // ========== 3. 非连续判定:极度宽松 ========== val continuousRatio = if (speechFrames > 0) continuousSpeechFrames.toFloat() / speechFrames else 0f - val isDiscontinuous = continuousRatio < MIN_CONTINUOUS_FRAME_RATIO || - speechFrames < MIN_EFFECTIVE_SPEECH_FRAMES || - peakPositionRatio < MAX_PEAK_POSITION_RATIO - - // 远场+无效语音过滤(仅过滤真正的远场/杂音) - if (isFarField && isInvalidPeakRatio) { // 修复:同时满足才过滤 - LogUtils.w(TAG, "❌ 远场/无效语音过滤 | 能量: $avgEnergy < 远场上限: $MAX_FAR_FIELD_ENERGY | 峰均比: $peakAvgRatio < 有效下限: $MIN_VALID_PEAK_AVG_RATIO") + val isDiscontinuous = continuousRatio < MIN_CONTINUOUS_FRAME_RATIO && + speechFrames < MIN_EFFECTIVE_SPEECH_FRAMES && + peakPositionRatio > MAX_PEAK_POSITION_RATIO + if (isDiscontinuous) { + LogUtils.w(TAG, "❌ 非连续杂音过滤 | 连续占比: $continuousRatio < $MIN_CONTINUOUS_FRAME_RATIO") resetToWaitSpeech() return } - // 非连续语音过滤(仅过滤真正的零散杂音) - if (isDiscontinuous && isFarField) { // 修复:结合远场特征,避免过滤正常语音 - LogUtils.w(TAG, "❌ 非连续杂音过滤 | 连续占比: $continuousRatio < $MIN_CONTINUOUS_FRAME_RATIO | 语音帧: $speechFrames < $MIN_EFFECTIVE_SPEECH_FRAMES | 峰值位置: $peakPositionRatio < $MAX_PEAK_POSITION_RATIO") - resetToWaitSpeech() - return - } - - // ========== 动态阈值计算(调整后更宽松) ========== + // ========== 4. 分场景动态阈值计算(保留核心逻辑) ========== + val isQuietEnv = currentEnvBaseline < BASELINE_QUIET_THRESHOLD val thresholdConfig = when { duration in SHORT_SPEECH_MIN..SHORT_SPEECH_MAX -> { - val coeff = if (currentEnvBaseline < BASELINE_QUIET_THRESHOLD) { - SHORT_SPEECH_ENERGY_COEFF_QUIET - } else { - SHORT_SPEECH_ENERGY_COEFF_NOISY - } - val threshold = currentEnvBaseline * coeff - LogUtils.d(TAG, "📏 短语音阈值 | 场景: ${if (currentEnvBaseline < BASELINE_QUIET_THRESHOLD) "安静" else "嘈杂"} | 系数: $coeff | 阈值: $threshold") + val coeff = if (isQuietEnv) SHORT_SPEECH_ENERGY_COEFF_QUIET else SHORT_SPEECH_ENERGY_COEFF_NOISY + val energyThreshold = currentEnvBaseline * coeff + LogUtils.d(TAG, "📏 短语音阈值 | 场景: ${if (isQuietEnv) "安静" else "嘈杂"} | 系数: $coeff | 阈值: $energyThreshold") ThresholdConfig( - energyThreshold = threshold, + energyThreshold = energyThreshold, vadRatioThreshold = SHORT_SPEECH_VAD_COEFF, minScore = SHORT_SPEECH_MIN_SCORE, scene = "短语音" ) } else -> { - val threshold = currentEnvBaseline * LONG_SPEECH_ENERGY_COEFF + val coeff = if (isQuietEnv) LONG_SPEECH_ENERGY_COEFF_QUIET else LONG_SPEECH_ENERGY_COEFF_NOISY + val energyThreshold = currentEnvBaseline * coeff + LogUtils.d(TAG, "📏 长语音阈值 | 场景: ${if (isQuietEnv) "安静" else "嘈杂"} | 系数: $coeff | 阈值: $energyThreshold") ThresholdConfig( - energyThreshold = threshold, + energyThreshold = energyThreshold, vadRatioThreshold = LONG_SPEECH_VAD_COEFF, minScore = LONG_SPEECH_MIN_SCORE, scene = "长语音" @@ -270,61 +279,40 @@ class VoiceController( } } - LogUtils.d(TAG, "📊 动态阈值 | ${thresholdConfig.scene} | 能量阈值: ${thresholdConfig.energyThreshold} | 占比阈值: ${thresholdConfig.vadRatioThreshold} | 最低分: ${thresholdConfig.minScore}") - - // 基础阈值过滤(调整后更宽松) - if (avgEnergy < thresholdConfig.energyThreshold || vadRatio < thresholdConfig.vadRatioThreshold) { - LogUtils.w(TAG, "❌ 阈值过滤 | 能量: $avgEnergy < ${thresholdConfig.energyThreshold} | 占比: $vadRatio < ${thresholdConfig.vadRatioThreshold}") + // ========== 5. 分场景阈值过滤 ========== + val energyPass = avgEnergy >= thresholdConfig.energyThreshold + val vadRatioPass = vadRatio >= thresholdConfig.vadRatioThreshold + if (!energyPass || !vadRatioPass) { + LogUtils.w(TAG, "❌ 低能量语音阈值过滤 | 能量: $avgEnergy < ${thresholdConfig.energyThreshold} | 占比: $vadRatio < ${thresholdConfig.vadRatioThreshold} | 场景: ${thresholdConfig.scene}") resetToWaitSpeech() return } - // 评分逻辑(恢复短语音保底分) + // ========== 6. 评分判定:极度宽松 ========== var score = 0 - // 1. 时长评分(恢复短语音保底1分) score += when { duration >= 4000 -> 3 duration >= 2500 -> 2 - duration >= 1500 -> 1 - duration >= SHORT_SPEECH_MIN -> 1 // 恢复保底分 - else -> 0 - } - // 2. 能量评分 - score += when { - avgEnergy >= thresholdConfig.energyThreshold * 10 -> 3 - avgEnergy >= thresholdConfig.energyThreshold * 5 -> 2 - avgEnergy >= thresholdConfig.energyThreshold -> 1 - else -> 0 - } - // 3. 占比+连续性评分(调整阈值) - score += when { - continuousRatio >= 0.7 -> 2 // 从0.8调低 - continuousRatio >= MIN_CONTINUOUS_FRAME_RATIO -> 1 - else -> 0 - } - - LogUtils.d(TAG, "🏆 评分结果 | 总分: $score | 最低分: ${thresholdConfig.minScore} | 连续占比: $continuousRatio") - - // 分场景判定(调整后更宽松) - val pass = if (duration in SHORT_SPEECH_MIN..SHORT_SPEECH_MAX) { - score >= thresholdConfig.minScore && continuousRatio >= 0.5 // 从0.7调低 - } else { - score >= thresholdConfig.minScore || (score >= 2 && avgEnergy >= currentEnvBaseline * 4) // 从3→2,6→4 + else -> 1 } + score += if (avgEnergy >= thresholdConfig.energyThreshold) 1 else 0 + score += if (continuousRatio >= MIN_CONTINUOUS_FRAME_RATIO) 1 else 0 + val pass = score >= thresholdConfig.minScore if (!pass) { - LogUtils.w(TAG, "❌ 评分/连续性不足过滤 | 总分: $score < ${thresholdConfig.minScore} | 连续占比: $continuousRatio") + LogUtils.w(TAG, "❌ 评分不足过滤 | 总分: $score < ${thresholdConfig.minScore} | 场景: ${thresholdConfig.scene}") resetToWaitSpeech() return } + // ========== 最终通过 ========== audioBuffer.clear() state = VoiceState.UPLOADING onFinalAudio(audio) - LogUtils.i(TAG, "✅ 录音通过 | 时长: $duration ms | 能量: $avgEnergy | 连续占比: $continuousRatio | 准备上传") + LogUtils.i(TAG, "✅ 低能量语音通过 | 时长: $duration ms | 能量: $avgEnergy | 场景: ${thresholdConfig.scene}") } - /* ================= 播放/上传/Reset 回调(无修改) ================= */ + /* ================= 播放/上传/Reset 回调 ================= */ fun onPlayStartPrompt() { LogUtils.d(TAG, "🎵 播放提示音 | 基线: $currentEnvBaseline") state = VoiceState.PLAYING_PROMPT