From 6e811b99b7c62364200bf016b24202da0657fd03 Mon Sep 17 00:00:00 2001 From: ross <3024454314@qq.com> Date: Sat, 3 Jan 2026 16:47:29 +0800 Subject: [PATCH] =?UTF-8?q?=E5=8E=BB=E9=99=A4=E5=A4=9A=E4=BD=99=E7=9A=84?= =?UTF-8?q?=E6=A8=A1=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../zs/smarthuman/sherpa/VoiceController.kt | 41 +++++++++++-------- 1 file changed, 24 insertions(+), 17 deletions(-) diff --git a/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt b/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt index 60a7fab..108d6a3 100644 --- a/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt +++ b/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt @@ -196,14 +196,31 @@ class VoiceController( val audio = audioBuffer.toFloatArray() val vadRatio = vadManager.activeSpeechRatio() val avgEnergy = if (speechFrameCount > 0) speechEnergySum / speechFrameCount else 0f + val peakRms = calcPeakRms(audio) - // ⭐ 多人说话检测:短时峰值方差 - if (shortTermPeakVariance(audio) > 0.015f) { - Log.d(TAG, "❌ Short-term peak variance too high → likely multi-speaker") + + val peakAvgRatio = if (avgEnergy > 0f) peakRms / avgEnergy else 0f + + if (avgEnergy < MIN_AVG_ENERGY) { + Log.d(TAG, "❌ Avg energy too low: $avgEnergy → rejected") resetToWaitSpeech() return } + if (peakAvgRatio < 1.2f) { + Log.d(TAG, "❌ Peak/Avg ratio too low: $peakAvgRatio → rejected") + resetToWaitSpeech() + return + } + + + if (vadRatio < 0.40f) { + Log.d(TAG, "❌ VAD ratio too low: $vadRatio → rejected") + resetToWaitSpeech() + return + } + + // 原评分逻辑 var score = 0 when { duration >= 4000 -> score += 3 @@ -220,7 +237,7 @@ class VoiceController( vadRatio >= 0.40f -> score += 1 } - Log.d(TAG, "📊 duration=$duration ms, vadRatio=$vadRatio, avgEnergy=$avgEnergy, score=$score") + Log.d(TAG, "📊 duration=$duration ms, vadRatio=$vadRatio, avgEnergy=$avgEnergy, peakRms=$peakRms, score=$score") val pass = when { score >= 6 -> true @@ -234,25 +251,15 @@ class VoiceController( return } + // ✅ 通过 → 上传 waitSpeechFailStartMs = 0L audioBuffer.clear() state = VoiceState.UPLOADING onFinalAudio(audio) } - /** 计算短时峰值方差,用于多人说话检测 */ - private fun shortTermPeakVariance(audio: FloatArray): Float { - val frameSize = 160 // 10ms @16kHz - val peaks = mutableListOf() - var i = 0 - while (i + frameSize <= audio.size) { - val frame = audio.sliceArray(i until i + frameSize) - peaks.add(calcPeakRms(frame)) - i += frameSize - } - val mean = peaks.average().toFloat() - return peaks.map { (it - mean) * (it - mean) }.average().toFloat() - } + + /** 计算音频帧峰值 */ private fun calcPeakRms(audio: FloatArray): Float {