From 6e811b99b7c62364200bf016b24202da0657fd03 Mon Sep 17 00:00:00 2001
From: ross <3024454314@qq.com>
Date: Sat, 3 Jan 2026 16:47:29 +0800
Subject: [PATCH] =?UTF-8?q?=E5=8E=BB=E9=99=A4=E5=A4=9A=E4=BD=99=E7=9A=84?=
 =?UTF-8?q?=E6=A8=A1=E5=9E=8B?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../zs/smarthuman/sherpa/VoiceController.kt   | 41 +++++++++++--------
 1 file changed, 24 insertions(+), 17 deletions(-)
diff --git a/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt b/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt
index 60a7fab..108d6a3 100644
--- a/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt
+++ b/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt
@@ -196,14 +196,31 @@ class VoiceController(
         val audio = audioBuffer.toFloatArray()
         val vadRatio = vadManager.activeSpeechRatio()
         val avgEnergy = if (speechFrameCount > 0) speechEnergySum / speechFrameCount else 0f
+        val peakRms = calcPeakRms(audio)
 
-        // ⭐ 多人说话检测：短时峰值方差
-        if (shortTermPeakVariance(audio) > 0.015f) {
-            Log.d(TAG, "❌ Short-term peak variance too high → likely multi-speaker")
+
+        val peakAvgRatio = if (avgEnergy > 0f) peakRms / avgEnergy else 0f
+
+        if (avgEnergy < MIN_AVG_ENERGY) {
+            Log.d(TAG, "❌ Avg energy too low: $avgEnergy → rejected")
             resetToWaitSpeech()
             return
         }
 
+        if (peakAvgRatio < 1.2f) {
+            Log.d(TAG, "❌ Peak/Avg ratio too low: $peakAvgRatio → rejected")
+            resetToWaitSpeech()
+            return
+        }
+
+
+        if (vadRatio < 0.40f) {
+            Log.d(TAG, "❌ VAD ratio too low: $vadRatio → rejected")
+            resetToWaitSpeech()
+            return
+        }
+
+        // 原评分逻辑
         var score = 0
         when {
             duration >= 4000 -> score += 3
@@ -220,7 +237,7 @@ class VoiceController(
             vadRatio >= 0.40f -> score += 1
         }
 
-        Log.d(TAG, "📊 duration=$duration ms, vadRatio=$vadRatio, avgEnergy=$avgEnergy, score=$score")
+        Log.d(TAG, "📊 duration=$duration ms, vadRatio=$vadRatio, avgEnergy=$avgEnergy, peakRms=$peakRms, score=$score")
 
         val pass = when {
             score >= 6 -> true
@@ -234,25 +251,15 @@ class VoiceController(
             return
         }
 
+        // ✅ 通过 → 上传
         waitSpeechFailStartMs = 0L
         audioBuffer.clear()
         state = VoiceState.UPLOADING
         onFinalAudio(audio)
     }
 
-    /** 计算短时峰值方差，用于多人说话检测 */
-    private fun shortTermPeakVariance(audio: FloatArray): Float {
-        val frameSize = 160 // 10ms @16kHz
-        val peaks = mutableListOf<Float>()
-        var i = 0
-        while (i + frameSize <= audio.size) {
-            val frame = audio.sliceArray(i until i + frameSize)
-            peaks.add(calcPeakRms(frame))
-            i += frameSize
-        }
-        val mean = peaks.average().toFloat()
-        return peaks.map { (it - mean) * (it - mean) }.average().toFloat()
-    }
+
+
 
     /** 计算音频帧峰值 */
     private fun calcPeakRms(audio: FloatArray): Float {