From 761e98f1b6e4bfa24927475caa401c1ed6427a72 Mon Sep 17 00:00:00 2001
From: ross <3024454314@qq.com>
Date: Sun, 4 Jan 2026 15:47:31 +0800
Subject: [PATCH] =?UTF-8?q?=E4=BC=98=E5=8C=96=E4=BB=A3=E7=A0=81?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../com/zs/smarthuman/sherpa/VadManager.kt    | 64 +++++++++----------
 .../zs/smarthuman/sherpa/VoiceController.kt   | 12 +---
 2 files changed, 34 insertions(+), 42 deletions(-)

diff --git a/app/src/main/java/com/zs/smarthuman/sherpa/VadManager.kt b/app/src/main/java/com/zs/smarthuman/sherpa/VadManager.kt
index 18d22aa..4029b3c 100644
--- a/app/src/main/java/com/zs/smarthuman/sherpa/VadManager.kt
+++ b/app/src/main/java/com/zs/smarthuman/sherpa/VadManager.kt
@@ -8,85 +8,83 @@ import kotlin.math.sqrt
 class VadManager(
     assetManager: AssetManager,
     private val onSpeechStart: () -> Unit,
-    private val onSpeechEnd: (Float, Float) -> Unit // avgEnergy, peakRms
+    private val onSpeechEnd: (avgEnergy: Float, peakRms: Float) -> Unit
 ) {
+
     private val vad: Vad
     private var isSpeaking = false
     private var lastSpeechTime = 0L
 
-    /** 有效语音统计 */
+    private val END_SILENCE_MS = 800L
+
     private var activeFrameCount = 0
     private var activeSpeechFrameCount = 0
+
     private var speechEnergySum = 0f
     private var speechFrameCount = 0
     private var peakRms = 0f
 
-    private val END_SILENCE_MS = 800L
-
     init {
-        val config = getVadModelConfig(0)
-            ?: throw IllegalStateException("VAD config not found")
+        val config = getVadModelConfig(0) ?: throw IllegalStateException("VAD config not found")
         vad = Vad(assetManager, config)
     }
 
-    /** 外部调用的音频输入 */
     fun accept(samples: FloatArray) {
         val now = System.currentTimeMillis()
 
         vad.acceptWaveform(samples)
         val hasSpeech = vad.isSpeechDetected()
 
+        // RMS & peak 统计
         val rms = calcRms(samples)
+        if (hasSpeech) {
+            speechEnergySum += rms
+            speechFrameCount++
+            peakRms = maxOf(peakRms, rms)
+        }
+
+        // VAD逻辑
         if (hasSpeech) {
             lastSpeechTime = now
             if (!isSpeaking) {
                 isSpeaking = true
-                resetStats()
                 onSpeechStart()
             }
-
-            // 累计有效语音能量和峰值
-            speechEnergySum += rms
-            speechFrameCount++
-            if (rms > peakRms) peakRms = rms
-
             activeFrameCount++
             activeSpeechFrameCount++
         } else {
-            if (isSpeaking) activeFrameCount++
-            // 检查结束
-            if (isSpeaking && now - lastSpeechTime >= END_SILENCE_MS) {
-                isSpeaking = false
-                val avgEnergy = if (speechFrameCount > 0) speechEnergySum / speechFrameCount else 0f
-                onSpeechEnd(avgEnergy, peakRms)
+            if (isSpeaking) {
+                activeFrameCount++
+                if (now - lastSpeechTime >= END_SILENCE_MS) {
+                    isSpeaking = false
+                    val avgEnergy = if (speechFrameCount > 0) speechEnergySum / speechFrameCount else 0f
+                    val peak = peakRms
+                    onSpeechEnd(avgEnergy, peak)
+                }
             }
         }
     }
 
-    /** 统计有效语音比例，用于 VoiceController */
-    fun activeSpeechRatio(): Float {
-        if (activeFrameCount == 0) return 0f
-        return activeSpeechFrameCount.toFloat() / activeFrameCount
-    }
+    fun activeSpeechRatio(): Float = if (activeFrameCount == 0) 0f else activeSpeechFrameCount.toFloat() / activeFrameCount
 
     fun reset() {
         isSpeaking = false
         lastSpeechTime = 0L
         activeFrameCount = 0
         activeSpeechFrameCount = 0
-        resetStats()
-        vad.reset()
-    }
-
-    private fun resetStats() {
         speechEnergySum = 0f
         speechFrameCount = 0
         peakRms = 0f
+        vad.reset()
     }
 
-    private fun calcRms(audio: FloatArray): Float {
+    private fun calcRms(samples: FloatArray): Float {
         var sum = 0f
-        for (v in audio) sum += v * v
-        return sqrt(sum / audio.size)
+        var peak = 0f
+        for (v in samples) {
+            sum += v * v
+            peak = maxOf(peak, kotlin.math.abs(v))
+        }
+        return sqrt(sum / samples.size)
     }
 }
diff --git a/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt b/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt
index c845d26..c0a2eb3 100644
--- a/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt
+++ b/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt
@@ -107,15 +107,9 @@ class VoiceController(
 
     /* ================= 唤醒 ================= */
     private fun handleWakeupEvent() {
-        when (state) {
-            VoiceState.UPLOADING -> return
-            VoiceState.RECORDING,
-            VoiceState.PLAYING_BACKEND -> {
-                stopBackendAudio?.invoke()
-                enterWakeup(interrupt = true)
-            }
-            else -> enterWakeup(interrupt = false)
-        }
+        if (state == VoiceState.UPLOADING) return
+        stopBackendAudio?.invoke()
+        enterWakeup(interrupt = true)
     }
 
     private fun enterWakeup(interrupt: Boolean) {