优化后的代码

2026-01-03 15:31:49 +08:00 · 2026-01-03 15:31:49 +08:00 · a1c430bd40
commit a1c430bd40
parent 0806376687
1 changed files with 36 additions and 5 deletions
--- a/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt
+++ b/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt
@ -9,7 +9,7 @@ class VoiceController(
    assetManager: AssetManager,
    private val onWakeup: () -> Unit,
    private val onFinalAudio: (FloatArray) -> Unit,
-    private val idleTimeoutSeconds: Int = 8,
+    private val idleTimeoutSeconds: Int = 10,
    private val maxRecordingSeconds: Int = 10,
    private val onStateChanged: ((VoiceState) -> Unit)? = null,
    private val stopBackendAudio: (() -> Unit)? = null
@ -62,7 +62,6 @@ class VoiceController(
    private val MIN_SPEECH_MS = 1000L
    private val MIN_AVG_ENERGY = 0.02f

-    private val WAIT_SPEECH_TIMEOUT_MS = 8000L

    /* ================= 音频入口 ================= */

@ -93,7 +92,7 @@ class VoiceController(

            VoiceState.WAIT_SPEECH -> {

-                if (waitSpeechStartMs > 0 && now - waitSpeechStartMs >= WAIT_SPEECH_TIMEOUT_MS) {
+                if (waitSpeechStartMs > 0 && now - waitSpeechStartMs >= idleTimeoutSeconds * 1000) {
                    Log.d(TAG, "⏱ Wakeup but no speech → WAIT_WAKEUP")
                    resetAll()
                    return
@ -196,9 +195,17 @@ class VoiceController(
            return
        }

+        val audio = audioBuffer.toFloatArray()
        val vadRatio = vadManager.activeSpeechRatio()
        val avgEnergy = if (speechFrameCount > 0) speechEnergySum / speechFrameCount else 0f

+        // ⭐ 多人说话检测：短时峰值方差
+        if (shortTermPeakVariance(audio) > 0.015f) {
+            Log.d(TAG, "❌ Short-term peak variance too high → likely multi-speaker")
+            resetToWaitSpeech()
+            return
+        }
+
        var score = 0
        when {
            duration >= 4000 -> score += 3
@ -230,12 +237,36 @@ class VoiceController(
        }

        waitSpeechFailStartMs = 0L
-        val finalAudio = audioBuffer.toFloatArray()
        audioBuffer.clear()
        state = VoiceState.UPLOADING
-        onFinalAudio(finalAudio)
+        onFinalAudio(audio)
    }

+    /** 计算短时峰值方差，用于多人说话检测 */
+    private fun shortTermPeakVariance(audio: FloatArray): Float {
+        val frameSize = 160 // 10ms @16kHz
+        val peaks = mutableListOf<Float>()
+        var i = 0
+        while (i + frameSize <= audio.size) {
+            val frame = audio.sliceArray(i until i + frameSize)
+            peaks.add(calcPeakRms(frame))
+            i += frameSize
+        }
+        val mean = peaks.average().toFloat()
+        return peaks.map { (it - mean) * (it - mean) }.average().toFloat()
+    }
+
+    /** 计算音频帧峰值 */
+    private fun calcPeakRms(audio: FloatArray): Float {
+        var peak = 0f
+        for (v in audio) {
+            val abs = kotlin.math.abs(v)
+            if (abs > peak) peak = abs
+        }
+        return peak
+    }
+
+
    /* ================= 播放回调 ================= */

    fun onPlayStartPrompt() { state = VoiceState.PLAYING_PROMPT }        // ⭐ 补全