diff --git a/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt b/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt index e112809..f06dcb7 100644 --- a/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt +++ b/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt @@ -9,7 +9,7 @@ class VoiceController( assetManager: AssetManager, private val onWakeup: () -> Unit, private val onFinalAudio: (FloatArray) -> Unit, - private val idleTimeoutSeconds: Int = 8, + private val idleTimeoutSeconds: Int = 10, private val maxRecordingSeconds: Int = 10, private val onStateChanged: ((VoiceState) -> Unit)? = null, private val stopBackendAudio: (() -> Unit)? = null @@ -62,7 +62,6 @@ class VoiceController( private val MIN_SPEECH_MS = 1000L private val MIN_AVG_ENERGY = 0.02f - private val WAIT_SPEECH_TIMEOUT_MS = 8000L /* ================= 音频入口 ================= */ @@ -93,7 +92,7 @@ class VoiceController( VoiceState.WAIT_SPEECH -> { - if (waitSpeechStartMs > 0 && now - waitSpeechStartMs >= WAIT_SPEECH_TIMEOUT_MS) { + if (waitSpeechStartMs > 0 && now - waitSpeechStartMs >= idleTimeoutSeconds * 1000) { Log.d(TAG, "⏱ Wakeup but no speech → WAIT_WAKEUP") resetAll() return @@ -196,9 +195,17 @@ class VoiceController( return } + val audio = audioBuffer.toFloatArray() val vadRatio = vadManager.activeSpeechRatio() val avgEnergy = if (speechFrameCount > 0) speechEnergySum / speechFrameCount else 0f + // ⭐ 多人说话检测:短时峰值方差 + if (shortTermPeakVariance(audio) > 0.015f) { + Log.d(TAG, "❌ Short-term peak variance too high → likely multi-speaker") + resetToWaitSpeech() + return + } + var score = 0 when { duration >= 4000 -> score += 3 @@ -230,12 +237,36 @@ class VoiceController( } waitSpeechFailStartMs = 0L - val finalAudio = audioBuffer.toFloatArray() audioBuffer.clear() state = VoiceState.UPLOADING - onFinalAudio(finalAudio) + onFinalAudio(audio) } + /** 计算短时峰值方差,用于多人说话检测 */ + private fun shortTermPeakVariance(audio: FloatArray): Float { + val frameSize = 160 // 10ms @16kHz + val peaks = mutableListOf() + var i = 0 + while (i + frameSize <= audio.size) { + val frame = audio.sliceArray(i until i + frameSize) + peaks.add(calcPeakRms(frame)) + i += frameSize + } + val mean = peaks.average().toFloat() + return peaks.map { (it - mean) * (it - mean) }.average().toFloat() + } + + /** 计算音频帧峰值 */ + private fun calcPeakRms(audio: FloatArray): Float { + var peak = 0f + for (v in audio) { + val abs = kotlin.math.abs(v) + if (abs > peak) peak = abs + } + return peak + } + + /* ================= 播放回调 ================= */ fun onPlayStartPrompt() { state = VoiceState.PLAYING_PROMPT } // ⭐ 补全