diff --git a/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt b/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt index 98dfa6a..485d596 100644 --- a/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt +++ b/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt @@ -53,7 +53,8 @@ class VoiceController( private val idleTimeoutMs = idleTimeoutSeconds * 1000L private val maxRecordingMs = maxRecordingSeconds * 1000L - private val vadManager = VadManager(assetManager, + private val vadManager = VadManager( + assetManager, onSpeechStart = { onVadStart() }, onSpeechEnd = { avgEnergy, peakRms -> onVadEnd(avgEnergy, peakRms) } ) @@ -100,7 +101,7 @@ class VoiceController( val result = it.getResult(stream) stream.release() result.text - }?:"" + } ?: "" } catch (e: Exception) { Log.e(TAG, "Error during ASR processing: ${e.message}") @@ -137,7 +138,8 @@ class VoiceController( VoiceState.WAIT_SPEECH -> { if ((waitSpeechStartMs > 0 && now - waitSpeechStartMs >= idleTimeoutMs) || - (waitSpeechFailStartMs > 0 && now - waitSpeechFailStartMs >= idleTimeoutMs)) { + (waitSpeechFailStartMs > 0 && now - waitSpeechFailStartMs >= idleTimeoutMs) + ) { Log.d(TAG, "⏱ WAIT_SPEECH idle timeout → WAIT_WAKEUP") resetAll() return @@ -215,7 +217,10 @@ class VoiceController( val peakAvgRatio = if (avgEnergy > 0f) peakRms / avgEnergy else 0f Log.d(TAG, "📊 Finish Sentence - duration: $duration ms, vadEnded: true") - Log.d(TAG, "📊 vadRatio=$vadRatio, avgEnergy=$avgEnergy, peakRms=$peakRms, peakAvgRatio=$peakAvgRatio") + Log.d( + TAG, + "📊 vadRatio=$vadRatio, avgEnergy=$avgEnergy, peakRms=$peakRms, peakAvgRatio=$peakAvgRatio" + ) if (avgEnergy < 0.02f || peakAvgRatio < 1.2f || vadRatio < 0.4f) { Log.d(TAG, "❌ Sentence rejected") @@ -230,14 +235,15 @@ class VoiceController( return } - val asrText = runSecondPass(audio) - if (asrText.isEmpty()) { - resetToWaitSpeech() - Log.d(TAG, "识别不出来asr") - return - } else { - Log.d(TAG, "识别出来:${asrText}") - } + var asrText = runSecondPass(audio) + + if (asrText.isEmpty()) { + resetToWaitSpeech() + Log.d(TAG, "识别不出来asr") + return + } else { + Log.d(TAG, "识别出来:${asrText}") + } // 评分逻辑 @@ -256,7 +262,10 @@ class VoiceController( vadRatio >= 0.55f -> score += 2 vadRatio >= 0.40f -> score += 1 } - Log.d(TAG, "📊 duration=$duration ms, vadRatio=$vadRatio, avgEnergy=$avgEnergy, score=$score") + Log.d( + TAG, + "📊 duration=$duration ms, vadRatio=$vadRatio, avgEnergy=$avgEnergy, score=$score" + ) val pass = score >= 5 || (score == 3 && avgEnergy >= 0.06f) if (!pass) { @@ -271,13 +280,19 @@ class VoiceController( } /* ================= 播放回调 ================= */ - fun onPlayStartPrompt() { state = VoiceState.PLAYING_PROMPT } + fun onPlayStartPrompt() { + state = VoiceState.PLAYING_PROMPT + } + fun onPlayEndPrompt() { speechEnableAtMs = System.currentTimeMillis() + SPEECH_COOLDOWN_MS state = VoiceState.WAIT_SPEECH_COOLDOWN } - fun onPlayStartBackend() { state = VoiceState.PLAYING_BACKEND } + fun onPlayStartBackend() { + state = VoiceState.PLAYING_BACKEND + } + fun onPlayEndBackend() { speechEnableAtMs = System.currentTimeMillis() + SPEECH_COOLDOWN_MS state = VoiceState.WAIT_SPEECH_COOLDOWN