diff --git a/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt b/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt index 6860f54..cbc3c71 100644 --- a/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt +++ b/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt @@ -3,6 +3,7 @@ package com.zs.smarthuman.sherpa import android.content.res.AssetManager import android.util.Log + class VoiceController( assetManager: AssetManager, private val onWakeup: () -> Unit, @@ -10,23 +11,25 @@ class VoiceController( private val idleTimeoutSeconds: Int = 15, private val onStateChanged: ((VoiceState) -> Unit)? = null ) { + private val TAG = "VoiceController" private var state: VoiceState = VoiceState.WAIT_WAKEUP set(value) { field = value onStateChanged?.invoke(value) - Log.d("VoiceController", "当前状态: $value") + Log.d(TAG, "当前状态: $value") } /** ================= 唤醒 ================= */ - private var wakeupLocked = false private var wakeupFlag = false + private var wakeupDiscardEndTime = 0L + private val WAKEUP_DISCARD_BUFFER_MS = 200L // 提示音前后缓冲 private val wakeupManager = WakeupManager(assetManager) { - Log.d("VoiceController", "唤醒触发! wakeupFlag=$wakeupFlag, 当前状态=$state") + Log.d(TAG, "唤醒触发! 当前状态=$state") wakeupFlag = true onWakeup() - startWaitSpeech() + playLocalPrompt() } /** ================= VAD ================= */ @@ -38,63 +41,83 @@ class VoiceController( /** ================= 音频缓存 ================= */ private val audioBuffer = mutableListOf() - private val preBuffer = ArrayDeque() // pre-roll 1 秒 + private val preBuffer = ArrayDeque() // pre-roll 1 秒 private val PRE_BUFFER_SIZE = 16000 private var idleTimer = 0L - var isPlaying = false - private set - /** ================= 尾部静音控制 ================= */ private var vadEndPending = false private var vadEndTime = 0L private val END_SILENCE_MS = 1000L - /* ================= 公共接口 ================= */ + var isPlaying = false + private set + + /** ================= 公共接口 ================= */ fun initWakeup(keywords: String) { wakeupManager.initStream(keywords) - onStateChanged?.invoke(state) } fun acceptAudio(samples: FloatArray) { cachePreBuffer(samples) - - // KWS 永远运行 wakeupManager.acceptAudio(samples) + val now = System.currentTimeMillis() + + // 播放本地提示音或后台音频期间,不喂 VAD + if (state == VoiceState.PLAYING_PROMPT || state == VoiceState.PLAYING_BACKEND) return + + val discardAudio = now < wakeupDiscardEndTime + when (state) { VoiceState.WAIT_WAKEUP -> Unit - - VoiceState.WAIT_SPEECH -> vadManager.accept(samples) - - VoiceState.RECORDING -> { + VoiceState.WAIT_SPEECH -> if (!discardAudio) vadManager.accept(samples) + VoiceState.RECORDING -> if (!discardAudio) { audioBuffer.addAll(samples.asList()) vadManager.accept(samples) - idleTimer = System.currentTimeMillis() - - if (vadEndPending && System.currentTimeMillis() - vadEndTime >= END_SILENCE_MS) { + idleTimer = now + if (vadEndPending && now - vadEndTime >= END_SILENCE_MS) { finishSentence() } } - VoiceState.PLAYING -> Unit + VoiceState.PLAYING_PROMPT -> { + + } + VoiceState.PLAYING_BACKEND -> {} } } - fun onPlayStart() { + fun onPlayStartPrompt() { isPlaying = true - state = VoiceState.PLAYING + state = VoiceState.PLAYING_PROMPT + Log.d(TAG, "播放提示音, 状态变为 PLAYING_PROMPT") } - fun onPlayEnd() { + fun onPlayEndPrompt() { isPlaying = false - if (state == VoiceState.WAIT_SPEECH) return - reset() + state = VoiceState.WAIT_SPEECH + idleTimer = System.currentTimeMillis() + // 设置 discardAudio 时间,丢弃提示音残留音频 + wakeupDiscardEndTime = System.currentTimeMillis() + WAKEUP_DISCARD_BUFFER_MS + Log.d(TAG, "提示音播放结束, 状态变为 WAIT_SPEECH") + } + + fun onPlayStartBackend() { + isPlaying = true + state = VoiceState.PLAYING_BACKEND + Log.d(TAG, "播放后台音频, 状态变为 PLAYING_BACKEND") + } + + fun onPlayEndBackend() { + isPlaying = false + state = VoiceState.WAIT_WAKEUP + idleTimer = System.currentTimeMillis() + Log.d(TAG, "后台音频播放结束, 状态变为 WAIT_WAKEUP") } fun checkIdleTimeout() { if (state != VoiceState.WAIT_SPEECH) return - val now = System.currentTimeMillis() if (now - idleTimer > idleTimeoutSeconds * 1000) { reset() @@ -107,10 +130,11 @@ class VoiceController( preBuffer.clear() vadManager.reset() wakeupManager.reset() - wakeupLocked = false wakeupFlag = false + wakeupDiscardEndTime = 0 idleTimer = 0 vadEndPending = false + Log.d(TAG, "已重置, 状态变为 WAIT_WAKEUP") } fun release() { @@ -121,69 +145,56 @@ class VoiceController( idleTimer = 0 isPlaying = false state = VoiceState.WAIT_WAKEUP - wakeupLocked = false wakeupFlag = false + wakeupDiscardEndTime = 0 vadEndPending = false } /* ================= 内部逻辑 ================= */ - private fun startWaitSpeech() { - state = VoiceState.WAIT_SPEECH - audioBuffer.clear() - idleTimer = System.currentTimeMillis() + private fun playLocalPrompt() { + onPlayStartPrompt() + // 在这里播放 "我在" 音频,播放结束后必须调用 onPlayEndPrompt() } private fun onVadSpeechStart() { vadEndPending = false - - Log.d("VoiceController", "VAD开始, 当前状态=$state, wakeupFlag=$wakeupFlag") - - // 如果是唤醒词音频则丢弃 - if (wakeupFlag) { - wakeupFlag = false - Log.d("VoiceController", "丢弃唤醒词音频") - return - } + val now = System.currentTimeMillis() + Log.d(TAG, "VAD开始, 当前状态=$state, wakeupFlag=$wakeupFlag") if (state != VoiceState.WAIT_SPEECH) return + if (wakeupFlag && now < wakeupDiscardEndTime) { + Log.d(TAG, "丢弃唤醒词残留音频") + return + } state = VoiceState.RECORDING audioBuffer.clear() audioBuffer.addAll(preBuffer) - idleTimer = System.currentTimeMillis() - - Log.d("VoiceController", "开始录音, 状态变为=$state") + idleTimer = now + Log.d(TAG, "开始录音, 状态变为 RECORDING") } private fun onVadSpeechEnd() { if (state != VoiceState.RECORDING) return vadEndPending = true vadEndTime = System.currentTimeMillis() - Log.d("VoiceController", "VAD结束, 等待尾部静音") + Log.d(TAG, "VAD结束, 等待尾部静音") } private fun finishSentence() { vadEndPending = false state = VoiceState.WAIT_WAKEUP - wakeupLocked = false - val finalAudio = audioBuffer.toFloatArray() audioBuffer.clear() - - if (finalAudio.isNotEmpty()) { - onFinalAudio(finalAudio) - } - + if (finalAudio.isNotEmpty()) onFinalAudio(finalAudio) idleTimer = 0 - Log.d("VoiceController", "录音结束, 返回 WAIT_WAKEUP") + Log.d(TAG, "录音结束, 返回 WAIT_WAKEUP") } private fun cachePreBuffer(samples: FloatArray) { for (s in samples) { preBuffer.addLast(s) - if (preBuffer.size > PRE_BUFFER_SIZE) { - preBuffer.removeFirst() - } + if (preBuffer.size > PRE_BUFFER_SIZE) preBuffer.removeFirst() } } } diff --git a/app/src/main/java/com/zs/smarthuman/sherpa/VoiceState.kt b/app/src/main/java/com/zs/smarthuman/sherpa/VoiceState.kt index 8eb83d2..74c80e2 100644 --- a/app/src/main/java/com/zs/smarthuman/sherpa/VoiceState.kt +++ b/app/src/main/java/com/zs/smarthuman/sherpa/VoiceState.kt @@ -6,8 +6,9 @@ package com.zs.smarthuman.sherpa * @date: 2025/12/17 10:20 */ enum class VoiceState { - WAIT_WAKEUP, - WAIT_SPEECH, - RECORDING, - PLAYING + WAIT_WAKEUP, // 等待唤醒 + PLAYING_PROMPT, // 播放本地提示音 + WAIT_SPEECH, // 等待用户说话 + RECORDING, // 用户正在说话 + PLAYING_BACKEND // 播放后台返回音频 } \ No newline at end of file diff --git a/app/src/main/java/com/zs/smarthuman/ui/MainActivity.kt b/app/src/main/java/com/zs/smarthuman/ui/MainActivity.kt index f4f16c6..c835e1e 100644 --- a/app/src/main/java/com/zs/smarthuman/ui/MainActivity.kt +++ b/app/src/main/java/com/zs/smarthuman/ui/MainActivity.kt @@ -183,9 +183,11 @@ class MainActivity : BaseViewModelActivity() VoiceState.WAIT_SPEECH -> Log.d("lrs", "当前状态: 唤醒成功,等待说话") VoiceState.RECORDING -> Log.d("lrs", "当前状态: 正在录音") - VoiceState.PLAYING -> Log.d("lrs", "当前状态: Unity 播放中") + VoiceState.PLAYING_PROMPT -> Log.d("lrs", "当前状态: 播放本地音频") + VoiceState.PLAYING_BACKEND -> Log.d("lrs", "当前状态: 播放后台音频") } - } + }, + ) // 初始化唤醒词 voiceController?.initWakeup("x iǎo z ì @小智") @@ -266,10 +268,10 @@ class MainActivity : BaseViewModelActivity() private fun processAudio() { - val interval = 0.1 - val bufferSize = (interval * sampleRateInHz).toInt() // in samples +// val interval = 0.1 +// val bufferSize = (interval * sampleRateInHz).toInt() // in samples // val buffer = ShortArray(bufferSize) -// val bufferSize = 512 // in samples + val bufferSize = 512 // in samples val buffer = ShortArray(bufferSize) while (isRecording) { val ret = audioRecord?.read(buffer, 0, buffer.size) ?: 0 @@ -297,15 +299,26 @@ class MainActivity : BaseViewModelActivity() word: String, audioUrl: String ) { - if (state == 1 && audioUrl == "https://static.seerteach.net/largemodel/smart_read_audio/intensive_reading/689450143596d58606a106e5/689450143596d58606a106e5_1.mp3") { + if (state == 1) { if (!isPlayed) { isPlayed = true - voiceController?.onPlayStart() + if (audioUrl == UserInfoManager.userInfo?.wakeUpAudioUrl){ + voiceController?.onPlayStartPrompt() + }else{ + voiceController?.onPlayStartBackend() + } } } - if (state == 3&& audioUrl == "https://static.seerteach.net/largemodel/smart_read_audio/intensive_reading/689450143596d58606a106e5/689450143596d58606a106e5_1.mp3") { - voiceController?.onPlayEnd() + + if (state == 3){ + if (audioUrl == UserInfoManager.userInfo?.wakeUpAudioUrl){ + voiceController?.onPlayEndPrompt() + + }else{ + voiceController?.onPlayEndBackend() + } + isPlayed = false } }