diff --git a/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt b/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt index d4a24a5..6212628 100644 --- a/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt +++ b/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt @@ -3,6 +3,7 @@ package com.zs.smarthuman.sherpa import android.content.res.AssetManager import android.util.Log import kotlin.math.sqrt +import java.util.ArrayDeque class VoiceController( assetManager: AssetManager, @@ -17,8 +18,6 @@ class VoiceController( private val TAG = "VoiceController" private val sampleRate = 16000 - /* ================= 状态 ================= */ - private var state: VoiceState = VoiceState.WAIT_WAKEUP set(value) { field = value @@ -26,42 +25,29 @@ class VoiceController( onStateChanged?.invoke(value) } - /* ================= KWS ================= */ - private val wakeupManager = WakeupManager(assetManager) { Log.d(TAG, "🔥 WakeWord detected") handleWakeupEvent() } - /* ================= VAD ================= */ - private val vadManager = VadManager( assetManager, onSpeechStart = { onVadStart() }, onSpeechEnd = {} ) - /* ================= Buffer ================= */ - private val audioBuffer = mutableListOf() - - /** 前导音缓存(2 秒) */ private val preBuffer = ArrayDeque() private val PRE_BUFFER_SIZE = sampleRate * 2 - /* ================= 时间 ================= */ - private var recordingStartMs = 0L private var silenceStartMs = 0L private var waitSpeechFailStartMs = 0L - - /* ================= 近讲统计(⭐关键新增) ================= */ + private var waitSpeechStartMs = 0L private var speechEnergySum = 0f private var speechFrameCount = 0 - /* ================= 控制 ================= */ - private var vadStarted = false private var inKwsObserve = false @@ -71,28 +57,17 @@ class VoiceController( private var speechEnableAtMs = 0L private val SPEECH_COOLDOWN_MS = 300L - /* ================= 阈值(⭐已校正) ================= */ - - private val RMS_SILENCE_THRESHOLD = 0.012f // 静音阈值(修正) + private val RMS_SILENCE_THRESHOLD = 0.012f private val SILENCE_END_MS = 1200L - private val MIN_SPEECH_MS = 1000L // 句子级 - private val MIN_AVG_ENERGY = 0.02f // 近讲能量门 + private val MIN_SPEECH_MS = 1000L + private val MIN_AVG_ENERGY = 0.02f - - /** ⭐ 唤醒后等待人声起点 */ - private var waitSpeechStartMs = 0L - - /** ⭐ 唤醒后最大等待时间(没说一句话) */ private val WAIT_SPEECH_TIMEOUT_MS = 8000L - - /* ================= 音频入口 ================= */ fun acceptAudio(samples: FloatArray) { - cachePreBuffer(samples) - wakeupManager.acceptAudio(samples) if (wakeupManager.consumeWakeupFlag()) { handleWakeupEvent() @@ -102,7 +77,6 @@ class VoiceController( val now = System.currentTimeMillis() when (state) { - VoiceState.WAIT_WAKEUP, VoiceState.PLAYING_PROMPT, VoiceState.PLAYING_BACKEND, @@ -110,40 +84,38 @@ class VoiceController( VoiceState.WAIT_SPEECH_COOLDOWN -> { if (now >= speechEnableAtMs) { - waitSpeechFailStartMs = 0L // ⭐ 必须清 + waitSpeechFailStartMs = System.currentTimeMillis() state = VoiceState.WAIT_SPEECH - waitSpeechStartMs = now // ⭐ 关键:开始等人说话 + waitSpeechStartMs = now } return } VoiceState.WAIT_SPEECH -> { - // ⭐ 唤醒后长时间没人说话 → 自动退出 - if (waitSpeechStartMs > 0 && - now - waitSpeechStartMs >= WAIT_SPEECH_TIMEOUT_MS - ) { - Log.d(TAG, "⏱ Wakeup but no speech, exit to WAIT_WAKEUP") + if (waitSpeechStartMs > 0 && now - waitSpeechStartMs >= WAIT_SPEECH_TIMEOUT_MS) { + Log.d(TAG, "⏱ Wakeup but no speech → WAIT_WAKEUP") resetAll() return } - if (inKwsObserve) { - if (now - kwsObserveStartMs < KWS_OBSERVE_MS) return - inKwsObserve = false + if (waitSpeechFailStartMs > 0 && now - waitSpeechFailStartMs >= idleTimeoutSeconds * 1000) { + Log.d(TAG, "⏱ WAIT_SPEECH idle timeout → WAIT_WAKEUP") + resetAll() + return } + if (inKwsObserve && now - kwsObserveStartMs < KWS_OBSERVE_MS) return + inKwsObserve = false + vadManager.accept(samples) } - VoiceState.RECORDING -> { - audioBuffer.addAll(samples.asList()) vadManager.accept(samples) val rms = calcRms(samples) - if (rms > RMS_SILENCE_THRESHOLD) { speechEnergySum += rms speechFrameCount++ @@ -169,22 +141,20 @@ class VoiceController( private fun handleWakeupEvent() { when (state) { - VoiceState.UPLOADING -> return - VoiceState.RECORDING, VoiceState.PLAYING_BACKEND -> { stopBackendAudio?.invoke() enterWakeup(interrupt = true) } - else -> enterWakeup(interrupt = false) } } private fun enterWakeup(interrupt: Boolean) { - waitSpeechFailStartMs = 0L // ⭐ 唤醒即新会话 - waitSpeechStartMs = 0L + waitSpeechFailStartMs = System.currentTimeMillis() + waitSpeechStartMs = System.currentTimeMillis() + if (interrupt) { audioBuffer.clear() vadManager.reset() @@ -200,31 +170,23 @@ class VoiceController( onWakeup() } - /* ================= VAD START ================= */ - private fun onVadStart() { if (state != VoiceState.WAIT_SPEECH) return Log.d(TAG, "🎤 REAL VAD START") - vadStarted = true recordingStartMs = System.currentTimeMillis() silenceStartMs = 0L - waitSpeechFailStartMs = 0L // ⭐ 新一轮有效说话 - waitSpeechStartMs = 0L // ⭐ 清掉“等待说话”超时 resetEnergyStat() audioBuffer.clear() audioBuffer.addAll(preBuffer) - state = VoiceState.RECORDING } - - /* ================= 结束录音(⭐核心) ================= */ + /* ================= 结束录音 ================= */ private fun finishSentence() { - val now = System.currentTimeMillis() val duration = now - recordingStartMs @@ -235,46 +197,28 @@ class VoiceController( } val vadRatio = vadManager.activeSpeechRatio() - val avgEnergy = - if (speechFrameCount > 0) speechEnergySum / speechFrameCount else 0f - - /* ================= 评分制判定 ================= */ + val avgEnergy = if (speechFrameCount > 0) speechEnergySum / speechFrameCount else 0f var score = 0 - - // 1️⃣ 时长评分(最重要) when { duration >= 4000 -> score += 3 duration >= 2500 -> score += 2 duration >= 1500 -> score += 1 } - - // 2️⃣ 能量评分(近讲人声强信号) when { avgEnergy >= 0.10f -> score += 3 avgEnergy >= 0.06f -> score += 2 avgEnergy >= MIN_AVG_ENERGY -> score += 1 } - - // 3️⃣ VAD 评分(只作为辅助) when { vadRatio >= 0.55f -> score += 2 vadRatio >= 0.40f -> score += 1 } - Log.d( - TAG, - "📊 duration=$duration ms, vadRatio=$vadRatio, avgEnergy=$avgEnergy, score=$score" - ) + Log.d(TAG, "📊 duration=$duration ms, vadRatio=$vadRatio, avgEnergy=$avgEnergy, score=$score") - /** - * 评分阈值: - * - >=4 : 必然是真实人声 - * - 3 : 在近讲/长句条件下允许 - * - <3 : 拦截 - */ val pass = when { - score >= 4 -> true + score >= 6 -> true score == 3 && avgEnergy >= 0.06f -> true else -> false } @@ -285,33 +229,22 @@ class VoiceController( return } - /* ================= 通过,进入上传 ================= */ - waitSpeechFailStartMs = 0L - val finalAudio = audioBuffer.toFloatArray() audioBuffer.clear() - state = VoiceState.UPLOADING onFinalAudio(finalAudio) } - /* ================= 播放回调 ================= */ - fun onPlayStartPrompt() { - state = VoiceState.PLAYING_PROMPT - } - + fun onPlayStartPrompt() { state = VoiceState.PLAYING_PROMPT } // ⭐ 补全 fun onPlayEndPrompt() { speechEnableAtMs = System.currentTimeMillis() + SPEECH_COOLDOWN_MS state = VoiceState.WAIT_SPEECH_COOLDOWN } - fun onPlayStartBackend() { - state = VoiceState.PLAYING_BACKEND - } - + fun onPlayStartBackend() { state = VoiceState.PLAYING_BACKEND } // ⭐ 补全 fun onPlayEndBackend() { speechEnableAtMs = System.currentTimeMillis() + SPEECH_COOLDOWN_MS state = VoiceState.WAIT_SPEECH_COOLDOWN @@ -321,10 +254,8 @@ class VoiceController( fun onUploadFinished(success: Boolean) { if (state != VoiceState.UPLOADING) return - - state = if (success) { - VoiceState.PLAYING_BACKEND - } else { + state = if (success) VoiceState.PLAYING_BACKEND + else { speechEnableAtMs = System.currentTimeMillis() + SPEECH_COOLDOWN_MS VoiceState.WAIT_SPEECH_COOLDOWN } @@ -335,13 +266,9 @@ class VoiceController( fun checkIdleTimeout() { if (state != VoiceState.WAIT_SPEECH) return if (waitSpeechFailStartMs == 0L) return - - if (System.currentTimeMillis() - waitSpeechFailStartMs > - idleTimeoutSeconds * 1000 - ) { - Log.d(TAG, "⏱ WAIT_SPEECH timeout") + if (System.currentTimeMillis() - waitSpeechFailStartMs > idleTimeoutSeconds * 1000) { + Log.d(TAG, "⏱ WAIT_SPEECH idle timeout → WAIT_WAKEUP") resetAll() - waitSpeechFailStartMs = 0L } } @@ -355,9 +282,7 @@ class VoiceController( silenceStartMs = 0L state = VoiceState.WAIT_SPEECH - if (waitSpeechFailStartMs == 0L) { - waitSpeechFailStartMs = System.currentTimeMillis() - } + if (waitSpeechFailStartMs == 0L) waitSpeechFailStartMs = System.currentTimeMillis() } private fun resetAll() { @@ -367,11 +292,11 @@ class VoiceController( resetEnergyStat() vadStarted = false silenceStartMs = 0L - waitSpeechStartMs = 0L // ⭐ + waitSpeechStartMs = 0L + waitSpeechFailStartMs = 0L state = VoiceState.WAIT_WAKEUP } - fun release() { wakeupManager.release() vadManager.reset() @@ -387,9 +312,7 @@ class VoiceController( private fun cachePreBuffer(samples: FloatArray) { for (s in samples) { preBuffer.addLast(s) - if (preBuffer.size > PRE_BUFFER_SIZE) { - preBuffer.removeFirst() - } + if (preBuffer.size > PRE_BUFFER_SIZE) preBuffer.removeFirst() } } diff --git a/app/src/main/java/com/zs/smarthuman/ui/MainActivity.kt b/app/src/main/java/com/zs/smarthuman/ui/MainActivity.kt index 74183a0..27110f0 100644 --- a/app/src/main/java/com/zs/smarthuman/ui/MainActivity.kt +++ b/app/src/main/java/com/zs/smarthuman/ui/MainActivity.kt @@ -161,7 +161,7 @@ class MainActivity : BaseViewModelActivity() voiceInfo = mutableListOf().apply { add( VoiceBeanResp( - audioUrl = UserInfoManager.userInfo?.wakeUpAudioUrl ?: "https://static.seerteach.net/aidialogue/systemVoice/aliyun-nv.mp3" + audioUrl = /*UserInfoManager.userInfo?.wakeUpAudioUrl ?: */"https://static.seerteach.net/aidialogue/systemVoice/aliyun-nv.mp3" ) ) } @@ -169,17 +169,17 @@ class MainActivity : BaseViewModelActivity() }, onFinalAudio = { audio -> Log.d("lrs", "检测到语音,长度=${audio.size}") -// mViewModel?.uploadVoice( -// AudioPcmUtil.pcm16ToBase64(AudioPcmUtil.floatToPcm16(audio)), -// 1 -// ) - loadLocalJsonAndPlay() - val file = File( - getExternalFilesDir(Environment.DIRECTORY_DOWNLOADS)!!.getAbsolutePath(), - "xxx.wav" + mViewModel?.uploadVoice( + AudioPcmUtil.pcm16ToBase64(AudioPcmUtil.floatToPcm16(audio)), + 1 ) - AudioDebugUtil.saveFloatPcmAsWav(audio, file) - LogUtils.dTag("audioxx", "WAV saved: ${file.path}, samples=${audio.size}") + loadLocalJsonAndPlay() +// val file = File( +// getExternalFilesDir(Environment.DIRECTORY_DOWNLOADS)!!.getAbsolutePath(), +// "xxx.wav" +// ) +// AudioDebugUtil.saveFloatPcmAsWav(audio, file) +// LogUtils.dTag("audioxx", "WAV saved: ${file.path}, samples=${audio.size}") }, onStateChanged = { state -> @@ -343,7 +343,7 @@ class MainActivity : BaseViewModelActivity() word: String, audioUrl: String ) { - val wakeupUrl = UserInfoManager.userInfo?.wakeUpAudioUrl ?: "https://static.seerteach.net/aidialogue/systemVoice/aliyun-nv.mp3" ?: return + val wakeupUrl = /*UserInfoManager.userInfo?.wakeUpAudioUrl ?: */"https://static.seerteach.net/aidialogue/systemVoice/aliyun-nv.mp3" ?: return if (audioUrl != wakeupUrl) return