diff --git a/app/build.gradle b/app/build.gradle index 58e31ef..c332a7d 100644 --- a/app/build.gradle +++ b/app/build.gradle @@ -180,6 +180,6 @@ dependencies { implementation libs.androidautosize - implementation files('libs/sherpa19.aar') + implementation files('libs/sherpa-onnx-1.12.20.aar') } \ No newline at end of file diff --git a/app/libs/sherpa19.aar b/app/libs/sherpa-onnx-1.12.20.aar similarity index 85% rename from app/libs/sherpa19.aar rename to app/libs/sherpa-onnx-1.12.20.aar index 134df6c..134fe69 100644 Binary files a/app/libs/sherpa19.aar and b/app/libs/sherpa-onnx-1.12.20.aar differ diff --git a/app/src/main/AndroidManifest.xml b/app/src/main/AndroidManifest.xml index d64e047..d698cc6 100644 --- a/app/src/main/AndroidManifest.xml +++ b/app/src/main/AndroidManifest.xml @@ -52,7 +52,7 @@ tools:targetApi="31"> @@ -66,9 +66,9 @@ - + android:screenOrientation="portrait"/>--> diff --git a/app/src/main/java/com/zs/smarthuman/sherpa/VadManager.kt b/app/src/main/java/com/zs/smarthuman/sherpa/VadManager.kt index 72f0021..7848ca6 100644 --- a/app/src/main/java/com/zs/smarthuman/sherpa/VadManager.kt +++ b/app/src/main/java/com/zs/smarthuman/sherpa/VadManager.kt @@ -10,42 +10,63 @@ import com.k2fsa.sherpa.onnx.getVadModelConfig * @date: 2025/12/17 10:22 */ class VadManager( - private val assetManager: AssetManager, + assetManager: AssetManager, private val onSpeechStart: () -> Unit, private val onSpeechEnd: () -> Unit ) { private val vad: Vad + private var isSpeaking = false + private var lastSpeechTime = 0L + + // ⭐ 统计用 + private var speechFrameCount = 0 + private var totalFrameCount = 0 + + private val END_SILENCE_MS = 600L init { - val config = getVadModelConfig(0) - if (config == null) { - throw IllegalStateException("VAD config not found") - } - vad = Vad(assetManager = assetManager, config = config) + val config = getVadModelConfig(1) + ?: throw IllegalStateException("VAD config not found") + vad = Vad(assetManager, config) } - /** 喂入音频帧 (16kHz PCM float) */ fun accept(samples: FloatArray) { - vad.acceptWaveform(samples) - val speechDetected = vad.isSpeechDetected() + val now = System.currentTimeMillis() - if (speechDetected && !isSpeaking) { - isSpeaking = true - onSpeechStart() - } else if (!speechDetected && isSpeaking) { - isSpeaking = false - onSpeechEnd() - // ⭐ 只在句子结束时清空 VAD - vad.clear() + vad.acceptWaveform(samples) + val hasSpeech = vad.isSpeechDetected() + + totalFrameCount++ + + if (hasSpeech) { + speechFrameCount++ + lastSpeechTime = now + + if (!isSpeaking) { + isSpeaking = true + onSpeechStart() + } + } else { + if (isSpeaking && now - lastSpeechTime >= END_SILENCE_MS) { + isSpeaking = false + onSpeechEnd() + vad.clear() + } } } + /** 👉 人声占比(真正用到 VAD 的地方) */ + fun speechRatio(): Float { + if (totalFrameCount == 0) return 0f + return speechFrameCount.toFloat() / totalFrameCount + } - /** 重置内部状态 */ fun reset() { isSpeaking = false + lastSpeechTime = 0 + speechFrameCount = 0 + totalFrameCount = 0 vad.reset() } } - diff --git a/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt b/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt index 8c731f9..a88e03f 100644 --- a/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt +++ b/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt @@ -8,15 +8,17 @@ class VoiceController( assetManager: AssetManager, private val onWakeup: () -> Unit, private val onFinalAudio: (FloatArray) -> Unit, - private val idleTimeoutSeconds: Int = 15, + private val idleTimeoutSeconds: Int = 5, private val maxRecordingSeconds: Int = 10, private val onStateChanged: ((VoiceState) -> Unit)? = null, private val stopBackendAudio: (() -> Unit)? = null ) { + private val TAG = "VoiceController" private val sampleRate = 16000 /* ================= 状态 ================= */ + private var state: VoiceState = VoiceState.WAIT_WAKEUP set(value) { field = value @@ -24,160 +26,258 @@ class VoiceController( onStateChanged?.invoke(value) } - /* ================= 唤醒 ================= */ + /* ================= KWS ================= */ + private val wakeupManager = WakeupManager(assetManager) { Log.d(TAG, "🔥 WakeWord detected") - stopBackendAudio?.invoke() - if (state != VoiceState.UPLOADING) { // 上传中不重置 - resetAll() - state = VoiceState.PLAYING_PROMPT - } - onWakeup() + handleWakeupEvent() } - /* ================= VAD(只负责 START) ================= */ + /* ================= VAD ================= */ + private val vadManager = VadManager( assetManager, onSpeechStart = { onVadStart() }, - onSpeechEnd = { /* 不再用于结束 */ } + onSpeechEnd = {} ) - /* ================= 音频缓存 ================= */ + /* ================= Buffer ================= */ + private val audioBuffer = mutableListOf() + + /** 前导音缓存(2 秒) */ private val preBuffer = ArrayDeque() - private val PRE_BUFFER_SIZE = sampleRate // 1 秒预缓冲 + private val PRE_BUFFER_SIZE = sampleRate * 2 /* ================= 时间 ================= */ - private var idleTimer = 0L - private var recordingStartTime = 0L + + private var recordingStartMs = 0L + private var silenceStartMs = 0L + + /** ⭐ WAIT_SPEECH 连续失败起点(关键) */ + private var waitSpeechFailStartMs = 0L + + /* ================= 控制 ================= */ + private var vadStarted = false - /* ================= RMS 静音判定 ================= */ - private var silenceStartMs = 0L - private val SILENCE_END_MS = 1200L // 静音多久算一句结束 - private val RMS_SILENCE_THRESHOLD = 0.005f // 更灵敏 - private val MIN_SPEECH_DURATION_MS = 300L // 最短有效语音 - private val MIN_SPEECH_RATIO = 0.15f // 有效帧占比至少 15% + /** 唤醒观察期 */ + private var inKwsObserve = false + private var kwsObserveStartMs = 0L + private val KWS_OBSERVE_MS = 500L + + /** 播放冷却 */ + private var speechEnableAtMs = 0L + private val SPEECH_COOLDOWN_MS = 300L + + /* ================= 阈值 ================= */ + + private val RMS_SILENCE_THRESHOLD = 0.005f + private val SILENCE_END_MS = 1200L + private val MIN_SPEECH_MS = 300L /* ================= 音频入口 ================= */ + fun acceptAudio(samples: FloatArray) { - // 唤醒独立处理,始终喂 + + cachePreBuffer(samples) + wakeupManager.acceptAudio(samples) - - if (state == VoiceState.UPLOADING || - state == VoiceState.PLAYING_PROMPT || - state == VoiceState.PLAYING_BACKEND - ) return - - if (state == VoiceState.WAIT_SPEECH) { - cachePreBuffer(samples) - vadManager.accept(samples) + if (wakeupManager.consumeWakeupFlag()) { + handleWakeupEvent() return } - if (state != VoiceState.RECORDING) return - - // ===== RECORDING ===== - audioBuffer.addAll(samples.asList()) - vadManager.accept(samples) - val now = System.currentTimeMillis() - // 1️⃣ 最大录音兜底 - if (now - recordingStartTime >= maxRecordingSeconds * 1000) { - Log.w(TAG, "⏱ Max recording reached") - finishSentence() - return - } + when (state) { - // 2️⃣ RMS 静音结束判定 - val rms = calcRms(samples) - if (rms < RMS_SILENCE_THRESHOLD) { - if (silenceStartMs == 0L) silenceStartMs = now - else if (now - silenceStartMs >= SILENCE_END_MS) { - Log.d(TAG, "🔇 RMS silence end") - finishSentence() + VoiceState.WAIT_WAKEUP, + VoiceState.PLAYING_PROMPT, + VoiceState.PLAYING_BACKEND, + VoiceState.UPLOADING -> return + + VoiceState.WAIT_SPEECH_COOLDOWN -> { + if (now >= speechEnableAtMs) { + state = VoiceState.WAIT_SPEECH + } + return } - } else { + + VoiceState.WAIT_SPEECH -> { + + if (inKwsObserve) { + if (now - kwsObserveStartMs < KWS_OBSERVE_MS) return + inKwsObserve = false + } + + vadManager.accept(samples) + } + + VoiceState.RECORDING -> { + + audioBuffer.addAll(samples.asList()) + vadManager.accept(samples) + + if (now - recordingStartMs > maxRecordingSeconds * 1000) { + Log.w(TAG, "⏱ Max recording reached") + finishSentence() + return + } + + val rms = calcRms(samples) + if (rms < RMS_SILENCE_THRESHOLD) { + if (silenceStartMs == 0L) silenceStartMs = now + else if (now - silenceStartMs >= SILENCE_END_MS) { + Log.d(TAG, "🔇 RMS silence end") + finishSentence() + } + } else { + silenceStartMs = 0L + } + } + } + } + + /* ================= 唤醒 ================= */ + + private fun handleWakeupEvent() { + when (state) { + + VoiceState.UPLOADING -> return + + VoiceState.RECORDING, + VoiceState.PLAYING_BACKEND -> { + stopBackendAudio?.invoke() + enterWakeup(interrupt = true) + } + + else -> enterWakeup(interrupt = false) + } + } + + private fun enterWakeup(interrupt: Boolean) { + + if (interrupt) { + audioBuffer.clear() + vadManager.reset() + vadStarted = false silenceStartMs = 0L } + + inKwsObserve = true + kwsObserveStartMs = System.currentTimeMillis() + + state = VoiceState.PLAYING_PROMPT + onWakeup() } /* ================= VAD START ================= */ + private fun onVadStart() { if (state != VoiceState.WAIT_SPEECH) return - Log.d(TAG, "🎤 VAD START") + Log.d(TAG, "🎤 REAL VAD START") + vadStarted = true - state = VoiceState.RECORDING - recordingStartTime = System.currentTimeMillis() + recordingStartMs = System.currentTimeMillis() silenceStartMs = 0L + audioBuffer.clear() audioBuffer.addAll(preBuffer) - preBuffer.clear() + + state = VoiceState.RECORDING } /* ================= 结束录音 ================= */ + private fun finishSentence() { - val speakTime = System.currentTimeMillis() - recordingStartTime - if (!vadStarted || speakTime < MIN_SPEECH_DURATION_MS) { - Log.d(TAG, "⛔ Speech too short, ignore") - resetToWaitSpeech(refreshIdle = false) + val duration = System.currentTimeMillis() - recordingStartMs + if (!vadStarted || duration < MIN_SPEECH_MS) { + resetToWaitSpeech() return } - val rmsFrames = calcRmsFrames(audioBuffer.toFloatArray(), frameSize = 320) - val validFrames = rmsFrames.count { it >= RMS_SILENCE_THRESHOLD } - val ratio = if (rmsFrames.isEmpty()) 0f else validFrames.toFloat() / rmsFrames.size - Log.d(TAG, "RMS ratio=$ratio") - if (ratio < MIN_SPEECH_RATIO) { - Log.d(TAG, "❌ Not enough human voice (ratio=$ratio)") - resetToWaitSpeech(refreshIdle = false) + val vadRatio = vadManager.speechRatio() + Log.d(TAG, "🎙 VAD speech ratio=$vadRatio") + + if (vadRatio < 0.25f) { + Log.d(TAG, "❌ VAD says NOT human speech") + resetToWaitSpeech() return } + // ✅ 成功一次,清空失败计时 + waitSpeechFailStartMs = 0L + val finalAudio = audioBuffer.toFloatArray() audioBuffer.clear() state = VoiceState.UPLOADING - Log.d(TAG, "⬆ Upload audio len=${finalAudio.size}") onFinalAudio(finalAudio) } /* ================= 播放回调 ================= */ - fun onPlayStartPrompt() { state = VoiceState.PLAYING_PROMPT } - fun onPlayEndPrompt() { state = VoiceState.WAIT_SPEECH; idleTimer = System.currentTimeMillis() } - fun onPlayStartBackend() { state = VoiceState.PLAYING_BACKEND } - fun onPlayEndBackend() { state = VoiceState.WAIT_SPEECH; idleTimer = System.currentTimeMillis() } - /* ================= 上传回调 ================= */ - fun onUploadFinished(success: Boolean) { - if (state != VoiceState.UPLOADING) return - state = if (success) VoiceState.PLAYING_BACKEND else VoiceState.WAIT_SPEECH - idleTimer = System.currentTimeMillis() + fun onPlayStartPrompt() { + state = VoiceState.PLAYING_PROMPT } - /* ================= Idle ================= */ + fun onPlayEndPrompt() { + speechEnableAtMs = System.currentTimeMillis() + SPEECH_COOLDOWN_MS + state = VoiceState.WAIT_SPEECH_COOLDOWN + } + + fun onPlayStartBackend() { + state = VoiceState.PLAYING_BACKEND + } + + fun onPlayEndBackend() { + speechEnableAtMs = System.currentTimeMillis() + SPEECH_COOLDOWN_MS + state = VoiceState.WAIT_SPEECH_COOLDOWN + } + + /* ================= 上传回调(保留 public) ================= */ + + fun onUploadFinished(success: Boolean) { + if (state != VoiceState.UPLOADING) return + + state = if (success) { + VoiceState.PLAYING_BACKEND + } else { + speechEnableAtMs = System.currentTimeMillis() + SPEECH_COOLDOWN_MS + VoiceState.WAIT_SPEECH_COOLDOWN + } + } + + /* ================= Idle 超时(关键修复) ================= */ + fun checkIdleTimeout() { - // 上传中不计时 if (state != VoiceState.WAIT_SPEECH) return + if (waitSpeechFailStartMs == 0L) return + val now = System.currentTimeMillis() - if (now - idleTimer > idleTimeoutSeconds * 1000) { - Log.d(TAG, "⏱ Idle timeout reached, resetAll") + if (now - waitSpeechFailStartMs > idleTimeoutSeconds * 1000) { + Log.d(TAG, "⏱ WAIT_SPEECH continuous fail timeout") resetAll() + waitSpeechFailStartMs = 0L } } /* ================= Reset ================= */ - private fun resetToWaitSpeech(refreshIdle: Boolean = true) { + + private fun resetToWaitSpeech() { audioBuffer.clear() - preBuffer.clear() vadManager.reset() vadStarted = false silenceStartMs = 0L state = VoiceState.WAIT_SPEECH - if (refreshIdle) idleTimer = System.currentTimeMillis() + + // ⭐ 只在第一次失败时记录 + if (waitSpeechFailStartMs == 0L) { + waitSpeechFailStartMs = System.currentTimeMillis() + } } private fun resetAll() { @@ -190,36 +290,24 @@ class VoiceController( } fun release() { - vadManager.reset() wakeupManager.release() + vadManager.reset() } - /* ================= 工具 ================= */ + /* ================= Utils ================= */ + private fun cachePreBuffer(samples: FloatArray) { for (s in samples) { preBuffer.addLast(s) - if (preBuffer.size > PRE_BUFFER_SIZE) preBuffer.removeFirst() + if (preBuffer.size > PRE_BUFFER_SIZE) { + preBuffer.removeFirst() + } } } private fun calcRms(audio: FloatArray): Float { - if (audio.isEmpty()) return 0f var sum = 0f for (v in audio) sum += v * v return sqrt(sum / audio.size) } - - private fun calcRmsFrames(audio: FloatArray, frameSize: Int = 320): FloatArray { - val rmsList = mutableListOf() - var i = 0 - while (i < audio.size) { - val end = minOf(i + frameSize, audio.size) - val frame = audio.sliceArray(i until end) - var sum = 0f - for (v in frame) sum += v * v - rmsList.add(sqrt(sum / frame.size)) - i += frameSize - } - return rmsList.toFloatArray() - } } diff --git a/app/src/main/java/com/zs/smarthuman/sherpa/VoiceState.kt b/app/src/main/java/com/zs/smarthuman/sherpa/VoiceState.kt index 004035d..cd8c829 100644 --- a/app/src/main/java/com/zs/smarthuman/sherpa/VoiceState.kt +++ b/app/src/main/java/com/zs/smarthuman/sherpa/VoiceState.kt @@ -11,5 +11,6 @@ enum class VoiceState { WAIT_SPEECH, // 等待用户说话 RECORDING, // 用户正在说话 UPLOADING, //音频上传中 + WAIT_SPEECH_COOLDOWN, // ⭐ 唤醒后冷却 PLAYING_BACKEND // 播放后台返回音频 } \ No newline at end of file diff --git a/app/src/main/java/com/zs/smarthuman/sherpa/WakeupManager.kt b/app/src/main/java/com/zs/smarthuman/sherpa/WakeupManager.kt index c7fcbce..0c7a72c 100644 --- a/app/src/main/java/com/zs/smarthuman/sherpa/WakeupManager.kt +++ b/app/src/main/java/com/zs/smarthuman/sherpa/WakeupManager.kt @@ -1,18 +1,18 @@ package com.zs.smarthuman.sherpa import android.content.res.AssetManager +import android.util.Log import com.k2fsa.sherpa.onnx.* -class WakeupManager( - assetManager: AssetManager, - private val onWakeup: () -> Unit -) { +class WakeupManager(assetManager: AssetManager, function: () -> Unit) { + private val TAG = "WakeupManager" private val sampleRate = 16000 + private val kws: KeywordSpotter private var stream: OnlineStream? = null - /** ⭐ 刚唤醒标记,用来丢弃唤醒词音频 */ + /** ⭐ 唤醒标记(只能消费一次) */ private var justWokeUp = false init { @@ -29,15 +29,16 @@ class WakeupManager( ) kws = KeywordSpotter(assetManager, config) + Log.d(TAG, "✅ KeywordSpotter initialized") + stream = kws.createStream() - ?: error("Failed to create KWS stream") + require(stream != null) { "Failed to create KWS stream" } + Log.d(TAG, "✅ KWS stream created") } - - /** ⭐ 小爱同学策略:不管播放还是录音,永远喂 */ + /** ⭐ 永远喂 KWS */ fun acceptAudio(samples: FloatArray) { val s = stream ?: return - // ⭐ 远讲 / 播放补偿(非常关键) for (i in samples.indices) { samples[i] *= 2.5f } @@ -47,15 +48,15 @@ class WakeupManager( kws.decode(s) val keyword = kws.getResult(s).keyword if (keyword.isNotBlank()) { + Log.d(TAG, "🔥 KWS hit: $keyword") justWokeUp = true - onWakeup() - kws.reset(s) // 立刻 reset,进入新一轮 + kws.reset(s) break } } } - /** 被 VAD 消费一次 */ + /** ⭐ 唯一唤醒出口 */ fun consumeWakeupFlag(): Boolean { val r = justWokeUp justWokeUp = false diff --git a/app/src/main/java/com/zs/smarthuman/ui/MainActivity.kt b/app/src/main/java/com/zs/smarthuman/ui/MainActivity.kt index 3c3bb02..23d3316 100644 --- a/app/src/main/java/com/zs/smarthuman/ui/MainActivity.kt +++ b/app/src/main/java/com/zs/smarthuman/ui/MainActivity.kt @@ -77,7 +77,7 @@ class MainActivity : BaseViewModelActivity() private var voiceController: VoiceController? = null private var audioRecord: AudioRecord? = null private var isRecording = false - private val audioSource = MediaRecorder.AudioSource.VOICE_RECOGNITION + private val audioSource = MediaRecorder.AudioSource.VOICE_COMMUNICATION private val sampleRateInHz = 16000 private val channelConfig = AudioFormat.CHANNEL_IN_MONO private val audioFormat = AudioFormat.ENCODING_PCM_16BIT @@ -169,17 +169,17 @@ class MainActivity : BaseViewModelActivity() }, onFinalAudio = { audio -> Log.d("lrs", "检测到语音,长度=${audio.size}") - mViewModel?.uploadVoice( - AudioPcmUtil.pcm16ToBase64(AudioPcmUtil.floatToPcm16(audio)), - 1 - ) -// loadLocalJsonAndPlay() -// val file = File( -// getExternalFilesDir(Environment.DIRECTORY_DOWNLOADS)!!.getAbsolutePath(), -// "xxx.wav" +// mViewModel?.uploadVoice( +// AudioPcmUtil.pcm16ToBase64(AudioPcmUtil.floatToPcm16(audio)), +// 1 // ) -// AudioDebugUtil.saveFloatPcmAsWav(audio, file) -// LogUtils.dTag("audioxx", "WAV saved: ${file.path}, samples=${audio.size}") + loadLocalJsonAndPlay() + val file = File( + getExternalFilesDir(Environment.DIRECTORY_DOWNLOADS)!!.getAbsolutePath(), + "xxx.wav" + ) + AudioDebugUtil.saveFloatPcmAsWav(audio, file) + LogUtils.dTag("audioxx", "WAV saved: ${file.path}, samples=${audio.size}") }, onStateChanged = { state -> @@ -261,7 +261,7 @@ class MainActivity : BaseViewModelActivity() if (audioRecord?.state != AudioRecord.STATE_INITIALIZED) { Log.e("VoiceService", "Failed to initialize AudioRecord") } - enableSystemAec(audioRecord!!) +// enableSystemAec(audioRecord!!) } private var aec: AcousticEchoCanceler? = null