diff --git a/app/src/main/java/com/zs/smarthuman/sherpa/VadManager.kt b/app/src/main/java/com/zs/smarthuman/sherpa/VadManager.kt index 8ce92f1..18d22aa 100644 --- a/app/src/main/java/com/zs/smarthuman/sherpa/VadManager.kt +++ b/app/src/main/java/com/zs/smarthuman/sherpa/VadManager.kt @@ -3,26 +3,25 @@ package com.zs.smarthuman.sherpa import android.content.res.AssetManager import com.k2fsa.sherpa.onnx.Vad import com.k2fsa.sherpa.onnx.getVadModelConfig +import kotlin.math.sqrt class VadManager( assetManager: AssetManager, private val onSpeechStart: () -> Unit, - private val onSpeechEnd: () -> Unit + private val onSpeechEnd: (Float, Float) -> Unit // avgEnergy, peakRms ) { private val vad: Vad - private var isSpeaking = false private var lastSpeechTime = 0L - /** ⭐ 仅统计“有效语音段” */ + /** 有效语音统计 */ private var activeFrameCount = 0 private var activeSpeechFrameCount = 0 + private var speechEnergySum = 0f + private var speechFrameCount = 0 + private var peakRms = 0f - /** ⭐ 用于调试(可选) */ - private var rawFrameCount = 0 - private var rawSpeechFrameCount = 0 - - private val END_SILENCE_MS = 600L + private val END_SILENCE_MS = 800L init { val config = getVadModelConfig(0) @@ -30,64 +29,64 @@ class VadManager( vad = Vad(assetManager, config) } + /** 外部调用的音频输入 */ fun accept(samples: FloatArray) { val now = System.currentTimeMillis() vad.acceptWaveform(samples) val hasSpeech = vad.isSpeechDetected() - /* ===== raw 统计(仅日志) ===== */ - rawFrameCount++ - if (hasSpeech) rawSpeechFrameCount++ - + val rms = calcRms(samples) if (hasSpeech) { lastSpeechTime = now - if (!isSpeaking) { isSpeaking = true + resetStats() onSpeechStart() } + // 累计有效语音能量和峰值 + speechEnergySum += rms + speechFrameCount++ + if (rms > peakRms) peakRms = rms + activeFrameCount++ activeSpeechFrameCount++ } else { - if (isSpeaking) { - activeFrameCount++ - - if (now - lastSpeechTime >= END_SILENCE_MS) { - isSpeaking = false - onSpeechEnd() - } + if (isSpeaking) activeFrameCount++ + // 检查结束 + if (isSpeaking && now - lastSpeechTime >= END_SILENCE_MS) { + isSpeaking = false + val avgEnergy = if (speechFrameCount > 0) speechEnergySum / speechFrameCount else 0f + onSpeechEnd(avgEnergy, peakRms) } } } - /** - * ✅ 真正用于判断「是不是有效人声」 - * 只统计 VAD 激活期间 - */ + /** 统计有效语音比例,用于 VoiceController */ fun activeSpeechRatio(): Float { if (activeFrameCount == 0) return 0f return activeSpeechFrameCount.toFloat() / activeFrameCount } - /** - * ⚠️ 仅用于调参观察 - */ - fun rawSpeechRatio(): Float { - if (rawFrameCount == 0) return 0f - return rawSpeechFrameCount.toFloat() / rawFrameCount - } - fun reset() { isSpeaking = false lastSpeechTime = 0L - activeFrameCount = 0 activeSpeechFrameCount = 0 - rawFrameCount = 0 - rawSpeechFrameCount = 0 - + resetStats() vad.reset() } + + private fun resetStats() { + speechEnergySum = 0f + speechFrameCount = 0 + peakRms = 0f + } + + private fun calcRms(audio: FloatArray): Float { + var sum = 0f + for (v in audio) sum += v * v + return sqrt(sum / audio.size) + } } diff --git a/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt b/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt index 108d6a3..c845d26 100644 --- a/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt +++ b/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt @@ -2,7 +2,6 @@ package com.zs.smarthuman.sherpa import android.content.res.AssetManager import android.util.Log -import kotlin.math.sqrt import java.util.ArrayDeque class VoiceController( @@ -18,8 +17,8 @@ class VoiceController( private val TAG = "VoiceController" private val sampleRate = 16000 - private var state: VoiceState = VoiceState.WAIT_WAKEUP - set(value) { + var state: VoiceState = VoiceState.WAIT_WAKEUP + private set(value) { field = value Log.d(TAG, "➡ State = $value") onStateChanged?.invoke(value) @@ -30,41 +29,31 @@ class VoiceController( handleWakeupEvent() } - private val vadManager = VadManager( - assetManager, - onSpeechStart = { onVadStart() }, - onSpeechEnd = {} - ) - private val audioBuffer = mutableListOf() private val preBuffer = ArrayDeque() private val PRE_BUFFER_SIZE = sampleRate * 2 private var recordingStartMs = 0L - private var silenceStartMs = 0L private var waitSpeechFailStartMs = 0L private var waitSpeechStartMs = 0L - private var speechEnergySum = 0f - private var speechFrameCount = 0 - private var vadStarted = false - private var inKwsObserve = false private var kwsObserveStartMs = 0L private val KWS_OBSERVE_MS = 500L - private var speechEnableAtMs = 0L private val SPEECH_COOLDOWN_MS = 300L - private val RMS_SILENCE_THRESHOLD = 0.012f - private val SILENCE_END_MS = 1200L - private val MIN_SPEECH_MS = 1000L - private val MIN_AVG_ENERGY = 0.02f + private val MIN_SPEECH_MS = 800L + private val idleTimeoutMs = idleTimeoutSeconds * 1000L + private val maxRecordingMs = maxRecordingSeconds * 1000L + private val vadManager = VadManager(assetManager, + onSpeechStart = { onVadStart() }, + onSpeechEnd = { avgEnergy, peakRms -> onVadEnd(avgEnergy, peakRms) } + ) /* ================= 音频入口 ================= */ - fun acceptAudio(samples: FloatArray) { cachePreBuffer(samples) wakeupManager.acceptAudio(samples) @@ -83,7 +72,7 @@ class VoiceController( VoiceState.WAIT_SPEECH_COOLDOWN -> { if (now >= speechEnableAtMs) { - waitSpeechFailStartMs = System.currentTimeMillis() + waitSpeechFailStartMs = now state = VoiceState.WAIT_SPEECH waitSpeechStartMs = now } @@ -91,14 +80,8 @@ class VoiceController( } VoiceState.WAIT_SPEECH -> { - - if (waitSpeechStartMs > 0 && now - waitSpeechStartMs >= idleTimeoutSeconds * 1000) { - Log.d(TAG, "⏱ Wakeup but no speech → WAIT_WAKEUP") - resetAll() - return - } - - if (waitSpeechFailStartMs > 0 && now - waitSpeechFailStartMs >= idleTimeoutSeconds * 1000) { + if ((waitSpeechStartMs > 0 && now - waitSpeechStartMs >= idleTimeoutMs) || + (waitSpeechFailStartMs > 0 && now - waitSpeechFailStartMs >= idleTimeoutMs)) { Log.d(TAG, "⏱ WAIT_SPEECH idle timeout → WAIT_WAKEUP") resetAll() return @@ -114,30 +97,15 @@ class VoiceController( audioBuffer.addAll(samples.asList()) vadManager.accept(samples) - val rms = calcRms(samples) - if (rms > RMS_SILENCE_THRESHOLD) { - speechEnergySum += rms - speechFrameCount++ - silenceStartMs = 0L - } else { - if (silenceStartMs == 0L) silenceStartMs = now - else if (now - silenceStartMs >= SILENCE_END_MS) { - Log.d(TAG, "🔇 Silence end") - finishSentence() - return - } - } - - if (now - recordingStartMs > maxRecordingSeconds * 1000) { + if (System.currentTimeMillis() - recordingStartMs > maxRecordingMs) { Log.w(TAG, "⏱ Max recording reached") - finishSentence() + finishSentence() // 超时也触发 finish } } } } /* ================= 唤醒 ================= */ - private fun handleWakeupEvent() { when (state) { VoiceState.UPLOADING -> return @@ -157,9 +125,7 @@ class VoiceController( if (interrupt) { audioBuffer.clear() vadManager.reset() - resetEnergyStat() vadStarted = false - silenceStartMs = 0L } inKwsObserve = true @@ -169,58 +135,45 @@ class VoiceController( private fun onVadStart() { if (state != VoiceState.WAIT_SPEECH) return - Log.d(TAG, "🎤 REAL VAD START") vadStarted = true recordingStartMs = System.currentTimeMillis() - silenceStartMs = 0L - resetEnergyStat() - audioBuffer.clear() audioBuffer.addAll(preBuffer) state = VoiceState.RECORDING } - /* ================= 结束录音 ================= */ + private fun onVadEnd(avgEnergy: Float, peakRms: Float) { + if (state != VoiceState.RECORDING) return + Log.d(TAG, "🧠 VAD END") + finishSentence(avgEnergy, peakRms) + } - private fun finishSentence() { + /* ================= 结束录音 ================= */ + private fun finishSentence(avgEnergy: Float = 0f, peakRms: Float = 0f) { val now = System.currentTimeMillis() val duration = now - recordingStartMs if (!vadStarted || duration < MIN_SPEECH_MS) { - Log.d(TAG, "❌ Too short or no VAD start: ${duration}ms") + Log.d(TAG, "❌ Too short or no VAD start: $duration ms") resetToWaitSpeech() return } val audio = audioBuffer.toFloatArray() val vadRatio = vadManager.activeSpeechRatio() - val avgEnergy = if (speechFrameCount > 0) speechEnergySum / speechFrameCount else 0f - val peakRms = calcPeakRms(audio) - - val peakAvgRatio = if (avgEnergy > 0f) peakRms / avgEnergy else 0f - if (avgEnergy < MIN_AVG_ENERGY) { - Log.d(TAG, "❌ Avg energy too low: $avgEnergy → rejected") + Log.d(TAG, "📊 Finish Sentence - duration: $duration ms, vadEnded: true") + Log.d(TAG, "📊 vadRatio=$vadRatio, avgEnergy=$avgEnergy, peakRms=$peakRms, peakAvgRatio=$peakAvgRatio") + + if (avgEnergy < 0.02f || peakAvgRatio < 1.2f || vadRatio < 0.4f) { + Log.d(TAG, "❌ Sentence rejected") resetToWaitSpeech() return } - if (peakAvgRatio < 1.2f) { - Log.d(TAG, "❌ Peak/Avg ratio too low: $peakAvgRatio → rejected") - resetToWaitSpeech() - return - } - - - if (vadRatio < 0.40f) { - Log.d(TAG, "❌ VAD ratio too low: $vadRatio → rejected") - resetToWaitSpeech() - return - } - - // 原评分逻辑 + // 评分逻辑 var score = 0 when { duration >= 4000 -> score += 3 @@ -230,64 +183,40 @@ class VoiceController( when { avgEnergy >= 0.10f -> score += 3 avgEnergy >= 0.06f -> score += 2 - avgEnergy >= MIN_AVG_ENERGY -> score += 1 + avgEnergy >= 0.02f -> score += 1 } when { vadRatio >= 0.55f -> score += 2 vadRatio >= 0.40f -> score += 1 } + Log.d(TAG, "📊 duration=$duration ms, vadRatio=$vadRatio, avgEnergy=$avgEnergy, score=$score") - Log.d(TAG, "📊 duration=$duration ms, vadRatio=$vadRatio, avgEnergy=$avgEnergy, peakRms=$peakRms, score=$score") - - val pass = when { - score >= 6 -> true - score == 3 && avgEnergy >= 0.06f -> true - else -> false - } - + val pass = score >= 5 || (score == 3 && avgEnergy >= 0.06f) if (!pass) { Log.d(TAG, "❌ Sentence rejected (score=$score)") resetToWaitSpeech() return } - // ✅ 通过 → 上传 - waitSpeechFailStartMs = 0L audioBuffer.clear() state = VoiceState.UPLOADING onFinalAudio(audio) } - - - - /** 计算音频帧峰值 */ - private fun calcPeakRms(audio: FloatArray): Float { - var peak = 0f - for (v in audio) { - val abs = kotlin.math.abs(v) - if (abs > peak) peak = abs - } - return peak - } - - /* ================= 播放回调 ================= */ - - fun onPlayStartPrompt() { state = VoiceState.PLAYING_PROMPT } // ⭐ 补全 + fun onPlayStartPrompt() { state = VoiceState.PLAYING_PROMPT } fun onPlayEndPrompt() { speechEnableAtMs = System.currentTimeMillis() + SPEECH_COOLDOWN_MS state = VoiceState.WAIT_SPEECH_COOLDOWN } - fun onPlayStartBackend() { state = VoiceState.PLAYING_BACKEND } // ⭐ 补全 + fun onPlayStartBackend() { state = VoiceState.PLAYING_BACKEND } fun onPlayEndBackend() { speechEnableAtMs = System.currentTimeMillis() + SPEECH_COOLDOWN_MS state = VoiceState.WAIT_SPEECH_COOLDOWN } /* ================= 上传回调 ================= */ - fun onUploadFinished(success: Boolean) { if (state != VoiceState.UPLOADING) return state = if (success) VoiceState.PLAYING_BACKEND @@ -297,17 +226,12 @@ class VoiceController( } } - /* ================= Reset ================= */ - private fun resetToWaitSpeech() { audioBuffer.clear() vadManager.reset() - resetEnergyStat() vadStarted = false - silenceStartMs = 0L state = VoiceState.WAIT_SPEECH - if (waitSpeechFailStartMs == 0L) waitSpeechFailStartMs = System.currentTimeMillis() } @@ -315,9 +239,7 @@ class VoiceController( audioBuffer.clear() preBuffer.clear() vadManager.reset() - resetEnergyStat() vadStarted = false - silenceStartMs = 0L waitSpeechStartMs = 0L waitSpeechFailStartMs = 0L state = VoiceState.WAIT_WAKEUP @@ -329,22 +251,10 @@ class VoiceController( } /* ================= Utils ================= */ - - private fun resetEnergyStat() { - speechEnergySum = 0f - speechFrameCount = 0 - } - private fun cachePreBuffer(samples: FloatArray) { for (s in samples) { preBuffer.addLast(s) if (preBuffer.size > PRE_BUFFER_SIZE) preBuffer.removeFirst() } } - - private fun calcRms(audio: FloatArray): Float { - var sum = 0f - for (v in audio) sum += v * v - return sqrt(sum / audio.size) - } } diff --git a/app/src/main/java/com/zs/smarthuman/ui/MainActivity.kt b/app/src/main/java/com/zs/smarthuman/ui/MainActivity.kt index 565ad58..5be2903 100644 --- a/app/src/main/java/com/zs/smarthuman/ui/MainActivity.kt +++ b/app/src/main/java/com/zs/smarthuman/ui/MainActivity.kt @@ -136,12 +136,12 @@ class MainActivity : BaseViewModelActivity() mViewModel?.uploadVoiceLiveData?.observe(this) { when (it) { is ApiResult.Error -> { - Toaster.showShort("上传失败") +// Toaster.showShort("上传失败") voiceController?.onUploadFinished(false) } is ApiResult.Success<*> -> { - Toaster.showShort("上传成功") +// Toaster.showShort("上传成功") voiceController?.onUploadFinished(true) } } @@ -185,7 +185,6 @@ class MainActivity : BaseViewModelActivity() onStateChanged = { state -> when (state) { VoiceState.WAIT_WAKEUP -> { - Log.d("lrs", "当前状态: 等待唤醒") lifecycleScope.launch(Dispatchers.Main) { mVerticalAnimator?.hide() UnityPlayerHolder.getInstance() @@ -201,10 +200,18 @@ class MainActivity : BaseViewModelActivity() } } - VoiceState.WAIT_SPEECH -> Log.d("lrs", "当前状态: 唤醒成功,等待说话") - VoiceState.RECORDING -> Log.d("lrs", "当前状态: 正在录音") - VoiceState.PLAYING_PROMPT -> Log.d("lrs", "当前状态: 播放本地音频") - VoiceState.PLAYING_BACKEND -> Log.d("lrs", "当前状态: 播放后台音频") + VoiceState.WAIT_SPEECH -> { + + } + VoiceState.RECORDING -> { + startRecording() + } + VoiceState.PLAYING_PROMPT ->{} + VoiceState.PLAYING_BACKEND ->{} + VoiceState.UPLOADING -> {} + + VoiceState.WAIT_SPEECH_COOLDOWN -> {} + else -> {} } }, @@ -248,15 +255,18 @@ class MainActivity : BaseViewModelActivity() override fun onPause() { super.onPause() + stopRecording() UnityPlayerHolder.getInstance().pause() } @SuppressLint("MissingPermission") - private fun initMicrophone() { - val numBytes = AudioRecord.getMinBufferSize(sampleRateInHz, channelConfig, audioFormat) - if (numBytes == AudioRecord.ERROR || numBytes == AudioRecord.ERROR_BAD_VALUE) { - Log.e("VoiceService", "Failed to initialize microphone: Invalid buffer size") - return + private fun initMicrophone(): Boolean { + val minBuf = AudioRecord.getMinBufferSize( + sampleRateInHz, channelConfig, audioFormat + ) + if (minBuf <= 0) { + Log.e("VoiceService", "Invalid min buffer size") + return false } audioRecord = AudioRecord( @@ -264,15 +274,20 @@ class MainActivity : BaseViewModelActivity() sampleRateInHz, channelConfig, audioFormat, - numBytes * 2 // 设置更大的缓冲区以防止丢失数据 + minBuf * 2 ) if (audioRecord?.state != AudioRecord.STATE_INITIALIZED) { - Log.e("VoiceService", "Failed to initialize AudioRecord") + Log.e("VoiceService", "AudioRecord init failed") + audioRecord?.release() + audioRecord = null + return false } -// enableSystemAec(audioRecord!!) + enableSystemAec(audioRecord!!) + return true } + private var aec: AcousticEchoCanceler? = null private fun enableSystemAec(record: AudioRecord) { @@ -290,23 +305,50 @@ class MainActivity : BaseViewModelActivity() //开始录音 fun startRecording() { + if (isRecording) return + + if (audioRecord == null) { + if (!initMicrophone()) { + Log.e("VoiceService", "startRecording: init failed") + return + } + } + + try { + audioRecord?.startRecording() + } catch (e: IllegalStateException) { + Log.e("VoiceService", "startRecording failed, recreate", e) + recreateAudioRecord() + return + } + isRecording = true - audioRecord?.startRecording() lifecycleScope.launch(Dispatchers.IO) { val buf = ShortArray(512) while (isRecording) { - val n = audioRecord?.read(buf, 0, buf.size) ?: 0 + val n = audioRecord?.read(buf, 0, buf.size) ?: break if (n > 0) { val raw = FloatArray(n) { buf[it] / 32768f } - - voiceController?.acceptAudio(raw) } } } } + + private fun recreateAudioRecord() { + stopRecording() + try { + audioRecord?.release() + } catch (_: Exception) { + } + + audioRecord = null + initMicrophone() + } + + //停止录音 fun stopRecording() { isRecording = false @@ -342,7 +384,8 @@ class MainActivity : BaseViewModelActivity() word: String, audioUrl: String ) { - val wakeupUrl = /*UserInfoManager.userInfo?.wakeUpAudioUrl ?: */"https://static.seerteach.net/aidialogue/systemVoice/aliyun-nv.mp3" ?: return + val wakeupUrl = /*UserInfoManager.userInfo?.wakeUpAudioUrl ?: */ + "https://static.seerteach.net/aidialogue/systemVoice/aliyun-nv.mp3" ?: return if (audioUrl != wakeupUrl) return diff --git a/app/src/main/java/com/zs/smarthuman/utils/PcmAudioWithAecManager.kt b/app/src/main/java/com/zs/smarthuman/utils/PcmAudioWithAecManager.kt index 14714ed..55e96ec 100644 --- a/app/src/main/java/com/zs/smarthuman/utils/PcmAudioWithAecManager.kt +++ b/app/src/main/java/com/zs/smarthuman/utils/PcmAudioWithAecManager.kt @@ -90,7 +90,7 @@ class PcmAudioWithAecManager( // 录音数据处理 processCapture(buffer.copyOf(read)) } - voiceController.checkIdleTimeout() +// voiceController.checkIdleTimeout() } } }