diff --git a/app/src/main/AndroidManifest.xml b/app/src/main/AndroidManifest.xml index adea95b..d64e047 100644 --- a/app/src/main/AndroidManifest.xml +++ b/app/src/main/AndroidManifest.xml @@ -52,7 +52,7 @@ tools:targetApi="31"> @@ -66,10 +66,9 @@ - diff --git a/app/src/main/java/com/zs/smarthuman/sherpa/VadManager.kt b/app/src/main/java/com/zs/smarthuman/sherpa/VadManager.kt index 504ea65..72f0021 100644 --- a/app/src/main/java/com/zs/smarthuman/sherpa/VadManager.kt +++ b/app/src/main/java/com/zs/smarthuman/sherpa/VadManager.kt @@ -27,11 +27,6 @@ class VadManager( /** 喂入音频帧 (16kHz PCM float) */ fun accept(samples: FloatArray) { - // 放大音量,提高灵敏度 - for (i in samples.indices) { - samples[i] *= 2.5f - } - vad.acceptWaveform(samples) val speechDetected = vad.isSpeechDetected() diff --git a/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt b/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt index 1ae5caf..8c731f9 100644 --- a/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt +++ b/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt @@ -13,7 +13,6 @@ class VoiceController( private val onStateChanged: ((VoiceState) -> Unit)? = null, private val stopBackendAudio: (() -> Unit)? = null ) { - private val TAG = "VoiceController" private val sampleRate = 16000 @@ -29,8 +28,10 @@ class VoiceController( private val wakeupManager = WakeupManager(assetManager) { Log.d(TAG, "🔥 WakeWord detected") stopBackendAudio?.invoke() - resetAll() - state = VoiceState.PLAYING_PROMPT + if (state != VoiceState.UPLOADING) { // 上传中不重置 + resetAll() + state = VoiceState.PLAYING_PROMPT + } onWakeup() } @@ -38,27 +39,29 @@ class VoiceController( private val vadManager = VadManager( assetManager, onSpeechStart = { onVadStart() }, - onSpeechEnd = { /* ❌ 不再用于结束 */ } + onSpeechEnd = { /* 不再用于结束 */ } ) /* ================= 音频缓存 ================= */ private val audioBuffer = mutableListOf() private val preBuffer = ArrayDeque() - private val PRE_BUFFER_SIZE = sampleRate / 2 // 500ms + private val PRE_BUFFER_SIZE = sampleRate // 1 秒预缓冲 /* ================= 时间 ================= */ private var idleTimer = 0L private var recordingStartTime = 0L private var vadStarted = false - /* ================= RMS 结束判定 ================= */ + /* ================= RMS 静音判定 ================= */ private var silenceStartMs = 0L private val SILENCE_END_MS = 1200L // 静音多久算一句结束 - private val RMS_SILENCE_THRESHOLD = 0.01f // 静音能量阈值 - private val MIN_SPEECH_DURATION_MS = 800L // 最短有效语音 + private val RMS_SILENCE_THRESHOLD = 0.005f // 更灵敏 + private val MIN_SPEECH_DURATION_MS = 300L // 最短有效语音 + private val MIN_SPEECH_RATIO = 0.15f // 有效帧占比至少 15% /* ================= 音频入口 ================= */ fun acceptAudio(samples: FloatArray) { + // 唤醒独立处理,始终喂 wakeupManager.acceptAudio(samples) if (state == VoiceState.UPLOADING || @@ -87,14 +90,11 @@ class VoiceController( return } - // 2️⃣ RMS 静音结束(核心) + // 2️⃣ RMS 静音结束判定 val rms = calcRms(samples) -// Log.d(TAG, "RMS_DEBUG", "rms=${"%.4f".format(rms)}") - if (rms < RMS_SILENCE_THRESHOLD) { - if (silenceStartMs == 0L) { - silenceStartMs = now - } else if (now - silenceStartMs >= SILENCE_END_MS) { + if (silenceStartMs == 0L) silenceStartMs = now + else if (now - silenceStartMs >= SILENCE_END_MS) { Log.d(TAG, "🔇 RMS silence end") finishSentence() } @@ -123,7 +123,17 @@ class VoiceController( if (!vadStarted || speakTime < MIN_SPEECH_DURATION_MS) { Log.d(TAG, "⛔ Speech too short, ignore") - resetToWaitSpeech() + resetToWaitSpeech(refreshIdle = false) + return + } + + val rmsFrames = calcRmsFrames(audioBuffer.toFloatArray(), frameSize = 320) + val validFrames = rmsFrames.count { it >= RMS_SILENCE_THRESHOLD } + val ratio = if (rmsFrames.isEmpty()) 0f else validFrames.toFloat() / rmsFrames.size + Log.d(TAG, "RMS ratio=$ratio") + if (ratio < MIN_SPEECH_RATIO) { + Log.d(TAG, "❌ Not enough human voice (ratio=$ratio)") + resetToWaitSpeech(refreshIdle = false) return } @@ -136,47 +146,38 @@ class VoiceController( } /* ================= 播放回调 ================= */ - fun onPlayStartPrompt() { - state = VoiceState.PLAYING_PROMPT - } - - fun onPlayEndPrompt() { - state = VoiceState.WAIT_SPEECH - idleTimer = System.currentTimeMillis() - } - - fun onPlayStartBackend() { - state = VoiceState.PLAYING_BACKEND - } - - fun onPlayEndBackend() { - state = VoiceState.WAIT_SPEECH - idleTimer = System.currentTimeMillis() - } + fun onPlayStartPrompt() { state = VoiceState.PLAYING_PROMPT } + fun onPlayEndPrompt() { state = VoiceState.WAIT_SPEECH; idleTimer = System.currentTimeMillis() } + fun onPlayStartBackend() { state = VoiceState.PLAYING_BACKEND } + fun onPlayEndBackend() { state = VoiceState.WAIT_SPEECH; idleTimer = System.currentTimeMillis() } /* ================= 上传回调 ================= */ fun onUploadFinished(success: Boolean) { if (state != VoiceState.UPLOADING) return state = if (success) VoiceState.PLAYING_BACKEND else VoiceState.WAIT_SPEECH + idleTimer = System.currentTimeMillis() } /* ================= Idle ================= */ fun checkIdleTimeout() { + // 上传中不计时 if (state != VoiceState.WAIT_SPEECH) return - if (System.currentTimeMillis() - idleTimer > idleTimeoutSeconds * 1000) { + val now = System.currentTimeMillis() + if (now - idleTimer > idleTimeoutSeconds * 1000) { + Log.d(TAG, "⏱ Idle timeout reached, resetAll") resetAll() } } /* ================= Reset ================= */ - private fun resetToWaitSpeech() { + private fun resetToWaitSpeech(refreshIdle: Boolean = true) { audioBuffer.clear() preBuffer.clear() vadManager.reset() vadStarted = false silenceStartMs = 0L state = VoiceState.WAIT_SPEECH - idleTimer = System.currentTimeMillis() + if (refreshIdle) idleTimer = System.currentTimeMillis() } private fun resetAll() { @@ -197,17 +198,28 @@ class VoiceController( private fun cachePreBuffer(samples: FloatArray) { for (s in samples) { preBuffer.addLast(s) - if (preBuffer.size > PRE_BUFFER_SIZE) { - preBuffer.removeFirst() - } + if (preBuffer.size > PRE_BUFFER_SIZE) preBuffer.removeFirst() } } private fun calcRms(audio: FloatArray): Float { + if (audio.isEmpty()) return 0f var sum = 0f - for (v in audio) { - sum += v * v - } + for (v in audio) sum += v * v return sqrt(sum / audio.size) } + + private fun calcRmsFrames(audio: FloatArray, frameSize: Int = 320): FloatArray { + val rmsList = mutableListOf() + var i = 0 + while (i < audio.size) { + val end = minOf(i + frameSize, audio.size) + val frame = audio.sliceArray(i until end) + var sum = 0f + for (v in frame) sum += v * v + rmsList.add(sqrt(sum / frame.size)) + i += frameSize + } + return rmsList.toFloatArray() + } } diff --git a/app/src/main/java/com/zs/smarthuman/sherpa/WakeupManager.kt b/app/src/main/java/com/zs/smarthuman/sherpa/WakeupManager.kt index 7454059..c7fcbce 100644 --- a/app/src/main/java/com/zs/smarthuman/sherpa/WakeupManager.kt +++ b/app/src/main/java/com/zs/smarthuman/sherpa/WakeupManager.kt @@ -38,9 +38,9 @@ class WakeupManager( fun acceptAudio(samples: FloatArray) { val s = stream ?: return // ⭐ 远讲 / 播放补偿(非常关键) -// for (i in samples.indices) { -// samples[i] *= 2.5f -// } + for (i in samples.indices) { + samples[i] *= 2.5f + } s.acceptWaveform(samples, sampleRate) while (kws.isReady(s)) { diff --git a/app/src/main/java/com/zs/smarthuman/ui/MainActivity.kt b/app/src/main/java/com/zs/smarthuman/ui/MainActivity.kt index 18d2a39..3c3bb02 100644 --- a/app/src/main/java/com/zs/smarthuman/ui/MainActivity.kt +++ b/app/src/main/java/com/zs/smarthuman/ui/MainActivity.kt @@ -52,6 +52,7 @@ import com.zs.smarthuman.utils.ViewSlideAnimator import com.zs.smarthuman.viewmodel.MainViewModel import kotlinx.coroutines.CoroutineScope import kotlinx.coroutines.Dispatchers +import kotlinx.coroutines.Job import kotlinx.coroutines.SupervisorJob import kotlinx.coroutines.delay import kotlinx.coroutines.launch @@ -136,10 +137,12 @@ class MainActivity : BaseViewModelActivity() when (it) { is ApiResult.Error -> { Toaster.showShort("上传失败") + voiceController?.onUploadFinished(false) } is ApiResult.Success<*> -> { Toaster.showShort("上传成功") + voiceController?.onUploadFinished(true) } } } @@ -166,17 +169,17 @@ class MainActivity : BaseViewModelActivity() }, onFinalAudio = { audio -> Log.d("lrs", "检测到语音,长度=${audio.size}") -// mViewModel?.uploadVoice( -// AudioPcmUtil.pcm16ToBase64(AudioPcmUtil.floatToPcm16(audio)), -// 1 -// ) - loadLocalJsonAndPlay() - val file = File( - getExternalFilesDir(Environment.DIRECTORY_DOWNLOADS)!!.getAbsolutePath(), - "xxx.wav" + mViewModel?.uploadVoice( + AudioPcmUtil.pcm16ToBase64(AudioPcmUtil.floatToPcm16(audio)), + 1 ) - AudioDebugUtil.saveFloatPcmAsWav(audio, file) - LogUtils.dTag("audioxx", "WAV saved: ${file.path}, samples=${audio.size}") +// loadLocalJsonAndPlay() +// val file = File( +// getExternalFilesDir(Environment.DIRECTORY_DOWNLOADS)!!.getAbsolutePath(), +// "xxx.wav" +// ) +// AudioDebugUtil.saveFloatPcmAsWav(audio, file) +// LogUtils.dTag("audioxx", "WAV saved: ${file.path}, samples=${audio.size}") }, onStateChanged = { state -> @@ -204,8 +207,9 @@ class MainActivity : BaseViewModelActivity() when (msg.msgContentType) { MessageContentType.RECEIVE_VOICE_STREAM.msgContentType -> { lifecycleScope.launch(Dispatchers.IO) { - UnityPlayerHolder.getInstance() - .startTalking(msg.content) +// UnityPlayerHolder.getInstance() +// .startTalking(msg.content) + loadLocalJsonAndPlay() } } } @@ -315,6 +319,14 @@ class MainActivity : BaseViewModelActivity() private var promptPlaying = false private var backPlaying = false + private var promptTimeoutJob: Job? = null + private val PROMPT_PLAY_TIMEOUT_MS = 3000L // 10 秒 + + + private var backTimeoutJob: Job? = null + private val BACK_PLAY_TIMEOUT_MS = 3000L // 10 秒 + + fun onAudioProgressUpdated( // Unity 调用此方法传递音频进度 progress: Float, state: Int,//0stop 2pause 1play 3complete 4loading 5error @@ -331,6 +343,13 @@ class MainActivity : BaseViewModelActivity() if (!promptPlaying) { promptPlaying = true voiceController?.onPlayStartPrompt() + + promptTimeoutJob = lifecycleScope.launch { + delay(PROMPT_PLAY_TIMEOUT_MS) + promptPlaying = false + voiceController?.onPlayEndPrompt() + promptTimeoutJob?.cancel() + } } } @@ -338,6 +357,7 @@ class MainActivity : BaseViewModelActivity() if (promptPlaying) { promptPlaying = false voiceController?.onPlayEndPrompt() + promptTimeoutJob?.cancel() } } } @@ -360,7 +380,6 @@ class MainActivity : BaseViewModelActivity() 3 -> { // complete if (backPlaying) { - Toaster.showShort("借宿了") backPlaying = false voiceController?.onPlayEndBackend() }