diff --git a/app/build.gradle b/app/build.gradle index 27f3a8a..c07dfcf 100644 --- a/app/build.gradle +++ b/app/build.gradle @@ -181,5 +181,6 @@ dependencies { implementation libs.androidautosize implementation files('libs/sherpa-onnx-1.12.20.aar') + implementation 'com.github.yyued:SVGAPlayer-Android:2.6.1' } \ No newline at end of file diff --git a/app/src/main/AndroidManifest.xml b/app/src/main/AndroidManifest.xml index d64e047..560b309 100644 --- a/app/src/main/AndroidManifest.xml +++ b/app/src/main/AndroidManifest.xml @@ -39,7 +39,7 @@ Unit, private val onSpeechEnd: () -> Unit ) { + private val TAG = "VadManager" private val vad: Vad private var isSpeaking = false - private var lastSpeechTime = 0L + private var lastSpeechMs = 0L - private val ACTIVE_END_SILENCE_MS = 1500L - private val ACTIVE_CONSECUTIVE_FRAMES = 10 - private val FINAL_END_SILENCE_MS = 800L - private val FINAL_CONSECUTIVE_FRAMES = 5 - private val FINAL_PHASE_TRIGGER_MS = 1000L - private val MAX_SILENCE_AFTER_SPEECH_MS = 2000L - - private val MIN_EFFECTIVE_SPEECH_RMS = 0.001f - - private var consecutiveSilenceFrames = 0 - private var isInFinalPhase = false - private var lastEffectiveSpeechTime = 0L + /** 更果断结束 */ + private val END_SILENCE_MS = 350L + private val MIN_RMS = 0.002f init { - val config = getVadModelConfig(0) - ?: throw IllegalStateException("[$TAG] VAD config not found") - vad = Vad(assetManager, VadModelConfig(sileroVadModelConfig = SileroVadModelConfig(model = "silero_vad.onnx", threshold = 0.2f))) - LogUtils.i(TAG, "✅ VAD 初始化成功") + vad = Vad( + assetManager, + VadModelConfig( + sileroVadModelConfig = SileroVadModelConfig( + model = "silero_vad.onnx", + threshold = 0.5F, + minSilenceDuration = 0.25F, + minSpeechDuration = 0.25F, + windowSize = 512, + ), + sampleRate = 16000, + numThreads = 1, + provider = "cpu" + ) + ) + LogUtils.i(TAG, "✅ VAD init") } fun accept(samples: FloatArray) { val now = System.currentTimeMillis() vad.acceptWaveform(samples) - val vadHasSpeech = vad.isSpeechDetected() + val hasSpeech = vad.isSpeechDetected() val rms = calcRms(samples) - val isEffectiveSpeech = vadHasSpeech && rms >= MIN_EFFECTIVE_SPEECH_RMS - - if (isEffectiveSpeech) { - lastEffectiveSpeechTime = now - isInFinalPhase = false - lastSpeechTime = now - consecutiveSilenceFrames = 0 - } else { - consecutiveSilenceFrames++ - if (now - lastEffectiveSpeechTime >= FINAL_PHASE_TRIGGER_MS) { - isInFinalPhase = true - } - } - - val (endSilenceMs, endFrames) = - if (isInFinalPhase) - FINAL_END_SILENCE_MS to FINAL_CONSECUTIVE_FRAMES - else - ACTIVE_END_SILENCE_MS to ACTIVE_CONSECUTIVE_FRAMES - - if (isEffectiveSpeech) { + if (hasSpeech && rms >= MIN_RMS) { + lastSpeechMs = now if (!isSpeaking) { isSpeaking = true onSpeechStart() } - } else if (isSpeaking) { - val silenceMs = now - lastSpeechTime - val effectiveSilenceMs = now - lastEffectiveSpeechTime - - val shouldEnd = - (silenceMs >= endSilenceMs || - effectiveSilenceMs >= MAX_SILENCE_AFTER_SPEECH_MS) && - consecutiveSilenceFrames >= endFrames - - if (shouldEnd) { - onSpeechEnd() - reset() - isSpeaking = false - isInFinalPhase = false - } + } else if (isSpeaking && now - lastSpeechMs > END_SILENCE_MS) { + onSpeechEnd() + reset() } } fun reset() { isSpeaking = false - lastSpeechTime = 0L - lastEffectiveSpeechTime = 0L - consecutiveSilenceFrames = 0 - isInFinalPhase = false + lastSpeechMs = 0 vad.reset() } - fun calcRms(samples: FloatArray): Float { + private fun calcRms(samples: FloatArray): Float { var sum = 0f for (v in samples) sum += v * v return sqrt(sum / samples.size) diff --git a/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt b/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt index 88c4072..5b3ea1f 100644 --- a/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt +++ b/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt @@ -30,11 +30,9 @@ class VoiceController( // 预缓存大小(2秒) private const val PRE_BUFFER_SIZE = SAMPLE_RATE * 2 - // 短语音判定阈值 - private const val SHORT_AUDIO_DURATION_MS = 1000L private const val INVALID_RESET_DEBOUNCE_MS = 1500L // 最小语音时长 - private const val MIN_SPEECH_MS = 800L + private const val MIN_SPEECH_MS = 600L // 统一的声纹验证阈值(不再分场景) private const val SPEAKER_THRESHOLD = 0.45f @@ -275,6 +273,9 @@ class VoiceController( fun onPlayEndPrompt() { speechEnableAtMs = System.currentTimeMillis() + SPEECH_COOLDOWN_MS LogUtils.d(TAG, "🎵 提示音结束") + if (!preBuffer.isEmpty()) { + preBuffer.clear() + } state = VoiceState.WAIT_SPEECH_COOLDOWN } @@ -321,7 +322,9 @@ class VoiceController( private fun resetAll() { LogUtils.d(TAG, "🔄 重置所有状态 | 本次超时类型: $currentTimeoutType") audioBuffer.clear() - preBuffer.clear() + if (!preBuffer.isEmpty()) { + preBuffer.clear() + } vadManager.reset() wakeupManager.reset() vadStarted = false @@ -359,9 +362,15 @@ class VoiceController( } private fun cachePreBuffer(samples: FloatArray) { + // 空数据快速返回,避免无效循环 + if (samples.isEmpty()) return + for (s in samples) { preBuffer.addLast(s) - if (preBuffer.size > PRE_BUFFER_SIZE) preBuffer.removeFirst() + // 关键修复:移除前先检查队列是否非空 + if (preBuffer.size > PRE_BUFFER_SIZE && !preBuffer.isEmpty()) { + preBuffer.removeFirst() + } } } @@ -371,7 +380,10 @@ class VoiceController( return false } - // 1. 裁剪音频:只保留本次录音的有效部分 + // 1. 记录验证开始时间(关键:统计处理耗时) + val verifyStartMs = System.currentTimeMillis() + + // 2. 原有音频裁剪逻辑(保留) val audioDurationMs = (audio.size.toFloat() / SAMPLE_RATE * 1000).toLong() val validAudio = if (audioDurationMs > 0) { val validSampleCount = (audioDurationMs * SAMPLE_RATE / 1000).toInt() @@ -386,21 +398,16 @@ class VoiceController( var stream: OnlineStream? = null - // 使用 runCatching 统一处理异常 return runCatching { stream = SpeakerRecognition.extractor.createStream() + stream.acceptWaveform(samples = validAudio, sampleRate = SAMPLE_RATE) + stream.inputFinished() - // 处理音频数据 - stream?.acceptWaveform(samples = validAudio, sampleRate = SAMPLE_RATE) - stream?.inputFinished() - - // 检查 stream 是否就绪 - if (stream == null || !SpeakerRecognition.extractor.isReady(stream)) { + if (!SpeakerRecognition.extractor.isReady(stream)) { LogUtils.w(TAG, "❌ 音频Stream未就绪,验证失败") return@runCatching false } - // 计算特征并验证 val embedding = SpeakerRecognition.extractor.compute(stream) speakerManagerLock.withLock { val verifyPass = SpeakerRecognition.manager.verify( @@ -409,19 +416,20 @@ class VoiceController( threshold = SPEAKER_THRESHOLD ) - LogUtils.d(TAG, "📊 声纹验证 | 统一阈值: $SPEAKER_THRESHOLD | 通过: $verifyPass | 验证时长: ${(validAudio.size.toFloat()/SAMPLE_RATE*1000).toLong()}ms") + // 3. 计算真实处理耗时(结束时间 - 开始时间) + val verifyCostMs = System.currentTimeMillis() - verifyStartMs + // 日志区分:音频时长 vs 处理耗时 + LogUtils.d(TAG, "📊 声纹验证 | 统一阈值: $SPEAKER_THRESHOLD | 通过: $verifyPass | 音频时长: $audioDurationMs ms | 处理耗时: $verifyCostMs ms") verifyPass } }.onFailure { e -> - // 处理所有异常情况 LogUtils.e(TAG, "❌ 声纹验证异常,拒绝", e) }.also { - // 确保 stream 资源释放 runCatching { stream?.release() }.onFailure { e -> LogUtils.w(TAG, "⚠️ 释放 Stream 资源失败", e) } - }.getOrDefault(false) // 异常时默认返回 false + }.getOrDefault(false) } } \ No newline at end of file diff --git a/app/src/main/java/com/zs/smarthuman/ui/MainActivity.kt b/app/src/main/java/com/zs/smarthuman/ui/MainActivity.kt index 7168ef6..63053cc 100644 --- a/app/src/main/java/com/zs/smarthuman/ui/MainActivity.kt +++ b/app/src/main/java/com/zs/smarthuman/ui/MainActivity.kt @@ -102,7 +102,7 @@ class MainActivity : BaseViewModelActivity() private var startPlayTimeoutJob: Job? = null // 统一管理所有播放场景的超时Job private var mEventSources: EventSource? = null - private var isManualCancel = false + override fun getViewBinding(): ActivityMainBinding = ActivityMainBinding.inflate(layoutInflater) override fun initView() { @@ -219,7 +219,7 @@ class MainActivity : BaseViewModelActivity() } }, onFinalAudio = { audio -> - sendRecordVoiceToServer(AudioPcmUtil.floatToPcm16Base64(audio)) + sendRecordVoiceToServer(AudioPcmUtil.floatToPcm16(audio)) // mViewModel?.uploadVoice( // // AudioPcmUtil.floatToPcm16Base64(audio), @@ -231,7 +231,7 @@ class MainActivity : BaseViewModelActivity() "xxx.wav" ) AudioDebugUtil.saveFloatPcmAsWav(audio, file) - LogUtils.dTag("audioxx", "WAV saved: ${file.path}, samples=${audio.size}") +// LogUtils.dTag("audioxx", "WAV saved: ${file.path}, samples=${audio.size}") // lifecycleScope.launch(Dispatchers.Main) { // // mVerticalAnimator?.show() @@ -560,16 +560,13 @@ class MainActivity : BaseViewModelActivity() } - private fun sendRecordVoiceToServer(audio: String) { + private fun sendRecordVoiceToServer(audio: ByteArray) { cancelSSE() - val request: Request? = RxHttp.postJson(ApiService.UPLOAD_RECORD_VOICE_URL) - .add("audio",audio) + val request: Request? = RxHttp.postBody(ApiService.UPLOAD_RECORD_VOICE_URL) + .setBody(audio) .buildRequest() request?.let { - // 重置手动取消标记 - isManualCancel = false - mEventSources = createFactory(RxHttpPlugins.getOkHttpClient()) .newEventSource(it, object : EventSourceListener() { override fun onOpen(eventSource: EventSource, response: Response) { @@ -601,22 +598,19 @@ class MainActivity : BaseViewModelActivity() response: Response? ) { super.onFailure(eventSource, t, response) - // 关键修复2:忽略手动取消导致的异常 - if (isManualCancel) { - LogUtils.eTag("lrsxxx", "SSE手动取消,忽略失败回调") - return - } - // 正常失败逻辑 val errorMsg = t?.message ?: response?.message ?: "未知错误" - voiceController?.onUploadFinished(false) + LogUtils.eTag("lrsxxx", "流式请求失败:${errorMsg}") + if (backPlaying){ + voiceController?.onPlayEndBackend() + backPlaying = false + }else{ + voiceController?.onUploadFinished(false) + } } override fun onClosed(eventSource: EventSource) { super.onClosed(eventSource) - // 关键修复3:区分手动取消和正常关闭 - val isSuccess = !isManualCancel - // 关键修复4:关闭后置空引用,避免内存泄漏 mEventSources = null } }) @@ -625,7 +619,6 @@ class MainActivity : BaseViewModelActivity() private fun cancelSSE() { - isManualCancel = true mEventSources?.cancel() mEventSources = null } diff --git a/app/src/main/res/layout/custom_words_panel.xml b/app/src/main/res/layout/custom_words_panel.xml index 842aee1..e4d30c2 100644 --- a/app/src/main/res/layout/custom_words_panel.xml +++ b/app/src/main/res/layout/custom_words_panel.xml @@ -26,6 +26,7 @@ android:layout_width="match_parent" android:layout_height="wrap_content" android:textColor="@color/white" + android:padding="20dp" android:textSize="14sp" />