提升检测速度

2026-01-19 11:06:26 +08:00 · 2026-01-19 11:06:26 +08:00 · 0dfdccc75b
commit 0dfdccc75b
parent c450d8d620
7 changed files with 72 additions and 102 deletions
--- a/app/build.gradle
+++ b/app/build.gradle
@ -181,5 +181,6 @@ dependencies {
    implementation libs.androidautosize
    implementation files('libs/sherpa-onnx-1.12.20.aar')
    implementation 'com.github.yyued:SVGAPlayer-Android:2.6.1'
 }
--- a/app/src/main/AndroidManifest.xml
+++ b/app/src/main/AndroidManifest.xml
@ -39,7 +39,7 @@
    <application
        android:name=".App"
-        android:allowBackup="false"
+        android:allowBackup="true"
        android:dataExtractionRules="@xml/data_extraction_rules"
        android:fullBackupContent="@xml/backup_rules"
        android:icon="@mipmap/ic_launcher"
--- a/app/src/main/assets/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
+++ b/app/src/main/assets/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
--- a/app/src/main/java/com/zs/smarthuman/sherpa/VadManager.kt
+++ b/app/src/main/java/com/zs/smarthuman/sherpa/VadManager.kt
@ -2,10 +2,7 @@ package com.zs.smarthuman.sherpa
 import android.content.res.AssetManager
 import com.blankj.utilcode.util.LogUtils
-import com.k2fsa.sherpa.onnx.SileroVadModelConfig
+import com.k2fsa.sherpa.onnx.*
 import com.k2fsa.sherpa.onnx.Vad
 import com.k2fsa.sherpa.onnx.VadModelConfig
 import com.k2fsa.sherpa.onnx.getVadModelConfig
 import kotlin.math.sqrt
 class VadManager(
@ -13,92 +10,62 @@ class VadManager(
    private val onSpeechStart: () -> Unit,
    private val onSpeechEnd: () -> Unit
 ) {
    private val TAG = "VadManager"
    private val vad: Vad
    private var isSpeaking = false
-    private var lastSpeechTime = 0L
+    private var lastSpeechMs = 0L
-    private val ACTIVE_END_SILENCE_MS = 1500L
+    /** 更果断结束 */
-    private val ACTIVE_CONSECUTIVE_FRAMES = 10
+    private val END_SILENCE_MS = 350L
-    private val FINAL_END_SILENCE_MS = 800L
+    private val MIN_RMS = 0.002f
    private val FINAL_CONSECUTIVE_FRAMES = 5
    private val FINAL_PHASE_TRIGGER_MS = 1000L
    private val MAX_SILENCE_AFTER_SPEECH_MS = 2000L
    private val MIN_EFFECTIVE_SPEECH_RMS = 0.001f
    private var consecutiveSilenceFrames = 0
    private var isInFinalPhase = false
    private var lastEffectiveSpeechTime = 0L
    init {
-        val config = getVadModelConfig(0)
+        vad = Vad(
-            ?: throw IllegalStateException("[$TAG] VAD config not found")
+            assetManager,
-        vad = Vad(assetManager, VadModelConfig(sileroVadModelConfig = SileroVadModelConfig(model = "silero_vad.onnx", threshold = 0.2f)))
+            VadModelConfig(
-        LogUtils.i(TAG, "✅ VAD 初始化成功")
+                sileroVadModelConfig = SileroVadModelConfig(
                    model = "silero_vad.onnx",
                    threshold = 0.5F,
                    minSilenceDuration = 0.25F,
                    minSpeechDuration = 0.25F,
                    windowSize = 512,
                ),
                sampleRate = 16000,
                numThreads = 1,
                provider = "cpu"
            )
        )
        LogUtils.i(TAG, "✅ VAD init")
    }
    fun accept(samples: FloatArray) {
        val now = System.currentTimeMillis()
        vad.acceptWaveform(samples)
-        val vadHasSpeech = vad.isSpeechDetected()
+        val hasSpeech = vad.isSpeechDetected()
        val rms = calcRms(samples)
-        val isEffectiveSpeech = vadHasSpeech && rms >= MIN_EFFECTIVE_SPEECH_RMS
+        if (hasSpeech && rms >= MIN_RMS) {
-
+            lastSpeechMs = now
        if (isEffectiveSpeech) {
            lastEffectiveSpeechTime = now
            isInFinalPhase = false
            lastSpeechTime = now
            consecutiveSilenceFrames = 0
        } else {
            consecutiveSilenceFrames++
            if (now - lastEffectiveSpeechTime >= FINAL_PHASE_TRIGGER_MS) {
                isInFinalPhase = true
            }
        }
        val (endSilenceMs, endFrames) =
            if (isInFinalPhase)
                FINAL_END_SILENCE_MS to FINAL_CONSECUTIVE_FRAMES
            else
                ACTIVE_END_SILENCE_MS to ACTIVE_CONSECUTIVE_FRAMES
        if (isEffectiveSpeech) {
            if (!isSpeaking) {
                isSpeaking = true
                onSpeechStart()
            }
-        } else if (isSpeaking) {
+        } else if (isSpeaking && now - lastSpeechMs > END_SILENCE_MS) {
-            val silenceMs = now - lastSpeechTime
+            onSpeechEnd()
-            val effectiveSilenceMs = now - lastEffectiveSpeechTime
+            reset()
            val shouldEnd =
                (silenceMs >= endSilenceMs ||
                        effectiveSilenceMs >= MAX_SILENCE_AFTER_SPEECH_MS) &&
                        consecutiveSilenceFrames >= endFrames
            if (shouldEnd) {
                onSpeechEnd()
                reset()
                isSpeaking = false
                isInFinalPhase = false
            }
        }
    }
    fun reset() {
        isSpeaking = false
-        lastSpeechTime = 0L
+        lastSpeechMs = 0
        lastEffectiveSpeechTime = 0L
        consecutiveSilenceFrames = 0
        isInFinalPhase = false
        vad.reset()
    }
-    fun calcRms(samples: FloatArray): Float {
+    private fun calcRms(samples: FloatArray): Float {
        var sum = 0f
        for (v in samples) sum += v * v
        return sqrt(sum / samples.size)
--- a/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt
+++ b/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt
@ -30,11 +30,9 @@ class VoiceController(
        // 预缓存大小（2秒）
        private const val PRE_BUFFER_SIZE = SAMPLE_RATE * 2
        // 短语音判定阈值
        private const val SHORT_AUDIO_DURATION_MS = 1000L
        private const val INVALID_RESET_DEBOUNCE_MS = 1500L
        // 最小语音时长
-        private const val MIN_SPEECH_MS = 800L
+        private const val MIN_SPEECH_MS = 600L
        // 统一的声纹验证阈值（不再分场景）
        private const val SPEAKER_THRESHOLD = 0.45f
@ -275,6 +273,9 @@ class VoiceController(
    fun onPlayEndPrompt() {
        speechEnableAtMs = System.currentTimeMillis() + SPEECH_COOLDOWN_MS
        LogUtils.d(TAG, "🎵 提示音结束")
        if (!preBuffer.isEmpty()) {
            preBuffer.clear()
        }
        state = VoiceState.WAIT_SPEECH_COOLDOWN
    }
@ -321,7 +322,9 @@ class VoiceController(
    private fun resetAll() {
        LogUtils.d(TAG, "🔄 重置所有状态 | 本次超时类型: $currentTimeoutType")
        audioBuffer.clear()
-        preBuffer.clear()
+        if (!preBuffer.isEmpty()) {
            preBuffer.clear()
        }
        vadManager.reset()
        wakeupManager.reset()
        vadStarted = false
@ -359,9 +362,15 @@ class VoiceController(
    }
    private fun cachePreBuffer(samples: FloatArray) {
        // 空数据快速返回，避免无效循环
        if (samples.isEmpty()) return
        for (s in samples) {
            preBuffer.addLast(s)
-            if (preBuffer.size > PRE_BUFFER_SIZE) preBuffer.removeFirst()
+            // 关键修复：移除前先检查队列是否非空
            if (preBuffer.size > PRE_BUFFER_SIZE && !preBuffer.isEmpty()) {
                preBuffer.removeFirst()
            }
        }
    }
@ -371,7 +380,10 @@ class VoiceController(
            return false
        }
-        // 1. 裁剪音频：只保留本次录音的有效部分
+        // 1. 记录验证开始时间（关键：统计处理耗时）
        val verifyStartMs = System.currentTimeMillis()
        // 2. 原有音频裁剪逻辑（保留）
        val audioDurationMs = (audio.size.toFloat() / SAMPLE_RATE * 1000).toLong()
        val validAudio = if (audioDurationMs > 0) {
            val validSampleCount = (audioDurationMs * SAMPLE_RATE / 1000).toInt()
@ -386,21 +398,16 @@ class VoiceController(
        var stream: OnlineStream? = null
        // 使用 runCatching 统一处理异常
        return runCatching {
            stream = SpeakerRecognition.extractor.createStream()
            stream.acceptWaveform(samples = validAudio, sampleRate = SAMPLE_RATE)
            stream.inputFinished()
-            // 处理音频数据
+            if (!SpeakerRecognition.extractor.isReady(stream)) {
            stream?.acceptWaveform(samples = validAudio, sampleRate = SAMPLE_RATE)
            stream?.inputFinished()
            // 检查 stream 是否就绪
            if (stream == null || !SpeakerRecognition.extractor.isReady(stream)) {
                LogUtils.w(TAG, "❌ 音频Stream未就绪，验证失败")
                return@runCatching false
            }
            // 计算特征并验证
            val embedding = SpeakerRecognition.extractor.compute(stream)
            speakerManagerLock.withLock {
                val verifyPass = SpeakerRecognition.manager.verify(
@ -409,19 +416,20 @@ class VoiceController(
                    threshold = SPEAKER_THRESHOLD
                )
-                LogUtils.d(TAG, "📊 声纹验证 | 统一阈值: $SPEAKER_THRESHOLD | 通过: $verifyPass | 验证时长: ${(validAudio.size.toFloat()/SAMPLE_RATE*1000).toLong()}ms")
+                // 3. 计算真实处理耗时（结束时间 - 开始时间）
                val verifyCostMs = System.currentTimeMillis() - verifyStartMs
                // 日志区分：音频时长 vs 处理耗时
                LogUtils.d(TAG, "📊 声纹验证 | 统一阈值: $SPEAKER_THRESHOLD | 通过: $verifyPass | 音频时长: $audioDurationMs ms | 处理耗时: $verifyCostMs ms")
                verifyPass
            }
        }.onFailure { e ->
            // 处理所有异常情况
            LogUtils.e(TAG, "❌ 声纹验证异常，拒绝", e)
        }.also {
            // 确保 stream 资源释放
            runCatching {
                stream?.release()
            }.onFailure { e ->
                LogUtils.w(TAG, "⚠️ 释放 Stream 资源失败", e)
            }
-        }.getOrDefault(false) // 异常时默认返回 false
+        }.getOrDefault(false)
    }
 }
--- a/app/src/main/java/com/zs/smarthuman/ui/MainActivity.kt
+++ b/app/src/main/java/com/zs/smarthuman/ui/MainActivity.kt
@ -102,7 +102,7 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
    private var startPlayTimeoutJob: Job? = null   // 统一管理所有播放场景的超时Job
    private var mEventSources: EventSource? = null
-    private var isManualCancel = false
+
    override fun getViewBinding(): ActivityMainBinding = ActivityMainBinding.inflate(layoutInflater)
    override fun initView() {
@ -219,7 +219,7 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
                }
            },
            onFinalAudio = { audio ->
-                sendRecordVoiceToServer(AudioPcmUtil.floatToPcm16Base64(audio))
+                sendRecordVoiceToServer(AudioPcmUtil.floatToPcm16(audio))
 //                mViewModel?.uploadVoice(
 //
 //                    AudioPcmUtil.floatToPcm16Base64(audio),
@ -231,7 +231,7 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
                    "xxx.wav"
                )
                AudioDebugUtil.saveFloatPcmAsWav(audio, file)
-                LogUtils.dTag("audioxx", "WAV saved: ${file.path}, samples=${audio.size}")
+//                LogUtils.dTag("audioxx", "WAV saved: ${file.path}, samples=${audio.size}")
 //                lifecycleScope.launch(Dispatchers.Main) {
 //
 //                    mVerticalAnimator?.show()
@ -560,16 +560,13 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
    }
-    private fun sendRecordVoiceToServer(audio: String) {
+    private fun sendRecordVoiceToServer(audio: ByteArray) {
        cancelSSE()
-        val request: Request? = RxHttp.postJson(ApiService.UPLOAD_RECORD_VOICE_URL)
+        val request: Request? = RxHttp.postBody(ApiService.UPLOAD_RECORD_VOICE_URL)
-            .add("audio",audio)
+            .setBody(audio)
            .buildRequest()
        request?.let {
            // 重置手动取消标记
            isManualCancel = false
            mEventSources = createFactory(RxHttpPlugins.getOkHttpClient())
                .newEventSource(it, object : EventSourceListener() {
                    override fun onOpen(eventSource: EventSource, response: Response) {
@ -601,22 +598,19 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
                        response: Response?
                    ) {
                        super.onFailure(eventSource, t, response)
                        // 关键修复2：忽略手动取消导致的异常
                        if (isManualCancel) {
                            LogUtils.eTag("lrsxxx", "SSE手动取消，忽略失败回调")
                            return
                        }
                        // 正常失败逻辑
                        val errorMsg = t?.message ?: response?.message ?: "未知错误"
-                        voiceController?.onUploadFinished(false)
+                        LogUtils.eTag("lrsxxx", "流式请求失败:${errorMsg}")
                        if (backPlaying){
                            voiceController?.onPlayEndBackend()
                            backPlaying = false
                        }else{
                            voiceController?.onUploadFinished(false)
                        }
                    }
                    override fun onClosed(eventSource: EventSource) {
                        super.onClosed(eventSource)
                        // 关键修复3：区分手动取消和正常关闭
                        val isSuccess = !isManualCancel
                        // 关键修复4：关闭后置空引用，避免内存泄漏
                        mEventSources = null
                    }
                })
@ -625,7 +619,6 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
    private fun cancelSSE() {
        isManualCancel = true
        mEventSources?.cancel()
        mEventSources = null
    }
--- a/app/src/main/res/layout/custom_words_panel.xml
+++ b/app/src/main/res/layout/custom_words_panel.xml
@ -26,6 +26,7 @@
                android:layout_width="match_parent"
                android:layout_height="wrap_content"
                android:textColor="@color/white"
                android:padding="20dp"
                android:textSize="14sp" />
        </LinearLayout>