提升检测速度

2026-01-19 11:06:26 +08:00 · 2026-01-19 11:06:26 +08:00 · 0dfdccc75b
commit 0dfdccc75b
parent c450d8d620
7 changed files with 72 additions and 102 deletions
--- a/app/build.gradle
+++ b/app/build.gradle
@ -181,5 +181,6 @@ dependencies {
    implementation libs.androidautosize

    implementation files('libs/sherpa-onnx-1.12.20.aar')
+    implementation 'com.github.yyued:SVGAPlayer-Android:2.6.1'

 }
--- a/app/src/main/AndroidManifest.xml
+++ b/app/src/main/AndroidManifest.xml
@ -39,7 +39,7 @@

    <application
        android:name=".App"
-        android:allowBackup="false"
+        android:allowBackup="true"
        android:dataExtractionRules="@xml/data_extraction_rules"
        android:fullBackupContent="@xml/backup_rules"
        android:icon="@mipmap/ic_launcher"
--- a/app/src/main/assets/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
+++ b/app/src/main/assets/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
--- a/app/src/main/java/com/zs/smarthuman/sherpa/VadManager.kt
+++ b/app/src/main/java/com/zs/smarthuman/sherpa/VadManager.kt
@ -2,10 +2,7 @@ package com.zs.smarthuman.sherpa

 import android.content.res.AssetManager
 import com.blankj.utilcode.util.LogUtils
-import com.k2fsa.sherpa.onnx.SileroVadModelConfig
-import com.k2fsa.sherpa.onnx.Vad
-import com.k2fsa.sherpa.onnx.VadModelConfig
-import com.k2fsa.sherpa.onnx.getVadModelConfig
+import com.k2fsa.sherpa.onnx.*
 import kotlin.math.sqrt

 class VadManager(
@ -13,92 +10,62 @@ class VadManager(
    private val onSpeechStart: () -> Unit,
    private val onSpeechEnd: () -> Unit
 ) {
+
    private val TAG = "VadManager"
    private val vad: Vad

    private var isSpeaking = false
-    private var lastSpeechTime = 0L
+    private var lastSpeechMs = 0L

-    private val ACTIVE_END_SILENCE_MS = 1500L
-    private val ACTIVE_CONSECUTIVE_FRAMES = 10
-    private val FINAL_END_SILENCE_MS = 800L
-    private val FINAL_CONSECUTIVE_FRAMES = 5
-    private val FINAL_PHASE_TRIGGER_MS = 1000L
-    private val MAX_SILENCE_AFTER_SPEECH_MS = 2000L
-
-    private val MIN_EFFECTIVE_SPEECH_RMS = 0.001f
-
-    private var consecutiveSilenceFrames = 0
-    private var isInFinalPhase = false
-    private var lastEffectiveSpeechTime = 0L
+    /** 更果断结束 */
+    private val END_SILENCE_MS = 350L
+    private val MIN_RMS = 0.002f

    init {
-        val config = getVadModelConfig(0)
-            ?: throw IllegalStateException("[$TAG] VAD config not found")
-        vad = Vad(assetManager, VadModelConfig(sileroVadModelConfig = SileroVadModelConfig(model = "silero_vad.onnx", threshold = 0.2f)))
-        LogUtils.i(TAG, "✅ VAD 初始化成功")
+        vad = Vad(
+            assetManager,
+            VadModelConfig(
+                sileroVadModelConfig = SileroVadModelConfig(
+                    model = "silero_vad.onnx",
+                    threshold = 0.5F,
+                    minSilenceDuration = 0.25F,
+                    minSpeechDuration = 0.25F,
+                    windowSize = 512,
+                ),
+                sampleRate = 16000,
+                numThreads = 1,
+                provider = "cpu"
+            )
+        )
+        LogUtils.i(TAG, "✅ VAD init")
    }

    fun accept(samples: FloatArray) {
        val now = System.currentTimeMillis()

        vad.acceptWaveform(samples)
-        val vadHasSpeech = vad.isSpeechDetected()
+        val hasSpeech = vad.isSpeechDetected()
        val rms = calcRms(samples)

-        val isEffectiveSpeech = vadHasSpeech && rms >= MIN_EFFECTIVE_SPEECH_RMS
-
-        if (isEffectiveSpeech) {
-            lastEffectiveSpeechTime = now
-            isInFinalPhase = false
-            lastSpeechTime = now
-            consecutiveSilenceFrames = 0
-        } else {
-            consecutiveSilenceFrames++
-            if (now - lastEffectiveSpeechTime >= FINAL_PHASE_TRIGGER_MS) {
-                isInFinalPhase = true
-            }
-        }
-
-        val (endSilenceMs, endFrames) =
-            if (isInFinalPhase)
-                FINAL_END_SILENCE_MS to FINAL_CONSECUTIVE_FRAMES
-            else
-                ACTIVE_END_SILENCE_MS to ACTIVE_CONSECUTIVE_FRAMES
-
-        if (isEffectiveSpeech) {
+        if (hasSpeech && rms >= MIN_RMS) {
+            lastSpeechMs = now
            if (!isSpeaking) {
                isSpeaking = true
                onSpeechStart()
            }
-        } else if (isSpeaking) {
-            val silenceMs = now - lastSpeechTime
-            val effectiveSilenceMs = now - lastEffectiveSpeechTime
-
-            val shouldEnd =
-                (silenceMs >= endSilenceMs ||
-                        effectiveSilenceMs >= MAX_SILENCE_AFTER_SPEECH_MS) &&
-                        consecutiveSilenceFrames >= endFrames
-
-            if (shouldEnd) {
+        } else if (isSpeaking && now - lastSpeechMs > END_SILENCE_MS) {
            onSpeechEnd()
            reset()
-                isSpeaking = false
-                isInFinalPhase = false
-            }
        }
    }

    fun reset() {
        isSpeaking = false
-        lastSpeechTime = 0L
-        lastEffectiveSpeechTime = 0L
-        consecutiveSilenceFrames = 0
-        isInFinalPhase = false
+        lastSpeechMs = 0
        vad.reset()
    }

-    fun calcRms(samples: FloatArray): Float {
+    private fun calcRms(samples: FloatArray): Float {
        var sum = 0f
        for (v in samples) sum += v * v
        return sqrt(sum / samples.size)
--- a/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt
+++ b/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt
@ -30,11 +30,9 @@ class VoiceController(
        // 预缓存大小（2秒）
        private const val PRE_BUFFER_SIZE = SAMPLE_RATE * 2

-        // 短语音判定阈值
-        private const val SHORT_AUDIO_DURATION_MS = 1000L
        private const val INVALID_RESET_DEBOUNCE_MS = 1500L
        // 最小语音时长
-        private const val MIN_SPEECH_MS = 800L
+        private const val MIN_SPEECH_MS = 600L

        // 统一的声纹验证阈值（不再分场景）
        private const val SPEAKER_THRESHOLD = 0.45f
@ -275,6 +273,9 @@ class VoiceController(
    fun onPlayEndPrompt() {
        speechEnableAtMs = System.currentTimeMillis() + SPEECH_COOLDOWN_MS
        LogUtils.d(TAG, "🎵 提示音结束")
+        if (!preBuffer.isEmpty()) {
+            preBuffer.clear()
+        }
        state = VoiceState.WAIT_SPEECH_COOLDOWN
    }

@ -321,7 +322,9 @@ class VoiceController(
    private fun resetAll() {
        LogUtils.d(TAG, "🔄 重置所有状态 | 本次超时类型: $currentTimeoutType")
        audioBuffer.clear()
+        if (!preBuffer.isEmpty()) {
            preBuffer.clear()
+        }
        vadManager.reset()
        wakeupManager.reset()
        vadStarted = false
@ -359,9 +362,15 @@ class VoiceController(
    }

    private fun cachePreBuffer(samples: FloatArray) {
+        // 空数据快速返回，避免无效循环
+        if (samples.isEmpty()) return
+
        for (s in samples) {
            preBuffer.addLast(s)
-            if (preBuffer.size > PRE_BUFFER_SIZE) preBuffer.removeFirst()
+            // 关键修复：移除前先检查队列是否非空
+            if (preBuffer.size > PRE_BUFFER_SIZE && !preBuffer.isEmpty()) {
+                preBuffer.removeFirst()
+            }
        }
    }

@ -371,7 +380,10 @@ class VoiceController(
            return false
        }

-        // 1. 裁剪音频：只保留本次录音的有效部分
+        // 1. 记录验证开始时间（关键：统计处理耗时）
+        val verifyStartMs = System.currentTimeMillis()
+
+        // 2. 原有音频裁剪逻辑（保留）
        val audioDurationMs = (audio.size.toFloat() / SAMPLE_RATE * 1000).toLong()
        val validAudio = if (audioDurationMs > 0) {
            val validSampleCount = (audioDurationMs * SAMPLE_RATE / 1000).toInt()
@ -386,21 +398,16 @@ class VoiceController(

        var stream: OnlineStream? = null

-        // 使用 runCatching 统一处理异常
        return runCatching {
            stream = SpeakerRecognition.extractor.createStream()
+            stream.acceptWaveform(samples = validAudio, sampleRate = SAMPLE_RATE)
+            stream.inputFinished()

-            // 处理音频数据
-            stream?.acceptWaveform(samples = validAudio, sampleRate = SAMPLE_RATE)
-            stream?.inputFinished()
-
-            // 检查 stream 是否就绪
-            if (stream == null || !SpeakerRecognition.extractor.isReady(stream)) {
+            if (!SpeakerRecognition.extractor.isReady(stream)) {
                LogUtils.w(TAG, "❌ 音频Stream未就绪，验证失败")
                return@runCatching false
            }

-            // 计算特征并验证
            val embedding = SpeakerRecognition.extractor.compute(stream)
            speakerManagerLock.withLock {
                val verifyPass = SpeakerRecognition.manager.verify(
@ -409,19 +416,20 @@ class VoiceController(
                    threshold = SPEAKER_THRESHOLD
                )

-                LogUtils.d(TAG, "📊 声纹验证 | 统一阈值: $SPEAKER_THRESHOLD | 通过: $verifyPass | 验证时长: ${(validAudio.size.toFloat()/SAMPLE_RATE*1000).toLong()}ms")
+                // 3. 计算真实处理耗时（结束时间 - 开始时间）
+                val verifyCostMs = System.currentTimeMillis() - verifyStartMs
+                // 日志区分：音频时长 vs 处理耗时
+                LogUtils.d(TAG, "📊 声纹验证 | 统一阈值: $SPEAKER_THRESHOLD | 通过: $verifyPass | 音频时长: $audioDurationMs ms | 处理耗时: $verifyCostMs ms")
                verifyPass
            }
        }.onFailure { e ->
-            // 处理所有异常情况
            LogUtils.e(TAG, "❌ 声纹验证异常，拒绝", e)
        }.also {
-            // 确保 stream 资源释放
            runCatching {
                stream?.release()
            }.onFailure { e ->
                LogUtils.w(TAG, "⚠️ 释放 Stream 资源失败", e)
            }
-        }.getOrDefault(false) // 异常时默认返回 false
+        }.getOrDefault(false)
    }
 }
--- a/app/src/main/java/com/zs/smarthuman/ui/MainActivity.kt
+++ b/app/src/main/java/com/zs/smarthuman/ui/MainActivity.kt
@ -102,7 +102,7 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
    private var startPlayTimeoutJob: Job? = null   // 统一管理所有播放场景的超时Job

    private var mEventSources: EventSource? = null
-    private var isManualCancel = false
+

    override fun getViewBinding(): ActivityMainBinding = ActivityMainBinding.inflate(layoutInflater)
    override fun initView() {
@ -219,7 +219,7 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
                }
            },
            onFinalAudio = { audio ->
-                sendRecordVoiceToServer(AudioPcmUtil.floatToPcm16Base64(audio))
+                sendRecordVoiceToServer(AudioPcmUtil.floatToPcm16(audio))
 //                mViewModel?.uploadVoice(
 //
 //                    AudioPcmUtil.floatToPcm16Base64(audio),
@ -231,7 +231,7 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
                    "xxx.wav"
                )
                AudioDebugUtil.saveFloatPcmAsWav(audio, file)
-                LogUtils.dTag("audioxx", "WAV saved: ${file.path}, samples=${audio.size}")
+//                LogUtils.dTag("audioxx", "WAV saved: ${file.path}, samples=${audio.size}")
 //                lifecycleScope.launch(Dispatchers.Main) {
 //
 //                    mVerticalAnimator?.show()
@ -560,16 +560,13 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
    }


-    private fun sendRecordVoiceToServer(audio: String) {
+    private fun sendRecordVoiceToServer(audio: ByteArray) {
        cancelSSE()
-        val request: Request? = RxHttp.postJson(ApiService.UPLOAD_RECORD_VOICE_URL)
-            .add("audio",audio)
+        val request: Request? = RxHttp.postBody(ApiService.UPLOAD_RECORD_VOICE_URL)
+            .setBody(audio)
            .buildRequest()

        request?.let {
-            // 重置手动取消标记
-            isManualCancel = false
-
            mEventSources = createFactory(RxHttpPlugins.getOkHttpClient())
                .newEventSource(it, object : EventSourceListener() {
                    override fun onOpen(eventSource: EventSource, response: Response) {
@ -601,22 +598,19 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
                        response: Response?
                    ) {
                        super.onFailure(eventSource, t, response)
-                        // 关键修复2：忽略手动取消导致的异常
-                        if (isManualCancel) {
-                            LogUtils.eTag("lrsxxx", "SSE手动取消，忽略失败回调")
-                            return
-                        }
-
                        // 正常失败逻辑
                        val errorMsg = t?.message ?: response?.message ?: "未知错误"
+                        LogUtils.eTag("lrsxxx", "流式请求失败:${errorMsg}")
+                        if (backPlaying){
+                            voiceController?.onPlayEndBackend()
+                            backPlaying = false
+                        }else{
                            voiceController?.onUploadFinished(false)
                        }
+                    }

                    override fun onClosed(eventSource: EventSource) {
                        super.onClosed(eventSource)
-                        // 关键修复3：区分手动取消和正常关闭
-                        val isSuccess = !isManualCancel
-                        // 关键修复4：关闭后置空引用，避免内存泄漏
                        mEventSources = null
                    }
                })
@ -625,7 +619,6 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()


    private fun cancelSSE() {
-        isManualCancel = true
        mEventSources?.cancel()
        mEventSources = null
    }
--- a/app/src/main/res/layout/custom_words_panel.xml
+++ b/app/src/main/res/layout/custom_words_panel.xml
@ -26,6 +26,7 @@
                android:layout_width="match_parent"
                android:layout_height="wrap_content"
                android:textColor="@color/white"
+                android:padding="20dp"
                android:textSize="14sp" />

        </LinearLayout>