优化后的代码

2025-12-31 14:43:10 +08:00 · 2025-12-31 14:43:10 +08:00 · f8812b6a48
commit f8812b6a48
parent 997bfe0539
8 changed files with 258 additions and 147 deletions
--- a/app/build.gradle
+++ b/app/build.gradle
@ -180,6 +180,6 @@ dependencies {

    implementation libs.androidautosize

-    implementation files('libs/sherpa19.aar')
+    implementation files('libs/sherpa-onnx-1.12.20.aar')

 }
--- a/app/libs/sherpa-onnx-1.12.20.aar
+++ b/app/libs/sherpa-onnx-1.12.20.aar
--- a/app/src/main/AndroidManifest.xml
+++ b/app/src/main/AndroidManifest.xml
@ -52,7 +52,7 @@
        tools:targetApi="31">

        <activity
-            android:name=".ui.SplashActivity"
+            android:name=".ui.MainActivity"
            android:exported="true"
            android:theme="@style/Theme.Splash"
            android:screenOrientation="portrait">
@ -66,9 +66,9 @@
            </intent-filter>
        </activity>

-        <activity
+        <!--<activity
            android:name="com.zs.smarthuman.ui.MainActivity"
-            android:screenOrientation="portrait"/>
+            android:screenOrientation="portrait"/>-->
        <activity
            android:name="com.zs.smarthuman.ui.ActivateActivity"
            android:screenOrientation="portrait"/>
--- a/app/src/main/java/com/zs/smarthuman/sherpa/VadManager.kt
+++ b/app/src/main/java/com/zs/smarthuman/sherpa/VadManager.kt
@ -10,42 +10,63 @@ import com.k2fsa.sherpa.onnx.getVadModelConfig
 * @date: 2025/12/17 10:22
 */
 class VadManager(
-    private val assetManager: AssetManager,
+    assetManager: AssetManager,
    private val onSpeechStart: () -> Unit,
    private val onSpeechEnd: () -> Unit
 ) {
    private val vad: Vad
+
    private var isSpeaking = false
+    private var lastSpeechTime = 0L
+
+    // ⭐ 统计用
+    private var speechFrameCount = 0
+    private var totalFrameCount = 0
+
+    private val END_SILENCE_MS = 600L

    init {
-        val config = getVadModelConfig(0)
-        if (config == null) {
-            throw IllegalStateException("VAD config not found")
-        }
-        vad = Vad(assetManager = assetManager, config = config)
+        val config = getVadModelConfig(1)
+            ?: throw IllegalStateException("VAD config not found")
+        vad = Vad(assetManager, config)
    }

-    /** 喂入音频帧 (16kHz PCM float) */
    fun accept(samples: FloatArray) {
-        vad.acceptWaveform(samples)
-        val speechDetected = vad.isSpeechDetected()
+        val now = System.currentTimeMillis()

-        if (speechDetected && !isSpeaking) {
+        vad.acceptWaveform(samples)
+        val hasSpeech = vad.isSpeechDetected()
+
+        totalFrameCount++
+
+        if (hasSpeech) {
+            speechFrameCount++
+            lastSpeechTime = now
+
+            if (!isSpeaking) {
                isSpeaking = true
                onSpeechStart()
-        } else if (!speechDetected && isSpeaking) {
+            }
+        } else {
+            if (isSpeaking && now - lastSpeechTime >= END_SILENCE_MS) {
                isSpeaking = false
                onSpeechEnd()
-            // ⭐ 只在句子结束时清空 VAD
                vad.clear()
            }
        }
+    }

+    /** 👉 人声占比（真正用到 VAD 的地方） */
+    fun speechRatio(): Float {
+        if (totalFrameCount == 0) return 0f
+        return speechFrameCount.toFloat() / totalFrameCount
+    }

-    /** 重置内部状态 */
    fun reset() {
        isSpeaking = false
+        lastSpeechTime = 0
+        speechFrameCount = 0
+        totalFrameCount = 0
        vad.reset()
    }
 }
-
--- a/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt
+++ b/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt
@ -8,15 +8,17 @@ class VoiceController(
    assetManager: AssetManager,
    private val onWakeup: () -> Unit,
    private val onFinalAudio: (FloatArray) -> Unit,
-    private val idleTimeoutSeconds: Int = 15,
+    private val idleTimeoutSeconds: Int = 5,
    private val maxRecordingSeconds: Int = 10,
    private val onStateChanged: ((VoiceState) -> Unit)? = null,
    private val stopBackendAudio: (() -> Unit)? = null
 ) {
+
    private val TAG = "VoiceController"
    private val sampleRate = 16000

    /* ================= 状态 ================= */
+
    private var state: VoiceState = VoiceState.WAIT_WAKEUP
        set(value) {
            field = value
@ -24,73 +26,105 @@ class VoiceController(
            onStateChanged?.invoke(value)
        }

-    /* ================= 唤醒 ================= */
+    /* ================= KWS ================= */
+
    private val wakeupManager = WakeupManager(assetManager) {
        Log.d(TAG, "🔥 WakeWord detected")
-        stopBackendAudio?.invoke()
-        if (state != VoiceState.UPLOADING) { // 上传中不重置
-            resetAll()
-            state = VoiceState.PLAYING_PROMPT
-        }
-        onWakeup()
+        handleWakeupEvent()
    }

-    /* ================= VAD（只负责 START） ================= */
+    /* ================= VAD ================= */
+
    private val vadManager = VadManager(
        assetManager,
        onSpeechStart = { onVadStart() },
-        onSpeechEnd = { /* 不再用于结束 */ }
+        onSpeechEnd = {}
    )

-    /* ================= 音频缓存 ================= */
+    /* ================= Buffer ================= */
+
    private val audioBuffer = mutableListOf<Float>()
+
+    /** 前导音缓存（2 秒） */
    private val preBuffer = ArrayDeque<Float>()
-    private val PRE_BUFFER_SIZE = sampleRate // 1 秒预缓冲
+    private val PRE_BUFFER_SIZE = sampleRate * 2

    /* ================= 时间 ================= */
-    private var idleTimer = 0L
-    private var recordingStartTime = 0L
+
+    private var recordingStartMs = 0L
+    private var silenceStartMs = 0L
+
+    /** ⭐ WAIT_SPEECH 连续失败起点（关键） */
+    private var waitSpeechFailStartMs = 0L
+
+    /* ================= 控制 ================= */
+
    private var vadStarted = false

-    /* ================= RMS 静音判定 ================= */
-    private var silenceStartMs = 0L
-    private val SILENCE_END_MS = 1200L          // 静音多久算一句结束
-    private val RMS_SILENCE_THRESHOLD = 0.005f // 更灵敏
-    private val MIN_SPEECH_DURATION_MS = 300L  // 最短有效语音
-    private val MIN_SPEECH_RATIO = 0.15f       // 有效帧占比至少 15%
+    /** 唤醒观察期 */
+    private var inKwsObserve = false
+    private var kwsObserveStartMs = 0L
+    private val KWS_OBSERVE_MS = 500L
+
+    /** 播放冷却 */
+    private var speechEnableAtMs = 0L
+    private val SPEECH_COOLDOWN_MS = 300L
+
+    /* ================= 阈值 ================= */
+
+    private val RMS_SILENCE_THRESHOLD = 0.005f
+    private val SILENCE_END_MS = 1200L
+    private val MIN_SPEECH_MS = 300L

    /* ================= 音频入口 ================= */
+
    fun acceptAudio(samples: FloatArray) {
-        // 唤醒独立处理，始终喂
-        wakeupManager.acceptAudio(samples)

-        if (state == VoiceState.UPLOADING ||
-            state == VoiceState.PLAYING_PROMPT ||
-            state == VoiceState.PLAYING_BACKEND
-        ) return
-
-        if (state == VoiceState.WAIT_SPEECH) {
        cachePreBuffer(samples)
-            vadManager.accept(samples)
+
+        wakeupManager.acceptAudio(samples)
+        if (wakeupManager.consumeWakeupFlag()) {
+            handleWakeupEvent()
            return
        }

-        if (state != VoiceState.RECORDING) return
+        val now = System.currentTimeMillis()
+
+        when (state) {
+
+            VoiceState.WAIT_WAKEUP,
+            VoiceState.PLAYING_PROMPT,
+            VoiceState.PLAYING_BACKEND,
+            VoiceState.UPLOADING -> return
+
+            VoiceState.WAIT_SPEECH_COOLDOWN -> {
+                if (now >= speechEnableAtMs) {
+                    state = VoiceState.WAIT_SPEECH
+                }
+                return
+            }
+
+            VoiceState.WAIT_SPEECH -> {
+
+                if (inKwsObserve) {
+                    if (now - kwsObserveStartMs < KWS_OBSERVE_MS) return
+                    inKwsObserve = false
+                }
+
+                vadManager.accept(samples)
+            }
+
+            VoiceState.RECORDING -> {

-        // ===== RECORDING =====
                audioBuffer.addAll(samples.asList())
                vadManager.accept(samples)

-        val now = System.currentTimeMillis()
-
-        // 1️⃣ 最大录音兜底
-        if (now - recordingStartTime >= maxRecordingSeconds * 1000) {
+                if (now - recordingStartMs > maxRecordingSeconds * 1000) {
                    Log.w(TAG, "⏱ Max recording reached")
                    finishSentence()
                    return
                }

-        // 2️⃣ RMS 静音结束判定
                val rms = calcRms(samples)
                if (rms < RMS_SILENCE_THRESHOLD) {
                    if (silenceStartMs == 0L) silenceStartMs = now
@ -102,82 +136,148 @@ class VoiceController(
                    silenceStartMs = 0L
                }
            }
+        }
+    }
+
+    /* ================= 唤醒 ================= */
+
+    private fun handleWakeupEvent() {
+        when (state) {
+
+            VoiceState.UPLOADING -> return
+
+            VoiceState.RECORDING,
+            VoiceState.PLAYING_BACKEND -> {
+                stopBackendAudio?.invoke()
+                enterWakeup(interrupt = true)
+            }
+
+            else -> enterWakeup(interrupt = false)
+        }
+    }
+
+    private fun enterWakeup(interrupt: Boolean) {
+
+        if (interrupt) {
+            audioBuffer.clear()
+            vadManager.reset()
+            vadStarted = false
+            silenceStartMs = 0L
+        }
+
+        inKwsObserve = true
+        kwsObserveStartMs = System.currentTimeMillis()
+
+        state = VoiceState.PLAYING_PROMPT
+        onWakeup()
+    }

    /* ================= VAD START ================= */
+
    private fun onVadStart() {
        if (state != VoiceState.WAIT_SPEECH) return

-        Log.d(TAG, "🎤 VAD START")
+        Log.d(TAG, "🎤 REAL VAD START")
+
        vadStarted = true
-        state = VoiceState.RECORDING
-        recordingStartTime = System.currentTimeMillis()
+        recordingStartMs = System.currentTimeMillis()
        silenceStartMs = 0L

+        audioBuffer.clear()
        audioBuffer.addAll(preBuffer)
-        preBuffer.clear()
+
+        state = VoiceState.RECORDING
    }

    /* ================= 结束录音 ================= */
+
    private fun finishSentence() {
-        val speakTime = System.currentTimeMillis() - recordingStartTime

-        if (!vadStarted || speakTime < MIN_SPEECH_DURATION_MS) {
-            Log.d(TAG, "⛔ Speech too short, ignore")
-            resetToWaitSpeech(refreshIdle = false)
+        val duration = System.currentTimeMillis() - recordingStartMs
+        if (!vadStarted || duration < MIN_SPEECH_MS) {
+            resetToWaitSpeech()
            return
        }

-        val rmsFrames = calcRmsFrames(audioBuffer.toFloatArray(), frameSize = 320)
-        val validFrames = rmsFrames.count { it >= RMS_SILENCE_THRESHOLD }
-        val ratio = if (rmsFrames.isEmpty()) 0f else validFrames.toFloat() / rmsFrames.size
-        Log.d(TAG, "RMS ratio=$ratio")
-        if (ratio < MIN_SPEECH_RATIO) {
-            Log.d(TAG, "❌ Not enough human voice (ratio=$ratio)")
-            resetToWaitSpeech(refreshIdle = false)
+        val vadRatio = vadManager.speechRatio()
+        Log.d(TAG, "🎙 VAD speech ratio=$vadRatio")
+
+        if (vadRatio < 0.25f) {
+            Log.d(TAG, "❌ VAD says NOT human speech")
+            resetToWaitSpeech()
            return
        }

+        // ✅ 成功一次，清空失败计时
+        waitSpeechFailStartMs = 0L
+
        val finalAudio = audioBuffer.toFloatArray()
        audioBuffer.clear()

        state = VoiceState.UPLOADING
-        Log.d(TAG, "⬆ Upload audio len=${finalAudio.size}")
        onFinalAudio(finalAudio)
    }

    /* ================= 播放回调 ================= */
-    fun onPlayStartPrompt() { state = VoiceState.PLAYING_PROMPT }
-    fun onPlayEndPrompt() { state = VoiceState.WAIT_SPEECH; idleTimer = System.currentTimeMillis() }
-    fun onPlayStartBackend() { state = VoiceState.PLAYING_BACKEND }
-    fun onPlayEndBackend() { state = VoiceState.WAIT_SPEECH; idleTimer = System.currentTimeMillis() }

-    /* ================= 上传回调 ================= */
-    fun onUploadFinished(success: Boolean) {
-        if (state != VoiceState.UPLOADING) return
-        state = if (success) VoiceState.PLAYING_BACKEND else VoiceState.WAIT_SPEECH
-        idleTimer = System.currentTimeMillis()
+    fun onPlayStartPrompt() {
+        state = VoiceState.PLAYING_PROMPT
    }

-    /* ================= Idle ================= */
+    fun onPlayEndPrompt() {
+        speechEnableAtMs = System.currentTimeMillis() + SPEECH_COOLDOWN_MS
+        state = VoiceState.WAIT_SPEECH_COOLDOWN
+    }
+
+    fun onPlayStartBackend() {
+        state = VoiceState.PLAYING_BACKEND
+    }
+
+    fun onPlayEndBackend() {
+        speechEnableAtMs = System.currentTimeMillis() + SPEECH_COOLDOWN_MS
+        state = VoiceState.WAIT_SPEECH_COOLDOWN
+    }
+
+    /* ================= 上传回调（保留 public） ================= */
+
+    fun onUploadFinished(success: Boolean) {
+        if (state != VoiceState.UPLOADING) return
+
+        state = if (success) {
+            VoiceState.PLAYING_BACKEND
+        } else {
+            speechEnableAtMs = System.currentTimeMillis() + SPEECH_COOLDOWN_MS
+            VoiceState.WAIT_SPEECH_COOLDOWN
+        }
+    }
+
+    /* ================= Idle 超时（关键修复） ================= */
+
    fun checkIdleTimeout() {
-        // 上传中不计时
        if (state != VoiceState.WAIT_SPEECH) return
+        if (waitSpeechFailStartMs == 0L) return
+
        val now = System.currentTimeMillis()
-        if (now - idleTimer > idleTimeoutSeconds * 1000) {
-            Log.d(TAG, "⏱ Idle timeout reached, resetAll")
+        if (now - waitSpeechFailStartMs > idleTimeoutSeconds * 1000) {
+            Log.d(TAG, "⏱ WAIT_SPEECH continuous fail timeout")
            resetAll()
+            waitSpeechFailStartMs = 0L
        }
    }

    /* ================= Reset ================= */
-    private fun resetToWaitSpeech(refreshIdle: Boolean = true) {
+
+    private fun resetToWaitSpeech() {
        audioBuffer.clear()
-        preBuffer.clear()
        vadManager.reset()
        vadStarted = false
        silenceStartMs = 0L
        state = VoiceState.WAIT_SPEECH
-        if (refreshIdle) idleTimer = System.currentTimeMillis()
+
+        // ⭐ 只在第一次失败时记录
+        if (waitSpeechFailStartMs == 0L) {
+            waitSpeechFailStartMs = System.currentTimeMillis()
+        }
    }

    private fun resetAll() {
@ -190,36 +290,24 @@ class VoiceController(
    }

    fun release() {
-        vadManager.reset()
        wakeupManager.release()
+        vadManager.reset()
    }

-    /* ================= 工具 ================= */
+    /* ================= Utils ================= */
+
    private fun cachePreBuffer(samples: FloatArray) {
        for (s in samples) {
            preBuffer.addLast(s)
-            if (preBuffer.size > PRE_BUFFER_SIZE) preBuffer.removeFirst()
+            if (preBuffer.size > PRE_BUFFER_SIZE) {
+                preBuffer.removeFirst()
+            }
        }
    }

    private fun calcRms(audio: FloatArray): Float {
-        if (audio.isEmpty()) return 0f
        var sum = 0f
        for (v in audio) sum += v * v
        return sqrt(sum / audio.size)
    }
-
-    private fun calcRmsFrames(audio: FloatArray, frameSize: Int = 320): FloatArray {
-        val rmsList = mutableListOf<Float>()
-        var i = 0
-        while (i < audio.size) {
-            val end = minOf(i + frameSize, audio.size)
-            val frame = audio.sliceArray(i until end)
-            var sum = 0f
-            for (v in frame) sum += v * v
-            rmsList.add(sqrt(sum / frame.size))
-            i += frameSize
-        }
-        return rmsList.toFloatArray()
-    }
 }
--- a/app/src/main/java/com/zs/smarthuman/sherpa/VoiceState.kt
+++ b/app/src/main/java/com/zs/smarthuman/sherpa/VoiceState.kt
@ -11,5 +11,6 @@ enum class VoiceState {
    WAIT_SPEECH,       // 等待用户说话
    RECORDING,         // 用户正在说话
    UPLOADING, //音频上传中
+    WAIT_SPEECH_COOLDOWN, // ⭐ 唤醒后冷却
    PLAYING_BACKEND    // 播放后台返回音频
 }
--- a/app/src/main/java/com/zs/smarthuman/sherpa/WakeupManager.kt
+++ b/app/src/main/java/com/zs/smarthuman/sherpa/WakeupManager.kt
@ -1,18 +1,18 @@
 package com.zs.smarthuman.sherpa

 import android.content.res.AssetManager
+import android.util.Log
 import com.k2fsa.sherpa.onnx.*

-class WakeupManager(
-    assetManager: AssetManager,
-    private val onWakeup: () -> Unit
-) {
+class WakeupManager(assetManager: AssetManager, function: () -> Unit) {

+    private val TAG = "WakeupManager"
    private val sampleRate = 16000
+
    private val kws: KeywordSpotter
    private var stream: OnlineStream? = null

-    /** ⭐ 刚唤醒标记，用来丢弃唤醒词音频 */
+    /** ⭐ 唤醒标记（只能消费一次） */
    private var justWokeUp = false

    init {
@ -29,15 +29,16 @@ class WakeupManager(
        )

        kws = KeywordSpotter(assetManager, config)
+        Log.d(TAG, "✅ KeywordSpotter initialized")
+
        stream = kws.createStream()
-            ?: error("Failed to create KWS stream")
+        require(stream != null) { "Failed to create KWS stream" }
+        Log.d(TAG, "✅ KWS stream created")
    }

-
-    /** ⭐ 小爱同学策略：不管播放还是录音，永远喂 */
+    /** ⭐ 永远喂 KWS */
    fun acceptAudio(samples: FloatArray) {
        val s = stream ?: return
-        // ⭐ 远讲 / 播放补偿（非常关键）
        for (i in samples.indices) {
            samples[i] *= 2.5f
        }
@ -47,15 +48,15 @@ class WakeupManager(
            kws.decode(s)
            val keyword = kws.getResult(s).keyword
            if (keyword.isNotBlank()) {
+                Log.d(TAG, "🔥 KWS hit: $keyword")
                justWokeUp = true
-                onWakeup()
-                kws.reset(s)   // 立刻 reset，进入新一轮
+                kws.reset(s)
                break
            }
        }
    }

-    /** 被 VAD 消费一次 */
+    /** ⭐ 唯一唤醒出口 */
    fun consumeWakeupFlag(): Boolean {
        val r = justWokeUp
        justWokeUp = false
--- a/app/src/main/java/com/zs/smarthuman/ui/MainActivity.kt
+++ b/app/src/main/java/com/zs/smarthuman/ui/MainActivity.kt
@ -77,7 +77,7 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
    private var voiceController: VoiceController? = null
    private var audioRecord: AudioRecord? = null
    private var isRecording = false
-    private val audioSource = MediaRecorder.AudioSource.VOICE_RECOGNITION
+    private val audioSource = MediaRecorder.AudioSource.VOICE_COMMUNICATION
    private val sampleRateInHz = 16000
    private val channelConfig = AudioFormat.CHANNEL_IN_MONO
    private val audioFormat = AudioFormat.ENCODING_PCM_16BIT
@ -169,17 +169,17 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
            },
            onFinalAudio = { audio ->
                Log.d("lrs", "检测到语音，长度=${audio.size}")
-                mViewModel?.uploadVoice(
-                    AudioPcmUtil.pcm16ToBase64(AudioPcmUtil.floatToPcm16(audio)),
-                    1
-                )
-//                loadLocalJsonAndPlay()
-//                val file = File(
-//                    getExternalFilesDir(Environment.DIRECTORY_DOWNLOADS)!!.getAbsolutePath(),
-//                    "xxx.wav"
+//                mViewModel?.uploadVoice(
+//                    AudioPcmUtil.pcm16ToBase64(AudioPcmUtil.floatToPcm16(audio)),
+//                    1
 //                )
-//                AudioDebugUtil.saveFloatPcmAsWav(audio, file)
-//                LogUtils.dTag("audioxx", "WAV saved: ${file.path}, samples=${audio.size}")
+                loadLocalJsonAndPlay()
+                val file = File(
+                    getExternalFilesDir(Environment.DIRECTORY_DOWNLOADS)!!.getAbsolutePath(),
+                    "xxx.wav"
+                )
+                AudioDebugUtil.saveFloatPcmAsWav(audio, file)
+                LogUtils.dTag("audioxx", "WAV saved: ${file.path}, samples=${audio.size}")

            },
            onStateChanged = { state ->
@ -261,7 +261,7 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
        if (audioRecord?.state != AudioRecord.STATE_INITIALIZED) {
            Log.e("VoiceService", "Failed to initialize AudioRecord")
        }
-        enableSystemAec(audioRecord!!)
+//        enableSystemAec(audioRecord!!)
    }

    private var aec: AcousticEchoCanceler? = null