添加动态阈值

2026-01-08 11:37:26 +08:00 · 2026-01-08 11:37:26 +08:00 · e21283b73b
commit e21283b73b
parent 8dd5e68ded
3 changed files with 292 additions and 75 deletions
--- a/app/src/main/java/com/zs/smarthuman/sherpa/VadManager.kt
+++ b/app/src/main/java/com/zs/smarthuman/sherpa/VadManager.kt
@ -1,6 +1,7 @@
 package com.zs.smarthuman.sherpa

 import android.content.res.AssetManager
+import com.blankj.utilcode.util.LogUtils
 import com.k2fsa.sherpa.onnx.Vad
 import com.k2fsa.sherpa.onnx.getVadModelConfig
 import kotlin.math.sqrt
@ -10,44 +11,81 @@ class VadManager(
    private val onSpeechStart: () -> Unit,
    private val onSpeechEnd: (avgEnergy: Float, peakRms: Float) -> Unit
 ) {
-
+    private val TAG = "SmartHuman-VadManager"
    private val vad: Vad
    private var isSpeaking = false
    private var lastSpeechTime = 0L
-
    private val END_SILENCE_MS = 800L

+    // 基础统计变量
    private var activeFrameCount = 0
    private var activeSpeechFrameCount = 0
-
    private var speechEnergySum = 0f
    private var speechFrameCount = 0
    private var peakRms = 0f

+    // ========== 新增：连续性检测核心变量 ==========
+    private var totalFrames = 0               // 总处理帧数
+    private var speechFrames = 0              // 语音帧总数
+    private var continuousSpeechFrames = 0    // 连续语音帧数
+    private var lastFrameIsSpeech = false     // 上一帧是否为语音
+    private var peakPosition = 0              // 峰值所在帧位置
+    private var frameIndex = 0                // 当前帧索引
+
    init {
-        val config = getVadModelConfig(0) ?: throw IllegalStateException("VAD config not found")
+        val config = getVadModelConfig(0) ?: throw IllegalStateException("[$TAG] VAD config not found")
        vad = Vad(assetManager, config)
+        LogUtils.i(TAG, "✅ VAD 初始化成功")
    }

+    /**
+     * 接收音频数据并进行VAD检测
+     * @param samples 音频采样数据（float数组）
+     */
    fun accept(samples: FloatArray) {
        val now = System.currentTimeMillis()
-
        vad.acceptWaveform(samples)
        val hasSpeech = vad.isSpeechDetected()
-
-        // RMS & peak 统计
        val rms = calcRms(samples)
+
+        // ========== 1. 语音能量统计 ==========
        if (hasSpeech) {
            speechEnergySum += rms
            speechFrameCount++
            peakRms = maxOf(peakRms, rms)
+            LogUtils.v(TAG, "🔊 检测到语音帧 | RMS: $rms | 累计峰值: $peakRms")
+        } else {
+            LogUtils.v(TAG, "🔇 检测到静音帧 | RMS: $rms")
        }

-        // VAD逻辑
+        // ========== 2. 新增：帧统计与连续性计算 ==========
+        totalFrames++
+        frameIndex++
+
+        if (hasSpeech) {
+            speechFrames++
+            // 连续语音帧计数
+            continuousSpeechFrames = if (lastFrameIsSpeech) {
+                continuousSpeechFrames + 1
+            } else {
+                1 // 重置连续计数
+            }
+            lastFrameIsSpeech = true
+
+            // 更新峰值位置（仅当当前RMS为新峰值时）
+            if (rms == peakRms) {
+                peakPosition = frameIndex
+            }
+        } else {
+            lastFrameIsSpeech = false
+        }
+
+        // ========== 3. VAD核心状态流转 ==========
        if (hasSpeech) {
            lastSpeechTime = now
            if (!isSpeaking) {
                isSpeaking = true
+                LogUtils.d(TAG, "🎤 语音开始")
                onSpeechStart()
            }
            activeFrameCount++
@ -55,36 +93,85 @@ class VadManager(
        } else {
            if (isSpeaking) {
                activeFrameCount++
-                if (now - lastSpeechTime >= END_SILENCE_MS) {
+                val silenceDuration = now - lastSpeechTime
+                if (silenceDuration >= END_SILENCE_MS) {
                    isSpeaking = false
                    val avgEnergy = if (speechFrameCount > 0) speechEnergySum / speechFrameCount else 0f
-                    val peak = peakRms
-                    onSpeechEnd(avgEnergy, peak)
+                    LogUtils.d(TAG, "🛑 语音结束 | 静音时长: ${silenceDuration}ms | 平均能量: $avgEnergy | 峰值: $peakRms")
+                    onSpeechEnd(avgEnergy, peakRms)
+                    resetStats() // 重置基础统计
+                } else {
+                    LogUtils.v(TAG, "⏳ 静音中，时长: ${silenceDuration}ms (阈值: ${END_SILENCE_MS}ms)")
                }
            }
        }
    }

-    fun activeSpeechRatio(): Float = if (activeFrameCount == 0) 0f else activeSpeechFrameCount.toFloat() / activeFrameCount
+    /**
+     * 计算语音占比（活跃语音帧 / 总活跃帧）
+     * @return 语音占比（0~1）
+     */
+    fun activeSpeechRatio(): Float {
+        val ratio = if (activeFrameCount == 0) 0f else activeSpeechFrameCount.toFloat() / activeFrameCount
+        LogUtils.d(TAG, "📊 语音占比: $ratio | 语音帧: $activeSpeechFrameCount | 总帧: $activeFrameCount")
+        return ratio
+    }

+    // ========== 新增：帧统计获取方法（给VoiceController调用） ==========
+    /** 获取总处理帧数 */
+    fun getTotalFrames(): Int = totalFrames
+
+    /** 获取语音帧总数 */
+    fun getSpeechFrames(): Int = speechFrames
+
+    /** 获取连续语音帧数 */
+    fun getContinuousSpeechFrames(): Int = continuousSpeechFrames
+
+    /** 获取峰值位置占比（峰值帧索引/总帧数） */
+    fun getPeakPositionRatio(): Float {
+        return if (totalFrames == 0) 0f else peakPosition.toFloat() / totalFrames
+    }
+
+    /**
+     * 重置VAD状态（保留核心对象，清空统计数据）
+     */
    fun reset() {
+        // 基础状态重置
        isSpeaking = false
        lastSpeechTime = 0L
+        resetStats()
+        vad.reset()
+
+        // 新增：连续性统计重置
+        totalFrames = 0
+        speechFrames = 0
+        continuousSpeechFrames = 0
+        lastFrameIsSpeech = false
+        peakPosition = 0
+        frameIndex = 0
+
+        LogUtils.d(TAG, "🔄 VAD 状态已完全重置")
+    }
+
+    /**
+     * 重置统计数据（内部使用）
+     */
+    private fun resetStats() {
        activeFrameCount = 0
        activeSpeechFrameCount = 0
        speechEnergySum = 0f
        speechFrameCount = 0
        peakRms = 0f
-        vad.reset()
    }

-    private fun calcRms(samples: FloatArray): Float {
+    /**
+     * 计算音频采样的RMS（均方根）能量
+     * @param samples 音频采样数据
+     * @return RMS值
+     */
+    fun calcRms(samples: FloatArray): Float {
        var sum = 0f
-        var peak = 0f
-        for (v in samples) {
-            sum += v * v
-            peak = maxOf(peak, kotlin.math.abs(v))
-        }
+        for (v in samples) sum += v * v
        return sqrt(sum / samples.size)
    }
 }
--- a/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt
+++ b/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt
@ -1,13 +1,13 @@
 package com.zs.smarthuman.sherpa

 import android.content.res.AssetManager
-import android.util.Log
+import com.blankj.utilcode.util.LogUtils
 import java.util.ArrayDeque

 class VoiceController(
    assetManager: AssetManager,
    private val onWakeup: () -> Unit,
-    private val onFinalAudio: (FloatArray) -> Unit, // 修改：传回识别结果文本
+    private val onFinalAudio: (FloatArray) -> Unit,
    private val idleTimeoutSeconds: Int = 10,
    private val maxRecordingSeconds: Int = 10,
    private val onStateChanged: ((VoiceState) -> Unit)? = null,
@ -20,14 +20,16 @@ class VoiceController(
    var state: VoiceState = VoiceState.WAIT_WAKEUP
        private set(value) {
            field = value
-            Log.d(TAG, "➡ State = $value")
+            LogUtils.d(TAG, "➡ State = $value")
            onStateChanged?.invoke(value)
        }

-    private val wakeupManager = WakeupManager(assetManager) {
-        Log.d(TAG, "🔥 WakeWord detected")
-        handleWakeupEvent()
-    }
+    private val wakeupManager = WakeupManager(assetManager, onWakeup)
+    private val vadManager = VadManager(
+        assetManager,
+        onSpeechStart = { onVadStart() },
+        onSpeechEnd = { avgEnergy, peakRms -> onVadEnd(avgEnergy, peakRms) }
+    )

    private val audioBuffer = mutableListOf<Float>()
    private val preBuffer = ArrayDeque<Float>()
@ -48,10 +50,36 @@ class VoiceController(
    private val idleTimeoutMs = idleTimeoutSeconds * 1000L
    private val maxRecordingMs = maxRecordingSeconds * 1000L

-    private val vadManager = VadManager(
-        assetManager,
-        onSpeechStart = { onVadStart() },
-        onSpeechEnd = { avgEnergy, peakRms -> onVadEnd(avgEnergy, peakRms) }
+    // ================= 动态阈值核心配置（修复+调整） =================
+    private val BASELINE_WINDOW_SIZE = 50
+    private val envNoiseBuffer = ArrayDeque<Float>(BASELINE_WINDOW_SIZE)
+    private var currentEnvBaseline = 0.001f
+
+    // ========== 修复：远场过滤配置（关键调整） ==========
+    private val MAX_FAR_FIELD_ENERGY = 0.05f    // 远场能量上限（正常语音>0.05，远场<0.05）
+    private val MIN_VALID_PEAK_AVG_RATIO = 1.5f // 有效峰均比下限（正常语音>1.5，咳嗽<1.5）
+    private val BASELINE_QUIET_THRESHOLD = 0.002f
+    private val MIN_CONTINUOUS_FRAME_RATIO = 0.4f // 连续帧占比下限（从0.6调低，兼容正常短语音）
+    private val MAX_PEAK_POSITION_RATIO = 0.5f    // 峰值位置上限（从0.3调高，兼容正常语音）
+    private val MIN_EFFECTIVE_SPEECH_FRAMES = 3    // 最低有效帧数（从5调低）
+
+    // 分场景动态系数（调整：降低安静环境系数）
+    private val SHORT_SPEECH_ENERGY_COEFF_QUIET = 6.0f   // 从8.0调低
+    private val SHORT_SPEECH_ENERGY_COEFF_NOISY = 4.0f   // 从5.0调低
+    private val LONG_SPEECH_ENERGY_COEFF = 15.0f         // 从20.0调低
+    private val SHORT_SPEECH_VAD_COEFF = 0.15f           // 从0.2调低
+    private val LONG_SPEECH_VAD_COEFF = 0.3f             // 从0.4调低
+    private val SHORT_SPEECH_MIN_SCORE = 2               // 调回2分
+    private val LONG_SPEECH_MIN_SCORE = 4                // 从5调低
+    private val SHORT_SPEECH_MIN = 500L
+    private val SHORT_SPEECH_MAX = 2000L
+
+    // 阈值配置数据类
+    private data class ThresholdConfig(
+        val energyThreshold: Float,
+        val vadRatioThreshold: Float,
+        val minScore: Int,
+        val scene: String
    )

    /* ================= 音频入口 ================= */
@ -65,6 +93,10 @@ class VoiceController(

        val now = System.currentTimeMillis()

+        if (state == VoiceState.WAIT_WAKEUP) {
+            calibrateEnvBaseline(samples)
+        }
+
        when (state) {
            VoiceState.WAIT_WAKEUP,
            VoiceState.PLAYING_PROMPT,
@ -84,7 +116,7 @@ class VoiceController(
                if ((waitSpeechStartMs > 0 && now - waitSpeechStartMs >= idleTimeoutMs) ||
                    (waitSpeechFailStartMs > 0 && now - waitSpeechFailStartMs >= idleTimeoutMs)
                ) {
-                    Log.d(TAG, "⏱ WAIT_SPEECH idle timeout → WAIT_WAKEUP")
+                    LogUtils.d(TAG, "⏱ WAIT_SPEECH idle timeout → WAIT_WAKEUP")
                    resetAll()
                    return
                }
@ -100,13 +132,28 @@ class VoiceController(
                vadManager.accept(samples)

                if (System.currentTimeMillis() - recordingStartMs > maxRecordingMs) {
-                    Log.w(TAG, "⏱ Max recording reached")
-                    finishSentence() // 超时也触发 finish
+                    LogUtils.w(TAG, "⏱ Max recording reached | 当前环境基线: $currentEnvBaseline")
+                    finishSentence()
                }
            }
        }
    }

+    /* ================= 环境基线校准 ================= */
+    private fun calibrateEnvBaseline(samples: FloatArray) {
+        val rms = vadManager.calcRms(samples)
+        if (rms < 0.01f) {
+            if (envNoiseBuffer.size >= BASELINE_WINDOW_SIZE) {
+                envNoiseBuffer.removeFirst()
+            }
+            envNoiseBuffer.addLast(rms)
+            currentEnvBaseline = envNoiseBuffer.maxOrNull() ?: 0.001f
+            LogUtils.d(TAG, "🌡 环境基线校准 | RMS: $rms | 基线: $currentEnvBaseline | 缓存数: ${envNoiseBuffer.size}")
+        } else {
+            LogUtils.v(TAG, "🔊 高能量音频跳过校准 | RMS: $rms | 基线: $currentEnvBaseline")
+        }
+    }
+
    /* ================= 唤醒 ================= */
    private fun handleWakeupEvent() {
        if (state == VoiceState.UPLOADING) return
@ -127,11 +174,12 @@ class VoiceController(
        inKwsObserve = true
        kwsObserveStartMs = System.currentTimeMillis()
        onWakeup()
+        LogUtils.d(TAG, "🔔 唤醒成功 | 环境基线: $currentEnvBaseline")
    }

    private fun onVadStart() {
        if (state != VoiceState.WAIT_SPEECH) return
-        Log.d(TAG, "🎤 REAL VAD START")
+        LogUtils.d(TAG, "🎤 REAL VAD START | 环境基线: $currentEnvBaseline")
        vadStarted = true
        recordingStartMs = System.currentTimeMillis()
        audioBuffer.clear()
@ -141,17 +189,17 @@ class VoiceController(

    private fun onVadEnd(avgEnergy: Float, peakRms: Float) {
        if (state != VoiceState.RECORDING) return
-        Log.d(TAG, "🧠 VAD END")
+        LogUtils.d(TAG, "🧠 VAD END | 环境基线: $currentEnvBaseline")
        finishSentence(avgEnergy, peakRms)
    }

-    /* ================= 结束录音 ================= */
+    /* ================= 结束录音（修复远场过滤逻辑） ================= */
    private fun finishSentence(avgEnergy: Float = 0f, peakRms: Float = 0f) {
        val now = System.currentTimeMillis()
        val duration = now - recordingStartMs

        if (!vadStarted || duration < MIN_SPEECH_MS) {
-            Log.d(TAG, "❌ Too short or no VAD start: $duration ms")
+            LogUtils.d(TAG, "❌ 语音过短: $duration ms | 基线: $currentEnvBaseline")
            resetToWaitSpeech()
            return
        }
@ -160,73 +208,148 @@ class VoiceController(
        val vadRatio = vadManager.activeSpeechRatio()
        val peakAvgRatio = if (avgEnergy > 0f) peakRms / avgEnergy else 0f

-        Log.d(TAG, "📊 Finish Sentence - duration: $duration ms, vadEnded: true")
-        Log.d(
-            TAG,
-            "📊 vadRatio=$vadRatio, avgEnergy=$avgEnergy, peakRms=$peakRms, peakAvgRatio=$peakAvgRatio"
-        )
+        // 获取VAD帧统计
+        val totalFrames = vadManager.getTotalFrames()
+        val speechFrames = vadManager.getSpeechFrames()
+        val continuousSpeechFrames = vadManager.getContinuousSpeechFrames()
+        val peakPositionRatio = vadManager.getPeakPositionRatio()

-        if (avgEnergy < 0.02f || peakAvgRatio < 1.2f || vadRatio < 0.4f) {
-            Log.d(TAG, "❌ Sentence rejected")
+        LogUtils.d(TAG, "📊 录音信息 | 时长: $duration ms | 能量: $avgEnergy | 峰均比: $peakAvgRatio | 基线: $currentEnvBaseline")
+        LogUtils.d(TAG, "📊 帧统计 | 总帧: $totalFrames | 语音帧: $speechFrames | 连续语音帧: $continuousSpeechFrames | 峰值位置占比: $peakPositionRatio")
+
+        // ========== 修复：远场语音过滤逻辑（核心） ==========
+        // 正确逻辑：能量 < MAX_FAR_FIELD_ENERGY 才是远场；峰均比 < MIN_VALID_PEAK_AVG_RATIO 才是无效语音
+        val isFarField = avgEnergy < MAX_FAR_FIELD_ENERGY // 修复：从 < MIN 改为 < MAX
+        val isInvalidPeakRatio = peakAvgRatio < MIN_VALID_PEAK_AVG_RATIO // 降低阈值
+
+        // 非连续特征（调整阈值后更宽松）
+        val continuousRatio = if (speechFrames > 0) continuousSpeechFrames.toFloat() / speechFrames else 0f
+        val isDiscontinuous = continuousRatio < MIN_CONTINUOUS_FRAME_RATIO ||
+                speechFrames < MIN_EFFECTIVE_SPEECH_FRAMES ||
+                peakPositionRatio < MAX_PEAK_POSITION_RATIO
+
+        // 远场+无效语音过滤（仅过滤真正的远场/杂音）
+        if (isFarField && isInvalidPeakRatio) { // 修复：同时满足才过滤
+            LogUtils.w(TAG, "❌ 远场/无效语音过滤 | 能量: $avgEnergy < 远场上限: $MAX_FAR_FIELD_ENERGY | 峰均比: $peakAvgRatio < 有效下限: $MIN_VALID_PEAK_AVG_RATIO")
            resetToWaitSpeech()
            return
        }

-        // 评分逻辑
-        var score = 0
-        when {
-            duration >= 4000 -> score += 3
-            duration >= 2500 -> score += 2
-            duration >= 1500 -> score += 1
+        // 非连续语音过滤（仅过滤真正的零散杂音）
+        if (isDiscontinuous && isFarField) { // 修复：结合远场特征，避免过滤正常语音
+            LogUtils.w(TAG, "❌ 非连续杂音过滤 | 连续占比: $continuousRatio < $MIN_CONTINUOUS_FRAME_RATIO | 语音帧: $speechFrames < $MIN_EFFECTIVE_SPEECH_FRAMES | 峰值位置: $peakPositionRatio < $MAX_PEAK_POSITION_RATIO")
+            resetToWaitSpeech()
+            return
+        }
+
+        // ========== 动态阈值计算（调整后更宽松） ==========
+        val thresholdConfig = when {
+            duration in SHORT_SPEECH_MIN..SHORT_SPEECH_MAX -> {
+                val coeff = if (currentEnvBaseline < BASELINE_QUIET_THRESHOLD) {
+                    SHORT_SPEECH_ENERGY_COEFF_QUIET
+                } else {
+                    SHORT_SPEECH_ENERGY_COEFF_NOISY
+                }
+                val threshold = currentEnvBaseline * coeff
+                LogUtils.d(TAG, "📏 短语音阈值 | 场景: ${if (currentEnvBaseline < BASELINE_QUIET_THRESHOLD) "安静" else "嘈杂"} | 系数: $coeff | 阈值: $threshold")
+                ThresholdConfig(
+                    energyThreshold = threshold,
+                    vadRatioThreshold = SHORT_SPEECH_VAD_COEFF,
+                    minScore = SHORT_SPEECH_MIN_SCORE,
+                    scene = "短语音"
+                )
+            }
+            else -> {
+                val threshold = currentEnvBaseline * LONG_SPEECH_ENERGY_COEFF
+                ThresholdConfig(
+                    energyThreshold = threshold,
+                    vadRatioThreshold = LONG_SPEECH_VAD_COEFF,
+                    minScore = LONG_SPEECH_MIN_SCORE,
+                    scene = "长语音"
+                )
+            }
+        }
+
+        LogUtils.d(TAG, "📊 动态阈值 | ${thresholdConfig.scene} | 能量阈值: ${thresholdConfig.energyThreshold} | 占比阈值: ${thresholdConfig.vadRatioThreshold} | 最低分: ${thresholdConfig.minScore}")
+
+        // 基础阈值过滤（调整后更宽松）
+        if (avgEnergy < thresholdConfig.energyThreshold || vadRatio < thresholdConfig.vadRatioThreshold) {
+            LogUtils.w(TAG, "❌ 阈值过滤 | 能量: $avgEnergy < ${thresholdConfig.energyThreshold} | 占比: $vadRatio < ${thresholdConfig.vadRatioThreshold}")
+            resetToWaitSpeech()
+            return
+        }
+
+        // 评分逻辑（恢复短语音保底分）
+        var score = 0
+        // 1. 时长评分（恢复短语音保底1分）
+        score += when {
+            duration >= 4000 -> 3
+            duration >= 2500 -> 2
+            duration >= 1500 -> 1
+            duration >= SHORT_SPEECH_MIN -> 1 // 恢复保底分
+            else -> 0
+        }
+        // 2. 能量评分
+        score += when {
+            avgEnergy >= thresholdConfig.energyThreshold * 10 -> 3
+            avgEnergy >= thresholdConfig.energyThreshold * 5 -> 2
+            avgEnergy >= thresholdConfig.energyThreshold -> 1
+            else -> 0
+        }
+        // 3. 占比+连续性评分（调整阈值）
+        score += when {
+            continuousRatio >= 0.7 -> 2 // 从0.8调低
+            continuousRatio >= MIN_CONTINUOUS_FRAME_RATIO -> 1
+            else -> 0
+        }
+
+        LogUtils.d(TAG, "🏆 评分结果 | 总分: $score | 最低分: ${thresholdConfig.minScore} | 连续占比: $continuousRatio")
+
+        // 分场景判定（调整后更宽松）
+        val pass = if (duration in SHORT_SPEECH_MIN..SHORT_SPEECH_MAX) {
+            score >= thresholdConfig.minScore && continuousRatio >= 0.5 // 从0.7调低
+        } else {
+            score >= thresholdConfig.minScore || (score >= 2 && avgEnergy >= currentEnvBaseline * 4) // 从3→2，6→4
        }
-        when {
-            avgEnergy >= 0.10f -> score += 3
-            avgEnergy >= 0.06f -> score += 2
-            avgEnergy >= 0.02f -> score += 1
-        }
-        when {
-            vadRatio >= 0.55f -> score += 2
-            vadRatio >= 0.40f -> score += 1
-        }
-        Log.d(
-            TAG,
-            "📊 duration=$duration ms, vadRatio=$vadRatio, avgEnergy=$avgEnergy, score=$score"
-        )

-        val pass = score >= 5 || (score == 3 && avgEnergy >= 0.06f)
        if (!pass) {
-            Log.d(TAG, "❌ Sentence rejected (score=$score)")
+            LogUtils.w(TAG, "❌ 评分/连续性不足过滤 | 总分: $score < ${thresholdConfig.minScore} | 连续占比: $continuousRatio")
            resetToWaitSpeech()
            return
        }

        audioBuffer.clear()
        state = VoiceState.UPLOADING
-        onFinalAudio(audio)  // 传递音频和识别文本
+        onFinalAudio(audio)
+        LogUtils.i(TAG, "✅ 录音通过 | 时长: $duration ms | 能量: $avgEnergy | 连续占比: $continuousRatio | 准备上传")
    }

-    /* ================= 播放回调 ================= */
+    /* ================= 播放/上传/Reset 回调（无修改） ================= */
    fun onPlayStartPrompt() {
+        LogUtils.d(TAG, "🎵 播放提示音 | 基线: $currentEnvBaseline")
        state = VoiceState.PLAYING_PROMPT
    }

    fun onPlayEndPrompt() {
        speechEnableAtMs = System.currentTimeMillis() + SPEECH_COOLDOWN_MS
+        LogUtils.d(TAG, "🎵 提示音结束 | 基线: $currentEnvBaseline")
        state = VoiceState.WAIT_SPEECH_COOLDOWN
    }

    fun onPlayStartBackend() {
+        LogUtils.d(TAG, "🎶 播放后台音频 | 基线: $currentEnvBaseline")
        state = VoiceState.PLAYING_BACKEND
    }

    fun onPlayEndBackend() {
        speechEnableAtMs = System.currentTimeMillis() + SPEECH_COOLDOWN_MS
+        LogUtils.d(TAG, "🎶 后台音频结束 | 基线: $currentEnvBaseline")
        state = VoiceState.WAIT_SPEECH_COOLDOWN
    }

-    /* ================= 上传回调 ================= */
    fun onUploadFinished(success: Boolean) {
        if (state != VoiceState.UPLOADING) return
+        LogUtils.d(TAG, "📤 上传完成 | 成功: $success | 基线: $currentEnvBaseline")
        state = if (success) VoiceState.PLAYING_BACKEND
        else {
            speechEnableAtMs = System.currentTimeMillis() + SPEECH_COOLDOWN_MS
@ -234,8 +357,8 @@ class VoiceController(
        }
    }

-    /* ================= Reset ================= */
    private fun resetToWaitSpeech() {
+        LogUtils.d(TAG, "🔄 重置到等待说话 | 基线: $currentEnvBaseline")
        audioBuffer.clear()
        vadManager.reset()
        vadStarted = false
@ -244,21 +367,27 @@ class VoiceController(
    }

    private fun resetAll() {
+        LogUtils.d(TAG, "🔄 重置所有状态 | 基线: $currentEnvBaseline")
        audioBuffer.clear()
        preBuffer.clear()
        vadManager.reset()
+        wakeupManager.reset()
        vadStarted = false
        waitSpeechStartMs = 0L
        waitSpeechFailStartMs = 0L
+        envNoiseBuffer.clear()
+        currentEnvBaseline = 0.001f
+        LogUtils.d(TAG, "🔄 环境基线已重置 | 新基线: $currentEnvBaseline")
        state = VoiceState.WAIT_WAKEUP
    }

    fun release() {
+        LogUtils.d(TAG, "🔌 释放资源 | 最终基线: $currentEnvBaseline")
        wakeupManager.release()
        vadManager.reset()
+        envNoiseBuffer.clear()
    }

-    /* ================= Utils ================= */
    private fun cachePreBuffer(samples: FloatArray) {
        for (s in samples) {
            preBuffer.addLast(s)
--- a/app/src/main/java/com/zs/smarthuman/sherpa/WakeupManager.kt
+++ b/app/src/main/java/com/zs/smarthuman/sherpa/WakeupManager.kt
@ -2,6 +2,7 @@ package com.zs.smarthuman.sherpa

 import android.content.res.AssetManager
 import android.util.Log
+import com.blankj.utilcode.util.LogUtils
 import com.k2fsa.sherpa.onnx.*

 class WakeupManager(assetManager: AssetManager, function: () -> Unit) {
@ -29,11 +30,11 @@ class WakeupManager(assetManager: AssetManager, function: () -> Unit) {
        )

        kws = KeywordSpotter(assetManager, config)
-        Log.d(TAG, "✅ KeywordSpotter initialized")
+        LogUtils.d(TAG, "✅ KeywordSpotter initialized")

        stream = kws.createStream()
        require(stream != null) { "Failed to create KWS stream" }
-        Log.d(TAG, "✅ KWS stream created")
+        LogUtils.d(TAG, "✅ KWS stream created")
    }

    /** ⭐ 永远喂 KWS */
@ -48,7 +49,7 @@ class WakeupManager(assetManager: AssetManager, function: () -> Unit) {
            kws.decode(s)
            val keyword = kws.getResult(s).keyword
            if (keyword.isNotBlank()) {
-                Log.d(TAG, "🔥 KWS hit: $keyword")
+                LogUtils.d(TAG, "🔥 KWS hit: $keyword")
                justWokeUp = true
                kws.reset(s)
                break