From 22fc80dd07bf244f4a80a693a8cd65fe1d23410b Mon Sep 17 00:00:00 2001 From: ross <3024454314@qq.com> Date: Mon, 12 Jan 2026 18:42:58 +0800 Subject: [PATCH] =?UTF-8?q?=E7=A8=B3=E5=AE=9A=E7=89=88=E5=8F=82=E6=95=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../com/zs/smarthuman/sherpa/VoiceConfig.kt | 162 ++++ .../zs/smarthuman/sherpa/VoiceController.kt | 755 ++++++------------ .../zs/smarthuman/sherpa/VoiceStateManager.kt | 211 +++++ .../com/zs/smarthuman/sherpa/VoiceUtils.kt | 355 ++++++++ 4 files changed, 956 insertions(+), 527 deletions(-) create mode 100644 app/src/main/java/com/zs/smarthuman/sherpa/VoiceConfig.kt create mode 100644 app/src/main/java/com/zs/smarthuman/sherpa/VoiceStateManager.kt create mode 100644 app/src/main/java/com/zs/smarthuman/sherpa/VoiceUtils.kt diff --git a/app/src/main/java/com/zs/smarthuman/sherpa/VoiceConfig.kt b/app/src/main/java/com/zs/smarthuman/sherpa/VoiceConfig.kt new file mode 100644 index 0000000..ac7f471 --- /dev/null +++ b/app/src/main/java/com/zs/smarthuman/sherpa/VoiceConfig.kt @@ -0,0 +1,162 @@ +package com.zs.smarthuman.sherpa + +/** + * @description: 语音控制器配置常量类(所有硬编码参数集中管理,无需修改业务逻辑即可适配场景) + * @author: lrs + * @date: 2026/1/12 17:33 + * @note: 所有参数适配 16kHz 单通道语音场景,调整后需在目标场景(安静/嘈杂/远场)测试验证 + */ +object VoiceConfig { + // ===================== 基础配置(通用核心,禁止随意修改) ===================== + /** 日志打印统一标签,便于筛选语音相关日志 */ + const val TAG = "VoiceController" + + /** 语音采样率(固定16kHz),与VAD/声纹模型强绑定,禁止修改 */ + const val SAMPLE_RATE = 16000 + + /** 预缓存音频大小(2秒),用于唤醒后补全唤醒前的语音片段,防止开头缺失 */ + const val PRE_BUFFER_SIZE = SAMPLE_RATE * 2 + + /** 声纹验证的当前唤醒用户ID,业务层可动态替换 */ + const val CURRENT_USER_ID = "current_wakeup_user" + + /** 是否开启严格的声纹验证:测试阶段设为false,生产环境建议设为true */ + const val ENABLE_STRICT_SPEAKER_VERIFY = true + + // ===================== 时间阈值(ms,时序控制核心) ===================== + /** 空闲超时默认秒数(实际超时=200*1000=200秒),建议调整为5-30秒 */ + const val IDLE_TIMEOUT_DEFAULT_SECONDS = 200 + + /** 最大录音时长(10秒),防止录音过长占用内存:短指令3-5秒,长语音10-20秒 */ + const val MAX_RECORDING_DEFAULT_SECONDS = 10 + + /** 短语音判定阈值(1秒),声纹验证分场景的临界值,无需调整 */ + const val SHORT_AUDIO_DURATION_MS = 1000L + + /** 无效语音重置防抖时间(1.5秒),避免1.5秒内重复重置状态,建议1-2秒 */ + const val INVALID_RESET_DEBOUNCE_MS = 1500L + + /** 有效语音最小时长(800ms),过滤极短的杂音/按键音,建议600-1000ms */ + const val MIN_SPEECH_MS = 800L + + /** 微弱人声过滤的基础时长阈值(400ms),建议为MIN_SPEECH_MS的一半 */ + const val MIN_EFFECTIVE_VOICE_DURATION = 400L + + /** 唤醒后观察期(500ms),期间不处理VAD防止误触发,建议300-800ms */ + const val KWS_OBSERVE_MS = 500L + + /** 说话冷却期(300ms),提示音/后台音频结束后延迟进入等待说话,建议200-400ms */ + const val SPEECH_COOLDOWN_MS = 300L + + /** 短语音时长范围(0.5-2秒),分场景阈值的判定依据,无需调整 */ + const val SHORT_SPEECH_MIN = 500L + const val SHORT_SPEECH_MAX = 2000L + + /** 多人对话检测的最小时长(2.5秒),太短易误判,太长漏判,建议2-3秒 */ + const val MULTI_DIALOGUE_MIN_DURATION = 2500L + + // ===================== 环境/噪音阈值(能量类,核心过滤参数) ===================== + /** 嘈杂环境判定阈值:环境基线≥0.01f则为嘈杂环境,安静场景0.008f,嘈杂场景0.015f */ + const val NOISE_BASELINE_THRESHOLD = 0.01f + + /** 环境基线校准的滑动窗口大小(50帧),太小基线波动大,太大校准滞后,建议30-80 */ + const val BASELINE_WINDOW_SIZE = 50 + + /** 安静环境判定阈值:环境基线<0.005f则为安静环境,建议为NOISE_BASELINE_THRESHOLD的一半 */ + const val BASELINE_QUIET_THRESHOLD = 0.005f + + /** 有效语音的最小RMS能量(0.0005f),太低统计背景噪,太高漏统计弱人声,建议0.0003-0.0008f */ + const val MIN_EFFECTIVE_SPEECH_RMS = 0.0005f + + /** 正常语音能量阈值(0.008f),联合过滤的临界值,嘈杂环境0.01f,安静环境0.006f */ + const val NORMAL_VOICE_ENERGY_THRESHOLD = 0.008f + + // ===================== 声纹验证阈值(分场景适配,核心体验参数) ===================== + /** 安静环境声纹验证阈值(0.50f),越高越严格:准确率优先0.55f,通过率优先0.45f */ + const val SPEAKER_THRESHOLD_QUIET = 0.50f + + /** 嘈杂环境声纹验证阈值(0.43f),放宽以提高通过率,比安静环境低0.05-0.1f */ + const val SPEAKER_THRESHOLD_NOISY = 0.43f + + /** 短语音声纹验证阈值(0.40f),进一步放宽,比嘈杂环境低0.03-0.05f */ + const val SPEAKER_THRESHOLD_SHORT = 0.40f + + // ===================== 能量/占比阈值(过滤核心,分场景适配) ===================== + /** 正常语音最低能量(0.03f),短语音适配0.01f,长语音保持0.03f */ + const val MIN_NORMAL_VOICE_ENERGY = 0.03f + + /** 正常语音VAD占比阈值(0.2f),短语音0.1f,嘈杂环境0.15f */ + const val MIN_NORMAL_VOICE_VAD_RATIO = 0.2f + + /** 远场语音最大能量阈值(0.015f),近场0.01f,远场0.02f */ + const val MAX_FAR_FIELD_ENERGY = 0.015f + + /** 有效语音最小峰均比(0.5f),过滤扁平背景噪,建议0.4-0.6f */ + const val MIN_VALID_PEAK_AVG_RATIO = 0.5f + + /** 有效语音最小连续帧占比(0.1f),非连续杂音过滤,建议0.08-0.12f */ + const val MIN_CONTINUOUS_FRAME_RATIO = 0.1f + + /** 语音峰值位置阈值(0.95f),过滤末尾突发杂音,建议0.9-0.98f */ + const val MAX_PEAK_POSITION_RATIO = 0.95f + + /** 有效语音最小帧数(3帧),过滤零星语音帧,建议2-5帧 */ + const val MIN_EFFECTIVE_SPEECH_FRAMES = 3 + + // ===================== 多人对话过滤(多维度判定) ===================== + /** 多人对话最大峰均比(2.5f),峰均比过高判定为多人对话,建议2.0-3.0f */ + const val MULTI_DIALOGUE_MAX_PEAK_AVG_RATIO = 2.5f + + /** 多人对话最小峰均比(0.4f),峰均比过低判定为多人对话,建议0.3-0.5f */ + const val MULTI_DIALOGUE_MIN_PEAK_AVG_RATIO = 0.4f + + /** 多人对话最大连续帧占比(0.3f),连续帧低判定为多人对话,建议0.2-0.4f */ + const val MULTI_DIALOGUE_MAX_CONTINUOUS_RATIO = 0.3f + + /** 多人对话最小VAD占比(0.55f),VAD占比高判定为多人对话,建议0.5-0.6f */ + const val MULTI_DIALOGUE_MIN_VAD_RATIO = 0.55f + + // ===================== 分场景动态系数(阈值计算) ===================== + /** 短语音能量动态系数:安静环境1.5f,嘈杂环境2.0f(嘈杂环境系数更高) */ + const val SHORT_SPEECH_ENERGY_COEFF_QUIET = 1.5f + const val SHORT_SPEECH_ENERGY_COEFF_NOISY = 2.0f + + /** 长语音能量动态系数:安静环境2.5f,嘈杂环境3.5f(长语音系数更高) */ + const val LONG_SPEECH_ENERGY_COEFF_QUIET = 2.5f + const val LONG_SPEECH_ENERGY_COEFF_NOISY = 3.5f + + /** 短/长语音VAD占比动态系数:短语音0.05f,长语音0.10f(长语音要求更高) */ + const val SHORT_SPEECH_VAD_COEFF = 0.05f + const val LONG_SPEECH_VAD_COEFF = 0.10f + + /** 短/长语音最低评分:默认1分(宽松),严格场景可设为2分 */ + const val SHORT_SPEECH_MIN_SCORE = 1 + const val LONG_SPEECH_MIN_SCORE = 1 + + // ===================== 微弱人声过滤专用阈值(补充) ===================== + /** 短/长语音临界时长(2000ms),filterWeakVoice中判定短语音的依据 */ + const val SHORT_LONG_SPEECH_CUTOFF_MS = 2000L + + /** 短语音动态能量阈值(0.01f),filterWeakVoice中短语音的能量判定值 */ + const val SHORT_SPEECH_ENERGY_THRESHOLD = 0.01f + + /** 短语音VAD占比阈值(0.10f),filterWeakVoice中短语音的VAD判定值 */ + const val SHORT_SPEECH_VAD_RATIO = 0.10f + + /** 嘈杂环境VAD占比阈值(0.15f),filterWeakVoice中嘈杂环境的VAD判定值 */ + const val NOISY_ENV_VAD_RATIO = 0.15f + + /** 纯底噪过滤的能量阈值(0.005f),filterWeakVoice中底噪判定的能量值 */ + const val PURE_NOISE_ENERGY_THRESHOLD = 0.005f + + /** 纯底噪过滤的能量基线比(1.2f),filterWeakVoice中底噪判定的比值 */ + const val PURE_NOISE_BASELINE_RATIO = 1.2f + + // ===================== 语音评分专用阈值(补充) ===================== + /** 长语音评分临界时长(4000ms),calculateSpeechScore中评3分的依据 */ + const val LONG_SPEECH_SCORE_CUTOFF_MS = 4000L + + /** 中语音评分临界时长(2500ms),calculateSpeechScore中评2分的依据 */ + const val MID_SPEECH_SCORE_CUTOFF_MS = 2500L + +} \ No newline at end of file diff --git a/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt b/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt index 7ed71f3..56e630c 100644 --- a/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt +++ b/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt @@ -2,7 +2,8 @@ package com.zs.smarthuman.sherpa import android.content.res.AssetManager import com.blankj.utilcode.util.LogUtils -import com.k2fsa.sherpa.onnx.OnlineStream +import com.k2fsa.sherpa.onnx.SpeakerEmbeddingExtractor +import com.k2fsa.sherpa.onnx.SpeakerEmbeddingManager import com.k2fsa.sherpa.onnx.SpeakerRecognition import kotlinx.coroutines.CoroutineScope import kotlinx.coroutines.Dispatchers @@ -15,45 +16,32 @@ class VoiceController( assetManager: AssetManager, private val onWakeup: () -> Unit, private val onFinalAudio: (FloatArray) -> Unit, - idleTimeoutSeconds: Int = 200, - maxRecordingSeconds: Int = 10, + idleTimeoutSeconds: Int = VoiceConfig.IDLE_TIMEOUT_DEFAULT_SECONDS, + maxRecordingSeconds: Int = VoiceConfig.MAX_RECORDING_DEFAULT_SECONDS, private val onStateChanged: ((VoiceState) -> Unit)? = null, private val stopBackendAudio: (() -> Unit)? = null, private val onTimeoutTip: OnTimeoutTip? = null ) { + // 依赖组件 + private val wakeupManager = WakeupManager(assetManager, onWakeup) + private val vadManager = VadManager( + assetManager, + onSpeechStart = ::onVadStart, + onSpeechEnd = { avgEnergy, peakRms -> onVadEnd(avgEnergy, peakRms) } + ) + private val stateManager = VoiceStateManager( + idleTimeoutSeconds = idleTimeoutSeconds, + maxRecordingSeconds = maxRecordingSeconds, + onStateChanged = onStateChanged, + onTimeoutTip = onTimeoutTip + ) - companion object { - // 日志标签 - private const val TAG = "VoiceController" - // 采样率 - private const val SAMPLE_RATE = 16000 - // 预缓存大小(2秒) - private const val PRE_BUFFER_SIZE = SAMPLE_RATE * 2 + // 音频缓存 + private val audioBuffer = mutableListOf() + private val preBuffer = ArrayDeque(VoiceConfig.PRE_BUFFER_SIZE) + private val envNoiseBuffer = ArrayDeque(VoiceConfig.BASELINE_WINDOW_SIZE) - // ========== 核心:分场景声纹阈值(极简版) ========== - private const val SPEAKER_THRESHOLD_QUIET = 0.50f // 安静环境 - private const val SPEAKER_THRESHOLD_NOISY = 0.43f // 嘈杂环境(匹配你的真实相似度) - private const val SPEAKER_THRESHOLD_SHORT = 0.40f // 短语音(<1秒) - - // 短语音判定阈值 - private const val SHORT_AUDIO_DURATION_MS = 1000L - private const val INVALID_RESET_DEBOUNCE_MS = 1500L - // 最小语音时长 - private const val MIN_SPEECH_MS = 800L - private const val MIN_EFFECTIVE_VOICE_DURATION = 400L - - // 噪音场景判定阈值 - private const val NOISE_BASELINE_THRESHOLD = 0.01f - } - - var state: VoiceState = VoiceState.WAIT_WAKEUP - private set(value) { - field = value - LogUtils.d(TAG, "➡ State = $value") - onStateChanged?.invoke(value) - } - - // 实时能量与帧统计变量 + // 实时统计 private var realtimeEnergySum = 0f private var realtimeEnergyCount = 0 private var realtimePeakRms = 0f @@ -62,361 +50,222 @@ class VoiceController( private var realtimeContinuousSpeechFrames = 0 private var realtimeLastFrameIsSpeech = false private var isMultiPersonDialogueDetected = false - private var lastInvalidResetMs = 0L + + // 声纹识别相关 private val speakerManagerLock = ReentrantLock() - - // 环境噪音状态标记 - private var isNoisyEnvironment = false - - private val wakeupManager = WakeupManager(assetManager, onWakeup) - private val vadManager = VadManager( - assetManager, - onSpeechStart = { onVadStart() }, - onSpeechEnd = { avgEnergy, peakRms -> onVadEnd(avgEnergy, peakRms) } - ) - - private val audioBuffer = mutableListOf() - private val preBuffer = ArrayDeque(PRE_BUFFER_SIZE) - - private var recordingStartMs = 0L - private var waitSpeechFailStartMs = 0L - private var waitSpeechStartMs = 0L - - private var vadStarted = false - private var inKwsObserve = false - private var kwsObserveStartMs = 0L - private val KWS_OBSERVE_MS = 500L - private var speechEnableAtMs = 0L - private val SPEECH_COOLDOWN_MS = 300L - - private val idleTimeoutMs = idleTimeoutSeconds * 1000L - private val maxRecordingMs = maxRecordingSeconds * 1000L - - // 分场景动态系数(保留原有逻辑) - private val BASELINE_WINDOW_SIZE = 50 - private val envNoiseBuffer = ArrayDeque(BASELINE_WINDOW_SIZE) - private var currentEnvBaseline = 0.001f - - // 强制兜底:正常语音最低门槛 - private val MIN_NORMAL_VOICE_ENERGY = 0.03f - private val MIN_NORMAL_VOICE_VAD_RATIO = 0.2f - - // 分场景动态系数 - private val BASELINE_QUIET_THRESHOLD = 0.005f - private val SHORT_SPEECH_ENERGY_COEFF_QUIET = 1.5f - private val SHORT_SPEECH_ENERGY_COEFF_NOISY = 2.0f - private val LONG_SPEECH_ENERGY_COEFF_QUIET = 2.5f - private val LONG_SPEECH_ENERGY_COEFF_NOISY = 3.5f - private val SHORT_SPEECH_VAD_COEFF = 0.05f - private val LONG_SPEECH_VAD_COEFF = 0.10f - private val SHORT_SPEECH_MIN_SCORE = 1 - private val LONG_SPEECH_MIN_SCORE = 1 - - // 其他过滤参数 - private val MAX_FAR_FIELD_ENERGY = 0.015f - private val MIN_VALID_PEAK_AVG_RATIO = 0.5f - private val MIN_CONTINUOUS_FRAME_RATIO = 0.1f - private val MAX_PEAK_POSITION_RATIO = 0.95f - private val MIN_EFFECTIVE_SPEECH_FRAMES = 3 - private val SHORT_SPEECH_MIN = 500L - private val SHORT_SPEECH_MAX = 2000L - - // 多人对话过滤配置 - private val MULTI_DIALOGUE_MIN_DURATION = 2500L - private val MULTI_DIALOGUE_MAX_PEAK_AVG_RATIO = 2.5f - private val MULTI_DIALOGUE_MIN_PEAK_AVG_RATIO = 0.4f - private val MULTI_DIALOGUE_MAX_CONTINUOUS_RATIO = 0.3f - private val MULTI_DIALOGUE_MIN_VAD_RATIO = 0.55f - - // 微弱人声过滤配置 - private val MIN_VOICE_FRAME_RATIO = 0.08f - private val MIN_PEAK_ENERGY_RATIO = 1.5f - private val NORMAL_VOICE_ENERGY_THRESHOLD = 0.008f - private val MIN_CONTINUOUS_VOICE_FRAMES = 1 - private val MIN_EFFECTIVE_SPEECH_RMS = 0.0005f - - // 无效说话标记 + 超时类型 - private var hasInvalidSpeech = false - private var currentTimeoutType: TimeoutType = TimeoutType.IDLE_TIMEOUT - - // 声纹验证相关 - private val CURRENT_USER_ID = "current_wakeup_user" - private val ENABLE_STRICT_SPEAKER_VERIFY = true + private lateinit var speakerExtractor: SpeakerEmbeddingExtractor + private lateinit var speakerManager: SpeakerEmbeddingManager init { + initSpeakerRecognition(assetManager) + } + + /** + * 初始化声纹识别器 + */ + private fun initSpeakerRecognition(assetManager: AssetManager) { try { SpeakerRecognition.initExtractor(assetManager) - LogUtils.d(TAG, "✅ 声纹识别器初始化成功") + speakerExtractor = SpeakerRecognition.extractor + speakerManager = SpeakerRecognition.manager + LogUtils.d(VoiceConfig.TAG, "✅ 声纹识别器初始化成功") } catch (e: Exception) { - LogUtils.e(TAG, "❌ 声纹识别器初始化失败", e) + LogUtils.e(VoiceConfig.TAG, "❌ 声纹识别器初始化失败", e) throw RuntimeException("声纹识别初始化失败", e) } } - /* ================= 音频入口 ================= */ + /** + * 音频入口(对外API不变) + */ fun acceptAudio(samples: FloatArray) { - cachePreBuffer(samples) + // 缓存预缓冲 + VoiceUtils.cachePreBuffer(samples, preBuffer) + + // 唤醒检测 wakeupManager.acceptAudio(samples) if (wakeupManager.consumeWakeupFlag()) { handleWakeupEvent() // 注册唤醒用户特征 CoroutineScope(Dispatchers.IO).launch { - var stream: OnlineStream? = null - runCatching { - val wakeupAudio = preBuffer.toFloatArray() - if (wakeupAudio.isEmpty()) { - LogUtils.w(TAG, "❌ 唤醒音频缓存为空,无法注册用户特征") - return@launch - } - - stream = SpeakerRecognition.extractor.createStream() - stream?.acceptWaveform(samples = wakeupAudio, sampleRate = SAMPLE_RATE) - stream?.inputFinished() - - if (stream != null && SpeakerRecognition.extractor.isReady(stream)) { - val embedding = SpeakerRecognition.extractor.compute(stream) - speakerManagerLock.withLock { - SpeakerRecognition.manager.remove(CURRENT_USER_ID) - val embeddingList = mutableListOf(embedding) - val ok = SpeakerRecognition.manager.add( - name = CURRENT_USER_ID, - embedding = embeddingList.toTypedArray() - ) - if (ok) { - LogUtils.d(TAG, "✅ 注册当前唤醒用户特征成功 | 特征长度: ${embedding.size}") - } else { - LogUtils.w(TAG, "❌ 注册当前唤醒用户特征失败") - } - } - } else { - LogUtils.w(TAG, "❌ 唤醒音频Stream未就绪,跳过用户注册") - } - }.onFailure { - LogUtils.e(TAG, "❌ 唤醒用户特征注册失败", it) - }.also { - stream?.release() - } + VoiceUtils.registerWakeupUser( + preBuffer = preBuffer, + extractor = speakerExtractor, + manager = speakerManager + ) } return } val now = System.currentTimeMillis() - if (state == VoiceState.WAIT_WAKEUP) { - calibrateEnvBaseline(samples) - isNoisyEnvironment = currentEnvBaseline >= NOISE_BASELINE_THRESHOLD - LogUtils.d(TAG, "📊 环境状态 | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment") + // 环境基线校准(仅等待唤醒状态) + if (stateManager.state == VoiceState.WAIT_WAKEUP) { + stateManager.currentEnvBaseline = VoiceUtils.calibrateEnvBaseline( + samples = samples, + vadManager = vadManager, + envNoiseBuffer = envNoiseBuffer, + currentEnvBaseline = stateManager.currentEnvBaseline + ) + stateManager.isNoisyEnvironment = stateManager.currentEnvBaseline >= VoiceConfig.NOISE_BASELINE_THRESHOLD + LogUtils.d(VoiceConfig.TAG, "📊 环境状态 | 基线: ${stateManager.currentEnvBaseline} | 嘈杂环境: ${stateManager.isNoisyEnvironment}") } - when (state) { + // 状态分发 + when (stateManager.state) { VoiceState.WAIT_WAKEUP, VoiceState.PLAYING_PROMPT, VoiceState.PLAYING_BACKEND, VoiceState.UPLOADING -> return VoiceState.WAIT_SPEECH_COOLDOWN -> { - if (now >= speechEnableAtMs) { - waitSpeechFailStartMs = now - state = VoiceState.WAIT_SPEECH - waitSpeechStartMs = now - } + stateManager.handleWaitSpeechCooldown(now) return } VoiceState.WAIT_SPEECH -> { - if ((waitSpeechStartMs > 0 && now - waitSpeechStartMs >= idleTimeoutMs) || - (waitSpeechFailStartMs > 0 && now - waitSpeechFailStartMs >= idleTimeoutMs) - ) { - currentTimeoutType = if (hasInvalidSpeech) { - TimeoutType.INVALID_SPEECH_TIMEOUT - } else { - TimeoutType.IDLE_TIMEOUT - } - LogUtils.d(TAG, "⏱ WAIT_SPEECH timeout → WAIT_WAKEUP | 超时类型: $currentTimeoutType") - onTimeoutTip?.invoke(currentTimeoutType) - resetAll() + // 检查超时(修复点:超时后主动调用 resetAll() 并传参) + if (stateManager.checkWaitSpeechTimeout(now)) { + stateManager.resetAll( + resetRealtimeStats = ::resetRealtimeStats, + audioBuffer = audioBuffer, + preBuffer = preBuffer, + vadManager = vadManager, + wakeupManager = wakeupManager, + envNoiseBuffer = envNoiseBuffer + ) return } - if (inKwsObserve && now - kwsObserveStartMs < KWS_OBSERVE_MS) return - inKwsObserve = false + // 唤醒观察期 + if (stateManager.inKwsObserve && now - stateManager.kwsObserveStartMs < VoiceConfig.KWS_OBSERVE_MS) return + stateManager.inKwsObserve = false + // VAD检测 vadManager.accept(samples) } VoiceState.RECORDING -> { + // 音频缓存 audioBuffer.addAll(samples.asList()) vadManager.accept(samples) - calibrateEnvBaseline(samples) - updateRealtimeEnergy(samples) - updateRealtimeFrameStats() - isNoisyEnvironment = currentEnvBaseline >= NOISE_BASELINE_THRESHOLD + // 环境校准 + stateManager.currentEnvBaseline = VoiceUtils.calibrateEnvBaseline( + samples = samples, + vadManager = vadManager, + envNoiseBuffer = envNoiseBuffer, + currentEnvBaseline = stateManager.currentEnvBaseline + ) + stateManager.isNoisyEnvironment = stateManager.currentEnvBaseline >= VoiceConfig.NOISE_BASELINE_THRESHOLD - if (checkMultiPersonDialogueRealtime(now)) { - LogUtils.w(TAG, "🚨 录音中识别出多人对话,提前终止") + // 更新实时统计 + val energyStats = VoiceUtils.updateRealtimeEnergy( + samples = samples, + vadManager = vadManager, + isNoisyEnvironment = stateManager.isNoisyEnvironment, + currentEnvBaseline = stateManager.currentEnvBaseline, + realtimeEnergySum = realtimeEnergySum, + realtimeEnergyCount = realtimeEnergyCount, + realtimePeakRms = realtimePeakRms + ) + realtimeEnergySum = energyStats.first + realtimeEnergyCount = energyStats.second + realtimePeakRms = energyStats.third + + val frameStats = VoiceUtils.updateRealtimeFrameStats(vadManager) + realtimeTotalFrames = frameStats.totalFrames + realtimeSpeechFrames = frameStats.speechFrames + realtimeContinuousSpeechFrames = frameStats.continuousSpeechFrames + realtimeLastFrameIsSpeech = frameStats.lastFrameIsSpeech + + // 多人对话检测 + isMultiPersonDialogueDetected = VoiceUtils.checkMultiPersonDialogue( + now = now, + recordingStartMs = stateManager.recordingStartMs, + realtimeEnergySum = realtimeEnergySum, + realtimeEnergyCount = realtimeEnergyCount, + realtimePeakRms = realtimePeakRms, + realtimeSpeechFrames = realtimeSpeechFrames, + realtimeContinuousSpeechFrames = realtimeContinuousSpeechFrames, + vadManager = vadManager + ) + + if (isMultiPersonDialogueDetected) { + LogUtils.w(VoiceConfig.TAG, "🚨 录音中识别出多人对话,提前终止") finishSentence(realtimeEnergySum / realtimeEnergyCount, realtimePeakRms) return } - if (System.currentTimeMillis() - recordingStartMs > maxRecordingMs) { - LogUtils.w(TAG, "⏱ Max recording reached | 当前环境基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment") + // 最大录音时长检测 + if (System.currentTimeMillis() - stateManager.recordingStartMs > stateManager.maxRecordingMs) { + LogUtils.w(VoiceConfig.TAG, "⏱ Max recording reached | 当前环境基线: ${stateManager.currentEnvBaseline} | 嘈杂环境: ${stateManager.isNoisyEnvironment}") finishSentence(realtimeEnergySum / realtimeEnergyCount, realtimePeakRms) } } } } - /* ================= 实时能量更新 ================= */ - private fun updateRealtimeEnergy(samples: FloatArray) { - val rms = vadManager.calcRms(samples) - val effectiveThreshold = if (isNoisyEnvironment) currentEnvBaseline * 1.8f else MIN_EFFECTIVE_SPEECH_RMS - if (rms >= effectiveThreshold) { - realtimeEnergySum += rms - realtimeEnergyCount++ - realtimePeakRms = maxOf(realtimePeakRms, rms) - } - } - - /* ================= 实时帧统计 ================= */ - private fun updateRealtimeFrameStats() { - realtimeTotalFrames = vadManager.getTotalFrames() - realtimeSpeechFrames = vadManager.getSpeechFrames() - realtimeContinuousSpeechFrames = vadManager.getContinuousSpeechFrames() - val currentFrameIsSpeech = vadManager.isSpeechDetected() - if (currentFrameIsSpeech) { - realtimeContinuousSpeechFrames = if (realtimeLastFrameIsSpeech) realtimeContinuousSpeechFrames + 1 else 1 - } else { - realtimeContinuousSpeechFrames = 0 - } - realtimeLastFrameIsSpeech = currentFrameIsSpeech - } - - /* ================= 多人对话检测 ================= */ - private fun checkMultiPersonDialogueRealtime(now: Long): Boolean { - val duration = now - recordingStartMs - if (duration < MULTI_DIALOGUE_MIN_DURATION) return false - - val avgEnergy = if (realtimeEnergyCount > 0) realtimeEnergySum / realtimeEnergyCount else 0f - val peakAvgRatio = if (avgEnergy > 0) realtimePeakRms / avgEnergy else 0f - val continuousRatio = if (realtimeSpeechFrames > 0) realtimeContinuousSpeechFrames.toFloat() / realtimeSpeechFrames else 0f - val vadRatio = vadManager.activeSpeechRatio() - - isMultiPersonDialogueDetected = duration >= MULTI_DIALOGUE_MIN_DURATION && - peakAvgRatio in MULTI_DIALOGUE_MIN_PEAK_AVG_RATIO..MULTI_DIALOGUE_MAX_PEAK_AVG_RATIO && - continuousRatio <= MULTI_DIALOGUE_MAX_CONTINUOUS_RATIO && - vadRatio >= MULTI_DIALOGUE_MIN_VAD_RATIO - - return isMultiPersonDialogueDetected - } - - /* ================= 环境基线校准 ================= */ - private fun calibrateEnvBaseline(samples: FloatArray) { - val rms = vadManager.calcRms(samples) - val validRms = if (rms < currentEnvBaseline + 0.002f) rms else currentEnvBaseline - if (rms < 0.015f) { - if (envNoiseBuffer.size >= BASELINE_WINDOW_SIZE) { - envNoiseBuffer.removeFirst() - } - envNoiseBuffer.addLast(validRms) - currentEnvBaseline = envNoiseBuffer.maxOrNull() ?: 0.001f - } - } - - /* ================= 唤醒处理 ================= */ + /** + * 处理唤醒事件 + */ private fun handleWakeupEvent() { - if (state == VoiceState.UPLOADING) return + if (stateManager.state == VoiceState.UPLOADING) return stopBackendAudio?.invoke() - enterWakeup(interrupt = true) - } - - private fun enterWakeup(interrupt: Boolean) { - waitSpeechFailStartMs = System.currentTimeMillis() - waitSpeechStartMs = System.currentTimeMillis() - - hasInvalidSpeech = false - currentTimeoutType = TimeoutType.IDLE_TIMEOUT - - if (interrupt) { - audioBuffer.clear() - vadManager.reset() - vadStarted = false - resetRealtimeStats() - } - - inKwsObserve = true - kwsObserveStartMs = System.currentTimeMillis() + stateManager.enterWakeup(interrupt = true, resetRealtimeStats = ::resetRealtimeStats) onWakeup() - LogUtils.d(TAG, "🔔 唤醒成功 | 环境基线: $currentEnvBaseline") } + /** + * VAD开始回调 + */ private fun onVadStart() { - if (state != VoiceState.WAIT_SPEECH) return - LogUtils.d(TAG, "🎤 REAL VAD START | 环境基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment") - vadStarted = true - recordingStartMs = System.currentTimeMillis() - audioBuffer.clear() - audioBuffer.addAll(preBuffer) - resetRealtimeStats() - state = VoiceState.RECORDING + stateManager.onVadStart( + audioBuffer = audioBuffer, + preBuffer = preBuffer, + resetRealtimeStats = ::resetRealtimeStats + ) } + /** + * VAD结束回调 + */ private fun onVadEnd(avgEnergy: Float, peakRms: Float) { - if (state != VoiceState.RECORDING) return - LogUtils.d(TAG, "🧠 VAD END | 环境基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment") + if (stateManager.state != VoiceState.RECORDING) return + LogUtils.d(VoiceConfig.TAG, "🧠 VAD END | 环境基线: ${stateManager.currentEnvBaseline} | 嘈杂环境: ${stateManager.isNoisyEnvironment}") val realAvgEnergy = if (realtimeEnergyCount > 0) realtimeEnergySum / realtimeEnergyCount else avgEnergy val realPeakRms = if (realtimePeakRms > 0) realtimePeakRms else peakRms finishSentence(realAvgEnergy, realPeakRms) } - /* ================= 微弱人声过滤 ================= */ - private fun filterWeakVoice(duration: Long, avgEnergy: Float, peakRms: Float): Boolean { - if (duration < MIN_EFFECTIVE_VOICE_DURATION) { - LogUtils.w(TAG, "❌ 微弱人声过滤:时长${duration}ms < ${MIN_EFFECTIVE_VOICE_DURATION}ms") - return true - } - - val voiceFrameRatio = if (realtimeTotalFrames > 0) realtimeSpeechFrames.toFloat() / realtimeTotalFrames else 0f - if (avgEnergy < NORMAL_VOICE_ENERGY_THRESHOLD && voiceFrameRatio < MIN_VOICE_FRAME_RATIO) { - LogUtils.w(TAG, "❌ 微弱人声过滤:帧占比${voiceFrameRatio} < ${MIN_VOICE_FRAME_RATIO}") - return true - } - - val peakBaselineRatio = peakRms / currentEnvBaseline - if (avgEnergy < NORMAL_VOICE_ENERGY_THRESHOLD && peakBaselineRatio < MIN_PEAK_ENERGY_RATIO) { - LogUtils.w(TAG, "❌ 微弱人声过滤:峰值/基线${peakBaselineRatio} < ${MIN_PEAK_ENERGY_RATIO}") - return true - } - - if (avgEnergy < NORMAL_VOICE_ENERGY_THRESHOLD && realtimeContinuousSpeechFrames < MIN_CONTINUOUS_VOICE_FRAMES) { - LogUtils.w(TAG, "❌ 微弱人声过滤:连续帧${realtimeContinuousSpeechFrames} < ${MIN_CONTINUOUS_VOICE_FRAMES}") - return true - } - - val energyBaselineRatio = avgEnergy / currentEnvBaseline - if (avgEnergy < 0.005f && energyBaselineRatio < 1.2f) { - LogUtils.w(TAG, "❌ 微弱人声过滤:能量/基线${energyBaselineRatio} < 1.2") - return true - } - - return false - } - - /* ================= 结束录音 ================= */ + /** + * 结束录音处理 + */ private fun finishSentence(avgEnergy: Float = 0f, peakRms: Float = 0f) { val now = System.currentTimeMillis() - val duration = now - recordingStartMs + val duration = now - stateManager.recordingStartMs - if (!vadStarted || duration < MIN_SPEECH_MS) { - LogUtils.d(TAG, "❌ 语音过短: $duration ms | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment") - hasInvalidSpeech = true - resetToWaitSpeech() + // 基础过滤:语音过短 + if (!stateManager.vadStarted || duration < VoiceConfig.MIN_SPEECH_MS) { + LogUtils.d(VoiceConfig.TAG, "❌ 语音过短: $duration ms | 基线: ${stateManager.currentEnvBaseline} | 嘈杂环境: ${stateManager.isNoisyEnvironment}") + stateManager.hasInvalidSpeech = true + stateManager.resetToWaitSpeech(::resetRealtimeStats, audioBuffer, vadManager) return } - if (filterWeakVoice(duration, avgEnergy, peakRms)) { - hasInvalidSpeech = true - resetToWaitSpeech() + // 微弱人声过滤 + if (VoiceUtils.filterWeakVoice( + duration = duration, + avgEnergy = avgEnergy, + peakRms = peakRms, + currentEnvBaseline = stateManager.currentEnvBaseline, + realtimeTotalFrames = realtimeTotalFrames, + realtimeSpeechFrames = realtimeSpeechFrames, + realtimeContinuousSpeechFrames = realtimeContinuousSpeechFrames + ) + ) { + stateManager.hasInvalidSpeech = true + stateManager.resetToWaitSpeech(::resetRealtimeStats, audioBuffer, vadManager) return } @@ -424,104 +273,95 @@ class VoiceController( val vadRatio = vadManager.activeSpeechRatio() val peakAvgRatio = if (avgEnergy > 0f) peakRms / avgEnergy else 0f - LogUtils.d(TAG, "📊 录音信息 | 时长: $duration ms | 能量: $avgEnergy | 峰均比: $peakAvgRatio | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment") - LogUtils.d(TAG, "📊 实时帧统计 | 总帧: $realtimeTotalFrames | 语音帧: $realtimeSpeechFrames | 连续语音帧: $realtimeContinuousSpeechFrames") + LogUtils.d(VoiceConfig.TAG, "📊 录音信息 | 时长: $duration ms | 能量: $avgEnergy | 峰均比: $peakAvgRatio | 基线: ${stateManager.currentEnvBaseline} | 嘈杂环境: ${stateManager.isNoisyEnvironment}") + LogUtils.d(VoiceConfig.TAG, "📊 实时帧统计 | 总帧: $realtimeTotalFrames | 语音帧: $realtimeSpeechFrames | 连续语音帧: $realtimeContinuousSpeechFrames") + // 多人对话过滤 if (isMultiPersonDialogueDetected) { - LogUtils.w(TAG, "❌ 过滤多人对话垃圾语音 | 时长: $duration ms") - hasInvalidSpeech = true - resetToWaitSpeech() + LogUtils.w(VoiceConfig.TAG, "❌ 过滤多人对话垃圾语音 | 时长: $duration ms") + stateManager.hasInvalidSpeech = true + stateManager.resetToWaitSpeech(::resetRealtimeStats, audioBuffer, vadManager) return } - // 声纹验证(核心极简版) - if (ENABLE_STRICT_SPEAKER_VERIFY) { - val isCurrentUser = verifySpeaker(audio) + // 声纹验证 + if (VoiceConfig.ENABLE_STRICT_SPEAKER_VERIFY) { + val isCurrentUser = VoiceUtils.verifySpeaker( + audio = audio, + isNoisyEnvironment = stateManager.isNoisyEnvironment, + extractor = speakerExtractor, + manager = speakerManager + ) if (!isCurrentUser) { - LogUtils.w(TAG, "❌ 非当前唤醒用户,拒绝语音 | 录音时长: $duration ms | 嘈杂环境: $isNoisyEnvironment") - hasInvalidSpeech = true - resetToWaitSpeech() + LogUtils.w(VoiceConfig.TAG, "❌ 非当前唤醒用户,拒绝语音 | 录音时长: $duration ms | 嘈杂环境: ${stateManager.isNoisyEnvironment}") + stateManager.hasInvalidSpeech = true + stateManager.resetToWaitSpeech(::resetRealtimeStats, audioBuffer, vadManager) return } - LogUtils.d(TAG, "✅ 当前用户语音,继续处理 | 录音时长: $duration ms | 嘈杂环境: $isNoisyEnvironment") + LogUtils.d(VoiceConfig.TAG, "✅ 当前用户语音,继续处理 | 录音时长: $duration ms | 嘈杂环境: ${stateManager.isNoisyEnvironment}") } - // 远场过滤 - val isFarField = avgEnergy < MAX_FAR_FIELD_ENERGY - val isInvalidPeakRatio = peakAvgRatio < MIN_VALID_PEAK_AVG_RATIO + val isFarField = avgEnergy < VoiceConfig.MAX_FAR_FIELD_ENERGY + val isInvalidPeakRatio = peakAvgRatio < VoiceConfig.MIN_VALID_PEAK_AVG_RATIO if (isFarField && isInvalidPeakRatio) { - LogUtils.w(TAG, "❌ 远场/无效语音过滤 | 能量: $avgEnergy < $MAX_FAR_FIELD_ENERGY") - hasInvalidSpeech = true - resetToWaitSpeech() + LogUtils.w(VoiceConfig.TAG, "❌ 远场/无效语音过滤 | 能量: $avgEnergy < ${VoiceConfig.MAX_FAR_FIELD_ENERGY}") + stateManager.hasInvalidSpeech = true + stateManager.resetToWaitSpeech(::resetRealtimeStats, audioBuffer, vadManager) return } // 非连续判定 val continuousRatio = if (realtimeSpeechFrames > 0) realtimeContinuousSpeechFrames.toFloat() / realtimeSpeechFrames else 0f val peakPositionRatio = vadManager.getPeakPositionRatio() - val isDiscontinuous = continuousRatio < MIN_CONTINUOUS_FRAME_RATIO && - realtimeSpeechFrames < MIN_EFFECTIVE_SPEECH_FRAMES && - peakPositionRatio > MAX_PEAK_POSITION_RATIO + val isDiscontinuous = continuousRatio < VoiceConfig.MIN_CONTINUOUS_FRAME_RATIO && + realtimeSpeechFrames < VoiceConfig.MIN_EFFECTIVE_SPEECH_FRAMES && + peakPositionRatio > VoiceConfig.MAX_PEAK_POSITION_RATIO if (isDiscontinuous) { - LogUtils.w(TAG, "❌ 非连续杂音过滤 | 连续占比: $continuousRatio < $MIN_CONTINUOUS_FRAME_RATIO") - hasInvalidSpeech = true - resetToWaitSpeech() + LogUtils.w(VoiceConfig.TAG, "❌ 非连续杂音过滤 | 连续占比: $continuousRatio < ${VoiceConfig.MIN_CONTINUOUS_FRAME_RATIO}") + stateManager.hasInvalidSpeech = true + stateManager.resetToWaitSpeech(::resetRealtimeStats, audioBuffer, vadManager) return } // 分场景阈值过滤 - val isQuietEnv = currentEnvBaseline < BASELINE_QUIET_THRESHOLD - val thresholdConfig = when { - duration in SHORT_SPEECH_MIN..SHORT_SPEECH_MAX -> { - val coeff = if (isQuietEnv) SHORT_SPEECH_ENERGY_COEFF_QUIET else SHORT_SPEECH_ENERGY_COEFF_NOISY - val energyThreshold = currentEnvBaseline * coeff - ThresholdConfig(energyThreshold, SHORT_SPEECH_VAD_COEFF, SHORT_SPEECH_MIN_SCORE, "短语音") - } - else -> { - val coeff = if (isQuietEnv) LONG_SPEECH_ENERGY_COEFF_QUIET else LONG_SPEECH_ENERGY_COEFF_NOISY - val energyThreshold = currentEnvBaseline * coeff - ThresholdConfig(energyThreshold, LONG_SPEECH_VAD_COEFF, LONG_SPEECH_MIN_SCORE, "长语音") - } - } - + val thresholdConfig = VoiceUtils.getThresholdConfig(duration, stateManager.currentEnvBaseline) val energyPass = avgEnergy >= thresholdConfig.energyThreshold val vadRatioPass = vadRatio >= thresholdConfig.vadRatioThreshold if (!energyPass || !vadRatioPass) { - LogUtils.w(TAG, "❌ 低能量语音阈值过滤 | 能量: $avgEnergy < ${thresholdConfig.energyThreshold} | 占比: $vadRatio < ${thresholdConfig.vadRatioThreshold} | 场景: ${thresholdConfig.scene}") - hasInvalidSpeech = true - resetToWaitSpeech() + LogUtils.w(VoiceConfig.TAG, "❌ 低能量语音阈值过滤 | 能量: $avgEnergy < ${thresholdConfig.energyThreshold} | 占比: $vadRatio < ${thresholdConfig.vadRatioThreshold} | 场景: ${thresholdConfig.scene}") + stateManager.hasInvalidSpeech = true + stateManager.resetToWaitSpeech(::resetRealtimeStats, audioBuffer, vadManager) return } // 评分判定 - var score = 0 - score += when { - duration >= 4000 -> 3 - duration >= 2500 -> 2 - else -> 1 - } - score += if (avgEnergy >= thresholdConfig.energyThreshold) 1 else 0 - score += if (continuousRatio >= MIN_CONTINUOUS_FRAME_RATIO) 1 else 0 - + val score = VoiceUtils.calculateSpeechScore( + duration = duration, + avgEnergy = avgEnergy, + continuousRatio = continuousRatio, + thresholdConfig = thresholdConfig + ) val pass = score >= thresholdConfig.minScore if (!pass) { - LogUtils.w(TAG, "❌ 评分不足过滤 | 总分: $score < ${thresholdConfig.minScore} | 场景: ${thresholdConfig.scene}") - hasInvalidSpeech = true - resetToWaitSpeech() + LogUtils.w(VoiceConfig.TAG, "❌ 评分不足过滤 | 总分: $score < ${thresholdConfig.minScore} | 场景: ${thresholdConfig.scene}") + stateManager.hasInvalidSpeech = true + stateManager.resetToWaitSpeech(::resetRealtimeStats, audioBuffer, vadManager) return } // 最终通过 audioBuffer.clear() - state = VoiceState.UPLOADING + stateManager.state = VoiceState.UPLOADING onFinalAudio(audio) resetRealtimeStats() - hasInvalidSpeech = false - LogUtils.i(TAG, "✅ 语音通过 | 时长: $duration ms | 能量: $avgEnergy | 场景: ${thresholdConfig.scene} | 嘈杂环境: $isNoisyEnvironment") + stateManager.hasInvalidSpeech = false + LogUtils.i(VoiceConfig.TAG, "✅ 语音通过 | 时长: $duration ms | 能量: $avgEnergy | 场景: ${thresholdConfig.scene} | 嘈杂环境: ${stateManager.isNoisyEnvironment}") } - /* ================= 重置实时统计 ================= */ + /** + * 重置实时统计 + */ private fun resetRealtimeStats() { realtimeEnergySum = 0f realtimeEnergyCount = 0 @@ -533,95 +373,34 @@ class VoiceController( isMultiPersonDialogueDetected = false } - /* ================= 播放/上传回调 ================= */ - fun onPlayStartPrompt() { - LogUtils.d(TAG, "🎵 播放提示音 | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment") - state = VoiceState.PLAYING_PROMPT - } - - fun onPlayEndPrompt() { - speechEnableAtMs = System.currentTimeMillis() + SPEECH_COOLDOWN_MS - LogUtils.d(TAG, "🎵 提示音结束 | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment") - state = VoiceState.WAIT_SPEECH_COOLDOWN - } - - fun onPlayStartBackend() { - if (state != VoiceState.UPLOADING) { - LogUtils.w(TAG, "🎶 非上传完成状态,禁止切换到 PLAYING_BACKEND | 当前状态: $state") - return - } - LogUtils.d(TAG, "🎶 开始播放后台音频 | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment") - state = VoiceState.PLAYING_BACKEND - } - - fun onPlayEndBackend() { - speechEnableAtMs = System.currentTimeMillis() + SPEECH_COOLDOWN_MS - LogUtils.d(TAG, "🎶 后台音频结束 | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment") - state = VoiceState.WAIT_SPEECH_COOLDOWN - } - - fun onUploadFinished(success: Boolean) { - if (state != VoiceState.UPLOADING) return - LogUtils.d(TAG, "📤 上传完成 | 成功: $success | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment") - - if (!success) { - speechEnableAtMs = System.currentTimeMillis() + SPEECH_COOLDOWN_MS - state = VoiceState.WAIT_SPEECH_COOLDOWN - } - } - - private fun resetToWaitSpeech() { - LogUtils.d(TAG, "🔄 重置到等待说话 | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment | 已标记无效说话: $hasInvalidSpeech") - val now = System.currentTimeMillis() - if (now - lastInvalidResetMs < INVALID_RESET_DEBOUNCE_MS) { - LogUtils.d(TAG, "🛡 防抖:1.5秒内重复无效语音,跳过重置") - return - } - lastInvalidResetMs = now - audioBuffer.clear() - vadManager.reset() - vadStarted = false - resetRealtimeStats() - state = VoiceState.WAIT_SPEECH - if (waitSpeechFailStartMs == 0L) waitSpeechFailStartMs = System.currentTimeMillis() - } - - private fun resetAll() { - LogUtils.d(TAG, "🔄 重置所有状态 | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment | 本次超时类型: $currentTimeoutType") - audioBuffer.clear() - preBuffer.clear() - vadManager.reset() - wakeupManager.reset() - vadStarted = false - waitSpeechStartMs = 0L - waitSpeechFailStartMs = 0L - envNoiseBuffer.clear() - currentEnvBaseline = 0.001f - isNoisyEnvironment = false - resetRealtimeStats() - hasInvalidSpeech = false - currentTimeoutType = TimeoutType.IDLE_TIMEOUT - state = VoiceState.WAIT_WAKEUP - } + // ================= 对外API(完全不变) ================= + fun onPlayStartPrompt() = stateManager.onPlayStartPrompt() + fun onPlayEndPrompt() = stateManager.onPlayEndPrompt() + fun onPlayStartBackend() = stateManager.onPlayStartBackend() + fun onPlayEndBackend() = stateManager.onPlayEndBackend() + fun onUploadFinished(success: Boolean) = stateManager.onUploadFinished(success) + /** + * 资源释放 + */ fun release() { - LogUtils.d(TAG, "🔌 释放资源 | 最终基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment") + LogUtils.d(VoiceConfig.TAG, "🔌 释放资源 | 最终基线: ${stateManager.currentEnvBaseline} | 嘈杂环境: ${stateManager.isNoisyEnvironment}") wakeupManager.release() vadManager.reset() envNoiseBuffer.clear() resetRealtimeStats() - hasInvalidSpeech = false - currentTimeoutType = TimeoutType.IDLE_TIMEOUT - isNoisyEnvironment = false + stateManager.hasInvalidSpeech = false + stateManager.currentTimeoutType = TimeoutType.IDLE_TIMEOUT + stateManager.isNoisyEnvironment = false runCatching { - SpeakerRecognition.extractor.release() + speakerExtractor.release() speakerManagerLock.withLock { - SpeakerRecognition.manager.release() + speakerManager.release() } - LogUtils.d(TAG, "✅ 声纹识别器资源已释放") + LogUtils.d(VoiceConfig.TAG, "✅ 声纹识别器资源已释放") }.onFailure { - LogUtils.e(TAG, "❌ 释放声纹识别器资源失败", it) + LogUtils.e(VoiceConfig.TAG, "❌ 释放声纹识别器资源失败", it) } } @@ -629,85 +408,7 @@ class VoiceController( runCatching { release() }.onFailure { - LogUtils.e(TAG, "❌ finalize 释放资源失败", it) - } - } - - private fun cachePreBuffer(samples: FloatArray) { - for (s in samples) { - preBuffer.addLast(s) - if (preBuffer.size > PRE_BUFFER_SIZE) preBuffer.removeFirst() - } - } - - // 阈值配置数据类 - private data class ThresholdConfig( - val energyThreshold: Float, - val vadRatioThreshold: Float, - val minScore: Int, - val scene: String - ) - - /* ================= 核心:极简版声纹验证 ================= */ - private fun verifySpeaker(audio: FloatArray): Boolean { - if (audio.isEmpty()) { - LogUtils.w(TAG, "❌ 待验证音频为空,声纹验证失败") - return false - } - - // 1. 裁剪音频:只保留本次录音的有效部分(解决时长不匹配问题) - val audioDurationMs = (audio.size.toFloat() / SAMPLE_RATE * 1000).toLong() - // 只保留最后 N 毫秒的音频(N = 实际录音时长),避免缓存旧音频 - val validAudio = if (audioDurationMs > 0) { - val validSampleCount = (audioDurationMs * SAMPLE_RATE / 1000).toInt() - if (validSampleCount < audio.size) { - audio.copyOfRange(audio.size - validSampleCount, audio.size) - } else { - audio - } - } else { - audio - } - - // 2. 分场景选阈值(无容错,只调阈值) - val finalThreshold = when { - audioDurationMs < SHORT_AUDIO_DURATION_MS -> SPEAKER_THRESHOLD_SHORT - isNoisyEnvironment -> SPEAKER_THRESHOLD_NOISY - else -> SPEAKER_THRESHOLD_QUIET - } - - var stream: OnlineStream? = null - return try { - stream = SpeakerRecognition.extractor.createStream() - stream.acceptWaveform(samples = validAudio, sampleRate = SAMPLE_RATE) // 用裁剪后的音频验证 - stream.inputFinished() - - if (!SpeakerRecognition.extractor.isReady(stream)) { - LogUtils.w(TAG, "❌ 音频Stream未就绪,验证失败") - return false - } - - val embedding = SpeakerRecognition.extractor.compute(stream) - - // 3. 纯验证逻辑:过就过,不过就拒绝 - speakerManagerLock.withLock { - val verifyPass = SpeakerRecognition.manager.verify( - name = CURRENT_USER_ID, - embedding = embedding, - threshold = finalThreshold - ) - - // 打印关键信息(补充裁剪后时长) - LogUtils.d(TAG, "📊 声纹验证 | 阈值: $finalThreshold | 通过: $verifyPass | 嘈杂环境: $isNoisyEnvironment | 原始时长: ${audioDurationMs}ms | 验证时长: ${(validAudio.size.toFloat()/SAMPLE_RATE*1000).toLong()}ms") - - // 无任何容错:验证结果就是最终结果 - return verifyPass - } - } catch (e: Exception) { - LogUtils.e(TAG, "❌ 声纹验证异常,拒绝", e) - return false - } finally { - stream?.release() + LogUtils.e(VoiceConfig.TAG, "❌ finalize 释放资源失败", it) } } } \ No newline at end of file diff --git a/app/src/main/java/com/zs/smarthuman/sherpa/VoiceStateManager.kt b/app/src/main/java/com/zs/smarthuman/sherpa/VoiceStateManager.kt new file mode 100644 index 0000000..c1b5d90 --- /dev/null +++ b/app/src/main/java/com/zs/smarthuman/sherpa/VoiceStateManager.kt @@ -0,0 +1,211 @@ +package com.zs.smarthuman.sherpa + +import com.blankj.utilcode.util.LogUtils +import java.util.ArrayDeque + +/** + * 语音控制器状态管理类 + */ +class VoiceStateManager( + idleTimeoutSeconds: Int, + maxRecordingSeconds: Int, + private val onStateChanged: ((VoiceState) -> Unit)?, + private val onTimeoutTip: OnTimeoutTip? +) { + var state: VoiceState = VoiceState.WAIT_WAKEUP + set(value) { + field = value + LogUtils.d(VoiceConfig.TAG, "➡ State = $value") + onStateChanged?.invoke(value) + } + + // 超时相关 + val idleTimeoutMs = idleTimeoutSeconds * 1000L + val maxRecordingMs = maxRecordingSeconds * 1000L + var waitSpeechFailStartMs = 0L + var waitSpeechStartMs = 0L + var speechEnableAtMs = 0L + var lastInvalidResetMs = 0L + + // 无效说话标记 + 超时类型 + var hasInvalidSpeech = false + var currentTimeoutType: TimeoutType = TimeoutType.IDLE_TIMEOUT + + // 唤醒观察标记 + var inKwsObserve = false + var kwsObserveStartMs = 0L + + // 环境状态 + var isNoisyEnvironment = false + var currentEnvBaseline = 0.001f + + // 录音状态 + var recordingStartMs = 0L + var vadStarted = false + + /** + * 检查等待说话超时 + * 修复点:返回是否超时,由外部调用 resetAll()(避免内部依赖外部对象) + */ + fun checkWaitSpeechTimeout(now: Long): Boolean { + val isTimeout = (waitSpeechStartMs > 0 && now - waitSpeechStartMs >= idleTimeoutMs) || + (waitSpeechFailStartMs > 0 && now - waitSpeechFailStartMs >= idleTimeoutMs) + + if (isTimeout) { + currentTimeoutType = if (hasInvalidSpeech) { + TimeoutType.INVALID_SPEECH_TIMEOUT + } else { + TimeoutType.IDLE_TIMEOUT + } + LogUtils.d(VoiceConfig.TAG, "⏱ WAIT_SPEECH timeout → WAIT_WAKEUP | 超时类型: $currentTimeoutType") + onTimeoutTip?.invoke(currentTimeoutType) + // 修复点:不再内部调用 resetAll(),改为返回超时状态,由外部处理 + return true + } + return false + } + + /** + * 处理等待说话冷却状态 + */ + fun handleWaitSpeechCooldown(now: Long): Boolean { + if (now >= speechEnableAtMs) { + waitSpeechFailStartMs = now + state = VoiceState.WAIT_SPEECH + waitSpeechStartMs = now + return true + } + return false + } + + /** + * 进入唤醒状态 + */ + fun enterWakeup(interrupt: Boolean, resetRealtimeStats: () -> Unit) { + val now = System.currentTimeMillis() + waitSpeechFailStartMs = now + waitSpeechStartMs = now + hasInvalidSpeech = false + currentTimeoutType = TimeoutType.IDLE_TIMEOUT + + if (interrupt) { + resetRealtimeStats() + vadStarted = false + } + + inKwsObserve = true + kwsObserveStartMs = now + } + + /** + * 重置到等待说话状态 + */ + fun resetToWaitSpeech(resetRealtimeStats: () -> Unit, audioBuffer: MutableList, vadManager: VadManager) { + LogUtils.d(VoiceConfig.TAG, "🔄 重置到等待说话 | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment | 已标记无效说话: $hasInvalidSpeech") + val now = System.currentTimeMillis() + if (now - lastInvalidResetMs < VoiceConfig.INVALID_RESET_DEBOUNCE_MS) { + LogUtils.d(VoiceConfig.TAG, "🛡 防抖:1.5秒内重复无效语音,跳过重置") + return + } + lastInvalidResetMs = now + audioBuffer.clear() + vadManager.reset() + vadStarted = false + resetRealtimeStats() + state = VoiceState.WAIT_SPEECH + if (waitSpeechFailStartMs == 0L) waitSpeechFailStartMs = System.currentTimeMillis() + } + + /** + * 重置所有状态 + */ + fun resetAll( + resetRealtimeStats: () -> Unit, + audioBuffer: MutableList, + preBuffer: ArrayDeque, + vadManager: VadManager, + wakeupManager: WakeupManager, + envNoiseBuffer: ArrayDeque + ) { + LogUtils.d(VoiceConfig.TAG, "🔄 重置所有状态 | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment | 本次超时类型: $currentTimeoutType") + audioBuffer.clear() + preBuffer.clear() + vadManager.reset() + wakeupManager.reset() + vadStarted = false + waitSpeechStartMs = 0L + waitSpeechFailStartMs = 0L + envNoiseBuffer.clear() + currentEnvBaseline = 0.001f + isNoisyEnvironment = false + resetRealtimeStats() + hasInvalidSpeech = false + currentTimeoutType = TimeoutType.IDLE_TIMEOUT + state = VoiceState.WAIT_WAKEUP + } + + /** + * 播放提示音开始 + */ + fun onPlayStartPrompt() { + LogUtils.d(VoiceConfig.TAG, "🎵 播放提示音 | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment") + state = VoiceState.PLAYING_PROMPT + } + + /** + * 播放提示音结束 + */ + fun onPlayEndPrompt() { + speechEnableAtMs = System.currentTimeMillis() + VoiceConfig.SPEECH_COOLDOWN_MS + LogUtils.d(VoiceConfig.TAG, "🎵 提示音结束 | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment") + state = VoiceState.WAIT_SPEECH_COOLDOWN + } + + /** + * 播放后台音频开始 + */ + fun onPlayStartBackend() { + if (state != VoiceState.UPLOADING) { + LogUtils.w(VoiceConfig.TAG, "🎶 非上传完成状态,禁止切换到 PLAYING_BACKEND | 当前状态: $state") + return + } + LogUtils.d(VoiceConfig.TAG, "🎶 开始播放后台音频 | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment") + state = VoiceState.PLAYING_BACKEND + } + + /** + * 播放后台音频结束 + */ + fun onPlayEndBackend() { + speechEnableAtMs = System.currentTimeMillis() + VoiceConfig.SPEECH_COOLDOWN_MS + LogUtils.d(VoiceConfig.TAG, "🎶 后台音频结束 | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment") + state = VoiceState.WAIT_SPEECH_COOLDOWN + } + + /** + * 上传完成 + */ + fun onUploadFinished(success: Boolean) { + if (state != VoiceState.UPLOADING) return + LogUtils.d(VoiceConfig.TAG, "📤 上传完成 | 成功: $success | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment") + + if (!success) { + speechEnableAtMs = System.currentTimeMillis() + VoiceConfig.SPEECH_COOLDOWN_MS + state = VoiceState.WAIT_SPEECH_COOLDOWN + } + } + + /** + * VAD开始回调 + */ + fun onVadStart(audioBuffer: MutableList, preBuffer: ArrayDeque, resetRealtimeStats: () -> Unit) { + if (state != VoiceState.WAIT_SPEECH) return + LogUtils.d(VoiceConfig.TAG, "🎤 REAL VAD START | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment") + vadStarted = true + recordingStartMs = System.currentTimeMillis() + audioBuffer.clear() + audioBuffer.addAll(preBuffer) + resetRealtimeStats() + state = VoiceState.RECORDING + } +} \ No newline at end of file diff --git a/app/src/main/java/com/zs/smarthuman/sherpa/VoiceUtils.kt b/app/src/main/java/com/zs/smarthuman/sherpa/VoiceUtils.kt new file mode 100644 index 0000000..25768c8 --- /dev/null +++ b/app/src/main/java/com/zs/smarthuman/sherpa/VoiceUtils.kt @@ -0,0 +1,355 @@ +package com.zs.smarthuman.sherpa + +import com.blankj.utilcode.util.LogUtils +import com.k2fsa.sherpa.onnx.OnlineStream +import com.k2fsa.sherpa.onnx.SpeakerEmbeddingExtractor +import com.k2fsa.sherpa.onnx.SpeakerEmbeddingManager +import java.util.ArrayDeque +import java.util.concurrent.locks.ReentrantLock +import kotlin.concurrent.withLock + +/** + * 语音处理通用工具类(优化微弱人声过滤逻辑,适配正常语音) + */ +object VoiceUtils { + private val speakerManagerLock = ReentrantLock() + + /** + * 环境基线校准 + */ + fun calibrateEnvBaseline( + samples: FloatArray, + vadManager: VadManager, + envNoiseBuffer: ArrayDeque, + currentEnvBaseline: Float + ): Float { + val rms = vadManager.calcRms(samples) + val validRms = if (rms < currentEnvBaseline + 0.002f) rms else currentEnvBaseline + if (rms < 0.015f) { + if (envNoiseBuffer.size >= VoiceConfig.BASELINE_WINDOW_SIZE) { + envNoiseBuffer.removeFirst() + } + envNoiseBuffer.addLast(validRms) + return envNoiseBuffer.maxOrNull() ?: 0.001f + } + return currentEnvBaseline + } + + /** + * 更新实时能量统计 + */ + fun updateRealtimeEnergy( + samples: FloatArray, + vadManager: VadManager, + isNoisyEnvironment: Boolean, + currentEnvBaseline: Float, + realtimeEnergySum: Float, + realtimeEnergyCount: Int, + realtimePeakRms: Float + ): Triple { + val rms = vadManager.calcRms(samples) + val effectiveThreshold = if (isNoisyEnvironment) currentEnvBaseline * 1.8f else VoiceConfig.MIN_EFFECTIVE_SPEECH_RMS + var newSum = realtimeEnergySum + var newCount = realtimeEnergyCount + var newPeak = realtimePeakRms + if (rms >= effectiveThreshold) { + newSum += rms + newCount++ + newPeak = maxOf(newPeak, rms) + } + return Triple(newSum, newCount, newPeak) + } + + /** + * 更新实时帧统计 + */ + fun updateRealtimeFrameStats(vadManager: VadManager): FrameStats { + val totalFrames = vadManager.getTotalFrames() + val speechFrames = vadManager.getSpeechFrames() + val continuousSpeechFrames = vadManager.getContinuousSpeechFrames() + val currentFrameIsSpeech = vadManager.isSpeechDetected() + val newContinuousFrames = if (currentFrameIsSpeech) { + if (vadManager.getContinuousSpeechFrames() > 0) continuousSpeechFrames + 1 else 1 + } else { + 0 + } + return FrameStats( + totalFrames = totalFrames, + speechFrames = speechFrames, + continuousSpeechFrames = newContinuousFrames, + lastFrameIsSpeech = currentFrameIsSpeech + ) + } + + /** + * 多人对话实时检测 + */ + fun checkMultiPersonDialogue( + now: Long, + recordingStartMs: Long, + realtimeEnergySum: Float, + realtimeEnergyCount: Int, + realtimePeakRms: Float, + realtimeSpeechFrames: Int, + realtimeContinuousSpeechFrames: Int, + vadManager: VadManager + ): Boolean { + val duration = now - recordingStartMs + if (duration < VoiceConfig.MULTI_DIALOGUE_MIN_DURATION) return false + + val avgEnergy = if (realtimeEnergyCount > 0) realtimeEnergySum / realtimeEnergyCount else 0f + val peakAvgRatio = if (avgEnergy > 0) realtimePeakRms / avgEnergy else 0f + val continuousRatio = if (realtimeSpeechFrames > 0) realtimeContinuousSpeechFrames.toFloat() / realtimeSpeechFrames else 0f + val vadRatio = vadManager.activeSpeechRatio() + + return duration >= VoiceConfig.MULTI_DIALOGUE_MIN_DURATION && + peakAvgRatio in VoiceConfig.MULTI_DIALOGUE_MIN_PEAK_AVG_RATIO..VoiceConfig.MULTI_DIALOGUE_MAX_PEAK_AVG_RATIO && + continuousRatio <= VoiceConfig.MULTI_DIALOGUE_MAX_CONTINUOUS_RATIO && + vadRatio >= VoiceConfig.MULTI_DIALOGUE_MIN_VAD_RATIO + } + + /** + * 微弱人声过滤(精简版:保留核心层,删除冗余层,避免过度过滤) + */ + fun filterWeakVoice( + duration: Long, + avgEnergy: Float, + peakRms: Float, + currentEnvBaseline: Float, + realtimeTotalFrames: Int, + realtimeSpeechFrames: Int, + realtimeContinuousSpeechFrames: Int + ): Boolean { + // 1. 基础时长过滤(必需:过滤极短杂音) + if (duration < VoiceConfig.MIN_EFFECTIVE_VOICE_DURATION) { + LogUtils.w("${VoiceConfig.TAG}", "❌ 微弱人声过滤:时长${duration}ms < ${VoiceConfig.MIN_EFFECTIVE_VOICE_DURATION}ms") + return true + } + + // 2. 动态能量阈值过滤(核心:分场景放宽短语音阈值) + val dynamicEnergyThreshold = if (duration < VoiceConfig.SHORT_LONG_SPEECH_CUTOFF_MS) + VoiceConfig.SHORT_SPEECH_ENERGY_THRESHOLD + else + VoiceConfig.MIN_NORMAL_VOICE_ENERGY + + if (avgEnergy < dynamicEnergyThreshold) { + LogUtils.w("${VoiceConfig.TAG}", "❌ 微弱人声过滤:平均能量${avgEnergy} < ${if (duration < 2000) "短语音能量阈值${dynamicEnergyThreshold}" else "正常语音能量阈值${dynamicEnergyThreshold}"}") + return true + } + + // 3. 计算VAD占比(辅助:为后续过滤准备) + val voiceFrameRatio = if (realtimeTotalFrames > 0) realtimeSpeechFrames.toFloat() / realtimeTotalFrames else 0f + + // 4. 动态VAD占比+能量联合过滤(核心:分场景适配,避免单一维度误判) + val dynamicVadRatioThreshold = when { + duration < VoiceConfig.SHORT_LONG_SPEECH_CUTOFF_MS -> VoiceConfig.SHORT_SPEECH_VAD_RATIO + currentEnvBaseline >= VoiceConfig.NOISE_BASELINE_THRESHOLD -> VoiceConfig.NOISY_ENV_VAD_RATIO + else -> VoiceConfig.MIN_NORMAL_VOICE_VAD_RATIO + } + + if (voiceFrameRatio < dynamicVadRatioThreshold && avgEnergy < VoiceConfig.NORMAL_VOICE_ENERGY_THRESHOLD) { + LogUtils.w("${VoiceConfig.TAG}", "❌ 微弱人声过滤:语音帧占比${voiceFrameRatio} < ${dynamicVadRatioThreshold} | 平均能量${avgEnergy}") + return true + } + + // 5. 纯底噪过滤(必需:过滤无语音的环境底噪) + val energyBaselineRatio = avgEnergy / currentEnvBaseline + if (avgEnergy < VoiceConfig.PURE_NOISE_ENERGY_THRESHOLD && energyBaselineRatio < VoiceConfig.PURE_NOISE_BASELINE_RATIO) { + LogUtils.w("${VoiceConfig.TAG}", "❌ 微弱人声过滤:能量/基线${energyBaselineRatio} < 1.2(纯底噪)") + return true + } + + // (可选保留)峰值/基线过滤:仅对扁平背景音有效,可根据实际场景选择 + // val peakBaselineRatio = peakRms / currentEnvBaseline + // if (avgEnergy < VoiceConfig.NORMAL_VOICE_ENERGY_THRESHOLD && peakBaselineRatio < VoiceConfig.MIN_PEAK_ENERGY_RATIO) { + // LogUtils.w("${VoiceConfig.TAG}", "❌ 微弱人声过滤:峰值/基线${peakBaselineRatio} < ${VoiceConfig.MIN_PEAK_ENERGY_RATIO}") + // return true + // } + + // 正常语音通过所有核心过滤 + LogUtils.d("${VoiceConfig.TAG}", "✅ 正常语音通过微弱人声过滤 | 时长${duration}ms | 能量${avgEnergy} | VAD占比${voiceFrameRatio} | 基线${currentEnvBaseline}") + return false + } + + /** + * 声纹验证核心逻辑(无修改) + */ + fun verifySpeaker( + audio: FloatArray, + isNoisyEnvironment: Boolean, + extractor: SpeakerEmbeddingExtractor, + manager: SpeakerEmbeddingManager + ): Boolean { + if (audio.isEmpty()) { + LogUtils.w("${VoiceConfig.TAG}", "❌ 待验证音频为空,声纹验证失败") + return false + } + + // 裁剪音频:只保留本次录音的有效部分 + val audioDurationMs = (audio.size.toFloat() / VoiceConfig.SAMPLE_RATE * 1000).toLong() + val validAudio = if (audioDurationMs > 0) { + val validSampleCount = (audioDurationMs * VoiceConfig.SAMPLE_RATE / 1000).toInt() + if (validSampleCount < audio.size) { + audio.copyOfRange(audio.size - validSampleCount, audio.size) + } else { + audio + } + } else { + audio + } + + // 分场景选阈值 + val finalThreshold = when { + audioDurationMs < VoiceConfig.SHORT_AUDIO_DURATION_MS -> VoiceConfig.SPEAKER_THRESHOLD_SHORT + isNoisyEnvironment -> VoiceConfig.SPEAKER_THRESHOLD_NOISY + else -> VoiceConfig.SPEAKER_THRESHOLD_QUIET + } + + var stream: OnlineStream? = null + return try { + stream = extractor.createStream() + stream.acceptWaveform(samples = validAudio, sampleRate = VoiceConfig.SAMPLE_RATE) + stream.inputFinished() + + if (!extractor.isReady(stream)) { + LogUtils.w("${VoiceConfig.TAG}", "❌ 音频Stream未就绪,验证失败") + return false + } + + val embedding = extractor.compute(stream) + speakerManagerLock.withLock { + val verifyPass = manager.verify( + name = VoiceConfig.CURRENT_USER_ID, + embedding = embedding, + threshold = finalThreshold + ) + LogUtils.d("${VoiceConfig.TAG}", "📊 声纹验证 | 阈值: $finalThreshold | 通过: $verifyPass | 嘈杂环境: $isNoisyEnvironment | 原始时长: ${audioDurationMs}ms | 验证时长: ${(validAudio.size.toFloat()/VoiceConfig.SAMPLE_RATE*1000).toLong()}ms") + return verifyPass + } + } catch (e: Exception) { + LogUtils.e("${VoiceConfig.TAG}", "❌ 声纹验证异常,拒绝", e) + return false + } finally { + stream?.release() + } + } + + /** + * 注册唤醒用户声纹特征(无修改) + */ + fun registerWakeupUser( + preBuffer: ArrayDeque, + extractor: SpeakerEmbeddingExtractor, + manager: SpeakerEmbeddingManager + ) { + var stream: OnlineStream? = null + runCatching { + val wakeupAudio = preBuffer.toFloatArray() + if (wakeupAudio.isEmpty()) { + LogUtils.w("${VoiceConfig.TAG}", "❌ 唤醒音频缓存为空,无法注册用户特征") + return + } + + stream = extractor.createStream() + stream?.acceptWaveform(samples = wakeupAudio, sampleRate = VoiceConfig.SAMPLE_RATE) + stream?.inputFinished() + + if (stream != null && extractor.isReady(stream)) { + val embedding = extractor.compute(stream) + speakerManagerLock.withLock { + manager.remove(VoiceConfig.CURRENT_USER_ID) + val embeddingList = mutableListOf(embedding) + val ok = manager.add( + name = VoiceConfig.CURRENT_USER_ID, + embedding = embeddingList.toTypedArray() + ) + if (ok) { + LogUtils.d("${VoiceConfig.TAG}", "✅ 注册当前唤醒用户特征成功 | 特征长度: ${embedding.size}") + } else { + LogUtils.w("${VoiceConfig.TAG}", "❌ 注册当前唤醒用户特征失败") + } + } + } else { + LogUtils.w("${VoiceConfig.TAG}", "❌ 唤醒音频Stream未就绪,跳过用户注册") + } + }.onFailure { + LogUtils.e("${VoiceConfig.TAG}", "❌ 唤醒用户特征注册失败", it) + }.also { + stream?.release() + } + } + + /** + * 缓存预缓冲音频(无修改) + */ + fun cachePreBuffer(samples: FloatArray, preBuffer: ArrayDeque) { + for (s in samples) { + preBuffer.addLast(s) + if (preBuffer.size > VoiceConfig.PRE_BUFFER_SIZE) preBuffer.removeFirst() + } + } + + /** + * 帧统计数据类(无修改) + */ + data class FrameStats( + val totalFrames: Int, + val speechFrames: Int, + val continuousSpeechFrames: Int, + val lastFrameIsSpeech: Boolean + ) + + /** + * 阈值配置数据类(无修改) + */ + data class ThresholdConfig( + val energyThreshold: Float, + val vadRatioThreshold: Float, + val minScore: Int, + val scene: String + ) + + /** + * 获取分场景阈值配置(无修改) + */ + fun getThresholdConfig(duration: Long, currentEnvBaseline: Float): ThresholdConfig { + val isQuietEnv = currentEnvBaseline < VoiceConfig.BASELINE_QUIET_THRESHOLD + return if (duration in VoiceConfig.SHORT_SPEECH_MIN..VoiceConfig.SHORT_SPEECH_MAX) { + val coeff = if (isQuietEnv) VoiceConfig.SHORT_SPEECH_ENERGY_COEFF_QUIET else VoiceConfig.SHORT_SPEECH_ENERGY_COEFF_NOISY + ThresholdConfig( + energyThreshold = currentEnvBaseline * coeff, + vadRatioThreshold = VoiceConfig.SHORT_SPEECH_VAD_COEFF, + minScore = VoiceConfig.SHORT_SPEECH_MIN_SCORE, + scene = "短语音" + ) + } else { + val coeff = if (isQuietEnv) VoiceConfig.LONG_SPEECH_ENERGY_COEFF_QUIET else VoiceConfig.LONG_SPEECH_ENERGY_COEFF_NOISY + ThresholdConfig( + energyThreshold = currentEnvBaseline * coeff, + vadRatioThreshold = VoiceConfig.LONG_SPEECH_VAD_COEFF, + minScore = VoiceConfig.LONG_SPEECH_MIN_SCORE, + scene = "长语音" + ) + } + } + + /** + * 计算语音评分(无修改) + */ + fun calculateSpeechScore( + duration: Long, + avgEnergy: Float, + continuousRatio: Float, + thresholdConfig: ThresholdConfig + ): Int { + var score = 0 + score += when { + duration >= VoiceConfig.LONG_SPEECH_SCORE_CUTOFF_MS -> 3 + duration >= VoiceConfig.MID_SPEECH_SCORE_CUTOFF_MS -> 2 + else -> 1 + } + score += if (avgEnergy >= thresholdConfig.energyThreshold) 1 else 0 + score += if (continuousRatio >= VoiceConfig.MIN_CONTINUOUS_FRAME_RATIO) 1 else 0 + return score + } +} \ No newline at end of file