From d47362ca38fc321369354a5cac876751f6a6c751 Mon Sep 17 00:00:00 2001 From: ross <3024454314@qq.com> Date: Wed, 14 Jan 2026 17:54:11 +0800 Subject: [PATCH] =?UTF-8?q?=E4=B8=B4=E6=97=B6=E6=8F=90=E4=BA=A4=E5=8F=AF?= =?UTF-8?q?=E4=BB=A5=E6=92=AD=E6=94=BEpcm=E7=9A=84=E7=B1=BB?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../java/com/zs/smarthuman/bean/AudioDTO.kt | 9 + .../com/zs/smarthuman/sherpa/VoiceConfig.kt | 162 ---- .../zs/smarthuman/sherpa/VoiceController.kt | 755 ++++++++++++------ .../zs/smarthuman/sherpa/VoiceStateManager.kt | 211 ----- .../com/zs/smarthuman/sherpa/VoiceUtils.kt | 355 -------- .../java/com/zs/smarthuman/ui/MainActivity.kt | 27 +- .../zs/smarthuman/utils/PcmStreamPlayer.kt | 95 +++ .../zs/smarthuman/utils/VoiceStreamPlayer.kt | 101 +++ .../zs/smarthuman/viewmodel/MainViewModel.kt | 6 +- 9 files changed, 752 insertions(+), 969 deletions(-) create mode 100644 app/src/main/java/com/zs/smarthuman/bean/AudioDTO.kt delete mode 100644 app/src/main/java/com/zs/smarthuman/sherpa/VoiceConfig.kt delete mode 100644 app/src/main/java/com/zs/smarthuman/sherpa/VoiceStateManager.kt delete mode 100644 app/src/main/java/com/zs/smarthuman/sherpa/VoiceUtils.kt create mode 100644 app/src/main/java/com/zs/smarthuman/utils/PcmStreamPlayer.kt create mode 100644 app/src/main/java/com/zs/smarthuman/utils/VoiceStreamPlayer.kt diff --git a/app/src/main/java/com/zs/smarthuman/bean/AudioDTO.kt b/app/src/main/java/com/zs/smarthuman/bean/AudioDTO.kt new file mode 100644 index 0000000..581b324 --- /dev/null +++ b/app/src/main/java/com/zs/smarthuman/bean/AudioDTO.kt @@ -0,0 +1,9 @@ +package com.zs.smarthuman.bean + +/** + * @description: + * @author: lrs + * @date: 2026/1/14 11:04 + */ +data class AudioDTO(val samplingRate: Int = 0, val items: MutableList = mutableListOf()) +data class LmChatDTO(val id: Int = 0, val sortId: Int = 0, val text: String = "", val audioData: String = "", val isFinal: Boolean = false) \ No newline at end of file diff --git a/app/src/main/java/com/zs/smarthuman/sherpa/VoiceConfig.kt b/app/src/main/java/com/zs/smarthuman/sherpa/VoiceConfig.kt deleted file mode 100644 index ac7f471..0000000 --- a/app/src/main/java/com/zs/smarthuman/sherpa/VoiceConfig.kt +++ /dev/null @@ -1,162 +0,0 @@ -package com.zs.smarthuman.sherpa - -/** - * @description: 语音控制器配置常量类(所有硬编码参数集中管理,无需修改业务逻辑即可适配场景) - * @author: lrs - * @date: 2026/1/12 17:33 - * @note: 所有参数适配 16kHz 单通道语音场景,调整后需在目标场景(安静/嘈杂/远场)测试验证 - */ -object VoiceConfig { - // ===================== 基础配置(通用核心,禁止随意修改) ===================== - /** 日志打印统一标签,便于筛选语音相关日志 */ - const val TAG = "VoiceController" - - /** 语音采样率(固定16kHz),与VAD/声纹模型强绑定,禁止修改 */ - const val SAMPLE_RATE = 16000 - - /** 预缓存音频大小(2秒),用于唤醒后补全唤醒前的语音片段,防止开头缺失 */ - const val PRE_BUFFER_SIZE = SAMPLE_RATE * 2 - - /** 声纹验证的当前唤醒用户ID,业务层可动态替换 */ - const val CURRENT_USER_ID = "current_wakeup_user" - - /** 是否开启严格的声纹验证:测试阶段设为false,生产环境建议设为true */ - const val ENABLE_STRICT_SPEAKER_VERIFY = true - - // ===================== 时间阈值(ms,时序控制核心) ===================== - /** 空闲超时默认秒数(实际超时=200*1000=200秒),建议调整为5-30秒 */ - const val IDLE_TIMEOUT_DEFAULT_SECONDS = 200 - - /** 最大录音时长(10秒),防止录音过长占用内存:短指令3-5秒,长语音10-20秒 */ - const val MAX_RECORDING_DEFAULT_SECONDS = 10 - - /** 短语音判定阈值(1秒),声纹验证分场景的临界值,无需调整 */ - const val SHORT_AUDIO_DURATION_MS = 1000L - - /** 无效语音重置防抖时间(1.5秒),避免1.5秒内重复重置状态,建议1-2秒 */ - const val INVALID_RESET_DEBOUNCE_MS = 1500L - - /** 有效语音最小时长(800ms),过滤极短的杂音/按键音,建议600-1000ms */ - const val MIN_SPEECH_MS = 800L - - /** 微弱人声过滤的基础时长阈值(400ms),建议为MIN_SPEECH_MS的一半 */ - const val MIN_EFFECTIVE_VOICE_DURATION = 400L - - /** 唤醒后观察期(500ms),期间不处理VAD防止误触发,建议300-800ms */ - const val KWS_OBSERVE_MS = 500L - - /** 说话冷却期(300ms),提示音/后台音频结束后延迟进入等待说话,建议200-400ms */ - const val SPEECH_COOLDOWN_MS = 300L - - /** 短语音时长范围(0.5-2秒),分场景阈值的判定依据,无需调整 */ - const val SHORT_SPEECH_MIN = 500L - const val SHORT_SPEECH_MAX = 2000L - - /** 多人对话检测的最小时长(2.5秒),太短易误判,太长漏判,建议2-3秒 */ - const val MULTI_DIALOGUE_MIN_DURATION = 2500L - - // ===================== 环境/噪音阈值(能量类,核心过滤参数) ===================== - /** 嘈杂环境判定阈值:环境基线≥0.01f则为嘈杂环境,安静场景0.008f,嘈杂场景0.015f */ - const val NOISE_BASELINE_THRESHOLD = 0.01f - - /** 环境基线校准的滑动窗口大小(50帧),太小基线波动大,太大校准滞后,建议30-80 */ - const val BASELINE_WINDOW_SIZE = 50 - - /** 安静环境判定阈值:环境基线<0.005f则为安静环境,建议为NOISE_BASELINE_THRESHOLD的一半 */ - const val BASELINE_QUIET_THRESHOLD = 0.005f - - /** 有效语音的最小RMS能量(0.0005f),太低统计背景噪,太高漏统计弱人声,建议0.0003-0.0008f */ - const val MIN_EFFECTIVE_SPEECH_RMS = 0.0005f - - /** 正常语音能量阈值(0.008f),联合过滤的临界值,嘈杂环境0.01f,安静环境0.006f */ - const val NORMAL_VOICE_ENERGY_THRESHOLD = 0.008f - - // ===================== 声纹验证阈值(分场景适配,核心体验参数) ===================== - /** 安静环境声纹验证阈值(0.50f),越高越严格:准确率优先0.55f,通过率优先0.45f */ - const val SPEAKER_THRESHOLD_QUIET = 0.50f - - /** 嘈杂环境声纹验证阈值(0.43f),放宽以提高通过率,比安静环境低0.05-0.1f */ - const val SPEAKER_THRESHOLD_NOISY = 0.43f - - /** 短语音声纹验证阈值(0.40f),进一步放宽,比嘈杂环境低0.03-0.05f */ - const val SPEAKER_THRESHOLD_SHORT = 0.40f - - // ===================== 能量/占比阈值(过滤核心,分场景适配) ===================== - /** 正常语音最低能量(0.03f),短语音适配0.01f,长语音保持0.03f */ - const val MIN_NORMAL_VOICE_ENERGY = 0.03f - - /** 正常语音VAD占比阈值(0.2f),短语音0.1f,嘈杂环境0.15f */ - const val MIN_NORMAL_VOICE_VAD_RATIO = 0.2f - - /** 远场语音最大能量阈值(0.015f),近场0.01f,远场0.02f */ - const val MAX_FAR_FIELD_ENERGY = 0.015f - - /** 有效语音最小峰均比(0.5f),过滤扁平背景噪,建议0.4-0.6f */ - const val MIN_VALID_PEAK_AVG_RATIO = 0.5f - - /** 有效语音最小连续帧占比(0.1f),非连续杂音过滤,建议0.08-0.12f */ - const val MIN_CONTINUOUS_FRAME_RATIO = 0.1f - - /** 语音峰值位置阈值(0.95f),过滤末尾突发杂音,建议0.9-0.98f */ - const val MAX_PEAK_POSITION_RATIO = 0.95f - - /** 有效语音最小帧数(3帧),过滤零星语音帧,建议2-5帧 */ - const val MIN_EFFECTIVE_SPEECH_FRAMES = 3 - - // ===================== 多人对话过滤(多维度判定) ===================== - /** 多人对话最大峰均比(2.5f),峰均比过高判定为多人对话,建议2.0-3.0f */ - const val MULTI_DIALOGUE_MAX_PEAK_AVG_RATIO = 2.5f - - /** 多人对话最小峰均比(0.4f),峰均比过低判定为多人对话,建议0.3-0.5f */ - const val MULTI_DIALOGUE_MIN_PEAK_AVG_RATIO = 0.4f - - /** 多人对话最大连续帧占比(0.3f),连续帧低判定为多人对话,建议0.2-0.4f */ - const val MULTI_DIALOGUE_MAX_CONTINUOUS_RATIO = 0.3f - - /** 多人对话最小VAD占比(0.55f),VAD占比高判定为多人对话,建议0.5-0.6f */ - const val MULTI_DIALOGUE_MIN_VAD_RATIO = 0.55f - - // ===================== 分场景动态系数(阈值计算) ===================== - /** 短语音能量动态系数:安静环境1.5f,嘈杂环境2.0f(嘈杂环境系数更高) */ - const val SHORT_SPEECH_ENERGY_COEFF_QUIET = 1.5f - const val SHORT_SPEECH_ENERGY_COEFF_NOISY = 2.0f - - /** 长语音能量动态系数:安静环境2.5f,嘈杂环境3.5f(长语音系数更高) */ - const val LONG_SPEECH_ENERGY_COEFF_QUIET = 2.5f - const val LONG_SPEECH_ENERGY_COEFF_NOISY = 3.5f - - /** 短/长语音VAD占比动态系数:短语音0.05f,长语音0.10f(长语音要求更高) */ - const val SHORT_SPEECH_VAD_COEFF = 0.05f - const val LONG_SPEECH_VAD_COEFF = 0.10f - - /** 短/长语音最低评分:默认1分(宽松),严格场景可设为2分 */ - const val SHORT_SPEECH_MIN_SCORE = 1 - const val LONG_SPEECH_MIN_SCORE = 1 - - // ===================== 微弱人声过滤专用阈值(补充) ===================== - /** 短/长语音临界时长(2000ms),filterWeakVoice中判定短语音的依据 */ - const val SHORT_LONG_SPEECH_CUTOFF_MS = 2000L - - /** 短语音动态能量阈值(0.01f),filterWeakVoice中短语音的能量判定值 */ - const val SHORT_SPEECH_ENERGY_THRESHOLD = 0.01f - - /** 短语音VAD占比阈值(0.10f),filterWeakVoice中短语音的VAD判定值 */ - const val SHORT_SPEECH_VAD_RATIO = 0.10f - - /** 嘈杂环境VAD占比阈值(0.15f),filterWeakVoice中嘈杂环境的VAD判定值 */ - const val NOISY_ENV_VAD_RATIO = 0.15f - - /** 纯底噪过滤的能量阈值(0.005f),filterWeakVoice中底噪判定的能量值 */ - const val PURE_NOISE_ENERGY_THRESHOLD = 0.005f - - /** 纯底噪过滤的能量基线比(1.2f),filterWeakVoice中底噪判定的比值 */ - const val PURE_NOISE_BASELINE_RATIO = 1.2f - - // ===================== 语音评分专用阈值(补充) ===================== - /** 长语音评分临界时长(4000ms),calculateSpeechScore中评3分的依据 */ - const val LONG_SPEECH_SCORE_CUTOFF_MS = 4000L - - /** 中语音评分临界时长(2500ms),calculateSpeechScore中评2分的依据 */ - const val MID_SPEECH_SCORE_CUTOFF_MS = 2500L - -} \ No newline at end of file diff --git a/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt b/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt index aaec1b4..beb3fa9 100644 --- a/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt +++ b/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt @@ -2,8 +2,7 @@ package com.zs.smarthuman.sherpa import android.content.res.AssetManager import com.blankj.utilcode.util.LogUtils -import com.k2fsa.sherpa.onnx.SpeakerEmbeddingExtractor -import com.k2fsa.sherpa.onnx.SpeakerEmbeddingManager +import com.k2fsa.sherpa.onnx.OnlineStream import com.k2fsa.sherpa.onnx.SpeakerRecognition import kotlinx.coroutines.CoroutineScope import kotlinx.coroutines.Dispatchers @@ -16,32 +15,45 @@ class VoiceController( assetManager: AssetManager, private val onWakeup: () -> Unit, private val onFinalAudio: (FloatArray) -> Unit, - idleTimeoutSeconds: Int = VoiceConfig.IDLE_TIMEOUT_DEFAULT_SECONDS, - maxRecordingSeconds: Int = VoiceConfig.MAX_RECORDING_DEFAULT_SECONDS, + idleTimeoutSeconds: Int = 200, + maxRecordingSeconds: Int = 10, private val onStateChanged: ((VoiceState) -> Unit)? = null, private val stopBackendAudio: (() -> Unit)? = null, private val onTimeoutTip: OnTimeoutTip? = null ) { - // 依赖组件 - private val wakeupManager = WakeupManager(assetManager, onWakeup) - private val vadManager = VadManager( - assetManager, - onSpeechStart = ::onVadStart, - onSpeechEnd = { avgEnergy, peakRms -> onVadEnd(avgEnergy, peakRms) } - ) - private val stateManager = VoiceStateManager( - idleTimeoutSeconds = idleTimeoutSeconds, - maxRecordingSeconds = maxRecordingSeconds, - onStateChanged = onStateChanged, - onTimeoutTip = onTimeoutTip - ) - // 音频缓存 - private val audioBuffer = mutableListOf() - private val preBuffer = ArrayDeque(VoiceConfig.PRE_BUFFER_SIZE) - private val envNoiseBuffer = ArrayDeque(VoiceConfig.BASELINE_WINDOW_SIZE) + companion object { + // 日志标签 + private const val TAG = "VoiceController" + // 采样率 + private const val SAMPLE_RATE = 16000 + // 预缓存大小(2秒) + private const val PRE_BUFFER_SIZE = SAMPLE_RATE * 2 - // 实时统计 + // ========== 核心:分场景声纹阈值(极简版) ========== + private const val SPEAKER_THRESHOLD_QUIET = 0.50f // 安静环境 + private const val SPEAKER_THRESHOLD_NOISY = 0.45f // 嘈杂环境(匹配你的真实相似度) + private const val SPEAKER_THRESHOLD_SHORT = 0.43f // 短语音(<1秒) + + // 短语音判定阈值 + private const val SHORT_AUDIO_DURATION_MS = 1000L + private const val INVALID_RESET_DEBOUNCE_MS = 1500L + // 最小语音时长 + private const val MIN_SPEECH_MS = 800L + private const val MIN_EFFECTIVE_VOICE_DURATION = 400L + + // 噪音场景判定阈值 + private const val NOISE_BASELINE_THRESHOLD = 0.01f + } + + var state: VoiceState = VoiceState.WAIT_WAKEUP + private set(value) { + field = value + LogUtils.d(TAG, "➡ State = $value") + onStateChanged?.invoke(value) + } + + // 实时能量与帧统计变量 private var realtimeEnergySum = 0f private var realtimeEnergyCount = 0 private var realtimePeakRms = 0f @@ -50,224 +62,359 @@ class VoiceController( private var realtimeContinuousSpeechFrames = 0 private var realtimeLastFrameIsSpeech = false private var isMultiPersonDialogueDetected = false - - // 声纹识别相关 + private var lastInvalidResetMs = 0L private val speakerManagerLock = ReentrantLock() - private lateinit var speakerExtractor: SpeakerEmbeddingExtractor - private lateinit var speakerManager: SpeakerEmbeddingManager + + // 环境噪音状态标记 + private var isNoisyEnvironment = false + + private val wakeupManager = WakeupManager(assetManager, onWakeup) + private val vadManager = VadManager( + assetManager, + onSpeechStart = { onVadStart() }, + onSpeechEnd = { avgEnergy, peakRms -> onVadEnd(avgEnergy, peakRms) } + ) + + private val audioBuffer = mutableListOf() + private val preBuffer = ArrayDeque(PRE_BUFFER_SIZE) + + private var recordingStartMs = 0L + private var waitSpeechFailStartMs = 0L + private var waitSpeechStartMs = 0L + + private var vadStarted = false + private var inKwsObserve = false + private var kwsObserveStartMs = 0L + private val KWS_OBSERVE_MS = 500L + private var speechEnableAtMs = 0L + private val SPEECH_COOLDOWN_MS = 300L + + private val idleTimeoutMs = idleTimeoutSeconds * 1000L + private val maxRecordingMs = maxRecordingSeconds * 1000L + + // 分场景动态系数(保留原有逻辑) + private val BASELINE_WINDOW_SIZE = 50 + private val envNoiseBuffer = ArrayDeque(BASELINE_WINDOW_SIZE) + private var currentEnvBaseline = 0.001f + + + // 分场景动态系数 + private val BASELINE_QUIET_THRESHOLD = 0.005f + private val SHORT_SPEECH_ENERGY_COEFF_QUIET = 1.5f + private val SHORT_SPEECH_ENERGY_COEFF_NOISY = 2.0f + private val LONG_SPEECH_ENERGY_COEFF_QUIET = 2.5f + private val LONG_SPEECH_ENERGY_COEFF_NOISY = 3.5f + private val SHORT_SPEECH_VAD_COEFF = 0.05f + private val LONG_SPEECH_VAD_COEFF = 0.10f + private val SHORT_SPEECH_MIN_SCORE = 1 + private val LONG_SPEECH_MIN_SCORE = 1 + + // 其他过滤参数 + private val MAX_FAR_FIELD_ENERGY = 0.015f + private val MIN_VALID_PEAK_AVG_RATIO = 0.5f + private val MIN_CONTINUOUS_FRAME_RATIO = 0.1f + private val MAX_PEAK_POSITION_RATIO = 0.95f + private val MIN_EFFECTIVE_SPEECH_FRAMES = 3 + private val SHORT_SPEECH_MIN = 500L + private val SHORT_SPEECH_MAX = 2000L + + // 多人对话过滤配置 + private val MULTI_DIALOGUE_MIN_DURATION = 2500L + private val MULTI_DIALOGUE_MAX_PEAK_AVG_RATIO = 2.5f + private val MULTI_DIALOGUE_MIN_PEAK_AVG_RATIO = 0.4f + private val MULTI_DIALOGUE_MAX_CONTINUOUS_RATIO = 0.3f + private val MULTI_DIALOGUE_MIN_VAD_RATIO = 0.55f + + // 微弱人声过滤配置 + private val MIN_VOICE_FRAME_RATIO = 0.08f + private val MIN_PEAK_ENERGY_RATIO = 1.5f + private val NORMAL_VOICE_ENERGY_THRESHOLD = 0.008f + private val MIN_CONTINUOUS_VOICE_FRAMES = 1 + private val MIN_EFFECTIVE_SPEECH_RMS = 0.0005f + + // 无效说话标记 + 超时类型 + private var hasInvalidSpeech = false + private var currentTimeoutType: TimeoutType = TimeoutType.IDLE_TIMEOUT + + // 声纹验证相关 + private val CURRENT_USER_ID = "current_wakeup_user" + private val ENABLE_STRICT_SPEAKER_VERIFY = true init { - initSpeakerRecognition(assetManager) - } - - /** - * 初始化声纹识别器 - */ - private fun initSpeakerRecognition(assetManager: AssetManager) { try { SpeakerRecognition.initExtractor(assetManager) - speakerExtractor = SpeakerRecognition.extractor - speakerManager = SpeakerRecognition.manager - LogUtils.d(VoiceConfig.TAG, "✅ 声纹识别器初始化成功") + LogUtils.d(TAG, "✅ 声纹识别器初始化成功") } catch (e: Exception) { - LogUtils.e(VoiceConfig.TAG, "❌ 声纹识别器初始化失败", e) + LogUtils.e(TAG, "❌ 声纹识别器初始化失败", e) throw RuntimeException("声纹识别初始化失败", e) } } - /** - * 音频入口(对外API不变) - */ + /* ================= 音频入口 ================= */ fun acceptAudio(samples: FloatArray) { - // 缓存预缓冲 - VoiceUtils.cachePreBuffer(samples, preBuffer) - - // 唤醒检测 + cachePreBuffer(samples) wakeupManager.acceptAudio(samples) if (wakeupManager.consumeWakeupFlag()) { - val preBufferSnapshot = preBuffer.toFloatArray() + val preBufferShot = preBuffer.toFloatArray() handleWakeupEvent() // 注册唤醒用户特征 CoroutineScope(Dispatchers.IO).launch { - VoiceUtils.registerWakeupUser( - preBuffer = ArrayDeque(preBufferSnapshot.asList()), // 用快照创建新队列 - extractor = speakerExtractor, - manager = speakerManager - ) + var stream: OnlineStream? = null + runCatching { + val wakeupAudio = preBufferShot + if (wakeupAudio.isEmpty()) { + LogUtils.w(TAG, "❌ 唤醒音频缓存为空,无法注册用户特征") + return@launch + } + + stream = SpeakerRecognition.extractor.createStream() + stream?.acceptWaveform(samples = wakeupAudio, sampleRate = SAMPLE_RATE) + stream?.inputFinished() + + if (stream != null && SpeakerRecognition.extractor.isReady(stream)) { + val embedding = SpeakerRecognition.extractor.compute(stream) + speakerManagerLock.withLock { + SpeakerRecognition.manager.remove(CURRENT_USER_ID) + val embeddingList = mutableListOf(embedding) + val ok = SpeakerRecognition.manager.add( + name = CURRENT_USER_ID, + embedding = embeddingList.toTypedArray() + ) + if (ok) { + LogUtils.d(TAG, "✅ 注册当前唤醒用户特征成功 | 特征长度: ${embedding.size}") + } else { + LogUtils.w(TAG, "❌ 注册当前唤醒用户特征失败") + } + } + } else { + LogUtils.w(TAG, "❌ 唤醒音频Stream未就绪,跳过用户注册") + } + }.onFailure { + LogUtils.e(TAG, "❌ 唤醒用户特征注册失败", it) + }.also { + stream?.release() + } } return } val now = System.currentTimeMillis() - // 环境基线校准(仅等待唤醒状态) - if (stateManager.state == VoiceState.WAIT_WAKEUP) { - stateManager.currentEnvBaseline = VoiceUtils.calibrateEnvBaseline( - samples = samples, - vadManager = vadManager, - envNoiseBuffer = envNoiseBuffer, - currentEnvBaseline = stateManager.currentEnvBaseline - ) - stateManager.isNoisyEnvironment = stateManager.currentEnvBaseline >= VoiceConfig.NOISE_BASELINE_THRESHOLD -// LogUtils.d(VoiceConfig.TAG, "📊 环境状态 | 基线: ${stateManager.currentEnvBaseline} | 嘈杂环境: ${stateManager.isNoisyEnvironment}") + if (state == VoiceState.WAIT_WAKEUP) { + calibrateEnvBaseline(samples) + isNoisyEnvironment = currentEnvBaseline >= NOISE_BASELINE_THRESHOLD + LogUtils.d(TAG, "📊 环境状态 | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment") } - // 状态分发 - when (stateManager.state) { + when (state) { VoiceState.WAIT_WAKEUP, VoiceState.PLAYING_PROMPT, VoiceState.PLAYING_BACKEND, VoiceState.UPLOADING -> return VoiceState.WAIT_SPEECH_COOLDOWN -> { - stateManager.handleWaitSpeechCooldown(now) + if (now >= speechEnableAtMs) { + waitSpeechFailStartMs = now + state = VoiceState.WAIT_SPEECH + waitSpeechStartMs = now + } return } VoiceState.WAIT_SPEECH -> { - // 检查超时(修复点:超时后主动调用 resetAll() 并传参) - if (stateManager.checkWaitSpeechTimeout(now)) { - stateManager.resetAll( - resetRealtimeStats = ::resetRealtimeStats, - audioBuffer = audioBuffer, - preBuffer = preBuffer, - vadManager = vadManager, - wakeupManager = wakeupManager, - envNoiseBuffer = envNoiseBuffer - ) + if ((waitSpeechStartMs > 0 && now - waitSpeechStartMs >= idleTimeoutMs) || + (waitSpeechFailStartMs > 0 && now - waitSpeechFailStartMs >= idleTimeoutMs) + ) { + currentTimeoutType = if (hasInvalidSpeech) { + TimeoutType.INVALID_SPEECH_TIMEOUT + } else { + TimeoutType.IDLE_TIMEOUT + } + LogUtils.d(TAG, "⏱ WAIT_SPEECH timeout → WAIT_WAKEUP | 超时类型: $currentTimeoutType") + onTimeoutTip?.invoke(currentTimeoutType) + resetAll() return } - // 唤醒观察期 - if (stateManager.inKwsObserve && now - stateManager.kwsObserveStartMs < VoiceConfig.KWS_OBSERVE_MS) return - stateManager.inKwsObserve = false + if (inKwsObserve && now - kwsObserveStartMs < KWS_OBSERVE_MS) return + inKwsObserve = false - // VAD检测 vadManager.accept(samples) } VoiceState.RECORDING -> { - // 音频缓存 audioBuffer.addAll(samples.asList()) vadManager.accept(samples) - // 环境校准 - stateManager.currentEnvBaseline = VoiceUtils.calibrateEnvBaseline( - samples = samples, - vadManager = vadManager, - envNoiseBuffer = envNoiseBuffer, - currentEnvBaseline = stateManager.currentEnvBaseline - ) - stateManager.isNoisyEnvironment = stateManager.currentEnvBaseline >= VoiceConfig.NOISE_BASELINE_THRESHOLD + calibrateEnvBaseline(samples) + updateRealtimeEnergy(samples) + updateRealtimeFrameStats() + isNoisyEnvironment = currentEnvBaseline >= NOISE_BASELINE_THRESHOLD - // 更新实时统计 - val energyStats = VoiceUtils.updateRealtimeEnergy( - samples = samples, - vadManager = vadManager, - isNoisyEnvironment = stateManager.isNoisyEnvironment, - currentEnvBaseline = stateManager.currentEnvBaseline, - realtimeEnergySum = realtimeEnergySum, - realtimeEnergyCount = realtimeEnergyCount, - realtimePeakRms = realtimePeakRms - ) - realtimeEnergySum = energyStats.first - realtimeEnergyCount = energyStats.second - realtimePeakRms = energyStats.third - - val frameStats = VoiceUtils.updateRealtimeFrameStats(vadManager) - realtimeTotalFrames = frameStats.totalFrames - realtimeSpeechFrames = frameStats.speechFrames - realtimeContinuousSpeechFrames = frameStats.continuousSpeechFrames - realtimeLastFrameIsSpeech = frameStats.lastFrameIsSpeech - - // 多人对话检测 - isMultiPersonDialogueDetected = VoiceUtils.checkMultiPersonDialogue( - now = now, - recordingStartMs = stateManager.recordingStartMs, - realtimeEnergySum = realtimeEnergySum, - realtimeEnergyCount = realtimeEnergyCount, - realtimePeakRms = realtimePeakRms, - realtimeSpeechFrames = realtimeSpeechFrames, - realtimeContinuousSpeechFrames = realtimeContinuousSpeechFrames, - vadManager = vadManager - ) - - if (isMultiPersonDialogueDetected) { - LogUtils.w(VoiceConfig.TAG, "🚨 录音中识别出多人对话,提前终止") + if (checkMultiPersonDialogueRealtime(now)) { + LogUtils.w(TAG, "🚨 录音中识别出多人对话,提前终止") finishSentence(realtimeEnergySum / realtimeEnergyCount, realtimePeakRms) return } - // 最大录音时长检测 - if (System.currentTimeMillis() - stateManager.recordingStartMs > stateManager.maxRecordingMs) { - LogUtils.w(VoiceConfig.TAG, "⏱ Max recording reached | 当前环境基线: ${stateManager.currentEnvBaseline} | 嘈杂环境: ${stateManager.isNoisyEnvironment}") + if (System.currentTimeMillis() - recordingStartMs > maxRecordingMs) { + LogUtils.w(TAG, "⏱ Max recording reached | 当前环境基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment") finishSentence(realtimeEnergySum / realtimeEnergyCount, realtimePeakRms) } } } } - /** - * 处理唤醒事件 - */ + /* ================= 实时能量更新 ================= */ + private fun updateRealtimeEnergy(samples: FloatArray) { + val rms = vadManager.calcRms(samples) + val effectiveThreshold = if (isNoisyEnvironment) currentEnvBaseline * 1.8f else MIN_EFFECTIVE_SPEECH_RMS + if (rms >= effectiveThreshold) { + realtimeEnergySum += rms + realtimeEnergyCount++ + realtimePeakRms = maxOf(realtimePeakRms, rms) + } + } + + /* ================= 实时帧统计 ================= */ + private fun updateRealtimeFrameStats() { + realtimeTotalFrames = vadManager.getTotalFrames() + realtimeSpeechFrames = vadManager.getSpeechFrames() + realtimeContinuousSpeechFrames = vadManager.getContinuousSpeechFrames() + val currentFrameIsSpeech = vadManager.isSpeechDetected() + if (currentFrameIsSpeech) { + realtimeContinuousSpeechFrames = if (realtimeLastFrameIsSpeech) realtimeContinuousSpeechFrames + 1 else 1 + } else { + realtimeContinuousSpeechFrames = 0 + } + realtimeLastFrameIsSpeech = currentFrameIsSpeech + } + + /* ================= 多人对话检测 ================= */ + private fun checkMultiPersonDialogueRealtime(now: Long): Boolean { + val duration = now - recordingStartMs + if (duration < MULTI_DIALOGUE_MIN_DURATION) return false + + val avgEnergy = if (realtimeEnergyCount > 0) realtimeEnergySum / realtimeEnergyCount else 0f + val peakAvgRatio = if (avgEnergy > 0) realtimePeakRms / avgEnergy else 0f + val continuousRatio = if (realtimeSpeechFrames > 0) realtimeContinuousSpeechFrames.toFloat() / realtimeSpeechFrames else 0f + val vadRatio = vadManager.activeSpeechRatio() + + isMultiPersonDialogueDetected = duration >= MULTI_DIALOGUE_MIN_DURATION && + peakAvgRatio in MULTI_DIALOGUE_MIN_PEAK_AVG_RATIO..MULTI_DIALOGUE_MAX_PEAK_AVG_RATIO && + continuousRatio <= MULTI_DIALOGUE_MAX_CONTINUOUS_RATIO && + vadRatio >= MULTI_DIALOGUE_MIN_VAD_RATIO + + return isMultiPersonDialogueDetected + } + + /* ================= 环境基线校准 ================= */ + private fun calibrateEnvBaseline(samples: FloatArray) { + val rms = vadManager.calcRms(samples) + val validRms = if (rms < currentEnvBaseline + 0.002f) rms else currentEnvBaseline + if (rms < 0.015f) { + if (envNoiseBuffer.size >= BASELINE_WINDOW_SIZE) { + envNoiseBuffer.removeFirst() + } + envNoiseBuffer.addLast(validRms) + currentEnvBaseline = envNoiseBuffer.maxOrNull() ?: 0.001f + } + } + + /* ================= 唤醒处理 ================= */ private fun handleWakeupEvent() { - if (stateManager.state == VoiceState.UPLOADING) return + if (state == VoiceState.UPLOADING) return stopBackendAudio?.invoke() - stateManager.enterWakeup(interrupt = true, resetRealtimeStats = ::resetRealtimeStats) - preBuffer.clear() + enterWakeup(interrupt = true) + } + + private fun enterWakeup(interrupt: Boolean) { + waitSpeechFailStartMs = System.currentTimeMillis() + waitSpeechStartMs = System.currentTimeMillis() + + hasInvalidSpeech = false + currentTimeoutType = TimeoutType.IDLE_TIMEOUT + + if (interrupt) { + audioBuffer.clear() + vadManager.reset() + vadStarted = false + resetRealtimeStats() + } + + inKwsObserve = true + kwsObserveStartMs = System.currentTimeMillis() onWakeup() + LogUtils.d(TAG, "🔔 唤醒成功 | 环境基线: $currentEnvBaseline") } - /** - * VAD开始回调 - */ private fun onVadStart() { - stateManager.onVadStart( - audioBuffer = audioBuffer, - preBuffer = preBuffer, - resetRealtimeStats = ::resetRealtimeStats - ) + if (state != VoiceState.WAIT_SPEECH) return + LogUtils.d(TAG, "🎤 REAL VAD START | 环境基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment") + vadStarted = true + recordingStartMs = System.currentTimeMillis() + audioBuffer.clear() + audioBuffer.addAll(preBuffer) + resetRealtimeStats() + state = VoiceState.RECORDING } - /** - * VAD结束回调 - */ private fun onVadEnd(avgEnergy: Float, peakRms: Float) { - if (stateManager.state != VoiceState.RECORDING) return - LogUtils.d(VoiceConfig.TAG, "🧠 VAD END | 环境基线: ${stateManager.currentEnvBaseline} | 嘈杂环境: ${stateManager.isNoisyEnvironment}") + if (state != VoiceState.RECORDING) return + LogUtils.d(TAG, "🧠 VAD END | 环境基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment") val realAvgEnergy = if (realtimeEnergyCount > 0) realtimeEnergySum / realtimeEnergyCount else avgEnergy val realPeakRms = if (realtimePeakRms > 0) realtimePeakRms else peakRms finishSentence(realAvgEnergy, realPeakRms) } - /** - * 结束录音处理 - */ + /* ================= 微弱人声过滤 ================= */ + private fun filterWeakVoice(duration: Long, avgEnergy: Float, peakRms: Float): Boolean { + if (duration < MIN_EFFECTIVE_VOICE_DURATION) { + LogUtils.w(TAG, "❌ 微弱人声过滤:时长${duration}ms < ${MIN_EFFECTIVE_VOICE_DURATION}ms") + return true + } + + val voiceFrameRatio = if (realtimeTotalFrames > 0) realtimeSpeechFrames.toFloat() / realtimeTotalFrames else 0f + if (avgEnergy < NORMAL_VOICE_ENERGY_THRESHOLD && voiceFrameRatio < MIN_VOICE_FRAME_RATIO) { + LogUtils.w(TAG, "❌ 微弱人声过滤:帧占比${voiceFrameRatio} < ${MIN_VOICE_FRAME_RATIO}") + return true + } + + val peakBaselineRatio = peakRms / currentEnvBaseline + if (avgEnergy < NORMAL_VOICE_ENERGY_THRESHOLD && peakBaselineRatio < MIN_PEAK_ENERGY_RATIO) { + LogUtils.w(TAG, "❌ 微弱人声过滤:峰值/基线${peakBaselineRatio} < ${MIN_PEAK_ENERGY_RATIO}") + return true + } + + if (avgEnergy < NORMAL_VOICE_ENERGY_THRESHOLD && realtimeContinuousSpeechFrames < MIN_CONTINUOUS_VOICE_FRAMES) { + LogUtils.w(TAG, "❌ 微弱人声过滤:连续帧${realtimeContinuousSpeechFrames} < ${MIN_CONTINUOUS_VOICE_FRAMES}") + return true + } + + val energyBaselineRatio = avgEnergy / currentEnvBaseline + if (avgEnergy < 0.005f && energyBaselineRatio < 1.2f) { + LogUtils.w(TAG, "❌ 微弱人声过滤:能量/基线${energyBaselineRatio} < 1.2") + return true + } + + return false + } + + /* ================= 结束录音 ================= */ private fun finishSentence(avgEnergy: Float = 0f, peakRms: Float = 0f) { val now = System.currentTimeMillis() - val duration = now - stateManager.recordingStartMs + val duration = now - recordingStartMs - // 基础过滤:语音过短 - if (!stateManager.vadStarted || duration < VoiceConfig.MIN_SPEECH_MS) { - LogUtils.d(VoiceConfig.TAG, "❌ 语音过短: $duration ms | 基线: ${stateManager.currentEnvBaseline} | 嘈杂环境: ${stateManager.isNoisyEnvironment}") - stateManager.hasInvalidSpeech = true - stateManager.resetToWaitSpeech(::resetRealtimeStats, audioBuffer, vadManager) + if (!vadStarted || duration < MIN_SPEECH_MS) { + LogUtils.d(TAG, "❌ 语音过短: $duration ms | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment") + hasInvalidSpeech = true + resetToWaitSpeech() return } - // 微弱人声过滤 - if (VoiceUtils.filterWeakVoice( - duration = duration, - avgEnergy = avgEnergy, - peakRms = peakRms, - currentEnvBaseline = stateManager.currentEnvBaseline, - realtimeTotalFrames = realtimeTotalFrames, - realtimeSpeechFrames = realtimeSpeechFrames, - realtimeContinuousSpeechFrames = realtimeContinuousSpeechFrames - ) - ) { - stateManager.hasInvalidSpeech = true - stateManager.resetToWaitSpeech(::resetRealtimeStats, audioBuffer, vadManager) + if (filterWeakVoice(duration, avgEnergy, peakRms)) { + hasInvalidSpeech = true + resetToWaitSpeech() return } @@ -275,95 +422,104 @@ class VoiceController( val vadRatio = vadManager.activeSpeechRatio() val peakAvgRatio = if (avgEnergy > 0f) peakRms / avgEnergy else 0f - LogUtils.d(VoiceConfig.TAG, "📊 录音信息 | 时长: $duration ms | 能量: $avgEnergy | 峰均比: $peakAvgRatio | 基线: ${stateManager.currentEnvBaseline} | 嘈杂环境: ${stateManager.isNoisyEnvironment}") - LogUtils.d(VoiceConfig.TAG, "📊 实时帧统计 | 总帧: $realtimeTotalFrames | 语音帧: $realtimeSpeechFrames | 连续语音帧: $realtimeContinuousSpeechFrames") + LogUtils.d(TAG, "📊 录音信息 | 时长: $duration ms | 能量: $avgEnergy | 峰均比: $peakAvgRatio | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment") + LogUtils.d(TAG, "📊 实时帧统计 | 总帧: $realtimeTotalFrames | 语音帧: $realtimeSpeechFrames | 连续语音帧: $realtimeContinuousSpeechFrames") - // 多人对话过滤 if (isMultiPersonDialogueDetected) { - LogUtils.w(VoiceConfig.TAG, "❌ 过滤多人对话垃圾语音 | 时长: $duration ms") - stateManager.hasInvalidSpeech = true - stateManager.resetToWaitSpeech(::resetRealtimeStats, audioBuffer, vadManager) + LogUtils.w(TAG, "❌ 过滤多人对话垃圾语音 | 时长: $duration ms") + hasInvalidSpeech = true + resetToWaitSpeech() return } - // 声纹验证 - if (VoiceConfig.ENABLE_STRICT_SPEAKER_VERIFY) { - val isCurrentUser = VoiceUtils.verifySpeaker( - audio = audio, - isNoisyEnvironment = stateManager.isNoisyEnvironment, - extractor = speakerExtractor, - manager = speakerManager - ) + // 声纹验证(核心极简版) + if (ENABLE_STRICT_SPEAKER_VERIFY) { + val isCurrentUser = verifySpeaker(audio) if (!isCurrentUser) { - LogUtils.w(VoiceConfig.TAG, "❌ 非当前唤醒用户,拒绝语音 | 录音时长: $duration ms | 嘈杂环境: ${stateManager.isNoisyEnvironment}") - stateManager.hasInvalidSpeech = true - stateManager.resetToWaitSpeech(::resetRealtimeStats, audioBuffer, vadManager) + LogUtils.w(TAG, "❌ 非当前唤醒用户,拒绝语音 | 录音时长: $duration ms | 嘈杂环境: $isNoisyEnvironment") + hasInvalidSpeech = true + resetToWaitSpeech() return } - LogUtils.d(VoiceConfig.TAG, "✅ 当前用户语音,继续处理 | 录音时长: $duration ms | 嘈杂环境: ${stateManager.isNoisyEnvironment}") + LogUtils.d(TAG, "✅ 当前用户语音,继续处理 | 录音时长: $duration ms | 嘈杂环境: $isNoisyEnvironment") } + // 远场过滤 - val isFarField = avgEnergy < VoiceConfig.MAX_FAR_FIELD_ENERGY - val isInvalidPeakRatio = peakAvgRatio < VoiceConfig.MIN_VALID_PEAK_AVG_RATIO + val isFarField = avgEnergy < MAX_FAR_FIELD_ENERGY + val isInvalidPeakRatio = peakAvgRatio < MIN_VALID_PEAK_AVG_RATIO if (isFarField && isInvalidPeakRatio) { - LogUtils.w(VoiceConfig.TAG, "❌ 远场/无效语音过滤 | 能量: $avgEnergy < ${VoiceConfig.MAX_FAR_FIELD_ENERGY}") - stateManager.hasInvalidSpeech = true - stateManager.resetToWaitSpeech(::resetRealtimeStats, audioBuffer, vadManager) + LogUtils.w(TAG, "❌ 远场/无效语音过滤 | 能量: $avgEnergy < $MAX_FAR_FIELD_ENERGY") + hasInvalidSpeech = true + resetToWaitSpeech() return } // 非连续判定 val continuousRatio = if (realtimeSpeechFrames > 0) realtimeContinuousSpeechFrames.toFloat() / realtimeSpeechFrames else 0f val peakPositionRatio = vadManager.getPeakPositionRatio() - val isDiscontinuous = continuousRatio < VoiceConfig.MIN_CONTINUOUS_FRAME_RATIO && - realtimeSpeechFrames < VoiceConfig.MIN_EFFECTIVE_SPEECH_FRAMES && - peakPositionRatio > VoiceConfig.MAX_PEAK_POSITION_RATIO + val isDiscontinuous = continuousRatio < MIN_CONTINUOUS_FRAME_RATIO && + realtimeSpeechFrames < MIN_EFFECTIVE_SPEECH_FRAMES && + peakPositionRatio > MAX_PEAK_POSITION_RATIO if (isDiscontinuous) { - LogUtils.w(VoiceConfig.TAG, "❌ 非连续杂音过滤 | 连续占比: $continuousRatio < ${VoiceConfig.MIN_CONTINUOUS_FRAME_RATIO}") - stateManager.hasInvalidSpeech = true - stateManager.resetToWaitSpeech(::resetRealtimeStats, audioBuffer, vadManager) + LogUtils.w(TAG, "❌ 非连续杂音过滤 | 连续占比: $continuousRatio < $MIN_CONTINUOUS_FRAME_RATIO") + hasInvalidSpeech = true + resetToWaitSpeech() return } // 分场景阈值过滤 - val thresholdConfig = VoiceUtils.getThresholdConfig(duration, stateManager.currentEnvBaseline) + val isQuietEnv = currentEnvBaseline < BASELINE_QUIET_THRESHOLD + val thresholdConfig = when { + duration in SHORT_SPEECH_MIN..SHORT_SPEECH_MAX -> { + val coeff = if (isQuietEnv) SHORT_SPEECH_ENERGY_COEFF_QUIET else SHORT_SPEECH_ENERGY_COEFF_NOISY + val energyThreshold = currentEnvBaseline * coeff + ThresholdConfig(energyThreshold, SHORT_SPEECH_VAD_COEFF, SHORT_SPEECH_MIN_SCORE, "短语音") + } + else -> { + val coeff = if (isQuietEnv) LONG_SPEECH_ENERGY_COEFF_QUIET else LONG_SPEECH_ENERGY_COEFF_NOISY + val energyThreshold = currentEnvBaseline * coeff + ThresholdConfig(energyThreshold, LONG_SPEECH_VAD_COEFF, LONG_SPEECH_MIN_SCORE, "长语音") + } + } + val energyPass = avgEnergy >= thresholdConfig.energyThreshold val vadRatioPass = vadRatio >= thresholdConfig.vadRatioThreshold if (!energyPass || !vadRatioPass) { - LogUtils.w(VoiceConfig.TAG, "❌ 低能量语音阈值过滤 | 能量: $avgEnergy < ${thresholdConfig.energyThreshold} | 占比: $vadRatio < ${thresholdConfig.vadRatioThreshold} | 场景: ${thresholdConfig.scene}") - stateManager.hasInvalidSpeech = true - stateManager.resetToWaitSpeech(::resetRealtimeStats, audioBuffer, vadManager) + LogUtils.w(TAG, "❌ 低能量语音阈值过滤 | 能量: $avgEnergy < ${thresholdConfig.energyThreshold} | 占比: $vadRatio < ${thresholdConfig.vadRatioThreshold} | 场景: ${thresholdConfig.scene}") + hasInvalidSpeech = true + resetToWaitSpeech() return } // 评分判定 - val score = VoiceUtils.calculateSpeechScore( - duration = duration, - avgEnergy = avgEnergy, - continuousRatio = continuousRatio, - thresholdConfig = thresholdConfig - ) + var score = 0 + score += when { + duration >= 4000 -> 3 + duration >= 2500 -> 2 + else -> 1 + } + score += if (avgEnergy >= thresholdConfig.energyThreshold) 1 else 0 + score += if (continuousRatio >= MIN_CONTINUOUS_FRAME_RATIO) 1 else 0 + val pass = score >= thresholdConfig.minScore if (!pass) { - LogUtils.w(VoiceConfig.TAG, "❌ 评分不足过滤 | 总分: $score < ${thresholdConfig.minScore} | 场景: ${thresholdConfig.scene}") - stateManager.hasInvalidSpeech = true - stateManager.resetToWaitSpeech(::resetRealtimeStats, audioBuffer, vadManager) + LogUtils.w(TAG, "❌ 评分不足过滤 | 总分: $score < ${thresholdConfig.minScore} | 场景: ${thresholdConfig.scene}") + hasInvalidSpeech = true + resetToWaitSpeech() return } // 最终通过 audioBuffer.clear() - stateManager.state = VoiceState.UPLOADING + state = VoiceState.UPLOADING onFinalAudio(audio) resetRealtimeStats() - stateManager.hasInvalidSpeech = false - LogUtils.i(VoiceConfig.TAG, "✅ 语音通过 | 时长: $duration ms | 能量: $avgEnergy | 场景: ${thresholdConfig.scene} | 嘈杂环境: ${stateManager.isNoisyEnvironment}") + hasInvalidSpeech = false + LogUtils.i(TAG, "✅ 语音通过 | 时长: $duration ms | 能量: $avgEnergy | 场景: ${thresholdConfig.scene} | 嘈杂环境: $isNoisyEnvironment") } - /** - * 重置实时统计 - */ + /* ================= 重置实时统计 ================= */ private fun resetRealtimeStats() { realtimeEnergySum = 0f realtimeEnergyCount = 0 @@ -375,34 +531,95 @@ class VoiceController( isMultiPersonDialogueDetected = false } - // ================= 对外API(完全不变) ================= - fun onPlayStartPrompt() = stateManager.onPlayStartPrompt() - fun onPlayEndPrompt() = stateManager.onPlayEndPrompt() - fun onPlayStartBackend() = stateManager.onPlayStartBackend() - fun onPlayEndBackend() = stateManager.onPlayEndBackend() - fun onUploadFinished(success: Boolean) = stateManager.onUploadFinished(success) + /* ================= 播放/上传回调 ================= */ + fun onPlayStartPrompt() { + LogUtils.d(TAG, "🎵 播放提示音 | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment") + state = VoiceState.PLAYING_PROMPT + } + + fun onPlayEndPrompt() { + speechEnableAtMs = System.currentTimeMillis() + SPEECH_COOLDOWN_MS + LogUtils.d(TAG, "🎵 提示音结束 | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment") + state = VoiceState.WAIT_SPEECH_COOLDOWN + } + + fun onPlayStartBackend() { + if (state != VoiceState.UPLOADING) { + LogUtils.w(TAG, "🎶 非上传完成状态,禁止切换到 PLAYING_BACKEND | 当前状态: $state") + return + } + LogUtils.d(TAG, "🎶 开始播放后台音频 | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment") + state = VoiceState.PLAYING_BACKEND + } + + fun onPlayEndBackend() { + speechEnableAtMs = System.currentTimeMillis() + SPEECH_COOLDOWN_MS + LogUtils.d(TAG, "🎶 后台音频结束 | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment") + state = VoiceState.WAIT_SPEECH_COOLDOWN + } + + fun onUploadFinished(success: Boolean) { + if (state != VoiceState.UPLOADING) return + LogUtils.d(TAG, "📤 上传完成 | 成功: $success | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment") + + if (!success) { + speechEnableAtMs = System.currentTimeMillis() + SPEECH_COOLDOWN_MS + state = VoiceState.WAIT_SPEECH_COOLDOWN + } + } + + private fun resetToWaitSpeech() { + LogUtils.d(TAG, "🔄 重置到等待说话 | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment | 已标记无效说话: $hasInvalidSpeech") + val now = System.currentTimeMillis() + if (now - lastInvalidResetMs < INVALID_RESET_DEBOUNCE_MS) { + LogUtils.d(TAG, "🛡 防抖:1.5秒内重复无效语音,跳过重置") + return + } + lastInvalidResetMs = now + audioBuffer.clear() + vadManager.reset() + vadStarted = false + resetRealtimeStats() + state = VoiceState.WAIT_SPEECH + if (waitSpeechFailStartMs == 0L) waitSpeechFailStartMs = System.currentTimeMillis() + } + + private fun resetAll() { + LogUtils.d(TAG, "🔄 重置所有状态 | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment | 本次超时类型: $currentTimeoutType") + audioBuffer.clear() + preBuffer.clear() + vadManager.reset() + wakeupManager.reset() + vadStarted = false + waitSpeechStartMs = 0L + waitSpeechFailStartMs = 0L + envNoiseBuffer.clear() + currentEnvBaseline = 0.001f + isNoisyEnvironment = false + resetRealtimeStats() + hasInvalidSpeech = false + currentTimeoutType = TimeoutType.IDLE_TIMEOUT + state = VoiceState.WAIT_WAKEUP + } - /** - * 资源释放 - */ fun release() { - LogUtils.d(VoiceConfig.TAG, "🔌 释放资源 | 最终基线: ${stateManager.currentEnvBaseline} | 嘈杂环境: ${stateManager.isNoisyEnvironment}") + LogUtils.d(TAG, "🔌 释放资源 | 最终基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment") wakeupManager.release() vadManager.reset() envNoiseBuffer.clear() resetRealtimeStats() - stateManager.hasInvalidSpeech = false - stateManager.currentTimeoutType = TimeoutType.IDLE_TIMEOUT - stateManager.isNoisyEnvironment = false + hasInvalidSpeech = false + currentTimeoutType = TimeoutType.IDLE_TIMEOUT + isNoisyEnvironment = false runCatching { - speakerExtractor.release() + SpeakerRecognition.extractor.release() speakerManagerLock.withLock { - speakerManager.release() + SpeakerRecognition.manager.release() } - LogUtils.d(VoiceConfig.TAG, "✅ 声纹识别器资源已释放") + LogUtils.d(TAG, "✅ 声纹识别器资源已释放") }.onFailure { - LogUtils.e(VoiceConfig.TAG, "❌ 释放声纹识别器资源失败", it) + LogUtils.e(TAG, "❌ 释放声纹识别器资源失败", it) } } @@ -410,7 +627,85 @@ class VoiceController( runCatching { release() }.onFailure { - LogUtils.e(VoiceConfig.TAG, "❌ finalize 释放资源失败", it) + LogUtils.e(TAG, "❌ finalize 释放资源失败", it) + } + } + + private fun cachePreBuffer(samples: FloatArray) { + for (s in samples) { + preBuffer.addLast(s) + if (preBuffer.size > PRE_BUFFER_SIZE) preBuffer.removeFirst() + } + } + + // 阈值配置数据类 + private data class ThresholdConfig( + val energyThreshold: Float, + val vadRatioThreshold: Float, + val minScore: Int, + val scene: String + ) + + /* ================= 核心:极简版声纹验证 ================= */ + private fun verifySpeaker(audio: FloatArray): Boolean { + if (audio.isEmpty()) { + LogUtils.w(TAG, "❌ 待验证音频为空,声纹验证失败") + return false + } + + // 1. 裁剪音频:只保留本次录音的有效部分(解决时长不匹配问题) + val audioDurationMs = (audio.size.toFloat() / SAMPLE_RATE * 1000).toLong() + // 只保留最后 N 毫秒的音频(N = 实际录音时长),避免缓存旧音频 + val validAudio = if (audioDurationMs > 0) { + val validSampleCount = (audioDurationMs * SAMPLE_RATE / 1000).toInt() + if (validSampleCount < audio.size) { + audio.copyOfRange(audio.size - validSampleCount, audio.size) + } else { + audio + } + } else { + audio + } + + // 2. 分场景选阈值(无容错,只调阈值) + val finalThreshold = when { + audioDurationMs < SHORT_AUDIO_DURATION_MS -> SPEAKER_THRESHOLD_SHORT + isNoisyEnvironment -> SPEAKER_THRESHOLD_NOISY + else -> SPEAKER_THRESHOLD_QUIET + } + + var stream: OnlineStream? = null + return try { + stream = SpeakerRecognition.extractor.createStream() + stream.acceptWaveform(samples = validAudio, sampleRate = SAMPLE_RATE) // 用裁剪后的音频验证 + stream.inputFinished() + + if (!SpeakerRecognition.extractor.isReady(stream)) { + LogUtils.w(TAG, "❌ 音频Stream未就绪,验证失败") + return false + } + + val embedding = SpeakerRecognition.extractor.compute(stream) + + // 3. 纯验证逻辑:过就过,不过就拒绝 + speakerManagerLock.withLock { + val verifyPass = SpeakerRecognition.manager.verify( + name = CURRENT_USER_ID, + embedding = embedding, + threshold = finalThreshold + ) + + // 打印关键信息(补充裁剪后时长) + LogUtils.d(TAG, "📊 声纹验证 | 阈值: $finalThreshold | 通过: $verifyPass | 嘈杂环境: $isNoisyEnvironment | 原始时长: ${audioDurationMs}ms | 验证时长: ${(validAudio.size.toFloat()/SAMPLE_RATE*1000).toLong()}ms") + + // 无任何容错:验证结果就是最终结果 + return verifyPass + } + } catch (e: Exception) { + LogUtils.e(TAG, "❌ 声纹验证异常,拒绝", e) + return false + } finally { + stream?.release() } } } \ No newline at end of file diff --git a/app/src/main/java/com/zs/smarthuman/sherpa/VoiceStateManager.kt b/app/src/main/java/com/zs/smarthuman/sherpa/VoiceStateManager.kt deleted file mode 100644 index c1b5d90..0000000 --- a/app/src/main/java/com/zs/smarthuman/sherpa/VoiceStateManager.kt +++ /dev/null @@ -1,211 +0,0 @@ -package com.zs.smarthuman.sherpa - -import com.blankj.utilcode.util.LogUtils -import java.util.ArrayDeque - -/** - * 语音控制器状态管理类 - */ -class VoiceStateManager( - idleTimeoutSeconds: Int, - maxRecordingSeconds: Int, - private val onStateChanged: ((VoiceState) -> Unit)?, - private val onTimeoutTip: OnTimeoutTip? -) { - var state: VoiceState = VoiceState.WAIT_WAKEUP - set(value) { - field = value - LogUtils.d(VoiceConfig.TAG, "➡ State = $value") - onStateChanged?.invoke(value) - } - - // 超时相关 - val idleTimeoutMs = idleTimeoutSeconds * 1000L - val maxRecordingMs = maxRecordingSeconds * 1000L - var waitSpeechFailStartMs = 0L - var waitSpeechStartMs = 0L - var speechEnableAtMs = 0L - var lastInvalidResetMs = 0L - - // 无效说话标记 + 超时类型 - var hasInvalidSpeech = false - var currentTimeoutType: TimeoutType = TimeoutType.IDLE_TIMEOUT - - // 唤醒观察标记 - var inKwsObserve = false - var kwsObserveStartMs = 0L - - // 环境状态 - var isNoisyEnvironment = false - var currentEnvBaseline = 0.001f - - // 录音状态 - var recordingStartMs = 0L - var vadStarted = false - - /** - * 检查等待说话超时 - * 修复点:返回是否超时,由外部调用 resetAll()(避免内部依赖外部对象) - */ - fun checkWaitSpeechTimeout(now: Long): Boolean { - val isTimeout = (waitSpeechStartMs > 0 && now - waitSpeechStartMs >= idleTimeoutMs) || - (waitSpeechFailStartMs > 0 && now - waitSpeechFailStartMs >= idleTimeoutMs) - - if (isTimeout) { - currentTimeoutType = if (hasInvalidSpeech) { - TimeoutType.INVALID_SPEECH_TIMEOUT - } else { - TimeoutType.IDLE_TIMEOUT - } - LogUtils.d(VoiceConfig.TAG, "⏱ WAIT_SPEECH timeout → WAIT_WAKEUP | 超时类型: $currentTimeoutType") - onTimeoutTip?.invoke(currentTimeoutType) - // 修复点:不再内部调用 resetAll(),改为返回超时状态,由外部处理 - return true - } - return false - } - - /** - * 处理等待说话冷却状态 - */ - fun handleWaitSpeechCooldown(now: Long): Boolean { - if (now >= speechEnableAtMs) { - waitSpeechFailStartMs = now - state = VoiceState.WAIT_SPEECH - waitSpeechStartMs = now - return true - } - return false - } - - /** - * 进入唤醒状态 - */ - fun enterWakeup(interrupt: Boolean, resetRealtimeStats: () -> Unit) { - val now = System.currentTimeMillis() - waitSpeechFailStartMs = now - waitSpeechStartMs = now - hasInvalidSpeech = false - currentTimeoutType = TimeoutType.IDLE_TIMEOUT - - if (interrupt) { - resetRealtimeStats() - vadStarted = false - } - - inKwsObserve = true - kwsObserveStartMs = now - } - - /** - * 重置到等待说话状态 - */ - fun resetToWaitSpeech(resetRealtimeStats: () -> Unit, audioBuffer: MutableList, vadManager: VadManager) { - LogUtils.d(VoiceConfig.TAG, "🔄 重置到等待说话 | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment | 已标记无效说话: $hasInvalidSpeech") - val now = System.currentTimeMillis() - if (now - lastInvalidResetMs < VoiceConfig.INVALID_RESET_DEBOUNCE_MS) { - LogUtils.d(VoiceConfig.TAG, "🛡 防抖:1.5秒内重复无效语音,跳过重置") - return - } - lastInvalidResetMs = now - audioBuffer.clear() - vadManager.reset() - vadStarted = false - resetRealtimeStats() - state = VoiceState.WAIT_SPEECH - if (waitSpeechFailStartMs == 0L) waitSpeechFailStartMs = System.currentTimeMillis() - } - - /** - * 重置所有状态 - */ - fun resetAll( - resetRealtimeStats: () -> Unit, - audioBuffer: MutableList, - preBuffer: ArrayDeque, - vadManager: VadManager, - wakeupManager: WakeupManager, - envNoiseBuffer: ArrayDeque - ) { - LogUtils.d(VoiceConfig.TAG, "🔄 重置所有状态 | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment | 本次超时类型: $currentTimeoutType") - audioBuffer.clear() - preBuffer.clear() - vadManager.reset() - wakeupManager.reset() - vadStarted = false - waitSpeechStartMs = 0L - waitSpeechFailStartMs = 0L - envNoiseBuffer.clear() - currentEnvBaseline = 0.001f - isNoisyEnvironment = false - resetRealtimeStats() - hasInvalidSpeech = false - currentTimeoutType = TimeoutType.IDLE_TIMEOUT - state = VoiceState.WAIT_WAKEUP - } - - /** - * 播放提示音开始 - */ - fun onPlayStartPrompt() { - LogUtils.d(VoiceConfig.TAG, "🎵 播放提示音 | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment") - state = VoiceState.PLAYING_PROMPT - } - - /** - * 播放提示音结束 - */ - fun onPlayEndPrompt() { - speechEnableAtMs = System.currentTimeMillis() + VoiceConfig.SPEECH_COOLDOWN_MS - LogUtils.d(VoiceConfig.TAG, "🎵 提示音结束 | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment") - state = VoiceState.WAIT_SPEECH_COOLDOWN - } - - /** - * 播放后台音频开始 - */ - fun onPlayStartBackend() { - if (state != VoiceState.UPLOADING) { - LogUtils.w(VoiceConfig.TAG, "🎶 非上传完成状态,禁止切换到 PLAYING_BACKEND | 当前状态: $state") - return - } - LogUtils.d(VoiceConfig.TAG, "🎶 开始播放后台音频 | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment") - state = VoiceState.PLAYING_BACKEND - } - - /** - * 播放后台音频结束 - */ - fun onPlayEndBackend() { - speechEnableAtMs = System.currentTimeMillis() + VoiceConfig.SPEECH_COOLDOWN_MS - LogUtils.d(VoiceConfig.TAG, "🎶 后台音频结束 | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment") - state = VoiceState.WAIT_SPEECH_COOLDOWN - } - - /** - * 上传完成 - */ - fun onUploadFinished(success: Boolean) { - if (state != VoiceState.UPLOADING) return - LogUtils.d(VoiceConfig.TAG, "📤 上传完成 | 成功: $success | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment") - - if (!success) { - speechEnableAtMs = System.currentTimeMillis() + VoiceConfig.SPEECH_COOLDOWN_MS - state = VoiceState.WAIT_SPEECH_COOLDOWN - } - } - - /** - * VAD开始回调 - */ - fun onVadStart(audioBuffer: MutableList, preBuffer: ArrayDeque, resetRealtimeStats: () -> Unit) { - if (state != VoiceState.WAIT_SPEECH) return - LogUtils.d(VoiceConfig.TAG, "🎤 REAL VAD START | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment") - vadStarted = true - recordingStartMs = System.currentTimeMillis() - audioBuffer.clear() - audioBuffer.addAll(preBuffer) - resetRealtimeStats() - state = VoiceState.RECORDING - } -} \ No newline at end of file diff --git a/app/src/main/java/com/zs/smarthuman/sherpa/VoiceUtils.kt b/app/src/main/java/com/zs/smarthuman/sherpa/VoiceUtils.kt deleted file mode 100644 index 25768c8..0000000 --- a/app/src/main/java/com/zs/smarthuman/sherpa/VoiceUtils.kt +++ /dev/null @@ -1,355 +0,0 @@ -package com.zs.smarthuman.sherpa - -import com.blankj.utilcode.util.LogUtils -import com.k2fsa.sherpa.onnx.OnlineStream -import com.k2fsa.sherpa.onnx.SpeakerEmbeddingExtractor -import com.k2fsa.sherpa.onnx.SpeakerEmbeddingManager -import java.util.ArrayDeque -import java.util.concurrent.locks.ReentrantLock -import kotlin.concurrent.withLock - -/** - * 语音处理通用工具类(优化微弱人声过滤逻辑,适配正常语音) - */ -object VoiceUtils { - private val speakerManagerLock = ReentrantLock() - - /** - * 环境基线校准 - */ - fun calibrateEnvBaseline( - samples: FloatArray, - vadManager: VadManager, - envNoiseBuffer: ArrayDeque, - currentEnvBaseline: Float - ): Float { - val rms = vadManager.calcRms(samples) - val validRms = if (rms < currentEnvBaseline + 0.002f) rms else currentEnvBaseline - if (rms < 0.015f) { - if (envNoiseBuffer.size >= VoiceConfig.BASELINE_WINDOW_SIZE) { - envNoiseBuffer.removeFirst() - } - envNoiseBuffer.addLast(validRms) - return envNoiseBuffer.maxOrNull() ?: 0.001f - } - return currentEnvBaseline - } - - /** - * 更新实时能量统计 - */ - fun updateRealtimeEnergy( - samples: FloatArray, - vadManager: VadManager, - isNoisyEnvironment: Boolean, - currentEnvBaseline: Float, - realtimeEnergySum: Float, - realtimeEnergyCount: Int, - realtimePeakRms: Float - ): Triple { - val rms = vadManager.calcRms(samples) - val effectiveThreshold = if (isNoisyEnvironment) currentEnvBaseline * 1.8f else VoiceConfig.MIN_EFFECTIVE_SPEECH_RMS - var newSum = realtimeEnergySum - var newCount = realtimeEnergyCount - var newPeak = realtimePeakRms - if (rms >= effectiveThreshold) { - newSum += rms - newCount++ - newPeak = maxOf(newPeak, rms) - } - return Triple(newSum, newCount, newPeak) - } - - /** - * 更新实时帧统计 - */ - fun updateRealtimeFrameStats(vadManager: VadManager): FrameStats { - val totalFrames = vadManager.getTotalFrames() - val speechFrames = vadManager.getSpeechFrames() - val continuousSpeechFrames = vadManager.getContinuousSpeechFrames() - val currentFrameIsSpeech = vadManager.isSpeechDetected() - val newContinuousFrames = if (currentFrameIsSpeech) { - if (vadManager.getContinuousSpeechFrames() > 0) continuousSpeechFrames + 1 else 1 - } else { - 0 - } - return FrameStats( - totalFrames = totalFrames, - speechFrames = speechFrames, - continuousSpeechFrames = newContinuousFrames, - lastFrameIsSpeech = currentFrameIsSpeech - ) - } - - /** - * 多人对话实时检测 - */ - fun checkMultiPersonDialogue( - now: Long, - recordingStartMs: Long, - realtimeEnergySum: Float, - realtimeEnergyCount: Int, - realtimePeakRms: Float, - realtimeSpeechFrames: Int, - realtimeContinuousSpeechFrames: Int, - vadManager: VadManager - ): Boolean { - val duration = now - recordingStartMs - if (duration < VoiceConfig.MULTI_DIALOGUE_MIN_DURATION) return false - - val avgEnergy = if (realtimeEnergyCount > 0) realtimeEnergySum / realtimeEnergyCount else 0f - val peakAvgRatio = if (avgEnergy > 0) realtimePeakRms / avgEnergy else 0f - val continuousRatio = if (realtimeSpeechFrames > 0) realtimeContinuousSpeechFrames.toFloat() / realtimeSpeechFrames else 0f - val vadRatio = vadManager.activeSpeechRatio() - - return duration >= VoiceConfig.MULTI_DIALOGUE_MIN_DURATION && - peakAvgRatio in VoiceConfig.MULTI_DIALOGUE_MIN_PEAK_AVG_RATIO..VoiceConfig.MULTI_DIALOGUE_MAX_PEAK_AVG_RATIO && - continuousRatio <= VoiceConfig.MULTI_DIALOGUE_MAX_CONTINUOUS_RATIO && - vadRatio >= VoiceConfig.MULTI_DIALOGUE_MIN_VAD_RATIO - } - - /** - * 微弱人声过滤(精简版:保留核心层,删除冗余层,避免过度过滤) - */ - fun filterWeakVoice( - duration: Long, - avgEnergy: Float, - peakRms: Float, - currentEnvBaseline: Float, - realtimeTotalFrames: Int, - realtimeSpeechFrames: Int, - realtimeContinuousSpeechFrames: Int - ): Boolean { - // 1. 基础时长过滤(必需:过滤极短杂音) - if (duration < VoiceConfig.MIN_EFFECTIVE_VOICE_DURATION) { - LogUtils.w("${VoiceConfig.TAG}", "❌ 微弱人声过滤:时长${duration}ms < ${VoiceConfig.MIN_EFFECTIVE_VOICE_DURATION}ms") - return true - } - - // 2. 动态能量阈值过滤(核心:分场景放宽短语音阈值) - val dynamicEnergyThreshold = if (duration < VoiceConfig.SHORT_LONG_SPEECH_CUTOFF_MS) - VoiceConfig.SHORT_SPEECH_ENERGY_THRESHOLD - else - VoiceConfig.MIN_NORMAL_VOICE_ENERGY - - if (avgEnergy < dynamicEnergyThreshold) { - LogUtils.w("${VoiceConfig.TAG}", "❌ 微弱人声过滤:平均能量${avgEnergy} < ${if (duration < 2000) "短语音能量阈值${dynamicEnergyThreshold}" else "正常语音能量阈值${dynamicEnergyThreshold}"}") - return true - } - - // 3. 计算VAD占比(辅助:为后续过滤准备) - val voiceFrameRatio = if (realtimeTotalFrames > 0) realtimeSpeechFrames.toFloat() / realtimeTotalFrames else 0f - - // 4. 动态VAD占比+能量联合过滤(核心:分场景适配,避免单一维度误判) - val dynamicVadRatioThreshold = when { - duration < VoiceConfig.SHORT_LONG_SPEECH_CUTOFF_MS -> VoiceConfig.SHORT_SPEECH_VAD_RATIO - currentEnvBaseline >= VoiceConfig.NOISE_BASELINE_THRESHOLD -> VoiceConfig.NOISY_ENV_VAD_RATIO - else -> VoiceConfig.MIN_NORMAL_VOICE_VAD_RATIO - } - - if (voiceFrameRatio < dynamicVadRatioThreshold && avgEnergy < VoiceConfig.NORMAL_VOICE_ENERGY_THRESHOLD) { - LogUtils.w("${VoiceConfig.TAG}", "❌ 微弱人声过滤:语音帧占比${voiceFrameRatio} < ${dynamicVadRatioThreshold} | 平均能量${avgEnergy}") - return true - } - - // 5. 纯底噪过滤(必需:过滤无语音的环境底噪) - val energyBaselineRatio = avgEnergy / currentEnvBaseline - if (avgEnergy < VoiceConfig.PURE_NOISE_ENERGY_THRESHOLD && energyBaselineRatio < VoiceConfig.PURE_NOISE_BASELINE_RATIO) { - LogUtils.w("${VoiceConfig.TAG}", "❌ 微弱人声过滤:能量/基线${energyBaselineRatio} < 1.2(纯底噪)") - return true - } - - // (可选保留)峰值/基线过滤:仅对扁平背景音有效,可根据实际场景选择 - // val peakBaselineRatio = peakRms / currentEnvBaseline - // if (avgEnergy < VoiceConfig.NORMAL_VOICE_ENERGY_THRESHOLD && peakBaselineRatio < VoiceConfig.MIN_PEAK_ENERGY_RATIO) { - // LogUtils.w("${VoiceConfig.TAG}", "❌ 微弱人声过滤:峰值/基线${peakBaselineRatio} < ${VoiceConfig.MIN_PEAK_ENERGY_RATIO}") - // return true - // } - - // 正常语音通过所有核心过滤 - LogUtils.d("${VoiceConfig.TAG}", "✅ 正常语音通过微弱人声过滤 | 时长${duration}ms | 能量${avgEnergy} | VAD占比${voiceFrameRatio} | 基线${currentEnvBaseline}") - return false - } - - /** - * 声纹验证核心逻辑(无修改) - */ - fun verifySpeaker( - audio: FloatArray, - isNoisyEnvironment: Boolean, - extractor: SpeakerEmbeddingExtractor, - manager: SpeakerEmbeddingManager - ): Boolean { - if (audio.isEmpty()) { - LogUtils.w("${VoiceConfig.TAG}", "❌ 待验证音频为空,声纹验证失败") - return false - } - - // 裁剪音频:只保留本次录音的有效部分 - val audioDurationMs = (audio.size.toFloat() / VoiceConfig.SAMPLE_RATE * 1000).toLong() - val validAudio = if (audioDurationMs > 0) { - val validSampleCount = (audioDurationMs * VoiceConfig.SAMPLE_RATE / 1000).toInt() - if (validSampleCount < audio.size) { - audio.copyOfRange(audio.size - validSampleCount, audio.size) - } else { - audio - } - } else { - audio - } - - // 分场景选阈值 - val finalThreshold = when { - audioDurationMs < VoiceConfig.SHORT_AUDIO_DURATION_MS -> VoiceConfig.SPEAKER_THRESHOLD_SHORT - isNoisyEnvironment -> VoiceConfig.SPEAKER_THRESHOLD_NOISY - else -> VoiceConfig.SPEAKER_THRESHOLD_QUIET - } - - var stream: OnlineStream? = null - return try { - stream = extractor.createStream() - stream.acceptWaveform(samples = validAudio, sampleRate = VoiceConfig.SAMPLE_RATE) - stream.inputFinished() - - if (!extractor.isReady(stream)) { - LogUtils.w("${VoiceConfig.TAG}", "❌ 音频Stream未就绪,验证失败") - return false - } - - val embedding = extractor.compute(stream) - speakerManagerLock.withLock { - val verifyPass = manager.verify( - name = VoiceConfig.CURRENT_USER_ID, - embedding = embedding, - threshold = finalThreshold - ) - LogUtils.d("${VoiceConfig.TAG}", "📊 声纹验证 | 阈值: $finalThreshold | 通过: $verifyPass | 嘈杂环境: $isNoisyEnvironment | 原始时长: ${audioDurationMs}ms | 验证时长: ${(validAudio.size.toFloat()/VoiceConfig.SAMPLE_RATE*1000).toLong()}ms") - return verifyPass - } - } catch (e: Exception) { - LogUtils.e("${VoiceConfig.TAG}", "❌ 声纹验证异常,拒绝", e) - return false - } finally { - stream?.release() - } - } - - /** - * 注册唤醒用户声纹特征(无修改) - */ - fun registerWakeupUser( - preBuffer: ArrayDeque, - extractor: SpeakerEmbeddingExtractor, - manager: SpeakerEmbeddingManager - ) { - var stream: OnlineStream? = null - runCatching { - val wakeupAudio = preBuffer.toFloatArray() - if (wakeupAudio.isEmpty()) { - LogUtils.w("${VoiceConfig.TAG}", "❌ 唤醒音频缓存为空,无法注册用户特征") - return - } - - stream = extractor.createStream() - stream?.acceptWaveform(samples = wakeupAudio, sampleRate = VoiceConfig.SAMPLE_RATE) - stream?.inputFinished() - - if (stream != null && extractor.isReady(stream)) { - val embedding = extractor.compute(stream) - speakerManagerLock.withLock { - manager.remove(VoiceConfig.CURRENT_USER_ID) - val embeddingList = mutableListOf(embedding) - val ok = manager.add( - name = VoiceConfig.CURRENT_USER_ID, - embedding = embeddingList.toTypedArray() - ) - if (ok) { - LogUtils.d("${VoiceConfig.TAG}", "✅ 注册当前唤醒用户特征成功 | 特征长度: ${embedding.size}") - } else { - LogUtils.w("${VoiceConfig.TAG}", "❌ 注册当前唤醒用户特征失败") - } - } - } else { - LogUtils.w("${VoiceConfig.TAG}", "❌ 唤醒音频Stream未就绪,跳过用户注册") - } - }.onFailure { - LogUtils.e("${VoiceConfig.TAG}", "❌ 唤醒用户特征注册失败", it) - }.also { - stream?.release() - } - } - - /** - * 缓存预缓冲音频(无修改) - */ - fun cachePreBuffer(samples: FloatArray, preBuffer: ArrayDeque) { - for (s in samples) { - preBuffer.addLast(s) - if (preBuffer.size > VoiceConfig.PRE_BUFFER_SIZE) preBuffer.removeFirst() - } - } - - /** - * 帧统计数据类(无修改) - */ - data class FrameStats( - val totalFrames: Int, - val speechFrames: Int, - val continuousSpeechFrames: Int, - val lastFrameIsSpeech: Boolean - ) - - /** - * 阈值配置数据类(无修改) - */ - data class ThresholdConfig( - val energyThreshold: Float, - val vadRatioThreshold: Float, - val minScore: Int, - val scene: String - ) - - /** - * 获取分场景阈值配置(无修改) - */ - fun getThresholdConfig(duration: Long, currentEnvBaseline: Float): ThresholdConfig { - val isQuietEnv = currentEnvBaseline < VoiceConfig.BASELINE_QUIET_THRESHOLD - return if (duration in VoiceConfig.SHORT_SPEECH_MIN..VoiceConfig.SHORT_SPEECH_MAX) { - val coeff = if (isQuietEnv) VoiceConfig.SHORT_SPEECH_ENERGY_COEFF_QUIET else VoiceConfig.SHORT_SPEECH_ENERGY_COEFF_NOISY - ThresholdConfig( - energyThreshold = currentEnvBaseline * coeff, - vadRatioThreshold = VoiceConfig.SHORT_SPEECH_VAD_COEFF, - minScore = VoiceConfig.SHORT_SPEECH_MIN_SCORE, - scene = "短语音" - ) - } else { - val coeff = if (isQuietEnv) VoiceConfig.LONG_SPEECH_ENERGY_COEFF_QUIET else VoiceConfig.LONG_SPEECH_ENERGY_COEFF_NOISY - ThresholdConfig( - energyThreshold = currentEnvBaseline * coeff, - vadRatioThreshold = VoiceConfig.LONG_SPEECH_VAD_COEFF, - minScore = VoiceConfig.LONG_SPEECH_MIN_SCORE, - scene = "长语音" - ) - } - } - - /** - * 计算语音评分(无修改) - */ - fun calculateSpeechScore( - duration: Long, - avgEnergy: Float, - continuousRatio: Float, - thresholdConfig: ThresholdConfig - ): Int { - var score = 0 - score += when { - duration >= VoiceConfig.LONG_SPEECH_SCORE_CUTOFF_MS -> 3 - duration >= VoiceConfig.MID_SPEECH_SCORE_CUTOFF_MS -> 2 - else -> 1 - } - score += if (avgEnergy >= thresholdConfig.energyThreshold) 1 else 0 - score += if (continuousRatio >= VoiceConfig.MIN_CONTINUOUS_FRAME_RATIO) 1 else 0 - return score - } -} \ No newline at end of file diff --git a/app/src/main/java/com/zs/smarthuman/ui/MainActivity.kt b/app/src/main/java/com/zs/smarthuman/ui/MainActivity.kt index 40397fd..e61512e 100644 --- a/app/src/main/java/com/zs/smarthuman/ui/MainActivity.kt +++ b/app/src/main/java/com/zs/smarthuman/ui/MainActivity.kt @@ -37,6 +37,7 @@ import com.zs.smarthuman.BuildConfig import com.zs.smarthuman.R import com.zs.smarthuman.base.BaseActivity import com.zs.smarthuman.base.BaseViewModelActivity +import com.zs.smarthuman.bean.AudioDTO import com.zs.smarthuman.bean.NetworkStatusEventMsg import com.zs.smarthuman.bean.UserInfoResp import com.zs.smarthuman.bean.VersionUpdateResp @@ -54,10 +55,12 @@ import com.zs.smarthuman.utils.AudioDebugUtil import com.zs.smarthuman.utils.AudioPcmUtil import com.zs.smarthuman.utils.DangerousUtils import com.zs.smarthuman.utils.LogFileUtils +import com.zs.smarthuman.utils.PcmStreamPlayer import com.zs.smarthuman.utils.UnityPlayerHolder import com.zs.smarthuman.utils.ViewSlideAnimator +import com.zs.smarthuman.utils.VoiceStreamPlayer import com.zs.smarthuman.viewmodel.MainViewModel import com.zs.smarthuman.widget.VersionUpdateDialog import kotlinx.coroutines.Dispatchers @@ -149,7 +152,7 @@ class MainActivity : BaseViewModelActivity() when (it) { is ApiResult.Error -> { Toaster.showShort("上传失败") - voiceController?.onUploadFinished(false) + voiceController?.onUploadFinished(true) } is ApiResult.Success -> { @@ -243,19 +246,26 @@ class MainActivity : BaseViewModelActivity() } ) } + private val voicePlayer = VoiceStreamPlayer().apply { + onPlayStart = { id -> + LogUtils.d("🎵 开始播放 audioId=$id") + startPlayTimeoutJob?.cancel() + voiceController?.onPlayStartBackend() + } + onPlayEnd = { id -> + LogUtils.d("✅ 播放结束 audioId=$id") + voiceController?.onPlayEndBackend() + } + + } override fun receivedIMMsg(msg: SingleMessage) { when (msg.msgContentType) { MessageContentType.RECEIVE_VOICE_STREAM.msgContentType -> { lifecycleScope.launch(Dispatchers.IO) { - -// LogFileUtils.logToFile2(this@MainActivity,msg.content) - UnityPlayerHolder.getInstance() - .startTalking(msg.content) -// loadLocalJsonAndPlay() - - + val audioDTO = GsonUtils.fromJson(msg.content, AudioDTO::class.java) + voicePlayer.onAudioDTO(audioDTO) } } } @@ -540,6 +550,7 @@ class MainActivity : BaseViewModelActivity() UnityPlayerHolder.getInstance().release() UnityPlayerHolder.getInstance().clearCache() releaseIM() + voicePlayer.release() } diff --git a/app/src/main/java/com/zs/smarthuman/utils/PcmStreamPlayer.kt b/app/src/main/java/com/zs/smarthuman/utils/PcmStreamPlayer.kt new file mode 100644 index 0000000..0616216 --- /dev/null +++ b/app/src/main/java/com/zs/smarthuman/utils/PcmStreamPlayer.kt @@ -0,0 +1,95 @@ +package com.zs.smarthuman.utils + +import android.media.AudioAttributes +import android.media.AudioFormat +import android.media.AudioManager +import android.media.AudioTrack +import kotlinx.coroutines.CoroutineScope +import kotlinx.coroutines.Dispatchers +import kotlinx.coroutines.SupervisorJob +import kotlinx.coroutines.cancel +import kotlinx.coroutines.delay +import kotlinx.coroutines.isActive +import kotlinx.coroutines.launch +import java.util.ArrayDeque +import java.util.Queue +import java.util.concurrent.locks.ReentrantLock + +// ====================== PCM 播放器 ====================== +class PcmStreamPlayer( + private val sampleRate: Int +) { + + var onPlayEnd: (() -> Unit)? = null + + private val scope = CoroutineScope(SupervisorJob() + Dispatchers.IO) + private val bufferQueue: Queue = ArrayDeque() + private val queueLock = ReentrantLock() + + private var audioTrack: AudioTrack? = null + @Volatile + private var playing = true + + init { + scope.launch { + audioTrack = AudioTrack( + AudioAttributes.Builder() + .setUsage(AudioAttributes.USAGE_MEDIA) + .setContentType(AudioAttributes.CONTENT_TYPE_SPEECH) + .build(), + AudioFormat.Builder() + .setEncoding(AudioFormat.ENCODING_PCM_16BIT) + .setSampleRate(sampleRate) + .setChannelMask(AudioFormat.CHANNEL_OUT_MONO) + .build(), + AudioTrack.getMinBufferSize( + sampleRate, + AudioFormat.CHANNEL_OUT_MONO, + AudioFormat.ENCODING_PCM_16BIT + ), + AudioTrack.MODE_STREAM, + AudioManager.AUDIO_SESSION_ID_GENERATE + ) + + audioTrack?.play() + + val silent = ByteArray(2048) + + while (isActive && playing) { + val pcm = queueLock.run { bufferQueue.poll() } + + if (pcm != null) { + audioTrack?.write(pcm, 0, pcm.size) + } else { + audioTrack?.write(silent, 0, silent.size) + delay(5) + } + } + + audioTrack?.stop() + audioTrack?.release() + audioTrack = null + + onPlayEnd?.invoke() + } + } + + fun pushPcm(pcm: ByteArray) { + queueLock.run { bufferQueue.add(pcm) } + } + + fun clearQueue() { + queueLock.run { bufferQueue.clear() } + } + + fun queueEmpty(): Boolean = queueLock.run { bufferQueue.isEmpty() } + + fun release() { + playing = false + queueLock.run { bufferQueue.clear() } + scope.cancel() + } +} + + + diff --git a/app/src/main/java/com/zs/smarthuman/utils/VoiceStreamPlayer.kt b/app/src/main/java/com/zs/smarthuman/utils/VoiceStreamPlayer.kt new file mode 100644 index 0000000..d20ee0a --- /dev/null +++ b/app/src/main/java/com/zs/smarthuman/utils/VoiceStreamPlayer.kt @@ -0,0 +1,101 @@ +package com.zs.smarthuman.utils + + +import android.util.Base64 +import com.zs.smarthuman.bean.AudioDTO +import com.zs.smarthuman.bean.LmChatDTO +import kotlinx.coroutines.* +import java.util.* + + + + +// ====================== Voice 流播放器 ====================== +class VoiceStreamPlayer( + private val sampleRate: Int = 24000 +) { + var onPlayStart: ((audioId: Int) -> Unit)? = null + var onPlayEnd: ((audioId: Int) -> Unit)? = null + + private val scope = CoroutineScope(SupervisorJob() + Dispatchers.IO) + private var currentAudioId: Int? = null + private val pcmPlayer: PcmStreamPlayer by lazy { PcmStreamPlayer(sampleRate) } + + private val sliceBuffer = TreeMap() + private var nextSortId = 1 + private var inputFinished = false + private var firstPlayTriggered = false + private var bufferedBytes = 0 + private var playEndLaunched = false + + private val PREBUFFER_BYTES = (sampleRate * 2 * 250 / 1000) // 250ms + + fun onAudioDTO(dto: AudioDTO) { + scope.launch { + dto.items.forEach { slice -> + handleSlice(slice) + } + } + } + + private fun handleSlice(slice: LmChatDTO) { + if (currentAudioId != slice.id) { + startNewAudio(slice.id) + } + + slice.audioData?.takeIf { it.isNotBlank() }?.let { + val pcm = Base64.decode(it, Base64.DEFAULT) + sliceBuffer[slice.sortId] = pcm + } + + if (slice.isFinal) inputFinished = true + + flushBufferIfPossible() + } + + private fun startNewAudio(audioId: Int) { + currentAudioId = audioId + sliceBuffer.clear() + nextSortId = 1 + bufferedBytes = 0 + firstPlayTriggered = false + playEndLaunched = false + inputFinished = false + pcmPlayer.clearQueue() + } + + private fun flushBufferIfPossible() { + while (true) { + val pcm = sliceBuffer[nextSortId] ?: break + bufferedBytes += pcm.size + sliceBuffer.remove(nextSortId) + nextSortId++ + + if (!firstPlayTriggered && bufferedBytes >= PREBUFFER_BYTES) { + firstPlayTriggered = true + onPlayStart?.invoke(currentAudioId!!) + } + + if (firstPlayTriggered) { + pcmPlayer.pushPcm(pcm) + } + } + + // 收尾,只启动一次协程监控播放完成 + if (inputFinished && sliceBuffer.isEmpty() && firstPlayTriggered && !playEndLaunched) { + playEndLaunched = true + scope.launch { + while (!pcmPlayer.queueEmpty()) { + delay(10) + } + onPlayEnd?.invoke(currentAudioId!!) + } + } + } + + fun release() { + pcmPlayer.release() + scope.cancel() + } +} + diff --git a/app/src/main/java/com/zs/smarthuman/viewmodel/MainViewModel.kt b/app/src/main/java/com/zs/smarthuman/viewmodel/MainViewModel.kt index a08047f..fa35fe6 100644 --- a/app/src/main/java/com/zs/smarthuman/viewmodel/MainViewModel.kt +++ b/app/src/main/java/com/zs/smarthuman/viewmodel/MainViewModel.kt @@ -41,9 +41,9 @@ class MainViewModel: BaseViewModel() { RxHttp.postJson(ApiService.UPLOAD_RECORD_VOICE_URL) .add("sessionCode",sessionCode) .add("audio", audioVoice) - .readTimeout(3000L) - .writeTimeout(3000L) - .connectTimeout(3000L) + .readTimeout(5000L) + .writeTimeout(5000L) + .connectTimeout(5000L) .toAwaitResponse() .awaitResult() .getOrThrow()