diff --git a/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt b/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt index 3fa10b3..7ed71f3 100644 --- a/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt +++ b/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt @@ -1,7 +1,6 @@ package com.zs.smarthuman.sherpa import android.content.res.AssetManager -import android.text.TextUtils import com.blankj.utilcode.util.LogUtils import com.k2fsa.sherpa.onnx.OnlineStream import com.k2fsa.sherpa.onnx.SpeakerRecognition @@ -16,7 +15,7 @@ class VoiceController( assetManager: AssetManager, private val onWakeup: () -> Unit, private val onFinalAudio: (FloatArray) -> Unit, - idleTimeoutSeconds: Int = 10, + idleTimeoutSeconds: Int = 200, maxRecordingSeconds: Int = 10, private val onStateChanged: ((VoiceState) -> Unit)? = null, private val stopBackendAudio: (() -> Unit)? = null, @@ -30,15 +29,21 @@ class VoiceController( private const val SAMPLE_RATE = 16000 // 预缓存大小(2秒) private const val PRE_BUFFER_SIZE = SAMPLE_RATE * 2 - // 声纹验证阈值 - private const val SPEAKER_VERIFY_THRESHOLD_NORMAL = 0.25f - private const val SPEAKER_VERIFY_THRESHOLD_SHORT = 0.20f - private const val SHORT_AUDIO_THRESHOLD = SAMPLE_RATE * 0.5f // 0.5秒音频长度 - // 防抖时间 + + // ========== 核心:分场景声纹阈值(极简版) ========== + private const val SPEAKER_THRESHOLD_QUIET = 0.50f // 安静环境 + private const val SPEAKER_THRESHOLD_NOISY = 0.43f // 嘈杂环境(匹配你的真实相似度) + private const val SPEAKER_THRESHOLD_SHORT = 0.40f // 短语音(<1秒) + + // 短语音判定阈值 + private const val SHORT_AUDIO_DURATION_MS = 1000L private const val INVALID_RESET_DEBOUNCE_MS = 1500L // 最小语音时长 private const val MIN_SPEECH_MS = 800L private const val MIN_EFFECTIVE_VOICE_DURATION = 400L + + // 噪音场景判定阈值 + private const val NOISE_BASELINE_THRESHOLD = 0.01f } var state: VoiceState = VoiceState.WAIT_WAKEUP @@ -48,23 +53,21 @@ class VoiceController( onStateChanged?.invoke(value) } - // ========== 缺失变量补充:实时能量与帧统计变量 ========== - // 实时能量统计 + // 实时能量与帧统计变量 private var realtimeEnergySum = 0f private var realtimeEnergyCount = 0 private var realtimePeakRms = 0f - // 实时帧统计 private var realtimeTotalFrames = 0 private var realtimeSpeechFrames = 0 private var realtimeContinuousSpeechFrames = 0 private var realtimeLastFrameIsSpeech = false - // 多人对话检测标记 private var isMultiPersonDialogueDetected = false - // 防抖重置标记 private var lastInvalidResetMs = 0L - // 声纹管理器锁(解决并发问题) private val speakerManagerLock = ReentrantLock() + // 环境噪音状态标记 + private var isNoisyEnvironment = false + private val wakeupManager = WakeupManager(assetManager, onWakeup) private val vadManager = VadManager( assetManager, @@ -89,16 +92,16 @@ class VoiceController( private val idleTimeoutMs = idleTimeoutSeconds * 1000L private val maxRecordingMs = maxRecordingSeconds * 1000L - // ================= 保留分场景动态系数 + 强制兜底配置(近距离优化版) ================= + // 分场景动态系数(保留原有逻辑) private val BASELINE_WINDOW_SIZE = 50 private val envNoiseBuffer = ArrayDeque(BASELINE_WINDOW_SIZE) private var currentEnvBaseline = 0.001f - // 强制兜底:正常语音最低门槛(近距离场景大幅降低) + // 强制兜底:正常语音最低门槛 private val MIN_NORMAL_VOICE_ENERGY = 0.03f private val MIN_NORMAL_VOICE_VAD_RATIO = 0.2f - // 分场景动态系数(安静环境系数极低,适配近距离轻声) + // 分场景动态系数 private val BASELINE_QUIET_THRESHOLD = 0.005f private val SHORT_SPEECH_ENERGY_COEFF_QUIET = 1.5f private val SHORT_SPEECH_ENERGY_COEFF_NOISY = 2.0f @@ -109,7 +112,7 @@ class VoiceController( private val SHORT_SPEECH_MIN_SCORE = 1 private val LONG_SPEECH_MIN_SCORE = 1 - // 其他过滤参数(近距离场景放宽) + // 其他过滤参数 private val MAX_FAR_FIELD_ENERGY = 0.015f private val MIN_VALID_PEAK_AVG_RATIO = 0.5f private val MIN_CONTINUOUS_FRAME_RATIO = 0.1f @@ -118,40 +121,32 @@ class VoiceController( private val SHORT_SPEECH_MIN = 500L private val SHORT_SPEECH_MAX = 2000L - // ========== 核心修改:多人对话过滤配置 ========== + // 多人对话过滤配置 private val MULTI_DIALOGUE_MIN_DURATION = 2500L private val MULTI_DIALOGUE_MAX_PEAK_AVG_RATIO = 2.5f private val MULTI_DIALOGUE_MIN_PEAK_AVG_RATIO = 0.4f private val MULTI_DIALOGUE_MAX_CONTINUOUS_RATIO = 0.3f private val MULTI_DIALOGUE_MIN_VAD_RATIO = 0.55f - // ========== 核心调整:近距离场景 微弱人声过滤配置(重点优化) ========== + // 微弱人声过滤配置 private val MIN_VOICE_FRAME_RATIO = 0.08f private val MIN_PEAK_ENERGY_RATIO = 1.5f private val NORMAL_VOICE_ENERGY_THRESHOLD = 0.008f private val MIN_CONTINUOUS_VOICE_FRAMES = 1 - - // ========== 核心新增:MIN_EFFECTIVE_SPEECH_RMS 常量 ========== private val MIN_EFFECTIVE_SPEECH_RMS = 0.0005f - // ========== 核心新增:无效说话标记 + 超时类型 ========== + // 无效说话标记 + 超时类型 private var hasInvalidSpeech = false private var currentTimeoutType: TimeoutType = TimeoutType.IDLE_TIMEOUT - // ========== 核心配置:声纹验证相关 ========== - private val CURRENT_USER_ID = "current_wakeup_user" // 当前唤醒用户唯一标识 - private val ENABLE_STRICT_SPEAKER_VERIFY = true // 严格验证开关 + // 声纹验证相关 + private val CURRENT_USER_ID = "current_wakeup_user" + private val ENABLE_STRICT_SPEAKER_VERIFY = true init { - // 参数校验 - require(idleTimeoutSeconds > 0) { "idleTimeoutSeconds 必须大于0" } - require(maxRecordingSeconds > 0) { "maxRecordingSeconds 必须大于0" } - require(maxRecordingSeconds >= idleTimeoutSeconds) { "maxRecordingSeconds 必须大于等于 idleTimeoutSeconds" } - - // 初始化声纹识别器(适配你提供的API) try { - SpeakerRecognition.initExtractor(assetManager) // 对齐原生API - LogUtils.d(TAG, "✅ 声纹识别器初始化成功(原生Stream版本)") + SpeakerRecognition.initExtractor(assetManager) + LogUtils.d(TAG, "✅ 声纹识别器初始化成功") } catch (e: Exception) { LogUtils.e(TAG, "❌ 声纹识别器初始化失败", e) throw RuntimeException("声纹识别初始化失败", e) @@ -163,8 +158,8 @@ class VoiceController( cachePreBuffer(samples) wakeupManager.acceptAudio(samples) if (wakeupManager.consumeWakeupFlag()) { - handleWakeupEvent() // 仅调用一次 - // 注册唤醒用户特征(异步执行) + handleWakeupEvent() + // 注册唤醒用户特征 CoroutineScope(Dispatchers.IO).launch { var stream: OnlineStream? = null runCatching { @@ -174,18 +169,14 @@ class VoiceController( return@launch } - // 创建原生Stream stream = SpeakerRecognition.extractor.createStream() - stream.acceptWaveform(samples = wakeupAudio, sampleRate = SAMPLE_RATE) - stream.inputFinished() + stream?.acceptWaveform(samples = wakeupAudio, sampleRate = SAMPLE_RATE) + stream?.inputFinished() - // 计算特征并注册(仅当前用户) - if (SpeakerRecognition.extractor.isReady(stream)) { + if (stream != null && SpeakerRecognition.extractor.isReady(stream)) { val embedding = SpeakerRecognition.extractor.compute(stream) - // 加锁保护 manager 操作 speakerManagerLock.withLock { SpeakerRecognition.manager.remove(CURRENT_USER_ID) - // 注册当前唤醒用户 val embeddingList = mutableListOf(embedding) val ok = SpeakerRecognition.manager.add( name = CURRENT_USER_ID, @@ -194,7 +185,7 @@ class VoiceController( if (ok) { LogUtils.d(TAG, "✅ 注册当前唤醒用户特征成功 | 特征长度: ${embedding.size}") } else { - LogUtils.w(TAG, "❌ 注册当前唤醒用户特征失败(manager.add返回false)") + LogUtils.w(TAG, "❌ 注册当前唤醒用户特征失败") } } } else { @@ -203,9 +194,7 @@ class VoiceController( }.onFailure { LogUtils.e(TAG, "❌ 唤醒用户特征注册失败", it) }.also { - // 释放Stream stream?.release() - LogUtils.d(TAG, "🔄 唤醒注册Stream已释放") } } return @@ -215,6 +204,8 @@ class VoiceController( if (state == VoiceState.WAIT_WAKEUP) { calibrateEnvBaseline(samples) + isNoisyEnvironment = currentEnvBaseline >= NOISE_BASELINE_THRESHOLD + LogUtils.d(TAG, "📊 环境状态 | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment") } when (state) { @@ -257,10 +248,11 @@ class VoiceController( audioBuffer.addAll(samples.asList()) vadManager.accept(samples) - // ========== 核心优化:录音过程中实时计算 ========== calibrateEnvBaseline(samples) updateRealtimeEnergy(samples) updateRealtimeFrameStats() + isNoisyEnvironment = currentEnvBaseline >= NOISE_BASELINE_THRESHOLD + if (checkMultiPersonDialogueRealtime(now)) { LogUtils.w(TAG, "🚨 录音中识别出多人对话,提前终止") finishSentence(realtimeEnergySum / realtimeEnergyCount, realtimePeakRms) @@ -268,25 +260,25 @@ class VoiceController( } if (System.currentTimeMillis() - recordingStartMs > maxRecordingMs) { - LogUtils.w(TAG, "⏱ Max recording reached | 当前环境基线: $currentEnvBaseline") + LogUtils.w(TAG, "⏱ Max recording reached | 当前环境基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment") finishSentence(realtimeEnergySum / realtimeEnergyCount, realtimePeakRms) } } } } - /* ================= 新增:录音中实时更新能量统计(适配近距离轻声) ================= */ + /* ================= 实时能量更新 ================= */ private fun updateRealtimeEnergy(samples: FloatArray) { val rms = vadManager.calcRms(samples) - // 仅统计有效语音帧的能量(阈值降低) - if (rms >= MIN_EFFECTIVE_SPEECH_RMS) { + val effectiveThreshold = if (isNoisyEnvironment) currentEnvBaseline * 1.8f else MIN_EFFECTIVE_SPEECH_RMS + if (rms >= effectiveThreshold) { realtimeEnergySum += rms realtimeEnergyCount++ realtimePeakRms = maxOf(realtimePeakRms, rms) } } - /* ================= 新增:录音中实时更新帧统计 ================= */ + /* ================= 实时帧统计 ================= */ private fun updateRealtimeFrameStats() { realtimeTotalFrames = vadManager.getTotalFrames() realtimeSpeechFrames = vadManager.getSpeechFrames() @@ -300,7 +292,7 @@ class VoiceController( realtimeLastFrameIsSpeech = currentFrameIsSpeech } - /* ================= 新增:录音中实时判定多人对话 ================= */ + /* ================= 多人对话检测 ================= */ private fun checkMultiPersonDialogueRealtime(now: Long): Boolean { val duration = now - recordingStartMs if (duration < MULTI_DIALOGUE_MIN_DURATION) return false @@ -318,10 +310,9 @@ class VoiceController( return isMultiPersonDialogueDetected } - /* ================= 环境基线校准(适配近距离场景,降低噪音敏感度) ================= */ + /* ================= 环境基线校准 ================= */ private fun calibrateEnvBaseline(samples: FloatArray) { val rms = vadManager.calcRms(samples) - // 只保留低于基线+阈值的有效值,过滤突发噪音(阈值降低) val validRms = if (rms < currentEnvBaseline + 0.002f) rms else currentEnvBaseline if (rms < 0.015f) { if (envNoiseBuffer.size >= BASELINE_WINDOW_SIZE) { @@ -332,7 +323,7 @@ class VoiceController( } } - /* ================= 唤醒相关方法 ================= */ + /* ================= 唤醒处理 ================= */ private fun handleWakeupEvent() { if (state == VoiceState.UPLOADING) return stopBackendAudio?.invoke() @@ -361,7 +352,7 @@ class VoiceController( private fun onVadStart() { if (state != VoiceState.WAIT_SPEECH) return - LogUtils.d(TAG, "🎤 REAL VAD START | 环境基线: $currentEnvBaseline") + LogUtils.d(TAG, "🎤 REAL VAD START | 环境基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment") vadStarted = true recordingStartMs = System.currentTimeMillis() audioBuffer.clear() @@ -372,64 +363,57 @@ class VoiceController( private fun onVadEnd(avgEnergy: Float, peakRms: Float) { if (state != VoiceState.RECORDING) return - LogUtils.d(TAG, "🧠 VAD END | 环境基线: $currentEnvBaseline") + LogUtils.d(TAG, "🧠 VAD END | 环境基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment") val realAvgEnergy = if (realtimeEnergyCount > 0) realtimeEnergySum / realtimeEnergyCount else avgEnergy val realPeakRms = if (realtimePeakRms > 0) realtimePeakRms else peakRms finishSentence(realAvgEnergy, realPeakRms) } - /* ================= 核心优化:近距离场景 微弱人声过滤方法 ================= */ + /* ================= 微弱人声过滤 ================= */ private fun filterWeakVoice(duration: Long, avgEnergy: Float, peakRms: Float): Boolean { - // 1. 时长过滤:<400ms的极短杂音才过滤 if (duration < MIN_EFFECTIVE_VOICE_DURATION) { LogUtils.w(TAG, "❌ 微弱人声过滤:时长${duration}ms < ${MIN_EFFECTIVE_VOICE_DURATION}ms") return true } - // 2. 帧占比过滤:仅对极低能量语音生效 val voiceFrameRatio = if (realtimeTotalFrames > 0) realtimeSpeechFrames.toFloat() / realtimeTotalFrames else 0f if (avgEnergy < NORMAL_VOICE_ENERGY_THRESHOLD && voiceFrameRatio < MIN_VOICE_FRAME_RATIO) { - LogUtils.w(TAG, "❌ 微弱人声过滤:帧占比${voiceFrameRatio} < ${MIN_VOICE_FRAME_RATIO}(极低能量)") + LogUtils.w(TAG, "❌ 微弱人声过滤:帧占比${voiceFrameRatio} < ${MIN_VOICE_FRAME_RATIO}") return true } - // 3. 峰值能量过滤:仅对极低能量语音生效,且阈值大幅降低 val peakBaselineRatio = peakRms / currentEnvBaseline if (avgEnergy < NORMAL_VOICE_ENERGY_THRESHOLD && peakBaselineRatio < MIN_PEAK_ENERGY_RATIO) { - LogUtils.w(TAG, "❌ 微弱人声过滤:峰值/基线${peakBaselineRatio} < ${MIN_PEAK_ENERGY_RATIO}(极低能量)") + LogUtils.w(TAG, "❌ 微弱人声过滤:峰值/基线${peakBaselineRatio} < ${MIN_PEAK_ENERGY_RATIO}") return true } - // 4. 连续帧过滤:仅对极低能量语音生效,且阈值降到1 if (avgEnergy < NORMAL_VOICE_ENERGY_THRESHOLD && realtimeContinuousSpeechFrames < MIN_CONTINUOUS_VOICE_FRAMES) { - LogUtils.w(TAG, "❌ 微弱人声过滤:连续帧${realtimeContinuousSpeechFrames} < ${MIN_CONTINUOUS_VOICE_FRAMES}(极低能量)") + LogUtils.w(TAG, "❌ 微弱人声过滤:连续帧${realtimeContinuousSpeechFrames} < ${MIN_CONTINUOUS_VOICE_FRAMES}") return true } - // 5. 平均能量过滤:仅对极极低能量语音生效 val energyBaselineRatio = avgEnergy / currentEnvBaseline if (avgEnergy < 0.005f && energyBaselineRatio < 1.2f) { - LogUtils.w(TAG, "❌ 微弱人声过滤:能量/基线${energyBaselineRatio} < 1.2(极极低能量)") + LogUtils.w(TAG, "❌ 微弱人声过滤:能量/基线${energyBaselineRatio} < 1.2") return true } - // 正常语音(包括近距离轻声)直接通过 return false } - /* ================= 结束录音(核心:适配近距离轻声) ================= */ + /* ================= 结束录音 ================= */ private fun finishSentence(avgEnergy: Float = 0f, peakRms: Float = 0f) { val now = System.currentTimeMillis() val duration = now - recordingStartMs if (!vadStarted || duration < MIN_SPEECH_MS) { - LogUtils.d(TAG, "❌ 语音过短: $duration ms | 基线: $currentEnvBaseline") + LogUtils.d(TAG, "❌ 语音过短: $duration ms | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment") hasInvalidSpeech = true resetToWaitSpeech() return } - // ========== 第二步:微弱人声专项过滤(仅过滤极微弱杂音) ========== if (filterWeakVoice(duration, avgEnergy, peakRms)) { hasInvalidSpeech = true resetToWaitSpeech() @@ -440,42 +424,30 @@ class VoiceController( val vadRatio = vadManager.activeSpeechRatio() val peakAvgRatio = if (avgEnergy > 0f) peakRms / avgEnergy else 0f - LogUtils.d(TAG, "📊 录音信息 | 时长: $duration ms | 能量: $avgEnergy | 峰均比: $peakAvgRatio | 基线: $currentEnvBaseline") + LogUtils.d(TAG, "📊 录音信息 | 时长: $duration ms | 能量: $avgEnergy | 峰均比: $peakAvgRatio | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment") LogUtils.d(TAG, "📊 实时帧统计 | 总帧: $realtimeTotalFrames | 语音帧: $realtimeSpeechFrames | 连续语音帧: $realtimeContinuousSpeechFrames") - // 多人对话过滤 if (isMultiPersonDialogueDetected) { - LogUtils.w(TAG, "❌ 过滤多人对话垃圾语音(实时识别) | 时长: $duration ms") + LogUtils.w(TAG, "❌ 过滤多人对话垃圾语音 | 时长: $duration ms") hasInvalidSpeech = true resetToWaitSpeech() return } - // ========== 步骤1:优先声纹验证(核心!仅当前用户可通过) ========== + // 声纹验证(核心极简版) if (ENABLE_STRICT_SPEAKER_VERIFY) { - val isCurrentUser = verifySpeaker(audioBuffer.toFloatArray()) + val isCurrentUser = verifySpeaker(audio) if (!isCurrentUser) { - LogUtils.w(TAG, "❌ 非当前唤醒用户,直接拒绝语音 | 录音时长: $duration ms") + LogUtils.w(TAG, "❌ 非当前唤醒用户,拒绝语音 | 录音时长: $duration ms | 嘈杂环境: $isNoisyEnvironment") hasInvalidSpeech = true resetToWaitSpeech() return } - LogUtils.d(TAG, "✅ 当前用户语音,继续处理 | 录音时长: $duration ms") + LogUtils.d(TAG, "✅ 当前用户语音,继续处理 | 录音时长: $duration ms | 嘈杂环境: $isNoisyEnvironment") } - // ========== 1. 强制兜底:正常语音直接通过(阈值降低) ========== - val isNormalVoice = avgEnergy >= MIN_NORMAL_VOICE_ENERGY && vadRatio >= MIN_NORMAL_VOICE_VAD_RATIO - if (isNormalVoice) { - LogUtils.i(TAG, "✅ 正常语音强制通过 | 能量: $avgEnergy ≥ $MIN_NORMAL_VOICE_ENERGY | 占比: $vadRatio ≥ $MIN_NORMAL_VOICE_VAD_RATIO") - audioBuffer.clear() - state = VoiceState.UPLOADING - onFinalAudio(audio) - resetRealtimeStats() - hasInvalidSpeech = false - return - } - // ========== 2. 远场过滤(近距离场景几乎不生效) ========== + // 远场过滤 val isFarField = avgEnergy < MAX_FAR_FIELD_ENERGY val isInvalidPeakRatio = peakAvgRatio < MIN_VALID_PEAK_AVG_RATIO if (isFarField && isInvalidPeakRatio) { @@ -485,7 +457,7 @@ class VoiceController( return } - // ========== 3. 非连续判定(大幅放宽) ========== + // 非连续判定 val continuousRatio = if (realtimeSpeechFrames > 0) realtimeContinuousSpeechFrames.toFloat() / realtimeSpeechFrames else 0f val peakPositionRatio = vadManager.getPeakPositionRatio() val isDiscontinuous = continuousRatio < MIN_CONTINUOUS_FRAME_RATIO && @@ -498,34 +470,21 @@ class VoiceController( return } - // ========== 4. 分场景动态阈值计算(系数大幅降低) ========== + // 分场景阈值过滤 val isQuietEnv = currentEnvBaseline < BASELINE_QUIET_THRESHOLD val thresholdConfig = when { duration in SHORT_SPEECH_MIN..SHORT_SPEECH_MAX -> { val coeff = if (isQuietEnv) SHORT_SPEECH_ENERGY_COEFF_QUIET else SHORT_SPEECH_ENERGY_COEFF_NOISY val energyThreshold = currentEnvBaseline * coeff - LogUtils.d(TAG, "📏 短语音阈值 | 场景: ${if (isQuietEnv) "安静" else "嘈杂"} | 系数: $coeff | 阈值: $energyThreshold") - ThresholdConfig( - energyThreshold = energyThreshold, - vadRatioThreshold = SHORT_SPEECH_VAD_COEFF, - minScore = SHORT_SPEECH_MIN_SCORE, - scene = "短语音" - ) + ThresholdConfig(energyThreshold, SHORT_SPEECH_VAD_COEFF, SHORT_SPEECH_MIN_SCORE, "短语音") } else -> { val coeff = if (isQuietEnv) LONG_SPEECH_ENERGY_COEFF_QUIET else LONG_SPEECH_ENERGY_COEFF_NOISY val energyThreshold = currentEnvBaseline * coeff - LogUtils.d(TAG, "📏 长语音阈值 | 场景: ${if (isQuietEnv) "安静" else "嘈杂"} | 系数: $coeff | 阈值: $energyThreshold") - ThresholdConfig( - energyThreshold = energyThreshold, - vadRatioThreshold = LONG_SPEECH_VAD_COEFF, - minScore = LONG_SPEECH_MIN_SCORE, - scene = "长语音" - ) + ThresholdConfig(energyThreshold, LONG_SPEECH_VAD_COEFF, LONG_SPEECH_MIN_SCORE, "长语音") } } - // ========== 5. 分场景阈值过滤(阈值降低) ========== val energyPass = avgEnergy >= thresholdConfig.energyThreshold val vadRatioPass = vadRatio >= thresholdConfig.vadRatioThreshold if (!energyPass || !vadRatioPass) { @@ -535,7 +494,7 @@ class VoiceController( return } - // ========== 6. 评分判定(门槛降低到1) ========== + // 评分判定 var score = 0 score += when { duration >= 4000 -> 3 @@ -553,16 +512,16 @@ class VoiceController( return } - // ========== 最终通过 ========== + // 最终通过 audioBuffer.clear() state = VoiceState.UPLOADING onFinalAudio(audio) resetRealtimeStats() hasInvalidSpeech = false - LogUtils.i(TAG, "✅ 近距离轻声通过 | 时长: $duration ms | 能量: $avgEnergy | 场景: ${thresholdConfig.scene}") + LogUtils.i(TAG, "✅ 语音通过 | 时长: $duration ms | 能量: $avgEnergy | 场景: ${thresholdConfig.scene} | 嘈杂环境: $isNoisyEnvironment") } - /* ================= 重置实时统计变量 ================= */ + /* ================= 重置实时统计 ================= */ private fun resetRealtimeStats() { realtimeEnergySum = 0f realtimeEnergyCount = 0 @@ -574,15 +533,15 @@ class VoiceController( isMultiPersonDialogueDetected = false } - /* ================= 播放/上传/Reset 回调 ================= */ + /* ================= 播放/上传回调 ================= */ fun onPlayStartPrompt() { - LogUtils.d(TAG, "🎵 播放提示音 | 基线: $currentEnvBaseline") + LogUtils.d(TAG, "🎵 播放提示音 | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment") state = VoiceState.PLAYING_PROMPT } fun onPlayEndPrompt() { speechEnableAtMs = System.currentTimeMillis() + SPEECH_COOLDOWN_MS - LogUtils.d(TAG, "🎵 提示音结束 | 基线: $currentEnvBaseline") + LogUtils.d(TAG, "🎵 提示音结束 | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment") state = VoiceState.WAIT_SPEECH_COOLDOWN } @@ -591,19 +550,19 @@ class VoiceController( LogUtils.w(TAG, "🎶 非上传完成状态,禁止切换到 PLAYING_BACKEND | 当前状态: $state") return } - LogUtils.d(TAG, "🎶 开始播放后台音频 | 基线: $currentEnvBaseline") + LogUtils.d(TAG, "🎶 开始播放后台音频 | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment") state = VoiceState.PLAYING_BACKEND } fun onPlayEndBackend() { speechEnableAtMs = System.currentTimeMillis() + SPEECH_COOLDOWN_MS - LogUtils.d(TAG, "🎶 后台音频结束 | 基线: $currentEnvBaseline") + LogUtils.d(TAG, "🎶 后台音频结束 | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment") state = VoiceState.WAIT_SPEECH_COOLDOWN } fun onUploadFinished(success: Boolean) { if (state != VoiceState.UPLOADING) return - LogUtils.d(TAG, "📤 上传完成 | 成功: $success | 基线: $currentEnvBaseline") + LogUtils.d(TAG, "📤 上传完成 | 成功: $success | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment") if (!success) { speechEnableAtMs = System.currentTimeMillis() + SPEECH_COOLDOWN_MS @@ -612,7 +571,7 @@ class VoiceController( } private fun resetToWaitSpeech() { - LogUtils.d(TAG, "🔄 重置到等待说话 | 基线: $currentEnvBaseline | 已标记无效说话: $hasInvalidSpeech") + LogUtils.d(TAG, "🔄 重置到等待说话 | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment | 已标记无效说话: $hasInvalidSpeech") val now = System.currentTimeMillis() if (now - lastInvalidResetMs < INVALID_RESET_DEBOUNCE_MS) { LogUtils.d(TAG, "🛡 防抖:1.5秒内重复无效语音,跳过重置") @@ -628,7 +587,7 @@ class VoiceController( } private fun resetAll() { - LogUtils.d(TAG, "🔄 重置所有状态 | 基线: $currentEnvBaseline | 本次超时类型: $currentTimeoutType") + LogUtils.d(TAG, "🔄 重置所有状态 | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment | 本次超时类型: $currentTimeoutType") audioBuffer.clear() preBuffer.clear() vadManager.reset() @@ -638,23 +597,23 @@ class VoiceController( waitSpeechFailStartMs = 0L envNoiseBuffer.clear() currentEnvBaseline = 0.001f + isNoisyEnvironment = false resetRealtimeStats() hasInvalidSpeech = false currentTimeoutType = TimeoutType.IDLE_TIMEOUT - LogUtils.d(TAG, "🔄 环境基线已重置 | 新基线: $currentEnvBaseline | 无效说话标记已重置") state = VoiceState.WAIT_WAKEUP } fun release() { - LogUtils.d(TAG, "🔌 释放资源 | 最终基线: $currentEnvBaseline") + LogUtils.d(TAG, "🔌 释放资源 | 最终基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment") wakeupManager.release() vadManager.reset() envNoiseBuffer.clear() resetRealtimeStats() hasInvalidSpeech = false currentTimeoutType = TimeoutType.IDLE_TIMEOUT + isNoisyEnvironment = false - // 释放声纹识别器资源 runCatching { SpeakerRecognition.extractor.release() speakerManagerLock.withLock { @@ -666,7 +625,6 @@ class VoiceController( } } - // 兜底释放(防止未调用release) protected fun finalize() { runCatching { release() @@ -690,60 +648,66 @@ class VoiceController( val scene: String ) - /* ================= 核心:原生Stream声纹验证(仅当前用户有效) ================= */ - /** - * 验证语音是否属于当前唤醒用户(完全适配你提供的API) - * @param audio 待验证的语音数据 - * @return true=是当前用户,false=非当前用户 - */ + /* ================= 核心:极简版声纹验证 ================= */ private fun verifySpeaker(audio: FloatArray): Boolean { if (audio.isEmpty()) { LogUtils.w(TAG, "❌ 待验证音频为空,声纹验证失败") return false } + // 1. 裁剪音频:只保留本次录音的有效部分(解决时长不匹配问题) + val audioDurationMs = (audio.size.toFloat() / SAMPLE_RATE * 1000).toLong() + // 只保留最后 N 毫秒的音频(N = 实际录音时长),避免缓存旧音频 + val validAudio = if (audioDurationMs > 0) { + val validSampleCount = (audioDurationMs * SAMPLE_RATE / 1000).toInt() + if (validSampleCount < audio.size) { + audio.copyOfRange(audio.size - validSampleCount, audio.size) + } else { + audio + } + } else { + audio + } + + // 2. 分场景选阈值(无容错,只调阈值) + val finalThreshold = when { + audioDurationMs < SHORT_AUDIO_DURATION_MS -> SPEAKER_THRESHOLD_SHORT + isNoisyEnvironment -> SPEAKER_THRESHOLD_NOISY + else -> SPEAKER_THRESHOLD_QUIET + } + var stream: OnlineStream? = null return try { stream = SpeakerRecognition.extractor.createStream() - stream.acceptWaveform(samples = audio, sampleRate = SAMPLE_RATE) + stream.acceptWaveform(samples = validAudio, sampleRate = SAMPLE_RATE) // 用裁剪后的音频验证 stream.inputFinished() if (!SpeakerRecognition.extractor.isReady(stream)) { - LogUtils.w(TAG, "❌ 验证音频Stream未就绪,验证失败") + LogUtils.w(TAG, "❌ 音频Stream未就绪,验证失败") return false } val embedding = SpeakerRecognition.extractor.compute(stream) - // 动态选择阈值 - val threshold = if (audio.size < SHORT_AUDIO_THRESHOLD) { - LogUtils.d(TAG, "📢 检测到短速语音,使用放宽阈值: $SPEAKER_VERIFY_THRESHOLD_SHORT") - SPEAKER_VERIFY_THRESHOLD_SHORT - } else { - SPEAKER_VERIFY_THRESHOLD_NORMAL - } - // 加锁验证 + // 3. 纯验证逻辑:过就过,不过就拒绝 speakerManagerLock.withLock { val verifyPass = SpeakerRecognition.manager.verify( name = CURRENT_USER_ID, embedding = embedding, - threshold = threshold + threshold = finalThreshold ) - if (verifyPass) { - LogUtils.d(TAG, "✅ 声纹验证通过 | 阈值: $threshold") - } else { - LogUtils.w(TAG, "❌ 声纹验证失败 | 阈值: $threshold") - } + + // 打印关键信息(补充裁剪后时长) + LogUtils.d(TAG, "📊 声纹验证 | 阈值: $finalThreshold | 通过: $verifyPass | 嘈杂环境: $isNoisyEnvironment | 原始时长: ${audioDurationMs}ms | 验证时长: ${(validAudio.size.toFloat()/SAMPLE_RATE*1000).toLong()}ms") + + // 无任何容错:验证结果就是最终结果 return verifyPass } } catch (e: Exception) { - LogUtils.e(TAG, "❌ 声纹验证异常", e) - false + LogUtils.e(TAG, "❌ 声纹验证异常,拒绝", e) + return false } finally { - // 释放Stream stream?.release() - LogUtils.d(TAG, "🔄 验证Stream已释放") } } - } \ No newline at end of file diff --git a/app/src/main/java/com/zs/smarthuman/ui/MainActivity.kt b/app/src/main/java/com/zs/smarthuman/ui/MainActivity.kt index f34285b..6a3b05b 100644 --- a/app/src/main/java/com/zs/smarthuman/ui/MainActivity.kt +++ b/app/src/main/java/com/zs/smarthuman/ui/MainActivity.kt @@ -49,6 +49,7 @@ import com.zs.smarthuman.kt.releaseIM import com.zs.smarthuman.sherpa.TimeoutType import com.zs.smarthuman.sherpa.VoiceController import com.zs.smarthuman.toast.Toaster +import com.zs.smarthuman.utils.AudioDebugUtil import com.zs.smarthuman.utils.AudioPcmUtil import com.zs.smarthuman.utils.DangerousUtils import com.zs.smarthuman.utils.LogFileUtils @@ -213,12 +214,12 @@ class MainActivity : BaseViewModelActivity() 1 ) // loadLocalJsonAndPlay() -// val file = File( -// getExternalFilesDir(Environment.DIRECTORY_DOWNLOADS)!!.getAbsolutePath(), -// "xxx.wav" -// ) -// AudioDebugUtil.saveFloatPcmAsWav(audio, file) -// LogUtils.dTag("audioxx", "WAV saved: ${file.path}, samples=${audio.size}") + val file = File( + getExternalFilesDir(Environment.DIRECTORY_DOWNLOADS)!!.getAbsolutePath(), + "xxx.wav" + ) + AudioDebugUtil.saveFloatPcmAsWav(audio, file) + LogUtils.dTag("audioxx", "WAV saved: ${file.path}, samples=${audio.size}") lifecycleScope.launch(Dispatchers.Main) { mVerticalAnimator?.show() @@ -291,7 +292,7 @@ class MainActivity : BaseViewModelActivity() override fun onPause() { super.onPause() - stopRecording() +// stopRecording() UnityPlayerHolder.getInstance().pause() } diff --git a/app/src/main/java/com/zs/smarthuman/viewmodel/MainViewModel.kt b/app/src/main/java/com/zs/smarthuman/viewmodel/MainViewModel.kt index 5450f47..a08047f 100644 --- a/app/src/main/java/com/zs/smarthuman/viewmodel/MainViewModel.kt +++ b/app/src/main/java/com/zs/smarthuman/viewmodel/MainViewModel.kt @@ -41,6 +41,9 @@ class MainViewModel: BaseViewModel() { RxHttp.postJson(ApiService.UPLOAD_RECORD_VOICE_URL) .add("sessionCode",sessionCode) .add("audio", audioVoice) + .readTimeout(3000L) + .writeTimeout(3000L) + .connectTimeout(3000L) .toAwaitResponse() .awaitResult() .getOrThrow()