去除多余的检测

This commit is contained in:
林若思 2026-01-16 18:30:16 +08:00
parent cdc189c5f4
commit c450d8d620
7 changed files with 211 additions and 554 deletions

View File

@ -2,203 +2,100 @@ package com.zs.smarthuman.sherpa
import android.content.res.AssetManager import android.content.res.AssetManager
import com.blankj.utilcode.util.LogUtils import com.blankj.utilcode.util.LogUtils
import com.k2fsa.sherpa.onnx.SileroVadModelConfig
import com.k2fsa.sherpa.onnx.Vad import com.k2fsa.sherpa.onnx.Vad
import com.k2fsa.sherpa.onnx.VadModelConfig
import com.k2fsa.sherpa.onnx.getVadModelConfig import com.k2fsa.sherpa.onnx.getVadModelConfig
import kotlin.math.sqrt import kotlin.math.sqrt
class VadManager( class VadManager(
assetManager: AssetManager, assetManager: AssetManager,
private val onSpeechStart: () -> Unit, private val onSpeechStart: () -> Unit,
private val onSpeechEnd: (avgEnergy: Float, peakRms: Float) -> Unit private val onSpeechEnd: () -> Unit
) { ) {
private val TAG = "SmartHuman-VadManager" private val TAG = "VadManager"
private val vad: Vad private val vad: Vad
private var isSpeaking = false private var isSpeaking = false
private var lastSpeechTime = 0L private var lastSpeechTime = 0L
// ========== 核心调整:区分活跃期/收尾期阈值 ========== private val ACTIVE_END_SILENCE_MS = 1500L
// 说话活跃期(容忍停顿) private val ACTIVE_CONSECUTIVE_FRAMES = 10
private val ACTIVE_END_SILENCE_MS = 1500L // 活跃期基础静默(保留停顿容忍) private val FINAL_END_SILENCE_MS = 800L
private val ACTIVE_CONSECUTIVE_FRAMES = 10 // 活跃期连续静音帧 private val FINAL_CONSECUTIVE_FRAMES = 5
// 说话收尾期(快速结束) private val FINAL_PHASE_TRIGGER_MS = 1000L
private val FINAL_END_SILENCE_MS = 800L // 收尾期基础静默缩短到800ms private val MAX_SILENCE_AFTER_SPEECH_MS = 2000L
private val FINAL_CONSECUTIVE_FRAMES = 5 // 收尾期连续静音帧5帧=100ms
// 收尾期触发条件最后一次有效语音后超过X秒判定为进入收尾期
private val FINAL_PHASE_TRIGGER_MS = 1000L // 1秒无有效语音进入收尾期
private val MAX_SILENCE_AFTER_SPEECH_MS = 2000L // 兜底阈值从3秒降到2秒
// 原有基础配置
private val MIN_EFFECTIVE_SPEECH_RMS = 0.001f private val MIN_EFFECTIVE_SPEECH_RMS = 0.001f
private val ENV_BASELINE_FACTOR = 1.2f
private var envBaselineRms = 0.0005f
private var lastEffectiveSpeechTime = 0L
private var consecutiveSilenceFrames = 0 private var consecutiveSilenceFrames = 0
// 新增:收尾期标记
private var isInFinalPhase = false private var isInFinalPhase = false
private var lastEffectiveSpeechTime = 0L
// 统计变量
private var speechEnergySum = 0f
private var speechFrameCount = 0
private var peakRms = 0f
private var totalFrames = 0
private var speechFrames = 0
private var continuousSpeechFrames = 0
private var lastFrameIsSpeech = false
private var peakPosition = 0
private var frameIndex = 0
private var activeFrameCount = 0
private var activeSpeechFrameCount = 0
init { init {
val config = getVadModelConfig(0) ?: throw IllegalStateException("[$TAG] VAD config not found") val config = getVadModelConfig(0)
vad = Vad(assetManager, config) ?: throw IllegalStateException("[$TAG] VAD config not found")
vad = Vad(assetManager, VadModelConfig(sileroVadModelConfig = SileroVadModelConfig(model = "silero_vad.onnx", threshold = 0.2f)))
LogUtils.i(TAG, "✅ VAD 初始化成功") LogUtils.i(TAG, "✅ VAD 初始化成功")
} }
fun accept(samples: FloatArray) { fun accept(samples: FloatArray) {
val now = System.currentTimeMillis() val now = System.currentTimeMillis()
vad.acceptWaveform(samples) vad.acceptWaveform(samples)
val vadHasSpeech = vad.isSpeechDetected() val vadHasSpeech = vad.isSpeechDetected()
val rms = calcRms(samples) val rms = calcRms(samples)
// 环境基线更新 val isEffectiveSpeech = vadHasSpeech && rms >= MIN_EFFECTIVE_SPEECH_RMS
if (!vadHasSpeech || rms < MIN_EFFECTIVE_SPEECH_RMS) {
envBaselineRms = (envBaselineRms * 0.9f) + (rms * 0.1f)
}
val effectiveSpeechThreshold = maxOf(MIN_EFFECTIVE_SPEECH_RMS, envBaselineRms * ENV_BASELINE_FACTOR)
val isEffectiveSpeech = vadHasSpeech && rms >= effectiveSpeechThreshold
// ========== 核心优化:动态判定收尾期 ==========
if (isEffectiveSpeech) { if (isEffectiveSpeech) {
lastEffectiveSpeechTime = now lastEffectiveSpeechTime = now
isInFinalPhase = false // 有有效语音,退出收尾期 isInFinalPhase = false
lastSpeechTime = now
consecutiveSilenceFrames = 0
} else { } else {
// 最后一次有效语音后超过1秒进入收尾期 consecutiveSilenceFrames++
if (now - lastEffectiveSpeechTime >= FINAL_PHASE_TRIGGER_MS) { if (now - lastEffectiveSpeechTime >= FINAL_PHASE_TRIGGER_MS) {
isInFinalPhase = true isInFinalPhase = true
} }
} }
// 语音能量统计 val (endSilenceMs, endFrames) =
if (isEffectiveSpeech) { if (isInFinalPhase)
speechEnergySum += rms FINAL_END_SILENCE_MS to FINAL_CONSECUTIVE_FRAMES
speechFrameCount++ else
peakRms = maxOf(peakRms, rms) ACTIVE_END_SILENCE_MS to ACTIVE_CONSECUTIVE_FRAMES
lastSpeechTime = now
consecutiveSilenceFrames = 0
LogUtils.v(TAG, "🔊 有效语音帧 | RMS: $rms | 阈值: $effectiveSpeechThreshold | 收尾期: $isInFinalPhase")
} else {
consecutiveSilenceFrames++
LogUtils.v(TAG, if (vadHasSpeech) "⚠ 低能量语音帧 | RMS: $rms | 阈值: $effectiveSpeechThreshold"
else "🔇 静音帧 | 连续静音帧: $consecutiveSilenceFrames | 收尾期: $isInFinalPhase")
}
// 帧统计
totalFrames++
frameIndex++
if (isEffectiveSpeech) {
speechFrames++
continuousSpeechFrames = if (lastFrameIsSpeech) continuousSpeechFrames + 1 else 1
lastFrameIsSpeech = true
if (rms == peakRms) peakPosition = frameIndex
} else {
lastFrameIsSpeech = false
}
// ========== 核心优化:根据收尾期选择不同阈值 ==========
val (endSilenceMs, consecutiveFrames) = if (isInFinalPhase) {
Pair(FINAL_END_SILENCE_MS, FINAL_CONSECUTIVE_FRAMES)
} else {
Pair(ACTIVE_END_SILENCE_MS, ACTIVE_CONSECUTIVE_FRAMES)
}
// VAD状态流转
if (isEffectiveSpeech) { if (isEffectiveSpeech) {
if (!isSpeaking) { if (!isSpeaking) {
isSpeaking = true isSpeaking = true
LogUtils.d(TAG, "🎤 有效语音开始 | 阈值: $effectiveSpeechThreshold")
onSpeechStart() onSpeechStart()
} }
activeFrameCount++ } else if (isSpeaking) {
activeSpeechFrameCount++ val silenceMs = now - lastSpeechTime
} else { val effectiveSilenceMs = now - lastEffectiveSpeechTime
if (isSpeaking) {
activeFrameCount++
val vadSilenceDuration = now - lastSpeechTime
val effectiveSilenceDuration = now - lastEffectiveSpeechTime
// 触发结束条件:适配当前阶段的阈值 val shouldEnd =
val isSilenceTimeout = (vadSilenceDuration >= endSilenceMs || (silenceMs >= endSilenceMs ||
effectiveSilenceDuration >= MAX_SILENCE_AFTER_SPEECH_MS) && effectiveSilenceMs >= MAX_SILENCE_AFTER_SPEECH_MS) &&
consecutiveSilenceFrames >= consecutiveFrames consecutiveSilenceFrames >= endFrames
if (isSilenceTimeout) { if (shouldEnd) {
onSpeechEnd()
reset()
isSpeaking = false isSpeaking = false
isInFinalPhase = false // 重置收尾期 isInFinalPhase = false
val avgEnergy = if (speechFrameCount > 0) speechEnergySum / speechFrameCount else 0f
LogUtils.d(TAG, """
🛑 语音结束
- 有效静默时长: ${effectiveSilenceDuration}ms
- 连续静音帧: $consecutiveSilenceFrames
- 平均能量: $avgEnergy | 峰值: $peakRms
- 收尾期: $isInFinalPhase | 所用阈值: $endSilenceMs ms
""".trimIndent())
onSpeechEnd(avgEnergy, peakRms)
resetStats()
} else {
LogUtils.v(TAG, "⏳ 静默中 | 连续静音帧: $consecutiveSilenceFrames | 静默时长: ${effectiveSilenceDuration}ms | 所用阈值: $endSilenceMs ms")
} }
} }
} }
}
// 保留原有方法...
fun isSpeechDetected(): Boolean {
return vad.isSpeechDetected()
}
fun activeSpeechRatio(): Float {
val ratio = if (activeFrameCount == 0) 0f else activeSpeechFrameCount.toFloat() / activeFrameCount
LogUtils.d(TAG, "📊 语音占比: $ratio | 有效语音帧: $activeSpeechFrameCount | 总帧: $activeFrameCount")
return ratio
}
fun getTotalFrames(): Int = totalFrames
fun getSpeechFrames(): Int = speechFrames
fun getContinuousSpeechFrames(): Int = continuousSpeechFrames
fun getPeakPositionRatio(): Float {
return if (totalFrames == 0) 0f else peakPosition.toFloat() / totalFrames
}
fun reset() { fun reset() {
isSpeaking = false isSpeaking = false
lastSpeechTime = 0L lastSpeechTime = 0L
lastEffectiveSpeechTime = 0L lastEffectiveSpeechTime = 0L
envBaselineRms = 0.0005f
consecutiveSilenceFrames = 0 consecutiveSilenceFrames = 0
isInFinalPhase = false // 重置收尾期 isInFinalPhase = false
resetStats()
vad.reset() vad.reset()
totalFrames = 0
speechFrames = 0
continuousSpeechFrames = 0
lastFrameIsSpeech = false
peakPosition = 0
frameIndex = 0
LogUtils.d(TAG, "🔄 VAD 状态已完全重置")
}
private fun resetStats() {
activeFrameCount = 0
activeSpeechFrameCount = 0
speechEnergySum = 0f
speechFrameCount = 0
peakRms = 0f
} }
fun calcRms(samples: FloatArray): Float { fun calcRms(samples: FloatArray): Float {

View File

@ -15,7 +15,7 @@ class VoiceController(
assetManager: AssetManager, assetManager: AssetManager,
private val onWakeup: () -> Unit, private val onWakeup: () -> Unit,
private val onFinalAudio: (FloatArray) -> Unit, private val onFinalAudio: (FloatArray) -> Unit,
idleTimeoutSeconds: Int = 200, idleTimeoutSeconds: Int = 10,
maxRecordingSeconds: Int = 10, maxRecordingSeconds: Int = 10,
private val onStateChanged: ((VoiceState) -> Unit)? = null, private val onStateChanged: ((VoiceState) -> Unit)? = null,
private val stopBackendAudio: (() -> Unit)? = null, private val stopBackendAudio: (() -> Unit)? = null,
@ -30,20 +30,14 @@ class VoiceController(
// 预缓存大小2秒 // 预缓存大小2秒
private const val PRE_BUFFER_SIZE = SAMPLE_RATE * 2 private const val PRE_BUFFER_SIZE = SAMPLE_RATE * 2
// ========== 核心:分场景声纹阈值(极简版) ==========
private const val SPEAKER_THRESHOLD_QUIET = 0.50f // 安静环境
private const val SPEAKER_THRESHOLD_NOISY = 0.45f // 嘈杂环境(匹配你的真实相似度)
private const val SPEAKER_THRESHOLD_SHORT = 0.43f // 短语音(<1秒
// 短语音判定阈值 // 短语音判定阈值
private const val SHORT_AUDIO_DURATION_MS = 1000L private const val SHORT_AUDIO_DURATION_MS = 1000L
private const val INVALID_RESET_DEBOUNCE_MS = 1500L private const val INVALID_RESET_DEBOUNCE_MS = 1500L
// 最小语音时长 // 最小语音时长
private const val MIN_SPEECH_MS = 800L private const val MIN_SPEECH_MS = 800L
private const val MIN_EFFECTIVE_VOICE_DURATION = 400L
// 噪音场景判定阈值 // 统一的声纹验证阈值(不再分场景)
private const val NOISE_BASELINE_THRESHOLD = 0.01f private const val SPEAKER_THRESHOLD = 0.45f
} }
var state: VoiceState = VoiceState.WAIT_WAKEUP var state: VoiceState = VoiceState.WAIT_WAKEUP
@ -53,26 +47,17 @@ class VoiceController(
onStateChanged?.invoke(value) onStateChanged?.invoke(value)
} }
// 实时能量与帧统计变量 // 无效说话标记 + 超时类型
private var realtimeEnergySum = 0f private var hasInvalidSpeech = false
private var realtimeEnergyCount = 0 private var currentTimeoutType: TimeoutType = TimeoutType.IDLE_TIMEOUT
private var realtimePeakRms = 0f
private var realtimeTotalFrames = 0
private var realtimeSpeechFrames = 0
private var realtimeContinuousSpeechFrames = 0
private var realtimeLastFrameIsSpeech = false
private var isMultiPersonDialogueDetected = false
private var lastInvalidResetMs = 0L private var lastInvalidResetMs = 0L
private val speakerManagerLock = ReentrantLock() private val speakerManagerLock = ReentrantLock()
// 环境噪音状态标记
private var isNoisyEnvironment = false
private val wakeupManager = WakeupManager(assetManager, onWakeup) private val wakeupManager = WakeupManager(assetManager, onWakeup)
private val vadManager = VadManager( private val vadManager = VadManager(
assetManager, assetManager,
onSpeechStart = { onVadStart() }, onSpeechStart = { onVadStart() },
onSpeechEnd = { avgEnergy, peakRms -> onVadEnd(avgEnergy, peakRms) } onSpeechEnd = { onVadEnd() }
) )
private val audioBuffer = mutableListOf<Float>() private val audioBuffer = mutableListOf<Float>()
@ -92,50 +77,6 @@ class VoiceController(
private val idleTimeoutMs = idleTimeoutSeconds * 1000L private val idleTimeoutMs = idleTimeoutSeconds * 1000L
private val maxRecordingMs = maxRecordingSeconds * 1000L private val maxRecordingMs = maxRecordingSeconds * 1000L
// 分场景动态系数(保留原有逻辑)
private val BASELINE_WINDOW_SIZE = 50
private val envNoiseBuffer = ArrayDeque<Float>(BASELINE_WINDOW_SIZE)
private var currentEnvBaseline = 0.001f
// 分场景动态系数
private val BASELINE_QUIET_THRESHOLD = 0.005f
private val SHORT_SPEECH_ENERGY_COEFF_QUIET = 1.5f
private val SHORT_SPEECH_ENERGY_COEFF_NOISY = 2.0f
private val LONG_SPEECH_ENERGY_COEFF_QUIET = 2.5f
private val LONG_SPEECH_ENERGY_COEFF_NOISY = 3.5f
private val SHORT_SPEECH_VAD_COEFF = 0.05f
private val LONG_SPEECH_VAD_COEFF = 0.10f
private val SHORT_SPEECH_MIN_SCORE = 1
private val LONG_SPEECH_MIN_SCORE = 1
// 其他过滤参数
private val MAX_FAR_FIELD_ENERGY = 0.015f
private val MIN_VALID_PEAK_AVG_RATIO = 0.5f
private val MIN_CONTINUOUS_FRAME_RATIO = 0.1f
private val MAX_PEAK_POSITION_RATIO = 0.95f
private val MIN_EFFECTIVE_SPEECH_FRAMES = 3
private val SHORT_SPEECH_MIN = 500L
private val SHORT_SPEECH_MAX = 2000L
// 多人对话过滤配置
private val MULTI_DIALOGUE_MIN_DURATION = 2500L
private val MULTI_DIALOGUE_MAX_PEAK_AVG_RATIO = 2.5f
private val MULTI_DIALOGUE_MIN_PEAK_AVG_RATIO = 0.4f
private val MULTI_DIALOGUE_MAX_CONTINUOUS_RATIO = 0.3f
private val MULTI_DIALOGUE_MIN_VAD_RATIO = 0.55f
// 微弱人声过滤配置
private val MIN_VOICE_FRAME_RATIO = 0.08f
private val MIN_PEAK_ENERGY_RATIO = 1.5f
private val NORMAL_VOICE_ENERGY_THRESHOLD = 0.008f
private val MIN_CONTINUOUS_VOICE_FRAMES = 1
private val MIN_EFFECTIVE_SPEECH_RMS = 0.0005f
// 无效说话标记 + 超时类型
private var hasInvalidSpeech = false
private var currentTimeoutType: TimeoutType = TimeoutType.IDLE_TIMEOUT
// 声纹验证相关 // 声纹验证相关
private val CURRENT_USER_ID = "current_wakeup_user" private val CURRENT_USER_ID = "current_wakeup_user"
private val ENABLE_STRICT_SPEAKER_VERIFY = true private val ENABLE_STRICT_SPEAKER_VERIFY = true
@ -200,12 +141,6 @@ class VoiceController(
val now = System.currentTimeMillis() val now = System.currentTimeMillis()
if (state == VoiceState.WAIT_WAKEUP) {
calibrateEnvBaseline(samples)
isNoisyEnvironment = currentEnvBaseline >= NOISE_BASELINE_THRESHOLD
LogUtils.d(TAG, "📊 环境状态 | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment")
}
when (state) { when (state) {
VoiceState.WAIT_WAKEUP, VoiceState.WAIT_WAKEUP,
VoiceState.PLAYING_PROMPT, VoiceState.PLAYING_PROMPT,
@ -246,81 +181,15 @@ class VoiceController(
audioBuffer.addAll(samples.asList()) audioBuffer.addAll(samples.asList())
vadManager.accept(samples) vadManager.accept(samples)
calibrateEnvBaseline(samples) // 仅保留最大录音时长判断
updateRealtimeEnergy(samples)
updateRealtimeFrameStats()
isNoisyEnvironment = currentEnvBaseline >= NOISE_BASELINE_THRESHOLD
if (checkMultiPersonDialogueRealtime(now)) {
LogUtils.w(TAG, "🚨 录音中识别出多人对话,提前终止")
finishSentence(realtimeEnergySum / realtimeEnergyCount, realtimePeakRms)
return
}
if (System.currentTimeMillis() - recordingStartMs > maxRecordingMs) { if (System.currentTimeMillis() - recordingStartMs > maxRecordingMs) {
LogUtils.w(TAG, "⏱ Max recording reached | 当前环境基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment") LogUtils.w(TAG, "⏱ Max recording reached")
finishSentence(realtimeEnergySum / realtimeEnergyCount, realtimePeakRms) finishSentence()
} }
} }
} }
} }
/* ================= 实时能量更新 ================= */
private fun updateRealtimeEnergy(samples: FloatArray) {
val rms = vadManager.calcRms(samples)
val effectiveThreshold = if (isNoisyEnvironment) currentEnvBaseline * 1.8f else MIN_EFFECTIVE_SPEECH_RMS
if (rms >= effectiveThreshold) {
realtimeEnergySum += rms
realtimeEnergyCount++
realtimePeakRms = maxOf(realtimePeakRms, rms)
}
}
/* ================= 实时帧统计 ================= */
private fun updateRealtimeFrameStats() {
realtimeTotalFrames = vadManager.getTotalFrames()
realtimeSpeechFrames = vadManager.getSpeechFrames()
realtimeContinuousSpeechFrames = vadManager.getContinuousSpeechFrames()
val currentFrameIsSpeech = vadManager.isSpeechDetected()
if (currentFrameIsSpeech) {
realtimeContinuousSpeechFrames = if (realtimeLastFrameIsSpeech) realtimeContinuousSpeechFrames + 1 else 1
} else {
realtimeContinuousSpeechFrames = 0
}
realtimeLastFrameIsSpeech = currentFrameIsSpeech
}
/* ================= 多人对话检测 ================= */
private fun checkMultiPersonDialogueRealtime(now: Long): Boolean {
val duration = now - recordingStartMs
if (duration < MULTI_DIALOGUE_MIN_DURATION) return false
val avgEnergy = if (realtimeEnergyCount > 0) realtimeEnergySum / realtimeEnergyCount else 0f
val peakAvgRatio = if (avgEnergy > 0) realtimePeakRms / avgEnergy else 0f
val continuousRatio = if (realtimeSpeechFrames > 0) realtimeContinuousSpeechFrames.toFloat() / realtimeSpeechFrames else 0f
val vadRatio = vadManager.activeSpeechRatio()
isMultiPersonDialogueDetected = duration >= MULTI_DIALOGUE_MIN_DURATION &&
peakAvgRatio in MULTI_DIALOGUE_MIN_PEAK_AVG_RATIO..MULTI_DIALOGUE_MAX_PEAK_AVG_RATIO &&
continuousRatio <= MULTI_DIALOGUE_MAX_CONTINUOUS_RATIO &&
vadRatio >= MULTI_DIALOGUE_MIN_VAD_RATIO
return isMultiPersonDialogueDetected
}
/* ================= 环境基线校准 ================= */
private fun calibrateEnvBaseline(samples: FloatArray) {
val rms = vadManager.calcRms(samples)
val validRms = if (rms < currentEnvBaseline + 0.002f) rms else currentEnvBaseline
if (rms < 0.015f) {
if (envNoiseBuffer.size >= BASELINE_WINDOW_SIZE) {
envNoiseBuffer.removeFirst()
}
envNoiseBuffer.addLast(validRms)
currentEnvBaseline = envNoiseBuffer.maxOrNull() ?: 0.001f
}
}
/* ================= 唤醒处理 ================= */ /* ================= 唤醒处理 ================= */
private fun handleWakeupEvent() { private fun handleWakeupEvent() {
if (state == VoiceState.UPLOADING) return if (state == VoiceState.UPLOADING) return
@ -339,207 +208,73 @@ class VoiceController(
audioBuffer.clear() audioBuffer.clear()
vadManager.reset() vadManager.reset()
vadStarted = false vadStarted = false
resetRealtimeStats()
} }
inKwsObserve = true inKwsObserve = true
kwsObserveStartMs = System.currentTimeMillis() kwsObserveStartMs = System.currentTimeMillis()
onWakeup() onWakeup()
LogUtils.d(TAG, "🔔 唤醒成功 | 环境基线: $currentEnvBaseline") LogUtils.d(TAG, "🔔 唤醒成功")
} }
private fun onVadStart() { private fun onVadStart() {
if (state != VoiceState.WAIT_SPEECH) return if (state != VoiceState.WAIT_SPEECH) return
LogUtils.d(TAG, "🎤 REAL VAD START | 环境基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment") LogUtils.d(TAG, "🎤 REAL VAD START")
vadStarted = true vadStarted = true
recordingStartMs = System.currentTimeMillis() recordingStartMs = System.currentTimeMillis()
audioBuffer.clear() audioBuffer.clear()
audioBuffer.addAll(preBuffer) audioBuffer.addAll(preBuffer)
resetRealtimeStats()
state = VoiceState.RECORDING state = VoiceState.RECORDING
} }
private fun onVadEnd(avgEnergy: Float, peakRms: Float) { private fun onVadEnd() {
if (state != VoiceState.RECORDING) return if (state != VoiceState.RECORDING) return
LogUtils.d(TAG, "🧠 VAD END | 环境基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment") LogUtils.d(TAG, "🧠 VAD END")
val realAvgEnergy = if (realtimeEnergyCount > 0) realtimeEnergySum / realtimeEnergyCount else avgEnergy finishSentence()
val realPeakRms = if (realtimePeakRms > 0) realtimePeakRms else peakRms
finishSentence(realAvgEnergy, realPeakRms)
}
/* ================= 微弱人声过滤 ================= */
private fun filterWeakVoice(duration: Long, avgEnergy: Float, peakRms: Float): Boolean {
if (duration < MIN_EFFECTIVE_VOICE_DURATION) {
LogUtils.w(TAG, "❌ 微弱人声过滤:时长${duration}ms < ${MIN_EFFECTIVE_VOICE_DURATION}ms")
return true
}
val voiceFrameRatio = if (realtimeTotalFrames > 0) realtimeSpeechFrames.toFloat() / realtimeTotalFrames else 0f
if (avgEnergy < NORMAL_VOICE_ENERGY_THRESHOLD && voiceFrameRatio < MIN_VOICE_FRAME_RATIO) {
LogUtils.w(TAG, "❌ 微弱人声过滤:帧占比${voiceFrameRatio} < ${MIN_VOICE_FRAME_RATIO}")
return true
}
val peakBaselineRatio = peakRms / currentEnvBaseline
if (avgEnergy < NORMAL_VOICE_ENERGY_THRESHOLD && peakBaselineRatio < MIN_PEAK_ENERGY_RATIO) {
LogUtils.w(TAG, "❌ 微弱人声过滤:峰值/基线${peakBaselineRatio} < ${MIN_PEAK_ENERGY_RATIO}")
return true
}
if (avgEnergy < NORMAL_VOICE_ENERGY_THRESHOLD && realtimeContinuousSpeechFrames < MIN_CONTINUOUS_VOICE_FRAMES) {
LogUtils.w(TAG, "❌ 微弱人声过滤:连续帧${realtimeContinuousSpeechFrames} < ${MIN_CONTINUOUS_VOICE_FRAMES}")
return true
}
val energyBaselineRatio = avgEnergy / currentEnvBaseline
if (avgEnergy < 0.005f && energyBaselineRatio < 1.2f) {
LogUtils.w(TAG, "❌ 微弱人声过滤:能量/基线${energyBaselineRatio} < 1.2")
return true
}
return false
} }
/* ================= 结束录音 ================= */ /* ================= 结束录音 ================= */
private fun finishSentence(avgEnergy: Float = 0f, peakRms: Float = 0f) { private fun finishSentence() {
val now = System.currentTimeMillis() val now = System.currentTimeMillis()
val duration = now - recordingStartMs val duration = now - recordingStartMs
if (!vadStarted || duration < MIN_SPEECH_MS) { if (!vadStarted || duration < MIN_SPEECH_MS) {
LogUtils.d(TAG, "❌ 语音过短: $duration ms | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment") LogUtils.d(TAG, "❌ 语音过短: $duration ms")
hasInvalidSpeech = true
resetToWaitSpeech()
return
}
if (filterWeakVoice(duration, avgEnergy, peakRms)) {
hasInvalidSpeech = true hasInvalidSpeech = true
resetToWaitSpeech() resetToWaitSpeech()
return return
} }
val audio = audioBuffer.toFloatArray() val audio = audioBuffer.toFloatArray()
val vadRatio = vadManager.activeSpeechRatio()
val peakAvgRatio = if (avgEnergy > 0f) peakRms / avgEnergy else 0f
LogUtils.d(TAG, "📊 录音信息 | 时长: $duration ms | 能量: $avgEnergy | 峰均比: $peakAvgRatio | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment") // 声纹验证(保留核心逻辑)
LogUtils.d(TAG, "📊 实时帧统计 | 总帧: $realtimeTotalFrames | 语音帧: $realtimeSpeechFrames | 连续语音帧: $realtimeContinuousSpeechFrames")
if (isMultiPersonDialogueDetected) {
LogUtils.w(TAG, "❌ 过滤多人对话垃圾语音 | 时长: $duration ms")
hasInvalidSpeech = true
resetToWaitSpeech()
return
}
// 声纹验证(核心极简版)
if (ENABLE_STRICT_SPEAKER_VERIFY) { if (ENABLE_STRICT_SPEAKER_VERIFY) {
val isCurrentUser = verifySpeaker(audio) val isCurrentUser = verifySpeaker(audio)
if (!isCurrentUser) { if (!isCurrentUser) {
LogUtils.w(TAG, "❌ 非当前唤醒用户,拒绝语音 | 录音时长: $duration ms | 嘈杂环境: $isNoisyEnvironment") LogUtils.w(TAG, "❌ 非当前唤醒用户,拒绝语音 | 录音时长: $duration ms")
hasInvalidSpeech = true hasInvalidSpeech = true
resetToWaitSpeech() resetToWaitSpeech()
return return
} }
LogUtils.d(TAG, "✅ 当前用户语音,继续处理 | 录音时长: $duration ms | 嘈杂环境: $isNoisyEnvironment") LogUtils.d(TAG, "✅ 当前用户语音,继续处理 | 录音时长: $duration ms")
}
// 远场过滤
val isFarField = avgEnergy < MAX_FAR_FIELD_ENERGY
val isInvalidPeakRatio = peakAvgRatio < MIN_VALID_PEAK_AVG_RATIO
if (isFarField && isInvalidPeakRatio) {
LogUtils.w(TAG, "❌ 远场/无效语音过滤 | 能量: $avgEnergy < $MAX_FAR_FIELD_ENERGY")
hasInvalidSpeech = true
resetToWaitSpeech()
return
}
// 非连续判定
val continuousRatio = if (realtimeSpeechFrames > 0) realtimeContinuousSpeechFrames.toFloat() / realtimeSpeechFrames else 0f
val peakPositionRatio = vadManager.getPeakPositionRatio()
val isDiscontinuous = continuousRatio < MIN_CONTINUOUS_FRAME_RATIO &&
realtimeSpeechFrames < MIN_EFFECTIVE_SPEECH_FRAMES &&
peakPositionRatio > MAX_PEAK_POSITION_RATIO
if (isDiscontinuous) {
LogUtils.w(TAG, "❌ 非连续杂音过滤 | 连续占比: $continuousRatio < $MIN_CONTINUOUS_FRAME_RATIO")
hasInvalidSpeech = true
resetToWaitSpeech()
return
}
// 分场景阈值过滤
val isQuietEnv = currentEnvBaseline < BASELINE_QUIET_THRESHOLD
val thresholdConfig = when {
duration in SHORT_SPEECH_MIN..SHORT_SPEECH_MAX -> {
val coeff = if (isQuietEnv) SHORT_SPEECH_ENERGY_COEFF_QUIET else SHORT_SPEECH_ENERGY_COEFF_NOISY
val energyThreshold = currentEnvBaseline * coeff
ThresholdConfig(energyThreshold, SHORT_SPEECH_VAD_COEFF, SHORT_SPEECH_MIN_SCORE, "短语音")
}
else -> {
val coeff = if (isQuietEnv) LONG_SPEECH_ENERGY_COEFF_QUIET else LONG_SPEECH_ENERGY_COEFF_NOISY
val energyThreshold = currentEnvBaseline * coeff
ThresholdConfig(energyThreshold, LONG_SPEECH_VAD_COEFF, LONG_SPEECH_MIN_SCORE, "长语音")
}
}
val energyPass = avgEnergy >= thresholdConfig.energyThreshold
val vadRatioPass = vadRatio >= thresholdConfig.vadRatioThreshold
if (!energyPass || !vadRatioPass) {
LogUtils.w(TAG, "❌ 低能量语音阈值过滤 | 能量: $avgEnergy < ${thresholdConfig.energyThreshold} | 占比: $vadRatio < ${thresholdConfig.vadRatioThreshold} | 场景: ${thresholdConfig.scene}")
hasInvalidSpeech = true
resetToWaitSpeech()
return
}
// 评分判定
var score = 0
score += when {
duration >= 4000 -> 3
duration >= 2500 -> 2
else -> 1
}
score += if (avgEnergy >= thresholdConfig.energyThreshold) 1 else 0
score += if (continuousRatio >= MIN_CONTINUOUS_FRAME_RATIO) 1 else 0
val pass = score >= thresholdConfig.minScore
if (!pass) {
LogUtils.w(TAG, "❌ 评分不足过滤 | 总分: $score < ${thresholdConfig.minScore} | 场景: ${thresholdConfig.scene}")
hasInvalidSpeech = true
resetToWaitSpeech()
return
} }
// 最终通过 // 最终通过
audioBuffer.clear() audioBuffer.clear()
state = VoiceState.UPLOADING state = VoiceState.UPLOADING
onFinalAudio(audio) onFinalAudio(audio)
resetRealtimeStats()
hasInvalidSpeech = false hasInvalidSpeech = false
LogUtils.i(TAG, "✅ 语音通过 | 时长: $duration ms | 能量: $avgEnergy | 场景: ${thresholdConfig.scene} | 嘈杂环境: $isNoisyEnvironment") LogUtils.i(TAG, "✅ 语音通过 | 时长: $duration ms")
}
/* ================= 重置实时统计 ================= */
private fun resetRealtimeStats() {
realtimeEnergySum = 0f
realtimeEnergyCount = 0
realtimePeakRms = 0f
realtimeTotalFrames = 0
realtimeSpeechFrames = 0
realtimeContinuousSpeechFrames = 0
realtimeLastFrameIsSpeech = false
isMultiPersonDialogueDetected = false
} }
/* ================= 播放/上传回调 ================= */ /* ================= 播放/上传回调 ================= */
fun onPlayStartPrompt() { fun onPlayStartPrompt() {
LogUtils.d(TAG, "🎵 播放提示音 | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment") LogUtils.d(TAG, "🎵 播放提示音")
state = VoiceState.PLAYING_PROMPT state = VoiceState.PLAYING_PROMPT
} }
fun onPlayEndPrompt() { fun onPlayEndPrompt() {
speechEnableAtMs = System.currentTimeMillis() + SPEECH_COOLDOWN_MS speechEnableAtMs = System.currentTimeMillis() + SPEECH_COOLDOWN_MS
LogUtils.d(TAG, "🎵 提示音结束 | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment") LogUtils.d(TAG, "🎵 提示音结束")
state = VoiceState.WAIT_SPEECH_COOLDOWN state = VoiceState.WAIT_SPEECH_COOLDOWN
} }
@ -548,19 +283,19 @@ class VoiceController(
LogUtils.w(TAG, "🎶 非上传完成状态,禁止切换到 PLAYING_BACKEND | 当前状态: $state") LogUtils.w(TAG, "🎶 非上传完成状态,禁止切换到 PLAYING_BACKEND | 当前状态: $state")
return return
} }
LogUtils.d(TAG, "🎶 开始播放后台音频 | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment") LogUtils.d(TAG, "🎶 开始播放后台音频")
state = VoiceState.PLAYING_BACKEND state = VoiceState.PLAYING_BACKEND
} }
fun onPlayEndBackend() { fun onPlayEndBackend() {
speechEnableAtMs = System.currentTimeMillis() + SPEECH_COOLDOWN_MS speechEnableAtMs = System.currentTimeMillis() + SPEECH_COOLDOWN_MS
LogUtils.d(TAG, "🎶 后台音频结束 | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment") LogUtils.d(TAG, "🎶 后台音频结束")
state = VoiceState.WAIT_SPEECH_COOLDOWN state = VoiceState.WAIT_SPEECH_COOLDOWN
} }
fun onUploadFinished(success: Boolean) { fun onUploadFinished(success: Boolean) {
if (state != VoiceState.UPLOADING) return if (state != VoiceState.UPLOADING) return
LogUtils.d(TAG, "📤 上传完成 | 成功: $success | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment") LogUtils.d(TAG, "📤 上传完成 | 成功: $success")
if (!success) { if (!success) {
speechEnableAtMs = System.currentTimeMillis() + SPEECH_COOLDOWN_MS speechEnableAtMs = System.currentTimeMillis() + SPEECH_COOLDOWN_MS
@ -569,7 +304,7 @@ class VoiceController(
} }
private fun resetToWaitSpeech() { private fun resetToWaitSpeech() {
LogUtils.d(TAG, "🔄 重置到等待说话 | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment | 已标记无效说话: $hasInvalidSpeech") LogUtils.d(TAG, "🔄 重置到等待说话 | 已标记无效说话: $hasInvalidSpeech")
val now = System.currentTimeMillis() val now = System.currentTimeMillis()
if (now - lastInvalidResetMs < INVALID_RESET_DEBOUNCE_MS) { if (now - lastInvalidResetMs < INVALID_RESET_DEBOUNCE_MS) {
LogUtils.d(TAG, "🛡 防抖1.5秒内重复无效语音,跳过重置") LogUtils.d(TAG, "🛡 防抖1.5秒内重复无效语音,跳过重置")
@ -579,13 +314,12 @@ class VoiceController(
audioBuffer.clear() audioBuffer.clear()
vadManager.reset() vadManager.reset()
vadStarted = false vadStarted = false
resetRealtimeStats()
state = VoiceState.WAIT_SPEECH state = VoiceState.WAIT_SPEECH
if (waitSpeechFailStartMs == 0L) waitSpeechFailStartMs = System.currentTimeMillis() if (waitSpeechFailStartMs == 0L) waitSpeechFailStartMs = System.currentTimeMillis()
} }
private fun resetAll() { private fun resetAll() {
LogUtils.d(TAG, "🔄 重置所有状态 | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment | 本次超时类型: $currentTimeoutType") LogUtils.d(TAG, "🔄 重置所有状态 | 本次超时类型: $currentTimeoutType")
audioBuffer.clear() audioBuffer.clear()
preBuffer.clear() preBuffer.clear()
vadManager.reset() vadManager.reset()
@ -593,24 +327,17 @@ class VoiceController(
vadStarted = false vadStarted = false
waitSpeechStartMs = 0L waitSpeechStartMs = 0L
waitSpeechFailStartMs = 0L waitSpeechFailStartMs = 0L
envNoiseBuffer.clear()
currentEnvBaseline = 0.001f
isNoisyEnvironment = false
resetRealtimeStats()
hasInvalidSpeech = false hasInvalidSpeech = false
currentTimeoutType = TimeoutType.IDLE_TIMEOUT currentTimeoutType = TimeoutType.IDLE_TIMEOUT
state = VoiceState.WAIT_WAKEUP state = VoiceState.WAIT_WAKEUP
} }
fun release() { fun release() {
LogUtils.d(TAG, "🔌 释放资源 | 最终基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment") LogUtils.d(TAG, "🔌 释放资源")
wakeupManager.release() wakeupManager.release()
vadManager.reset() vadManager.reset()
envNoiseBuffer.clear()
resetRealtimeStats()
hasInvalidSpeech = false hasInvalidSpeech = false
currentTimeoutType = TimeoutType.IDLE_TIMEOUT currentTimeoutType = TimeoutType.IDLE_TIMEOUT
isNoisyEnvironment = false
runCatching { runCatching {
SpeakerRecognition.extractor.release() SpeakerRecognition.extractor.release()
@ -638,24 +365,14 @@ class VoiceController(
} }
} }
// 阈值配置数据类
private data class ThresholdConfig(
val energyThreshold: Float,
val vadRatioThreshold: Float,
val minScore: Int,
val scene: String
)
/* ================= 核心:极简版声纹验证 ================= */
private fun verifySpeaker(audio: FloatArray): Boolean { private fun verifySpeaker(audio: FloatArray): Boolean {
if (audio.isEmpty()) { if (audio.isEmpty()) {
LogUtils.w(TAG, "❌ 待验证音频为空,声纹验证失败") LogUtils.w(TAG, "❌ 待验证音频为空,声纹验证失败")
return false return false
} }
// 1. 裁剪音频:只保留本次录音的有效部分(解决时长不匹配问题) // 1. 裁剪音频:只保留本次录音的有效部分
val audioDurationMs = (audio.size.toFloat() / SAMPLE_RATE * 1000).toLong() val audioDurationMs = (audio.size.toFloat() / SAMPLE_RATE * 1000).toLong()
// 只保留最后 N 毫秒的音频N = 实际录音时长),避免缓存旧音频
val validAudio = if (audioDurationMs > 0) { val validAudio = if (audioDurationMs > 0) {
val validSampleCount = (audioDurationMs * SAMPLE_RATE / 1000).toInt() val validSampleCount = (audioDurationMs * SAMPLE_RATE / 1000).toInt()
if (validSampleCount < audio.size) { if (validSampleCount < audio.size) {
@ -667,45 +384,44 @@ class VoiceController(
audio audio
} }
// 2. 分场景选阈值(无容错,只调阈值)
val finalThreshold = when {
audioDurationMs < SHORT_AUDIO_DURATION_MS -> SPEAKER_THRESHOLD_SHORT
isNoisyEnvironment -> SPEAKER_THRESHOLD_NOISY
else -> SPEAKER_THRESHOLD_QUIET
}
var stream: OnlineStream? = null var stream: OnlineStream? = null
return try {
stream = SpeakerRecognition.extractor.createStream()
stream.acceptWaveform(samples = validAudio, sampleRate = SAMPLE_RATE) // 用裁剪后的音频验证
stream.inputFinished()
if (!SpeakerRecognition.extractor.isReady(stream)) { // 使用 runCatching 统一处理异常
return runCatching {
stream = SpeakerRecognition.extractor.createStream()
// 处理音频数据
stream?.acceptWaveform(samples = validAudio, sampleRate = SAMPLE_RATE)
stream?.inputFinished()
// 检查 stream 是否就绪
if (stream == null || !SpeakerRecognition.extractor.isReady(stream)) {
LogUtils.w(TAG, "❌ 音频Stream未就绪验证失败") LogUtils.w(TAG, "❌ 音频Stream未就绪验证失败")
return false return@runCatching false
} }
// 计算特征并验证
val embedding = SpeakerRecognition.extractor.compute(stream) val embedding = SpeakerRecognition.extractor.compute(stream)
// 3. 纯验证逻辑:过就过,不过就拒绝
speakerManagerLock.withLock { speakerManagerLock.withLock {
val verifyPass = SpeakerRecognition.manager.verify( val verifyPass = SpeakerRecognition.manager.verify(
name = CURRENT_USER_ID, name = CURRENT_USER_ID,
embedding = embedding, embedding = embedding,
threshold = finalThreshold threshold = SPEAKER_THRESHOLD
) )
// 打印关键信息(补充裁剪后时长) LogUtils.d(TAG, "📊 声纹验证 | 统一阈值: $SPEAKER_THRESHOLD | 通过: $verifyPass | 验证时长: ${(validAudio.size.toFloat()/SAMPLE_RATE*1000).toLong()}ms")
LogUtils.d(TAG, "📊 声纹验证 | 阈值: $finalThreshold | 通过: $verifyPass | 嘈杂环境: $isNoisyEnvironment | 原始时长: ${audioDurationMs}ms | 验证时长: ${(validAudio.size.toFloat()/SAMPLE_RATE*1000).toLong()}ms") verifyPass
// 无任何容错:验证结果就是最终结果
return verifyPass
} }
} catch (e: Exception) { }.onFailure { e ->
// 处理所有异常情况
LogUtils.e(TAG, "❌ 声纹验证异常,拒绝", e) LogUtils.e(TAG, "❌ 声纹验证异常,拒绝", e)
return false }.also {
} finally { // 确保 stream 资源释放
runCatching {
stream?.release() stream?.release()
} }.onFailure { e ->
LogUtils.w(TAG, "⚠️ 释放 Stream 资源失败", e)
}
}.getOrDefault(false) // 异常时默认返回 false
} }
} }

View File

@ -26,7 +26,8 @@ class WakeupManager(assetManager: AssetManager, function: () -> Unit) {
val config = KeywordSpotterConfig( val config = KeywordSpotterConfig(
featConfig = featConfig, featConfig = featConfig,
modelConfig = modelConfig, modelConfig = modelConfig,
keywordsFile = keywordsFile keywordsFile = keywordsFile,
keywordsThreshold = 0.2f
) )
kws = KeywordSpotter(assetManager, config) kws = KeywordSpotter(assetManager, config)
@ -40,9 +41,9 @@ class WakeupManager(assetManager: AssetManager, function: () -> Unit) {
/** ⭐ 永远喂 KWS */ /** ⭐ 永远喂 KWS */
fun acceptAudio(samples: FloatArray) { fun acceptAudio(samples: FloatArray) {
val s = stream ?: return val s = stream ?: return
for (i in samples.indices) { // for (i in samples.indices) {
samples[i] *= 2.5f // samples[i] *= 2.5f
} // }
s.acceptWaveform(samples, sampleRate) s.acceptWaveform(samples, sampleRate)
while (kws.isReady(s)) { while (kws.isReady(s)) {

View File

@ -159,27 +159,27 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
} }
} }
mViewModel?.uploadVoiceLiveData?.observe(this) { // mViewModel?.uploadVoiceLiveData?.observe(this) {
when (it) { // when (it) {
is ApiResult.Error -> { // is ApiResult.Error -> {
Toaster.showShort("上传失败") // Toaster.showShort("上传失败")
voiceController?.onUploadFinished(true) // voiceController?.onUploadFinished(true)
} // }
//
is ApiResult.Success<String> -> { // is ApiResult.Success<String> -> {
if (!TextUtils.isEmpty(it.data)) { // if (!TextUtils.isEmpty(it.data)) {
Toaster.showShort(it.data) // Toaster.showShort(it.data)
} // }
Toaster.showShort(it) // Toaster.showShort(it)
voiceController?.onUploadFinished(true) // voiceController?.onUploadFinished(true)
startPlayTimeoutJob?.cancel() // startPlayTimeoutJob?.cancel()
startPlayTimeoutJob = lifecycleScope.launch { // startPlayTimeoutJob = lifecycleScope.launch {
delay(PLAY_WAIT_TIMEOUT_MS) // delay(PLAY_WAIT_TIMEOUT_MS)
voiceController?.onPlayEndBackend() // voiceController?.onPlayEndBackend()
} // }
} // }
} // }
} // }
@ -226,12 +226,12 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
// 1 // 1
// ) // )
// loadLocalJsonAndPlay() // loadLocalJsonAndPlay()
// val file = File( val file = File(
// getExternalFilesDir(Environment.DIRECTORY_DOWNLOADS)!!.getAbsolutePath(), getExternalFilesDir(Environment.DIRECTORY_DOWNLOADS)!!.getAbsolutePath(),
// "xxx.wav" "xxx.wav"
// ) )
// AudioDebugUtil.saveFloatPcmAsWav(audio, file) AudioDebugUtil.saveFloatPcmAsWav(audio, file)
// LogUtils.dTag("audioxx", "WAV saved: ${file.path}, samples=${audio.size}") LogUtils.dTag("audioxx", "WAV saved: ${file.path}, samples=${audio.size}")
// lifecycleScope.launch(Dispatchers.Main) { // lifecycleScope.launch(Dispatchers.Main) {
// //
// mVerticalAnimator?.show() // mVerticalAnimator?.show()
@ -280,9 +280,9 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
when (msg.msgContentType) { when (msg.msgContentType) {
MessageContentType.RECEIVE_VOICE_STREAM.msgContentType -> { MessageContentType.RECEIVE_VOICE_STREAM.msgContentType -> {
lifecycleScope.launch(Dispatchers.IO) { lifecycleScope.launch(Dispatchers.IO) {
// UnityPlayerHolder.getInstance().startTalking(msg.content) //// UnityPlayerHolder.getInstance().startTalking(msg.content)
val audioDTO = GsonUtils.fromJson(msg.content, AudioDTO::class.java) // val audioDTO = GsonUtils.fromJson(msg.content, LmChatDTO::class.java)
// voicePlayer.onAudioDTO(audioDTO) // voicePlayer.handleSlice(audioDTO)
} }
} }
} }
@ -586,9 +586,9 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
super.onEvent(eventSource, id, type, data) super.onEvent(eventSource, id, type, data)
LogUtils.eTag("lrsxxx", "onEvent:${data}") LogUtils.eTag("lrsxxx", "onEvent:${data}")
runCatching { runCatching {
// val audioDTO = GsonUtils.fromJson(data, LmChatDTO::class.java) val audioDTO = GsonUtils.fromJson(data, LmChatDTO::class.java)
// voicePlayer.handleSlice(audioDTO) voicePlayer.handleSlice(audioDTO)
UnityPlayerHolder.getInstance().startTalking(data) // UnityPlayerHolder.getInstance().startTalking(data)
}.onFailure { }.onFailure {
LogUtils.eTag("lrsxxx", "解析音频数据失败", it) LogUtils.eTag("lrsxxx", "解析音频数据失败", it)
voiceController?.onUploadFinished(false) voiceController?.onUploadFinished(false)

View File

@ -4,92 +4,134 @@ import android.media.AudioAttributes
import android.media.AudioFormat import android.media.AudioFormat
import android.media.AudioManager import android.media.AudioManager
import android.media.AudioTrack import android.media.AudioTrack
import kotlinx.coroutines.CoroutineScope import kotlinx.coroutines.*
import kotlinx.coroutines.Dispatchers
import kotlinx.coroutines.SupervisorJob
import kotlinx.coroutines.cancel
import kotlinx.coroutines.delay
import kotlinx.coroutines.isActive
import kotlinx.coroutines.launch
import java.util.ArrayDeque import java.util.ArrayDeque
import java.util.Queue import java.util.Queue
import java.util.concurrent.locks.ReentrantLock import java.util.concurrent.locks.ReentrantLock
import kotlin.concurrent.withLock
// ====================== PCM 播放器 ======================
class PcmStreamPlayer( class PcmStreamPlayer(
private val sampleRate: Int private val sampleRate: Int
) { ) {
var onPlayEnd: (() -> Unit)? = null
private val scope = CoroutineScope(SupervisorJob() + Dispatchers.IO) private val scope = CoroutineScope(SupervisorJob() + Dispatchers.IO)
private val bufferQueue: Queue<ByteArray> = ArrayDeque() private val bufferQueue: Queue<ByteArray> = ArrayDeque()
private val queueLock = ReentrantLock() private val lock = ReentrantLock()
private var audioTrack: AudioTrack? = null private var audioTrack: AudioTrack? = null
@Volatile @Volatile
private var playing = true private var playing = true
// 新增:标记是否已释放,防止空指针
@Volatile
private var isReleased = false
init { init {
scope.launch { scope.launch {
val minBufferSize = AudioTrack.getMinBufferSize(
sampleRate,
AudioFormat.CHANNEL_OUT_MONO,
AudioFormat.ENCODING_PCM_16BIT
)
audioTrack = AudioTrack( audioTrack = AudioTrack(
AudioAttributes.Builder() AudioAttributes.Builder()
.setUsage(AudioAttributes.USAGE_MEDIA) .setUsage(AudioAttributes.USAGE_MEDIA)
.setContentType(AudioAttributes.CONTENT_TYPE_SPEECH) .setContentType(AudioAttributes.CONTENT_TYPE_SPEECH)
.build(), .build(),
AudioFormat.Builder() AudioFormat.Builder()
.setEncoding(AudioFormat.ENCODING_PCM_16BIT)
.setSampleRate(sampleRate) .setSampleRate(sampleRate)
.setEncoding(AudioFormat.ENCODING_PCM_16BIT)
.setChannelMask(AudioFormat.CHANNEL_OUT_MONO) .setChannelMask(AudioFormat.CHANNEL_OUT_MONO)
.build(), .build(),
AudioTrack.getMinBufferSize( minBufferSize * 2,
sampleRate,
AudioFormat.CHANNEL_OUT_MONO,
AudioFormat.ENCODING_PCM_16BIT
),
AudioTrack.MODE_STREAM, AudioTrack.MODE_STREAM,
AudioManager.AUDIO_SESSION_ID_GENERATE AudioManager.AUDIO_SESSION_ID_GENERATE
) )
audioTrack?.play() // 空安全检查防止AudioTrack创建失败
audioTrack?.play() ?: run {
playing = false
isReleased = true
return@launch
}
val silent = ByteArray(2048) // 🔥 AudioTrack 预热(非常关键)
warmUp()
while (isActive && playing) { val silent = ByteArray(1024)
val pcm = queueLock.run { bufferQueue.poll() }
if (pcm != null) { while (isActive && playing && !isReleased) {
val pcm = lock.withLock {
if (bufferQueue.isEmpty()) null else bufferQueue.poll()
}
if (pcm != null && !isReleased) {
audioTrack?.write(pcm, 0, pcm.size) audioTrack?.write(pcm, 0, pcm.size)
} else { } else if (!isReleased) {
audioTrack?.write(silent, 0, silent.size) audioTrack?.write(silent, 0, silent.size)
delay(5) delay(5)
} }
} }
audioTrack?.stop() // 释放前的空安全检查
audioTrack?.release() audioTrack?.takeIf { !isReleased }?.stop()
audioTrack?.takeIf { !isReleased }?.release()
audioTrack = null audioTrack = null
onPlayEnd?.invoke()
} }
} }
private fun warmUp() {
if (isReleased) return
val warmUpMs = 50
val bytes = sampleRate * 2 * warmUpMs / 1000
val silence = ByteArray(bytes)
audioTrack?.write(silence, 0, silence.size)
audioTrack?.write(silence, 0, silence.size)
}
fun pushPcm(pcm: ByteArray) { fun pushPcm(pcm: ByteArray) {
queueLock.run { bufferQueue.add(pcm) } if (isReleased) return
lock.withLock {
bufferQueue.add(pcm)
}
} }
fun clearQueue() { fun clearQueue() {
queueLock.run { bufferQueue.clear() } if (isReleased) return
lock.withLock {
bufferQueue.clear()
}
} }
fun queueEmpty(): Boolean = queueLock.run { bufferQueue.isEmpty() } fun queueEmpty(): Boolean {
if (isReleased) return true
return lock.withLock { bufferQueue.isEmpty() }
}
// 核心新增:强制停止当前播放,清空所有缓冲区
fun forceStop() {
if (isReleased) return
lock.withLock {
bufferQueue.clear()
}
// 清空AudioTrack内部缓冲区立即停止发声
audioTrack?.flush()
audioTrack?.pause() // 暂停硬件播放,避免残留静音数据
}
// 核心新增:重启播放器(用于停止后播放新音频)
fun restart() {
if (isReleased) return
audioTrack?.play()
}
fun release() { fun release() {
isReleased = true
playing = false playing = false
queueLock.run { bufferQueue.clear() } clearQueue()
scope.cancel() scope.cancel()
audioTrack?.stop()
audioTrack?.release()
audioTrack = null
} }
} }

View File

@ -73,6 +73,7 @@ immersionbar-components = { module = "com.geyifeng.immersionbar:immersionbar-com
immersionbar-ktx = { module = "com.geyifeng.immersionbar:immersionbar-ktx", version.ref = "immersionbarKtx" } immersionbar-ktx = { module = "com.geyifeng.immersionbar:immersionbar-ktx", version.ref = "immersionbarKtx" }
immersionbar = { module = "com.geyifeng.immersionbar:immersionbar", version.ref = "immersionbar" } immersionbar = { module = "com.geyifeng.immersionbar:immersionbar", version.ref = "immersionbar" }
androidx-lifecycle-runtime-android = { group = "androidx.lifecycle", name = "lifecycle-runtime-android", version.ref = "lifecycleRuntimeAndroid" } androidx-lifecycle-runtime-android = { group = "androidx.lifecycle", name = "lifecycle-runtime-android", version.ref = "lifecycleRuntimeAndroid" }
[plugins] [plugins]
android-application = { id = "com.android.application", version.ref = "agp" } android-application = { id = "com.android.application", version.ref = "agp" }
kotlin-android = { id = "org.jetbrains.kotlin.android", version.ref = "kotlin" } kotlin-android = { id = "org.jetbrains.kotlin.android", version.ref = "kotlin" }