临时提交可以播放pcm的类

This commit is contained in:
林若思 2026-01-14 17:54:11 +08:00
parent 6f65a48686
commit d47362ca38
9 changed files with 752 additions and 969 deletions

View File

@ -0,0 +1,9 @@
package com.zs.smarthuman.bean
/**
* @description:
* @author: lrs
* @date: 2026/1/14 11:04
*/
data class AudioDTO(val samplingRate: Int = 0, val items: MutableList<LmChatDTO> = mutableListOf())
data class LmChatDTO(val id: Int = 0, val sortId: Int = 0, val text: String = "", val audioData: String = "", val isFinal: Boolean = false)

View File

@ -1,162 +0,0 @@
package com.zs.smarthuman.sherpa
/**
* @description: 语音控制器配置常量类所有硬编码参数集中管理无需修改业务逻辑即可适配场景
* @author: lrs
* @date: 2026/1/12 17:33
* @note: 所有参数适配 16kHz 单通道语音场景调整后需在目标场景安静/嘈杂/远场测试验证
*/
object VoiceConfig {
// ===================== 基础配置(通用核心,禁止随意修改) =====================
/** 日志打印统一标签,便于筛选语音相关日志 */
const val TAG = "VoiceController"
/** 语音采样率固定16kHz与VAD/声纹模型强绑定,禁止修改 */
const val SAMPLE_RATE = 16000
/** 预缓存音频大小2秒用于唤醒后补全唤醒前的语音片段防止开头缺失 */
const val PRE_BUFFER_SIZE = SAMPLE_RATE * 2
/** 声纹验证的当前唤醒用户ID业务层可动态替换 */
const val CURRENT_USER_ID = "current_wakeup_user"
/** 是否开启严格的声纹验证测试阶段设为false生产环境建议设为true */
const val ENABLE_STRICT_SPEAKER_VERIFY = true
// ===================== 时间阈值ms时序控制核心 =====================
/** 空闲超时默认秒数(实际超时=200*1000=200秒建议调整为5-30秒 */
const val IDLE_TIMEOUT_DEFAULT_SECONDS = 200
/** 最大录音时长10秒防止录音过长占用内存短指令3-5秒长语音10-20秒 */
const val MAX_RECORDING_DEFAULT_SECONDS = 10
/** 短语音判定阈值1秒声纹验证分场景的临界值无需调整 */
const val SHORT_AUDIO_DURATION_MS = 1000L
/** 无效语音重置防抖时间1.5秒避免1.5秒内重复重置状态建议1-2秒 */
const val INVALID_RESET_DEBOUNCE_MS = 1500L
/** 有效语音最小时长800ms过滤极短的杂音/按键音建议600-1000ms */
const val MIN_SPEECH_MS = 800L
/** 微弱人声过滤的基础时长阈值400ms建议为MIN_SPEECH_MS的一半 */
const val MIN_EFFECTIVE_VOICE_DURATION = 400L
/** 唤醒后观察期500ms期间不处理VAD防止误触发建议300-800ms */
const val KWS_OBSERVE_MS = 500L
/** 说话冷却期300ms提示音/后台音频结束后延迟进入等待说话建议200-400ms */
const val SPEECH_COOLDOWN_MS = 300L
/** 短语音时长范围0.5-2秒分场景阈值的判定依据无需调整 */
const val SHORT_SPEECH_MIN = 500L
const val SHORT_SPEECH_MAX = 2000L
/** 多人对话检测的最小时长2.5秒太短易误判太长漏判建议2-3秒 */
const val MULTI_DIALOGUE_MIN_DURATION = 2500L
// ===================== 环境/噪音阈值(能量类,核心过滤参数) =====================
/** 嘈杂环境判定阈值环境基线≥0.01f则为嘈杂环境安静场景0.008f嘈杂场景0.015f */
const val NOISE_BASELINE_THRESHOLD = 0.01f
/** 环境基线校准的滑动窗口大小50帧太小基线波动大太大校准滞后建议30-80 */
const val BASELINE_WINDOW_SIZE = 50
/** 安静环境判定阈值:环境基线<0.005f则为安静环境建议为NOISE_BASELINE_THRESHOLD的一半 */
const val BASELINE_QUIET_THRESHOLD = 0.005f
/** 有效语音的最小RMS能量0.0005f太低统计背景噪太高漏统计弱人声建议0.0003-0.0008f */
const val MIN_EFFECTIVE_SPEECH_RMS = 0.0005f
/** 正常语音能量阈值0.008f联合过滤的临界值嘈杂环境0.01f安静环境0.006f */
const val NORMAL_VOICE_ENERGY_THRESHOLD = 0.008f
// ===================== 声纹验证阈值(分场景适配,核心体验参数) =====================
/** 安静环境声纹验证阈值0.50f越高越严格准确率优先0.55f通过率优先0.45f */
const val SPEAKER_THRESHOLD_QUIET = 0.50f
/** 嘈杂环境声纹验证阈值0.43f放宽以提高通过率比安静环境低0.05-0.1f */
const val SPEAKER_THRESHOLD_NOISY = 0.43f
/** 短语音声纹验证阈值0.40f进一步放宽比嘈杂环境低0.03-0.05f */
const val SPEAKER_THRESHOLD_SHORT = 0.40f
// ===================== 能量/占比阈值(过滤核心,分场景适配) =====================
/** 正常语音最低能量0.03f短语音适配0.01f长语音保持0.03f */
const val MIN_NORMAL_VOICE_ENERGY = 0.03f
/** 正常语音VAD占比阈值0.2f短语音0.1f嘈杂环境0.15f */
const val MIN_NORMAL_VOICE_VAD_RATIO = 0.2f
/** 远场语音最大能量阈值0.015f近场0.01f远场0.02f */
const val MAX_FAR_FIELD_ENERGY = 0.015f
/** 有效语音最小峰均比0.5f过滤扁平背景噪建议0.4-0.6f */
const val MIN_VALID_PEAK_AVG_RATIO = 0.5f
/** 有效语音最小连续帧占比0.1f非连续杂音过滤建议0.08-0.12f */
const val MIN_CONTINUOUS_FRAME_RATIO = 0.1f
/** 语音峰值位置阈值0.95f过滤末尾突发杂音建议0.9-0.98f */
const val MAX_PEAK_POSITION_RATIO = 0.95f
/** 有效语音最小帧数3帧过滤零星语音帧建议2-5帧 */
const val MIN_EFFECTIVE_SPEECH_FRAMES = 3
// ===================== 多人对话过滤(多维度判定) =====================
/** 多人对话最大峰均比2.5f峰均比过高判定为多人对话建议2.0-3.0f */
const val MULTI_DIALOGUE_MAX_PEAK_AVG_RATIO = 2.5f
/** 多人对话最小峰均比0.4f峰均比过低判定为多人对话建议0.3-0.5f */
const val MULTI_DIALOGUE_MIN_PEAK_AVG_RATIO = 0.4f
/** 多人对话最大连续帧占比0.3f连续帧低判定为多人对话建议0.2-0.4f */
const val MULTI_DIALOGUE_MAX_CONTINUOUS_RATIO = 0.3f
/** 多人对话最小VAD占比0.55fVAD占比高判定为多人对话建议0.5-0.6f */
const val MULTI_DIALOGUE_MIN_VAD_RATIO = 0.55f
// ===================== 分场景动态系数(阈值计算) =====================
/** 短语音能量动态系数安静环境1.5f嘈杂环境2.0f(嘈杂环境系数更高) */
const val SHORT_SPEECH_ENERGY_COEFF_QUIET = 1.5f
const val SHORT_SPEECH_ENERGY_COEFF_NOISY = 2.0f
/** 长语音能量动态系数安静环境2.5f嘈杂环境3.5f(长语音系数更高) */
const val LONG_SPEECH_ENERGY_COEFF_QUIET = 2.5f
const val LONG_SPEECH_ENERGY_COEFF_NOISY = 3.5f
/** 短/长语音VAD占比动态系数短语音0.05f长语音0.10f(长语音要求更高) */
const val SHORT_SPEECH_VAD_COEFF = 0.05f
const val LONG_SPEECH_VAD_COEFF = 0.10f
/** 短/长语音最低评分默认1分宽松严格场景可设为2分 */
const val SHORT_SPEECH_MIN_SCORE = 1
const val LONG_SPEECH_MIN_SCORE = 1
// ===================== 微弱人声过滤专用阈值(补充) =====================
/** 短/长语音临界时长2000msfilterWeakVoice中判定短语音的依据 */
const val SHORT_LONG_SPEECH_CUTOFF_MS = 2000L
/** 短语音动态能量阈值0.01ffilterWeakVoice中短语音的能量判定值 */
const val SHORT_SPEECH_ENERGY_THRESHOLD = 0.01f
/** 短语音VAD占比阈值0.10ffilterWeakVoice中短语音的VAD判定值 */
const val SHORT_SPEECH_VAD_RATIO = 0.10f
/** 嘈杂环境VAD占比阈值0.15ffilterWeakVoice中嘈杂环境的VAD判定值 */
const val NOISY_ENV_VAD_RATIO = 0.15f
/** 纯底噪过滤的能量阈值0.005ffilterWeakVoice中底噪判定的能量值 */
const val PURE_NOISE_ENERGY_THRESHOLD = 0.005f
/** 纯底噪过滤的能量基线比1.2ffilterWeakVoice中底噪判定的比值 */
const val PURE_NOISE_BASELINE_RATIO = 1.2f
// ===================== 语音评分专用阈值(补充) =====================
/** 长语音评分临界时长4000mscalculateSpeechScore中评3分的依据 */
const val LONG_SPEECH_SCORE_CUTOFF_MS = 4000L
/** 中语音评分临界时长2500mscalculateSpeechScore中评2分的依据 */
const val MID_SPEECH_SCORE_CUTOFF_MS = 2500L
}

View File

@ -2,8 +2,7 @@ package com.zs.smarthuman.sherpa
import android.content.res.AssetManager import android.content.res.AssetManager
import com.blankj.utilcode.util.LogUtils import com.blankj.utilcode.util.LogUtils
import com.k2fsa.sherpa.onnx.SpeakerEmbeddingExtractor import com.k2fsa.sherpa.onnx.OnlineStream
import com.k2fsa.sherpa.onnx.SpeakerEmbeddingManager
import com.k2fsa.sherpa.onnx.SpeakerRecognition import com.k2fsa.sherpa.onnx.SpeakerRecognition
import kotlinx.coroutines.CoroutineScope import kotlinx.coroutines.CoroutineScope
import kotlinx.coroutines.Dispatchers import kotlinx.coroutines.Dispatchers
@ -16,32 +15,45 @@ class VoiceController(
assetManager: AssetManager, assetManager: AssetManager,
private val onWakeup: () -> Unit, private val onWakeup: () -> Unit,
private val onFinalAudio: (FloatArray) -> Unit, private val onFinalAudio: (FloatArray) -> Unit,
idleTimeoutSeconds: Int = VoiceConfig.IDLE_TIMEOUT_DEFAULT_SECONDS, idleTimeoutSeconds: Int = 200,
maxRecordingSeconds: Int = VoiceConfig.MAX_RECORDING_DEFAULT_SECONDS, maxRecordingSeconds: Int = 10,
private val onStateChanged: ((VoiceState) -> Unit)? = null, private val onStateChanged: ((VoiceState) -> Unit)? = null,
private val stopBackendAudio: (() -> Unit)? = null, private val stopBackendAudio: (() -> Unit)? = null,
private val onTimeoutTip: OnTimeoutTip? = null private val onTimeoutTip: OnTimeoutTip? = null
) { ) {
// 依赖组件
private val wakeupManager = WakeupManager(assetManager, onWakeup)
private val vadManager = VadManager(
assetManager,
onSpeechStart = ::onVadStart,
onSpeechEnd = { avgEnergy, peakRms -> onVadEnd(avgEnergy, peakRms) }
)
private val stateManager = VoiceStateManager(
idleTimeoutSeconds = idleTimeoutSeconds,
maxRecordingSeconds = maxRecordingSeconds,
onStateChanged = onStateChanged,
onTimeoutTip = onTimeoutTip
)
// 音频缓存 companion object {
private val audioBuffer = mutableListOf<Float>() // 日志标签
private val preBuffer = ArrayDeque<Float>(VoiceConfig.PRE_BUFFER_SIZE) private const val TAG = "VoiceController"
private val envNoiseBuffer = ArrayDeque<Float>(VoiceConfig.BASELINE_WINDOW_SIZE) // 采样率
private const val SAMPLE_RATE = 16000
// 预缓存大小2秒
private const val PRE_BUFFER_SIZE = SAMPLE_RATE * 2
// 实时统计 // ========== 核心:分场景声纹阈值(极简版) ==========
private const val SPEAKER_THRESHOLD_QUIET = 0.50f // 安静环境
private const val SPEAKER_THRESHOLD_NOISY = 0.45f // 嘈杂环境(匹配你的真实相似度)
private const val SPEAKER_THRESHOLD_SHORT = 0.43f // 短语音(<1秒
// 短语音判定阈值
private const val SHORT_AUDIO_DURATION_MS = 1000L
private const val INVALID_RESET_DEBOUNCE_MS = 1500L
// 最小语音时长
private const val MIN_SPEECH_MS = 800L
private const val MIN_EFFECTIVE_VOICE_DURATION = 400L
// 噪音场景判定阈值
private const val NOISE_BASELINE_THRESHOLD = 0.01f
}
var state: VoiceState = VoiceState.WAIT_WAKEUP
private set(value) {
field = value
LogUtils.d(TAG, "➡ State = $value")
onStateChanged?.invoke(value)
}
// 实时能量与帧统计变量
private var realtimeEnergySum = 0f private var realtimeEnergySum = 0f
private var realtimeEnergyCount = 0 private var realtimeEnergyCount = 0
private var realtimePeakRms = 0f private var realtimePeakRms = 0f
@ -50,224 +62,359 @@ class VoiceController(
private var realtimeContinuousSpeechFrames = 0 private var realtimeContinuousSpeechFrames = 0
private var realtimeLastFrameIsSpeech = false private var realtimeLastFrameIsSpeech = false
private var isMultiPersonDialogueDetected = false private var isMultiPersonDialogueDetected = false
private var lastInvalidResetMs = 0L
// 声纹识别相关
private val speakerManagerLock = ReentrantLock() private val speakerManagerLock = ReentrantLock()
private lateinit var speakerExtractor: SpeakerEmbeddingExtractor
private lateinit var speakerManager: SpeakerEmbeddingManager // 环境噪音状态标记
private var isNoisyEnvironment = false
private val wakeupManager = WakeupManager(assetManager, onWakeup)
private val vadManager = VadManager(
assetManager,
onSpeechStart = { onVadStart() },
onSpeechEnd = { avgEnergy, peakRms -> onVadEnd(avgEnergy, peakRms) }
)
private val audioBuffer = mutableListOf<Float>()
private val preBuffer = ArrayDeque<Float>(PRE_BUFFER_SIZE)
private var recordingStartMs = 0L
private var waitSpeechFailStartMs = 0L
private var waitSpeechStartMs = 0L
private var vadStarted = false
private var inKwsObserve = false
private var kwsObserveStartMs = 0L
private val KWS_OBSERVE_MS = 500L
private var speechEnableAtMs = 0L
private val SPEECH_COOLDOWN_MS = 300L
private val idleTimeoutMs = idleTimeoutSeconds * 1000L
private val maxRecordingMs = maxRecordingSeconds * 1000L
// 分场景动态系数(保留原有逻辑)
private val BASELINE_WINDOW_SIZE = 50
private val envNoiseBuffer = ArrayDeque<Float>(BASELINE_WINDOW_SIZE)
private var currentEnvBaseline = 0.001f
// 分场景动态系数
private val BASELINE_QUIET_THRESHOLD = 0.005f
private val SHORT_SPEECH_ENERGY_COEFF_QUIET = 1.5f
private val SHORT_SPEECH_ENERGY_COEFF_NOISY = 2.0f
private val LONG_SPEECH_ENERGY_COEFF_QUIET = 2.5f
private val LONG_SPEECH_ENERGY_COEFF_NOISY = 3.5f
private val SHORT_SPEECH_VAD_COEFF = 0.05f
private val LONG_SPEECH_VAD_COEFF = 0.10f
private val SHORT_SPEECH_MIN_SCORE = 1
private val LONG_SPEECH_MIN_SCORE = 1
// 其他过滤参数
private val MAX_FAR_FIELD_ENERGY = 0.015f
private val MIN_VALID_PEAK_AVG_RATIO = 0.5f
private val MIN_CONTINUOUS_FRAME_RATIO = 0.1f
private val MAX_PEAK_POSITION_RATIO = 0.95f
private val MIN_EFFECTIVE_SPEECH_FRAMES = 3
private val SHORT_SPEECH_MIN = 500L
private val SHORT_SPEECH_MAX = 2000L
// 多人对话过滤配置
private val MULTI_DIALOGUE_MIN_DURATION = 2500L
private val MULTI_DIALOGUE_MAX_PEAK_AVG_RATIO = 2.5f
private val MULTI_DIALOGUE_MIN_PEAK_AVG_RATIO = 0.4f
private val MULTI_DIALOGUE_MAX_CONTINUOUS_RATIO = 0.3f
private val MULTI_DIALOGUE_MIN_VAD_RATIO = 0.55f
// 微弱人声过滤配置
private val MIN_VOICE_FRAME_RATIO = 0.08f
private val MIN_PEAK_ENERGY_RATIO = 1.5f
private val NORMAL_VOICE_ENERGY_THRESHOLD = 0.008f
private val MIN_CONTINUOUS_VOICE_FRAMES = 1
private val MIN_EFFECTIVE_SPEECH_RMS = 0.0005f
// 无效说话标记 + 超时类型
private var hasInvalidSpeech = false
private var currentTimeoutType: TimeoutType = TimeoutType.IDLE_TIMEOUT
// 声纹验证相关
private val CURRENT_USER_ID = "current_wakeup_user"
private val ENABLE_STRICT_SPEAKER_VERIFY = true
init { init {
initSpeakerRecognition(assetManager)
}
/**
* 初始化声纹识别器
*/
private fun initSpeakerRecognition(assetManager: AssetManager) {
try { try {
SpeakerRecognition.initExtractor(assetManager) SpeakerRecognition.initExtractor(assetManager)
speakerExtractor = SpeakerRecognition.extractor LogUtils.d(TAG, "✅ 声纹识别器初始化成功")
speakerManager = SpeakerRecognition.manager
LogUtils.d(VoiceConfig.TAG, "✅ 声纹识别器初始化成功")
} catch (e: Exception) { } catch (e: Exception) {
LogUtils.e(VoiceConfig.TAG, "❌ 声纹识别器初始化失败", e) LogUtils.e(TAG, "❌ 声纹识别器初始化失败", e)
throw RuntimeException("声纹识别初始化失败", e) throw RuntimeException("声纹识别初始化失败", e)
} }
} }
/** /* ================= 音频入口 ================= */
* 音频入口对外API不变
*/
fun acceptAudio(samples: FloatArray) { fun acceptAudio(samples: FloatArray) {
// 缓存预缓冲 cachePreBuffer(samples)
VoiceUtils.cachePreBuffer(samples, preBuffer)
// 唤醒检测
wakeupManager.acceptAudio(samples) wakeupManager.acceptAudio(samples)
if (wakeupManager.consumeWakeupFlag()) { if (wakeupManager.consumeWakeupFlag()) {
val preBufferSnapshot = preBuffer.toFloatArray() val preBufferShot = preBuffer.toFloatArray()
handleWakeupEvent() handleWakeupEvent()
// 注册唤醒用户特征 // 注册唤醒用户特征
CoroutineScope(Dispatchers.IO).launch { CoroutineScope(Dispatchers.IO).launch {
VoiceUtils.registerWakeupUser( var stream: OnlineStream? = null
preBuffer = ArrayDeque(preBufferSnapshot.asList()), // 用快照创建新队列 runCatching {
extractor = speakerExtractor, val wakeupAudio = preBufferShot
manager = speakerManager if (wakeupAudio.isEmpty()) {
LogUtils.w(TAG, "❌ 唤醒音频缓存为空,无法注册用户特征")
return@launch
}
stream = SpeakerRecognition.extractor.createStream()
stream?.acceptWaveform(samples = wakeupAudio, sampleRate = SAMPLE_RATE)
stream?.inputFinished()
if (stream != null && SpeakerRecognition.extractor.isReady(stream)) {
val embedding = SpeakerRecognition.extractor.compute(stream)
speakerManagerLock.withLock {
SpeakerRecognition.manager.remove(CURRENT_USER_ID)
val embeddingList = mutableListOf(embedding)
val ok = SpeakerRecognition.manager.add(
name = CURRENT_USER_ID,
embedding = embeddingList.toTypedArray()
) )
if (ok) {
LogUtils.d(TAG, "✅ 注册当前唤醒用户特征成功 | 特征长度: ${embedding.size}")
} else {
LogUtils.w(TAG, "❌ 注册当前唤醒用户特征失败")
}
}
} else {
LogUtils.w(TAG, "❌ 唤醒音频Stream未就绪跳过用户注册")
}
}.onFailure {
LogUtils.e(TAG, "❌ 唤醒用户特征注册失败", it)
}.also {
stream?.release()
}
} }
return return
} }
val now = System.currentTimeMillis() val now = System.currentTimeMillis()
// 环境基线校准(仅等待唤醒状态) if (state == VoiceState.WAIT_WAKEUP) {
if (stateManager.state == VoiceState.WAIT_WAKEUP) { calibrateEnvBaseline(samples)
stateManager.currentEnvBaseline = VoiceUtils.calibrateEnvBaseline( isNoisyEnvironment = currentEnvBaseline >= NOISE_BASELINE_THRESHOLD
samples = samples, LogUtils.d(TAG, "📊 环境状态 | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment")
vadManager = vadManager,
envNoiseBuffer = envNoiseBuffer,
currentEnvBaseline = stateManager.currentEnvBaseline
)
stateManager.isNoisyEnvironment = stateManager.currentEnvBaseline >= VoiceConfig.NOISE_BASELINE_THRESHOLD
// LogUtils.d(VoiceConfig.TAG, "📊 环境状态 | 基线: ${stateManager.currentEnvBaseline} | 嘈杂环境: ${stateManager.isNoisyEnvironment}")
} }
// 状态分发 when (state) {
when (stateManager.state) {
VoiceState.WAIT_WAKEUP, VoiceState.WAIT_WAKEUP,
VoiceState.PLAYING_PROMPT, VoiceState.PLAYING_PROMPT,
VoiceState.PLAYING_BACKEND, VoiceState.PLAYING_BACKEND,
VoiceState.UPLOADING -> return VoiceState.UPLOADING -> return
VoiceState.WAIT_SPEECH_COOLDOWN -> { VoiceState.WAIT_SPEECH_COOLDOWN -> {
stateManager.handleWaitSpeechCooldown(now) if (now >= speechEnableAtMs) {
waitSpeechFailStartMs = now
state = VoiceState.WAIT_SPEECH
waitSpeechStartMs = now
}
return return
} }
VoiceState.WAIT_SPEECH -> { VoiceState.WAIT_SPEECH -> {
// 检查超时(修复点:超时后主动调用 resetAll() 并传参) if ((waitSpeechStartMs > 0 && now - waitSpeechStartMs >= idleTimeoutMs) ||
if (stateManager.checkWaitSpeechTimeout(now)) { (waitSpeechFailStartMs > 0 && now - waitSpeechFailStartMs >= idleTimeoutMs)
stateManager.resetAll( ) {
resetRealtimeStats = ::resetRealtimeStats, currentTimeoutType = if (hasInvalidSpeech) {
audioBuffer = audioBuffer, TimeoutType.INVALID_SPEECH_TIMEOUT
preBuffer = preBuffer, } else {
vadManager = vadManager, TimeoutType.IDLE_TIMEOUT
wakeupManager = wakeupManager, }
envNoiseBuffer = envNoiseBuffer LogUtils.d(TAG, "⏱ WAIT_SPEECH timeout → WAIT_WAKEUP | 超时类型: $currentTimeoutType")
) onTimeoutTip?.invoke(currentTimeoutType)
resetAll()
return return
} }
// 唤醒观察期 if (inKwsObserve && now - kwsObserveStartMs < KWS_OBSERVE_MS) return
if (stateManager.inKwsObserve && now - stateManager.kwsObserveStartMs < VoiceConfig.KWS_OBSERVE_MS) return inKwsObserve = false
stateManager.inKwsObserve = false
// VAD检测
vadManager.accept(samples) vadManager.accept(samples)
} }
VoiceState.RECORDING -> { VoiceState.RECORDING -> {
// 音频缓存
audioBuffer.addAll(samples.asList()) audioBuffer.addAll(samples.asList())
vadManager.accept(samples) vadManager.accept(samples)
// 环境校准 calibrateEnvBaseline(samples)
stateManager.currentEnvBaseline = VoiceUtils.calibrateEnvBaseline( updateRealtimeEnergy(samples)
samples = samples, updateRealtimeFrameStats()
vadManager = vadManager, isNoisyEnvironment = currentEnvBaseline >= NOISE_BASELINE_THRESHOLD
envNoiseBuffer = envNoiseBuffer,
currentEnvBaseline = stateManager.currentEnvBaseline
)
stateManager.isNoisyEnvironment = stateManager.currentEnvBaseline >= VoiceConfig.NOISE_BASELINE_THRESHOLD
// 更新实时统计 if (checkMultiPersonDialogueRealtime(now)) {
val energyStats = VoiceUtils.updateRealtimeEnergy( LogUtils.w(TAG, "🚨 录音中识别出多人对话,提前终止")
samples = samples,
vadManager = vadManager,
isNoisyEnvironment = stateManager.isNoisyEnvironment,
currentEnvBaseline = stateManager.currentEnvBaseline,
realtimeEnergySum = realtimeEnergySum,
realtimeEnergyCount = realtimeEnergyCount,
realtimePeakRms = realtimePeakRms
)
realtimeEnergySum = energyStats.first
realtimeEnergyCount = energyStats.second
realtimePeakRms = energyStats.third
val frameStats = VoiceUtils.updateRealtimeFrameStats(vadManager)
realtimeTotalFrames = frameStats.totalFrames
realtimeSpeechFrames = frameStats.speechFrames
realtimeContinuousSpeechFrames = frameStats.continuousSpeechFrames
realtimeLastFrameIsSpeech = frameStats.lastFrameIsSpeech
// 多人对话检测
isMultiPersonDialogueDetected = VoiceUtils.checkMultiPersonDialogue(
now = now,
recordingStartMs = stateManager.recordingStartMs,
realtimeEnergySum = realtimeEnergySum,
realtimeEnergyCount = realtimeEnergyCount,
realtimePeakRms = realtimePeakRms,
realtimeSpeechFrames = realtimeSpeechFrames,
realtimeContinuousSpeechFrames = realtimeContinuousSpeechFrames,
vadManager = vadManager
)
if (isMultiPersonDialogueDetected) {
LogUtils.w(VoiceConfig.TAG, "🚨 录音中识别出多人对话,提前终止")
finishSentence(realtimeEnergySum / realtimeEnergyCount, realtimePeakRms) finishSentence(realtimeEnergySum / realtimeEnergyCount, realtimePeakRms)
return return
} }
// 最大录音时长检测 if (System.currentTimeMillis() - recordingStartMs > maxRecordingMs) {
if (System.currentTimeMillis() - stateManager.recordingStartMs > stateManager.maxRecordingMs) { LogUtils.w(TAG, "⏱ Max recording reached | 当前环境基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment")
LogUtils.w(VoiceConfig.TAG, "⏱ Max recording reached | 当前环境基线: ${stateManager.currentEnvBaseline} | 嘈杂环境: ${stateManager.isNoisyEnvironment}")
finishSentence(realtimeEnergySum / realtimeEnergyCount, realtimePeakRms) finishSentence(realtimeEnergySum / realtimeEnergyCount, realtimePeakRms)
} }
} }
} }
} }
/** /* ================= 实时能量更新 ================= */
* 处理唤醒事件 private fun updateRealtimeEnergy(samples: FloatArray) {
*/ val rms = vadManager.calcRms(samples)
val effectiveThreshold = if (isNoisyEnvironment) currentEnvBaseline * 1.8f else MIN_EFFECTIVE_SPEECH_RMS
if (rms >= effectiveThreshold) {
realtimeEnergySum += rms
realtimeEnergyCount++
realtimePeakRms = maxOf(realtimePeakRms, rms)
}
}
/* ================= 实时帧统计 ================= */
private fun updateRealtimeFrameStats() {
realtimeTotalFrames = vadManager.getTotalFrames()
realtimeSpeechFrames = vadManager.getSpeechFrames()
realtimeContinuousSpeechFrames = vadManager.getContinuousSpeechFrames()
val currentFrameIsSpeech = vadManager.isSpeechDetected()
if (currentFrameIsSpeech) {
realtimeContinuousSpeechFrames = if (realtimeLastFrameIsSpeech) realtimeContinuousSpeechFrames + 1 else 1
} else {
realtimeContinuousSpeechFrames = 0
}
realtimeLastFrameIsSpeech = currentFrameIsSpeech
}
/* ================= 多人对话检测 ================= */
private fun checkMultiPersonDialogueRealtime(now: Long): Boolean {
val duration = now - recordingStartMs
if (duration < MULTI_DIALOGUE_MIN_DURATION) return false
val avgEnergy = if (realtimeEnergyCount > 0) realtimeEnergySum / realtimeEnergyCount else 0f
val peakAvgRatio = if (avgEnergy > 0) realtimePeakRms / avgEnergy else 0f
val continuousRatio = if (realtimeSpeechFrames > 0) realtimeContinuousSpeechFrames.toFloat() / realtimeSpeechFrames else 0f
val vadRatio = vadManager.activeSpeechRatio()
isMultiPersonDialogueDetected = duration >= MULTI_DIALOGUE_MIN_DURATION &&
peakAvgRatio in MULTI_DIALOGUE_MIN_PEAK_AVG_RATIO..MULTI_DIALOGUE_MAX_PEAK_AVG_RATIO &&
continuousRatio <= MULTI_DIALOGUE_MAX_CONTINUOUS_RATIO &&
vadRatio >= MULTI_DIALOGUE_MIN_VAD_RATIO
return isMultiPersonDialogueDetected
}
/* ================= 环境基线校准 ================= */
private fun calibrateEnvBaseline(samples: FloatArray) {
val rms = vadManager.calcRms(samples)
val validRms = if (rms < currentEnvBaseline + 0.002f) rms else currentEnvBaseline
if (rms < 0.015f) {
if (envNoiseBuffer.size >= BASELINE_WINDOW_SIZE) {
envNoiseBuffer.removeFirst()
}
envNoiseBuffer.addLast(validRms)
currentEnvBaseline = envNoiseBuffer.maxOrNull() ?: 0.001f
}
}
/* ================= 唤醒处理 ================= */
private fun handleWakeupEvent() { private fun handleWakeupEvent() {
if (stateManager.state == VoiceState.UPLOADING) return if (state == VoiceState.UPLOADING) return
stopBackendAudio?.invoke() stopBackendAudio?.invoke()
stateManager.enterWakeup(interrupt = true, resetRealtimeStats = ::resetRealtimeStats) enterWakeup(interrupt = true)
preBuffer.clear() }
private fun enterWakeup(interrupt: Boolean) {
waitSpeechFailStartMs = System.currentTimeMillis()
waitSpeechStartMs = System.currentTimeMillis()
hasInvalidSpeech = false
currentTimeoutType = TimeoutType.IDLE_TIMEOUT
if (interrupt) {
audioBuffer.clear()
vadManager.reset()
vadStarted = false
resetRealtimeStats()
}
inKwsObserve = true
kwsObserveStartMs = System.currentTimeMillis()
onWakeup() onWakeup()
LogUtils.d(TAG, "🔔 唤醒成功 | 环境基线: $currentEnvBaseline")
} }
/**
* VAD开始回调
*/
private fun onVadStart() { private fun onVadStart() {
stateManager.onVadStart( if (state != VoiceState.WAIT_SPEECH) return
audioBuffer = audioBuffer, LogUtils.d(TAG, "🎤 REAL VAD START | 环境基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment")
preBuffer = preBuffer, vadStarted = true
resetRealtimeStats = ::resetRealtimeStats recordingStartMs = System.currentTimeMillis()
) audioBuffer.clear()
audioBuffer.addAll(preBuffer)
resetRealtimeStats()
state = VoiceState.RECORDING
} }
/**
* VAD结束回调
*/
private fun onVadEnd(avgEnergy: Float, peakRms: Float) { private fun onVadEnd(avgEnergy: Float, peakRms: Float) {
if (stateManager.state != VoiceState.RECORDING) return if (state != VoiceState.RECORDING) return
LogUtils.d(VoiceConfig.TAG, "🧠 VAD END | 环境基线: ${stateManager.currentEnvBaseline} | 嘈杂环境: ${stateManager.isNoisyEnvironment}") LogUtils.d(TAG, "🧠 VAD END | 环境基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment")
val realAvgEnergy = if (realtimeEnergyCount > 0) realtimeEnergySum / realtimeEnergyCount else avgEnergy val realAvgEnergy = if (realtimeEnergyCount > 0) realtimeEnergySum / realtimeEnergyCount else avgEnergy
val realPeakRms = if (realtimePeakRms > 0) realtimePeakRms else peakRms val realPeakRms = if (realtimePeakRms > 0) realtimePeakRms else peakRms
finishSentence(realAvgEnergy, realPeakRms) finishSentence(realAvgEnergy, realPeakRms)
} }
/** /* ================= 微弱人声过滤 ================= */
* 结束录音处理 private fun filterWeakVoice(duration: Long, avgEnergy: Float, peakRms: Float): Boolean {
*/ if (duration < MIN_EFFECTIVE_VOICE_DURATION) {
LogUtils.w(TAG, "❌ 微弱人声过滤:时长${duration}ms < ${MIN_EFFECTIVE_VOICE_DURATION}ms")
return true
}
val voiceFrameRatio = if (realtimeTotalFrames > 0) realtimeSpeechFrames.toFloat() / realtimeTotalFrames else 0f
if (avgEnergy < NORMAL_VOICE_ENERGY_THRESHOLD && voiceFrameRatio < MIN_VOICE_FRAME_RATIO) {
LogUtils.w(TAG, "❌ 微弱人声过滤:帧占比${voiceFrameRatio} < ${MIN_VOICE_FRAME_RATIO}")
return true
}
val peakBaselineRatio = peakRms / currentEnvBaseline
if (avgEnergy < NORMAL_VOICE_ENERGY_THRESHOLD && peakBaselineRatio < MIN_PEAK_ENERGY_RATIO) {
LogUtils.w(TAG, "❌ 微弱人声过滤:峰值/基线${peakBaselineRatio} < ${MIN_PEAK_ENERGY_RATIO}")
return true
}
if (avgEnergy < NORMAL_VOICE_ENERGY_THRESHOLD && realtimeContinuousSpeechFrames < MIN_CONTINUOUS_VOICE_FRAMES) {
LogUtils.w(TAG, "❌ 微弱人声过滤:连续帧${realtimeContinuousSpeechFrames} < ${MIN_CONTINUOUS_VOICE_FRAMES}")
return true
}
val energyBaselineRatio = avgEnergy / currentEnvBaseline
if (avgEnergy < 0.005f && energyBaselineRatio < 1.2f) {
LogUtils.w(TAG, "❌ 微弱人声过滤:能量/基线${energyBaselineRatio} < 1.2")
return true
}
return false
}
/* ================= 结束录音 ================= */
private fun finishSentence(avgEnergy: Float = 0f, peakRms: Float = 0f) { private fun finishSentence(avgEnergy: Float = 0f, peakRms: Float = 0f) {
val now = System.currentTimeMillis() val now = System.currentTimeMillis()
val duration = now - stateManager.recordingStartMs val duration = now - recordingStartMs
// 基础过滤:语音过短 if (!vadStarted || duration < MIN_SPEECH_MS) {
if (!stateManager.vadStarted || duration < VoiceConfig.MIN_SPEECH_MS) { LogUtils.d(TAG, "❌ 语音过短: $duration ms | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment")
LogUtils.d(VoiceConfig.TAG, "❌ 语音过短: $duration ms | 基线: ${stateManager.currentEnvBaseline} | 嘈杂环境: ${stateManager.isNoisyEnvironment}") hasInvalidSpeech = true
stateManager.hasInvalidSpeech = true resetToWaitSpeech()
stateManager.resetToWaitSpeech(::resetRealtimeStats, audioBuffer, vadManager)
return return
} }
// 微弱人声过滤 if (filterWeakVoice(duration, avgEnergy, peakRms)) {
if (VoiceUtils.filterWeakVoice( hasInvalidSpeech = true
duration = duration, resetToWaitSpeech()
avgEnergy = avgEnergy,
peakRms = peakRms,
currentEnvBaseline = stateManager.currentEnvBaseline,
realtimeTotalFrames = realtimeTotalFrames,
realtimeSpeechFrames = realtimeSpeechFrames,
realtimeContinuousSpeechFrames = realtimeContinuousSpeechFrames
)
) {
stateManager.hasInvalidSpeech = true
stateManager.resetToWaitSpeech(::resetRealtimeStats, audioBuffer, vadManager)
return return
} }
@ -275,95 +422,104 @@ class VoiceController(
val vadRatio = vadManager.activeSpeechRatio() val vadRatio = vadManager.activeSpeechRatio()
val peakAvgRatio = if (avgEnergy > 0f) peakRms / avgEnergy else 0f val peakAvgRatio = if (avgEnergy > 0f) peakRms / avgEnergy else 0f
LogUtils.d(VoiceConfig.TAG, "📊 录音信息 | 时长: $duration ms | 能量: $avgEnergy | 峰均比: $peakAvgRatio | 基线: ${stateManager.currentEnvBaseline} | 嘈杂环境: ${stateManager.isNoisyEnvironment}") LogUtils.d(TAG, "📊 录音信息 | 时长: $duration ms | 能量: $avgEnergy | 峰均比: $peakAvgRatio | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment")
LogUtils.d(VoiceConfig.TAG, "📊 实时帧统计 | 总帧: $realtimeTotalFrames | 语音帧: $realtimeSpeechFrames | 连续语音帧: $realtimeContinuousSpeechFrames") LogUtils.d(TAG, "📊 实时帧统计 | 总帧: $realtimeTotalFrames | 语音帧: $realtimeSpeechFrames | 连续语音帧: $realtimeContinuousSpeechFrames")
// 多人对话过滤
if (isMultiPersonDialogueDetected) { if (isMultiPersonDialogueDetected) {
LogUtils.w(VoiceConfig.TAG, "❌ 过滤多人对话垃圾语音 | 时长: $duration ms") LogUtils.w(TAG, "❌ 过滤多人对话垃圾语音 | 时长: $duration ms")
stateManager.hasInvalidSpeech = true hasInvalidSpeech = true
stateManager.resetToWaitSpeech(::resetRealtimeStats, audioBuffer, vadManager) resetToWaitSpeech()
return return
} }
// 声纹验证 // 声纹验证(核心极简版)
if (VoiceConfig.ENABLE_STRICT_SPEAKER_VERIFY) { if (ENABLE_STRICT_SPEAKER_VERIFY) {
val isCurrentUser = VoiceUtils.verifySpeaker( val isCurrentUser = verifySpeaker(audio)
audio = audio,
isNoisyEnvironment = stateManager.isNoisyEnvironment,
extractor = speakerExtractor,
manager = speakerManager
)
if (!isCurrentUser) { if (!isCurrentUser) {
LogUtils.w(VoiceConfig.TAG, "❌ 非当前唤醒用户,拒绝语音 | 录音时长: $duration ms | 嘈杂环境: ${stateManager.isNoisyEnvironment}") LogUtils.w(TAG, "❌ 非当前唤醒用户,拒绝语音 | 录音时长: $duration ms | 嘈杂环境: $isNoisyEnvironment")
stateManager.hasInvalidSpeech = true hasInvalidSpeech = true
stateManager.resetToWaitSpeech(::resetRealtimeStats, audioBuffer, vadManager) resetToWaitSpeech()
return return
} }
LogUtils.d(VoiceConfig.TAG, "✅ 当前用户语音,继续处理 | 录音时长: $duration ms | 嘈杂环境: ${stateManager.isNoisyEnvironment}") LogUtils.d(TAG, "✅ 当前用户语音,继续处理 | 录音时长: $duration ms | 嘈杂环境: $isNoisyEnvironment")
} }
// 远场过滤 // 远场过滤
val isFarField = avgEnergy < VoiceConfig.MAX_FAR_FIELD_ENERGY val isFarField = avgEnergy < MAX_FAR_FIELD_ENERGY
val isInvalidPeakRatio = peakAvgRatio < VoiceConfig.MIN_VALID_PEAK_AVG_RATIO val isInvalidPeakRatio = peakAvgRatio < MIN_VALID_PEAK_AVG_RATIO
if (isFarField && isInvalidPeakRatio) { if (isFarField && isInvalidPeakRatio) {
LogUtils.w(VoiceConfig.TAG, "❌ 远场/无效语音过滤 | 能量: $avgEnergy < ${VoiceConfig.MAX_FAR_FIELD_ENERGY}") LogUtils.w(TAG, "❌ 远场/无效语音过滤 | 能量: $avgEnergy < $MAX_FAR_FIELD_ENERGY")
stateManager.hasInvalidSpeech = true hasInvalidSpeech = true
stateManager.resetToWaitSpeech(::resetRealtimeStats, audioBuffer, vadManager) resetToWaitSpeech()
return return
} }
// 非连续判定 // 非连续判定
val continuousRatio = if (realtimeSpeechFrames > 0) realtimeContinuousSpeechFrames.toFloat() / realtimeSpeechFrames else 0f val continuousRatio = if (realtimeSpeechFrames > 0) realtimeContinuousSpeechFrames.toFloat() / realtimeSpeechFrames else 0f
val peakPositionRatio = vadManager.getPeakPositionRatio() val peakPositionRatio = vadManager.getPeakPositionRatio()
val isDiscontinuous = continuousRatio < VoiceConfig.MIN_CONTINUOUS_FRAME_RATIO && val isDiscontinuous = continuousRatio < MIN_CONTINUOUS_FRAME_RATIO &&
realtimeSpeechFrames < VoiceConfig.MIN_EFFECTIVE_SPEECH_FRAMES && realtimeSpeechFrames < MIN_EFFECTIVE_SPEECH_FRAMES &&
peakPositionRatio > VoiceConfig.MAX_PEAK_POSITION_RATIO peakPositionRatio > MAX_PEAK_POSITION_RATIO
if (isDiscontinuous) { if (isDiscontinuous) {
LogUtils.w(VoiceConfig.TAG, "❌ 非连续杂音过滤 | 连续占比: $continuousRatio < ${VoiceConfig.MIN_CONTINUOUS_FRAME_RATIO}") LogUtils.w(TAG, "❌ 非连续杂音过滤 | 连续占比: $continuousRatio < $MIN_CONTINUOUS_FRAME_RATIO")
stateManager.hasInvalidSpeech = true hasInvalidSpeech = true
stateManager.resetToWaitSpeech(::resetRealtimeStats, audioBuffer, vadManager) resetToWaitSpeech()
return return
} }
// 分场景阈值过滤 // 分场景阈值过滤
val thresholdConfig = VoiceUtils.getThresholdConfig(duration, stateManager.currentEnvBaseline) val isQuietEnv = currentEnvBaseline < BASELINE_QUIET_THRESHOLD
val thresholdConfig = when {
duration in SHORT_SPEECH_MIN..SHORT_SPEECH_MAX -> {
val coeff = if (isQuietEnv) SHORT_SPEECH_ENERGY_COEFF_QUIET else SHORT_SPEECH_ENERGY_COEFF_NOISY
val energyThreshold = currentEnvBaseline * coeff
ThresholdConfig(energyThreshold, SHORT_SPEECH_VAD_COEFF, SHORT_SPEECH_MIN_SCORE, "短语音")
}
else -> {
val coeff = if (isQuietEnv) LONG_SPEECH_ENERGY_COEFF_QUIET else LONG_SPEECH_ENERGY_COEFF_NOISY
val energyThreshold = currentEnvBaseline * coeff
ThresholdConfig(energyThreshold, LONG_SPEECH_VAD_COEFF, LONG_SPEECH_MIN_SCORE, "长语音")
}
}
val energyPass = avgEnergy >= thresholdConfig.energyThreshold val energyPass = avgEnergy >= thresholdConfig.energyThreshold
val vadRatioPass = vadRatio >= thresholdConfig.vadRatioThreshold val vadRatioPass = vadRatio >= thresholdConfig.vadRatioThreshold
if (!energyPass || !vadRatioPass) { if (!energyPass || !vadRatioPass) {
LogUtils.w(VoiceConfig.TAG, "❌ 低能量语音阈值过滤 | 能量: $avgEnergy < ${thresholdConfig.energyThreshold} | 占比: $vadRatio < ${thresholdConfig.vadRatioThreshold} | 场景: ${thresholdConfig.scene}") LogUtils.w(TAG, "❌ 低能量语音阈值过滤 | 能量: $avgEnergy < ${thresholdConfig.energyThreshold} | 占比: $vadRatio < ${thresholdConfig.vadRatioThreshold} | 场景: ${thresholdConfig.scene}")
stateManager.hasInvalidSpeech = true hasInvalidSpeech = true
stateManager.resetToWaitSpeech(::resetRealtimeStats, audioBuffer, vadManager) resetToWaitSpeech()
return return
} }
// 评分判定 // 评分判定
val score = VoiceUtils.calculateSpeechScore( var score = 0
duration = duration, score += when {
avgEnergy = avgEnergy, duration >= 4000 -> 3
continuousRatio = continuousRatio, duration >= 2500 -> 2
thresholdConfig = thresholdConfig else -> 1
) }
score += if (avgEnergy >= thresholdConfig.energyThreshold) 1 else 0
score += if (continuousRatio >= MIN_CONTINUOUS_FRAME_RATIO) 1 else 0
val pass = score >= thresholdConfig.minScore val pass = score >= thresholdConfig.minScore
if (!pass) { if (!pass) {
LogUtils.w(VoiceConfig.TAG, "❌ 评分不足过滤 | 总分: $score < ${thresholdConfig.minScore} | 场景: ${thresholdConfig.scene}") LogUtils.w(TAG, "❌ 评分不足过滤 | 总分: $score < ${thresholdConfig.minScore} | 场景: ${thresholdConfig.scene}")
stateManager.hasInvalidSpeech = true hasInvalidSpeech = true
stateManager.resetToWaitSpeech(::resetRealtimeStats, audioBuffer, vadManager) resetToWaitSpeech()
return return
} }
// 最终通过 // 最终通过
audioBuffer.clear() audioBuffer.clear()
stateManager.state = VoiceState.UPLOADING state = VoiceState.UPLOADING
onFinalAudio(audio) onFinalAudio(audio)
resetRealtimeStats() resetRealtimeStats()
stateManager.hasInvalidSpeech = false hasInvalidSpeech = false
LogUtils.i(VoiceConfig.TAG, "✅ 语音通过 | 时长: $duration ms | 能量: $avgEnergy | 场景: ${thresholdConfig.scene} | 嘈杂环境: ${stateManager.isNoisyEnvironment}") LogUtils.i(TAG, "✅ 语音通过 | 时长: $duration ms | 能量: $avgEnergy | 场景: ${thresholdConfig.scene} | 嘈杂环境: $isNoisyEnvironment")
} }
/** /* ================= 重置实时统计 ================= */
* 重置实时统计
*/
private fun resetRealtimeStats() { private fun resetRealtimeStats() {
realtimeEnergySum = 0f realtimeEnergySum = 0f
realtimeEnergyCount = 0 realtimeEnergyCount = 0
@ -375,34 +531,95 @@ class VoiceController(
isMultiPersonDialogueDetected = false isMultiPersonDialogueDetected = false
} }
// ================= 对外API完全不变 ================= /* ================= 播放/上传回调 ================= */
fun onPlayStartPrompt() = stateManager.onPlayStartPrompt() fun onPlayStartPrompt() {
fun onPlayEndPrompt() = stateManager.onPlayEndPrompt() LogUtils.d(TAG, "🎵 播放提示音 | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment")
fun onPlayStartBackend() = stateManager.onPlayStartBackend() state = VoiceState.PLAYING_PROMPT
fun onPlayEndBackend() = stateManager.onPlayEndBackend() }
fun onUploadFinished(success: Boolean) = stateManager.onUploadFinished(success)
fun onPlayEndPrompt() {
speechEnableAtMs = System.currentTimeMillis() + SPEECH_COOLDOWN_MS
LogUtils.d(TAG, "🎵 提示音结束 | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment")
state = VoiceState.WAIT_SPEECH_COOLDOWN
}
fun onPlayStartBackend() {
if (state != VoiceState.UPLOADING) {
LogUtils.w(TAG, "🎶 非上传完成状态,禁止切换到 PLAYING_BACKEND | 当前状态: $state")
return
}
LogUtils.d(TAG, "🎶 开始播放后台音频 | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment")
state = VoiceState.PLAYING_BACKEND
}
fun onPlayEndBackend() {
speechEnableAtMs = System.currentTimeMillis() + SPEECH_COOLDOWN_MS
LogUtils.d(TAG, "🎶 后台音频结束 | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment")
state = VoiceState.WAIT_SPEECH_COOLDOWN
}
fun onUploadFinished(success: Boolean) {
if (state != VoiceState.UPLOADING) return
LogUtils.d(TAG, "📤 上传完成 | 成功: $success | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment")
if (!success) {
speechEnableAtMs = System.currentTimeMillis() + SPEECH_COOLDOWN_MS
state = VoiceState.WAIT_SPEECH_COOLDOWN
}
}
private fun resetToWaitSpeech() {
LogUtils.d(TAG, "🔄 重置到等待说话 | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment | 已标记无效说话: $hasInvalidSpeech")
val now = System.currentTimeMillis()
if (now - lastInvalidResetMs < INVALID_RESET_DEBOUNCE_MS) {
LogUtils.d(TAG, "🛡 防抖1.5秒内重复无效语音,跳过重置")
return
}
lastInvalidResetMs = now
audioBuffer.clear()
vadManager.reset()
vadStarted = false
resetRealtimeStats()
state = VoiceState.WAIT_SPEECH
if (waitSpeechFailStartMs == 0L) waitSpeechFailStartMs = System.currentTimeMillis()
}
private fun resetAll() {
LogUtils.d(TAG, "🔄 重置所有状态 | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment | 本次超时类型: $currentTimeoutType")
audioBuffer.clear()
preBuffer.clear()
vadManager.reset()
wakeupManager.reset()
vadStarted = false
waitSpeechStartMs = 0L
waitSpeechFailStartMs = 0L
envNoiseBuffer.clear()
currentEnvBaseline = 0.001f
isNoisyEnvironment = false
resetRealtimeStats()
hasInvalidSpeech = false
currentTimeoutType = TimeoutType.IDLE_TIMEOUT
state = VoiceState.WAIT_WAKEUP
}
/**
* 资源释放
*/
fun release() { fun release() {
LogUtils.d(VoiceConfig.TAG, "🔌 释放资源 | 最终基线: ${stateManager.currentEnvBaseline} | 嘈杂环境: ${stateManager.isNoisyEnvironment}") LogUtils.d(TAG, "🔌 释放资源 | 最终基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment")
wakeupManager.release() wakeupManager.release()
vadManager.reset() vadManager.reset()
envNoiseBuffer.clear() envNoiseBuffer.clear()
resetRealtimeStats() resetRealtimeStats()
stateManager.hasInvalidSpeech = false hasInvalidSpeech = false
stateManager.currentTimeoutType = TimeoutType.IDLE_TIMEOUT currentTimeoutType = TimeoutType.IDLE_TIMEOUT
stateManager.isNoisyEnvironment = false isNoisyEnvironment = false
runCatching { runCatching {
speakerExtractor.release() SpeakerRecognition.extractor.release()
speakerManagerLock.withLock { speakerManagerLock.withLock {
speakerManager.release() SpeakerRecognition.manager.release()
} }
LogUtils.d(VoiceConfig.TAG, "✅ 声纹识别器资源已释放") LogUtils.d(TAG, "✅ 声纹识别器资源已释放")
}.onFailure { }.onFailure {
LogUtils.e(VoiceConfig.TAG, "❌ 释放声纹识别器资源失败", it) LogUtils.e(TAG, "❌ 释放声纹识别器资源失败", it)
} }
} }
@ -410,7 +627,85 @@ class VoiceController(
runCatching { runCatching {
release() release()
}.onFailure { }.onFailure {
LogUtils.e(VoiceConfig.TAG, "❌ finalize 释放资源失败", it) LogUtils.e(TAG, "❌ finalize 释放资源失败", it)
}
}
private fun cachePreBuffer(samples: FloatArray) {
for (s in samples) {
preBuffer.addLast(s)
if (preBuffer.size > PRE_BUFFER_SIZE) preBuffer.removeFirst()
}
}
// 阈值配置数据类
private data class ThresholdConfig(
val energyThreshold: Float,
val vadRatioThreshold: Float,
val minScore: Int,
val scene: String
)
/* ================= 核心:极简版声纹验证 ================= */
private fun verifySpeaker(audio: FloatArray): Boolean {
if (audio.isEmpty()) {
LogUtils.w(TAG, "❌ 待验证音频为空,声纹验证失败")
return false
}
// 1. 裁剪音频:只保留本次录音的有效部分(解决时长不匹配问题)
val audioDurationMs = (audio.size.toFloat() / SAMPLE_RATE * 1000).toLong()
// 只保留最后 N 毫秒的音频N = 实际录音时长),避免缓存旧音频
val validAudio = if (audioDurationMs > 0) {
val validSampleCount = (audioDurationMs * SAMPLE_RATE / 1000).toInt()
if (validSampleCount < audio.size) {
audio.copyOfRange(audio.size - validSampleCount, audio.size)
} else {
audio
}
} else {
audio
}
// 2. 分场景选阈值(无容错,只调阈值)
val finalThreshold = when {
audioDurationMs < SHORT_AUDIO_DURATION_MS -> SPEAKER_THRESHOLD_SHORT
isNoisyEnvironment -> SPEAKER_THRESHOLD_NOISY
else -> SPEAKER_THRESHOLD_QUIET
}
var stream: OnlineStream? = null
return try {
stream = SpeakerRecognition.extractor.createStream()
stream.acceptWaveform(samples = validAudio, sampleRate = SAMPLE_RATE) // 用裁剪后的音频验证
stream.inputFinished()
if (!SpeakerRecognition.extractor.isReady(stream)) {
LogUtils.w(TAG, "❌ 音频Stream未就绪验证失败")
return false
}
val embedding = SpeakerRecognition.extractor.compute(stream)
// 3. 纯验证逻辑:过就过,不过就拒绝
speakerManagerLock.withLock {
val verifyPass = SpeakerRecognition.manager.verify(
name = CURRENT_USER_ID,
embedding = embedding,
threshold = finalThreshold
)
// 打印关键信息(补充裁剪后时长)
LogUtils.d(TAG, "📊 声纹验证 | 阈值: $finalThreshold | 通过: $verifyPass | 嘈杂环境: $isNoisyEnvironment | 原始时长: ${audioDurationMs}ms | 验证时长: ${(validAudio.size.toFloat()/SAMPLE_RATE*1000).toLong()}ms")
// 无任何容错:验证结果就是最终结果
return verifyPass
}
} catch (e: Exception) {
LogUtils.e(TAG, "❌ 声纹验证异常,拒绝", e)
return false
} finally {
stream?.release()
} }
} }
} }

View File

@ -1,211 +0,0 @@
package com.zs.smarthuman.sherpa
import com.blankj.utilcode.util.LogUtils
import java.util.ArrayDeque
/**
* 语音控制器状态管理类
*/
class VoiceStateManager(
idleTimeoutSeconds: Int,
maxRecordingSeconds: Int,
private val onStateChanged: ((VoiceState) -> Unit)?,
private val onTimeoutTip: OnTimeoutTip?
) {
var state: VoiceState = VoiceState.WAIT_WAKEUP
set(value) {
field = value
LogUtils.d(VoiceConfig.TAG, "➡ State = $value")
onStateChanged?.invoke(value)
}
// 超时相关
val idleTimeoutMs = idleTimeoutSeconds * 1000L
val maxRecordingMs = maxRecordingSeconds * 1000L
var waitSpeechFailStartMs = 0L
var waitSpeechStartMs = 0L
var speechEnableAtMs = 0L
var lastInvalidResetMs = 0L
// 无效说话标记 + 超时类型
var hasInvalidSpeech = false
var currentTimeoutType: TimeoutType = TimeoutType.IDLE_TIMEOUT
// 唤醒观察标记
var inKwsObserve = false
var kwsObserveStartMs = 0L
// 环境状态
var isNoisyEnvironment = false
var currentEnvBaseline = 0.001f
// 录音状态
var recordingStartMs = 0L
var vadStarted = false
/**
* 检查等待说话超时
* 修复点返回是否超时由外部调用 resetAll()避免内部依赖外部对象
*/
fun checkWaitSpeechTimeout(now: Long): Boolean {
val isTimeout = (waitSpeechStartMs > 0 && now - waitSpeechStartMs >= idleTimeoutMs) ||
(waitSpeechFailStartMs > 0 && now - waitSpeechFailStartMs >= idleTimeoutMs)
if (isTimeout) {
currentTimeoutType = if (hasInvalidSpeech) {
TimeoutType.INVALID_SPEECH_TIMEOUT
} else {
TimeoutType.IDLE_TIMEOUT
}
LogUtils.d(VoiceConfig.TAG, "⏱ WAIT_SPEECH timeout → WAIT_WAKEUP | 超时类型: $currentTimeoutType")
onTimeoutTip?.invoke(currentTimeoutType)
// 修复点:不再内部调用 resetAll(),改为返回超时状态,由外部处理
return true
}
return false
}
/**
* 处理等待说话冷却状态
*/
fun handleWaitSpeechCooldown(now: Long): Boolean {
if (now >= speechEnableAtMs) {
waitSpeechFailStartMs = now
state = VoiceState.WAIT_SPEECH
waitSpeechStartMs = now
return true
}
return false
}
/**
* 进入唤醒状态
*/
fun enterWakeup(interrupt: Boolean, resetRealtimeStats: () -> Unit) {
val now = System.currentTimeMillis()
waitSpeechFailStartMs = now
waitSpeechStartMs = now
hasInvalidSpeech = false
currentTimeoutType = TimeoutType.IDLE_TIMEOUT
if (interrupt) {
resetRealtimeStats()
vadStarted = false
}
inKwsObserve = true
kwsObserveStartMs = now
}
/**
* 重置到等待说话状态
*/
fun resetToWaitSpeech(resetRealtimeStats: () -> Unit, audioBuffer: MutableList<Float>, vadManager: VadManager) {
LogUtils.d(VoiceConfig.TAG, "🔄 重置到等待说话 | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment | 已标记无效说话: $hasInvalidSpeech")
val now = System.currentTimeMillis()
if (now - lastInvalidResetMs < VoiceConfig.INVALID_RESET_DEBOUNCE_MS) {
LogUtils.d(VoiceConfig.TAG, "🛡 防抖1.5秒内重复无效语音,跳过重置")
return
}
lastInvalidResetMs = now
audioBuffer.clear()
vadManager.reset()
vadStarted = false
resetRealtimeStats()
state = VoiceState.WAIT_SPEECH
if (waitSpeechFailStartMs == 0L) waitSpeechFailStartMs = System.currentTimeMillis()
}
/**
* 重置所有状态
*/
fun resetAll(
resetRealtimeStats: () -> Unit,
audioBuffer: MutableList<Float>,
preBuffer: ArrayDeque<Float>,
vadManager: VadManager,
wakeupManager: WakeupManager,
envNoiseBuffer: ArrayDeque<Float>
) {
LogUtils.d(VoiceConfig.TAG, "🔄 重置所有状态 | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment | 本次超时类型: $currentTimeoutType")
audioBuffer.clear()
preBuffer.clear()
vadManager.reset()
wakeupManager.reset()
vadStarted = false
waitSpeechStartMs = 0L
waitSpeechFailStartMs = 0L
envNoiseBuffer.clear()
currentEnvBaseline = 0.001f
isNoisyEnvironment = false
resetRealtimeStats()
hasInvalidSpeech = false
currentTimeoutType = TimeoutType.IDLE_TIMEOUT
state = VoiceState.WAIT_WAKEUP
}
/**
* 播放提示音开始
*/
fun onPlayStartPrompt() {
LogUtils.d(VoiceConfig.TAG, "🎵 播放提示音 | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment")
state = VoiceState.PLAYING_PROMPT
}
/**
* 播放提示音结束
*/
fun onPlayEndPrompt() {
speechEnableAtMs = System.currentTimeMillis() + VoiceConfig.SPEECH_COOLDOWN_MS
LogUtils.d(VoiceConfig.TAG, "🎵 提示音结束 | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment")
state = VoiceState.WAIT_SPEECH_COOLDOWN
}
/**
* 播放后台音频开始
*/
fun onPlayStartBackend() {
if (state != VoiceState.UPLOADING) {
LogUtils.w(VoiceConfig.TAG, "🎶 非上传完成状态,禁止切换到 PLAYING_BACKEND | 当前状态: $state")
return
}
LogUtils.d(VoiceConfig.TAG, "🎶 开始播放后台音频 | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment")
state = VoiceState.PLAYING_BACKEND
}
/**
* 播放后台音频结束
*/
fun onPlayEndBackend() {
speechEnableAtMs = System.currentTimeMillis() + VoiceConfig.SPEECH_COOLDOWN_MS
LogUtils.d(VoiceConfig.TAG, "🎶 后台音频结束 | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment")
state = VoiceState.WAIT_SPEECH_COOLDOWN
}
/**
* 上传完成
*/
fun onUploadFinished(success: Boolean) {
if (state != VoiceState.UPLOADING) return
LogUtils.d(VoiceConfig.TAG, "📤 上传完成 | 成功: $success | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment")
if (!success) {
speechEnableAtMs = System.currentTimeMillis() + VoiceConfig.SPEECH_COOLDOWN_MS
state = VoiceState.WAIT_SPEECH_COOLDOWN
}
}
/**
* VAD开始回调
*/
fun onVadStart(audioBuffer: MutableList<Float>, preBuffer: ArrayDeque<Float>, resetRealtimeStats: () -> Unit) {
if (state != VoiceState.WAIT_SPEECH) return
LogUtils.d(VoiceConfig.TAG, "🎤 REAL VAD START | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment")
vadStarted = true
recordingStartMs = System.currentTimeMillis()
audioBuffer.clear()
audioBuffer.addAll(preBuffer)
resetRealtimeStats()
state = VoiceState.RECORDING
}
}

View File

@ -1,355 +0,0 @@
package com.zs.smarthuman.sherpa
import com.blankj.utilcode.util.LogUtils
import com.k2fsa.sherpa.onnx.OnlineStream
import com.k2fsa.sherpa.onnx.SpeakerEmbeddingExtractor
import com.k2fsa.sherpa.onnx.SpeakerEmbeddingManager
import java.util.ArrayDeque
import java.util.concurrent.locks.ReentrantLock
import kotlin.concurrent.withLock
/**
* 语音处理通用工具类优化微弱人声过滤逻辑适配正常语音
*/
object VoiceUtils {
private val speakerManagerLock = ReentrantLock()
/**
* 环境基线校准
*/
fun calibrateEnvBaseline(
samples: FloatArray,
vadManager: VadManager,
envNoiseBuffer: ArrayDeque<Float>,
currentEnvBaseline: Float
): Float {
val rms = vadManager.calcRms(samples)
val validRms = if (rms < currentEnvBaseline + 0.002f) rms else currentEnvBaseline
if (rms < 0.015f) {
if (envNoiseBuffer.size >= VoiceConfig.BASELINE_WINDOW_SIZE) {
envNoiseBuffer.removeFirst()
}
envNoiseBuffer.addLast(validRms)
return envNoiseBuffer.maxOrNull() ?: 0.001f
}
return currentEnvBaseline
}
/**
* 更新实时能量统计
*/
fun updateRealtimeEnergy(
samples: FloatArray,
vadManager: VadManager,
isNoisyEnvironment: Boolean,
currentEnvBaseline: Float,
realtimeEnergySum: Float,
realtimeEnergyCount: Int,
realtimePeakRms: Float
): Triple<Float, Int, Float> {
val rms = vadManager.calcRms(samples)
val effectiveThreshold = if (isNoisyEnvironment) currentEnvBaseline * 1.8f else VoiceConfig.MIN_EFFECTIVE_SPEECH_RMS
var newSum = realtimeEnergySum
var newCount = realtimeEnergyCount
var newPeak = realtimePeakRms
if (rms >= effectiveThreshold) {
newSum += rms
newCount++
newPeak = maxOf(newPeak, rms)
}
return Triple(newSum, newCount, newPeak)
}
/**
* 更新实时帧统计
*/
fun updateRealtimeFrameStats(vadManager: VadManager): FrameStats {
val totalFrames = vadManager.getTotalFrames()
val speechFrames = vadManager.getSpeechFrames()
val continuousSpeechFrames = vadManager.getContinuousSpeechFrames()
val currentFrameIsSpeech = vadManager.isSpeechDetected()
val newContinuousFrames = if (currentFrameIsSpeech) {
if (vadManager.getContinuousSpeechFrames() > 0) continuousSpeechFrames + 1 else 1
} else {
0
}
return FrameStats(
totalFrames = totalFrames,
speechFrames = speechFrames,
continuousSpeechFrames = newContinuousFrames,
lastFrameIsSpeech = currentFrameIsSpeech
)
}
/**
* 多人对话实时检测
*/
fun checkMultiPersonDialogue(
now: Long,
recordingStartMs: Long,
realtimeEnergySum: Float,
realtimeEnergyCount: Int,
realtimePeakRms: Float,
realtimeSpeechFrames: Int,
realtimeContinuousSpeechFrames: Int,
vadManager: VadManager
): Boolean {
val duration = now - recordingStartMs
if (duration < VoiceConfig.MULTI_DIALOGUE_MIN_DURATION) return false
val avgEnergy = if (realtimeEnergyCount > 0) realtimeEnergySum / realtimeEnergyCount else 0f
val peakAvgRatio = if (avgEnergy > 0) realtimePeakRms / avgEnergy else 0f
val continuousRatio = if (realtimeSpeechFrames > 0) realtimeContinuousSpeechFrames.toFloat() / realtimeSpeechFrames else 0f
val vadRatio = vadManager.activeSpeechRatio()
return duration >= VoiceConfig.MULTI_DIALOGUE_MIN_DURATION &&
peakAvgRatio in VoiceConfig.MULTI_DIALOGUE_MIN_PEAK_AVG_RATIO..VoiceConfig.MULTI_DIALOGUE_MAX_PEAK_AVG_RATIO &&
continuousRatio <= VoiceConfig.MULTI_DIALOGUE_MAX_CONTINUOUS_RATIO &&
vadRatio >= VoiceConfig.MULTI_DIALOGUE_MIN_VAD_RATIO
}
/**
* 微弱人声过滤精简版保留核心层删除冗余层避免过度过滤
*/
fun filterWeakVoice(
duration: Long,
avgEnergy: Float,
peakRms: Float,
currentEnvBaseline: Float,
realtimeTotalFrames: Int,
realtimeSpeechFrames: Int,
realtimeContinuousSpeechFrames: Int
): Boolean {
// 1. 基础时长过滤(必需:过滤极短杂音)
if (duration < VoiceConfig.MIN_EFFECTIVE_VOICE_DURATION) {
LogUtils.w("${VoiceConfig.TAG}", "❌ 微弱人声过滤:时长${duration}ms < ${VoiceConfig.MIN_EFFECTIVE_VOICE_DURATION}ms")
return true
}
// 2. 动态能量阈值过滤(核心:分场景放宽短语音阈值)
val dynamicEnergyThreshold = if (duration < VoiceConfig.SHORT_LONG_SPEECH_CUTOFF_MS)
VoiceConfig.SHORT_SPEECH_ENERGY_THRESHOLD
else
VoiceConfig.MIN_NORMAL_VOICE_ENERGY
if (avgEnergy < dynamicEnergyThreshold) {
LogUtils.w("${VoiceConfig.TAG}", "❌ 微弱人声过滤:平均能量${avgEnergy} < ${if (duration < 2000) "短语音能量阈值${dynamicEnergyThreshold}" else "正常语音能量阈值${dynamicEnergyThreshold}"}")
return true
}
// 3. 计算VAD占比辅助为后续过滤准备
val voiceFrameRatio = if (realtimeTotalFrames > 0) realtimeSpeechFrames.toFloat() / realtimeTotalFrames else 0f
// 4. 动态VAD占比+能量联合过滤(核心:分场景适配,避免单一维度误判)
val dynamicVadRatioThreshold = when {
duration < VoiceConfig.SHORT_LONG_SPEECH_CUTOFF_MS -> VoiceConfig.SHORT_SPEECH_VAD_RATIO
currentEnvBaseline >= VoiceConfig.NOISE_BASELINE_THRESHOLD -> VoiceConfig.NOISY_ENV_VAD_RATIO
else -> VoiceConfig.MIN_NORMAL_VOICE_VAD_RATIO
}
if (voiceFrameRatio < dynamicVadRatioThreshold && avgEnergy < VoiceConfig.NORMAL_VOICE_ENERGY_THRESHOLD) {
LogUtils.w("${VoiceConfig.TAG}", "❌ 微弱人声过滤:语音帧占比${voiceFrameRatio} < ${dynamicVadRatioThreshold} | 平均能量${avgEnergy}")
return true
}
// 5. 纯底噪过滤(必需:过滤无语音的环境底噪)
val energyBaselineRatio = avgEnergy / currentEnvBaseline
if (avgEnergy < VoiceConfig.PURE_NOISE_ENERGY_THRESHOLD && energyBaselineRatio < VoiceConfig.PURE_NOISE_BASELINE_RATIO) {
LogUtils.w("${VoiceConfig.TAG}", "❌ 微弱人声过滤:能量/基线${energyBaselineRatio} < 1.2(纯底噪)")
return true
}
// (可选保留)峰值/基线过滤:仅对扁平背景音有效,可根据实际场景选择
// val peakBaselineRatio = peakRms / currentEnvBaseline
// if (avgEnergy < VoiceConfig.NORMAL_VOICE_ENERGY_THRESHOLD && peakBaselineRatio < VoiceConfig.MIN_PEAK_ENERGY_RATIO) {
// LogUtils.w("${VoiceConfig.TAG}", "❌ 微弱人声过滤:峰值/基线${peakBaselineRatio} < ${VoiceConfig.MIN_PEAK_ENERGY_RATIO}")
// return true
// }
// 正常语音通过所有核心过滤
LogUtils.d("${VoiceConfig.TAG}", "✅ 正常语音通过微弱人声过滤 | 时长${duration}ms | 能量${avgEnergy} | VAD占比${voiceFrameRatio} | 基线${currentEnvBaseline}")
return false
}
/**
* 声纹验证核心逻辑无修改
*/
fun verifySpeaker(
audio: FloatArray,
isNoisyEnvironment: Boolean,
extractor: SpeakerEmbeddingExtractor,
manager: SpeakerEmbeddingManager
): Boolean {
if (audio.isEmpty()) {
LogUtils.w("${VoiceConfig.TAG}", "❌ 待验证音频为空,声纹验证失败")
return false
}
// 裁剪音频:只保留本次录音的有效部分
val audioDurationMs = (audio.size.toFloat() / VoiceConfig.SAMPLE_RATE * 1000).toLong()
val validAudio = if (audioDurationMs > 0) {
val validSampleCount = (audioDurationMs * VoiceConfig.SAMPLE_RATE / 1000).toInt()
if (validSampleCount < audio.size) {
audio.copyOfRange(audio.size - validSampleCount, audio.size)
} else {
audio
}
} else {
audio
}
// 分场景选阈值
val finalThreshold = when {
audioDurationMs < VoiceConfig.SHORT_AUDIO_DURATION_MS -> VoiceConfig.SPEAKER_THRESHOLD_SHORT
isNoisyEnvironment -> VoiceConfig.SPEAKER_THRESHOLD_NOISY
else -> VoiceConfig.SPEAKER_THRESHOLD_QUIET
}
var stream: OnlineStream? = null
return try {
stream = extractor.createStream()
stream.acceptWaveform(samples = validAudio, sampleRate = VoiceConfig.SAMPLE_RATE)
stream.inputFinished()
if (!extractor.isReady(stream)) {
LogUtils.w("${VoiceConfig.TAG}", "❌ 音频Stream未就绪验证失败")
return false
}
val embedding = extractor.compute(stream)
speakerManagerLock.withLock {
val verifyPass = manager.verify(
name = VoiceConfig.CURRENT_USER_ID,
embedding = embedding,
threshold = finalThreshold
)
LogUtils.d("${VoiceConfig.TAG}", "📊 声纹验证 | 阈值: $finalThreshold | 通过: $verifyPass | 嘈杂环境: $isNoisyEnvironment | 原始时长: ${audioDurationMs}ms | 验证时长: ${(validAudio.size.toFloat()/VoiceConfig.SAMPLE_RATE*1000).toLong()}ms")
return verifyPass
}
} catch (e: Exception) {
LogUtils.e("${VoiceConfig.TAG}", "❌ 声纹验证异常,拒绝", e)
return false
} finally {
stream?.release()
}
}
/**
* 注册唤醒用户声纹特征无修改
*/
fun registerWakeupUser(
preBuffer: ArrayDeque<Float>,
extractor: SpeakerEmbeddingExtractor,
manager: SpeakerEmbeddingManager
) {
var stream: OnlineStream? = null
runCatching {
val wakeupAudio = preBuffer.toFloatArray()
if (wakeupAudio.isEmpty()) {
LogUtils.w("${VoiceConfig.TAG}", "❌ 唤醒音频缓存为空,无法注册用户特征")
return
}
stream = extractor.createStream()
stream?.acceptWaveform(samples = wakeupAudio, sampleRate = VoiceConfig.SAMPLE_RATE)
stream?.inputFinished()
if (stream != null && extractor.isReady(stream)) {
val embedding = extractor.compute(stream)
speakerManagerLock.withLock {
manager.remove(VoiceConfig.CURRENT_USER_ID)
val embeddingList = mutableListOf(embedding)
val ok = manager.add(
name = VoiceConfig.CURRENT_USER_ID,
embedding = embeddingList.toTypedArray()
)
if (ok) {
LogUtils.d("${VoiceConfig.TAG}", "✅ 注册当前唤醒用户特征成功 | 特征长度: ${embedding.size}")
} else {
LogUtils.w("${VoiceConfig.TAG}", "❌ 注册当前唤醒用户特征失败")
}
}
} else {
LogUtils.w("${VoiceConfig.TAG}", "❌ 唤醒音频Stream未就绪跳过用户注册")
}
}.onFailure {
LogUtils.e("${VoiceConfig.TAG}", "❌ 唤醒用户特征注册失败", it)
}.also {
stream?.release()
}
}
/**
* 缓存预缓冲音频无修改
*/
fun cachePreBuffer(samples: FloatArray, preBuffer: ArrayDeque<Float>) {
for (s in samples) {
preBuffer.addLast(s)
if (preBuffer.size > VoiceConfig.PRE_BUFFER_SIZE) preBuffer.removeFirst()
}
}
/**
* 帧统计数据类无修改
*/
data class FrameStats(
val totalFrames: Int,
val speechFrames: Int,
val continuousSpeechFrames: Int,
val lastFrameIsSpeech: Boolean
)
/**
* 阈值配置数据类无修改
*/
data class ThresholdConfig(
val energyThreshold: Float,
val vadRatioThreshold: Float,
val minScore: Int,
val scene: String
)
/**
* 获取分场景阈值配置无修改
*/
fun getThresholdConfig(duration: Long, currentEnvBaseline: Float): ThresholdConfig {
val isQuietEnv = currentEnvBaseline < VoiceConfig.BASELINE_QUIET_THRESHOLD
return if (duration in VoiceConfig.SHORT_SPEECH_MIN..VoiceConfig.SHORT_SPEECH_MAX) {
val coeff = if (isQuietEnv) VoiceConfig.SHORT_SPEECH_ENERGY_COEFF_QUIET else VoiceConfig.SHORT_SPEECH_ENERGY_COEFF_NOISY
ThresholdConfig(
energyThreshold = currentEnvBaseline * coeff,
vadRatioThreshold = VoiceConfig.SHORT_SPEECH_VAD_COEFF,
minScore = VoiceConfig.SHORT_SPEECH_MIN_SCORE,
scene = "短语音"
)
} else {
val coeff = if (isQuietEnv) VoiceConfig.LONG_SPEECH_ENERGY_COEFF_QUIET else VoiceConfig.LONG_SPEECH_ENERGY_COEFF_NOISY
ThresholdConfig(
energyThreshold = currentEnvBaseline * coeff,
vadRatioThreshold = VoiceConfig.LONG_SPEECH_VAD_COEFF,
minScore = VoiceConfig.LONG_SPEECH_MIN_SCORE,
scene = "长语音"
)
}
}
/**
* 计算语音评分无修改
*/
fun calculateSpeechScore(
duration: Long,
avgEnergy: Float,
continuousRatio: Float,
thresholdConfig: ThresholdConfig
): Int {
var score = 0
score += when {
duration >= VoiceConfig.LONG_SPEECH_SCORE_CUTOFF_MS -> 3
duration >= VoiceConfig.MID_SPEECH_SCORE_CUTOFF_MS -> 2
else -> 1
}
score += if (avgEnergy >= thresholdConfig.energyThreshold) 1 else 0
score += if (continuousRatio >= VoiceConfig.MIN_CONTINUOUS_FRAME_RATIO) 1 else 0
return score
}
}

View File

@ -37,6 +37,7 @@ import com.zs.smarthuman.BuildConfig
import com.zs.smarthuman.R import com.zs.smarthuman.R
import com.zs.smarthuman.base.BaseActivity import com.zs.smarthuman.base.BaseActivity
import com.zs.smarthuman.base.BaseViewModelActivity import com.zs.smarthuman.base.BaseViewModelActivity
import com.zs.smarthuman.bean.AudioDTO
import com.zs.smarthuman.bean.NetworkStatusEventMsg import com.zs.smarthuman.bean.NetworkStatusEventMsg
import com.zs.smarthuman.bean.UserInfoResp import com.zs.smarthuman.bean.UserInfoResp
import com.zs.smarthuman.bean.VersionUpdateResp import com.zs.smarthuman.bean.VersionUpdateResp
@ -54,10 +55,12 @@ import com.zs.smarthuman.utils.AudioDebugUtil
import com.zs.smarthuman.utils.AudioPcmUtil import com.zs.smarthuman.utils.AudioPcmUtil
import com.zs.smarthuman.utils.DangerousUtils import com.zs.smarthuman.utils.DangerousUtils
import com.zs.smarthuman.utils.LogFileUtils import com.zs.smarthuman.utils.LogFileUtils
import com.zs.smarthuman.utils.PcmStreamPlayer
import com.zs.smarthuman.utils.UnityPlayerHolder import com.zs.smarthuman.utils.UnityPlayerHolder
import com.zs.smarthuman.utils.ViewSlideAnimator import com.zs.smarthuman.utils.ViewSlideAnimator
import com.zs.smarthuman.utils.VoiceStreamPlayer
import com.zs.smarthuman.viewmodel.MainViewModel import com.zs.smarthuman.viewmodel.MainViewModel
import com.zs.smarthuman.widget.VersionUpdateDialog import com.zs.smarthuman.widget.VersionUpdateDialog
import kotlinx.coroutines.Dispatchers import kotlinx.coroutines.Dispatchers
@ -149,7 +152,7 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
when (it) { when (it) {
is ApiResult.Error -> { is ApiResult.Error -> {
Toaster.showShort("上传失败") Toaster.showShort("上传失败")
voiceController?.onUploadFinished(false) voiceController?.onUploadFinished(true)
} }
is ApiResult.Success<String> -> { is ApiResult.Success<String> -> {
@ -243,19 +246,26 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
} }
) )
} }
private val voicePlayer = VoiceStreamPlayer().apply {
onPlayStart = { id ->
LogUtils.d("🎵 开始播放 audioId=$id")
startPlayTimeoutJob?.cancel()
voiceController?.onPlayStartBackend()
}
onPlayEnd = { id ->
LogUtils.d("✅ 播放结束 audioId=$id")
voiceController?.onPlayEndBackend()
}
}
override fun receivedIMMsg(msg: SingleMessage) { override fun receivedIMMsg(msg: SingleMessage) {
when (msg.msgContentType) { when (msg.msgContentType) {
MessageContentType.RECEIVE_VOICE_STREAM.msgContentType -> { MessageContentType.RECEIVE_VOICE_STREAM.msgContentType -> {
lifecycleScope.launch(Dispatchers.IO) { lifecycleScope.launch(Dispatchers.IO) {
val audioDTO = GsonUtils.fromJson(msg.content, AudioDTO::class.java)
// LogFileUtils.logToFile2(this@MainActivity,msg.content) voicePlayer.onAudioDTO(audioDTO)
UnityPlayerHolder.getInstance()
.startTalking(msg.content)
// loadLocalJsonAndPlay()
} }
} }
} }
@ -540,6 +550,7 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
UnityPlayerHolder.getInstance().release() UnityPlayerHolder.getInstance().release()
UnityPlayerHolder.getInstance().clearCache() UnityPlayerHolder.getInstance().clearCache()
releaseIM() releaseIM()
voicePlayer.release()
} }

View File

@ -0,0 +1,95 @@
package com.zs.smarthuman.utils
import android.media.AudioAttributes
import android.media.AudioFormat
import android.media.AudioManager
import android.media.AudioTrack
import kotlinx.coroutines.CoroutineScope
import kotlinx.coroutines.Dispatchers
import kotlinx.coroutines.SupervisorJob
import kotlinx.coroutines.cancel
import kotlinx.coroutines.delay
import kotlinx.coroutines.isActive
import kotlinx.coroutines.launch
import java.util.ArrayDeque
import java.util.Queue
import java.util.concurrent.locks.ReentrantLock
// ====================== PCM 播放器 ======================
class PcmStreamPlayer(
private val sampleRate: Int
) {
var onPlayEnd: (() -> Unit)? = null
private val scope = CoroutineScope(SupervisorJob() + Dispatchers.IO)
private val bufferQueue: Queue<ByteArray> = ArrayDeque()
private val queueLock = ReentrantLock()
private var audioTrack: AudioTrack? = null
@Volatile
private var playing = true
init {
scope.launch {
audioTrack = AudioTrack(
AudioAttributes.Builder()
.setUsage(AudioAttributes.USAGE_MEDIA)
.setContentType(AudioAttributes.CONTENT_TYPE_SPEECH)
.build(),
AudioFormat.Builder()
.setEncoding(AudioFormat.ENCODING_PCM_16BIT)
.setSampleRate(sampleRate)
.setChannelMask(AudioFormat.CHANNEL_OUT_MONO)
.build(),
AudioTrack.getMinBufferSize(
sampleRate,
AudioFormat.CHANNEL_OUT_MONO,
AudioFormat.ENCODING_PCM_16BIT
),
AudioTrack.MODE_STREAM,
AudioManager.AUDIO_SESSION_ID_GENERATE
)
audioTrack?.play()
val silent = ByteArray(2048)
while (isActive && playing) {
val pcm = queueLock.run { bufferQueue.poll() }
if (pcm != null) {
audioTrack?.write(pcm, 0, pcm.size)
} else {
audioTrack?.write(silent, 0, silent.size)
delay(5)
}
}
audioTrack?.stop()
audioTrack?.release()
audioTrack = null
onPlayEnd?.invoke()
}
}
fun pushPcm(pcm: ByteArray) {
queueLock.run { bufferQueue.add(pcm) }
}
fun clearQueue() {
queueLock.run { bufferQueue.clear() }
}
fun queueEmpty(): Boolean = queueLock.run { bufferQueue.isEmpty() }
fun release() {
playing = false
queueLock.run { bufferQueue.clear() }
scope.cancel()
}
}

View File

@ -0,0 +1,101 @@
package com.zs.smarthuman.utils
import android.util.Base64
import com.zs.smarthuman.bean.AudioDTO
import com.zs.smarthuman.bean.LmChatDTO
import kotlinx.coroutines.*
import java.util.*
// ====================== Voice 流播放器 ======================
class VoiceStreamPlayer(
private val sampleRate: Int = 24000
) {
var onPlayStart: ((audioId: Int) -> Unit)? = null
var onPlayEnd: ((audioId: Int) -> Unit)? = null
private val scope = CoroutineScope(SupervisorJob() + Dispatchers.IO)
private var currentAudioId: Int? = null
private val pcmPlayer: PcmStreamPlayer by lazy { PcmStreamPlayer(sampleRate) }
private val sliceBuffer = TreeMap<Int, ByteArray>()
private var nextSortId = 1
private var inputFinished = false
private var firstPlayTriggered = false
private var bufferedBytes = 0
private var playEndLaunched = false
private val PREBUFFER_BYTES = (sampleRate * 2 * 250 / 1000) // 250ms
fun onAudioDTO(dto: AudioDTO) {
scope.launch {
dto.items.forEach { slice ->
handleSlice(slice)
}
}
}
private fun handleSlice(slice: LmChatDTO) {
if (currentAudioId != slice.id) {
startNewAudio(slice.id)
}
slice.audioData?.takeIf { it.isNotBlank() }?.let {
val pcm = Base64.decode(it, Base64.DEFAULT)
sliceBuffer[slice.sortId] = pcm
}
if (slice.isFinal) inputFinished = true
flushBufferIfPossible()
}
private fun startNewAudio(audioId: Int) {
currentAudioId = audioId
sliceBuffer.clear()
nextSortId = 1
bufferedBytes = 0
firstPlayTriggered = false
playEndLaunched = false
inputFinished = false
pcmPlayer.clearQueue()
}
private fun flushBufferIfPossible() {
while (true) {
val pcm = sliceBuffer[nextSortId] ?: break
bufferedBytes += pcm.size
sliceBuffer.remove(nextSortId)
nextSortId++
if (!firstPlayTriggered && bufferedBytes >= PREBUFFER_BYTES) {
firstPlayTriggered = true
onPlayStart?.invoke(currentAudioId!!)
}
if (firstPlayTriggered) {
pcmPlayer.pushPcm(pcm)
}
}
// 收尾,只启动一次协程监控播放完成
if (inputFinished && sliceBuffer.isEmpty() && firstPlayTriggered && !playEndLaunched) {
playEndLaunched = true
scope.launch {
while (!pcmPlayer.queueEmpty()) {
delay(10)
}
onPlayEnd?.invoke(currentAudioId!!)
}
}
}
fun release() {
pcmPlayer.release()
scope.cancel()
}
}

View File

@ -41,9 +41,9 @@ class MainViewModel: BaseViewModel() {
RxHttp.postJson(ApiService.UPLOAD_RECORD_VOICE_URL) RxHttp.postJson(ApiService.UPLOAD_RECORD_VOICE_URL)
.add("sessionCode",sessionCode) .add("sessionCode",sessionCode)
.add("audio", audioVoice) .add("audio", audioVoice)
.readTimeout(3000L) .readTimeout(5000L)
.writeTimeout(3000L) .writeTimeout(5000L)
.connectTimeout(3000L) .connectTimeout(5000L)
.toAwaitResponse<String>() .toAwaitResponse<String>()
.awaitResult() .awaitResult()
.getOrThrow() .getOrThrow()