优化参数

This commit is contained in:
林若思 2026-01-10 17:54:30 +08:00
parent b7fc6d4ee0
commit 8f7b02f18b

View File

@ -23,8 +23,23 @@ class VoiceController(
private val onTimeoutTip: OnTimeoutTip? = null private val onTimeoutTip: OnTimeoutTip? = null
) { ) {
private val TAG = "VoiceController" companion object {
private val sampleRate = 16000 // 日志标签
private const val TAG = "VoiceController"
// 采样率
private const val SAMPLE_RATE = 16000
// 预缓存大小2秒
private const val PRE_BUFFER_SIZE = SAMPLE_RATE * 2
// 声纹验证阈值
private const val SPEAKER_VERIFY_THRESHOLD_NORMAL = 0.25f
private const val SPEAKER_VERIFY_THRESHOLD_SHORT = 0.20f
private const val SHORT_AUDIO_THRESHOLD = SAMPLE_RATE * 0.5f // 0.5秒音频长度
// 防抖时间
private const val INVALID_RESET_DEBOUNCE_MS = 1500L
// 最小语音时长
private const val MIN_SPEECH_MS = 800L
private const val MIN_EFFECTIVE_VOICE_DURATION = 400L
}
var state: VoiceState = VoiceState.WAIT_WAKEUP var state: VoiceState = VoiceState.WAIT_WAKEUP
private set(value) { private set(value) {
@ -47,7 +62,8 @@ class VoiceController(
private var isMultiPersonDialogueDetected = false private var isMultiPersonDialogueDetected = false
// 防抖重置标记 // 防抖重置标记
private var lastInvalidResetMs = 0L private var lastInvalidResetMs = 0L
private val INVALID_RESET_DEBOUNCE_MS = 1500L // 声纹管理器锁(解决并发问题)
private val speakerManagerLock = ReentrantLock()
private val wakeupManager = WakeupManager(assetManager, onWakeup) private val wakeupManager = WakeupManager(assetManager, onWakeup)
private val vadManager = VadManager( private val vadManager = VadManager(
@ -57,8 +73,7 @@ class VoiceController(
) )
private val audioBuffer = mutableListOf<Float>() private val audioBuffer = mutableListOf<Float>()
private val preBuffer = ArrayDeque<Float>() private val preBuffer = ArrayDeque<Float>(PRE_BUFFER_SIZE)
private val PRE_BUFFER_SIZE = sampleRate * 2
private var recordingStartMs = 0L private var recordingStartMs = 0L
private var waitSpeechFailStartMs = 0L private var waitSpeechFailStartMs = 0L
@ -71,7 +86,6 @@ class VoiceController(
private var speechEnableAtMs = 0L private var speechEnableAtMs = 0L
private val SPEECH_COOLDOWN_MS = 300L private val SPEECH_COOLDOWN_MS = 300L
private val MIN_SPEECH_MS = 800L
private val idleTimeoutMs = idleTimeoutSeconds * 1000L private val idleTimeoutMs = idleTimeoutSeconds * 1000L
private val maxRecordingMs = maxRecordingSeconds * 1000L private val maxRecordingMs = maxRecordingSeconds * 1000L
@ -112,7 +126,6 @@ class VoiceController(
private val MULTI_DIALOGUE_MIN_VAD_RATIO = 0.55f private val MULTI_DIALOGUE_MIN_VAD_RATIO = 0.55f
// ========== 核心调整:近距离场景 微弱人声过滤配置(重点优化) ========== // ========== 核心调整:近距离场景 微弱人声过滤配置(重点优化) ==========
private val MIN_EFFECTIVE_VOICE_DURATION = 400L
private val MIN_VOICE_FRAME_RATIO = 0.08f private val MIN_VOICE_FRAME_RATIO = 0.08f
private val MIN_PEAK_ENERGY_RATIO = 1.5f private val MIN_PEAK_ENERGY_RATIO = 1.5f
private val NORMAL_VOICE_ENERGY_THRESHOLD = 0.008f private val NORMAL_VOICE_ENERGY_THRESHOLD = 0.008f
@ -121,19 +134,20 @@ class VoiceController(
// ========== 核心新增MIN_EFFECTIVE_SPEECH_RMS 常量 ========== // ========== 核心新增MIN_EFFECTIVE_SPEECH_RMS 常量 ==========
private val MIN_EFFECTIVE_SPEECH_RMS = 0.0005f private val MIN_EFFECTIVE_SPEECH_RMS = 0.0005f
// ========== 核心新增:无效说话标记 + 超时类型 ========== // ========== 核心新增:无效说话标记 + 超时类型 ==========
private var hasInvalidSpeech = false private var hasInvalidSpeech = false
private var currentTimeoutType: TimeoutType = TimeoutType.IDLE_TIMEOUT private var currentTimeoutType: TimeoutType = TimeoutType.IDLE_TIMEOUT
// ========== 核心配置:声纹验证相关 ========== // ========== 核心配置:声纹验证相关 ==========
private val CURRENT_USER_ID = "current_wakeup_user" // 当前唤醒用户唯一标识 private val CURRENT_USER_ID = "current_wakeup_user" // 当前唤醒用户唯一标识
private val ENABLE_STRICT_SPEAKER_VERIFY = true // 严格验证开关 private val ENABLE_STRICT_SPEAKER_VERIFY = true // 严格验证开关
private val SPEAKER_VERIFY_THRESHOLD = 0.5f // 严格验证开关
init { init {
// 参数校验
require(idleTimeoutSeconds > 0) { "idleTimeoutSeconds 必须大于0" }
require(maxRecordingSeconds > 0) { "maxRecordingSeconds 必须大于0" }
require(maxRecordingSeconds >= idleTimeoutSeconds) { "maxRecordingSeconds 必须大于等于 idleTimeoutSeconds" }
// 初始化声纹识别器适配你提供的API // 初始化声纹识别器适配你提供的API
try { try {
SpeakerRecognition.initExtractor(assetManager) // 对齐原生API SpeakerRecognition.initExtractor(assetManager) // 对齐原生API
@ -149,10 +163,10 @@ class VoiceController(
cachePreBuffer(samples) cachePreBuffer(samples)
wakeupManager.acceptAudio(samples) wakeupManager.acceptAudio(samples)
if (wakeupManager.consumeWakeupFlag()) { if (wakeupManager.consumeWakeupFlag()) {
handleWakeupEvent() handleWakeupEvent() // 仅调用一次
// 注册唤醒用户特征(异步执行) // 注册唤醒用户特征(异步执行)
CoroutineScope(Dispatchers.IO).launch { CoroutineScope(Dispatchers.IO).launch {
var stream: OnlineStream? = null var stream: OnlineStream? = null
runCatching { runCatching {
val wakeupAudio = preBuffer.toFloatArray() val wakeupAudio = preBuffer.toFloatArray()
if (wakeupAudio.isEmpty()) { if (wakeupAudio.isEmpty()) {
@ -160,42 +174,40 @@ class VoiceController(
return@launch return@launch
} }
// 2. 创建原生Stream按你提供的API // 创建原生Stream
stream = SpeakerRecognition.extractor.createStream() stream = SpeakerRecognition.extractor.createStream()
stream.acceptWaveform(samples = wakeupAudio, sampleRate = SAMPLE_RATE)
stream.acceptWaveform(samples = preBuffer.toFloatArray(), sampleRate = sampleRate)
stream.inputFinished() stream.inputFinished()
// 4. 计算特征并注册(仅当前用户) // 计算特征并注册(仅当前用户)
if (SpeakerRecognition.extractor.isReady(stream)) { if (SpeakerRecognition.extractor.isReady(stream)) {
val embedding = SpeakerRecognition.extractor.compute(stream) val embedding = SpeakerRecognition.extractor.compute(stream)
// 清空历史特征,确保当前用户唯一 // 加锁保护 manager 操作
SpeakerRecognition.manager.remove(CURRENT_USER_ID) speakerManagerLock.withLock {
// 注册当前唤醒用户按你提供的add API SpeakerRecognition.manager.remove(CURRENT_USER_ID)
val embeddingList: MutableList<FloatArray> = mutableListOf() // 注册当前唤醒用户
embeddingList.add(embedding) val embeddingList = mutableListOf(embedding)
val ok = SpeakerRecognition.manager.add( val ok = SpeakerRecognition.manager.add(
name = CURRENT_USER_ID, name = CURRENT_USER_ID,
embedding = embeddingList.toTypedArray() embedding = embeddingList.toTypedArray()
) )
if (ok) { if (ok) {
LogUtils.d(TAG, "✅ 注册当前唤醒用户特征成功 | 特征长度: ${embedding.size}") LogUtils.d(TAG, "✅ 注册当前唤醒用户特征成功 | 特征长度: ${embedding.size}")
} else { } else {
LogUtils.w(TAG, "❌ 注册当前唤醒用户特征失败manager.add返回false") LogUtils.w(TAG, "❌ 注册当前唤醒用户特征失败manager.add返回false")
}
} }
} else { } else {
LogUtils.w(TAG, "❌ 唤醒音频Stream未就绪跳过用户注册") LogUtils.w(TAG, "❌ 唤醒音频Stream未就绪跳过用户注册")
} }
}.onFailure { }.onFailure {
LogUtils.e(TAG, "❌ 唤醒用户特征注册失败", it.message) LogUtils.e(TAG, "❌ 唤醒用户特征注册失败", it)
}.also { }.also {
// 释放Stream原生Stream用完即释放 // 释放Stream
stream?.release() stream?.release()
LogUtils.d(TAG, "🔄 唤醒注册Stream已释放") LogUtils.d(TAG, "🔄 唤醒注册Stream已释放")
} }
} }
handleWakeupEvent()
return return
} }
@ -410,7 +422,6 @@ class VoiceController(
val now = System.currentTimeMillis() val now = System.currentTimeMillis()
val duration = now - recordingStartMs val duration = now - recordingStartMs
if (!vadStarted || duration < MIN_SPEECH_MS) { if (!vadStarted || duration < MIN_SPEECH_MS) {
LogUtils.d(TAG, "❌ 语音过短: $duration ms | 基线: $currentEnvBaseline") LogUtils.d(TAG, "❌ 语音过短: $duration ms | 基线: $currentEnvBaseline")
hasInvalidSpeech = true hasInvalidSpeech = true
@ -563,7 +574,6 @@ class VoiceController(
isMultiPersonDialogueDetected = false isMultiPersonDialogueDetected = false
} }
/* ================= 播放/上传/Reset 回调 ================= */ /* ================= 播放/上传/Reset 回调 ================= */
fun onPlayStartPrompt() { fun onPlayStartPrompt() {
LogUtils.d(TAG, "🎵 播放提示音 | 基线: $currentEnvBaseline") LogUtils.d(TAG, "🎵 播放提示音 | 基线: $currentEnvBaseline")
@ -643,6 +653,26 @@ class VoiceController(
resetRealtimeStats() resetRealtimeStats()
hasInvalidSpeech = false hasInvalidSpeech = false
currentTimeoutType = TimeoutType.IDLE_TIMEOUT currentTimeoutType = TimeoutType.IDLE_TIMEOUT
// 释放声纹识别器资源
runCatching {
SpeakerRecognition.extractor.release()
speakerManagerLock.withLock {
SpeakerRecognition.manager.release()
}
LogUtils.d(TAG, "✅ 声纹识别器资源已释放")
}.onFailure {
LogUtils.e(TAG, "❌ 释放声纹识别器资源失败", it)
}
}
// 兜底释放防止未调用release
protected fun finalize() {
runCatching {
release()
}.onFailure {
LogUtils.e(TAG, "❌ finalize 释放资源失败", it)
}
} }
private fun cachePreBuffer(samples: FloatArray) { private fun cachePreBuffer(samples: FloatArray) {
@ -667,36 +697,53 @@ class VoiceController(
* @return true=是当前用户false=非当前用户 * @return true=是当前用户false=非当前用户
*/ */
private fun verifySpeaker(audio: FloatArray): Boolean { private fun verifySpeaker(audio: FloatArray): Boolean {
var stream: OnlineStream? = null if (audio.isEmpty()) {
return try { LogUtils.w(TAG, "❌ 待验证音频为空,声纹验证失败")
stream = SpeakerRecognition.extractor.createStream() return false
stream.acceptWaveform(samples = audio, sampleRate = sampleRate) }
stream.inputFinished()
// 4. 计算特征并验证按你提供的API var stream: OnlineStream? = null
if (!SpeakerRecognition.extractor.isReady(stream)) { return try {
LogUtils.w(TAG, "❌ 验证音频Stream未就绪验证失败") stream = SpeakerRecognition.extractor.createStream()
return false stream.acceptWaveform(samples = audio, sampleRate = SAMPLE_RATE)
} stream.inputFinished()
val embedding = SpeakerRecognition.extractor.compute(stream) if (!SpeakerRecognition.extractor.isReady(stream)) {
LogUtils.w(TAG, "❌ 验证音频Stream未就绪验证失败")
val verifyPass = SpeakerRecognition.manager.verify(name = CURRENT_USER_ID, embedding = embedding, threshold = SPEAKER_VERIFY_THRESHOLD) return false
if (verifyPass) {
LogUtils.d(TAG, "✅ 声纹验证通过")
} else {
LogUtils.w(TAG, "❌ 声纹验证失败")
}
verifyPass
} catch (e: Exception) {
LogUtils.e(TAG, "❌ 声纹验证异常", e)
false
} finally {
// 释放Stream原生Stream用完即释放
stream?.release()
LogUtils.d(TAG, "🔄 验证Stream已释放")
} }
val embedding = SpeakerRecognition.extractor.compute(stream)
// 动态选择阈值
val threshold = if (audio.size < SHORT_AUDIO_THRESHOLD) {
LogUtils.d(TAG, "📢 检测到短速语音,使用放宽阈值: $SPEAKER_VERIFY_THRESHOLD_SHORT")
SPEAKER_VERIFY_THRESHOLD_SHORT
} else {
SPEAKER_VERIFY_THRESHOLD_NORMAL
}
// 加锁验证
speakerManagerLock.withLock {
val verifyPass = SpeakerRecognition.manager.verify(
name = CURRENT_USER_ID,
embedding = embedding,
threshold = threshold
)
if (verifyPass) {
LogUtils.d(TAG, "✅ 声纹验证通过 | 阈值: $threshold")
} else {
LogUtils.w(TAG, "❌ 声纹验证失败 | 阈值: $threshold")
}
return verifyPass
}
} catch (e: Exception) {
LogUtils.e(TAG, "❌ 声纹验证异常", e)
false
} finally {
// 释放Stream
stream?.release()
LogUtils.d(TAG, "🔄 验证Stream已释放")
}
} }
} }