优化参数
This commit is contained in:
parent
b7fc6d4ee0
commit
8f7b02f18b
@ -23,8 +23,23 @@ class VoiceController(
|
|||||||
private val onTimeoutTip: OnTimeoutTip? = null
|
private val onTimeoutTip: OnTimeoutTip? = null
|
||||||
) {
|
) {
|
||||||
|
|
||||||
private val TAG = "VoiceController"
|
companion object {
|
||||||
private val sampleRate = 16000
|
// 日志标签
|
||||||
|
private const val TAG = "VoiceController"
|
||||||
|
// 采样率
|
||||||
|
private const val SAMPLE_RATE = 16000
|
||||||
|
// 预缓存大小(2秒)
|
||||||
|
private const val PRE_BUFFER_SIZE = SAMPLE_RATE * 2
|
||||||
|
// 声纹验证阈值
|
||||||
|
private const val SPEAKER_VERIFY_THRESHOLD_NORMAL = 0.25f
|
||||||
|
private const val SPEAKER_VERIFY_THRESHOLD_SHORT = 0.20f
|
||||||
|
private const val SHORT_AUDIO_THRESHOLD = SAMPLE_RATE * 0.5f // 0.5秒音频长度
|
||||||
|
// 防抖时间
|
||||||
|
private const val INVALID_RESET_DEBOUNCE_MS = 1500L
|
||||||
|
// 最小语音时长
|
||||||
|
private const val MIN_SPEECH_MS = 800L
|
||||||
|
private const val MIN_EFFECTIVE_VOICE_DURATION = 400L
|
||||||
|
}
|
||||||
|
|
||||||
var state: VoiceState = VoiceState.WAIT_WAKEUP
|
var state: VoiceState = VoiceState.WAIT_WAKEUP
|
||||||
private set(value) {
|
private set(value) {
|
||||||
@ -47,7 +62,8 @@ class VoiceController(
|
|||||||
private var isMultiPersonDialogueDetected = false
|
private var isMultiPersonDialogueDetected = false
|
||||||
// 防抖重置标记
|
// 防抖重置标记
|
||||||
private var lastInvalidResetMs = 0L
|
private var lastInvalidResetMs = 0L
|
||||||
private val INVALID_RESET_DEBOUNCE_MS = 1500L
|
// 声纹管理器锁(解决并发问题)
|
||||||
|
private val speakerManagerLock = ReentrantLock()
|
||||||
|
|
||||||
private val wakeupManager = WakeupManager(assetManager, onWakeup)
|
private val wakeupManager = WakeupManager(assetManager, onWakeup)
|
||||||
private val vadManager = VadManager(
|
private val vadManager = VadManager(
|
||||||
@ -57,8 +73,7 @@ class VoiceController(
|
|||||||
)
|
)
|
||||||
|
|
||||||
private val audioBuffer = mutableListOf<Float>()
|
private val audioBuffer = mutableListOf<Float>()
|
||||||
private val preBuffer = ArrayDeque<Float>()
|
private val preBuffer = ArrayDeque<Float>(PRE_BUFFER_SIZE)
|
||||||
private val PRE_BUFFER_SIZE = sampleRate * 2
|
|
||||||
|
|
||||||
private var recordingStartMs = 0L
|
private var recordingStartMs = 0L
|
||||||
private var waitSpeechFailStartMs = 0L
|
private var waitSpeechFailStartMs = 0L
|
||||||
@ -71,7 +86,6 @@ class VoiceController(
|
|||||||
private var speechEnableAtMs = 0L
|
private var speechEnableAtMs = 0L
|
||||||
private val SPEECH_COOLDOWN_MS = 300L
|
private val SPEECH_COOLDOWN_MS = 300L
|
||||||
|
|
||||||
private val MIN_SPEECH_MS = 800L
|
|
||||||
private val idleTimeoutMs = idleTimeoutSeconds * 1000L
|
private val idleTimeoutMs = idleTimeoutSeconds * 1000L
|
||||||
private val maxRecordingMs = maxRecordingSeconds * 1000L
|
private val maxRecordingMs = maxRecordingSeconds * 1000L
|
||||||
|
|
||||||
@ -112,7 +126,6 @@ class VoiceController(
|
|||||||
private val MULTI_DIALOGUE_MIN_VAD_RATIO = 0.55f
|
private val MULTI_DIALOGUE_MIN_VAD_RATIO = 0.55f
|
||||||
|
|
||||||
// ========== 核心调整:近距离场景 微弱人声过滤配置(重点优化) ==========
|
// ========== 核心调整:近距离场景 微弱人声过滤配置(重点优化) ==========
|
||||||
private val MIN_EFFECTIVE_VOICE_DURATION = 400L
|
|
||||||
private val MIN_VOICE_FRAME_RATIO = 0.08f
|
private val MIN_VOICE_FRAME_RATIO = 0.08f
|
||||||
private val MIN_PEAK_ENERGY_RATIO = 1.5f
|
private val MIN_PEAK_ENERGY_RATIO = 1.5f
|
||||||
private val NORMAL_VOICE_ENERGY_THRESHOLD = 0.008f
|
private val NORMAL_VOICE_ENERGY_THRESHOLD = 0.008f
|
||||||
@ -121,19 +134,20 @@ class VoiceController(
|
|||||||
// ========== 核心新增:MIN_EFFECTIVE_SPEECH_RMS 常量 ==========
|
// ========== 核心新增:MIN_EFFECTIVE_SPEECH_RMS 常量 ==========
|
||||||
private val MIN_EFFECTIVE_SPEECH_RMS = 0.0005f
|
private val MIN_EFFECTIVE_SPEECH_RMS = 0.0005f
|
||||||
|
|
||||||
|
|
||||||
// ========== 核心新增:无效说话标记 + 超时类型 ==========
|
// ========== 核心新增:无效说话标记 + 超时类型 ==========
|
||||||
private var hasInvalidSpeech = false
|
private var hasInvalidSpeech = false
|
||||||
private var currentTimeoutType: TimeoutType = TimeoutType.IDLE_TIMEOUT
|
private var currentTimeoutType: TimeoutType = TimeoutType.IDLE_TIMEOUT
|
||||||
|
|
||||||
|
|
||||||
// ========== 核心配置:声纹验证相关 ==========
|
// ========== 核心配置:声纹验证相关 ==========
|
||||||
private val CURRENT_USER_ID = "current_wakeup_user" // 当前唤醒用户唯一标识
|
private val CURRENT_USER_ID = "current_wakeup_user" // 当前唤醒用户唯一标识
|
||||||
private val ENABLE_STRICT_SPEAKER_VERIFY = true // 严格验证开关
|
private val ENABLE_STRICT_SPEAKER_VERIFY = true // 严格验证开关
|
||||||
private val SPEAKER_VERIFY_THRESHOLD = 0.5f // 严格验证开关
|
|
||||||
|
|
||||||
|
|
||||||
init {
|
init {
|
||||||
|
// 参数校验
|
||||||
|
require(idleTimeoutSeconds > 0) { "idleTimeoutSeconds 必须大于0" }
|
||||||
|
require(maxRecordingSeconds > 0) { "maxRecordingSeconds 必须大于0" }
|
||||||
|
require(maxRecordingSeconds >= idleTimeoutSeconds) { "maxRecordingSeconds 必须大于等于 idleTimeoutSeconds" }
|
||||||
|
|
||||||
// 初始化声纹识别器(适配你提供的API)
|
// 初始化声纹识别器(适配你提供的API)
|
||||||
try {
|
try {
|
||||||
SpeakerRecognition.initExtractor(assetManager) // 对齐原生API
|
SpeakerRecognition.initExtractor(assetManager) // 对齐原生API
|
||||||
@ -149,10 +163,10 @@ class VoiceController(
|
|||||||
cachePreBuffer(samples)
|
cachePreBuffer(samples)
|
||||||
wakeupManager.acceptAudio(samples)
|
wakeupManager.acceptAudio(samples)
|
||||||
if (wakeupManager.consumeWakeupFlag()) {
|
if (wakeupManager.consumeWakeupFlag()) {
|
||||||
handleWakeupEvent()
|
handleWakeupEvent() // 仅调用一次
|
||||||
// 注册唤醒用户特征(异步执行)
|
// 注册唤醒用户特征(异步执行)
|
||||||
CoroutineScope(Dispatchers.IO).launch {
|
CoroutineScope(Dispatchers.IO).launch {
|
||||||
var stream: OnlineStream? = null
|
var stream: OnlineStream? = null
|
||||||
runCatching {
|
runCatching {
|
||||||
val wakeupAudio = preBuffer.toFloatArray()
|
val wakeupAudio = preBuffer.toFloatArray()
|
||||||
if (wakeupAudio.isEmpty()) {
|
if (wakeupAudio.isEmpty()) {
|
||||||
@ -160,42 +174,40 @@ class VoiceController(
|
|||||||
return@launch
|
return@launch
|
||||||
}
|
}
|
||||||
|
|
||||||
// 2. 创建原生Stream(按你提供的API)
|
// 创建原生Stream
|
||||||
stream = SpeakerRecognition.extractor.createStream()
|
stream = SpeakerRecognition.extractor.createStream()
|
||||||
|
stream.acceptWaveform(samples = wakeupAudio, sampleRate = SAMPLE_RATE)
|
||||||
stream.acceptWaveform(samples = preBuffer.toFloatArray(), sampleRate = sampleRate)
|
|
||||||
stream.inputFinished()
|
stream.inputFinished()
|
||||||
|
|
||||||
// 4. 计算特征并注册(仅当前用户)
|
// 计算特征并注册(仅当前用户)
|
||||||
if (SpeakerRecognition.extractor.isReady(stream)) {
|
if (SpeakerRecognition.extractor.isReady(stream)) {
|
||||||
val embedding = SpeakerRecognition.extractor.compute(stream)
|
val embedding = SpeakerRecognition.extractor.compute(stream)
|
||||||
// 清空历史特征,确保当前用户唯一
|
// 加锁保护 manager 操作
|
||||||
SpeakerRecognition.manager.remove(CURRENT_USER_ID)
|
speakerManagerLock.withLock {
|
||||||
// 注册当前唤醒用户(按你提供的add API)
|
SpeakerRecognition.manager.remove(CURRENT_USER_ID)
|
||||||
val embeddingList: MutableList<FloatArray> = mutableListOf()
|
// 注册当前唤醒用户
|
||||||
embeddingList.add(embedding)
|
val embeddingList = mutableListOf(embedding)
|
||||||
val ok = SpeakerRecognition.manager.add(
|
val ok = SpeakerRecognition.manager.add(
|
||||||
name = CURRENT_USER_ID,
|
name = CURRENT_USER_ID,
|
||||||
embedding = embeddingList.toTypedArray()
|
embedding = embeddingList.toTypedArray()
|
||||||
)
|
)
|
||||||
if (ok) {
|
if (ok) {
|
||||||
LogUtils.d(TAG, "✅ 注册当前唤醒用户特征成功 | 特征长度: ${embedding.size}")
|
LogUtils.d(TAG, "✅ 注册当前唤醒用户特征成功 | 特征长度: ${embedding.size}")
|
||||||
} else {
|
} else {
|
||||||
LogUtils.w(TAG, "❌ 注册当前唤醒用户特征失败(manager.add返回false)")
|
LogUtils.w(TAG, "❌ 注册当前唤醒用户特征失败(manager.add返回false)")
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
LogUtils.w(TAG, "❌ 唤醒音频Stream未就绪,跳过用户注册")
|
LogUtils.w(TAG, "❌ 唤醒音频Stream未就绪,跳过用户注册")
|
||||||
}
|
}
|
||||||
}.onFailure {
|
}.onFailure {
|
||||||
LogUtils.e(TAG, "❌ 唤醒用户特征注册失败", it.message)
|
LogUtils.e(TAG, "❌ 唤醒用户特征注册失败", it)
|
||||||
}.also {
|
}.also {
|
||||||
// 释放Stream(原生Stream用完即释放)
|
// 释放Stream
|
||||||
stream?.release()
|
stream?.release()
|
||||||
LogUtils.d(TAG, "🔄 唤醒注册Stream已释放")
|
LogUtils.d(TAG, "🔄 唤醒注册Stream已释放")
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
handleWakeupEvent()
|
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -410,7 +422,6 @@ class VoiceController(
|
|||||||
val now = System.currentTimeMillis()
|
val now = System.currentTimeMillis()
|
||||||
val duration = now - recordingStartMs
|
val duration = now - recordingStartMs
|
||||||
|
|
||||||
|
|
||||||
if (!vadStarted || duration < MIN_SPEECH_MS) {
|
if (!vadStarted || duration < MIN_SPEECH_MS) {
|
||||||
LogUtils.d(TAG, "❌ 语音过短: $duration ms | 基线: $currentEnvBaseline")
|
LogUtils.d(TAG, "❌ 语音过短: $duration ms | 基线: $currentEnvBaseline")
|
||||||
hasInvalidSpeech = true
|
hasInvalidSpeech = true
|
||||||
@ -563,7 +574,6 @@ class VoiceController(
|
|||||||
isMultiPersonDialogueDetected = false
|
isMultiPersonDialogueDetected = false
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/* ================= 播放/上传/Reset 回调 ================= */
|
/* ================= 播放/上传/Reset 回调 ================= */
|
||||||
fun onPlayStartPrompt() {
|
fun onPlayStartPrompt() {
|
||||||
LogUtils.d(TAG, "🎵 播放提示音 | 基线: $currentEnvBaseline")
|
LogUtils.d(TAG, "🎵 播放提示音 | 基线: $currentEnvBaseline")
|
||||||
@ -643,6 +653,26 @@ class VoiceController(
|
|||||||
resetRealtimeStats()
|
resetRealtimeStats()
|
||||||
hasInvalidSpeech = false
|
hasInvalidSpeech = false
|
||||||
currentTimeoutType = TimeoutType.IDLE_TIMEOUT
|
currentTimeoutType = TimeoutType.IDLE_TIMEOUT
|
||||||
|
|
||||||
|
// 释放声纹识别器资源
|
||||||
|
runCatching {
|
||||||
|
SpeakerRecognition.extractor.release()
|
||||||
|
speakerManagerLock.withLock {
|
||||||
|
SpeakerRecognition.manager.release()
|
||||||
|
}
|
||||||
|
LogUtils.d(TAG, "✅ 声纹识别器资源已释放")
|
||||||
|
}.onFailure {
|
||||||
|
LogUtils.e(TAG, "❌ 释放声纹识别器资源失败", it)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 兜底释放(防止未调用release)
|
||||||
|
protected fun finalize() {
|
||||||
|
runCatching {
|
||||||
|
release()
|
||||||
|
}.onFailure {
|
||||||
|
LogUtils.e(TAG, "❌ finalize 释放资源失败", it)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private fun cachePreBuffer(samples: FloatArray) {
|
private fun cachePreBuffer(samples: FloatArray) {
|
||||||
@ -667,36 +697,53 @@ class VoiceController(
|
|||||||
* @return true=是当前用户,false=非当前用户
|
* @return true=是当前用户,false=非当前用户
|
||||||
*/
|
*/
|
||||||
private fun verifySpeaker(audio: FloatArray): Boolean {
|
private fun verifySpeaker(audio: FloatArray): Boolean {
|
||||||
var stream: OnlineStream? = null
|
if (audio.isEmpty()) {
|
||||||
return try {
|
LogUtils.w(TAG, "❌ 待验证音频为空,声纹验证失败")
|
||||||
stream = SpeakerRecognition.extractor.createStream()
|
return false
|
||||||
stream.acceptWaveform(samples = audio, sampleRate = sampleRate)
|
}
|
||||||
stream.inputFinished()
|
|
||||||
|
|
||||||
// 4. 计算特征并验证(按你提供的API)
|
var stream: OnlineStream? = null
|
||||||
if (!SpeakerRecognition.extractor.isReady(stream)) {
|
return try {
|
||||||
LogUtils.w(TAG, "❌ 验证音频Stream未就绪,验证失败")
|
stream = SpeakerRecognition.extractor.createStream()
|
||||||
return false
|
stream.acceptWaveform(samples = audio, sampleRate = SAMPLE_RATE)
|
||||||
}
|
stream.inputFinished()
|
||||||
|
|
||||||
val embedding = SpeakerRecognition.extractor.compute(stream)
|
if (!SpeakerRecognition.extractor.isReady(stream)) {
|
||||||
|
LogUtils.w(TAG, "❌ 验证音频Stream未就绪,验证失败")
|
||||||
val verifyPass = SpeakerRecognition.manager.verify(name = CURRENT_USER_ID, embedding = embedding, threshold = SPEAKER_VERIFY_THRESHOLD)
|
return false
|
||||||
if (verifyPass) {
|
|
||||||
LogUtils.d(TAG, "✅ 声纹验证通过")
|
|
||||||
} else {
|
|
||||||
LogUtils.w(TAG, "❌ 声纹验证失败")
|
|
||||||
}
|
|
||||||
verifyPass
|
|
||||||
} catch (e: Exception) {
|
|
||||||
LogUtils.e(TAG, "❌ 声纹验证异常", e)
|
|
||||||
false
|
|
||||||
} finally {
|
|
||||||
// 释放Stream(原生Stream用完即释放)
|
|
||||||
stream?.release()
|
|
||||||
LogUtils.d(TAG, "🔄 验证Stream已释放")
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
val embedding = SpeakerRecognition.extractor.compute(stream)
|
||||||
|
// 动态选择阈值
|
||||||
|
val threshold = if (audio.size < SHORT_AUDIO_THRESHOLD) {
|
||||||
|
LogUtils.d(TAG, "📢 检测到短速语音,使用放宽阈值: $SPEAKER_VERIFY_THRESHOLD_SHORT")
|
||||||
|
SPEAKER_VERIFY_THRESHOLD_SHORT
|
||||||
|
} else {
|
||||||
|
SPEAKER_VERIFY_THRESHOLD_NORMAL
|
||||||
|
}
|
||||||
|
|
||||||
|
// 加锁验证
|
||||||
|
speakerManagerLock.withLock {
|
||||||
|
val verifyPass = SpeakerRecognition.manager.verify(
|
||||||
|
name = CURRENT_USER_ID,
|
||||||
|
embedding = embedding,
|
||||||
|
threshold = threshold
|
||||||
|
)
|
||||||
|
if (verifyPass) {
|
||||||
|
LogUtils.d(TAG, "✅ 声纹验证通过 | 阈值: $threshold")
|
||||||
|
} else {
|
||||||
|
LogUtils.w(TAG, "❌ 声纹验证失败 | 阈值: $threshold")
|
||||||
|
}
|
||||||
|
return verifyPass
|
||||||
|
}
|
||||||
|
} catch (e: Exception) {
|
||||||
|
LogUtils.e(TAG, "❌ 声纹验证异常", e)
|
||||||
|
false
|
||||||
|
} finally {
|
||||||
|
// 释放Stream
|
||||||
|
stream?.release()
|
||||||
|
LogUtils.d(TAG, "🔄 验证Stream已释放")
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
Loading…
x
Reference in New Issue
Block a user