实时校验
This commit is contained in:
parent
222dcc2143
commit
7a0bd086e7
@ -12,13 +12,15 @@ class VadManager(
|
|||||||
) {
|
) {
|
||||||
|
|
||||||
private val TAG = "VadManager"
|
private val TAG = "VadManager"
|
||||||
|
|
||||||
private val vad: Vad
|
private val vad: Vad
|
||||||
|
|
||||||
private var isSpeaking = false
|
private var isSpeaking = false
|
||||||
private var lastSpeechMs = 0L
|
private var lastSpeechMs = 0L
|
||||||
|
private var lastActiveMs = 0L
|
||||||
|
|
||||||
/** 更果断结束 */
|
|
||||||
private val END_SILENCE_MS = 350L
|
private val END_SILENCE_MS = 350L
|
||||||
|
private val RESET_IDLE_MS = 3_000L
|
||||||
private val MIN_RMS = 0.002f
|
private val MIN_RMS = 0.002f
|
||||||
|
|
||||||
init {
|
init {
|
||||||
@ -43,31 +45,62 @@ class VadManager(
|
|||||||
fun accept(samples: FloatArray) {
|
fun accept(samples: FloatArray) {
|
||||||
val now = System.currentTimeMillis()
|
val now = System.currentTimeMillis()
|
||||||
|
|
||||||
|
// 1️⃣ 先快速 RMS 判断
|
||||||
|
val rms = fastRms(samples)
|
||||||
|
if (rms < MIN_RMS) {
|
||||||
|
handleSilence(now)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// 2️⃣ 有能量再喂 VAD
|
||||||
vad.acceptWaveform(samples)
|
vad.acceptWaveform(samples)
|
||||||
val hasSpeech = vad.isSpeechDetected()
|
val hasSpeech = vad.isSpeechDetected()
|
||||||
val rms = calcRms(samples)
|
|
||||||
|
|
||||||
if (hasSpeech && rms >= MIN_RMS) {
|
if (hasSpeech) {
|
||||||
lastSpeechMs = now
|
lastSpeechMs = now
|
||||||
|
lastActiveMs = now
|
||||||
if (!isSpeaking) {
|
if (!isSpeaking) {
|
||||||
isSpeaking = true
|
isSpeaking = true
|
||||||
onSpeechStart()
|
onSpeechStart()
|
||||||
}
|
}
|
||||||
} else if (isSpeaking && now - lastSpeechMs > END_SILENCE_MS) {
|
} else {
|
||||||
onSpeechEnd()
|
handleSilence(now)
|
||||||
reset()
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private fun handleSilence(now: Long) {
|
||||||
|
if (isSpeaking && now - lastSpeechMs > END_SILENCE_MS) {
|
||||||
|
isSpeaking = false
|
||||||
|
onSpeechEnd()
|
||||||
|
}
|
||||||
|
|
||||||
|
// 超长 idle 才 reset
|
||||||
|
if (!isSpeaking && now - lastActiveMs > RESET_IDLE_MS) {
|
||||||
|
vad.reset()
|
||||||
|
lastActiveMs = now
|
||||||
|
LogUtils.d(TAG, "🔄 VAD reset (idle)")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private fun fastRms(samples: FloatArray): Float {
|
||||||
|
var sum = 0f
|
||||||
|
var count = 0
|
||||||
|
var i = 0
|
||||||
|
val step = 4
|
||||||
|
while (i < samples.size) {
|
||||||
|
val v = samples[i]
|
||||||
|
sum += v * v
|
||||||
|
count++
|
||||||
|
i += step
|
||||||
|
}
|
||||||
|
return sqrt(sum / count)
|
||||||
|
}
|
||||||
|
|
||||||
fun reset() {
|
fun reset() {
|
||||||
isSpeaking = false
|
isSpeaking = false
|
||||||
lastSpeechMs = 0
|
lastSpeechMs = 0
|
||||||
|
lastActiveMs = 0
|
||||||
vad.reset()
|
vad.reset()
|
||||||
}
|
}
|
||||||
|
|
||||||
private fun calcRms(samples: FloatArray): Float {
|
|
||||||
var sum = 0f
|
|
||||||
for (v in samples) sum += v * v
|
|
||||||
return sqrt(sum / samples.size)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -34,11 +34,12 @@ class VoiceController(
|
|||||||
|
|
||||||
private const val INVALID_RESET_DEBOUNCE_MS = 1500L
|
private const val INVALID_RESET_DEBOUNCE_MS = 1500L
|
||||||
|
|
||||||
// 最小语音时长
|
|
||||||
private const val MIN_SPEECH_MS = 600L
|
|
||||||
|
|
||||||
// 统一的声纹验证阈值(不再分场景)
|
// 统一的声纹验证阈值(不再分场景)
|
||||||
private const val SPEAKER_THRESHOLD = 0.45f
|
private const val SPEAKER_THRESHOLD = 0.38f
|
||||||
|
|
||||||
|
private const val MIN_VERIFY_MS = 600L
|
||||||
|
private const val MAX_VERIFY_MS = 1200L
|
||||||
}
|
}
|
||||||
|
|
||||||
var state: VoiceState = VoiceState.WAIT_WAKEUP
|
var state: VoiceState = VoiceState.WAIT_WAKEUP
|
||||||
@ -219,23 +220,59 @@ class VoiceController(
|
|||||||
onWakeup()
|
onWakeup()
|
||||||
LogUtils.d(TAG, "🔔 唤醒成功")
|
LogUtils.d(TAG, "🔔 唤醒成功")
|
||||||
}
|
}
|
||||||
|
@Volatile private var speakerVerifyFinished = false
|
||||||
|
@Volatile private var speakerVerifyPassed = true
|
||||||
|
|
||||||
private fun onVadStart() {
|
private fun onVadStart() {
|
||||||
if (state != VoiceState.WAIT_SPEECH) return
|
if (state != VoiceState.WAIT_SPEECH) return
|
||||||
LogUtils.d(TAG, "🎤 REAL VAD START")
|
|
||||||
vadStarted = true
|
vadStarted = true
|
||||||
recordingStartMs = System.currentTimeMillis()
|
recordingStartMs = System.currentTimeMillis()
|
||||||
audioBuffer.clear()
|
audioBuffer.clear()
|
||||||
audioBuffer.addAll(preBuffer)
|
audioBuffer.addAll(preBuffer)
|
||||||
|
|
||||||
|
startAsyncSpeakerVerify()
|
||||||
|
|
||||||
state = VoiceState.RECORDING
|
state = VoiceState.RECORDING
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private fun onVadEnd() {
|
private fun onVadEnd() {
|
||||||
if (state != VoiceState.RECORDING) return
|
if (state != VoiceState.RECORDING) return
|
||||||
LogUtils.d(TAG, "🧠 VAD END")
|
LogUtils.d(TAG, "🧠 VAD END")
|
||||||
finishSentence()
|
finishSentence()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private fun startAsyncSpeakerVerify() {
|
||||||
|
speakerVerifyFinished = false
|
||||||
|
speakerVerifyPassed = true // fail-open
|
||||||
|
|
||||||
|
CoroutineScope(Dispatchers.IO).launch {
|
||||||
|
// 等 600ms 音频
|
||||||
|
val needSamples = SAMPLE_RATE * 600 / 1000
|
||||||
|
var waited = 0L
|
||||||
|
|
||||||
|
while (audioBuffer.size < needSamples && waited < 800) {
|
||||||
|
kotlinx.coroutines.delay(20)
|
||||||
|
waited += 20
|
||||||
|
}
|
||||||
|
|
||||||
|
if (audioBuffer.size < needSamples) {
|
||||||
|
speakerVerifyFinished = true
|
||||||
|
return@launch
|
||||||
|
}
|
||||||
|
|
||||||
|
val input = audioBuffer
|
||||||
|
.takeLast(needSamples)
|
||||||
|
.toFloatArray()
|
||||||
|
|
||||||
|
val pass = verifySpeaker(input)
|
||||||
|
speakerVerifyPassed = pass
|
||||||
|
speakerVerifyFinished = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/* ================= 结束录音 ================= */
|
/* ================= 结束录音 ================= */
|
||||||
private fun finishSentence() {
|
private fun finishSentence() {
|
||||||
val now = System.currentTimeMillis()
|
val now = System.currentTimeMillis()
|
||||||
@ -251,17 +288,17 @@ class VoiceController(
|
|||||||
val audio = audioBuffer.toFloatArray()
|
val audio = audioBuffer.toFloatArray()
|
||||||
|
|
||||||
// 声纹验证(保留核心逻辑)
|
// 声纹验证(保留核心逻辑)
|
||||||
if (ENABLE_STRICT_SPEAKER_VERIFY) {
|
if (ENABLE_STRICT_SPEAKER_VERIFY &&
|
||||||
val isCurrentUser = verifySpeaker(audio)
|
speakerVerifyFinished &&
|
||||||
if (!isCurrentUser) {
|
!speakerVerifyPassed
|
||||||
LogUtils.w(TAG, "❌ 非当前唤醒用户,拒绝语音 | 录音时长: $duration ms")
|
) {
|
||||||
hasInvalidSpeech = true
|
LogUtils.w(TAG, "❌ 声纹失败(已完成),拒绝")
|
||||||
resetToWaitSpeech()
|
hasInvalidSpeech = true
|
||||||
return
|
resetToWaitSpeech()
|
||||||
}
|
return
|
||||||
LogUtils.d(TAG, "✅ 当前用户语音,继续处理 | 录音时长: $duration ms")
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// 最终通过
|
// 最终通过
|
||||||
audioBuffer.clear()
|
audioBuffer.clear()
|
||||||
state = VoiceState.UPLOADING
|
state = VoiceState.UPLOADING
|
||||||
@ -390,65 +427,62 @@ class VoiceController(
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
private fun verifySpeaker(audio: FloatArray): Boolean {
|
private fun verifySpeaker(audio: FloatArray): Boolean {
|
||||||
if (audio.isEmpty()) {
|
if (audio.isEmpty()) return true
|
||||||
LogUtils.w(TAG, "❌ 待验证音频为空,声纹验证失败")
|
|
||||||
return false
|
val audioMs = audio.size * 1000L / SAMPLE_RATE
|
||||||
|
if (audioMs < MIN_VERIFY_MS) {
|
||||||
|
LogUtils.d(TAG, "🟡 短音频 $audioMs ms,跳过声纹")
|
||||||
|
return true
|
||||||
}
|
}
|
||||||
|
|
||||||
// 1. 记录验证开始时间(关键:统计处理耗时)
|
|
||||||
val verifyStartMs = System.currentTimeMillis()
|
val verifyStartMs = System.currentTimeMillis()
|
||||||
|
|
||||||
// 2. 原有音频裁剪逻辑(保留)
|
val maxSamples = (SAMPLE_RATE * MAX_VERIFY_MS / 1000).toInt()
|
||||||
val audioDurationMs = (audio.size.toFloat() / SAMPLE_RATE * 1000).toLong()
|
val input = if (audio.size > maxSamples) {
|
||||||
val validAudio = if (audioDurationMs > 0) {
|
audio.copyOfRange(audio.size - maxSamples, audio.size)
|
||||||
val validSampleCount = (audioDurationMs * SAMPLE_RATE / 1000).toInt()
|
|
||||||
if (validSampleCount < audio.size) {
|
|
||||||
audio.copyOfRange(audio.size - validSampleCount, audio.size)
|
|
||||||
} else {
|
|
||||||
audio
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
audio
|
audio
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
var stream: OnlineStream? = null
|
var stream: OnlineStream? = null
|
||||||
|
|
||||||
return runCatching {
|
return runCatching {
|
||||||
stream = SpeakerRecognition.extractor.createStream()
|
stream = SpeakerRecognition.extractor.createStream()
|
||||||
stream.acceptWaveform(samples = validAudio, sampleRate = SAMPLE_RATE)
|
stream.acceptWaveform(input, SAMPLE_RATE)
|
||||||
stream.inputFinished()
|
stream.inputFinished()
|
||||||
|
|
||||||
if (!SpeakerRecognition.extractor.isReady(stream)) {
|
if (!SpeakerRecognition.extractor.isReady(stream)) {
|
||||||
LogUtils.w(TAG, "❌ 音频Stream未就绪,验证失败")
|
LogUtils.w(TAG, "⚠️ stream not ready,放行")
|
||||||
return@runCatching false
|
return@runCatching true
|
||||||
}
|
}
|
||||||
|
|
||||||
val embedding = SpeakerRecognition.extractor.compute(stream)
|
val embedding = SpeakerRecognition.extractor.compute(stream)
|
||||||
speakerManagerLock.withLock {
|
|
||||||
val verifyPass = SpeakerRecognition.manager.verify(
|
|
||||||
name = CURRENT_USER_ID,
|
|
||||||
embedding = embedding,
|
|
||||||
threshold = SPEAKER_THRESHOLD
|
|
||||||
)
|
|
||||||
|
|
||||||
// 3. 计算真实处理耗时(结束时间 - 开始时间)
|
val pass = speakerManagerLock.withLock {
|
||||||
val verifyCostMs = System.currentTimeMillis() - verifyStartMs
|
SpeakerRecognition.manager.verify(
|
||||||
// 日志区分:音频时长 vs 处理耗时
|
CURRENT_USER_ID,
|
||||||
LogUtils.d(
|
embedding,
|
||||||
TAG,
|
SPEAKER_THRESHOLD
|
||||||
"📊 声纹验证 | 统一阈值: $SPEAKER_THRESHOLD | 通过: $verifyPass | 音频时长: $audioDurationMs ms | 处理耗时: $verifyCostMs ms"
|
|
||||||
)
|
)
|
||||||
verifyPass
|
|
||||||
}
|
}
|
||||||
}.onFailure { e ->
|
|
||||||
LogUtils.e(TAG, "❌ 声纹验证异常,拒绝", e)
|
val cost = System.currentTimeMillis() - verifyStartMs
|
||||||
|
LogUtils.d(
|
||||||
|
TAG,
|
||||||
|
"📊 声纹 | pass=$pass | 音频=${audioMs}ms | 输入=${input.size} | 耗时=${cost}ms"
|
||||||
|
)
|
||||||
|
|
||||||
|
pass
|
||||||
|
}.onFailure {
|
||||||
|
LogUtils.e(TAG, "❌ 声纹异常,放行", it)
|
||||||
}.also {
|
}.also {
|
||||||
runCatching {
|
runCatching { stream?.release() }
|
||||||
stream?.release()
|
}.getOrDefault(true)
|
||||||
}.onFailure { e ->
|
|
||||||
LogUtils.w(TAG, "⚠️ 释放 Stream 资源失败", e)
|
|
||||||
}
|
|
||||||
}.getOrDefault(false)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
Loading…
x
Reference in New Issue
Block a user