强化代码

This commit is contained in:
林若思 2026-01-30 17:02:50 +08:00
parent 13b08c8e7a
commit 99cf06456d
2 changed files with 125 additions and 61 deletions

View File

@ -2,7 +2,9 @@ package com.zs.smarthuman.sherpa
import android.content.res.AssetManager import android.content.res.AssetManager
import com.blankj.utilcode.util.LogUtils import com.blankj.utilcode.util.LogUtils
import com.k2fsa.sherpa.onnx.* import com.k2fsa.sherpa.onnx.SileroVadModelConfig
import com.k2fsa.sherpa.onnx.Vad
import com.k2fsa.sherpa.onnx.VadModelConfig
import kotlin.math.sqrt import kotlin.math.sqrt
class VadManager( class VadManager(
@ -12,19 +14,22 @@ class VadManager(
) { ) {
private val TAG = "VadManager" private val TAG = "VadManager"
private val vad: Vad private val vad: Vad
private var isSpeaking = false private var isSpeaking = false
private var lastSpeechMs = 0L private var lastSpeechMs = 0L
private var lastActiveMs = 0L private var lastActiveMs = 0L
private var speechStartMs = 0L // 新增:记录语音启动时间,用于启动保护 private var speechStartMs = 0L // 语音启动保护
// 核心参数优化:适配中文说话停顿 // **************** 超短语音300ms专属参数优化 ****************
private val END_SILENCE_MS = 350L // 静音多久判定结束350→600留足停顿缓冲 // 1. 大幅提高RMS阈值彻底过滤麦克风底噪核心中的核心
private val RESET_IDLE_MS = 3_000L // 超长空闲重置时间,保持不变 private val MIN_RMS = 0.003f
private val MIN_RMS = 0.001f // 能量检测阈值0.002→0.001,降低误判静音) // 2. 缩短静音结束判定时间适配短语音350→200ms有效语音结束后200ms静音就触发结束
private val SPEECH_START_PROTECT_MS = 90L // 语音启动保护期,避免开头停顿误判 private val END_SILENCE_MS = 200L
// 3. 保留启动保护防止语音开头轻微停顿误判短语音也需要90ms足够
private val SPEECH_START_PROTECT_MS = 90L
// 4. 空闲重置时间保留,不影响短语音
private val RESET_IDLE_MS = 3_000L
init { init {
vad = Vad( vad = Vad(
@ -32,33 +37,41 @@ class VadManager(
VadModelConfig( VadModelConfig(
sileroVadModelConfig = SileroVadModelConfig( sileroVadModelConfig = SileroVadModelConfig(
model = "silero_vad.onnx", model = "silero_vad.onnx",
threshold = 0.40F, // 模型敏感度0.45→0.40,降低误判) threshold = 0.45F, // 轻微提高敏感度让VAD更快识别语音结束0.40→0.45
minSilenceDuration = 0.1F, minSilenceDuration = 0.05F, // 适配短静音判定0.1→0.05
minSpeechDuration = 0.25F, minSpeechDuration = 0.1F, // 允许超短语音0.25→0.1支持100ms以上语音
windowSize = 512, windowSize = 512
),
sampleRate = 16000,
numThreads = 1,
provider = "cpu"
) )
) )
LogUtils.i(TAG, "✅ VAD init已优化抗停顿参数") )
LogUtils.i(TAG, "✅ VAD init适配300ms超短语音+中文抗停顿)")
} }
fun accept(samples: FloatArray) { fun accept(samples: FloatArray) {
if (samples.isEmpty()) return if (samples.isEmpty()) return
val now = System.currentTimeMillis() val now = System.currentTimeMillis()
// 1⃣ 快速RMS能量判断过滤纯静音 // 1⃣ 快速RMS能量判断彻底过滤底噪+纯静音
val rms = fastRms(samples) val rms = fastRms(samples)
if (rms < MIN_RMS) { if (rms < MIN_RMS) {
handleSilence(now) handleSilence(now)
return return
} }
// 2⃣ 有有效能量才喂给VAD模型减少计算 // 2⃣ 有有效能量才喂给VAD模型减少计算+防原生异常
try {
vad.acceptWaveform(samples) vad.acceptWaveform(samples)
val hasSpeech = vad.isSpeechDetected() } catch (e: Exception) {
LogUtils.e(TAG, "❌ VAD acceptWaveform异常", e)
return
}
val hasSpeech = try {
vad.isSpeechDetected()
} catch (e: Exception) {
LogUtils.e(TAG, "❌ VAD isSpeechDetected异常", e)
false
}
if (hasSpeech) { if (hasSpeech) {
lastSpeechMs = now lastSpeechMs = now
@ -67,32 +80,38 @@ class VadManager(
isSpeaking = true isSpeaking = true
speechStartMs = now speechStartMs = now
onSpeechStart() onSpeechStart()
LogUtils.d(TAG, "🗣 VAD检测到语音开始") LogUtils.d(TAG, "🗣 VAD检测到语音开始 | RMS: $rms | 采样数: ${samples.size}")
} }
} else { } else {
handleSilence(now) handleSilence(now)
lastActiveMs = now
} }
} }
private fun handleSilence(now: Long) { private fun handleSilence(now: Long) {
// 语音启动保护期内 + 静音超阈值,才判定结束 // 核心判定:启动保护后+连续静音超200ms → 立即触发结束适配300ms短语音
if (isSpeaking if (isSpeaking
&& now - speechStartMs > SPEECH_START_PROTECT_MS && now - speechStartMs > SPEECH_START_PROTECT_MS
&& now - lastSpeechMs > END_SILENCE_MS) { && now - lastSpeechMs > END_SILENCE_MS) {
isSpeaking = false isSpeaking = false
onSpeechEnd() onSpeechEnd()
LogUtils.d(TAG, "🔇 VAD检测到语音结束静音超${END_SILENCE_MS}ms") // 打印详细日志,方便排查:总时长/有效语音后静音时长
LogUtils.d(TAG, "🔇 VAD检测到语音结束 | 静音超${END_SILENCE_MS}ms | 总时长: ${now - speechStartMs}ms | 有效语音后静音: ${now - lastSpeechMs}ms")
} }
// 超长空闲重置VAD避免状态残留 // 超长空闲重置VAD避免状态残留
if (!isSpeaking && now - lastActiveMs > RESET_IDLE_MS) { if (!isSpeaking && now - lastActiveMs > RESET_IDLE_MS) {
try {
vad.reset() vad.reset()
} catch (e: Exception) {
LogUtils.e(TAG, "❌ VAD reset异常", e)
}
lastActiveMs = now lastActiveMs = now
LogUtils.d(TAG, "🔄 VAD reset (idle)") LogUtils.d(TAG, "🔄 VAD reset (idle)")
} }
} }
// 快速RMS计算步采样减少计算量保持原有逻辑 // 快速RMS计算步采样减少计算量保持原有逻辑不变
private fun fastRms(samples: FloatArray): Float { private fun fastRms(samples: FloatArray): Float {
var sum = 0f var sum = 0f
var count = 0 var count = 0
@ -104,17 +123,20 @@ class VadManager(
count++ count++
i += step i += step
} }
// 避免除0异常极端情况count=0
return if (count == 0) 0f else sqrt(sum / count) return if (count == 0) 0f else sqrt(sum / count)
} }
// 重置所有状态,包括新增的语音启动时间 // 重置所有状态,保持原有逻辑不变
fun reset() { fun reset() {
isSpeaking = false isSpeaking = false
lastSpeechMs = 0L lastSpeechMs = 0L
lastActiveMs = 0L lastActiveMs = 0L
speechStartMs = 0L speechStartMs = 0L
try {
vad.reset() vad.reset()
} catch (e: Exception) {
LogUtils.e(TAG, "❌ VAD 手动reset异常", e)
}
LogUtils.d(TAG, "🔄 VAD手动重置所有状态") LogUtils.d(TAG, "🔄 VAD手动重置所有状态")
} }
} }

View File

@ -6,8 +6,11 @@ import com.k2fsa.sherpa.onnx.OnlineStream
import com.k2fsa.sherpa.onnx.SpeakerRecognition import com.k2fsa.sherpa.onnx.SpeakerRecognition
import kotlinx.coroutines.CoroutineScope import kotlinx.coroutines.CoroutineScope
import kotlinx.coroutines.Dispatchers import kotlinx.coroutines.Dispatchers
import kotlinx.coroutines.Job
import kotlinx.coroutines.delay
import kotlinx.coroutines.launch import kotlinx.coroutines.launch
import java.util.ArrayDeque import java.util.ArrayDeque
import java.util.concurrent.CopyOnWriteArrayList
import java.util.concurrent.locks.ReentrantLock import java.util.concurrent.locks.ReentrantLock
import kotlin.concurrent.withLock import kotlin.concurrent.withLock
@ -34,9 +37,8 @@ class VoiceController(
private const val INVALID_RESET_DEBOUNCE_MS = 1500L private const val INVALID_RESET_DEBOUNCE_MS = 1500L
// 统一的声纹验证阈值可根据实际效果微调原0.36
// 统一的声纹验证阈值(不再分场景) private const val SPEAKER_THRESHOLD = 0.35f
private const val SPEAKER_THRESHOLD = 0.36f
private const val MIN_VERIFY_MS = 600L private const val MIN_VERIFY_MS = 600L
private const val MAX_VERIFY_MS = 1200L private const val MAX_VERIFY_MS = 1200L
@ -56,13 +58,14 @@ class VoiceController(
private val speakerManagerLock = ReentrantLock() private val speakerManagerLock = ReentrantLock()
private val wakeupManager = WakeupManager(assetManager, onWakeup) private val wakeupManager = WakeupManager(assetManager, onWakeup)
// 【修改】保留vadManager可访问用于手动触发结束
private val vadManager = VadManager( private val vadManager = VadManager(
assetManager, assetManager,
onSpeechStart = { onVadStart() }, onSpeechStart = { onVadStart() },
onSpeechEnd = { onVadEnd() } onSpeechEnd = { onVadEnd() }
) )
private val audioBuffer = mutableListOf<Float>() private val audioBuffer = CopyOnWriteArrayList<Float>()
private val preBuffer = ArrayDeque<Float>(PRE_BUFFER_SIZE) private val preBuffer = ArrayDeque<Float>(PRE_BUFFER_SIZE)
private var recordingStartMs = 0L private var recordingStartMs = 0L
@ -84,6 +87,14 @@ class VoiceController(
private val ENABLE_STRICT_SPEAKER_VERIFY = true private val ENABLE_STRICT_SPEAKER_VERIFY = true
private val preBufferLock = Any() private val preBufferLock = Any()
// 声纹验证标记
@Volatile private var speakerVerifyFinished = false
@Volatile private var speakerVerifyPassed = true // fail-open
@Volatile private var speakerVerifyFailed = false
@Volatile private var currentSpeakerVerifyJob: Job? = null
init { init {
try { try {
SpeakerRecognition.initExtractor(assetManager) SpeakerRecognition.initExtractor(assetManager)
@ -220,8 +231,6 @@ class VoiceController(
onWakeup() onWakeup()
LogUtils.d(TAG, "🔔 唤醒成功") LogUtils.d(TAG, "🔔 唤醒成功")
} }
@Volatile private var speakerVerifyFinished = false
@Volatile private var speakerVerifyPassed = true
private fun onVadStart() { private fun onVadStart() {
if (state != VoiceState.WAIT_SPEECH) return if (state != VoiceState.WAIT_SPEECH) return
@ -236,24 +245,39 @@ class VoiceController(
state = VoiceState.RECORDING state = VoiceState.RECORDING
} }
private fun onVadEnd() { private fun onVadEnd() {
if (state != VoiceState.RECORDING) return if (state != VoiceState.RECORDING) return
LogUtils.d(TAG, "🧠 VAD END") LogUtils.d(TAG, "🧠 VAD END")
finishSentence() finishSentence()
} }
// 【新增2】手动触发VAD结束核心方法切主线程保证线程安全
private fun manualTriggerVadEnd() {
CoroutineScope(Dispatchers.Main).launch {
if (vadStarted && state == VoiceState.RECORDING) {
LogUtils.d(TAG, "🔴 声纹失败手动触发VAD结束")
vadManager.reset()
onVadEnd()
} else {
// 打印状态,方便问题排查
LogUtils.w(TAG, "🟡 尝试手动触发VAD结束但非录音状态state$statevadStarted$vadStarted),放弃")
}
}
}
private fun startAsyncSpeakerVerify() { private fun startAsyncSpeakerVerify() {
currentSpeakerVerifyJob?.cancel()
currentSpeakerVerifyJob = null
speakerVerifyFinished = false speakerVerifyFinished = false
speakerVerifyPassed = true // fail-open speakerVerifyPassed = true // fail-open
speakerVerifyFailed = false // 重置失败标记
CoroutineScope(Dispatchers.IO).launch { currentSpeakerVerifyJob = CoroutineScope(Dispatchers.IO).launch {
// 等 600ms 音频 // 可优化500ms原600ms更快触发验证需Samples对应修改
val needSamples = SAMPLE_RATE * 600 / 1000 val needSamples = SAMPLE_RATE * 600 / 1000
var waited = 0L var waited = 0L
while (audioBuffer.size < needSamples && waited < 800) { while (audioBuffer.size < needSamples && waited < 800) {
kotlinx.coroutines.delay(20) delay(20)
waited += 20 waited += 20
} }
@ -272,22 +296,26 @@ class VoiceController(
} }
} }
/* ================= 结束录音 ================= */ /* ================= 结束录音 ================= */
private fun finishSentence() { private fun finishSentence() {
currentSpeakerVerifyJob?.cancel()
currentSpeakerVerifyJob = null
val now = System.currentTimeMillis() val now = System.currentTimeMillis()
val duration = now - recordingStartMs val duration = now - recordingStartMs
speakerVerifyFailed = false
// if (!vadStarted || duration < MIN_SPEECH_MS) { // 【修改1】声纹失败快速终止直接跳过所有校验
// LogUtils.d(TAG, "❌ 语音过短: $duration ms") if (speakerVerifyFailed) {
// hasInvalidSpeech = true LogUtils.w(TAG, "🔴 声纹失败快速终止,跳过语音时长校验 | 实际录音时长: $duration ms")
// resetToWaitSpeech() hasInvalidSpeech = true
// return resetToWaitSpeech()
// } speakerVerifyFailed = false // 重置标记,避免状态残留
return
}
val audio = audioBuffer.toFloatArray() val audio = audioBuffer.toFloatArray().copyOf()
// 声纹验证(保留核心逻辑 // 原有声纹最终校验(双重保障,极端情况兜底
if (ENABLE_STRICT_SPEAKER_VERIFY && if (ENABLE_STRICT_SPEAKER_VERIFY &&
speakerVerifyFinished && speakerVerifyFinished &&
!speakerVerifyPassed !speakerVerifyPassed
@ -298,8 +326,7 @@ class VoiceController(
return return
} }
// 最终通过逻辑
// 最终通过
audioBuffer.clear() audioBuffer.clear()
state = VoiceState.UPLOADING state = VoiceState.UPLOADING
onFinalAudio(audio) onFinalAudio(audio)
@ -356,10 +383,16 @@ class VoiceController(
LogUtils.d(TAG, "🛡 防抖1.5秒内重复无效语音,跳过重置") LogUtils.d(TAG, "🛡 防抖1.5秒内重复无效语音,跳过重置")
return return
} }
currentSpeakerVerifyJob?.cancel()
currentSpeakerVerifyJob = null
lastInvalidResetMs = now lastInvalidResetMs = now
audioBuffer.clear() audioBuffer.clear()
vadManager.reset() vadManager.reset()
vadStarted = false vadStarted = false
// 【修改2】重置声纹所有标记避免状态残留
speakerVerifyFinished = false
speakerVerifyPassed = true
speakerVerifyFailed = false
state = VoiceState.WAIT_SPEECH state = VoiceState.WAIT_SPEECH
if (waitSpeechFailStartMs == 0L) waitSpeechFailStartMs = System.currentTimeMillis() if (waitSpeechFailStartMs == 0L) waitSpeechFailStartMs = System.currentTimeMillis()
} }
@ -379,6 +412,10 @@ class VoiceController(
waitSpeechFailStartMs = 0L waitSpeechFailStartMs = 0L
hasInvalidSpeech = false hasInvalidSpeech = false
currentTimeoutType = TimeoutType.IDLE_TIMEOUT currentTimeoutType = TimeoutType.IDLE_TIMEOUT
// 【修改3】重置声纹所有标记
speakerVerifyFinished = false
speakerVerifyPassed = true
speakerVerifyFailed = false
state = VoiceState.WAIT_WAKEUP state = VoiceState.WAIT_WAKEUP
} }
@ -409,7 +446,6 @@ class VoiceController(
} }
private fun cachePreBuffer(samples: FloatArray) { private fun cachePreBuffer(samples: FloatArray) {
// 空数据快速返回,避免无效循环
if (samples.isEmpty()) return if (samples.isEmpty()) return
synchronized(preBufferLock) { synchronized(preBufferLock) {
for (s in samples) { for (s in samples) {
@ -421,14 +457,10 @@ class VoiceController(
preBuffer.removeFirst() preBuffer.removeFirst()
} }
} }
} }
} }
} }
private fun verifySpeaker(audio: FloatArray): Boolean { private fun verifySpeaker(audio: FloatArray): Boolean {
if (audio.isEmpty()) return true if (audio.isEmpty()) return true
@ -447,7 +479,6 @@ class VoiceController(
audio audio
} }
var stream: OnlineStream? = null var stream: OnlineStream? = null
return runCatching { return runCatching {
@ -476,6 +507,19 @@ class VoiceController(
"📊 声纹 | pass=$pass | 音频=${audioMs}ms | 输入=${input.size} | 耗时=${cost}ms" "📊 声纹 | pass=$pass | 音频=${audioMs}ms | 输入=${input.size} | 耗时=${cost}ms"
) )
// 【修改4】声纹失败核心处理置位标记 + 手动触发VAD结束
if (!pass) {
// 只有当前还在录音中,才做后续处理;否则直接阻断,不影响任何状态
if (state == VoiceState.RECORDING) {
speakerVerifyFailed = true
manualTriggerVadEnd()
LogUtils.d(TAG, "🔴 声纹失败,当前处于录音状态,执行终止流程")
} else {
// 窗口期已过,仅打印日志,不做任何操作
LogUtils.w(TAG, "🟡 声纹验证失败,但录音流程已完成(当前状态:$state),不处理")
}
}
pass pass
}.onFailure { }.onFailure {
LogUtils.e(TAG, "❌ 声纹异常,放行", it) LogUtils.e(TAG, "❌ 声纹异常,放行", it)
@ -483,6 +527,4 @@ class VoiceController(
runCatching { stream?.release() } runCatching { stream?.release() }
}.getOrDefault(true) }.getOrDefault(true)
} }
} }