提升检测速度

This commit is contained in:
林若思 2026-01-19 11:06:26 +08:00
parent c450d8d620
commit 0dfdccc75b
7 changed files with 72 additions and 102 deletions

View File

@ -181,5 +181,6 @@ dependencies {
implementation libs.androidautosize implementation libs.androidautosize
implementation files('libs/sherpa-onnx-1.12.20.aar') implementation files('libs/sherpa-onnx-1.12.20.aar')
implementation 'com.github.yyued:SVGAPlayer-Android:2.6.1'
} }

View File

@ -39,7 +39,7 @@
<application <application
android:name=".App" android:name=".App"
android:allowBackup="false" android:allowBackup="true"
android:dataExtractionRules="@xml/data_extraction_rules" android:dataExtractionRules="@xml/data_extraction_rules"
android:fullBackupContent="@xml/backup_rules" android:fullBackupContent="@xml/backup_rules"
android:icon="@mipmap/ic_launcher" android:icon="@mipmap/ic_launcher"

View File

@ -2,10 +2,7 @@ package com.zs.smarthuman.sherpa
import android.content.res.AssetManager import android.content.res.AssetManager
import com.blankj.utilcode.util.LogUtils import com.blankj.utilcode.util.LogUtils
import com.k2fsa.sherpa.onnx.SileroVadModelConfig import com.k2fsa.sherpa.onnx.*
import com.k2fsa.sherpa.onnx.Vad
import com.k2fsa.sherpa.onnx.VadModelConfig
import com.k2fsa.sherpa.onnx.getVadModelConfig
import kotlin.math.sqrt import kotlin.math.sqrt
class VadManager( class VadManager(
@ -13,92 +10,62 @@ class VadManager(
private val onSpeechStart: () -> Unit, private val onSpeechStart: () -> Unit,
private val onSpeechEnd: () -> Unit private val onSpeechEnd: () -> Unit
) { ) {
private val TAG = "VadManager" private val TAG = "VadManager"
private val vad: Vad private val vad: Vad
private var isSpeaking = false private var isSpeaking = false
private var lastSpeechTime = 0L private var lastSpeechMs = 0L
private val ACTIVE_END_SILENCE_MS = 1500L /** 更果断结束 */
private val ACTIVE_CONSECUTIVE_FRAMES = 10 private val END_SILENCE_MS = 350L
private val FINAL_END_SILENCE_MS = 800L private val MIN_RMS = 0.002f
private val FINAL_CONSECUTIVE_FRAMES = 5
private val FINAL_PHASE_TRIGGER_MS = 1000L
private val MAX_SILENCE_AFTER_SPEECH_MS = 2000L
private val MIN_EFFECTIVE_SPEECH_RMS = 0.001f
private var consecutiveSilenceFrames = 0
private var isInFinalPhase = false
private var lastEffectiveSpeechTime = 0L
init { init {
val config = getVadModelConfig(0) vad = Vad(
?: throw IllegalStateException("[$TAG] VAD config not found") assetManager,
vad = Vad(assetManager, VadModelConfig(sileroVadModelConfig = SileroVadModelConfig(model = "silero_vad.onnx", threshold = 0.2f))) VadModelConfig(
LogUtils.i(TAG, "✅ VAD 初始化成功") sileroVadModelConfig = SileroVadModelConfig(
model = "silero_vad.onnx",
threshold = 0.5F,
minSilenceDuration = 0.25F,
minSpeechDuration = 0.25F,
windowSize = 512,
),
sampleRate = 16000,
numThreads = 1,
provider = "cpu"
)
)
LogUtils.i(TAG, "✅ VAD init")
} }
fun accept(samples: FloatArray) { fun accept(samples: FloatArray) {
val now = System.currentTimeMillis() val now = System.currentTimeMillis()
vad.acceptWaveform(samples) vad.acceptWaveform(samples)
val vadHasSpeech = vad.isSpeechDetected() val hasSpeech = vad.isSpeechDetected()
val rms = calcRms(samples) val rms = calcRms(samples)
val isEffectiveSpeech = vadHasSpeech && rms >= MIN_EFFECTIVE_SPEECH_RMS if (hasSpeech && rms >= MIN_RMS) {
lastSpeechMs = now
if (isEffectiveSpeech) {
lastEffectiveSpeechTime = now
isInFinalPhase = false
lastSpeechTime = now
consecutiveSilenceFrames = 0
} else {
consecutiveSilenceFrames++
if (now - lastEffectiveSpeechTime >= FINAL_PHASE_TRIGGER_MS) {
isInFinalPhase = true
}
}
val (endSilenceMs, endFrames) =
if (isInFinalPhase)
FINAL_END_SILENCE_MS to FINAL_CONSECUTIVE_FRAMES
else
ACTIVE_END_SILENCE_MS to ACTIVE_CONSECUTIVE_FRAMES
if (isEffectiveSpeech) {
if (!isSpeaking) { if (!isSpeaking) {
isSpeaking = true isSpeaking = true
onSpeechStart() onSpeechStart()
} }
} else if (isSpeaking) { } else if (isSpeaking && now - lastSpeechMs > END_SILENCE_MS) {
val silenceMs = now - lastSpeechTime onSpeechEnd()
val effectiveSilenceMs = now - lastEffectiveSpeechTime reset()
val shouldEnd =
(silenceMs >= endSilenceMs ||
effectiveSilenceMs >= MAX_SILENCE_AFTER_SPEECH_MS) &&
consecutiveSilenceFrames >= endFrames
if (shouldEnd) {
onSpeechEnd()
reset()
isSpeaking = false
isInFinalPhase = false
}
} }
} }
fun reset() { fun reset() {
isSpeaking = false isSpeaking = false
lastSpeechTime = 0L lastSpeechMs = 0
lastEffectiveSpeechTime = 0L
consecutiveSilenceFrames = 0
isInFinalPhase = false
vad.reset() vad.reset()
} }
fun calcRms(samples: FloatArray): Float { private fun calcRms(samples: FloatArray): Float {
var sum = 0f var sum = 0f
for (v in samples) sum += v * v for (v in samples) sum += v * v
return sqrt(sum / samples.size) return sqrt(sum / samples.size)

View File

@ -30,11 +30,9 @@ class VoiceController(
// 预缓存大小2秒 // 预缓存大小2秒
private const val PRE_BUFFER_SIZE = SAMPLE_RATE * 2 private const val PRE_BUFFER_SIZE = SAMPLE_RATE * 2
// 短语音判定阈值
private const val SHORT_AUDIO_DURATION_MS = 1000L
private const val INVALID_RESET_DEBOUNCE_MS = 1500L private const val INVALID_RESET_DEBOUNCE_MS = 1500L
// 最小语音时长 // 最小语音时长
private const val MIN_SPEECH_MS = 800L private const val MIN_SPEECH_MS = 600L
// 统一的声纹验证阈值(不再分场景) // 统一的声纹验证阈值(不再分场景)
private const val SPEAKER_THRESHOLD = 0.45f private const val SPEAKER_THRESHOLD = 0.45f
@ -275,6 +273,9 @@ class VoiceController(
fun onPlayEndPrompt() { fun onPlayEndPrompt() {
speechEnableAtMs = System.currentTimeMillis() + SPEECH_COOLDOWN_MS speechEnableAtMs = System.currentTimeMillis() + SPEECH_COOLDOWN_MS
LogUtils.d(TAG, "🎵 提示音结束") LogUtils.d(TAG, "🎵 提示音结束")
if (!preBuffer.isEmpty()) {
preBuffer.clear()
}
state = VoiceState.WAIT_SPEECH_COOLDOWN state = VoiceState.WAIT_SPEECH_COOLDOWN
} }
@ -321,7 +322,9 @@ class VoiceController(
private fun resetAll() { private fun resetAll() {
LogUtils.d(TAG, "🔄 重置所有状态 | 本次超时类型: $currentTimeoutType") LogUtils.d(TAG, "🔄 重置所有状态 | 本次超时类型: $currentTimeoutType")
audioBuffer.clear() audioBuffer.clear()
preBuffer.clear() if (!preBuffer.isEmpty()) {
preBuffer.clear()
}
vadManager.reset() vadManager.reset()
wakeupManager.reset() wakeupManager.reset()
vadStarted = false vadStarted = false
@ -359,9 +362,15 @@ class VoiceController(
} }
private fun cachePreBuffer(samples: FloatArray) { private fun cachePreBuffer(samples: FloatArray) {
// 空数据快速返回,避免无效循环
if (samples.isEmpty()) return
for (s in samples) { for (s in samples) {
preBuffer.addLast(s) preBuffer.addLast(s)
if (preBuffer.size > PRE_BUFFER_SIZE) preBuffer.removeFirst() // 关键修复:移除前先检查队列是否非空
if (preBuffer.size > PRE_BUFFER_SIZE && !preBuffer.isEmpty()) {
preBuffer.removeFirst()
}
} }
} }
@ -371,7 +380,10 @@ class VoiceController(
return false return false
} }
// 1. 裁剪音频:只保留本次录音的有效部分 // 1. 记录验证开始时间(关键:统计处理耗时)
val verifyStartMs = System.currentTimeMillis()
// 2. 原有音频裁剪逻辑(保留)
val audioDurationMs = (audio.size.toFloat() / SAMPLE_RATE * 1000).toLong() val audioDurationMs = (audio.size.toFloat() / SAMPLE_RATE * 1000).toLong()
val validAudio = if (audioDurationMs > 0) { val validAudio = if (audioDurationMs > 0) {
val validSampleCount = (audioDurationMs * SAMPLE_RATE / 1000).toInt() val validSampleCount = (audioDurationMs * SAMPLE_RATE / 1000).toInt()
@ -386,21 +398,16 @@ class VoiceController(
var stream: OnlineStream? = null var stream: OnlineStream? = null
// 使用 runCatching 统一处理异常
return runCatching { return runCatching {
stream = SpeakerRecognition.extractor.createStream() stream = SpeakerRecognition.extractor.createStream()
stream.acceptWaveform(samples = validAudio, sampleRate = SAMPLE_RATE)
stream.inputFinished()
// 处理音频数据 if (!SpeakerRecognition.extractor.isReady(stream)) {
stream?.acceptWaveform(samples = validAudio, sampleRate = SAMPLE_RATE)
stream?.inputFinished()
// 检查 stream 是否就绪
if (stream == null || !SpeakerRecognition.extractor.isReady(stream)) {
LogUtils.w(TAG, "❌ 音频Stream未就绪验证失败") LogUtils.w(TAG, "❌ 音频Stream未就绪验证失败")
return@runCatching false return@runCatching false
} }
// 计算特征并验证
val embedding = SpeakerRecognition.extractor.compute(stream) val embedding = SpeakerRecognition.extractor.compute(stream)
speakerManagerLock.withLock { speakerManagerLock.withLock {
val verifyPass = SpeakerRecognition.manager.verify( val verifyPass = SpeakerRecognition.manager.verify(
@ -409,19 +416,20 @@ class VoiceController(
threshold = SPEAKER_THRESHOLD threshold = SPEAKER_THRESHOLD
) )
LogUtils.d(TAG, "📊 声纹验证 | 统一阈值: $SPEAKER_THRESHOLD | 通过: $verifyPass | 验证时长: ${(validAudio.size.toFloat()/SAMPLE_RATE*1000).toLong()}ms") // 3. 计算真实处理耗时(结束时间 - 开始时间)
val verifyCostMs = System.currentTimeMillis() - verifyStartMs
// 日志区分:音频时长 vs 处理耗时
LogUtils.d(TAG, "📊 声纹验证 | 统一阈值: $SPEAKER_THRESHOLD | 通过: $verifyPass | 音频时长: $audioDurationMs ms | 处理耗时: $verifyCostMs ms")
verifyPass verifyPass
} }
}.onFailure { e -> }.onFailure { e ->
// 处理所有异常情况
LogUtils.e(TAG, "❌ 声纹验证异常,拒绝", e) LogUtils.e(TAG, "❌ 声纹验证异常,拒绝", e)
}.also { }.also {
// 确保 stream 资源释放
runCatching { runCatching {
stream?.release() stream?.release()
}.onFailure { e -> }.onFailure { e ->
LogUtils.w(TAG, "⚠️ 释放 Stream 资源失败", e) LogUtils.w(TAG, "⚠️ 释放 Stream 资源失败", e)
} }
}.getOrDefault(false) // 异常时默认返回 false }.getOrDefault(false)
} }
} }

View File

@ -102,7 +102,7 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
private var startPlayTimeoutJob: Job? = null // 统一管理所有播放场景的超时Job private var startPlayTimeoutJob: Job? = null // 统一管理所有播放场景的超时Job
private var mEventSources: EventSource? = null private var mEventSources: EventSource? = null
private var isManualCancel = false
override fun getViewBinding(): ActivityMainBinding = ActivityMainBinding.inflate(layoutInflater) override fun getViewBinding(): ActivityMainBinding = ActivityMainBinding.inflate(layoutInflater)
override fun initView() { override fun initView() {
@ -219,7 +219,7 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
} }
}, },
onFinalAudio = { audio -> onFinalAudio = { audio ->
sendRecordVoiceToServer(AudioPcmUtil.floatToPcm16Base64(audio)) sendRecordVoiceToServer(AudioPcmUtil.floatToPcm16(audio))
// mViewModel?.uploadVoice( // mViewModel?.uploadVoice(
// //
// AudioPcmUtil.floatToPcm16Base64(audio), // AudioPcmUtil.floatToPcm16Base64(audio),
@ -231,7 +231,7 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
"xxx.wav" "xxx.wav"
) )
AudioDebugUtil.saveFloatPcmAsWav(audio, file) AudioDebugUtil.saveFloatPcmAsWav(audio, file)
LogUtils.dTag("audioxx", "WAV saved: ${file.path}, samples=${audio.size}") // LogUtils.dTag("audioxx", "WAV saved: ${file.path}, samples=${audio.size}")
// lifecycleScope.launch(Dispatchers.Main) { // lifecycleScope.launch(Dispatchers.Main) {
// //
// mVerticalAnimator?.show() // mVerticalAnimator?.show()
@ -560,16 +560,13 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
} }
private fun sendRecordVoiceToServer(audio: String) { private fun sendRecordVoiceToServer(audio: ByteArray) {
cancelSSE() cancelSSE()
val request: Request? = RxHttp.postJson(ApiService.UPLOAD_RECORD_VOICE_URL) val request: Request? = RxHttp.postBody(ApiService.UPLOAD_RECORD_VOICE_URL)
.add("audio",audio) .setBody(audio)
.buildRequest() .buildRequest()
request?.let { request?.let {
// 重置手动取消标记
isManualCancel = false
mEventSources = createFactory(RxHttpPlugins.getOkHttpClient()) mEventSources = createFactory(RxHttpPlugins.getOkHttpClient())
.newEventSource(it, object : EventSourceListener() { .newEventSource(it, object : EventSourceListener() {
override fun onOpen(eventSource: EventSource, response: Response) { override fun onOpen(eventSource: EventSource, response: Response) {
@ -601,22 +598,19 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
response: Response? response: Response?
) { ) {
super.onFailure(eventSource, t, response) super.onFailure(eventSource, t, response)
// 关键修复2忽略手动取消导致的异常
if (isManualCancel) {
LogUtils.eTag("lrsxxx", "SSE手动取消忽略失败回调")
return
}
// 正常失败逻辑 // 正常失败逻辑
val errorMsg = t?.message ?: response?.message ?: "未知错误" val errorMsg = t?.message ?: response?.message ?: "未知错误"
voiceController?.onUploadFinished(false) LogUtils.eTag("lrsxxx", "流式请求失败:${errorMsg}")
if (backPlaying){
voiceController?.onPlayEndBackend()
backPlaying = false
}else{
voiceController?.onUploadFinished(false)
}
} }
override fun onClosed(eventSource: EventSource) { override fun onClosed(eventSource: EventSource) {
super.onClosed(eventSource) super.onClosed(eventSource)
// 关键修复3区分手动取消和正常关闭
val isSuccess = !isManualCancel
// 关键修复4关闭后置空引用避免内存泄漏
mEventSources = null mEventSources = null
} }
}) })
@ -625,7 +619,6 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
private fun cancelSSE() { private fun cancelSSE() {
isManualCancel = true
mEventSources?.cancel() mEventSources?.cancel()
mEventSources = null mEventSources = null
} }

View File

@ -26,6 +26,7 @@
android:layout_width="match_parent" android:layout_width="match_parent"
android:layout_height="wrap_content" android:layout_height="wrap_content"
android:textColor="@color/white" android:textColor="@color/white"
android:padding="20dp"
android:textSize="14sp" /> android:textSize="14sp" />
</LinearLayout> </LinearLayout>