提升检测速度

This commit is contained in:
林若思 2026-01-19 11:06:26 +08:00
parent c450d8d620
commit 0dfdccc75b
7 changed files with 72 additions and 102 deletions

View File

@ -181,5 +181,6 @@ dependencies {
implementation libs.androidautosize
implementation files('libs/sherpa-onnx-1.12.20.aar')
implementation 'com.github.yyued:SVGAPlayer-Android:2.6.1'
}

View File

@ -39,7 +39,7 @@
<application
android:name=".App"
android:allowBackup="false"
android:allowBackup="true"
android:dataExtractionRules="@xml/data_extraction_rules"
android:fullBackupContent="@xml/backup_rules"
android:icon="@mipmap/ic_launcher"

View File

@ -2,10 +2,7 @@ package com.zs.smarthuman.sherpa
import android.content.res.AssetManager
import com.blankj.utilcode.util.LogUtils
import com.k2fsa.sherpa.onnx.SileroVadModelConfig
import com.k2fsa.sherpa.onnx.Vad
import com.k2fsa.sherpa.onnx.VadModelConfig
import com.k2fsa.sherpa.onnx.getVadModelConfig
import com.k2fsa.sherpa.onnx.*
import kotlin.math.sqrt
class VadManager(
@ -13,92 +10,62 @@ class VadManager(
private val onSpeechStart: () -> Unit,
private val onSpeechEnd: () -> Unit
) {
private val TAG = "VadManager"
private val vad: Vad
private var isSpeaking = false
private var lastSpeechTime = 0L
private var lastSpeechMs = 0L
private val ACTIVE_END_SILENCE_MS = 1500L
private val ACTIVE_CONSECUTIVE_FRAMES = 10
private val FINAL_END_SILENCE_MS = 800L
private val FINAL_CONSECUTIVE_FRAMES = 5
private val FINAL_PHASE_TRIGGER_MS = 1000L
private val MAX_SILENCE_AFTER_SPEECH_MS = 2000L
private val MIN_EFFECTIVE_SPEECH_RMS = 0.001f
private var consecutiveSilenceFrames = 0
private var isInFinalPhase = false
private var lastEffectiveSpeechTime = 0L
/** 更果断结束 */
private val END_SILENCE_MS = 350L
private val MIN_RMS = 0.002f
init {
val config = getVadModelConfig(0)
?: throw IllegalStateException("[$TAG] VAD config not found")
vad = Vad(assetManager, VadModelConfig(sileroVadModelConfig = SileroVadModelConfig(model = "silero_vad.onnx", threshold = 0.2f)))
LogUtils.i(TAG, "✅ VAD 初始化成功")
vad = Vad(
assetManager,
VadModelConfig(
sileroVadModelConfig = SileroVadModelConfig(
model = "silero_vad.onnx",
threshold = 0.5F,
minSilenceDuration = 0.25F,
minSpeechDuration = 0.25F,
windowSize = 512,
),
sampleRate = 16000,
numThreads = 1,
provider = "cpu"
)
)
LogUtils.i(TAG, "✅ VAD init")
}
fun accept(samples: FloatArray) {
val now = System.currentTimeMillis()
vad.acceptWaveform(samples)
val vadHasSpeech = vad.isSpeechDetected()
val hasSpeech = vad.isSpeechDetected()
val rms = calcRms(samples)
val isEffectiveSpeech = vadHasSpeech && rms >= MIN_EFFECTIVE_SPEECH_RMS
if (isEffectiveSpeech) {
lastEffectiveSpeechTime = now
isInFinalPhase = false
lastSpeechTime = now
consecutiveSilenceFrames = 0
} else {
consecutiveSilenceFrames++
if (now - lastEffectiveSpeechTime >= FINAL_PHASE_TRIGGER_MS) {
isInFinalPhase = true
}
}
val (endSilenceMs, endFrames) =
if (isInFinalPhase)
FINAL_END_SILENCE_MS to FINAL_CONSECUTIVE_FRAMES
else
ACTIVE_END_SILENCE_MS to ACTIVE_CONSECUTIVE_FRAMES
if (isEffectiveSpeech) {
if (hasSpeech && rms >= MIN_RMS) {
lastSpeechMs = now
if (!isSpeaking) {
isSpeaking = true
onSpeechStart()
}
} else if (isSpeaking) {
val silenceMs = now - lastSpeechTime
val effectiveSilenceMs = now - lastEffectiveSpeechTime
val shouldEnd =
(silenceMs >= endSilenceMs ||
effectiveSilenceMs >= MAX_SILENCE_AFTER_SPEECH_MS) &&
consecutiveSilenceFrames >= endFrames
if (shouldEnd) {
} else if (isSpeaking && now - lastSpeechMs > END_SILENCE_MS) {
onSpeechEnd()
reset()
isSpeaking = false
isInFinalPhase = false
}
}
}
fun reset() {
isSpeaking = false
lastSpeechTime = 0L
lastEffectiveSpeechTime = 0L
consecutiveSilenceFrames = 0
isInFinalPhase = false
lastSpeechMs = 0
vad.reset()
}
fun calcRms(samples: FloatArray): Float {
private fun calcRms(samples: FloatArray): Float {
var sum = 0f
for (v in samples) sum += v * v
return sqrt(sum / samples.size)

View File

@ -30,11 +30,9 @@ class VoiceController(
// 预缓存大小2秒
private const val PRE_BUFFER_SIZE = SAMPLE_RATE * 2
// 短语音判定阈值
private const val SHORT_AUDIO_DURATION_MS = 1000L
private const val INVALID_RESET_DEBOUNCE_MS = 1500L
// 最小语音时长
private const val MIN_SPEECH_MS = 800L
private const val MIN_SPEECH_MS = 600L
// 统一的声纹验证阈值(不再分场景)
private const val SPEAKER_THRESHOLD = 0.45f
@ -275,6 +273,9 @@ class VoiceController(
fun onPlayEndPrompt() {
speechEnableAtMs = System.currentTimeMillis() + SPEECH_COOLDOWN_MS
LogUtils.d(TAG, "🎵 提示音结束")
if (!preBuffer.isEmpty()) {
preBuffer.clear()
}
state = VoiceState.WAIT_SPEECH_COOLDOWN
}
@ -321,7 +322,9 @@ class VoiceController(
private fun resetAll() {
LogUtils.d(TAG, "🔄 重置所有状态 | 本次超时类型: $currentTimeoutType")
audioBuffer.clear()
if (!preBuffer.isEmpty()) {
preBuffer.clear()
}
vadManager.reset()
wakeupManager.reset()
vadStarted = false
@ -359,9 +362,15 @@ class VoiceController(
}
private fun cachePreBuffer(samples: FloatArray) {
// 空数据快速返回,避免无效循环
if (samples.isEmpty()) return
for (s in samples) {
preBuffer.addLast(s)
if (preBuffer.size > PRE_BUFFER_SIZE) preBuffer.removeFirst()
// 关键修复:移除前先检查队列是否非空
if (preBuffer.size > PRE_BUFFER_SIZE && !preBuffer.isEmpty()) {
preBuffer.removeFirst()
}
}
}
@ -371,7 +380,10 @@ class VoiceController(
return false
}
// 1. 裁剪音频:只保留本次录音的有效部分
// 1. 记录验证开始时间(关键:统计处理耗时)
val verifyStartMs = System.currentTimeMillis()
// 2. 原有音频裁剪逻辑(保留)
val audioDurationMs = (audio.size.toFloat() / SAMPLE_RATE * 1000).toLong()
val validAudio = if (audioDurationMs > 0) {
val validSampleCount = (audioDurationMs * SAMPLE_RATE / 1000).toInt()
@ -386,21 +398,16 @@ class VoiceController(
var stream: OnlineStream? = null
// 使用 runCatching 统一处理异常
return runCatching {
stream = SpeakerRecognition.extractor.createStream()
stream.acceptWaveform(samples = validAudio, sampleRate = SAMPLE_RATE)
stream.inputFinished()
// 处理音频数据
stream?.acceptWaveform(samples = validAudio, sampleRate = SAMPLE_RATE)
stream?.inputFinished()
// 检查 stream 是否就绪
if (stream == null || !SpeakerRecognition.extractor.isReady(stream)) {
if (!SpeakerRecognition.extractor.isReady(stream)) {
LogUtils.w(TAG, "❌ 音频Stream未就绪验证失败")
return@runCatching false
}
// 计算特征并验证
val embedding = SpeakerRecognition.extractor.compute(stream)
speakerManagerLock.withLock {
val verifyPass = SpeakerRecognition.manager.verify(
@ -409,19 +416,20 @@ class VoiceController(
threshold = SPEAKER_THRESHOLD
)
LogUtils.d(TAG, "📊 声纹验证 | 统一阈值: $SPEAKER_THRESHOLD | 通过: $verifyPass | 验证时长: ${(validAudio.size.toFloat()/SAMPLE_RATE*1000).toLong()}ms")
// 3. 计算真实处理耗时(结束时间 - 开始时间)
val verifyCostMs = System.currentTimeMillis() - verifyStartMs
// 日志区分:音频时长 vs 处理耗时
LogUtils.d(TAG, "📊 声纹验证 | 统一阈值: $SPEAKER_THRESHOLD | 通过: $verifyPass | 音频时长: $audioDurationMs ms | 处理耗时: $verifyCostMs ms")
verifyPass
}
}.onFailure { e ->
// 处理所有异常情况
LogUtils.e(TAG, "❌ 声纹验证异常,拒绝", e)
}.also {
// 确保 stream 资源释放
runCatching {
stream?.release()
}.onFailure { e ->
LogUtils.w(TAG, "⚠️ 释放 Stream 资源失败", e)
}
}.getOrDefault(false) // 异常时默认返回 false
}.getOrDefault(false)
}
}

View File

@ -102,7 +102,7 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
private var startPlayTimeoutJob: Job? = null // 统一管理所有播放场景的超时Job
private var mEventSources: EventSource? = null
private var isManualCancel = false
override fun getViewBinding(): ActivityMainBinding = ActivityMainBinding.inflate(layoutInflater)
override fun initView() {
@ -219,7 +219,7 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
}
},
onFinalAudio = { audio ->
sendRecordVoiceToServer(AudioPcmUtil.floatToPcm16Base64(audio))
sendRecordVoiceToServer(AudioPcmUtil.floatToPcm16(audio))
// mViewModel?.uploadVoice(
//
// AudioPcmUtil.floatToPcm16Base64(audio),
@ -231,7 +231,7 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
"xxx.wav"
)
AudioDebugUtil.saveFloatPcmAsWav(audio, file)
LogUtils.dTag("audioxx", "WAV saved: ${file.path}, samples=${audio.size}")
// LogUtils.dTag("audioxx", "WAV saved: ${file.path}, samples=${audio.size}")
// lifecycleScope.launch(Dispatchers.Main) {
//
// mVerticalAnimator?.show()
@ -560,16 +560,13 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
}
private fun sendRecordVoiceToServer(audio: String) {
private fun sendRecordVoiceToServer(audio: ByteArray) {
cancelSSE()
val request: Request? = RxHttp.postJson(ApiService.UPLOAD_RECORD_VOICE_URL)
.add("audio",audio)
val request: Request? = RxHttp.postBody(ApiService.UPLOAD_RECORD_VOICE_URL)
.setBody(audio)
.buildRequest()
request?.let {
// 重置手动取消标记
isManualCancel = false
mEventSources = createFactory(RxHttpPlugins.getOkHttpClient())
.newEventSource(it, object : EventSourceListener() {
override fun onOpen(eventSource: EventSource, response: Response) {
@ -601,22 +598,19 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
response: Response?
) {
super.onFailure(eventSource, t, response)
// 关键修复2忽略手动取消导致的异常
if (isManualCancel) {
LogUtils.eTag("lrsxxx", "SSE手动取消忽略失败回调")
return
}
// 正常失败逻辑
val errorMsg = t?.message ?: response?.message ?: "未知错误"
LogUtils.eTag("lrsxxx", "流式请求失败:${errorMsg}")
if (backPlaying){
voiceController?.onPlayEndBackend()
backPlaying = false
}else{
voiceController?.onUploadFinished(false)
}
}
override fun onClosed(eventSource: EventSource) {
super.onClosed(eventSource)
// 关键修复3区分手动取消和正常关闭
val isSuccess = !isManualCancel
// 关键修复4关闭后置空引用避免内存泄漏
mEventSources = null
}
})
@ -625,7 +619,6 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
private fun cancelSSE() {
isManualCancel = true
mEventSources?.cancel()
mEventSources = null
}

View File

@ -26,6 +26,7 @@
android:layout_width="match_parent"
android:layout_height="wrap_content"
android:textColor="@color/white"
android:padding="20dp"
android:textSize="14sp" />
</LinearLayout>