提升检测速度
This commit is contained in:
parent
c450d8d620
commit
0dfdccc75b
@ -181,5 +181,6 @@ dependencies {
|
||||
implementation libs.androidautosize
|
||||
|
||||
implementation files('libs/sherpa-onnx-1.12.20.aar')
|
||||
implementation 'com.github.yyued:SVGAPlayer-Android:2.6.1'
|
||||
|
||||
}
|
||||
@ -39,7 +39,7 @@
|
||||
|
||||
<application
|
||||
android:name=".App"
|
||||
android:allowBackup="false"
|
||||
android:allowBackup="true"
|
||||
android:dataExtractionRules="@xml/data_extraction_rules"
|
||||
android:fullBackupContent="@xml/backup_rules"
|
||||
android:icon="@mipmap/ic_launcher"
|
||||
|
||||
Binary file not shown.
@ -2,10 +2,7 @@ package com.zs.smarthuman.sherpa
|
||||
|
||||
import android.content.res.AssetManager
|
||||
import com.blankj.utilcode.util.LogUtils
|
||||
import com.k2fsa.sherpa.onnx.SileroVadModelConfig
|
||||
import com.k2fsa.sherpa.onnx.Vad
|
||||
import com.k2fsa.sherpa.onnx.VadModelConfig
|
||||
import com.k2fsa.sherpa.onnx.getVadModelConfig
|
||||
import com.k2fsa.sherpa.onnx.*
|
||||
import kotlin.math.sqrt
|
||||
|
||||
class VadManager(
|
||||
@ -13,92 +10,62 @@ class VadManager(
|
||||
private val onSpeechStart: () -> Unit,
|
||||
private val onSpeechEnd: () -> Unit
|
||||
) {
|
||||
|
||||
private val TAG = "VadManager"
|
||||
private val vad: Vad
|
||||
|
||||
private var isSpeaking = false
|
||||
private var lastSpeechTime = 0L
|
||||
private var lastSpeechMs = 0L
|
||||
|
||||
private val ACTIVE_END_SILENCE_MS = 1500L
|
||||
private val ACTIVE_CONSECUTIVE_FRAMES = 10
|
||||
private val FINAL_END_SILENCE_MS = 800L
|
||||
private val FINAL_CONSECUTIVE_FRAMES = 5
|
||||
private val FINAL_PHASE_TRIGGER_MS = 1000L
|
||||
private val MAX_SILENCE_AFTER_SPEECH_MS = 2000L
|
||||
|
||||
private val MIN_EFFECTIVE_SPEECH_RMS = 0.001f
|
||||
|
||||
private var consecutiveSilenceFrames = 0
|
||||
private var isInFinalPhase = false
|
||||
private var lastEffectiveSpeechTime = 0L
|
||||
/** 更果断结束 */
|
||||
private val END_SILENCE_MS = 350L
|
||||
private val MIN_RMS = 0.002f
|
||||
|
||||
init {
|
||||
val config = getVadModelConfig(0)
|
||||
?: throw IllegalStateException("[$TAG] VAD config not found")
|
||||
vad = Vad(assetManager, VadModelConfig(sileroVadModelConfig = SileroVadModelConfig(model = "silero_vad.onnx", threshold = 0.2f)))
|
||||
LogUtils.i(TAG, "✅ VAD 初始化成功")
|
||||
vad = Vad(
|
||||
assetManager,
|
||||
VadModelConfig(
|
||||
sileroVadModelConfig = SileroVadModelConfig(
|
||||
model = "silero_vad.onnx",
|
||||
threshold = 0.5F,
|
||||
minSilenceDuration = 0.25F,
|
||||
minSpeechDuration = 0.25F,
|
||||
windowSize = 512,
|
||||
),
|
||||
sampleRate = 16000,
|
||||
numThreads = 1,
|
||||
provider = "cpu"
|
||||
)
|
||||
)
|
||||
LogUtils.i(TAG, "✅ VAD init")
|
||||
}
|
||||
|
||||
fun accept(samples: FloatArray) {
|
||||
val now = System.currentTimeMillis()
|
||||
|
||||
vad.acceptWaveform(samples)
|
||||
val vadHasSpeech = vad.isSpeechDetected()
|
||||
val hasSpeech = vad.isSpeechDetected()
|
||||
val rms = calcRms(samples)
|
||||
|
||||
val isEffectiveSpeech = vadHasSpeech && rms >= MIN_EFFECTIVE_SPEECH_RMS
|
||||
|
||||
if (isEffectiveSpeech) {
|
||||
lastEffectiveSpeechTime = now
|
||||
isInFinalPhase = false
|
||||
lastSpeechTime = now
|
||||
consecutiveSilenceFrames = 0
|
||||
} else {
|
||||
consecutiveSilenceFrames++
|
||||
if (now - lastEffectiveSpeechTime >= FINAL_PHASE_TRIGGER_MS) {
|
||||
isInFinalPhase = true
|
||||
}
|
||||
}
|
||||
|
||||
val (endSilenceMs, endFrames) =
|
||||
if (isInFinalPhase)
|
||||
FINAL_END_SILENCE_MS to FINAL_CONSECUTIVE_FRAMES
|
||||
else
|
||||
ACTIVE_END_SILENCE_MS to ACTIVE_CONSECUTIVE_FRAMES
|
||||
|
||||
if (isEffectiveSpeech) {
|
||||
if (hasSpeech && rms >= MIN_RMS) {
|
||||
lastSpeechMs = now
|
||||
if (!isSpeaking) {
|
||||
isSpeaking = true
|
||||
onSpeechStart()
|
||||
}
|
||||
} else if (isSpeaking) {
|
||||
val silenceMs = now - lastSpeechTime
|
||||
val effectiveSilenceMs = now - lastEffectiveSpeechTime
|
||||
|
||||
val shouldEnd =
|
||||
(silenceMs >= endSilenceMs ||
|
||||
effectiveSilenceMs >= MAX_SILENCE_AFTER_SPEECH_MS) &&
|
||||
consecutiveSilenceFrames >= endFrames
|
||||
|
||||
if (shouldEnd) {
|
||||
} else if (isSpeaking && now - lastSpeechMs > END_SILENCE_MS) {
|
||||
onSpeechEnd()
|
||||
reset()
|
||||
isSpeaking = false
|
||||
isInFinalPhase = false
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fun reset() {
|
||||
isSpeaking = false
|
||||
lastSpeechTime = 0L
|
||||
lastEffectiveSpeechTime = 0L
|
||||
consecutiveSilenceFrames = 0
|
||||
isInFinalPhase = false
|
||||
lastSpeechMs = 0
|
||||
vad.reset()
|
||||
}
|
||||
|
||||
fun calcRms(samples: FloatArray): Float {
|
||||
private fun calcRms(samples: FloatArray): Float {
|
||||
var sum = 0f
|
||||
for (v in samples) sum += v * v
|
||||
return sqrt(sum / samples.size)
|
||||
|
||||
@ -30,11 +30,9 @@ class VoiceController(
|
||||
// 预缓存大小(2秒)
|
||||
private const val PRE_BUFFER_SIZE = SAMPLE_RATE * 2
|
||||
|
||||
// 短语音判定阈值
|
||||
private const val SHORT_AUDIO_DURATION_MS = 1000L
|
||||
private const val INVALID_RESET_DEBOUNCE_MS = 1500L
|
||||
// 最小语音时长
|
||||
private const val MIN_SPEECH_MS = 800L
|
||||
private const val MIN_SPEECH_MS = 600L
|
||||
|
||||
// 统一的声纹验证阈值(不再分场景)
|
||||
private const val SPEAKER_THRESHOLD = 0.45f
|
||||
@ -275,6 +273,9 @@ class VoiceController(
|
||||
fun onPlayEndPrompt() {
|
||||
speechEnableAtMs = System.currentTimeMillis() + SPEECH_COOLDOWN_MS
|
||||
LogUtils.d(TAG, "🎵 提示音结束")
|
||||
if (!preBuffer.isEmpty()) {
|
||||
preBuffer.clear()
|
||||
}
|
||||
state = VoiceState.WAIT_SPEECH_COOLDOWN
|
||||
}
|
||||
|
||||
@ -321,7 +322,9 @@ class VoiceController(
|
||||
private fun resetAll() {
|
||||
LogUtils.d(TAG, "🔄 重置所有状态 | 本次超时类型: $currentTimeoutType")
|
||||
audioBuffer.clear()
|
||||
if (!preBuffer.isEmpty()) {
|
||||
preBuffer.clear()
|
||||
}
|
||||
vadManager.reset()
|
||||
wakeupManager.reset()
|
||||
vadStarted = false
|
||||
@ -359,9 +362,15 @@ class VoiceController(
|
||||
}
|
||||
|
||||
private fun cachePreBuffer(samples: FloatArray) {
|
||||
// 空数据快速返回,避免无效循环
|
||||
if (samples.isEmpty()) return
|
||||
|
||||
for (s in samples) {
|
||||
preBuffer.addLast(s)
|
||||
if (preBuffer.size > PRE_BUFFER_SIZE) preBuffer.removeFirst()
|
||||
// 关键修复:移除前先检查队列是否非空
|
||||
if (preBuffer.size > PRE_BUFFER_SIZE && !preBuffer.isEmpty()) {
|
||||
preBuffer.removeFirst()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -371,7 +380,10 @@ class VoiceController(
|
||||
return false
|
||||
}
|
||||
|
||||
// 1. 裁剪音频:只保留本次录音的有效部分
|
||||
// 1. 记录验证开始时间(关键:统计处理耗时)
|
||||
val verifyStartMs = System.currentTimeMillis()
|
||||
|
||||
// 2. 原有音频裁剪逻辑(保留)
|
||||
val audioDurationMs = (audio.size.toFloat() / SAMPLE_RATE * 1000).toLong()
|
||||
val validAudio = if (audioDurationMs > 0) {
|
||||
val validSampleCount = (audioDurationMs * SAMPLE_RATE / 1000).toInt()
|
||||
@ -386,21 +398,16 @@ class VoiceController(
|
||||
|
||||
var stream: OnlineStream? = null
|
||||
|
||||
// 使用 runCatching 统一处理异常
|
||||
return runCatching {
|
||||
stream = SpeakerRecognition.extractor.createStream()
|
||||
stream.acceptWaveform(samples = validAudio, sampleRate = SAMPLE_RATE)
|
||||
stream.inputFinished()
|
||||
|
||||
// 处理音频数据
|
||||
stream?.acceptWaveform(samples = validAudio, sampleRate = SAMPLE_RATE)
|
||||
stream?.inputFinished()
|
||||
|
||||
// 检查 stream 是否就绪
|
||||
if (stream == null || !SpeakerRecognition.extractor.isReady(stream)) {
|
||||
if (!SpeakerRecognition.extractor.isReady(stream)) {
|
||||
LogUtils.w(TAG, "❌ 音频Stream未就绪,验证失败")
|
||||
return@runCatching false
|
||||
}
|
||||
|
||||
// 计算特征并验证
|
||||
val embedding = SpeakerRecognition.extractor.compute(stream)
|
||||
speakerManagerLock.withLock {
|
||||
val verifyPass = SpeakerRecognition.manager.verify(
|
||||
@ -409,19 +416,20 @@ class VoiceController(
|
||||
threshold = SPEAKER_THRESHOLD
|
||||
)
|
||||
|
||||
LogUtils.d(TAG, "📊 声纹验证 | 统一阈值: $SPEAKER_THRESHOLD | 通过: $verifyPass | 验证时长: ${(validAudio.size.toFloat()/SAMPLE_RATE*1000).toLong()}ms")
|
||||
// 3. 计算真实处理耗时(结束时间 - 开始时间)
|
||||
val verifyCostMs = System.currentTimeMillis() - verifyStartMs
|
||||
// 日志区分:音频时长 vs 处理耗时
|
||||
LogUtils.d(TAG, "📊 声纹验证 | 统一阈值: $SPEAKER_THRESHOLD | 通过: $verifyPass | 音频时长: $audioDurationMs ms | 处理耗时: $verifyCostMs ms")
|
||||
verifyPass
|
||||
}
|
||||
}.onFailure { e ->
|
||||
// 处理所有异常情况
|
||||
LogUtils.e(TAG, "❌ 声纹验证异常,拒绝", e)
|
||||
}.also {
|
||||
// 确保 stream 资源释放
|
||||
runCatching {
|
||||
stream?.release()
|
||||
}.onFailure { e ->
|
||||
LogUtils.w(TAG, "⚠️ 释放 Stream 资源失败", e)
|
||||
}
|
||||
}.getOrDefault(false) // 异常时默认返回 false
|
||||
}.getOrDefault(false)
|
||||
}
|
||||
}
|
||||
@ -102,7 +102,7 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
|
||||
private var startPlayTimeoutJob: Job? = null // 统一管理所有播放场景的超时Job
|
||||
|
||||
private var mEventSources: EventSource? = null
|
||||
private var isManualCancel = false
|
||||
|
||||
|
||||
override fun getViewBinding(): ActivityMainBinding = ActivityMainBinding.inflate(layoutInflater)
|
||||
override fun initView() {
|
||||
@ -219,7 +219,7 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
|
||||
}
|
||||
},
|
||||
onFinalAudio = { audio ->
|
||||
sendRecordVoiceToServer(AudioPcmUtil.floatToPcm16Base64(audio))
|
||||
sendRecordVoiceToServer(AudioPcmUtil.floatToPcm16(audio))
|
||||
// mViewModel?.uploadVoice(
|
||||
//
|
||||
// AudioPcmUtil.floatToPcm16Base64(audio),
|
||||
@ -231,7 +231,7 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
|
||||
"xxx.wav"
|
||||
)
|
||||
AudioDebugUtil.saveFloatPcmAsWav(audio, file)
|
||||
LogUtils.dTag("audioxx", "WAV saved: ${file.path}, samples=${audio.size}")
|
||||
// LogUtils.dTag("audioxx", "WAV saved: ${file.path}, samples=${audio.size}")
|
||||
// lifecycleScope.launch(Dispatchers.Main) {
|
||||
//
|
||||
// mVerticalAnimator?.show()
|
||||
@ -560,16 +560,13 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
|
||||
}
|
||||
|
||||
|
||||
private fun sendRecordVoiceToServer(audio: String) {
|
||||
private fun sendRecordVoiceToServer(audio: ByteArray) {
|
||||
cancelSSE()
|
||||
val request: Request? = RxHttp.postJson(ApiService.UPLOAD_RECORD_VOICE_URL)
|
||||
.add("audio",audio)
|
||||
val request: Request? = RxHttp.postBody(ApiService.UPLOAD_RECORD_VOICE_URL)
|
||||
.setBody(audio)
|
||||
.buildRequest()
|
||||
|
||||
request?.let {
|
||||
// 重置手动取消标记
|
||||
isManualCancel = false
|
||||
|
||||
mEventSources = createFactory(RxHttpPlugins.getOkHttpClient())
|
||||
.newEventSource(it, object : EventSourceListener() {
|
||||
override fun onOpen(eventSource: EventSource, response: Response) {
|
||||
@ -601,22 +598,19 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
|
||||
response: Response?
|
||||
) {
|
||||
super.onFailure(eventSource, t, response)
|
||||
// 关键修复2:忽略手动取消导致的异常
|
||||
if (isManualCancel) {
|
||||
LogUtils.eTag("lrsxxx", "SSE手动取消,忽略失败回调")
|
||||
return
|
||||
}
|
||||
|
||||
// 正常失败逻辑
|
||||
val errorMsg = t?.message ?: response?.message ?: "未知错误"
|
||||
LogUtils.eTag("lrsxxx", "流式请求失败:${errorMsg}")
|
||||
if (backPlaying){
|
||||
voiceController?.onPlayEndBackend()
|
||||
backPlaying = false
|
||||
}else{
|
||||
voiceController?.onUploadFinished(false)
|
||||
}
|
||||
}
|
||||
|
||||
override fun onClosed(eventSource: EventSource) {
|
||||
super.onClosed(eventSource)
|
||||
// 关键修复3:区分手动取消和正常关闭
|
||||
val isSuccess = !isManualCancel
|
||||
// 关键修复4:关闭后置空引用,避免内存泄漏
|
||||
mEventSources = null
|
||||
}
|
||||
})
|
||||
@ -625,7 +619,6 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
|
||||
|
||||
|
||||
private fun cancelSSE() {
|
||||
isManualCancel = true
|
||||
mEventSources?.cancel()
|
||||
mEventSources = null
|
||||
}
|
||||
|
||||
@ -26,6 +26,7 @@
|
||||
android:layout_width="match_parent"
|
||||
android:layout_height="wrap_content"
|
||||
android:textColor="@color/white"
|
||||
android:padding="20dp"
|
||||
android:textSize="14sp" />
|
||||
|
||||
</LinearLayout>
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user