提升检测速度
This commit is contained in:
parent
c450d8d620
commit
0dfdccc75b
@ -181,5 +181,6 @@ dependencies {
|
|||||||
implementation libs.androidautosize
|
implementation libs.androidautosize
|
||||||
|
|
||||||
implementation files('libs/sherpa-onnx-1.12.20.aar')
|
implementation files('libs/sherpa-onnx-1.12.20.aar')
|
||||||
|
implementation 'com.github.yyued:SVGAPlayer-Android:2.6.1'
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -39,7 +39,7 @@
|
|||||||
|
|
||||||
<application
|
<application
|
||||||
android:name=".App"
|
android:name=".App"
|
||||||
android:allowBackup="false"
|
android:allowBackup="true"
|
||||||
android:dataExtractionRules="@xml/data_extraction_rules"
|
android:dataExtractionRules="@xml/data_extraction_rules"
|
||||||
android:fullBackupContent="@xml/backup_rules"
|
android:fullBackupContent="@xml/backup_rules"
|
||||||
android:icon="@mipmap/ic_launcher"
|
android:icon="@mipmap/ic_launcher"
|
||||||
|
|||||||
Binary file not shown.
@ -2,10 +2,7 @@ package com.zs.smarthuman.sherpa
|
|||||||
|
|
||||||
import android.content.res.AssetManager
|
import android.content.res.AssetManager
|
||||||
import com.blankj.utilcode.util.LogUtils
|
import com.blankj.utilcode.util.LogUtils
|
||||||
import com.k2fsa.sherpa.onnx.SileroVadModelConfig
|
import com.k2fsa.sherpa.onnx.*
|
||||||
import com.k2fsa.sherpa.onnx.Vad
|
|
||||||
import com.k2fsa.sherpa.onnx.VadModelConfig
|
|
||||||
import com.k2fsa.sherpa.onnx.getVadModelConfig
|
|
||||||
import kotlin.math.sqrt
|
import kotlin.math.sqrt
|
||||||
|
|
||||||
class VadManager(
|
class VadManager(
|
||||||
@ -13,92 +10,62 @@ class VadManager(
|
|||||||
private val onSpeechStart: () -> Unit,
|
private val onSpeechStart: () -> Unit,
|
||||||
private val onSpeechEnd: () -> Unit
|
private val onSpeechEnd: () -> Unit
|
||||||
) {
|
) {
|
||||||
|
|
||||||
private val TAG = "VadManager"
|
private val TAG = "VadManager"
|
||||||
private val vad: Vad
|
private val vad: Vad
|
||||||
|
|
||||||
private var isSpeaking = false
|
private var isSpeaking = false
|
||||||
private var lastSpeechTime = 0L
|
private var lastSpeechMs = 0L
|
||||||
|
|
||||||
private val ACTIVE_END_SILENCE_MS = 1500L
|
/** 更果断结束 */
|
||||||
private val ACTIVE_CONSECUTIVE_FRAMES = 10
|
private val END_SILENCE_MS = 350L
|
||||||
private val FINAL_END_SILENCE_MS = 800L
|
private val MIN_RMS = 0.002f
|
||||||
private val FINAL_CONSECUTIVE_FRAMES = 5
|
|
||||||
private val FINAL_PHASE_TRIGGER_MS = 1000L
|
|
||||||
private val MAX_SILENCE_AFTER_SPEECH_MS = 2000L
|
|
||||||
|
|
||||||
private val MIN_EFFECTIVE_SPEECH_RMS = 0.001f
|
|
||||||
|
|
||||||
private var consecutiveSilenceFrames = 0
|
|
||||||
private var isInFinalPhase = false
|
|
||||||
private var lastEffectiveSpeechTime = 0L
|
|
||||||
|
|
||||||
init {
|
init {
|
||||||
val config = getVadModelConfig(0)
|
vad = Vad(
|
||||||
?: throw IllegalStateException("[$TAG] VAD config not found")
|
assetManager,
|
||||||
vad = Vad(assetManager, VadModelConfig(sileroVadModelConfig = SileroVadModelConfig(model = "silero_vad.onnx", threshold = 0.2f)))
|
VadModelConfig(
|
||||||
LogUtils.i(TAG, "✅ VAD 初始化成功")
|
sileroVadModelConfig = SileroVadModelConfig(
|
||||||
|
model = "silero_vad.onnx",
|
||||||
|
threshold = 0.5F,
|
||||||
|
minSilenceDuration = 0.25F,
|
||||||
|
minSpeechDuration = 0.25F,
|
||||||
|
windowSize = 512,
|
||||||
|
),
|
||||||
|
sampleRate = 16000,
|
||||||
|
numThreads = 1,
|
||||||
|
provider = "cpu"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
LogUtils.i(TAG, "✅ VAD init")
|
||||||
}
|
}
|
||||||
|
|
||||||
fun accept(samples: FloatArray) {
|
fun accept(samples: FloatArray) {
|
||||||
val now = System.currentTimeMillis()
|
val now = System.currentTimeMillis()
|
||||||
|
|
||||||
vad.acceptWaveform(samples)
|
vad.acceptWaveform(samples)
|
||||||
val vadHasSpeech = vad.isSpeechDetected()
|
val hasSpeech = vad.isSpeechDetected()
|
||||||
val rms = calcRms(samples)
|
val rms = calcRms(samples)
|
||||||
|
|
||||||
val isEffectiveSpeech = vadHasSpeech && rms >= MIN_EFFECTIVE_SPEECH_RMS
|
if (hasSpeech && rms >= MIN_RMS) {
|
||||||
|
lastSpeechMs = now
|
||||||
if (isEffectiveSpeech) {
|
|
||||||
lastEffectiveSpeechTime = now
|
|
||||||
isInFinalPhase = false
|
|
||||||
lastSpeechTime = now
|
|
||||||
consecutiveSilenceFrames = 0
|
|
||||||
} else {
|
|
||||||
consecutiveSilenceFrames++
|
|
||||||
if (now - lastEffectiveSpeechTime >= FINAL_PHASE_TRIGGER_MS) {
|
|
||||||
isInFinalPhase = true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
val (endSilenceMs, endFrames) =
|
|
||||||
if (isInFinalPhase)
|
|
||||||
FINAL_END_SILENCE_MS to FINAL_CONSECUTIVE_FRAMES
|
|
||||||
else
|
|
||||||
ACTIVE_END_SILENCE_MS to ACTIVE_CONSECUTIVE_FRAMES
|
|
||||||
|
|
||||||
if (isEffectiveSpeech) {
|
|
||||||
if (!isSpeaking) {
|
if (!isSpeaking) {
|
||||||
isSpeaking = true
|
isSpeaking = true
|
||||||
onSpeechStart()
|
onSpeechStart()
|
||||||
}
|
}
|
||||||
} else if (isSpeaking) {
|
} else if (isSpeaking && now - lastSpeechMs > END_SILENCE_MS) {
|
||||||
val silenceMs = now - lastSpeechTime
|
onSpeechEnd()
|
||||||
val effectiveSilenceMs = now - lastEffectiveSpeechTime
|
reset()
|
||||||
|
|
||||||
val shouldEnd =
|
|
||||||
(silenceMs >= endSilenceMs ||
|
|
||||||
effectiveSilenceMs >= MAX_SILENCE_AFTER_SPEECH_MS) &&
|
|
||||||
consecutiveSilenceFrames >= endFrames
|
|
||||||
|
|
||||||
if (shouldEnd) {
|
|
||||||
onSpeechEnd()
|
|
||||||
reset()
|
|
||||||
isSpeaking = false
|
|
||||||
isInFinalPhase = false
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fun reset() {
|
fun reset() {
|
||||||
isSpeaking = false
|
isSpeaking = false
|
||||||
lastSpeechTime = 0L
|
lastSpeechMs = 0
|
||||||
lastEffectiveSpeechTime = 0L
|
|
||||||
consecutiveSilenceFrames = 0
|
|
||||||
isInFinalPhase = false
|
|
||||||
vad.reset()
|
vad.reset()
|
||||||
}
|
}
|
||||||
|
|
||||||
fun calcRms(samples: FloatArray): Float {
|
private fun calcRms(samples: FloatArray): Float {
|
||||||
var sum = 0f
|
var sum = 0f
|
||||||
for (v in samples) sum += v * v
|
for (v in samples) sum += v * v
|
||||||
return sqrt(sum / samples.size)
|
return sqrt(sum / samples.size)
|
||||||
|
|||||||
@ -30,11 +30,9 @@ class VoiceController(
|
|||||||
// 预缓存大小(2秒)
|
// 预缓存大小(2秒)
|
||||||
private const val PRE_BUFFER_SIZE = SAMPLE_RATE * 2
|
private const val PRE_BUFFER_SIZE = SAMPLE_RATE * 2
|
||||||
|
|
||||||
// 短语音判定阈值
|
|
||||||
private const val SHORT_AUDIO_DURATION_MS = 1000L
|
|
||||||
private const val INVALID_RESET_DEBOUNCE_MS = 1500L
|
private const val INVALID_RESET_DEBOUNCE_MS = 1500L
|
||||||
// 最小语音时长
|
// 最小语音时长
|
||||||
private const val MIN_SPEECH_MS = 800L
|
private const val MIN_SPEECH_MS = 600L
|
||||||
|
|
||||||
// 统一的声纹验证阈值(不再分场景)
|
// 统一的声纹验证阈值(不再分场景)
|
||||||
private const val SPEAKER_THRESHOLD = 0.45f
|
private const val SPEAKER_THRESHOLD = 0.45f
|
||||||
@ -275,6 +273,9 @@ class VoiceController(
|
|||||||
fun onPlayEndPrompt() {
|
fun onPlayEndPrompt() {
|
||||||
speechEnableAtMs = System.currentTimeMillis() + SPEECH_COOLDOWN_MS
|
speechEnableAtMs = System.currentTimeMillis() + SPEECH_COOLDOWN_MS
|
||||||
LogUtils.d(TAG, "🎵 提示音结束")
|
LogUtils.d(TAG, "🎵 提示音结束")
|
||||||
|
if (!preBuffer.isEmpty()) {
|
||||||
|
preBuffer.clear()
|
||||||
|
}
|
||||||
state = VoiceState.WAIT_SPEECH_COOLDOWN
|
state = VoiceState.WAIT_SPEECH_COOLDOWN
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -321,7 +322,9 @@ class VoiceController(
|
|||||||
private fun resetAll() {
|
private fun resetAll() {
|
||||||
LogUtils.d(TAG, "🔄 重置所有状态 | 本次超时类型: $currentTimeoutType")
|
LogUtils.d(TAG, "🔄 重置所有状态 | 本次超时类型: $currentTimeoutType")
|
||||||
audioBuffer.clear()
|
audioBuffer.clear()
|
||||||
preBuffer.clear()
|
if (!preBuffer.isEmpty()) {
|
||||||
|
preBuffer.clear()
|
||||||
|
}
|
||||||
vadManager.reset()
|
vadManager.reset()
|
||||||
wakeupManager.reset()
|
wakeupManager.reset()
|
||||||
vadStarted = false
|
vadStarted = false
|
||||||
@ -359,9 +362,15 @@ class VoiceController(
|
|||||||
}
|
}
|
||||||
|
|
||||||
private fun cachePreBuffer(samples: FloatArray) {
|
private fun cachePreBuffer(samples: FloatArray) {
|
||||||
|
// 空数据快速返回,避免无效循环
|
||||||
|
if (samples.isEmpty()) return
|
||||||
|
|
||||||
for (s in samples) {
|
for (s in samples) {
|
||||||
preBuffer.addLast(s)
|
preBuffer.addLast(s)
|
||||||
if (preBuffer.size > PRE_BUFFER_SIZE) preBuffer.removeFirst()
|
// 关键修复:移除前先检查队列是否非空
|
||||||
|
if (preBuffer.size > PRE_BUFFER_SIZE && !preBuffer.isEmpty()) {
|
||||||
|
preBuffer.removeFirst()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -371,7 +380,10 @@ class VoiceController(
|
|||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
// 1. 裁剪音频:只保留本次录音的有效部分
|
// 1. 记录验证开始时间(关键:统计处理耗时)
|
||||||
|
val verifyStartMs = System.currentTimeMillis()
|
||||||
|
|
||||||
|
// 2. 原有音频裁剪逻辑(保留)
|
||||||
val audioDurationMs = (audio.size.toFloat() / SAMPLE_RATE * 1000).toLong()
|
val audioDurationMs = (audio.size.toFloat() / SAMPLE_RATE * 1000).toLong()
|
||||||
val validAudio = if (audioDurationMs > 0) {
|
val validAudio = if (audioDurationMs > 0) {
|
||||||
val validSampleCount = (audioDurationMs * SAMPLE_RATE / 1000).toInt()
|
val validSampleCount = (audioDurationMs * SAMPLE_RATE / 1000).toInt()
|
||||||
@ -386,21 +398,16 @@ class VoiceController(
|
|||||||
|
|
||||||
var stream: OnlineStream? = null
|
var stream: OnlineStream? = null
|
||||||
|
|
||||||
// 使用 runCatching 统一处理异常
|
|
||||||
return runCatching {
|
return runCatching {
|
||||||
stream = SpeakerRecognition.extractor.createStream()
|
stream = SpeakerRecognition.extractor.createStream()
|
||||||
|
stream.acceptWaveform(samples = validAudio, sampleRate = SAMPLE_RATE)
|
||||||
|
stream.inputFinished()
|
||||||
|
|
||||||
// 处理音频数据
|
if (!SpeakerRecognition.extractor.isReady(stream)) {
|
||||||
stream?.acceptWaveform(samples = validAudio, sampleRate = SAMPLE_RATE)
|
|
||||||
stream?.inputFinished()
|
|
||||||
|
|
||||||
// 检查 stream 是否就绪
|
|
||||||
if (stream == null || !SpeakerRecognition.extractor.isReady(stream)) {
|
|
||||||
LogUtils.w(TAG, "❌ 音频Stream未就绪,验证失败")
|
LogUtils.w(TAG, "❌ 音频Stream未就绪,验证失败")
|
||||||
return@runCatching false
|
return@runCatching false
|
||||||
}
|
}
|
||||||
|
|
||||||
// 计算特征并验证
|
|
||||||
val embedding = SpeakerRecognition.extractor.compute(stream)
|
val embedding = SpeakerRecognition.extractor.compute(stream)
|
||||||
speakerManagerLock.withLock {
|
speakerManagerLock.withLock {
|
||||||
val verifyPass = SpeakerRecognition.manager.verify(
|
val verifyPass = SpeakerRecognition.manager.verify(
|
||||||
@ -409,19 +416,20 @@ class VoiceController(
|
|||||||
threshold = SPEAKER_THRESHOLD
|
threshold = SPEAKER_THRESHOLD
|
||||||
)
|
)
|
||||||
|
|
||||||
LogUtils.d(TAG, "📊 声纹验证 | 统一阈值: $SPEAKER_THRESHOLD | 通过: $verifyPass | 验证时长: ${(validAudio.size.toFloat()/SAMPLE_RATE*1000).toLong()}ms")
|
// 3. 计算真实处理耗时(结束时间 - 开始时间)
|
||||||
|
val verifyCostMs = System.currentTimeMillis() - verifyStartMs
|
||||||
|
// 日志区分:音频时长 vs 处理耗时
|
||||||
|
LogUtils.d(TAG, "📊 声纹验证 | 统一阈值: $SPEAKER_THRESHOLD | 通过: $verifyPass | 音频时长: $audioDurationMs ms | 处理耗时: $verifyCostMs ms")
|
||||||
verifyPass
|
verifyPass
|
||||||
}
|
}
|
||||||
}.onFailure { e ->
|
}.onFailure { e ->
|
||||||
// 处理所有异常情况
|
|
||||||
LogUtils.e(TAG, "❌ 声纹验证异常,拒绝", e)
|
LogUtils.e(TAG, "❌ 声纹验证异常,拒绝", e)
|
||||||
}.also {
|
}.also {
|
||||||
// 确保 stream 资源释放
|
|
||||||
runCatching {
|
runCatching {
|
||||||
stream?.release()
|
stream?.release()
|
||||||
}.onFailure { e ->
|
}.onFailure { e ->
|
||||||
LogUtils.w(TAG, "⚠️ 释放 Stream 资源失败", e)
|
LogUtils.w(TAG, "⚠️ 释放 Stream 资源失败", e)
|
||||||
}
|
}
|
||||||
}.getOrDefault(false) // 异常时默认返回 false
|
}.getOrDefault(false)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -102,7 +102,7 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
|
|||||||
private var startPlayTimeoutJob: Job? = null // 统一管理所有播放场景的超时Job
|
private var startPlayTimeoutJob: Job? = null // 统一管理所有播放场景的超时Job
|
||||||
|
|
||||||
private var mEventSources: EventSource? = null
|
private var mEventSources: EventSource? = null
|
||||||
private var isManualCancel = false
|
|
||||||
|
|
||||||
override fun getViewBinding(): ActivityMainBinding = ActivityMainBinding.inflate(layoutInflater)
|
override fun getViewBinding(): ActivityMainBinding = ActivityMainBinding.inflate(layoutInflater)
|
||||||
override fun initView() {
|
override fun initView() {
|
||||||
@ -219,7 +219,7 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
onFinalAudio = { audio ->
|
onFinalAudio = { audio ->
|
||||||
sendRecordVoiceToServer(AudioPcmUtil.floatToPcm16Base64(audio))
|
sendRecordVoiceToServer(AudioPcmUtil.floatToPcm16(audio))
|
||||||
// mViewModel?.uploadVoice(
|
// mViewModel?.uploadVoice(
|
||||||
//
|
//
|
||||||
// AudioPcmUtil.floatToPcm16Base64(audio),
|
// AudioPcmUtil.floatToPcm16Base64(audio),
|
||||||
@ -231,7 +231,7 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
|
|||||||
"xxx.wav"
|
"xxx.wav"
|
||||||
)
|
)
|
||||||
AudioDebugUtil.saveFloatPcmAsWav(audio, file)
|
AudioDebugUtil.saveFloatPcmAsWav(audio, file)
|
||||||
LogUtils.dTag("audioxx", "WAV saved: ${file.path}, samples=${audio.size}")
|
// LogUtils.dTag("audioxx", "WAV saved: ${file.path}, samples=${audio.size}")
|
||||||
// lifecycleScope.launch(Dispatchers.Main) {
|
// lifecycleScope.launch(Dispatchers.Main) {
|
||||||
//
|
//
|
||||||
// mVerticalAnimator?.show()
|
// mVerticalAnimator?.show()
|
||||||
@ -560,16 +560,13 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private fun sendRecordVoiceToServer(audio: String) {
|
private fun sendRecordVoiceToServer(audio: ByteArray) {
|
||||||
cancelSSE()
|
cancelSSE()
|
||||||
val request: Request? = RxHttp.postJson(ApiService.UPLOAD_RECORD_VOICE_URL)
|
val request: Request? = RxHttp.postBody(ApiService.UPLOAD_RECORD_VOICE_URL)
|
||||||
.add("audio",audio)
|
.setBody(audio)
|
||||||
.buildRequest()
|
.buildRequest()
|
||||||
|
|
||||||
request?.let {
|
request?.let {
|
||||||
// 重置手动取消标记
|
|
||||||
isManualCancel = false
|
|
||||||
|
|
||||||
mEventSources = createFactory(RxHttpPlugins.getOkHttpClient())
|
mEventSources = createFactory(RxHttpPlugins.getOkHttpClient())
|
||||||
.newEventSource(it, object : EventSourceListener() {
|
.newEventSource(it, object : EventSourceListener() {
|
||||||
override fun onOpen(eventSource: EventSource, response: Response) {
|
override fun onOpen(eventSource: EventSource, response: Response) {
|
||||||
@ -601,22 +598,19 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
|
|||||||
response: Response?
|
response: Response?
|
||||||
) {
|
) {
|
||||||
super.onFailure(eventSource, t, response)
|
super.onFailure(eventSource, t, response)
|
||||||
// 关键修复2:忽略手动取消导致的异常
|
|
||||||
if (isManualCancel) {
|
|
||||||
LogUtils.eTag("lrsxxx", "SSE手动取消,忽略失败回调")
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
// 正常失败逻辑
|
// 正常失败逻辑
|
||||||
val errorMsg = t?.message ?: response?.message ?: "未知错误"
|
val errorMsg = t?.message ?: response?.message ?: "未知错误"
|
||||||
voiceController?.onUploadFinished(false)
|
LogUtils.eTag("lrsxxx", "流式请求失败:${errorMsg}")
|
||||||
|
if (backPlaying){
|
||||||
|
voiceController?.onPlayEndBackend()
|
||||||
|
backPlaying = false
|
||||||
|
}else{
|
||||||
|
voiceController?.onUploadFinished(false)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
override fun onClosed(eventSource: EventSource) {
|
override fun onClosed(eventSource: EventSource) {
|
||||||
super.onClosed(eventSource)
|
super.onClosed(eventSource)
|
||||||
// 关键修复3:区分手动取消和正常关闭
|
|
||||||
val isSuccess = !isManualCancel
|
|
||||||
// 关键修复4:关闭后置空引用,避免内存泄漏
|
|
||||||
mEventSources = null
|
mEventSources = null
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
@ -625,7 +619,6 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
|
|||||||
|
|
||||||
|
|
||||||
private fun cancelSSE() {
|
private fun cancelSSE() {
|
||||||
isManualCancel = true
|
|
||||||
mEventSources?.cancel()
|
mEventSources?.cancel()
|
||||||
mEventSources = null
|
mEventSources = null
|
||||||
}
|
}
|
||||||
|
|||||||
@ -26,6 +26,7 @@
|
|||||||
android:layout_width="match_parent"
|
android:layout_width="match_parent"
|
||||||
android:layout_height="wrap_content"
|
android:layout_height="wrap_content"
|
||||||
android:textColor="@color/white"
|
android:textColor="@color/white"
|
||||||
|
android:padding="20dp"
|
||||||
android:textSize="14sp" />
|
android:textSize="14sp" />
|
||||||
|
|
||||||
</LinearLayout>
|
</LinearLayout>
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user