From 956dd78c1bff909ff5fbbe5fab847f0aa4701fd5 Mon Sep 17 00:00:00 2001 From: ross <3024454314@qq.com> Date: Sat, 3 Jan 2026 10:11:28 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BC=98=E5=8C=96=E5=90=8E=E7=9A=84=E4=BB=A3?= =?UTF-8?q?=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- app/src/main/AndroidManifest.xml | 6 +- .../com/zs/smarthuman/sherpa/VadManager.kt | 63 ++++--- .../zs/smarthuman/sherpa/VoiceController.kt | 163 ++++++++++++++---- .../java/com/zs/smarthuman/ui/MainActivity.kt | 19 +- .../zs/smarthuman/utils/SerialNumberUtil.kt | 16 +- 5 files changed, 197 insertions(+), 70 deletions(-) diff --git a/app/src/main/AndroidManifest.xml b/app/src/main/AndroidManifest.xml index d698cc6..d64e047 100644 --- a/app/src/main/AndroidManifest.xml +++ b/app/src/main/AndroidManifest.xml @@ -52,7 +52,7 @@ tools:targetApi="31"> @@ -66,9 +66,9 @@ - + android:screenOrientation="portrait"/> diff --git a/app/src/main/java/com/zs/smarthuman/sherpa/VadManager.kt b/app/src/main/java/com/zs/smarthuman/sherpa/VadManager.kt index 7848ca6..ae254ed 100644 --- a/app/src/main/java/com/zs/smarthuman/sherpa/VadManager.kt +++ b/app/src/main/java/com/zs/smarthuman/sherpa/VadManager.kt @@ -4,11 +4,6 @@ import android.content.res.AssetManager import com.k2fsa.sherpa.onnx.Vad import com.k2fsa.sherpa.onnx.getVadModelConfig -/** - * @description: - * @author: lrs - * @date: 2025/12/17 10:22 - */ class VadManager( assetManager: AssetManager, private val onSpeechStart: () -> Unit, @@ -19,9 +14,13 @@ class VadManager( private var isSpeaking = false private var lastSpeechTime = 0L - // ⭐ 统计用 - private var speechFrameCount = 0 - private var totalFrameCount = 0 + /** ⭐ 仅统计“有效语音段” */ + private var activeFrameCount = 0 + private var activeSpeechFrameCount = 0 + + /** ⭐ 用于调试(可选) */ + private var rawFrameCount = 0 + private var rawSpeechFrameCount = 0 private val END_SILENCE_MS = 600L @@ -37,36 +36,58 @@ class VadManager( vad.acceptWaveform(samples) val hasSpeech = vad.isSpeechDetected() - totalFrameCount++ + /* ===== raw 统计(仅日志) ===== */ + rawFrameCount++ + if (hasSpeech) rawSpeechFrameCount++ if (hasSpeech) { - speechFrameCount++ lastSpeechTime = now if (!isSpeaking) { isSpeaking = true onSpeechStart() } + + activeFrameCount++ + activeSpeechFrameCount++ } else { - if (isSpeaking && now - lastSpeechTime >= END_SILENCE_MS) { - isSpeaking = false - onSpeechEnd() - vad.clear() + if (isSpeaking) { + activeFrameCount++ + + if (now - lastSpeechTime >= END_SILENCE_MS) { + isSpeaking = false + onSpeechEnd() + } } } } - /** 👉 人声占比(真正用到 VAD 的地方) */ - fun speechRatio(): Float { - if (totalFrameCount == 0) return 0f - return speechFrameCount.toFloat() / totalFrameCount + /** + * ✅ 真正用于判断「是不是有效人声」 + * 只统计 VAD 激活期间 + */ + fun activeSpeechRatio(): Float { + if (activeFrameCount == 0) return 0f + return activeSpeechFrameCount.toFloat() / activeFrameCount + } + + /** + * ⚠️ 仅用于调参观察 + */ + fun rawSpeechRatio(): Float { + if (rawFrameCount == 0) return 0f + return rawSpeechFrameCount.toFloat() / rawFrameCount } fun reset() { isSpeaking = false - lastSpeechTime = 0 - speechFrameCount = 0 - totalFrameCount = 0 + lastSpeechTime = 0L + + activeFrameCount = 0 + activeSpeechFrameCount = 0 + rawFrameCount = 0 + rawSpeechFrameCount = 0 + vad.reset() } } diff --git a/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt b/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt index a88e03f..77524ec 100644 --- a/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt +++ b/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt @@ -8,7 +8,7 @@ class VoiceController( assetManager: AssetManager, private val onWakeup: () -> Unit, private val onFinalAudio: (FloatArray) -> Unit, - private val idleTimeoutSeconds: Int = 5, + private val idleTimeoutSeconds: Int = 8, private val maxRecordingSeconds: Int = 10, private val onStateChanged: ((VoiceState) -> Unit)? = null, private val stopBackendAudio: (() -> Unit)? = null @@ -53,28 +53,39 @@ class VoiceController( private var recordingStartMs = 0L private var silenceStartMs = 0L - - /** ⭐ WAIT_SPEECH 连续失败起点(关键) */ private var waitSpeechFailStartMs = 0L + /* ================= 近讲统计(⭐关键新增) ================= */ + + private var speechEnergySum = 0f + private var speechFrameCount = 0 + /* ================= 控制 ================= */ private var vadStarted = false - /** 唤醒观察期 */ private var inKwsObserve = false private var kwsObserveStartMs = 0L private val KWS_OBSERVE_MS = 500L - /** 播放冷却 */ private var speechEnableAtMs = 0L private val SPEECH_COOLDOWN_MS = 300L - /* ================= 阈值 ================= */ + /* ================= 阈值(⭐已校正) ================= */ - private val RMS_SILENCE_THRESHOLD = 0.005f + private val RMS_SILENCE_THRESHOLD = 0.012f // 静音阈值(修正) private val SILENCE_END_MS = 1200L - private val MIN_SPEECH_MS = 300L + private val MIN_SPEECH_MS = 1000L // 句子级 + private val MIN_AVG_ENERGY = 0.02f // 近讲能量门 + + + /** ⭐ 唤醒后等待人声起点 */ + private var waitSpeechStartMs = 0L + + /** ⭐ 唤醒后最大等待时间(没说一句话) */ + private val WAIT_SPEECH_TIMEOUT_MS = 8000L + + /* ================= 音频入口 ================= */ @@ -100,12 +111,22 @@ class VoiceController( VoiceState.WAIT_SPEECH_COOLDOWN -> { if (now >= speechEnableAtMs) { state = VoiceState.WAIT_SPEECH + waitSpeechStartMs = now // ⭐ 关键:开始等人说话 } return } VoiceState.WAIT_SPEECH -> { + // ⭐ 唤醒后长时间没人说话 → 自动退出 + if (waitSpeechStartMs > 0 && + now - waitSpeechStartMs >= WAIT_SPEECH_TIMEOUT_MS + ) { + Log.d(TAG, "⏱ Wakeup but no speech, exit to WAIT_WAKEUP") + resetAll() + return + } + if (inKwsObserve) { if (now - kwsObserveStartMs < KWS_OBSERVE_MS) return inKwsObserve = false @@ -114,26 +135,30 @@ class VoiceController( vadManager.accept(samples) } + VoiceState.RECORDING -> { audioBuffer.addAll(samples.asList()) vadManager.accept(samples) + val rms = calcRms(samples) + + if (rms > RMS_SILENCE_THRESHOLD) { + speechEnergySum += rms + speechFrameCount++ + silenceStartMs = 0L + } else { + if (silenceStartMs == 0L) silenceStartMs = now + else if (now - silenceStartMs >= SILENCE_END_MS) { + Log.d(TAG, "🔇 Silence end") + finishSentence() + return + } + } + if (now - recordingStartMs > maxRecordingSeconds * 1000) { Log.w(TAG, "⏱ Max recording reached") finishSentence() - return - } - - val rms = calcRms(samples) - if (rms < RMS_SILENCE_THRESHOLD) { - if (silenceStartMs == 0L) silenceStartMs = now - else if (now - silenceStartMs >= SILENCE_END_MS) { - Log.d(TAG, "🔇 RMS silence end") - finishSentence() - } - } else { - silenceStartMs = 0L } } } @@ -146,8 +171,12 @@ class VoiceController( VoiceState.UPLOADING -> return + // ⭐ 关键:只要不是纯等待唤醒,一律打断 VoiceState.RECORDING, + VoiceState.WAIT_SPEECH, + VoiceState.WAIT_SPEECH_COOLDOWN, VoiceState.PLAYING_BACKEND -> { + Log.d(TAG, "⚠ WakeWord interrupt state=$state") stopBackendAudio?.invoke() enterWakeup(interrupt = true) } @@ -156,13 +185,21 @@ class VoiceController( } } + private fun enterWakeup(interrupt: Boolean) { if (interrupt) { + Log.d(TAG, "🛑 Interrupt current speech / recording") + audioBuffer.clear() + preBuffer.clear() // ⭐ 防止把旧唤醒词带进去 vadManager.reset() + resetEnergyStat() + vadStarted = false silenceStartMs = 0L + waitSpeechStartMs = 0L // ⭐ + waitSpeechFailStartMs = 0L // ⭐ } inKwsObserve = true @@ -172,6 +209,7 @@ class VoiceController( onWakeup() } + /* ================= VAD START ================= */ private fun onVadStart() { @@ -182,6 +220,8 @@ class VoiceController( vadStarted = true recordingStartMs = System.currentTimeMillis() silenceStartMs = 0L + waitSpeechStartMs = 0L // ⭐ 清掉“等待说话”超时 + resetEnergyStat() audioBuffer.clear() audioBuffer.addAll(preBuffer) @@ -189,26 +229,73 @@ class VoiceController( state = VoiceState.RECORDING } - /* ================= 结束录音 ================= */ + + /* ================= 结束录音(⭐核心) ================= */ private fun finishSentence() { - val duration = System.currentTimeMillis() - recordingStartMs + val now = System.currentTimeMillis() + val duration = now - recordingStartMs + if (!vadStarted || duration < MIN_SPEECH_MS) { + Log.d(TAG, "❌ Too short or no VAD start: ${duration}ms") resetToWaitSpeech() return } - val vadRatio = vadManager.speechRatio() - Log.d(TAG, "🎙 VAD speech ratio=$vadRatio") + val vadRatio = vadManager.activeSpeechRatio() + val avgEnergy = + if (speechFrameCount > 0) speechEnergySum / speechFrameCount else 0f - if (vadRatio < 0.25f) { - Log.d(TAG, "❌ VAD says NOT human speech") + /* ================= 评分制判定 ================= */ + + var score = 0 + + // 1️⃣ 时长评分(最重要) + when { + duration >= 4000 -> score += 3 + duration >= 2500 -> score += 2 + duration >= 1500 -> score += 1 + } + + // 2️⃣ 能量评分(近讲人声强信号) + when { + avgEnergy >= 0.10f -> score += 3 + avgEnergy >= 0.06f -> score += 2 + avgEnergy >= MIN_AVG_ENERGY -> score += 1 + } + + // 3️⃣ VAD 评分(只作为辅助) + when { + vadRatio >= 0.55f -> score += 2 + vadRatio >= 0.40f -> score += 1 + } + + Log.d( + TAG, + "📊 duration=$duration ms, vadRatio=$vadRatio, avgEnergy=$avgEnergy, score=$score" + ) + + /** + * 评分阈值: + * - >=4 : 必然是真实人声 + * - 3 : 在近讲/长句条件下允许 + * - <3 : 拦截 + */ + val pass = when { + score >= 4 -> true + score == 3 && avgEnergy >= 0.06f -> true + else -> false + } + + if (!pass) { + Log.d(TAG, "❌ Sentence rejected (score=$score)") resetToWaitSpeech() return } - // ✅ 成功一次,清空失败计时 + /* ================= 通过,进入上传 ================= */ + waitSpeechFailStartMs = 0L val finalAudio = audioBuffer.toFloatArray() @@ -218,6 +305,7 @@ class VoiceController( onFinalAudio(finalAudio) } + /* ================= 播放回调 ================= */ fun onPlayStartPrompt() { @@ -238,7 +326,7 @@ class VoiceController( state = VoiceState.WAIT_SPEECH_COOLDOWN } - /* ================= 上传回调(保留 public) ================= */ + /* ================= 上传回调 ================= */ fun onUploadFinished(success: Boolean) { if (state != VoiceState.UPLOADING) return @@ -251,15 +339,16 @@ class VoiceController( } } - /* ================= Idle 超时(关键修复) ================= */ + /* ================= Idle 超时 ================= */ fun checkIdleTimeout() { if (state != VoiceState.WAIT_SPEECH) return if (waitSpeechFailStartMs == 0L) return - val now = System.currentTimeMillis() - if (now - waitSpeechFailStartMs > idleTimeoutSeconds * 1000) { - Log.d(TAG, "⏱ WAIT_SPEECH continuous fail timeout") + if (System.currentTimeMillis() - waitSpeechFailStartMs > + idleTimeoutSeconds * 1000 + ) { + Log.d(TAG, "⏱ WAIT_SPEECH timeout") resetAll() waitSpeechFailStartMs = 0L } @@ -270,11 +359,11 @@ class VoiceController( private fun resetToWaitSpeech() { audioBuffer.clear() vadManager.reset() + resetEnergyStat() vadStarted = false silenceStartMs = 0L state = VoiceState.WAIT_SPEECH - // ⭐ 只在第一次失败时记录 if (waitSpeechFailStartMs == 0L) { waitSpeechFailStartMs = System.currentTimeMillis() } @@ -284,11 +373,14 @@ class VoiceController( audioBuffer.clear() preBuffer.clear() vadManager.reset() + resetEnergyStat() vadStarted = false silenceStartMs = 0L + waitSpeechStartMs = 0L // ⭐ state = VoiceState.WAIT_WAKEUP } + fun release() { wakeupManager.release() vadManager.reset() @@ -296,6 +388,11 @@ class VoiceController( /* ================= Utils ================= */ + private fun resetEnergyStat() { + speechEnergySum = 0f + speechFrameCount = 0 + } + private fun cachePreBuffer(samples: FloatArray) { for (s in samples) { preBuffer.addLast(s) diff --git a/app/src/main/java/com/zs/smarthuman/ui/MainActivity.kt b/app/src/main/java/com/zs/smarthuman/ui/MainActivity.kt index 23d3316..74183a0 100644 --- a/app/src/main/java/com/zs/smarthuman/ui/MainActivity.kt +++ b/app/src/main/java/com/zs/smarthuman/ui/MainActivity.kt @@ -161,7 +161,7 @@ class MainActivity : BaseViewModelActivity() voiceInfo = mutableListOf().apply { add( VoiceBeanResp( - audioUrl = /*UserInfoManager.userInfo?.wakeUpAudioUrl ?:*/ "https://static.seerteach.net/aidialogue/systemVoice/aliyun-nv.mp3" + audioUrl = UserInfoManager.userInfo?.wakeUpAudioUrl ?: "https://static.seerteach.net/aidialogue/systemVoice/aliyun-nv.mp3" ) ) } @@ -187,8 +187,17 @@ class MainActivity : BaseViewModelActivity() VoiceState.WAIT_WAKEUP -> { Log.d("lrs", "当前状态: 等待唤醒") lifecycleScope.launch(Dispatchers.Main) { - mVerticalAnimator?.hide() + UnityPlayerHolder.getInstance() + .sendVoiceToUnity( + voiceInfo = mutableListOf().apply { + add( + VoiceBeanResp( + audioUrl = "https://static.seerteach.net/aidialogue/userWakeUpAudio/ttsmaker-file-2025-12-31-16-2-51.mp3" + ) + ) + } + ) } } @@ -334,9 +343,9 @@ class MainActivity : BaseViewModelActivity() word: String, audioUrl: String ) { -// val wakeupUrl = UserInfoManager.userInfo?.wakeUpAudioUrl ?: return -// -// if (audioUrl != wakeupUrl) return + val wakeupUrl = UserInfoManager.userInfo?.wakeUpAudioUrl ?: "https://static.seerteach.net/aidialogue/systemVoice/aliyun-nv.mp3" ?: return + + if (audioUrl != wakeupUrl) return when (state) { 1 -> { // play diff --git a/app/src/main/java/com/zs/smarthuman/utils/SerialNumberUtil.kt b/app/src/main/java/com/zs/smarthuman/utils/SerialNumberUtil.kt index a78f6b3..f3228c2 100644 --- a/app/src/main/java/com/zs/smarthuman/utils/SerialNumberUtil.kt +++ b/app/src/main/java/com/zs/smarthuman/utils/SerialNumberUtil.kt @@ -25,14 +25,14 @@ object SerialNumberUtil { * 外部调用,获取最终序列号 */ fun getSerialNumber(): String { - for (key in snKeys) { - val sn = getProp(key) - if (!sn.isNullOrBlank()) { - return limitSerialDigit(sn) - } - } - return "" -// return "zd09312051870556" +// for (key in snKeys) { +// val sn = getProp(key) +// if (!sn.isNullOrBlank()) { +// return limitSerialDigit(sn) +// } +// } +// return "" + return "zd09312051870556" } /**