From 956dd78c1bff909ff5fbbe5fab847f0aa4701fd5 Mon Sep 17 00:00:00 2001
From: ross <3024454314@qq.com>
Date: Sat, 3 Jan 2026 10:11:28 +0800
Subject: [PATCH] =?UTF-8?q?=E4=BC=98=E5=8C=96=E5=90=8E=E7=9A=84=E4=BB=A3?=
=?UTF-8?q?=E7=A0=81?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
app/src/main/AndroidManifest.xml | 6 +-
.../com/zs/smarthuman/sherpa/VadManager.kt | 63 ++++---
.../zs/smarthuman/sherpa/VoiceController.kt | 163 ++++++++++++++----
.../java/com/zs/smarthuman/ui/MainActivity.kt | 19 +-
.../zs/smarthuman/utils/SerialNumberUtil.kt | 16 +-
5 files changed, 197 insertions(+), 70 deletions(-)
diff --git a/app/src/main/AndroidManifest.xml b/app/src/main/AndroidManifest.xml
index d698cc6..d64e047 100644
--- a/app/src/main/AndroidManifest.xml
+++ b/app/src/main/AndroidManifest.xml
@@ -52,7 +52,7 @@
tools:targetApi="31">
@@ -66,9 +66,9 @@
-
+ android:screenOrientation="portrait"/>
diff --git a/app/src/main/java/com/zs/smarthuman/sherpa/VadManager.kt b/app/src/main/java/com/zs/smarthuman/sherpa/VadManager.kt
index 7848ca6..ae254ed 100644
--- a/app/src/main/java/com/zs/smarthuman/sherpa/VadManager.kt
+++ b/app/src/main/java/com/zs/smarthuman/sherpa/VadManager.kt
@@ -4,11 +4,6 @@ import android.content.res.AssetManager
import com.k2fsa.sherpa.onnx.Vad
import com.k2fsa.sherpa.onnx.getVadModelConfig
-/**
- * @description:
- * @author: lrs
- * @date: 2025/12/17 10:22
- */
class VadManager(
assetManager: AssetManager,
private val onSpeechStart: () -> Unit,
@@ -19,9 +14,13 @@ class VadManager(
private var isSpeaking = false
private var lastSpeechTime = 0L
- // ⭐ 统计用
- private var speechFrameCount = 0
- private var totalFrameCount = 0
+ /** ⭐ 仅统计“有效语音段” */
+ private var activeFrameCount = 0
+ private var activeSpeechFrameCount = 0
+
+ /** ⭐ 用于调试(可选) */
+ private var rawFrameCount = 0
+ private var rawSpeechFrameCount = 0
private val END_SILENCE_MS = 600L
@@ -37,36 +36,58 @@ class VadManager(
vad.acceptWaveform(samples)
val hasSpeech = vad.isSpeechDetected()
- totalFrameCount++
+ /* ===== raw 统计(仅日志) ===== */
+ rawFrameCount++
+ if (hasSpeech) rawSpeechFrameCount++
if (hasSpeech) {
- speechFrameCount++
lastSpeechTime = now
if (!isSpeaking) {
isSpeaking = true
onSpeechStart()
}
+
+ activeFrameCount++
+ activeSpeechFrameCount++
} else {
- if (isSpeaking && now - lastSpeechTime >= END_SILENCE_MS) {
- isSpeaking = false
- onSpeechEnd()
- vad.clear()
+ if (isSpeaking) {
+ activeFrameCount++
+
+ if (now - lastSpeechTime >= END_SILENCE_MS) {
+ isSpeaking = false
+ onSpeechEnd()
+ }
}
}
}
- /** 👉 人声占比(真正用到 VAD 的地方) */
- fun speechRatio(): Float {
- if (totalFrameCount == 0) return 0f
- return speechFrameCount.toFloat() / totalFrameCount
+ /**
+ * ✅ 真正用于判断「是不是有效人声」
+ * 只统计 VAD 激活期间
+ */
+ fun activeSpeechRatio(): Float {
+ if (activeFrameCount == 0) return 0f
+ return activeSpeechFrameCount.toFloat() / activeFrameCount
+ }
+
+ /**
+ * ⚠️ 仅用于调参观察
+ */
+ fun rawSpeechRatio(): Float {
+ if (rawFrameCount == 0) return 0f
+ return rawSpeechFrameCount.toFloat() / rawFrameCount
}
fun reset() {
isSpeaking = false
- lastSpeechTime = 0
- speechFrameCount = 0
- totalFrameCount = 0
+ lastSpeechTime = 0L
+
+ activeFrameCount = 0
+ activeSpeechFrameCount = 0
+ rawFrameCount = 0
+ rawSpeechFrameCount = 0
+
vad.reset()
}
}
diff --git a/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt b/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt
index a88e03f..77524ec 100644
--- a/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt
+++ b/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt
@@ -8,7 +8,7 @@ class VoiceController(
assetManager: AssetManager,
private val onWakeup: () -> Unit,
private val onFinalAudio: (FloatArray) -> Unit,
- private val idleTimeoutSeconds: Int = 5,
+ private val idleTimeoutSeconds: Int = 8,
private val maxRecordingSeconds: Int = 10,
private val onStateChanged: ((VoiceState) -> Unit)? = null,
private val stopBackendAudio: (() -> Unit)? = null
@@ -53,28 +53,39 @@ class VoiceController(
private var recordingStartMs = 0L
private var silenceStartMs = 0L
-
- /** ⭐ WAIT_SPEECH 连续失败起点(关键) */
private var waitSpeechFailStartMs = 0L
+ /* ================= 近讲统计(⭐关键新增) ================= */
+
+ private var speechEnergySum = 0f
+ private var speechFrameCount = 0
+
/* ================= 控制 ================= */
private var vadStarted = false
- /** 唤醒观察期 */
private var inKwsObserve = false
private var kwsObserveStartMs = 0L
private val KWS_OBSERVE_MS = 500L
- /** 播放冷却 */
private var speechEnableAtMs = 0L
private val SPEECH_COOLDOWN_MS = 300L
- /* ================= 阈值 ================= */
+ /* ================= 阈值(⭐已校正) ================= */
- private val RMS_SILENCE_THRESHOLD = 0.005f
+ private val RMS_SILENCE_THRESHOLD = 0.012f // 静音阈值(修正)
private val SILENCE_END_MS = 1200L
- private val MIN_SPEECH_MS = 300L
+ private val MIN_SPEECH_MS = 1000L // 句子级
+ private val MIN_AVG_ENERGY = 0.02f // 近讲能量门
+
+
+ /** ⭐ 唤醒后等待人声起点 */
+ private var waitSpeechStartMs = 0L
+
+ /** ⭐ 唤醒后最大等待时间(没说一句话) */
+ private val WAIT_SPEECH_TIMEOUT_MS = 8000L
+
+
/* ================= 音频入口 ================= */
@@ -100,12 +111,22 @@ class VoiceController(
VoiceState.WAIT_SPEECH_COOLDOWN -> {
if (now >= speechEnableAtMs) {
state = VoiceState.WAIT_SPEECH
+ waitSpeechStartMs = now // ⭐ 关键:开始等人说话
}
return
}
VoiceState.WAIT_SPEECH -> {
+ // ⭐ 唤醒后长时间没人说话 → 自动退出
+ if (waitSpeechStartMs > 0 &&
+ now - waitSpeechStartMs >= WAIT_SPEECH_TIMEOUT_MS
+ ) {
+ Log.d(TAG, "⏱ Wakeup but no speech, exit to WAIT_WAKEUP")
+ resetAll()
+ return
+ }
+
if (inKwsObserve) {
if (now - kwsObserveStartMs < KWS_OBSERVE_MS) return
inKwsObserve = false
@@ -114,26 +135,30 @@ class VoiceController(
vadManager.accept(samples)
}
+
VoiceState.RECORDING -> {
audioBuffer.addAll(samples.asList())
vadManager.accept(samples)
+ val rms = calcRms(samples)
+
+ if (rms > RMS_SILENCE_THRESHOLD) {
+ speechEnergySum += rms
+ speechFrameCount++
+ silenceStartMs = 0L
+ } else {
+ if (silenceStartMs == 0L) silenceStartMs = now
+ else if (now - silenceStartMs >= SILENCE_END_MS) {
+ Log.d(TAG, "🔇 Silence end")
+ finishSentence()
+ return
+ }
+ }
+
if (now - recordingStartMs > maxRecordingSeconds * 1000) {
Log.w(TAG, "⏱ Max recording reached")
finishSentence()
- return
- }
-
- val rms = calcRms(samples)
- if (rms < RMS_SILENCE_THRESHOLD) {
- if (silenceStartMs == 0L) silenceStartMs = now
- else if (now - silenceStartMs >= SILENCE_END_MS) {
- Log.d(TAG, "🔇 RMS silence end")
- finishSentence()
- }
- } else {
- silenceStartMs = 0L
}
}
}
@@ -146,8 +171,12 @@ class VoiceController(
VoiceState.UPLOADING -> return
+ // ⭐ 关键:只要不是纯等待唤醒,一律打断
VoiceState.RECORDING,
+ VoiceState.WAIT_SPEECH,
+ VoiceState.WAIT_SPEECH_COOLDOWN,
VoiceState.PLAYING_BACKEND -> {
+ Log.d(TAG, "⚠ WakeWord interrupt state=$state")
stopBackendAudio?.invoke()
enterWakeup(interrupt = true)
}
@@ -156,13 +185,21 @@ class VoiceController(
}
}
+
private fun enterWakeup(interrupt: Boolean) {
if (interrupt) {
+ Log.d(TAG, "🛑 Interrupt current speech / recording")
+
audioBuffer.clear()
+ preBuffer.clear() // ⭐ 防止把旧唤醒词带进去
vadManager.reset()
+ resetEnergyStat()
+
vadStarted = false
silenceStartMs = 0L
+ waitSpeechStartMs = 0L // ⭐
+ waitSpeechFailStartMs = 0L // ⭐
}
inKwsObserve = true
@@ -172,6 +209,7 @@ class VoiceController(
onWakeup()
}
+
/* ================= VAD START ================= */
private fun onVadStart() {
@@ -182,6 +220,8 @@ class VoiceController(
vadStarted = true
recordingStartMs = System.currentTimeMillis()
silenceStartMs = 0L
+ waitSpeechStartMs = 0L // ⭐ 清掉“等待说话”超时
+ resetEnergyStat()
audioBuffer.clear()
audioBuffer.addAll(preBuffer)
@@ -189,26 +229,73 @@ class VoiceController(
state = VoiceState.RECORDING
}
- /* ================= 结束录音 ================= */
+
+ /* ================= 结束录音(⭐核心) ================= */
private fun finishSentence() {
- val duration = System.currentTimeMillis() - recordingStartMs
+ val now = System.currentTimeMillis()
+ val duration = now - recordingStartMs
+
if (!vadStarted || duration < MIN_SPEECH_MS) {
+ Log.d(TAG, "❌ Too short or no VAD start: ${duration}ms")
resetToWaitSpeech()
return
}
- val vadRatio = vadManager.speechRatio()
- Log.d(TAG, "🎙 VAD speech ratio=$vadRatio")
+ val vadRatio = vadManager.activeSpeechRatio()
+ val avgEnergy =
+ if (speechFrameCount > 0) speechEnergySum / speechFrameCount else 0f
- if (vadRatio < 0.25f) {
- Log.d(TAG, "❌ VAD says NOT human speech")
+ /* ================= 评分制判定 ================= */
+
+ var score = 0
+
+ // 1️⃣ 时长评分(最重要)
+ when {
+ duration >= 4000 -> score += 3
+ duration >= 2500 -> score += 2
+ duration >= 1500 -> score += 1
+ }
+
+ // 2️⃣ 能量评分(近讲人声强信号)
+ when {
+ avgEnergy >= 0.10f -> score += 3
+ avgEnergy >= 0.06f -> score += 2
+ avgEnergy >= MIN_AVG_ENERGY -> score += 1
+ }
+
+ // 3️⃣ VAD 评分(只作为辅助)
+ when {
+ vadRatio >= 0.55f -> score += 2
+ vadRatio >= 0.40f -> score += 1
+ }
+
+ Log.d(
+ TAG,
+ "📊 duration=$duration ms, vadRatio=$vadRatio, avgEnergy=$avgEnergy, score=$score"
+ )
+
+ /**
+ * 评分阈值:
+ * - >=4 : 必然是真实人声
+ * - 3 : 在近讲/长句条件下允许
+ * - <3 : 拦截
+ */
+ val pass = when {
+ score >= 4 -> true
+ score == 3 && avgEnergy >= 0.06f -> true
+ else -> false
+ }
+
+ if (!pass) {
+ Log.d(TAG, "❌ Sentence rejected (score=$score)")
resetToWaitSpeech()
return
}
- // ✅ 成功一次,清空失败计时
+ /* ================= 通过,进入上传 ================= */
+
waitSpeechFailStartMs = 0L
val finalAudio = audioBuffer.toFloatArray()
@@ -218,6 +305,7 @@ class VoiceController(
onFinalAudio(finalAudio)
}
+
/* ================= 播放回调 ================= */
fun onPlayStartPrompt() {
@@ -238,7 +326,7 @@ class VoiceController(
state = VoiceState.WAIT_SPEECH_COOLDOWN
}
- /* ================= 上传回调(保留 public) ================= */
+ /* ================= 上传回调 ================= */
fun onUploadFinished(success: Boolean) {
if (state != VoiceState.UPLOADING) return
@@ -251,15 +339,16 @@ class VoiceController(
}
}
- /* ================= Idle 超时(关键修复) ================= */
+ /* ================= Idle 超时 ================= */
fun checkIdleTimeout() {
if (state != VoiceState.WAIT_SPEECH) return
if (waitSpeechFailStartMs == 0L) return
- val now = System.currentTimeMillis()
- if (now - waitSpeechFailStartMs > idleTimeoutSeconds * 1000) {
- Log.d(TAG, "⏱ WAIT_SPEECH continuous fail timeout")
+ if (System.currentTimeMillis() - waitSpeechFailStartMs >
+ idleTimeoutSeconds * 1000
+ ) {
+ Log.d(TAG, "⏱ WAIT_SPEECH timeout")
resetAll()
waitSpeechFailStartMs = 0L
}
@@ -270,11 +359,11 @@ class VoiceController(
private fun resetToWaitSpeech() {
audioBuffer.clear()
vadManager.reset()
+ resetEnergyStat()
vadStarted = false
silenceStartMs = 0L
state = VoiceState.WAIT_SPEECH
- // ⭐ 只在第一次失败时记录
if (waitSpeechFailStartMs == 0L) {
waitSpeechFailStartMs = System.currentTimeMillis()
}
@@ -284,11 +373,14 @@ class VoiceController(
audioBuffer.clear()
preBuffer.clear()
vadManager.reset()
+ resetEnergyStat()
vadStarted = false
silenceStartMs = 0L
+ waitSpeechStartMs = 0L // ⭐
state = VoiceState.WAIT_WAKEUP
}
+
fun release() {
wakeupManager.release()
vadManager.reset()
@@ -296,6 +388,11 @@ class VoiceController(
/* ================= Utils ================= */
+ private fun resetEnergyStat() {
+ speechEnergySum = 0f
+ speechFrameCount = 0
+ }
+
private fun cachePreBuffer(samples: FloatArray) {
for (s in samples) {
preBuffer.addLast(s)
diff --git a/app/src/main/java/com/zs/smarthuman/ui/MainActivity.kt b/app/src/main/java/com/zs/smarthuman/ui/MainActivity.kt
index 23d3316..74183a0 100644
--- a/app/src/main/java/com/zs/smarthuman/ui/MainActivity.kt
+++ b/app/src/main/java/com/zs/smarthuman/ui/MainActivity.kt
@@ -161,7 +161,7 @@ class MainActivity : BaseViewModelActivity()
voiceInfo = mutableListOf().apply {
add(
VoiceBeanResp(
- audioUrl = /*UserInfoManager.userInfo?.wakeUpAudioUrl ?:*/ "https://static.seerteach.net/aidialogue/systemVoice/aliyun-nv.mp3"
+ audioUrl = UserInfoManager.userInfo?.wakeUpAudioUrl ?: "https://static.seerteach.net/aidialogue/systemVoice/aliyun-nv.mp3"
)
)
}
@@ -187,8 +187,17 @@ class MainActivity : BaseViewModelActivity()
VoiceState.WAIT_WAKEUP -> {
Log.d("lrs", "当前状态: 等待唤醒")
lifecycleScope.launch(Dispatchers.Main) {
-
mVerticalAnimator?.hide()
+ UnityPlayerHolder.getInstance()
+ .sendVoiceToUnity(
+ voiceInfo = mutableListOf().apply {
+ add(
+ VoiceBeanResp(
+ audioUrl = "https://static.seerteach.net/aidialogue/userWakeUpAudio/ttsmaker-file-2025-12-31-16-2-51.mp3"
+ )
+ )
+ }
+ )
}
}
@@ -334,9 +343,9 @@ class MainActivity : BaseViewModelActivity()
word: String,
audioUrl: String
) {
-// val wakeupUrl = UserInfoManager.userInfo?.wakeUpAudioUrl ?: return
-//
-// if (audioUrl != wakeupUrl) return
+ val wakeupUrl = UserInfoManager.userInfo?.wakeUpAudioUrl ?: "https://static.seerteach.net/aidialogue/systemVoice/aliyun-nv.mp3" ?: return
+
+ if (audioUrl != wakeupUrl) return
when (state) {
1 -> { // play
diff --git a/app/src/main/java/com/zs/smarthuman/utils/SerialNumberUtil.kt b/app/src/main/java/com/zs/smarthuman/utils/SerialNumberUtil.kt
index a78f6b3..f3228c2 100644
--- a/app/src/main/java/com/zs/smarthuman/utils/SerialNumberUtil.kt
+++ b/app/src/main/java/com/zs/smarthuman/utils/SerialNumberUtil.kt
@@ -25,14 +25,14 @@ object SerialNumberUtil {
* 外部调用,获取最终序列号
*/
fun getSerialNumber(): String {
- for (key in snKeys) {
- val sn = getProp(key)
- if (!sn.isNullOrBlank()) {
- return limitSerialDigit(sn)
- }
- }
- return ""
-// return "zd09312051870556"
+// for (key in snKeys) {
+// val sn = getProp(key)
+// if (!sn.isNullOrBlank()) {
+// return limitSerialDigit(sn)
+// }
+// }
+// return ""
+ return "zd09312051870556"
}
/**