diff --git a/app/build.gradle b/app/build.gradle
index 58e31ef..c332a7d 100644
--- a/app/build.gradle
+++ b/app/build.gradle
@@ -180,6 +180,6 @@ dependencies {
implementation libs.androidautosize
- implementation files('libs/sherpa19.aar')
+ implementation files('libs/sherpa-onnx-1.12.20.aar')
}
\ No newline at end of file
diff --git a/app/libs/sherpa19.aar b/app/libs/sherpa-onnx-1.12.20.aar
similarity index 85%
rename from app/libs/sherpa19.aar
rename to app/libs/sherpa-onnx-1.12.20.aar
index 134df6c..134fe69 100644
Binary files a/app/libs/sherpa19.aar and b/app/libs/sherpa-onnx-1.12.20.aar differ
diff --git a/app/src/main/AndroidManifest.xml b/app/src/main/AndroidManifest.xml
index d64e047..d698cc6 100644
--- a/app/src/main/AndroidManifest.xml
+++ b/app/src/main/AndroidManifest.xml
@@ -52,7 +52,7 @@
tools:targetApi="31">
@@ -66,9 +66,9 @@
-
+ android:screenOrientation="portrait"/>-->
diff --git a/app/src/main/java/com/zs/smarthuman/sherpa/VadManager.kt b/app/src/main/java/com/zs/smarthuman/sherpa/VadManager.kt
index 72f0021..7848ca6 100644
--- a/app/src/main/java/com/zs/smarthuman/sherpa/VadManager.kt
+++ b/app/src/main/java/com/zs/smarthuman/sherpa/VadManager.kt
@@ -10,42 +10,63 @@ import com.k2fsa.sherpa.onnx.getVadModelConfig
* @date: 2025/12/17 10:22
*/
class VadManager(
- private val assetManager: AssetManager,
+ assetManager: AssetManager,
private val onSpeechStart: () -> Unit,
private val onSpeechEnd: () -> Unit
) {
private val vad: Vad
+
private var isSpeaking = false
+ private var lastSpeechTime = 0L
+
+ // ⭐ 统计用
+ private var speechFrameCount = 0
+ private var totalFrameCount = 0
+
+ private val END_SILENCE_MS = 600L
init {
- val config = getVadModelConfig(0)
- if (config == null) {
- throw IllegalStateException("VAD config not found")
- }
- vad = Vad(assetManager = assetManager, config = config)
+ val config = getVadModelConfig(1)
+ ?: throw IllegalStateException("VAD config not found")
+ vad = Vad(assetManager, config)
}
- /** 喂入音频帧 (16kHz PCM float) */
fun accept(samples: FloatArray) {
- vad.acceptWaveform(samples)
- val speechDetected = vad.isSpeechDetected()
+ val now = System.currentTimeMillis()
- if (speechDetected && !isSpeaking) {
- isSpeaking = true
- onSpeechStart()
- } else if (!speechDetected && isSpeaking) {
- isSpeaking = false
- onSpeechEnd()
- // ⭐ 只在句子结束时清空 VAD
- vad.clear()
+ vad.acceptWaveform(samples)
+ val hasSpeech = vad.isSpeechDetected()
+
+ totalFrameCount++
+
+ if (hasSpeech) {
+ speechFrameCount++
+ lastSpeechTime = now
+
+ if (!isSpeaking) {
+ isSpeaking = true
+ onSpeechStart()
+ }
+ } else {
+ if (isSpeaking && now - lastSpeechTime >= END_SILENCE_MS) {
+ isSpeaking = false
+ onSpeechEnd()
+ vad.clear()
+ }
}
}
+ /** 👉 人声占比(真正用到 VAD 的地方) */
+ fun speechRatio(): Float {
+ if (totalFrameCount == 0) return 0f
+ return speechFrameCount.toFloat() / totalFrameCount
+ }
- /** 重置内部状态 */
fun reset() {
isSpeaking = false
+ lastSpeechTime = 0
+ speechFrameCount = 0
+ totalFrameCount = 0
vad.reset()
}
}
-
diff --git a/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt b/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt
index 8c731f9..a88e03f 100644
--- a/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt
+++ b/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt
@@ -8,15 +8,17 @@ class VoiceController(
assetManager: AssetManager,
private val onWakeup: () -> Unit,
private val onFinalAudio: (FloatArray) -> Unit,
- private val idleTimeoutSeconds: Int = 15,
+ private val idleTimeoutSeconds: Int = 5,
private val maxRecordingSeconds: Int = 10,
private val onStateChanged: ((VoiceState) -> Unit)? = null,
private val stopBackendAudio: (() -> Unit)? = null
) {
+
private val TAG = "VoiceController"
private val sampleRate = 16000
/* ================= 状态 ================= */
+
private var state: VoiceState = VoiceState.WAIT_WAKEUP
set(value) {
field = value
@@ -24,160 +26,258 @@ class VoiceController(
onStateChanged?.invoke(value)
}
- /* ================= 唤醒 ================= */
+ /* ================= KWS ================= */
+
private val wakeupManager = WakeupManager(assetManager) {
Log.d(TAG, "🔥 WakeWord detected")
- stopBackendAudio?.invoke()
- if (state != VoiceState.UPLOADING) { // 上传中不重置
- resetAll()
- state = VoiceState.PLAYING_PROMPT
- }
- onWakeup()
+ handleWakeupEvent()
}
- /* ================= VAD(只负责 START) ================= */
+ /* ================= VAD ================= */
+
private val vadManager = VadManager(
assetManager,
onSpeechStart = { onVadStart() },
- onSpeechEnd = { /* 不再用于结束 */ }
+ onSpeechEnd = {}
)
- /* ================= 音频缓存 ================= */
+ /* ================= Buffer ================= */
+
private val audioBuffer = mutableListOf()
+
+ /** 前导音缓存(2 秒) */
private val preBuffer = ArrayDeque()
- private val PRE_BUFFER_SIZE = sampleRate // 1 秒预缓冲
+ private val PRE_BUFFER_SIZE = sampleRate * 2
/* ================= 时间 ================= */
- private var idleTimer = 0L
- private var recordingStartTime = 0L
+
+ private var recordingStartMs = 0L
+ private var silenceStartMs = 0L
+
+ /** ⭐ WAIT_SPEECH 连续失败起点(关键) */
+ private var waitSpeechFailStartMs = 0L
+
+ /* ================= 控制 ================= */
+
private var vadStarted = false
- /* ================= RMS 静音判定 ================= */
- private var silenceStartMs = 0L
- private val SILENCE_END_MS = 1200L // 静音多久算一句结束
- private val RMS_SILENCE_THRESHOLD = 0.005f // 更灵敏
- private val MIN_SPEECH_DURATION_MS = 300L // 最短有效语音
- private val MIN_SPEECH_RATIO = 0.15f // 有效帧占比至少 15%
+ /** 唤醒观察期 */
+ private var inKwsObserve = false
+ private var kwsObserveStartMs = 0L
+ private val KWS_OBSERVE_MS = 500L
+
+ /** 播放冷却 */
+ private var speechEnableAtMs = 0L
+ private val SPEECH_COOLDOWN_MS = 300L
+
+ /* ================= 阈值 ================= */
+
+ private val RMS_SILENCE_THRESHOLD = 0.005f
+ private val SILENCE_END_MS = 1200L
+ private val MIN_SPEECH_MS = 300L
/* ================= 音频入口 ================= */
+
fun acceptAudio(samples: FloatArray) {
- // 唤醒独立处理,始终喂
+
+ cachePreBuffer(samples)
+
wakeupManager.acceptAudio(samples)
-
- if (state == VoiceState.UPLOADING ||
- state == VoiceState.PLAYING_PROMPT ||
- state == VoiceState.PLAYING_BACKEND
- ) return
-
- if (state == VoiceState.WAIT_SPEECH) {
- cachePreBuffer(samples)
- vadManager.accept(samples)
+ if (wakeupManager.consumeWakeupFlag()) {
+ handleWakeupEvent()
return
}
- if (state != VoiceState.RECORDING) return
-
- // ===== RECORDING =====
- audioBuffer.addAll(samples.asList())
- vadManager.accept(samples)
-
val now = System.currentTimeMillis()
- // 1️⃣ 最大录音兜底
- if (now - recordingStartTime >= maxRecordingSeconds * 1000) {
- Log.w(TAG, "⏱ Max recording reached")
- finishSentence()
- return
- }
+ when (state) {
- // 2️⃣ RMS 静音结束判定
- val rms = calcRms(samples)
- if (rms < RMS_SILENCE_THRESHOLD) {
- if (silenceStartMs == 0L) silenceStartMs = now
- else if (now - silenceStartMs >= SILENCE_END_MS) {
- Log.d(TAG, "🔇 RMS silence end")
- finishSentence()
+ VoiceState.WAIT_WAKEUP,
+ VoiceState.PLAYING_PROMPT,
+ VoiceState.PLAYING_BACKEND,
+ VoiceState.UPLOADING -> return
+
+ VoiceState.WAIT_SPEECH_COOLDOWN -> {
+ if (now >= speechEnableAtMs) {
+ state = VoiceState.WAIT_SPEECH
+ }
+ return
}
- } else {
+
+ VoiceState.WAIT_SPEECH -> {
+
+ if (inKwsObserve) {
+ if (now - kwsObserveStartMs < KWS_OBSERVE_MS) return
+ inKwsObserve = false
+ }
+
+ vadManager.accept(samples)
+ }
+
+ VoiceState.RECORDING -> {
+
+ audioBuffer.addAll(samples.asList())
+ vadManager.accept(samples)
+
+ if (now - recordingStartMs > maxRecordingSeconds * 1000) {
+ Log.w(TAG, "⏱ Max recording reached")
+ finishSentence()
+ return
+ }
+
+ val rms = calcRms(samples)
+ if (rms < RMS_SILENCE_THRESHOLD) {
+ if (silenceStartMs == 0L) silenceStartMs = now
+ else if (now - silenceStartMs >= SILENCE_END_MS) {
+ Log.d(TAG, "🔇 RMS silence end")
+ finishSentence()
+ }
+ } else {
+ silenceStartMs = 0L
+ }
+ }
+ }
+ }
+
+ /* ================= 唤醒 ================= */
+
+ private fun handleWakeupEvent() {
+ when (state) {
+
+ VoiceState.UPLOADING -> return
+
+ VoiceState.RECORDING,
+ VoiceState.PLAYING_BACKEND -> {
+ stopBackendAudio?.invoke()
+ enterWakeup(interrupt = true)
+ }
+
+ else -> enterWakeup(interrupt = false)
+ }
+ }
+
+ private fun enterWakeup(interrupt: Boolean) {
+
+ if (interrupt) {
+ audioBuffer.clear()
+ vadManager.reset()
+ vadStarted = false
silenceStartMs = 0L
}
+
+ inKwsObserve = true
+ kwsObserveStartMs = System.currentTimeMillis()
+
+ state = VoiceState.PLAYING_PROMPT
+ onWakeup()
}
/* ================= VAD START ================= */
+
private fun onVadStart() {
if (state != VoiceState.WAIT_SPEECH) return
- Log.d(TAG, "🎤 VAD START")
+ Log.d(TAG, "🎤 REAL VAD START")
+
vadStarted = true
- state = VoiceState.RECORDING
- recordingStartTime = System.currentTimeMillis()
+ recordingStartMs = System.currentTimeMillis()
silenceStartMs = 0L
+ audioBuffer.clear()
audioBuffer.addAll(preBuffer)
- preBuffer.clear()
+
+ state = VoiceState.RECORDING
}
/* ================= 结束录音 ================= */
+
private fun finishSentence() {
- val speakTime = System.currentTimeMillis() - recordingStartTime
- if (!vadStarted || speakTime < MIN_SPEECH_DURATION_MS) {
- Log.d(TAG, "⛔ Speech too short, ignore")
- resetToWaitSpeech(refreshIdle = false)
+ val duration = System.currentTimeMillis() - recordingStartMs
+ if (!vadStarted || duration < MIN_SPEECH_MS) {
+ resetToWaitSpeech()
return
}
- val rmsFrames = calcRmsFrames(audioBuffer.toFloatArray(), frameSize = 320)
- val validFrames = rmsFrames.count { it >= RMS_SILENCE_THRESHOLD }
- val ratio = if (rmsFrames.isEmpty()) 0f else validFrames.toFloat() / rmsFrames.size
- Log.d(TAG, "RMS ratio=$ratio")
- if (ratio < MIN_SPEECH_RATIO) {
- Log.d(TAG, "❌ Not enough human voice (ratio=$ratio)")
- resetToWaitSpeech(refreshIdle = false)
+ val vadRatio = vadManager.speechRatio()
+ Log.d(TAG, "🎙 VAD speech ratio=$vadRatio")
+
+ if (vadRatio < 0.25f) {
+ Log.d(TAG, "❌ VAD says NOT human speech")
+ resetToWaitSpeech()
return
}
+ // ✅ 成功一次,清空失败计时
+ waitSpeechFailStartMs = 0L
+
val finalAudio = audioBuffer.toFloatArray()
audioBuffer.clear()
state = VoiceState.UPLOADING
- Log.d(TAG, "⬆ Upload audio len=${finalAudio.size}")
onFinalAudio(finalAudio)
}
/* ================= 播放回调 ================= */
- fun onPlayStartPrompt() { state = VoiceState.PLAYING_PROMPT }
- fun onPlayEndPrompt() { state = VoiceState.WAIT_SPEECH; idleTimer = System.currentTimeMillis() }
- fun onPlayStartBackend() { state = VoiceState.PLAYING_BACKEND }
- fun onPlayEndBackend() { state = VoiceState.WAIT_SPEECH; idleTimer = System.currentTimeMillis() }
- /* ================= 上传回调 ================= */
- fun onUploadFinished(success: Boolean) {
- if (state != VoiceState.UPLOADING) return
- state = if (success) VoiceState.PLAYING_BACKEND else VoiceState.WAIT_SPEECH
- idleTimer = System.currentTimeMillis()
+ fun onPlayStartPrompt() {
+ state = VoiceState.PLAYING_PROMPT
}
- /* ================= Idle ================= */
+ fun onPlayEndPrompt() {
+ speechEnableAtMs = System.currentTimeMillis() + SPEECH_COOLDOWN_MS
+ state = VoiceState.WAIT_SPEECH_COOLDOWN
+ }
+
+ fun onPlayStartBackend() {
+ state = VoiceState.PLAYING_BACKEND
+ }
+
+ fun onPlayEndBackend() {
+ speechEnableAtMs = System.currentTimeMillis() + SPEECH_COOLDOWN_MS
+ state = VoiceState.WAIT_SPEECH_COOLDOWN
+ }
+
+ /* ================= 上传回调(保留 public) ================= */
+
+ fun onUploadFinished(success: Boolean) {
+ if (state != VoiceState.UPLOADING) return
+
+ state = if (success) {
+ VoiceState.PLAYING_BACKEND
+ } else {
+ speechEnableAtMs = System.currentTimeMillis() + SPEECH_COOLDOWN_MS
+ VoiceState.WAIT_SPEECH_COOLDOWN
+ }
+ }
+
+ /* ================= Idle 超时(关键修复) ================= */
+
fun checkIdleTimeout() {
- // 上传中不计时
if (state != VoiceState.WAIT_SPEECH) return
+ if (waitSpeechFailStartMs == 0L) return
+
val now = System.currentTimeMillis()
- if (now - idleTimer > idleTimeoutSeconds * 1000) {
- Log.d(TAG, "⏱ Idle timeout reached, resetAll")
+ if (now - waitSpeechFailStartMs > idleTimeoutSeconds * 1000) {
+ Log.d(TAG, "⏱ WAIT_SPEECH continuous fail timeout")
resetAll()
+ waitSpeechFailStartMs = 0L
}
}
/* ================= Reset ================= */
- private fun resetToWaitSpeech(refreshIdle: Boolean = true) {
+
+ private fun resetToWaitSpeech() {
audioBuffer.clear()
- preBuffer.clear()
vadManager.reset()
vadStarted = false
silenceStartMs = 0L
state = VoiceState.WAIT_SPEECH
- if (refreshIdle) idleTimer = System.currentTimeMillis()
+
+ // ⭐ 只在第一次失败时记录
+ if (waitSpeechFailStartMs == 0L) {
+ waitSpeechFailStartMs = System.currentTimeMillis()
+ }
}
private fun resetAll() {
@@ -190,36 +290,24 @@ class VoiceController(
}
fun release() {
- vadManager.reset()
wakeupManager.release()
+ vadManager.reset()
}
- /* ================= 工具 ================= */
+ /* ================= Utils ================= */
+
private fun cachePreBuffer(samples: FloatArray) {
for (s in samples) {
preBuffer.addLast(s)
- if (preBuffer.size > PRE_BUFFER_SIZE) preBuffer.removeFirst()
+ if (preBuffer.size > PRE_BUFFER_SIZE) {
+ preBuffer.removeFirst()
+ }
}
}
private fun calcRms(audio: FloatArray): Float {
- if (audio.isEmpty()) return 0f
var sum = 0f
for (v in audio) sum += v * v
return sqrt(sum / audio.size)
}
-
- private fun calcRmsFrames(audio: FloatArray, frameSize: Int = 320): FloatArray {
- val rmsList = mutableListOf()
- var i = 0
- while (i < audio.size) {
- val end = minOf(i + frameSize, audio.size)
- val frame = audio.sliceArray(i until end)
- var sum = 0f
- for (v in frame) sum += v * v
- rmsList.add(sqrt(sum / frame.size))
- i += frameSize
- }
- return rmsList.toFloatArray()
- }
}
diff --git a/app/src/main/java/com/zs/smarthuman/sherpa/VoiceState.kt b/app/src/main/java/com/zs/smarthuman/sherpa/VoiceState.kt
index 004035d..cd8c829 100644
--- a/app/src/main/java/com/zs/smarthuman/sherpa/VoiceState.kt
+++ b/app/src/main/java/com/zs/smarthuman/sherpa/VoiceState.kt
@@ -11,5 +11,6 @@ enum class VoiceState {
WAIT_SPEECH, // 等待用户说话
RECORDING, // 用户正在说话
UPLOADING, //音频上传中
+ WAIT_SPEECH_COOLDOWN, // ⭐ 唤醒后冷却
PLAYING_BACKEND // 播放后台返回音频
}
\ No newline at end of file
diff --git a/app/src/main/java/com/zs/smarthuman/sherpa/WakeupManager.kt b/app/src/main/java/com/zs/smarthuman/sherpa/WakeupManager.kt
index c7fcbce..0c7a72c 100644
--- a/app/src/main/java/com/zs/smarthuman/sherpa/WakeupManager.kt
+++ b/app/src/main/java/com/zs/smarthuman/sherpa/WakeupManager.kt
@@ -1,18 +1,18 @@
package com.zs.smarthuman.sherpa
import android.content.res.AssetManager
+import android.util.Log
import com.k2fsa.sherpa.onnx.*
-class WakeupManager(
- assetManager: AssetManager,
- private val onWakeup: () -> Unit
-) {
+class WakeupManager(assetManager: AssetManager, function: () -> Unit) {
+ private val TAG = "WakeupManager"
private val sampleRate = 16000
+
private val kws: KeywordSpotter
private var stream: OnlineStream? = null
- /** ⭐ 刚唤醒标记,用来丢弃唤醒词音频 */
+ /** ⭐ 唤醒标记(只能消费一次) */
private var justWokeUp = false
init {
@@ -29,15 +29,16 @@ class WakeupManager(
)
kws = KeywordSpotter(assetManager, config)
+ Log.d(TAG, "✅ KeywordSpotter initialized")
+
stream = kws.createStream()
- ?: error("Failed to create KWS stream")
+ require(stream != null) { "Failed to create KWS stream" }
+ Log.d(TAG, "✅ KWS stream created")
}
-
- /** ⭐ 小爱同学策略:不管播放还是录音,永远喂 */
+ /** ⭐ 永远喂 KWS */
fun acceptAudio(samples: FloatArray) {
val s = stream ?: return
- // ⭐ 远讲 / 播放补偿(非常关键)
for (i in samples.indices) {
samples[i] *= 2.5f
}
@@ -47,15 +48,15 @@ class WakeupManager(
kws.decode(s)
val keyword = kws.getResult(s).keyword
if (keyword.isNotBlank()) {
+ Log.d(TAG, "🔥 KWS hit: $keyword")
justWokeUp = true
- onWakeup()
- kws.reset(s) // 立刻 reset,进入新一轮
+ kws.reset(s)
break
}
}
}
- /** 被 VAD 消费一次 */
+ /** ⭐ 唯一唤醒出口 */
fun consumeWakeupFlag(): Boolean {
val r = justWokeUp
justWokeUp = false
diff --git a/app/src/main/java/com/zs/smarthuman/ui/MainActivity.kt b/app/src/main/java/com/zs/smarthuman/ui/MainActivity.kt
index 3c3bb02..23d3316 100644
--- a/app/src/main/java/com/zs/smarthuman/ui/MainActivity.kt
+++ b/app/src/main/java/com/zs/smarthuman/ui/MainActivity.kt
@@ -77,7 +77,7 @@ class MainActivity : BaseViewModelActivity()
private var voiceController: VoiceController? = null
private var audioRecord: AudioRecord? = null
private var isRecording = false
- private val audioSource = MediaRecorder.AudioSource.VOICE_RECOGNITION
+ private val audioSource = MediaRecorder.AudioSource.VOICE_COMMUNICATION
private val sampleRateInHz = 16000
private val channelConfig = AudioFormat.CHANNEL_IN_MONO
private val audioFormat = AudioFormat.ENCODING_PCM_16BIT
@@ -169,17 +169,17 @@ class MainActivity : BaseViewModelActivity()
},
onFinalAudio = { audio ->
Log.d("lrs", "检测到语音,长度=${audio.size}")
- mViewModel?.uploadVoice(
- AudioPcmUtil.pcm16ToBase64(AudioPcmUtil.floatToPcm16(audio)),
- 1
- )
-// loadLocalJsonAndPlay()
-// val file = File(
-// getExternalFilesDir(Environment.DIRECTORY_DOWNLOADS)!!.getAbsolutePath(),
-// "xxx.wav"
+// mViewModel?.uploadVoice(
+// AudioPcmUtil.pcm16ToBase64(AudioPcmUtil.floatToPcm16(audio)),
+// 1
// )
-// AudioDebugUtil.saveFloatPcmAsWav(audio, file)
-// LogUtils.dTag("audioxx", "WAV saved: ${file.path}, samples=${audio.size}")
+ loadLocalJsonAndPlay()
+ val file = File(
+ getExternalFilesDir(Environment.DIRECTORY_DOWNLOADS)!!.getAbsolutePath(),
+ "xxx.wav"
+ )
+ AudioDebugUtil.saveFloatPcmAsWav(audio, file)
+ LogUtils.dTag("audioxx", "WAV saved: ${file.path}, samples=${audio.size}")
},
onStateChanged = { state ->
@@ -261,7 +261,7 @@ class MainActivity : BaseViewModelActivity()
if (audioRecord?.state != AudioRecord.STATE_INITIALIZED) {
Log.e("VoiceService", "Failed to initialize AudioRecord")
}
- enableSystemAec(audioRecord!!)
+// enableSystemAec(audioRecord!!)
}
private var aec: AcousticEchoCanceler? = null