优化后的代码

This commit is contained in:
林若思 2025-12-31 14:43:10 +08:00
parent 997bfe0539
commit f8812b6a48
8 changed files with 258 additions and 147 deletions

View File

@ -180,6 +180,6 @@ dependencies {
implementation libs.androidautosize implementation libs.androidautosize
implementation files('libs/sherpa19.aar') implementation files('libs/sherpa-onnx-1.12.20.aar')
} }

View File

@ -52,7 +52,7 @@
tools:targetApi="31"> tools:targetApi="31">
<activity <activity
android:name=".ui.SplashActivity" android:name=".ui.MainActivity"
android:exported="true" android:exported="true"
android:theme="@style/Theme.Splash" android:theme="@style/Theme.Splash"
android:screenOrientation="portrait"> android:screenOrientation="portrait">
@ -66,9 +66,9 @@
</intent-filter> </intent-filter>
</activity> </activity>
<activity <!--<activity
android:name="com.zs.smarthuman.ui.MainActivity" android:name="com.zs.smarthuman.ui.MainActivity"
android:screenOrientation="portrait"/> android:screenOrientation="portrait"/>-->
<activity <activity
android:name="com.zs.smarthuman.ui.ActivateActivity" android:name="com.zs.smarthuman.ui.ActivateActivity"
android:screenOrientation="portrait"/> android:screenOrientation="portrait"/>

View File

@ -10,42 +10,63 @@ import com.k2fsa.sherpa.onnx.getVadModelConfig
* @date: 2025/12/17 10:22 * @date: 2025/12/17 10:22
*/ */
class VadManager( class VadManager(
private val assetManager: AssetManager, assetManager: AssetManager,
private val onSpeechStart: () -> Unit, private val onSpeechStart: () -> Unit,
private val onSpeechEnd: () -> Unit private val onSpeechEnd: () -> Unit
) { ) {
private val vad: Vad private val vad: Vad
private var isSpeaking = false private var isSpeaking = false
private var lastSpeechTime = 0L
// ⭐ 统计用
private var speechFrameCount = 0
private var totalFrameCount = 0
private val END_SILENCE_MS = 600L
init { init {
val config = getVadModelConfig(0) val config = getVadModelConfig(1)
if (config == null) { ?: throw IllegalStateException("VAD config not found")
throw IllegalStateException("VAD config not found") vad = Vad(assetManager, config)
}
vad = Vad(assetManager = assetManager, config = config)
} }
/** 喂入音频帧 (16kHz PCM float) */
fun accept(samples: FloatArray) { fun accept(samples: FloatArray) {
vad.acceptWaveform(samples) val now = System.currentTimeMillis()
val speechDetected = vad.isSpeechDetected()
if (speechDetected && !isSpeaking) { vad.acceptWaveform(samples)
val hasSpeech = vad.isSpeechDetected()
totalFrameCount++
if (hasSpeech) {
speechFrameCount++
lastSpeechTime = now
if (!isSpeaking) {
isSpeaking = true isSpeaking = true
onSpeechStart() onSpeechStart()
} else if (!speechDetected && isSpeaking) { }
} else {
if (isSpeaking && now - lastSpeechTime >= END_SILENCE_MS) {
isSpeaking = false isSpeaking = false
onSpeechEnd() onSpeechEnd()
// ⭐ 只在句子结束时清空 VAD
vad.clear() vad.clear()
} }
} }
}
/** 👉 人声占比(真正用到 VAD 的地方) */
fun speechRatio(): Float {
if (totalFrameCount == 0) return 0f
return speechFrameCount.toFloat() / totalFrameCount
}
/** 重置内部状态 */
fun reset() { fun reset() {
isSpeaking = false isSpeaking = false
lastSpeechTime = 0
speechFrameCount = 0
totalFrameCount = 0
vad.reset() vad.reset()
} }
} }

View File

@ -8,15 +8,17 @@ class VoiceController(
assetManager: AssetManager, assetManager: AssetManager,
private val onWakeup: () -> Unit, private val onWakeup: () -> Unit,
private val onFinalAudio: (FloatArray) -> Unit, private val onFinalAudio: (FloatArray) -> Unit,
private val idleTimeoutSeconds: Int = 15, private val idleTimeoutSeconds: Int = 5,
private val maxRecordingSeconds: Int = 10, private val maxRecordingSeconds: Int = 10,
private val onStateChanged: ((VoiceState) -> Unit)? = null, private val onStateChanged: ((VoiceState) -> Unit)? = null,
private val stopBackendAudio: (() -> Unit)? = null private val stopBackendAudio: (() -> Unit)? = null
) { ) {
private val TAG = "VoiceController" private val TAG = "VoiceController"
private val sampleRate = 16000 private val sampleRate = 16000
/* ================= 状态 ================= */ /* ================= 状态 ================= */
private var state: VoiceState = VoiceState.WAIT_WAKEUP private var state: VoiceState = VoiceState.WAIT_WAKEUP
set(value) { set(value) {
field = value field = value
@ -24,73 +26,105 @@ class VoiceController(
onStateChanged?.invoke(value) onStateChanged?.invoke(value)
} }
/* ================= 唤醒 ================= */ /* ================= KWS ================= */
private val wakeupManager = WakeupManager(assetManager) { private val wakeupManager = WakeupManager(assetManager) {
Log.d(TAG, "🔥 WakeWord detected") Log.d(TAG, "🔥 WakeWord detected")
stopBackendAudio?.invoke() handleWakeupEvent()
if (state != VoiceState.UPLOADING) { // 上传中不重置
resetAll()
state = VoiceState.PLAYING_PROMPT
}
onWakeup()
} }
/* ================= VAD只负责 START ================= */ /* ================= VAD ================= */
private val vadManager = VadManager( private val vadManager = VadManager(
assetManager, assetManager,
onSpeechStart = { onVadStart() }, onSpeechStart = { onVadStart() },
onSpeechEnd = { /* 不再用于结束 */ } onSpeechEnd = {}
) )
/* ================= 音频缓存 ================= */ /* ================= Buffer ================= */
private val audioBuffer = mutableListOf<Float>() private val audioBuffer = mutableListOf<Float>()
/** 前导音缓存2 秒) */
private val preBuffer = ArrayDeque<Float>() private val preBuffer = ArrayDeque<Float>()
private val PRE_BUFFER_SIZE = sampleRate // 1 秒预缓冲 private val PRE_BUFFER_SIZE = sampleRate * 2
/* ================= 时间 ================= */ /* ================= 时间 ================= */
private var idleTimer = 0L
private var recordingStartTime = 0L private var recordingStartMs = 0L
private var silenceStartMs = 0L
/** ⭐ WAIT_SPEECH 连续失败起点(关键) */
private var waitSpeechFailStartMs = 0L
/* ================= 控制 ================= */
private var vadStarted = false private var vadStarted = false
/* ================= RMS 静音判定 ================= */ /** 唤醒观察期 */
private var silenceStartMs = 0L private var inKwsObserve = false
private val SILENCE_END_MS = 1200L // 静音多久算一句结束 private var kwsObserveStartMs = 0L
private val RMS_SILENCE_THRESHOLD = 0.005f // 更灵敏 private val KWS_OBSERVE_MS = 500L
private val MIN_SPEECH_DURATION_MS = 300L // 最短有效语音
private val MIN_SPEECH_RATIO = 0.15f // 有效帧占比至少 15% /** 播放冷却 */
private var speechEnableAtMs = 0L
private val SPEECH_COOLDOWN_MS = 300L
/* ================= 阈值 ================= */
private val RMS_SILENCE_THRESHOLD = 0.005f
private val SILENCE_END_MS = 1200L
private val MIN_SPEECH_MS = 300L
/* ================= 音频入口 ================= */ /* ================= 音频入口 ================= */
fun acceptAudio(samples: FloatArray) { fun acceptAudio(samples: FloatArray) {
// 唤醒独立处理,始终喂
wakeupManager.acceptAudio(samples)
if (state == VoiceState.UPLOADING ||
state == VoiceState.PLAYING_PROMPT ||
state == VoiceState.PLAYING_BACKEND
) return
if (state == VoiceState.WAIT_SPEECH) {
cachePreBuffer(samples) cachePreBuffer(samples)
vadManager.accept(samples)
wakeupManager.acceptAudio(samples)
if (wakeupManager.consumeWakeupFlag()) {
handleWakeupEvent()
return return
} }
if (state != VoiceState.RECORDING) return val now = System.currentTimeMillis()
when (state) {
VoiceState.WAIT_WAKEUP,
VoiceState.PLAYING_PROMPT,
VoiceState.PLAYING_BACKEND,
VoiceState.UPLOADING -> return
VoiceState.WAIT_SPEECH_COOLDOWN -> {
if (now >= speechEnableAtMs) {
state = VoiceState.WAIT_SPEECH
}
return
}
VoiceState.WAIT_SPEECH -> {
if (inKwsObserve) {
if (now - kwsObserveStartMs < KWS_OBSERVE_MS) return
inKwsObserve = false
}
vadManager.accept(samples)
}
VoiceState.RECORDING -> {
// ===== RECORDING =====
audioBuffer.addAll(samples.asList()) audioBuffer.addAll(samples.asList())
vadManager.accept(samples) vadManager.accept(samples)
val now = System.currentTimeMillis() if (now - recordingStartMs > maxRecordingSeconds * 1000) {
// 1⃣ 最大录音兜底
if (now - recordingStartTime >= maxRecordingSeconds * 1000) {
Log.w(TAG, "⏱ Max recording reached") Log.w(TAG, "⏱ Max recording reached")
finishSentence() finishSentence()
return return
} }
// 2⃣ RMS 静音结束判定
val rms = calcRms(samples) val rms = calcRms(samples)
if (rms < RMS_SILENCE_THRESHOLD) { if (rms < RMS_SILENCE_THRESHOLD) {
if (silenceStartMs == 0L) silenceStartMs = now if (silenceStartMs == 0L) silenceStartMs = now
@ -102,82 +136,148 @@ class VoiceController(
silenceStartMs = 0L silenceStartMs = 0L
} }
} }
}
}
/* ================= 唤醒 ================= */
private fun handleWakeupEvent() {
when (state) {
VoiceState.UPLOADING -> return
VoiceState.RECORDING,
VoiceState.PLAYING_BACKEND -> {
stopBackendAudio?.invoke()
enterWakeup(interrupt = true)
}
else -> enterWakeup(interrupt = false)
}
}
private fun enterWakeup(interrupt: Boolean) {
if (interrupt) {
audioBuffer.clear()
vadManager.reset()
vadStarted = false
silenceStartMs = 0L
}
inKwsObserve = true
kwsObserveStartMs = System.currentTimeMillis()
state = VoiceState.PLAYING_PROMPT
onWakeup()
}
/* ================= VAD START ================= */ /* ================= VAD START ================= */
private fun onVadStart() { private fun onVadStart() {
if (state != VoiceState.WAIT_SPEECH) return if (state != VoiceState.WAIT_SPEECH) return
Log.d(TAG, "🎤 VAD START") Log.d(TAG, "🎤 REAL VAD START")
vadStarted = true vadStarted = true
state = VoiceState.RECORDING recordingStartMs = System.currentTimeMillis()
recordingStartTime = System.currentTimeMillis()
silenceStartMs = 0L silenceStartMs = 0L
audioBuffer.clear()
audioBuffer.addAll(preBuffer) audioBuffer.addAll(preBuffer)
preBuffer.clear()
state = VoiceState.RECORDING
} }
/* ================= 结束录音 ================= */ /* ================= 结束录音 ================= */
private fun finishSentence() { private fun finishSentence() {
val speakTime = System.currentTimeMillis() - recordingStartTime
if (!vadStarted || speakTime < MIN_SPEECH_DURATION_MS) { val duration = System.currentTimeMillis() - recordingStartMs
Log.d(TAG, "⛔ Speech too short, ignore") if (!vadStarted || duration < MIN_SPEECH_MS) {
resetToWaitSpeech(refreshIdle = false) resetToWaitSpeech()
return return
} }
val rmsFrames = calcRmsFrames(audioBuffer.toFloatArray(), frameSize = 320) val vadRatio = vadManager.speechRatio()
val validFrames = rmsFrames.count { it >= RMS_SILENCE_THRESHOLD } Log.d(TAG, "🎙 VAD speech ratio=$vadRatio")
val ratio = if (rmsFrames.isEmpty()) 0f else validFrames.toFloat() / rmsFrames.size
Log.d(TAG, "RMS ratio=$ratio") if (vadRatio < 0.25f) {
if (ratio < MIN_SPEECH_RATIO) { Log.d(TAG, "❌ VAD says NOT human speech")
Log.d(TAG, "❌ Not enough human voice (ratio=$ratio)") resetToWaitSpeech()
resetToWaitSpeech(refreshIdle = false)
return return
} }
// ✅ 成功一次,清空失败计时
waitSpeechFailStartMs = 0L
val finalAudio = audioBuffer.toFloatArray() val finalAudio = audioBuffer.toFloatArray()
audioBuffer.clear() audioBuffer.clear()
state = VoiceState.UPLOADING state = VoiceState.UPLOADING
Log.d(TAG, "⬆ Upload audio len=${finalAudio.size}")
onFinalAudio(finalAudio) onFinalAudio(finalAudio)
} }
/* ================= 播放回调 ================= */ /* ================= 播放回调 ================= */
fun onPlayStartPrompt() { state = VoiceState.PLAYING_PROMPT }
fun onPlayEndPrompt() { state = VoiceState.WAIT_SPEECH; idleTimer = System.currentTimeMillis() }
fun onPlayStartBackend() { state = VoiceState.PLAYING_BACKEND }
fun onPlayEndBackend() { state = VoiceState.WAIT_SPEECH; idleTimer = System.currentTimeMillis() }
/* ================= 上传回调 ================= */ fun onPlayStartPrompt() {
fun onUploadFinished(success: Boolean) { state = VoiceState.PLAYING_PROMPT
if (state != VoiceState.UPLOADING) return
state = if (success) VoiceState.PLAYING_BACKEND else VoiceState.WAIT_SPEECH
idleTimer = System.currentTimeMillis()
} }
/* ================= Idle ================= */ fun onPlayEndPrompt() {
speechEnableAtMs = System.currentTimeMillis() + SPEECH_COOLDOWN_MS
state = VoiceState.WAIT_SPEECH_COOLDOWN
}
fun onPlayStartBackend() {
state = VoiceState.PLAYING_BACKEND
}
fun onPlayEndBackend() {
speechEnableAtMs = System.currentTimeMillis() + SPEECH_COOLDOWN_MS
state = VoiceState.WAIT_SPEECH_COOLDOWN
}
/* ================= 上传回调(保留 public ================= */
fun onUploadFinished(success: Boolean) {
if (state != VoiceState.UPLOADING) return
state = if (success) {
VoiceState.PLAYING_BACKEND
} else {
speechEnableAtMs = System.currentTimeMillis() + SPEECH_COOLDOWN_MS
VoiceState.WAIT_SPEECH_COOLDOWN
}
}
/* ================= Idle 超时(关键修复) ================= */
fun checkIdleTimeout() { fun checkIdleTimeout() {
// 上传中不计时
if (state != VoiceState.WAIT_SPEECH) return if (state != VoiceState.WAIT_SPEECH) return
if (waitSpeechFailStartMs == 0L) return
val now = System.currentTimeMillis() val now = System.currentTimeMillis()
if (now - idleTimer > idleTimeoutSeconds * 1000) { if (now - waitSpeechFailStartMs > idleTimeoutSeconds * 1000) {
Log.d(TAG, "⏱ Idle timeout reached, resetAll") Log.d(TAG, "WAIT_SPEECH continuous fail timeout")
resetAll() resetAll()
waitSpeechFailStartMs = 0L
} }
} }
/* ================= Reset ================= */ /* ================= Reset ================= */
private fun resetToWaitSpeech(refreshIdle: Boolean = true) {
private fun resetToWaitSpeech() {
audioBuffer.clear() audioBuffer.clear()
preBuffer.clear()
vadManager.reset() vadManager.reset()
vadStarted = false vadStarted = false
silenceStartMs = 0L silenceStartMs = 0L
state = VoiceState.WAIT_SPEECH state = VoiceState.WAIT_SPEECH
if (refreshIdle) idleTimer = System.currentTimeMillis()
// ⭐ 只在第一次失败时记录
if (waitSpeechFailStartMs == 0L) {
waitSpeechFailStartMs = System.currentTimeMillis()
}
} }
private fun resetAll() { private fun resetAll() {
@ -190,36 +290,24 @@ class VoiceController(
} }
fun release() { fun release() {
vadManager.reset()
wakeupManager.release() wakeupManager.release()
vadManager.reset()
} }
/* ================= 工具 ================= */ /* ================= Utils ================= */
private fun cachePreBuffer(samples: FloatArray) { private fun cachePreBuffer(samples: FloatArray) {
for (s in samples) { for (s in samples) {
preBuffer.addLast(s) preBuffer.addLast(s)
if (preBuffer.size > PRE_BUFFER_SIZE) preBuffer.removeFirst() if (preBuffer.size > PRE_BUFFER_SIZE) {
preBuffer.removeFirst()
}
} }
} }
private fun calcRms(audio: FloatArray): Float { private fun calcRms(audio: FloatArray): Float {
if (audio.isEmpty()) return 0f
var sum = 0f var sum = 0f
for (v in audio) sum += v * v for (v in audio) sum += v * v
return sqrt(sum / audio.size) return sqrt(sum / audio.size)
} }
private fun calcRmsFrames(audio: FloatArray, frameSize: Int = 320): FloatArray {
val rmsList = mutableListOf<Float>()
var i = 0
while (i < audio.size) {
val end = minOf(i + frameSize, audio.size)
val frame = audio.sliceArray(i until end)
var sum = 0f
for (v in frame) sum += v * v
rmsList.add(sqrt(sum / frame.size))
i += frameSize
}
return rmsList.toFloatArray()
}
} }

View File

@ -11,5 +11,6 @@ enum class VoiceState {
WAIT_SPEECH, // 等待用户说话 WAIT_SPEECH, // 等待用户说话
RECORDING, // 用户正在说话 RECORDING, // 用户正在说话
UPLOADING, //音频上传中 UPLOADING, //音频上传中
WAIT_SPEECH_COOLDOWN, // ⭐ 唤醒后冷却
PLAYING_BACKEND // 播放后台返回音频 PLAYING_BACKEND // 播放后台返回音频
} }

View File

@ -1,18 +1,18 @@
package com.zs.smarthuman.sherpa package com.zs.smarthuman.sherpa
import android.content.res.AssetManager import android.content.res.AssetManager
import android.util.Log
import com.k2fsa.sherpa.onnx.* import com.k2fsa.sherpa.onnx.*
class WakeupManager( class WakeupManager(assetManager: AssetManager, function: () -> Unit) {
assetManager: AssetManager,
private val onWakeup: () -> Unit
) {
private val TAG = "WakeupManager"
private val sampleRate = 16000 private val sampleRate = 16000
private val kws: KeywordSpotter private val kws: KeywordSpotter
private var stream: OnlineStream? = null private var stream: OnlineStream? = null
/** ⭐ 刚唤醒标记,用来丢弃唤醒词音频 */ /** ⭐ 唤醒标记(只能消费一次) */
private var justWokeUp = false private var justWokeUp = false
init { init {
@ -29,15 +29,16 @@ class WakeupManager(
) )
kws = KeywordSpotter(assetManager, config) kws = KeywordSpotter(assetManager, config)
Log.d(TAG, "✅ KeywordSpotter initialized")
stream = kws.createStream() stream = kws.createStream()
?: error("Failed to create KWS stream") require(stream != null) { "Failed to create KWS stream" }
Log.d(TAG, "✅ KWS stream created")
} }
/** ⭐ 永远喂 KWS */
/** ⭐ 小爱同学策略:不管播放还是录音,永远喂 */
fun acceptAudio(samples: FloatArray) { fun acceptAudio(samples: FloatArray) {
val s = stream ?: return val s = stream ?: return
// ⭐ 远讲 / 播放补偿(非常关键)
for (i in samples.indices) { for (i in samples.indices) {
samples[i] *= 2.5f samples[i] *= 2.5f
} }
@ -47,15 +48,15 @@ class WakeupManager(
kws.decode(s) kws.decode(s)
val keyword = kws.getResult(s).keyword val keyword = kws.getResult(s).keyword
if (keyword.isNotBlank()) { if (keyword.isNotBlank()) {
Log.d(TAG, "🔥 KWS hit: $keyword")
justWokeUp = true justWokeUp = true
onWakeup() kws.reset(s)
kws.reset(s) // 立刻 reset进入新一轮
break break
} }
} }
} }
/** 被 VAD 消费一次 */ /** ⭐ 唯一唤醒出口 */
fun consumeWakeupFlag(): Boolean { fun consumeWakeupFlag(): Boolean {
val r = justWokeUp val r = justWokeUp
justWokeUp = false justWokeUp = false

View File

@ -77,7 +77,7 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
private var voiceController: VoiceController? = null private var voiceController: VoiceController? = null
private var audioRecord: AudioRecord? = null private var audioRecord: AudioRecord? = null
private var isRecording = false private var isRecording = false
private val audioSource = MediaRecorder.AudioSource.VOICE_RECOGNITION private val audioSource = MediaRecorder.AudioSource.VOICE_COMMUNICATION
private val sampleRateInHz = 16000 private val sampleRateInHz = 16000
private val channelConfig = AudioFormat.CHANNEL_IN_MONO private val channelConfig = AudioFormat.CHANNEL_IN_MONO
private val audioFormat = AudioFormat.ENCODING_PCM_16BIT private val audioFormat = AudioFormat.ENCODING_PCM_16BIT
@ -169,17 +169,17 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
}, },
onFinalAudio = { audio -> onFinalAudio = { audio ->
Log.d("lrs", "检测到语音,长度=${audio.size}") Log.d("lrs", "检测到语音,长度=${audio.size}")
mViewModel?.uploadVoice( // mViewModel?.uploadVoice(
AudioPcmUtil.pcm16ToBase64(AudioPcmUtil.floatToPcm16(audio)), // AudioPcmUtil.pcm16ToBase64(AudioPcmUtil.floatToPcm16(audio)),
1 // 1
)
// loadLocalJsonAndPlay()
// val file = File(
// getExternalFilesDir(Environment.DIRECTORY_DOWNLOADS)!!.getAbsolutePath(),
// "xxx.wav"
// ) // )
// AudioDebugUtil.saveFloatPcmAsWav(audio, file) loadLocalJsonAndPlay()
// LogUtils.dTag("audioxx", "WAV saved: ${file.path}, samples=${audio.size}") val file = File(
getExternalFilesDir(Environment.DIRECTORY_DOWNLOADS)!!.getAbsolutePath(),
"xxx.wav"
)
AudioDebugUtil.saveFloatPcmAsWav(audio, file)
LogUtils.dTag("audioxx", "WAV saved: ${file.path}, samples=${audio.size}")
}, },
onStateChanged = { state -> onStateChanged = { state ->
@ -261,7 +261,7 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
if (audioRecord?.state != AudioRecord.STATE_INITIALIZED) { if (audioRecord?.state != AudioRecord.STATE_INITIALIZED) {
Log.e("VoiceService", "Failed to initialize AudioRecord") Log.e("VoiceService", "Failed to initialize AudioRecord")
} }
enableSystemAec(audioRecord!!) // enableSystemAec(audioRecord!!)
} }
private var aec: AcousticEchoCanceler? = null private var aec: AcousticEchoCanceler? = null