临时提交

This commit is contained in:
林若思 2026-01-09 15:39:54 +08:00
parent 724c5d51e0
commit d01e43cd56
8 changed files with 8952 additions and 167 deletions

View File

@ -52,7 +52,7 @@
tools:targetApi="31"> tools:targetApi="31">
<activity <activity
android:name=".ui.MainActivity" android:name=".ui.SplashActivity"
android:exported="true" android:exported="true"
android:theme="@style/Theme.Splash" android:theme="@style/Theme.Splash"
android:screenOrientation="portrait"> android:screenOrientation="portrait">
@ -66,9 +66,9 @@
</intent-filter> </intent-filter>
</activity> </activity>
<!-- <activity <activity
android:name="com.zs.smarthuman.ui.MainActivity" android:name="com.zs.smarthuman.ui.MainActivity"
android:screenOrientation="portrait"/>--> android:screenOrientation="portrait"/>
<activity <activity
android:name="com.zs.smarthuman.ui.ActivateActivity" android:name="com.zs.smarthuman.ui.ActivateActivity"
android:screenOrientation="portrait"/> android:screenOrientation="portrait"/>

View File

@ -0,0 +1,4 @@
# Introduction
Model in this directory is converted from
https://huggingface.co/ASLP-lab/WSYue-ASR/tree/main/u2pp_conformer_yue

View File

@ -2,10 +2,6 @@ package com.zs.smarthuman.sherpa
import android.content.res.AssetManager import android.content.res.AssetManager
import com.blankj.utilcode.util.LogUtils import com.blankj.utilcode.util.LogUtils
import kotlinx.coroutines.GlobalScope
import kotlinx.coroutines.Job
import kotlinx.coroutines.delay
import kotlinx.coroutines.launch
import java.util.ArrayDeque import java.util.ArrayDeque
class VoiceController( class VoiceController(
@ -29,6 +25,22 @@ class VoiceController(
onStateChanged?.invoke(value) onStateChanged?.invoke(value)
} }
// ========== 缺失变量补充:实时能量与帧统计变量 ==========
// 实时能量统计
private var realtimeEnergySum = 0f
private var realtimeEnergyCount = 0
private var realtimePeakRms = 0f
// 实时帧统计
private var realtimeTotalFrames = 0
private var realtimeSpeechFrames = 0
private var realtimeContinuousSpeechFrames = 0
private var realtimeLastFrameIsSpeech = false
// 多人对话检测标记
private var isMultiPersonDialogueDetected = false
// 防抖重置标记
private var lastInvalidResetMs = 0L
private val INVALID_RESET_DEBOUNCE_MS = 1500L
private val wakeupManager = WakeupManager(assetManager, onWakeup) private val wakeupManager = WakeupManager(assetManager, onWakeup)
private val vadManager = VadManager( private val vadManager = VadManager(
assetManager, assetManager,
@ -55,69 +67,56 @@ class VoiceController(
private val idleTimeoutMs = idleTimeoutSeconds * 1000L private val idleTimeoutMs = idleTimeoutSeconds * 1000L
private val maxRecordingMs = maxRecordingSeconds * 1000L private val maxRecordingMs = maxRecordingSeconds * 1000L
// ================= 保留分场景动态系数 + 强制兜底配置 ================= // ================= 保留分场景动态系数 + 强制兜底配置(近距离优化版) =================
private val BASELINE_WINDOW_SIZE = 50 private val BASELINE_WINDOW_SIZE = 50
private val envNoiseBuffer = ArrayDeque<Float>(BASELINE_WINDOW_SIZE) private val envNoiseBuffer = ArrayDeque<Float>(BASELINE_WINDOW_SIZE)
private var currentEnvBaseline = 0.001f private var currentEnvBaseline = 0.001f
// 强制兜底:正常语音最低门槛 // 强制兜底:正常语音最低门槛(近距离场景大幅降低)
private val MIN_NORMAL_VOICE_ENERGY = 0.06f private val MIN_NORMAL_VOICE_ENERGY = 0.03f
private val MIN_NORMAL_VOICE_VAD_RATIO = 0.3f private val MIN_NORMAL_VOICE_VAD_RATIO = 0.2f
// 分场景动态系数(安静环境系数极低 // 分场景动态系数(安静环境系数极低,适配近距离轻声
private val BASELINE_QUIET_THRESHOLD = 0.005f // 安静环境基线阈值 private val BASELINE_QUIET_THRESHOLD = 0.005f
private val SHORT_SPEECH_ENERGY_COEFF_QUIET = 2.0f // 安静环境短语音系数 private val SHORT_SPEECH_ENERGY_COEFF_QUIET = 1.5f
private val SHORT_SPEECH_ENERGY_COEFF_NOISY = 3.0f // 嘈杂环境短语音系数 private val SHORT_SPEECH_ENERGY_COEFF_NOISY = 2.0f
private val LONG_SPEECH_ENERGY_COEFF_QUIET = 4.0f // 安静环境长语音系数 private val LONG_SPEECH_ENERGY_COEFF_QUIET = 2.5f
private val LONG_SPEECH_ENERGY_COEFF_NOISY = 6.0f // 嘈杂环境长语音系数 private val LONG_SPEECH_ENERGY_COEFF_NOISY = 3.5f
private val SHORT_SPEECH_VAD_COEFF = 0.08f private val SHORT_SPEECH_VAD_COEFF = 0.05f
private val LONG_SPEECH_VAD_COEFF = 0.15f private val LONG_SPEECH_VAD_COEFF = 0.10f
private val SHORT_SPEECH_MIN_SCORE = 1 private val SHORT_SPEECH_MIN_SCORE = 1
private val LONG_SPEECH_MIN_SCORE = 2 private val LONG_SPEECH_MIN_SCORE = 1
// 其他过滤参数 // 其他过滤参数(近距离场景放宽)
private val MAX_FAR_FIELD_ENERGY = 0.03f private val MAX_FAR_FIELD_ENERGY = 0.015f
private val MIN_VALID_PEAK_AVG_RATIO = 0.8f private val MIN_VALID_PEAK_AVG_RATIO = 0.5f
private val MIN_CONTINUOUS_FRAME_RATIO = 0.2f private val MIN_CONTINUOUS_FRAME_RATIO = 0.1f
private val MAX_PEAK_POSITION_RATIO = 0.95f private val MAX_PEAK_POSITION_RATIO = 0.95f
private val MIN_EFFECTIVE_SPEECH_FRAMES = 5 private val MIN_EFFECTIVE_SPEECH_FRAMES = 3
private val SHORT_SPEECH_MIN = 500L private val SHORT_SPEECH_MIN = 500L
private val SHORT_SPEECH_MAX = 2000L private val SHORT_SPEECH_MAX = 2000L
// ========== 核心修改:多人对话过滤配置适配2人以上场景 ========== // ========== 核心修改:多人对话过滤配置 ==========
private val MULTI_DIALOGUE_MIN_DURATION = 2500L // 多人对话最小时长2.5秒) private val MULTI_DIALOGUE_MIN_DURATION = 2500L
private val MULTI_DIALOGUE_MAX_PEAK_AVG_RATIO = 2.5f // 多人对话峰均比范围 private val MULTI_DIALOGUE_MAX_PEAK_AVG_RATIO = 2.5f
private val MULTI_DIALOGUE_MIN_PEAK_AVG_RATIO = 0.4f private val MULTI_DIALOGUE_MIN_PEAK_AVG_RATIO = 0.4f
private val MULTI_DIALOGUE_MAX_CONTINUOUS_RATIO = 0.3f // 多人对话连续帧占比 private val MULTI_DIALOGUE_MAX_CONTINUOUS_RATIO = 0.3f
private val MULTI_DIALOGUE_MIN_VAD_RATIO = 0.55f // 多人对话有效帧占比 private val MULTI_DIALOGUE_MIN_VAD_RATIO = 0.55f
// ========== 新增:录音过程中实时统计的变量 ========== // ========== 核心调整:近距离场景 微弱人声过滤配置(重点优化) ==========
// 能量统计 private val MIN_EFFECTIVE_VOICE_DURATION = 400L
private var realtimeEnergySum = 0f private val MIN_VOICE_FRAME_RATIO = 0.08f
private var realtimeEnergyCount = 0 private val MIN_PEAK_ENERGY_RATIO = 1.5f
private var realtimePeakRms = 0f private val NORMAL_VOICE_ENERGY_THRESHOLD = 0.008f
// 帧统计(实时累加) private val MIN_CONTINUOUS_VOICE_FRAMES = 1
private var realtimeTotalFrames = 0
private var realtimeSpeechFrames = 0
private var realtimeContinuousSpeechFrames = 0
private var realtimeLastFrameIsSpeech = false
// 多人对话实时判定标记
private var isMultiPersonDialogueDetected = false
// 防抖变量
private var lastInvalidResetMs = 0L
private val INVALID_RESET_DEBOUNCE_MS = 1500L // 1.5秒内不重复重置
// ========== 核心新增:区分超时类型的标记 ========== // ========== 核心新增MIN_EFFECTIVE_SPEECH_RMS 常量 ==========
private var hasInvalidSpeech = false // 是否有过无效说话行为 private val MIN_EFFECTIVE_SPEECH_RMS = 0.0005f
private var currentTimeoutType: TimeoutType = TimeoutType.IDLE_TIMEOUT // 当前超时类型
// ========== 补充MIN_EFFECTIVE_SPEECH_RMS 常量和VadManager对齐 ==========
private val MIN_EFFECTIVE_SPEECH_RMS = 0.001f
//播放等待超时 // ========== 核心新增:无效说话标记 + 超时类型 ==========
private val PLAY_WAIT_TIMEOUT_MS = 3000L private var hasInvalidSpeech = false
private var playWaitJob: Job? = null private var currentTimeoutType: TimeoutType = TimeoutType.IDLE_TIMEOUT
/* ================= 音频入口 ================= */ /* ================= 音频入口 ================= */
fun acceptAudio(samples: FloatArray) { fun acceptAudio(samples: FloatArray) {
@ -153,14 +152,12 @@ class VoiceController(
if ((waitSpeechStartMs > 0 && now - waitSpeechStartMs >= idleTimeoutMs) || if ((waitSpeechStartMs > 0 && now - waitSpeechStartMs >= idleTimeoutMs) ||
(waitSpeechFailStartMs > 0 && now - waitSpeechFailStartMs >= idleTimeoutMs) (waitSpeechFailStartMs > 0 && now - waitSpeechFailStartMs >= idleTimeoutMs)
) { ) {
// 核心修改:超时前先判定超时类型
currentTimeoutType = if (hasInvalidSpeech) { currentTimeoutType = if (hasInvalidSpeech) {
TimeoutType.INVALID_SPEECH_TIMEOUT TimeoutType.INVALID_SPEECH_TIMEOUT
} else { } else {
TimeoutType.IDLE_TIMEOUT TimeoutType.IDLE_TIMEOUT
} }
LogUtils.d(TAG, "⏱ WAIT_SPEECH timeout → WAIT_WAKEUP | 超时类型: $currentTimeoutType") LogUtils.d(TAG, "⏱ WAIT_SPEECH timeout → WAIT_WAKEUP | 超时类型: $currentTimeoutType")
// 触发超时提示语回调
onTimeoutTip?.invoke(currentTimeoutType) onTimeoutTip?.invoke(currentTimeoutType)
resetAll() resetAll()
return return
@ -177,20 +174,15 @@ class VoiceController(
vadManager.accept(samples) vadManager.accept(samples)
// ========== 核心优化:录音过程中实时计算 ========== // ========== 核心优化:录音过程中实时计算 ==========
// 1. 实时校准环境基线(适配录音中环境变化)
calibrateEnvBaseline(samples) calibrateEnvBaseline(samples)
// 2. 实时计算能量/峰值
updateRealtimeEnergy(samples) updateRealtimeEnergy(samples)
// 3. 实时更新帧统计
updateRealtimeFrameStats() updateRealtimeFrameStats()
// 4. 实时判定是否为多人对话,若是则立即终止录音
if (checkMultiPersonDialogueRealtime(now)) { if (checkMultiPersonDialogueRealtime(now)) {
LogUtils.w(TAG, "🚨 录音中识别出多人对话,提前终止") LogUtils.w(TAG, "🚨 录音中识别出多人对话,提前终止")
finishSentence(realtimeEnergySum / realtimeEnergyCount, realtimePeakRms) finishSentence(realtimeEnergySum / realtimeEnergyCount, realtimePeakRms)
return return
} }
// 原有最大录音时长判断
if (System.currentTimeMillis() - recordingStartMs > maxRecordingMs) { if (System.currentTimeMillis() - recordingStartMs > maxRecordingMs) {
LogUtils.w(TAG, "⏱ Max recording reached | 当前环境基线: $currentEnvBaseline") LogUtils.w(TAG, "⏱ Max recording reached | 当前环境基线: $currentEnvBaseline")
finishSentence(realtimeEnergySum / realtimeEnergyCount, realtimePeakRms) finishSentence(realtimeEnergySum / realtimeEnergyCount, realtimePeakRms)
@ -199,10 +191,10 @@ class VoiceController(
} }
} }
/* ================= 新增:录音中实时更新能量统计 ================= */ /* ================= 新增:录音中实时更新能量统计(适配近距离轻声) ================= */
private fun updateRealtimeEnergy(samples: FloatArray) { private fun updateRealtimeEnergy(samples: FloatArray) {
val rms = vadManager.calcRms(samples) val rms = vadManager.calcRms(samples)
// 仅统计有效语音帧的能量 // 仅统计有效语音帧的能量(阈值降低)
if (rms >= MIN_EFFECTIVE_SPEECH_RMS) { if (rms >= MIN_EFFECTIVE_SPEECH_RMS) {
realtimeEnergySum += rms realtimeEnergySum += rms
realtimeEnergyCount++ realtimeEnergyCount++
@ -212,12 +204,10 @@ class VoiceController(
/* ================= 新增:录音中实时更新帧统计 ================= */ /* ================= 新增:录音中实时更新帧统计 ================= */
private fun updateRealtimeFrameStats() { private fun updateRealtimeFrameStats() {
// 从VADManager获取最新帧状态
realtimeTotalFrames = vadManager.getTotalFrames() realtimeTotalFrames = vadManager.getTotalFrames()
realtimeSpeechFrames = vadManager.getSpeechFrames() realtimeSpeechFrames = vadManager.getSpeechFrames()
realtimeContinuousSpeechFrames = vadManager.getContinuousSpeechFrames() realtimeContinuousSpeechFrames = vadManager.getContinuousSpeechFrames()
// 实时更新连续帧标记 val currentFrameIsSpeech = vadManager.isSpeechDetected()
val currentFrameIsSpeech = vadManager.isSpeechDetected() // 需给VadManager新增isSpeechDetected()方法
if (currentFrameIsSpeech) { if (currentFrameIsSpeech) {
realtimeContinuousSpeechFrames = if (realtimeLastFrameIsSpeech) realtimeContinuousSpeechFrames + 1 else 1 realtimeContinuousSpeechFrames = if (realtimeLastFrameIsSpeech) realtimeContinuousSpeechFrames + 1 else 1
} else { } else {
@ -228,17 +218,14 @@ class VoiceController(
/* ================= 新增:录音中实时判定多人对话 ================= */ /* ================= 新增:录音中实时判定多人对话 ================= */
private fun checkMultiPersonDialogueRealtime(now: Long): Boolean { private fun checkMultiPersonDialogueRealtime(now: Long): Boolean {
// 还没到多人对话最小时长,不判定
val duration = now - recordingStartMs val duration = now - recordingStartMs
if (duration < MULTI_DIALOGUE_MIN_DURATION) return false if (duration < MULTI_DIALOGUE_MIN_DURATION) return false
// 实时计算特征值
val avgEnergy = if (realtimeEnergyCount > 0) realtimeEnergySum / realtimeEnergyCount else 0f val avgEnergy = if (realtimeEnergyCount > 0) realtimeEnergySum / realtimeEnergyCount else 0f
val peakAvgRatio = if (avgEnergy > 0) realtimePeakRms / avgEnergy else 0f val peakAvgRatio = if (avgEnergy > 0) realtimePeakRms / avgEnergy else 0f
val continuousRatio = if (realtimeSpeechFrames > 0) realtimeContinuousSpeechFrames.toFloat() / realtimeSpeechFrames else 0f val continuousRatio = if (realtimeSpeechFrames > 0) realtimeContinuousSpeechFrames.toFloat() / realtimeSpeechFrames else 0f
val vadRatio = vadManager.activeSpeechRatio() val vadRatio = vadManager.activeSpeechRatio()
// 多人对话判定逻辑(和原逻辑一致,但实时执行)
isMultiPersonDialogueDetected = duration >= MULTI_DIALOGUE_MIN_DURATION && isMultiPersonDialogueDetected = duration >= MULTI_DIALOGUE_MIN_DURATION &&
peakAvgRatio in MULTI_DIALOGUE_MIN_PEAK_AVG_RATIO..MULTI_DIALOGUE_MAX_PEAK_AVG_RATIO && peakAvgRatio in MULTI_DIALOGUE_MIN_PEAK_AVG_RATIO..MULTI_DIALOGUE_MAX_PEAK_AVG_RATIO &&
continuousRatio <= MULTI_DIALOGUE_MAX_CONTINUOUS_RATIO && continuousRatio <= MULTI_DIALOGUE_MAX_CONTINUOUS_RATIO &&
@ -247,21 +234,21 @@ class VoiceController(
return isMultiPersonDialogueDetected return isMultiPersonDialogueDetected
} }
/* ================= 环境基线校准(保留,录音中也会调用 ================= */ /* ================= 环境基线校准(适配近距离场景,降低噪音敏感度 ================= */
private fun calibrateEnvBaseline(samples: FloatArray) { private fun calibrateEnvBaseline(samples: FloatArray) {
val rms = vadManager.calcRms(samples) val rms = vadManager.calcRms(samples)
// 新增:只保留低于基线+阈值的有效值,过滤突发噪音 // 只保留低于基线+阈值的有效值,过滤突发噪音(阈值降低)
val validRms = if (rms < currentEnvBaseline + 0.005f) rms else currentEnvBaseline val validRms = if (rms < currentEnvBaseline + 0.002f) rms else currentEnvBaseline
if (rms < 0.03f) { if (rms < 0.015f) {
if (envNoiseBuffer.size >= BASELINE_WINDOW_SIZE) { if (envNoiseBuffer.size >= BASELINE_WINDOW_SIZE) {
envNoiseBuffer.removeFirst() envNoiseBuffer.removeFirst()
} }
envNoiseBuffer.addLast(validRms) // 用过滤后的有效值更新 envNoiseBuffer.addLast(validRms)
currentEnvBaseline = envNoiseBuffer.maxOrNull() ?: 0.001f currentEnvBaseline = envNoiseBuffer.maxOrNull() ?: 0.001f
} }
} }
/* ================= 唤醒 ================= */ /* ================= 唤醒相关方法 ================= */
private fun handleWakeupEvent() { private fun handleWakeupEvent() {
if (state == VoiceState.UPLOADING) return if (state == VoiceState.UPLOADING) return
stopBackendAudio?.invoke() stopBackendAudio?.invoke()
@ -272,7 +259,6 @@ class VoiceController(
waitSpeechFailStartMs = System.currentTimeMillis() waitSpeechFailStartMs = System.currentTimeMillis()
waitSpeechStartMs = System.currentTimeMillis() waitSpeechStartMs = System.currentTimeMillis()
// 核心新增:唤醒时重置无效说话标记(每次唤醒都是新的会话)
hasInvalidSpeech = false hasInvalidSpeech = false
currentTimeoutType = TimeoutType.IDLE_TIMEOUT currentTimeoutType = TimeoutType.IDLE_TIMEOUT
@ -280,7 +266,6 @@ class VoiceController(
audioBuffer.clear() audioBuffer.clear()
vadManager.reset() vadManager.reset()
vadStarted = false vadStarted = false
// 重置实时统计变量
resetRealtimeStats() resetRealtimeStats()
} }
@ -297,7 +282,6 @@ class VoiceController(
recordingStartMs = System.currentTimeMillis() recordingStartMs = System.currentTimeMillis()
audioBuffer.clear() audioBuffer.clear()
audioBuffer.addAll(preBuffer) audioBuffer.addAll(preBuffer)
// 初始化实时统计变量
resetRealtimeStats() resetRealtimeStats()
state = VoiceState.RECORDING state = VoiceState.RECORDING
} }
@ -305,20 +289,65 @@ class VoiceController(
private fun onVadEnd(avgEnergy: Float, peakRms: Float) { private fun onVadEnd(avgEnergy: Float, peakRms: Float) {
if (state != VoiceState.RECORDING) return if (state != VoiceState.RECORDING) return
LogUtils.d(TAG, "🧠 VAD END | 环境基线: $currentEnvBaseline") LogUtils.d(TAG, "🧠 VAD END | 环境基线: $currentEnvBaseline")
// 优先使用实时统计的能量值,避免重复计算
val realAvgEnergy = if (realtimeEnergyCount > 0) realtimeEnergySum / realtimeEnergyCount else avgEnergy val realAvgEnergy = if (realtimeEnergyCount > 0) realtimeEnergySum / realtimeEnergyCount else avgEnergy
val realPeakRms = if (realtimePeakRms > 0) realtimePeakRms else peakRms val realPeakRms = if (realtimePeakRms > 0) realtimePeakRms else peakRms
finishSentence(realAvgEnergy, realPeakRms) finishSentence(realAvgEnergy, realPeakRms)
} }
/* ================= 结束录音(核心:复用实时计算结果) ================= */ /* ================= 核心优化:近距离场景 微弱人声过滤方法 ================= */
private fun filterWeakVoice(duration: Long, avgEnergy: Float, peakRms: Float): Boolean {
// 1. 时长过滤:<400ms的极短杂音才过滤
if (duration < MIN_EFFECTIVE_VOICE_DURATION) {
LogUtils.w(TAG, "❌ 微弱人声过滤:时长${duration}ms < ${MIN_EFFECTIVE_VOICE_DURATION}ms")
return true
}
// 2. 帧占比过滤:仅对极低能量语音生效
val voiceFrameRatio = if (realtimeTotalFrames > 0) realtimeSpeechFrames.toFloat() / realtimeTotalFrames else 0f
if (avgEnergy < NORMAL_VOICE_ENERGY_THRESHOLD && voiceFrameRatio < MIN_VOICE_FRAME_RATIO) {
LogUtils.w(TAG, "❌ 微弱人声过滤:帧占比${voiceFrameRatio} < ${MIN_VOICE_FRAME_RATIO}(极低能量)")
return true
}
// 3. 峰值能量过滤:仅对极低能量语音生效,且阈值大幅降低
val peakBaselineRatio = peakRms / currentEnvBaseline
if (avgEnergy < NORMAL_VOICE_ENERGY_THRESHOLD && peakBaselineRatio < MIN_PEAK_ENERGY_RATIO) {
LogUtils.w(TAG, "❌ 微弱人声过滤:峰值/基线${peakBaselineRatio} < ${MIN_PEAK_ENERGY_RATIO}(极低能量)")
return true
}
// 4. 连续帧过滤仅对极低能量语音生效且阈值降到1
if (avgEnergy < NORMAL_VOICE_ENERGY_THRESHOLD && realtimeContinuousSpeechFrames < MIN_CONTINUOUS_VOICE_FRAMES) {
LogUtils.w(TAG, "❌ 微弱人声过滤:连续帧${realtimeContinuousSpeechFrames} < ${MIN_CONTINUOUS_VOICE_FRAMES}(极低能量)")
return true
}
// 5. 平均能量过滤:仅对极极低能量语音生效
val energyBaselineRatio = avgEnergy / currentEnvBaseline
if (avgEnergy < 0.005f && energyBaselineRatio < 1.2f) {
LogUtils.w(TAG, "❌ 微弱人声过滤:能量/基线${energyBaselineRatio} < 1.2(极极低能量)")
return true
}
// 正常语音(包括近距离轻声)直接通过
return false
}
/* ================= 结束录音(核心:适配近距离轻声) ================= */
private fun finishSentence(avgEnergy: Float = 0f, peakRms: Float = 0f) { private fun finishSentence(avgEnergy: Float = 0f, peakRms: Float = 0f) {
val now = System.currentTimeMillis() val now = System.currentTimeMillis()
val duration = now - recordingStartMs val duration = now - recordingStartMs
// ========== 第一步:基础过滤(语音过短) ==========
if (!vadStarted || duration < MIN_SPEECH_MS) { if (!vadStarted || duration < MIN_SPEECH_MS) {
LogUtils.d(TAG, "❌ 语音过短: $duration ms | 基线: $currentEnvBaseline") LogUtils.d(TAG, "❌ 语音过短: $duration ms | 基线: $currentEnvBaseline")
// 核心新增无效说话语音过短标记hasInvalidSpeech为true hasInvalidSpeech = true
resetToWaitSpeech()
return
}
// ========== 第二步:微弱人声专项过滤(仅过滤极微弱杂音) ==========
if (filterWeakVoice(duration, avgEnergy, peakRms)) {
hasInvalidSpeech = true hasInvalidSpeech = true
resetToWaitSpeech() resetToWaitSpeech()
return return
@ -328,44 +357,40 @@ class VoiceController(
val vadRatio = vadManager.activeSpeechRatio() val vadRatio = vadManager.activeSpeechRatio()
val peakAvgRatio = if (avgEnergy > 0f) peakRms / avgEnergy else 0f val peakAvgRatio = if (avgEnergy > 0f) peakRms / avgEnergy else 0f
// 直接复用实时统计的帧数据,无需重新获取
LogUtils.d(TAG, "📊 录音信息 | 时长: $duration ms | 能量: $avgEnergy | 峰均比: $peakAvgRatio | 基线: $currentEnvBaseline") LogUtils.d(TAG, "📊 录音信息 | 时长: $duration ms | 能量: $avgEnergy | 峰均比: $peakAvgRatio | 基线: $currentEnvBaseline")
LogUtils.d(TAG, "📊 实时帧统计 | 总帧: $realtimeTotalFrames | 语音帧: $realtimeSpeechFrames | 连续语音帧: $realtimeContinuousSpeechFrames") LogUtils.d(TAG, "📊 实时帧统计 | 总帧: $realtimeTotalFrames | 语音帧: $realtimeSpeechFrames | 连续语音帧: $realtimeContinuousSpeechFrames")
// 若录音中已识别出多人对话,直接过滤 // 多人对话过滤
if (isMultiPersonDialogueDetected) { if (isMultiPersonDialogueDetected) {
LogUtils.w(TAG, "❌ 过滤多人对话垃圾语音(实时识别) | 时长: $duration ms") LogUtils.w(TAG, "❌ 过滤多人对话垃圾语音(实时识别) | 时长: $duration ms")
// 核心新增无效说话多人对话标记hasInvalidSpeech为true
hasInvalidSpeech = true hasInvalidSpeech = true
resetToWaitSpeech() resetToWaitSpeech()
return return
} }
// ========== 1. 强制兜底:正常语音直接通过 ========== // ========== 1. 强制兜底:正常语音直接通过(阈值降低) ==========
val isNormalVoice = avgEnergy >= MIN_NORMAL_VOICE_ENERGY && vadRatio >= MIN_NORMAL_VOICE_VAD_RATIO val isNormalVoice = avgEnergy >= MIN_NORMAL_VOICE_ENERGY && vadRatio >= MIN_NORMAL_VOICE_VAD_RATIO
if (isNormalVoice) { if (isNormalVoice) {
LogUtils.i(TAG, "✅ 正常语音强制通过 | 能量: $avgEnergy$MIN_NORMAL_VOICE_ENERGY | 占比: $vadRatio$MIN_NORMAL_VOICE_VAD_RATIO") LogUtils.i(TAG, "✅ 正常语音强制通过 | 能量: $avgEnergy$MIN_NORMAL_VOICE_ENERGY | 占比: $vadRatio$MIN_NORMAL_VOICE_VAD_RATIO")
audioBuffer.clear() audioBuffer.clear()
state = VoiceState.UPLOADING state = VoiceState.UPLOADING
onFinalAudio(audio) onFinalAudio(audio)
resetRealtimeStats() // 重置实时统计 resetRealtimeStats()
// 核心新增:有效语音通过后,重置无效说话标记(后续超时重新判定)
hasInvalidSpeech = false hasInvalidSpeech = false
return return
} }
// ========== 2. 远场过滤:只过滤极低能量 ========== // ========== 2. 远场过滤(近距离场景几乎不生效) ==========
val isFarField = avgEnergy < MAX_FAR_FIELD_ENERGY val isFarField = avgEnergy < MAX_FAR_FIELD_ENERGY
val isInvalidPeakRatio = peakAvgRatio < MIN_VALID_PEAK_AVG_RATIO val isInvalidPeakRatio = peakAvgRatio < MIN_VALID_PEAK_AVG_RATIO
if (isFarField && isInvalidPeakRatio) { if (isFarField && isInvalidPeakRatio) {
LogUtils.w(TAG, "❌ 远场/无效语音过滤 | 能量: $avgEnergy < $MAX_FAR_FIELD_ENERGY") LogUtils.w(TAG, "❌ 远场/无效语音过滤 | 能量: $avgEnergy < $MAX_FAR_FIELD_ENERGY")
// 核心新增无效说话远场标记hasInvalidSpeech为true
hasInvalidSpeech = true hasInvalidSpeech = true
resetToWaitSpeech() resetToWaitSpeech()
return return
} }
// ========== 3. 非连续判定:极度宽松 ========== // ========== 3. 非连续判定(大幅放宽) ==========
val continuousRatio = if (realtimeSpeechFrames > 0) realtimeContinuousSpeechFrames.toFloat() / realtimeSpeechFrames else 0f val continuousRatio = if (realtimeSpeechFrames > 0) realtimeContinuousSpeechFrames.toFloat() / realtimeSpeechFrames else 0f
val peakPositionRatio = vadManager.getPeakPositionRatio() val peakPositionRatio = vadManager.getPeakPositionRatio()
val isDiscontinuous = continuousRatio < MIN_CONTINUOUS_FRAME_RATIO && val isDiscontinuous = continuousRatio < MIN_CONTINUOUS_FRAME_RATIO &&
@ -373,13 +398,12 @@ class VoiceController(
peakPositionRatio > MAX_PEAK_POSITION_RATIO peakPositionRatio > MAX_PEAK_POSITION_RATIO
if (isDiscontinuous) { if (isDiscontinuous) {
LogUtils.w(TAG, "❌ 非连续杂音过滤 | 连续占比: $continuousRatio < $MIN_CONTINUOUS_FRAME_RATIO") LogUtils.w(TAG, "❌ 非连续杂音过滤 | 连续占比: $continuousRatio < $MIN_CONTINUOUS_FRAME_RATIO")
// 核心新增无效说话非连续杂音标记hasInvalidSpeech为true
hasInvalidSpeech = true hasInvalidSpeech = true
resetToWaitSpeech() resetToWaitSpeech()
return return
} }
// ========== 4. 分场景动态阈值计算(保留核心逻辑 ========== // ========== 4. 分场景动态阈值计算(系数大幅降低 ==========
val isQuietEnv = currentEnvBaseline < BASELINE_QUIET_THRESHOLD val isQuietEnv = currentEnvBaseline < BASELINE_QUIET_THRESHOLD
val thresholdConfig = when { val thresholdConfig = when {
duration in SHORT_SPEECH_MIN..SHORT_SPEECH_MAX -> { duration in SHORT_SPEECH_MIN..SHORT_SPEECH_MAX -> {
@ -406,18 +430,17 @@ class VoiceController(
} }
} }
// ========== 5. 分场景阈值过滤 ========== // ========== 5. 分场景阈值过滤(阈值降低) ==========
val energyPass = avgEnergy >= thresholdConfig.energyThreshold val energyPass = avgEnergy >= thresholdConfig.energyThreshold
val vadRatioPass = vadRatio >= thresholdConfig.vadRatioThreshold val vadRatioPass = vadRatio >= thresholdConfig.vadRatioThreshold
if (!energyPass || !vadRatioPass) { if (!energyPass || !vadRatioPass) {
LogUtils.w(TAG, "❌ 低能量语音阈值过滤 | 能量: $avgEnergy < ${thresholdConfig.energyThreshold} | 占比: $vadRatio < ${thresholdConfig.vadRatioThreshold} | 场景: ${thresholdConfig.scene}") LogUtils.w(TAG, "❌ 低能量语音阈值过滤 | 能量: $avgEnergy < ${thresholdConfig.energyThreshold} | 占比: $vadRatio < ${thresholdConfig.vadRatioThreshold} | 场景: ${thresholdConfig.scene}")
// 核心新增无效说话低能量标记hasInvalidSpeech为true
hasInvalidSpeech = true hasInvalidSpeech = true
resetToWaitSpeech() resetToWaitSpeech()
return return
} }
// ========== 6. 评分判定:极度宽松 ========== // ========== 6. 评分判定门槛降低到1 ==========
var score = 0 var score = 0
score += when { score += when {
duration >= 4000 -> 3 duration >= 4000 -> 3
@ -430,7 +453,6 @@ class VoiceController(
val pass = score >= thresholdConfig.minScore val pass = score >= thresholdConfig.minScore
if (!pass) { if (!pass) {
LogUtils.w(TAG, "❌ 评分不足过滤 | 总分: $score < ${thresholdConfig.minScore} | 场景: ${thresholdConfig.scene}") LogUtils.w(TAG, "❌ 评分不足过滤 | 总分: $score < ${thresholdConfig.minScore} | 场景: ${thresholdConfig.scene}")
// 核心新增无效说话评分不足标记hasInvalidSpeech为true
hasInvalidSpeech = true hasInvalidSpeech = true
resetToWaitSpeech() resetToWaitSpeech()
return return
@ -440,13 +462,12 @@ class VoiceController(
audioBuffer.clear() audioBuffer.clear()
state = VoiceState.UPLOADING state = VoiceState.UPLOADING
onFinalAudio(audio) onFinalAudio(audio)
resetRealtimeStats() // 重置实时统计 resetRealtimeStats()
// 核心新增:有效语音通过后,重置无效说话标记
hasInvalidSpeech = false hasInvalidSpeech = false
LogUtils.i(TAG, "低能量语音通过 | 时长: $duration ms | 能量: $avgEnergy | 场景: ${thresholdConfig.scene}") LogUtils.i(TAG, "近距离轻声通过 | 时长: $duration ms | 能量: $avgEnergy | 场景: ${thresholdConfig.scene}")
} }
/* ================= 新增:重置实时统计变量 ================= */ /* ================= 重置实时统计变量 ================= */
private fun resetRealtimeStats() { private fun resetRealtimeStats() {
realtimeEnergySum = 0f realtimeEnergySum = 0f
realtimeEnergyCount = 0 realtimeEnergyCount = 0
@ -458,6 +479,7 @@ class VoiceController(
isMultiPersonDialogueDetected = false isMultiPersonDialogueDetected = false
} }
/* ================= 播放/上传/Reset 回调 ================= */ /* ================= 播放/上传/Reset 回调 ================= */
fun onPlayStartPrompt() { fun onPlayStartPrompt() {
LogUtils.d(TAG, "🎵 播放提示音 | 基线: $currentEnvBaseline") LogUtils.d(TAG, "🎵 播放提示音 | 基线: $currentEnvBaseline")
@ -471,7 +493,6 @@ class VoiceController(
} }
fun onPlayStartBackend() { fun onPlayStartBackend() {
// 仅当上传完成(成功)且状态为 UPLOADING 时,才切换状态
if (state != VoiceState.UPLOADING) { if (state != VoiceState.UPLOADING) {
LogUtils.w(TAG, "🎶 非上传完成状态,禁止切换到 PLAYING_BACKEND | 当前状态: $state") LogUtils.w(TAG, "🎶 非上传完成状态,禁止切换到 PLAYING_BACKEND | 当前状态: $state")
return return
@ -490,42 +511,12 @@ class VoiceController(
if (state != VoiceState.UPLOADING) return if (state != VoiceState.UPLOADING) return
LogUtils.d(TAG, "📤 上传完成 | 成功: $success | 基线: $currentEnvBaseline") LogUtils.d(TAG, "📤 上传完成 | 成功: $success | 基线: $currentEnvBaseline")
if (success) { if (!success) {
// 上传成功:启动协程超时任务
startPlayWaitTimer()
} else {
// 上传失败:取消超时任务,重置状态
cancelPlayWaitTimer()
speechEnableAtMs = System.currentTimeMillis() + SPEECH_COOLDOWN_MS speechEnableAtMs = System.currentTimeMillis() + SPEECH_COOLDOWN_MS
state = VoiceState.WAIT_SPEECH_COOLDOWN state = VoiceState.WAIT_SPEECH_COOLDOWN
} }
} }
private fun startPlayWaitTimer() {
// 先取消旧任务,避免重复
cancelPlayWaitTimer()
// 启动协程超时任务Dispatchers.Main保证状态修改在主线程
playWaitJob = GlobalScope.launch {
delay(PLAY_WAIT_TIMEOUT_MS) // 挂起3秒无线程阻塞
LogUtils.w(TAG, "⏱ 播放等待超时(${PLAY_WAIT_TIMEOUT_MS}ms自动重置状态")
// 超时后重置状态(加同步锁,避免多线程冲突)
synchronized(this@VoiceController) {
speechEnableAtMs = System.currentTimeMillis() + SPEECH_COOLDOWN_MS
state = VoiceState.WAIT_SPEECH_COOLDOWN
}
}
}
// ================= 替换:取消协程任务 =================
private fun cancelPlayWaitTimer() {
playWaitJob?.cancel() // 取消协程(挂起函数会立即停止)
playWaitJob = null
LogUtils.d(TAG, "🔄 播放等待协程已取消")
}
private fun resetToWaitSpeech() { private fun resetToWaitSpeech() {
LogUtils.d(TAG, "🔄 重置到等待说话 | 基线: $currentEnvBaseline | 已标记无效说话: $hasInvalidSpeech") LogUtils.d(TAG, "🔄 重置到等待说话 | 基线: $currentEnvBaseline | 已标记无效说话: $hasInvalidSpeech")
val now = System.currentTimeMillis() val now = System.currentTimeMillis()
@ -537,7 +528,7 @@ class VoiceController(
audioBuffer.clear() audioBuffer.clear()
vadManager.reset() vadManager.reset()
vadStarted = false vadStarted = false
resetRealtimeStats() // 重置实时统计 resetRealtimeStats()
state = VoiceState.WAIT_SPEECH state = VoiceState.WAIT_SPEECH
if (waitSpeechFailStartMs == 0L) waitSpeechFailStartMs = System.currentTimeMillis() if (waitSpeechFailStartMs == 0L) waitSpeechFailStartMs = System.currentTimeMillis()
} }
@ -553,13 +544,11 @@ class VoiceController(
waitSpeechFailStartMs = 0L waitSpeechFailStartMs = 0L
envNoiseBuffer.clear() envNoiseBuffer.clear()
currentEnvBaseline = 0.001f currentEnvBaseline = 0.001f
resetRealtimeStats() // 重置实时统计 resetRealtimeStats()
// 核心新增:重置所有状态时,同时重置无效说话标记和超时类型
hasInvalidSpeech = false hasInvalidSpeech = false
currentTimeoutType = TimeoutType.IDLE_TIMEOUT currentTimeoutType = TimeoutType.IDLE_TIMEOUT
LogUtils.d(TAG, "🔄 环境基线已重置 | 新基线: $currentEnvBaseline | 无效说话标记已重置") LogUtils.d(TAG, "🔄 环境基线已重置 | 新基线: $currentEnvBaseline | 无效说话标记已重置")
state = VoiceState.WAIT_WAKEUP state = VoiceState.WAIT_WAKEUP
cancelPlayWaitTimer()
} }
fun release() { fun release() {
@ -567,11 +556,9 @@ class VoiceController(
wakeupManager.release() wakeupManager.release()
vadManager.reset() vadManager.reset()
envNoiseBuffer.clear() envNoiseBuffer.clear()
resetRealtimeStats() // 重置实时统计 resetRealtimeStats()
// 核心新增:释放资源时重置标记
hasInvalidSpeech = false hasInvalidSpeech = false
currentTimeoutType = TimeoutType.IDLE_TIMEOUT currentTimeoutType = TimeoutType.IDLE_TIMEOUT
cancelPlayWaitTimer()
} }
private fun cachePreBuffer(samples: FloatArray) { private fun cachePreBuffer(samples: FloatArray) {

View File

@ -54,6 +54,7 @@ import com.zs.smarthuman.utils.AudioDebugUtil
import com.zs.smarthuman.utils.AudioPcmUtil import com.zs.smarthuman.utils.AudioPcmUtil
import com.zs.smarthuman.utils.DangerousUtils import com.zs.smarthuman.utils.DangerousUtils
import com.zs.smarthuman.utils.LogFileUtils import com.zs.smarthuman.utils.LogFileUtils
import com.zs.smarthuman.utils.SimulateStreamingAsr
import com.zs.smarthuman.utils.UnityPlayerHolder import com.zs.smarthuman.utils.UnityPlayerHolder
import com.zs.smarthuman.utils.ViewSlideAnimator import com.zs.smarthuman.utils.ViewSlideAnimator
@ -86,6 +87,9 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
private var versionUpdateDialog: VersionUpdateDialog? = null private var versionUpdateDialog: VersionUpdateDialog? = null
private val PLAY_WAIT_TIMEOUT_MS = 2000L // 统一2秒超时阈值
private var startPlayTimeoutJob: Job? = null // 统一管理所有播放场景的超时Job
override fun getViewBinding(): ActivityMainBinding = ActivityMainBinding.inflate(layoutInflater) override fun getViewBinding(): ActivityMainBinding = ActivityMainBinding.inflate(layoutInflater)
override fun initView() { override fun initView() {
UnityPlayerHolder.getInstance().initialize(this) UnityPlayerHolder.getInstance().initialize(this)
@ -97,6 +101,7 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
} }
override fun initData() { override fun initData() {
initAsrModel()
PermissionUtils.permissionGroup(PermissionConstants.MICROPHONE) PermissionUtils.permissionGroup(PermissionConstants.MICROPHONE)
.callback(object : PermissionUtils.FullCallback { .callback(object : PermissionUtils.FullCallback {
override fun onGranted(granted: List<String?>) { override fun onGranted(granted: List<String?>) {
@ -154,6 +159,11 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
} }
Toaster.showShort(it) Toaster.showShort(it)
voiceController?.onUploadFinished(true) voiceController?.onUploadFinished(true)
startPlayTimeoutJob?.cancel()
startPlayTimeoutJob = lifecycleScope.launch {
delay(PLAY_WAIT_TIMEOUT_MS)
voiceController?.onPlayEndBackend()
}
} }
} }
} }
@ -181,38 +191,42 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
voiceController = VoiceController( voiceController = VoiceController(
assetManager = assets, assetManager = assets,
onWakeup = { onWakeup = {
Log.d("lrs", "当前状态: 唤醒成功wakeup")
//每次唤醒前都要把前面的音频停掉 //每次唤醒前都要把前面的音频停掉
// UnityPlayerHolder.getInstance().cancelPCM() UnityPlayerHolder.getInstance().cancelPCM()
UnityPlayerHolder.getInstance() UnityPlayerHolder.getInstance()
.sendVoiceToUnity( .sendVoiceToUnity(
voiceInfo = mutableListOf<VoiceBeanResp>().apply { voiceInfo = mutableListOf<VoiceBeanResp>().apply {
add( add(
VoiceBeanResp( VoiceBeanResp(
audioUrl = UserInfoManager.userInfo?.wakeUpAudioUrl audioUrl = "https://static.seerteach.net/aidialogue/systemVoice/aliyun-nv.mp3"
?: "https://static.seerteach.net/aidialogue/systemVoice/aliyun-nv.mp3"
) )
) )
} }
) )
startPlayTimeoutJob?.cancel()
startPlayTimeoutJob = lifecycleScope.launch {
delay(PLAY_WAIT_TIMEOUT_MS)
voiceController?.onPlayEndPrompt()
}
}, },
onFinalAudio = { audio -> onFinalAudio = { audio ->
Log.d("lrsxx", "检测到语音,长度=${audio.size}") Log.d("lrsxx", "检测到语音,长度=${audio.size}")
mViewModel?.uploadVoice( // mViewModel?.uploadVoice(
AudioPcmUtil.pcm16ToBase64(AudioPcmUtil.floatToPcm16(audio)), // AudioPcmUtil.pcm16ToBase64(AudioPcmUtil.floatToPcm16(audio)),
1 // 1
) // )
// loadLocalJsonAndPlay() loadLocalJsonAndPlay()
val file = File( val file = File(
getExternalFilesDir(Environment.DIRECTORY_DOWNLOADS)!!.getAbsolutePath(), getExternalFilesDir(Environment.DIRECTORY_DOWNLOADS)!!.getAbsolutePath(),
"xxx.wav" "xxx.wav"
) )
AudioDebugUtil.saveFloatPcmAsWav(audio, file) AudioDebugUtil.saveFloatPcmAsWav(audio, file)
LogUtils.dTag("audioxx", "WAV saved: ${file.path}, samples=${audio.size}") LogUtils.dTag("audioxx", "WAV saved: ${file.path}, samples=${audio.size}")
// lifecycleScope.launch(Dispatchers.Main) { lifecycleScope.launch(Dispatchers.Main) {
//
// mVerticalAnimator?.show() mVerticalAnimator?.show()
// } }
}, },
onStateChanged = { state -> onStateChanged = { state ->
@ -233,7 +247,7 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
voiceInfo = mutableListOf<VoiceBeanResp>().apply { voiceInfo = mutableListOf<VoiceBeanResp>().apply {
add( add(
VoiceBeanResp( VoiceBeanResp(
audioUrl = "https://static.seerteach.net/aidialogue/userWakeUpAudio/ttsmaker-file-2025-12-31-16-2-51.mp3" audioUrl = UserInfoManager.userInfo?.endAudioUrl?:""
) )
) )
} }
@ -244,6 +258,11 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
) )
} }
private fun initAsrModel(){
lifecycleScope.launch(Dispatchers.IO){
SimulateStreamingAsr.initOfflineRecognizer(App.getInstance())
}
}
override fun receivedIMMsg(msg: SingleMessage) { override fun receivedIMMsg(msg: SingleMessage) {
when (msg.msgContentType) { when (msg.msgContentType) {
MessageContentType.RECEIVE_VOICE_STREAM.msgContentType -> { MessageContentType.RECEIVE_VOICE_STREAM.msgContentType -> {
@ -415,9 +434,6 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
private var promptPlaying = false private var promptPlaying = false
private var backPlaying = false private var backPlaying = false
private var promptTimeoutJob: Job? = null
private val PROMPT_PLAY_TIMEOUT_MS = 3000L // 3 秒
fun onAudioProgressUpdated( // Unity 调用此方法传递音频进度 fun onAudioProgressUpdated( // Unity 调用此方法传递音频进度
progress: Float, progress: Float,
@ -427,7 +443,6 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
audioUrl: String audioUrl: String
) { ) {
val wakeupUrl = UserInfoManager.userInfo?.wakeUpAudioUrl val wakeupUrl = UserInfoManager.userInfo?.wakeUpAudioUrl
?: "https://static.seerteach.net/aidialogue/systemVoice/aliyun-nv.mp3"
if (audioUrl != wakeupUrl) return if (audioUrl != wakeupUrl) return
@ -436,13 +451,8 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
if (!promptPlaying) { if (!promptPlaying) {
promptPlaying = true promptPlaying = true
voiceController?.onPlayStartPrompt() voiceController?.onPlayStartPrompt()
startPlayTimeoutJob?.cancel()
promptTimeoutJob = lifecycleScope.launch { LogUtils.eTag("MainActivity","等待超时")
delay(PROMPT_PLAY_TIMEOUT_MS)
promptPlaying = false
voiceController?.onPlayEndPrompt()
promptTimeoutJob?.cancel()
}
} }
} }
@ -450,7 +460,6 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
if (promptPlaying) { if (promptPlaying) {
promptPlaying = false promptPlaying = false
voiceController?.onPlayEndPrompt() voiceController?.onPlayEndPrompt()
promptTimeoutJob?.cancel()
} }
} }
} }
@ -464,14 +473,15 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
isFinal: Boolean isFinal: Boolean
) { ) {
when (state) { when (state) {
1 -> { // play 1 -> {
if (!backPlaying) { if (!backPlaying) {
backPlaying = true backPlaying = true
voiceController?.onPlayStartBackend() voiceController?.onPlayStartBackend()
startPlayTimeoutJob?.cancel()
} }
} }
3 -> { // complete 3 -> {
if (backPlaying) { if (backPlaying) {
backPlaying = false backPlaying = false
voiceController?.onPlayEndBackend() voiceController?.onPlayEndBackend()

View File

@ -0,0 +1,155 @@
package com.zs.smarthuman.utils
import android.content.Context
import android.content.res.AssetManager
import com.blankj.utilcode.util.LogUtils
import com.k2fsa.sherpa.onnx.OfflineModelConfig
import com.k2fsa.sherpa.onnx.OfflineRecognizer
import com.k2fsa.sherpa.onnx.OfflineRecognizerConfig
import com.k2fsa.sherpa.onnx.OfflineWenetCtcModelConfig
import com.k2fsa.sherpa.onnx.Vad
import com.k2fsa.sherpa.onnx.getVadModelConfig
import java.io.File
import java.io.FileOutputStream
import java.io.InputStream
import java.io.OutputStream
fun assetExists(assetManager: AssetManager, path: String): Boolean {
val dir = path.substringBeforeLast('/', "")
val fileName = path.substringAfterLast('/')
val files = assetManager.list(dir) ?: return false
return files.contains(fileName)
}
fun copyAssetToInternalStorage(path: String, context: Context): String {
val targetRoot = context.filesDir
val outFile = File(targetRoot, path)
if (!assetExists(context.assets, path = path)) {
// for context binary, if it is does not exist, we return a path
// that can be written to
outFile.parentFile?.mkdirs()
LogUtils.i("VoiceController", "$path does not exist, return ${outFile.absolutePath}")
return outFile.absolutePath
}
if (outFile.exists()) {
val assetSize = context.assets.open(path).use { it.available() }
if (outFile.length() == assetSize.toLong()) {
LogUtils.i("VoiceController", "$targetRoot/$path already exists, skip copying, return $targetRoot/$path")
return "$targetRoot/$path"
}
}
outFile.parentFile?.mkdirs()
context.assets.open(path).use { input: InputStream ->
FileOutputStream(outFile).use { output: OutputStream ->
input.copyTo(output)
}
}
LogUtils.i("VoiceController", "Copied $path to $targetRoot/$path")
return outFile.absolutePath
}
object SimulateStreamingAsr {
private var _recognizer: OfflineRecognizer? = null
val recognizer: OfflineRecognizer
get() {
return _recognizer!!
}
fun initOfflineRecognizer(context: Context) {
synchronized(this) {
if (_recognizer != null) {
return
}
val wenetConfig = OfflineWenetCtcModelConfig(
model = "sherpa-onnx-wenetspeech-yue-u2pp-conformer-ctc-zh-en-cantonese-int8-2025-09-10/model.int8.onnx",
)
val modelConfig = OfflineModelConfig(
wenetCtc = wenetConfig,
tokens = "sherpa-onnx-wenetspeech-yue-u2pp-conformer-ctc-zh-en-cantonese-int8-2025-09-10/tokens.txt"
)
val config = OfflineRecognizerConfig(
modelConfig = modelConfig,
)
var assetManager: AssetManager? = context.assets
if (config.modelConfig.provider == "qnn") {
// We assume you have copied files like libQnnHtpV81Skel.so to jniLibs/arm64-v8a
LogUtils.i("VoiceController", "nativelibdir: ${context.applicationInfo.nativeLibraryDir}")
// If we don't set the environment variable for ADSP_LIBRARY_PATH, we will see
// the error code 1008 from qnn_interface.deviceCreate()
// See also
// https://workbench.aihub.qualcomm.com/docs/hub/faq.html#why-am-i-seeing-error-1008-when-trying-to-use-htp
OfflineRecognizer.prependAdspLibraryPath(context.applicationInfo.nativeLibraryDir)
// for qnn, we need to copy *.so files from assets folder to sd card
if (config.modelConfig.senseVoice.qnnConfig.backendLib.isEmpty() && config.modelConfig.zipformerCtc.qnnConfig.backendLib.isEmpty()) {
LogUtils.i("VoiceController", "You should provide libQnnHtp.so for qnn")
throw IllegalArgumentException("You should provide libQnnHtp.so for qnn")
}
config.modelConfig.tokens =
copyAssetToInternalStorage(config.modelConfig.tokens, context)
if (config.modelConfig.senseVoice.model.isNotEmpty() || assetExists(
context.assets,
path = config.modelConfig.senseVoice.qnnConfig.contextBinary
)
) {
if (config.modelConfig.senseVoice.model.isNotEmpty()) {
config.modelConfig.senseVoice.model =
copyAssetToInternalStorage(config.modelConfig.senseVoice.model, context)
}
config.modelConfig.senseVoice.qnnConfig.contextBinary =
copyAssetToInternalStorage(
config.modelConfig.senseVoice.qnnConfig.contextBinary,
context
)
} else if (config.modelConfig.zipformerCtc.model.isNotEmpty()) {
config.modelConfig.zipformerCtc.model =
copyAssetToInternalStorage(config.modelConfig.zipformerCtc.model, context)
config.modelConfig.zipformerCtc.qnnConfig.contextBinary =
copyAssetToInternalStorage(
config.modelConfig.zipformerCtc.qnnConfig.contextBinary,
context
)
}
if (config.hr.lexicon.isNotEmpty()) {
config.hr.lexicon = copyAssetToInternalStorage(config.hr.lexicon, context)
}
if (config.hr.ruleFsts.isNotEmpty()) {
// it assumes there is only one fst. otherwise, you need to copy each fst separately
config.hr.ruleFsts = copyAssetToInternalStorage(config.hr.ruleFsts, context)
}
assetManager = null
}
_recognizer = OfflineRecognizer(
assetManager = assetManager,
config = config,
)
LogUtils.i("VoiceController", "sherpa-onnx offline recognizer initialized")
}
}
}