临时提交
This commit is contained in:
parent
724c5d51e0
commit
d01e43cd56
@ -52,7 +52,7 @@
|
|||||||
tools:targetApi="31">
|
tools:targetApi="31">
|
||||||
|
|
||||||
<activity
|
<activity
|
||||||
android:name=".ui.MainActivity"
|
android:name=".ui.SplashActivity"
|
||||||
android:exported="true"
|
android:exported="true"
|
||||||
android:theme="@style/Theme.Splash"
|
android:theme="@style/Theme.Splash"
|
||||||
android:screenOrientation="portrait">
|
android:screenOrientation="portrait">
|
||||||
@ -66,9 +66,9 @@
|
|||||||
</intent-filter>
|
</intent-filter>
|
||||||
</activity>
|
</activity>
|
||||||
|
|
||||||
<!-- <activity
|
<activity
|
||||||
android:name="com.zs.smarthuman.ui.MainActivity"
|
android:name="com.zs.smarthuman.ui.MainActivity"
|
||||||
android:screenOrientation="portrait"/>-->
|
android:screenOrientation="portrait"/>
|
||||||
<activity
|
<activity
|
||||||
android:name="com.zs.smarthuman.ui.ActivateActivity"
|
android:name="com.zs.smarthuman.ui.ActivateActivity"
|
||||||
android:screenOrientation="portrait"/>
|
android:screenOrientation="portrait"/>
|
||||||
|
|||||||
@ -0,0 +1,4 @@
|
|||||||
|
# Introduction
|
||||||
|
|
||||||
|
Model in this directory is converted from
|
||||||
|
https://huggingface.co/ASLP-lab/WSYue-ASR/tree/main/u2pp_conformer_yue
|
||||||
Binary file not shown.
File diff suppressed because it is too large
Load Diff
@ -2,10 +2,6 @@ package com.zs.smarthuman.sherpa
|
|||||||
|
|
||||||
import android.content.res.AssetManager
|
import android.content.res.AssetManager
|
||||||
import com.blankj.utilcode.util.LogUtils
|
import com.blankj.utilcode.util.LogUtils
|
||||||
import kotlinx.coroutines.GlobalScope
|
|
||||||
import kotlinx.coroutines.Job
|
|
||||||
import kotlinx.coroutines.delay
|
|
||||||
import kotlinx.coroutines.launch
|
|
||||||
import java.util.ArrayDeque
|
import java.util.ArrayDeque
|
||||||
|
|
||||||
class VoiceController(
|
class VoiceController(
|
||||||
@ -29,6 +25,22 @@ class VoiceController(
|
|||||||
onStateChanged?.invoke(value)
|
onStateChanged?.invoke(value)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ========== 缺失变量补充:实时能量与帧统计变量 ==========
|
||||||
|
// 实时能量统计
|
||||||
|
private var realtimeEnergySum = 0f
|
||||||
|
private var realtimeEnergyCount = 0
|
||||||
|
private var realtimePeakRms = 0f
|
||||||
|
// 实时帧统计
|
||||||
|
private var realtimeTotalFrames = 0
|
||||||
|
private var realtimeSpeechFrames = 0
|
||||||
|
private var realtimeContinuousSpeechFrames = 0
|
||||||
|
private var realtimeLastFrameIsSpeech = false
|
||||||
|
// 多人对话检测标记
|
||||||
|
private var isMultiPersonDialogueDetected = false
|
||||||
|
// 防抖重置标记
|
||||||
|
private var lastInvalidResetMs = 0L
|
||||||
|
private val INVALID_RESET_DEBOUNCE_MS = 1500L
|
||||||
|
|
||||||
private val wakeupManager = WakeupManager(assetManager, onWakeup)
|
private val wakeupManager = WakeupManager(assetManager, onWakeup)
|
||||||
private val vadManager = VadManager(
|
private val vadManager = VadManager(
|
||||||
assetManager,
|
assetManager,
|
||||||
@ -55,69 +67,56 @@ class VoiceController(
|
|||||||
private val idleTimeoutMs = idleTimeoutSeconds * 1000L
|
private val idleTimeoutMs = idleTimeoutSeconds * 1000L
|
||||||
private val maxRecordingMs = maxRecordingSeconds * 1000L
|
private val maxRecordingMs = maxRecordingSeconds * 1000L
|
||||||
|
|
||||||
// ================= 保留分场景动态系数 + 强制兜底配置 =================
|
// ================= 保留分场景动态系数 + 强制兜底配置(近距离优化版) =================
|
||||||
private val BASELINE_WINDOW_SIZE = 50
|
private val BASELINE_WINDOW_SIZE = 50
|
||||||
private val envNoiseBuffer = ArrayDeque<Float>(BASELINE_WINDOW_SIZE)
|
private val envNoiseBuffer = ArrayDeque<Float>(BASELINE_WINDOW_SIZE)
|
||||||
private var currentEnvBaseline = 0.001f
|
private var currentEnvBaseline = 0.001f
|
||||||
|
|
||||||
// 强制兜底:正常语音最低门槛
|
// 强制兜底:正常语音最低门槛(近距离场景大幅降低)
|
||||||
private val MIN_NORMAL_VOICE_ENERGY = 0.06f
|
private val MIN_NORMAL_VOICE_ENERGY = 0.03f
|
||||||
private val MIN_NORMAL_VOICE_VAD_RATIO = 0.3f
|
private val MIN_NORMAL_VOICE_VAD_RATIO = 0.2f
|
||||||
|
|
||||||
// 分场景动态系数(安静环境系数极低)
|
// 分场景动态系数(安静环境系数极低,适配近距离轻声)
|
||||||
private val BASELINE_QUIET_THRESHOLD = 0.005f // 安静环境基线阈值
|
private val BASELINE_QUIET_THRESHOLD = 0.005f
|
||||||
private val SHORT_SPEECH_ENERGY_COEFF_QUIET = 2.0f // 安静环境短语音系数
|
private val SHORT_SPEECH_ENERGY_COEFF_QUIET = 1.5f
|
||||||
private val SHORT_SPEECH_ENERGY_COEFF_NOISY = 3.0f // 嘈杂环境短语音系数
|
private val SHORT_SPEECH_ENERGY_COEFF_NOISY = 2.0f
|
||||||
private val LONG_SPEECH_ENERGY_COEFF_QUIET = 4.0f // 安静环境长语音系数
|
private val LONG_SPEECH_ENERGY_COEFF_QUIET = 2.5f
|
||||||
private val LONG_SPEECH_ENERGY_COEFF_NOISY = 6.0f // 嘈杂环境长语音系数
|
private val LONG_SPEECH_ENERGY_COEFF_NOISY = 3.5f
|
||||||
private val SHORT_SPEECH_VAD_COEFF = 0.08f
|
private val SHORT_SPEECH_VAD_COEFF = 0.05f
|
||||||
private val LONG_SPEECH_VAD_COEFF = 0.15f
|
private val LONG_SPEECH_VAD_COEFF = 0.10f
|
||||||
private val SHORT_SPEECH_MIN_SCORE = 1
|
private val SHORT_SPEECH_MIN_SCORE = 1
|
||||||
private val LONG_SPEECH_MIN_SCORE = 2
|
private val LONG_SPEECH_MIN_SCORE = 1
|
||||||
|
|
||||||
// 其他过滤参数
|
// 其他过滤参数(近距离场景放宽)
|
||||||
private val MAX_FAR_FIELD_ENERGY = 0.03f
|
private val MAX_FAR_FIELD_ENERGY = 0.015f
|
||||||
private val MIN_VALID_PEAK_AVG_RATIO = 0.8f
|
private val MIN_VALID_PEAK_AVG_RATIO = 0.5f
|
||||||
private val MIN_CONTINUOUS_FRAME_RATIO = 0.2f
|
private val MIN_CONTINUOUS_FRAME_RATIO = 0.1f
|
||||||
private val MAX_PEAK_POSITION_RATIO = 0.95f
|
private val MAX_PEAK_POSITION_RATIO = 0.95f
|
||||||
private val MIN_EFFECTIVE_SPEECH_FRAMES = 5
|
private val MIN_EFFECTIVE_SPEECH_FRAMES = 3
|
||||||
private val SHORT_SPEECH_MIN = 500L
|
private val SHORT_SPEECH_MIN = 500L
|
||||||
private val SHORT_SPEECH_MAX = 2000L
|
private val SHORT_SPEECH_MAX = 2000L
|
||||||
|
|
||||||
// ========== 核心修改:多人对话过滤配置(适配2人以上场景) ==========
|
// ========== 核心修改:多人对话过滤配置 ==========
|
||||||
private val MULTI_DIALOGUE_MIN_DURATION = 2500L // 多人对话最小时长(2.5秒)
|
private val MULTI_DIALOGUE_MIN_DURATION = 2500L
|
||||||
private val MULTI_DIALOGUE_MAX_PEAK_AVG_RATIO = 2.5f // 多人对话峰均比范围
|
private val MULTI_DIALOGUE_MAX_PEAK_AVG_RATIO = 2.5f
|
||||||
private val MULTI_DIALOGUE_MIN_PEAK_AVG_RATIO = 0.4f
|
private val MULTI_DIALOGUE_MIN_PEAK_AVG_RATIO = 0.4f
|
||||||
private val MULTI_DIALOGUE_MAX_CONTINUOUS_RATIO = 0.3f // 多人对话连续帧占比
|
private val MULTI_DIALOGUE_MAX_CONTINUOUS_RATIO = 0.3f
|
||||||
private val MULTI_DIALOGUE_MIN_VAD_RATIO = 0.55f // 多人对话有效帧占比
|
private val MULTI_DIALOGUE_MIN_VAD_RATIO = 0.55f
|
||||||
|
|
||||||
// ========== 新增:录音过程中实时统计的变量 ==========
|
// ========== 核心调整:近距离场景 微弱人声过滤配置(重点优化) ==========
|
||||||
// 能量统计
|
private val MIN_EFFECTIVE_VOICE_DURATION = 400L
|
||||||
private var realtimeEnergySum = 0f
|
private val MIN_VOICE_FRAME_RATIO = 0.08f
|
||||||
private var realtimeEnergyCount = 0
|
private val MIN_PEAK_ENERGY_RATIO = 1.5f
|
||||||
private var realtimePeakRms = 0f
|
private val NORMAL_VOICE_ENERGY_THRESHOLD = 0.008f
|
||||||
// 帧统计(实时累加)
|
private val MIN_CONTINUOUS_VOICE_FRAMES = 1
|
||||||
private var realtimeTotalFrames = 0
|
|
||||||
private var realtimeSpeechFrames = 0
|
|
||||||
private var realtimeContinuousSpeechFrames = 0
|
|
||||||
private var realtimeLastFrameIsSpeech = false
|
|
||||||
// 多人对话实时判定标记
|
|
||||||
private var isMultiPersonDialogueDetected = false
|
|
||||||
// 防抖变量
|
|
||||||
private var lastInvalidResetMs = 0L
|
|
||||||
private val INVALID_RESET_DEBOUNCE_MS = 1500L // 1.5秒内不重复重置
|
|
||||||
|
|
||||||
// ========== 核心新增:区分超时类型的标记 ==========
|
// ========== 核心新增:MIN_EFFECTIVE_SPEECH_RMS 常量 ==========
|
||||||
private var hasInvalidSpeech = false // 是否有过无效说话行为
|
private val MIN_EFFECTIVE_SPEECH_RMS = 0.0005f
|
||||||
private var currentTimeoutType: TimeoutType = TimeoutType.IDLE_TIMEOUT // 当前超时类型
|
|
||||||
|
|
||||||
// ========== 补充:MIN_EFFECTIVE_SPEECH_RMS 常量(和VadManager对齐) ==========
|
|
||||||
private val MIN_EFFECTIVE_SPEECH_RMS = 0.001f
|
|
||||||
|
|
||||||
|
|
||||||
//播放等待超时
|
// ========== 核心新增:无效说话标记 + 超时类型 ==========
|
||||||
private val PLAY_WAIT_TIMEOUT_MS = 3000L
|
private var hasInvalidSpeech = false
|
||||||
private var playWaitJob: Job? = null
|
private var currentTimeoutType: TimeoutType = TimeoutType.IDLE_TIMEOUT
|
||||||
|
|
||||||
/* ================= 音频入口 ================= */
|
/* ================= 音频入口 ================= */
|
||||||
fun acceptAudio(samples: FloatArray) {
|
fun acceptAudio(samples: FloatArray) {
|
||||||
@ -153,14 +152,12 @@ class VoiceController(
|
|||||||
if ((waitSpeechStartMs > 0 && now - waitSpeechStartMs >= idleTimeoutMs) ||
|
if ((waitSpeechStartMs > 0 && now - waitSpeechStartMs >= idleTimeoutMs) ||
|
||||||
(waitSpeechFailStartMs > 0 && now - waitSpeechFailStartMs >= idleTimeoutMs)
|
(waitSpeechFailStartMs > 0 && now - waitSpeechFailStartMs >= idleTimeoutMs)
|
||||||
) {
|
) {
|
||||||
// 核心修改:超时前先判定超时类型
|
|
||||||
currentTimeoutType = if (hasInvalidSpeech) {
|
currentTimeoutType = if (hasInvalidSpeech) {
|
||||||
TimeoutType.INVALID_SPEECH_TIMEOUT
|
TimeoutType.INVALID_SPEECH_TIMEOUT
|
||||||
} else {
|
} else {
|
||||||
TimeoutType.IDLE_TIMEOUT
|
TimeoutType.IDLE_TIMEOUT
|
||||||
}
|
}
|
||||||
LogUtils.d(TAG, "⏱ WAIT_SPEECH timeout → WAIT_WAKEUP | 超时类型: $currentTimeoutType")
|
LogUtils.d(TAG, "⏱ WAIT_SPEECH timeout → WAIT_WAKEUP | 超时类型: $currentTimeoutType")
|
||||||
// 触发超时提示语回调
|
|
||||||
onTimeoutTip?.invoke(currentTimeoutType)
|
onTimeoutTip?.invoke(currentTimeoutType)
|
||||||
resetAll()
|
resetAll()
|
||||||
return
|
return
|
||||||
@ -177,20 +174,15 @@ class VoiceController(
|
|||||||
vadManager.accept(samples)
|
vadManager.accept(samples)
|
||||||
|
|
||||||
// ========== 核心优化:录音过程中实时计算 ==========
|
// ========== 核心优化:录音过程中实时计算 ==========
|
||||||
// 1. 实时校准环境基线(适配录音中环境变化)
|
|
||||||
calibrateEnvBaseline(samples)
|
calibrateEnvBaseline(samples)
|
||||||
// 2. 实时计算能量/峰值
|
|
||||||
updateRealtimeEnergy(samples)
|
updateRealtimeEnergy(samples)
|
||||||
// 3. 实时更新帧统计
|
|
||||||
updateRealtimeFrameStats()
|
updateRealtimeFrameStats()
|
||||||
// 4. 实时判定是否为多人对话,若是则立即终止录音
|
|
||||||
if (checkMultiPersonDialogueRealtime(now)) {
|
if (checkMultiPersonDialogueRealtime(now)) {
|
||||||
LogUtils.w(TAG, "🚨 录音中识别出多人对话,提前终止")
|
LogUtils.w(TAG, "🚨 录音中识别出多人对话,提前终止")
|
||||||
finishSentence(realtimeEnergySum / realtimeEnergyCount, realtimePeakRms)
|
finishSentence(realtimeEnergySum / realtimeEnergyCount, realtimePeakRms)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
// 原有最大录音时长判断
|
|
||||||
if (System.currentTimeMillis() - recordingStartMs > maxRecordingMs) {
|
if (System.currentTimeMillis() - recordingStartMs > maxRecordingMs) {
|
||||||
LogUtils.w(TAG, "⏱ Max recording reached | 当前环境基线: $currentEnvBaseline")
|
LogUtils.w(TAG, "⏱ Max recording reached | 当前环境基线: $currentEnvBaseline")
|
||||||
finishSentence(realtimeEnergySum / realtimeEnergyCount, realtimePeakRms)
|
finishSentence(realtimeEnergySum / realtimeEnergyCount, realtimePeakRms)
|
||||||
@ -199,10 +191,10 @@ class VoiceController(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* ================= 新增:录音中实时更新能量统计 ================= */
|
/* ================= 新增:录音中实时更新能量统计(适配近距离轻声) ================= */
|
||||||
private fun updateRealtimeEnergy(samples: FloatArray) {
|
private fun updateRealtimeEnergy(samples: FloatArray) {
|
||||||
val rms = vadManager.calcRms(samples)
|
val rms = vadManager.calcRms(samples)
|
||||||
// 仅统计有效语音帧的能量
|
// 仅统计有效语音帧的能量(阈值降低)
|
||||||
if (rms >= MIN_EFFECTIVE_SPEECH_RMS) {
|
if (rms >= MIN_EFFECTIVE_SPEECH_RMS) {
|
||||||
realtimeEnergySum += rms
|
realtimeEnergySum += rms
|
||||||
realtimeEnergyCount++
|
realtimeEnergyCount++
|
||||||
@ -212,12 +204,10 @@ class VoiceController(
|
|||||||
|
|
||||||
/* ================= 新增:录音中实时更新帧统计 ================= */
|
/* ================= 新增:录音中实时更新帧统计 ================= */
|
||||||
private fun updateRealtimeFrameStats() {
|
private fun updateRealtimeFrameStats() {
|
||||||
// 从VADManager获取最新帧状态
|
|
||||||
realtimeTotalFrames = vadManager.getTotalFrames()
|
realtimeTotalFrames = vadManager.getTotalFrames()
|
||||||
realtimeSpeechFrames = vadManager.getSpeechFrames()
|
realtimeSpeechFrames = vadManager.getSpeechFrames()
|
||||||
realtimeContinuousSpeechFrames = vadManager.getContinuousSpeechFrames()
|
realtimeContinuousSpeechFrames = vadManager.getContinuousSpeechFrames()
|
||||||
// 实时更新连续帧标记
|
val currentFrameIsSpeech = vadManager.isSpeechDetected()
|
||||||
val currentFrameIsSpeech = vadManager.isSpeechDetected() // 需给VadManager新增isSpeechDetected()方法
|
|
||||||
if (currentFrameIsSpeech) {
|
if (currentFrameIsSpeech) {
|
||||||
realtimeContinuousSpeechFrames = if (realtimeLastFrameIsSpeech) realtimeContinuousSpeechFrames + 1 else 1
|
realtimeContinuousSpeechFrames = if (realtimeLastFrameIsSpeech) realtimeContinuousSpeechFrames + 1 else 1
|
||||||
} else {
|
} else {
|
||||||
@ -228,17 +218,14 @@ class VoiceController(
|
|||||||
|
|
||||||
/* ================= 新增:录音中实时判定多人对话 ================= */
|
/* ================= 新增:录音中实时判定多人对话 ================= */
|
||||||
private fun checkMultiPersonDialogueRealtime(now: Long): Boolean {
|
private fun checkMultiPersonDialogueRealtime(now: Long): Boolean {
|
||||||
// 还没到多人对话最小时长,不判定
|
|
||||||
val duration = now - recordingStartMs
|
val duration = now - recordingStartMs
|
||||||
if (duration < MULTI_DIALOGUE_MIN_DURATION) return false
|
if (duration < MULTI_DIALOGUE_MIN_DURATION) return false
|
||||||
|
|
||||||
// 实时计算特征值
|
|
||||||
val avgEnergy = if (realtimeEnergyCount > 0) realtimeEnergySum / realtimeEnergyCount else 0f
|
val avgEnergy = if (realtimeEnergyCount > 0) realtimeEnergySum / realtimeEnergyCount else 0f
|
||||||
val peakAvgRatio = if (avgEnergy > 0) realtimePeakRms / avgEnergy else 0f
|
val peakAvgRatio = if (avgEnergy > 0) realtimePeakRms / avgEnergy else 0f
|
||||||
val continuousRatio = if (realtimeSpeechFrames > 0) realtimeContinuousSpeechFrames.toFloat() / realtimeSpeechFrames else 0f
|
val continuousRatio = if (realtimeSpeechFrames > 0) realtimeContinuousSpeechFrames.toFloat() / realtimeSpeechFrames else 0f
|
||||||
val vadRatio = vadManager.activeSpeechRatio()
|
val vadRatio = vadManager.activeSpeechRatio()
|
||||||
|
|
||||||
// 多人对话判定逻辑(和原逻辑一致,但实时执行)
|
|
||||||
isMultiPersonDialogueDetected = duration >= MULTI_DIALOGUE_MIN_DURATION &&
|
isMultiPersonDialogueDetected = duration >= MULTI_DIALOGUE_MIN_DURATION &&
|
||||||
peakAvgRatio in MULTI_DIALOGUE_MIN_PEAK_AVG_RATIO..MULTI_DIALOGUE_MAX_PEAK_AVG_RATIO &&
|
peakAvgRatio in MULTI_DIALOGUE_MIN_PEAK_AVG_RATIO..MULTI_DIALOGUE_MAX_PEAK_AVG_RATIO &&
|
||||||
continuousRatio <= MULTI_DIALOGUE_MAX_CONTINUOUS_RATIO &&
|
continuousRatio <= MULTI_DIALOGUE_MAX_CONTINUOUS_RATIO &&
|
||||||
@ -247,21 +234,21 @@ class VoiceController(
|
|||||||
return isMultiPersonDialogueDetected
|
return isMultiPersonDialogueDetected
|
||||||
}
|
}
|
||||||
|
|
||||||
/* ================= 环境基线校准(保留,录音中也会调用) ================= */
|
/* ================= 环境基线校准(适配近距离场景,降低噪音敏感度) ================= */
|
||||||
private fun calibrateEnvBaseline(samples: FloatArray) {
|
private fun calibrateEnvBaseline(samples: FloatArray) {
|
||||||
val rms = vadManager.calcRms(samples)
|
val rms = vadManager.calcRms(samples)
|
||||||
// 新增:只保留低于基线+阈值的有效值,过滤突发噪音
|
// 只保留低于基线+阈值的有效值,过滤突发噪音(阈值降低)
|
||||||
val validRms = if (rms < currentEnvBaseline + 0.005f) rms else currentEnvBaseline
|
val validRms = if (rms < currentEnvBaseline + 0.002f) rms else currentEnvBaseline
|
||||||
if (rms < 0.03f) {
|
if (rms < 0.015f) {
|
||||||
if (envNoiseBuffer.size >= BASELINE_WINDOW_SIZE) {
|
if (envNoiseBuffer.size >= BASELINE_WINDOW_SIZE) {
|
||||||
envNoiseBuffer.removeFirst()
|
envNoiseBuffer.removeFirst()
|
||||||
}
|
}
|
||||||
envNoiseBuffer.addLast(validRms) // 用过滤后的有效值更新
|
envNoiseBuffer.addLast(validRms)
|
||||||
currentEnvBaseline = envNoiseBuffer.maxOrNull() ?: 0.001f
|
currentEnvBaseline = envNoiseBuffer.maxOrNull() ?: 0.001f
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* ================= 唤醒 ================= */
|
/* ================= 唤醒相关方法 ================= */
|
||||||
private fun handleWakeupEvent() {
|
private fun handleWakeupEvent() {
|
||||||
if (state == VoiceState.UPLOADING) return
|
if (state == VoiceState.UPLOADING) return
|
||||||
stopBackendAudio?.invoke()
|
stopBackendAudio?.invoke()
|
||||||
@ -272,7 +259,6 @@ class VoiceController(
|
|||||||
waitSpeechFailStartMs = System.currentTimeMillis()
|
waitSpeechFailStartMs = System.currentTimeMillis()
|
||||||
waitSpeechStartMs = System.currentTimeMillis()
|
waitSpeechStartMs = System.currentTimeMillis()
|
||||||
|
|
||||||
// 核心新增:唤醒时重置无效说话标记(每次唤醒都是新的会话)
|
|
||||||
hasInvalidSpeech = false
|
hasInvalidSpeech = false
|
||||||
currentTimeoutType = TimeoutType.IDLE_TIMEOUT
|
currentTimeoutType = TimeoutType.IDLE_TIMEOUT
|
||||||
|
|
||||||
@ -280,7 +266,6 @@ class VoiceController(
|
|||||||
audioBuffer.clear()
|
audioBuffer.clear()
|
||||||
vadManager.reset()
|
vadManager.reset()
|
||||||
vadStarted = false
|
vadStarted = false
|
||||||
// 重置实时统计变量
|
|
||||||
resetRealtimeStats()
|
resetRealtimeStats()
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -297,7 +282,6 @@ class VoiceController(
|
|||||||
recordingStartMs = System.currentTimeMillis()
|
recordingStartMs = System.currentTimeMillis()
|
||||||
audioBuffer.clear()
|
audioBuffer.clear()
|
||||||
audioBuffer.addAll(preBuffer)
|
audioBuffer.addAll(preBuffer)
|
||||||
// 初始化实时统计变量
|
|
||||||
resetRealtimeStats()
|
resetRealtimeStats()
|
||||||
state = VoiceState.RECORDING
|
state = VoiceState.RECORDING
|
||||||
}
|
}
|
||||||
@ -305,20 +289,65 @@ class VoiceController(
|
|||||||
private fun onVadEnd(avgEnergy: Float, peakRms: Float) {
|
private fun onVadEnd(avgEnergy: Float, peakRms: Float) {
|
||||||
if (state != VoiceState.RECORDING) return
|
if (state != VoiceState.RECORDING) return
|
||||||
LogUtils.d(TAG, "🧠 VAD END | 环境基线: $currentEnvBaseline")
|
LogUtils.d(TAG, "🧠 VAD END | 环境基线: $currentEnvBaseline")
|
||||||
// 优先使用实时统计的能量值,避免重复计算
|
|
||||||
val realAvgEnergy = if (realtimeEnergyCount > 0) realtimeEnergySum / realtimeEnergyCount else avgEnergy
|
val realAvgEnergy = if (realtimeEnergyCount > 0) realtimeEnergySum / realtimeEnergyCount else avgEnergy
|
||||||
val realPeakRms = if (realtimePeakRms > 0) realtimePeakRms else peakRms
|
val realPeakRms = if (realtimePeakRms > 0) realtimePeakRms else peakRms
|
||||||
finishSentence(realAvgEnergy, realPeakRms)
|
finishSentence(realAvgEnergy, realPeakRms)
|
||||||
}
|
}
|
||||||
|
|
||||||
/* ================= 结束录音(核心:复用实时计算结果) ================= */
|
/* ================= 核心优化:近距离场景 微弱人声过滤方法 ================= */
|
||||||
|
private fun filterWeakVoice(duration: Long, avgEnergy: Float, peakRms: Float): Boolean {
|
||||||
|
// 1. 时长过滤:<400ms的极短杂音才过滤
|
||||||
|
if (duration < MIN_EFFECTIVE_VOICE_DURATION) {
|
||||||
|
LogUtils.w(TAG, "❌ 微弱人声过滤:时长${duration}ms < ${MIN_EFFECTIVE_VOICE_DURATION}ms")
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
// 2. 帧占比过滤:仅对极低能量语音生效
|
||||||
|
val voiceFrameRatio = if (realtimeTotalFrames > 0) realtimeSpeechFrames.toFloat() / realtimeTotalFrames else 0f
|
||||||
|
if (avgEnergy < NORMAL_VOICE_ENERGY_THRESHOLD && voiceFrameRatio < MIN_VOICE_FRAME_RATIO) {
|
||||||
|
LogUtils.w(TAG, "❌ 微弱人声过滤:帧占比${voiceFrameRatio} < ${MIN_VOICE_FRAME_RATIO}(极低能量)")
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
// 3. 峰值能量过滤:仅对极低能量语音生效,且阈值大幅降低
|
||||||
|
val peakBaselineRatio = peakRms / currentEnvBaseline
|
||||||
|
if (avgEnergy < NORMAL_VOICE_ENERGY_THRESHOLD && peakBaselineRatio < MIN_PEAK_ENERGY_RATIO) {
|
||||||
|
LogUtils.w(TAG, "❌ 微弱人声过滤:峰值/基线${peakBaselineRatio} < ${MIN_PEAK_ENERGY_RATIO}(极低能量)")
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
// 4. 连续帧过滤:仅对极低能量语音生效,且阈值降到1
|
||||||
|
if (avgEnergy < NORMAL_VOICE_ENERGY_THRESHOLD && realtimeContinuousSpeechFrames < MIN_CONTINUOUS_VOICE_FRAMES) {
|
||||||
|
LogUtils.w(TAG, "❌ 微弱人声过滤:连续帧${realtimeContinuousSpeechFrames} < ${MIN_CONTINUOUS_VOICE_FRAMES}(极低能量)")
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
// 5. 平均能量过滤:仅对极极低能量语音生效
|
||||||
|
val energyBaselineRatio = avgEnergy / currentEnvBaseline
|
||||||
|
if (avgEnergy < 0.005f && energyBaselineRatio < 1.2f) {
|
||||||
|
LogUtils.w(TAG, "❌ 微弱人声过滤:能量/基线${energyBaselineRatio} < 1.2(极极低能量)")
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
// 正常语音(包括近距离轻声)直接通过
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ================= 结束录音(核心:适配近距离轻声) ================= */
|
||||||
private fun finishSentence(avgEnergy: Float = 0f, peakRms: Float = 0f) {
|
private fun finishSentence(avgEnergy: Float = 0f, peakRms: Float = 0f) {
|
||||||
val now = System.currentTimeMillis()
|
val now = System.currentTimeMillis()
|
||||||
val duration = now - recordingStartMs
|
val duration = now - recordingStartMs
|
||||||
|
|
||||||
|
// ========== 第一步:基础过滤(语音过短) ==========
|
||||||
if (!vadStarted || duration < MIN_SPEECH_MS) {
|
if (!vadStarted || duration < MIN_SPEECH_MS) {
|
||||||
LogUtils.d(TAG, "❌ 语音过短: $duration ms | 基线: $currentEnvBaseline")
|
LogUtils.d(TAG, "❌ 语音过短: $duration ms | 基线: $currentEnvBaseline")
|
||||||
// 核心新增:无效说话(语音过短),标记hasInvalidSpeech为true
|
hasInvalidSpeech = true
|
||||||
|
resetToWaitSpeech()
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// ========== 第二步:微弱人声专项过滤(仅过滤极微弱杂音) ==========
|
||||||
|
if (filterWeakVoice(duration, avgEnergy, peakRms)) {
|
||||||
hasInvalidSpeech = true
|
hasInvalidSpeech = true
|
||||||
resetToWaitSpeech()
|
resetToWaitSpeech()
|
||||||
return
|
return
|
||||||
@ -328,44 +357,40 @@ class VoiceController(
|
|||||||
val vadRatio = vadManager.activeSpeechRatio()
|
val vadRatio = vadManager.activeSpeechRatio()
|
||||||
val peakAvgRatio = if (avgEnergy > 0f) peakRms / avgEnergy else 0f
|
val peakAvgRatio = if (avgEnergy > 0f) peakRms / avgEnergy else 0f
|
||||||
|
|
||||||
// 直接复用实时统计的帧数据,无需重新获取
|
|
||||||
LogUtils.d(TAG, "📊 录音信息 | 时长: $duration ms | 能量: $avgEnergy | 峰均比: $peakAvgRatio | 基线: $currentEnvBaseline")
|
LogUtils.d(TAG, "📊 录音信息 | 时长: $duration ms | 能量: $avgEnergy | 峰均比: $peakAvgRatio | 基线: $currentEnvBaseline")
|
||||||
LogUtils.d(TAG, "📊 实时帧统计 | 总帧: $realtimeTotalFrames | 语音帧: $realtimeSpeechFrames | 连续语音帧: $realtimeContinuousSpeechFrames")
|
LogUtils.d(TAG, "📊 实时帧统计 | 总帧: $realtimeTotalFrames | 语音帧: $realtimeSpeechFrames | 连续语音帧: $realtimeContinuousSpeechFrames")
|
||||||
|
|
||||||
// 若录音中已识别出多人对话,直接过滤
|
// 多人对话过滤
|
||||||
if (isMultiPersonDialogueDetected) {
|
if (isMultiPersonDialogueDetected) {
|
||||||
LogUtils.w(TAG, "❌ 过滤多人对话垃圾语音(实时识别) | 时长: $duration ms")
|
LogUtils.w(TAG, "❌ 过滤多人对话垃圾语音(实时识别) | 时长: $duration ms")
|
||||||
// 核心新增:无效说话(多人对话),标记hasInvalidSpeech为true
|
|
||||||
hasInvalidSpeech = true
|
hasInvalidSpeech = true
|
||||||
resetToWaitSpeech()
|
resetToWaitSpeech()
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
// ========== 1. 强制兜底:正常语音直接通过 ==========
|
// ========== 1. 强制兜底:正常语音直接通过(阈值降低) ==========
|
||||||
val isNormalVoice = avgEnergy >= MIN_NORMAL_VOICE_ENERGY && vadRatio >= MIN_NORMAL_VOICE_VAD_RATIO
|
val isNormalVoice = avgEnergy >= MIN_NORMAL_VOICE_ENERGY && vadRatio >= MIN_NORMAL_VOICE_VAD_RATIO
|
||||||
if (isNormalVoice) {
|
if (isNormalVoice) {
|
||||||
LogUtils.i(TAG, "✅ 正常语音强制通过 | 能量: $avgEnergy ≥ $MIN_NORMAL_VOICE_ENERGY | 占比: $vadRatio ≥ $MIN_NORMAL_VOICE_VAD_RATIO")
|
LogUtils.i(TAG, "✅ 正常语音强制通过 | 能量: $avgEnergy ≥ $MIN_NORMAL_VOICE_ENERGY | 占比: $vadRatio ≥ $MIN_NORMAL_VOICE_VAD_RATIO")
|
||||||
audioBuffer.clear()
|
audioBuffer.clear()
|
||||||
state = VoiceState.UPLOADING
|
state = VoiceState.UPLOADING
|
||||||
onFinalAudio(audio)
|
onFinalAudio(audio)
|
||||||
resetRealtimeStats() // 重置实时统计
|
resetRealtimeStats()
|
||||||
// 核心新增:有效语音通过后,重置无效说话标记(后续超时重新判定)
|
|
||||||
hasInvalidSpeech = false
|
hasInvalidSpeech = false
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
// ========== 2. 远场过滤:只过滤极低能量 ==========
|
// ========== 2. 远场过滤(近距离场景几乎不生效) ==========
|
||||||
val isFarField = avgEnergy < MAX_FAR_FIELD_ENERGY
|
val isFarField = avgEnergy < MAX_FAR_FIELD_ENERGY
|
||||||
val isInvalidPeakRatio = peakAvgRatio < MIN_VALID_PEAK_AVG_RATIO
|
val isInvalidPeakRatio = peakAvgRatio < MIN_VALID_PEAK_AVG_RATIO
|
||||||
if (isFarField && isInvalidPeakRatio) {
|
if (isFarField && isInvalidPeakRatio) {
|
||||||
LogUtils.w(TAG, "❌ 远场/无效语音过滤 | 能量: $avgEnergy < $MAX_FAR_FIELD_ENERGY")
|
LogUtils.w(TAG, "❌ 远场/无效语音过滤 | 能量: $avgEnergy < $MAX_FAR_FIELD_ENERGY")
|
||||||
// 核心新增:无效说话(远场),标记hasInvalidSpeech为true
|
|
||||||
hasInvalidSpeech = true
|
hasInvalidSpeech = true
|
||||||
resetToWaitSpeech()
|
resetToWaitSpeech()
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
// ========== 3. 非连续判定:极度宽松 ==========
|
// ========== 3. 非连续判定(大幅放宽) ==========
|
||||||
val continuousRatio = if (realtimeSpeechFrames > 0) realtimeContinuousSpeechFrames.toFloat() / realtimeSpeechFrames else 0f
|
val continuousRatio = if (realtimeSpeechFrames > 0) realtimeContinuousSpeechFrames.toFloat() / realtimeSpeechFrames else 0f
|
||||||
val peakPositionRatio = vadManager.getPeakPositionRatio()
|
val peakPositionRatio = vadManager.getPeakPositionRatio()
|
||||||
val isDiscontinuous = continuousRatio < MIN_CONTINUOUS_FRAME_RATIO &&
|
val isDiscontinuous = continuousRatio < MIN_CONTINUOUS_FRAME_RATIO &&
|
||||||
@ -373,13 +398,12 @@ class VoiceController(
|
|||||||
peakPositionRatio > MAX_PEAK_POSITION_RATIO
|
peakPositionRatio > MAX_PEAK_POSITION_RATIO
|
||||||
if (isDiscontinuous) {
|
if (isDiscontinuous) {
|
||||||
LogUtils.w(TAG, "❌ 非连续杂音过滤 | 连续占比: $continuousRatio < $MIN_CONTINUOUS_FRAME_RATIO")
|
LogUtils.w(TAG, "❌ 非连续杂音过滤 | 连续占比: $continuousRatio < $MIN_CONTINUOUS_FRAME_RATIO")
|
||||||
// 核心新增:无效说话(非连续杂音),标记hasInvalidSpeech为true
|
|
||||||
hasInvalidSpeech = true
|
hasInvalidSpeech = true
|
||||||
resetToWaitSpeech()
|
resetToWaitSpeech()
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
// ========== 4. 分场景动态阈值计算(保留核心逻辑) ==========
|
// ========== 4. 分场景动态阈值计算(系数大幅降低) ==========
|
||||||
val isQuietEnv = currentEnvBaseline < BASELINE_QUIET_THRESHOLD
|
val isQuietEnv = currentEnvBaseline < BASELINE_QUIET_THRESHOLD
|
||||||
val thresholdConfig = when {
|
val thresholdConfig = when {
|
||||||
duration in SHORT_SPEECH_MIN..SHORT_SPEECH_MAX -> {
|
duration in SHORT_SPEECH_MIN..SHORT_SPEECH_MAX -> {
|
||||||
@ -406,18 +430,17 @@ class VoiceController(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// ========== 5. 分场景阈值过滤 ==========
|
// ========== 5. 分场景阈值过滤(阈值降低) ==========
|
||||||
val energyPass = avgEnergy >= thresholdConfig.energyThreshold
|
val energyPass = avgEnergy >= thresholdConfig.energyThreshold
|
||||||
val vadRatioPass = vadRatio >= thresholdConfig.vadRatioThreshold
|
val vadRatioPass = vadRatio >= thresholdConfig.vadRatioThreshold
|
||||||
if (!energyPass || !vadRatioPass) {
|
if (!energyPass || !vadRatioPass) {
|
||||||
LogUtils.w(TAG, "❌ 低能量语音阈值过滤 | 能量: $avgEnergy < ${thresholdConfig.energyThreshold} | 占比: $vadRatio < ${thresholdConfig.vadRatioThreshold} | 场景: ${thresholdConfig.scene}")
|
LogUtils.w(TAG, "❌ 低能量语音阈值过滤 | 能量: $avgEnergy < ${thresholdConfig.energyThreshold} | 占比: $vadRatio < ${thresholdConfig.vadRatioThreshold} | 场景: ${thresholdConfig.scene}")
|
||||||
// 核心新增:无效说话(低能量),标记hasInvalidSpeech为true
|
|
||||||
hasInvalidSpeech = true
|
hasInvalidSpeech = true
|
||||||
resetToWaitSpeech()
|
resetToWaitSpeech()
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
// ========== 6. 评分判定:极度宽松 ==========
|
// ========== 6. 评分判定(门槛降低到1) ==========
|
||||||
var score = 0
|
var score = 0
|
||||||
score += when {
|
score += when {
|
||||||
duration >= 4000 -> 3
|
duration >= 4000 -> 3
|
||||||
@ -430,7 +453,6 @@ class VoiceController(
|
|||||||
val pass = score >= thresholdConfig.minScore
|
val pass = score >= thresholdConfig.minScore
|
||||||
if (!pass) {
|
if (!pass) {
|
||||||
LogUtils.w(TAG, "❌ 评分不足过滤 | 总分: $score < ${thresholdConfig.minScore} | 场景: ${thresholdConfig.scene}")
|
LogUtils.w(TAG, "❌ 评分不足过滤 | 总分: $score < ${thresholdConfig.minScore} | 场景: ${thresholdConfig.scene}")
|
||||||
// 核心新增:无效说话(评分不足),标记hasInvalidSpeech为true
|
|
||||||
hasInvalidSpeech = true
|
hasInvalidSpeech = true
|
||||||
resetToWaitSpeech()
|
resetToWaitSpeech()
|
||||||
return
|
return
|
||||||
@ -440,13 +462,12 @@ class VoiceController(
|
|||||||
audioBuffer.clear()
|
audioBuffer.clear()
|
||||||
state = VoiceState.UPLOADING
|
state = VoiceState.UPLOADING
|
||||||
onFinalAudio(audio)
|
onFinalAudio(audio)
|
||||||
resetRealtimeStats() // 重置实时统计
|
resetRealtimeStats()
|
||||||
// 核心新增:有效语音通过后,重置无效说话标记
|
|
||||||
hasInvalidSpeech = false
|
hasInvalidSpeech = false
|
||||||
LogUtils.i(TAG, "✅ 低能量语音通过 | 时长: $duration ms | 能量: $avgEnergy | 场景: ${thresholdConfig.scene}")
|
LogUtils.i(TAG, "✅ 近距离轻声通过 | 时长: $duration ms | 能量: $avgEnergy | 场景: ${thresholdConfig.scene}")
|
||||||
}
|
}
|
||||||
|
|
||||||
/* ================= 新增:重置实时统计变量 ================= */
|
/* ================= 重置实时统计变量 ================= */
|
||||||
private fun resetRealtimeStats() {
|
private fun resetRealtimeStats() {
|
||||||
realtimeEnergySum = 0f
|
realtimeEnergySum = 0f
|
||||||
realtimeEnergyCount = 0
|
realtimeEnergyCount = 0
|
||||||
@ -458,6 +479,7 @@ class VoiceController(
|
|||||||
isMultiPersonDialogueDetected = false
|
isMultiPersonDialogueDetected = false
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/* ================= 播放/上传/Reset 回调 ================= */
|
/* ================= 播放/上传/Reset 回调 ================= */
|
||||||
fun onPlayStartPrompt() {
|
fun onPlayStartPrompt() {
|
||||||
LogUtils.d(TAG, "🎵 播放提示音 | 基线: $currentEnvBaseline")
|
LogUtils.d(TAG, "🎵 播放提示音 | 基线: $currentEnvBaseline")
|
||||||
@ -471,7 +493,6 @@ class VoiceController(
|
|||||||
}
|
}
|
||||||
|
|
||||||
fun onPlayStartBackend() {
|
fun onPlayStartBackend() {
|
||||||
// 仅当上传完成(成功)且状态为 UPLOADING 时,才切换状态
|
|
||||||
if (state != VoiceState.UPLOADING) {
|
if (state != VoiceState.UPLOADING) {
|
||||||
LogUtils.w(TAG, "🎶 非上传完成状态,禁止切换到 PLAYING_BACKEND | 当前状态: $state")
|
LogUtils.w(TAG, "🎶 非上传完成状态,禁止切换到 PLAYING_BACKEND | 当前状态: $state")
|
||||||
return
|
return
|
||||||
@ -490,42 +511,12 @@ class VoiceController(
|
|||||||
if (state != VoiceState.UPLOADING) return
|
if (state != VoiceState.UPLOADING) return
|
||||||
LogUtils.d(TAG, "📤 上传完成 | 成功: $success | 基线: $currentEnvBaseline")
|
LogUtils.d(TAG, "📤 上传完成 | 成功: $success | 基线: $currentEnvBaseline")
|
||||||
|
|
||||||
if (success) {
|
if (!success) {
|
||||||
// 上传成功:启动协程超时任务
|
|
||||||
startPlayWaitTimer()
|
|
||||||
} else {
|
|
||||||
// 上传失败:取消超时任务,重置状态
|
|
||||||
cancelPlayWaitTimer()
|
|
||||||
speechEnableAtMs = System.currentTimeMillis() + SPEECH_COOLDOWN_MS
|
speechEnableAtMs = System.currentTimeMillis() + SPEECH_COOLDOWN_MS
|
||||||
state = VoiceState.WAIT_SPEECH_COOLDOWN
|
state = VoiceState.WAIT_SPEECH_COOLDOWN
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private fun startPlayWaitTimer() {
|
|
||||||
// 先取消旧任务,避免重复
|
|
||||||
cancelPlayWaitTimer()
|
|
||||||
|
|
||||||
// 启动协程超时任务(Dispatchers.Main保证状态修改在主线程)
|
|
||||||
playWaitJob = GlobalScope.launch {
|
|
||||||
delay(PLAY_WAIT_TIMEOUT_MS) // 挂起3秒,无线程阻塞
|
|
||||||
LogUtils.w(TAG, "⏱ 播放等待超时(${PLAY_WAIT_TIMEOUT_MS}ms),自动重置状态")
|
|
||||||
|
|
||||||
// 超时后重置状态(加同步锁,避免多线程冲突)
|
|
||||||
synchronized(this@VoiceController) {
|
|
||||||
speechEnableAtMs = System.currentTimeMillis() + SPEECH_COOLDOWN_MS
|
|
||||||
state = VoiceState.WAIT_SPEECH_COOLDOWN
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// ================= 替换:取消协程任务 =================
|
|
||||||
private fun cancelPlayWaitTimer() {
|
|
||||||
playWaitJob?.cancel() // 取消协程(挂起函数会立即停止)
|
|
||||||
playWaitJob = null
|
|
||||||
LogUtils.d(TAG, "🔄 播放等待协程已取消")
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private fun resetToWaitSpeech() {
|
private fun resetToWaitSpeech() {
|
||||||
LogUtils.d(TAG, "🔄 重置到等待说话 | 基线: $currentEnvBaseline | 已标记无效说话: $hasInvalidSpeech")
|
LogUtils.d(TAG, "🔄 重置到等待说话 | 基线: $currentEnvBaseline | 已标记无效说话: $hasInvalidSpeech")
|
||||||
val now = System.currentTimeMillis()
|
val now = System.currentTimeMillis()
|
||||||
@ -537,7 +528,7 @@ class VoiceController(
|
|||||||
audioBuffer.clear()
|
audioBuffer.clear()
|
||||||
vadManager.reset()
|
vadManager.reset()
|
||||||
vadStarted = false
|
vadStarted = false
|
||||||
resetRealtimeStats() // 重置实时统计
|
resetRealtimeStats()
|
||||||
state = VoiceState.WAIT_SPEECH
|
state = VoiceState.WAIT_SPEECH
|
||||||
if (waitSpeechFailStartMs == 0L) waitSpeechFailStartMs = System.currentTimeMillis()
|
if (waitSpeechFailStartMs == 0L) waitSpeechFailStartMs = System.currentTimeMillis()
|
||||||
}
|
}
|
||||||
@ -553,13 +544,11 @@ class VoiceController(
|
|||||||
waitSpeechFailStartMs = 0L
|
waitSpeechFailStartMs = 0L
|
||||||
envNoiseBuffer.clear()
|
envNoiseBuffer.clear()
|
||||||
currentEnvBaseline = 0.001f
|
currentEnvBaseline = 0.001f
|
||||||
resetRealtimeStats() // 重置实时统计
|
resetRealtimeStats()
|
||||||
// 核心新增:重置所有状态时,同时重置无效说话标记和超时类型
|
|
||||||
hasInvalidSpeech = false
|
hasInvalidSpeech = false
|
||||||
currentTimeoutType = TimeoutType.IDLE_TIMEOUT
|
currentTimeoutType = TimeoutType.IDLE_TIMEOUT
|
||||||
LogUtils.d(TAG, "🔄 环境基线已重置 | 新基线: $currentEnvBaseline | 无效说话标记已重置")
|
LogUtils.d(TAG, "🔄 环境基线已重置 | 新基线: $currentEnvBaseline | 无效说话标记已重置")
|
||||||
state = VoiceState.WAIT_WAKEUP
|
state = VoiceState.WAIT_WAKEUP
|
||||||
cancelPlayWaitTimer()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fun release() {
|
fun release() {
|
||||||
@ -567,11 +556,9 @@ class VoiceController(
|
|||||||
wakeupManager.release()
|
wakeupManager.release()
|
||||||
vadManager.reset()
|
vadManager.reset()
|
||||||
envNoiseBuffer.clear()
|
envNoiseBuffer.clear()
|
||||||
resetRealtimeStats() // 重置实时统计
|
resetRealtimeStats()
|
||||||
// 核心新增:释放资源时重置标记
|
|
||||||
hasInvalidSpeech = false
|
hasInvalidSpeech = false
|
||||||
currentTimeoutType = TimeoutType.IDLE_TIMEOUT
|
currentTimeoutType = TimeoutType.IDLE_TIMEOUT
|
||||||
cancelPlayWaitTimer()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private fun cachePreBuffer(samples: FloatArray) {
|
private fun cachePreBuffer(samples: FloatArray) {
|
||||||
|
|||||||
@ -54,6 +54,7 @@ import com.zs.smarthuman.utils.AudioDebugUtil
|
|||||||
import com.zs.smarthuman.utils.AudioPcmUtil
|
import com.zs.smarthuman.utils.AudioPcmUtil
|
||||||
import com.zs.smarthuman.utils.DangerousUtils
|
import com.zs.smarthuman.utils.DangerousUtils
|
||||||
import com.zs.smarthuman.utils.LogFileUtils
|
import com.zs.smarthuman.utils.LogFileUtils
|
||||||
|
import com.zs.smarthuman.utils.SimulateStreamingAsr
|
||||||
|
|
||||||
import com.zs.smarthuman.utils.UnityPlayerHolder
|
import com.zs.smarthuman.utils.UnityPlayerHolder
|
||||||
import com.zs.smarthuman.utils.ViewSlideAnimator
|
import com.zs.smarthuman.utils.ViewSlideAnimator
|
||||||
@ -86,6 +87,9 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
|
|||||||
|
|
||||||
private var versionUpdateDialog: VersionUpdateDialog? = null
|
private var versionUpdateDialog: VersionUpdateDialog? = null
|
||||||
|
|
||||||
|
private val PLAY_WAIT_TIMEOUT_MS = 2000L // 统一2秒超时阈值
|
||||||
|
private var startPlayTimeoutJob: Job? = null // 统一管理所有播放场景的超时Job
|
||||||
|
|
||||||
override fun getViewBinding(): ActivityMainBinding = ActivityMainBinding.inflate(layoutInflater)
|
override fun getViewBinding(): ActivityMainBinding = ActivityMainBinding.inflate(layoutInflater)
|
||||||
override fun initView() {
|
override fun initView() {
|
||||||
UnityPlayerHolder.getInstance().initialize(this)
|
UnityPlayerHolder.getInstance().initialize(this)
|
||||||
@ -97,6 +101,7 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
|
|||||||
}
|
}
|
||||||
|
|
||||||
override fun initData() {
|
override fun initData() {
|
||||||
|
initAsrModel()
|
||||||
PermissionUtils.permissionGroup(PermissionConstants.MICROPHONE)
|
PermissionUtils.permissionGroup(PermissionConstants.MICROPHONE)
|
||||||
.callback(object : PermissionUtils.FullCallback {
|
.callback(object : PermissionUtils.FullCallback {
|
||||||
override fun onGranted(granted: List<String?>) {
|
override fun onGranted(granted: List<String?>) {
|
||||||
@ -154,6 +159,11 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
|
|||||||
}
|
}
|
||||||
Toaster.showShort(it)
|
Toaster.showShort(it)
|
||||||
voiceController?.onUploadFinished(true)
|
voiceController?.onUploadFinished(true)
|
||||||
|
startPlayTimeoutJob?.cancel()
|
||||||
|
startPlayTimeoutJob = lifecycleScope.launch {
|
||||||
|
delay(PLAY_WAIT_TIMEOUT_MS)
|
||||||
|
voiceController?.onPlayEndBackend()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -181,38 +191,42 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
|
|||||||
voiceController = VoiceController(
|
voiceController = VoiceController(
|
||||||
assetManager = assets,
|
assetManager = assets,
|
||||||
onWakeup = {
|
onWakeup = {
|
||||||
Log.d("lrs", "当前状态: 唤醒成功wakeup")
|
|
||||||
//每次唤醒前都要把前面的音频停掉
|
//每次唤醒前都要把前面的音频停掉
|
||||||
// UnityPlayerHolder.getInstance().cancelPCM()
|
UnityPlayerHolder.getInstance().cancelPCM()
|
||||||
UnityPlayerHolder.getInstance()
|
UnityPlayerHolder.getInstance()
|
||||||
.sendVoiceToUnity(
|
.sendVoiceToUnity(
|
||||||
voiceInfo = mutableListOf<VoiceBeanResp>().apply {
|
voiceInfo = mutableListOf<VoiceBeanResp>().apply {
|
||||||
add(
|
add(
|
||||||
VoiceBeanResp(
|
VoiceBeanResp(
|
||||||
audioUrl = UserInfoManager.userInfo?.wakeUpAudioUrl
|
audioUrl = "https://static.seerteach.net/aidialogue/systemVoice/aliyun-nv.mp3"
|
||||||
?: "https://static.seerteach.net/aidialogue/systemVoice/aliyun-nv.mp3"
|
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
startPlayTimeoutJob?.cancel()
|
||||||
|
startPlayTimeoutJob = lifecycleScope.launch {
|
||||||
|
delay(PLAY_WAIT_TIMEOUT_MS)
|
||||||
|
voiceController?.onPlayEndPrompt()
|
||||||
|
}
|
||||||
},
|
},
|
||||||
onFinalAudio = { audio ->
|
onFinalAudio = { audio ->
|
||||||
Log.d("lrsxx", "检测到语音,长度=${audio.size}")
|
Log.d("lrsxx", "检测到语音,长度=${audio.size}")
|
||||||
mViewModel?.uploadVoice(
|
// mViewModel?.uploadVoice(
|
||||||
AudioPcmUtil.pcm16ToBase64(AudioPcmUtil.floatToPcm16(audio)),
|
// AudioPcmUtil.pcm16ToBase64(AudioPcmUtil.floatToPcm16(audio)),
|
||||||
1
|
// 1
|
||||||
)
|
// )
|
||||||
// loadLocalJsonAndPlay()
|
loadLocalJsonAndPlay()
|
||||||
val file = File(
|
val file = File(
|
||||||
getExternalFilesDir(Environment.DIRECTORY_DOWNLOADS)!!.getAbsolutePath(),
|
getExternalFilesDir(Environment.DIRECTORY_DOWNLOADS)!!.getAbsolutePath(),
|
||||||
"xxx.wav"
|
"xxx.wav"
|
||||||
)
|
)
|
||||||
AudioDebugUtil.saveFloatPcmAsWav(audio, file)
|
AudioDebugUtil.saveFloatPcmAsWav(audio, file)
|
||||||
LogUtils.dTag("audioxx", "WAV saved: ${file.path}, samples=${audio.size}")
|
LogUtils.dTag("audioxx", "WAV saved: ${file.path}, samples=${audio.size}")
|
||||||
// lifecycleScope.launch(Dispatchers.Main) {
|
lifecycleScope.launch(Dispatchers.Main) {
|
||||||
//
|
|
||||||
// mVerticalAnimator?.show()
|
mVerticalAnimator?.show()
|
||||||
// }
|
}
|
||||||
},
|
},
|
||||||
onStateChanged = { state ->
|
onStateChanged = { state ->
|
||||||
|
|
||||||
@ -233,7 +247,7 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
|
|||||||
voiceInfo = mutableListOf<VoiceBeanResp>().apply {
|
voiceInfo = mutableListOf<VoiceBeanResp>().apply {
|
||||||
add(
|
add(
|
||||||
VoiceBeanResp(
|
VoiceBeanResp(
|
||||||
audioUrl = "https://static.seerteach.net/aidialogue/userWakeUpAudio/ttsmaker-file-2025-12-31-16-2-51.mp3"
|
audioUrl = UserInfoManager.userInfo?.endAudioUrl?:""
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
@ -244,6 +258,11 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private fun initAsrModel(){
|
||||||
|
lifecycleScope.launch(Dispatchers.IO){
|
||||||
|
SimulateStreamingAsr.initOfflineRecognizer(App.getInstance())
|
||||||
|
}
|
||||||
|
}
|
||||||
override fun receivedIMMsg(msg: SingleMessage) {
|
override fun receivedIMMsg(msg: SingleMessage) {
|
||||||
when (msg.msgContentType) {
|
when (msg.msgContentType) {
|
||||||
MessageContentType.RECEIVE_VOICE_STREAM.msgContentType -> {
|
MessageContentType.RECEIVE_VOICE_STREAM.msgContentType -> {
|
||||||
@ -415,9 +434,6 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
|
|||||||
|
|
||||||
private var promptPlaying = false
|
private var promptPlaying = false
|
||||||
private var backPlaying = false
|
private var backPlaying = false
|
||||||
private var promptTimeoutJob: Job? = null
|
|
||||||
private val PROMPT_PLAY_TIMEOUT_MS = 3000L // 3 秒
|
|
||||||
|
|
||||||
|
|
||||||
fun onAudioProgressUpdated( // Unity 调用此方法传递音频进度
|
fun onAudioProgressUpdated( // Unity 调用此方法传递音频进度
|
||||||
progress: Float,
|
progress: Float,
|
||||||
@ -427,7 +443,6 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
|
|||||||
audioUrl: String
|
audioUrl: String
|
||||||
) {
|
) {
|
||||||
val wakeupUrl = UserInfoManager.userInfo?.wakeUpAudioUrl
|
val wakeupUrl = UserInfoManager.userInfo?.wakeUpAudioUrl
|
||||||
?: "https://static.seerteach.net/aidialogue/systemVoice/aliyun-nv.mp3"
|
|
||||||
|
|
||||||
if (audioUrl != wakeupUrl) return
|
if (audioUrl != wakeupUrl) return
|
||||||
|
|
||||||
@ -436,13 +451,8 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
|
|||||||
if (!promptPlaying) {
|
if (!promptPlaying) {
|
||||||
promptPlaying = true
|
promptPlaying = true
|
||||||
voiceController?.onPlayStartPrompt()
|
voiceController?.onPlayStartPrompt()
|
||||||
|
startPlayTimeoutJob?.cancel()
|
||||||
promptTimeoutJob = lifecycleScope.launch {
|
LogUtils.eTag("MainActivity","等待超时")
|
||||||
delay(PROMPT_PLAY_TIMEOUT_MS)
|
|
||||||
promptPlaying = false
|
|
||||||
voiceController?.onPlayEndPrompt()
|
|
||||||
promptTimeoutJob?.cancel()
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -450,7 +460,6 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
|
|||||||
if (promptPlaying) {
|
if (promptPlaying) {
|
||||||
promptPlaying = false
|
promptPlaying = false
|
||||||
voiceController?.onPlayEndPrompt()
|
voiceController?.onPlayEndPrompt()
|
||||||
promptTimeoutJob?.cancel()
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -464,14 +473,15 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
|
|||||||
isFinal: Boolean
|
isFinal: Boolean
|
||||||
) {
|
) {
|
||||||
when (state) {
|
when (state) {
|
||||||
1 -> { // play
|
1 -> {
|
||||||
if (!backPlaying) {
|
if (!backPlaying) {
|
||||||
backPlaying = true
|
backPlaying = true
|
||||||
voiceController?.onPlayStartBackend()
|
voiceController?.onPlayStartBackend()
|
||||||
|
startPlayTimeoutJob?.cancel()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
3 -> { // complete
|
3 -> {
|
||||||
if (backPlaying) {
|
if (backPlaying) {
|
||||||
backPlaying = false
|
backPlaying = false
|
||||||
voiceController?.onPlayEndBackend()
|
voiceController?.onPlayEndBackend()
|
||||||
|
|||||||
@ -0,0 +1,155 @@
|
|||||||
|
package com.zs.smarthuman.utils
|
||||||
|
|
||||||
|
import android.content.Context
|
||||||
|
import android.content.res.AssetManager
|
||||||
|
import com.blankj.utilcode.util.LogUtils
|
||||||
|
|
||||||
|
import com.k2fsa.sherpa.onnx.OfflineModelConfig
|
||||||
|
import com.k2fsa.sherpa.onnx.OfflineRecognizer
|
||||||
|
import com.k2fsa.sherpa.onnx.OfflineRecognizerConfig
|
||||||
|
import com.k2fsa.sherpa.onnx.OfflineWenetCtcModelConfig
|
||||||
|
import com.k2fsa.sherpa.onnx.Vad
|
||||||
|
import com.k2fsa.sherpa.onnx.getVadModelConfig
|
||||||
|
import java.io.File
|
||||||
|
import java.io.FileOutputStream
|
||||||
|
import java.io.InputStream
|
||||||
|
import java.io.OutputStream
|
||||||
|
|
||||||
|
|
||||||
|
fun assetExists(assetManager: AssetManager, path: String): Boolean {
|
||||||
|
val dir = path.substringBeforeLast('/', "")
|
||||||
|
val fileName = path.substringAfterLast('/')
|
||||||
|
|
||||||
|
val files = assetManager.list(dir) ?: return false
|
||||||
|
return files.contains(fileName)
|
||||||
|
}
|
||||||
|
|
||||||
|
fun copyAssetToInternalStorage(path: String, context: Context): String {
|
||||||
|
val targetRoot = context.filesDir
|
||||||
|
val outFile = File(targetRoot, path)
|
||||||
|
|
||||||
|
if (!assetExists(context.assets, path = path)) {
|
||||||
|
// for context binary, if it is does not exist, we return a path
|
||||||
|
// that can be written to
|
||||||
|
outFile.parentFile?.mkdirs()
|
||||||
|
LogUtils.i("VoiceController", "$path does not exist, return ${outFile.absolutePath}")
|
||||||
|
return outFile.absolutePath
|
||||||
|
}
|
||||||
|
|
||||||
|
if (outFile.exists()) {
|
||||||
|
val assetSize = context.assets.open(path).use { it.available() }
|
||||||
|
if (outFile.length() == assetSize.toLong()) {
|
||||||
|
LogUtils.i("VoiceController", "$targetRoot/$path already exists, skip copying, return $targetRoot/$path")
|
||||||
|
|
||||||
|
return "$targetRoot/$path"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
outFile.parentFile?.mkdirs()
|
||||||
|
|
||||||
|
context.assets.open(path).use { input: InputStream ->
|
||||||
|
FileOutputStream(outFile).use { output: OutputStream ->
|
||||||
|
input.copyTo(output)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
LogUtils.i("VoiceController", "Copied $path to $targetRoot/$path")
|
||||||
|
|
||||||
|
return outFile.absolutePath
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
object SimulateStreamingAsr {
|
||||||
|
private var _recognizer: OfflineRecognizer? = null
|
||||||
|
val recognizer: OfflineRecognizer
|
||||||
|
get() {
|
||||||
|
return _recognizer!!
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
fun initOfflineRecognizer(context: Context) {
|
||||||
|
synchronized(this) {
|
||||||
|
if (_recognizer != null) {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
val wenetConfig = OfflineWenetCtcModelConfig(
|
||||||
|
model = "sherpa-onnx-wenetspeech-yue-u2pp-conformer-ctc-zh-en-cantonese-int8-2025-09-10/model.int8.onnx",
|
||||||
|
)
|
||||||
|
|
||||||
|
val modelConfig = OfflineModelConfig(
|
||||||
|
wenetCtc = wenetConfig,
|
||||||
|
tokens = "sherpa-onnx-wenetspeech-yue-u2pp-conformer-ctc-zh-en-cantonese-int8-2025-09-10/tokens.txt"
|
||||||
|
)
|
||||||
|
val config = OfflineRecognizerConfig(
|
||||||
|
modelConfig = modelConfig,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
var assetManager: AssetManager? = context.assets
|
||||||
|
|
||||||
|
if (config.modelConfig.provider == "qnn") {
|
||||||
|
// We assume you have copied files like libQnnHtpV81Skel.so to jniLibs/arm64-v8a
|
||||||
|
LogUtils.i("VoiceController", "nativelibdir: ${context.applicationInfo.nativeLibraryDir}")
|
||||||
|
|
||||||
|
// If we don't set the environment variable for ADSP_LIBRARY_PATH, we will see
|
||||||
|
// the error code 1008 from qnn_interface.deviceCreate()
|
||||||
|
// See also
|
||||||
|
// https://workbench.aihub.qualcomm.com/docs/hub/faq.html#why-am-i-seeing-error-1008-when-trying-to-use-htp
|
||||||
|
OfflineRecognizer.prependAdspLibraryPath(context.applicationInfo.nativeLibraryDir)
|
||||||
|
|
||||||
|
// for qnn, we need to copy *.so files from assets folder to sd card
|
||||||
|
if (config.modelConfig.senseVoice.qnnConfig.backendLib.isEmpty() && config.modelConfig.zipformerCtc.qnnConfig.backendLib.isEmpty()) {
|
||||||
|
LogUtils.i("VoiceController", "You should provide libQnnHtp.so for qnn")
|
||||||
|
throw IllegalArgumentException("You should provide libQnnHtp.so for qnn")
|
||||||
|
}
|
||||||
|
config.modelConfig.tokens =
|
||||||
|
copyAssetToInternalStorage(config.modelConfig.tokens, context)
|
||||||
|
|
||||||
|
if (config.modelConfig.senseVoice.model.isNotEmpty() || assetExists(
|
||||||
|
context.assets,
|
||||||
|
path = config.modelConfig.senseVoice.qnnConfig.contextBinary
|
||||||
|
)
|
||||||
|
) {
|
||||||
|
if (config.modelConfig.senseVoice.model.isNotEmpty()) {
|
||||||
|
config.modelConfig.senseVoice.model =
|
||||||
|
copyAssetToInternalStorage(config.modelConfig.senseVoice.model, context)
|
||||||
|
}
|
||||||
|
|
||||||
|
config.modelConfig.senseVoice.qnnConfig.contextBinary =
|
||||||
|
copyAssetToInternalStorage(
|
||||||
|
config.modelConfig.senseVoice.qnnConfig.contextBinary,
|
||||||
|
context
|
||||||
|
)
|
||||||
|
} else if (config.modelConfig.zipformerCtc.model.isNotEmpty()) {
|
||||||
|
config.modelConfig.zipformerCtc.model =
|
||||||
|
copyAssetToInternalStorage(config.modelConfig.zipformerCtc.model, context)
|
||||||
|
|
||||||
|
config.modelConfig.zipformerCtc.qnnConfig.contextBinary =
|
||||||
|
copyAssetToInternalStorage(
|
||||||
|
config.modelConfig.zipformerCtc.qnnConfig.contextBinary,
|
||||||
|
context
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
if (config.hr.lexicon.isNotEmpty()) {
|
||||||
|
config.hr.lexicon = copyAssetToInternalStorage(config.hr.lexicon, context)
|
||||||
|
}
|
||||||
|
|
||||||
|
if (config.hr.ruleFsts.isNotEmpty()) {
|
||||||
|
// it assumes there is only one fst. otherwise, you need to copy each fst separately
|
||||||
|
config.hr.ruleFsts = copyAssetToInternalStorage(config.hr.ruleFsts, context)
|
||||||
|
}
|
||||||
|
|
||||||
|
assetManager = null
|
||||||
|
}
|
||||||
|
|
||||||
|
_recognizer = OfflineRecognizer(
|
||||||
|
assetManager = assetManager,
|
||||||
|
config = config,
|
||||||
|
)
|
||||||
|
|
||||||
|
LogUtils.i("VoiceController", "sherpa-onnx offline recognizer initialized")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
BIN
app/src/main/jniLibs/arm64-v8a/libandroidx.graphics.path.so
Normal file
BIN
app/src/main/jniLibs/arm64-v8a/libandroidx.graphics.path.so
Normal file
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user