稳定版参数
This commit is contained in:
parent
eda73af083
commit
22fc80dd07
162
app/src/main/java/com/zs/smarthuman/sherpa/VoiceConfig.kt
Normal file
162
app/src/main/java/com/zs/smarthuman/sherpa/VoiceConfig.kt
Normal file
@ -0,0 +1,162 @@
|
|||||||
|
package com.zs.smarthuman.sherpa
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @description: 语音控制器配置常量类(所有硬编码参数集中管理,无需修改业务逻辑即可适配场景)
|
||||||
|
* @author: lrs
|
||||||
|
* @date: 2026/1/12 17:33
|
||||||
|
* @note: 所有参数适配 16kHz 单通道语音场景,调整后需在目标场景(安静/嘈杂/远场)测试验证
|
||||||
|
*/
|
||||||
|
object VoiceConfig {
|
||||||
|
// ===================== 基础配置(通用核心,禁止随意修改) =====================
|
||||||
|
/** 日志打印统一标签,便于筛选语音相关日志 */
|
||||||
|
const val TAG = "VoiceController"
|
||||||
|
|
||||||
|
/** 语音采样率(固定16kHz),与VAD/声纹模型强绑定,禁止修改 */
|
||||||
|
const val SAMPLE_RATE = 16000
|
||||||
|
|
||||||
|
/** 预缓存音频大小(2秒),用于唤醒后补全唤醒前的语音片段,防止开头缺失 */
|
||||||
|
const val PRE_BUFFER_SIZE = SAMPLE_RATE * 2
|
||||||
|
|
||||||
|
/** 声纹验证的当前唤醒用户ID,业务层可动态替换 */
|
||||||
|
const val CURRENT_USER_ID = "current_wakeup_user"
|
||||||
|
|
||||||
|
/** 是否开启严格的声纹验证:测试阶段设为false,生产环境建议设为true */
|
||||||
|
const val ENABLE_STRICT_SPEAKER_VERIFY = true
|
||||||
|
|
||||||
|
// ===================== 时间阈值(ms,时序控制核心) =====================
|
||||||
|
/** 空闲超时默认秒数(实际超时=200*1000=200秒),建议调整为5-30秒 */
|
||||||
|
const val IDLE_TIMEOUT_DEFAULT_SECONDS = 200
|
||||||
|
|
||||||
|
/** 最大录音时长(10秒),防止录音过长占用内存:短指令3-5秒,长语音10-20秒 */
|
||||||
|
const val MAX_RECORDING_DEFAULT_SECONDS = 10
|
||||||
|
|
||||||
|
/** 短语音判定阈值(1秒),声纹验证分场景的临界值,无需调整 */
|
||||||
|
const val SHORT_AUDIO_DURATION_MS = 1000L
|
||||||
|
|
||||||
|
/** 无效语音重置防抖时间(1.5秒),避免1.5秒内重复重置状态,建议1-2秒 */
|
||||||
|
const val INVALID_RESET_DEBOUNCE_MS = 1500L
|
||||||
|
|
||||||
|
/** 有效语音最小时长(800ms),过滤极短的杂音/按键音,建议600-1000ms */
|
||||||
|
const val MIN_SPEECH_MS = 800L
|
||||||
|
|
||||||
|
/** 微弱人声过滤的基础时长阈值(400ms),建议为MIN_SPEECH_MS的一半 */
|
||||||
|
const val MIN_EFFECTIVE_VOICE_DURATION = 400L
|
||||||
|
|
||||||
|
/** 唤醒后观察期(500ms),期间不处理VAD防止误触发,建议300-800ms */
|
||||||
|
const val KWS_OBSERVE_MS = 500L
|
||||||
|
|
||||||
|
/** 说话冷却期(300ms),提示音/后台音频结束后延迟进入等待说话,建议200-400ms */
|
||||||
|
const val SPEECH_COOLDOWN_MS = 300L
|
||||||
|
|
||||||
|
/** 短语音时长范围(0.5-2秒),分场景阈值的判定依据,无需调整 */
|
||||||
|
const val SHORT_SPEECH_MIN = 500L
|
||||||
|
const val SHORT_SPEECH_MAX = 2000L
|
||||||
|
|
||||||
|
/** 多人对话检测的最小时长(2.5秒),太短易误判,太长漏判,建议2-3秒 */
|
||||||
|
const val MULTI_DIALOGUE_MIN_DURATION = 2500L
|
||||||
|
|
||||||
|
// ===================== 环境/噪音阈值(能量类,核心过滤参数) =====================
|
||||||
|
/** 嘈杂环境判定阈值:环境基线≥0.01f则为嘈杂环境,安静场景0.008f,嘈杂场景0.015f */
|
||||||
|
const val NOISE_BASELINE_THRESHOLD = 0.01f
|
||||||
|
|
||||||
|
/** 环境基线校准的滑动窗口大小(50帧),太小基线波动大,太大校准滞后,建议30-80 */
|
||||||
|
const val BASELINE_WINDOW_SIZE = 50
|
||||||
|
|
||||||
|
/** 安静环境判定阈值:环境基线<0.005f则为安静环境,建议为NOISE_BASELINE_THRESHOLD的一半 */
|
||||||
|
const val BASELINE_QUIET_THRESHOLD = 0.005f
|
||||||
|
|
||||||
|
/** 有效语音的最小RMS能量(0.0005f),太低统计背景噪,太高漏统计弱人声,建议0.0003-0.0008f */
|
||||||
|
const val MIN_EFFECTIVE_SPEECH_RMS = 0.0005f
|
||||||
|
|
||||||
|
/** 正常语音能量阈值(0.008f),联合过滤的临界值,嘈杂环境0.01f,安静环境0.006f */
|
||||||
|
const val NORMAL_VOICE_ENERGY_THRESHOLD = 0.008f
|
||||||
|
|
||||||
|
// ===================== 声纹验证阈值(分场景适配,核心体验参数) =====================
|
||||||
|
/** 安静环境声纹验证阈值(0.50f),越高越严格:准确率优先0.55f,通过率优先0.45f */
|
||||||
|
const val SPEAKER_THRESHOLD_QUIET = 0.50f
|
||||||
|
|
||||||
|
/** 嘈杂环境声纹验证阈值(0.43f),放宽以提高通过率,比安静环境低0.05-0.1f */
|
||||||
|
const val SPEAKER_THRESHOLD_NOISY = 0.43f
|
||||||
|
|
||||||
|
/** 短语音声纹验证阈值(0.40f),进一步放宽,比嘈杂环境低0.03-0.05f */
|
||||||
|
const val SPEAKER_THRESHOLD_SHORT = 0.40f
|
||||||
|
|
||||||
|
// ===================== 能量/占比阈值(过滤核心,分场景适配) =====================
|
||||||
|
/** 正常语音最低能量(0.03f),短语音适配0.01f,长语音保持0.03f */
|
||||||
|
const val MIN_NORMAL_VOICE_ENERGY = 0.03f
|
||||||
|
|
||||||
|
/** 正常语音VAD占比阈值(0.2f),短语音0.1f,嘈杂环境0.15f */
|
||||||
|
const val MIN_NORMAL_VOICE_VAD_RATIO = 0.2f
|
||||||
|
|
||||||
|
/** 远场语音最大能量阈值(0.015f),近场0.01f,远场0.02f */
|
||||||
|
const val MAX_FAR_FIELD_ENERGY = 0.015f
|
||||||
|
|
||||||
|
/** 有效语音最小峰均比(0.5f),过滤扁平背景噪,建议0.4-0.6f */
|
||||||
|
const val MIN_VALID_PEAK_AVG_RATIO = 0.5f
|
||||||
|
|
||||||
|
/** 有效语音最小连续帧占比(0.1f),非连续杂音过滤,建议0.08-0.12f */
|
||||||
|
const val MIN_CONTINUOUS_FRAME_RATIO = 0.1f
|
||||||
|
|
||||||
|
/** 语音峰值位置阈值(0.95f),过滤末尾突发杂音,建议0.9-0.98f */
|
||||||
|
const val MAX_PEAK_POSITION_RATIO = 0.95f
|
||||||
|
|
||||||
|
/** 有效语音最小帧数(3帧),过滤零星语音帧,建议2-5帧 */
|
||||||
|
const val MIN_EFFECTIVE_SPEECH_FRAMES = 3
|
||||||
|
|
||||||
|
// ===================== 多人对话过滤(多维度判定) =====================
|
||||||
|
/** 多人对话最大峰均比(2.5f),峰均比过高判定为多人对话,建议2.0-3.0f */
|
||||||
|
const val MULTI_DIALOGUE_MAX_PEAK_AVG_RATIO = 2.5f
|
||||||
|
|
||||||
|
/** 多人对话最小峰均比(0.4f),峰均比过低判定为多人对话,建议0.3-0.5f */
|
||||||
|
const val MULTI_DIALOGUE_MIN_PEAK_AVG_RATIO = 0.4f
|
||||||
|
|
||||||
|
/** 多人对话最大连续帧占比(0.3f),连续帧低判定为多人对话,建议0.2-0.4f */
|
||||||
|
const val MULTI_DIALOGUE_MAX_CONTINUOUS_RATIO = 0.3f
|
||||||
|
|
||||||
|
/** 多人对话最小VAD占比(0.55f),VAD占比高判定为多人对话,建议0.5-0.6f */
|
||||||
|
const val MULTI_DIALOGUE_MIN_VAD_RATIO = 0.55f
|
||||||
|
|
||||||
|
// ===================== 分场景动态系数(阈值计算) =====================
|
||||||
|
/** 短语音能量动态系数:安静环境1.5f,嘈杂环境2.0f(嘈杂环境系数更高) */
|
||||||
|
const val SHORT_SPEECH_ENERGY_COEFF_QUIET = 1.5f
|
||||||
|
const val SHORT_SPEECH_ENERGY_COEFF_NOISY = 2.0f
|
||||||
|
|
||||||
|
/** 长语音能量动态系数:安静环境2.5f,嘈杂环境3.5f(长语音系数更高) */
|
||||||
|
const val LONG_SPEECH_ENERGY_COEFF_QUIET = 2.5f
|
||||||
|
const val LONG_SPEECH_ENERGY_COEFF_NOISY = 3.5f
|
||||||
|
|
||||||
|
/** 短/长语音VAD占比动态系数:短语音0.05f,长语音0.10f(长语音要求更高) */
|
||||||
|
const val SHORT_SPEECH_VAD_COEFF = 0.05f
|
||||||
|
const val LONG_SPEECH_VAD_COEFF = 0.10f
|
||||||
|
|
||||||
|
/** 短/长语音最低评分:默认1分(宽松),严格场景可设为2分 */
|
||||||
|
const val SHORT_SPEECH_MIN_SCORE = 1
|
||||||
|
const val LONG_SPEECH_MIN_SCORE = 1
|
||||||
|
|
||||||
|
// ===================== 微弱人声过滤专用阈值(补充) =====================
|
||||||
|
/** 短/长语音临界时长(2000ms),filterWeakVoice中判定短语音的依据 */
|
||||||
|
const val SHORT_LONG_SPEECH_CUTOFF_MS = 2000L
|
||||||
|
|
||||||
|
/** 短语音动态能量阈值(0.01f),filterWeakVoice中短语音的能量判定值 */
|
||||||
|
const val SHORT_SPEECH_ENERGY_THRESHOLD = 0.01f
|
||||||
|
|
||||||
|
/** 短语音VAD占比阈值(0.10f),filterWeakVoice中短语音的VAD判定值 */
|
||||||
|
const val SHORT_SPEECH_VAD_RATIO = 0.10f
|
||||||
|
|
||||||
|
/** 嘈杂环境VAD占比阈值(0.15f),filterWeakVoice中嘈杂环境的VAD判定值 */
|
||||||
|
const val NOISY_ENV_VAD_RATIO = 0.15f
|
||||||
|
|
||||||
|
/** 纯底噪过滤的能量阈值(0.005f),filterWeakVoice中底噪判定的能量值 */
|
||||||
|
const val PURE_NOISE_ENERGY_THRESHOLD = 0.005f
|
||||||
|
|
||||||
|
/** 纯底噪过滤的能量基线比(1.2f),filterWeakVoice中底噪判定的比值 */
|
||||||
|
const val PURE_NOISE_BASELINE_RATIO = 1.2f
|
||||||
|
|
||||||
|
// ===================== 语音评分专用阈值(补充) =====================
|
||||||
|
/** 长语音评分临界时长(4000ms),calculateSpeechScore中评3分的依据 */
|
||||||
|
const val LONG_SPEECH_SCORE_CUTOFF_MS = 4000L
|
||||||
|
|
||||||
|
/** 中语音评分临界时长(2500ms),calculateSpeechScore中评2分的依据 */
|
||||||
|
const val MID_SPEECH_SCORE_CUTOFF_MS = 2500L
|
||||||
|
|
||||||
|
}
|
||||||
@ -2,7 +2,8 @@ package com.zs.smarthuman.sherpa
|
|||||||
|
|
||||||
import android.content.res.AssetManager
|
import android.content.res.AssetManager
|
||||||
import com.blankj.utilcode.util.LogUtils
|
import com.blankj.utilcode.util.LogUtils
|
||||||
import com.k2fsa.sherpa.onnx.OnlineStream
|
import com.k2fsa.sherpa.onnx.SpeakerEmbeddingExtractor
|
||||||
|
import com.k2fsa.sherpa.onnx.SpeakerEmbeddingManager
|
||||||
import com.k2fsa.sherpa.onnx.SpeakerRecognition
|
import com.k2fsa.sherpa.onnx.SpeakerRecognition
|
||||||
import kotlinx.coroutines.CoroutineScope
|
import kotlinx.coroutines.CoroutineScope
|
||||||
import kotlinx.coroutines.Dispatchers
|
import kotlinx.coroutines.Dispatchers
|
||||||
@ -15,45 +16,32 @@ class VoiceController(
|
|||||||
assetManager: AssetManager,
|
assetManager: AssetManager,
|
||||||
private val onWakeup: () -> Unit,
|
private val onWakeup: () -> Unit,
|
||||||
private val onFinalAudio: (FloatArray) -> Unit,
|
private val onFinalAudio: (FloatArray) -> Unit,
|
||||||
idleTimeoutSeconds: Int = 200,
|
idleTimeoutSeconds: Int = VoiceConfig.IDLE_TIMEOUT_DEFAULT_SECONDS,
|
||||||
maxRecordingSeconds: Int = 10,
|
maxRecordingSeconds: Int = VoiceConfig.MAX_RECORDING_DEFAULT_SECONDS,
|
||||||
private val onStateChanged: ((VoiceState) -> Unit)? = null,
|
private val onStateChanged: ((VoiceState) -> Unit)? = null,
|
||||||
private val stopBackendAudio: (() -> Unit)? = null,
|
private val stopBackendAudio: (() -> Unit)? = null,
|
||||||
private val onTimeoutTip: OnTimeoutTip? = null
|
private val onTimeoutTip: OnTimeoutTip? = null
|
||||||
) {
|
) {
|
||||||
|
// 依赖组件
|
||||||
|
private val wakeupManager = WakeupManager(assetManager, onWakeup)
|
||||||
|
private val vadManager = VadManager(
|
||||||
|
assetManager,
|
||||||
|
onSpeechStart = ::onVadStart,
|
||||||
|
onSpeechEnd = { avgEnergy, peakRms -> onVadEnd(avgEnergy, peakRms) }
|
||||||
|
)
|
||||||
|
private val stateManager = VoiceStateManager(
|
||||||
|
idleTimeoutSeconds = idleTimeoutSeconds,
|
||||||
|
maxRecordingSeconds = maxRecordingSeconds,
|
||||||
|
onStateChanged = onStateChanged,
|
||||||
|
onTimeoutTip = onTimeoutTip
|
||||||
|
)
|
||||||
|
|
||||||
companion object {
|
// 音频缓存
|
||||||
// 日志标签
|
private val audioBuffer = mutableListOf<Float>()
|
||||||
private const val TAG = "VoiceController"
|
private val preBuffer = ArrayDeque<Float>(VoiceConfig.PRE_BUFFER_SIZE)
|
||||||
// 采样率
|
private val envNoiseBuffer = ArrayDeque<Float>(VoiceConfig.BASELINE_WINDOW_SIZE)
|
||||||
private const val SAMPLE_RATE = 16000
|
|
||||||
// 预缓存大小(2秒)
|
|
||||||
private const val PRE_BUFFER_SIZE = SAMPLE_RATE * 2
|
|
||||||
|
|
||||||
// ========== 核心:分场景声纹阈值(极简版) ==========
|
// 实时统计
|
||||||
private const val SPEAKER_THRESHOLD_QUIET = 0.50f // 安静环境
|
|
||||||
private const val SPEAKER_THRESHOLD_NOISY = 0.43f // 嘈杂环境(匹配你的真实相似度)
|
|
||||||
private const val SPEAKER_THRESHOLD_SHORT = 0.40f // 短语音(<1秒)
|
|
||||||
|
|
||||||
// 短语音判定阈值
|
|
||||||
private const val SHORT_AUDIO_DURATION_MS = 1000L
|
|
||||||
private const val INVALID_RESET_DEBOUNCE_MS = 1500L
|
|
||||||
// 最小语音时长
|
|
||||||
private const val MIN_SPEECH_MS = 800L
|
|
||||||
private const val MIN_EFFECTIVE_VOICE_DURATION = 400L
|
|
||||||
|
|
||||||
// 噪音场景判定阈值
|
|
||||||
private const val NOISE_BASELINE_THRESHOLD = 0.01f
|
|
||||||
}
|
|
||||||
|
|
||||||
var state: VoiceState = VoiceState.WAIT_WAKEUP
|
|
||||||
private set(value) {
|
|
||||||
field = value
|
|
||||||
LogUtils.d(TAG, "➡ State = $value")
|
|
||||||
onStateChanged?.invoke(value)
|
|
||||||
}
|
|
||||||
|
|
||||||
// 实时能量与帧统计变量
|
|
||||||
private var realtimeEnergySum = 0f
|
private var realtimeEnergySum = 0f
|
||||||
private var realtimeEnergyCount = 0
|
private var realtimeEnergyCount = 0
|
||||||
private var realtimePeakRms = 0f
|
private var realtimePeakRms = 0f
|
||||||
@ -62,361 +50,222 @@ class VoiceController(
|
|||||||
private var realtimeContinuousSpeechFrames = 0
|
private var realtimeContinuousSpeechFrames = 0
|
||||||
private var realtimeLastFrameIsSpeech = false
|
private var realtimeLastFrameIsSpeech = false
|
||||||
private var isMultiPersonDialogueDetected = false
|
private var isMultiPersonDialogueDetected = false
|
||||||
private var lastInvalidResetMs = 0L
|
|
||||||
|
// 声纹识别相关
|
||||||
private val speakerManagerLock = ReentrantLock()
|
private val speakerManagerLock = ReentrantLock()
|
||||||
|
private lateinit var speakerExtractor: SpeakerEmbeddingExtractor
|
||||||
// 环境噪音状态标记
|
private lateinit var speakerManager: SpeakerEmbeddingManager
|
||||||
private var isNoisyEnvironment = false
|
|
||||||
|
|
||||||
private val wakeupManager = WakeupManager(assetManager, onWakeup)
|
|
||||||
private val vadManager = VadManager(
|
|
||||||
assetManager,
|
|
||||||
onSpeechStart = { onVadStart() },
|
|
||||||
onSpeechEnd = { avgEnergy, peakRms -> onVadEnd(avgEnergy, peakRms) }
|
|
||||||
)
|
|
||||||
|
|
||||||
private val audioBuffer = mutableListOf<Float>()
|
|
||||||
private val preBuffer = ArrayDeque<Float>(PRE_BUFFER_SIZE)
|
|
||||||
|
|
||||||
private var recordingStartMs = 0L
|
|
||||||
private var waitSpeechFailStartMs = 0L
|
|
||||||
private var waitSpeechStartMs = 0L
|
|
||||||
|
|
||||||
private var vadStarted = false
|
|
||||||
private var inKwsObserve = false
|
|
||||||
private var kwsObserveStartMs = 0L
|
|
||||||
private val KWS_OBSERVE_MS = 500L
|
|
||||||
private var speechEnableAtMs = 0L
|
|
||||||
private val SPEECH_COOLDOWN_MS = 300L
|
|
||||||
|
|
||||||
private val idleTimeoutMs = idleTimeoutSeconds * 1000L
|
|
||||||
private val maxRecordingMs = maxRecordingSeconds * 1000L
|
|
||||||
|
|
||||||
// 分场景动态系数(保留原有逻辑)
|
|
||||||
private val BASELINE_WINDOW_SIZE = 50
|
|
||||||
private val envNoiseBuffer = ArrayDeque<Float>(BASELINE_WINDOW_SIZE)
|
|
||||||
private var currentEnvBaseline = 0.001f
|
|
||||||
|
|
||||||
// 强制兜底:正常语音最低门槛
|
|
||||||
private val MIN_NORMAL_VOICE_ENERGY = 0.03f
|
|
||||||
private val MIN_NORMAL_VOICE_VAD_RATIO = 0.2f
|
|
||||||
|
|
||||||
// 分场景动态系数
|
|
||||||
private val BASELINE_QUIET_THRESHOLD = 0.005f
|
|
||||||
private val SHORT_SPEECH_ENERGY_COEFF_QUIET = 1.5f
|
|
||||||
private val SHORT_SPEECH_ENERGY_COEFF_NOISY = 2.0f
|
|
||||||
private val LONG_SPEECH_ENERGY_COEFF_QUIET = 2.5f
|
|
||||||
private val LONG_SPEECH_ENERGY_COEFF_NOISY = 3.5f
|
|
||||||
private val SHORT_SPEECH_VAD_COEFF = 0.05f
|
|
||||||
private val LONG_SPEECH_VAD_COEFF = 0.10f
|
|
||||||
private val SHORT_SPEECH_MIN_SCORE = 1
|
|
||||||
private val LONG_SPEECH_MIN_SCORE = 1
|
|
||||||
|
|
||||||
// 其他过滤参数
|
|
||||||
private val MAX_FAR_FIELD_ENERGY = 0.015f
|
|
||||||
private val MIN_VALID_PEAK_AVG_RATIO = 0.5f
|
|
||||||
private val MIN_CONTINUOUS_FRAME_RATIO = 0.1f
|
|
||||||
private val MAX_PEAK_POSITION_RATIO = 0.95f
|
|
||||||
private val MIN_EFFECTIVE_SPEECH_FRAMES = 3
|
|
||||||
private val SHORT_SPEECH_MIN = 500L
|
|
||||||
private val SHORT_SPEECH_MAX = 2000L
|
|
||||||
|
|
||||||
// 多人对话过滤配置
|
|
||||||
private val MULTI_DIALOGUE_MIN_DURATION = 2500L
|
|
||||||
private val MULTI_DIALOGUE_MAX_PEAK_AVG_RATIO = 2.5f
|
|
||||||
private val MULTI_DIALOGUE_MIN_PEAK_AVG_RATIO = 0.4f
|
|
||||||
private val MULTI_DIALOGUE_MAX_CONTINUOUS_RATIO = 0.3f
|
|
||||||
private val MULTI_DIALOGUE_MIN_VAD_RATIO = 0.55f
|
|
||||||
|
|
||||||
// 微弱人声过滤配置
|
|
||||||
private val MIN_VOICE_FRAME_RATIO = 0.08f
|
|
||||||
private val MIN_PEAK_ENERGY_RATIO = 1.5f
|
|
||||||
private val NORMAL_VOICE_ENERGY_THRESHOLD = 0.008f
|
|
||||||
private val MIN_CONTINUOUS_VOICE_FRAMES = 1
|
|
||||||
private val MIN_EFFECTIVE_SPEECH_RMS = 0.0005f
|
|
||||||
|
|
||||||
// 无效说话标记 + 超时类型
|
|
||||||
private var hasInvalidSpeech = false
|
|
||||||
private var currentTimeoutType: TimeoutType = TimeoutType.IDLE_TIMEOUT
|
|
||||||
|
|
||||||
// 声纹验证相关
|
|
||||||
private val CURRENT_USER_ID = "current_wakeup_user"
|
|
||||||
private val ENABLE_STRICT_SPEAKER_VERIFY = true
|
|
||||||
|
|
||||||
init {
|
init {
|
||||||
|
initSpeakerRecognition(assetManager)
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 初始化声纹识别器
|
||||||
|
*/
|
||||||
|
private fun initSpeakerRecognition(assetManager: AssetManager) {
|
||||||
try {
|
try {
|
||||||
SpeakerRecognition.initExtractor(assetManager)
|
SpeakerRecognition.initExtractor(assetManager)
|
||||||
LogUtils.d(TAG, "✅ 声纹识别器初始化成功")
|
speakerExtractor = SpeakerRecognition.extractor
|
||||||
|
speakerManager = SpeakerRecognition.manager
|
||||||
|
LogUtils.d(VoiceConfig.TAG, "✅ 声纹识别器初始化成功")
|
||||||
} catch (e: Exception) {
|
} catch (e: Exception) {
|
||||||
LogUtils.e(TAG, "❌ 声纹识别器初始化失败", e)
|
LogUtils.e(VoiceConfig.TAG, "❌ 声纹识别器初始化失败", e)
|
||||||
throw RuntimeException("声纹识别初始化失败", e)
|
throw RuntimeException("声纹识别初始化失败", e)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* ================= 音频入口 ================= */
|
/**
|
||||||
|
* 音频入口(对外API不变)
|
||||||
|
*/
|
||||||
fun acceptAudio(samples: FloatArray) {
|
fun acceptAudio(samples: FloatArray) {
|
||||||
cachePreBuffer(samples)
|
// 缓存预缓冲
|
||||||
|
VoiceUtils.cachePreBuffer(samples, preBuffer)
|
||||||
|
|
||||||
|
// 唤醒检测
|
||||||
wakeupManager.acceptAudio(samples)
|
wakeupManager.acceptAudio(samples)
|
||||||
if (wakeupManager.consumeWakeupFlag()) {
|
if (wakeupManager.consumeWakeupFlag()) {
|
||||||
handleWakeupEvent()
|
handleWakeupEvent()
|
||||||
// 注册唤醒用户特征
|
// 注册唤醒用户特征
|
||||||
CoroutineScope(Dispatchers.IO).launch {
|
CoroutineScope(Dispatchers.IO).launch {
|
||||||
var stream: OnlineStream? = null
|
VoiceUtils.registerWakeupUser(
|
||||||
runCatching {
|
preBuffer = preBuffer,
|
||||||
val wakeupAudio = preBuffer.toFloatArray()
|
extractor = speakerExtractor,
|
||||||
if (wakeupAudio.isEmpty()) {
|
manager = speakerManager
|
||||||
LogUtils.w(TAG, "❌ 唤醒音频缓存为空,无法注册用户特征")
|
)
|
||||||
return@launch
|
|
||||||
}
|
|
||||||
|
|
||||||
stream = SpeakerRecognition.extractor.createStream()
|
|
||||||
stream?.acceptWaveform(samples = wakeupAudio, sampleRate = SAMPLE_RATE)
|
|
||||||
stream?.inputFinished()
|
|
||||||
|
|
||||||
if (stream != null && SpeakerRecognition.extractor.isReady(stream)) {
|
|
||||||
val embedding = SpeakerRecognition.extractor.compute(stream)
|
|
||||||
speakerManagerLock.withLock {
|
|
||||||
SpeakerRecognition.manager.remove(CURRENT_USER_ID)
|
|
||||||
val embeddingList = mutableListOf(embedding)
|
|
||||||
val ok = SpeakerRecognition.manager.add(
|
|
||||||
name = CURRENT_USER_ID,
|
|
||||||
embedding = embeddingList.toTypedArray()
|
|
||||||
)
|
|
||||||
if (ok) {
|
|
||||||
LogUtils.d(TAG, "✅ 注册当前唤醒用户特征成功 | 特征长度: ${embedding.size}")
|
|
||||||
} else {
|
|
||||||
LogUtils.w(TAG, "❌ 注册当前唤醒用户特征失败")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
LogUtils.w(TAG, "❌ 唤醒音频Stream未就绪,跳过用户注册")
|
|
||||||
}
|
|
||||||
}.onFailure {
|
|
||||||
LogUtils.e(TAG, "❌ 唤醒用户特征注册失败", it)
|
|
||||||
}.also {
|
|
||||||
stream?.release()
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
val now = System.currentTimeMillis()
|
val now = System.currentTimeMillis()
|
||||||
|
|
||||||
if (state == VoiceState.WAIT_WAKEUP) {
|
// 环境基线校准(仅等待唤醒状态)
|
||||||
calibrateEnvBaseline(samples)
|
if (stateManager.state == VoiceState.WAIT_WAKEUP) {
|
||||||
isNoisyEnvironment = currentEnvBaseline >= NOISE_BASELINE_THRESHOLD
|
stateManager.currentEnvBaseline = VoiceUtils.calibrateEnvBaseline(
|
||||||
LogUtils.d(TAG, "📊 环境状态 | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment")
|
samples = samples,
|
||||||
|
vadManager = vadManager,
|
||||||
|
envNoiseBuffer = envNoiseBuffer,
|
||||||
|
currentEnvBaseline = stateManager.currentEnvBaseline
|
||||||
|
)
|
||||||
|
stateManager.isNoisyEnvironment = stateManager.currentEnvBaseline >= VoiceConfig.NOISE_BASELINE_THRESHOLD
|
||||||
|
LogUtils.d(VoiceConfig.TAG, "📊 环境状态 | 基线: ${stateManager.currentEnvBaseline} | 嘈杂环境: ${stateManager.isNoisyEnvironment}")
|
||||||
}
|
}
|
||||||
|
|
||||||
when (state) {
|
// 状态分发
|
||||||
|
when (stateManager.state) {
|
||||||
VoiceState.WAIT_WAKEUP,
|
VoiceState.WAIT_WAKEUP,
|
||||||
VoiceState.PLAYING_PROMPT,
|
VoiceState.PLAYING_PROMPT,
|
||||||
VoiceState.PLAYING_BACKEND,
|
VoiceState.PLAYING_BACKEND,
|
||||||
VoiceState.UPLOADING -> return
|
VoiceState.UPLOADING -> return
|
||||||
|
|
||||||
VoiceState.WAIT_SPEECH_COOLDOWN -> {
|
VoiceState.WAIT_SPEECH_COOLDOWN -> {
|
||||||
if (now >= speechEnableAtMs) {
|
stateManager.handleWaitSpeechCooldown(now)
|
||||||
waitSpeechFailStartMs = now
|
|
||||||
state = VoiceState.WAIT_SPEECH
|
|
||||||
waitSpeechStartMs = now
|
|
||||||
}
|
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
VoiceState.WAIT_SPEECH -> {
|
VoiceState.WAIT_SPEECH -> {
|
||||||
if ((waitSpeechStartMs > 0 && now - waitSpeechStartMs >= idleTimeoutMs) ||
|
// 检查超时(修复点:超时后主动调用 resetAll() 并传参)
|
||||||
(waitSpeechFailStartMs > 0 && now - waitSpeechFailStartMs >= idleTimeoutMs)
|
if (stateManager.checkWaitSpeechTimeout(now)) {
|
||||||
) {
|
stateManager.resetAll(
|
||||||
currentTimeoutType = if (hasInvalidSpeech) {
|
resetRealtimeStats = ::resetRealtimeStats,
|
||||||
TimeoutType.INVALID_SPEECH_TIMEOUT
|
audioBuffer = audioBuffer,
|
||||||
} else {
|
preBuffer = preBuffer,
|
||||||
TimeoutType.IDLE_TIMEOUT
|
vadManager = vadManager,
|
||||||
}
|
wakeupManager = wakeupManager,
|
||||||
LogUtils.d(TAG, "⏱ WAIT_SPEECH timeout → WAIT_WAKEUP | 超时类型: $currentTimeoutType")
|
envNoiseBuffer = envNoiseBuffer
|
||||||
onTimeoutTip?.invoke(currentTimeoutType)
|
)
|
||||||
resetAll()
|
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
if (inKwsObserve && now - kwsObserveStartMs < KWS_OBSERVE_MS) return
|
// 唤醒观察期
|
||||||
inKwsObserve = false
|
if (stateManager.inKwsObserve && now - stateManager.kwsObserveStartMs < VoiceConfig.KWS_OBSERVE_MS) return
|
||||||
|
stateManager.inKwsObserve = false
|
||||||
|
|
||||||
|
// VAD检测
|
||||||
vadManager.accept(samples)
|
vadManager.accept(samples)
|
||||||
}
|
}
|
||||||
|
|
||||||
VoiceState.RECORDING -> {
|
VoiceState.RECORDING -> {
|
||||||
|
// 音频缓存
|
||||||
audioBuffer.addAll(samples.asList())
|
audioBuffer.addAll(samples.asList())
|
||||||
vadManager.accept(samples)
|
vadManager.accept(samples)
|
||||||
|
|
||||||
calibrateEnvBaseline(samples)
|
// 环境校准
|
||||||
updateRealtimeEnergy(samples)
|
stateManager.currentEnvBaseline = VoiceUtils.calibrateEnvBaseline(
|
||||||
updateRealtimeFrameStats()
|
samples = samples,
|
||||||
isNoisyEnvironment = currentEnvBaseline >= NOISE_BASELINE_THRESHOLD
|
vadManager = vadManager,
|
||||||
|
envNoiseBuffer = envNoiseBuffer,
|
||||||
|
currentEnvBaseline = stateManager.currentEnvBaseline
|
||||||
|
)
|
||||||
|
stateManager.isNoisyEnvironment = stateManager.currentEnvBaseline >= VoiceConfig.NOISE_BASELINE_THRESHOLD
|
||||||
|
|
||||||
if (checkMultiPersonDialogueRealtime(now)) {
|
// 更新实时统计
|
||||||
LogUtils.w(TAG, "🚨 录音中识别出多人对话,提前终止")
|
val energyStats = VoiceUtils.updateRealtimeEnergy(
|
||||||
|
samples = samples,
|
||||||
|
vadManager = vadManager,
|
||||||
|
isNoisyEnvironment = stateManager.isNoisyEnvironment,
|
||||||
|
currentEnvBaseline = stateManager.currentEnvBaseline,
|
||||||
|
realtimeEnergySum = realtimeEnergySum,
|
||||||
|
realtimeEnergyCount = realtimeEnergyCount,
|
||||||
|
realtimePeakRms = realtimePeakRms
|
||||||
|
)
|
||||||
|
realtimeEnergySum = energyStats.first
|
||||||
|
realtimeEnergyCount = energyStats.second
|
||||||
|
realtimePeakRms = energyStats.third
|
||||||
|
|
||||||
|
val frameStats = VoiceUtils.updateRealtimeFrameStats(vadManager)
|
||||||
|
realtimeTotalFrames = frameStats.totalFrames
|
||||||
|
realtimeSpeechFrames = frameStats.speechFrames
|
||||||
|
realtimeContinuousSpeechFrames = frameStats.continuousSpeechFrames
|
||||||
|
realtimeLastFrameIsSpeech = frameStats.lastFrameIsSpeech
|
||||||
|
|
||||||
|
// 多人对话检测
|
||||||
|
isMultiPersonDialogueDetected = VoiceUtils.checkMultiPersonDialogue(
|
||||||
|
now = now,
|
||||||
|
recordingStartMs = stateManager.recordingStartMs,
|
||||||
|
realtimeEnergySum = realtimeEnergySum,
|
||||||
|
realtimeEnergyCount = realtimeEnergyCount,
|
||||||
|
realtimePeakRms = realtimePeakRms,
|
||||||
|
realtimeSpeechFrames = realtimeSpeechFrames,
|
||||||
|
realtimeContinuousSpeechFrames = realtimeContinuousSpeechFrames,
|
||||||
|
vadManager = vadManager
|
||||||
|
)
|
||||||
|
|
||||||
|
if (isMultiPersonDialogueDetected) {
|
||||||
|
LogUtils.w(VoiceConfig.TAG, "🚨 录音中识别出多人对话,提前终止")
|
||||||
finishSentence(realtimeEnergySum / realtimeEnergyCount, realtimePeakRms)
|
finishSentence(realtimeEnergySum / realtimeEnergyCount, realtimePeakRms)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
if (System.currentTimeMillis() - recordingStartMs > maxRecordingMs) {
|
// 最大录音时长检测
|
||||||
LogUtils.w(TAG, "⏱ Max recording reached | 当前环境基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment")
|
if (System.currentTimeMillis() - stateManager.recordingStartMs > stateManager.maxRecordingMs) {
|
||||||
|
LogUtils.w(VoiceConfig.TAG, "⏱ Max recording reached | 当前环境基线: ${stateManager.currentEnvBaseline} | 嘈杂环境: ${stateManager.isNoisyEnvironment}")
|
||||||
finishSentence(realtimeEnergySum / realtimeEnergyCount, realtimePeakRms)
|
finishSentence(realtimeEnergySum / realtimeEnergyCount, realtimePeakRms)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* ================= 实时能量更新 ================= */
|
/**
|
||||||
private fun updateRealtimeEnergy(samples: FloatArray) {
|
* 处理唤醒事件
|
||||||
val rms = vadManager.calcRms(samples)
|
*/
|
||||||
val effectiveThreshold = if (isNoisyEnvironment) currentEnvBaseline * 1.8f else MIN_EFFECTIVE_SPEECH_RMS
|
|
||||||
if (rms >= effectiveThreshold) {
|
|
||||||
realtimeEnergySum += rms
|
|
||||||
realtimeEnergyCount++
|
|
||||||
realtimePeakRms = maxOf(realtimePeakRms, rms)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* ================= 实时帧统计 ================= */
|
|
||||||
private fun updateRealtimeFrameStats() {
|
|
||||||
realtimeTotalFrames = vadManager.getTotalFrames()
|
|
||||||
realtimeSpeechFrames = vadManager.getSpeechFrames()
|
|
||||||
realtimeContinuousSpeechFrames = vadManager.getContinuousSpeechFrames()
|
|
||||||
val currentFrameIsSpeech = vadManager.isSpeechDetected()
|
|
||||||
if (currentFrameIsSpeech) {
|
|
||||||
realtimeContinuousSpeechFrames = if (realtimeLastFrameIsSpeech) realtimeContinuousSpeechFrames + 1 else 1
|
|
||||||
} else {
|
|
||||||
realtimeContinuousSpeechFrames = 0
|
|
||||||
}
|
|
||||||
realtimeLastFrameIsSpeech = currentFrameIsSpeech
|
|
||||||
}
|
|
||||||
|
|
||||||
/* ================= 多人对话检测 ================= */
|
|
||||||
private fun checkMultiPersonDialogueRealtime(now: Long): Boolean {
|
|
||||||
val duration = now - recordingStartMs
|
|
||||||
if (duration < MULTI_DIALOGUE_MIN_DURATION) return false
|
|
||||||
|
|
||||||
val avgEnergy = if (realtimeEnergyCount > 0) realtimeEnergySum / realtimeEnergyCount else 0f
|
|
||||||
val peakAvgRatio = if (avgEnergy > 0) realtimePeakRms / avgEnergy else 0f
|
|
||||||
val continuousRatio = if (realtimeSpeechFrames > 0) realtimeContinuousSpeechFrames.toFloat() / realtimeSpeechFrames else 0f
|
|
||||||
val vadRatio = vadManager.activeSpeechRatio()
|
|
||||||
|
|
||||||
isMultiPersonDialogueDetected = duration >= MULTI_DIALOGUE_MIN_DURATION &&
|
|
||||||
peakAvgRatio in MULTI_DIALOGUE_MIN_PEAK_AVG_RATIO..MULTI_DIALOGUE_MAX_PEAK_AVG_RATIO &&
|
|
||||||
continuousRatio <= MULTI_DIALOGUE_MAX_CONTINUOUS_RATIO &&
|
|
||||||
vadRatio >= MULTI_DIALOGUE_MIN_VAD_RATIO
|
|
||||||
|
|
||||||
return isMultiPersonDialogueDetected
|
|
||||||
}
|
|
||||||
|
|
||||||
/* ================= 环境基线校准 ================= */
|
|
||||||
private fun calibrateEnvBaseline(samples: FloatArray) {
|
|
||||||
val rms = vadManager.calcRms(samples)
|
|
||||||
val validRms = if (rms < currentEnvBaseline + 0.002f) rms else currentEnvBaseline
|
|
||||||
if (rms < 0.015f) {
|
|
||||||
if (envNoiseBuffer.size >= BASELINE_WINDOW_SIZE) {
|
|
||||||
envNoiseBuffer.removeFirst()
|
|
||||||
}
|
|
||||||
envNoiseBuffer.addLast(validRms)
|
|
||||||
currentEnvBaseline = envNoiseBuffer.maxOrNull() ?: 0.001f
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* ================= 唤醒处理 ================= */
|
|
||||||
private fun handleWakeupEvent() {
|
private fun handleWakeupEvent() {
|
||||||
if (state == VoiceState.UPLOADING) return
|
if (stateManager.state == VoiceState.UPLOADING) return
|
||||||
stopBackendAudio?.invoke()
|
stopBackendAudio?.invoke()
|
||||||
enterWakeup(interrupt = true)
|
stateManager.enterWakeup(interrupt = true, resetRealtimeStats = ::resetRealtimeStats)
|
||||||
}
|
|
||||||
|
|
||||||
private fun enterWakeup(interrupt: Boolean) {
|
|
||||||
waitSpeechFailStartMs = System.currentTimeMillis()
|
|
||||||
waitSpeechStartMs = System.currentTimeMillis()
|
|
||||||
|
|
||||||
hasInvalidSpeech = false
|
|
||||||
currentTimeoutType = TimeoutType.IDLE_TIMEOUT
|
|
||||||
|
|
||||||
if (interrupt) {
|
|
||||||
audioBuffer.clear()
|
|
||||||
vadManager.reset()
|
|
||||||
vadStarted = false
|
|
||||||
resetRealtimeStats()
|
|
||||||
}
|
|
||||||
|
|
||||||
inKwsObserve = true
|
|
||||||
kwsObserveStartMs = System.currentTimeMillis()
|
|
||||||
onWakeup()
|
onWakeup()
|
||||||
LogUtils.d(TAG, "🔔 唤醒成功 | 环境基线: $currentEnvBaseline")
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* VAD开始回调
|
||||||
|
*/
|
||||||
private fun onVadStart() {
|
private fun onVadStart() {
|
||||||
if (state != VoiceState.WAIT_SPEECH) return
|
stateManager.onVadStart(
|
||||||
LogUtils.d(TAG, "🎤 REAL VAD START | 环境基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment")
|
audioBuffer = audioBuffer,
|
||||||
vadStarted = true
|
preBuffer = preBuffer,
|
||||||
recordingStartMs = System.currentTimeMillis()
|
resetRealtimeStats = ::resetRealtimeStats
|
||||||
audioBuffer.clear()
|
)
|
||||||
audioBuffer.addAll(preBuffer)
|
|
||||||
resetRealtimeStats()
|
|
||||||
state = VoiceState.RECORDING
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* VAD结束回调
|
||||||
|
*/
|
||||||
private fun onVadEnd(avgEnergy: Float, peakRms: Float) {
|
private fun onVadEnd(avgEnergy: Float, peakRms: Float) {
|
||||||
if (state != VoiceState.RECORDING) return
|
if (stateManager.state != VoiceState.RECORDING) return
|
||||||
LogUtils.d(TAG, "🧠 VAD END | 环境基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment")
|
LogUtils.d(VoiceConfig.TAG, "🧠 VAD END | 环境基线: ${stateManager.currentEnvBaseline} | 嘈杂环境: ${stateManager.isNoisyEnvironment}")
|
||||||
val realAvgEnergy = if (realtimeEnergyCount > 0) realtimeEnergySum / realtimeEnergyCount else avgEnergy
|
val realAvgEnergy = if (realtimeEnergyCount > 0) realtimeEnergySum / realtimeEnergyCount else avgEnergy
|
||||||
val realPeakRms = if (realtimePeakRms > 0) realtimePeakRms else peakRms
|
val realPeakRms = if (realtimePeakRms > 0) realtimePeakRms else peakRms
|
||||||
finishSentence(realAvgEnergy, realPeakRms)
|
finishSentence(realAvgEnergy, realPeakRms)
|
||||||
}
|
}
|
||||||
|
|
||||||
/* ================= 微弱人声过滤 ================= */
|
/**
|
||||||
private fun filterWeakVoice(duration: Long, avgEnergy: Float, peakRms: Float): Boolean {
|
* 结束录音处理
|
||||||
if (duration < MIN_EFFECTIVE_VOICE_DURATION) {
|
*/
|
||||||
LogUtils.w(TAG, "❌ 微弱人声过滤:时长${duration}ms < ${MIN_EFFECTIVE_VOICE_DURATION}ms")
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
|
|
||||||
val voiceFrameRatio = if (realtimeTotalFrames > 0) realtimeSpeechFrames.toFloat() / realtimeTotalFrames else 0f
|
|
||||||
if (avgEnergy < NORMAL_VOICE_ENERGY_THRESHOLD && voiceFrameRatio < MIN_VOICE_FRAME_RATIO) {
|
|
||||||
LogUtils.w(TAG, "❌ 微弱人声过滤:帧占比${voiceFrameRatio} < ${MIN_VOICE_FRAME_RATIO}")
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
|
|
||||||
val peakBaselineRatio = peakRms / currentEnvBaseline
|
|
||||||
if (avgEnergy < NORMAL_VOICE_ENERGY_THRESHOLD && peakBaselineRatio < MIN_PEAK_ENERGY_RATIO) {
|
|
||||||
LogUtils.w(TAG, "❌ 微弱人声过滤:峰值/基线${peakBaselineRatio} < ${MIN_PEAK_ENERGY_RATIO}")
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
|
|
||||||
if (avgEnergy < NORMAL_VOICE_ENERGY_THRESHOLD && realtimeContinuousSpeechFrames < MIN_CONTINUOUS_VOICE_FRAMES) {
|
|
||||||
LogUtils.w(TAG, "❌ 微弱人声过滤:连续帧${realtimeContinuousSpeechFrames} < ${MIN_CONTINUOUS_VOICE_FRAMES}")
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
|
|
||||||
val energyBaselineRatio = avgEnergy / currentEnvBaseline
|
|
||||||
if (avgEnergy < 0.005f && energyBaselineRatio < 1.2f) {
|
|
||||||
LogUtils.w(TAG, "❌ 微弱人声过滤:能量/基线${energyBaselineRatio} < 1.2")
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
|
|
||||||
/* ================= 结束录音 ================= */
|
|
||||||
private fun finishSentence(avgEnergy: Float = 0f, peakRms: Float = 0f) {
|
private fun finishSentence(avgEnergy: Float = 0f, peakRms: Float = 0f) {
|
||||||
val now = System.currentTimeMillis()
|
val now = System.currentTimeMillis()
|
||||||
val duration = now - recordingStartMs
|
val duration = now - stateManager.recordingStartMs
|
||||||
|
|
||||||
if (!vadStarted || duration < MIN_SPEECH_MS) {
|
// 基础过滤:语音过短
|
||||||
LogUtils.d(TAG, "❌ 语音过短: $duration ms | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment")
|
if (!stateManager.vadStarted || duration < VoiceConfig.MIN_SPEECH_MS) {
|
||||||
hasInvalidSpeech = true
|
LogUtils.d(VoiceConfig.TAG, "❌ 语音过短: $duration ms | 基线: ${stateManager.currentEnvBaseline} | 嘈杂环境: ${stateManager.isNoisyEnvironment}")
|
||||||
resetToWaitSpeech()
|
stateManager.hasInvalidSpeech = true
|
||||||
|
stateManager.resetToWaitSpeech(::resetRealtimeStats, audioBuffer, vadManager)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
if (filterWeakVoice(duration, avgEnergy, peakRms)) {
|
// 微弱人声过滤
|
||||||
hasInvalidSpeech = true
|
if (VoiceUtils.filterWeakVoice(
|
||||||
resetToWaitSpeech()
|
duration = duration,
|
||||||
|
avgEnergy = avgEnergy,
|
||||||
|
peakRms = peakRms,
|
||||||
|
currentEnvBaseline = stateManager.currentEnvBaseline,
|
||||||
|
realtimeTotalFrames = realtimeTotalFrames,
|
||||||
|
realtimeSpeechFrames = realtimeSpeechFrames,
|
||||||
|
realtimeContinuousSpeechFrames = realtimeContinuousSpeechFrames
|
||||||
|
)
|
||||||
|
) {
|
||||||
|
stateManager.hasInvalidSpeech = true
|
||||||
|
stateManager.resetToWaitSpeech(::resetRealtimeStats, audioBuffer, vadManager)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -424,104 +273,95 @@ class VoiceController(
|
|||||||
val vadRatio = vadManager.activeSpeechRatio()
|
val vadRatio = vadManager.activeSpeechRatio()
|
||||||
val peakAvgRatio = if (avgEnergy > 0f) peakRms / avgEnergy else 0f
|
val peakAvgRatio = if (avgEnergy > 0f) peakRms / avgEnergy else 0f
|
||||||
|
|
||||||
LogUtils.d(TAG, "📊 录音信息 | 时长: $duration ms | 能量: $avgEnergy | 峰均比: $peakAvgRatio | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment")
|
LogUtils.d(VoiceConfig.TAG, "📊 录音信息 | 时长: $duration ms | 能量: $avgEnergy | 峰均比: $peakAvgRatio | 基线: ${stateManager.currentEnvBaseline} | 嘈杂环境: ${stateManager.isNoisyEnvironment}")
|
||||||
LogUtils.d(TAG, "📊 实时帧统计 | 总帧: $realtimeTotalFrames | 语音帧: $realtimeSpeechFrames | 连续语音帧: $realtimeContinuousSpeechFrames")
|
LogUtils.d(VoiceConfig.TAG, "📊 实时帧统计 | 总帧: $realtimeTotalFrames | 语音帧: $realtimeSpeechFrames | 连续语音帧: $realtimeContinuousSpeechFrames")
|
||||||
|
|
||||||
|
// 多人对话过滤
|
||||||
if (isMultiPersonDialogueDetected) {
|
if (isMultiPersonDialogueDetected) {
|
||||||
LogUtils.w(TAG, "❌ 过滤多人对话垃圾语音 | 时长: $duration ms")
|
LogUtils.w(VoiceConfig.TAG, "❌ 过滤多人对话垃圾语音 | 时长: $duration ms")
|
||||||
hasInvalidSpeech = true
|
stateManager.hasInvalidSpeech = true
|
||||||
resetToWaitSpeech()
|
stateManager.resetToWaitSpeech(::resetRealtimeStats, audioBuffer, vadManager)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
// 声纹验证(核心极简版)
|
// 声纹验证
|
||||||
if (ENABLE_STRICT_SPEAKER_VERIFY) {
|
if (VoiceConfig.ENABLE_STRICT_SPEAKER_VERIFY) {
|
||||||
val isCurrentUser = verifySpeaker(audio)
|
val isCurrentUser = VoiceUtils.verifySpeaker(
|
||||||
|
audio = audio,
|
||||||
|
isNoisyEnvironment = stateManager.isNoisyEnvironment,
|
||||||
|
extractor = speakerExtractor,
|
||||||
|
manager = speakerManager
|
||||||
|
)
|
||||||
if (!isCurrentUser) {
|
if (!isCurrentUser) {
|
||||||
LogUtils.w(TAG, "❌ 非当前唤醒用户,拒绝语音 | 录音时长: $duration ms | 嘈杂环境: $isNoisyEnvironment")
|
LogUtils.w(VoiceConfig.TAG, "❌ 非当前唤醒用户,拒绝语音 | 录音时长: $duration ms | 嘈杂环境: ${stateManager.isNoisyEnvironment}")
|
||||||
hasInvalidSpeech = true
|
stateManager.hasInvalidSpeech = true
|
||||||
resetToWaitSpeech()
|
stateManager.resetToWaitSpeech(::resetRealtimeStats, audioBuffer, vadManager)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
LogUtils.d(TAG, "✅ 当前用户语音,继续处理 | 录音时长: $duration ms | 嘈杂环境: $isNoisyEnvironment")
|
LogUtils.d(VoiceConfig.TAG, "✅ 当前用户语音,继续处理 | 录音时长: $duration ms | 嘈杂环境: ${stateManager.isNoisyEnvironment}")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// 远场过滤
|
// 远场过滤
|
||||||
val isFarField = avgEnergy < MAX_FAR_FIELD_ENERGY
|
val isFarField = avgEnergy < VoiceConfig.MAX_FAR_FIELD_ENERGY
|
||||||
val isInvalidPeakRatio = peakAvgRatio < MIN_VALID_PEAK_AVG_RATIO
|
val isInvalidPeakRatio = peakAvgRatio < VoiceConfig.MIN_VALID_PEAK_AVG_RATIO
|
||||||
if (isFarField && isInvalidPeakRatio) {
|
if (isFarField && isInvalidPeakRatio) {
|
||||||
LogUtils.w(TAG, "❌ 远场/无效语音过滤 | 能量: $avgEnergy < $MAX_FAR_FIELD_ENERGY")
|
LogUtils.w(VoiceConfig.TAG, "❌ 远场/无效语音过滤 | 能量: $avgEnergy < ${VoiceConfig.MAX_FAR_FIELD_ENERGY}")
|
||||||
hasInvalidSpeech = true
|
stateManager.hasInvalidSpeech = true
|
||||||
resetToWaitSpeech()
|
stateManager.resetToWaitSpeech(::resetRealtimeStats, audioBuffer, vadManager)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
// 非连续判定
|
// 非连续判定
|
||||||
val continuousRatio = if (realtimeSpeechFrames > 0) realtimeContinuousSpeechFrames.toFloat() / realtimeSpeechFrames else 0f
|
val continuousRatio = if (realtimeSpeechFrames > 0) realtimeContinuousSpeechFrames.toFloat() / realtimeSpeechFrames else 0f
|
||||||
val peakPositionRatio = vadManager.getPeakPositionRatio()
|
val peakPositionRatio = vadManager.getPeakPositionRatio()
|
||||||
val isDiscontinuous = continuousRatio < MIN_CONTINUOUS_FRAME_RATIO &&
|
val isDiscontinuous = continuousRatio < VoiceConfig.MIN_CONTINUOUS_FRAME_RATIO &&
|
||||||
realtimeSpeechFrames < MIN_EFFECTIVE_SPEECH_FRAMES &&
|
realtimeSpeechFrames < VoiceConfig.MIN_EFFECTIVE_SPEECH_FRAMES &&
|
||||||
peakPositionRatio > MAX_PEAK_POSITION_RATIO
|
peakPositionRatio > VoiceConfig.MAX_PEAK_POSITION_RATIO
|
||||||
if (isDiscontinuous) {
|
if (isDiscontinuous) {
|
||||||
LogUtils.w(TAG, "❌ 非连续杂音过滤 | 连续占比: $continuousRatio < $MIN_CONTINUOUS_FRAME_RATIO")
|
LogUtils.w(VoiceConfig.TAG, "❌ 非连续杂音过滤 | 连续占比: $continuousRatio < ${VoiceConfig.MIN_CONTINUOUS_FRAME_RATIO}")
|
||||||
hasInvalidSpeech = true
|
stateManager.hasInvalidSpeech = true
|
||||||
resetToWaitSpeech()
|
stateManager.resetToWaitSpeech(::resetRealtimeStats, audioBuffer, vadManager)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
// 分场景阈值过滤
|
// 分场景阈值过滤
|
||||||
val isQuietEnv = currentEnvBaseline < BASELINE_QUIET_THRESHOLD
|
val thresholdConfig = VoiceUtils.getThresholdConfig(duration, stateManager.currentEnvBaseline)
|
||||||
val thresholdConfig = when {
|
|
||||||
duration in SHORT_SPEECH_MIN..SHORT_SPEECH_MAX -> {
|
|
||||||
val coeff = if (isQuietEnv) SHORT_SPEECH_ENERGY_COEFF_QUIET else SHORT_SPEECH_ENERGY_COEFF_NOISY
|
|
||||||
val energyThreshold = currentEnvBaseline * coeff
|
|
||||||
ThresholdConfig(energyThreshold, SHORT_SPEECH_VAD_COEFF, SHORT_SPEECH_MIN_SCORE, "短语音")
|
|
||||||
}
|
|
||||||
else -> {
|
|
||||||
val coeff = if (isQuietEnv) LONG_SPEECH_ENERGY_COEFF_QUIET else LONG_SPEECH_ENERGY_COEFF_NOISY
|
|
||||||
val energyThreshold = currentEnvBaseline * coeff
|
|
||||||
ThresholdConfig(energyThreshold, LONG_SPEECH_VAD_COEFF, LONG_SPEECH_MIN_SCORE, "长语音")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
val energyPass = avgEnergy >= thresholdConfig.energyThreshold
|
val energyPass = avgEnergy >= thresholdConfig.energyThreshold
|
||||||
val vadRatioPass = vadRatio >= thresholdConfig.vadRatioThreshold
|
val vadRatioPass = vadRatio >= thresholdConfig.vadRatioThreshold
|
||||||
if (!energyPass || !vadRatioPass) {
|
if (!energyPass || !vadRatioPass) {
|
||||||
LogUtils.w(TAG, "❌ 低能量语音阈值过滤 | 能量: $avgEnergy < ${thresholdConfig.energyThreshold} | 占比: $vadRatio < ${thresholdConfig.vadRatioThreshold} | 场景: ${thresholdConfig.scene}")
|
LogUtils.w(VoiceConfig.TAG, "❌ 低能量语音阈值过滤 | 能量: $avgEnergy < ${thresholdConfig.energyThreshold} | 占比: $vadRatio < ${thresholdConfig.vadRatioThreshold} | 场景: ${thresholdConfig.scene}")
|
||||||
hasInvalidSpeech = true
|
stateManager.hasInvalidSpeech = true
|
||||||
resetToWaitSpeech()
|
stateManager.resetToWaitSpeech(::resetRealtimeStats, audioBuffer, vadManager)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
// 评分判定
|
// 评分判定
|
||||||
var score = 0
|
val score = VoiceUtils.calculateSpeechScore(
|
||||||
score += when {
|
duration = duration,
|
||||||
duration >= 4000 -> 3
|
avgEnergy = avgEnergy,
|
||||||
duration >= 2500 -> 2
|
continuousRatio = continuousRatio,
|
||||||
else -> 1
|
thresholdConfig = thresholdConfig
|
||||||
}
|
)
|
||||||
score += if (avgEnergy >= thresholdConfig.energyThreshold) 1 else 0
|
|
||||||
score += if (continuousRatio >= MIN_CONTINUOUS_FRAME_RATIO) 1 else 0
|
|
||||||
|
|
||||||
val pass = score >= thresholdConfig.minScore
|
val pass = score >= thresholdConfig.minScore
|
||||||
if (!pass) {
|
if (!pass) {
|
||||||
LogUtils.w(TAG, "❌ 评分不足过滤 | 总分: $score < ${thresholdConfig.minScore} | 场景: ${thresholdConfig.scene}")
|
LogUtils.w(VoiceConfig.TAG, "❌ 评分不足过滤 | 总分: $score < ${thresholdConfig.minScore} | 场景: ${thresholdConfig.scene}")
|
||||||
hasInvalidSpeech = true
|
stateManager.hasInvalidSpeech = true
|
||||||
resetToWaitSpeech()
|
stateManager.resetToWaitSpeech(::resetRealtimeStats, audioBuffer, vadManager)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
// 最终通过
|
// 最终通过
|
||||||
audioBuffer.clear()
|
audioBuffer.clear()
|
||||||
state = VoiceState.UPLOADING
|
stateManager.state = VoiceState.UPLOADING
|
||||||
onFinalAudio(audio)
|
onFinalAudio(audio)
|
||||||
resetRealtimeStats()
|
resetRealtimeStats()
|
||||||
hasInvalidSpeech = false
|
stateManager.hasInvalidSpeech = false
|
||||||
LogUtils.i(TAG, "✅ 语音通过 | 时长: $duration ms | 能量: $avgEnergy | 场景: ${thresholdConfig.scene} | 嘈杂环境: $isNoisyEnvironment")
|
LogUtils.i(VoiceConfig.TAG, "✅ 语音通过 | 时长: $duration ms | 能量: $avgEnergy | 场景: ${thresholdConfig.scene} | 嘈杂环境: ${stateManager.isNoisyEnvironment}")
|
||||||
}
|
}
|
||||||
|
|
||||||
/* ================= 重置实时统计 ================= */
|
/**
|
||||||
|
* 重置实时统计
|
||||||
|
*/
|
||||||
private fun resetRealtimeStats() {
|
private fun resetRealtimeStats() {
|
||||||
realtimeEnergySum = 0f
|
realtimeEnergySum = 0f
|
||||||
realtimeEnergyCount = 0
|
realtimeEnergyCount = 0
|
||||||
@ -533,95 +373,34 @@ class VoiceController(
|
|||||||
isMultiPersonDialogueDetected = false
|
isMultiPersonDialogueDetected = false
|
||||||
}
|
}
|
||||||
|
|
||||||
/* ================= 播放/上传回调 ================= */
|
// ================= 对外API(完全不变) =================
|
||||||
fun onPlayStartPrompt() {
|
fun onPlayStartPrompt() = stateManager.onPlayStartPrompt()
|
||||||
LogUtils.d(TAG, "🎵 播放提示音 | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment")
|
fun onPlayEndPrompt() = stateManager.onPlayEndPrompt()
|
||||||
state = VoiceState.PLAYING_PROMPT
|
fun onPlayStartBackend() = stateManager.onPlayStartBackend()
|
||||||
}
|
fun onPlayEndBackend() = stateManager.onPlayEndBackend()
|
||||||
|
fun onUploadFinished(success: Boolean) = stateManager.onUploadFinished(success)
|
||||||
fun onPlayEndPrompt() {
|
|
||||||
speechEnableAtMs = System.currentTimeMillis() + SPEECH_COOLDOWN_MS
|
|
||||||
LogUtils.d(TAG, "🎵 提示音结束 | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment")
|
|
||||||
state = VoiceState.WAIT_SPEECH_COOLDOWN
|
|
||||||
}
|
|
||||||
|
|
||||||
fun onPlayStartBackend() {
|
|
||||||
if (state != VoiceState.UPLOADING) {
|
|
||||||
LogUtils.w(TAG, "🎶 非上传完成状态,禁止切换到 PLAYING_BACKEND | 当前状态: $state")
|
|
||||||
return
|
|
||||||
}
|
|
||||||
LogUtils.d(TAG, "🎶 开始播放后台音频 | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment")
|
|
||||||
state = VoiceState.PLAYING_BACKEND
|
|
||||||
}
|
|
||||||
|
|
||||||
fun onPlayEndBackend() {
|
|
||||||
speechEnableAtMs = System.currentTimeMillis() + SPEECH_COOLDOWN_MS
|
|
||||||
LogUtils.d(TAG, "🎶 后台音频结束 | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment")
|
|
||||||
state = VoiceState.WAIT_SPEECH_COOLDOWN
|
|
||||||
}
|
|
||||||
|
|
||||||
fun onUploadFinished(success: Boolean) {
|
|
||||||
if (state != VoiceState.UPLOADING) return
|
|
||||||
LogUtils.d(TAG, "📤 上传完成 | 成功: $success | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment")
|
|
||||||
|
|
||||||
if (!success) {
|
|
||||||
speechEnableAtMs = System.currentTimeMillis() + SPEECH_COOLDOWN_MS
|
|
||||||
state = VoiceState.WAIT_SPEECH_COOLDOWN
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private fun resetToWaitSpeech() {
|
|
||||||
LogUtils.d(TAG, "🔄 重置到等待说话 | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment | 已标记无效说话: $hasInvalidSpeech")
|
|
||||||
val now = System.currentTimeMillis()
|
|
||||||
if (now - lastInvalidResetMs < INVALID_RESET_DEBOUNCE_MS) {
|
|
||||||
LogUtils.d(TAG, "🛡 防抖:1.5秒内重复无效语音,跳过重置")
|
|
||||||
return
|
|
||||||
}
|
|
||||||
lastInvalidResetMs = now
|
|
||||||
audioBuffer.clear()
|
|
||||||
vadManager.reset()
|
|
||||||
vadStarted = false
|
|
||||||
resetRealtimeStats()
|
|
||||||
state = VoiceState.WAIT_SPEECH
|
|
||||||
if (waitSpeechFailStartMs == 0L) waitSpeechFailStartMs = System.currentTimeMillis()
|
|
||||||
}
|
|
||||||
|
|
||||||
private fun resetAll() {
|
|
||||||
LogUtils.d(TAG, "🔄 重置所有状态 | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment | 本次超时类型: $currentTimeoutType")
|
|
||||||
audioBuffer.clear()
|
|
||||||
preBuffer.clear()
|
|
||||||
vadManager.reset()
|
|
||||||
wakeupManager.reset()
|
|
||||||
vadStarted = false
|
|
||||||
waitSpeechStartMs = 0L
|
|
||||||
waitSpeechFailStartMs = 0L
|
|
||||||
envNoiseBuffer.clear()
|
|
||||||
currentEnvBaseline = 0.001f
|
|
||||||
isNoisyEnvironment = false
|
|
||||||
resetRealtimeStats()
|
|
||||||
hasInvalidSpeech = false
|
|
||||||
currentTimeoutType = TimeoutType.IDLE_TIMEOUT
|
|
||||||
state = VoiceState.WAIT_WAKEUP
|
|
||||||
}
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 资源释放
|
||||||
|
*/
|
||||||
fun release() {
|
fun release() {
|
||||||
LogUtils.d(TAG, "🔌 释放资源 | 最终基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment")
|
LogUtils.d(VoiceConfig.TAG, "🔌 释放资源 | 最终基线: ${stateManager.currentEnvBaseline} | 嘈杂环境: ${stateManager.isNoisyEnvironment}")
|
||||||
wakeupManager.release()
|
wakeupManager.release()
|
||||||
vadManager.reset()
|
vadManager.reset()
|
||||||
envNoiseBuffer.clear()
|
envNoiseBuffer.clear()
|
||||||
resetRealtimeStats()
|
resetRealtimeStats()
|
||||||
hasInvalidSpeech = false
|
stateManager.hasInvalidSpeech = false
|
||||||
currentTimeoutType = TimeoutType.IDLE_TIMEOUT
|
stateManager.currentTimeoutType = TimeoutType.IDLE_TIMEOUT
|
||||||
isNoisyEnvironment = false
|
stateManager.isNoisyEnvironment = false
|
||||||
|
|
||||||
runCatching {
|
runCatching {
|
||||||
SpeakerRecognition.extractor.release()
|
speakerExtractor.release()
|
||||||
speakerManagerLock.withLock {
|
speakerManagerLock.withLock {
|
||||||
SpeakerRecognition.manager.release()
|
speakerManager.release()
|
||||||
}
|
}
|
||||||
LogUtils.d(TAG, "✅ 声纹识别器资源已释放")
|
LogUtils.d(VoiceConfig.TAG, "✅ 声纹识别器资源已释放")
|
||||||
}.onFailure {
|
}.onFailure {
|
||||||
LogUtils.e(TAG, "❌ 释放声纹识别器资源失败", it)
|
LogUtils.e(VoiceConfig.TAG, "❌ 释放声纹识别器资源失败", it)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -629,85 +408,7 @@ class VoiceController(
|
|||||||
runCatching {
|
runCatching {
|
||||||
release()
|
release()
|
||||||
}.onFailure {
|
}.onFailure {
|
||||||
LogUtils.e(TAG, "❌ finalize 释放资源失败", it)
|
LogUtils.e(VoiceConfig.TAG, "❌ finalize 释放资源失败", it)
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private fun cachePreBuffer(samples: FloatArray) {
|
|
||||||
for (s in samples) {
|
|
||||||
preBuffer.addLast(s)
|
|
||||||
if (preBuffer.size > PRE_BUFFER_SIZE) preBuffer.removeFirst()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// 阈值配置数据类
|
|
||||||
private data class ThresholdConfig(
|
|
||||||
val energyThreshold: Float,
|
|
||||||
val vadRatioThreshold: Float,
|
|
||||||
val minScore: Int,
|
|
||||||
val scene: String
|
|
||||||
)
|
|
||||||
|
|
||||||
/* ================= 核心:极简版声纹验证 ================= */
|
|
||||||
private fun verifySpeaker(audio: FloatArray): Boolean {
|
|
||||||
if (audio.isEmpty()) {
|
|
||||||
LogUtils.w(TAG, "❌ 待验证音频为空,声纹验证失败")
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
|
|
||||||
// 1. 裁剪音频:只保留本次录音的有效部分(解决时长不匹配问题)
|
|
||||||
val audioDurationMs = (audio.size.toFloat() / SAMPLE_RATE * 1000).toLong()
|
|
||||||
// 只保留最后 N 毫秒的音频(N = 实际录音时长),避免缓存旧音频
|
|
||||||
val validAudio = if (audioDurationMs > 0) {
|
|
||||||
val validSampleCount = (audioDurationMs * SAMPLE_RATE / 1000).toInt()
|
|
||||||
if (validSampleCount < audio.size) {
|
|
||||||
audio.copyOfRange(audio.size - validSampleCount, audio.size)
|
|
||||||
} else {
|
|
||||||
audio
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
audio
|
|
||||||
}
|
|
||||||
|
|
||||||
// 2. 分场景选阈值(无容错,只调阈值)
|
|
||||||
val finalThreshold = when {
|
|
||||||
audioDurationMs < SHORT_AUDIO_DURATION_MS -> SPEAKER_THRESHOLD_SHORT
|
|
||||||
isNoisyEnvironment -> SPEAKER_THRESHOLD_NOISY
|
|
||||||
else -> SPEAKER_THRESHOLD_QUIET
|
|
||||||
}
|
|
||||||
|
|
||||||
var stream: OnlineStream? = null
|
|
||||||
return try {
|
|
||||||
stream = SpeakerRecognition.extractor.createStream()
|
|
||||||
stream.acceptWaveform(samples = validAudio, sampleRate = SAMPLE_RATE) // 用裁剪后的音频验证
|
|
||||||
stream.inputFinished()
|
|
||||||
|
|
||||||
if (!SpeakerRecognition.extractor.isReady(stream)) {
|
|
||||||
LogUtils.w(TAG, "❌ 音频Stream未就绪,验证失败")
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
|
|
||||||
val embedding = SpeakerRecognition.extractor.compute(stream)
|
|
||||||
|
|
||||||
// 3. 纯验证逻辑:过就过,不过就拒绝
|
|
||||||
speakerManagerLock.withLock {
|
|
||||||
val verifyPass = SpeakerRecognition.manager.verify(
|
|
||||||
name = CURRENT_USER_ID,
|
|
||||||
embedding = embedding,
|
|
||||||
threshold = finalThreshold
|
|
||||||
)
|
|
||||||
|
|
||||||
// 打印关键信息(补充裁剪后时长)
|
|
||||||
LogUtils.d(TAG, "📊 声纹验证 | 阈值: $finalThreshold | 通过: $verifyPass | 嘈杂环境: $isNoisyEnvironment | 原始时长: ${audioDurationMs}ms | 验证时长: ${(validAudio.size.toFloat()/SAMPLE_RATE*1000).toLong()}ms")
|
|
||||||
|
|
||||||
// 无任何容错:验证结果就是最终结果
|
|
||||||
return verifyPass
|
|
||||||
}
|
|
||||||
} catch (e: Exception) {
|
|
||||||
LogUtils.e(TAG, "❌ 声纹验证异常,拒绝", e)
|
|
||||||
return false
|
|
||||||
} finally {
|
|
||||||
stream?.release()
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
211
app/src/main/java/com/zs/smarthuman/sherpa/VoiceStateManager.kt
Normal file
211
app/src/main/java/com/zs/smarthuman/sherpa/VoiceStateManager.kt
Normal file
@ -0,0 +1,211 @@
|
|||||||
|
package com.zs.smarthuman.sherpa
|
||||||
|
|
||||||
|
import com.blankj.utilcode.util.LogUtils
|
||||||
|
import java.util.ArrayDeque
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 语音控制器状态管理类
|
||||||
|
*/
|
||||||
|
class VoiceStateManager(
|
||||||
|
idleTimeoutSeconds: Int,
|
||||||
|
maxRecordingSeconds: Int,
|
||||||
|
private val onStateChanged: ((VoiceState) -> Unit)?,
|
||||||
|
private val onTimeoutTip: OnTimeoutTip?
|
||||||
|
) {
|
||||||
|
var state: VoiceState = VoiceState.WAIT_WAKEUP
|
||||||
|
set(value) {
|
||||||
|
field = value
|
||||||
|
LogUtils.d(VoiceConfig.TAG, "➡ State = $value")
|
||||||
|
onStateChanged?.invoke(value)
|
||||||
|
}
|
||||||
|
|
||||||
|
// 超时相关
|
||||||
|
val idleTimeoutMs = idleTimeoutSeconds * 1000L
|
||||||
|
val maxRecordingMs = maxRecordingSeconds * 1000L
|
||||||
|
var waitSpeechFailStartMs = 0L
|
||||||
|
var waitSpeechStartMs = 0L
|
||||||
|
var speechEnableAtMs = 0L
|
||||||
|
var lastInvalidResetMs = 0L
|
||||||
|
|
||||||
|
// 无效说话标记 + 超时类型
|
||||||
|
var hasInvalidSpeech = false
|
||||||
|
var currentTimeoutType: TimeoutType = TimeoutType.IDLE_TIMEOUT
|
||||||
|
|
||||||
|
// 唤醒观察标记
|
||||||
|
var inKwsObserve = false
|
||||||
|
var kwsObserveStartMs = 0L
|
||||||
|
|
||||||
|
// 环境状态
|
||||||
|
var isNoisyEnvironment = false
|
||||||
|
var currentEnvBaseline = 0.001f
|
||||||
|
|
||||||
|
// 录音状态
|
||||||
|
var recordingStartMs = 0L
|
||||||
|
var vadStarted = false
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 检查等待说话超时
|
||||||
|
* 修复点:返回是否超时,由外部调用 resetAll()(避免内部依赖外部对象)
|
||||||
|
*/
|
||||||
|
fun checkWaitSpeechTimeout(now: Long): Boolean {
|
||||||
|
val isTimeout = (waitSpeechStartMs > 0 && now - waitSpeechStartMs >= idleTimeoutMs) ||
|
||||||
|
(waitSpeechFailStartMs > 0 && now - waitSpeechFailStartMs >= idleTimeoutMs)
|
||||||
|
|
||||||
|
if (isTimeout) {
|
||||||
|
currentTimeoutType = if (hasInvalidSpeech) {
|
||||||
|
TimeoutType.INVALID_SPEECH_TIMEOUT
|
||||||
|
} else {
|
||||||
|
TimeoutType.IDLE_TIMEOUT
|
||||||
|
}
|
||||||
|
LogUtils.d(VoiceConfig.TAG, "⏱ WAIT_SPEECH timeout → WAIT_WAKEUP | 超时类型: $currentTimeoutType")
|
||||||
|
onTimeoutTip?.invoke(currentTimeoutType)
|
||||||
|
// 修复点:不再内部调用 resetAll(),改为返回超时状态,由外部处理
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 处理等待说话冷却状态
|
||||||
|
*/
|
||||||
|
fun handleWaitSpeechCooldown(now: Long): Boolean {
|
||||||
|
if (now >= speechEnableAtMs) {
|
||||||
|
waitSpeechFailStartMs = now
|
||||||
|
state = VoiceState.WAIT_SPEECH
|
||||||
|
waitSpeechStartMs = now
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 进入唤醒状态
|
||||||
|
*/
|
||||||
|
fun enterWakeup(interrupt: Boolean, resetRealtimeStats: () -> Unit) {
|
||||||
|
val now = System.currentTimeMillis()
|
||||||
|
waitSpeechFailStartMs = now
|
||||||
|
waitSpeechStartMs = now
|
||||||
|
hasInvalidSpeech = false
|
||||||
|
currentTimeoutType = TimeoutType.IDLE_TIMEOUT
|
||||||
|
|
||||||
|
if (interrupt) {
|
||||||
|
resetRealtimeStats()
|
||||||
|
vadStarted = false
|
||||||
|
}
|
||||||
|
|
||||||
|
inKwsObserve = true
|
||||||
|
kwsObserveStartMs = now
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 重置到等待说话状态
|
||||||
|
*/
|
||||||
|
fun resetToWaitSpeech(resetRealtimeStats: () -> Unit, audioBuffer: MutableList<Float>, vadManager: VadManager) {
|
||||||
|
LogUtils.d(VoiceConfig.TAG, "🔄 重置到等待说话 | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment | 已标记无效说话: $hasInvalidSpeech")
|
||||||
|
val now = System.currentTimeMillis()
|
||||||
|
if (now - lastInvalidResetMs < VoiceConfig.INVALID_RESET_DEBOUNCE_MS) {
|
||||||
|
LogUtils.d(VoiceConfig.TAG, "🛡 防抖:1.5秒内重复无效语音,跳过重置")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
lastInvalidResetMs = now
|
||||||
|
audioBuffer.clear()
|
||||||
|
vadManager.reset()
|
||||||
|
vadStarted = false
|
||||||
|
resetRealtimeStats()
|
||||||
|
state = VoiceState.WAIT_SPEECH
|
||||||
|
if (waitSpeechFailStartMs == 0L) waitSpeechFailStartMs = System.currentTimeMillis()
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 重置所有状态
|
||||||
|
*/
|
||||||
|
fun resetAll(
|
||||||
|
resetRealtimeStats: () -> Unit,
|
||||||
|
audioBuffer: MutableList<Float>,
|
||||||
|
preBuffer: ArrayDeque<Float>,
|
||||||
|
vadManager: VadManager,
|
||||||
|
wakeupManager: WakeupManager,
|
||||||
|
envNoiseBuffer: ArrayDeque<Float>
|
||||||
|
) {
|
||||||
|
LogUtils.d(VoiceConfig.TAG, "🔄 重置所有状态 | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment | 本次超时类型: $currentTimeoutType")
|
||||||
|
audioBuffer.clear()
|
||||||
|
preBuffer.clear()
|
||||||
|
vadManager.reset()
|
||||||
|
wakeupManager.reset()
|
||||||
|
vadStarted = false
|
||||||
|
waitSpeechStartMs = 0L
|
||||||
|
waitSpeechFailStartMs = 0L
|
||||||
|
envNoiseBuffer.clear()
|
||||||
|
currentEnvBaseline = 0.001f
|
||||||
|
isNoisyEnvironment = false
|
||||||
|
resetRealtimeStats()
|
||||||
|
hasInvalidSpeech = false
|
||||||
|
currentTimeoutType = TimeoutType.IDLE_TIMEOUT
|
||||||
|
state = VoiceState.WAIT_WAKEUP
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 播放提示音开始
|
||||||
|
*/
|
||||||
|
fun onPlayStartPrompt() {
|
||||||
|
LogUtils.d(VoiceConfig.TAG, "🎵 播放提示音 | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment")
|
||||||
|
state = VoiceState.PLAYING_PROMPT
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 播放提示音结束
|
||||||
|
*/
|
||||||
|
fun onPlayEndPrompt() {
|
||||||
|
speechEnableAtMs = System.currentTimeMillis() + VoiceConfig.SPEECH_COOLDOWN_MS
|
||||||
|
LogUtils.d(VoiceConfig.TAG, "🎵 提示音结束 | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment")
|
||||||
|
state = VoiceState.WAIT_SPEECH_COOLDOWN
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 播放后台音频开始
|
||||||
|
*/
|
||||||
|
fun onPlayStartBackend() {
|
||||||
|
if (state != VoiceState.UPLOADING) {
|
||||||
|
LogUtils.w(VoiceConfig.TAG, "🎶 非上传完成状态,禁止切换到 PLAYING_BACKEND | 当前状态: $state")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
LogUtils.d(VoiceConfig.TAG, "🎶 开始播放后台音频 | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment")
|
||||||
|
state = VoiceState.PLAYING_BACKEND
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 播放后台音频结束
|
||||||
|
*/
|
||||||
|
fun onPlayEndBackend() {
|
||||||
|
speechEnableAtMs = System.currentTimeMillis() + VoiceConfig.SPEECH_COOLDOWN_MS
|
||||||
|
LogUtils.d(VoiceConfig.TAG, "🎶 后台音频结束 | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment")
|
||||||
|
state = VoiceState.WAIT_SPEECH_COOLDOWN
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 上传完成
|
||||||
|
*/
|
||||||
|
fun onUploadFinished(success: Boolean) {
|
||||||
|
if (state != VoiceState.UPLOADING) return
|
||||||
|
LogUtils.d(VoiceConfig.TAG, "📤 上传完成 | 成功: $success | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment")
|
||||||
|
|
||||||
|
if (!success) {
|
||||||
|
speechEnableAtMs = System.currentTimeMillis() + VoiceConfig.SPEECH_COOLDOWN_MS
|
||||||
|
state = VoiceState.WAIT_SPEECH_COOLDOWN
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* VAD开始回调
|
||||||
|
*/
|
||||||
|
fun onVadStart(audioBuffer: MutableList<Float>, preBuffer: ArrayDeque<Float>, resetRealtimeStats: () -> Unit) {
|
||||||
|
if (state != VoiceState.WAIT_SPEECH) return
|
||||||
|
LogUtils.d(VoiceConfig.TAG, "🎤 REAL VAD START | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment")
|
||||||
|
vadStarted = true
|
||||||
|
recordingStartMs = System.currentTimeMillis()
|
||||||
|
audioBuffer.clear()
|
||||||
|
audioBuffer.addAll(preBuffer)
|
||||||
|
resetRealtimeStats()
|
||||||
|
state = VoiceState.RECORDING
|
||||||
|
}
|
||||||
|
}
|
||||||
355
app/src/main/java/com/zs/smarthuman/sherpa/VoiceUtils.kt
Normal file
355
app/src/main/java/com/zs/smarthuman/sherpa/VoiceUtils.kt
Normal file
@ -0,0 +1,355 @@
|
|||||||
|
package com.zs.smarthuman.sherpa
|
||||||
|
|
||||||
|
import com.blankj.utilcode.util.LogUtils
|
||||||
|
import com.k2fsa.sherpa.onnx.OnlineStream
|
||||||
|
import com.k2fsa.sherpa.onnx.SpeakerEmbeddingExtractor
|
||||||
|
import com.k2fsa.sherpa.onnx.SpeakerEmbeddingManager
|
||||||
|
import java.util.ArrayDeque
|
||||||
|
import java.util.concurrent.locks.ReentrantLock
|
||||||
|
import kotlin.concurrent.withLock
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 语音处理通用工具类(优化微弱人声过滤逻辑,适配正常语音)
|
||||||
|
*/
|
||||||
|
object VoiceUtils {
|
||||||
|
private val speakerManagerLock = ReentrantLock()
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 环境基线校准
|
||||||
|
*/
|
||||||
|
fun calibrateEnvBaseline(
|
||||||
|
samples: FloatArray,
|
||||||
|
vadManager: VadManager,
|
||||||
|
envNoiseBuffer: ArrayDeque<Float>,
|
||||||
|
currentEnvBaseline: Float
|
||||||
|
): Float {
|
||||||
|
val rms = vadManager.calcRms(samples)
|
||||||
|
val validRms = if (rms < currentEnvBaseline + 0.002f) rms else currentEnvBaseline
|
||||||
|
if (rms < 0.015f) {
|
||||||
|
if (envNoiseBuffer.size >= VoiceConfig.BASELINE_WINDOW_SIZE) {
|
||||||
|
envNoiseBuffer.removeFirst()
|
||||||
|
}
|
||||||
|
envNoiseBuffer.addLast(validRms)
|
||||||
|
return envNoiseBuffer.maxOrNull() ?: 0.001f
|
||||||
|
}
|
||||||
|
return currentEnvBaseline
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 更新实时能量统计
|
||||||
|
*/
|
||||||
|
fun updateRealtimeEnergy(
|
||||||
|
samples: FloatArray,
|
||||||
|
vadManager: VadManager,
|
||||||
|
isNoisyEnvironment: Boolean,
|
||||||
|
currentEnvBaseline: Float,
|
||||||
|
realtimeEnergySum: Float,
|
||||||
|
realtimeEnergyCount: Int,
|
||||||
|
realtimePeakRms: Float
|
||||||
|
): Triple<Float, Int, Float> {
|
||||||
|
val rms = vadManager.calcRms(samples)
|
||||||
|
val effectiveThreshold = if (isNoisyEnvironment) currentEnvBaseline * 1.8f else VoiceConfig.MIN_EFFECTIVE_SPEECH_RMS
|
||||||
|
var newSum = realtimeEnergySum
|
||||||
|
var newCount = realtimeEnergyCount
|
||||||
|
var newPeak = realtimePeakRms
|
||||||
|
if (rms >= effectiveThreshold) {
|
||||||
|
newSum += rms
|
||||||
|
newCount++
|
||||||
|
newPeak = maxOf(newPeak, rms)
|
||||||
|
}
|
||||||
|
return Triple(newSum, newCount, newPeak)
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 更新实时帧统计
|
||||||
|
*/
|
||||||
|
fun updateRealtimeFrameStats(vadManager: VadManager): FrameStats {
|
||||||
|
val totalFrames = vadManager.getTotalFrames()
|
||||||
|
val speechFrames = vadManager.getSpeechFrames()
|
||||||
|
val continuousSpeechFrames = vadManager.getContinuousSpeechFrames()
|
||||||
|
val currentFrameIsSpeech = vadManager.isSpeechDetected()
|
||||||
|
val newContinuousFrames = if (currentFrameIsSpeech) {
|
||||||
|
if (vadManager.getContinuousSpeechFrames() > 0) continuousSpeechFrames + 1 else 1
|
||||||
|
} else {
|
||||||
|
0
|
||||||
|
}
|
||||||
|
return FrameStats(
|
||||||
|
totalFrames = totalFrames,
|
||||||
|
speechFrames = speechFrames,
|
||||||
|
continuousSpeechFrames = newContinuousFrames,
|
||||||
|
lastFrameIsSpeech = currentFrameIsSpeech
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 多人对话实时检测
|
||||||
|
*/
|
||||||
|
fun checkMultiPersonDialogue(
|
||||||
|
now: Long,
|
||||||
|
recordingStartMs: Long,
|
||||||
|
realtimeEnergySum: Float,
|
||||||
|
realtimeEnergyCount: Int,
|
||||||
|
realtimePeakRms: Float,
|
||||||
|
realtimeSpeechFrames: Int,
|
||||||
|
realtimeContinuousSpeechFrames: Int,
|
||||||
|
vadManager: VadManager
|
||||||
|
): Boolean {
|
||||||
|
val duration = now - recordingStartMs
|
||||||
|
if (duration < VoiceConfig.MULTI_DIALOGUE_MIN_DURATION) return false
|
||||||
|
|
||||||
|
val avgEnergy = if (realtimeEnergyCount > 0) realtimeEnergySum / realtimeEnergyCount else 0f
|
||||||
|
val peakAvgRatio = if (avgEnergy > 0) realtimePeakRms / avgEnergy else 0f
|
||||||
|
val continuousRatio = if (realtimeSpeechFrames > 0) realtimeContinuousSpeechFrames.toFloat() / realtimeSpeechFrames else 0f
|
||||||
|
val vadRatio = vadManager.activeSpeechRatio()
|
||||||
|
|
||||||
|
return duration >= VoiceConfig.MULTI_DIALOGUE_MIN_DURATION &&
|
||||||
|
peakAvgRatio in VoiceConfig.MULTI_DIALOGUE_MIN_PEAK_AVG_RATIO..VoiceConfig.MULTI_DIALOGUE_MAX_PEAK_AVG_RATIO &&
|
||||||
|
continuousRatio <= VoiceConfig.MULTI_DIALOGUE_MAX_CONTINUOUS_RATIO &&
|
||||||
|
vadRatio >= VoiceConfig.MULTI_DIALOGUE_MIN_VAD_RATIO
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 微弱人声过滤(精简版:保留核心层,删除冗余层,避免过度过滤)
|
||||||
|
*/
|
||||||
|
fun filterWeakVoice(
|
||||||
|
duration: Long,
|
||||||
|
avgEnergy: Float,
|
||||||
|
peakRms: Float,
|
||||||
|
currentEnvBaseline: Float,
|
||||||
|
realtimeTotalFrames: Int,
|
||||||
|
realtimeSpeechFrames: Int,
|
||||||
|
realtimeContinuousSpeechFrames: Int
|
||||||
|
): Boolean {
|
||||||
|
// 1. 基础时长过滤(必需:过滤极短杂音)
|
||||||
|
if (duration < VoiceConfig.MIN_EFFECTIVE_VOICE_DURATION) {
|
||||||
|
LogUtils.w("${VoiceConfig.TAG}", "❌ 微弱人声过滤:时长${duration}ms < ${VoiceConfig.MIN_EFFECTIVE_VOICE_DURATION}ms")
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
// 2. 动态能量阈值过滤(核心:分场景放宽短语音阈值)
|
||||||
|
val dynamicEnergyThreshold = if (duration < VoiceConfig.SHORT_LONG_SPEECH_CUTOFF_MS)
|
||||||
|
VoiceConfig.SHORT_SPEECH_ENERGY_THRESHOLD
|
||||||
|
else
|
||||||
|
VoiceConfig.MIN_NORMAL_VOICE_ENERGY
|
||||||
|
|
||||||
|
if (avgEnergy < dynamicEnergyThreshold) {
|
||||||
|
LogUtils.w("${VoiceConfig.TAG}", "❌ 微弱人声过滤:平均能量${avgEnergy} < ${if (duration < 2000) "短语音能量阈值${dynamicEnergyThreshold}" else "正常语音能量阈值${dynamicEnergyThreshold}"}")
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
// 3. 计算VAD占比(辅助:为后续过滤准备)
|
||||||
|
val voiceFrameRatio = if (realtimeTotalFrames > 0) realtimeSpeechFrames.toFloat() / realtimeTotalFrames else 0f
|
||||||
|
|
||||||
|
// 4. 动态VAD占比+能量联合过滤(核心:分场景适配,避免单一维度误判)
|
||||||
|
val dynamicVadRatioThreshold = when {
|
||||||
|
duration < VoiceConfig.SHORT_LONG_SPEECH_CUTOFF_MS -> VoiceConfig.SHORT_SPEECH_VAD_RATIO
|
||||||
|
currentEnvBaseline >= VoiceConfig.NOISE_BASELINE_THRESHOLD -> VoiceConfig.NOISY_ENV_VAD_RATIO
|
||||||
|
else -> VoiceConfig.MIN_NORMAL_VOICE_VAD_RATIO
|
||||||
|
}
|
||||||
|
|
||||||
|
if (voiceFrameRatio < dynamicVadRatioThreshold && avgEnergy < VoiceConfig.NORMAL_VOICE_ENERGY_THRESHOLD) {
|
||||||
|
LogUtils.w("${VoiceConfig.TAG}", "❌ 微弱人声过滤:语音帧占比${voiceFrameRatio} < ${dynamicVadRatioThreshold} | 平均能量${avgEnergy}")
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
// 5. 纯底噪过滤(必需:过滤无语音的环境底噪)
|
||||||
|
val energyBaselineRatio = avgEnergy / currentEnvBaseline
|
||||||
|
if (avgEnergy < VoiceConfig.PURE_NOISE_ENERGY_THRESHOLD && energyBaselineRatio < VoiceConfig.PURE_NOISE_BASELINE_RATIO) {
|
||||||
|
LogUtils.w("${VoiceConfig.TAG}", "❌ 微弱人声过滤:能量/基线${energyBaselineRatio} < 1.2(纯底噪)")
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
// (可选保留)峰值/基线过滤:仅对扁平背景音有效,可根据实际场景选择
|
||||||
|
// val peakBaselineRatio = peakRms / currentEnvBaseline
|
||||||
|
// if (avgEnergy < VoiceConfig.NORMAL_VOICE_ENERGY_THRESHOLD && peakBaselineRatio < VoiceConfig.MIN_PEAK_ENERGY_RATIO) {
|
||||||
|
// LogUtils.w("${VoiceConfig.TAG}", "❌ 微弱人声过滤:峰值/基线${peakBaselineRatio} < ${VoiceConfig.MIN_PEAK_ENERGY_RATIO}")
|
||||||
|
// return true
|
||||||
|
// }
|
||||||
|
|
||||||
|
// 正常语音通过所有核心过滤
|
||||||
|
LogUtils.d("${VoiceConfig.TAG}", "✅ 正常语音通过微弱人声过滤 | 时长${duration}ms | 能量${avgEnergy} | VAD占比${voiceFrameRatio} | 基线${currentEnvBaseline}")
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 声纹验证核心逻辑(无修改)
|
||||||
|
*/
|
||||||
|
fun verifySpeaker(
|
||||||
|
audio: FloatArray,
|
||||||
|
isNoisyEnvironment: Boolean,
|
||||||
|
extractor: SpeakerEmbeddingExtractor,
|
||||||
|
manager: SpeakerEmbeddingManager
|
||||||
|
): Boolean {
|
||||||
|
if (audio.isEmpty()) {
|
||||||
|
LogUtils.w("${VoiceConfig.TAG}", "❌ 待验证音频为空,声纹验证失败")
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// 裁剪音频:只保留本次录音的有效部分
|
||||||
|
val audioDurationMs = (audio.size.toFloat() / VoiceConfig.SAMPLE_RATE * 1000).toLong()
|
||||||
|
val validAudio = if (audioDurationMs > 0) {
|
||||||
|
val validSampleCount = (audioDurationMs * VoiceConfig.SAMPLE_RATE / 1000).toInt()
|
||||||
|
if (validSampleCount < audio.size) {
|
||||||
|
audio.copyOfRange(audio.size - validSampleCount, audio.size)
|
||||||
|
} else {
|
||||||
|
audio
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
audio
|
||||||
|
}
|
||||||
|
|
||||||
|
// 分场景选阈值
|
||||||
|
val finalThreshold = when {
|
||||||
|
audioDurationMs < VoiceConfig.SHORT_AUDIO_DURATION_MS -> VoiceConfig.SPEAKER_THRESHOLD_SHORT
|
||||||
|
isNoisyEnvironment -> VoiceConfig.SPEAKER_THRESHOLD_NOISY
|
||||||
|
else -> VoiceConfig.SPEAKER_THRESHOLD_QUIET
|
||||||
|
}
|
||||||
|
|
||||||
|
var stream: OnlineStream? = null
|
||||||
|
return try {
|
||||||
|
stream = extractor.createStream()
|
||||||
|
stream.acceptWaveform(samples = validAudio, sampleRate = VoiceConfig.SAMPLE_RATE)
|
||||||
|
stream.inputFinished()
|
||||||
|
|
||||||
|
if (!extractor.isReady(stream)) {
|
||||||
|
LogUtils.w("${VoiceConfig.TAG}", "❌ 音频Stream未就绪,验证失败")
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
val embedding = extractor.compute(stream)
|
||||||
|
speakerManagerLock.withLock {
|
||||||
|
val verifyPass = manager.verify(
|
||||||
|
name = VoiceConfig.CURRENT_USER_ID,
|
||||||
|
embedding = embedding,
|
||||||
|
threshold = finalThreshold
|
||||||
|
)
|
||||||
|
LogUtils.d("${VoiceConfig.TAG}", "📊 声纹验证 | 阈值: $finalThreshold | 通过: $verifyPass | 嘈杂环境: $isNoisyEnvironment | 原始时长: ${audioDurationMs}ms | 验证时长: ${(validAudio.size.toFloat()/VoiceConfig.SAMPLE_RATE*1000).toLong()}ms")
|
||||||
|
return verifyPass
|
||||||
|
}
|
||||||
|
} catch (e: Exception) {
|
||||||
|
LogUtils.e("${VoiceConfig.TAG}", "❌ 声纹验证异常,拒绝", e)
|
||||||
|
return false
|
||||||
|
} finally {
|
||||||
|
stream?.release()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 注册唤醒用户声纹特征(无修改)
|
||||||
|
*/
|
||||||
|
fun registerWakeupUser(
|
||||||
|
preBuffer: ArrayDeque<Float>,
|
||||||
|
extractor: SpeakerEmbeddingExtractor,
|
||||||
|
manager: SpeakerEmbeddingManager
|
||||||
|
) {
|
||||||
|
var stream: OnlineStream? = null
|
||||||
|
runCatching {
|
||||||
|
val wakeupAudio = preBuffer.toFloatArray()
|
||||||
|
if (wakeupAudio.isEmpty()) {
|
||||||
|
LogUtils.w("${VoiceConfig.TAG}", "❌ 唤醒音频缓存为空,无法注册用户特征")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
stream = extractor.createStream()
|
||||||
|
stream?.acceptWaveform(samples = wakeupAudio, sampleRate = VoiceConfig.SAMPLE_RATE)
|
||||||
|
stream?.inputFinished()
|
||||||
|
|
||||||
|
if (stream != null && extractor.isReady(stream)) {
|
||||||
|
val embedding = extractor.compute(stream)
|
||||||
|
speakerManagerLock.withLock {
|
||||||
|
manager.remove(VoiceConfig.CURRENT_USER_ID)
|
||||||
|
val embeddingList = mutableListOf(embedding)
|
||||||
|
val ok = manager.add(
|
||||||
|
name = VoiceConfig.CURRENT_USER_ID,
|
||||||
|
embedding = embeddingList.toTypedArray()
|
||||||
|
)
|
||||||
|
if (ok) {
|
||||||
|
LogUtils.d("${VoiceConfig.TAG}", "✅ 注册当前唤醒用户特征成功 | 特征长度: ${embedding.size}")
|
||||||
|
} else {
|
||||||
|
LogUtils.w("${VoiceConfig.TAG}", "❌ 注册当前唤醒用户特征失败")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
LogUtils.w("${VoiceConfig.TAG}", "❌ 唤醒音频Stream未就绪,跳过用户注册")
|
||||||
|
}
|
||||||
|
}.onFailure {
|
||||||
|
LogUtils.e("${VoiceConfig.TAG}", "❌ 唤醒用户特征注册失败", it)
|
||||||
|
}.also {
|
||||||
|
stream?.release()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 缓存预缓冲音频(无修改)
|
||||||
|
*/
|
||||||
|
fun cachePreBuffer(samples: FloatArray, preBuffer: ArrayDeque<Float>) {
|
||||||
|
for (s in samples) {
|
||||||
|
preBuffer.addLast(s)
|
||||||
|
if (preBuffer.size > VoiceConfig.PRE_BUFFER_SIZE) preBuffer.removeFirst()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 帧统计数据类(无修改)
|
||||||
|
*/
|
||||||
|
data class FrameStats(
|
||||||
|
val totalFrames: Int,
|
||||||
|
val speechFrames: Int,
|
||||||
|
val continuousSpeechFrames: Int,
|
||||||
|
val lastFrameIsSpeech: Boolean
|
||||||
|
)
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 阈值配置数据类(无修改)
|
||||||
|
*/
|
||||||
|
data class ThresholdConfig(
|
||||||
|
val energyThreshold: Float,
|
||||||
|
val vadRatioThreshold: Float,
|
||||||
|
val minScore: Int,
|
||||||
|
val scene: String
|
||||||
|
)
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 获取分场景阈值配置(无修改)
|
||||||
|
*/
|
||||||
|
fun getThresholdConfig(duration: Long, currentEnvBaseline: Float): ThresholdConfig {
|
||||||
|
val isQuietEnv = currentEnvBaseline < VoiceConfig.BASELINE_QUIET_THRESHOLD
|
||||||
|
return if (duration in VoiceConfig.SHORT_SPEECH_MIN..VoiceConfig.SHORT_SPEECH_MAX) {
|
||||||
|
val coeff = if (isQuietEnv) VoiceConfig.SHORT_SPEECH_ENERGY_COEFF_QUIET else VoiceConfig.SHORT_SPEECH_ENERGY_COEFF_NOISY
|
||||||
|
ThresholdConfig(
|
||||||
|
energyThreshold = currentEnvBaseline * coeff,
|
||||||
|
vadRatioThreshold = VoiceConfig.SHORT_SPEECH_VAD_COEFF,
|
||||||
|
minScore = VoiceConfig.SHORT_SPEECH_MIN_SCORE,
|
||||||
|
scene = "短语音"
|
||||||
|
)
|
||||||
|
} else {
|
||||||
|
val coeff = if (isQuietEnv) VoiceConfig.LONG_SPEECH_ENERGY_COEFF_QUIET else VoiceConfig.LONG_SPEECH_ENERGY_COEFF_NOISY
|
||||||
|
ThresholdConfig(
|
||||||
|
energyThreshold = currentEnvBaseline * coeff,
|
||||||
|
vadRatioThreshold = VoiceConfig.LONG_SPEECH_VAD_COEFF,
|
||||||
|
minScore = VoiceConfig.LONG_SPEECH_MIN_SCORE,
|
||||||
|
scene = "长语音"
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 计算语音评分(无修改)
|
||||||
|
*/
|
||||||
|
fun calculateSpeechScore(
|
||||||
|
duration: Long,
|
||||||
|
avgEnergy: Float,
|
||||||
|
continuousRatio: Float,
|
||||||
|
thresholdConfig: ThresholdConfig
|
||||||
|
): Int {
|
||||||
|
var score = 0
|
||||||
|
score += when {
|
||||||
|
duration >= VoiceConfig.LONG_SPEECH_SCORE_CUTOFF_MS -> 3
|
||||||
|
duration >= VoiceConfig.MID_SPEECH_SCORE_CUTOFF_MS -> 2
|
||||||
|
else -> 1
|
||||||
|
}
|
||||||
|
score += if (avgEnergy >= thresholdConfig.energyThreshold) 1 else 0
|
||||||
|
score += if (continuousRatio >= VoiceConfig.MIN_CONTINUOUS_FRAME_RATIO) 1 else 0
|
||||||
|
return score
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
x
Reference in New Issue
Block a user