稳定版参数
This commit is contained in:
parent
8f7b02f18b
commit
eda73af083
@ -1,7 +1,6 @@
|
|||||||
package com.zs.smarthuman.sherpa
|
package com.zs.smarthuman.sherpa
|
||||||
|
|
||||||
import android.content.res.AssetManager
|
import android.content.res.AssetManager
|
||||||
import android.text.TextUtils
|
|
||||||
import com.blankj.utilcode.util.LogUtils
|
import com.blankj.utilcode.util.LogUtils
|
||||||
import com.k2fsa.sherpa.onnx.OnlineStream
|
import com.k2fsa.sherpa.onnx.OnlineStream
|
||||||
import com.k2fsa.sherpa.onnx.SpeakerRecognition
|
import com.k2fsa.sherpa.onnx.SpeakerRecognition
|
||||||
@ -16,7 +15,7 @@ class VoiceController(
|
|||||||
assetManager: AssetManager,
|
assetManager: AssetManager,
|
||||||
private val onWakeup: () -> Unit,
|
private val onWakeup: () -> Unit,
|
||||||
private val onFinalAudio: (FloatArray) -> Unit,
|
private val onFinalAudio: (FloatArray) -> Unit,
|
||||||
idleTimeoutSeconds: Int = 10,
|
idleTimeoutSeconds: Int = 200,
|
||||||
maxRecordingSeconds: Int = 10,
|
maxRecordingSeconds: Int = 10,
|
||||||
private val onStateChanged: ((VoiceState) -> Unit)? = null,
|
private val onStateChanged: ((VoiceState) -> Unit)? = null,
|
||||||
private val stopBackendAudio: (() -> Unit)? = null,
|
private val stopBackendAudio: (() -> Unit)? = null,
|
||||||
@ -30,15 +29,21 @@ class VoiceController(
|
|||||||
private const val SAMPLE_RATE = 16000
|
private const val SAMPLE_RATE = 16000
|
||||||
// 预缓存大小(2秒)
|
// 预缓存大小(2秒)
|
||||||
private const val PRE_BUFFER_SIZE = SAMPLE_RATE * 2
|
private const val PRE_BUFFER_SIZE = SAMPLE_RATE * 2
|
||||||
// 声纹验证阈值
|
|
||||||
private const val SPEAKER_VERIFY_THRESHOLD_NORMAL = 0.25f
|
// ========== 核心:分场景声纹阈值(极简版) ==========
|
||||||
private const val SPEAKER_VERIFY_THRESHOLD_SHORT = 0.20f
|
private const val SPEAKER_THRESHOLD_QUIET = 0.50f // 安静环境
|
||||||
private const val SHORT_AUDIO_THRESHOLD = SAMPLE_RATE * 0.5f // 0.5秒音频长度
|
private const val SPEAKER_THRESHOLD_NOISY = 0.43f // 嘈杂环境(匹配你的真实相似度)
|
||||||
// 防抖时间
|
private const val SPEAKER_THRESHOLD_SHORT = 0.40f // 短语音(<1秒)
|
||||||
|
|
||||||
|
// 短语音判定阈值
|
||||||
|
private const val SHORT_AUDIO_DURATION_MS = 1000L
|
||||||
private const val INVALID_RESET_DEBOUNCE_MS = 1500L
|
private const val INVALID_RESET_DEBOUNCE_MS = 1500L
|
||||||
// 最小语音时长
|
// 最小语音时长
|
||||||
private const val MIN_SPEECH_MS = 800L
|
private const val MIN_SPEECH_MS = 800L
|
||||||
private const val MIN_EFFECTIVE_VOICE_DURATION = 400L
|
private const val MIN_EFFECTIVE_VOICE_DURATION = 400L
|
||||||
|
|
||||||
|
// 噪音场景判定阈值
|
||||||
|
private const val NOISE_BASELINE_THRESHOLD = 0.01f
|
||||||
}
|
}
|
||||||
|
|
||||||
var state: VoiceState = VoiceState.WAIT_WAKEUP
|
var state: VoiceState = VoiceState.WAIT_WAKEUP
|
||||||
@ -48,23 +53,21 @@ class VoiceController(
|
|||||||
onStateChanged?.invoke(value)
|
onStateChanged?.invoke(value)
|
||||||
}
|
}
|
||||||
|
|
||||||
// ========== 缺失变量补充:实时能量与帧统计变量 ==========
|
// 实时能量与帧统计变量
|
||||||
// 实时能量统计
|
|
||||||
private var realtimeEnergySum = 0f
|
private var realtimeEnergySum = 0f
|
||||||
private var realtimeEnergyCount = 0
|
private var realtimeEnergyCount = 0
|
||||||
private var realtimePeakRms = 0f
|
private var realtimePeakRms = 0f
|
||||||
// 实时帧统计
|
|
||||||
private var realtimeTotalFrames = 0
|
private var realtimeTotalFrames = 0
|
||||||
private var realtimeSpeechFrames = 0
|
private var realtimeSpeechFrames = 0
|
||||||
private var realtimeContinuousSpeechFrames = 0
|
private var realtimeContinuousSpeechFrames = 0
|
||||||
private var realtimeLastFrameIsSpeech = false
|
private var realtimeLastFrameIsSpeech = false
|
||||||
// 多人对话检测标记
|
|
||||||
private var isMultiPersonDialogueDetected = false
|
private var isMultiPersonDialogueDetected = false
|
||||||
// 防抖重置标记
|
|
||||||
private var lastInvalidResetMs = 0L
|
private var lastInvalidResetMs = 0L
|
||||||
// 声纹管理器锁(解决并发问题)
|
|
||||||
private val speakerManagerLock = ReentrantLock()
|
private val speakerManagerLock = ReentrantLock()
|
||||||
|
|
||||||
|
// 环境噪音状态标记
|
||||||
|
private var isNoisyEnvironment = false
|
||||||
|
|
||||||
private val wakeupManager = WakeupManager(assetManager, onWakeup)
|
private val wakeupManager = WakeupManager(assetManager, onWakeup)
|
||||||
private val vadManager = VadManager(
|
private val vadManager = VadManager(
|
||||||
assetManager,
|
assetManager,
|
||||||
@ -89,16 +92,16 @@ class VoiceController(
|
|||||||
private val idleTimeoutMs = idleTimeoutSeconds * 1000L
|
private val idleTimeoutMs = idleTimeoutSeconds * 1000L
|
||||||
private val maxRecordingMs = maxRecordingSeconds * 1000L
|
private val maxRecordingMs = maxRecordingSeconds * 1000L
|
||||||
|
|
||||||
// ================= 保留分场景动态系数 + 强制兜底配置(近距离优化版) =================
|
// 分场景动态系数(保留原有逻辑)
|
||||||
private val BASELINE_WINDOW_SIZE = 50
|
private val BASELINE_WINDOW_SIZE = 50
|
||||||
private val envNoiseBuffer = ArrayDeque<Float>(BASELINE_WINDOW_SIZE)
|
private val envNoiseBuffer = ArrayDeque<Float>(BASELINE_WINDOW_SIZE)
|
||||||
private var currentEnvBaseline = 0.001f
|
private var currentEnvBaseline = 0.001f
|
||||||
|
|
||||||
// 强制兜底:正常语音最低门槛(近距离场景大幅降低)
|
// 强制兜底:正常语音最低门槛
|
||||||
private val MIN_NORMAL_VOICE_ENERGY = 0.03f
|
private val MIN_NORMAL_VOICE_ENERGY = 0.03f
|
||||||
private val MIN_NORMAL_VOICE_VAD_RATIO = 0.2f
|
private val MIN_NORMAL_VOICE_VAD_RATIO = 0.2f
|
||||||
|
|
||||||
// 分场景动态系数(安静环境系数极低,适配近距离轻声)
|
// 分场景动态系数
|
||||||
private val BASELINE_QUIET_THRESHOLD = 0.005f
|
private val BASELINE_QUIET_THRESHOLD = 0.005f
|
||||||
private val SHORT_SPEECH_ENERGY_COEFF_QUIET = 1.5f
|
private val SHORT_SPEECH_ENERGY_COEFF_QUIET = 1.5f
|
||||||
private val SHORT_SPEECH_ENERGY_COEFF_NOISY = 2.0f
|
private val SHORT_SPEECH_ENERGY_COEFF_NOISY = 2.0f
|
||||||
@ -109,7 +112,7 @@ class VoiceController(
|
|||||||
private val SHORT_SPEECH_MIN_SCORE = 1
|
private val SHORT_SPEECH_MIN_SCORE = 1
|
||||||
private val LONG_SPEECH_MIN_SCORE = 1
|
private val LONG_SPEECH_MIN_SCORE = 1
|
||||||
|
|
||||||
// 其他过滤参数(近距离场景放宽)
|
// 其他过滤参数
|
||||||
private val MAX_FAR_FIELD_ENERGY = 0.015f
|
private val MAX_FAR_FIELD_ENERGY = 0.015f
|
||||||
private val MIN_VALID_PEAK_AVG_RATIO = 0.5f
|
private val MIN_VALID_PEAK_AVG_RATIO = 0.5f
|
||||||
private val MIN_CONTINUOUS_FRAME_RATIO = 0.1f
|
private val MIN_CONTINUOUS_FRAME_RATIO = 0.1f
|
||||||
@ -118,40 +121,32 @@ class VoiceController(
|
|||||||
private val SHORT_SPEECH_MIN = 500L
|
private val SHORT_SPEECH_MIN = 500L
|
||||||
private val SHORT_SPEECH_MAX = 2000L
|
private val SHORT_SPEECH_MAX = 2000L
|
||||||
|
|
||||||
// ========== 核心修改:多人对话过滤配置 ==========
|
// 多人对话过滤配置
|
||||||
private val MULTI_DIALOGUE_MIN_DURATION = 2500L
|
private val MULTI_DIALOGUE_MIN_DURATION = 2500L
|
||||||
private val MULTI_DIALOGUE_MAX_PEAK_AVG_RATIO = 2.5f
|
private val MULTI_DIALOGUE_MAX_PEAK_AVG_RATIO = 2.5f
|
||||||
private val MULTI_DIALOGUE_MIN_PEAK_AVG_RATIO = 0.4f
|
private val MULTI_DIALOGUE_MIN_PEAK_AVG_RATIO = 0.4f
|
||||||
private val MULTI_DIALOGUE_MAX_CONTINUOUS_RATIO = 0.3f
|
private val MULTI_DIALOGUE_MAX_CONTINUOUS_RATIO = 0.3f
|
||||||
private val MULTI_DIALOGUE_MIN_VAD_RATIO = 0.55f
|
private val MULTI_DIALOGUE_MIN_VAD_RATIO = 0.55f
|
||||||
|
|
||||||
// ========== 核心调整:近距离场景 微弱人声过滤配置(重点优化) ==========
|
// 微弱人声过滤配置
|
||||||
private val MIN_VOICE_FRAME_RATIO = 0.08f
|
private val MIN_VOICE_FRAME_RATIO = 0.08f
|
||||||
private val MIN_PEAK_ENERGY_RATIO = 1.5f
|
private val MIN_PEAK_ENERGY_RATIO = 1.5f
|
||||||
private val NORMAL_VOICE_ENERGY_THRESHOLD = 0.008f
|
private val NORMAL_VOICE_ENERGY_THRESHOLD = 0.008f
|
||||||
private val MIN_CONTINUOUS_VOICE_FRAMES = 1
|
private val MIN_CONTINUOUS_VOICE_FRAMES = 1
|
||||||
|
|
||||||
// ========== 核心新增:MIN_EFFECTIVE_SPEECH_RMS 常量 ==========
|
|
||||||
private val MIN_EFFECTIVE_SPEECH_RMS = 0.0005f
|
private val MIN_EFFECTIVE_SPEECH_RMS = 0.0005f
|
||||||
|
|
||||||
// ========== 核心新增:无效说话标记 + 超时类型 ==========
|
// 无效说话标记 + 超时类型
|
||||||
private var hasInvalidSpeech = false
|
private var hasInvalidSpeech = false
|
||||||
private var currentTimeoutType: TimeoutType = TimeoutType.IDLE_TIMEOUT
|
private var currentTimeoutType: TimeoutType = TimeoutType.IDLE_TIMEOUT
|
||||||
|
|
||||||
// ========== 核心配置:声纹验证相关 ==========
|
// 声纹验证相关
|
||||||
private val CURRENT_USER_ID = "current_wakeup_user" // 当前唤醒用户唯一标识
|
private val CURRENT_USER_ID = "current_wakeup_user"
|
||||||
private val ENABLE_STRICT_SPEAKER_VERIFY = true // 严格验证开关
|
private val ENABLE_STRICT_SPEAKER_VERIFY = true
|
||||||
|
|
||||||
init {
|
init {
|
||||||
// 参数校验
|
|
||||||
require(idleTimeoutSeconds > 0) { "idleTimeoutSeconds 必须大于0" }
|
|
||||||
require(maxRecordingSeconds > 0) { "maxRecordingSeconds 必须大于0" }
|
|
||||||
require(maxRecordingSeconds >= idleTimeoutSeconds) { "maxRecordingSeconds 必须大于等于 idleTimeoutSeconds" }
|
|
||||||
|
|
||||||
// 初始化声纹识别器(适配你提供的API)
|
|
||||||
try {
|
try {
|
||||||
SpeakerRecognition.initExtractor(assetManager) // 对齐原生API
|
SpeakerRecognition.initExtractor(assetManager)
|
||||||
LogUtils.d(TAG, "✅ 声纹识别器初始化成功(原生Stream版本)")
|
LogUtils.d(TAG, "✅ 声纹识别器初始化成功")
|
||||||
} catch (e: Exception) {
|
} catch (e: Exception) {
|
||||||
LogUtils.e(TAG, "❌ 声纹识别器初始化失败", e)
|
LogUtils.e(TAG, "❌ 声纹识别器初始化失败", e)
|
||||||
throw RuntimeException("声纹识别初始化失败", e)
|
throw RuntimeException("声纹识别初始化失败", e)
|
||||||
@ -163,8 +158,8 @@ class VoiceController(
|
|||||||
cachePreBuffer(samples)
|
cachePreBuffer(samples)
|
||||||
wakeupManager.acceptAudio(samples)
|
wakeupManager.acceptAudio(samples)
|
||||||
if (wakeupManager.consumeWakeupFlag()) {
|
if (wakeupManager.consumeWakeupFlag()) {
|
||||||
handleWakeupEvent() // 仅调用一次
|
handleWakeupEvent()
|
||||||
// 注册唤醒用户特征(异步执行)
|
// 注册唤醒用户特征
|
||||||
CoroutineScope(Dispatchers.IO).launch {
|
CoroutineScope(Dispatchers.IO).launch {
|
||||||
var stream: OnlineStream? = null
|
var stream: OnlineStream? = null
|
||||||
runCatching {
|
runCatching {
|
||||||
@ -174,18 +169,14 @@ class VoiceController(
|
|||||||
return@launch
|
return@launch
|
||||||
}
|
}
|
||||||
|
|
||||||
// 创建原生Stream
|
|
||||||
stream = SpeakerRecognition.extractor.createStream()
|
stream = SpeakerRecognition.extractor.createStream()
|
||||||
stream.acceptWaveform(samples = wakeupAudio, sampleRate = SAMPLE_RATE)
|
stream?.acceptWaveform(samples = wakeupAudio, sampleRate = SAMPLE_RATE)
|
||||||
stream.inputFinished()
|
stream?.inputFinished()
|
||||||
|
|
||||||
// 计算特征并注册(仅当前用户)
|
if (stream != null && SpeakerRecognition.extractor.isReady(stream)) {
|
||||||
if (SpeakerRecognition.extractor.isReady(stream)) {
|
|
||||||
val embedding = SpeakerRecognition.extractor.compute(stream)
|
val embedding = SpeakerRecognition.extractor.compute(stream)
|
||||||
// 加锁保护 manager 操作
|
|
||||||
speakerManagerLock.withLock {
|
speakerManagerLock.withLock {
|
||||||
SpeakerRecognition.manager.remove(CURRENT_USER_ID)
|
SpeakerRecognition.manager.remove(CURRENT_USER_ID)
|
||||||
// 注册当前唤醒用户
|
|
||||||
val embeddingList = mutableListOf(embedding)
|
val embeddingList = mutableListOf(embedding)
|
||||||
val ok = SpeakerRecognition.manager.add(
|
val ok = SpeakerRecognition.manager.add(
|
||||||
name = CURRENT_USER_ID,
|
name = CURRENT_USER_ID,
|
||||||
@ -194,7 +185,7 @@ class VoiceController(
|
|||||||
if (ok) {
|
if (ok) {
|
||||||
LogUtils.d(TAG, "✅ 注册当前唤醒用户特征成功 | 特征长度: ${embedding.size}")
|
LogUtils.d(TAG, "✅ 注册当前唤醒用户特征成功 | 特征长度: ${embedding.size}")
|
||||||
} else {
|
} else {
|
||||||
LogUtils.w(TAG, "❌ 注册当前唤醒用户特征失败(manager.add返回false)")
|
LogUtils.w(TAG, "❌ 注册当前唤醒用户特征失败")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
@ -203,9 +194,7 @@ class VoiceController(
|
|||||||
}.onFailure {
|
}.onFailure {
|
||||||
LogUtils.e(TAG, "❌ 唤醒用户特征注册失败", it)
|
LogUtils.e(TAG, "❌ 唤醒用户特征注册失败", it)
|
||||||
}.also {
|
}.also {
|
||||||
// 释放Stream
|
|
||||||
stream?.release()
|
stream?.release()
|
||||||
LogUtils.d(TAG, "🔄 唤醒注册Stream已释放")
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return
|
return
|
||||||
@ -215,6 +204,8 @@ class VoiceController(
|
|||||||
|
|
||||||
if (state == VoiceState.WAIT_WAKEUP) {
|
if (state == VoiceState.WAIT_WAKEUP) {
|
||||||
calibrateEnvBaseline(samples)
|
calibrateEnvBaseline(samples)
|
||||||
|
isNoisyEnvironment = currentEnvBaseline >= NOISE_BASELINE_THRESHOLD
|
||||||
|
LogUtils.d(TAG, "📊 环境状态 | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment")
|
||||||
}
|
}
|
||||||
|
|
||||||
when (state) {
|
when (state) {
|
||||||
@ -257,10 +248,11 @@ class VoiceController(
|
|||||||
audioBuffer.addAll(samples.asList())
|
audioBuffer.addAll(samples.asList())
|
||||||
vadManager.accept(samples)
|
vadManager.accept(samples)
|
||||||
|
|
||||||
// ========== 核心优化:录音过程中实时计算 ==========
|
|
||||||
calibrateEnvBaseline(samples)
|
calibrateEnvBaseline(samples)
|
||||||
updateRealtimeEnergy(samples)
|
updateRealtimeEnergy(samples)
|
||||||
updateRealtimeFrameStats()
|
updateRealtimeFrameStats()
|
||||||
|
isNoisyEnvironment = currentEnvBaseline >= NOISE_BASELINE_THRESHOLD
|
||||||
|
|
||||||
if (checkMultiPersonDialogueRealtime(now)) {
|
if (checkMultiPersonDialogueRealtime(now)) {
|
||||||
LogUtils.w(TAG, "🚨 录音中识别出多人对话,提前终止")
|
LogUtils.w(TAG, "🚨 录音中识别出多人对话,提前终止")
|
||||||
finishSentence(realtimeEnergySum / realtimeEnergyCount, realtimePeakRms)
|
finishSentence(realtimeEnergySum / realtimeEnergyCount, realtimePeakRms)
|
||||||
@ -268,25 +260,25 @@ class VoiceController(
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (System.currentTimeMillis() - recordingStartMs > maxRecordingMs) {
|
if (System.currentTimeMillis() - recordingStartMs > maxRecordingMs) {
|
||||||
LogUtils.w(TAG, "⏱ Max recording reached | 当前环境基线: $currentEnvBaseline")
|
LogUtils.w(TAG, "⏱ Max recording reached | 当前环境基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment")
|
||||||
finishSentence(realtimeEnergySum / realtimeEnergyCount, realtimePeakRms)
|
finishSentence(realtimeEnergySum / realtimeEnergyCount, realtimePeakRms)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* ================= 新增:录音中实时更新能量统计(适配近距离轻声) ================= */
|
/* ================= 实时能量更新 ================= */
|
||||||
private fun updateRealtimeEnergy(samples: FloatArray) {
|
private fun updateRealtimeEnergy(samples: FloatArray) {
|
||||||
val rms = vadManager.calcRms(samples)
|
val rms = vadManager.calcRms(samples)
|
||||||
// 仅统计有效语音帧的能量(阈值降低)
|
val effectiveThreshold = if (isNoisyEnvironment) currentEnvBaseline * 1.8f else MIN_EFFECTIVE_SPEECH_RMS
|
||||||
if (rms >= MIN_EFFECTIVE_SPEECH_RMS) {
|
if (rms >= effectiveThreshold) {
|
||||||
realtimeEnergySum += rms
|
realtimeEnergySum += rms
|
||||||
realtimeEnergyCount++
|
realtimeEnergyCount++
|
||||||
realtimePeakRms = maxOf(realtimePeakRms, rms)
|
realtimePeakRms = maxOf(realtimePeakRms, rms)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* ================= 新增:录音中实时更新帧统计 ================= */
|
/* ================= 实时帧统计 ================= */
|
||||||
private fun updateRealtimeFrameStats() {
|
private fun updateRealtimeFrameStats() {
|
||||||
realtimeTotalFrames = vadManager.getTotalFrames()
|
realtimeTotalFrames = vadManager.getTotalFrames()
|
||||||
realtimeSpeechFrames = vadManager.getSpeechFrames()
|
realtimeSpeechFrames = vadManager.getSpeechFrames()
|
||||||
@ -300,7 +292,7 @@ class VoiceController(
|
|||||||
realtimeLastFrameIsSpeech = currentFrameIsSpeech
|
realtimeLastFrameIsSpeech = currentFrameIsSpeech
|
||||||
}
|
}
|
||||||
|
|
||||||
/* ================= 新增:录音中实时判定多人对话 ================= */
|
/* ================= 多人对话检测 ================= */
|
||||||
private fun checkMultiPersonDialogueRealtime(now: Long): Boolean {
|
private fun checkMultiPersonDialogueRealtime(now: Long): Boolean {
|
||||||
val duration = now - recordingStartMs
|
val duration = now - recordingStartMs
|
||||||
if (duration < MULTI_DIALOGUE_MIN_DURATION) return false
|
if (duration < MULTI_DIALOGUE_MIN_DURATION) return false
|
||||||
@ -318,10 +310,9 @@ class VoiceController(
|
|||||||
return isMultiPersonDialogueDetected
|
return isMultiPersonDialogueDetected
|
||||||
}
|
}
|
||||||
|
|
||||||
/* ================= 环境基线校准(适配近距离场景,降低噪音敏感度) ================= */
|
/* ================= 环境基线校准 ================= */
|
||||||
private fun calibrateEnvBaseline(samples: FloatArray) {
|
private fun calibrateEnvBaseline(samples: FloatArray) {
|
||||||
val rms = vadManager.calcRms(samples)
|
val rms = vadManager.calcRms(samples)
|
||||||
// 只保留低于基线+阈值的有效值,过滤突发噪音(阈值降低)
|
|
||||||
val validRms = if (rms < currentEnvBaseline + 0.002f) rms else currentEnvBaseline
|
val validRms = if (rms < currentEnvBaseline + 0.002f) rms else currentEnvBaseline
|
||||||
if (rms < 0.015f) {
|
if (rms < 0.015f) {
|
||||||
if (envNoiseBuffer.size >= BASELINE_WINDOW_SIZE) {
|
if (envNoiseBuffer.size >= BASELINE_WINDOW_SIZE) {
|
||||||
@ -332,7 +323,7 @@ class VoiceController(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* ================= 唤醒相关方法 ================= */
|
/* ================= 唤醒处理 ================= */
|
||||||
private fun handleWakeupEvent() {
|
private fun handleWakeupEvent() {
|
||||||
if (state == VoiceState.UPLOADING) return
|
if (state == VoiceState.UPLOADING) return
|
||||||
stopBackendAudio?.invoke()
|
stopBackendAudio?.invoke()
|
||||||
@ -361,7 +352,7 @@ class VoiceController(
|
|||||||
|
|
||||||
private fun onVadStart() {
|
private fun onVadStart() {
|
||||||
if (state != VoiceState.WAIT_SPEECH) return
|
if (state != VoiceState.WAIT_SPEECH) return
|
||||||
LogUtils.d(TAG, "🎤 REAL VAD START | 环境基线: $currentEnvBaseline")
|
LogUtils.d(TAG, "🎤 REAL VAD START | 环境基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment")
|
||||||
vadStarted = true
|
vadStarted = true
|
||||||
recordingStartMs = System.currentTimeMillis()
|
recordingStartMs = System.currentTimeMillis()
|
||||||
audioBuffer.clear()
|
audioBuffer.clear()
|
||||||
@ -372,64 +363,57 @@ class VoiceController(
|
|||||||
|
|
||||||
private fun onVadEnd(avgEnergy: Float, peakRms: Float) {
|
private fun onVadEnd(avgEnergy: Float, peakRms: Float) {
|
||||||
if (state != VoiceState.RECORDING) return
|
if (state != VoiceState.RECORDING) return
|
||||||
LogUtils.d(TAG, "🧠 VAD END | 环境基线: $currentEnvBaseline")
|
LogUtils.d(TAG, "🧠 VAD END | 环境基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment")
|
||||||
val realAvgEnergy = if (realtimeEnergyCount > 0) realtimeEnergySum / realtimeEnergyCount else avgEnergy
|
val realAvgEnergy = if (realtimeEnergyCount > 0) realtimeEnergySum / realtimeEnergyCount else avgEnergy
|
||||||
val realPeakRms = if (realtimePeakRms > 0) realtimePeakRms else peakRms
|
val realPeakRms = if (realtimePeakRms > 0) realtimePeakRms else peakRms
|
||||||
finishSentence(realAvgEnergy, realPeakRms)
|
finishSentence(realAvgEnergy, realPeakRms)
|
||||||
}
|
}
|
||||||
|
|
||||||
/* ================= 核心优化:近距离场景 微弱人声过滤方法 ================= */
|
/* ================= 微弱人声过滤 ================= */
|
||||||
private fun filterWeakVoice(duration: Long, avgEnergy: Float, peakRms: Float): Boolean {
|
private fun filterWeakVoice(duration: Long, avgEnergy: Float, peakRms: Float): Boolean {
|
||||||
// 1. 时长过滤:<400ms的极短杂音才过滤
|
|
||||||
if (duration < MIN_EFFECTIVE_VOICE_DURATION) {
|
if (duration < MIN_EFFECTIVE_VOICE_DURATION) {
|
||||||
LogUtils.w(TAG, "❌ 微弱人声过滤:时长${duration}ms < ${MIN_EFFECTIVE_VOICE_DURATION}ms")
|
LogUtils.w(TAG, "❌ 微弱人声过滤:时长${duration}ms < ${MIN_EFFECTIVE_VOICE_DURATION}ms")
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
|
|
||||||
// 2. 帧占比过滤:仅对极低能量语音生效
|
|
||||||
val voiceFrameRatio = if (realtimeTotalFrames > 0) realtimeSpeechFrames.toFloat() / realtimeTotalFrames else 0f
|
val voiceFrameRatio = if (realtimeTotalFrames > 0) realtimeSpeechFrames.toFloat() / realtimeTotalFrames else 0f
|
||||||
if (avgEnergy < NORMAL_VOICE_ENERGY_THRESHOLD && voiceFrameRatio < MIN_VOICE_FRAME_RATIO) {
|
if (avgEnergy < NORMAL_VOICE_ENERGY_THRESHOLD && voiceFrameRatio < MIN_VOICE_FRAME_RATIO) {
|
||||||
LogUtils.w(TAG, "❌ 微弱人声过滤:帧占比${voiceFrameRatio} < ${MIN_VOICE_FRAME_RATIO}(极低能量)")
|
LogUtils.w(TAG, "❌ 微弱人声过滤:帧占比${voiceFrameRatio} < ${MIN_VOICE_FRAME_RATIO}")
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
|
|
||||||
// 3. 峰值能量过滤:仅对极低能量语音生效,且阈值大幅降低
|
|
||||||
val peakBaselineRatio = peakRms / currentEnvBaseline
|
val peakBaselineRatio = peakRms / currentEnvBaseline
|
||||||
if (avgEnergy < NORMAL_VOICE_ENERGY_THRESHOLD && peakBaselineRatio < MIN_PEAK_ENERGY_RATIO) {
|
if (avgEnergy < NORMAL_VOICE_ENERGY_THRESHOLD && peakBaselineRatio < MIN_PEAK_ENERGY_RATIO) {
|
||||||
LogUtils.w(TAG, "❌ 微弱人声过滤:峰值/基线${peakBaselineRatio} < ${MIN_PEAK_ENERGY_RATIO}(极低能量)")
|
LogUtils.w(TAG, "❌ 微弱人声过滤:峰值/基线${peakBaselineRatio} < ${MIN_PEAK_ENERGY_RATIO}")
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
|
|
||||||
// 4. 连续帧过滤:仅对极低能量语音生效,且阈值降到1
|
|
||||||
if (avgEnergy < NORMAL_VOICE_ENERGY_THRESHOLD && realtimeContinuousSpeechFrames < MIN_CONTINUOUS_VOICE_FRAMES) {
|
if (avgEnergy < NORMAL_VOICE_ENERGY_THRESHOLD && realtimeContinuousSpeechFrames < MIN_CONTINUOUS_VOICE_FRAMES) {
|
||||||
LogUtils.w(TAG, "❌ 微弱人声过滤:连续帧${realtimeContinuousSpeechFrames} < ${MIN_CONTINUOUS_VOICE_FRAMES}(极低能量)")
|
LogUtils.w(TAG, "❌ 微弱人声过滤:连续帧${realtimeContinuousSpeechFrames} < ${MIN_CONTINUOUS_VOICE_FRAMES}")
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
|
|
||||||
// 5. 平均能量过滤:仅对极极低能量语音生效
|
|
||||||
val energyBaselineRatio = avgEnergy / currentEnvBaseline
|
val energyBaselineRatio = avgEnergy / currentEnvBaseline
|
||||||
if (avgEnergy < 0.005f && energyBaselineRatio < 1.2f) {
|
if (avgEnergy < 0.005f && energyBaselineRatio < 1.2f) {
|
||||||
LogUtils.w(TAG, "❌ 微弱人声过滤:能量/基线${energyBaselineRatio} < 1.2(极极低能量)")
|
LogUtils.w(TAG, "❌ 微弱人声过滤:能量/基线${energyBaselineRatio} < 1.2")
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
|
|
||||||
// 正常语音(包括近距离轻声)直接通过
|
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
/* ================= 结束录音(核心:适配近距离轻声) ================= */
|
/* ================= 结束录音 ================= */
|
||||||
private fun finishSentence(avgEnergy: Float = 0f, peakRms: Float = 0f) {
|
private fun finishSentence(avgEnergy: Float = 0f, peakRms: Float = 0f) {
|
||||||
val now = System.currentTimeMillis()
|
val now = System.currentTimeMillis()
|
||||||
val duration = now - recordingStartMs
|
val duration = now - recordingStartMs
|
||||||
|
|
||||||
if (!vadStarted || duration < MIN_SPEECH_MS) {
|
if (!vadStarted || duration < MIN_SPEECH_MS) {
|
||||||
LogUtils.d(TAG, "❌ 语音过短: $duration ms | 基线: $currentEnvBaseline")
|
LogUtils.d(TAG, "❌ 语音过短: $duration ms | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment")
|
||||||
hasInvalidSpeech = true
|
hasInvalidSpeech = true
|
||||||
resetToWaitSpeech()
|
resetToWaitSpeech()
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
// ========== 第二步:微弱人声专项过滤(仅过滤极微弱杂音) ==========
|
|
||||||
if (filterWeakVoice(duration, avgEnergy, peakRms)) {
|
if (filterWeakVoice(duration, avgEnergy, peakRms)) {
|
||||||
hasInvalidSpeech = true
|
hasInvalidSpeech = true
|
||||||
resetToWaitSpeech()
|
resetToWaitSpeech()
|
||||||
@ -440,42 +424,30 @@ class VoiceController(
|
|||||||
val vadRatio = vadManager.activeSpeechRatio()
|
val vadRatio = vadManager.activeSpeechRatio()
|
||||||
val peakAvgRatio = if (avgEnergy > 0f) peakRms / avgEnergy else 0f
|
val peakAvgRatio = if (avgEnergy > 0f) peakRms / avgEnergy else 0f
|
||||||
|
|
||||||
LogUtils.d(TAG, "📊 录音信息 | 时长: $duration ms | 能量: $avgEnergy | 峰均比: $peakAvgRatio | 基线: $currentEnvBaseline")
|
LogUtils.d(TAG, "📊 录音信息 | 时长: $duration ms | 能量: $avgEnergy | 峰均比: $peakAvgRatio | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment")
|
||||||
LogUtils.d(TAG, "📊 实时帧统计 | 总帧: $realtimeTotalFrames | 语音帧: $realtimeSpeechFrames | 连续语音帧: $realtimeContinuousSpeechFrames")
|
LogUtils.d(TAG, "📊 实时帧统计 | 总帧: $realtimeTotalFrames | 语音帧: $realtimeSpeechFrames | 连续语音帧: $realtimeContinuousSpeechFrames")
|
||||||
|
|
||||||
// 多人对话过滤
|
|
||||||
if (isMultiPersonDialogueDetected) {
|
if (isMultiPersonDialogueDetected) {
|
||||||
LogUtils.w(TAG, "❌ 过滤多人对话垃圾语音(实时识别) | 时长: $duration ms")
|
LogUtils.w(TAG, "❌ 过滤多人对话垃圾语音 | 时长: $duration ms")
|
||||||
hasInvalidSpeech = true
|
hasInvalidSpeech = true
|
||||||
resetToWaitSpeech()
|
resetToWaitSpeech()
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
// ========== 步骤1:优先声纹验证(核心!仅当前用户可通过) ==========
|
// 声纹验证(核心极简版)
|
||||||
if (ENABLE_STRICT_SPEAKER_VERIFY) {
|
if (ENABLE_STRICT_SPEAKER_VERIFY) {
|
||||||
val isCurrentUser = verifySpeaker(audioBuffer.toFloatArray())
|
val isCurrentUser = verifySpeaker(audio)
|
||||||
if (!isCurrentUser) {
|
if (!isCurrentUser) {
|
||||||
LogUtils.w(TAG, "❌ 非当前唤醒用户,直接拒绝语音 | 录音时长: $duration ms")
|
LogUtils.w(TAG, "❌ 非当前唤醒用户,拒绝语音 | 录音时长: $duration ms | 嘈杂环境: $isNoisyEnvironment")
|
||||||
hasInvalidSpeech = true
|
hasInvalidSpeech = true
|
||||||
resetToWaitSpeech()
|
resetToWaitSpeech()
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
LogUtils.d(TAG, "✅ 当前用户语音,继续处理 | 录音时长: $duration ms")
|
LogUtils.d(TAG, "✅ 当前用户语音,继续处理 | 录音时长: $duration ms | 嘈杂环境: $isNoisyEnvironment")
|
||||||
}
|
}
|
||||||
|
|
||||||
// ========== 1. 强制兜底:正常语音直接通过(阈值降低) ==========
|
|
||||||
val isNormalVoice = avgEnergy >= MIN_NORMAL_VOICE_ENERGY && vadRatio >= MIN_NORMAL_VOICE_VAD_RATIO
|
|
||||||
if (isNormalVoice) {
|
|
||||||
LogUtils.i(TAG, "✅ 正常语音强制通过 | 能量: $avgEnergy ≥ $MIN_NORMAL_VOICE_ENERGY | 占比: $vadRatio ≥ $MIN_NORMAL_VOICE_VAD_RATIO")
|
|
||||||
audioBuffer.clear()
|
|
||||||
state = VoiceState.UPLOADING
|
|
||||||
onFinalAudio(audio)
|
|
||||||
resetRealtimeStats()
|
|
||||||
hasInvalidSpeech = false
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
// ========== 2. 远场过滤(近距离场景几乎不生效) ==========
|
// 远场过滤
|
||||||
val isFarField = avgEnergy < MAX_FAR_FIELD_ENERGY
|
val isFarField = avgEnergy < MAX_FAR_FIELD_ENERGY
|
||||||
val isInvalidPeakRatio = peakAvgRatio < MIN_VALID_PEAK_AVG_RATIO
|
val isInvalidPeakRatio = peakAvgRatio < MIN_VALID_PEAK_AVG_RATIO
|
||||||
if (isFarField && isInvalidPeakRatio) {
|
if (isFarField && isInvalidPeakRatio) {
|
||||||
@ -485,7 +457,7 @@ class VoiceController(
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
// ========== 3. 非连续判定(大幅放宽) ==========
|
// 非连续判定
|
||||||
val continuousRatio = if (realtimeSpeechFrames > 0) realtimeContinuousSpeechFrames.toFloat() / realtimeSpeechFrames else 0f
|
val continuousRatio = if (realtimeSpeechFrames > 0) realtimeContinuousSpeechFrames.toFloat() / realtimeSpeechFrames else 0f
|
||||||
val peakPositionRatio = vadManager.getPeakPositionRatio()
|
val peakPositionRatio = vadManager.getPeakPositionRatio()
|
||||||
val isDiscontinuous = continuousRatio < MIN_CONTINUOUS_FRAME_RATIO &&
|
val isDiscontinuous = continuousRatio < MIN_CONTINUOUS_FRAME_RATIO &&
|
||||||
@ -498,34 +470,21 @@ class VoiceController(
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
// ========== 4. 分场景动态阈值计算(系数大幅降低) ==========
|
// 分场景阈值过滤
|
||||||
val isQuietEnv = currentEnvBaseline < BASELINE_QUIET_THRESHOLD
|
val isQuietEnv = currentEnvBaseline < BASELINE_QUIET_THRESHOLD
|
||||||
val thresholdConfig = when {
|
val thresholdConfig = when {
|
||||||
duration in SHORT_SPEECH_MIN..SHORT_SPEECH_MAX -> {
|
duration in SHORT_SPEECH_MIN..SHORT_SPEECH_MAX -> {
|
||||||
val coeff = if (isQuietEnv) SHORT_SPEECH_ENERGY_COEFF_QUIET else SHORT_SPEECH_ENERGY_COEFF_NOISY
|
val coeff = if (isQuietEnv) SHORT_SPEECH_ENERGY_COEFF_QUIET else SHORT_SPEECH_ENERGY_COEFF_NOISY
|
||||||
val energyThreshold = currentEnvBaseline * coeff
|
val energyThreshold = currentEnvBaseline * coeff
|
||||||
LogUtils.d(TAG, "📏 短语音阈值 | 场景: ${if (isQuietEnv) "安静" else "嘈杂"} | 系数: $coeff | 阈值: $energyThreshold")
|
ThresholdConfig(energyThreshold, SHORT_SPEECH_VAD_COEFF, SHORT_SPEECH_MIN_SCORE, "短语音")
|
||||||
ThresholdConfig(
|
|
||||||
energyThreshold = energyThreshold,
|
|
||||||
vadRatioThreshold = SHORT_SPEECH_VAD_COEFF,
|
|
||||||
minScore = SHORT_SPEECH_MIN_SCORE,
|
|
||||||
scene = "短语音"
|
|
||||||
)
|
|
||||||
}
|
}
|
||||||
else -> {
|
else -> {
|
||||||
val coeff = if (isQuietEnv) LONG_SPEECH_ENERGY_COEFF_QUIET else LONG_SPEECH_ENERGY_COEFF_NOISY
|
val coeff = if (isQuietEnv) LONG_SPEECH_ENERGY_COEFF_QUIET else LONG_SPEECH_ENERGY_COEFF_NOISY
|
||||||
val energyThreshold = currentEnvBaseline * coeff
|
val energyThreshold = currentEnvBaseline * coeff
|
||||||
LogUtils.d(TAG, "📏 长语音阈值 | 场景: ${if (isQuietEnv) "安静" else "嘈杂"} | 系数: $coeff | 阈值: $energyThreshold")
|
ThresholdConfig(energyThreshold, LONG_SPEECH_VAD_COEFF, LONG_SPEECH_MIN_SCORE, "长语音")
|
||||||
ThresholdConfig(
|
|
||||||
energyThreshold = energyThreshold,
|
|
||||||
vadRatioThreshold = LONG_SPEECH_VAD_COEFF,
|
|
||||||
minScore = LONG_SPEECH_MIN_SCORE,
|
|
||||||
scene = "长语音"
|
|
||||||
)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// ========== 5. 分场景阈值过滤(阈值降低) ==========
|
|
||||||
val energyPass = avgEnergy >= thresholdConfig.energyThreshold
|
val energyPass = avgEnergy >= thresholdConfig.energyThreshold
|
||||||
val vadRatioPass = vadRatio >= thresholdConfig.vadRatioThreshold
|
val vadRatioPass = vadRatio >= thresholdConfig.vadRatioThreshold
|
||||||
if (!energyPass || !vadRatioPass) {
|
if (!energyPass || !vadRatioPass) {
|
||||||
@ -535,7 +494,7 @@ class VoiceController(
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
// ========== 6. 评分判定(门槛降低到1) ==========
|
// 评分判定
|
||||||
var score = 0
|
var score = 0
|
||||||
score += when {
|
score += when {
|
||||||
duration >= 4000 -> 3
|
duration >= 4000 -> 3
|
||||||
@ -553,16 +512,16 @@ class VoiceController(
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
// ========== 最终通过 ==========
|
// 最终通过
|
||||||
audioBuffer.clear()
|
audioBuffer.clear()
|
||||||
state = VoiceState.UPLOADING
|
state = VoiceState.UPLOADING
|
||||||
onFinalAudio(audio)
|
onFinalAudio(audio)
|
||||||
resetRealtimeStats()
|
resetRealtimeStats()
|
||||||
hasInvalidSpeech = false
|
hasInvalidSpeech = false
|
||||||
LogUtils.i(TAG, "✅ 近距离轻声通过 | 时长: $duration ms | 能量: $avgEnergy | 场景: ${thresholdConfig.scene}")
|
LogUtils.i(TAG, "✅ 语音通过 | 时长: $duration ms | 能量: $avgEnergy | 场景: ${thresholdConfig.scene} | 嘈杂环境: $isNoisyEnvironment")
|
||||||
}
|
}
|
||||||
|
|
||||||
/* ================= 重置实时统计变量 ================= */
|
/* ================= 重置实时统计 ================= */
|
||||||
private fun resetRealtimeStats() {
|
private fun resetRealtimeStats() {
|
||||||
realtimeEnergySum = 0f
|
realtimeEnergySum = 0f
|
||||||
realtimeEnergyCount = 0
|
realtimeEnergyCount = 0
|
||||||
@ -574,15 +533,15 @@ class VoiceController(
|
|||||||
isMultiPersonDialogueDetected = false
|
isMultiPersonDialogueDetected = false
|
||||||
}
|
}
|
||||||
|
|
||||||
/* ================= 播放/上传/Reset 回调 ================= */
|
/* ================= 播放/上传回调 ================= */
|
||||||
fun onPlayStartPrompt() {
|
fun onPlayStartPrompt() {
|
||||||
LogUtils.d(TAG, "🎵 播放提示音 | 基线: $currentEnvBaseline")
|
LogUtils.d(TAG, "🎵 播放提示音 | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment")
|
||||||
state = VoiceState.PLAYING_PROMPT
|
state = VoiceState.PLAYING_PROMPT
|
||||||
}
|
}
|
||||||
|
|
||||||
fun onPlayEndPrompt() {
|
fun onPlayEndPrompt() {
|
||||||
speechEnableAtMs = System.currentTimeMillis() + SPEECH_COOLDOWN_MS
|
speechEnableAtMs = System.currentTimeMillis() + SPEECH_COOLDOWN_MS
|
||||||
LogUtils.d(TAG, "🎵 提示音结束 | 基线: $currentEnvBaseline")
|
LogUtils.d(TAG, "🎵 提示音结束 | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment")
|
||||||
state = VoiceState.WAIT_SPEECH_COOLDOWN
|
state = VoiceState.WAIT_SPEECH_COOLDOWN
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -591,19 +550,19 @@ class VoiceController(
|
|||||||
LogUtils.w(TAG, "🎶 非上传完成状态,禁止切换到 PLAYING_BACKEND | 当前状态: $state")
|
LogUtils.w(TAG, "🎶 非上传完成状态,禁止切换到 PLAYING_BACKEND | 当前状态: $state")
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
LogUtils.d(TAG, "🎶 开始播放后台音频 | 基线: $currentEnvBaseline")
|
LogUtils.d(TAG, "🎶 开始播放后台音频 | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment")
|
||||||
state = VoiceState.PLAYING_BACKEND
|
state = VoiceState.PLAYING_BACKEND
|
||||||
}
|
}
|
||||||
|
|
||||||
fun onPlayEndBackend() {
|
fun onPlayEndBackend() {
|
||||||
speechEnableAtMs = System.currentTimeMillis() + SPEECH_COOLDOWN_MS
|
speechEnableAtMs = System.currentTimeMillis() + SPEECH_COOLDOWN_MS
|
||||||
LogUtils.d(TAG, "🎶 后台音频结束 | 基线: $currentEnvBaseline")
|
LogUtils.d(TAG, "🎶 后台音频结束 | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment")
|
||||||
state = VoiceState.WAIT_SPEECH_COOLDOWN
|
state = VoiceState.WAIT_SPEECH_COOLDOWN
|
||||||
}
|
}
|
||||||
|
|
||||||
fun onUploadFinished(success: Boolean) {
|
fun onUploadFinished(success: Boolean) {
|
||||||
if (state != VoiceState.UPLOADING) return
|
if (state != VoiceState.UPLOADING) return
|
||||||
LogUtils.d(TAG, "📤 上传完成 | 成功: $success | 基线: $currentEnvBaseline")
|
LogUtils.d(TAG, "📤 上传完成 | 成功: $success | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment")
|
||||||
|
|
||||||
if (!success) {
|
if (!success) {
|
||||||
speechEnableAtMs = System.currentTimeMillis() + SPEECH_COOLDOWN_MS
|
speechEnableAtMs = System.currentTimeMillis() + SPEECH_COOLDOWN_MS
|
||||||
@ -612,7 +571,7 @@ class VoiceController(
|
|||||||
}
|
}
|
||||||
|
|
||||||
private fun resetToWaitSpeech() {
|
private fun resetToWaitSpeech() {
|
||||||
LogUtils.d(TAG, "🔄 重置到等待说话 | 基线: $currentEnvBaseline | 已标记无效说话: $hasInvalidSpeech")
|
LogUtils.d(TAG, "🔄 重置到等待说话 | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment | 已标记无效说话: $hasInvalidSpeech")
|
||||||
val now = System.currentTimeMillis()
|
val now = System.currentTimeMillis()
|
||||||
if (now - lastInvalidResetMs < INVALID_RESET_DEBOUNCE_MS) {
|
if (now - lastInvalidResetMs < INVALID_RESET_DEBOUNCE_MS) {
|
||||||
LogUtils.d(TAG, "🛡 防抖:1.5秒内重复无效语音,跳过重置")
|
LogUtils.d(TAG, "🛡 防抖:1.5秒内重复无效语音,跳过重置")
|
||||||
@ -628,7 +587,7 @@ class VoiceController(
|
|||||||
}
|
}
|
||||||
|
|
||||||
private fun resetAll() {
|
private fun resetAll() {
|
||||||
LogUtils.d(TAG, "🔄 重置所有状态 | 基线: $currentEnvBaseline | 本次超时类型: $currentTimeoutType")
|
LogUtils.d(TAG, "🔄 重置所有状态 | 基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment | 本次超时类型: $currentTimeoutType")
|
||||||
audioBuffer.clear()
|
audioBuffer.clear()
|
||||||
preBuffer.clear()
|
preBuffer.clear()
|
||||||
vadManager.reset()
|
vadManager.reset()
|
||||||
@ -638,23 +597,23 @@ class VoiceController(
|
|||||||
waitSpeechFailStartMs = 0L
|
waitSpeechFailStartMs = 0L
|
||||||
envNoiseBuffer.clear()
|
envNoiseBuffer.clear()
|
||||||
currentEnvBaseline = 0.001f
|
currentEnvBaseline = 0.001f
|
||||||
|
isNoisyEnvironment = false
|
||||||
resetRealtimeStats()
|
resetRealtimeStats()
|
||||||
hasInvalidSpeech = false
|
hasInvalidSpeech = false
|
||||||
currentTimeoutType = TimeoutType.IDLE_TIMEOUT
|
currentTimeoutType = TimeoutType.IDLE_TIMEOUT
|
||||||
LogUtils.d(TAG, "🔄 环境基线已重置 | 新基线: $currentEnvBaseline | 无效说话标记已重置")
|
|
||||||
state = VoiceState.WAIT_WAKEUP
|
state = VoiceState.WAIT_WAKEUP
|
||||||
}
|
}
|
||||||
|
|
||||||
fun release() {
|
fun release() {
|
||||||
LogUtils.d(TAG, "🔌 释放资源 | 最终基线: $currentEnvBaseline")
|
LogUtils.d(TAG, "🔌 释放资源 | 最终基线: $currentEnvBaseline | 嘈杂环境: $isNoisyEnvironment")
|
||||||
wakeupManager.release()
|
wakeupManager.release()
|
||||||
vadManager.reset()
|
vadManager.reset()
|
||||||
envNoiseBuffer.clear()
|
envNoiseBuffer.clear()
|
||||||
resetRealtimeStats()
|
resetRealtimeStats()
|
||||||
hasInvalidSpeech = false
|
hasInvalidSpeech = false
|
||||||
currentTimeoutType = TimeoutType.IDLE_TIMEOUT
|
currentTimeoutType = TimeoutType.IDLE_TIMEOUT
|
||||||
|
isNoisyEnvironment = false
|
||||||
|
|
||||||
// 释放声纹识别器资源
|
|
||||||
runCatching {
|
runCatching {
|
||||||
SpeakerRecognition.extractor.release()
|
SpeakerRecognition.extractor.release()
|
||||||
speakerManagerLock.withLock {
|
speakerManagerLock.withLock {
|
||||||
@ -666,7 +625,6 @@ class VoiceController(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// 兜底释放(防止未调用release)
|
|
||||||
protected fun finalize() {
|
protected fun finalize() {
|
||||||
runCatching {
|
runCatching {
|
||||||
release()
|
release()
|
||||||
@ -690,60 +648,66 @@ class VoiceController(
|
|||||||
val scene: String
|
val scene: String
|
||||||
)
|
)
|
||||||
|
|
||||||
/* ================= 核心:原生Stream声纹验证(仅当前用户有效) ================= */
|
/* ================= 核心:极简版声纹验证 ================= */
|
||||||
/**
|
|
||||||
* 验证语音是否属于当前唤醒用户(完全适配你提供的API)
|
|
||||||
* @param audio 待验证的语音数据
|
|
||||||
* @return true=是当前用户,false=非当前用户
|
|
||||||
*/
|
|
||||||
private fun verifySpeaker(audio: FloatArray): Boolean {
|
private fun verifySpeaker(audio: FloatArray): Boolean {
|
||||||
if (audio.isEmpty()) {
|
if (audio.isEmpty()) {
|
||||||
LogUtils.w(TAG, "❌ 待验证音频为空,声纹验证失败")
|
LogUtils.w(TAG, "❌ 待验证音频为空,声纹验证失败")
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 1. 裁剪音频:只保留本次录音的有效部分(解决时长不匹配问题)
|
||||||
|
val audioDurationMs = (audio.size.toFloat() / SAMPLE_RATE * 1000).toLong()
|
||||||
|
// 只保留最后 N 毫秒的音频(N = 实际录音时长),避免缓存旧音频
|
||||||
|
val validAudio = if (audioDurationMs > 0) {
|
||||||
|
val validSampleCount = (audioDurationMs * SAMPLE_RATE / 1000).toInt()
|
||||||
|
if (validSampleCount < audio.size) {
|
||||||
|
audio.copyOfRange(audio.size - validSampleCount, audio.size)
|
||||||
|
} else {
|
||||||
|
audio
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
audio
|
||||||
|
}
|
||||||
|
|
||||||
|
// 2. 分场景选阈值(无容错,只调阈值)
|
||||||
|
val finalThreshold = when {
|
||||||
|
audioDurationMs < SHORT_AUDIO_DURATION_MS -> SPEAKER_THRESHOLD_SHORT
|
||||||
|
isNoisyEnvironment -> SPEAKER_THRESHOLD_NOISY
|
||||||
|
else -> SPEAKER_THRESHOLD_QUIET
|
||||||
|
}
|
||||||
|
|
||||||
var stream: OnlineStream? = null
|
var stream: OnlineStream? = null
|
||||||
return try {
|
return try {
|
||||||
stream = SpeakerRecognition.extractor.createStream()
|
stream = SpeakerRecognition.extractor.createStream()
|
||||||
stream.acceptWaveform(samples = audio, sampleRate = SAMPLE_RATE)
|
stream.acceptWaveform(samples = validAudio, sampleRate = SAMPLE_RATE) // 用裁剪后的音频验证
|
||||||
stream.inputFinished()
|
stream.inputFinished()
|
||||||
|
|
||||||
if (!SpeakerRecognition.extractor.isReady(stream)) {
|
if (!SpeakerRecognition.extractor.isReady(stream)) {
|
||||||
LogUtils.w(TAG, "❌ 验证音频Stream未就绪,验证失败")
|
LogUtils.w(TAG, "❌ 音频Stream未就绪,验证失败")
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
val embedding = SpeakerRecognition.extractor.compute(stream)
|
val embedding = SpeakerRecognition.extractor.compute(stream)
|
||||||
// 动态选择阈值
|
|
||||||
val threshold = if (audio.size < SHORT_AUDIO_THRESHOLD) {
|
|
||||||
LogUtils.d(TAG, "📢 检测到短速语音,使用放宽阈值: $SPEAKER_VERIFY_THRESHOLD_SHORT")
|
|
||||||
SPEAKER_VERIFY_THRESHOLD_SHORT
|
|
||||||
} else {
|
|
||||||
SPEAKER_VERIFY_THRESHOLD_NORMAL
|
|
||||||
}
|
|
||||||
|
|
||||||
// 加锁验证
|
// 3. 纯验证逻辑:过就过,不过就拒绝
|
||||||
speakerManagerLock.withLock {
|
speakerManagerLock.withLock {
|
||||||
val verifyPass = SpeakerRecognition.manager.verify(
|
val verifyPass = SpeakerRecognition.manager.verify(
|
||||||
name = CURRENT_USER_ID,
|
name = CURRENT_USER_ID,
|
||||||
embedding = embedding,
|
embedding = embedding,
|
||||||
threshold = threshold
|
threshold = finalThreshold
|
||||||
)
|
)
|
||||||
if (verifyPass) {
|
|
||||||
LogUtils.d(TAG, "✅ 声纹验证通过 | 阈值: $threshold")
|
// 打印关键信息(补充裁剪后时长)
|
||||||
} else {
|
LogUtils.d(TAG, "📊 声纹验证 | 阈值: $finalThreshold | 通过: $verifyPass | 嘈杂环境: $isNoisyEnvironment | 原始时长: ${audioDurationMs}ms | 验证时长: ${(validAudio.size.toFloat()/SAMPLE_RATE*1000).toLong()}ms")
|
||||||
LogUtils.w(TAG, "❌ 声纹验证失败 | 阈值: $threshold")
|
|
||||||
}
|
// 无任何容错:验证结果就是最终结果
|
||||||
return verifyPass
|
return verifyPass
|
||||||
}
|
}
|
||||||
} catch (e: Exception) {
|
} catch (e: Exception) {
|
||||||
LogUtils.e(TAG, "❌ 声纹验证异常", e)
|
LogUtils.e(TAG, "❌ 声纹验证异常,拒绝", e)
|
||||||
false
|
return false
|
||||||
} finally {
|
} finally {
|
||||||
// 释放Stream
|
|
||||||
stream?.release()
|
stream?.release()
|
||||||
LogUtils.d(TAG, "🔄 验证Stream已释放")
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -49,6 +49,7 @@ import com.zs.smarthuman.kt.releaseIM
|
|||||||
import com.zs.smarthuman.sherpa.TimeoutType
|
import com.zs.smarthuman.sherpa.TimeoutType
|
||||||
import com.zs.smarthuman.sherpa.VoiceController
|
import com.zs.smarthuman.sherpa.VoiceController
|
||||||
import com.zs.smarthuman.toast.Toaster
|
import com.zs.smarthuman.toast.Toaster
|
||||||
|
import com.zs.smarthuman.utils.AudioDebugUtil
|
||||||
import com.zs.smarthuman.utils.AudioPcmUtil
|
import com.zs.smarthuman.utils.AudioPcmUtil
|
||||||
import com.zs.smarthuman.utils.DangerousUtils
|
import com.zs.smarthuman.utils.DangerousUtils
|
||||||
import com.zs.smarthuman.utils.LogFileUtils
|
import com.zs.smarthuman.utils.LogFileUtils
|
||||||
@ -213,12 +214,12 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
|
|||||||
1
|
1
|
||||||
)
|
)
|
||||||
// loadLocalJsonAndPlay()
|
// loadLocalJsonAndPlay()
|
||||||
// val file = File(
|
val file = File(
|
||||||
// getExternalFilesDir(Environment.DIRECTORY_DOWNLOADS)!!.getAbsolutePath(),
|
getExternalFilesDir(Environment.DIRECTORY_DOWNLOADS)!!.getAbsolutePath(),
|
||||||
// "xxx.wav"
|
"xxx.wav"
|
||||||
// )
|
)
|
||||||
// AudioDebugUtil.saveFloatPcmAsWav(audio, file)
|
AudioDebugUtil.saveFloatPcmAsWav(audio, file)
|
||||||
// LogUtils.dTag("audioxx", "WAV saved: ${file.path}, samples=${audio.size}")
|
LogUtils.dTag("audioxx", "WAV saved: ${file.path}, samples=${audio.size}")
|
||||||
lifecycleScope.launch(Dispatchers.Main) {
|
lifecycleScope.launch(Dispatchers.Main) {
|
||||||
|
|
||||||
mVerticalAnimator?.show()
|
mVerticalAnimator?.show()
|
||||||
@ -291,7 +292,7 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
|
|||||||
|
|
||||||
override fun onPause() {
|
override fun onPause() {
|
||||||
super.onPause()
|
super.onPause()
|
||||||
stopRecording()
|
// stopRecording()
|
||||||
UnityPlayerHolder.getInstance().pause()
|
UnityPlayerHolder.getInstance().pause()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -41,6 +41,9 @@ class MainViewModel: BaseViewModel() {
|
|||||||
RxHttp.postJson(ApiService.UPLOAD_RECORD_VOICE_URL)
|
RxHttp.postJson(ApiService.UPLOAD_RECORD_VOICE_URL)
|
||||||
.add("sessionCode",sessionCode)
|
.add("sessionCode",sessionCode)
|
||||||
.add("audio", audioVoice)
|
.add("audio", audioVoice)
|
||||||
|
.readTimeout(3000L)
|
||||||
|
.writeTimeout(3000L)
|
||||||
|
.connectTimeout(3000L)
|
||||||
.toAwaitResponse<String>()
|
.toAwaitResponse<String>()
|
||||||
.awaitResult()
|
.awaitResult()
|
||||||
.getOrThrow()
|
.getOrThrow()
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user