判断提前做
This commit is contained in:
parent
58e2bc3e06
commit
725f1a9019
@ -16,39 +16,41 @@ class VadManager(
|
|||||||
private var isSpeaking = false
|
private var isSpeaking = false
|
||||||
private var lastSpeechTime = 0L
|
private var lastSpeechTime = 0L
|
||||||
|
|
||||||
// ========== 核心调整:适配人类正常说话停顿 ==========
|
// ========== 核心调整:区分活跃期/收尾期阈值 ==========
|
||||||
private val END_SILENCE_MS = 1500L // 基础静默阈值(1.5秒)
|
// 说话活跃期(容忍停顿)
|
||||||
private val MAX_SILENCE_AFTER_SPEECH_MS = 3000L// 兜底静默阈值(3秒)
|
private val ACTIVE_END_SILENCE_MS = 1500L // 活跃期基础静默(保留停顿容忍)
|
||||||
private val SPEECH_ACTIVE_DURATION = 5000L // 语音活跃期(5秒内容忍更长停顿)
|
private val ACTIVE_CONSECUTIVE_FRAMES = 10 // 活跃期连续静音帧
|
||||||
|
// 说话收尾期(快速结束)
|
||||||
|
private val FINAL_END_SILENCE_MS = 800L // 收尾期基础静默(缩短到800ms)
|
||||||
|
private val FINAL_CONSECUTIVE_FRAMES = 5 // 收尾期连续静音帧(5帧=100ms)
|
||||||
|
// 收尾期触发条件:最后一次有效语音后超过X秒,判定为进入收尾期
|
||||||
|
private val FINAL_PHASE_TRIGGER_MS = 1000L // 1秒无有效语音,进入收尾期
|
||||||
|
|
||||||
// ========== 核心调整:降低有效语音阈值 ==========
|
private val MAX_SILENCE_AFTER_SPEECH_MS = 2000L // 兜底阈值从3秒降到2秒
|
||||||
private val MIN_EFFECTIVE_SPEECH_RMS = 0.001f // 有效语音最小RMS
|
|
||||||
private val ENV_BASELINE_FACTOR = 1.2f // 环境基线倍数
|
|
||||||
private var envBaselineRms = 0.0005f // 初始环境基线
|
|
||||||
private var lastEffectiveSpeechTime = 0L // 最后一次有效语音时间戳
|
|
||||||
|
|
||||||
// ========== 新增:语音活跃期变量 ==========
|
// 原有基础配置
|
||||||
private var isSpeechActive = false // 语音活跃期标记
|
private val MIN_EFFECTIVE_SPEECH_RMS = 0.001f
|
||||||
private var speechActiveStartMs = 0L // 活跃期开始时间
|
private val ENV_BASELINE_FACTOR = 1.2f
|
||||||
|
private var envBaselineRms = 0.0005f
|
||||||
|
private var lastEffectiveSpeechTime = 0L
|
||||||
|
|
||||||
// ========== 连续静音帧校验(放宽阈值) ==========
|
|
||||||
private var consecutiveSilenceFrames = 0
|
private var consecutiveSilenceFrames = 0
|
||||||
private val CONSECUTIVE_SILENCE_FRAME_THRESHOLD = 10 // 连续10帧静音(200ms)
|
|
||||||
|
|
||||||
// 基础统计变量
|
// 新增:收尾期标记
|
||||||
private var activeFrameCount = 0
|
private var isInFinalPhase = false
|
||||||
private var activeSpeechFrameCount = 0
|
|
||||||
|
// 统计变量
|
||||||
private var speechEnergySum = 0f
|
private var speechEnergySum = 0f
|
||||||
private var speechFrameCount = 0
|
private var speechFrameCount = 0
|
||||||
private var peakRms = 0f
|
private var peakRms = 0f
|
||||||
|
|
||||||
// 连续性检测核心变量
|
|
||||||
private var totalFrames = 0
|
private var totalFrames = 0
|
||||||
private var speechFrames = 0
|
private var speechFrames = 0
|
||||||
private var continuousSpeechFrames = 0
|
private var continuousSpeechFrames = 0
|
||||||
private var lastFrameIsSpeech = false
|
private var lastFrameIsSpeech = false
|
||||||
private var peakPosition = 0
|
private var peakPosition = 0
|
||||||
private var frameIndex = 0
|
private var frameIndex = 0
|
||||||
|
private var activeFrameCount = 0
|
||||||
|
private var activeSpeechFrameCount = 0
|
||||||
|
|
||||||
init {
|
init {
|
||||||
val config = getVadModelConfig(0) ?: throw IllegalStateException("[$TAG] VAD config not found")
|
val config = getVadModelConfig(0) ?: throw IllegalStateException("[$TAG] VAD config not found")
|
||||||
@ -62,42 +64,39 @@ class VadManager(
|
|||||||
val vadHasSpeech = vad.isSpeechDetected()
|
val vadHasSpeech = vad.isSpeechDetected()
|
||||||
val rms = calcRms(samples)
|
val rms = calcRms(samples)
|
||||||
|
|
||||||
// 环境基线更新(滑动平均,适配背景噪音)
|
// 环境基线更新
|
||||||
if (!vadHasSpeech || rms < MIN_EFFECTIVE_SPEECH_RMS) {
|
if (!vadHasSpeech || rms < MIN_EFFECTIVE_SPEECH_RMS) {
|
||||||
envBaselineRms = (envBaselineRms * 0.9f) + (rms * 0.1f)
|
envBaselineRms = (envBaselineRms * 0.9f) + (rms * 0.1f)
|
||||||
}
|
}
|
||||||
// 有效语音判定:阈值更低,更容易触发
|
|
||||||
val effectiveSpeechThreshold = maxOf(MIN_EFFECTIVE_SPEECH_RMS, envBaselineRms * ENV_BASELINE_FACTOR)
|
val effectiveSpeechThreshold = maxOf(MIN_EFFECTIVE_SPEECH_RMS, envBaselineRms * ENV_BASELINE_FACTOR)
|
||||||
val isEffectiveSpeech = vadHasSpeech && rms >= effectiveSpeechThreshold
|
val isEffectiveSpeech = vadHasSpeech && rms >= effectiveSpeechThreshold
|
||||||
|
|
||||||
// ========== 核心优化:语音活跃期逻辑 ==========
|
// ========== 核心优化:动态判定收尾期 ==========
|
||||||
if (isEffectiveSpeech) {
|
if (isEffectiveSpeech) {
|
||||||
isSpeechActive = true
|
lastEffectiveSpeechTime = now
|
||||||
speechActiveStartMs = now // 重置活跃期
|
isInFinalPhase = false // 有有效语音,退出收尾期
|
||||||
}
|
|
||||||
// 活跃期内(5秒)用2秒静默阈值,活跃期外用1.5秒
|
|
||||||
val dynamicEndSilenceMs = if (isSpeechActive && (now - speechActiveStartMs) < SPEECH_ACTIVE_DURATION) {
|
|
||||||
2000L
|
|
||||||
} else {
|
} else {
|
||||||
END_SILENCE_MS
|
// 最后一次有效语音后超过1秒,进入收尾期
|
||||||
|
if (now - lastEffectiveSpeechTime >= FINAL_PHASE_TRIGGER_MS) {
|
||||||
|
isInFinalPhase = true
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// 语音能量统计(仅有效语音)
|
// 语音能量统计
|
||||||
if (isEffectiveSpeech) {
|
if (isEffectiveSpeech) {
|
||||||
speechEnergySum += rms
|
speechEnergySum += rms
|
||||||
speechFrameCount++
|
speechFrameCount++
|
||||||
peakRms = maxOf(peakRms, rms)
|
peakRms = maxOf(peakRms, rms)
|
||||||
lastEffectiveSpeechTime = now
|
|
||||||
lastSpeechTime = now
|
lastSpeechTime = now
|
||||||
consecutiveSilenceFrames = 0 // 重置连续静音帧
|
consecutiveSilenceFrames = 0
|
||||||
LogUtils.v(TAG, "🔊 有效语音帧 | RMS: $rms | 阈值: $effectiveSpeechThreshold")
|
LogUtils.v(TAG, "🔊 有效语音帧 | RMS: $rms | 阈值: $effectiveSpeechThreshold | 收尾期: $isInFinalPhase")
|
||||||
} else {
|
} else {
|
||||||
consecutiveSilenceFrames++ // 累计连续静音帧
|
consecutiveSilenceFrames++
|
||||||
LogUtils.v(TAG, if (vadHasSpeech) "⚠ 低能量语音帧 | RMS: $rms | 阈值: $effectiveSpeechThreshold"
|
LogUtils.v(TAG, if (vadHasSpeech) "⚠ 低能量语音帧 | RMS: $rms | 阈值: $effectiveSpeechThreshold"
|
||||||
else "🔇 静音帧 | 连续静音帧: $consecutiveSilenceFrames")
|
else "🔇 静音帧 | 连续静音帧: $consecutiveSilenceFrames | 收尾期: $isInFinalPhase")
|
||||||
}
|
}
|
||||||
|
|
||||||
// 帧统计与连续性计算
|
// 帧统计
|
||||||
totalFrames++
|
totalFrames++
|
||||||
frameIndex++
|
frameIndex++
|
||||||
if (isEffectiveSpeech) {
|
if (isEffectiveSpeech) {
|
||||||
@ -109,7 +108,14 @@ class VadManager(
|
|||||||
lastFrameIsSpeech = false
|
lastFrameIsSpeech = false
|
||||||
}
|
}
|
||||||
|
|
||||||
// VAD核心状态流转(使用动态静默阈值)
|
// ========== 核心优化:根据收尾期选择不同阈值 ==========
|
||||||
|
val (endSilenceMs, consecutiveFrames) = if (isInFinalPhase) {
|
||||||
|
Pair(FINAL_END_SILENCE_MS, FINAL_CONSECUTIVE_FRAMES)
|
||||||
|
} else {
|
||||||
|
Pair(ACTIVE_END_SILENCE_MS, ACTIVE_CONSECUTIVE_FRAMES)
|
||||||
|
}
|
||||||
|
|
||||||
|
// VAD状态流转
|
||||||
if (isEffectiveSpeech) {
|
if (isEffectiveSpeech) {
|
||||||
if (!isSpeaking) {
|
if (!isSpeaking) {
|
||||||
isSpeaking = true
|
isSpeaking = true
|
||||||
@ -124,32 +130,36 @@ class VadManager(
|
|||||||
val vadSilenceDuration = now - lastSpeechTime
|
val vadSilenceDuration = now - lastSpeechTime
|
||||||
val effectiveSilenceDuration = now - lastEffectiveSpeechTime
|
val effectiveSilenceDuration = now - lastEffectiveSpeechTime
|
||||||
|
|
||||||
// 触发条件:动态静默时长 + 连续静音帧达标
|
// 触发结束条件:适配当前阶段的阈值
|
||||||
val isSilenceTimeout = (vadSilenceDuration >= dynamicEndSilenceMs ||
|
val isSilenceTimeout = (vadSilenceDuration >= endSilenceMs ||
|
||||||
effectiveSilenceDuration >= MAX_SILENCE_AFTER_SPEECH_MS) &&
|
effectiveSilenceDuration >= MAX_SILENCE_AFTER_SPEECH_MS) &&
|
||||||
consecutiveSilenceFrames >= CONSECUTIVE_SILENCE_FRAME_THRESHOLD
|
consecutiveSilenceFrames >= consecutiveFrames
|
||||||
|
|
||||||
if (isSilenceTimeout) {
|
if (isSilenceTimeout) {
|
||||||
isSpeaking = false
|
isSpeaking = false
|
||||||
isSpeechActive = false // 结束活跃期
|
isInFinalPhase = false // 重置收尾期
|
||||||
val avgEnergy = if (speechFrameCount > 0) speechEnergySum / speechFrameCount else 0f
|
val avgEnergy = if (speechFrameCount > 0) speechEnergySum / speechFrameCount else 0f
|
||||||
LogUtils.d(TAG, """
|
LogUtils.d(TAG, """
|
||||||
🛑 语音结束
|
🛑 语音结束
|
||||||
- 有效静默时长: ${effectiveSilenceDuration}ms
|
- 有效静默时长: ${effectiveSilenceDuration}ms
|
||||||
- 连续静音帧: $consecutiveSilenceFrames
|
- 连续静音帧: $consecutiveSilenceFrames
|
||||||
- 平均能量: $avgEnergy | 峰值: $peakRms
|
- 平均能量: $avgEnergy | 峰值: $peakRms
|
||||||
- 活跃期: ${if (isSpeechActive) "是" else "否"}
|
- 收尾期: $isInFinalPhase | 所用阈值: $endSilenceMs ms
|
||||||
""".trimIndent())
|
""".trimIndent())
|
||||||
onSpeechEnd(avgEnergy, peakRms)
|
onSpeechEnd(avgEnergy, peakRms)
|
||||||
resetStats()
|
resetStats()
|
||||||
} else {
|
} else {
|
||||||
LogUtils.v(TAG, "⏳ 静默中(停顿容忍) | 连续静音帧: $consecutiveSilenceFrames | 静默时长: ${effectiveSilenceDuration}ms | 动态阈值: $dynamicEndSilenceMs")
|
LogUtils.v(TAG, "⏳ 静默中 | 连续静音帧: $consecutiveSilenceFrames | 静默时长: ${effectiveSilenceDuration}ms | 所用阈值: $endSilenceMs ms")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// ========== 保留原有方法 ==========
|
// 保留原有方法...
|
||||||
|
fun isSpeechDetected(): Boolean {
|
||||||
|
return vad.isSpeechDetected()
|
||||||
|
}
|
||||||
|
|
||||||
fun activeSpeechRatio(): Float {
|
fun activeSpeechRatio(): Float {
|
||||||
val ratio = if (activeFrameCount == 0) 0f else activeSpeechFrameCount.toFloat() / activeFrameCount
|
val ratio = if (activeFrameCount == 0) 0f else activeSpeechFrameCount.toFloat() / activeFrameCount
|
||||||
LogUtils.d(TAG, "📊 语音占比: $ratio | 有效语音帧: $activeSpeechFrameCount | 总帧: $activeFrameCount")
|
LogUtils.d(TAG, "📊 语音占比: $ratio | 有效语音帧: $activeSpeechFrameCount | 总帧: $activeFrameCount")
|
||||||
@ -169,10 +179,7 @@ class VadManager(
|
|||||||
lastEffectiveSpeechTime = 0L
|
lastEffectiveSpeechTime = 0L
|
||||||
envBaselineRms = 0.0005f
|
envBaselineRms = 0.0005f
|
||||||
consecutiveSilenceFrames = 0
|
consecutiveSilenceFrames = 0
|
||||||
// 重置活跃期
|
isInFinalPhase = false // 重置收尾期
|
||||||
isSpeechActive = false
|
|
||||||
speechActiveStartMs = 0L
|
|
||||||
// 重置统计
|
|
||||||
resetStats()
|
resetStats()
|
||||||
vad.reset()
|
vad.reset()
|
||||||
|
|
||||||
|
|||||||
@ -80,11 +80,27 @@ class VoiceController(
|
|||||||
private val SHORT_SPEECH_MAX = 2000L
|
private val SHORT_SPEECH_MAX = 2000L
|
||||||
|
|
||||||
// ========== 核心修改:多人对话过滤配置(适配2人以上场景) ==========
|
// ========== 核心修改:多人对话过滤配置(适配2人以上场景) ==========
|
||||||
private val MULTI_DIALOGUE_MIN_DURATION = 2500L // 多人对话最小时长(2.5秒,比两人更短也能判定)
|
private val MULTI_DIALOGUE_MIN_DURATION = 2500L // 多人对话最小时长(2.5秒)
|
||||||
private val MULTI_DIALOGUE_MAX_PEAK_AVG_RATIO = 2.5f // 多人对话峰均比范围更大(多人音量差异更大)
|
private val MULTI_DIALOGUE_MAX_PEAK_AVG_RATIO = 2.5f // 多人对话峰均比范围
|
||||||
private val MULTI_DIALOGUE_MIN_PEAK_AVG_RATIO = 0.4f
|
private val MULTI_DIALOGUE_MIN_PEAK_AVG_RATIO = 0.4f
|
||||||
private val MULTI_DIALOGUE_MAX_CONTINUOUS_RATIO = 0.3f // 多人对话连续帧占比更低(轮流说话,断层更多)
|
private val MULTI_DIALOGUE_MAX_CONTINUOUS_RATIO = 0.3f // 多人对话连续帧占比
|
||||||
private val MULTI_DIALOGUE_MIN_VAD_RATIO = 0.55f // 多人对话有效帧占比要求稍低(避免漏过滤)
|
private val MULTI_DIALOGUE_MIN_VAD_RATIO = 0.55f // 多人对话有效帧占比
|
||||||
|
|
||||||
|
// ========== 新增:录音过程中实时统计的变量 ==========
|
||||||
|
// 能量统计
|
||||||
|
private var realtimeEnergySum = 0f
|
||||||
|
private var realtimeEnergyCount = 0
|
||||||
|
private var realtimePeakRms = 0f
|
||||||
|
// 帧统计(实时累加)
|
||||||
|
private var realtimeTotalFrames = 0
|
||||||
|
private var realtimeSpeechFrames = 0
|
||||||
|
private var realtimeContinuousSpeechFrames = 0
|
||||||
|
private var realtimeLastFrameIsSpeech = false
|
||||||
|
// 多人对话实时判定标记
|
||||||
|
private var isMultiPersonDialogueDetected = false
|
||||||
|
// 防抖变量
|
||||||
|
private var lastInvalidResetMs = 0L
|
||||||
|
private val INVALID_RESET_DEBOUNCE_MS = 1500L // 1.5秒内不重复重置
|
||||||
|
|
||||||
// 阈值配置数据类
|
// 阈值配置数据类
|
||||||
private data class ThresholdConfig(
|
private data class ThresholdConfig(
|
||||||
@ -143,15 +159,78 @@ class VoiceController(
|
|||||||
audioBuffer.addAll(samples.asList())
|
audioBuffer.addAll(samples.asList())
|
||||||
vadManager.accept(samples)
|
vadManager.accept(samples)
|
||||||
|
|
||||||
|
// ========== 核心优化:录音过程中实时计算 ==========
|
||||||
|
// 1. 实时校准环境基线(适配录音中环境变化)
|
||||||
|
calibrateEnvBaseline(samples)
|
||||||
|
// 2. 实时计算能量/峰值
|
||||||
|
updateRealtimeEnergy(samples)
|
||||||
|
// 3. 实时更新帧统计
|
||||||
|
updateRealtimeFrameStats()
|
||||||
|
// 4. 实时判定是否为多人对话,若是则立即终止录音
|
||||||
|
if (checkMultiPersonDialogueRealtime(now)) {
|
||||||
|
LogUtils.w(TAG, "🚨 录音中识别出多人对话,提前终止")
|
||||||
|
finishSentence(realtimeEnergySum / realtimeEnergyCount, realtimePeakRms)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// 原有最大录音时长判断
|
||||||
if (System.currentTimeMillis() - recordingStartMs > maxRecordingMs) {
|
if (System.currentTimeMillis() - recordingStartMs > maxRecordingMs) {
|
||||||
LogUtils.w(TAG, "⏱ Max recording reached | 当前环境基线: $currentEnvBaseline")
|
LogUtils.w(TAG, "⏱ Max recording reached | 当前环境基线: $currentEnvBaseline")
|
||||||
finishSentence()
|
finishSentence(realtimeEnergySum / realtimeEnergyCount, realtimePeakRms)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* ================= 环境基线校准 ================= */
|
/* ================= 新增:录音中实时更新能量统计 ================= */
|
||||||
|
private fun updateRealtimeEnergy(samples: FloatArray) {
|
||||||
|
val rms = vadManager.calcRms(samples)
|
||||||
|
// 仅统计有效语音帧的能量
|
||||||
|
if (rms >= MIN_EFFECTIVE_SPEECH_RMS) {
|
||||||
|
realtimeEnergySum += rms
|
||||||
|
realtimeEnergyCount++
|
||||||
|
realtimePeakRms = maxOf(realtimePeakRms, rms)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ================= 新增:录音中实时更新帧统计 ================= */
|
||||||
|
private fun updateRealtimeFrameStats() {
|
||||||
|
// 从VADManager获取最新帧状态
|
||||||
|
realtimeTotalFrames = vadManager.getTotalFrames()
|
||||||
|
realtimeSpeechFrames = vadManager.getSpeechFrames()
|
||||||
|
realtimeContinuousSpeechFrames = vadManager.getContinuousSpeechFrames()
|
||||||
|
// 实时更新连续帧标记
|
||||||
|
val currentFrameIsSpeech = vadManager.isSpeechDetected() // 需给VadManager新增isSpeechDetected()方法
|
||||||
|
if (currentFrameIsSpeech) {
|
||||||
|
realtimeContinuousSpeechFrames = if (realtimeLastFrameIsSpeech) realtimeContinuousSpeechFrames + 1 else 1
|
||||||
|
} else {
|
||||||
|
realtimeContinuousSpeechFrames = 0
|
||||||
|
}
|
||||||
|
realtimeLastFrameIsSpeech = currentFrameIsSpeech
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ================= 新增:录音中实时判定多人对话 ================= */
|
||||||
|
private fun checkMultiPersonDialogueRealtime(now: Long): Boolean {
|
||||||
|
// 还没到多人对话最小时长,不判定
|
||||||
|
val duration = now - recordingStartMs
|
||||||
|
if (duration < MULTI_DIALOGUE_MIN_DURATION) return false
|
||||||
|
|
||||||
|
// 实时计算特征值
|
||||||
|
val avgEnergy = if (realtimeEnergyCount > 0) realtimeEnergySum / realtimeEnergyCount else 0f
|
||||||
|
val peakAvgRatio = if (avgEnergy > 0) realtimePeakRms / avgEnergy else 0f
|
||||||
|
val continuousRatio = if (realtimeSpeechFrames > 0) realtimeContinuousSpeechFrames.toFloat() / realtimeSpeechFrames else 0f
|
||||||
|
val vadRatio = vadManager.activeSpeechRatio()
|
||||||
|
|
||||||
|
// 多人对话判定逻辑(和原逻辑一致,但实时执行)
|
||||||
|
isMultiPersonDialogueDetected = duration >= MULTI_DIALOGUE_MIN_DURATION &&
|
||||||
|
peakAvgRatio in MULTI_DIALOGUE_MIN_PEAK_AVG_RATIO..MULTI_DIALOGUE_MAX_PEAK_AVG_RATIO &&
|
||||||
|
continuousRatio <= MULTI_DIALOGUE_MAX_CONTINUOUS_RATIO &&
|
||||||
|
vadRatio >= MULTI_DIALOGUE_MIN_VAD_RATIO
|
||||||
|
|
||||||
|
return isMultiPersonDialogueDetected
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ================= 环境基线校准(保留,录音中也会调用) ================= */
|
||||||
private fun calibrateEnvBaseline(samples: FloatArray) {
|
private fun calibrateEnvBaseline(samples: FloatArray) {
|
||||||
val rms = vadManager.calcRms(samples)
|
val rms = vadManager.calcRms(samples)
|
||||||
// 新增:只保留低于基线+阈值的有效值,过滤突发噪音
|
// 新增:只保留低于基线+阈值的有效值,过滤突发噪音
|
||||||
@ -160,7 +239,7 @@ class VoiceController(
|
|||||||
if (envNoiseBuffer.size >= BASELINE_WINDOW_SIZE) {
|
if (envNoiseBuffer.size >= BASELINE_WINDOW_SIZE) {
|
||||||
envNoiseBuffer.removeFirst()
|
envNoiseBuffer.removeFirst()
|
||||||
}
|
}
|
||||||
envNoiseBuffer.addLast(rms)
|
envNoiseBuffer.addLast(validRms) // 用过滤后的有效值更新
|
||||||
currentEnvBaseline = envNoiseBuffer.maxOrNull() ?: 0.001f
|
currentEnvBaseline = envNoiseBuffer.maxOrNull() ?: 0.001f
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -180,6 +259,8 @@ class VoiceController(
|
|||||||
audioBuffer.clear()
|
audioBuffer.clear()
|
||||||
vadManager.reset()
|
vadManager.reset()
|
||||||
vadStarted = false
|
vadStarted = false
|
||||||
|
// 重置实时统计变量
|
||||||
|
resetRealtimeStats()
|
||||||
}
|
}
|
||||||
|
|
||||||
inKwsObserve = true
|
inKwsObserve = true
|
||||||
@ -195,16 +276,21 @@ class VoiceController(
|
|||||||
recordingStartMs = System.currentTimeMillis()
|
recordingStartMs = System.currentTimeMillis()
|
||||||
audioBuffer.clear()
|
audioBuffer.clear()
|
||||||
audioBuffer.addAll(preBuffer)
|
audioBuffer.addAll(preBuffer)
|
||||||
|
// 初始化实时统计变量
|
||||||
|
resetRealtimeStats()
|
||||||
state = VoiceState.RECORDING
|
state = VoiceState.RECORDING
|
||||||
}
|
}
|
||||||
|
|
||||||
private fun onVadEnd(avgEnergy: Float, peakRms: Float) {
|
private fun onVadEnd(avgEnergy: Float, peakRms: Float) {
|
||||||
if (state != VoiceState.RECORDING) return
|
if (state != VoiceState.RECORDING) return
|
||||||
LogUtils.d(TAG, "🧠 VAD END | 环境基线: $currentEnvBaseline")
|
LogUtils.d(TAG, "🧠 VAD END | 环境基线: $currentEnvBaseline")
|
||||||
finishSentence(avgEnergy, peakRms)
|
// 优先使用实时统计的能量值,避免重复计算
|
||||||
|
val realAvgEnergy = if (realtimeEnergyCount > 0) realtimeEnergySum / realtimeEnergyCount else avgEnergy
|
||||||
|
val realPeakRms = if (realtimePeakRms > 0) realtimePeakRms else peakRms
|
||||||
|
finishSentence(realAvgEnergy, realPeakRms)
|
||||||
}
|
}
|
||||||
|
|
||||||
/* ================= 结束录音(核心:多人对话过滤) ================= */
|
/* ================= 结束录音(核心:复用实时计算结果) ================= */
|
||||||
private fun finishSentence(avgEnergy: Float = 0f, peakRms: Float = 0f) {
|
private fun finishSentence(avgEnergy: Float = 0f, peakRms: Float = 0f) {
|
||||||
val now = System.currentTimeMillis()
|
val now = System.currentTimeMillis()
|
||||||
val duration = now - recordingStartMs
|
val duration = now - recordingStartMs
|
||||||
@ -219,24 +305,13 @@ class VoiceController(
|
|||||||
val vadRatio = vadManager.activeSpeechRatio()
|
val vadRatio = vadManager.activeSpeechRatio()
|
||||||
val peakAvgRatio = if (avgEnergy > 0f) peakRms / avgEnergy else 0f
|
val peakAvgRatio = if (avgEnergy > 0f) peakRms / avgEnergy else 0f
|
||||||
|
|
||||||
// 获取VAD帧统计
|
// 直接复用实时统计的帧数据,无需重新获取
|
||||||
val totalFrames = vadManager.getTotalFrames()
|
|
||||||
val speechFrames = vadManager.getSpeechFrames()
|
|
||||||
val continuousSpeechFrames = vadManager.getContinuousSpeechFrames()
|
|
||||||
val peakPositionRatio = vadManager.getPeakPositionRatio()
|
|
||||||
|
|
||||||
LogUtils.d(TAG, "📊 录音信息 | 时长: $duration ms | 能量: $avgEnergy | 峰均比: $peakAvgRatio | 基线: $currentEnvBaseline")
|
LogUtils.d(TAG, "📊 录音信息 | 时长: $duration ms | 能量: $avgEnergy | 峰均比: $peakAvgRatio | 基线: $currentEnvBaseline")
|
||||||
LogUtils.d(TAG, "📊 帧统计 | 总帧: $totalFrames | 语音帧: $speechFrames | 连续语音帧: $continuousSpeechFrames | 峰值位置占比: $peakPositionRatio")
|
LogUtils.d(TAG, "📊 实时帧统计 | 总帧: $realtimeTotalFrames | 语音帧: $realtimeSpeechFrames | 连续语音帧: $realtimeContinuousSpeechFrames")
|
||||||
|
|
||||||
// ========== 核心修改:第一步过滤多人对话垃圾语音 ==========
|
// 若录音中已识别出多人对话,直接过滤
|
||||||
val continuousRatio = if (speechFrames > 0) continuousSpeechFrames.toFloat() / speechFrames else 0f
|
if (isMultiPersonDialogueDetected) {
|
||||||
val isMultiPersonDialogue = duration >= MULTI_DIALOGUE_MIN_DURATION && // 时长≥2.5秒
|
LogUtils.w(TAG, "❌ 过滤多人对话垃圾语音(实时识别) | 时长: $duration ms")
|
||||||
peakAvgRatio in MULTI_DIALOGUE_MIN_PEAK_AVG_RATIO..MULTI_DIALOGUE_MAX_PEAK_AVG_RATIO && // 峰均比0.4~2.5
|
|
||||||
continuousRatio <= MULTI_DIALOGUE_MAX_CONTINUOUS_RATIO && // 连续帧占比≤0.3(多人轮流说,断层多)
|
|
||||||
vadRatio >= MULTI_DIALOGUE_MIN_VAD_RATIO // 有效帧占比≥0.55(整体语音占比高)
|
|
||||||
|
|
||||||
if (isMultiPersonDialogue) {
|
|
||||||
LogUtils.w(TAG, "❌ 过滤多人对话垃圾语音 | 时长: $duration ms | 连续占比: $continuousRatio | 有效占比: $vadRatio | 峰均比: $peakAvgRatio")
|
|
||||||
resetToWaitSpeech()
|
resetToWaitSpeech()
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
@ -248,6 +323,7 @@ class VoiceController(
|
|||||||
audioBuffer.clear()
|
audioBuffer.clear()
|
||||||
state = VoiceState.UPLOADING
|
state = VoiceState.UPLOADING
|
||||||
onFinalAudio(audio)
|
onFinalAudio(audio)
|
||||||
|
resetRealtimeStats() // 重置实时统计
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -261,8 +337,10 @@ class VoiceController(
|
|||||||
}
|
}
|
||||||
|
|
||||||
// ========== 3. 非连续判定:极度宽松 ==========
|
// ========== 3. 非连续判定:极度宽松 ==========
|
||||||
|
val continuousRatio = if (realtimeSpeechFrames > 0) realtimeContinuousSpeechFrames.toFloat() / realtimeSpeechFrames else 0f
|
||||||
|
val peakPositionRatio = vadManager.getPeakPositionRatio()
|
||||||
val isDiscontinuous = continuousRatio < MIN_CONTINUOUS_FRAME_RATIO &&
|
val isDiscontinuous = continuousRatio < MIN_CONTINUOUS_FRAME_RATIO &&
|
||||||
speechFrames < MIN_EFFECTIVE_SPEECH_FRAMES &&
|
realtimeSpeechFrames < MIN_EFFECTIVE_SPEECH_FRAMES &&
|
||||||
peakPositionRatio > MAX_PEAK_POSITION_RATIO
|
peakPositionRatio > MAX_PEAK_POSITION_RATIO
|
||||||
if (isDiscontinuous) {
|
if (isDiscontinuous) {
|
||||||
LogUtils.w(TAG, "❌ 非连续杂音过滤 | 连续占比: $continuousRatio < $MIN_CONTINUOUS_FRAME_RATIO")
|
LogUtils.w(TAG, "❌ 非连续杂音过滤 | 连续占比: $continuousRatio < $MIN_CONTINUOUS_FRAME_RATIO")
|
||||||
@ -327,9 +405,22 @@ class VoiceController(
|
|||||||
audioBuffer.clear()
|
audioBuffer.clear()
|
||||||
state = VoiceState.UPLOADING
|
state = VoiceState.UPLOADING
|
||||||
onFinalAudio(audio)
|
onFinalAudio(audio)
|
||||||
|
resetRealtimeStats() // 重置实时统计
|
||||||
LogUtils.i(TAG, "✅ 低能量语音通过 | 时长: $duration ms | 能量: $avgEnergy | 场景: ${thresholdConfig.scene}")
|
LogUtils.i(TAG, "✅ 低能量语音通过 | 时长: $duration ms | 能量: $avgEnergy | 场景: ${thresholdConfig.scene}")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* ================= 新增:重置实时统计变量 ================= */
|
||||||
|
private fun resetRealtimeStats() {
|
||||||
|
realtimeEnergySum = 0f
|
||||||
|
realtimeEnergyCount = 0
|
||||||
|
realtimePeakRms = 0f
|
||||||
|
realtimeTotalFrames = 0
|
||||||
|
realtimeSpeechFrames = 0
|
||||||
|
realtimeContinuousSpeechFrames = 0
|
||||||
|
realtimeLastFrameIsSpeech = false
|
||||||
|
isMultiPersonDialogueDetected = false
|
||||||
|
}
|
||||||
|
|
||||||
/* ================= 播放/上传/Reset 回调 ================= */
|
/* ================= 播放/上传/Reset 回调 ================= */
|
||||||
fun onPlayStartPrompt() {
|
fun onPlayStartPrompt() {
|
||||||
LogUtils.d(TAG, "🎵 播放提示音 | 基线: $currentEnvBaseline")
|
LogUtils.d(TAG, "🎵 播放提示音 | 基线: $currentEnvBaseline")
|
||||||
@ -362,8 +453,7 @@ class VoiceController(
|
|||||||
VoiceState.WAIT_SPEECH_COOLDOWN
|
VoiceState.WAIT_SPEECH_COOLDOWN
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
private var lastInvalidResetMs = 0L
|
|
||||||
private val INVALID_RESET_DEBOUNCE_MS = 1500L // 1.5秒内不重复重置
|
|
||||||
private fun resetToWaitSpeech() {
|
private fun resetToWaitSpeech() {
|
||||||
LogUtils.d(TAG, "🔄 重置到等待说话 | 基线: $currentEnvBaseline")
|
LogUtils.d(TAG, "🔄 重置到等待说话 | 基线: $currentEnvBaseline")
|
||||||
val now = System.currentTimeMillis()
|
val now = System.currentTimeMillis()
|
||||||
@ -375,6 +465,7 @@ class VoiceController(
|
|||||||
audioBuffer.clear()
|
audioBuffer.clear()
|
||||||
vadManager.reset()
|
vadManager.reset()
|
||||||
vadStarted = false
|
vadStarted = false
|
||||||
|
resetRealtimeStats() // 重置实时统计
|
||||||
state = VoiceState.WAIT_SPEECH
|
state = VoiceState.WAIT_SPEECH
|
||||||
if (waitSpeechFailStartMs == 0L) waitSpeechFailStartMs = System.currentTimeMillis()
|
if (waitSpeechFailStartMs == 0L) waitSpeechFailStartMs = System.currentTimeMillis()
|
||||||
}
|
}
|
||||||
@ -390,6 +481,7 @@ class VoiceController(
|
|||||||
waitSpeechFailStartMs = 0L
|
waitSpeechFailStartMs = 0L
|
||||||
envNoiseBuffer.clear()
|
envNoiseBuffer.clear()
|
||||||
currentEnvBaseline = 0.001f
|
currentEnvBaseline = 0.001f
|
||||||
|
resetRealtimeStats() // 重置实时统计
|
||||||
LogUtils.d(TAG, "🔄 环境基线已重置 | 新基线: $currentEnvBaseline")
|
LogUtils.d(TAG, "🔄 环境基线已重置 | 新基线: $currentEnvBaseline")
|
||||||
state = VoiceState.WAIT_WAKEUP
|
state = VoiceState.WAIT_WAKEUP
|
||||||
}
|
}
|
||||||
@ -399,6 +491,7 @@ class VoiceController(
|
|||||||
wakeupManager.release()
|
wakeupManager.release()
|
||||||
vadManager.reset()
|
vadManager.reset()
|
||||||
envNoiseBuffer.clear()
|
envNoiseBuffer.clear()
|
||||||
|
resetRealtimeStats() // 重置实时统计
|
||||||
}
|
}
|
||||||
|
|
||||||
private fun cachePreBuffer(samples: FloatArray) {
|
private fun cachePreBuffer(samples: FloatArray) {
|
||||||
@ -407,4 +500,7 @@ class VoiceController(
|
|||||||
if (preBuffer.size > PRE_BUFFER_SIZE) preBuffer.removeFirst()
|
if (preBuffer.size > PRE_BUFFER_SIZE) preBuffer.removeFirst()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ========== 补充:MIN_EFFECTIVE_SPEECH_RMS 常量(和VadManager对齐) ==========
|
||||||
|
private val MIN_EFFECTIVE_SPEECH_RMS = 0.001f
|
||||||
}
|
}
|
||||||
Loading…
x
Reference in New Issue
Block a user