修改合适阈值

This commit is contained in:
林若思 2026-01-08 14:53:22 +08:00
parent e21283b73b
commit b15621985d

View File

@ -50,27 +50,32 @@ class VoiceController(
private val idleTimeoutMs = idleTimeoutSeconds * 1000L
private val maxRecordingMs = maxRecordingSeconds * 1000L
// ================= 动态阈值核心配置(修复+调整) =================
// ================= 保留分场景动态系数 + 强制兜底配置 =================
private val BASELINE_WINDOW_SIZE = 50
private val envNoiseBuffer = ArrayDeque<Float>(BASELINE_WINDOW_SIZE)
private var currentEnvBaseline = 0.001f
// ========== 修复:远场过滤配置(关键调整) ==========
private val MAX_FAR_FIELD_ENERGY = 0.05f // 远场能量上限(正常语音>0.05,远场<0.05
private val MIN_VALID_PEAK_AVG_RATIO = 1.5f // 有效峰均比下限(正常语音>1.5,咳嗽<1.5
private val BASELINE_QUIET_THRESHOLD = 0.002f
private val MIN_CONTINUOUS_FRAME_RATIO = 0.4f // 连续帧占比下限从0.6调低,兼容正常短语音)
private val MAX_PEAK_POSITION_RATIO = 0.5f // 峰值位置上限从0.3调高,兼容正常语音)
private val MIN_EFFECTIVE_SPEECH_FRAMES = 3 // 最低有效帧数从5调低
// 强制兜底正常语音最低门槛你的0.0809≥0.06直接通过)
private val MIN_NORMAL_VOICE_ENERGY = 0.06f
private val MIN_NORMAL_VOICE_VAD_RATIO = 0.3f
// 分场景动态系数(调整:降低安静环境系数)
private val SHORT_SPEECH_ENERGY_COEFF_QUIET = 6.0f // 从8.0调低
private val SHORT_SPEECH_ENERGY_COEFF_NOISY = 4.0f // 从5.0调低
private val LONG_SPEECH_ENERGY_COEFF = 15.0f // 从20.0调低
private val SHORT_SPEECH_VAD_COEFF = 0.15f // 从0.2调低
private val LONG_SPEECH_VAD_COEFF = 0.3f // 从0.4调低
private val SHORT_SPEECH_MIN_SCORE = 2 // 调回2分
private val LONG_SPEECH_MIN_SCORE = 4 // 从5调低
// 分场景动态系数(安静环境系数极低)
private val BASELINE_QUIET_THRESHOLD = 0.005f // 安静环境基线阈值
private val SHORT_SPEECH_ENERGY_COEFF_QUIET = 2.0f // 安静环境短语音系数
private val SHORT_SPEECH_ENERGY_COEFF_NOISY = 3.0f // 嘈杂环境短语音系数
private val LONG_SPEECH_ENERGY_COEFF_QUIET = 4.0f // 安静环境长语音系数
private val LONG_SPEECH_ENERGY_COEFF_NOISY = 6.0f // 嘈杂环境长语音系数
private val SHORT_SPEECH_VAD_COEFF = 0.08f
private val LONG_SPEECH_VAD_COEFF = 0.15f
private val SHORT_SPEECH_MIN_SCORE = 1
private val LONG_SPEECH_MIN_SCORE = 2
// 其他过滤参数
private val MAX_FAR_FIELD_ENERGY = 0.03f
private val MIN_VALID_PEAK_AVG_RATIO = 0.8f
private val MIN_CONTINUOUS_FRAME_RATIO = 0.2f
private val MAX_PEAK_POSITION_RATIO = 0.95f
private val MIN_EFFECTIVE_SPEECH_FRAMES = 5
private val SHORT_SPEECH_MIN = 500L
private val SHORT_SPEECH_MAX = 2000L
@ -142,15 +147,15 @@ class VoiceController(
/* ================= 环境基线校准 ================= */
private fun calibrateEnvBaseline(samples: FloatArray) {
val rms = vadManager.calcRms(samples)
if (rms < 0.01f) {
if (rms < 0.03f) {
if (envNoiseBuffer.size >= BASELINE_WINDOW_SIZE) {
envNoiseBuffer.removeFirst()
}
envNoiseBuffer.addLast(rms)
currentEnvBaseline = envNoiseBuffer.maxOrNull() ?: 0.001f
LogUtils.d(TAG, "🌡 环境基线校准 | RMS: $rms | 基线: $currentEnvBaseline | 缓存数: ${envNoiseBuffer.size}")
// LogUtils.d(TAG, "🌡 环境基线校准 | RMS: $rms | 基线: $currentEnvBaseline | 缓存数: ${envNoiseBuffer.size}")
} else {
LogUtils.v(TAG, "🔊 高能量音频跳过校准 | RMS: $rms | 基线: $currentEnvBaseline")
// LogUtils.v(TAG, "🔊 高能量音频跳过校准 | RMS: $rms | 基线: $currentEnvBaseline")
}
}
@ -193,7 +198,7 @@ class VoiceController(
finishSentence(avgEnergy, peakRms)
}
/* ================= 结束录音(修复远场过滤逻辑 ================= */
/* ================= 结束录音(分场景系数+强制兜底 ================= */
private fun finishSentence(avgEnergy: Float = 0f, peakRms: Float = 0f) {
val now = System.currentTimeMillis()
val duration = now - recordingStartMs
@ -217,52 +222,56 @@ class VoiceController(
LogUtils.d(TAG, "📊 录音信息 | 时长: $duration ms | 能量: $avgEnergy | 峰均比: $peakAvgRatio | 基线: $currentEnvBaseline")
LogUtils.d(TAG, "📊 帧统计 | 总帧: $totalFrames | 语音帧: $speechFrames | 连续语音帧: $continuousSpeechFrames | 峰值位置占比: $peakPositionRatio")
// ========== 修复:远场语音过滤逻辑(核心) ==========
// 正确逻辑:能量 < MAX_FAR_FIELD_ENERGY 才是远场;峰均比 < MIN_VALID_PEAK_AVG_RATIO 才是无效语音
val isFarField = avgEnergy < MAX_FAR_FIELD_ENERGY // 修复:从 < MIN 改为 < MAX
val isInvalidPeakRatio = peakAvgRatio < MIN_VALID_PEAK_AVG_RATIO // 降低阈值
// ========== 1. 强制兜底:正常语音直接通过 ==========
val isNormalVoice = avgEnergy >= MIN_NORMAL_VOICE_ENERGY && vadRatio >= MIN_NORMAL_VOICE_VAD_RATIO
if (isNormalVoice) {
LogUtils.i(TAG, "✅ 正常语音强制通过 | 能量: $avgEnergy$MIN_NORMAL_VOICE_ENERGY | 占比: $vadRatio$MIN_NORMAL_VOICE_VAD_RATIO")
audioBuffer.clear()
state = VoiceState.UPLOADING
onFinalAudio(audio)
return
}
// 非连续特征(调整阈值后更宽松)
// ========== 2. 远场过滤:只过滤极低能量 ==========
val isFarField = avgEnergy < MAX_FAR_FIELD_ENERGY
val isInvalidPeakRatio = peakAvgRatio < MIN_VALID_PEAK_AVG_RATIO
if (isFarField && isInvalidPeakRatio) {
LogUtils.w(TAG, "❌ 远场/无效语音过滤 | 能量: $avgEnergy < $MAX_FAR_FIELD_ENERGY")
resetToWaitSpeech()
return
}
// ========== 3. 非连续判定:极度宽松 ==========
val continuousRatio = if (speechFrames > 0) continuousSpeechFrames.toFloat() / speechFrames else 0f
val isDiscontinuous = continuousRatio < MIN_CONTINUOUS_FRAME_RATIO ||
speechFrames < MIN_EFFECTIVE_SPEECH_FRAMES ||
peakPositionRatio < MAX_PEAK_POSITION_RATIO
// 远场+无效语音过滤(仅过滤真正的远场/杂音)
if (isFarField && isInvalidPeakRatio) { // 修复:同时满足才过滤
LogUtils.w(TAG, "❌ 远场/无效语音过滤 | 能量: $avgEnergy < 远场上限: $MAX_FAR_FIELD_ENERGY | 峰均比: $peakAvgRatio < 有效下限: $MIN_VALID_PEAK_AVG_RATIO")
val isDiscontinuous = continuousRatio < MIN_CONTINUOUS_FRAME_RATIO &&
speechFrames < MIN_EFFECTIVE_SPEECH_FRAMES &&
peakPositionRatio > MAX_PEAK_POSITION_RATIO
if (isDiscontinuous) {
LogUtils.w(TAG, "❌ 非连续杂音过滤 | 连续占比: $continuousRatio < $MIN_CONTINUOUS_FRAME_RATIO")
resetToWaitSpeech()
return
}
// 非连续语音过滤(仅过滤真正的零散杂音)
if (isDiscontinuous && isFarField) { // 修复:结合远场特征,避免过滤正常语音
LogUtils.w(TAG, "❌ 非连续杂音过滤 | 连续占比: $continuousRatio < $MIN_CONTINUOUS_FRAME_RATIO | 语音帧: $speechFrames < $MIN_EFFECTIVE_SPEECH_FRAMES | 峰值位置: $peakPositionRatio < $MAX_PEAK_POSITION_RATIO")
resetToWaitSpeech()
return
}
// ========== 动态阈值计算(调整后更宽松) ==========
// ========== 4. 分场景动态阈值计算(保留核心逻辑) ==========
val isQuietEnv = currentEnvBaseline < BASELINE_QUIET_THRESHOLD
val thresholdConfig = when {
duration in SHORT_SPEECH_MIN..SHORT_SPEECH_MAX -> {
val coeff = if (currentEnvBaseline < BASELINE_QUIET_THRESHOLD) {
SHORT_SPEECH_ENERGY_COEFF_QUIET
} else {
SHORT_SPEECH_ENERGY_COEFF_NOISY
}
val threshold = currentEnvBaseline * coeff
LogUtils.d(TAG, "📏 短语音阈值 | 场景: ${if (currentEnvBaseline < BASELINE_QUIET_THRESHOLD) "安静" else "嘈杂"} | 系数: $coeff | 阈值: $threshold")
val coeff = if (isQuietEnv) SHORT_SPEECH_ENERGY_COEFF_QUIET else SHORT_SPEECH_ENERGY_COEFF_NOISY
val energyThreshold = currentEnvBaseline * coeff
LogUtils.d(TAG, "📏 短语音阈值 | 场景: ${if (isQuietEnv) "安静" else "嘈杂"} | 系数: $coeff | 阈值: $energyThreshold")
ThresholdConfig(
energyThreshold = threshold,
energyThreshold = energyThreshold,
vadRatioThreshold = SHORT_SPEECH_VAD_COEFF,
minScore = SHORT_SPEECH_MIN_SCORE,
scene = "短语音"
)
}
else -> {
val threshold = currentEnvBaseline * LONG_SPEECH_ENERGY_COEFF
val coeff = if (isQuietEnv) LONG_SPEECH_ENERGY_COEFF_QUIET else LONG_SPEECH_ENERGY_COEFF_NOISY
val energyThreshold = currentEnvBaseline * coeff
LogUtils.d(TAG, "📏 长语音阈值 | 场景: ${if (isQuietEnv) "安静" else "嘈杂"} | 系数: $coeff | 阈值: $energyThreshold")
ThresholdConfig(
energyThreshold = threshold,
energyThreshold = energyThreshold,
vadRatioThreshold = LONG_SPEECH_VAD_COEFF,
minScore = LONG_SPEECH_MIN_SCORE,
scene = "长语音"
@ -270,61 +279,40 @@ class VoiceController(
}
}
LogUtils.d(TAG, "📊 动态阈值 | ${thresholdConfig.scene} | 能量阈值: ${thresholdConfig.energyThreshold} | 占比阈值: ${thresholdConfig.vadRatioThreshold} | 最低分: ${thresholdConfig.minScore}")
// 基础阈值过滤(调整后更宽松)
if (avgEnergy < thresholdConfig.energyThreshold || vadRatio < thresholdConfig.vadRatioThreshold) {
LogUtils.w(TAG, "阈值过滤 | 能量: $avgEnergy < ${thresholdConfig.energyThreshold} | 占比: $vadRatio < ${thresholdConfig.vadRatioThreshold}")
// ========== 5. 分场景阈值过滤 ==========
val energyPass = avgEnergy >= thresholdConfig.energyThreshold
val vadRatioPass = vadRatio >= thresholdConfig.vadRatioThreshold
if (!energyPass || !vadRatioPass) {
LogUtils.w(TAG, "低能量语音阈值过滤 | 能量: $avgEnergy < ${thresholdConfig.energyThreshold} | 占比: $vadRatio < ${thresholdConfig.vadRatioThreshold} | 场景: ${thresholdConfig.scene}")
resetToWaitSpeech()
return
}
// 评分逻辑(恢复短语音保底分)
// ========== 6. 评分判定:极度宽松 ==========
var score = 0
// 1. 时长评分恢复短语音保底1分
score += when {
duration >= 4000 -> 3
duration >= 2500 -> 2
duration >= 1500 -> 1
duration >= SHORT_SPEECH_MIN -> 1 // 恢复保底分
else -> 0
}
// 2. 能量评分
score += when {
avgEnergy >= thresholdConfig.energyThreshold * 10 -> 3
avgEnergy >= thresholdConfig.energyThreshold * 5 -> 2
avgEnergy >= thresholdConfig.energyThreshold -> 1
else -> 0
}
// 3. 占比+连续性评分(调整阈值)
score += when {
continuousRatio >= 0.7 -> 2 // 从0.8调低
continuousRatio >= MIN_CONTINUOUS_FRAME_RATIO -> 1
else -> 0
}
LogUtils.d(TAG, "🏆 评分结果 | 总分: $score | 最低分: ${thresholdConfig.minScore} | 连续占比: $continuousRatio")
// 分场景判定(调整后更宽松)
val pass = if (duration in SHORT_SPEECH_MIN..SHORT_SPEECH_MAX) {
score >= thresholdConfig.minScore && continuousRatio >= 0.5 // 从0.7调低
} else {
score >= thresholdConfig.minScore || (score >= 2 && avgEnergy >= currentEnvBaseline * 4) // 从3→26→4
else -> 1
}
score += if (avgEnergy >= thresholdConfig.energyThreshold) 1 else 0
score += if (continuousRatio >= MIN_CONTINUOUS_FRAME_RATIO) 1 else 0
val pass = score >= thresholdConfig.minScore
if (!pass) {
LogUtils.w(TAG, "❌ 评分/连续性不足过滤 | 总分: $score < ${thresholdConfig.minScore} | 连续占比: $continuousRatio")
LogUtils.w(TAG, "❌ 评分不足过滤 | 总分: $score < ${thresholdConfig.minScore} | 场景: ${thresholdConfig.scene}")
resetToWaitSpeech()
return
}
// ========== 最终通过 ==========
audioBuffer.clear()
state = VoiceState.UPLOADING
onFinalAudio(audio)
LogUtils.i(TAG, "录音通过 | 时长: $duration ms | 能量: $avgEnergy | 连续占比: $continuousRatio | 准备上传")
LogUtils.i(TAG, "✅ 低能量语音通过 | 时长: $duration ms | 能量: $avgEnergy | 场景: ${thresholdConfig.scene}")
}
/* ================= 播放/上传/Reset 回调(无修改) ================= */
/* ================= 播放/上传/Reset 回调 ================= */
fun onPlayStartPrompt() {
LogUtils.d(TAG, "🎵 播放提示音 | 基线: $currentEnvBaseline")
state = VoiceState.PLAYING_PROMPT