修改合适阈值
This commit is contained in:
parent
e21283b73b
commit
b15621985d
@ -50,27 +50,32 @@ class VoiceController(
|
||||
private val idleTimeoutMs = idleTimeoutSeconds * 1000L
|
||||
private val maxRecordingMs = maxRecordingSeconds * 1000L
|
||||
|
||||
// ================= 动态阈值核心配置(修复+调整) =================
|
||||
// ================= 保留分场景动态系数 + 强制兜底配置 =================
|
||||
private val BASELINE_WINDOW_SIZE = 50
|
||||
private val envNoiseBuffer = ArrayDeque<Float>(BASELINE_WINDOW_SIZE)
|
||||
private var currentEnvBaseline = 0.001f
|
||||
|
||||
// ========== 修复:远场过滤配置(关键调整) ==========
|
||||
private val MAX_FAR_FIELD_ENERGY = 0.05f // 远场能量上限(正常语音>0.05,远场<0.05)
|
||||
private val MIN_VALID_PEAK_AVG_RATIO = 1.5f // 有效峰均比下限(正常语音>1.5,咳嗽<1.5)
|
||||
private val BASELINE_QUIET_THRESHOLD = 0.002f
|
||||
private val MIN_CONTINUOUS_FRAME_RATIO = 0.4f // 连续帧占比下限(从0.6调低,兼容正常短语音)
|
||||
private val MAX_PEAK_POSITION_RATIO = 0.5f // 峰值位置上限(从0.3调高,兼容正常语音)
|
||||
private val MIN_EFFECTIVE_SPEECH_FRAMES = 3 // 最低有效帧数(从5调低)
|
||||
// 强制兜底:正常语音最低门槛(你的0.0809≥0.06直接通过)
|
||||
private val MIN_NORMAL_VOICE_ENERGY = 0.06f
|
||||
private val MIN_NORMAL_VOICE_VAD_RATIO = 0.3f
|
||||
|
||||
// 分场景动态系数(调整:降低安静环境系数)
|
||||
private val SHORT_SPEECH_ENERGY_COEFF_QUIET = 6.0f // 从8.0调低
|
||||
private val SHORT_SPEECH_ENERGY_COEFF_NOISY = 4.0f // 从5.0调低
|
||||
private val LONG_SPEECH_ENERGY_COEFF = 15.0f // 从20.0调低
|
||||
private val SHORT_SPEECH_VAD_COEFF = 0.15f // 从0.2调低
|
||||
private val LONG_SPEECH_VAD_COEFF = 0.3f // 从0.4调低
|
||||
private val SHORT_SPEECH_MIN_SCORE = 2 // 调回2分
|
||||
private val LONG_SPEECH_MIN_SCORE = 4 // 从5调低
|
||||
// 分场景动态系数(安静环境系数极低)
|
||||
private val BASELINE_QUIET_THRESHOLD = 0.005f // 安静环境基线阈值
|
||||
private val SHORT_SPEECH_ENERGY_COEFF_QUIET = 2.0f // 安静环境短语音系数
|
||||
private val SHORT_SPEECH_ENERGY_COEFF_NOISY = 3.0f // 嘈杂环境短语音系数
|
||||
private val LONG_SPEECH_ENERGY_COEFF_QUIET = 4.0f // 安静环境长语音系数
|
||||
private val LONG_SPEECH_ENERGY_COEFF_NOISY = 6.0f // 嘈杂环境长语音系数
|
||||
private val SHORT_SPEECH_VAD_COEFF = 0.08f
|
||||
private val LONG_SPEECH_VAD_COEFF = 0.15f
|
||||
private val SHORT_SPEECH_MIN_SCORE = 1
|
||||
private val LONG_SPEECH_MIN_SCORE = 2
|
||||
|
||||
// 其他过滤参数
|
||||
private val MAX_FAR_FIELD_ENERGY = 0.03f
|
||||
private val MIN_VALID_PEAK_AVG_RATIO = 0.8f
|
||||
private val MIN_CONTINUOUS_FRAME_RATIO = 0.2f
|
||||
private val MAX_PEAK_POSITION_RATIO = 0.95f
|
||||
private val MIN_EFFECTIVE_SPEECH_FRAMES = 5
|
||||
private val SHORT_SPEECH_MIN = 500L
|
||||
private val SHORT_SPEECH_MAX = 2000L
|
||||
|
||||
@ -142,15 +147,15 @@ class VoiceController(
|
||||
/* ================= 环境基线校准 ================= */
|
||||
private fun calibrateEnvBaseline(samples: FloatArray) {
|
||||
val rms = vadManager.calcRms(samples)
|
||||
if (rms < 0.01f) {
|
||||
if (rms < 0.03f) {
|
||||
if (envNoiseBuffer.size >= BASELINE_WINDOW_SIZE) {
|
||||
envNoiseBuffer.removeFirst()
|
||||
}
|
||||
envNoiseBuffer.addLast(rms)
|
||||
currentEnvBaseline = envNoiseBuffer.maxOrNull() ?: 0.001f
|
||||
LogUtils.d(TAG, "🌡 环境基线校准 | RMS: $rms | 基线: $currentEnvBaseline | 缓存数: ${envNoiseBuffer.size}")
|
||||
// LogUtils.d(TAG, "🌡 环境基线校准 | RMS: $rms | 基线: $currentEnvBaseline | 缓存数: ${envNoiseBuffer.size}")
|
||||
} else {
|
||||
LogUtils.v(TAG, "🔊 高能量音频跳过校准 | RMS: $rms | 基线: $currentEnvBaseline")
|
||||
// LogUtils.v(TAG, "🔊 高能量音频跳过校准 | RMS: $rms | 基线: $currentEnvBaseline")
|
||||
}
|
||||
}
|
||||
|
||||
@ -193,7 +198,7 @@ class VoiceController(
|
||||
finishSentence(avgEnergy, peakRms)
|
||||
}
|
||||
|
||||
/* ================= 结束录音(修复远场过滤逻辑) ================= */
|
||||
/* ================= 结束录音(分场景系数+强制兜底) ================= */
|
||||
private fun finishSentence(avgEnergy: Float = 0f, peakRms: Float = 0f) {
|
||||
val now = System.currentTimeMillis()
|
||||
val duration = now - recordingStartMs
|
||||
@ -217,52 +222,56 @@ class VoiceController(
|
||||
LogUtils.d(TAG, "📊 录音信息 | 时长: $duration ms | 能量: $avgEnergy | 峰均比: $peakAvgRatio | 基线: $currentEnvBaseline")
|
||||
LogUtils.d(TAG, "📊 帧统计 | 总帧: $totalFrames | 语音帧: $speechFrames | 连续语音帧: $continuousSpeechFrames | 峰值位置占比: $peakPositionRatio")
|
||||
|
||||
// ========== 修复:远场语音过滤逻辑(核心) ==========
|
||||
// 正确逻辑:能量 < MAX_FAR_FIELD_ENERGY 才是远场;峰均比 < MIN_VALID_PEAK_AVG_RATIO 才是无效语音
|
||||
val isFarField = avgEnergy < MAX_FAR_FIELD_ENERGY // 修复:从 < MIN 改为 < MAX
|
||||
val isInvalidPeakRatio = peakAvgRatio < MIN_VALID_PEAK_AVG_RATIO // 降低阈值
|
||||
// ========== 1. 强制兜底:正常语音直接通过 ==========
|
||||
val isNormalVoice = avgEnergy >= MIN_NORMAL_VOICE_ENERGY && vadRatio >= MIN_NORMAL_VOICE_VAD_RATIO
|
||||
if (isNormalVoice) {
|
||||
LogUtils.i(TAG, "✅ 正常语音强制通过 | 能量: $avgEnergy ≥ $MIN_NORMAL_VOICE_ENERGY | 占比: $vadRatio ≥ $MIN_NORMAL_VOICE_VAD_RATIO")
|
||||
audioBuffer.clear()
|
||||
state = VoiceState.UPLOADING
|
||||
onFinalAudio(audio)
|
||||
return
|
||||
}
|
||||
|
||||
// 非连续特征(调整阈值后更宽松)
|
||||
// ========== 2. 远场过滤:只过滤极低能量 ==========
|
||||
val isFarField = avgEnergy < MAX_FAR_FIELD_ENERGY
|
||||
val isInvalidPeakRatio = peakAvgRatio < MIN_VALID_PEAK_AVG_RATIO
|
||||
if (isFarField && isInvalidPeakRatio) {
|
||||
LogUtils.w(TAG, "❌ 远场/无效语音过滤 | 能量: $avgEnergy < $MAX_FAR_FIELD_ENERGY")
|
||||
resetToWaitSpeech()
|
||||
return
|
||||
}
|
||||
|
||||
// ========== 3. 非连续判定:极度宽松 ==========
|
||||
val continuousRatio = if (speechFrames > 0) continuousSpeechFrames.toFloat() / speechFrames else 0f
|
||||
val isDiscontinuous = continuousRatio < MIN_CONTINUOUS_FRAME_RATIO ||
|
||||
speechFrames < MIN_EFFECTIVE_SPEECH_FRAMES ||
|
||||
peakPositionRatio < MAX_PEAK_POSITION_RATIO
|
||||
|
||||
// 远场+无效语音过滤(仅过滤真正的远场/杂音)
|
||||
if (isFarField && isInvalidPeakRatio) { // 修复:同时满足才过滤
|
||||
LogUtils.w(TAG, "❌ 远场/无效语音过滤 | 能量: $avgEnergy < 远场上限: $MAX_FAR_FIELD_ENERGY | 峰均比: $peakAvgRatio < 有效下限: $MIN_VALID_PEAK_AVG_RATIO")
|
||||
val isDiscontinuous = continuousRatio < MIN_CONTINUOUS_FRAME_RATIO &&
|
||||
speechFrames < MIN_EFFECTIVE_SPEECH_FRAMES &&
|
||||
peakPositionRatio > MAX_PEAK_POSITION_RATIO
|
||||
if (isDiscontinuous) {
|
||||
LogUtils.w(TAG, "❌ 非连续杂音过滤 | 连续占比: $continuousRatio < $MIN_CONTINUOUS_FRAME_RATIO")
|
||||
resetToWaitSpeech()
|
||||
return
|
||||
}
|
||||
|
||||
// 非连续语音过滤(仅过滤真正的零散杂音)
|
||||
if (isDiscontinuous && isFarField) { // 修复:结合远场特征,避免过滤正常语音
|
||||
LogUtils.w(TAG, "❌ 非连续杂音过滤 | 连续占比: $continuousRatio < $MIN_CONTINUOUS_FRAME_RATIO | 语音帧: $speechFrames < $MIN_EFFECTIVE_SPEECH_FRAMES | 峰值位置: $peakPositionRatio < $MAX_PEAK_POSITION_RATIO")
|
||||
resetToWaitSpeech()
|
||||
return
|
||||
}
|
||||
|
||||
// ========== 动态阈值计算(调整后更宽松) ==========
|
||||
// ========== 4. 分场景动态阈值计算(保留核心逻辑) ==========
|
||||
val isQuietEnv = currentEnvBaseline < BASELINE_QUIET_THRESHOLD
|
||||
val thresholdConfig = when {
|
||||
duration in SHORT_SPEECH_MIN..SHORT_SPEECH_MAX -> {
|
||||
val coeff = if (currentEnvBaseline < BASELINE_QUIET_THRESHOLD) {
|
||||
SHORT_SPEECH_ENERGY_COEFF_QUIET
|
||||
} else {
|
||||
SHORT_SPEECH_ENERGY_COEFF_NOISY
|
||||
}
|
||||
val threshold = currentEnvBaseline * coeff
|
||||
LogUtils.d(TAG, "📏 短语音阈值 | 场景: ${if (currentEnvBaseline < BASELINE_QUIET_THRESHOLD) "安静" else "嘈杂"} | 系数: $coeff | 阈值: $threshold")
|
||||
val coeff = if (isQuietEnv) SHORT_SPEECH_ENERGY_COEFF_QUIET else SHORT_SPEECH_ENERGY_COEFF_NOISY
|
||||
val energyThreshold = currentEnvBaseline * coeff
|
||||
LogUtils.d(TAG, "📏 短语音阈值 | 场景: ${if (isQuietEnv) "安静" else "嘈杂"} | 系数: $coeff | 阈值: $energyThreshold")
|
||||
ThresholdConfig(
|
||||
energyThreshold = threshold,
|
||||
energyThreshold = energyThreshold,
|
||||
vadRatioThreshold = SHORT_SPEECH_VAD_COEFF,
|
||||
minScore = SHORT_SPEECH_MIN_SCORE,
|
||||
scene = "短语音"
|
||||
)
|
||||
}
|
||||
else -> {
|
||||
val threshold = currentEnvBaseline * LONG_SPEECH_ENERGY_COEFF
|
||||
val coeff = if (isQuietEnv) LONG_SPEECH_ENERGY_COEFF_QUIET else LONG_SPEECH_ENERGY_COEFF_NOISY
|
||||
val energyThreshold = currentEnvBaseline * coeff
|
||||
LogUtils.d(TAG, "📏 长语音阈值 | 场景: ${if (isQuietEnv) "安静" else "嘈杂"} | 系数: $coeff | 阈值: $energyThreshold")
|
||||
ThresholdConfig(
|
||||
energyThreshold = threshold,
|
||||
energyThreshold = energyThreshold,
|
||||
vadRatioThreshold = LONG_SPEECH_VAD_COEFF,
|
||||
minScore = LONG_SPEECH_MIN_SCORE,
|
||||
scene = "长语音"
|
||||
@ -270,61 +279,40 @@ class VoiceController(
|
||||
}
|
||||
}
|
||||
|
||||
LogUtils.d(TAG, "📊 动态阈值 | ${thresholdConfig.scene} | 能量阈值: ${thresholdConfig.energyThreshold} | 占比阈值: ${thresholdConfig.vadRatioThreshold} | 最低分: ${thresholdConfig.minScore}")
|
||||
|
||||
// 基础阈值过滤(调整后更宽松)
|
||||
if (avgEnergy < thresholdConfig.energyThreshold || vadRatio < thresholdConfig.vadRatioThreshold) {
|
||||
LogUtils.w(TAG, "❌ 阈值过滤 | 能量: $avgEnergy < ${thresholdConfig.energyThreshold} | 占比: $vadRatio < ${thresholdConfig.vadRatioThreshold}")
|
||||
// ========== 5. 分场景阈值过滤 ==========
|
||||
val energyPass = avgEnergy >= thresholdConfig.energyThreshold
|
||||
val vadRatioPass = vadRatio >= thresholdConfig.vadRatioThreshold
|
||||
if (!energyPass || !vadRatioPass) {
|
||||
LogUtils.w(TAG, "❌ 低能量语音阈值过滤 | 能量: $avgEnergy < ${thresholdConfig.energyThreshold} | 占比: $vadRatio < ${thresholdConfig.vadRatioThreshold} | 场景: ${thresholdConfig.scene}")
|
||||
resetToWaitSpeech()
|
||||
return
|
||||
}
|
||||
|
||||
// 评分逻辑(恢复短语音保底分)
|
||||
// ========== 6. 评分判定:极度宽松 ==========
|
||||
var score = 0
|
||||
// 1. 时长评分(恢复短语音保底1分)
|
||||
score += when {
|
||||
duration >= 4000 -> 3
|
||||
duration >= 2500 -> 2
|
||||
duration >= 1500 -> 1
|
||||
duration >= SHORT_SPEECH_MIN -> 1 // 恢复保底分
|
||||
else -> 0
|
||||
}
|
||||
// 2. 能量评分
|
||||
score += when {
|
||||
avgEnergy >= thresholdConfig.energyThreshold * 10 -> 3
|
||||
avgEnergy >= thresholdConfig.energyThreshold * 5 -> 2
|
||||
avgEnergy >= thresholdConfig.energyThreshold -> 1
|
||||
else -> 0
|
||||
}
|
||||
// 3. 占比+连续性评分(调整阈值)
|
||||
score += when {
|
||||
continuousRatio >= 0.7 -> 2 // 从0.8调低
|
||||
continuousRatio >= MIN_CONTINUOUS_FRAME_RATIO -> 1
|
||||
else -> 0
|
||||
}
|
||||
|
||||
LogUtils.d(TAG, "🏆 评分结果 | 总分: $score | 最低分: ${thresholdConfig.minScore} | 连续占比: $continuousRatio")
|
||||
|
||||
// 分场景判定(调整后更宽松)
|
||||
val pass = if (duration in SHORT_SPEECH_MIN..SHORT_SPEECH_MAX) {
|
||||
score >= thresholdConfig.minScore && continuousRatio >= 0.5 // 从0.7调低
|
||||
} else {
|
||||
score >= thresholdConfig.minScore || (score >= 2 && avgEnergy >= currentEnvBaseline * 4) // 从3→2,6→4
|
||||
else -> 1
|
||||
}
|
||||
score += if (avgEnergy >= thresholdConfig.energyThreshold) 1 else 0
|
||||
score += if (continuousRatio >= MIN_CONTINUOUS_FRAME_RATIO) 1 else 0
|
||||
|
||||
val pass = score >= thresholdConfig.minScore
|
||||
if (!pass) {
|
||||
LogUtils.w(TAG, "❌ 评分/连续性不足过滤 | 总分: $score < ${thresholdConfig.minScore} | 连续占比: $continuousRatio")
|
||||
LogUtils.w(TAG, "❌ 评分不足过滤 | 总分: $score < ${thresholdConfig.minScore} | 场景: ${thresholdConfig.scene}")
|
||||
resetToWaitSpeech()
|
||||
return
|
||||
}
|
||||
|
||||
// ========== 最终通过 ==========
|
||||
audioBuffer.clear()
|
||||
state = VoiceState.UPLOADING
|
||||
onFinalAudio(audio)
|
||||
LogUtils.i(TAG, "✅ 录音通过 | 时长: $duration ms | 能量: $avgEnergy | 连续占比: $continuousRatio | 准备上传")
|
||||
LogUtils.i(TAG, "✅ 低能量语音通过 | 时长: $duration ms | 能量: $avgEnergy | 场景: ${thresholdConfig.scene}")
|
||||
}
|
||||
|
||||
/* ================= 播放/上传/Reset 回调(无修改) ================= */
|
||||
/* ================= 播放/上传/Reset 回调 ================= */
|
||||
fun onPlayStartPrompt() {
|
||||
LogUtils.d(TAG, "🎵 播放提示音 | 基线: $currentEnvBaseline")
|
||||
state = VoiceState.PLAYING_PROMPT
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user