添加动态阈值
This commit is contained in:
parent
8dd5e68ded
commit
e21283b73b
@ -1,6 +1,7 @@
|
||||
package com.zs.smarthuman.sherpa
|
||||
|
||||
import android.content.res.AssetManager
|
||||
import com.blankj.utilcode.util.LogUtils
|
||||
import com.k2fsa.sherpa.onnx.Vad
|
||||
import com.k2fsa.sherpa.onnx.getVadModelConfig
|
||||
import kotlin.math.sqrt
|
||||
@ -10,44 +11,81 @@ class VadManager(
|
||||
private val onSpeechStart: () -> Unit,
|
||||
private val onSpeechEnd: (avgEnergy: Float, peakRms: Float) -> Unit
|
||||
) {
|
||||
|
||||
private val TAG = "SmartHuman-VadManager"
|
||||
private val vad: Vad
|
||||
private var isSpeaking = false
|
||||
private var lastSpeechTime = 0L
|
||||
|
||||
private val END_SILENCE_MS = 800L
|
||||
|
||||
// 基础统计变量
|
||||
private var activeFrameCount = 0
|
||||
private var activeSpeechFrameCount = 0
|
||||
|
||||
private var speechEnergySum = 0f
|
||||
private var speechFrameCount = 0
|
||||
private var peakRms = 0f
|
||||
|
||||
// ========== 新增:连续性检测核心变量 ==========
|
||||
private var totalFrames = 0 // 总处理帧数
|
||||
private var speechFrames = 0 // 语音帧总数
|
||||
private var continuousSpeechFrames = 0 // 连续语音帧数
|
||||
private var lastFrameIsSpeech = false // 上一帧是否为语音
|
||||
private var peakPosition = 0 // 峰值所在帧位置
|
||||
private var frameIndex = 0 // 当前帧索引
|
||||
|
||||
init {
|
||||
val config = getVadModelConfig(0) ?: throw IllegalStateException("VAD config not found")
|
||||
val config = getVadModelConfig(0) ?: throw IllegalStateException("[$TAG] VAD config not found")
|
||||
vad = Vad(assetManager, config)
|
||||
LogUtils.i(TAG, "✅ VAD 初始化成功")
|
||||
}
|
||||
|
||||
/**
|
||||
* 接收音频数据并进行VAD检测
|
||||
* @param samples 音频采样数据(float数组)
|
||||
*/
|
||||
fun accept(samples: FloatArray) {
|
||||
val now = System.currentTimeMillis()
|
||||
|
||||
vad.acceptWaveform(samples)
|
||||
val hasSpeech = vad.isSpeechDetected()
|
||||
|
||||
// RMS & peak 统计
|
||||
val rms = calcRms(samples)
|
||||
|
||||
// ========== 1. 语音能量统计 ==========
|
||||
if (hasSpeech) {
|
||||
speechEnergySum += rms
|
||||
speechFrameCount++
|
||||
peakRms = maxOf(peakRms, rms)
|
||||
LogUtils.v(TAG, "🔊 检测到语音帧 | RMS: $rms | 累计峰值: $peakRms")
|
||||
} else {
|
||||
LogUtils.v(TAG, "🔇 检测到静音帧 | RMS: $rms")
|
||||
}
|
||||
|
||||
// VAD逻辑
|
||||
// ========== 2. 新增:帧统计与连续性计算 ==========
|
||||
totalFrames++
|
||||
frameIndex++
|
||||
|
||||
if (hasSpeech) {
|
||||
speechFrames++
|
||||
// 连续语音帧计数
|
||||
continuousSpeechFrames = if (lastFrameIsSpeech) {
|
||||
continuousSpeechFrames + 1
|
||||
} else {
|
||||
1 // 重置连续计数
|
||||
}
|
||||
lastFrameIsSpeech = true
|
||||
|
||||
// 更新峰值位置(仅当当前RMS为新峰值时)
|
||||
if (rms == peakRms) {
|
||||
peakPosition = frameIndex
|
||||
}
|
||||
} else {
|
||||
lastFrameIsSpeech = false
|
||||
}
|
||||
|
||||
// ========== 3. VAD核心状态流转 ==========
|
||||
if (hasSpeech) {
|
||||
lastSpeechTime = now
|
||||
if (!isSpeaking) {
|
||||
isSpeaking = true
|
||||
LogUtils.d(TAG, "🎤 语音开始")
|
||||
onSpeechStart()
|
||||
}
|
||||
activeFrameCount++
|
||||
@ -55,36 +93,85 @@ class VadManager(
|
||||
} else {
|
||||
if (isSpeaking) {
|
||||
activeFrameCount++
|
||||
if (now - lastSpeechTime >= END_SILENCE_MS) {
|
||||
val silenceDuration = now - lastSpeechTime
|
||||
if (silenceDuration >= END_SILENCE_MS) {
|
||||
isSpeaking = false
|
||||
val avgEnergy = if (speechFrameCount > 0) speechEnergySum / speechFrameCount else 0f
|
||||
val peak = peakRms
|
||||
onSpeechEnd(avgEnergy, peak)
|
||||
LogUtils.d(TAG, "🛑 语音结束 | 静音时长: ${silenceDuration}ms | 平均能量: $avgEnergy | 峰值: $peakRms")
|
||||
onSpeechEnd(avgEnergy, peakRms)
|
||||
resetStats() // 重置基础统计
|
||||
} else {
|
||||
LogUtils.v(TAG, "⏳ 静音中,时长: ${silenceDuration}ms (阈值: ${END_SILENCE_MS}ms)")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fun activeSpeechRatio(): Float = if (activeFrameCount == 0) 0f else activeSpeechFrameCount.toFloat() / activeFrameCount
|
||||
/**
|
||||
* 计算语音占比(活跃语音帧 / 总活跃帧)
|
||||
* @return 语音占比(0~1)
|
||||
*/
|
||||
fun activeSpeechRatio(): Float {
|
||||
val ratio = if (activeFrameCount == 0) 0f else activeSpeechFrameCount.toFloat() / activeFrameCount
|
||||
LogUtils.d(TAG, "📊 语音占比: $ratio | 语音帧: $activeSpeechFrameCount | 总帧: $activeFrameCount")
|
||||
return ratio
|
||||
}
|
||||
|
||||
// ========== 新增:帧统计获取方法(给VoiceController调用) ==========
|
||||
/** 获取总处理帧数 */
|
||||
fun getTotalFrames(): Int = totalFrames
|
||||
|
||||
/** 获取语音帧总数 */
|
||||
fun getSpeechFrames(): Int = speechFrames
|
||||
|
||||
/** 获取连续语音帧数 */
|
||||
fun getContinuousSpeechFrames(): Int = continuousSpeechFrames
|
||||
|
||||
/** 获取峰值位置占比(峰值帧索引/总帧数) */
|
||||
fun getPeakPositionRatio(): Float {
|
||||
return if (totalFrames == 0) 0f else peakPosition.toFloat() / totalFrames
|
||||
}
|
||||
|
||||
/**
|
||||
* 重置VAD状态(保留核心对象,清空统计数据)
|
||||
*/
|
||||
fun reset() {
|
||||
// 基础状态重置
|
||||
isSpeaking = false
|
||||
lastSpeechTime = 0L
|
||||
resetStats()
|
||||
vad.reset()
|
||||
|
||||
// 新增:连续性统计重置
|
||||
totalFrames = 0
|
||||
speechFrames = 0
|
||||
continuousSpeechFrames = 0
|
||||
lastFrameIsSpeech = false
|
||||
peakPosition = 0
|
||||
frameIndex = 0
|
||||
|
||||
LogUtils.d(TAG, "🔄 VAD 状态已完全重置")
|
||||
}
|
||||
|
||||
/**
|
||||
* 重置统计数据(内部使用)
|
||||
*/
|
||||
private fun resetStats() {
|
||||
activeFrameCount = 0
|
||||
activeSpeechFrameCount = 0
|
||||
speechEnergySum = 0f
|
||||
speechFrameCount = 0
|
||||
peakRms = 0f
|
||||
vad.reset()
|
||||
}
|
||||
|
||||
private fun calcRms(samples: FloatArray): Float {
|
||||
/**
|
||||
* 计算音频采样的RMS(均方根)能量
|
||||
* @param samples 音频采样数据
|
||||
* @return RMS值
|
||||
*/
|
||||
fun calcRms(samples: FloatArray): Float {
|
||||
var sum = 0f
|
||||
var peak = 0f
|
||||
for (v in samples) {
|
||||
sum += v * v
|
||||
peak = maxOf(peak, kotlin.math.abs(v))
|
||||
}
|
||||
for (v in samples) sum += v * v
|
||||
return sqrt(sum / samples.size)
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1,13 +1,13 @@
|
||||
package com.zs.smarthuman.sherpa
|
||||
|
||||
import android.content.res.AssetManager
|
||||
import android.util.Log
|
||||
import com.blankj.utilcode.util.LogUtils
|
||||
import java.util.ArrayDeque
|
||||
|
||||
class VoiceController(
|
||||
assetManager: AssetManager,
|
||||
private val onWakeup: () -> Unit,
|
||||
private val onFinalAudio: (FloatArray) -> Unit, // 修改:传回识别结果文本
|
||||
private val onFinalAudio: (FloatArray) -> Unit,
|
||||
private val idleTimeoutSeconds: Int = 10,
|
||||
private val maxRecordingSeconds: Int = 10,
|
||||
private val onStateChanged: ((VoiceState) -> Unit)? = null,
|
||||
@ -20,14 +20,16 @@ class VoiceController(
|
||||
var state: VoiceState = VoiceState.WAIT_WAKEUP
|
||||
private set(value) {
|
||||
field = value
|
||||
Log.d(TAG, "➡ State = $value")
|
||||
LogUtils.d(TAG, "➡ State = $value")
|
||||
onStateChanged?.invoke(value)
|
||||
}
|
||||
|
||||
private val wakeupManager = WakeupManager(assetManager) {
|
||||
Log.d(TAG, "🔥 WakeWord detected")
|
||||
handleWakeupEvent()
|
||||
}
|
||||
private val wakeupManager = WakeupManager(assetManager, onWakeup)
|
||||
private val vadManager = VadManager(
|
||||
assetManager,
|
||||
onSpeechStart = { onVadStart() },
|
||||
onSpeechEnd = { avgEnergy, peakRms -> onVadEnd(avgEnergy, peakRms) }
|
||||
)
|
||||
|
||||
private val audioBuffer = mutableListOf<Float>()
|
||||
private val preBuffer = ArrayDeque<Float>()
|
||||
@ -48,10 +50,36 @@ class VoiceController(
|
||||
private val idleTimeoutMs = idleTimeoutSeconds * 1000L
|
||||
private val maxRecordingMs = maxRecordingSeconds * 1000L
|
||||
|
||||
private val vadManager = VadManager(
|
||||
assetManager,
|
||||
onSpeechStart = { onVadStart() },
|
||||
onSpeechEnd = { avgEnergy, peakRms -> onVadEnd(avgEnergy, peakRms) }
|
||||
// ================= 动态阈值核心配置(修复+调整) =================
|
||||
private val BASELINE_WINDOW_SIZE = 50
|
||||
private val envNoiseBuffer = ArrayDeque<Float>(BASELINE_WINDOW_SIZE)
|
||||
private var currentEnvBaseline = 0.001f
|
||||
|
||||
// ========== 修复:远场过滤配置(关键调整) ==========
|
||||
private val MAX_FAR_FIELD_ENERGY = 0.05f // 远场能量上限(正常语音>0.05,远场<0.05)
|
||||
private val MIN_VALID_PEAK_AVG_RATIO = 1.5f // 有效峰均比下限(正常语音>1.5,咳嗽<1.5)
|
||||
private val BASELINE_QUIET_THRESHOLD = 0.002f
|
||||
private val MIN_CONTINUOUS_FRAME_RATIO = 0.4f // 连续帧占比下限(从0.6调低,兼容正常短语音)
|
||||
private val MAX_PEAK_POSITION_RATIO = 0.5f // 峰值位置上限(从0.3调高,兼容正常语音)
|
||||
private val MIN_EFFECTIVE_SPEECH_FRAMES = 3 // 最低有效帧数(从5调低)
|
||||
|
||||
// 分场景动态系数(调整:降低安静环境系数)
|
||||
private val SHORT_SPEECH_ENERGY_COEFF_QUIET = 6.0f // 从8.0调低
|
||||
private val SHORT_SPEECH_ENERGY_COEFF_NOISY = 4.0f // 从5.0调低
|
||||
private val LONG_SPEECH_ENERGY_COEFF = 15.0f // 从20.0调低
|
||||
private val SHORT_SPEECH_VAD_COEFF = 0.15f // 从0.2调低
|
||||
private val LONG_SPEECH_VAD_COEFF = 0.3f // 从0.4调低
|
||||
private val SHORT_SPEECH_MIN_SCORE = 2 // 调回2分
|
||||
private val LONG_SPEECH_MIN_SCORE = 4 // 从5调低
|
||||
private val SHORT_SPEECH_MIN = 500L
|
||||
private val SHORT_SPEECH_MAX = 2000L
|
||||
|
||||
// 阈值配置数据类
|
||||
private data class ThresholdConfig(
|
||||
val energyThreshold: Float,
|
||||
val vadRatioThreshold: Float,
|
||||
val minScore: Int,
|
||||
val scene: String
|
||||
)
|
||||
|
||||
/* ================= 音频入口 ================= */
|
||||
@ -65,6 +93,10 @@ class VoiceController(
|
||||
|
||||
val now = System.currentTimeMillis()
|
||||
|
||||
if (state == VoiceState.WAIT_WAKEUP) {
|
||||
calibrateEnvBaseline(samples)
|
||||
}
|
||||
|
||||
when (state) {
|
||||
VoiceState.WAIT_WAKEUP,
|
||||
VoiceState.PLAYING_PROMPT,
|
||||
@ -84,7 +116,7 @@ class VoiceController(
|
||||
if ((waitSpeechStartMs > 0 && now - waitSpeechStartMs >= idleTimeoutMs) ||
|
||||
(waitSpeechFailStartMs > 0 && now - waitSpeechFailStartMs >= idleTimeoutMs)
|
||||
) {
|
||||
Log.d(TAG, "⏱ WAIT_SPEECH idle timeout → WAIT_WAKEUP")
|
||||
LogUtils.d(TAG, "⏱ WAIT_SPEECH idle timeout → WAIT_WAKEUP")
|
||||
resetAll()
|
||||
return
|
||||
}
|
||||
@ -100,13 +132,28 @@ class VoiceController(
|
||||
vadManager.accept(samples)
|
||||
|
||||
if (System.currentTimeMillis() - recordingStartMs > maxRecordingMs) {
|
||||
Log.w(TAG, "⏱ Max recording reached")
|
||||
finishSentence() // 超时也触发 finish
|
||||
LogUtils.w(TAG, "⏱ Max recording reached | 当前环境基线: $currentEnvBaseline")
|
||||
finishSentence()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* ================= 环境基线校准 ================= */
|
||||
private fun calibrateEnvBaseline(samples: FloatArray) {
|
||||
val rms = vadManager.calcRms(samples)
|
||||
if (rms < 0.01f) {
|
||||
if (envNoiseBuffer.size >= BASELINE_WINDOW_SIZE) {
|
||||
envNoiseBuffer.removeFirst()
|
||||
}
|
||||
envNoiseBuffer.addLast(rms)
|
||||
currentEnvBaseline = envNoiseBuffer.maxOrNull() ?: 0.001f
|
||||
LogUtils.d(TAG, "🌡 环境基线校准 | RMS: $rms | 基线: $currentEnvBaseline | 缓存数: ${envNoiseBuffer.size}")
|
||||
} else {
|
||||
LogUtils.v(TAG, "🔊 高能量音频跳过校准 | RMS: $rms | 基线: $currentEnvBaseline")
|
||||
}
|
||||
}
|
||||
|
||||
/* ================= 唤醒 ================= */
|
||||
private fun handleWakeupEvent() {
|
||||
if (state == VoiceState.UPLOADING) return
|
||||
@ -127,11 +174,12 @@ class VoiceController(
|
||||
inKwsObserve = true
|
||||
kwsObserveStartMs = System.currentTimeMillis()
|
||||
onWakeup()
|
||||
LogUtils.d(TAG, "🔔 唤醒成功 | 环境基线: $currentEnvBaseline")
|
||||
}
|
||||
|
||||
private fun onVadStart() {
|
||||
if (state != VoiceState.WAIT_SPEECH) return
|
||||
Log.d(TAG, "🎤 REAL VAD START")
|
||||
LogUtils.d(TAG, "🎤 REAL VAD START | 环境基线: $currentEnvBaseline")
|
||||
vadStarted = true
|
||||
recordingStartMs = System.currentTimeMillis()
|
||||
audioBuffer.clear()
|
||||
@ -141,17 +189,17 @@ class VoiceController(
|
||||
|
||||
private fun onVadEnd(avgEnergy: Float, peakRms: Float) {
|
||||
if (state != VoiceState.RECORDING) return
|
||||
Log.d(TAG, "🧠 VAD END")
|
||||
LogUtils.d(TAG, "🧠 VAD END | 环境基线: $currentEnvBaseline")
|
||||
finishSentence(avgEnergy, peakRms)
|
||||
}
|
||||
|
||||
/* ================= 结束录音 ================= */
|
||||
/* ================= 结束录音(修复远场过滤逻辑) ================= */
|
||||
private fun finishSentence(avgEnergy: Float = 0f, peakRms: Float = 0f) {
|
||||
val now = System.currentTimeMillis()
|
||||
val duration = now - recordingStartMs
|
||||
|
||||
if (!vadStarted || duration < MIN_SPEECH_MS) {
|
||||
Log.d(TAG, "❌ Too short or no VAD start: $duration ms")
|
||||
LogUtils.d(TAG, "❌ 语音过短: $duration ms | 基线: $currentEnvBaseline")
|
||||
resetToWaitSpeech()
|
||||
return
|
||||
}
|
||||
@ -160,73 +208,148 @@ class VoiceController(
|
||||
val vadRatio = vadManager.activeSpeechRatio()
|
||||
val peakAvgRatio = if (avgEnergy > 0f) peakRms / avgEnergy else 0f
|
||||
|
||||
Log.d(TAG, "📊 Finish Sentence - duration: $duration ms, vadEnded: true")
|
||||
Log.d(
|
||||
TAG,
|
||||
"📊 vadRatio=$vadRatio, avgEnergy=$avgEnergy, peakRms=$peakRms, peakAvgRatio=$peakAvgRatio"
|
||||
)
|
||||
// 获取VAD帧统计
|
||||
val totalFrames = vadManager.getTotalFrames()
|
||||
val speechFrames = vadManager.getSpeechFrames()
|
||||
val continuousSpeechFrames = vadManager.getContinuousSpeechFrames()
|
||||
val peakPositionRatio = vadManager.getPeakPositionRatio()
|
||||
|
||||
if (avgEnergy < 0.02f || peakAvgRatio < 1.2f || vadRatio < 0.4f) {
|
||||
Log.d(TAG, "❌ Sentence rejected")
|
||||
LogUtils.d(TAG, "📊 录音信息 | 时长: $duration ms | 能量: $avgEnergy | 峰均比: $peakAvgRatio | 基线: $currentEnvBaseline")
|
||||
LogUtils.d(TAG, "📊 帧统计 | 总帧: $totalFrames | 语音帧: $speechFrames | 连续语音帧: $continuousSpeechFrames | 峰值位置占比: $peakPositionRatio")
|
||||
|
||||
// ========== 修复:远场语音过滤逻辑(核心) ==========
|
||||
// 正确逻辑:能量 < MAX_FAR_FIELD_ENERGY 才是远场;峰均比 < MIN_VALID_PEAK_AVG_RATIO 才是无效语音
|
||||
val isFarField = avgEnergy < MAX_FAR_FIELD_ENERGY // 修复:从 < MIN 改为 < MAX
|
||||
val isInvalidPeakRatio = peakAvgRatio < MIN_VALID_PEAK_AVG_RATIO // 降低阈值
|
||||
|
||||
// 非连续特征(调整阈值后更宽松)
|
||||
val continuousRatio = if (speechFrames > 0) continuousSpeechFrames.toFloat() / speechFrames else 0f
|
||||
val isDiscontinuous = continuousRatio < MIN_CONTINUOUS_FRAME_RATIO ||
|
||||
speechFrames < MIN_EFFECTIVE_SPEECH_FRAMES ||
|
||||
peakPositionRatio < MAX_PEAK_POSITION_RATIO
|
||||
|
||||
// 远场+无效语音过滤(仅过滤真正的远场/杂音)
|
||||
if (isFarField && isInvalidPeakRatio) { // 修复:同时满足才过滤
|
||||
LogUtils.w(TAG, "❌ 远场/无效语音过滤 | 能量: $avgEnergy < 远场上限: $MAX_FAR_FIELD_ENERGY | 峰均比: $peakAvgRatio < 有效下限: $MIN_VALID_PEAK_AVG_RATIO")
|
||||
resetToWaitSpeech()
|
||||
return
|
||||
}
|
||||
|
||||
// 评分逻辑
|
||||
var score = 0
|
||||
when {
|
||||
duration >= 4000 -> score += 3
|
||||
duration >= 2500 -> score += 2
|
||||
duration >= 1500 -> score += 1
|
||||
// 非连续语音过滤(仅过滤真正的零散杂音)
|
||||
if (isDiscontinuous && isFarField) { // 修复:结合远场特征,避免过滤正常语音
|
||||
LogUtils.w(TAG, "❌ 非连续杂音过滤 | 连续占比: $continuousRatio < $MIN_CONTINUOUS_FRAME_RATIO | 语音帧: $speechFrames < $MIN_EFFECTIVE_SPEECH_FRAMES | 峰值位置: $peakPositionRatio < $MAX_PEAK_POSITION_RATIO")
|
||||
resetToWaitSpeech()
|
||||
return
|
||||
}
|
||||
|
||||
// ========== 动态阈值计算(调整后更宽松) ==========
|
||||
val thresholdConfig = when {
|
||||
duration in SHORT_SPEECH_MIN..SHORT_SPEECH_MAX -> {
|
||||
val coeff = if (currentEnvBaseline < BASELINE_QUIET_THRESHOLD) {
|
||||
SHORT_SPEECH_ENERGY_COEFF_QUIET
|
||||
} else {
|
||||
SHORT_SPEECH_ENERGY_COEFF_NOISY
|
||||
}
|
||||
val threshold = currentEnvBaseline * coeff
|
||||
LogUtils.d(TAG, "📏 短语音阈值 | 场景: ${if (currentEnvBaseline < BASELINE_QUIET_THRESHOLD) "安静" else "嘈杂"} | 系数: $coeff | 阈值: $threshold")
|
||||
ThresholdConfig(
|
||||
energyThreshold = threshold,
|
||||
vadRatioThreshold = SHORT_SPEECH_VAD_COEFF,
|
||||
minScore = SHORT_SPEECH_MIN_SCORE,
|
||||
scene = "短语音"
|
||||
)
|
||||
}
|
||||
else -> {
|
||||
val threshold = currentEnvBaseline * LONG_SPEECH_ENERGY_COEFF
|
||||
ThresholdConfig(
|
||||
energyThreshold = threshold,
|
||||
vadRatioThreshold = LONG_SPEECH_VAD_COEFF,
|
||||
minScore = LONG_SPEECH_MIN_SCORE,
|
||||
scene = "长语音"
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
LogUtils.d(TAG, "📊 动态阈值 | ${thresholdConfig.scene} | 能量阈值: ${thresholdConfig.energyThreshold} | 占比阈值: ${thresholdConfig.vadRatioThreshold} | 最低分: ${thresholdConfig.minScore}")
|
||||
|
||||
// 基础阈值过滤(调整后更宽松)
|
||||
if (avgEnergy < thresholdConfig.energyThreshold || vadRatio < thresholdConfig.vadRatioThreshold) {
|
||||
LogUtils.w(TAG, "❌ 阈值过滤 | 能量: $avgEnergy < ${thresholdConfig.energyThreshold} | 占比: $vadRatio < ${thresholdConfig.vadRatioThreshold}")
|
||||
resetToWaitSpeech()
|
||||
return
|
||||
}
|
||||
|
||||
// 评分逻辑(恢复短语音保底分)
|
||||
var score = 0
|
||||
// 1. 时长评分(恢复短语音保底1分)
|
||||
score += when {
|
||||
duration >= 4000 -> 3
|
||||
duration >= 2500 -> 2
|
||||
duration >= 1500 -> 1
|
||||
duration >= SHORT_SPEECH_MIN -> 1 // 恢复保底分
|
||||
else -> 0
|
||||
}
|
||||
// 2. 能量评分
|
||||
score += when {
|
||||
avgEnergy >= thresholdConfig.energyThreshold * 10 -> 3
|
||||
avgEnergy >= thresholdConfig.energyThreshold * 5 -> 2
|
||||
avgEnergy >= thresholdConfig.energyThreshold -> 1
|
||||
else -> 0
|
||||
}
|
||||
// 3. 占比+连续性评分(调整阈值)
|
||||
score += when {
|
||||
continuousRatio >= 0.7 -> 2 // 从0.8调低
|
||||
continuousRatio >= MIN_CONTINUOUS_FRAME_RATIO -> 1
|
||||
else -> 0
|
||||
}
|
||||
|
||||
LogUtils.d(TAG, "🏆 评分结果 | 总分: $score | 最低分: ${thresholdConfig.minScore} | 连续占比: $continuousRatio")
|
||||
|
||||
// 分场景判定(调整后更宽松)
|
||||
val pass = if (duration in SHORT_SPEECH_MIN..SHORT_SPEECH_MAX) {
|
||||
score >= thresholdConfig.minScore && continuousRatio >= 0.5 // 从0.7调低
|
||||
} else {
|
||||
score >= thresholdConfig.minScore || (score >= 2 && avgEnergy >= currentEnvBaseline * 4) // 从3→2,6→4
|
||||
}
|
||||
when {
|
||||
avgEnergy >= 0.10f -> score += 3
|
||||
avgEnergy >= 0.06f -> score += 2
|
||||
avgEnergy >= 0.02f -> score += 1
|
||||
}
|
||||
when {
|
||||
vadRatio >= 0.55f -> score += 2
|
||||
vadRatio >= 0.40f -> score += 1
|
||||
}
|
||||
Log.d(
|
||||
TAG,
|
||||
"📊 duration=$duration ms, vadRatio=$vadRatio, avgEnergy=$avgEnergy, score=$score"
|
||||
)
|
||||
|
||||
val pass = score >= 5 || (score == 3 && avgEnergy >= 0.06f)
|
||||
if (!pass) {
|
||||
Log.d(TAG, "❌ Sentence rejected (score=$score)")
|
||||
LogUtils.w(TAG, "❌ 评分/连续性不足过滤 | 总分: $score < ${thresholdConfig.minScore} | 连续占比: $continuousRatio")
|
||||
resetToWaitSpeech()
|
||||
return
|
||||
}
|
||||
|
||||
audioBuffer.clear()
|
||||
state = VoiceState.UPLOADING
|
||||
onFinalAudio(audio) // 传递音频和识别文本
|
||||
onFinalAudio(audio)
|
||||
LogUtils.i(TAG, "✅ 录音通过 | 时长: $duration ms | 能量: $avgEnergy | 连续占比: $continuousRatio | 准备上传")
|
||||
}
|
||||
|
||||
/* ================= 播放回调 ================= */
|
||||
/* ================= 播放/上传/Reset 回调(无修改) ================= */
|
||||
fun onPlayStartPrompt() {
|
||||
LogUtils.d(TAG, "🎵 播放提示音 | 基线: $currentEnvBaseline")
|
||||
state = VoiceState.PLAYING_PROMPT
|
||||
}
|
||||
|
||||
fun onPlayEndPrompt() {
|
||||
speechEnableAtMs = System.currentTimeMillis() + SPEECH_COOLDOWN_MS
|
||||
LogUtils.d(TAG, "🎵 提示音结束 | 基线: $currentEnvBaseline")
|
||||
state = VoiceState.WAIT_SPEECH_COOLDOWN
|
||||
}
|
||||
|
||||
fun onPlayStartBackend() {
|
||||
LogUtils.d(TAG, "🎶 播放后台音频 | 基线: $currentEnvBaseline")
|
||||
state = VoiceState.PLAYING_BACKEND
|
||||
}
|
||||
|
||||
fun onPlayEndBackend() {
|
||||
speechEnableAtMs = System.currentTimeMillis() + SPEECH_COOLDOWN_MS
|
||||
LogUtils.d(TAG, "🎶 后台音频结束 | 基线: $currentEnvBaseline")
|
||||
state = VoiceState.WAIT_SPEECH_COOLDOWN
|
||||
}
|
||||
|
||||
/* ================= 上传回调 ================= */
|
||||
fun onUploadFinished(success: Boolean) {
|
||||
if (state != VoiceState.UPLOADING) return
|
||||
LogUtils.d(TAG, "📤 上传完成 | 成功: $success | 基线: $currentEnvBaseline")
|
||||
state = if (success) VoiceState.PLAYING_BACKEND
|
||||
else {
|
||||
speechEnableAtMs = System.currentTimeMillis() + SPEECH_COOLDOWN_MS
|
||||
@ -234,8 +357,8 @@ class VoiceController(
|
||||
}
|
||||
}
|
||||
|
||||
/* ================= Reset ================= */
|
||||
private fun resetToWaitSpeech() {
|
||||
LogUtils.d(TAG, "🔄 重置到等待说话 | 基线: $currentEnvBaseline")
|
||||
audioBuffer.clear()
|
||||
vadManager.reset()
|
||||
vadStarted = false
|
||||
@ -244,25 +367,31 @@ class VoiceController(
|
||||
}
|
||||
|
||||
private fun resetAll() {
|
||||
LogUtils.d(TAG, "🔄 重置所有状态 | 基线: $currentEnvBaseline")
|
||||
audioBuffer.clear()
|
||||
preBuffer.clear()
|
||||
vadManager.reset()
|
||||
wakeupManager.reset()
|
||||
vadStarted = false
|
||||
waitSpeechStartMs = 0L
|
||||
waitSpeechFailStartMs = 0L
|
||||
envNoiseBuffer.clear()
|
||||
currentEnvBaseline = 0.001f
|
||||
LogUtils.d(TAG, "🔄 环境基线已重置 | 新基线: $currentEnvBaseline")
|
||||
state = VoiceState.WAIT_WAKEUP
|
||||
}
|
||||
|
||||
fun release() {
|
||||
LogUtils.d(TAG, "🔌 释放资源 | 最终基线: $currentEnvBaseline")
|
||||
wakeupManager.release()
|
||||
vadManager.reset()
|
||||
envNoiseBuffer.clear()
|
||||
}
|
||||
|
||||
/* ================= Utils ================= */
|
||||
private fun cachePreBuffer(samples: FloatArray) {
|
||||
for (s in samples) {
|
||||
preBuffer.addLast(s)
|
||||
if (preBuffer.size > PRE_BUFFER_SIZE) preBuffer.removeFirst()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -2,6 +2,7 @@ package com.zs.smarthuman.sherpa
|
||||
|
||||
import android.content.res.AssetManager
|
||||
import android.util.Log
|
||||
import com.blankj.utilcode.util.LogUtils
|
||||
import com.k2fsa.sherpa.onnx.*
|
||||
|
||||
class WakeupManager(assetManager: AssetManager, function: () -> Unit) {
|
||||
@ -29,11 +30,11 @@ class WakeupManager(assetManager: AssetManager, function: () -> Unit) {
|
||||
)
|
||||
|
||||
kws = KeywordSpotter(assetManager, config)
|
||||
Log.d(TAG, "✅ KeywordSpotter initialized")
|
||||
LogUtils.d(TAG, "✅ KeywordSpotter initialized")
|
||||
|
||||
stream = kws.createStream()
|
||||
require(stream != null) { "Failed to create KWS stream" }
|
||||
Log.d(TAG, "✅ KWS stream created")
|
||||
LogUtils.d(TAG, "✅ KWS stream created")
|
||||
}
|
||||
|
||||
/** ⭐ 永远喂 KWS */
|
||||
@ -48,7 +49,7 @@ class WakeupManager(assetManager: AssetManager, function: () -> Unit) {
|
||||
kws.decode(s)
|
||||
val keyword = kws.getResult(s).keyword
|
||||
if (keyword.isNotBlank()) {
|
||||
Log.d(TAG, "🔥 KWS hit: $keyword")
|
||||
LogUtils.d(TAG, "🔥 KWS hit: $keyword")
|
||||
justWokeUp = true
|
||||
kws.reset(s)
|
||||
break
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user