添加说话人校验

This commit is contained in:
林若思 2026-01-10 17:30:32 +08:00
parent d01e43cd56
commit b7fc6d4ee0
8 changed files with 138 additions and 8812 deletions

View File

@ -1,3 +1,2 @@
x iǎo z ì t óng x ué @小智同学 x iǎo z ì t óng x ué @小智同学
x iǎo z ì @小智 x iǎo z ì t óng x ué @小志同学
x iǎo z ì @小志

View File

@ -1,4 +0,0 @@
# Introduction
Model in this directory is converted from
https://huggingface.co/ASLP-lab/WSYue-ASR/tree/main/u2pp_conformer_yue

View File

@ -1,8 +1,16 @@
package com.zs.smarthuman.sherpa package com.zs.smarthuman.sherpa
import android.content.res.AssetManager import android.content.res.AssetManager
import android.text.TextUtils
import com.blankj.utilcode.util.LogUtils import com.blankj.utilcode.util.LogUtils
import com.k2fsa.sherpa.onnx.OnlineStream
import com.k2fsa.sherpa.onnx.SpeakerRecognition
import kotlinx.coroutines.CoroutineScope
import kotlinx.coroutines.Dispatchers
import kotlinx.coroutines.launch
import java.util.ArrayDeque import java.util.ArrayDeque
import java.util.concurrent.locks.ReentrantLock
import kotlin.concurrent.withLock
class VoiceController( class VoiceController(
assetManager: AssetManager, assetManager: AssetManager,
@ -118,11 +126,75 @@ class VoiceController(
private var hasInvalidSpeech = false private var hasInvalidSpeech = false
private var currentTimeoutType: TimeoutType = TimeoutType.IDLE_TIMEOUT private var currentTimeoutType: TimeoutType = TimeoutType.IDLE_TIMEOUT
// ========== 核心配置:声纹验证相关 ==========
private val CURRENT_USER_ID = "current_wakeup_user" // 当前唤醒用户唯一标识
private val ENABLE_STRICT_SPEAKER_VERIFY = true // 严格验证开关
private val SPEAKER_VERIFY_THRESHOLD = 0.5f // 严格验证开关
init {
// 初始化声纹识别器适配你提供的API
try {
SpeakerRecognition.initExtractor(assetManager) // 对齐原生API
LogUtils.d(TAG, "✅ 声纹识别器初始化成功原生Stream版本")
} catch (e: Exception) {
LogUtils.e(TAG, "❌ 声纹识别器初始化失败", e)
throw RuntimeException("声纹识别初始化失败", e)
}
}
/* ================= 音频入口 ================= */ /* ================= 音频入口 ================= */
fun acceptAudio(samples: FloatArray) { fun acceptAudio(samples: FloatArray) {
cachePreBuffer(samples) cachePreBuffer(samples)
wakeupManager.acceptAudio(samples) wakeupManager.acceptAudio(samples)
if (wakeupManager.consumeWakeupFlag()) { if (wakeupManager.consumeWakeupFlag()) {
handleWakeupEvent()
// 注册唤醒用户特征(异步执行)
CoroutineScope(Dispatchers.IO).launch {
var stream: OnlineStream? = null
runCatching {
val wakeupAudio = preBuffer.toFloatArray()
if (wakeupAudio.isEmpty()) {
LogUtils.w(TAG, "❌ 唤醒音频缓存为空,无法注册用户特征")
return@launch
}
// 2. 创建原生Stream按你提供的API
stream = SpeakerRecognition.extractor.createStream()
stream.acceptWaveform(samples = preBuffer.toFloatArray(), sampleRate = sampleRate)
stream.inputFinished()
// 4. 计算特征并注册(仅当前用户)
if (SpeakerRecognition.extractor.isReady(stream)) {
val embedding = SpeakerRecognition.extractor.compute(stream)
// 清空历史特征,确保当前用户唯一
SpeakerRecognition.manager.remove(CURRENT_USER_ID)
// 注册当前唤醒用户按你提供的add API
val embeddingList: MutableList<FloatArray> = mutableListOf()
embeddingList.add(embedding)
val ok = SpeakerRecognition.manager.add(
name = CURRENT_USER_ID,
embedding = embeddingList.toTypedArray()
)
if (ok) {
LogUtils.d(TAG, "✅ 注册当前唤醒用户特征成功 | 特征长度: ${embedding.size}")
} else {
LogUtils.w(TAG, "❌ 注册当前唤醒用户特征失败manager.add返回false")
}
} else {
LogUtils.w(TAG, "❌ 唤醒音频Stream未就绪跳过用户注册")
}
}.onFailure {
LogUtils.e(TAG, "❌ 唤醒用户特征注册失败", it.message)
}.also {
// 释放Stream原生Stream用完即释放
stream?.release()
LogUtils.d(TAG, "🔄 唤醒注册Stream已释放")
}
}
handleWakeupEvent() handleWakeupEvent()
return return
} }
@ -338,7 +410,7 @@ class VoiceController(
val now = System.currentTimeMillis() val now = System.currentTimeMillis()
val duration = now - recordingStartMs val duration = now - recordingStartMs
// ========== 第一步:基础过滤(语音过短) ==========
if (!vadStarted || duration < MIN_SPEECH_MS) { if (!vadStarted || duration < MIN_SPEECH_MS) {
LogUtils.d(TAG, "❌ 语音过短: $duration ms | 基线: $currentEnvBaseline") LogUtils.d(TAG, "❌ 语音过短: $duration ms | 基线: $currentEnvBaseline")
hasInvalidSpeech = true hasInvalidSpeech = true
@ -368,6 +440,18 @@ class VoiceController(
return return
} }
// ========== 步骤1优先声纹验证核心仅当前用户可通过 ==========
if (ENABLE_STRICT_SPEAKER_VERIFY) {
val isCurrentUser = verifySpeaker(audioBuffer.toFloatArray())
if (!isCurrentUser) {
LogUtils.w(TAG, "❌ 非当前唤醒用户,直接拒绝语音 | 录音时长: $duration ms")
hasInvalidSpeech = true
resetToWaitSpeech()
return
}
LogUtils.d(TAG, "✅ 当前用户语音,继续处理 | 录音时长: $duration ms")
}
// ========== 1. 强制兜底:正常语音直接通过(阈值降低) ========== // ========== 1. 强制兜底:正常语音直接通过(阈值降低) ==========
val isNormalVoice = avgEnergy >= MIN_NORMAL_VOICE_ENERGY && vadRatio >= MIN_NORMAL_VOICE_VAD_RATIO val isNormalVoice = avgEnergy >= MIN_NORMAL_VOICE_ENERGY && vadRatio >= MIN_NORMAL_VOICE_VAD_RATIO
if (isNormalVoice) { if (isNormalVoice) {
@ -575,4 +659,44 @@ class VoiceController(
val minScore: Int, val minScore: Int,
val scene: String val scene: String
) )
/* ================= 核心原生Stream声纹验证仅当前用户有效 ================= */
/**
* 验证语音是否属于当前唤醒用户完全适配你提供的API
* @param audio 待验证的语音数据
* @return true=是当前用户false=非当前用户
*/
private fun verifySpeaker(audio: FloatArray): Boolean {
var stream: OnlineStream? = null
return try {
stream = SpeakerRecognition.extractor.createStream()
stream.acceptWaveform(samples = audio, sampleRate = sampleRate)
stream.inputFinished()
// 4. 计算特征并验证按你提供的API
if (!SpeakerRecognition.extractor.isReady(stream)) {
LogUtils.w(TAG, "❌ 验证音频Stream未就绪验证失败")
return false
}
val embedding = SpeakerRecognition.extractor.compute(stream)
val verifyPass = SpeakerRecognition.manager.verify(name = CURRENT_USER_ID, embedding = embedding, threshold = SPEAKER_VERIFY_THRESHOLD)
if (verifyPass) {
LogUtils.d(TAG, "✅ 声纹验证通过")
} else {
LogUtils.w(TAG, "❌ 声纹验证失败")
}
verifyPass
} catch (e: Exception) {
LogUtils.e(TAG, "❌ 声纹验证异常", e)
false
} finally {
// 释放Stream原生Stream用完即释放
stream?.release()
LogUtils.d(TAG, "🔄 验证Stream已释放")
}
}
} }

View File

@ -48,13 +48,10 @@ import com.zs.smarthuman.im.chat.bean.SingleMessage
import com.zs.smarthuman.kt.releaseIM import com.zs.smarthuman.kt.releaseIM
import com.zs.smarthuman.sherpa.TimeoutType import com.zs.smarthuman.sherpa.TimeoutType
import com.zs.smarthuman.sherpa.VoiceController import com.zs.smarthuman.sherpa.VoiceController
import com.zs.smarthuman.sherpa.VoiceState
import com.zs.smarthuman.toast.Toaster import com.zs.smarthuman.toast.Toaster
import com.zs.smarthuman.utils.AudioDebugUtil
import com.zs.smarthuman.utils.AudioPcmUtil import com.zs.smarthuman.utils.AudioPcmUtil
import com.zs.smarthuman.utils.DangerousUtils import com.zs.smarthuman.utils.DangerousUtils
import com.zs.smarthuman.utils.LogFileUtils import com.zs.smarthuman.utils.LogFileUtils
import com.zs.smarthuman.utils.SimulateStreamingAsr
import com.zs.smarthuman.utils.UnityPlayerHolder import com.zs.smarthuman.utils.UnityPlayerHolder
import com.zs.smarthuman.utils.ViewSlideAnimator import com.zs.smarthuman.utils.ViewSlideAnimator
@ -101,7 +98,6 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
} }
override fun initData() { override fun initData() {
initAsrModel()
PermissionUtils.permissionGroup(PermissionConstants.MICROPHONE) PermissionUtils.permissionGroup(PermissionConstants.MICROPHONE)
.callback(object : PermissionUtils.FullCallback { .callback(object : PermissionUtils.FullCallback {
override fun onGranted(granted: List<String?>) { override fun onGranted(granted: List<String?>) {
@ -198,7 +194,7 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
voiceInfo = mutableListOf<VoiceBeanResp>().apply { voiceInfo = mutableListOf<VoiceBeanResp>().apply {
add( add(
VoiceBeanResp( VoiceBeanResp(
audioUrl = "https://static.seerteach.net/aidialogue/systemVoice/aliyun-nv.mp3" audioUrl = "https://static.seerteach.net/aidialogue/userWakeUpAudio/344.mp3"
) )
) )
} }
@ -212,17 +208,17 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
}, },
onFinalAudio = { audio -> onFinalAudio = { audio ->
Log.d("lrsxx", "检测到语音,长度=${audio.size}") Log.d("lrsxx", "检测到语音,长度=${audio.size}")
// mViewModel?.uploadVoice( mViewModel?.uploadVoice(
// AudioPcmUtil.pcm16ToBase64(AudioPcmUtil.floatToPcm16(audio)), AudioPcmUtil.pcm16ToBase64(AudioPcmUtil.floatToPcm16(audio)),
// 1 1
// )
loadLocalJsonAndPlay()
val file = File(
getExternalFilesDir(Environment.DIRECTORY_DOWNLOADS)!!.getAbsolutePath(),
"xxx.wav"
) )
AudioDebugUtil.saveFloatPcmAsWav(audio, file) // loadLocalJsonAndPlay()
LogUtils.dTag("audioxx", "WAV saved: ${file.path}, samples=${audio.size}") // val file = File(
// getExternalFilesDir(Environment.DIRECTORY_DOWNLOADS)!!.getAbsolutePath(),
// "xxx.wav"
// )
// AudioDebugUtil.saveFloatPcmAsWav(audio, file)
// LogUtils.dTag("audioxx", "WAV saved: ${file.path}, samples=${audio.size}")
lifecycleScope.launch(Dispatchers.Main) { lifecycleScope.launch(Dispatchers.Main) {
mVerticalAnimator?.show() mVerticalAnimator?.show()
@ -258,16 +254,11 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
) )
} }
private fun initAsrModel(){
lifecycleScope.launch(Dispatchers.IO){
SimulateStreamingAsr.initOfflineRecognizer(App.getInstance())
}
}
override fun receivedIMMsg(msg: SingleMessage) { override fun receivedIMMsg(msg: SingleMessage) {
when (msg.msgContentType) { when (msg.msgContentType) {
MessageContentType.RECEIVE_VOICE_STREAM.msgContentType -> { MessageContentType.RECEIVE_VOICE_STREAM.msgContentType -> {
lifecycleScope.launch(Dispatchers.IO) { lifecycleScope.launch(Dispatchers.IO) {
LogFileUtils.logToFile2(this@MainActivity,msg.content) // LogFileUtils.logToFile2(this@MainActivity,msg.content)
UnityPlayerHolder.getInstance() UnityPlayerHolder.getInstance()
.startTalking(msg.content) .startTalking(msg.content)
// loadLocalJsonAndPlay() // loadLocalJsonAndPlay()

View File

@ -1,155 +0,0 @@
package com.zs.smarthuman.utils
import android.content.Context
import android.content.res.AssetManager
import com.blankj.utilcode.util.LogUtils
import com.k2fsa.sherpa.onnx.OfflineModelConfig
import com.k2fsa.sherpa.onnx.OfflineRecognizer
import com.k2fsa.sherpa.onnx.OfflineRecognizerConfig
import com.k2fsa.sherpa.onnx.OfflineWenetCtcModelConfig
import com.k2fsa.sherpa.onnx.Vad
import com.k2fsa.sherpa.onnx.getVadModelConfig
import java.io.File
import java.io.FileOutputStream
import java.io.InputStream
import java.io.OutputStream
fun assetExists(assetManager: AssetManager, path: String): Boolean {
val dir = path.substringBeforeLast('/', "")
val fileName = path.substringAfterLast('/')
val files = assetManager.list(dir) ?: return false
return files.contains(fileName)
}
fun copyAssetToInternalStorage(path: String, context: Context): String {
val targetRoot = context.filesDir
val outFile = File(targetRoot, path)
if (!assetExists(context.assets, path = path)) {
// for context binary, if it is does not exist, we return a path
// that can be written to
outFile.parentFile?.mkdirs()
LogUtils.i("VoiceController", "$path does not exist, return ${outFile.absolutePath}")
return outFile.absolutePath
}
if (outFile.exists()) {
val assetSize = context.assets.open(path).use { it.available() }
if (outFile.length() == assetSize.toLong()) {
LogUtils.i("VoiceController", "$targetRoot/$path already exists, skip copying, return $targetRoot/$path")
return "$targetRoot/$path"
}
}
outFile.parentFile?.mkdirs()
context.assets.open(path).use { input: InputStream ->
FileOutputStream(outFile).use { output: OutputStream ->
input.copyTo(output)
}
}
LogUtils.i("VoiceController", "Copied $path to $targetRoot/$path")
return outFile.absolutePath
}
object SimulateStreamingAsr {
private var _recognizer: OfflineRecognizer? = null
val recognizer: OfflineRecognizer
get() {
return _recognizer!!
}
fun initOfflineRecognizer(context: Context) {
synchronized(this) {
if (_recognizer != null) {
return
}
val wenetConfig = OfflineWenetCtcModelConfig(
model = "sherpa-onnx-wenetspeech-yue-u2pp-conformer-ctc-zh-en-cantonese-int8-2025-09-10/model.int8.onnx",
)
val modelConfig = OfflineModelConfig(
wenetCtc = wenetConfig,
tokens = "sherpa-onnx-wenetspeech-yue-u2pp-conformer-ctc-zh-en-cantonese-int8-2025-09-10/tokens.txt"
)
val config = OfflineRecognizerConfig(
modelConfig = modelConfig,
)
var assetManager: AssetManager? = context.assets
if (config.modelConfig.provider == "qnn") {
// We assume you have copied files like libQnnHtpV81Skel.so to jniLibs/arm64-v8a
LogUtils.i("VoiceController", "nativelibdir: ${context.applicationInfo.nativeLibraryDir}")
// If we don't set the environment variable for ADSP_LIBRARY_PATH, we will see
// the error code 1008 from qnn_interface.deviceCreate()
// See also
// https://workbench.aihub.qualcomm.com/docs/hub/faq.html#why-am-i-seeing-error-1008-when-trying-to-use-htp
OfflineRecognizer.prependAdspLibraryPath(context.applicationInfo.nativeLibraryDir)
// for qnn, we need to copy *.so files from assets folder to sd card
if (config.modelConfig.senseVoice.qnnConfig.backendLib.isEmpty() && config.modelConfig.zipformerCtc.qnnConfig.backendLib.isEmpty()) {
LogUtils.i("VoiceController", "You should provide libQnnHtp.so for qnn")
throw IllegalArgumentException("You should provide libQnnHtp.so for qnn")
}
config.modelConfig.tokens =
copyAssetToInternalStorage(config.modelConfig.tokens, context)
if (config.modelConfig.senseVoice.model.isNotEmpty() || assetExists(
context.assets,
path = config.modelConfig.senseVoice.qnnConfig.contextBinary
)
) {
if (config.modelConfig.senseVoice.model.isNotEmpty()) {
config.modelConfig.senseVoice.model =
copyAssetToInternalStorage(config.modelConfig.senseVoice.model, context)
}
config.modelConfig.senseVoice.qnnConfig.contextBinary =
copyAssetToInternalStorage(
config.modelConfig.senseVoice.qnnConfig.contextBinary,
context
)
} else if (config.modelConfig.zipformerCtc.model.isNotEmpty()) {
config.modelConfig.zipformerCtc.model =
copyAssetToInternalStorage(config.modelConfig.zipformerCtc.model, context)
config.modelConfig.zipformerCtc.qnnConfig.contextBinary =
copyAssetToInternalStorage(
config.modelConfig.zipformerCtc.qnnConfig.contextBinary,
context
)
}
if (config.hr.lexicon.isNotEmpty()) {
config.hr.lexicon = copyAssetToInternalStorage(config.hr.lexicon, context)
}
if (config.hr.ruleFsts.isNotEmpty()) {
// it assumes there is only one fst. otherwise, you need to copy each fst separately
config.hr.ruleFsts = copyAssetToInternalStorage(config.hr.ruleFsts, context)
}
assetManager = null
}
_recognizer = OfflineRecognizer(
assetManager = assetManager,
config = config,
)
LogUtils.i("VoiceController", "sherpa-onnx offline recognizer initialized")
}
}
}