添加说话人校验

This commit is contained in:
林若思 2026-01-10 17:30:32 +08:00
parent d01e43cd56
commit b7fc6d4ee0
8 changed files with 138 additions and 8812 deletions

View File

@ -1,3 +1,2 @@
x iǎo z ì t óng x ué @小智同学
x iǎo z ì @小智
x iǎo z ì @小志
x iǎo z ì t óng x ué @小志同学

View File

@ -1,4 +0,0 @@
# Introduction
Model in this directory is converted from
https://huggingface.co/ASLP-lab/WSYue-ASR/tree/main/u2pp_conformer_yue

View File

@ -1,8 +1,16 @@
package com.zs.smarthuman.sherpa
import android.content.res.AssetManager
import android.text.TextUtils
import com.blankj.utilcode.util.LogUtils
import com.k2fsa.sherpa.onnx.OnlineStream
import com.k2fsa.sherpa.onnx.SpeakerRecognition
import kotlinx.coroutines.CoroutineScope
import kotlinx.coroutines.Dispatchers
import kotlinx.coroutines.launch
import java.util.ArrayDeque
import java.util.concurrent.locks.ReentrantLock
import kotlin.concurrent.withLock
class VoiceController(
assetManager: AssetManager,
@ -118,11 +126,75 @@ class VoiceController(
private var hasInvalidSpeech = false
private var currentTimeoutType: TimeoutType = TimeoutType.IDLE_TIMEOUT
// ========== 核心配置:声纹验证相关 ==========
private val CURRENT_USER_ID = "current_wakeup_user" // 当前唤醒用户唯一标识
private val ENABLE_STRICT_SPEAKER_VERIFY = true // 严格验证开关
private val SPEAKER_VERIFY_THRESHOLD = 0.5f // 严格验证开关
init {
// 初始化声纹识别器适配你提供的API
try {
SpeakerRecognition.initExtractor(assetManager) // 对齐原生API
LogUtils.d(TAG, "✅ 声纹识别器初始化成功原生Stream版本")
} catch (e: Exception) {
LogUtils.e(TAG, "❌ 声纹识别器初始化失败", e)
throw RuntimeException("声纹识别初始化失败", e)
}
}
/* ================= 音频入口 ================= */
fun acceptAudio(samples: FloatArray) {
cachePreBuffer(samples)
wakeupManager.acceptAudio(samples)
if (wakeupManager.consumeWakeupFlag()) {
handleWakeupEvent()
// 注册唤醒用户特征(异步执行)
CoroutineScope(Dispatchers.IO).launch {
var stream: OnlineStream? = null
runCatching {
val wakeupAudio = preBuffer.toFloatArray()
if (wakeupAudio.isEmpty()) {
LogUtils.w(TAG, "❌ 唤醒音频缓存为空,无法注册用户特征")
return@launch
}
// 2. 创建原生Stream按你提供的API
stream = SpeakerRecognition.extractor.createStream()
stream.acceptWaveform(samples = preBuffer.toFloatArray(), sampleRate = sampleRate)
stream.inputFinished()
// 4. 计算特征并注册(仅当前用户)
if (SpeakerRecognition.extractor.isReady(stream)) {
val embedding = SpeakerRecognition.extractor.compute(stream)
// 清空历史特征,确保当前用户唯一
SpeakerRecognition.manager.remove(CURRENT_USER_ID)
// 注册当前唤醒用户按你提供的add API
val embeddingList: MutableList<FloatArray> = mutableListOf()
embeddingList.add(embedding)
val ok = SpeakerRecognition.manager.add(
name = CURRENT_USER_ID,
embedding = embeddingList.toTypedArray()
)
if (ok) {
LogUtils.d(TAG, "✅ 注册当前唤醒用户特征成功 | 特征长度: ${embedding.size}")
} else {
LogUtils.w(TAG, "❌ 注册当前唤醒用户特征失败manager.add返回false")
}
} else {
LogUtils.w(TAG, "❌ 唤醒音频Stream未就绪跳过用户注册")
}
}.onFailure {
LogUtils.e(TAG, "❌ 唤醒用户特征注册失败", it.message)
}.also {
// 释放Stream原生Stream用完即释放
stream?.release()
LogUtils.d(TAG, "🔄 唤醒注册Stream已释放")
}
}
handleWakeupEvent()
return
}
@ -338,7 +410,7 @@ class VoiceController(
val now = System.currentTimeMillis()
val duration = now - recordingStartMs
// ========== 第一步:基础过滤(语音过短) ==========
if (!vadStarted || duration < MIN_SPEECH_MS) {
LogUtils.d(TAG, "❌ 语音过短: $duration ms | 基线: $currentEnvBaseline")
hasInvalidSpeech = true
@ -368,6 +440,18 @@ class VoiceController(
return
}
// ========== 步骤1优先声纹验证核心仅当前用户可通过 ==========
if (ENABLE_STRICT_SPEAKER_VERIFY) {
val isCurrentUser = verifySpeaker(audioBuffer.toFloatArray())
if (!isCurrentUser) {
LogUtils.w(TAG, "❌ 非当前唤醒用户,直接拒绝语音 | 录音时长: $duration ms")
hasInvalidSpeech = true
resetToWaitSpeech()
return
}
LogUtils.d(TAG, "✅ 当前用户语音,继续处理 | 录音时长: $duration ms")
}
// ========== 1. 强制兜底:正常语音直接通过(阈值降低) ==========
val isNormalVoice = avgEnergy >= MIN_NORMAL_VOICE_ENERGY && vadRatio >= MIN_NORMAL_VOICE_VAD_RATIO
if (isNormalVoice) {
@ -575,4 +659,44 @@ class VoiceController(
val minScore: Int,
val scene: String
)
/* ================= 核心原生Stream声纹验证仅当前用户有效 ================= */
/**
* 验证语音是否属于当前唤醒用户完全适配你提供的API
* @param audio 待验证的语音数据
* @return true=是当前用户false=非当前用户
*/
private fun verifySpeaker(audio: FloatArray): Boolean {
var stream: OnlineStream? = null
return try {
stream = SpeakerRecognition.extractor.createStream()
stream.acceptWaveform(samples = audio, sampleRate = sampleRate)
stream.inputFinished()
// 4. 计算特征并验证按你提供的API
if (!SpeakerRecognition.extractor.isReady(stream)) {
LogUtils.w(TAG, "❌ 验证音频Stream未就绪验证失败")
return false
}
val embedding = SpeakerRecognition.extractor.compute(stream)
val verifyPass = SpeakerRecognition.manager.verify(name = CURRENT_USER_ID, embedding = embedding, threshold = SPEAKER_VERIFY_THRESHOLD)
if (verifyPass) {
LogUtils.d(TAG, "✅ 声纹验证通过")
} else {
LogUtils.w(TAG, "❌ 声纹验证失败")
}
verifyPass
} catch (e: Exception) {
LogUtils.e(TAG, "❌ 声纹验证异常", e)
false
} finally {
// 释放Stream原生Stream用完即释放
stream?.release()
LogUtils.d(TAG, "🔄 验证Stream已释放")
}
}
}

View File

@ -48,13 +48,10 @@ import com.zs.smarthuman.im.chat.bean.SingleMessage
import com.zs.smarthuman.kt.releaseIM
import com.zs.smarthuman.sherpa.TimeoutType
import com.zs.smarthuman.sherpa.VoiceController
import com.zs.smarthuman.sherpa.VoiceState
import com.zs.smarthuman.toast.Toaster
import com.zs.smarthuman.utils.AudioDebugUtil
import com.zs.smarthuman.utils.AudioPcmUtil
import com.zs.smarthuman.utils.DangerousUtils
import com.zs.smarthuman.utils.LogFileUtils
import com.zs.smarthuman.utils.SimulateStreamingAsr
import com.zs.smarthuman.utils.UnityPlayerHolder
import com.zs.smarthuman.utils.ViewSlideAnimator
@ -101,7 +98,6 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
}
override fun initData() {
initAsrModel()
PermissionUtils.permissionGroup(PermissionConstants.MICROPHONE)
.callback(object : PermissionUtils.FullCallback {
override fun onGranted(granted: List<String?>) {
@ -198,7 +194,7 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
voiceInfo = mutableListOf<VoiceBeanResp>().apply {
add(
VoiceBeanResp(
audioUrl = "https://static.seerteach.net/aidialogue/systemVoice/aliyun-nv.mp3"
audioUrl = "https://static.seerteach.net/aidialogue/userWakeUpAudio/344.mp3"
)
)
}
@ -212,17 +208,17 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
},
onFinalAudio = { audio ->
Log.d("lrsxx", "检测到语音,长度=${audio.size}")
// mViewModel?.uploadVoice(
// AudioPcmUtil.pcm16ToBase64(AudioPcmUtil.floatToPcm16(audio)),
// 1
// )
loadLocalJsonAndPlay()
val file = File(
getExternalFilesDir(Environment.DIRECTORY_DOWNLOADS)!!.getAbsolutePath(),
"xxx.wav"
mViewModel?.uploadVoice(
AudioPcmUtil.pcm16ToBase64(AudioPcmUtil.floatToPcm16(audio)),
1
)
AudioDebugUtil.saveFloatPcmAsWav(audio, file)
LogUtils.dTag("audioxx", "WAV saved: ${file.path}, samples=${audio.size}")
// loadLocalJsonAndPlay()
// val file = File(
// getExternalFilesDir(Environment.DIRECTORY_DOWNLOADS)!!.getAbsolutePath(),
// "xxx.wav"
// )
// AudioDebugUtil.saveFloatPcmAsWav(audio, file)
// LogUtils.dTag("audioxx", "WAV saved: ${file.path}, samples=${audio.size}")
lifecycleScope.launch(Dispatchers.Main) {
mVerticalAnimator?.show()
@ -258,16 +254,11 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
)
}
private fun initAsrModel(){
lifecycleScope.launch(Dispatchers.IO){
SimulateStreamingAsr.initOfflineRecognizer(App.getInstance())
}
}
override fun receivedIMMsg(msg: SingleMessage) {
when (msg.msgContentType) {
MessageContentType.RECEIVE_VOICE_STREAM.msgContentType -> {
lifecycleScope.launch(Dispatchers.IO) {
LogFileUtils.logToFile2(this@MainActivity,msg.content)
// LogFileUtils.logToFile2(this@MainActivity,msg.content)
UnityPlayerHolder.getInstance()
.startTalking(msg.content)
// loadLocalJsonAndPlay()

View File

@ -1,155 +0,0 @@
package com.zs.smarthuman.utils
import android.content.Context
import android.content.res.AssetManager
import com.blankj.utilcode.util.LogUtils
import com.k2fsa.sherpa.onnx.OfflineModelConfig
import com.k2fsa.sherpa.onnx.OfflineRecognizer
import com.k2fsa.sherpa.onnx.OfflineRecognizerConfig
import com.k2fsa.sherpa.onnx.OfflineWenetCtcModelConfig
import com.k2fsa.sherpa.onnx.Vad
import com.k2fsa.sherpa.onnx.getVadModelConfig
import java.io.File
import java.io.FileOutputStream
import java.io.InputStream
import java.io.OutputStream
fun assetExists(assetManager: AssetManager, path: String): Boolean {
val dir = path.substringBeforeLast('/', "")
val fileName = path.substringAfterLast('/')
val files = assetManager.list(dir) ?: return false
return files.contains(fileName)
}
fun copyAssetToInternalStorage(path: String, context: Context): String {
val targetRoot = context.filesDir
val outFile = File(targetRoot, path)
if (!assetExists(context.assets, path = path)) {
// for context binary, if it is does not exist, we return a path
// that can be written to
outFile.parentFile?.mkdirs()
LogUtils.i("VoiceController", "$path does not exist, return ${outFile.absolutePath}")
return outFile.absolutePath
}
if (outFile.exists()) {
val assetSize = context.assets.open(path).use { it.available() }
if (outFile.length() == assetSize.toLong()) {
LogUtils.i("VoiceController", "$targetRoot/$path already exists, skip copying, return $targetRoot/$path")
return "$targetRoot/$path"
}
}
outFile.parentFile?.mkdirs()
context.assets.open(path).use { input: InputStream ->
FileOutputStream(outFile).use { output: OutputStream ->
input.copyTo(output)
}
}
LogUtils.i("VoiceController", "Copied $path to $targetRoot/$path")
return outFile.absolutePath
}
object SimulateStreamingAsr {
private var _recognizer: OfflineRecognizer? = null
val recognizer: OfflineRecognizer
get() {
return _recognizer!!
}
fun initOfflineRecognizer(context: Context) {
synchronized(this) {
if (_recognizer != null) {
return
}
val wenetConfig = OfflineWenetCtcModelConfig(
model = "sherpa-onnx-wenetspeech-yue-u2pp-conformer-ctc-zh-en-cantonese-int8-2025-09-10/model.int8.onnx",
)
val modelConfig = OfflineModelConfig(
wenetCtc = wenetConfig,
tokens = "sherpa-onnx-wenetspeech-yue-u2pp-conformer-ctc-zh-en-cantonese-int8-2025-09-10/tokens.txt"
)
val config = OfflineRecognizerConfig(
modelConfig = modelConfig,
)
var assetManager: AssetManager? = context.assets
if (config.modelConfig.provider == "qnn") {
// We assume you have copied files like libQnnHtpV81Skel.so to jniLibs/arm64-v8a
LogUtils.i("VoiceController", "nativelibdir: ${context.applicationInfo.nativeLibraryDir}")
// If we don't set the environment variable for ADSP_LIBRARY_PATH, we will see
// the error code 1008 from qnn_interface.deviceCreate()
// See also
// https://workbench.aihub.qualcomm.com/docs/hub/faq.html#why-am-i-seeing-error-1008-when-trying-to-use-htp
OfflineRecognizer.prependAdspLibraryPath(context.applicationInfo.nativeLibraryDir)
// for qnn, we need to copy *.so files from assets folder to sd card
if (config.modelConfig.senseVoice.qnnConfig.backendLib.isEmpty() && config.modelConfig.zipformerCtc.qnnConfig.backendLib.isEmpty()) {
LogUtils.i("VoiceController", "You should provide libQnnHtp.so for qnn")
throw IllegalArgumentException("You should provide libQnnHtp.so for qnn")
}
config.modelConfig.tokens =
copyAssetToInternalStorage(config.modelConfig.tokens, context)
if (config.modelConfig.senseVoice.model.isNotEmpty() || assetExists(
context.assets,
path = config.modelConfig.senseVoice.qnnConfig.contextBinary
)
) {
if (config.modelConfig.senseVoice.model.isNotEmpty()) {
config.modelConfig.senseVoice.model =
copyAssetToInternalStorage(config.modelConfig.senseVoice.model, context)
}
config.modelConfig.senseVoice.qnnConfig.contextBinary =
copyAssetToInternalStorage(
config.modelConfig.senseVoice.qnnConfig.contextBinary,
context
)
} else if (config.modelConfig.zipformerCtc.model.isNotEmpty()) {
config.modelConfig.zipformerCtc.model =
copyAssetToInternalStorage(config.modelConfig.zipformerCtc.model, context)
config.modelConfig.zipformerCtc.qnnConfig.contextBinary =
copyAssetToInternalStorage(
config.modelConfig.zipformerCtc.qnnConfig.contextBinary,
context
)
}
if (config.hr.lexicon.isNotEmpty()) {
config.hr.lexicon = copyAssetToInternalStorage(config.hr.lexicon, context)
}
if (config.hr.ruleFsts.isNotEmpty()) {
// it assumes there is only one fst. otherwise, you need to copy each fst separately
config.hr.ruleFsts = copyAssetToInternalStorage(config.hr.ruleFsts, context)
}
assetManager = null
}
_recognizer = OfflineRecognizer(
assetManager = assetManager,
config = config,
)
LogUtils.i("VoiceController", "sherpa-onnx offline recognizer initialized")
}
}
}