添加说话人校验

2026-01-10 17:30:32 +08:00 · 2026-01-10 17:30:32 +08:00 · b7fc6d4ee0
commit b7fc6d4ee0
parent d01e43cd56
8 changed files with 138 additions and 8812 deletions
--- a/app/src/main/assets/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
+++ b/app/src/main/assets/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
--- a/app/src/main/assets/sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/keywords.txt
+++ b/app/src/main/assets/sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/keywords.txt
@ -1,3 +1,2 @@
 x iǎo z ì t óng x ué @小智同学
-x iǎo z ì @小智
-x iǎo z ì @小志
+x iǎo z ì t óng x ué @小志同学
--- a/app/src/main/assets/sherpa-onnx-wenetspeech-yue-u2pp-conformer-ctc-zh-en-cantonese-int8-2025-09-10/README.md
+++ b/app/src/main/assets/sherpa-onnx-wenetspeech-yue-u2pp-conformer-ctc-zh-en-cantonese-int8-2025-09-10/README.md
@ -1,4 +0,0 @@
-# Introduction
-
-Model in this directory is converted from
-https://huggingface.co/ASLP-lab/WSYue-ASR/tree/main/u2pp_conformer_yue
--- a/app/src/main/assets/sherpa-onnx-wenetspeech-yue-u2pp-conformer-ctc-zh-en-cantonese-int8-2025-09-10/model.int8.onnx
+++ b/app/src/main/assets/sherpa-onnx-wenetspeech-yue-u2pp-conformer-ctc-zh-en-cantonese-int8-2025-09-10/model.int8.onnx
--- a/app/src/main/assets/sherpa-onnx-wenetspeech-yue-u2pp-conformer-ctc-zh-en-cantonese-int8-2025-09-10/tokens.txt
+++ b/app/src/main/assets/sherpa-onnx-wenetspeech-yue-u2pp-conformer-ctc-zh-en-cantonese-int8-2025-09-10/tokens.txt
--- a/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt
+++ b/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt
@ -1,8 +1,16 @@
 package com.zs.smarthuman.sherpa

 import android.content.res.AssetManager
+import android.text.TextUtils
 import com.blankj.utilcode.util.LogUtils
+import com.k2fsa.sherpa.onnx.OnlineStream
+import com.k2fsa.sherpa.onnx.SpeakerRecognition
+import kotlinx.coroutines.CoroutineScope
+import kotlinx.coroutines.Dispatchers
+import kotlinx.coroutines.launch
 import java.util.ArrayDeque
+import java.util.concurrent.locks.ReentrantLock
+import kotlin.concurrent.withLock

 class VoiceController(
    assetManager: AssetManager,
@ -118,11 +126,75 @@ class VoiceController(
    private var hasInvalidSpeech = false
    private var currentTimeoutType: TimeoutType = TimeoutType.IDLE_TIMEOUT

+
+    // ========== 核心配置：声纹验证相关 ==========
+    private val CURRENT_USER_ID = "current_wakeup_user" // 当前唤醒用户唯一标识
+    private val ENABLE_STRICT_SPEAKER_VERIFY = true // 严格验证开关
+    private val SPEAKER_VERIFY_THRESHOLD = 0.5f // 严格验证开关
+
+
+    init {
+        // 初始化声纹识别器（适配你提供的API）
+        try {
+            SpeakerRecognition.initExtractor(assetManager) // 对齐原生API
+            LogUtils.d(TAG, "✅ 声纹识别器初始化成功（原生Stream版本）")
+        } catch (e: Exception) {
+            LogUtils.e(TAG, "❌ 声纹识别器初始化失败", e)
+            throw RuntimeException("声纹识别初始化失败", e)
+        }
+    }
+
    /* ================= 音频入口 ================= */
    fun acceptAudio(samples: FloatArray) {
        cachePreBuffer(samples)
        wakeupManager.acceptAudio(samples)
        if (wakeupManager.consumeWakeupFlag()) {
+            handleWakeupEvent()
+            // 注册唤醒用户特征（异步执行）
+            CoroutineScope(Dispatchers.IO).launch {
+                    var stream: OnlineStream? = null
+                runCatching {
+                    val wakeupAudio = preBuffer.toFloatArray()
+                    if (wakeupAudio.isEmpty()) {
+                        LogUtils.w(TAG, "❌ 唤醒音频缓存为空，无法注册用户特征")
+                        return@launch
+                    }
+
+                    // 2. 创建原生Stream（按你提供的API）
+                    stream = SpeakerRecognition.extractor.createStream()
+
+                    stream.acceptWaveform(samples = preBuffer.toFloatArray(), sampleRate = sampleRate)
+                    stream.inputFinished()
+
+                    // 4. 计算特征并注册（仅当前用户）
+                    if (SpeakerRecognition.extractor.isReady(stream)) {
+                        val embedding = SpeakerRecognition.extractor.compute(stream)
+                        // 清空历史特征，确保当前用户唯一
+                        SpeakerRecognition.manager.remove(CURRENT_USER_ID)
+                        // 注册当前唤醒用户（按你提供的add API）
+                        val embeddingList: MutableList<FloatArray> = mutableListOf()
+                        embeddingList.add(embedding)
+                        val ok = SpeakerRecognition.manager.add(
+                            name = CURRENT_USER_ID,
+                            embedding = embeddingList.toTypedArray()
+                        )
+                        if (ok) {
+                            LogUtils.d(TAG, "✅ 注册当前唤醒用户特征成功 | 特征长度: ${embedding.size}")
+                        } else {
+                            LogUtils.w(TAG, "❌ 注册当前唤醒用户特征失败（manager.add返回false）")
+                        }
+                    } else {
+                        LogUtils.w(TAG, "❌ 唤醒音频Stream未就绪，跳过用户注册")
+                    }
+                }.onFailure {
+                    LogUtils.e(TAG, "❌ 唤醒用户特征注册失败", it.message)
+                }.also {
+                    // 释放Stream（原生Stream用完即释放）
+                    stream?.release()
+                    LogUtils.d(TAG, "🔄 唤醒注册Stream已释放")
+                }
+
+            }
            handleWakeupEvent()
            return
        }
@ -338,7 +410,7 @@ class VoiceController(
        val now = System.currentTimeMillis()
        val duration = now - recordingStartMs

-        // ========== 第一步：基础过滤（语音过短） ==========
+
        if (!vadStarted || duration < MIN_SPEECH_MS) {
            LogUtils.d(TAG, "❌ 语音过短: $duration ms | 基线: $currentEnvBaseline")
            hasInvalidSpeech = true
@ -368,6 +440,18 @@ class VoiceController(
            return
        }

+        // ========== 步骤1：优先声纹验证（核心！仅当前用户可通过） ==========
+        if (ENABLE_STRICT_SPEAKER_VERIFY) {
+            val isCurrentUser = verifySpeaker(audioBuffer.toFloatArray())
+            if (!isCurrentUser) {
+                LogUtils.w(TAG, "❌ 非当前唤醒用户，直接拒绝语音 | 录音时长: $duration ms")
+                hasInvalidSpeech = true
+                resetToWaitSpeech()
+                return
+            }
+            LogUtils.d(TAG, "✅ 当前用户语音，继续处理 | 录音时长: $duration ms")
+        }
+
        // ========== 1. 强制兜底：正常语音直接通过（阈值降低） ==========
        val isNormalVoice = avgEnergy >= MIN_NORMAL_VOICE_ENERGY && vadRatio >= MIN_NORMAL_VOICE_VAD_RATIO
        if (isNormalVoice) {
@ -575,4 +659,44 @@ class VoiceController(
        val minScore: Int,
        val scene: String
    )
+
+    /* ================= 核心：原生Stream声纹验证（仅当前用户有效） ================= */
+    /**
+     * 验证语音是否属于当前唤醒用户（完全适配你提供的API）
+     * @param audio 待验证的语音数据
+     * @return true=是当前用户，false=非当前用户
+     */
+    private fun verifySpeaker(audio: FloatArray): Boolean {
+            var stream: OnlineStream? = null
+            return try {
+                stream = SpeakerRecognition.extractor.createStream()
+                stream.acceptWaveform(samples = audio, sampleRate = sampleRate)
+                stream.inputFinished()
+
+                // 4. 计算特征并验证（按你提供的API）
+                if (!SpeakerRecognition.extractor.isReady(stream)) {
+                    LogUtils.w(TAG, "❌ 验证音频Stream未就绪，验证失败")
+                    return false
+                }
+
+                val embedding = SpeakerRecognition.extractor.compute(stream)
+
+                val verifyPass = SpeakerRecognition.manager.verify(name = CURRENT_USER_ID, embedding = embedding, threshold = SPEAKER_VERIFY_THRESHOLD)
+                if (verifyPass) {
+                    LogUtils.d(TAG, "✅ 声纹验证通过")
+                } else {
+                    LogUtils.w(TAG, "❌ 声纹验证失败")
+                }
+                verifyPass
+            } catch (e: Exception) {
+                LogUtils.e(TAG, "❌ 声纹验证异常", e)
+                false
+            } finally {
+                // 释放Stream（原生Stream用完即释放）
+                stream?.release()
+                LogUtils.d(TAG, "🔄 验证Stream已释放")
+            }
+
+    }
+
 }
--- a/app/src/main/java/com/zs/smarthuman/ui/MainActivity.kt
+++ b/app/src/main/java/com/zs/smarthuman/ui/MainActivity.kt
@ -48,13 +48,10 @@ import com.zs.smarthuman.im.chat.bean.SingleMessage
 import com.zs.smarthuman.kt.releaseIM
 import com.zs.smarthuman.sherpa.TimeoutType
 import com.zs.smarthuman.sherpa.VoiceController
-import com.zs.smarthuman.sherpa.VoiceState
 import com.zs.smarthuman.toast.Toaster
-import com.zs.smarthuman.utils.AudioDebugUtil
 import com.zs.smarthuman.utils.AudioPcmUtil
 import com.zs.smarthuman.utils.DangerousUtils
 import com.zs.smarthuman.utils.LogFileUtils
-import com.zs.smarthuman.utils.SimulateStreamingAsr

 import com.zs.smarthuman.utils.UnityPlayerHolder
 import com.zs.smarthuman.utils.ViewSlideAnimator
@ -101,7 +98,6 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
    }

    override fun initData() {
-        initAsrModel()
        PermissionUtils.permissionGroup(PermissionConstants.MICROPHONE)
            .callback(object : PermissionUtils.FullCallback {
                override fun onGranted(granted: List<String?>) {
@ -198,7 +194,7 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
                        voiceInfo = mutableListOf<VoiceBeanResp>().apply {
                            add(
                                VoiceBeanResp(
-                                    audioUrl = "https://static.seerteach.net/aidialogue/systemVoice/aliyun-nv.mp3"
+                                    audioUrl = "https://static.seerteach.net/aidialogue/userWakeUpAudio/344.mp3"
                                )
                            )
                        }
@ -212,17 +208,17 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
            },
            onFinalAudio = { audio ->
                Log.d("lrsxx", "检测到语音，长度=${audio.size}")
-//                mViewModel?.uploadVoice(
-//                    AudioPcmUtil.pcm16ToBase64(AudioPcmUtil.floatToPcm16(audio)),
-//                    1
-//                )
-                loadLocalJsonAndPlay()
-                val file = File(
-                    getExternalFilesDir(Environment.DIRECTORY_DOWNLOADS)!!.getAbsolutePath(),
-                    "xxx.wav"
+                mViewModel?.uploadVoice(
+                    AudioPcmUtil.pcm16ToBase64(AudioPcmUtil.floatToPcm16(audio)),
+                    1
                )
-                AudioDebugUtil.saveFloatPcmAsWav(audio, file)
-                LogUtils.dTag("audioxx", "WAV saved: ${file.path}, samples=${audio.size}")
+//                loadLocalJsonAndPlay()
+//                val file = File(
+//                    getExternalFilesDir(Environment.DIRECTORY_DOWNLOADS)!!.getAbsolutePath(),
+//                    "xxx.wav"
+//                )
+//                AudioDebugUtil.saveFloatPcmAsWav(audio, file)
+//                LogUtils.dTag("audioxx", "WAV saved: ${file.path}, samples=${audio.size}")
                lifecycleScope.launch(Dispatchers.Main) {

                    mVerticalAnimator?.show()
@ -258,16 +254,11 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
        )
    }

-    private fun initAsrModel(){
-        lifecycleScope.launch(Dispatchers.IO){
-            SimulateStreamingAsr.initOfflineRecognizer(App.getInstance())
-        }
-    }
    override fun receivedIMMsg(msg: SingleMessage) {
        when (msg.msgContentType) {
            MessageContentType.RECEIVE_VOICE_STREAM.msgContentType -> {
                lifecycleScope.launch(Dispatchers.IO) {
-                    LogFileUtils.logToFile2(this@MainActivity,msg.content)
+//                    LogFileUtils.logToFile2(this@MainActivity,msg.content)
                    UnityPlayerHolder.getInstance()
                        .startTalking(msg.content)
 //                    loadLocalJsonAndPlay()
--- a/app/src/main/java/com/zs/smarthuman/utils/SimulateStreamingAsr.kt
+++ b/app/src/main/java/com/zs/smarthuman/utils/SimulateStreamingAsr.kt
@ -1,155 +0,0 @@
-package com.zs.smarthuman.utils
-
-import android.content.Context
-import android.content.res.AssetManager
-import com.blankj.utilcode.util.LogUtils
-
-import com.k2fsa.sherpa.onnx.OfflineModelConfig
-import com.k2fsa.sherpa.onnx.OfflineRecognizer
-import com.k2fsa.sherpa.onnx.OfflineRecognizerConfig
-import com.k2fsa.sherpa.onnx.OfflineWenetCtcModelConfig
-import com.k2fsa.sherpa.onnx.Vad
-import com.k2fsa.sherpa.onnx.getVadModelConfig
-import java.io.File
-import java.io.FileOutputStream
-import java.io.InputStream
-import java.io.OutputStream
-
-
-fun assetExists(assetManager: AssetManager, path: String): Boolean {
-    val dir = path.substringBeforeLast('/', "")
-    val fileName = path.substringAfterLast('/')
-
-    val files = assetManager.list(dir) ?: return false
-    return files.contains(fileName)
-}
-
-fun copyAssetToInternalStorage(path: String, context: Context): String {
-    val targetRoot = context.filesDir
-    val outFile = File(targetRoot, path)
-
-    if (!assetExists(context.assets, path = path)) {
-        // for context binary, if it is does not exist, we return a path
-        // that can be written to
-        outFile.parentFile?.mkdirs()
-        LogUtils.i("VoiceController", "$path does not exist, return ${outFile.absolutePath}")
-        return outFile.absolutePath
-    }
-
-    if (outFile.exists()) {
-        val assetSize = context.assets.open(path).use { it.available() }
-        if (outFile.length() == assetSize.toLong()) {
-            LogUtils.i("VoiceController", "$targetRoot/$path already exists, skip copying, return $targetRoot/$path")
-
-            return "$targetRoot/$path"
-        }
-    }
-
-    outFile.parentFile?.mkdirs()
-
-    context.assets.open(path).use { input: InputStream ->
-        FileOutputStream(outFile).use { output: OutputStream ->
-            input.copyTo(output)
-        }
-    }
-    LogUtils.i("VoiceController", "Copied $path to $targetRoot/$path")
-
-    return outFile.absolutePath
-}
-
-
-object SimulateStreamingAsr {
-    private var _recognizer: OfflineRecognizer? = null
-    val recognizer: OfflineRecognizer
-        get() {
-            return _recognizer!!
-        }
-
-
-    fun initOfflineRecognizer(context: Context) {
-        synchronized(this) {
-            if (_recognizer != null) {
-                return
-            }
-
-            val wenetConfig = OfflineWenetCtcModelConfig(
-                model = "sherpa-onnx-wenetspeech-yue-u2pp-conformer-ctc-zh-en-cantonese-int8-2025-09-10/model.int8.onnx",
-            )
-
-            val modelConfig = OfflineModelConfig(
-                wenetCtc = wenetConfig,
-                tokens = "sherpa-onnx-wenetspeech-yue-u2pp-conformer-ctc-zh-en-cantonese-int8-2025-09-10/tokens.txt"
-            )
-            val config = OfflineRecognizerConfig(
-                modelConfig = modelConfig,
-            )
-
-
-            var assetManager: AssetManager? = context.assets
-
-            if (config.modelConfig.provider == "qnn") {
-                // We assume you have copied files like libQnnHtpV81Skel.so to jniLibs/arm64-v8a
-                LogUtils.i("VoiceController", "nativelibdir: ${context.applicationInfo.nativeLibraryDir}")
-
-                // If we don't set the environment variable for ADSP_LIBRARY_PATH, we will see
-                // the error code 1008 from qnn_interface.deviceCreate()
-                // See also
-                // https://workbench.aihub.qualcomm.com/docs/hub/faq.html#why-am-i-seeing-error-1008-when-trying-to-use-htp
-                OfflineRecognizer.prependAdspLibraryPath(context.applicationInfo.nativeLibraryDir)
-
-                // for qnn, we need to copy *.so files from assets folder to sd card
-                if (config.modelConfig.senseVoice.qnnConfig.backendLib.isEmpty() && config.modelConfig.zipformerCtc.qnnConfig.backendLib.isEmpty()) {
-                    LogUtils.i("VoiceController", "You should provide libQnnHtp.so for qnn")
-                    throw IllegalArgumentException("You should provide libQnnHtp.so for qnn")
-                }
-                config.modelConfig.tokens =
-                    copyAssetToInternalStorage(config.modelConfig.tokens, context)
-
-                if (config.modelConfig.senseVoice.model.isNotEmpty() || assetExists(
-                        context.assets,
-                        path = config.modelConfig.senseVoice.qnnConfig.contextBinary
-                    )
-                ) {
-                    if (config.modelConfig.senseVoice.model.isNotEmpty()) {
-                        config.modelConfig.senseVoice.model =
-                            copyAssetToInternalStorage(config.modelConfig.senseVoice.model, context)
-                    }
-
-                    config.modelConfig.senseVoice.qnnConfig.contextBinary =
-                        copyAssetToInternalStorage(
-                            config.modelConfig.senseVoice.qnnConfig.contextBinary,
-                            context
-                        )
-                } else if (config.modelConfig.zipformerCtc.model.isNotEmpty()) {
-                    config.modelConfig.zipformerCtc.model =
-                        copyAssetToInternalStorage(config.modelConfig.zipformerCtc.model, context)
-
-                    config.modelConfig.zipformerCtc.qnnConfig.contextBinary =
-                        copyAssetToInternalStorage(
-                            config.modelConfig.zipformerCtc.qnnConfig.contextBinary,
-                            context
-                        )
-                }
-
-                if (config.hr.lexicon.isNotEmpty()) {
-                    config.hr.lexicon = copyAssetToInternalStorage(config.hr.lexicon, context)
-                }
-
-                if (config.hr.ruleFsts.isNotEmpty()) {
-                    // it assumes there is only one fst. otherwise, you need to copy each fst separately
-                    config.hr.ruleFsts = copyAssetToInternalStorage(config.hr.ruleFsts, context)
-                }
-
-                assetManager = null
-            }
-
-            _recognizer = OfflineRecognizer(
-                assetManager = assetManager,
-                config = config,
-            )
-
-            LogUtils.i("VoiceController", "sherpa-onnx offline recognizer initialized")
-        }
-    }
-
-}