添加说话人校验

2026-01-10 17:30:32 +08:00 · 2026-01-10 17:30:32 +08:00 · b7fc6d4ee0
commit b7fc6d4ee0
parent d01e43cd56
8 changed files with 138 additions and 8812 deletions
--- a/app/src/main/assets/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
+++ b/app/src/main/assets/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
--- a/app/src/main/assets/sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/keywords.txt
+++ b/app/src/main/assets/sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/keywords.txt
@ -1,3 +1,2 @@
 x iǎo z ì t óng x ué @小智同学
-x iǎo z ì @小智
+x iǎo z ì t óng x ué @小志同学
 x iǎo z ì @小志
--- a/app/src/main/assets/sherpa-onnx-wenetspeech-yue-u2pp-conformer-ctc-zh-en-cantonese-int8-2025-09-10/README.md
+++ b/app/src/main/assets/sherpa-onnx-wenetspeech-yue-u2pp-conformer-ctc-zh-en-cantonese-int8-2025-09-10/README.md
@ -1,4 +0,0 @@
 # Introduction
 Model in this directory is converted from
 https://huggingface.co/ASLP-lab/WSYue-ASR/tree/main/u2pp_conformer_yue
--- a/app/src/main/assets/sherpa-onnx-wenetspeech-yue-u2pp-conformer-ctc-zh-en-cantonese-int8-2025-09-10/model.int8.onnx
+++ b/app/src/main/assets/sherpa-onnx-wenetspeech-yue-u2pp-conformer-ctc-zh-en-cantonese-int8-2025-09-10/model.int8.onnx
--- a/app/src/main/assets/sherpa-onnx-wenetspeech-yue-u2pp-conformer-ctc-zh-en-cantonese-int8-2025-09-10/tokens.txt
+++ b/app/src/main/assets/sherpa-onnx-wenetspeech-yue-u2pp-conformer-ctc-zh-en-cantonese-int8-2025-09-10/tokens.txt
--- a/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt
+++ b/app/src/main/java/com/zs/smarthuman/sherpa/VoiceController.kt
@ -1,8 +1,16 @@
 package com.zs.smarthuman.sherpa
 import android.content.res.AssetManager
 import android.text.TextUtils
 import com.blankj.utilcode.util.LogUtils
 import com.k2fsa.sherpa.onnx.OnlineStream
 import com.k2fsa.sherpa.onnx.SpeakerRecognition
 import kotlinx.coroutines.CoroutineScope
 import kotlinx.coroutines.Dispatchers
 import kotlinx.coroutines.launch
 import java.util.ArrayDeque
 import java.util.concurrent.locks.ReentrantLock
 import kotlin.concurrent.withLock
 class VoiceController(
    assetManager: AssetManager,
@ -118,11 +126,75 @@ class VoiceController(
    private var hasInvalidSpeech = false
    private var currentTimeoutType: TimeoutType = TimeoutType.IDLE_TIMEOUT
    // ========== 核心配置：声纹验证相关 ==========
    private val CURRENT_USER_ID = "current_wakeup_user" // 当前唤醒用户唯一标识
    private val ENABLE_STRICT_SPEAKER_VERIFY = true // 严格验证开关
    private val SPEAKER_VERIFY_THRESHOLD = 0.5f // 严格验证开关
    init {
        // 初始化声纹识别器（适配你提供的API）
        try {
            SpeakerRecognition.initExtractor(assetManager) // 对齐原生API
            LogUtils.d(TAG, "✅ 声纹识别器初始化成功（原生Stream版本）")
        } catch (e: Exception) {
            LogUtils.e(TAG, "❌ 声纹识别器初始化失败", e)
            throw RuntimeException("声纹识别初始化失败", e)
        }
    }
    /* ================= 音频入口 ================= */
    fun acceptAudio(samples: FloatArray) {
        cachePreBuffer(samples)
        wakeupManager.acceptAudio(samples)
        if (wakeupManager.consumeWakeupFlag()) {
            handleWakeupEvent()
            // 注册唤醒用户特征（异步执行）
            CoroutineScope(Dispatchers.IO).launch {
                    var stream: OnlineStream? = null
                runCatching {
                    val wakeupAudio = preBuffer.toFloatArray()
                    if (wakeupAudio.isEmpty()) {
                        LogUtils.w(TAG, "❌ 唤醒音频缓存为空，无法注册用户特征")
                        return@launch
                    }
                    // 2. 创建原生Stream（按你提供的API）
                    stream = SpeakerRecognition.extractor.createStream()
                    stream.acceptWaveform(samples = preBuffer.toFloatArray(), sampleRate = sampleRate)
                    stream.inputFinished()
                    // 4. 计算特征并注册（仅当前用户）
                    if (SpeakerRecognition.extractor.isReady(stream)) {
                        val embedding = SpeakerRecognition.extractor.compute(stream)
                        // 清空历史特征，确保当前用户唯一
                        SpeakerRecognition.manager.remove(CURRENT_USER_ID)
                        // 注册当前唤醒用户（按你提供的add API）
                        val embeddingList: MutableList<FloatArray> = mutableListOf()
                        embeddingList.add(embedding)
                        val ok = SpeakerRecognition.manager.add(
                            name = CURRENT_USER_ID,
                            embedding = embeddingList.toTypedArray()
                        )
                        if (ok) {
                            LogUtils.d(TAG, "✅ 注册当前唤醒用户特征成功 | 特征长度: ${embedding.size}")
                        } else {
                            LogUtils.w(TAG, "❌ 注册当前唤醒用户特征失败（manager.add返回false）")
                        }
                    } else {
                        LogUtils.w(TAG, "❌ 唤醒音频Stream未就绪，跳过用户注册")
                    }
                }.onFailure {
                    LogUtils.e(TAG, "❌ 唤醒用户特征注册失败", it.message)
                }.also {
                    // 释放Stream（原生Stream用完即释放）
                    stream?.release()
                    LogUtils.d(TAG, "🔄 唤醒注册Stream已释放")
                }
            }
            handleWakeupEvent()
            return
        }
@ -338,7 +410,7 @@ class VoiceController(
        val now = System.currentTimeMillis()
        val duration = now - recordingStartMs
-        // ========== 第一步：基础过滤（语音过短） ==========
+
        if (!vadStarted || duration < MIN_SPEECH_MS) {
            LogUtils.d(TAG, "❌ 语音过短: $duration ms | 基线: $currentEnvBaseline")
            hasInvalidSpeech = true
@ -368,6 +440,18 @@ class VoiceController(
            return
        }
        // ========== 步骤1：优先声纹验证（核心！仅当前用户可通过） ==========
        if (ENABLE_STRICT_SPEAKER_VERIFY) {
            val isCurrentUser = verifySpeaker(audioBuffer.toFloatArray())
            if (!isCurrentUser) {
                LogUtils.w(TAG, "❌ 非当前唤醒用户，直接拒绝语音 | 录音时长: $duration ms")
                hasInvalidSpeech = true
                resetToWaitSpeech()
                return
            }
            LogUtils.d(TAG, "✅ 当前用户语音，继续处理 | 录音时长: $duration ms")
        }
        // ========== 1. 强制兜底：正常语音直接通过（阈值降低） ==========
        val isNormalVoice = avgEnergy >= MIN_NORMAL_VOICE_ENERGY && vadRatio >= MIN_NORMAL_VOICE_VAD_RATIO
        if (isNormalVoice) {
@ -575,4 +659,44 @@ class VoiceController(
        val minScore: Int,
        val scene: String
    )
    /* ================= 核心：原生Stream声纹验证（仅当前用户有效） ================= */
    /**
     * 验证语音是否属于当前唤醒用户（完全适配你提供的API）
     * @param audio 待验证的语音数据
     * @return true=是当前用户，false=非当前用户
     */
    private fun verifySpeaker(audio: FloatArray): Boolean {
            var stream: OnlineStream? = null
            return try {
                stream = SpeakerRecognition.extractor.createStream()
                stream.acceptWaveform(samples = audio, sampleRate = sampleRate)
                stream.inputFinished()
                // 4. 计算特征并验证（按你提供的API）
                if (!SpeakerRecognition.extractor.isReady(stream)) {
                    LogUtils.w(TAG, "❌ 验证音频Stream未就绪，验证失败")
                    return false
                }
                val embedding = SpeakerRecognition.extractor.compute(stream)
                val verifyPass = SpeakerRecognition.manager.verify(name = CURRENT_USER_ID, embedding = embedding, threshold = SPEAKER_VERIFY_THRESHOLD)
                if (verifyPass) {
                    LogUtils.d(TAG, "✅ 声纹验证通过")
                } else {
                    LogUtils.w(TAG, "❌ 声纹验证失败")
                }
                verifyPass
            } catch (e: Exception) {
                LogUtils.e(TAG, "❌ 声纹验证异常", e)
                false
            } finally {
                // 释放Stream（原生Stream用完即释放）
                stream?.release()
                LogUtils.d(TAG, "🔄 验证Stream已释放")
            }
    }
 }
--- a/app/src/main/java/com/zs/smarthuman/ui/MainActivity.kt
+++ b/app/src/main/java/com/zs/smarthuman/ui/MainActivity.kt
@ -48,13 +48,10 @@ import com.zs.smarthuman.im.chat.bean.SingleMessage
 import com.zs.smarthuman.kt.releaseIM
 import com.zs.smarthuman.sherpa.TimeoutType
 import com.zs.smarthuman.sherpa.VoiceController
 import com.zs.smarthuman.sherpa.VoiceState
 import com.zs.smarthuman.toast.Toaster
 import com.zs.smarthuman.utils.AudioDebugUtil
 import com.zs.smarthuman.utils.AudioPcmUtil
 import com.zs.smarthuman.utils.DangerousUtils
 import com.zs.smarthuman.utils.LogFileUtils
 import com.zs.smarthuman.utils.SimulateStreamingAsr
 import com.zs.smarthuman.utils.UnityPlayerHolder
 import com.zs.smarthuman.utils.ViewSlideAnimator
@ -101,7 +98,6 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
    }
    override fun initData() {
        initAsrModel()
        PermissionUtils.permissionGroup(PermissionConstants.MICROPHONE)
            .callback(object : PermissionUtils.FullCallback {
                override fun onGranted(granted: List<String?>) {
@ -198,7 +194,7 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
                        voiceInfo = mutableListOf<VoiceBeanResp>().apply {
                            add(
                                VoiceBeanResp(
-                                    audioUrl = "https://static.seerteach.net/aidialogue/systemVoice/aliyun-nv.mp3"
+                                    audioUrl = "https://static.seerteach.net/aidialogue/userWakeUpAudio/344.mp3"
                                )
                            )
                        }
@ -212,17 +208,17 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
            },
            onFinalAudio = { audio ->
                Log.d("lrsxx", "检测到语音，长度=${audio.size}")
-//                mViewModel?.uploadVoice(
+                mViewModel?.uploadVoice(
-//                    AudioPcmUtil.pcm16ToBase64(AudioPcmUtil.floatToPcm16(audio)),
+                    AudioPcmUtil.pcm16ToBase64(AudioPcmUtil.floatToPcm16(audio)),
-//                    1
+                    1
 //                )
                loadLocalJsonAndPlay()
                val file = File(
                    getExternalFilesDir(Environment.DIRECTORY_DOWNLOADS)!!.getAbsolutePath(),
                    "xxx.wav"
                )
-                AudioDebugUtil.saveFloatPcmAsWav(audio, file)
+//                loadLocalJsonAndPlay()
-                LogUtils.dTag("audioxx", "WAV saved: ${file.path}, samples=${audio.size}")
+//                val file = File(
 //                    getExternalFilesDir(Environment.DIRECTORY_DOWNLOADS)!!.getAbsolutePath(),
 //                    "xxx.wav"
 //                )
 //                AudioDebugUtil.saveFloatPcmAsWav(audio, file)
 //                LogUtils.dTag("audioxx", "WAV saved: ${file.path}, samples=${audio.size}")
                lifecycleScope.launch(Dispatchers.Main) {
                    mVerticalAnimator?.show()
@ -258,16 +254,11 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
        )
    }
    private fun initAsrModel(){
        lifecycleScope.launch(Dispatchers.IO){
            SimulateStreamingAsr.initOfflineRecognizer(App.getInstance())
        }
    }
    override fun receivedIMMsg(msg: SingleMessage) {
        when (msg.msgContentType) {
            MessageContentType.RECEIVE_VOICE_STREAM.msgContentType -> {
                lifecycleScope.launch(Dispatchers.IO) {
-                    LogFileUtils.logToFile2(this@MainActivity,msg.content)
+//                    LogFileUtils.logToFile2(this@MainActivity,msg.content)
                    UnityPlayerHolder.getInstance()
                        .startTalking(msg.content)
 //                    loadLocalJsonAndPlay()
--- a/app/src/main/java/com/zs/smarthuman/utils/SimulateStreamingAsr.kt
+++ b/app/src/main/java/com/zs/smarthuman/utils/SimulateStreamingAsr.kt
@ -1,155 +0,0 @@
 package com.zs.smarthuman.utils
 import android.content.Context
 import android.content.res.AssetManager
 import com.blankj.utilcode.util.LogUtils
 import com.k2fsa.sherpa.onnx.OfflineModelConfig
 import com.k2fsa.sherpa.onnx.OfflineRecognizer
 import com.k2fsa.sherpa.onnx.OfflineRecognizerConfig
 import com.k2fsa.sherpa.onnx.OfflineWenetCtcModelConfig
 import com.k2fsa.sherpa.onnx.Vad
 import com.k2fsa.sherpa.onnx.getVadModelConfig
 import java.io.File
 import java.io.FileOutputStream
 import java.io.InputStream
 import java.io.OutputStream
 fun assetExists(assetManager: AssetManager, path: String): Boolean {
    val dir = path.substringBeforeLast('/', "")
    val fileName = path.substringAfterLast('/')
    val files = assetManager.list(dir) ?: return false
    return files.contains(fileName)
 }
 fun copyAssetToInternalStorage(path: String, context: Context): String {
    val targetRoot = context.filesDir
    val outFile = File(targetRoot, path)
    if (!assetExists(context.assets, path = path)) {
        // for context binary, if it is does not exist, we return a path
        // that can be written to
        outFile.parentFile?.mkdirs()
        LogUtils.i("VoiceController", "$path does not exist, return ${outFile.absolutePath}")
        return outFile.absolutePath
    }
    if (outFile.exists()) {
        val assetSize = context.assets.open(path).use { it.available() }
        if (outFile.length() == assetSize.toLong()) {
            LogUtils.i("VoiceController", "$targetRoot/$path already exists, skip copying, return $targetRoot/$path")
            return "$targetRoot/$path"
        }
    }
    outFile.parentFile?.mkdirs()
    context.assets.open(path).use { input: InputStream ->
        FileOutputStream(outFile).use { output: OutputStream ->
            input.copyTo(output)
        }
    }
    LogUtils.i("VoiceController", "Copied $path to $targetRoot/$path")
    return outFile.absolutePath
 }
 object SimulateStreamingAsr {
    private var _recognizer: OfflineRecognizer? = null
    val recognizer: OfflineRecognizer
        get() {
            return _recognizer!!
        }
    fun initOfflineRecognizer(context: Context) {
        synchronized(this) {
            if (_recognizer != null) {
                return
            }
            val wenetConfig = OfflineWenetCtcModelConfig(
                model = "sherpa-onnx-wenetspeech-yue-u2pp-conformer-ctc-zh-en-cantonese-int8-2025-09-10/model.int8.onnx",
            )
            val modelConfig = OfflineModelConfig(
                wenetCtc = wenetConfig,
                tokens = "sherpa-onnx-wenetspeech-yue-u2pp-conformer-ctc-zh-en-cantonese-int8-2025-09-10/tokens.txt"
            )
            val config = OfflineRecognizerConfig(
                modelConfig = modelConfig,
            )
            var assetManager: AssetManager? = context.assets
            if (config.modelConfig.provider == "qnn") {
                // We assume you have copied files like libQnnHtpV81Skel.so to jniLibs/arm64-v8a
                LogUtils.i("VoiceController", "nativelibdir: ${context.applicationInfo.nativeLibraryDir}")
                // If we don't set the environment variable for ADSP_LIBRARY_PATH, we will see
                // the error code 1008 from qnn_interface.deviceCreate()
                // See also
                // https://workbench.aihub.qualcomm.com/docs/hub/faq.html#why-am-i-seeing-error-1008-when-trying-to-use-htp
                OfflineRecognizer.prependAdspLibraryPath(context.applicationInfo.nativeLibraryDir)
                // for qnn, we need to copy *.so files from assets folder to sd card
                if (config.modelConfig.senseVoice.qnnConfig.backendLib.isEmpty() && config.modelConfig.zipformerCtc.qnnConfig.backendLib.isEmpty()) {
                    LogUtils.i("VoiceController", "You should provide libQnnHtp.so for qnn")
                    throw IllegalArgumentException("You should provide libQnnHtp.so for qnn")
                }
                config.modelConfig.tokens =
                    copyAssetToInternalStorage(config.modelConfig.tokens, context)
                if (config.modelConfig.senseVoice.model.isNotEmpty() || assetExists(
                        context.assets,
                        path = config.modelConfig.senseVoice.qnnConfig.contextBinary
                    )
                ) {
                    if (config.modelConfig.senseVoice.model.isNotEmpty()) {
                        config.modelConfig.senseVoice.model =
                            copyAssetToInternalStorage(config.modelConfig.senseVoice.model, context)
                    }
                    config.modelConfig.senseVoice.qnnConfig.contextBinary =
                        copyAssetToInternalStorage(
                            config.modelConfig.senseVoice.qnnConfig.contextBinary,
                            context
                        )
                } else if (config.modelConfig.zipformerCtc.model.isNotEmpty()) {
                    config.modelConfig.zipformerCtc.model =
                        copyAssetToInternalStorage(config.modelConfig.zipformerCtc.model, context)
                    config.modelConfig.zipformerCtc.qnnConfig.contextBinary =
                        copyAssetToInternalStorage(
                            config.modelConfig.zipformerCtc.qnnConfig.contextBinary,
                            context
                        )
                }
                if (config.hr.lexicon.isNotEmpty()) {
                    config.hr.lexicon = copyAssetToInternalStorage(config.hr.lexicon, context)
                }
                if (config.hr.ruleFsts.isNotEmpty()) {
                    // it assumes there is only one fst. otherwise, you need to copy each fst separately
                    config.hr.ruleFsts = copyAssetToInternalStorage(config.hr.ruleFsts, context)
                }
                assetManager = null
            }
            _recognizer = OfflineRecognizer(
                assetManager = assetManager,
                config = config,
            )
            LogUtils.i("VoiceController", "sherpa-onnx offline recognizer initialized")
        }
    }
 }