添加说话人校验
This commit is contained in:
parent
d01e43cd56
commit
b7fc6d4ee0
Binary file not shown.
@ -1,3 +1,2 @@
|
||||
x iǎo z ì t óng x ué @小智同学
|
||||
x iǎo z ì @小智
|
||||
x iǎo z ì @小志
|
||||
x iǎo z ì t óng x ué @小志同学
|
||||
@ -1,4 +0,0 @@
|
||||
# Introduction
|
||||
|
||||
Model in this directory is converted from
|
||||
https://huggingface.co/ASLP-lab/WSYue-ASR/tree/main/u2pp_conformer_yue
|
||||
Binary file not shown.
File diff suppressed because it is too large
Load Diff
@ -1,8 +1,16 @@
|
||||
package com.zs.smarthuman.sherpa
|
||||
|
||||
import android.content.res.AssetManager
|
||||
import android.text.TextUtils
|
||||
import com.blankj.utilcode.util.LogUtils
|
||||
import com.k2fsa.sherpa.onnx.OnlineStream
|
||||
import com.k2fsa.sherpa.onnx.SpeakerRecognition
|
||||
import kotlinx.coroutines.CoroutineScope
|
||||
import kotlinx.coroutines.Dispatchers
|
||||
import kotlinx.coroutines.launch
|
||||
import java.util.ArrayDeque
|
||||
import java.util.concurrent.locks.ReentrantLock
|
||||
import kotlin.concurrent.withLock
|
||||
|
||||
class VoiceController(
|
||||
assetManager: AssetManager,
|
||||
@ -118,11 +126,75 @@ class VoiceController(
|
||||
private var hasInvalidSpeech = false
|
||||
private var currentTimeoutType: TimeoutType = TimeoutType.IDLE_TIMEOUT
|
||||
|
||||
|
||||
// ========== 核心配置:声纹验证相关 ==========
|
||||
private val CURRENT_USER_ID = "current_wakeup_user" // 当前唤醒用户唯一标识
|
||||
private val ENABLE_STRICT_SPEAKER_VERIFY = true // 严格验证开关
|
||||
private val SPEAKER_VERIFY_THRESHOLD = 0.5f // 严格验证开关
|
||||
|
||||
|
||||
init {
|
||||
// 初始化声纹识别器(适配你提供的API)
|
||||
try {
|
||||
SpeakerRecognition.initExtractor(assetManager) // 对齐原生API
|
||||
LogUtils.d(TAG, "✅ 声纹识别器初始化成功(原生Stream版本)")
|
||||
} catch (e: Exception) {
|
||||
LogUtils.e(TAG, "❌ 声纹识别器初始化失败", e)
|
||||
throw RuntimeException("声纹识别初始化失败", e)
|
||||
}
|
||||
}
|
||||
|
||||
/* ================= 音频入口 ================= */
|
||||
fun acceptAudio(samples: FloatArray) {
|
||||
cachePreBuffer(samples)
|
||||
wakeupManager.acceptAudio(samples)
|
||||
if (wakeupManager.consumeWakeupFlag()) {
|
||||
handleWakeupEvent()
|
||||
// 注册唤醒用户特征(异步执行)
|
||||
CoroutineScope(Dispatchers.IO).launch {
|
||||
var stream: OnlineStream? = null
|
||||
runCatching {
|
||||
val wakeupAudio = preBuffer.toFloatArray()
|
||||
if (wakeupAudio.isEmpty()) {
|
||||
LogUtils.w(TAG, "❌ 唤醒音频缓存为空,无法注册用户特征")
|
||||
return@launch
|
||||
}
|
||||
|
||||
// 2. 创建原生Stream(按你提供的API)
|
||||
stream = SpeakerRecognition.extractor.createStream()
|
||||
|
||||
stream.acceptWaveform(samples = preBuffer.toFloatArray(), sampleRate = sampleRate)
|
||||
stream.inputFinished()
|
||||
|
||||
// 4. 计算特征并注册(仅当前用户)
|
||||
if (SpeakerRecognition.extractor.isReady(stream)) {
|
||||
val embedding = SpeakerRecognition.extractor.compute(stream)
|
||||
// 清空历史特征,确保当前用户唯一
|
||||
SpeakerRecognition.manager.remove(CURRENT_USER_ID)
|
||||
// 注册当前唤醒用户(按你提供的add API)
|
||||
val embeddingList: MutableList<FloatArray> = mutableListOf()
|
||||
embeddingList.add(embedding)
|
||||
val ok = SpeakerRecognition.manager.add(
|
||||
name = CURRENT_USER_ID,
|
||||
embedding = embeddingList.toTypedArray()
|
||||
)
|
||||
if (ok) {
|
||||
LogUtils.d(TAG, "✅ 注册当前唤醒用户特征成功 | 特征长度: ${embedding.size}")
|
||||
} else {
|
||||
LogUtils.w(TAG, "❌ 注册当前唤醒用户特征失败(manager.add返回false)")
|
||||
}
|
||||
} else {
|
||||
LogUtils.w(TAG, "❌ 唤醒音频Stream未就绪,跳过用户注册")
|
||||
}
|
||||
}.onFailure {
|
||||
LogUtils.e(TAG, "❌ 唤醒用户特征注册失败", it.message)
|
||||
}.also {
|
||||
// 释放Stream(原生Stream用完即释放)
|
||||
stream?.release()
|
||||
LogUtils.d(TAG, "🔄 唤醒注册Stream已释放")
|
||||
}
|
||||
|
||||
}
|
||||
handleWakeupEvent()
|
||||
return
|
||||
}
|
||||
@ -338,7 +410,7 @@ class VoiceController(
|
||||
val now = System.currentTimeMillis()
|
||||
val duration = now - recordingStartMs
|
||||
|
||||
// ========== 第一步:基础过滤(语音过短) ==========
|
||||
|
||||
if (!vadStarted || duration < MIN_SPEECH_MS) {
|
||||
LogUtils.d(TAG, "❌ 语音过短: $duration ms | 基线: $currentEnvBaseline")
|
||||
hasInvalidSpeech = true
|
||||
@ -368,6 +440,18 @@ class VoiceController(
|
||||
return
|
||||
}
|
||||
|
||||
// ========== 步骤1:优先声纹验证(核心!仅当前用户可通过) ==========
|
||||
if (ENABLE_STRICT_SPEAKER_VERIFY) {
|
||||
val isCurrentUser = verifySpeaker(audioBuffer.toFloatArray())
|
||||
if (!isCurrentUser) {
|
||||
LogUtils.w(TAG, "❌ 非当前唤醒用户,直接拒绝语音 | 录音时长: $duration ms")
|
||||
hasInvalidSpeech = true
|
||||
resetToWaitSpeech()
|
||||
return
|
||||
}
|
||||
LogUtils.d(TAG, "✅ 当前用户语音,继续处理 | 录音时长: $duration ms")
|
||||
}
|
||||
|
||||
// ========== 1. 强制兜底:正常语音直接通过(阈值降低) ==========
|
||||
val isNormalVoice = avgEnergy >= MIN_NORMAL_VOICE_ENERGY && vadRatio >= MIN_NORMAL_VOICE_VAD_RATIO
|
||||
if (isNormalVoice) {
|
||||
@ -575,4 +659,44 @@ class VoiceController(
|
||||
val minScore: Int,
|
||||
val scene: String
|
||||
)
|
||||
|
||||
/* ================= 核心:原生Stream声纹验证(仅当前用户有效) ================= */
|
||||
/**
|
||||
* 验证语音是否属于当前唤醒用户(完全适配你提供的API)
|
||||
* @param audio 待验证的语音数据
|
||||
* @return true=是当前用户,false=非当前用户
|
||||
*/
|
||||
private fun verifySpeaker(audio: FloatArray): Boolean {
|
||||
var stream: OnlineStream? = null
|
||||
return try {
|
||||
stream = SpeakerRecognition.extractor.createStream()
|
||||
stream.acceptWaveform(samples = audio, sampleRate = sampleRate)
|
||||
stream.inputFinished()
|
||||
|
||||
// 4. 计算特征并验证(按你提供的API)
|
||||
if (!SpeakerRecognition.extractor.isReady(stream)) {
|
||||
LogUtils.w(TAG, "❌ 验证音频Stream未就绪,验证失败")
|
||||
return false
|
||||
}
|
||||
|
||||
val embedding = SpeakerRecognition.extractor.compute(stream)
|
||||
|
||||
val verifyPass = SpeakerRecognition.manager.verify(name = CURRENT_USER_ID, embedding = embedding, threshold = SPEAKER_VERIFY_THRESHOLD)
|
||||
if (verifyPass) {
|
||||
LogUtils.d(TAG, "✅ 声纹验证通过")
|
||||
} else {
|
||||
LogUtils.w(TAG, "❌ 声纹验证失败")
|
||||
}
|
||||
verifyPass
|
||||
} catch (e: Exception) {
|
||||
LogUtils.e(TAG, "❌ 声纹验证异常", e)
|
||||
false
|
||||
} finally {
|
||||
// 释放Stream(原生Stream用完即释放)
|
||||
stream?.release()
|
||||
LogUtils.d(TAG, "🔄 验证Stream已释放")
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -48,13 +48,10 @@ import com.zs.smarthuman.im.chat.bean.SingleMessage
|
||||
import com.zs.smarthuman.kt.releaseIM
|
||||
import com.zs.smarthuman.sherpa.TimeoutType
|
||||
import com.zs.smarthuman.sherpa.VoiceController
|
||||
import com.zs.smarthuman.sherpa.VoiceState
|
||||
import com.zs.smarthuman.toast.Toaster
|
||||
import com.zs.smarthuman.utils.AudioDebugUtil
|
||||
import com.zs.smarthuman.utils.AudioPcmUtil
|
||||
import com.zs.smarthuman.utils.DangerousUtils
|
||||
import com.zs.smarthuman.utils.LogFileUtils
|
||||
import com.zs.smarthuman.utils.SimulateStreamingAsr
|
||||
|
||||
import com.zs.smarthuman.utils.UnityPlayerHolder
|
||||
import com.zs.smarthuman.utils.ViewSlideAnimator
|
||||
@ -101,7 +98,6 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
|
||||
}
|
||||
|
||||
override fun initData() {
|
||||
initAsrModel()
|
||||
PermissionUtils.permissionGroup(PermissionConstants.MICROPHONE)
|
||||
.callback(object : PermissionUtils.FullCallback {
|
||||
override fun onGranted(granted: List<String?>) {
|
||||
@ -198,7 +194,7 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
|
||||
voiceInfo = mutableListOf<VoiceBeanResp>().apply {
|
||||
add(
|
||||
VoiceBeanResp(
|
||||
audioUrl = "https://static.seerteach.net/aidialogue/systemVoice/aliyun-nv.mp3"
|
||||
audioUrl = "https://static.seerteach.net/aidialogue/userWakeUpAudio/344.mp3"
|
||||
)
|
||||
)
|
||||
}
|
||||
@ -212,17 +208,17 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
|
||||
},
|
||||
onFinalAudio = { audio ->
|
||||
Log.d("lrsxx", "检测到语音,长度=${audio.size}")
|
||||
// mViewModel?.uploadVoice(
|
||||
// AudioPcmUtil.pcm16ToBase64(AudioPcmUtil.floatToPcm16(audio)),
|
||||
// 1
|
||||
// )
|
||||
loadLocalJsonAndPlay()
|
||||
val file = File(
|
||||
getExternalFilesDir(Environment.DIRECTORY_DOWNLOADS)!!.getAbsolutePath(),
|
||||
"xxx.wav"
|
||||
mViewModel?.uploadVoice(
|
||||
AudioPcmUtil.pcm16ToBase64(AudioPcmUtil.floatToPcm16(audio)),
|
||||
1
|
||||
)
|
||||
AudioDebugUtil.saveFloatPcmAsWav(audio, file)
|
||||
LogUtils.dTag("audioxx", "WAV saved: ${file.path}, samples=${audio.size}")
|
||||
// loadLocalJsonAndPlay()
|
||||
// val file = File(
|
||||
// getExternalFilesDir(Environment.DIRECTORY_DOWNLOADS)!!.getAbsolutePath(),
|
||||
// "xxx.wav"
|
||||
// )
|
||||
// AudioDebugUtil.saveFloatPcmAsWav(audio, file)
|
||||
// LogUtils.dTag("audioxx", "WAV saved: ${file.path}, samples=${audio.size}")
|
||||
lifecycleScope.launch(Dispatchers.Main) {
|
||||
|
||||
mVerticalAnimator?.show()
|
||||
@ -258,16 +254,11 @@ class MainActivity : BaseViewModelActivity<ActivityMainBinding, MainViewModel>()
|
||||
)
|
||||
}
|
||||
|
||||
private fun initAsrModel(){
|
||||
lifecycleScope.launch(Dispatchers.IO){
|
||||
SimulateStreamingAsr.initOfflineRecognizer(App.getInstance())
|
||||
}
|
||||
}
|
||||
override fun receivedIMMsg(msg: SingleMessage) {
|
||||
when (msg.msgContentType) {
|
||||
MessageContentType.RECEIVE_VOICE_STREAM.msgContentType -> {
|
||||
lifecycleScope.launch(Dispatchers.IO) {
|
||||
LogFileUtils.logToFile2(this@MainActivity,msg.content)
|
||||
// LogFileUtils.logToFile2(this@MainActivity,msg.content)
|
||||
UnityPlayerHolder.getInstance()
|
||||
.startTalking(msg.content)
|
||||
// loadLocalJsonAndPlay()
|
||||
|
||||
@ -1,155 +0,0 @@
|
||||
package com.zs.smarthuman.utils
|
||||
|
||||
import android.content.Context
|
||||
import android.content.res.AssetManager
|
||||
import com.blankj.utilcode.util.LogUtils
|
||||
|
||||
import com.k2fsa.sherpa.onnx.OfflineModelConfig
|
||||
import com.k2fsa.sherpa.onnx.OfflineRecognizer
|
||||
import com.k2fsa.sherpa.onnx.OfflineRecognizerConfig
|
||||
import com.k2fsa.sherpa.onnx.OfflineWenetCtcModelConfig
|
||||
import com.k2fsa.sherpa.onnx.Vad
|
||||
import com.k2fsa.sherpa.onnx.getVadModelConfig
|
||||
import java.io.File
|
||||
import java.io.FileOutputStream
|
||||
import java.io.InputStream
|
||||
import java.io.OutputStream
|
||||
|
||||
|
||||
fun assetExists(assetManager: AssetManager, path: String): Boolean {
|
||||
val dir = path.substringBeforeLast('/', "")
|
||||
val fileName = path.substringAfterLast('/')
|
||||
|
||||
val files = assetManager.list(dir) ?: return false
|
||||
return files.contains(fileName)
|
||||
}
|
||||
|
||||
fun copyAssetToInternalStorage(path: String, context: Context): String {
|
||||
val targetRoot = context.filesDir
|
||||
val outFile = File(targetRoot, path)
|
||||
|
||||
if (!assetExists(context.assets, path = path)) {
|
||||
// for context binary, if it is does not exist, we return a path
|
||||
// that can be written to
|
||||
outFile.parentFile?.mkdirs()
|
||||
LogUtils.i("VoiceController", "$path does not exist, return ${outFile.absolutePath}")
|
||||
return outFile.absolutePath
|
||||
}
|
||||
|
||||
if (outFile.exists()) {
|
||||
val assetSize = context.assets.open(path).use { it.available() }
|
||||
if (outFile.length() == assetSize.toLong()) {
|
||||
LogUtils.i("VoiceController", "$targetRoot/$path already exists, skip copying, return $targetRoot/$path")
|
||||
|
||||
return "$targetRoot/$path"
|
||||
}
|
||||
}
|
||||
|
||||
outFile.parentFile?.mkdirs()
|
||||
|
||||
context.assets.open(path).use { input: InputStream ->
|
||||
FileOutputStream(outFile).use { output: OutputStream ->
|
||||
input.copyTo(output)
|
||||
}
|
||||
}
|
||||
LogUtils.i("VoiceController", "Copied $path to $targetRoot/$path")
|
||||
|
||||
return outFile.absolutePath
|
||||
}
|
||||
|
||||
|
||||
object SimulateStreamingAsr {
|
||||
private var _recognizer: OfflineRecognizer? = null
|
||||
val recognizer: OfflineRecognizer
|
||||
get() {
|
||||
return _recognizer!!
|
||||
}
|
||||
|
||||
|
||||
fun initOfflineRecognizer(context: Context) {
|
||||
synchronized(this) {
|
||||
if (_recognizer != null) {
|
||||
return
|
||||
}
|
||||
|
||||
val wenetConfig = OfflineWenetCtcModelConfig(
|
||||
model = "sherpa-onnx-wenetspeech-yue-u2pp-conformer-ctc-zh-en-cantonese-int8-2025-09-10/model.int8.onnx",
|
||||
)
|
||||
|
||||
val modelConfig = OfflineModelConfig(
|
||||
wenetCtc = wenetConfig,
|
||||
tokens = "sherpa-onnx-wenetspeech-yue-u2pp-conformer-ctc-zh-en-cantonese-int8-2025-09-10/tokens.txt"
|
||||
)
|
||||
val config = OfflineRecognizerConfig(
|
||||
modelConfig = modelConfig,
|
||||
)
|
||||
|
||||
|
||||
var assetManager: AssetManager? = context.assets
|
||||
|
||||
if (config.modelConfig.provider == "qnn") {
|
||||
// We assume you have copied files like libQnnHtpV81Skel.so to jniLibs/arm64-v8a
|
||||
LogUtils.i("VoiceController", "nativelibdir: ${context.applicationInfo.nativeLibraryDir}")
|
||||
|
||||
// If we don't set the environment variable for ADSP_LIBRARY_PATH, we will see
|
||||
// the error code 1008 from qnn_interface.deviceCreate()
|
||||
// See also
|
||||
// https://workbench.aihub.qualcomm.com/docs/hub/faq.html#why-am-i-seeing-error-1008-when-trying-to-use-htp
|
||||
OfflineRecognizer.prependAdspLibraryPath(context.applicationInfo.nativeLibraryDir)
|
||||
|
||||
// for qnn, we need to copy *.so files from assets folder to sd card
|
||||
if (config.modelConfig.senseVoice.qnnConfig.backendLib.isEmpty() && config.modelConfig.zipformerCtc.qnnConfig.backendLib.isEmpty()) {
|
||||
LogUtils.i("VoiceController", "You should provide libQnnHtp.so for qnn")
|
||||
throw IllegalArgumentException("You should provide libQnnHtp.so for qnn")
|
||||
}
|
||||
config.modelConfig.tokens =
|
||||
copyAssetToInternalStorage(config.modelConfig.tokens, context)
|
||||
|
||||
if (config.modelConfig.senseVoice.model.isNotEmpty() || assetExists(
|
||||
context.assets,
|
||||
path = config.modelConfig.senseVoice.qnnConfig.contextBinary
|
||||
)
|
||||
) {
|
||||
if (config.modelConfig.senseVoice.model.isNotEmpty()) {
|
||||
config.modelConfig.senseVoice.model =
|
||||
copyAssetToInternalStorage(config.modelConfig.senseVoice.model, context)
|
||||
}
|
||||
|
||||
config.modelConfig.senseVoice.qnnConfig.contextBinary =
|
||||
copyAssetToInternalStorage(
|
||||
config.modelConfig.senseVoice.qnnConfig.contextBinary,
|
||||
context
|
||||
)
|
||||
} else if (config.modelConfig.zipformerCtc.model.isNotEmpty()) {
|
||||
config.modelConfig.zipformerCtc.model =
|
||||
copyAssetToInternalStorage(config.modelConfig.zipformerCtc.model, context)
|
||||
|
||||
config.modelConfig.zipformerCtc.qnnConfig.contextBinary =
|
||||
copyAssetToInternalStorage(
|
||||
config.modelConfig.zipformerCtc.qnnConfig.contextBinary,
|
||||
context
|
||||
)
|
||||
}
|
||||
|
||||
if (config.hr.lexicon.isNotEmpty()) {
|
||||
config.hr.lexicon = copyAssetToInternalStorage(config.hr.lexicon, context)
|
||||
}
|
||||
|
||||
if (config.hr.ruleFsts.isNotEmpty()) {
|
||||
// it assumes there is only one fst. otherwise, you need to copy each fst separately
|
||||
config.hr.ruleFsts = copyAssetToInternalStorage(config.hr.ruleFsts, context)
|
||||
}
|
||||
|
||||
assetManager = null
|
||||
}
|
||||
|
||||
_recognizer = OfflineRecognizer(
|
||||
assetManager = assetManager,
|
||||
config = config,
|
||||
)
|
||||
|
||||
LogUtils.i("VoiceController", "sherpa-onnx offline recognizer initialized")
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user