场景:
-
一段音频中有多个说话人,将不同的人说的话分离出来
-
已知一些人的语音特征,跟分离出来的片段,分别求特征的余弦距离,余弦距离最小的作为说话的人
安装:
pip install pyannote.audio
# _*_ coding: utf-8 _*_
import torch
from pyannote.audio import Model, Pipeline, Inference
from pyannote.core import Segment
from scipy.spatial.distance import cosine
def extract_speaker_embedding(pipeline, audio_file, speaker_label):
diarization = pipeline(audio_file)
speaker_embedding = None
for turn, _, label in diarization.itertracks(yield_label=True):
if label == speaker_label:
segment = Segment(turn.start, turn.end)
speaker_embedding = inference.crop(audio_file, segment)
break
return speaker_embedding
# 对于给定的音频,提取声纹特征并与人库中的声纹进行比较
def recognize_speaker(pipeline, audio_file):
diarization =
标签:file,audio,embedding,python,label,speaker,语音,import,识别
From: https://blog.csdn.net/qq_30895747/article/details/136918857