语音识别

发布时间 2023-08-14 17:48:52作者: lyycy

1、下载相关包
https://github.com/PaddlePaddle/PaddleSpeech 网址
pip install paddlepaddle==2.4.1 -i https://mirror.baidu.com/pypi/simple
pip install pytest-runner
pip install paddlespeech

git clone https://github.com/PaddlePaddle/PaddleSpeech.git
cd PaddleSpeech
pip install pytest-runner
pip install .

#下载音频
wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav


paddlespeech tts --input "以明月相思为基调创作一首诗吧!" --output output.wav

from paddlespeech.cli.asr.infer import ASRExecutor
asr = ASRExecutor()
result = asr(audio_file="zh.wav")
print(result)


from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("/mnt/workspace/ChatGLM2-6B-main/chatglm2-6b", trust_remote_code=True)
model = AutoModel.from_pretrained("/mnt/workspace/ChatGLM2-6B-main/chatglm2-6b", trust_remote_code=True).half().cuda()
model = model.eval()
response, history = model.chat(tokenizer,result,history=[])
print(response)


# 语音合成使用,第一次使用时自动下载模型,耗时比较长
from paddlespeech.cli.tts.infer import TTSExecutor
tts = TTSExecutor()
text = "买了社保,是不是就不用买商业保险了?"
output_wav = "baoxian_example.wav"
tts(text=text, output=output_wav)


# 语音识别能力,第一次使用时自动下载模型,耗时比较长
from paddlespeech.cli.asr.infer import ASRExecutor
asr = ASRExecutor()
asr_result = asr(audio_file=output_wav, force_yes=True)
# tts 生成的音频默认为24k, asr 识别默认为16k,重采样一下
print("asr 识别结果:", asr_result)


# 标点恢复,第一次使用时自动下载模型,耗时比较长
from paddlespeech.cli.text.infer import TextExecutor
text_punc = TextExecutor()
text_punc_result = text_punc(text=asr_result)
print("标点恢复结果:", text_punc_result)


# 加载飞桨的API
import paddlenlp as ppnlp
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle import inference

# 序列的最大的长度,根据数据集的情况进行设置
max_seq_length=64
batch_size=64
# 使用rocketqa开放领域的问答模型
model_name_or_path='rocketqa-zh-dureader-query-encoder'
tokenizer = ppnlp.transformers.ErnieTokenizer.from_pretrained(model_name_or_path)


# 加载 SimCSE 模型
from SimCSE import SimCSE
# 关键参数
scale=20 # 推荐值: 10 ~ 30
margin=0.1 # 推荐值: 0.0 ~ 0.2
# 可以根据实际情况进行设置
output_emb_size=256

# 使用预训练模型
pretrained_model = ppnlp.transformers.ErnieModel.from_pretrained(model_name_or_path)
# 无监督+R-Drop,类似于多任务学习
simcse_model = SimCSE(
pretrained_model,
margin=margin,
scale=scale,
output_emb_size=output_emb_size)

# 加载模型
state_dict = paddle.load("model_140/model_state.pdparams")
simcse_model.set_state_dict(state_dict)
simcse_model.eval()


# 构建向量检索数据库
from ann_util import build_index
from data import convert_example_test
from data import gen_id2corpus
from paddlenlp.data import Stack, Tuple, Pad
from paddlenlp.datasets import load_dataset, MapDataset
from functools import partial

# 明文数据 -> ID 序列训练数据
def create_dataloader(dataset,
mode='train',
batch_size=1,
batchify_fn=None,
trans_fn=None):
if trans_fn:
dataset = dataset.map(trans_fn)

shuffle = True if mode == 'train' else False
if mode == 'train':
batch_sampler = paddle.io.DistributedBatchSampler(
dataset, batch_size=batch_size, shuffle=shuffle)
else:
batch_sampler = paddle.io.BatchSampler(
dataset, batch_size=batch_size, shuffle=shuffle)

return paddle.io.DataLoader(
dataset=dataset,
batch_sampler=batch_sampler,
collate_fn=batchify_fn,
return_list=True)


corpus_file = 'baoxian/corpus.csv'
id2corpus = gen_id2corpus(corpus_file)
# conver_example function's input must be dict
corpus_list = [{idx: text} for idx, text in id2corpus.items()]
print(corpus_list[:4])


trans_func_corpus = partial(
convert_example_test,
tokenizer=tokenizer,
max_seq_length=max_seq_length)
batchify_fn_corpus = lambda samples, fn=Tuple(
Pad(axis=0, pad_val=tokenizer.pad_token_id), # text_input
Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # text_segment
): [data for data in fn(samples)]
corpus_ds = MapDataset(corpus_list)
corpus_data_loader = create_dataloader(
corpus_ds,
mode='predict',
batch_size=batch_size,
batchify_fn=batchify_fn_corpus,
trans_fn=trans_func_corpus)

# 索引的大小
hnsw_max_elements=1000000
# 控制时间和精度的平衡参数
hnsw_ef=100
hnsw_m=100

final_index = build_index(corpus_data_loader,
simcse_model,
output_emb_size=output_emb_size,
hnsw_max_elements=hnsw_max_elements,
hnsw_ef=hnsw_ef,
hnsw_m=hnsw_m)