1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
| import torch
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer
import torchaudio
class QwenAudioModel:
def __init__(self, model_path="Qwen/Qwen2-Audio-7B-Instruct"):
"""初始化Qwen-Audio模型"""
self.model = AutoModel.from_pretrained(
model_path,
trust_remote_code=True,
torch_dtype=torch.float16,
device_map="auto"
)
self.processor = AutoTokenizer.from_pretrained(model_path)
# 音频编码器配置
self.audio_encoder_config = {
'sample_rate': 16000,
'n_mels': 128,
'hop_length': 160,
'n_fft': 400,
'window_size': 25, # ms
'stride': 10 # ms
}
def process_audio(self, audio_path):
"""处理音频输入"""
# 加载音频
waveform, sample_rate = torchaudio.load(audio_path)
# 重采样到16kHz
if sample_rate != 16000:
resampler = torchaudio.transforms.Resample(
orig_freq=sample_rate,
new_freq=16000
)
waveform = resampler(waveform)
# 提取Mel频谱特征
mel_spectrogram = torchaudio.transforms.MelSpectrogram(
sample_rate=16000,
n_mels=self.audio_encoder_config['n_mels'],
n_fft=self.audio_encoder_config['n_fft'],
hop_length=self.audio_encoder_config['hop_length']
)
features = mel_spectrogram(waveform)
return features
def multi_task_inference(self, audio_path, task_type="auto"):
"""多任务推理"""
audio_features = self.process_audio(audio_path)
if task_type == "auto":
# 自动识别任务类型
task_type = self.detect_task_type(audio_features)
task_prompts = {
"asr": "Transcribe the speech to text:",
"translation": "Translate the speech to English:",
"emotion": "Analyze the emotion in this speech:",
"speaker": "Identify the speaker characteristics:",
"caption": "Generate a caption for this audio:",
"qa": "Answer questions about this audio:"
}
prompt = task_prompts.get(task_type, "Process this audio:")
# 构建输入
inputs = self.processor(
text=prompt,
audio=audio_features,
return_tensors="pt"
)
# 生成输出
with torch.no_grad():
outputs = self.model.generate(
**inputs,
max_new_tokens=512,
temperature=0.7,
do_sample=True
)
response = self.processor.decode(outputs[0], skip_special_tokens=True)
return response
|