注意,以下代码仍受到数值精度(numerical accuracy)的影响。标签:1024,power,torchaudio,mel,torch,ts,length,data From: https://www.cnblogs.com/s-tyou/p/18071109
import soundfile as sf
import numpy as np
import torchaudio
import torch
import librosa
if __name__ == '__main__':
np_data, sr = sf.read('./test.wav')
ts_data = torch.from_numpy(np_data).float()
# the following methods will return the same spectrogram
li_spec = librosa.stft(y=np_data, n_fft=1024, hop_length=512, center=True, pad_mode='constant')
ts_spec = torch.stft(ts_data, n_fft=1024, hop_length=512, center=True, pad_mode='constant', return_complex=True,
window=torch.hann_window(1024))
ts_spec_1 = torchaudio.transforms.Spectrogram(n_fft=1024, win_length=1024, hop_length=512, center=True,
pad_mode='constant', power=None)(ts_data)
# the following methods will return the same mel-spectrogram
li_mel = librosa.feature.melspectrogram(S=np.abs(li_spec) ** 2., sr=sr, n_fft=1024, hop_length=512, n_mels=64,
fmax=8000, center=True)
li_mel_1 = librosa.feature.melspectrogram(y=np_data, sr=sr, n_fft=1024, hop_length=512, n_mels=64, fmax=8000,
center=True, pad_mode='constant', power=2.0)
ts_mel = torchaudio.transforms.MelSpectrogram(window_fn=torch.hann_window, win_length=1024, sample_rate=sr,
n_fft=1024, hop_length=512, n_mels=64, f_max=8000,
pad_mode='constant', mel_scale='slaney', norm='slaney')(ts_data)
ts_mel_1 = torchaudio.transforms.MelScale(n_mels=64, sample_rate=sr, f_min=0.0, f_max=8000.0, n_stft=513,
norm='slaney', mel_scale='slaney')(torch.square(torch.abs(ts_spec_1)))
# the following methods will return the same log-mel-spectrogram
li_log_mel = librosa.power_to_db(li_mel)
ts_log_mel = torchaudio.transforms.AmplitudeToDB(top_db=80)(ts_mel)