对音频(audio)信号做数据增强(Data Augmentation)可以有多重方式,这里通过sox库、soundfile库、librosa库进行实验,希望可以帮助到有需要的人。可用于深度学习音频处理实验的数据预处理。
音高变换增强(Pitch Shift Augmentation)
音高变化增强,是围绕频率轴的±5%范围内的随机滚动。环绕式转换以保留所有信息。
def pitch_shift_spectrogram(wavepath):
# """ Shift a spectrogram along the frequency axis in the spectral-domain at
# random
# """
wave, sr = sf.read(wavepath)
# 将波形转换为 spectrogram
spectrogram = librosa.stft(wave)
nb_cols = spectrogram.shape[0]
max_shifts = nb_cols // 20 # around 5% shift
nb_shifts = np.random.randint(-max_shifts, max_shifts)
shifted_spectrogram = np.roll(spectrogram, nb_shifts, axis=0)
# 将平移后的 spectrogram 转换回波形
shifted_wave = librosa.istft(shifted_spectrogram)
return shifted_wave, sr
时移增强(Time Shift Augmentation)
时移增强是通过沿时间轴滚动信号来随机移位信号。
def time_shift_spectrogram(wavepath):
""" Shift a spectrogram along the time axis in the spectral-domain at random
"""
wave, sr = sf.read(wavepath)
# 将波形转换为 spectrogram
spectrogram = librosa.stft(wave)
nb_cols = spectrogram.shape[1]
nb_shifts = np.random.randint(0, nb_cols)
# 对 spectrogram 进行时间轴的随机平移
shifted_spectrogram = np.roll(spectrogram, nb_shifts, axis=1)
# 将平移后的 spectrogram 转换回波形
shifted_wave = librosa.istft(shifted_spectrogram)
# # 保存平移后的波形为 WAV 文件
# shifted_wavepath = wavepath[:-4] + "_shifted.wav"
# sf.write(shifted_wavepath, shifted_wave, sr)
return shifted_wave, sr
噪声增强(Noise Augmentation)
噪声增强只是在增强信号之上增加一个随机噪声段,阻尼系数限制在0.4下。
def noise_augmentation(wavepath):
""" Perform noise augmentation of the wave by loading three noise segments
from the noise_dir and add these on top of the wave with a dampening factor
of 0.4
"""
wave, fs = sf.read(wavepath)
# dampening_factor = 0.4
# 生成随机阻尼系数
dampening_factor = np.random.uniform(0.0, 0.4)
# 生成与音频长度相同的随机噪声
noise = np.random.randn(len(wave))
# 将随机噪声混入到音频中
wave = wave + noise * dampening_factor
return wave, fs
相同类别增强(Same Class Augmentation)
相同类别的增强,简单将两个音频片段s1和s2,按照一定比例相加。
def same_class_augmentation(wavepath, class_dir):
""" Perform same class augmentation of the wave by loading a random segment
from the class_dir and additively combine the wave with that segment.
"""
sig_paths = glob.glob(os.path.join(class_dir, "*.wav"))
# print('sig_paths:', sig_paths)
aug_sig_path = np.random.choice(sig_paths, 1, replace=False)[0]
print('aug_sig_path:', aug_sig_path)
aug_sig, fs = sf.read(aug_sig_path)
wave, fs1 = sf.read(wavepath)
# 调整aug_sig的长度与wave相同
aug_sig = interp1d(np.arange(len(aug_sig)), aug_sig)(np.linspace(0, len(aug_sig) - 1, len(wave)))
alpha = np.random.rand()
wave = (1.0 - alpha) * wave + alpha * aug_sig
return wave, fs
变速增强(Time Augmentation)
请注意,需要Python安装sox库,以及Win上下载好并安装sox软件,exe所在文件夹需要加入系统环境变量。
# 变速增强,运用sox库
def speed_aug(input_wav, output_wav, speed_factor=1.2):
# 创建 SoX 转换器对象
transformer = sox.Transformer()
transformer.speed(speed_factor)
# 设置变速参数
# speed_factor = 1.2 # 变速因子,大于 1 加快速度,小于 1 减慢速度
# 执行变速和数据增强转换
transformer.build(input_wav, output_wav)
代码实操
# -*- coding: utf-8 -*-
# @Time : 2023/5/27 0027 15:18
# @Author : Jason
# -*- coding: utf-8 -*-
# @Time : 2023/5/26 0026 22:07
# @Author : Jason
import glob
import os
import numpy as np
import sox # py
import soundfile as sf
from scipy.interpolate import interp1d
import librosa
from config import *
# sox库
os.environ['PATH'] = 'G:\Program Files (x86)\sox-14-4-2'
# 相同类别的增强,简单将两个音频片段s1和s2,按照一定比例相加
def same_class_augmentation(wavepath, class_dir):
""" Perform same class augmentation of the wave by loading a random segment
from the class_dir and additively combine the wave with that segment.
"""
sig_paths = glob.glob(os.path.join(class_dir, "*.wav"))
# print('sig_paths:', sig_paths)
aug_sig_path = np.random.choice(sig_paths, 1, replace=False)[0]
print('aug_sig_path:', aug_sig_path)
aug_sig, fs = sf.read(aug_sig_path)
wave, fs1 = sf.read(wavepath)
# 调整aug_sig的长度与wave相同
aug_sig = interp1d(np.arange(len(aug_sig)), aug_sig)(np.linspace(0, len(aug_sig) - 1, len(wave)))
alpha = np.random.rand()
wave = (1.0 - alpha) * wave + alpha * aug_sig
return wave, fs
# 噪声增强只是在增强信号之上增加一个随机噪声段,阻尼系数为0.4
def noise_augmentation(wavepath):
""" Perform noise augmentation of the wave by loading three noise segments
from the noise_dir and add these on top of the wave with a dampening factor
of 0.4
"""
wave, fs = sf.read(wavepath)
# dampening_factor = 0.4
# 生成随机阻尼系数
dampening_factor = np.random.uniform(0.0, 0.4)
# 生成与音频长度相同的随机噪声
noise = np.random.randn(len(wave))
# 将随机噪声混入到音频中
wave = wave + noise * dampening_factor
return wave, fs
# 时移增强是通过沿时间轴滚动信号来随机移位信号。包裹着移动
def time_shift_spectrogram(wavepath):
""" Shift a spectrogram along the time axis in the spectral-domain at random
"""
wave, sr = sf.read(wavepath)
# 将波形转换为 spectrogram
spectrogram = librosa.stft(wave)
nb_cols = spectrogram.shape[1]
nb_shifts = np.random.randint(0, nb_cols)
# 对 spectrogram 进行时间轴的随机平移
shifted_spectrogram = np.roll(spectrogram, nb_shifts, axis=1)
# 将平移后的 spectrogram 转换回波形
shifted_wave = librosa.istft(shifted_spectrogram)
# # 保存平移后的波形为 WAV 文件
# shifted_wavepath = wavepath[:-4] + "_shifted.wav"
# sf.write(shifted_wavepath, shifted_wave, sr)
return shifted_wave, sr
# 音高变化增强,是围绕频率轴的±5%范围内的随机滚动。环绕式转换以保留所有信息
def pitch_shift_spectrogram(wavepath):
# """ Shift a spectrogram along the frequency axis in the spectral-domain at
# random
# """
wave, sr = sf.read(wavepath)
# 将波形转换为 spectrogram
spectrogram = librosa.stft(wave)
nb_cols = spectrogram.shape[0]
max_shifts = nb_cols // 20 # around 5% shift
nb_shifts = np.random.randint(-max_shifts, max_shifts)
shifted_spectrogram = np.roll(spectrogram, nb_shifts, axis=0)
# 将平移后的 spectrogram 转换回波形
shifted_wave = librosa.istft(shifted_spectrogram)
return shifted_wave, sr
# 变速增强,运用sox库
def speed_aug(input_wav, output_wav, speed_factor=1.2):
# 创建 SoX 转换器对象
transformer = sox.Transformer()
transformer.speed(speed_factor)
# 设置变速参数
# speed_factor = 1.2 # 变速因子,大于 1 加快速度,小于 1 减慢速度
# 执行变速和数据增强转换
transformer.build(input_wav, output_wav)
# 增加方法如下
if __name__ == "__main__":
rootPath = 'H:\\Codes\\AudioClassification-Pytorch-master\\dataset\\audio'
sourceType = ['C', 'M', 'E', '0', 'ru'] # 增强前类别名
source_list = []
newlist = ['C_a', 'M_a', 'E_a', '0_a', 'ru_a'] # 增强后类别名
s = []
t = []
for i in range(0, len(sourceType) - 1):
s = []
t = []
sourcePath = os.path.join(rootPath, sourceType[i])
# print(sourcePath)
# 获取文件夹中的所有文件和子文件夹列表
source_list = os.listdir(sourcePath)
# print(source_list)
for j in source_list:
s.append(os.path.join(sourcePath, j))
print('s: ', s)
for x in range(len(s)):
targetPath = os.path.join(rootPath, newlist[i])
# print(targetPath)
# target_list = os.listdir(targetPath)
# print(target_list)
t.append(os.path.join(targetPath, str(startIdx)) + '~~~~.wav')
startIdx += 1
print('t: ', t)
"""
从这里开始调用函数生成新样本
"""
# 变速增强
# for i in range(len(s)): # 1.2
# speed_aug(s[i], t[i])
# for i in range(len(s)):
# speed_aug(s[i], t[i], 0.8)
# for i in range(len(s)):
# speed_aug(s[i], t[i], 1.1)
# for i in range(len(s)):
# speed_aug(s[i], t[i], 0.9)
# 3724-4579
# 混合两文件增强
# for k in range(len(s)):
# print('-')
# print(s[k])
# # print(os.path.join('H:\Codes\AudioClassification-Pytorch-master\dataset\\audio', sourceType[i]))
# w, fs = same_class_augmentation(s[k], os.path.join('H:\Codes\AudioClassification-Pytorch-master\dataset\\audio',
# sourceType[i]))
# print('t[k]: ', t[k])
# sf.write(t[k], w, fs)
# print('-')
# 4580-5435
# 5436-6291
# 加噪增强
# for k in range(len(s)):
# print('-')
# print(s[k])
# # print(os.path.join('H:\Codes\AudioClassification-Pytorch-master\dataset\\audio', sourceType[i]))
# w, fs = noise_augmentation(s[k])
# print('t[k]: ', t[k])
# sf.write(t[k], w, fs)
# print('-')
# 时移增强
# 6292-7147~~~
# for k in range(len(s)):
# print('-')
# print(s[k])
# # print(os.path.join('H:\Codes\AudioClassification-Pytorch-master\dataset\\audio', sourceType[i]))
# w, fs = time_shift_spectrogram(s[k])
# print('t[k]: ', t[k])
# sf.write(t[k], w, fs)
# print('-')
# 音高变化增强
# 7148-
for k in range(len(s)):
print('-')
print(s[k])
# print(os.path.join('H:\Codes\AudioClassification-Pytorch-master\dataset\\audio', sourceType[i]))
w, fs = pitch_shift_spectrogram(s[k])
print('t[k]: ', t[k])
sf.write(t[k], w, fs)
print('-')
"""
结束
"""
print('---------1 class end')
增强后如图
参考文献:
处理库的官方文档
https://zhuanlan.zhihu.com/p/41679490
https://github.com/johnmartinsson/bird-species-classification/wiki/Data-Augmentation