本次准确识别5种大类,共计50种小类的音频。每个音频文件时长5秒,格式为wav。
数据集来自freesound公开项目,从中手动提取。训练集共计1600个,测试集400个。
import os
import random
import numpy as np
import pandas as pd
import librosa
import librosa.display
import matplotlib.pyplot as plt
import seaborn as sn
from sklearn import model_selection
from sklearn import preprocessing
import IPython.display as ipd
# define directories
base_dir = "."
esc_dir = os.path.join(base_dir, "ESC-50-master")
meta_file = os.path.join(esc_dir, "esc50.csv")
audio_dir = os.path.join(esc_dir, "audio/audio/")
# load metadata
meta_data = pd.read_csv(meta_file,header=None,names=["filename","target"])
meta_data
# get data size
data_size = meta_data.shape
print(data_size)
(2000, 2)
# load a wave data
def load_wave_data(audio_dir, file_name):file_path = os.path.join(audio_dir, file_name)x, fs = librosa.load(file_path, sr=44100)return x,fs
# change wave data to mel-stft
def calculate_melsp(x, n_fft=1024, hop_length=128):stft = np.abs(librosa.stft(x, n_fft=n_fft, hop_length=hop_length))**2log_stft = librosa.power_to_db(stft)melsp = lspectrogram(S=log_stft,n_mels=128)return melsp
# display wave in plots
def show_wave(x):plt.plot(x)plt.show()
# display wave in heatmap
def show_melsp(melsp, fs):librosa.display.specshow(melsp, sr=lorbar()plt.show()
# example data
x, fs = load_wave_data(audio_dir, meta_data.loc[0,"filename"])
melsp = calculate_melsp(x)
print("wave size:{0}nmelsp size:{1}nsamping rate:{2}".format(x.shape, melsp.shape, fs))
show_wave(x)
show_melsp(melsp, fs)
wave size:(220500,)
melsp size:(128, 1723)
samping rate:44100
ipd.Audio(x, rate=fs)
# data augmentation: add white noise
def add_white_noise(x, rate=0.002):return x + rate*np.random.randn(len(x))
x_wn = add_white_noise(x)
melsp = calculate_melsp(x_wn)
print("wave size:{0}nmelsp size:{1}nsamping rate:{2}".format(x_wn.shape, melsp.shape, fs))
show_wave(x_wn)
show_melsp(melsp, fs)
wave size:(220500,)
melsp size:(128, 1723)
samping rate:44100
ipd.Audio(x_wn, rate=fs)
# data augmentation: shift sound in timeframe
def shift_sound(x, rate=2):ll(x, int(len(x)//rate))
x_ss = shift_sound(x)
melsp = calculate_melsp(x_ss)
print("wave size:{0}nmelsp size:{1}nsamping rate:{2}".format(x_ss.shape, melsp.shape, fs))
show_wave(x_ss)
show_melsp(melsp, fs)
melsp size:(128, 1723)
samping rate:44100
ipd.Audio(x_ss, rate=fs)
# data augmentation: stretch sound
def stretch_sound(x, rate=1.1):input_length = len(x)x = librosa.effects.time_stretch(x, rate)if len(x)>input_length:return x[:input_length]else:return np.pad(x, (0, max(0, input_length - len(x))), "constant")
x_st = stretch_sound(x)
melsp = calculate_melsp(x_st)
print("wave size:{0}nmelsp size:{1}nsamping rate:{2}".format(x_st.shape, melsp.shape, fs))
show_wave(x_st)
show_melsp(melsp, fs)
wave size:(220500,)
melsp size:(128, 1723)
samping rate:44100
ipd.Audio(x_st, rate=fs)
# get training dataset and target dataset
x = list(meta_data.loc[:,"filename"])
y = list(meta_data.loc[:, "target"])x_train, x_test, y_train, y_test = ain_test_split(x, y, test_size=0.2, stratify=y)
print("x train:{0}ny train:{1}nx test:{2}ny test:{3}".format(len(x_train),len(y_train), len(x_test), len(y_test)))
x train:1600
y train:1600
x test:400
y test:400
freq = 128
time = 1723
# save wave data in npz, with augmentation
def save_np_data(filename, x, y, aug=None, rates=None):np_data = np.zeros(freq*time*len(x)).reshape(len(x), freq, time)np_targets = np.zeros(len(y))for i in range(len(y)):_x, fs = load_wave_data(audio_dir, x[i])if aug is not None:_x = aug(x=_x, rate=rates[i])_x = calculate_melsp(_x)np_data[i] = _xnp_targets[i] = y[i]np.savez(filename, x=np_data, y=np_targets)
# save raw training dataset
if not ists("esc_melsp_all_train_raw.npz"):save_np_data("esc_melsp_all_train_raw.npz", x_train, y_train)
# save test dataset
if not ists("esc_melsp_all_test.npz"):save_np_data("esc_melsp_all_test.npz", x_test, y_test)
# save training dataset with white noise
if not ists("esc_melsp_train_white_noise.npz"):rates = np.random.randint(1,50,len(x_train))/10000save_np_data("esc_melsp_train_white_noise.npz", x_train, y_train, aug=add_white_noise, rates=rates)
# save training dataset with sound shift
if not ists("esc_melsp_train_shift_sound.npz"):rates = np.random.choice(np.arange(2,6),len(y_train))save_np_data("esc_melsp_train_shift_sound.npz", x_train, y_train, aug=shift_sound, rates=rates)
# save training dataset with stretch
if not ists("esc_melsp_train_stretch_sound.npz"):rates = np.random.choice(np.arange(80,120),len(y_train))/100save_np_data("esc_melsp_train_stretch_sound.npz", x_train, y_train, aug=stretch_sound, rates=rates)
# save training dataset with combination of white noise and shift or stretch
if not ists("esc_melsp_train_combination.npz"):np_data = np.zeros(freq*time*len(x_train)).reshape(len(x_train), freq, time)np_targets = np.zeros(len(y_train))for i in range(len(y_train)):x, fs = load_wave_data(audio_dir, x_train[i])x = add_white_noise(x=x, rate=np.random.randint(1,50)/1000)if np.random.choice((True,False)):x = shift_sound(x=x, rate=np.random.choice(np.arange(2,6)))else:x = stretch_sound(x=x, rate=np.random.choice(np.arange(80,120))/100)x = calculate_melsp(x)np_data[i] = xnp_targets[i] = y_train[i]np.savez("esc_melsp_train_combination.npz", x=np_data, y=np_targets)
import keras
dels import Model
from keras.layers import Input, Dense, Dropout, Activation
from keras.layers import Conv2D, GlobalAveragePooling2D
from keras.layers import BatchNormalization, Add
from keras.callbacks import EarlyStopping, ModelCheckpoint,ReduceLROnPlateau
dels import load_model
import warnings
warnings.filterwarnings("ignore")
# dataset files
train_files = ["esc_melsp_all_train_raw.npz"]
test_file = "esc_melsp_all_test.npz"
train_num = len(x_train)
test_num = len(x_test)
print(train_num)
print(test_num)
1600
400
# define dataset placeholders
x_train = np.zeros(freq*time*train_num*len(train_files)).reshape(train_num*len(train_files), freq, time)
y_train = np.zeros(train_num*len(train_files))
# load dataset
for i in range(len(train_files)):data = np.load(train_files[i])x_train[i*train_num:(i+1)*train_num] = data["x"]y_train[i*train_num:(i+1)*train_num] = data["y"]
# load test dataset
test_data = np.load(test_file)
x_test = test_data["x"]
y_test = test_data["y"]
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)
(1600, 128, 1723)
(1600,)
(400, 128, 1723)
(400,)
# redefine target data into one hot vector
classes = 50
y_train = _categorical(y_train, classes)
y_test = _categorical(y_test, classes)
# reshape training dataset
x_train = shape(train_num*1, freq, time, 1)
x_test = shape(test_num, freq, time, 1)
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)
(1600, 128, 1723, 1)
(1600, 50)
(400, 128, 1723, 1)
(400, 50)
print("x train:{0}ny train:{1}nx test:{2}ny test:{3}".format(x_train.shape,y_train.shape, x_test.shape, y_test.shape))
x train:(1600, 128, 1723, 1)
y train:(1600, 50)
x test:(400, 128, 1723, 1)
y test:(400, 50)
def cba(inputs, filters, kernel_size, strides):x = Conv2D(filters, kernel_size=kernel_size, strides=strides, padding='same')(inputs)x = BatchNormalization()(x)x = Activation("relu")(x)return x
# define CNN
inputs = Input(shape=(x_train.shape[1:]))x_1 = cba(inputs, filters=32, kernel_size=(1,8), strides=(1,2))
x_1 = cba(x_1, filters=32, kernel_size=(8,1), strides=(2,1))
x_1 = cba(x_1, filters=64, kernel_size=(1,8), strides=(1,2))
x_1 = cba(x_1, filters=64, kernel_size=(8,1), strides=(2,1))x_2 = cba(inputs, filters=32, kernel_size=(1,16), strides=(1,2))
x_2 = cba(x_2, filters=32, kernel_size=(16,1), strides=(2,1))
x_2 = cba(x_2, filters=64, kernel_size=(1,16), strides=(1,2))
x_2 = cba(x_2, filters=64, kernel_size=(16,1), strides=(2,1))x_3 = cba(inputs, filters=32, kernel_size=(1,32), strides=(1,2))
x_3 = cba(x_3, filters=32, kernel_size=(32,1), strides=(2,1))
x_3 = cba(x_3, filters=64, kernel_size=(1,32), strides=(1,2))
x_3 = cba(x_3, filters=64, kernel_size=(32,1), strides=(2,1))x_4 = cba(inputs, filters=32, kernel_size=(1,64), strides=(1,2))
x_4 = cba(x_4, filters=32, kernel_size=(64,1), strides=(2,1))
x_4 = cba(x_4, filters=64, kernel_size=(1,64), strides=(1,2))
x_4 = cba(x_4, filters=64, kernel_size=(64,1), strides=(2,1))x = Add()([x_1, x_2, x_3, x_4])x = cba(x, filters=128, kernel_size=(1,16), strides=(1,2))
x = cba(x, filters=128, kernel_size=(16,1), strides=(2,1))x = GlobalAveragePooling2D()(x)
x = Dense(classes)(x)
x = Activation("softmax")(x)model = Model(inputs, x)
model.summary()
# initiate Adam optimizer
opt = keras.optimizers.adam(lr=0.0001, decay=1e-6, amsgrad=True)# Let's train the model using Adam with amsgrad
modelpile(loss='categorical_crossentropy',optimizer=opt,metrics=['accuracy'])
# directory for model checkpoints
model_dir = "./models"
if not ists(model_dir):os.mkdir(model_dir)# early stopping and model checkpoint# early
es_cb = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True,verbose=1, mode='min')
chkpt = os.path.join(model_dir, 'esc50_.{epoch:02d}_{val_loss:.4f}_{val_acc:.4f}.hdf5')
cp_cb = ModelCheckpoint(filepath = chkpt, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
reduce_lr = ReduceLROnPlateau(monitor="val_loss",factor=0.6,patience=3,verbose=1,mode="min")
# between class data generator
class MixupGenerator():def __init__(self, x_train, y_train, batch_size=16, alpha=0.2, shuffle=True):self.x_train = x_trainself.y_train = y_trainself.batch_size = batch_sizeself.alpha = alphaself.shuffle = shuffleself.sample_num = len(x_train)def __call__(self):while True:indexes = self.__get_exploration_order()itr_num = int(len(indexes) // (self.batch_size * 2))for i in range(itr_num):batch_ids = indexes[i * self.batch_size * 2:(i + 1) * self.batch_size * 2]x, y = self.__data_generation(batch_ids)yield x, ydef __get_exploration_order(self):indexes = np.arange(self.sample_num)if self.shuffle:np.random.shuffle(indexes)return indexesdef __data_generation(self, batch_ids):_, h, w, c = self.x_train.shape_, class_num = self.y_train.shapex1 = self.x_train[batch_ids[:self.batch_size]]x2 = self.x_train[batch_ids[self.batch_size:]]y1 = self.y_train[batch_ids[:self.batch_size]]y2 = self.y_train[batch_ids[self.batch_size:]]l = np.random.beta(self.alpha, self.alpha, self.batch_size)x_l = l.reshape(self.batch_size, 1, 1, 1)y_l = l.reshape(self.batch_size, 1)x = x1 * x_l + x2 * (1 - x_l)y = y1 * y_l + y2 * (1 - y_l)return x, y
# train model
batch_size = 16
epochs = 1000training_generator = MixupGenerator(x_train, y_train)()
model.fit_generator(generator=training_generator,steps_per_epoch=x_train.shape[0] // batch_size,validation_data=(x_test, y_test),epochs=epochs, verbose=1,shuffle=True,callbacks=[es_cb, cp_cb,reduce_lr])
evaluation = model.evaluate(x_test, y_test)
evaluation = model.evaluate(x_test, y_test)
print(evaluation)
320/320 [==============================] - 48s 151ms/step
[1.4442142248153687, 0.675]
freq = 128
time = 1723
predict_audio_dir = os.path.join(esc_dir, "audio/test/")
predict_file = os.path.join(esc_dir,"test.csv")
predict_data = pd.read_csv("test.csv",header=None,names=["filename"])
predict_data
predict_data.shape[0]
400
predict = list(predict_data.loc[:,"filename"])
# save wave data in npz, with augmentation
def save_np_data(filename, x, aug=None, rates=None):np_data = np.zeros(freq*time*len(x)).reshape(len(x), freq, time)for i in range(len(predict)):_x, fs = load_wave_data(predict_audio_dir, x[i])if aug is not None:_x = aug(x=_x, rate=rates[i])_x = calculate_melsp(_x)np_data[i] = _xnp.savez(filename, x=np_data)
# save raw training dataset
if not ists("esc_melsp_predict_raw.npz"):save_np_data("esc_melsp_predict_raw.npz", predict)
predict_file = "esc_melsp_predict_raw.npz"
# load test dataset
predict = np.load(predict_file)
x_predict = predict["x"]
x_predict = shape(predict_data.shape[0],freq,time,1)
pred = None
for model_path in ["models/esc50_.14_1.3803_0.7312.hdf5","models/esc50_.18_1.2065_0.7000.hdf5","models/esc50_.20_1.1664_0.7594.hdf5"]:model = load_model(model_path)if pred is None:pred = model.predict(x_predict)else:pred += model.predict(x_predict)
print(pred.shape)
res = np.argmax(pred,axis=1)
print(res[:5])
import pandas as pddf = pd.DataFrame({"img_path":predict_data["filename"], "tags":res})
df.to_csv("submit.csv",index=None,header=None)
下次见!!!
本文发布于:2024-02-02 14:32:54,感谢您对本站的认可!
本文链接:https://www.4u4v.net/it/170685557544439.html
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系,我们将在24小时内删除。
留言与评论(共有 0 条评论) |