语音识别入门第六节：基于DNN

阅读：评论：0
语音识别入门第六节：基于DNN
本代码主要针对内容为DNN。基础资源见.git。
# Author: Sining Sun, Zhanheng Yang, Binbin Zhang
import math
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import kaldi_io
from utils import *def plot_spectrogram(spec, file_name):fig = plt.figure(figsize=(20, 10))plt.plot(spec)plt.xlabel('epochs')plt.ylabel('loss')plt.savefig(file_name)plt.show()targets_list = ['Z', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'O']
targets_mapping = {}
for i, x in enumerate(targets_list):targets_mapping[x] = iclass Layer:def forward(self, input):''' Forward function by inputArgs:input: input, B * N matrix, B for batch sizeReturns:output when applied this layer'''raise 'Not implement error'def backward(self, input, output, d_output):''' Compute gradient of this layer's input by (input, output, d_output)as well as compute the gradient of the parameter of this layerArgs:input: input of this layeroutput: output of this layerd_output: accumulated gradient from final output to thislayer's outputReturns:accumulated gradient from final output to this layer's input'''raise 'Not implement error'def set_learning_rate(self, lr):''' Set learning rate of this layer'''self.learning_rate = lrdef update(self):''' Update this layers parameter if it has or do nothing'''class ReLU(Layer):def forward(self, input):mat = np.maximum(0, input)return mat.Tdef backward(self, input, output, d_output):mat = np.array(d_output, copy=True)mat[input <= 0] = 0return mat.Tclass FullyConnect(Layer):def __init__(self, in_dim, out_dim):self.w = np.random.randn(out_dim, in_dim) * np.sqrt(2.0 / in_dim)#weightself.b = np.zeros((out_dim, 1))#biasself.dw = np.zeros((out_dim, in_dim))self.db = np.zeros((out_dim, 1))def forward(self, input):mat = np.dot(self.w, input.T) + self.breturn matdef backward(self, input, output, d_output):batch_size = input.shape[0]in_diff = None# BEGIN_LAB, compute in_diff/dw/db hereself.dw = np.dot(d_output, input) / batch_sizeself.db = np.sum(d_output, axis=1, keepdims=True) / batch_sizein_diff = np.dot(self.w.T, d_output).T# END_LAB# Normalize dw/db by batch sizeself.dw = self.dw / batch_sizeself.db = self.db / batch_sizereturn in_diffdef update(self):self.w = self.w - self.learning_rate * self.dwself.b = self.b - self.learning_rate * self.dbclass Softmax(Layer):def forward(self, input):_input = input.Trow_max = _input.max(axis=1).reshape(_input.shape[0], 1)x = _input - row_p(x) / np.p(x), axis=1).reshape(x.shape[0], 1)def backward(self, input, output, d_output):''' Directly return the d_output as we show below, the grad is tothe activation(input) of softmax'''return d_outputclass DNN:def __init__(self, in_dim, out_dim, hidden_dim, num_hidden):self.layers = []self.layers.append(FullyConnect(in_dim, hidden_dim[0]))self.layers.append(ReLU())for i in range(num_hidden):self.layers.append(FullyConnect(hidden_dim[i], hidden_dim[i + 1]))self.layers.append(ReLU())self.layers.append(FullyConnect(hidden_dim[len(hidden_dim) - 1], out_dim))self.layers.append(Softmax())def set_learning_rate(self, lr):for layer in self.layers:layer.set_learning_rate(lr)def forward(self, input):self.forward_buf = []out = inputself.forward_buf.append(out)for i in range(len(self.layers)):out = self.layers[i].forward(out)self.forward_buf.append(out)assert (len(self.forward_buf) == len(self.layers) + 1)return outdef backward(self, grad):'''Args:grad: the grad is to the activation before softmax'''self.backward_buf = [None] * len(self.layers)# print("1",self.backward_buf.shape)self.backward_buf[len(self.layers) - 1] = grad# print("2",self.backward_buf.shape)for i in range(len(self.layers) - 2, -1, -1):# print("3",self.backward_buf[i + 1].shape)grad = self.layers[i].backward(self.forward_buf[i],self.forward_buf[i + 1],self.backward_buf[i + 1].T)self.backward_buf[i] = graddef update(self):for layer in self.layers:layer.update()def one_hot(labels, total_label):output = np.zeros((labels.shape[0], total_label))for i in range(labels.shape[0]):output[i][labels[i]] = 1.0return outputdef train(dnn):utt2feat, utt2target = read_feats_and_targets('train/feats.scp','train/text')inputs, labels = build_input(targets_mapping, utt2feat, utt2target)num_samples = inputs.shape[0]# Shuffle datapermute = np.random.permutation(num_samples)inputs = inputs[permute]labels = labels[permute]num_epochs = 100batch_size = 100avg_loss = np.zeros(num_epochs)for i in range(num_epochs):cur = 0while cur < num_samples:end = min(cur + batch_size, num_samples)input = inputs[cur:end]label = labels[cur:end]# Step1: forwardout = dnn.forward(input)one_hot_label = one_hot(label, out.shape[1])# print(label)# print(out.shape)# Step2: Compute cross entropy loss and backwardloss = -np.sum(np.log(out + 1e-20) * one_hot_label) / out.shape[0]# The grad is to activation before softmaxgrad = out - one_hot_labeldnn.backward(grad)# Step3: update parametersdnn.update()print('Epoch {} num_samples {} loss {}'.format(i, cur, loss))avg_loss[i] += losscur += batch_sizeavg_loss[i] /= il(num_samples / batch_size)plot_spectrogram(avg_loss, 'loss.png')def test(dnn):utt2feat, utt2target = read_feats_and_targets('test/feats.scp','test/text')total = len(utt2feat)correct = 0for utt in utt2feat:t = utt2target[utt]ark = utt2feat[utt]mat = ad_mat(ark)mat = splice(mat, 5, 5)posterior = dnn.forward(mat)posterior = np.sum(posterior, axis=0) / float(mat.shape[0])predict = targets_list[np.argmax(posterior)]if t == predict: correct += 1print('label: {} predict: {}'.format(t, predict))print('Acc: {}'.format(float(correct) / total))def main():np.random.seed(777)# We splice the raw feat with left 5 frames and right 5 frames# So the input here is 39 * (5 + 1 + 5) = 429dnn = DNN(429, 11, [170, 200, 150], 2)dnn.set_learning_rate(2e-2)train(dnn)test(dnn)if __name__ == '__main__':main()
本文发布于:2024-01-30 14:43:54，感谢您对本站的认可！
本文链接：https://www.4u4v.net/it/170659703520749.html
上一篇：NBATopShot排队机制最新解析（官方）
下一篇：认识变量：（JS）
标签：第六节入门语音识别 DNN
留言与评论（共有 0 条评论）