如何构建神经网络
- 首先导入在此作业中需要的所有包
- 初始化𝐿层神经网络参数。
- 前向传播
- 计算损失函数
- 后向传播
- 更新参数
导入在此作业中需要的所有包
import numpy as np //numpy是使用Python进行科学计算的主要包。
import h5py
import matplotlib.pyplot as plt //matplotlib是一个用于在Python中绘制图形的库。
from testCases_v2 import *
from dnn_utils_v2 import sigmoid, sigmoid_backward, relu, relu_backward //Dnn_utils提供了一些必要的函数。%matplotlib inline
plt.rcParams['figure.figsize'] = (5.0, 4.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'
//plt(matplotlib.pyplot)使用rc配置文件来自定义图形的各种默认属性,称之为rc配置或rc参数。%load_ext autoreload //自动加载扩展
%autoreload 2np.random.seed(1) //np.random.seed()函数用于生成指定随机数。
初始化参数
深度L层神经网络的初始化很复杂,因为有更多的权矩阵和偏置向量。在完成initialize_parameters_deep()时,应该确保每个层之间的维度匹配。
def initialize_parameters_deep(layer_dims)://layer_dims为一个数组,代表层数与对应的权矩阵维度np.random.seed(3)parameters = {}L = len(layer_dims) //深度网络层数for l in range(1, L):parameters["W"+str(l)] = np.random.randn(layer_dims[l],layer_dims[l-1])*0.01parameters["b"+str(l)] = np.zeros((layer_dims[l],1))assert(parameters['W' + str(l)].shape == (layer_dims[l], layer_dims[l-1]))assert(parameters['b' + str(l)].shape == (layer_dims[l], 1))return parameters
测试一下
parameters = initialize_parameters_deep([5,4,3])
print("W1 = " + str(parameters["W1"]))
print("b1 = " + str(parameters["b1"]))
print("W2 = " + str(parameters["W2"]))
print("b2 = " + str(parameters["b2"]))
输出结果为
W1 = [[ 0.01788628 0.0043651 0.00096497 -0.01863493 -0.00277388][-0.00354759 -0.00082741 -0.00627001 -0.00043818 -0.00477218][-0.01313865 0.00884622 0.00881318 0.01709573 0.00050034][-0.00404677 -0.0054536 -0.01546477 0.00982367 -0.01101068]]
b1 = [[0.][0.][0.][0.]]
W2 = [[-0.01185047 -0.0020565 0.01486148 0.00236716][-0.01023785 -0.00712993 0.00625245 -0.00160513][-0.00768836 -0.00230031 0.00745056 0.01976111]]
b2 = [[0.][0.][0.]]
前向传播
将前向传播中的ZL=WLAL-1+bL和保存缓存提取出来作为公共方法linear_forward()。
def linear_forward(A, W, b):Z = np.dot(W,A)+bassert(Z.shape == (W.shape[0], A.shape[1]))cache = (A, W, b)return Z, cache
因为有L-1层的RELU函数+1层的SIGMIOD函数,linear_activation_forward()中要新增一个参数作为relu和sigmiod的选择
def linear_activation_forward(A_prev, W, b, activation):if activation == "sigmoid":Z,linear_cache = linear_forward(A_prev, W, b)A,activation_cache = sigmoid(Z)elif activation == "relu":Z,linear_cache = linear_forward(A_prev, W, b)A,activation_cache = relu(Z) assert (A.shape == (W.shape[0], A_prev.shape[1]))cache = (linear_cache, activation_cache)return A, cache
sigmoid函数:A = 1/(1+np.exp(-Z))
relu函数:A = np.maximum(0,Z)
linear_cache:A,W,b
activation_cache:Z
测试一下
A_prev, W, b = linear_activation_forward_test_case()A, linear_activation_cache = linear_activation_forward(A_prev, W, b, activation = "sigmoid")
print("With sigmoid: A = " + str(A))A, linear_activation_cache = linear_activation_forward(A_prev, W, b, activation = "relu")
print("With ReLU: A = " + str(A))
输出结果为
With sigmoid: A = [[0.96890023 0.11013289]]
With ReLU: A = [[3.43896131 0. ]]
在完成了上面两个函数后,现在我们来实现多层前向传播。
X:数据,numpy数组(输入大小,示例数量)
parameters:为初始化时initialize_parameters_deep()的输出
L:parameters中存放的是W,b为一对,所有除2为一层
def L_model_forward(X, parameters):caches = []A = XL = len(parameters) // 2 for l in range(1, L):A_prev = A A, linear_activation_cache = linear_activation_forward(A_prev, parameters["W"+str(l)], parameters["b"+str(l)], activation = "relu")caches.append(linear_activation_cache)AL, linear_activation_cache = linear_activation_forward(A, parameters["W"+str(L)], parameters["b"+str(L)], activation = "sigmoid")caches.append(linear_activation_cache)assert(AL.shape == (1,X.shape[1]))return AL, caches
测试一下
X, parameters = L_model_forward_test_case()
AL, caches = L_model_forward(X, parameters)
print("AL = " + str(AL))
print("Length of caches list = " + str(len(caches)))
输出结果为
AL = [[0.17007265 0.2524272 ]]
Length of caches list = 2
计算损失函数
损失函数可以很好的衡量你的预估值和实际值的差距
def compute_cost(AL, Y):m = Y.shape[1]cost = (-1/m)*np.sum(np.log(AL)*Y+np.log(1-AL)*(1-Y))cost = np.squeeze(cost) assert(cost.shape == ())return cost
测试一下
Y, AL = compute_cost_test_case()print("cost = " + str(compute_cost(AL, Y)))
输出结果为
cost = 0.41493159961539694
反向传播
反向传播用于计算损失函数相对于参数的梯度。
与正向传播类似,您将分三步构建反向传播:
- 线性反向传播
- 线性->激活函数反向传播,其中激活函数计算ReLU或sigmoid的导数
- 全模型反向传播([LINEAR -> RELU] *(L-1)-> LINEAR -> SIGMOID)
所以,第一步,假设你已经计算了导数dZL,你想要得到dWL,dbL,dAL-1,公式如下:
第一步:
def linear_backward(dZ, cache):A_prev, W, b = cachem = A_prev.shape[1] //获得A_prev中的数据数量dA_prev = np.dot(W.T,dZ)dW = np.dot(dZ,A_prev.T)/mdb = np.sum(dZ,axis = 1 , keepdims = True)/massert (dA_prev.shape == A_prev.shape)assert (dW.shape == W.shape)assert (db.shape == b.shape)return dA_prev, dW, db
第二步:
def linear_activation_backward(dA, cache, activation):linear_cache, activation_cache = cacheif activation == "relu":dZ = relu_backward(dA, activation_cache)dA_prev, dW, db = linear_backward(dZ, linear_cache)elif activation == "sigmoid":dZ = sigmoid_backward(dA, activation_cache)dA_prev, dW, db = linear_backward(dZ, linear_cache)return dA_prev, dW, db
辅助函数如下:
def relu_backward(dA, cache):Z = cachedZ = np.array(dA, copy=True)dZ[Z <= 0] = 0assert (dZ.shape == Z.shape)return dZ
因为relu大于0的情况导数为1,小于0的情况导数为0,因此反向推导时只需要将输入小于0的部分设置为0即可。
def sigmoid_backward(dA, cache):Z = caches = 1/(1+np.exp(-Z))dZ = dA * s * (1-s)assert (dZ.shape == Z.shape)return dZ
第三步:
我们写好了反向传播函数,现在写一个全模型反向传播,我们知道最后输出的值是AL=sigmoid(ZL),我们需要计算出第一个反向传播所需要的dAL,用这个公式:
def L_model_backward(AL, Y, caches):grads = {}L = len(caches)m = AL.shape[1]Y = Y.reshape(AL.shape) dAL = - (np.divide(Y, AL) - np.divide(1 - Y, 1 - AL))current_cache = caches[L-1]grads["dA" + str(L)], grads["dW" + str(L)], grads["db" + str(L)] = linear_activation_backward(dAL, current_cache, activation = "sigmoid")for l in reversed(range(L - 1)):current_cache = caches[l]dA_prev, dW, db = linear_activation_backward(grads["dA" + str(l + 2)], current_cache, activation = "relu")grads["dA" + str(l+1)]=dA_prevgrads["dW" + str(l+1)]=dWgrads["db" + str(l+1)]=dbreturn grads
测试一下
AL, Y_assess, caches = L_model_backward_test_case()
grads = L_model_backward(AL, Y_assess, caches)
print ("dW1 = "+ str(grads["dW1"]))
print ("db1 = "+ str(grads["db1"]))
print ("dA1 = "+ str(grads["dA1"]))
输出结果为
dW1 = [[0.41010002 0.07807203 0.13798444 0.10502167][0. 0. 0. 0. ][0.05283652 0.01005865 0.01777766 0.0135308 ]]
db1 = [[-0.22007063][ 0. ][-0.02835349]]
dA1 = [[ 0. 0.52257901][ 0. -0.3269206 ][ 0. -0.32070404][ 0. -0.74079187]]
更新参数
def update_parameters(parameters, grads, learning_rate):L = len(parameters) for l in range(L):parameters["W" + str(l+1)] = parameters["W" + str(l+1)] - learning_rate*grads["dW"+str(l+1)]parameters["b" + str(l+1)] = parameters["b" + str(l+1)] - learning_rate*grads["db"+str(l+1)]return parameters
测试一下
parameters, grads = update_parameters_test_case()
parameters = update_parameters(parameters, grads, 0.1)print ("W1 = "+ str(parameters["W1"]))
print ("b1 = "+ str(parameters["b1"]))
print ("W2 = "+ str(parameters["W2"]))
print ("b2 = "+ str(parameters["b2"]))
输出结果为
W1 = [[-0.59562069 -0.09991781 -2.14584584 1.82662008][-1.76569676 -0.80627147 0.51115557 -1.18258802][-1.0535704 -0.86128581 0.68284052 2.20374577]]
b1 = [[-0.04659241][-1.28888275][ 0.53405496]]
W2 = [[-0.55569196 0.0354055 1.32964895]]
b2 = [[-0.84610769]]
将上述所有步骤依次组装,即可实现一个简单的深度学习网络