寻找志同道合的学习伙伴,请访问我的个人网页.
该内容同步发布在CSDN和耳壳网.本次作业代码全部采用numpy的array数组来进行计算,不采用矩阵
两者在乘法和数量积的运算上采用的符号不同
最好不要混用,开始之前先定下代码里用哪一种
神经网络 Neural Networks
数据可视化 Visualizing the data
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
from scipy.io import loadmat
data = loadmat('ex4data1.mat')
data
{'__header__': b'MATLAB 5.0 MAT-file, Platform: GLNXA64, Created on: Sun Oct 16 13:09:09 2011',
'__version__': '1.0',
'__globals__': [],
'X': array([[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
...,
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.]]),
'y': array([[10],
[10],
[10],
...,
[ 9],
[ 9],
[ 9]], dtype=uint8)}
data['X'].shape, data['y'].shape
((5000, 400), (5000, 1))
# 随机挑选100个图像
sample_idx = np.random.choice(np.arange(data['X'].shape[0]), 100)
sample_images = data['X'][sample_idx, :]
sample_images.shape
(100, 400)
# 展示图像
fig, ax = plt.subplots(10, 10, sharey=True, sharex=True, figsize=(10, 10))
for r in range(10):
for c in range(10):
ax[r, c].matshow(np.array(sample_images[10 * r + c].reshape(20, 20)).T, cmap=matplotlib.cm.binary)
plt.xticks(np.array([]))
plt.yticks(np.array([]))
如果文中公式不可见,可以去我GitHub上查看
处理训练数据
标签重编码
首先我们要将标签值(1,2,3,4,…,10)转化成非线性相关的向量,向量对应位置(y[i-1])上的值等于1,
例如y[0]=6转化为y[0]=[0,0,0,0,0,1,0,0,0,0]。
def expend_y(y):
result = []
# 将y的每个元素转换为一个向量,标签值的位置为1,其余为0
for label in y:
y_array = np.zeros(10)
y_array[label-1] = 1
result.append(y_array)
return np.array(result)
'''
也可以利用sklearn的编码函数
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse=False)
y_onehot = encoder.fit_transform(y)
y_onehot.shape
'''
'\n也可以利用sklearn的编码函数\n\nfrom sklearn.preprocessing import OneHotEncoder\n encoder = OneHotEncoder(sparse=False)\n y_onehot = encoder.fit_transform(y)\n y_onehot.shape \n'
验证y
y1 = np.arange(1, 11, 1).reshape((10, 1))
y1
array([[ 1],
[ 2],
[ 3],
[ 4],
[ 5],
[ 6],
[ 7],
[ 8],
[ 9],
[10]])
y2 = expend_y(y1)
y2.shape
(10, 10)
y2
array([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
[0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
[0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
[0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
[0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
[0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
[0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
[0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
[0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]])
获取训练数据集
X = np.insert(data['X'], 0, np.ones(len(data['X'])), axis=1)
raw_y = data['y']
y = expend_y(raw_y)
X.shape, y.shape
((5000, 401), (5000, 10))
读取已训练好的权重
weight = loadmat('ex4weights.mat')
weight
{'__header__': b'MATLAB 5.0 MAT-file, Platform: GLNXA64, Created on: Tue Oct 18 14:57:02 2011',
'__version__': '1.0',
'__globals__': [],
'Theta1': array([[-2.25623899e-02, -1.05624163e-08, 2.19414684e-09, ...,
-1.30529929e-05, -5.04175101e-06, 2.80464449e-09],
[-9.83811294e-02, 7.66168682e-09, -9.75873689e-09, ...,
-5.60134007e-05, 2.00940969e-07, 3.54422854e-09],
[ 1.16156052e-01, -8.77654466e-09, 8.16037764e-09, ...,
-1.20951657e-04, -2.33669661e-06, -7.50668099e-09],
...,
[-1.83220638e-01, -8.89272060e-09, -9.81968100e-09, ...,
2.35311186e-05, -3.25484493e-06, 9.02499060e-09],
[-7.02096331e-01, 3.05178374e-10, 2.56061008e-09, ...,
-8.61759744e-04, 9.43449909e-05, 3.83761998e-09],
[-3.50933229e-01, 8.85876862e-09, -6.57515140e-10, ...,
-1.80365926e-06, -8.14464807e-06, 8.79454531e-09]]),
'Theta2': array([[-0.76100352, -1.21244498, -0.10187131, -2.36850085, -1.05778129,
-2.20823629, 0.56383834, 1.21105294, 2.21030997, 0.44456156,
-1.18244872, 1.04289112, -1.60558756, 1.30419943, 1.37175046,
1.74825095, -0.23365648, -1.52014483, 1.15324176, 0.10368082,
-0.37207719, -0.61530019, -0.1256836 , -2.27193038, -0.71836208,
-1.29690315],
[-0.61785176, 0.61559207, -1.26550639, 1.85745418, -0.91853319,
-0.05502589, -0.38589806, 1.29520853, -1.56843297, -0.97026419,
-2.18334895, -2.85033578, -2.07733086, 1.63163164, 0.3490229 ,
1.82789117, -2.44174379, -0.8563034 , -0.2982564 , -2.07947873,
-1.2933238 , 0.89982032, 0.28306578, 2.31180525, -2.46444086,
1.45656548],
[-0.68934072, -1.94538151, 2.01360618, -3.12316188, -0.2361763 ,
1.38680947, 0.90982429, -1.54774416, -0.79830896, -0.65599834,
0.7353833 , -2.58593294, 0.47210839, 0.55349499, 2.51255453,
-2.4167454 , -1.63898627, 1.2027302 , -1.20245851, -1.83445959,
-1.88013027, -0.34056098, 0.23692483, -1.06137919, 1.02759232,
-0.47690832],
[-0.67832479, 0.46299226, 0.58492321, -0.1650184 , 1.93264192,
-0.22965765, -1.84731492, 0.49011768, 1.07146054, -3.31905643,
1.54113507, 0.37371947, -0.86484681, -2.58273522, 0.97062447,
-0.51021867, -0.68427897, -1.64713607, 0.21153145, -0.27422442,
1.72599755, 1.32418658, -2.63984479, -0.08055871, -2.03510803,
-1.46123776],
[-0.59664339, -2.04481799, 2.05698407, 1.95100909, 0.17637699,
-2.16141218, -0.40394736, 1.80157532, -1.56278739, -0.25253004,
0.23586497, 0.71656699, 1.07689092, -0.35457279, -1.67743058,
-0.12939255, -0.67488849, 1.14066535, 1.32431237, 3.21158484,
-2.15888898, -2.60164082, -3.2226466 , -1.89612906, -0.87488068,
2.51038628],
[-0.87794907, 0.4344112 , -0.93161049, 0.18390778, -0.36078216,
0.61958137, 0.38624948, -2.65150343, 2.29710773, -2.08818098,
-1.86382323, 1.06057836, 0.77562146, 2.1346861 , -1.14973702,
-0.52081426, 0.99743429, -1.48309353, -2.3139424 , 0.29517333,
-0.38704879, -2.20607697, 0.30702191, -1.17646114, -1.63462966,
-0.82467661],
[-0.52746527, 1.21564288, -1.50095981, -2.03195359, -1.52366734,
-2.43732079, -2.37570311, -1.39987277, -0.88735315, -0.63278873,
1.50450176, -1.580763 , 0.58599217, -0.77540416, 0.94257331,
2.10919653, 0.54479132, 0.43773612, -1.28024228, -0.04360994,
1.4774997 , -1.13276949, -0.72846904, 0.04734716, 1.6574566 ,
1.68540944],
[-0.7490154 , -0.72249056, -3.15228173, 0.36577778, 0.19811362,
-0.73059946, 1.65263918, -2.300357 , -1.87468162, 0.98095387,
-1.58825159, 1.35434142, 2.17895331, -1.99239762, -2.00371362,
-0.388613 , -2.33992976, -2.91719062, 0.99398645, -2.70476768,
-1.27139772, 1.86091461, -1.20519404, -0.38014194, 0.7087181 ,
-2.11014003],
[-0.6665468 , 0.53601845, 1.30307573, -1.03372714, -4.03084753,
0.58173469, -2.65717902, 0.80379994, -1.09241928, 2.49910058,
0.362008 , 0.66195337, -0.92160534, -0.83123666, -2.00200952,
-2.94897501, 0.64564202, -1.10114694, 0.74510309, 0.58506717,
-1.99545251, 0.62591105, 1.80596103, -0.22309315, -1.40442136,
-2.1319153 ],
[-0.46089119, -1.43944954, -1.21809509, 0.71093011, 0.45216919,
-0.35953381, 0.62284954, -0.67005297, -0.7069138 , 0.06311351,
-1.23199074, -1.74645233, -2.71960897, -2.21437178, -1.69307505,
-0.90927394, 0.87852311, 1.18664814, -1.87041262, 0.39796295,
1.72113872, -1.36934055, 0.8580668 , -0.24779579, 1.28009118,
-1.32752042]])}
theta1 = weight['Theta1']
theta2 = weight['Theta2']
展开/合并参数
当我们使用高级优化方法来优化神经网络时,我们需要将多个参数矩阵展开,才能传入优化函数,然后再恢复形状。
def serialize(a, b):
return np.concatenate((np.ravel(a), np.ravel(b)))
a = np.arange(10)
b = np.ones(5)
c = serialize(a, b)
c
array([0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 1., 1., 1., 1., 1.])
def deserialize(seq):
return seq[:25*401].reshape(25, 401), seq[25*401:].reshape(10, 26)
模型表示 Model representation
我们的网络有三层,输入层,隐藏层,输出层。我们的输入是数字图像的像素值,因为每个数字的图像大小为20*20,所以我们输入层有400个单元(这里不包括总是输出要加一个偏置单元)。
如果文中公式不可见,可以去我GitHub上查看
正向传播 Feedforward
# sigmoid函数
def sigmoid(z):
return 1 / (1 + np.exp(-z))
# 前向传播
def forward_propagation(theta, X): # theta,X都是array数组类型,非矩阵
t1, t2 = deserialize(theta)
a1 = X # 5000*401
z2 = a1 @ t1.T # 5000*25
a2 = sigmoid(z2)
a2 = np.insert(a2, 0, np.ones(len(a2)), axis=1) # 5000*26
z3 = a2 @ t2.T # 5000*10
a3 = sigmoid(z3)
h = a3 # 5000*10
return a1, z2, a2, z3, h
theta = serialize(theta1, theta2)
a1, z2, a2, z3, h = forward_propagation(theta, X)
a1.shape, z2.shape, a2.shape, z3.shape, h.shape
((5000, 401), (5000, 25), (5000, 26), (5000, 10), (5000, 10))
forward_propagation(theta, X)[-1]
array([[1.12661530e-04, 1.74127856e-03, 2.52696959e-03, ...,
4.01468105e-04, 6.48072305e-03, 9.95734012e-01],
[4.79026796e-04, 2.41495958e-03, 3.44755685e-03, ...,
2.39107046e-03, 1.97025086e-03, 9.95696931e-01],
[8.85702310e-05, 3.24266731e-03, 2.55419797e-02, ...,
6.22892325e-02, 5.49803551e-03, 9.28008397e-01],
...,
[5.17641791e-02, 3.81715020e-03, 2.96297510e-02, ...,
2.15667361e-03, 6.49826950e-01, 2.42384687e-05],
[8.30631310e-04, 6.22003774e-04, 3.14518512e-04, ...,
1.19366192e-02, 9.71410499e-01, 2.06173648e-04],
[4.81465717e-05, 4.58821829e-04, 2.15146201e-05, ...,
5.73434571e-03, 6.96288990e-01, 8.18576980e-02]])
代价函数 Cost function
def nnCost(theta, X, y): # 均为数组
h = forward_propagation(theta, X)[-1]
temp = -y * np.log(h) - (1 - y) * np.log(1- h)
return temp.sum() / len(X)
nnCost(theta, X, y)
0.2876291651613189
正则化代价函数 regularized cost function
def nnCostReg(theta, X, y, lambdaRate):
theta1, theta2 = deserialize(theta)
first = nnCost(theta, X, y)
reg1 = (np.power(theta1[:, 1: ], 2)).sum()
reg2 = (np.power(theta2[:, 1: ], 2)).sum()
reg = lambdaRate / (2 * len(X)) * (reg1 + reg2)
return first + reg
nnCostReg(theta, X, y, lambdaRate=1)
0.38376985909092365
反向传播 Back propagation
S函数梯度 Sigmoid gradient
def sigmoid_gradient(z):
return sigmoid(z) * (1- sigmoid(z))
sigmoid_gradient(0)
0.25
siggra_x = np.arange(-10, 10, 0.01)
plt.plot(siggra_x, sigmoid_gradient(siggra_x))
plt.grid()
plt.show()
由此可见,sigmoid的导数值最大为0.25,决定了其无法用于深层网络
theta随机初始化 Random initialization
当我们训练神经网络时,随机初始化参数是很重要的,可以打破数据的对称性。
一个有效的策略是在均匀分布(−e,e)中随机选择值,我们可以选择 e = 0.12 这个范围的值来确保参数足够小,使得训练更有效率。
# 从服从均匀分布的范围中随机返回size大小的值
def random_init(size):
return np.random.uniform(-0.12, 0.12, size)
random_init((3, 4))
array([[-0.03588089, -0.01860106, 0.0231179 , 0.08834786],
[-0.00270229, -0.04455686, -0.10708566, 0.09343351],
[-0.04520061, 0.06483762, -0.08712267, 0.0488686 ]])
反向传播
目标:获取整个网络代价函数的梯度,以便在优化算法中求解。
步骤:给定训练集,先计算正向传播,再对于层的每个节点,计算误差项,这个数据衡量这个节点对最后输出
的误差“贡献”了多少。
这里面一定要理解正向传播和反向传播的过程,才能弄清楚各种参数在网络中的维度,切记。
最好手写出每次传播的式子。
公式
1.
2.
3.
4.
更新梯度
# 更新梯度
def nnGradient(theta, X, y):
theta1, theta2 = deserialize(theta)
# 首先正向传播
a1, z2, a2, z3, h = forward_propagation(theta, X)
# 针对每层的各个节点计算误差项
d3 = h - y # 输出层误差 5000*10
d2 = d3 @ theta2[:, 1:] * sigmoid_gradient(z2) # 隐藏层误差 5000*25
delta1 = d2.T @ a1 # 25 * 401
delta2 = d3.T @ a2 # 10 * 26
delta = serialize(delta1, delta2) # (10285, )
return delta / len(X) # (10285, )
d1, d2 = deserialize(nnGradient(theta, X, y))
d1.shape, d2.shape
((25, 401), (10, 26))
theta1.shape, theta2.shape
((25, 401), (10, 26))
梯度检验 Gradient checking
进行梯度校验时,需要把参数Θ(1)、Θ(2)连接成一个长向量。之后你可以使用如下公式计算:
def gradient_checking(theta, X, y, eps, regularized=False):
m = len(theta)
def a_numeric_grad(plus, minus, X, y):
if regularized:
return (nnCostReg(plus, X, y) - nnCostReg(minus, X, y)) / (2 * eps)
else:
return (nnCost(plus, X, y) - nnCost(minus, X, y)) / (2 * eps)
approxGrad = np.zeros(m)
# 计算偏导数
for i in range(m):
thetaPlus = theta.copy()
thetaPlus[i] = theta[i] + eps
thetaMinus = theta.copy()
thetaMinus[i] = theta[i] - eps
grad = a_numeric_grad(thetaPlus, thetaMinus, X, y)
approxGrad[i] = grad
# 用梯度公式
funcGrad = nnGradientReg(theta, X, y) if regularized else nnGradient(theta, X, y)
diff = np.linalg.norm(approxGrad - funcGrad) / np.linalg.norm(approxGrad + funcGrad)
print('If your backpropagation implementation is correct,\n'\
+ 'the relative difference will be smaller than 10e-9 (assume epsilon=0.0001).\n'
+ 'Relative Difference: {}\n'.format(diff))
#return approxGrad.shape
# gradient_checking(theta, X, y, eps=0.0001, regularized=False) #这个运行很慢,谨慎运行
正则化神经网络
def nnGradientReg(theta, X, y, lambdaRate):
first = nnGradient(theta, X, y)
t1, t2 = deserialize(theta)
t1[:, 0] = 0
t2[:, 0] = 0
reg = lambdaRate / len(X) * serialize(t1, t2)
return first + reg
d1, d2 = deserialize(nnGradientReg(theta, X, y, lambdaRate=1))
d1.shape, d2.shape
((25, 401), (10, 26))
优化参数 Learning parameters using fmincg
from scipy import optimize as opt
def nnTraining(X, y):
# 初始化theta
init_theta = random_init(10285)
res = opt.minimize(fun=nnCostReg,
x0=init_theta,
args=(X, y, 1),
method='TNC',
jac=nnGradientReg,
options={'maxiter':500})
return res
res = nnTraining(X, y)
res
fun: 0.33298285824302043
jac: array([ 1.41541160e-04, -2.29401553e-07, -2.55382916e-07, ...,
-5.01885346e-06, -3.60910199e-05, -2.47061167e-05])
message: 'Max. number of function evaluations reached'
nfev: 250
nit: 21
status: 3
success: False
x: array([ 0.00000000e+00, -1.14700777e-03, -1.27691458e-03, ...,
1.63199029e+00, -2.96556538e+00, -1.95288757e+00])
性能评价
final_theta = res.x
final_theta.shape
(10285,)
h = forward_propagation(final_theta, X)[-1]
h.shape
(5000, 10)
y_predict = np.argmax(h, axis=1) + 1
y_predict = y_predict.reshape(5000, 1)
y_predict
array([[10],
[10],
[10],
...,
[ 9],
[ 9],
[10]], dtype=int64)
raw_y
array([[10],
[10],
[10],
...,
[ 9],
[ 9],
[ 9]], dtype=uint8)
from sklearn.metrics import classification_report
print(classification_report(raw_y, y_predict))
precision recall f1-score support
1 0.97 0.99 0.98 500
2 0.98 0.98 0.98 500
3 0.98 0.98 0.98 500
4 1.00 0.95 0.97 500
5 1.00 0.98 0.99 500
6 0.98 0.99 0.99 500
7 0.98 0.99 0.98 500
8 0.98 0.99 0.99 500
9 0.99 0.96 0.97 500
10 0.96 1.00 0.98 500
accuracy 0.98 5000
macro avg 0.98 0.98 0.98 5000
weighted avg 0.98 0.98 0.98 5000
可视化隐层 Visualizing the hidden layer
theta1, theta2 = deserialize(final_theta)
hidden_layer = theta1[:, 1: ]
hidden_layer.shape
(25, 400)
fig, ax = plt.subplots(5, 5, sharey=True, sharex=True, figsize=(10, 10))
for r in range(5):
for c in range(5):
ax[r, c].matshow(np.array(hidden_layer[5 * r + c].reshape(20, 20)).T, cmap=matplotlib.cm.binary)
plt.xticks(np.array([]))
plt.yticks(np.array([]))
文章评论
赞
@洛辰 共同进步!