Niedawno uczę się Sztucznej Sieci Neuronowej (ANN) i mam działający w Pythonie kod działający w oparciu o mini-batchowe szkolenie. Śledziłem książkę Michael Nilson's Neural Networks and Deep Learning, gdzie wyjaśniono krok po kroku każdy algorytm dla początkujących. Istnieje również w pełni działający kod do ręcznego rozpoznawania cyfr, który działa również dla mnie.Pełno-macierzowe podejście do propagacji wstecznej w Sztucznej Sieci Neuronowej
Staram się jednak nieco poprawić kod poprzez przekazanie całej mini-partii do pociągu przez propagację wsteczną w postaci macierzy. Opracowałem również działający kod, ale kod działa bardzo wolno po uruchomieniu. Czy istnieje sposób, w jaki mogę wdrożyć podejście oparte na pełnej macierzy do uczenia się w mini-sieci w oparciu o algorytm propagacji wstecznej?
import numpy as np
import pandas as pd
class Network:
def __init__(self, sizes):
self.layers = len(sizes)
self.sizes = sizes
self.biases = [np.random.randn(y, 1) for y in sizes[1:]]
self.weights = [np.random.randn(y, x) for y, x in zip(sizes[1:], sizes[:-1])]
def feed_forward(self, a):
for w, b in zip(self.weights, self.biases):
a = sigmoid(np.dot(w,a) + b)
return a
# Calculate the cost derivative (Gradient of C w.r.t. 'a' - Nabla C(a))
def cost_derivative(self, output_activation, y):
return (output_activation - y)
def update_mini_batch(self, mini_batch, eta):
from scipy.linalg import block_diag
n = len(mini_batch)
xs = [x for x, y in mini_batch]
features = block_diag(*xs)
ys = [y for x, y in mini_batch]
responses = block_diag(*ys)
ws = [a for a in self.weights for i in xrange(n)]
new_list = []
k = 0
while (k < len(ws)):
new_list.append(ws[k: k + n])
k += n
weights = [block_diag(*elems) for elems in new_list]
bs = [b for b in self.biases for i in xrange(n)]
new_list2 = []
j = 0
while (j < len(bs)):
new_list2.append(bs[j : j + n])
j += n
biases = [block_diag(*elems) for elems in new_list2]
baises_dim_1 = [np.dot(np.ones((n*b.shape[0], b.shape[0])), b) for b in self.biases]
biases_dim_2 = [np.dot(b, np.ones((b.shape[1], n*b.shape[1]))) for b in baises_dim_1]
weights_dim_1 = [np.dot(np.ones((n*w.shape[0], w.shape[0])), w) for w in self.weights]
weights_dim_2 = [np.dot(w, np.ones((w.shape[1], n*w.shape[1]))) for w in weights_dim_1]
nabla_b = [np.zeros(b.shape) for b in biases_dim_2]
nabla_w = [np.zeros(w.shape) for w in weights_dim_2]
delta_b = [np.zeros(b.shape) for b in self.biases]
delta_w = [np.zeros(w.shape) for w in self.weights]
zs = []
activation = features
activations = [features]
for w, b in zip(weights, biases):
z = np.dot(w, activation) + b
zs.append(z)
activation = sigmoid(z)
activations.append(activation)
delta = self.cost_derivative(activations[-1], responses) * sigmoid_prime(zs[-1])
nabla_b[-1] = delta
nabla_w[-1] = np.dot(delta, activations[-2].transpose())
for l in xrange(2, self.layers):
z = zs[-l] # the weighted input for that layer
activation_prime = sigmoid_prime(z) # the derivative of activation for the layer
delta = np.dot(weights[-l + 1].transpose(), delta) * activation_prime # calculate the adjustment term (delta) for that layer
nabla_b[-l] = delta # calculate the bias adjustments - by means of using eq-BP3.
nabla_w[-l] = np.dot(delta, activations[-l-1].transpose()) # calculate the weight adjustments - by means of using eq-BP4.
delta_b = [self.split_cases(b, n) for b in nabla_b]
delta_w = [self.split_cases(w, n) for w in nabla_w]
self.weights = [w - (eta/n) * nw for w, nw in zip(self.weights, delta_w)]
self.biases = [b - (eta/ n) * nb for b, nb in zip(self.biases, delta_b)]
def split_cases(self, mat, mini_batch_size):
i = 0
j = 0
dim1 = mat.shape[0]/mini_batch_size
dim2 = mat.shape[1]/mini_batch_size
sum_samples = np.zeros((dim1, dim2))
while i < len(mat):
sum_samples = sum_samples + mat[i: i + dim1, j : j + dim2]
i += dim1
j += dim2
return sum_samples
"""Stochastic Gradient Descent for training in epochs"""
def SGD(self, training_data, epochs, mini_batch_size, eta, test_data = None):
n = len(training_data)
if test_data:
n_test = len(test_data)
for j in xrange(epochs):
np.random.shuffle(training_data) # for each epochs the mini-batches are selected randomly
mini_batches = [training_data[k: k+mini_batch_size] for k in xrange(0, n, mini_batch_size)] # select equal sizes of mini-batches for the epochs (last mini_batch size might differ however)
c = 1
for mini_batch in mini_batches:
print "Updating mini-batch {0}".format(c)
self.update_mini_batch(mini_batch, eta)
c += 1
if test_data:
print "Epoch {0}: {1}/{2}".format(j, self.evaluate(test_data), n_test)
else:
print "Epoch {0} completed.".format(j)
def evaluate(self, test_data):
test_results = [(np.argmax(self.feed_forward(x)), y) for (x, y) in test_data]
return (sum(int(x == y) for x, y in test_results))
def export_results(self, test_data):
results = [(np.argmax(self.feed_forward(x)), y) for (x, y) in test_data]
k = pd.DataFrame(results)
k.to_csv('net_results.csv')
# Global functions
## Activation function (sigmoid)
@np.vectorize
def sigmoid(z):
return 1.0/(1.0 + np.exp(-z))
## Activation derivative (sigmoid_prime)
@np.vectorize
def sigmoid_prime(z):
return sigmoid(z)*(1 - sigmoid(z))