Pełno-macierzowe podejście do propagacji wstecznej w Sztucznej Sieci Neuronowej

Niedawno uczę się Sztucznej Sieci Neuronowej (ANN) i mam działający w Pythonie kod działający w oparciu o mini-batchowe szkolenie. Śledziłem książkę Michael Nilson's Neural Networks and Deep Learning, gdzie wyjaśniono krok po kroku każdy algorytm dla początkujących. Istnieje również w pełni działający kod do ręcznego rozpoznawania cyfr, który działa również dla mnie.Pełno-macierzowe podejście do propagacji wstecznej w Sztucznej Sieci Neuronowej

Staram się jednak nieco poprawić kod poprzez przekazanie całej mini-partii do pociągu przez propagację wsteczną w postaci macierzy. Opracowałem również działający kod, ale kod działa bardzo wolno po uruchomieniu. Czy istnieje sposób, w jaki mogę wdrożyć podejście oparte na pełnej macierzy do uczenia się w mini-sieci w oparciu o algorytm propagacji wstecznej?

import numpy as np 
import pandas as pd 

class Network: 

    def __init__(self, sizes): 
     self.layers = len(sizes) 
     self.sizes = sizes 

     self.biases = [np.random.randn(y, 1) for y in sizes[1:]] 
     self.weights = [np.random.randn(y, x) for y, x in zip(sizes[1:], sizes[:-1])] 

    def feed_forward(self, a): 
     for w, b in zip(self.weights, self.biases): 
      a = sigmoid(np.dot(w,a) + b) 
     return a 

    # Calculate the cost derivative (Gradient of C w.r.t. 'a' - Nabla C(a)) 
    def cost_derivative(self, output_activation, y): 
     return (output_activation - y) 


    def update_mini_batch(self, mini_batch, eta): 

     from scipy.linalg import block_diag 

     n = len(mini_batch) 

     xs = [x for x, y in mini_batch] 
     features = block_diag(*xs) 

     ys = [y for x, y in mini_batch] 
     responses = block_diag(*ys) 

     ws = [a for a in self.weights for i in xrange(n)] 

     new_list = [] 
     k = 0 
     while (k < len(ws)): 
      new_list.append(ws[k: k + n]) 
      k += n 

     weights = [block_diag(*elems) for elems in new_list] 

     bs = [b for b in self.biases for i in xrange(n)] 

     new_list2 = [] 
     j = 0 
     while (j < len(bs)): 
      new_list2.append(bs[j : j + n]) 
      j += n 

     biases = [block_diag(*elems) for elems in new_list2] 

     baises_dim_1 = [np.dot(np.ones((n*b.shape[0], b.shape[0])), b) for b in self.biases] 
     biases_dim_2 = [np.dot(b, np.ones((b.shape[1], n*b.shape[1]))) for b in baises_dim_1] 
     weights_dim_1 = [np.dot(np.ones((n*w.shape[0], w.shape[0])), w) for w in self.weights] 
     weights_dim_2 = [np.dot(w, np.ones((w.shape[1], n*w.shape[1]))) for w in weights_dim_1] 

     nabla_b = [np.zeros(b.shape) for b in biases_dim_2] 
     nabla_w = [np.zeros(w.shape) for w in weights_dim_2] 

     delta_b = [np.zeros(b.shape) for b in self.biases] 
     delta_w = [np.zeros(w.shape) for w in self.weights] 

     zs = [] 
     activation = features 
     activations = [features] 

     for w, b in zip(weights, biases): 

      z = np.dot(w, activation) + b 
      zs.append(z) 
      activation = sigmoid(z) 
      activations.append(activation) 

     delta = self.cost_derivative(activations[-1], responses) * sigmoid_prime(zs[-1]) 
     nabla_b[-1] = delta 
     nabla_w[-1] = np.dot(delta, activations[-2].transpose()) 

     for l in xrange(2, self.layers): 
      z = zs[-l]                  # the weighted input for that layer 
      activation_prime = sigmoid_prime(z)            # the derivative of activation for the layer 
      delta = np.dot(weights[-l + 1].transpose(), delta) * activation_prime   # calculate the adjustment term (delta) for that layer 
      nabla_b[-l] = delta                # calculate the bias adjustments - by means of using eq-BP3. 
      nabla_w[-l] = np.dot(delta, activations[-l-1].transpose())     # calculate the weight adjustments - by means of using eq-BP4. 

     delta_b = [self.split_cases(b, n) for b in nabla_b] 
     delta_w = [self.split_cases(w, n) for w in nabla_w] 

     self.weights = [w - (eta/n) * nw for w, nw in zip(self.weights, delta_w)] 
     self.biases = [b - (eta/ n) * nb for b, nb in zip(self.biases, delta_b)] 



    def split_cases(self, mat, mini_batch_size): 
     i = 0 
     j = 0 
     dim1 = mat.shape[0]/mini_batch_size 
     dim2 = mat.shape[1]/mini_batch_size 
     sum_samples = np.zeros((dim1, dim2)) 
     while i < len(mat): 

      sum_samples = sum_samples + mat[i: i + dim1, j : j + dim2] 
      i += dim1 
      j += dim2 

     return sum_samples 

    """Stochastic Gradient Descent for training in epochs""" 
    def SGD(self, training_data, epochs, mini_batch_size, eta, test_data = None): 

     n = len(training_data) 

     if test_data: 
      n_test = len(test_data) 

     for j in xrange(epochs): 
      np.random.shuffle(training_data)                 # for each epochs the mini-batches are selected randomly 
      mini_batches = [training_data[k: k+mini_batch_size] for k in xrange(0, n, mini_batch_size)]  # select equal sizes of mini-batches for the epochs (last mini_batch size might differ however) 

      c = 1 

      for mini_batch in mini_batches: 
       print "Updating mini-batch {0}".format(c) 
       self.update_mini_batch(mini_batch, eta) 
       c += 1 
      if test_data: 
       print "Epoch {0}: {1}/{2}".format(j, self.evaluate(test_data), n_test) 

      else: 
       print "Epoch {0} completed.".format(j) 

    def evaluate(self, test_data): 
     test_results = [(np.argmax(self.feed_forward(x)), y) for (x, y) in test_data] 
     return (sum(int(x == y) for x, y in test_results)) 

    def export_results(self, test_data): 
     results = [(np.argmax(self.feed_forward(x)), y) for (x, y) in test_data] 
     k = pd.DataFrame(results) 
     k.to_csv('net_results.csv') 


# Global functions 

## Activation function (sigmoid) 
@np.vectorize 
def sigmoid(z): 
    return 1.0/(1.0 + np.exp(-z)) 

## Activation derivative (sigmoid_prime) 
@np.vectorize 
def sigmoid_prime(z): 
    return sigmoid(z)*(1 - sigmoid(z))

Źródło

2015-07-24 Bishwarup Bhattacharjee

Oto mój kod. Czas potrzebny do powtórzenia 30 epok zmniejsza się z 800+ sekund do ponad 200 sekund na mojej maszynie.

Ponieważ jestem nowy w Pythonie, używam tego, co jest łatwo dostępne. Ten fragment wymaga tylko numpy do uruchomienia.

Spróbuj.

def feedforward2(self, a): 
    zs = [] 
    activations = [a] 

    activation = a 
    for b, w in zip(self.biases, self.weights): 
     z = np.dot(w, activation) + b 
     zs.append(z) 
     activation = sigmoid(z) 
     activations.append(activation) 

    return (zs, activations) 

def update_mini_batch2(self, mini_batch, eta): 
    batch_size = len(mini_batch) 

    # transform to (input x batch_size) matrix 
    x = np.asarray([_x.ravel() for _x, _y in mini_batch]).transpose() 
    # transform to (output x batch_size) matrix 
    y = np.asarray([_y.ravel() for _x, _y in mini_batch]).transpose() 

    nabla_b, nabla_w = self.backprop2(x, y) 
    self.weights = [w - (eta/batch_size) * nw for w, nw in zip(self.weights, nabla_w)] 
    self.biases = [b - (eta/batch_size) * nb for b, nb in zip(self.biases, nabla_b)] 

    return 

def backprop2(self, x, y): 

    nabla_b = [0 for i in self.biases] 
    nabla_w = [0 for i in self.weights] 

    # feedforward 
    zs, activations = self.feedforward2(x) 

    # backward pass 
    delta = self.cost_derivative(activations[-1], y) * sigmoid_prime(zs[-1]) 
    nabla_b[-1] = delta.sum(1).reshape([len(delta), 1]) # reshape to (n x 1) matrix 
    nabla_w[-1] = np.dot(delta, activations[-2].transpose()) 

    for l in xrange(2, self.num_layers): 
     z = zs[-l] 
     sp = sigmoid_prime(z) 
     delta = np.dot(self.weights[-l + 1].transpose(), delta) * sp 
     nabla_b[-l] = delta.sum(1).reshape([len(delta), 1]) # reshape to (n x 1) matrix 
     nabla_w[-l] = np.dot(delta, activations[-l - 1].transpose()) 

    return (nabla_b, nabla_w)

Źródło

2015-08-29 13:44:22

Pełno-macierzowe podejście do propagacji wstecznej w Sztucznej Sieci Neuronowej

Odpowiedz

Powiązane problemy