diff options
author | Alex Auvolat <alex@adnab.me> | 2016-03-29 12:27:35 +0200 |
---|---|---|
committer | Alex Auvolat <alex@adnab.me> | 2016-03-29 12:27:35 +0200 |
commit | 30faf44a08edcc2075362c4633f6b1d291944cd3 (patch) | |
tree | bb4a53766d87fc3437999631eb08158a4a26e812 | |
parent | 62c05c06013e7204c1e7681a7e2ac7541f2acbcb (diff) | |
download | text-rnn-30faf44a08edcc2075362c4633f6b1d291944cd3.tar.gz text-rnn-30faf44a08edcc2075362c4633f6b1d291944cd3.zip |
This HPC stuff doesn't work very well.
-rw-r--r-- | config/hpc-gru-0.py | 52 | ||||
-rw-r--r-- | config/hpc-gru-1.py | 52 | ||||
-rw-r--r-- | config/hpc-lstm-0.py | 43 | ||||
-rw-r--r-- | config/hpc-lstm-2.py | 46 | ||||
-rw-r--r-- | config/hpc-lstm-3.py | 53 | ||||
-rw-r--r-- | model/hpc_gru.py | 134 | ||||
-rw-r--r-- | model/hpc_lstm.py | 16 | ||||
-rwxr-xr-x | train.py | 2 |
8 files changed, 394 insertions, 4 deletions
diff --git a/config/hpc-gru-0.py b/config/hpc-gru-0.py new file mode 100644 index 0000000..ab58a86 --- /dev/null +++ b/config/hpc-gru-0.py @@ -0,0 +1,52 @@ +import numpy +from numpy.random import RandomState + +from blocks.algorithms import AdaDelta, Momentum, RMSProp, CompositeRule, BasicMomentum +from blocks.bricks import Tanh, Rectifier +from blocks.initialization import IsotropicGaussian, Constant + +from model.hpc_gru import Model + +dataset = 'data/logcompil-2016-03-07.txt' + +io_dim = 256 +repr_dim = 64 +embedding_matrix = (RandomState(42).binomial(1, 10./repr_dim, ((io_dim, repr_dim))) + -RandomState(123).binomial(1, 10./repr_dim, ((io_dim, repr_dim)))) + +# An epoch will be composed of 'num_seqs' sequences of len 'seq_len' +# divided in chunks of lengh 'seq_div_size' +num_seqs = 100 +seq_len = 2000 +seq_div_size = 100 + +hidden_dims = [128, 384, 1024] +cost_factors = [1., 1., 1.] +hidden_q = [0.5, 0.5, 0.5] +activation_function = Tanh() + +out_hidden = [512] +out_hidden_act = [Tanh] + +weight_noise = 0 + +step_rule = AdaDelta() +#step_rule = CompositeRule([RMSProp(learning_rate=0.01), +# BasicMomentum(momentum=0.9)]) +#step_rule = Momentum(learning_rate=.1, momentum=0.9) + +weights_init = IsotropicGaussian(0.1) +biases_init = Constant(0.) + +# parameter saving freq (number of batches) +monitor_freq = 100 +save_freq = 100 + +# used for sample generation and IRC mode +sample_temperature = 0.5 #0.7 + +# do we want to generate samples at times during training? +sample_len = 1000 +sample_freq = 100 +sample_init = '\nalex\ttu crois?\n' + diff --git a/config/hpc-gru-1.py b/config/hpc-gru-1.py new file mode 100644 index 0000000..b59b025 --- /dev/null +++ b/config/hpc-gru-1.py @@ -0,0 +1,52 @@ +import numpy +from numpy.random import RandomState + +from blocks.algorithms import AdaDelta, Momentum, RMSProp, CompositeRule, BasicMomentum, Adam +from blocks.bricks import Tanh, Rectifier +from blocks.initialization import IsotropicGaussian, Constant + +from model.hpc_gru import Model + +dataset = 'data/logcompil-2016-03-07.txt' + +io_dim = 256 +repr_dim = 128 +embedding_matrix = (RandomState(42).binomial(1, 0.1, ((io_dim, repr_dim))) + -RandomState(123).binomial(1, 0.1, ((io_dim, repr_dim)))) + +# An epoch will be composed of 'num_seqs' sequences of len 'seq_len' +# divided in chunks of lengh 'seq_div_size' +num_seqs = 20 +seq_len = 5000 +seq_div_size = 50 + +hidden_dims = [128, 192, 256, 512] +cost_factors = [1., 1., 1., 1.] +hidden_q = [0.5, 0.5, 0.5, 0.5] +activation_function = Tanh() + +out_hidden = [512] +out_hidden_act = [Rectifier] + +weight_noise = 0.05 + +step_rule = Adam() +#step_rule = CompositeRule([RMSProp(learning_rate=0.01), +# BasicMomentum(momentum=0.9)]) +#step_rule = Momentum(learning_rate=.1, momentum=0.9) + +weights_init = IsotropicGaussian(0.1) +biases_init = Constant(0.01) + +# parameter saving freq (number of batches) +monitor_freq = 500 +save_freq = monitor_freq + +# used for sample generation and IRC mode +sample_temperature = 0.5 #0.7 + +# do we want to generate samples at times during training? +sample_len = 1000 +sample_freq = monitor_freq +sample_init = '\nalex\ttu crois?\n' + diff --git a/config/hpc-lstm-0.py b/config/hpc-lstm-0.py new file mode 100644 index 0000000..afb6471 --- /dev/null +++ b/config/hpc-lstm-0.py @@ -0,0 +1,43 @@ +import numpy +from numpy.random import RandomState + +from blocks.algorithms import AdaDelta, Momentum +from blocks.bricks import Tanh, Rectifier + +from model.hpc_lstm import Model + +dataset = 'data/logcompil-2016-03-07.txt' + +io_dim = 256 +repr_dim = 256 +embedding_matrix = numpy.eye(io_dim) + +# An epoch will be composed of 'num_seqs' sequences of len 'seq_len' +# divided in chunks of lengh 'seq_div_size' +num_seqs = 100 +seq_len = 2000 +seq_div_size = 100 + +hidden_dims = [128, 128, 256, 512] +cost_factors = [1., 1., 1., 1.] +hidden_q = [0.1, 0.15, 0.22, 0.33] +activation_function = Tanh() + +out_hidden = [512] +out_hidden_act = [Rectifier] + +step_rule = AdaDelta() +#step_rule = Momentum(learning_rate=0.0001, momentum=0.99) + +# parameter saving freq (number of batches) +monitor_freq = 10 +save_freq = 100 + +# used for sample generation and IRC mode +sample_temperature = 0.7 #0.5 + +# do we want to generate samples at times during training? +sample_len = 1000 +sample_freq = 100 +sample_init = '\nalex\ttu crois?\n' + diff --git a/config/hpc-lstm-2.py b/config/hpc-lstm-2.py new file mode 100644 index 0000000..aaed80e --- /dev/null +++ b/config/hpc-lstm-2.py @@ -0,0 +1,46 @@ +import numpy +from numpy.random import RandomState + +from blocks.algorithms import AdaDelta, Momentum, RMSProp, CompositeRule, BasicMomentum +from blocks.bricks import Tanh, Rectifier + +from model.hpc_lstm import Model + +dataset = 'data/logcompil-2016-03-07.txt' + +io_dim = 256 +repr_dim = 64 +embedding_matrix = (RandomState(42).binomial(1, 10./repr_dim, ((io_dim, repr_dim))) + -RandomState(123).binomial(1, 10./repr_dim, ((io_dim, repr_dim)))) + +# An epoch will be composed of 'num_seqs' sequences of len 'seq_len' +# divided in chunks of lengh 'seq_div_size' +num_seqs = 100 +seq_len = 2000 +seq_div_size = 100 + +hidden_dims = [64, 256, 1024] +cost_factors = [1., 1., 1.] +hidden_q = [0.5, 0.5, 0.5] +activation_function = Tanh() + +out_hidden = [512] +out_hidden_act = [Rectifier] + +step_rule = AdaDelta() +#step_rule = CompositeRule([RMSProp(learning_rate=0.01), +# BasicMomentum(momentum=0.9)]) +#step_rule = Momentum(learning_rate=.1, momentum=0.9) + +# parameter saving freq (number of batches) +monitor_freq = 100 +save_freq = 100 + +# used for sample generation and IRC mode +sample_temperature = 0.7 #0.5 + +# do we want to generate samples at times during training? +sample_len = 1000 +sample_freq = 100 +sample_init = '\nalex\ttu crois?\n' + diff --git a/config/hpc-lstm-3.py b/config/hpc-lstm-3.py new file mode 100644 index 0000000..fa0f77e --- /dev/null +++ b/config/hpc-lstm-3.py @@ -0,0 +1,53 @@ + +import numpy +from numpy.random import RandomState + +from blocks.algorithms import AdaDelta, Momentum, RMSProp, CompositeRule, BasicMomentum, Adam +from blocks.bricks import Tanh, Rectifier +from blocks.initialization import IsotropicGaussian, Constant + +from model.hpc_lstm import Model + +dataset = 'data/logcompil-2016-03-07.txt' + +io_dim = 256 +repr_dim = 128 +embedding_matrix = (RandomState(42).binomial(1, 0.1, ((io_dim, repr_dim))) + -RandomState(123).binomial(1, 0.1, ((io_dim, repr_dim)))) + +# An epoch will be composed of 'num_seqs' sequences of len 'seq_len' +# divided in chunks of lengh 'seq_div_size' +num_seqs = 20 +seq_len = 5000 +seq_div_size = 50 + +hidden_dims = [128, 192, 256, 512] +cost_factors = [1., 1., 1., 1.] +hidden_q = [0.5, 0.5, 0.5, 0.5] +activation_function = Tanh() + +out_hidden = [512] +out_hidden_act = [Rectifier] + +weight_noise = 0.05 + +step_rule = Adam() +#step_rule = CompositeRule([RMSProp(learning_rate=0.01), +# BasicMomentum(momentum=0.9)]) +#step_rule = Momentum(learning_rate=.1, momentum=0.9) + +weights_init = IsotropicGaussian(0.1) +biases_init = Constant(0.01) + +# parameter saving freq (number of batches) +monitor_freq = 500 +save_freq = monitor_freq + +# used for sample generation and IRC mode +sample_temperature = 0.5 #0.7 + +# do we want to generate samples at times during training? +sample_len = 1000 +sample_freq = monitor_freq +sample_init = '\nalex\ttu crois?\n' + diff --git a/model/hpc_gru.py b/model/hpc_gru.py new file mode 100644 index 0000000..bd01633 --- /dev/null +++ b/model/hpc_gru.py @@ -0,0 +1,134 @@ +# HPC-GRU : Hierarchical Predictive Coding GRU + +import theano +from theano import tensor +import numpy + +from blocks.bricks import Softmax, Tanh, Logistic, Linear, MLP, Identity +from blocks.bricks.recurrent import GatedRecurrent + +from blocks.filter import VariableFilter +from blocks.roles import WEIGHT +from blocks.graph import ComputationGraph, apply_noise + + +class Model(): + def __init__(self, config): + inp = tensor.imatrix('bytes') + + embed = theano.shared(config.embedding_matrix.astype(theano.config.floatX), + name='embedding_matrix') + in_repr = embed[inp.flatten(), :].reshape((inp.shape[0], inp.shape[1], config.repr_dim)) + in_repr.name = 'in_repr' + + bricks = [] + states = [] + + # Construct predictive GRU hierarchy + hidden = [] + costs = [] + next_target = in_repr.dimshuffle(1, 0, 2) + for i, (hdim, cf, q) in enumerate(zip(config.hidden_dims, + config.cost_factors, + config.hidden_q)): + init_state = theano.shared(numpy.zeros((config.num_seqs, hdim)).astype(theano.config.floatX), + name='st0_%d'%i) + + linear = Linear(input_dim=config.repr_dim, output_dim=3*hdim, + name="lstm_in_%d"%i) + lstm = GatedRecurrent(dim=hdim, activation=config.activation_function, + name="lstm_rec_%d"%i) + linear2 = Linear(input_dim=hdim, output_dim=config.repr_dim, name='lstm_out_%d'%i) + tanh = Tanh('lstm_out_tanh_%d'%i) + bricks += [linear, lstm, linear2, tanh] + if i > 0: + linear1 = Linear(input_dim=config.hidden_dims[i-1], output_dim=3*hdim, + name='lstm_in2_%d'%i) + bricks += [linear1] + + next_target = tensor.cast(next_target, dtype=theano.config.floatX) + inter = linear.apply(theano.gradient.disconnected_grad(next_target)) + if i > 0: + inter += linear1.apply(theano.gradient.disconnected_grad(hidden[-1][:-1,:,:])) + new_hidden = lstm.apply(inputs=inter[:,:,:hdim], + gate_inputs=inter[:,:,hdim:], + states=init_state) + states.append((init_state, new_hidden[-1, :, :])) + + hidden += [tensor.concatenate([init_state[None,:,:], new_hidden],axis=0)] + pred = tanh.apply(linear2.apply(hidden[-1][:-1,:,:])) + costs += [numpy.float32(cf) * (-next_target * pred).sum(axis=2).mean()] + costs += [numpy.float32(cf) * q * abs(pred).sum(axis=2).mean()] + diff = next_target - pred + next_target = tensor.ge(diff, 0.5) - tensor.le(diff, -0.5) + + + # Construct output from hidden states + hidden = [s.dimshuffle(1, 0, 2) for s in hidden] + + out_parts = [] + out_dims = config.out_hidden + [config.io_dim] + for i, (dim, state) in enumerate(zip(config.hidden_dims, hidden)): + pred_linear = Linear(input_dim=dim, output_dim=out_dims[0], + name='pred_linear_%d'%i) + bricks.append(pred_linear) + lin = theano.gradient.disconnected_grad(state) + out_parts.append(pred_linear.apply(lin)) + + # Do prediction and calculate cost + out = sum(out_parts) + + if len(out_dims) > 1: + out = config.out_hidden_act[0](name='out_act0').apply(out) + mlp = MLP(dims=out_dims, + activations=[x(name='out_act%d'%i) for i, x in enumerate(config.out_hidden_act[1:])] + +[Identity()], + name='out_mlp') + bricks.append(mlp) + out = mlp.apply(out.reshape((inp.shape[0]*(inp.shape[1]+1),-1)) + ).reshape((inp.shape[0],inp.shape[1]+1,-1)) + + pred = out.argmax(axis=2) + + cost = Softmax().categorical_cross_entropy(inp.flatten(), + out[:,:-1,:].reshape((inp.shape[0]*inp.shape[1], + config.io_dim))).mean() + error_rate = tensor.neq(inp.flatten(), pred[:,:-1].flatten()).mean() + + sgd_cost = cost + sum(costs) + + # Initialize all bricks + for brick in bricks: + brick.weights_init = config.weights_init + brick.biases_init = config.biases_init + brick.initialize() + + # apply noise + cg = ComputationGraph([sgd_cost, cost, error_rate]+costs) + if config.weight_noise > 0: + noise_vars = VariableFilter(roles=[WEIGHT])(cg) + cg = apply_noise(cg, noise_vars, config.weight_noise) + sgd_cost = cg.outputs[0] + cost = cg.outputs[1] + error_rate = cg.outputs[2] + costs = cg.outputs[3:] + + + # put stuff into self that is usefull for training or extensions + self.sgd_cost = sgd_cost + + sgd_cost.name = 'sgd_cost' + for i in range(len(costs)): + costs[i].name = 'pred_cost_%d'%i + cost.name = 'cost' + error_rate.name = 'error_rate' + self.monitor_vars = [costs, [cost], + [error_rate]] + + self.out = out[:,1:,:] + self.pred = pred[:,1:] + + self.states = states + + +# vim: set sts=4 ts=4 sw=4 tw=0 et : diff --git a/model/hpc_lstm.py b/model/hpc_lstm.py index 395646c..d3c33a2 100644 --- a/model/hpc_lstm.py +++ b/model/hpc_lstm.py @@ -10,7 +10,7 @@ from blocks.initialization import IsotropicGaussian, Constant from blocks.filter import VariableFilter from blocks.roles import WEIGHT -from blocks.graph import ComputationGraph, apply_noise, apply_dropout +from blocks.graph import ComputationGraph, apply_noise class Model(): @@ -103,10 +103,20 @@ class Model(): # Initialize all bricks for brick in bricks: - brick.weights_init = IsotropicGaussian(0.1) - brick.biases_init = Constant(0.) + brick.weights_init = config.weights_init + brick.biases_init = config.biases_init brick.initialize() + # apply noise + cg = ComputationGraph([sgd_cost, cost, error_rate]+costs) + if config.weight_noise > 0: + noise_vars = VariableFilter(roles=[WEIGHT])(cg) + cg = apply_noise(cg, noise_vars, config.weight_noise) + sgd_cost = cg.outputs[0] + cost = cg.outputs[1] + error_rate = cg.outputs[2] + costs = cg.outputs[3:] + # put stuff into self that is usefull for training or extensions self.sgd_cost = sgd_cost @@ -70,11 +70,11 @@ if __name__ == "__main__": monitor_vars = list(set(v for p in m.monitor_vars for v in p)) extensions = [ + ProgressBar(), TrainingDataMonitoring( monitor_vars, prefix='train', every_n_batches=config.monitor_freq), Printing(every_n_batches=config.monitor_freq, after_epoch=False), - ProgressBar(), ResetStates([v for v, _ in m.states], after_epoch=True) ] |