From c5e1cd9c8c896096ad1630909a655b06eb398abb Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 9 Jun 2015 16:27:06 -0400 Subject: Now learning something --- datastream.py | 42 ++++++++++++++++++++++--- lstm.py | 99 +++++++++++++++++++++++++++++++++++++++++++++-------------- train.py | 44 +++++++++++++++++++++----- 3 files changed, 151 insertions(+), 34 deletions(-) diff --git a/datastream.py b/datastream.py index 5d9441f..8025945 100644 --- a/datastream.py +++ b/datastream.py @@ -61,16 +61,50 @@ class BytesToIndices(Transformer): data = next(self.child_epoch_iterator) return numpy.array([ord(i) for i in data[0]], dtype='int16'), -def setup_datastream(filename, seq_len, num_seqs_per_epoch=100): +class ParallelSequences(Transformer): + def __init__(self, stream, num_seqs, seq_div_size, **kwargs): + self.sources = ('bytes',) + + self.num_seqs = num_seqs + self.div_size = seq_div_size + + self.tmp = None + self.i = 0 + + super(ParallelSequences, self).__init__(stream, **kwargs) + + def get_data(self, request=None): + if request is not None: + raise ValueError('Unsupported: request') + + if self.tmp is None or self.i >= self.tmp.shape[1]: + self.tmp = numpy.concatenate([next(self.child_epoch_iterator)[0][None, :] + for _ in xrange(self.num_seqs)], + axis=0) + self.i = 0 + + ret = self.tmp[:, self.i:self.i + self.div_size] + self.i += self.div_size + + return ret, + + + +def setup_datastream(filename, num_seqs, seq_len, seq_div_size): ds = BinaryFileDataset(filename) - it = RandomBlockIterator(ds.num_examples(), seq_len, num_seqs_per_epoch) + it = RandomBlockIterator(ds.num_examples(), seq_len, num_seqs) stream = DataStream(ds, iteration_scheme=it) stream = BytesToIndices(stream) + stream = ParallelSequences(stream, num_seqs, seq_div_size) return stream if __name__ == "__main__": # Test - stream = setup_datastream("data/logcompil.txt", 100) - print(next(stream.get_epoch_iterator())) + stream = setup_datastream("data/logcompil.txt", 2, 60, 20) + it = stream.get_epoch_iterator() + for d, in stream.get_epoch_iterator(): + print '--' + for u in range(d.shape[0]): + print ''.join(chr(i) for i in d[u]) diff --git a/lstm.py b/lstm.py index 72b67f1..32cdb9b 100644 --- a/lstm.py +++ b/lstm.py @@ -1,60 +1,108 @@ import theano from theano import tensor +import numpy -from blocks.algorithms import Momentum, AdaDelta +from blocks.algorithms import Momentum, AdaDelta, RMSProp from blocks.bricks import Tanh, Softmax, Linear, MLP from blocks.bricks.recurrent import LSTM from blocks.initialization import IsotropicGaussian, Constant from blocks.filter import VariableFilter from blocks.roles import WEIGHT -from blocks.graph import ComputationGraph, apply_noise +from blocks.graph import ComputationGraph, apply_noise, apply_dropout -chars_per_seq = 100 -seqs_per_epoch = 1 +# An epoch will be composed of 'num_seqs' sequences of len 'seq_len' +# divided in chunks of lengh 'seq_div_size' +num_seqs = 10 +seq_len = 2000 +seq_div_size = 100 io_dim = 256 -hidden_dims = [200, 500] +hidden_dims = [512, 512] activation_function = Tanh() +all_hidden_for_output = False + w_noise_std = 0.01 +i_dropout = 0.5 + +step_rule = 'adadelta' -step_rule = AdaDelta() -pt_freq = 1 +param_desc = '%s-%sHO-n%s-d%s-%dx%d(%d)-%s' % ( + repr(hidden_dims), + 'all' if all_hidden_for_output else 'last', + repr(w_noise_std), + repr(i_dropout), + num_seqs, seq_len, seq_div_size, + step_rule + ) -param_desc = '' # todo +if step_rule == 'rmsprop': + step_rule = RMSProp() +elif step_rule == 'adadelta': + step_rule = AdaDelta() +else: + assert(False) class Model(): def __init__(self): - inp = tensor.lvector('bytes') + inp = tensor.lmatrix('bytes') - in_onehot = tensor.eq(tensor.arange(io_dim, dtype='int16').reshape((1, io_dim)), - inp[:, None]) + in_onehot = tensor.eq(tensor.arange(io_dim, dtype='int16').reshape((1, 1, io_dim)), + inp[:, :, None]) dims = [io_dim] + hidden_dims - prev = in_onehot[None, :, :] + states = [in_onehot.dimshuffle(1, 0, 2)] bricks = [] + updates = [] for i in xrange(1, len(dims)): + init_state = theano.shared(numpy.zeros((num_seqs, dims[i])).astype(theano.config.floatX), + name='st0_%d'%i) + init_cell = theano.shared(numpy.zeros((num_seqs, dims[i])).astype(theano.config.floatX), + name='cell0_%d'%i) + linear = Linear(input_dim=dims[i-1], output_dim=4*dims[i], name="lstm_in_%d"%i) lstm = LSTM(dim=dims[i], activation=activation_function, name="lstm_rec_%d"%i) - prev = lstm.apply(linear.apply(prev))[0] + + new_states, new_cells = lstm.apply(linear.apply(states[-1]), + states=init_state, + cells=init_cell) + updates.append((init_state, new_states[-1, :, :])) + updates.append((init_cell, new_cells[-1, :, :])) + + states.append(new_states) bricks = bricks + [linear, lstm] - top_linear = MLP(dims=[hidden_dims[-1], io_dim], - activations=[Softmax()], - name="pred_mlp") - bricks.append(top_linear) + states = [s.dimshuffle(1, 0, 2).reshape((inp.shape[0] * inp.shape[1], dim)) + for dim, s in zip(dims, states)] - out = top_linear.apply(prev.reshape((inp.shape[0], hidden_dims[-1]))) + if all_hidden_for_output: + top_linear = MLP(dims=[sum(hidden_dims), io_dim], + activations=[Softmax()], + name="pred_mlp") + bricks.append(top_linear) - pred = out.argmax(axis=1) + out = top_linear.apply(tensor.concatenate(states[1:], axis=1)) + else: + top_linear = MLP(dims=[hidden_dims[-1], io_dim], + activations=[None], + name="pred_mlp") + bricks.append(top_linear) - cost = Softmax().categorical_cross_entropy(inp[:-1], out[1:]) - error_rate = tensor.neq(inp[:-1], pred[1:]).mean() + out = top_linear.apply(states[-1]) + + out = out.reshape((inp.shape[0], inp.shape[1], io_dim)) + + pred = out.argmax(axis=2) + + cost = Softmax().categorical_cross_entropy(inp[:, 1:].flatten(), + out[:, :-1, :].reshape((inp.shape[0]*(inp.shape[1]-1), + io_dim))) + error_rate = tensor.neq(inp[:, 1:].flatten(), pred[:, :-1].flatten()).mean() # Initialize for brick in bricks: @@ -64,8 +112,11 @@ class Model(): # apply noise cg = ComputationGraph([cost, error_rate]) - noise_vars = VariableFilter(roles=[WEIGHT])(cg) - cg = apply_noise(cg, noise_vars, w_noise_std) + if w_noise_std > 0: + noise_vars = VariableFilter(roles=[WEIGHT])(cg) + cg = apply_noise(cg, noise_vars, w_noise_std) + if i_dropout > 0: + cg = apply_dropout(cg, states[1:], i_dropout) [cost_reg, error_rate_reg] = cg.outputs self.cost = cost @@ -74,3 +125,5 @@ class Model(): self.error_rate_reg = error_rate_reg self.pred = pred + self.updates = updates + diff --git a/train.py b/train.py index ab973a1..7857f3f 100755 --- a/train.py +++ b/train.py @@ -5,16 +5,18 @@ import numpy import sys import importlib +import theano +from theano import tensor + from blocks.dump import load_parameter_values from blocks.dump import MainLoopDumpManager -from blocks.extensions import Printing +from blocks.extensions import Printing, SimpleExtension from blocks.extensions.monitoring import DataStreamMonitoring, TrainingDataMonitoring from blocks.extensions.plot import Plot from blocks.graph import ComputationGraph from blocks.main_loop import MainLoop from blocks.model import Model from blocks.algorithms import GradientDescent -from theano import tensor import datastream # from apply_model import Apply @@ -30,6 +32,29 @@ if __name__ == "__main__": config = importlib.import_module('%s' % model_name) +class GenText(SimpleExtension): + def __init__(self, model, init_text, max_bytes, **kwargs): + self.init_text = init_text + self.max_bytes = max_bytes + + cg = ComputationGraph([model.pred]) + assert(len(cg.inputs) == 1) + assert(cg.inputs[0].name == 'bytes') + self.f = theano.function(inputs=cg.inputs, outputs=[model.pred]) + + super(GenText, self).__init__(**kwargs) + + def do(self, which_callback, *args): + v = numpy.array([ord(i) for i in self.init_text], + dtype='int16')[None, :].repeat(axis=0, repeats=config.num_seqs) + + while v.shape[1] < self.max_bytes: + pred, = self.f(v) + v = numpy.concatenate([v, pred[:, -1:]], axis=1) + + for i in range(v.shape[0]): + print "Sample:", ''.join([chr(int(v[i, j])) for j in range(v.shape[1])]) + def train_model(m, train_stream, load_location=None, save_location=None): # Define the model @@ -44,6 +69,9 @@ def train_model(m, train_stream, load_location=None, save_location=None): algorithm = GradientDescent(cost=m.cost_reg, step_rule=config.step_rule, params=cg.parameters) + + algorithm.add_updates(m.updates) + main_loop = MainLoop( model=model, data_stream=train_stream, @@ -51,12 +79,13 @@ def train_model(m, train_stream, load_location=None, save_location=None): extensions=[ TrainingDataMonitoring( [m.cost_reg, m.error_rate_reg, m.cost, m.error_rate], - prefix='train', every_n_epochs=1*config.pt_freq), - Printing(every_n_epochs=1*config.pt_freq, after_epoch=False), + prefix='train', every_n_epochs=1), + Printing(every_n_epochs=1, after_epoch=False), Plot(document='tr_'+model_name+'_'+config.param_desc, channels=[['train_cost', 'train_cost_reg'], ['train_error_rate', 'train_error_rate_reg']], - every_n_epochs=1*config.pt_freq, after_epoch=False) + every_n_epochs=1, after_epoch=False), + GenText(m, '\t', 20, every_n_epochs=1, after_epoch=False) ] ) main_loop.run() @@ -72,8 +101,9 @@ def train_model(m, train_stream, load_location=None, save_location=None): if __name__ == "__main__": # Build datastream train_stream = datastream.setup_datastream('data/logcompil.txt', - config.chars_per_seq, - config.seqs_per_epoch) + config.num_seqs, + config.seq_len, + config.seq_div_size) # Build model m = config.Model() -- cgit v1.2.3