diff options
author | Alex Auvolat <alex.auvolat@ens.fr> | 2015-06-09 16:27:06 -0400 |
---|---|---|
committer | Alex Auvolat <alex.auvolat@ens.fr> | 2015-06-09 16:27:06 -0400 |
commit | c5e1cd9c8c896096ad1630909a655b06eb398abb (patch) | |
tree | 836ef331e1e2ec96b6f634e53eb42e7e781319f0 /lstm.py | |
parent | 6a5ed2e43a5885eeb3c5e202ed5bb473f6065401 (diff) | |
download | text-rnn-c5e1cd9c8c896096ad1630909a655b06eb398abb.tar.gz text-rnn-c5e1cd9c8c896096ad1630909a655b06eb398abb.zip |
Now learning something
Diffstat (limited to 'lstm.py')
-rw-r--r-- | lstm.py | 99 |
1 files changed, 76 insertions, 23 deletions
@@ -1,60 +1,108 @@ import theano from theano import tensor +import numpy -from blocks.algorithms import Momentum, AdaDelta +from blocks.algorithms import Momentum, AdaDelta, RMSProp from blocks.bricks import Tanh, Softmax, Linear, MLP from blocks.bricks.recurrent import LSTM from blocks.initialization import IsotropicGaussian, Constant from blocks.filter import VariableFilter from blocks.roles import WEIGHT -from blocks.graph import ComputationGraph, apply_noise +from blocks.graph import ComputationGraph, apply_noise, apply_dropout -chars_per_seq = 100 -seqs_per_epoch = 1 +# An epoch will be composed of 'num_seqs' sequences of len 'seq_len' +# divided in chunks of lengh 'seq_div_size' +num_seqs = 10 +seq_len = 2000 +seq_div_size = 100 io_dim = 256 -hidden_dims = [200, 500] +hidden_dims = [512, 512] activation_function = Tanh() +all_hidden_for_output = False + w_noise_std = 0.01 +i_dropout = 0.5 + +step_rule = 'adadelta' -step_rule = AdaDelta() -pt_freq = 1 +param_desc = '%s-%sHO-n%s-d%s-%dx%d(%d)-%s' % ( + repr(hidden_dims), + 'all' if all_hidden_for_output else 'last', + repr(w_noise_std), + repr(i_dropout), + num_seqs, seq_len, seq_div_size, + step_rule + ) -param_desc = '' # todo +if step_rule == 'rmsprop': + step_rule = RMSProp() +elif step_rule == 'adadelta': + step_rule = AdaDelta() +else: + assert(False) class Model(): def __init__(self): - inp = tensor.lvector('bytes') + inp = tensor.lmatrix('bytes') - in_onehot = tensor.eq(tensor.arange(io_dim, dtype='int16').reshape((1, io_dim)), - inp[:, None]) + in_onehot = tensor.eq(tensor.arange(io_dim, dtype='int16').reshape((1, 1, io_dim)), + inp[:, :, None]) dims = [io_dim] + hidden_dims - prev = in_onehot[None, :, :] + states = [in_onehot.dimshuffle(1, 0, 2)] bricks = [] + updates = [] for i in xrange(1, len(dims)): + init_state = theano.shared(numpy.zeros((num_seqs, dims[i])).astype(theano.config.floatX), + name='st0_%d'%i) + init_cell = theano.shared(numpy.zeros((num_seqs, dims[i])).astype(theano.config.floatX), + name='cell0_%d'%i) + linear = Linear(input_dim=dims[i-1], output_dim=4*dims[i], name="lstm_in_%d"%i) lstm = LSTM(dim=dims[i], activation=activation_function, name="lstm_rec_%d"%i) - prev = lstm.apply(linear.apply(prev))[0] + + new_states, new_cells = lstm.apply(linear.apply(states[-1]), + states=init_state, + cells=init_cell) + updates.append((init_state, new_states[-1, :, :])) + updates.append((init_cell, new_cells[-1, :, :])) + + states.append(new_states) bricks = bricks + [linear, lstm] - top_linear = MLP(dims=[hidden_dims[-1], io_dim], - activations=[Softmax()], - name="pred_mlp") - bricks.append(top_linear) + states = [s.dimshuffle(1, 0, 2).reshape((inp.shape[0] * inp.shape[1], dim)) + for dim, s in zip(dims, states)] - out = top_linear.apply(prev.reshape((inp.shape[0], hidden_dims[-1]))) + if all_hidden_for_output: + top_linear = MLP(dims=[sum(hidden_dims), io_dim], + activations=[Softmax()], + name="pred_mlp") + bricks.append(top_linear) - pred = out.argmax(axis=1) + out = top_linear.apply(tensor.concatenate(states[1:], axis=1)) + else: + top_linear = MLP(dims=[hidden_dims[-1], io_dim], + activations=[None], + name="pred_mlp") + bricks.append(top_linear) - cost = Softmax().categorical_cross_entropy(inp[:-1], out[1:]) - error_rate = tensor.neq(inp[:-1], pred[1:]).mean() + out = top_linear.apply(states[-1]) + + out = out.reshape((inp.shape[0], inp.shape[1], io_dim)) + + pred = out.argmax(axis=2) + + cost = Softmax().categorical_cross_entropy(inp[:, 1:].flatten(), + out[:, :-1, :].reshape((inp.shape[0]*(inp.shape[1]-1), + io_dim))) + error_rate = tensor.neq(inp[:, 1:].flatten(), pred[:, :-1].flatten()).mean() # Initialize for brick in bricks: @@ -64,8 +112,11 @@ class Model(): # apply noise cg = ComputationGraph([cost, error_rate]) - noise_vars = VariableFilter(roles=[WEIGHT])(cg) - cg = apply_noise(cg, noise_vars, w_noise_std) + if w_noise_std > 0: + noise_vars = VariableFilter(roles=[WEIGHT])(cg) + cg = apply_noise(cg, noise_vars, w_noise_std) + if i_dropout > 0: + cg = apply_dropout(cg, states[1:], i_dropout) [cost_reg, error_rate_reg] = cg.outputs self.cost = cost @@ -74,3 +125,5 @@ class Model(): self.error_rate_reg = error_rate_reg self.pred = pred + self.updates = updates + |