summaryrefslogtreecommitdiff
path: root/lstm.py
diff options
context:
space:
mode:
Diffstat (limited to 'lstm.py')
-rw-r--r--lstm.py160
1 files changed, 0 insertions, 160 deletions
diff --git a/lstm.py b/lstm.py
deleted file mode 100644
index 1750d58..0000000
--- a/lstm.py
+++ /dev/null
@@ -1,160 +0,0 @@
-import theano
-from theano import tensor
-import numpy
-
-from blocks.algorithms import Momentum, AdaDelta, RMSProp
-from blocks.bricks import Tanh, Softmax, Linear, MLP
-from blocks.bricks.recurrent import LSTM
-from blocks.initialization import IsotropicGaussian, Constant
-
-from blocks.filter import VariableFilter
-from blocks.roles import WEIGHT
-from blocks.graph import ComputationGraph, apply_noise, apply_dropout
-
-# An epoch will be composed of 'num_seqs' sequences of len 'seq_len'
-# divided in chunks of lengh 'seq_div_size'
-num_seqs = 20
-seq_len = 5000
-seq_div_size = 200
-
-io_dim = 256
-
-hidden_dims = [1024, 1024, 1024]
-activation_function = Tanh()
-
-i2h_all = True # input to all hidden layers or only first layer
-h2o_all = True # all hiden layers to output or only last layer
-
-w_noise_std = 0.02
-i_dropout = 0.5
-
-l1_reg = 0
-
-step_rule = 'adadelta'
-learning_rate = 0.1
-momentum = 0.9
-
-
-param_desc = '%s-%sIH,%sHO-n%s-d%s-l1r%s-%dx%d(%d)-%s' % (
- repr(hidden_dims),
- 'all' if i2h_all else 'first',
- 'all' if h2o_all else 'last',
- repr(w_noise_std),
- repr(i_dropout),
- repr(l1_reg),
- num_seqs, seq_len, seq_div_size,
- step_rule
- )
-
-save_freq = 5
-on_irc = True
-
-# parameters for sample generation
-sample_len = 1000
-sample_temperature = 0.7 #0.5
-sample_freq = None
-
-if step_rule == 'rmsprop':
- step_rule = RMSProp()
-elif step_rule == 'adadelta':
- step_rule = AdaDelta()
-elif step_rule == 'momentum':
- step_rule = Momentum(learning_rate=learning_rate, momentum=momentum)
-else:
- assert(False)
-
-class Model():
- def __init__(self):
- inp = tensor.lmatrix('bytes')
-
- in_onehot = tensor.eq(tensor.arange(io_dim, dtype='int16').reshape((1, 1, io_dim)),
- inp[:, :, None])
- in_onehot.name = 'in_onehot'
-
- # Construct hidden states
- dims = [io_dim] + hidden_dims
- hidden = [in_onehot.dimshuffle(1, 0, 2)]
- bricks = []
- states = []
- for i in xrange(1, len(dims)):
- init_state = theano.shared(numpy.zeros((num_seqs, dims[i])).astype(theano.config.floatX),
- name='st0_%d'%i)
- init_cell = theano.shared(numpy.zeros((num_seqs, dims[i])).astype(theano.config.floatX),
- name='cell0_%d'%i)
-
- linear = Linear(input_dim=dims[i-1], output_dim=4*dims[i],
- name="lstm_in_%d"%i)
- bricks.append(linear)
- inter = linear.apply(hidden[-1])
-
- if i2h_all and i > 1:
- linear2 = Linear(input_dim=dims[0], output_dim=4*dims[i],
- name="lstm_in0_%d"%i)
- bricks.append(linear2)
- inter = inter + linear2.apply(hidden[0])
- inter.name = 'inter_bis_%d'%i
-
- lstm = LSTM(dim=dims[i], activation=activation_function,
- name="lstm_rec_%d"%i)
- bricks.append(lstm)
-
- new_hidden, new_cells = lstm.apply(inter,
- states=init_state,
- cells=init_cell)
- states.append((init_state, new_hidden[-1, :, :]))
- states.append((init_cell, new_cells[-1, :, :]))
-
- hidden.append(new_hidden)
-
- hidden = [s.dimshuffle(1, 0, 2) for s in hidden]
-
- # Construct output from hidden states
- out = None
- layers = zip(dims, hidden)[1:]
- if not h2o_all:
- layers = [layers[-1]]
- for i, (dim, state) in enumerate(layers):
- top_linear = Linear(input_dim=dim, output_dim=io_dim,
- name='top_linear_%d'%i)
- bricks.append(top_linear)
- out_i = top_linear.apply(state)
- out = out_i if out is None else out + out_i
- out.name = 'out_part_%d'%i
-
- # Do prediction and calculate cost
- pred = out.argmax(axis=2)
-
- cost = Softmax().categorical_cross_entropy(inp[:, 1:].flatten(),
- out[:, :-1, :].reshape((inp.shape[0]*(inp.shape[1]-1),
- io_dim)))
- error_rate = tensor.neq(inp[:, 1:].flatten(), pred[:, :-1].flatten()).mean()
-
- # Initialize all bricks
- for brick in bricks:
- brick.weights_init = IsotropicGaussian(0.1)
- brick.biases_init = Constant(0.)
- brick.initialize()
-
- # Apply noise and dropout
- cg = ComputationGraph([cost, error_rate])
- if w_noise_std > 0:
- noise_vars = VariableFilter(roles=[WEIGHT])(cg)
- cg = apply_noise(cg, noise_vars, w_noise_std)
- if i_dropout > 0:
- cg = apply_dropout(cg, hidden[1:], i_dropout)
- [cost_reg, error_rate_reg] = cg.outputs
-
- # add l1 regularization
- if l1_reg > 0:
- l1pen = sum(abs(st).mean() for st in hidden[1:])
- cost_reg = cost_reg + l1_reg * l1pen
-
- self.cost = cost
- self.error_rate = error_rate
- self.cost_reg = cost_reg
- self.error_rate_reg = error_rate_reg
- self.out = out
- self.pred = pred
-
- self.states = states
-