1 files changed, 76 insertions, 23 deletions
diff --git a/lstm.py b/lstm.py
index 72b67f1..32cdb9b 100644
--- a/lstm.py
+++ b/lstm.py
@@ -1,60 +1,108 @@
 import theano
 from theano import tensor
+import numpy
 
-from blocks.algorithms import Momentum, AdaDelta
+from blocks.algorithms import Momentum, AdaDelta, RMSProp
 from blocks.bricks import Tanh, Softmax, Linear, MLP
 from blocks.bricks.recurrent import LSTM
 from blocks.initialization import IsotropicGaussian, Constant
 
 from blocks.filter import VariableFilter
 from blocks.roles import WEIGHT
-from blocks.graph import ComputationGraph, apply_noise
+from blocks.graph import ComputationGraph, apply_noise, apply_dropout
 
-chars_per_seq = 100
-seqs_per_epoch = 1
+# An epoch will be composed of 'num_seqs' sequences of len 'seq_len'
+# divided in chunks of lengh 'seq_div_size'
+num_seqs = 10
+seq_len = 2000
+seq_div_size = 100
 
 io_dim = 256
 
-hidden_dims = [200, 500]
+hidden_dims = [512, 512]
 activation_function = Tanh()
 
+all_hidden_for_output = False
+
 w_noise_std = 0.01
+i_dropout = 0.5
+
+step_rule = 'adadelta'
 
-step_rule = AdaDelta()
 
-pt_freq = 1
+param_desc = '%s-%sHO-n%s-d%s-%dx%d(%d)-%s' % (
+                 repr(hidden_dims),
+                 'all' if all_hidden_for_output else 'last',
+                 repr(w_noise_std),
+                 repr(i_dropout),
+                 num_seqs, seq_len, seq_div_size,
+                 step_rule
+                ) 
 
-param_desc = '' # todo
+if step_rule == 'rmsprop':
+    step_rule = RMSProp()
+elif step_rule == 'adadelta':
+    step_rule = AdaDelta()
+else:
+    assert(False)
 
 class Model():
     def __init__(self):
-        inp = tensor.lvector('bytes')
+        inp = tensor.lmatrix('bytes')
 
-        in_onehot = tensor.eq(tensor.arange(io_dim, dtype='int16').reshape((1, io_dim)),
-                              inp[:, None])
+        in_onehot = tensor.eq(tensor.arange(io_dim, dtype='int16').reshape((1, 1, io_dim)),
+                              inp[:, :, None])
 
         dims = [io_dim] + hidden_dims
-        prev = in_onehot[None, :, :]
+        states = [in_onehot.dimshuffle(1, 0, 2)]
         bricks = []
+        updates = []
         for i in xrange(1, len(dims)):
+            init_state = theano.shared(numpy.zeros((num_seqs, dims[i])).astype(theano.config.floatX),
+                                       name='st0_%d'%i)
+            init_cell = theano.shared(numpy.zeros((num_seqs, dims[i])).astype(theano.config.floatX),
+                                       name='cell0_%d'%i)
+
             linear = Linear(input_dim=dims[i-1], output_dim=4*dims[i],
                             name="lstm_in_%d"%i)
             lstm = LSTM(dim=dims[i], activation=activation_function,
                         name="lstm_rec_%d"%i)
-            prev = lstm.apply(linear.apply(prev))[0]
+
+            new_states, new_cells = lstm.apply(linear.apply(states[-1]),
+                                               states=init_state,
+                                               cells=init_cell)
+            updates.append((init_state, new_states[-1, :, :]))
+            updates.append((init_cell, new_cells[-1, :, :]))
+
+            states.append(new_states)
             bricks = bricks + [linear, lstm]
 
-        top_linear = MLP(dims=[hidden_dims[-1], io_dim],
-                         activations=[Softmax()],
-                         name="pred_mlp")
-        bricks.append(top_linear)
+        states = [s.dimshuffle(1, 0, 2).reshape((inp.shape[0] * inp.shape[1], dim))
+                        for dim, s in zip(dims, states)]
 
-        out = top_linear.apply(prev.reshape((inp.shape[0], hidden_dims[-1])))
+        if all_hidden_for_output:
+            top_linear = MLP(dims=[sum(hidden_dims), io_dim],
+                             activations=[Softmax()],
+                             name="pred_mlp")
+            bricks.append(top_linear)
 
-        pred = out.argmax(axis=1)
+            out = top_linear.apply(tensor.concatenate(states[1:], axis=1))
+        else:
+            top_linear = MLP(dims=[hidden_dims[-1], io_dim],
+                             activations=[None],
+                             name="pred_mlp")
+            bricks.append(top_linear)
 
-        cost = Softmax().categorical_cross_entropy(inp[:-1], out[1:])
-        error_rate = tensor.neq(inp[:-1], pred[1:]).mean()
+            out = top_linear.apply(states[-1])
+
+        out = out.reshape((inp.shape[0], inp.shape[1], io_dim))
+
+        pred = out.argmax(axis=2)
+
+        cost = Softmax().categorical_cross_entropy(inp[:, 1:].flatten(),
+                                                   out[:, :-1, :].reshape((inp.shape[0]*(inp.shape[1]-1),
+                                                                           io_dim)))
+        error_rate = tensor.neq(inp[:, 1:].flatten(), pred[:, :-1].flatten()).mean()
 
         # Initialize
         for brick in bricks:
@@ -64,8 +112,11 @@ class Model():
 
         # apply noise
         cg = ComputationGraph([cost, error_rate])
-        noise_vars = VariableFilter(roles=[WEIGHT])(cg)
-        cg = apply_noise(cg, noise_vars, w_noise_std)
+        if w_noise_std > 0:
+            noise_vars = VariableFilter(roles=[WEIGHT])(cg)
+            cg = apply_noise(cg, noise_vars, w_noise_std)
+        if i_dropout > 0:
+            cg = apply_dropout(cg, states[1:], i_dropout)
         [cost_reg, error_rate_reg] = cg.outputs
 
         self.cost = cost
@@ -74,3 +125,5 @@ class Model():
         self.error_rate_reg = error_rate_reg
         self.pred = pred
 
+        self.updates = updates
+