3 files changed, 85 insertions, 12 deletions
diff --git a/config/lstm-frigo-irc.py b/config/lstm-frigo-irc.py
index 4ae21e2..a0f5b5c 100644
--- a/config/lstm-frigo-irc.py
+++ b/config/lstm-frigo-irc.py
@@ -12,7 +12,11 @@ num_seqs = 50
 seq_len = 5000
 seq_div_size = 200
 
-hidden_dims = [1024, 1024, 1024]
+layers = [
+	{'dim':		1024},
+	{'dim':		1024},
+	{'dim':		1024},
+]
 activation_function = Tanh()
 
 i2h_all = True             # input to all hidden layers or only first layer
diff --git a/config/lstm-xreg.py b/config/lstm-xreg.py
new file mode 100644
index 0000000..66f7c51
--- /dev/null
+++ b/config/lstm-xreg.py
@@ -0,0 +1,48 @@
+from blocks.algorithms import AdaDelta
+from blocks.bricks import Tanh
+
+from model.lstm import Model
+
+dataset = 'data/logcompil.txt'
+io_dim = 256
+
+# An epoch will be composed of 'num_seqs' sequences of len 'seq_len'
+# divided in chunks of lengh 'seq_div_size'
+num_seqs = 50
+seq_len = 5000
+seq_div_size = 200
+
+layers = [
+	{'dim':		1024,
+	 'xreg': 	(768, 0.1, 10, 10, 6)
+	},
+	{'dim':		1024,
+	 'xreg': 	(768, 0.1, 10, 10, 6)
+	},
+	{'dim':		1024,
+	},
+]
+activation_function = Tanh()
+
+i2h_all = True             # input to all hidden layers or only first layer
+h2o_all = True             # all hiden layers to output or only last layer
+
+w_noise_std = 0.02
+i_dropout = 0.5
+
+l1_reg = 0
+
+step_rule = AdaDelta()
+
+# parameter saving freq (number of batches)
+monitor_freq = 100
+save_freq = 100
+
+# used for sample generation and IRC mode
+sample_temperature = 0.7 #0.5
+
+# do we want to generate samples at times during training?
+sample_len = 1000
+sample_freq = 100
+sample_init = '\nalex\ttu crois?\n'
+
diff --git a/model/lstm.py b/model/lstm.py
index 4d715d5..d928c88 100644
--- a/model/lstm.py
+++ b/model/lstm.py
@@ -19,30 +19,34 @@ class Model():
                               inp[:, :, None]).astype(theano.config.floatX)
         in_onehot.name = 'in_onehot'
 
+        costs_xreg = []
+
         # Construct hidden states
-        dims = [config.io_dim] + config.hidden_dims
+        dims = [config.io_dim]
         hidden = [in_onehot.dimshuffle(1, 0, 2)]
         bricks = []
         states = []
-        for i in xrange(1, len(dims)):
-            init_state = theano.shared(numpy.zeros((config.num_seqs, dims[i])).astype(theano.config.floatX),
+        for i in xrange(1, len(config.layers)+1):
+            p = config.layers[i-1]
+
+            init_state = theano.shared(numpy.zeros((config.num_seqs, p['dim'])).astype(theano.config.floatX),
                                        name='st0_%d'%i)
-            init_cell = theano.shared(numpy.zeros((config.num_seqs, dims[i])).astype(theano.config.floatX),
+            init_cell = theano.shared(numpy.zeros((config.num_seqs, p['dim'])).astype(theano.config.floatX),
                                        name='cell0_%d'%i)
 
-            linear = Linear(input_dim=dims[i-1], output_dim=4*dims[i],
+            linear = Linear(input_dim=dims[i-1], output_dim=4*p['dim'],
                             name="lstm_in_%d"%i)
             bricks.append(linear)
             inter = linear.apply(hidden[-1])
 
             if config.i2h_all and i > 1:
-                linear2 = Linear(input_dim=dims[0], output_dim=4*dims[i],
+                linear2 = Linear(input_dim=dims[0], output_dim=4*p['dim'],
                                  name="lstm_in0_%d"%i)
                 bricks.append(linear2)
                 inter = inter + linear2.apply(hidden[0])
                 inter.name = 'inter_bis_%d'%i
 
-            lstm = LSTM(dim=dims[i], activation=config.activation_function,
+            lstm = LSTM(dim=p['dim'], activation=config.activation_function,
                         name="lstm_rec_%d"%i)
             bricks.append(lstm)
 
@@ -52,6 +56,17 @@ class Model():
             states.append((init_state, new_hidden[-1, :, :]))
             states.append((init_cell, new_cells[-1, :, :]))
 
+            if 'xreg' in p and p['xreg'] is not None:
+                n, s, w1, w2, w3 = p['xreg']
+                cost_x1 = w1 * ((new_hidden.mean(axis=2) - s)**2).mean()
+                cost_x2 = w2 * ((new_hidden.mean(axis=(0,1)) - s)**2).mean()
+                cost_x3 = -w3 * abs(new_hidden - s).mean()
+                cost_x1.name = 'cost_x1_%d'%i
+                cost_x2.name = 'cost_x2_%d'%i
+                cost_x3.name = 'cost_x3_%d'%i
+                costs_xreg += [cost_x1, cost_x2, cost_x3]
+
+            dims.append(p['dim'])
             hidden.append(new_hidden)
 
         for i, (u, v) in enumerate(states):
@@ -79,13 +94,17 @@ class Model():
         print "****         inp", inp.dtype
         print "****         out", out.dtype
         print "****         pred", pred.dtype
-        cost = Softmax().categorical_cross_entropy(inp[:, 1:].flatten(),
+        cost0 = Softmax().categorical_cross_entropy(inp[:, 1:].flatten(),
                                                    out[:, :-1, :].reshape((inp.shape[0]*(inp.shape[1]-1),
                                                                            config.io_dim))).mean()
+        cost0.name = 'cost0'
         error_rate = tensor.neq(inp[:, 1:].flatten(), pred[:, :-1].flatten()).astype(theano.config.floatX).mean()
-        print "****         cost", cost.dtype
+        print "****         cost0", cost0.dtype
         print "****         error_rate", error_rate.dtype
 
+        costs = [cost0] + costs_xreg
+        cost = sum(costs)
+
         # Initialize all bricks
         for brick in bricks:
             brick.weights_init = IsotropicGaussian(0.1)
@@ -93,13 +112,14 @@ class Model():
             brick.initialize()
 
         # Apply noise and dropout
-        cg = ComputationGraph([cost, error_rate])
+        cg = ComputationGraph([cost, error_rate] + costs)
         if config.w_noise_std > 0:
             noise_vars = VariableFilter(roles=[WEIGHT])(cg)
             cg = apply_noise(cg, noise_vars, config.w_noise_std)
         if config.i_dropout > 0:
             cg = apply_dropout(cg, hidden[1:], config.i_dropout)
-        [cost_reg, error_rate_reg] = cg.outputs
+        [cost_reg, error_rate_reg] = cg.outputs[:2]
+        costs_reg = cg.outputs[2:]
         print "****         cost_reg", cost_reg.dtype
         print "****         error_rate_reg", error_rate_reg.dtype
 
@@ -119,6 +139,7 @@ class Model():
         error_rate.name = 'error_rate'
         error_rate_reg.name = 'error_rate_reg'
         self.monitor_vars = [[cost_reg],
+                             costs_reg,
                              [error_rate_reg]]
 
         self.out = out