diff options
-rw-r--r-- | config/lstm-frigo-irc.py | 6 | ||||
-rw-r--r-- | config/lstm-xreg.py | 48 | ||||
-rw-r--r-- | model/lstm.py | 43 |
3 files changed, 85 insertions, 12 deletions
diff --git a/config/lstm-frigo-irc.py b/config/lstm-frigo-irc.py index 4ae21e2..a0f5b5c 100644 --- a/config/lstm-frigo-irc.py +++ b/config/lstm-frigo-irc.py @@ -12,7 +12,11 @@ num_seqs = 50 seq_len = 5000 seq_div_size = 200 -hidden_dims = [1024, 1024, 1024] +layers = [ + {'dim': 1024}, + {'dim': 1024}, + {'dim': 1024}, +] activation_function = Tanh() i2h_all = True # input to all hidden layers or only first layer diff --git a/config/lstm-xreg.py b/config/lstm-xreg.py new file mode 100644 index 0000000..66f7c51 --- /dev/null +++ b/config/lstm-xreg.py @@ -0,0 +1,48 @@ +from blocks.algorithms import AdaDelta +from blocks.bricks import Tanh + +from model.lstm import Model + +dataset = 'data/logcompil.txt' +io_dim = 256 + +# An epoch will be composed of 'num_seqs' sequences of len 'seq_len' +# divided in chunks of lengh 'seq_div_size' +num_seqs = 50 +seq_len = 5000 +seq_div_size = 200 + +layers = [ + {'dim': 1024, + 'xreg': (768, 0.1, 10, 10, 6) + }, + {'dim': 1024, + 'xreg': (768, 0.1, 10, 10, 6) + }, + {'dim': 1024, + }, +] +activation_function = Tanh() + +i2h_all = True # input to all hidden layers or only first layer +h2o_all = True # all hiden layers to output or only last layer + +w_noise_std = 0.02 +i_dropout = 0.5 + +l1_reg = 0 + +step_rule = AdaDelta() + +# parameter saving freq (number of batches) +monitor_freq = 100 +save_freq = 100 + +# used for sample generation and IRC mode +sample_temperature = 0.7 #0.5 + +# do we want to generate samples at times during training? +sample_len = 1000 +sample_freq = 100 +sample_init = '\nalex\ttu crois?\n' + diff --git a/model/lstm.py b/model/lstm.py index 4d715d5..d928c88 100644 --- a/model/lstm.py +++ b/model/lstm.py @@ -19,30 +19,34 @@ class Model(): inp[:, :, None]).astype(theano.config.floatX) in_onehot.name = 'in_onehot' + costs_xreg = [] + # Construct hidden states - dims = [config.io_dim] + config.hidden_dims + dims = [config.io_dim] hidden = [in_onehot.dimshuffle(1, 0, 2)] bricks = [] states = [] - for i in xrange(1, len(dims)): - init_state = theano.shared(numpy.zeros((config.num_seqs, dims[i])).astype(theano.config.floatX), + for i in xrange(1, len(config.layers)+1): + p = config.layers[i-1] + + init_state = theano.shared(numpy.zeros((config.num_seqs, p['dim'])).astype(theano.config.floatX), name='st0_%d'%i) - init_cell = theano.shared(numpy.zeros((config.num_seqs, dims[i])).astype(theano.config.floatX), + init_cell = theano.shared(numpy.zeros((config.num_seqs, p['dim'])).astype(theano.config.floatX), name='cell0_%d'%i) - linear = Linear(input_dim=dims[i-1], output_dim=4*dims[i], + linear = Linear(input_dim=dims[i-1], output_dim=4*p['dim'], name="lstm_in_%d"%i) bricks.append(linear) inter = linear.apply(hidden[-1]) if config.i2h_all and i > 1: - linear2 = Linear(input_dim=dims[0], output_dim=4*dims[i], + linear2 = Linear(input_dim=dims[0], output_dim=4*p['dim'], name="lstm_in0_%d"%i) bricks.append(linear2) inter = inter + linear2.apply(hidden[0]) inter.name = 'inter_bis_%d'%i - lstm = LSTM(dim=dims[i], activation=config.activation_function, + lstm = LSTM(dim=p['dim'], activation=config.activation_function, name="lstm_rec_%d"%i) bricks.append(lstm) @@ -52,6 +56,17 @@ class Model(): states.append((init_state, new_hidden[-1, :, :])) states.append((init_cell, new_cells[-1, :, :])) + if 'xreg' in p and p['xreg'] is not None: + n, s, w1, w2, w3 = p['xreg'] + cost_x1 = w1 * ((new_hidden.mean(axis=2) - s)**2).mean() + cost_x2 = w2 * ((new_hidden.mean(axis=(0,1)) - s)**2).mean() + cost_x3 = -w3 * abs(new_hidden - s).mean() + cost_x1.name = 'cost_x1_%d'%i + cost_x2.name = 'cost_x2_%d'%i + cost_x3.name = 'cost_x3_%d'%i + costs_xreg += [cost_x1, cost_x2, cost_x3] + + dims.append(p['dim']) hidden.append(new_hidden) for i, (u, v) in enumerate(states): @@ -79,13 +94,17 @@ class Model(): print "**** inp", inp.dtype print "**** out", out.dtype print "**** pred", pred.dtype - cost = Softmax().categorical_cross_entropy(inp[:, 1:].flatten(), + cost0 = Softmax().categorical_cross_entropy(inp[:, 1:].flatten(), out[:, :-1, :].reshape((inp.shape[0]*(inp.shape[1]-1), config.io_dim))).mean() + cost0.name = 'cost0' error_rate = tensor.neq(inp[:, 1:].flatten(), pred[:, :-1].flatten()).astype(theano.config.floatX).mean() - print "**** cost", cost.dtype + print "**** cost0", cost0.dtype print "**** error_rate", error_rate.dtype + costs = [cost0] + costs_xreg + cost = sum(costs) + # Initialize all bricks for brick in bricks: brick.weights_init = IsotropicGaussian(0.1) @@ -93,13 +112,14 @@ class Model(): brick.initialize() # Apply noise and dropout - cg = ComputationGraph([cost, error_rate]) + cg = ComputationGraph([cost, error_rate] + costs) if config.w_noise_std > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, config.w_noise_std) if config.i_dropout > 0: cg = apply_dropout(cg, hidden[1:], config.i_dropout) - [cost_reg, error_rate_reg] = cg.outputs + [cost_reg, error_rate_reg] = cg.outputs[:2] + costs_reg = cg.outputs[2:] print "**** cost_reg", cost_reg.dtype print "**** error_rate_reg", error_rate_reg.dtype @@ -119,6 +139,7 @@ class Model(): error_rate.name = 'error_rate' error_rate_reg.name = 'error_rate_reg' self.monitor_vars = [[cost_reg], + costs_reg, [error_rate_reg]] self.out = out |