summaryrefslogtreecommitdiff
path: root/lstm.py
diff options
context:
space:
mode:
Diffstat (limited to 'lstm.py')
-rw-r--r--lstm.py49
1 files changed, 29 insertions, 20 deletions
diff --git a/lstm.py b/lstm.py
index e294793..dbe46dc 100644
--- a/lstm.py
+++ b/lstm.py
@@ -14,31 +14,34 @@ from blocks.graph import ComputationGraph, apply_noise, apply_dropout
# An epoch will be composed of 'num_seqs' sequences of len 'seq_len'
# divided in chunks of lengh 'seq_div_size'
num_seqs = 20
-seq_len = 2000
-seq_div_size = 100
+seq_len = 5000
+seq_div_size = 200
io_dim = 256
-hidden_dims = [512, 512, 512]
+hidden_dims = [1024, 1024, 1024]
activation_function = Tanh()
i2h_all = True # input to all hidden layers or only first layer
h2o_all = True # all hiden layers to output or only last layer
-w_noise_std = 0.01
+w_noise_std = 0.02
i_dropout = 0.5
-step_rule = 'momentum'
+l1_reg = 0
+
+step_rule = 'adadelta'
learning_rate = 0.1
momentum = 0.9
-param_desc = '%s-%sIH,%sHO-n%s-d%s-%dx%d(%d)-%s' % (
+param_desc = '%s-%sIH,%sHO-n%s-d%s-l1r%s-%dx%d(%d)-%s' % (
repr(hidden_dims),
'all' if i2h_all else 'first',
'all' if h2o_all else 'last',
repr(w_noise_std),
repr(i_dropout),
+ repr(l1_reg),
num_seqs, seq_len, seq_div_size,
step_rule
)
@@ -46,8 +49,9 @@ param_desc = '%s-%sIH,%sHO-n%s-d%s-%dx%d(%d)-%s' % (
save_freq = 5
# parameters for sample generation
-sample_len = 60
-sample_temperature = 0.3
+sample_len = 1000
+sample_temperature = 0.7 #0.5
+sample_freq = 10
if step_rule == 'rmsprop':
step_rule = RMSProp()
@@ -68,9 +72,9 @@ class Model():
# Construct hidden states
dims = [io_dim] + hidden_dims
- states = [in_onehot.dimshuffle(1, 0, 2)]
+ hidden = [in_onehot.dimshuffle(1, 0, 2)]
bricks = []
- updates = []
+ states = []
for i in xrange(1, len(dims)):
init_state = theano.shared(numpy.zeros((num_seqs, dims[i])).astype(theano.config.floatX),
name='st0_%d'%i)
@@ -80,32 +84,32 @@ class Model():
linear = Linear(input_dim=dims[i-1], output_dim=4*dims[i],
name="lstm_in_%d"%i)
bricks.append(linear)
- inter = linear.apply(states[-1])
+ inter = linear.apply(hidden[-1])
if i2h_all and i > 1:
linear2 = Linear(input_dim=dims[0], output_dim=4*dims[i],
name="lstm_in0_%d"%i)
bricks.append(linear2)
- inter = inter + linear2.apply(states[0])
+ inter = inter + linear2.apply(hidden[0])
inter.name = 'inter_bis_%d'%i
lstm = LSTM(dim=dims[i], activation=activation_function,
name="lstm_rec_%d"%i)
bricks.append(lstm)
- new_states, new_cells = lstm.apply(inter,
+ new_hidden, new_cells = lstm.apply(inter,
states=init_state,
cells=init_cell)
- updates.append((init_state, new_states[-1, :, :]))
- updates.append((init_cell, new_cells[-1, :, :]))
+ states.append((init_state, new_hidden[-1, :, :]))
+ states.append((init_cell, new_cells[-1, :, :]))
- states.append(new_states)
+ hidden.append(new_hidden)
- states = [s.dimshuffle(1, 0, 2) for s in states]
+ hidden = [s.dimshuffle(1, 0, 2) for s in hidden]
# Construct output from hidden states
out = None
- layers = zip(dims, states)[1:]
+ layers = zip(dims, hidden)[1:]
if not h2o_all:
layers = [layers[-1]]
for i, (dim, state) in enumerate(layers):
@@ -136,9 +140,14 @@ class Model():
noise_vars = VariableFilter(roles=[WEIGHT])(cg)
cg = apply_noise(cg, noise_vars, w_noise_std)
if i_dropout > 0:
- cg = apply_dropout(cg, states[1:], i_dropout)
+ cg = apply_dropout(cg, hidden[1:], i_dropout)
[cost_reg, error_rate_reg] = cg.outputs
+ # add l1 regularization
+ if l1_reg > 0:
+ l1pen = sum(abs(st).mean() for st in hidden[1:])
+ cost_reg = cost_reg + l1_reg * l1pen
+
self.cost = cost
self.error_rate = error_rate
self.cost_reg = cost_reg
@@ -146,5 +155,5 @@ class Model():
self.out = out
self.pred = pred
- self.updates = updates
+ self.states = states