summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--lstm.py49
-rwxr-xr-xtrain.py71
2 files changed, 83 insertions, 37 deletions
diff --git a/lstm.py b/lstm.py
index e294793..dbe46dc 100644
--- a/lstm.py
+++ b/lstm.py
@@ -14,31 +14,34 @@ from blocks.graph import ComputationGraph, apply_noise, apply_dropout
# An epoch will be composed of 'num_seqs' sequences of len 'seq_len'
# divided in chunks of lengh 'seq_div_size'
num_seqs = 20
-seq_len = 2000
-seq_div_size = 100
+seq_len = 5000
+seq_div_size = 200
io_dim = 256
-hidden_dims = [512, 512, 512]
+hidden_dims = [1024, 1024, 1024]
activation_function = Tanh()
i2h_all = True # input to all hidden layers or only first layer
h2o_all = True # all hiden layers to output or only last layer
-w_noise_std = 0.01
+w_noise_std = 0.02
i_dropout = 0.5
-step_rule = 'momentum'
+l1_reg = 0
+
+step_rule = 'adadelta'
learning_rate = 0.1
momentum = 0.9
-param_desc = '%s-%sIH,%sHO-n%s-d%s-%dx%d(%d)-%s' % (
+param_desc = '%s-%sIH,%sHO-n%s-d%s-l1r%s-%dx%d(%d)-%s' % (
repr(hidden_dims),
'all' if i2h_all else 'first',
'all' if h2o_all else 'last',
repr(w_noise_std),
repr(i_dropout),
+ repr(l1_reg),
num_seqs, seq_len, seq_div_size,
step_rule
)
@@ -46,8 +49,9 @@ param_desc = '%s-%sIH,%sHO-n%s-d%s-%dx%d(%d)-%s' % (
save_freq = 5
# parameters for sample generation
-sample_len = 60
-sample_temperature = 0.3
+sample_len = 1000
+sample_temperature = 0.7 #0.5
+sample_freq = 10
if step_rule == 'rmsprop':
step_rule = RMSProp()
@@ -68,9 +72,9 @@ class Model():
# Construct hidden states
dims = [io_dim] + hidden_dims
- states = [in_onehot.dimshuffle(1, 0, 2)]
+ hidden = [in_onehot.dimshuffle(1, 0, 2)]
bricks = []
- updates = []
+ states = []
for i in xrange(1, len(dims)):
init_state = theano.shared(numpy.zeros((num_seqs, dims[i])).astype(theano.config.floatX),
name='st0_%d'%i)
@@ -80,32 +84,32 @@ class Model():
linear = Linear(input_dim=dims[i-1], output_dim=4*dims[i],
name="lstm_in_%d"%i)
bricks.append(linear)
- inter = linear.apply(states[-1])
+ inter = linear.apply(hidden[-1])
if i2h_all and i > 1:
linear2 = Linear(input_dim=dims[0], output_dim=4*dims[i],
name="lstm_in0_%d"%i)
bricks.append(linear2)
- inter = inter + linear2.apply(states[0])
+ inter = inter + linear2.apply(hidden[0])
inter.name = 'inter_bis_%d'%i
lstm = LSTM(dim=dims[i], activation=activation_function,
name="lstm_rec_%d"%i)
bricks.append(lstm)
- new_states, new_cells = lstm.apply(inter,
+ new_hidden, new_cells = lstm.apply(inter,
states=init_state,
cells=init_cell)
- updates.append((init_state, new_states[-1, :, :]))
- updates.append((init_cell, new_cells[-1, :, :]))
+ states.append((init_state, new_hidden[-1, :, :]))
+ states.append((init_cell, new_cells[-1, :, :]))
- states.append(new_states)
+ hidden.append(new_hidden)
- states = [s.dimshuffle(1, 0, 2) for s in states]
+ hidden = [s.dimshuffle(1, 0, 2) for s in hidden]
# Construct output from hidden states
out = None
- layers = zip(dims, states)[1:]
+ layers = zip(dims, hidden)[1:]
if not h2o_all:
layers = [layers[-1]]
for i, (dim, state) in enumerate(layers):
@@ -136,9 +140,14 @@ class Model():
noise_vars = VariableFilter(roles=[WEIGHT])(cg)
cg = apply_noise(cg, noise_vars, w_noise_std)
if i_dropout > 0:
- cg = apply_dropout(cg, states[1:], i_dropout)
+ cg = apply_dropout(cg, hidden[1:], i_dropout)
[cost_reg, error_rate_reg] = cg.outputs
+ # add l1 regularization
+ if l1_reg > 0:
+ l1pen = sum(abs(st).mean() for st in hidden[1:])
+ cost_reg = cost_reg + l1_reg * l1pen
+
self.cost = cost
self.error_rate = error_rate
self.cost_reg = cost_reg
@@ -146,5 +155,5 @@ class Model():
self.out = out
self.pred = pred
- self.updates = updates
+ self.states = states
diff --git a/train.py b/train.py
index a8e9ef2..a8c246a 100755
--- a/train.py
+++ b/train.py
@@ -27,6 +27,8 @@ import datastream
logging.basicConfig(level='INFO')
logger = logging.getLogger(__name__)
+sys.setrecursionlimit(1500)
+
if __name__ == "__main__":
if len(sys.argv) != 2:
print >> sys.stderr, 'Usage: %s config' % sys.argv[0]
@@ -37,34 +39,64 @@ if __name__ == "__main__":
class GenText(SimpleExtension):
def __init__(self, model, init_text, max_bytes, **kwargs):
+ super(GenText, self).__init__(**kwargs)
+
self.init_text = init_text
self.max_bytes = max_bytes
-
out = model.out[:, -1, :] / numpy.float32(config.sample_temperature)
prob = tensor.nnet.softmax(out)
cg = ComputationGraph([prob])
assert(len(cg.inputs) == 1)
assert(cg.inputs[0].name == 'bytes')
- self.f = theano.function(inputs=cg.inputs, outputs=[prob])
- super(GenText, self).__init__(**kwargs)
+ state_vars = [theano.shared(v[0:1, :].zeros_like().eval(), v.name+'-gen')
+ for v, _ in model.states]
+ givens = [(v, x) for (v, _), x in zip(model.states, state_vars)]
+ updates= [(x, upd) for x, (_, upd) in zip(state_vars, model.states)]
+
+ self.f = theano.function(inputs=cg.inputs, outputs=[prob],
+ givens=givens, updates=updates)
+ self.reset_states = theano.function(inputs=[], outputs=[],
+ updates=[(v, v.zeros_like()) for v in state_vars])
def do(self, which_callback, *args):
+
+ print "Sample:"
+ print "-------"
+
+ self.reset_states()
+
v = numpy.array([ord(i) for i in self.init_text],
- dtype='int16')[None, :].repeat(axis=0, repeats=config.num_seqs)
+ dtype='int16')[None, :]
+ prob, = self.f(v)
+ sys.stdout.write(self.init_text)
while v.shape[1] < self.max_bytes:
- prob, = self.f(v)
prob = prob / 1.00001
- pred = numpy.zeros((prob.shape[0],), dtype='int16')
- for i in range(prob.shape[0]):
- pred[i] = numpy.random.multinomial(1, prob[i, :]).nonzero()[0][0]
- v = numpy.concatenate([v, pred[:, None]], axis=1)
+ pred = numpy.random.multinomial(1, prob[0, :]).nonzero()[0][0]
+
+ v = numpy.concatenate([v, pred[None, None]], axis=1)
+ sys.stdout.write(chr(int(pred)))
+ sys.stdout.flush()
+
+ prob, = self.f(pred[None, None])
+ print
+ print "-------"
+ print
+
- for i in range(v.shape[0]):
- print "Sample:", ''.join([chr(int(v[i, j])) for j in range(v.shape[1])])
+class ResetStates(SimpleExtension):
+ def __init__(self, state_vars, **kwargs):
+ super(ResetStates, self).__init__(**kwargs)
+
+ self.f = theano.function(
+ inputs=[], outputs=[],
+ updates=[(v, v.zeros_like()) for v in state_vars])
+
+ def do(self, which_callback, *args):
+ self.f()
def train_model(m, train_stream, dump_path=None):
@@ -76,17 +108,17 @@ def train_model(m, train_stream, dump_path=None):
step_rule=config.step_rule,
params=cg.parameters)
- algorithm.add_updates(m.updates)
+ algorithm.add_updates(m.states)
# Load the parameters from a dumped model
if dump_path is not None:
try:
- logger.info('Loading parameters...')
with closing(numpy.load(dump_path)) as source:
+ logger.info('Loading parameters...')
param_values = {'/' + name.replace(BRICK_DELIMITER, '/'): source[name]
for name in source.keys()
if name != 'pkl' and not 'None' in name}
- model.set_param_values(param_values)
+ model.set_param_values(param_values)
except IOError:
pass
@@ -96,19 +128,24 @@ def train_model(m, train_stream, dump_path=None):
algorithm=algorithm,
extensions=[
Checkpoint(path=dump_path,
- after_epoch=False, every_n_epochs=config.save_freq),
+ after_epoch=False,
+ use_cpickle=True,
+ every_n_epochs=config.save_freq),
TrainingDataMonitoring(
[m.cost_reg, m.error_rate_reg, m.cost, m.error_rate],
prefix='train', every_n_epochs=1),
Printing(every_n_epochs=1, after_epoch=False),
- Plot(document='tr_'+model_name+'_'+config.param_desc,
+ Plot(document='text_'+model_name+'_'+config.param_desc,
channels=[['train_cost', 'train_cost_reg'],
['train_error_rate', 'train_error_rate_reg']],
server_url='http://eos21:4201/',
every_n_epochs=1, after_epoch=False),
- GenText(m, ' ', config.sample_len, every_n_epochs=1, after_epoch=False)
+ GenText(m, '\nalex\ttu crois ?\n', config.sample_len,
+ every_n_epochs=config.sample_freq,
+ after_epoch=False, before_training=True),
+ ResetStates([v for v, _ in m.states], after_epoch=True)
]
)
main_loop.run()