summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--datastream.py42
-rw-r--r--lstm.py99
-rwxr-xr-xtrain.py44
3 files changed, 151 insertions, 34 deletions
diff --git a/datastream.py b/datastream.py
index 5d9441f..8025945 100644
--- a/datastream.py
+++ b/datastream.py
@@ -61,16 +61,50 @@ class BytesToIndices(Transformer):
data = next(self.child_epoch_iterator)
return numpy.array([ord(i) for i in data[0]], dtype='int16'),
-def setup_datastream(filename, seq_len, num_seqs_per_epoch=100):
+class ParallelSequences(Transformer):
+ def __init__(self, stream, num_seqs, seq_div_size, **kwargs):
+ self.sources = ('bytes',)
+
+ self.num_seqs = num_seqs
+ self.div_size = seq_div_size
+
+ self.tmp = None
+ self.i = 0
+
+ super(ParallelSequences, self).__init__(stream, **kwargs)
+
+ def get_data(self, request=None):
+ if request is not None:
+ raise ValueError('Unsupported: request')
+
+ if self.tmp is None or self.i >= self.tmp.shape[1]:
+ self.tmp = numpy.concatenate([next(self.child_epoch_iterator)[0][None, :]
+ for _ in xrange(self.num_seqs)],
+ axis=0)
+ self.i = 0
+
+ ret = self.tmp[:, self.i:self.i + self.div_size]
+ self.i += self.div_size
+
+ return ret,
+
+
+
+def setup_datastream(filename, num_seqs, seq_len, seq_div_size):
ds = BinaryFileDataset(filename)
- it = RandomBlockIterator(ds.num_examples(), seq_len, num_seqs_per_epoch)
+ it = RandomBlockIterator(ds.num_examples(), seq_len, num_seqs)
stream = DataStream(ds, iteration_scheme=it)
stream = BytesToIndices(stream)
+ stream = ParallelSequences(stream, num_seqs, seq_div_size)
return stream
if __name__ == "__main__":
# Test
- stream = setup_datastream("data/logcompil.txt", 100)
- print(next(stream.get_epoch_iterator()))
+ stream = setup_datastream("data/logcompil.txt", 2, 60, 20)
+ it = stream.get_epoch_iterator()
+ for d, in stream.get_epoch_iterator():
+ print '--'
+ for u in range(d.shape[0]):
+ print ''.join(chr(i) for i in d[u])
diff --git a/lstm.py b/lstm.py
index 72b67f1..32cdb9b 100644
--- a/lstm.py
+++ b/lstm.py
@@ -1,60 +1,108 @@
import theano
from theano import tensor
+import numpy
-from blocks.algorithms import Momentum, AdaDelta
+from blocks.algorithms import Momentum, AdaDelta, RMSProp
from blocks.bricks import Tanh, Softmax, Linear, MLP
from blocks.bricks.recurrent import LSTM
from blocks.initialization import IsotropicGaussian, Constant
from blocks.filter import VariableFilter
from blocks.roles import WEIGHT
-from blocks.graph import ComputationGraph, apply_noise
+from blocks.graph import ComputationGraph, apply_noise, apply_dropout
-chars_per_seq = 100
-seqs_per_epoch = 1
+# An epoch will be composed of 'num_seqs' sequences of len 'seq_len'
+# divided in chunks of lengh 'seq_div_size'
+num_seqs = 10
+seq_len = 2000
+seq_div_size = 100
io_dim = 256
-hidden_dims = [200, 500]
+hidden_dims = [512, 512]
activation_function = Tanh()
+all_hidden_for_output = False
+
w_noise_std = 0.01
+i_dropout = 0.5
+
+step_rule = 'adadelta'
-step_rule = AdaDelta()
-pt_freq = 1
+param_desc = '%s-%sHO-n%s-d%s-%dx%d(%d)-%s' % (
+ repr(hidden_dims),
+ 'all' if all_hidden_for_output else 'last',
+ repr(w_noise_std),
+ repr(i_dropout),
+ num_seqs, seq_len, seq_div_size,
+ step_rule
+ )
-param_desc = '' # todo
+if step_rule == 'rmsprop':
+ step_rule = RMSProp()
+elif step_rule == 'adadelta':
+ step_rule = AdaDelta()
+else:
+ assert(False)
class Model():
def __init__(self):
- inp = tensor.lvector('bytes')
+ inp = tensor.lmatrix('bytes')
- in_onehot = tensor.eq(tensor.arange(io_dim, dtype='int16').reshape((1, io_dim)),
- inp[:, None])
+ in_onehot = tensor.eq(tensor.arange(io_dim, dtype='int16').reshape((1, 1, io_dim)),
+ inp[:, :, None])
dims = [io_dim] + hidden_dims
- prev = in_onehot[None, :, :]
+ states = [in_onehot.dimshuffle(1, 0, 2)]
bricks = []
+ updates = []
for i in xrange(1, len(dims)):
+ init_state = theano.shared(numpy.zeros((num_seqs, dims[i])).astype(theano.config.floatX),
+ name='st0_%d'%i)
+ init_cell = theano.shared(numpy.zeros((num_seqs, dims[i])).astype(theano.config.floatX),
+ name='cell0_%d'%i)
+
linear = Linear(input_dim=dims[i-1], output_dim=4*dims[i],
name="lstm_in_%d"%i)
lstm = LSTM(dim=dims[i], activation=activation_function,
name="lstm_rec_%d"%i)
- prev = lstm.apply(linear.apply(prev))[0]
+
+ new_states, new_cells = lstm.apply(linear.apply(states[-1]),
+ states=init_state,
+ cells=init_cell)
+ updates.append((init_state, new_states[-1, :, :]))
+ updates.append((init_cell, new_cells[-1, :, :]))
+
+ states.append(new_states)
bricks = bricks + [linear, lstm]
- top_linear = MLP(dims=[hidden_dims[-1], io_dim],
- activations=[Softmax()],
- name="pred_mlp")
- bricks.append(top_linear)
+ states = [s.dimshuffle(1, 0, 2).reshape((inp.shape[0] * inp.shape[1], dim))
+ for dim, s in zip(dims, states)]
- out = top_linear.apply(prev.reshape((inp.shape[0], hidden_dims[-1])))
+ if all_hidden_for_output:
+ top_linear = MLP(dims=[sum(hidden_dims), io_dim],
+ activations=[Softmax()],
+ name="pred_mlp")
+ bricks.append(top_linear)
- pred = out.argmax(axis=1)
+ out = top_linear.apply(tensor.concatenate(states[1:], axis=1))
+ else:
+ top_linear = MLP(dims=[hidden_dims[-1], io_dim],
+ activations=[None],
+ name="pred_mlp")
+ bricks.append(top_linear)
- cost = Softmax().categorical_cross_entropy(inp[:-1], out[1:])
- error_rate = tensor.neq(inp[:-1], pred[1:]).mean()
+ out = top_linear.apply(states[-1])
+
+ out = out.reshape((inp.shape[0], inp.shape[1], io_dim))
+
+ pred = out.argmax(axis=2)
+
+ cost = Softmax().categorical_cross_entropy(inp[:, 1:].flatten(),
+ out[:, :-1, :].reshape((inp.shape[0]*(inp.shape[1]-1),
+ io_dim)))
+ error_rate = tensor.neq(inp[:, 1:].flatten(), pred[:, :-1].flatten()).mean()
# Initialize
for brick in bricks:
@@ -64,8 +112,11 @@ class Model():
# apply noise
cg = ComputationGraph([cost, error_rate])
- noise_vars = VariableFilter(roles=[WEIGHT])(cg)
- cg = apply_noise(cg, noise_vars, w_noise_std)
+ if w_noise_std > 0:
+ noise_vars = VariableFilter(roles=[WEIGHT])(cg)
+ cg = apply_noise(cg, noise_vars, w_noise_std)
+ if i_dropout > 0:
+ cg = apply_dropout(cg, states[1:], i_dropout)
[cost_reg, error_rate_reg] = cg.outputs
self.cost = cost
@@ -74,3 +125,5 @@ class Model():
self.error_rate_reg = error_rate_reg
self.pred = pred
+ self.updates = updates
+
diff --git a/train.py b/train.py
index ab973a1..7857f3f 100755
--- a/train.py
+++ b/train.py
@@ -5,16 +5,18 @@ import numpy
import sys
import importlib
+import theano
+from theano import tensor
+
from blocks.dump import load_parameter_values
from blocks.dump import MainLoopDumpManager
-from blocks.extensions import Printing
+from blocks.extensions import Printing, SimpleExtension
from blocks.extensions.monitoring import DataStreamMonitoring, TrainingDataMonitoring
from blocks.extensions.plot import Plot
from blocks.graph import ComputationGraph
from blocks.main_loop import MainLoop
from blocks.model import Model
from blocks.algorithms import GradientDescent
-from theano import tensor
import datastream
# from apply_model import Apply
@@ -30,6 +32,29 @@ if __name__ == "__main__":
config = importlib.import_module('%s' % model_name)
+class GenText(SimpleExtension):
+ def __init__(self, model, init_text, max_bytes, **kwargs):
+ self.init_text = init_text
+ self.max_bytes = max_bytes
+
+ cg = ComputationGraph([model.pred])
+ assert(len(cg.inputs) == 1)
+ assert(cg.inputs[0].name == 'bytes')
+ self.f = theano.function(inputs=cg.inputs, outputs=[model.pred])
+
+ super(GenText, self).__init__(**kwargs)
+
+ def do(self, which_callback, *args):
+ v = numpy.array([ord(i) for i in self.init_text],
+ dtype='int16')[None, :].repeat(axis=0, repeats=config.num_seqs)
+
+ while v.shape[1] < self.max_bytes:
+ pred, = self.f(v)
+ v = numpy.concatenate([v, pred[:, -1:]], axis=1)
+
+ for i in range(v.shape[0]):
+ print "Sample:", ''.join([chr(int(v[i, j])) for j in range(v.shape[1])])
+
def train_model(m, train_stream, load_location=None, save_location=None):
# Define the model
@@ -44,6 +69,9 @@ def train_model(m, train_stream, load_location=None, save_location=None):
algorithm = GradientDescent(cost=m.cost_reg,
step_rule=config.step_rule,
params=cg.parameters)
+
+ algorithm.add_updates(m.updates)
+
main_loop = MainLoop(
model=model,
data_stream=train_stream,
@@ -51,12 +79,13 @@ def train_model(m, train_stream, load_location=None, save_location=None):
extensions=[
TrainingDataMonitoring(
[m.cost_reg, m.error_rate_reg, m.cost, m.error_rate],
- prefix='train', every_n_epochs=1*config.pt_freq),
- Printing(every_n_epochs=1*config.pt_freq, after_epoch=False),
+ prefix='train', every_n_epochs=1),
+ Printing(every_n_epochs=1, after_epoch=False),
Plot(document='tr_'+model_name+'_'+config.param_desc,
channels=[['train_cost', 'train_cost_reg'],
['train_error_rate', 'train_error_rate_reg']],
- every_n_epochs=1*config.pt_freq, after_epoch=False)
+ every_n_epochs=1, after_epoch=False),
+ GenText(m, '\t', 20, every_n_epochs=1, after_epoch=False)
]
)
main_loop.run()
@@ -72,8 +101,9 @@ def train_model(m, train_stream, load_location=None, save_location=None):
if __name__ == "__main__":
# Build datastream
train_stream = datastream.setup_datastream('data/logcompil.txt',
- config.chars_per_seq,
- config.seqs_per_epoch)
+ config.num_seqs,
+ config.seq_len,
+ config.seq_div_size)
# Build model
m = config.Model()