Adapt model to hdf5 dataset. WIP

author: Étienne Simon <esimon@esimon.eu> 2015-04-29 15:40:05 -0400
committer: Étienne Simon <esimon@esimon.eu> 2015-04-29 15:40:51 -0400
commit: f768d3e770216d4227ffd989cf98f1628fc476a3 (patch)
tree: b5620a34eaebac5290b74882018bab16f4658e27
parent: 61e0d47b6c6a570feebb43d474138020b13495aa (diff)
download: taxi-f768d3e770216d4227ffd989cf98f1628fc476a3.tar.gz
taxi-f768d3e770216d4227ffd989cf98f1628fc476a3.zip
5 files changed, 108 insertions, 100 deletions
diff --git a/config/__init__.py b/config/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/config/__init__.py
diff --git a/config/model_0.py b/config/model_0.py
new file mode 100644
index 0000000..ba04a15
--- /dev/null
+++ b/config/model_0.py
@@ -0,0 +1,21 @@
+n_dow = 7       # number of division for dayofweek/dayofmonth/hourofday
+n_dom = 31
+n_hour = 24
+
+n_clients = 57124 #57105
+n_stands = 63
+
+n_begin_end_pts = 5     # how many points we consider at the beginning and end of the known trajectory
+n_end_pts = 5
+
+train_size = 1710670
+n_valid = 1000
+
+dim_embed = 50
+dim_input = n_begin_end_pts * 2 * 2 + dim_embed + dim_embed
+dim_hidden = [200]
+dim_output = 2
+
+learning_rate = 0.002
+momentum = 0.9
+batch_size = 32
diff --git a/data.py b/data.py
index 351c90a..d38df10 100644
--- a/data.py
+++ b/data.py
@@ -21,8 +21,8 @@ def get_client_id(n):
     else:
         return 0
 
-porto_center = numpy.array([[ -8.61612, 41.1573]], dtype=theano.config.floatX)
-data_std = numpy.sqrt(numpy.array([[ 0.00333233, 0.00549598]], dtype=theano.config.floatX))
+porto_center = numpy.array([41.1573, -8.61612], dtype=theano.config.floatX)
+data_std = numpy.sqrt(numpy.array([0.00549598, 0.00333233], dtype=theano.config.floatX))
 
 class CallType(Enum):
     CENTRAL = 0
diff --git a/model.py b/model.py
index 405ad47..8e10e24 100644
--- a/model.py
+++ b/model.py
@@ -1,5 +1,7 @@
 import logging
 import os
+import sys
+import importlib
 from argparse import ArgumentParser
 
 import csv
@@ -19,9 +21,10 @@ from blocks.bricks.lookup import LookupTable
 from blocks.initialization import IsotropicGaussian, Constant
 from blocks.model import Model
 
+from fuel.datasets.hdf5 import H5PYDataset
 from fuel.transformers import Batch
 from fuel.streams import DataStream
-from fuel.schemes import ConstantScheme
+from fuel.schemes import ConstantScheme, SequentialExampleScheme
 
 from blocks.algorithms import GradientDescent, Scale, AdaDelta, Momentum
 from blocks.graph import ComputationGraph
@@ -35,58 +38,59 @@ import transformers
 import hdist
 import apply_model
 
-n_dow = 7       # number of division for dayofweek/dayofmonth/hourofday
-n_dom = 31
-n_hour = 24
-
-n_clients = 57105
-n_stands = 63
-
-n_begin_end_pts = 5     # how many points we consider at the beginning and end of the known trajectory
-n_end_pts = 5
-
-dim_embed = 50
-dim_input = n_begin_end_pts * 2 * 2 + dim_embed + dim_embed
-dim_hidden = [200]
-dim_output = 2
+if __name__ == "__main__":
+    if len(sys.argv) != 2:
+        print >> sys.stderr, 'Usage: %s config' % sys.argv[0]
+        sys.exit(1)
+    config = importlib.import_module(sys.argv[1])
 
-learning_rate = 0.002
-momentum = 0.9
-batch_size = 32
+def setup_stream():
+    # Load the training and test data
+    train = H5PYDataset('/data/lisatmp3/simonet/taxi/data.hdf5', which_set='train', subset=slice(0, config.train_size - config.n_valid), load_in_memory=True)
+    train = DataStream(train, iteration_scheme=SequentialExampleScheme(config.train_size - config.n_valid))
+    train = transformers.add_first_k(config.n_begin_end_pts, train)
+    train = transformers.add_random_k(config.n_begin_end_pts, train)
+    train = transformers.add_destination(train)
+    train = transformers.Select(train, ('origin_stand', 'origin_call', 'first_k_latitude', 'last_k_latitude', 'first_k_longitude', 'last_k_longitude', 'destination_latitude', 'destination_longitude'))
+    train_stream = Batch(train, iteration_scheme=ConstantScheme(config.batch_size))
+
+    valid = H5PYDataset('/data/lisatmp3/simonet/taxi/data.hdf5', which_set='train', subset=slice(config.train_size - config.n_valid, config.train_size), load_in_memory=True)
+    valid = DataStream(valid, iteration_scheme=SequentialExampleScheme(config.n_valid))
+    valid = transformers.add_first_k(config.n_begin_end_pts, valid)
+    valid = transformers.add_last_k(config.n_begin_end_pts, valid)
+    valid = transformers.add_destination(valid)
+    valid = transformers.Select(valid, ('origin_stand', 'origin_call', 'first_k_latitude', 'last_k_latitude', 'first_k_longitude', 'last_k_longitude', 'destination_latitude', 'destination_longitude'))
+    valid_stream = Batch(valid, iteration_scheme=ConstantScheme(1000))
+    
+    return (train_stream, valid_stream)
 
 def main():
     # The input and the targets
-    x_firstk = tensor.matrix('first_k')
-    n = x_firstk.shape[0]
-    x_firstk = (x_firstk.reshape((n, n_begin_end_pts, 2)) - data.porto_center[None, None, :]) / data.data_std[None, None, :]
-    x_firstk = x_firstk.reshape((n, 2 * n_begin_end_pts))
+    x_firstk_latitude = (tensor.matrix('first_k_latitude') - data.porto_center[0]) / data.data_std[0]
+    x_firstk_longitude = (tensor.matrix('first_k_longitude') - data.porto_center[1]) / data.data_std[1]
+    x_firstk = tensor.concatenate((x_firstk_latitude, x_firstk_longitude), axis=1)
 
-    x_lastk = tensor.matrix('last_k')
-    n = x_lastk.shape[0]
-    x_lastk = (x_lastk.reshape((n, n_begin_end_pts, 2)) - data.porto_center[None, None, :]) / data.data_std[None, None, :]
-    x_lastk = x_lastk.reshape((n, 2 * n_begin_end_pts))
+    x_lastk_latitude = (tensor.matrix('last_k_latitude') - data.porto_center[0]) / data.data_std[0]
+    x_lastk_longitude = (tensor.matrix('last_k_longitude') - data.porto_center[1]) / data.data_std[1]
+    x_lastk = tensor.concatenate((x_lastk_latitude, x_lastk_longitude), axis=1)
 
     x_client = tensor.lvector('origin_call')
     x_stand = tensor.lvector('origin_stand')
-    y = tensor.matrix('destination')
+    y = tensor.concatenate((tensor.vector('destination_latitude')[:, None], tensor.vector('destination_longitude')[:, None]), axis=1)
 
     # Define the model
-    client_embed_table = LookupTable(length=n_clients+1, dim=dim_embed, name='client_lookup')
-    stand_embed_table = LookupTable(length=n_stands+1, dim=dim_embed, name='stand_lookup')
-    hidden_layer = MLP(activations=[Rectifier() for _ in dim_hidden],
-                       dims=[dim_input] + dim_hidden)
-    output_layer = Linear(input_dim=dim_hidden[-1], output_dim=dim_output)
+    client_embed_table = LookupTable(length=config.n_clients+1, dim=config.dim_embed, name='client_lookup')
+    stand_embed_table = LookupTable(length=config.n_stands+1, dim=config.dim_embed, name='stand_lookup')
+    mlp = MLP(activations=[Rectifier() for _ in config.dim_hidden] + [None],
+                       dims=[config.dim_input] + config.dim_hidden + [config.dim_output])
 
     # Create the Theano variables
-
     client_embed = client_embed_table.apply(x_client).flatten(ndim=2)
     stand_embed = stand_embed_table.apply(x_stand).flatten(ndim=2)
     inputs = tensor.concatenate([x_firstk, x_lastk, client_embed, stand_embed],
                                 axis=1)
     # inputs = theano.printing.Print("inputs")(inputs)
-    hidden = hidden_layer.apply(inputs)
-    # hidden = theano.printing.Print("hidden")(hidden)
-    outputs = output_layer.apply(hidden)
+    outputs = mlp.apply(inputs)
 
     # Normalize & Center
     outputs = data.data_std * outputs + data.porto_center
@@ -101,42 +105,22 @@ def main():
     # Initialization
     client_embed_table.weights_init = IsotropicGaussian(0.001)
     stand_embed_table.weights_init = IsotropicGaussian(0.001)
-    hidden_layer.weights_init = IsotropicGaussian(0.01)
-    hidden_layer.biases_init = Constant(0.001)
-    output_layer.weights_init = IsotropicGaussian(0.01)
-    output_layer.biases_init = Constant(0.001)
+    mlp.weights_init = IsotropicGaussian(0.01)
+    mlp.biases_init = Constant(0.001)
 
     client_embed_table.initialize()
     stand_embed_table.initialize()
-    hidden_layer.initialize()
-    output_layer.initialize()
-
-    # Load the training and test data
-    train = data.train_data
-    train = DataStream(train)
-    train = transformers.add_first_k(n_begin_end_pts, train)
-    train = transformers.add_random_k(n_begin_end_pts, train)
-    train = transformers.add_destination(train)
-    train = transformers.Select(train, ('origin_stand', 'origin_call', 'first_k', 'last_k', 'destination'))
-    train_stream = Batch(train, iteration_scheme=ConstantScheme(batch_size))
-
-    valid = data.valid_data
-    valid = DataStream(valid)
-    valid = transformers.add_first_k(n_begin_end_pts, valid)
-    valid = transformers.add_last_k(n_begin_end_pts, valid)
-    valid = transformers.concat_destination_xy(valid)
-    valid = transformers.Select(valid, ('origin_stand', 'origin_call', 'first_k', 'last_k', 'destination'))
-    valid_stream = Batch(valid, iteration_scheme=ConstantScheme(1000))
+    mlp.initialize()
 
+    (train_stream, valid_stream) = setup_stream()
 
     # Training
     cg = ComputationGraph(cost)
-    params = VariableFilter(bricks=[Linear])(cg.parameters)
     algorithm = GradientDescent(
         cost=cost,
         # step_rule=AdaDelta(decay_rate=0.5),
-        step_rule=Momentum(learning_rate=learning_rate, momentum=momentum),
-        params=params)
+        step_rule=Momentum(learning_rate=config.learning_rate, momentum=config.momentum),
+        params=cg.parameters)
 
     extensions=[DataStreamMonitoring([cost, hcost], valid_stream,
                                      prefix='valid',
@@ -154,10 +138,11 @@ def main():
     main_loop.run()
 
     # Produce an output on the test data
+    '''
     test = data.test_data
     test = DataStream(test)
-    test = transformers.add_first_k(n_begin_end_pts, test)
-    test = transformers.add_last_k(n_begin_end_pts, test)
+    test = transformers.add_first_k(conifg.n_begin_end_pts, test)
+    test = transformers.add_last_k(config.n_begin_end_pts, test)
     test = transformers.Select(test, ('trip_id', 'origin_stand', 'origin_call', 'first_k', 'last_k'))
     test_stream = Batch(test, iteration_scheme=ConstantScheme(1000))
 
@@ -169,6 +154,7 @@ def main():
         for i, trip in enumerate(out['trip_id']):
             outcsv.writerow([trip, repr(dest[i, 1]), repr(dest[i, 0])])
     outfile.close()
+    '''
 
 
 if __name__ == "__main__":
diff --git a/transformers.py b/transformers.py
index c60d362..13852ac 100644
--- a/transformers.py
+++ b/transformers.py
@@ -3,15 +3,15 @@ import numpy
 import theano
 import random
 
-def at_least_k(k, pl, pad_at_begin):
-    if len(pl) == 0:
-        pl = [[ -8.61612, 41.1573]]
-    if len(pl) < k:
+def at_least_k(k, v, pad_at_begin, is_longitude):
+    if len(v) == 0:
+        v = numpy.array([41.1573 if is_longitude else -8.61612], dtype=theano.config.floatX)
+    if len(v) < k:
         if pad_at_begin:
-            pl = [pl[0]] * (k - len(pl)) + pl
+            v = numpy.concatenate((numpy.full((k - len(v),), v[0]), v))
         else:
-            pl = pl + [pl[-1]] * (k - len(pl))
-    return pl
+            v = numpy.concatenate((v, numpy.full((k - len(v),), v[-1])))
+    return v
 
 
 class Select(Transformer):
@@ -27,38 +27,39 @@ class Select(Transformer):
         return [data[id] for id in self.ids]
 
 def add_first_k(k, stream):
-    id_polyline=stream.sources.index('polyline')
-    def first_k(x):
-        pl = at_least_k(k, x[id_polyline], False)
-        return (numpy.array(pl[:k], dtype=theano.config.floatX).flatten(),)
-    stream = Mapping(stream, first_k, ('first_k',))
-    return stream
+    id_latitude = stream.sources.index('latitude')
+    id_longitude = stream.sources.index('longitude')
+    return Mapping(stream,
+        lambda data:
+            (numpy.array(at_least_k(k, data[id_latitude], False, False)[:k], dtype=theano.config.floatX),
+             numpy.array(at_least_k(k, data[id_longitude], False, True)[:k], dtype=theano.config.floatX)),
+        ('first_k_latitude', 'first_k_longitude'))
 
 def add_random_k(k, stream):
-    id_polyline=stream.sources.index('polyline')
+    id_latitude = stream.sources.index('latitude')
+    id_longitude = stream.sources.index('longitude')
     def random_k(x):
-        pl = at_least_k(k, x[id_polyline], True)
-        loc = random.randrange(len(pl)-k+1)
-        return (numpy.array(pl[loc:loc+k], dtype=theano.config.floatX).flatten(),)
-    stream = Mapping(stream, random_k, ('last_k',))
-    return stream
+        lat = at_least_k(k, x[id_latitude], True, False)
+        lon = at_least_k(k, x[id_latitude], True, True)
+        loc = random.randrange(len(lat)-k+1)
+        return (numpy.array(lat[loc:loc+k], dtype=theano.config.floatX),
+                numpy.array(lon[loc:loc+k], dtype=theano.config.floatX)),
+    return Mapping(stream, random_k, ('last_k_latitude', 'last_k_longitude'))
 
 def add_last_k(k, stream):
-    id_polyline=stream.sources.index('polyline')
-    def last_k(x):
-        pl = at_least_k(k, x[id_polyline], True)
-        return (numpy.array(pl[-k:], dtype=theano.config.floatX).flatten(),)
-    stream = Mapping(stream, last_k, ('last_k',))
-    return stream
+    id_latitude = stream.sources.index('latitude')
+    id_longitude = stream.sources.index('longitude')
+    return Mapping(stream,
+        lambda data:
+            (numpy.array(at_least_k(k, data[id_latitude], True, False)[-k:], dtype=theano.config.floatX),
+             numpy.array(at_least_k(k, data[id_longitude], True, True)[-k:], dtype=theano.config.floatX)),
+        ('last_k_latitude', 'last_k_longitude'))
 
 def add_destination(stream):
-    id_polyline=stream.sources.index('polyline')
+    id_latitude = stream.sources.index('latitude')
+    id_longitude = stream.sources.index('longitude')
     return Mapping(stream,
-        lambda x:
-            (numpy.array(at_least_k(1, x[id_polyline], True)[-1], dtype=theano.config.floatX),),
-        ('destination',))
-
-def concat_destination_xy(stream):
-    id_dx=stream.sources.index('destination_x')
-    id_dy=stream.sources.index('destination_y')
-    return Mapping(stream, lambda x: (numpy.array([x[id_dx], x[id_dy]], dtype=theano.config.floatX),), ('destination',))
+        lambda data:
+            (numpy.array(at_least_k(1, data[id_latitude], True, False)[-1], dtype=theano.config.floatX),
+             numpy.array(at_least_k(1, data[id_longitude], True, True)[-1], dtype=theano.config.floatX)),
+        ('destination_latitude', 'destination_longitude'))
author	Étienne Simon <esimon@esimon.eu>	2015-04-29 15:40:05 -0400
committer	Étienne Simon <esimon@esimon.eu>	2015-04-29 15:40:51 -0400
commit	f768d3e770216d4227ffd989cf98f1628fc476a3 (patch)
tree	b5620a34eaebac5290b74882018bab16f4658e27
parent	61e0d47b6c6a570feebb43d474138020b13495aa (diff)
download	taxi-f768d3e770216d4227ffd989cf98f1628fc476a3.tar.gz taxi-f768d3e770216d4227ffd989cf98f1628fc476a3.zip