diff options
author | Alex Auvolat <alex.auvolat@ens.fr> | 2015-05-05 14:15:21 -0400 |
---|---|---|
committer | Alex Auvolat <alex.auvolat@ens.fr> | 2015-05-05 14:15:21 -0400 |
commit | 54613c1f9cf510ca7a71d6619418f2247515aec6 (patch) | |
tree | bed9a5a11ef5b7feecee44095a29400e32f76b05 | |
parent | 712035b88be1816d3fbd58ce69ae6464767c780e (diff) | |
download | taxi-54613c1f9cf510ca7a71d6619418f2247515aec6.tar.gz taxi-54613c1f9cf510ca7a71d6619418f2247515aec6.zip |
Add models for time predictioAdd models for time prediction
-rw-r--r-- | config/dest_simple_mlp_2_cs.py (renamed from config/simple_mlp_2_cs.py) | 2 | ||||
-rw-r--r-- | config/dest_simple_mlp_2_cswdt.py (renamed from config/simple_mlp_2_cswdt.py) | 2 | ||||
-rw-r--r-- | config/dest_simple_mlp_2_noembed.py (renamed from config/simple_mlp_2_noembed.py) | 2 | ||||
-rw-r--r-- | config/dest_simple_mlp_tgtcls_0_cs.py (renamed from config/simple_mlp_tgtcls_0_cs.py) | 2 | ||||
-rw-r--r-- | config/dest_simple_mlp_tgtcls_1_cs.py (renamed from config/simple_mlp_tgtcls_1_cs.py) | 2 | ||||
-rw-r--r-- | config/dest_simple_mlp_tgtcls_1_cswdt.py (renamed from config/simple_mlp_tgtcls_1_cswdt.py) | 2 | ||||
-rw-r--r-- | config/dest_simple_mlp_tgtcls_1_cswdtx.py (renamed from config/simple_mlp_tgtcls_1_cswdtx.py) | 2 | ||||
-rw-r--r-- | config/time_simple_mlp_1.py | 19 | ||||
-rw-r--r-- | config/time_simple_mlp_2_cswdtx.py | 26 | ||||
-rw-r--r-- | data.py | 6 | ||||
-rw-r--r-- | error.py (renamed from hdist.py) | 3 | ||||
-rw-r--r-- | model/dest_simple_mlp.py (renamed from model/simple_mlp.py) | 10 | ||||
-rw-r--r-- | model/dest_simple_mlp_tgtcls.py (renamed from model/simple_mlp_tgtcls.py) | 10 | ||||
-rw-r--r-- | model/time_simple_mlp.py | 65 | ||||
-rw-r--r-- | train.py | 27 | ||||
-rw-r--r-- | transformers.py | 4 |
16 files changed, 152 insertions, 32 deletions
diff --git a/config/simple_mlp_2_cs.py b/config/dest_simple_mlp_2_cs.py index fa2f4c1..2cec78d 100644 --- a/config/simple_mlp_2_cs.py +++ b/config/dest_simple_mlp_2_cs.py @@ -1,4 +1,4 @@ -import model.simple_mlp as model +import model.dest_simple_mlp as model import data diff --git a/config/simple_mlp_2_cswdt.py b/config/dest_simple_mlp_2_cswdt.py index 05c9450..f6ddf34 100644 --- a/config/simple_mlp_2_cswdt.py +++ b/config/dest_simple_mlp_2_cswdt.py @@ -1,4 +1,4 @@ -import model.simple_mlp as model +import model.dest_simple_mlp as model import data diff --git a/config/simple_mlp_2_noembed.py b/config/dest_simple_mlp_2_noembed.py index 2f45f63..3832146 100644 --- a/config/simple_mlp_2_noembed.py +++ b/config/dest_simple_mlp_2_noembed.py @@ -1,4 +1,4 @@ -import model.simple_mlp as model +import model.dest_simple_mlp as model import data diff --git a/config/simple_mlp_tgtcls_0_cs.py b/config/dest_simple_mlp_tgtcls_0_cs.py index 96faca0..a8a5a0e 100644 --- a/config/simple_mlp_tgtcls_0_cs.py +++ b/config/dest_simple_mlp_tgtcls_0_cs.py @@ -2,7 +2,7 @@ import cPickle import data -import model.simple_mlp_tgtcls as model +import model.dest_simple_mlp_tgtcls as model n_begin_end_pts = 5 # how many points we consider at the beginning and end of the known trajectory n_end_pts = 5 diff --git a/config/simple_mlp_tgtcls_1_cs.py b/config/dest_simple_mlp_tgtcls_1_cs.py index 293a0ab..8136f10 100644 --- a/config/simple_mlp_tgtcls_1_cs.py +++ b/config/dest_simple_mlp_tgtcls_1_cs.py @@ -2,7 +2,7 @@ import cPickle import data -import model.simple_mlp_tgtcls as model +import model.dest_simple_mlp_tgtcls as model n_begin_end_pts = 5 # how many points we consider at the beginning and end of the known trajectory n_end_pts = 5 diff --git a/config/simple_mlp_tgtcls_1_cswdt.py b/config/dest_simple_mlp_tgtcls_1_cswdt.py index 45bd39e..af7b2a3 100644 --- a/config/simple_mlp_tgtcls_1_cswdt.py +++ b/config/dest_simple_mlp_tgtcls_1_cswdt.py @@ -2,7 +2,7 @@ import cPickle import data -import model.simple_mlp_tgtcls as model +import model.dest_simple_mlp_tgtcls as model n_begin_end_pts = 5 # how many points we consider at the beginning and end of the known trajectory n_end_pts = 5 diff --git a/config/simple_mlp_tgtcls_1_cswdtx.py b/config/dest_simple_mlp_tgtcls_1_cswdtx.py index d51ddde..b9832df 100644 --- a/config/simple_mlp_tgtcls_1_cswdtx.py +++ b/config/dest_simple_mlp_tgtcls_1_cswdtx.py @@ -2,7 +2,7 @@ import cPickle import data -import model.simple_mlp_tgtcls as model +import model.dest_simple_mlp_tgtcls as model n_begin_end_pts = 5 # how many points we consider at the beginning and end of the known trajectory n_end_pts = 5 diff --git a/config/time_simple_mlp_1.py b/config/time_simple_mlp_1.py new file mode 100644 index 0000000..eea4159 --- /dev/null +++ b/config/time_simple_mlp_1.py @@ -0,0 +1,19 @@ +import model.time_simple_mlp as model + +import data + +n_begin_end_pts = 5 # how many points we consider at the beginning and end of the known trajectory +n_end_pts = 5 + +n_valid = 1000 + +dim_embeddings = [ +] + +dim_input = n_begin_end_pts * 2 * 2 + sum(x for (_, _, x) in dim_embeddings) +dim_hidden = [200] +dim_output = 1 + +learning_rate = 0.00001 +momentum = 0.99 +batch_size = 32 diff --git a/config/time_simple_mlp_2_cswdtx.py b/config/time_simple_mlp_2_cswdtx.py new file mode 100644 index 0000000..ceb66e8 --- /dev/null +++ b/config/time_simple_mlp_2_cswdtx.py @@ -0,0 +1,26 @@ +import model.time_simple_mlp as model + +import data + +n_begin_end_pts = 5 # how many points we consider at the beginning and end of the known trajectory +n_end_pts = 5 + +n_valid = 1000 + +dim_embeddings = [ + ('origin_call', data.n_train_clients+1, 10), + ('origin_stand', data.n_stands+1, 10), + ('week_of_year', 52, 10), + ('day_of_week', 7, 10), + ('qhour_of_day', 24 * 4, 10), + ('day_type', 3, 10), + ('taxi_id', 448, 10), +] + +dim_input = n_begin_end_pts * 2 * 2 + sum(x for (_, _, x) in dim_embeddings) +dim_hidden = [500, 100] +dim_output = 1 + +learning_rate = 0.00001 +momentum = 0.99 +batch_size = 32 @@ -179,15 +179,13 @@ taxi_columns_valid = taxi_columns + [ ("time", lambda l: int(l[11])), ] -train_files=["%s/split/train-%02d.csv" % (DATA_PATH, i) for i in range(100)] -valid_files=["%s/split/valid2-cut.csv" % (DATA_PATH,)] +valid_files=["%s/valid2-cut.csv" % (DATA_PATH,)] test_file="%s/test.csv" % (DATA_PATH,) -train_data=TaxiData(train_files, taxi_columns) valid_data = TaxiData(valid_files, taxi_columns_valid) test_data = TaxiData(test_file, taxi_columns, has_header=True) -valid_trips = [l for l in open(DATA_PATH + "/split/valid2-cut-ids.txt")] +valid_trips = [l for l in open(DATA_PATH + "/valid2-cut-ids.txt")] def train_it(): return DataIterator(DataStream(train_data)) @@ -35,3 +35,6 @@ def erdist(a, b): x = (lon2-lon1) * tensor.cos((lat1+lat2)/2) y = (lat2-lat1) return tensor.sqrt(tensor.sqr(x) + tensor.sqr(y)) * rearth + +def rmsle(a, b): + return tensor.sqrt( ( (tensor.log(a+1)-tensor.log(b+1)) ** 2 ).mean() ) diff --git a/model/simple_mlp.py b/model/dest_simple_mlp.py index fc065f7..896f219 100644 --- a/model/simple_mlp.py +++ b/model/dest_simple_mlp.py @@ -6,7 +6,7 @@ from blocks.initialization import IsotropicGaussian, Constant from theano import tensor import data -import hdist +import error class Model(object): def __init__(self, config): @@ -51,9 +51,9 @@ class Model(object): outputs.name = 'outputs' # Calculate the cost - cost = hdist.erdist(outputs, y).mean() + cost = error.erdist(outputs, y).mean() cost.name = 'cost' - hcost = hdist.hdist(outputs, y).mean() + hcost = error.hdist(outputs, y).mean() hcost.name = 'hcost' # Initialization @@ -67,5 +67,7 @@ class Model(object): mlp.initialize() self.cost = cost - self.hcost = hcost + self.monitor = [cost, hcost] self.outputs = outputs + self.pred_vars = ['destination_latitude', 'destination_longitude'] + diff --git a/model/simple_mlp_tgtcls.py b/model/dest_simple_mlp_tgtcls.py index b2a1a6e..d8fdeb3 100644 --- a/model/simple_mlp_tgtcls.py +++ b/model/dest_simple_mlp_tgtcls.py @@ -9,7 +9,7 @@ from blocks.bricks.lookup import LookupTable from blocks.initialization import IsotropicGaussian, Constant import data -import hdist +import error class Model(object): def __init__(self, config): @@ -53,9 +53,9 @@ class Model(object): outputs.name = 'outputs' # Calculate the cost - cost = hdist.erdist(outputs, y).mean() + cost = error.erdist(outputs, y).mean() cost.name = 'cost' - hcost = hdist.hdist(outputs, y).mean() + hcost = error.hdist(outputs, y).mean() hcost.name = 'hcost' # Initialization @@ -69,5 +69,7 @@ class Model(object): mlp.initialize() self.cost = cost - self.hcost = hcost + self.monitor = [cost, hcost] self.outputs = outputs + self.pred_vars = ['destination_latitude', 'destination_longitude'] + diff --git a/model/time_simple_mlp.py b/model/time_simple_mlp.py new file mode 100644 index 0000000..1568ed3 --- /dev/null +++ b/model/time_simple_mlp.py @@ -0,0 +1,65 @@ +from blocks.bricks import MLP, Rectifier, Linear, Sigmoid, Identity +from blocks.bricks.lookup import LookupTable + +from blocks.initialization import IsotropicGaussian, Constant + +from theano import tensor + +import data +import error + +class Model(object): + def __init__(self, config): + # The input and the targets + x_firstk_latitude = (tensor.matrix('first_k_latitude') - data.porto_center[0]) / data.data_std[0] + x_firstk_longitude = (tensor.matrix('first_k_longitude') - data.porto_center[1]) / data.data_std[1] + + x_lastk_latitude = (tensor.matrix('last_k_latitude') - data.porto_center[0]) / data.data_std[0] + x_lastk_longitude = (tensor.matrix('last_k_longitude') - data.porto_center[1]) / data.data_std[1] + + input_list = [x_firstk_latitude, x_firstk_longitude, x_lastk_latitude, x_lastk_longitude] + embed_tables = [] + + self.require_inputs = ['first_k_latitude', 'first_k_longitude', 'last_k_latitude', 'last_k_longitude'] + + for (varname, num, dim) in config.dim_embeddings: + self.require_inputs.append(varname) + vardata = tensor.lvector(varname) + tbl = LookupTable(length=num, dim=dim, name='%s_lookup'%varname) + embed_tables.append(tbl) + input_list.append(tbl.apply(vardata)) + + y = tensor.lvector('time') + + # Define the model + mlp = MLP(activations=[Rectifier() for _ in config.dim_hidden] + [Identity()], + dims=[config.dim_input] + config.dim_hidden + [config.dim_output]) + + # Create the Theano variables + inputs = tensor.concatenate(input_list, axis=1) + # inputs = theano.printing.Print("inputs")(inputs) + outputs = tensor.exp(mlp.apply(inputs) + 2) + + # outputs = theano.printing.Print("outputs")(outputs) + # y = theano.printing.Print("y")(y) + + outputs.name = 'outputs' + + # Calculate the cost + cost = error.rmsle(outputs.flatten(), y.flatten()) + cost.name = 'cost' + + # Initialization + for tbl in embed_tables: + tbl.weights_init = IsotropicGaussian(0.001) + mlp.weights_init = IsotropicGaussian(0.01) + mlp.biases_init = Constant(0.001) + + for tbl in embed_tables: + tbl.initialize() + mlp.initialize() + + self.cost = cost + self.monitor = [cost] + self.outputs = outputs + self.pred_vars = ['time'] @@ -20,7 +20,7 @@ from blocks.model import Model from fuel.datasets.hdf5 import H5PYDataset from fuel.transformers import Batch from fuel.streams import DataStream -from fuel.schemes import ConstantScheme, SequentialExampleScheme +from fuel.schemes import ConstantScheme, SequentialExampleScheme, ShuffledExampleScheme from blocks.algorithms import GradientDescent, Scale, AdaDelta, Momentum from blocks.graph import ComputationGraph @@ -31,7 +31,6 @@ from blocks.extensions.monitoring import DataStreamMonitoring import data import transformers -import hdist import apply_model if __name__ == "__main__": @@ -48,7 +47,7 @@ def setup_train_stream(req_vars): which_set='train', subset=slice(0, data.dataset_size), load_in_memory=True) - train = DataStream(train, iteration_scheme=SequentialExampleScheme(data.dataset_size - config.n_valid)) + train = DataStream(train, iteration_scheme=ShuffledExampleScheme(data.dataset_size)) train = transformers.TaxiExcludeTrips(data.valid_trips, train) train = transformers.TaxiGenerateSplits(train, max_splits=100) @@ -91,10 +90,9 @@ def main(): model = config.model.Model(config) cost = model.cost - hcost = model.hcost outputs = model.outputs - req_vars = model.require_inputs + [ 'destination_latitude', 'destination_longitude' ] + req_vars = model.require_inputs + model.pred_vars req_vars_test = model.require_inputs + [ 'trip_id' ] train_stream = setup_train_stream(req_vars) @@ -109,7 +107,7 @@ def main(): step_rule=Momentum(learning_rate=config.learning_rate, momentum=config.momentum), params=params) - extensions=[DataStreamMonitoring([cost, hcost], valid_stream, + extensions=[DataStreamMonitoring(model.monitor, valid_stream, prefix='valid', every_n_batches=1000), Printing(every_n_batches=1000), @@ -132,11 +130,18 @@ def main(): outfile = open("output/test-output-%s.csv" % model_name, "w") outcsv = csv.writer(outfile) - outcsv.writerow(["TRIP_ID", "LATITUDE", "LONGITUDE"]) - for out in apply_model.Apply(outputs=outputs, stream=test_stream, return_vars=['trip_id', 'outputs']): - dest = out['outputs'] - for i, trip in enumerate(out['trip_id']): - outcsv.writerow([trip, repr(dest[i, 0]), repr(dest[i, 1])]) + if model.pred_vars == ['time']: + outcsv.writerow(["TRIP_ID", "TRAVEL_TIME"]) + for out in apply_model.Apply(outputs=outputs, stream=test_stream, return_vars=['trip_id', 'outputs']): + time = out['outputs'] + for i, trip in enumerate(out['trip_id']): + outcsv.writerow([trip, int(time[i, 0])]) + else: + outcsv.writerow(["TRIP_ID", "LATITUDE", "LONGITUDE"]) + for out in apply_model.Apply(outputs=outputs, stream=test_stream, return_vars=['trip_id', 'outputs']): + dest = out['outputs'] + for i, trip in enumerate(out['trip_id']): + outcsv.writerow([trip, repr(dest[i, 0]), repr(dest[i, 1])]) outfile.close() diff --git a/transformers.py b/transformers.py index 6ee0df1..73e3868 100644 --- a/transformers.py +++ b/transformers.py @@ -32,7 +32,7 @@ class Select(Transformer): class TaxiGenerateSplits(Transformer): def __init__(self, data_stream, max_splits=-1): super(TaxiGenerateSplits, self).__init__(data_stream) - self.sources = data_stream.sources + ('destination_latitude', 'destination_longitude') + self.sources = data_stream.sources + ('destination_latitude', 'destination_longitude', 'time') self.max_splits = max_splits self.data = None self.splits = [] @@ -63,7 +63,7 @@ class TaxiGenerateSplits(Transformer): dlat = numpy.float32(self.data[self.id_latitude][-1]) dlon = numpy.float32(self.data[self.id_longitude][-1]) - return tuple(r + [dlat, dlon]) + return tuple(r + [dlat, dlon, 15 * (len(self.data[self.id_longitude]) - 1)]) class TaxiAddFirstK(Transformer): def __init__(self, k, stream): |