aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlex Auvolat <alex.auvolat@ens.fr>2015-05-05 14:15:21 -0400
committerAlex Auvolat <alex.auvolat@ens.fr>2015-05-05 14:15:21 -0400
commit54613c1f9cf510ca7a71d6619418f2247515aec6 (patch)
treebed9a5a11ef5b7feecee44095a29400e32f76b05
parent712035b88be1816d3fbd58ce69ae6464767c780e (diff)
downloadtaxi-54613c1f9cf510ca7a71d6619418f2247515aec6.tar.gz
taxi-54613c1f9cf510ca7a71d6619418f2247515aec6.zip
Add models for time predictioAdd models for time prediction
-rw-r--r--config/dest_simple_mlp_2_cs.py (renamed from config/simple_mlp_2_cs.py)2
-rw-r--r--config/dest_simple_mlp_2_cswdt.py (renamed from config/simple_mlp_2_cswdt.py)2
-rw-r--r--config/dest_simple_mlp_2_noembed.py (renamed from config/simple_mlp_2_noembed.py)2
-rw-r--r--config/dest_simple_mlp_tgtcls_0_cs.py (renamed from config/simple_mlp_tgtcls_0_cs.py)2
-rw-r--r--config/dest_simple_mlp_tgtcls_1_cs.py (renamed from config/simple_mlp_tgtcls_1_cs.py)2
-rw-r--r--config/dest_simple_mlp_tgtcls_1_cswdt.py (renamed from config/simple_mlp_tgtcls_1_cswdt.py)2
-rw-r--r--config/dest_simple_mlp_tgtcls_1_cswdtx.py (renamed from config/simple_mlp_tgtcls_1_cswdtx.py)2
-rw-r--r--config/time_simple_mlp_1.py19
-rw-r--r--config/time_simple_mlp_2_cswdtx.py26
-rw-r--r--data.py6
-rw-r--r--error.py (renamed from hdist.py)3
-rw-r--r--model/dest_simple_mlp.py (renamed from model/simple_mlp.py)10
-rw-r--r--model/dest_simple_mlp_tgtcls.py (renamed from model/simple_mlp_tgtcls.py)10
-rw-r--r--model/time_simple_mlp.py65
-rw-r--r--train.py27
-rw-r--r--transformers.py4
16 files changed, 152 insertions, 32 deletions
diff --git a/config/simple_mlp_2_cs.py b/config/dest_simple_mlp_2_cs.py
index fa2f4c1..2cec78d 100644
--- a/config/simple_mlp_2_cs.py
+++ b/config/dest_simple_mlp_2_cs.py
@@ -1,4 +1,4 @@
-import model.simple_mlp as model
+import model.dest_simple_mlp as model
import data
diff --git a/config/simple_mlp_2_cswdt.py b/config/dest_simple_mlp_2_cswdt.py
index 05c9450..f6ddf34 100644
--- a/config/simple_mlp_2_cswdt.py
+++ b/config/dest_simple_mlp_2_cswdt.py
@@ -1,4 +1,4 @@
-import model.simple_mlp as model
+import model.dest_simple_mlp as model
import data
diff --git a/config/simple_mlp_2_noembed.py b/config/dest_simple_mlp_2_noembed.py
index 2f45f63..3832146 100644
--- a/config/simple_mlp_2_noembed.py
+++ b/config/dest_simple_mlp_2_noembed.py
@@ -1,4 +1,4 @@
-import model.simple_mlp as model
+import model.dest_simple_mlp as model
import data
diff --git a/config/simple_mlp_tgtcls_0_cs.py b/config/dest_simple_mlp_tgtcls_0_cs.py
index 96faca0..a8a5a0e 100644
--- a/config/simple_mlp_tgtcls_0_cs.py
+++ b/config/dest_simple_mlp_tgtcls_0_cs.py
@@ -2,7 +2,7 @@ import cPickle
import data
-import model.simple_mlp_tgtcls as model
+import model.dest_simple_mlp_tgtcls as model
n_begin_end_pts = 5 # how many points we consider at the beginning and end of the known trajectory
n_end_pts = 5
diff --git a/config/simple_mlp_tgtcls_1_cs.py b/config/dest_simple_mlp_tgtcls_1_cs.py
index 293a0ab..8136f10 100644
--- a/config/simple_mlp_tgtcls_1_cs.py
+++ b/config/dest_simple_mlp_tgtcls_1_cs.py
@@ -2,7 +2,7 @@ import cPickle
import data
-import model.simple_mlp_tgtcls as model
+import model.dest_simple_mlp_tgtcls as model
n_begin_end_pts = 5 # how many points we consider at the beginning and end of the known trajectory
n_end_pts = 5
diff --git a/config/simple_mlp_tgtcls_1_cswdt.py b/config/dest_simple_mlp_tgtcls_1_cswdt.py
index 45bd39e..af7b2a3 100644
--- a/config/simple_mlp_tgtcls_1_cswdt.py
+++ b/config/dest_simple_mlp_tgtcls_1_cswdt.py
@@ -2,7 +2,7 @@ import cPickle
import data
-import model.simple_mlp_tgtcls as model
+import model.dest_simple_mlp_tgtcls as model
n_begin_end_pts = 5 # how many points we consider at the beginning and end of the known trajectory
n_end_pts = 5
diff --git a/config/simple_mlp_tgtcls_1_cswdtx.py b/config/dest_simple_mlp_tgtcls_1_cswdtx.py
index d51ddde..b9832df 100644
--- a/config/simple_mlp_tgtcls_1_cswdtx.py
+++ b/config/dest_simple_mlp_tgtcls_1_cswdtx.py
@@ -2,7 +2,7 @@ import cPickle
import data
-import model.simple_mlp_tgtcls as model
+import model.dest_simple_mlp_tgtcls as model
n_begin_end_pts = 5 # how many points we consider at the beginning and end of the known trajectory
n_end_pts = 5
diff --git a/config/time_simple_mlp_1.py b/config/time_simple_mlp_1.py
new file mode 100644
index 0000000..eea4159
--- /dev/null
+++ b/config/time_simple_mlp_1.py
@@ -0,0 +1,19 @@
+import model.time_simple_mlp as model
+
+import data
+
+n_begin_end_pts = 5 # how many points we consider at the beginning and end of the known trajectory
+n_end_pts = 5
+
+n_valid = 1000
+
+dim_embeddings = [
+]
+
+dim_input = n_begin_end_pts * 2 * 2 + sum(x for (_, _, x) in dim_embeddings)
+dim_hidden = [200]
+dim_output = 1
+
+learning_rate = 0.00001
+momentum = 0.99
+batch_size = 32
diff --git a/config/time_simple_mlp_2_cswdtx.py b/config/time_simple_mlp_2_cswdtx.py
new file mode 100644
index 0000000..ceb66e8
--- /dev/null
+++ b/config/time_simple_mlp_2_cswdtx.py
@@ -0,0 +1,26 @@
+import model.time_simple_mlp as model
+
+import data
+
+n_begin_end_pts = 5 # how many points we consider at the beginning and end of the known trajectory
+n_end_pts = 5
+
+n_valid = 1000
+
+dim_embeddings = [
+ ('origin_call', data.n_train_clients+1, 10),
+ ('origin_stand', data.n_stands+1, 10),
+ ('week_of_year', 52, 10),
+ ('day_of_week', 7, 10),
+ ('qhour_of_day', 24 * 4, 10),
+ ('day_type', 3, 10),
+ ('taxi_id', 448, 10),
+]
+
+dim_input = n_begin_end_pts * 2 * 2 + sum(x for (_, _, x) in dim_embeddings)
+dim_hidden = [500, 100]
+dim_output = 1
+
+learning_rate = 0.00001
+momentum = 0.99
+batch_size = 32
diff --git a/data.py b/data.py
index 39603fc..42ebe1c 100644
--- a/data.py
+++ b/data.py
@@ -179,15 +179,13 @@ taxi_columns_valid = taxi_columns + [
("time", lambda l: int(l[11])),
]
-train_files=["%s/split/train-%02d.csv" % (DATA_PATH, i) for i in range(100)]
-valid_files=["%s/split/valid2-cut.csv" % (DATA_PATH,)]
+valid_files=["%s/valid2-cut.csv" % (DATA_PATH,)]
test_file="%s/test.csv" % (DATA_PATH,)
-train_data=TaxiData(train_files, taxi_columns)
valid_data = TaxiData(valid_files, taxi_columns_valid)
test_data = TaxiData(test_file, taxi_columns, has_header=True)
-valid_trips = [l for l in open(DATA_PATH + "/split/valid2-cut-ids.txt")]
+valid_trips = [l for l in open(DATA_PATH + "/valid2-cut-ids.txt")]
def train_it():
return DataIterator(DataStream(train_data))
diff --git a/hdist.py b/error.py
index 866aea4..5ea37ad 100644
--- a/hdist.py
+++ b/error.py
@@ -35,3 +35,6 @@ def erdist(a, b):
x = (lon2-lon1) * tensor.cos((lat1+lat2)/2)
y = (lat2-lat1)
return tensor.sqrt(tensor.sqr(x) + tensor.sqr(y)) * rearth
+
+def rmsle(a, b):
+ return tensor.sqrt( ( (tensor.log(a+1)-tensor.log(b+1)) ** 2 ).mean() )
diff --git a/model/simple_mlp.py b/model/dest_simple_mlp.py
index fc065f7..896f219 100644
--- a/model/simple_mlp.py
+++ b/model/dest_simple_mlp.py
@@ -6,7 +6,7 @@ from blocks.initialization import IsotropicGaussian, Constant
from theano import tensor
import data
-import hdist
+import error
class Model(object):
def __init__(self, config):
@@ -51,9 +51,9 @@ class Model(object):
outputs.name = 'outputs'
# Calculate the cost
- cost = hdist.erdist(outputs, y).mean()
+ cost = error.erdist(outputs, y).mean()
cost.name = 'cost'
- hcost = hdist.hdist(outputs, y).mean()
+ hcost = error.hdist(outputs, y).mean()
hcost.name = 'hcost'
# Initialization
@@ -67,5 +67,7 @@ class Model(object):
mlp.initialize()
self.cost = cost
- self.hcost = hcost
+ self.monitor = [cost, hcost]
self.outputs = outputs
+ self.pred_vars = ['destination_latitude', 'destination_longitude']
+
diff --git a/model/simple_mlp_tgtcls.py b/model/dest_simple_mlp_tgtcls.py
index b2a1a6e..d8fdeb3 100644
--- a/model/simple_mlp_tgtcls.py
+++ b/model/dest_simple_mlp_tgtcls.py
@@ -9,7 +9,7 @@ from blocks.bricks.lookup import LookupTable
from blocks.initialization import IsotropicGaussian, Constant
import data
-import hdist
+import error
class Model(object):
def __init__(self, config):
@@ -53,9 +53,9 @@ class Model(object):
outputs.name = 'outputs'
# Calculate the cost
- cost = hdist.erdist(outputs, y).mean()
+ cost = error.erdist(outputs, y).mean()
cost.name = 'cost'
- hcost = hdist.hdist(outputs, y).mean()
+ hcost = error.hdist(outputs, y).mean()
hcost.name = 'hcost'
# Initialization
@@ -69,5 +69,7 @@ class Model(object):
mlp.initialize()
self.cost = cost
- self.hcost = hcost
+ self.monitor = [cost, hcost]
self.outputs = outputs
+ self.pred_vars = ['destination_latitude', 'destination_longitude']
+
diff --git a/model/time_simple_mlp.py b/model/time_simple_mlp.py
new file mode 100644
index 0000000..1568ed3
--- /dev/null
+++ b/model/time_simple_mlp.py
@@ -0,0 +1,65 @@
+from blocks.bricks import MLP, Rectifier, Linear, Sigmoid, Identity
+from blocks.bricks.lookup import LookupTable
+
+from blocks.initialization import IsotropicGaussian, Constant
+
+from theano import tensor
+
+import data
+import error
+
+class Model(object):
+ def __init__(self, config):
+ # The input and the targets
+ x_firstk_latitude = (tensor.matrix('first_k_latitude') - data.porto_center[0]) / data.data_std[0]
+ x_firstk_longitude = (tensor.matrix('first_k_longitude') - data.porto_center[1]) / data.data_std[1]
+
+ x_lastk_latitude = (tensor.matrix('last_k_latitude') - data.porto_center[0]) / data.data_std[0]
+ x_lastk_longitude = (tensor.matrix('last_k_longitude') - data.porto_center[1]) / data.data_std[1]
+
+ input_list = [x_firstk_latitude, x_firstk_longitude, x_lastk_latitude, x_lastk_longitude]
+ embed_tables = []
+
+ self.require_inputs = ['first_k_latitude', 'first_k_longitude', 'last_k_latitude', 'last_k_longitude']
+
+ for (varname, num, dim) in config.dim_embeddings:
+ self.require_inputs.append(varname)
+ vardata = tensor.lvector(varname)
+ tbl = LookupTable(length=num, dim=dim, name='%s_lookup'%varname)
+ embed_tables.append(tbl)
+ input_list.append(tbl.apply(vardata))
+
+ y = tensor.lvector('time')
+
+ # Define the model
+ mlp = MLP(activations=[Rectifier() for _ in config.dim_hidden] + [Identity()],
+ dims=[config.dim_input] + config.dim_hidden + [config.dim_output])
+
+ # Create the Theano variables
+ inputs = tensor.concatenate(input_list, axis=1)
+ # inputs = theano.printing.Print("inputs")(inputs)
+ outputs = tensor.exp(mlp.apply(inputs) + 2)
+
+ # outputs = theano.printing.Print("outputs")(outputs)
+ # y = theano.printing.Print("y")(y)
+
+ outputs.name = 'outputs'
+
+ # Calculate the cost
+ cost = error.rmsle(outputs.flatten(), y.flatten())
+ cost.name = 'cost'
+
+ # Initialization
+ for tbl in embed_tables:
+ tbl.weights_init = IsotropicGaussian(0.001)
+ mlp.weights_init = IsotropicGaussian(0.01)
+ mlp.biases_init = Constant(0.001)
+
+ for tbl in embed_tables:
+ tbl.initialize()
+ mlp.initialize()
+
+ self.cost = cost
+ self.monitor = [cost]
+ self.outputs = outputs
+ self.pred_vars = ['time']
diff --git a/train.py b/train.py
index 2c9522e..4cbd526 100644
--- a/train.py
+++ b/train.py
@@ -20,7 +20,7 @@ from blocks.model import Model
from fuel.datasets.hdf5 import H5PYDataset
from fuel.transformers import Batch
from fuel.streams import DataStream
-from fuel.schemes import ConstantScheme, SequentialExampleScheme
+from fuel.schemes import ConstantScheme, SequentialExampleScheme, ShuffledExampleScheme
from blocks.algorithms import GradientDescent, Scale, AdaDelta, Momentum
from blocks.graph import ComputationGraph
@@ -31,7 +31,6 @@ from blocks.extensions.monitoring import DataStreamMonitoring
import data
import transformers
-import hdist
import apply_model
if __name__ == "__main__":
@@ -48,7 +47,7 @@ def setup_train_stream(req_vars):
which_set='train',
subset=slice(0, data.dataset_size),
load_in_memory=True)
- train = DataStream(train, iteration_scheme=SequentialExampleScheme(data.dataset_size - config.n_valid))
+ train = DataStream(train, iteration_scheme=ShuffledExampleScheme(data.dataset_size))
train = transformers.TaxiExcludeTrips(data.valid_trips, train)
train = transformers.TaxiGenerateSplits(train, max_splits=100)
@@ -91,10 +90,9 @@ def main():
model = config.model.Model(config)
cost = model.cost
- hcost = model.hcost
outputs = model.outputs
- req_vars = model.require_inputs + [ 'destination_latitude', 'destination_longitude' ]
+ req_vars = model.require_inputs + model.pred_vars
req_vars_test = model.require_inputs + [ 'trip_id' ]
train_stream = setup_train_stream(req_vars)
@@ -109,7 +107,7 @@ def main():
step_rule=Momentum(learning_rate=config.learning_rate, momentum=config.momentum),
params=params)
- extensions=[DataStreamMonitoring([cost, hcost], valid_stream,
+ extensions=[DataStreamMonitoring(model.monitor, valid_stream,
prefix='valid',
every_n_batches=1000),
Printing(every_n_batches=1000),
@@ -132,11 +130,18 @@ def main():
outfile = open("output/test-output-%s.csv" % model_name, "w")
outcsv = csv.writer(outfile)
- outcsv.writerow(["TRIP_ID", "LATITUDE", "LONGITUDE"])
- for out in apply_model.Apply(outputs=outputs, stream=test_stream, return_vars=['trip_id', 'outputs']):
- dest = out['outputs']
- for i, trip in enumerate(out['trip_id']):
- outcsv.writerow([trip, repr(dest[i, 0]), repr(dest[i, 1])])
+ if model.pred_vars == ['time']:
+ outcsv.writerow(["TRIP_ID", "TRAVEL_TIME"])
+ for out in apply_model.Apply(outputs=outputs, stream=test_stream, return_vars=['trip_id', 'outputs']):
+ time = out['outputs']
+ for i, trip in enumerate(out['trip_id']):
+ outcsv.writerow([trip, int(time[i, 0])])
+ else:
+ outcsv.writerow(["TRIP_ID", "LATITUDE", "LONGITUDE"])
+ for out in apply_model.Apply(outputs=outputs, stream=test_stream, return_vars=['trip_id', 'outputs']):
+ dest = out['outputs']
+ for i, trip in enumerate(out['trip_id']):
+ outcsv.writerow([trip, repr(dest[i, 0]), repr(dest[i, 1])])
outfile.close()
diff --git a/transformers.py b/transformers.py
index 6ee0df1..73e3868 100644
--- a/transformers.py
+++ b/transformers.py
@@ -32,7 +32,7 @@ class Select(Transformer):
class TaxiGenerateSplits(Transformer):
def __init__(self, data_stream, max_splits=-1):
super(TaxiGenerateSplits, self).__init__(data_stream)
- self.sources = data_stream.sources + ('destination_latitude', 'destination_longitude')
+ self.sources = data_stream.sources + ('destination_latitude', 'destination_longitude', 'time')
self.max_splits = max_splits
self.data = None
self.splits = []
@@ -63,7 +63,7 @@ class TaxiGenerateSplits(Transformer):
dlat = numpy.float32(self.data[self.id_latitude][-1])
dlon = numpy.float32(self.data[self.id_longitude][-1])
- return tuple(r + [dlat, dlon])
+ return tuple(r + [dlat, dlon, 15 * (len(self.data[self.id_longitude]) - 1)])
class TaxiAddFirstK(Transformer):
def __init__(self, k, stream):