From 54613c1f9cf510ca7a71d6619418f2247515aec6 Mon Sep 17 00:00:00 2001
From: Alex Auvolat <alex.auvolat@ens.fr>
Date: Tue, 5 May 2015 14:15:21 -0400
Subject: Add models for time predictioAdd models for time prediction

---
 config/dest_simple_mlp_2_cs.py            | 21 +++++++++
 config/dest_simple_mlp_2_cswdt.py         | 25 +++++++++++
 config/dest_simple_mlp_2_noembed.py       | 18 ++++++++
 config/dest_simple_mlp_tgtcls_0_cs.py     | 25 +++++++++++
 config/dest_simple_mlp_tgtcls_1_cs.py     | 25 +++++++++++
 config/dest_simple_mlp_tgtcls_1_cswdt.py  | 29 ++++++++++++
 config/dest_simple_mlp_tgtcls_1_cswdtx.py | 30 +++++++++++++
 config/simple_mlp_2_cs.py                 | 21 ---------
 config/simple_mlp_2_cswdt.py              | 25 -----------
 config/simple_mlp_2_noembed.py            | 18 --------
 config/simple_mlp_tgtcls_0_cs.py          | 25 -----------
 config/simple_mlp_tgtcls_1_cs.py          | 25 -----------
 config/simple_mlp_tgtcls_1_cswdt.py       | 29 ------------
 config/simple_mlp_tgtcls_1_cswdtx.py      | 30 -------------
 config/time_simple_mlp_1.py               | 19 ++++++++
 config/time_simple_mlp_2_cswdtx.py        | 26 +++++++++++
 data.py                                   |  6 +--
 error.py                                  | 40 +++++++++++++++++
 hdist.py                                  | 37 ---------------
 model/dest_simple_mlp.py                  | 73 ++++++++++++++++++++++++++++++
 model/dest_simple_mlp_tgtcls.py           | 75 +++++++++++++++++++++++++++++++
 model/simple_mlp.py                       | 71 -----------------------------
 model/simple_mlp_tgtcls.py                | 73 ------------------------------
 model/time_simple_mlp.py                  | 65 +++++++++++++++++++++++++++
 train.py                                  | 27 ++++++-----
 transformers.py                           |  4 +-
 26 files changed, 491 insertions(+), 371 deletions(-)
 create mode 100644 config/dest_simple_mlp_2_cs.py
 create mode 100644 config/dest_simple_mlp_2_cswdt.py
 create mode 100644 config/dest_simple_mlp_2_noembed.py
 create mode 100644 config/dest_simple_mlp_tgtcls_0_cs.py
 create mode 100644 config/dest_simple_mlp_tgtcls_1_cs.py
 create mode 100644 config/dest_simple_mlp_tgtcls_1_cswdt.py
 create mode 100644 config/dest_simple_mlp_tgtcls_1_cswdtx.py
 delete mode 100644 config/simple_mlp_2_cs.py
 delete mode 100644 config/simple_mlp_2_cswdt.py
 delete mode 100644 config/simple_mlp_2_noembed.py
 delete mode 100644 config/simple_mlp_tgtcls_0_cs.py
 delete mode 100644 config/simple_mlp_tgtcls_1_cs.py
 delete mode 100644 config/simple_mlp_tgtcls_1_cswdt.py
 delete mode 100644 config/simple_mlp_tgtcls_1_cswdtx.py
 create mode 100644 config/time_simple_mlp_1.py
 create mode 100644 config/time_simple_mlp_2_cswdtx.py
 create mode 100644 error.py
 delete mode 100644 hdist.py
 create mode 100644 model/dest_simple_mlp.py
 create mode 100644 model/dest_simple_mlp_tgtcls.py
 delete mode 100644 model/simple_mlp.py
 delete mode 100644 model/simple_mlp_tgtcls.py
 create mode 100644 model/time_simple_mlp.py

diff --git a/config/dest_simple_mlp_2_cs.py b/config/dest_simple_mlp_2_cs.py
new file mode 100644
index 0000000..2cec78d
--- /dev/null
+++ b/config/dest_simple_mlp_2_cs.py
@@ -0,0 +1,21 @@
+import model.dest_simple_mlp as model
+
+import data
+
+n_begin_end_pts = 5     # how many points we consider at the beginning and end of the known trajectory
+n_end_pts = 5
+
+n_valid = 1000
+
+dim_embeddings = [
+    ('origin_call', data.n_train_clients+1, 10),
+    ('origin_stand', data.n_stands+1, 10)
+]
+
+dim_input = n_begin_end_pts * 2 * 2 + sum(x for (_, _, x) in dim_embeddings)
+dim_hidden = [200, 100]
+dim_output = 2
+
+learning_rate = 0.0001
+momentum = 0.99
+batch_size = 32
diff --git a/config/dest_simple_mlp_2_cswdt.py b/config/dest_simple_mlp_2_cswdt.py
new file mode 100644
index 0000000..f6ddf34
--- /dev/null
+++ b/config/dest_simple_mlp_2_cswdt.py
@@ -0,0 +1,25 @@
+import model.dest_simple_mlp as model
+
+import data
+
+n_begin_end_pts = 5     # how many points we consider at the beginning and end of the known trajectory
+n_end_pts = 5
+
+n_valid = 1000
+
+dim_embeddings = [
+    ('origin_call', data.n_train_clients+1, 10),
+    ('origin_stand', data.n_stands+1, 10),
+    ('week_of_year', 52, 10),
+    ('day_of_week', 7, 10),
+    ('qhour_of_day', 24 * 4, 10),
+    ('day_type', 3, 10),
+]
+
+dim_input = n_begin_end_pts * 2 * 2 + sum(x for (_, _, x) in dim_embeddings)
+dim_hidden = [200, 100]
+dim_output = 2
+
+learning_rate = 0.0001
+momentum = 0.99
+batch_size = 32
diff --git a/config/dest_simple_mlp_2_noembed.py b/config/dest_simple_mlp_2_noembed.py
new file mode 100644
index 0000000..3832146
--- /dev/null
+++ b/config/dest_simple_mlp_2_noembed.py
@@ -0,0 +1,18 @@
+import model.dest_simple_mlp as model
+
+import data
+
+n_begin_end_pts = 5     # how many points we consider at the beginning and end of the known trajectory
+n_end_pts = 5
+
+n_valid = 1000
+
+dim_embeddings = []   # do not use embeddings
+
+dim_input = n_begin_end_pts * 2 * 2 + sum(x for (_, _, x) in dim_embeddings)
+dim_hidden = [200, 100]
+dim_output = 2
+
+learning_rate = 0.0001
+momentum = 0.99
+batch_size = 32
diff --git a/config/dest_simple_mlp_tgtcls_0_cs.py b/config/dest_simple_mlp_tgtcls_0_cs.py
new file mode 100644
index 0000000..a8a5a0e
--- /dev/null
+++ b/config/dest_simple_mlp_tgtcls_0_cs.py
@@ -0,0 +1,25 @@
+import cPickle
+
+import data
+
+import model.dest_simple_mlp_tgtcls as model
+
+n_begin_end_pts = 5     # how many points we consider at the beginning and end of the known trajectory
+n_end_pts = 5
+
+n_valid = 1000
+
+with open(data.DATA_PATH + "/arrival-clusters.pkl") as f: tgtcls = cPickle.load(f)
+
+dim_embeddings = [
+    ('origin_call', data.n_train_clients+1, 10),
+    ('origin_stand', data.n_stands+1, 10)
+]
+
+dim_input = n_begin_end_pts * 2 * 2 + sum(x for (_, _, x) in dim_embeddings)
+dim_hidden = []
+dim_output = tgtcls.shape[0]
+
+learning_rate = 0.0001
+momentum = 0.99
+batch_size = 32
diff --git a/config/dest_simple_mlp_tgtcls_1_cs.py b/config/dest_simple_mlp_tgtcls_1_cs.py
new file mode 100644
index 0000000..8136f10
--- /dev/null
+++ b/config/dest_simple_mlp_tgtcls_1_cs.py
@@ -0,0 +1,25 @@
+import cPickle
+
+import data
+
+import model.dest_simple_mlp_tgtcls as model
+
+n_begin_end_pts = 5     # how many points we consider at the beginning and end of the known trajectory
+n_end_pts = 5
+
+n_valid = 1000
+
+with open(data.DATA_PATH + "/arrival-clusters.pkl") as f: tgtcls = cPickle.load(f)
+
+dim_embeddings = [
+    ('origin_call', data.n_train_clients+1, 10),
+    ('origin_stand', data.n_stands+1, 10)
+]
+
+dim_input = n_begin_end_pts * 2 * 2 + sum(x for (_, _, x) in dim_embeddings)
+dim_hidden = [500]
+dim_output = tgtcls.shape[0]
+
+learning_rate = 0.0001
+momentum = 0.99
+batch_size = 32
diff --git a/config/dest_simple_mlp_tgtcls_1_cswdt.py b/config/dest_simple_mlp_tgtcls_1_cswdt.py
new file mode 100644
index 0000000..af7b2a3
--- /dev/null
+++ b/config/dest_simple_mlp_tgtcls_1_cswdt.py
@@ -0,0 +1,29 @@
+import cPickle
+
+import data
+
+import model.dest_simple_mlp_tgtcls as model
+
+n_begin_end_pts = 5     # how many points we consider at the beginning and end of the known trajectory
+n_end_pts = 5
+
+n_valid = 1000
+
+with open(data.DATA_PATH + "/arrival-clusters.pkl") as f: tgtcls = cPickle.load(f)
+
+dim_embeddings = [
+    ('origin_call', data.n_train_clients+1, 10),
+    ('origin_stand', data.n_stands+1, 10),
+    ('week_of_year', 52, 10),
+    ('day_of_week', 7, 10),
+    ('qhour_of_day', 24 * 4, 10),
+    ('day_type', 3, 10),
+]
+
+dim_input = n_begin_end_pts * 2 * 2 + sum(x for (_, _, x) in dim_embeddings)
+dim_hidden = [500]
+dim_output = tgtcls.shape[0]
+
+learning_rate = 0.0001
+momentum = 0.99
+batch_size = 32
diff --git a/config/dest_simple_mlp_tgtcls_1_cswdtx.py b/config/dest_simple_mlp_tgtcls_1_cswdtx.py
new file mode 100644
index 0000000..b9832df
--- /dev/null
+++ b/config/dest_simple_mlp_tgtcls_1_cswdtx.py
@@ -0,0 +1,30 @@
+import cPickle
+
+import data
+
+import model.dest_simple_mlp_tgtcls as model
+
+n_begin_end_pts = 5     # how many points we consider at the beginning and end of the known trajectory
+n_end_pts = 5
+
+n_valid = 1000
+
+with open(data.DATA_PATH + "/arrival-clusters.pkl") as f: tgtcls = cPickle.load(f)
+
+dim_embeddings = [
+    ('origin_call', data.n_train_clients+1, 10),
+    ('origin_stand', data.n_stands+1, 10),
+    ('week_of_year', 52, 10),
+    ('day_of_week', 7, 10),
+    ('qhour_of_day', 24 * 4, 10),
+    ('day_type', 3, 10),
+    ('taxi_id', 448, 10),
+]
+
+dim_input = n_begin_end_pts * 2 * 2 + sum(x for (_, _, x) in dim_embeddings)
+dim_hidden = [500]
+dim_output = tgtcls.shape[0]
+
+learning_rate = 0.0001
+momentum = 0.99
+batch_size = 32
diff --git a/config/simple_mlp_2_cs.py b/config/simple_mlp_2_cs.py
deleted file mode 100644
index fa2f4c1..0000000
--- a/config/simple_mlp_2_cs.py
+++ /dev/null
@@ -1,21 +0,0 @@
-import model.simple_mlp as model
-
-import data
-
-n_begin_end_pts = 5     # how many points we consider at the beginning and end of the known trajectory
-n_end_pts = 5
-
-n_valid = 1000
-
-dim_embeddings = [
-    ('origin_call', data.n_train_clients+1, 10),
-    ('origin_stand', data.n_stands+1, 10)
-]
-
-dim_input = n_begin_end_pts * 2 * 2 + sum(x for (_, _, x) in dim_embeddings)
-dim_hidden = [200, 100]
-dim_output = 2
-
-learning_rate = 0.0001
-momentum = 0.99
-batch_size = 32
diff --git a/config/simple_mlp_2_cswdt.py b/config/simple_mlp_2_cswdt.py
deleted file mode 100644
index 05c9450..0000000
--- a/config/simple_mlp_2_cswdt.py
+++ /dev/null
@@ -1,25 +0,0 @@
-import model.simple_mlp as model
-
-import data
-
-n_begin_end_pts = 5     # how many points we consider at the beginning and end of the known trajectory
-n_end_pts = 5
-
-n_valid = 1000
-
-dim_embeddings = [
-    ('origin_call', data.n_train_clients+1, 10),
-    ('origin_stand', data.n_stands+1, 10),
-    ('week_of_year', 52, 10),
-    ('day_of_week', 7, 10),
-    ('qhour_of_day', 24 * 4, 10),
-    ('day_type', 3, 10),
-]
-
-dim_input = n_begin_end_pts * 2 * 2 + sum(x for (_, _, x) in dim_embeddings)
-dim_hidden = [200, 100]
-dim_output = 2
-
-learning_rate = 0.0001
-momentum = 0.99
-batch_size = 32
diff --git a/config/simple_mlp_2_noembed.py b/config/simple_mlp_2_noembed.py
deleted file mode 100644
index 2f45f63..0000000
--- a/config/simple_mlp_2_noembed.py
+++ /dev/null
@@ -1,18 +0,0 @@
-import model.simple_mlp as model
-
-import data
-
-n_begin_end_pts = 5     # how many points we consider at the beginning and end of the known trajectory
-n_end_pts = 5
-
-n_valid = 1000
-
-dim_embeddings = []   # do not use embeddings
-
-dim_input = n_begin_end_pts * 2 * 2 + sum(x for (_, _, x) in dim_embeddings)
-dim_hidden = [200, 100]
-dim_output = 2
-
-learning_rate = 0.0001
-momentum = 0.99
-batch_size = 32
diff --git a/config/simple_mlp_tgtcls_0_cs.py b/config/simple_mlp_tgtcls_0_cs.py
deleted file mode 100644
index 96faca0..0000000
--- a/config/simple_mlp_tgtcls_0_cs.py
+++ /dev/null
@@ -1,25 +0,0 @@
-import cPickle
-
-import data
-
-import model.simple_mlp_tgtcls as model
-
-n_begin_end_pts = 5     # how many points we consider at the beginning and end of the known trajectory
-n_end_pts = 5
-
-n_valid = 1000
-
-with open(data.DATA_PATH + "/arrival-clusters.pkl") as f: tgtcls = cPickle.load(f)
-
-dim_embeddings = [
-    ('origin_call', data.n_train_clients+1, 10),
-    ('origin_stand', data.n_stands+1, 10)
-]
-
-dim_input = n_begin_end_pts * 2 * 2 + sum(x for (_, _, x) in dim_embeddings)
-dim_hidden = []
-dim_output = tgtcls.shape[0]
-
-learning_rate = 0.0001
-momentum = 0.99
-batch_size = 32
diff --git a/config/simple_mlp_tgtcls_1_cs.py b/config/simple_mlp_tgtcls_1_cs.py
deleted file mode 100644
index 293a0ab..0000000
--- a/config/simple_mlp_tgtcls_1_cs.py
+++ /dev/null
@@ -1,25 +0,0 @@
-import cPickle
-
-import data
-
-import model.simple_mlp_tgtcls as model
-
-n_begin_end_pts = 5     # how many points we consider at the beginning and end of the known trajectory
-n_end_pts = 5
-
-n_valid = 1000
-
-with open(data.DATA_PATH + "/arrival-clusters.pkl") as f: tgtcls = cPickle.load(f)
-
-dim_embeddings = [
-    ('origin_call', data.n_train_clients+1, 10),
-    ('origin_stand', data.n_stands+1, 10)
-]
-
-dim_input = n_begin_end_pts * 2 * 2 + sum(x for (_, _, x) in dim_embeddings)
-dim_hidden = [500]
-dim_output = tgtcls.shape[0]
-
-learning_rate = 0.0001
-momentum = 0.99
-batch_size = 32
diff --git a/config/simple_mlp_tgtcls_1_cswdt.py b/config/simple_mlp_tgtcls_1_cswdt.py
deleted file mode 100644
index 45bd39e..0000000
--- a/config/simple_mlp_tgtcls_1_cswdt.py
+++ /dev/null
@@ -1,29 +0,0 @@
-import cPickle
-
-import data
-
-import model.simple_mlp_tgtcls as model
-
-n_begin_end_pts = 5     # how many points we consider at the beginning and end of the known trajectory
-n_end_pts = 5
-
-n_valid = 1000
-
-with open(data.DATA_PATH + "/arrival-clusters.pkl") as f: tgtcls = cPickle.load(f)
-
-dim_embeddings = [
-    ('origin_call', data.n_train_clients+1, 10),
-    ('origin_stand', data.n_stands+1, 10),
-    ('week_of_year', 52, 10),
-    ('day_of_week', 7, 10),
-    ('qhour_of_day', 24 * 4, 10),
-    ('day_type', 3, 10),
-]
-
-dim_input = n_begin_end_pts * 2 * 2 + sum(x for (_, _, x) in dim_embeddings)
-dim_hidden = [500]
-dim_output = tgtcls.shape[0]
-
-learning_rate = 0.0001
-momentum = 0.99
-batch_size = 32
diff --git a/config/simple_mlp_tgtcls_1_cswdtx.py b/config/simple_mlp_tgtcls_1_cswdtx.py
deleted file mode 100644
index d51ddde..0000000
--- a/config/simple_mlp_tgtcls_1_cswdtx.py
+++ /dev/null
@@ -1,30 +0,0 @@
-import cPickle
-
-import data
-
-import model.simple_mlp_tgtcls as model
-
-n_begin_end_pts = 5     # how many points we consider at the beginning and end of the known trajectory
-n_end_pts = 5
-
-n_valid = 1000
-
-with open(data.DATA_PATH + "/arrival-clusters.pkl") as f: tgtcls = cPickle.load(f)
-
-dim_embeddings = [
-    ('origin_call', data.n_train_clients+1, 10),
-    ('origin_stand', data.n_stands+1, 10),
-    ('week_of_year', 52, 10),
-    ('day_of_week', 7, 10),
-    ('qhour_of_day', 24 * 4, 10),
-    ('day_type', 3, 10),
-    ('taxi_id', 448, 10),
-]
-
-dim_input = n_begin_end_pts * 2 * 2 + sum(x for (_, _, x) in dim_embeddings)
-dim_hidden = [500]
-dim_output = tgtcls.shape[0]
-
-learning_rate = 0.0001
-momentum = 0.99
-batch_size = 32
diff --git a/config/time_simple_mlp_1.py b/config/time_simple_mlp_1.py
new file mode 100644
index 0000000..eea4159
--- /dev/null
+++ b/config/time_simple_mlp_1.py
@@ -0,0 +1,19 @@
+import model.time_simple_mlp as model
+
+import data
+
+n_begin_end_pts = 5     # how many points we consider at the beginning and end of the known trajectory
+n_end_pts = 5
+
+n_valid = 1000
+
+dim_embeddings = [
+]
+
+dim_input = n_begin_end_pts * 2 * 2 + sum(x for (_, _, x) in dim_embeddings)
+dim_hidden = [200]
+dim_output = 1
+
+learning_rate = 0.00001
+momentum = 0.99
+batch_size = 32
diff --git a/config/time_simple_mlp_2_cswdtx.py b/config/time_simple_mlp_2_cswdtx.py
new file mode 100644
index 0000000..ceb66e8
--- /dev/null
+++ b/config/time_simple_mlp_2_cswdtx.py
@@ -0,0 +1,26 @@
+import model.time_simple_mlp as model
+
+import data
+
+n_begin_end_pts = 5     # how many points we consider at the beginning and end of the known trajectory
+n_end_pts = 5
+
+n_valid = 1000
+
+dim_embeddings = [
+    ('origin_call', data.n_train_clients+1, 10),
+    ('origin_stand', data.n_stands+1, 10),
+    ('week_of_year', 52, 10),
+    ('day_of_week', 7, 10),
+    ('qhour_of_day', 24 * 4, 10),
+    ('day_type', 3, 10),
+    ('taxi_id', 448, 10),
+]
+
+dim_input = n_begin_end_pts * 2 * 2 + sum(x for (_, _, x) in dim_embeddings)
+dim_hidden = [500, 100]
+dim_output = 1
+
+learning_rate = 0.00001
+momentum = 0.99
+batch_size = 32
diff --git a/data.py b/data.py
index 39603fc..42ebe1c 100644
--- a/data.py
+++ b/data.py
@@ -179,15 +179,13 @@ taxi_columns_valid = taxi_columns + [
     ("time", lambda l: int(l[11])),
 ]
 
-train_files=["%s/split/train-%02d.csv" % (DATA_PATH, i) for i in range(100)]
-valid_files=["%s/split/valid2-cut.csv" % (DATA_PATH,)]
+valid_files=["%s/valid2-cut.csv" % (DATA_PATH,)]
 test_file="%s/test.csv" % (DATA_PATH,)
 
-train_data=TaxiData(train_files, taxi_columns)
 valid_data = TaxiData(valid_files, taxi_columns_valid)
 test_data = TaxiData(test_file, taxi_columns, has_header=True)
 
-valid_trips = [l for l in open(DATA_PATH + "/split/valid2-cut-ids.txt")]
+valid_trips = [l for l in open(DATA_PATH + "/valid2-cut-ids.txt")]
 
 def train_it():
     return DataIterator(DataStream(train_data))
diff --git a/error.py b/error.py
new file mode 100644
index 0000000..5ea37ad
--- /dev/null
+++ b/error.py
@@ -0,0 +1,40 @@
+from theano import tensor
+import theano
+import numpy
+
+def const(v):
+    if theano.config.floatX == 'float32':
+        return numpy.float32(v)
+    else:
+        return numpy.float64(v)
+
+rearth = const(6371)
+deg2rad = const(3.141592653589793 / 180)
+
+def hdist(a, b):
+    lat1 = a[:, 0] * deg2rad
+    lon1 = a[:, 1] * deg2rad
+    lat2 = b[:, 0] * deg2rad
+    lon2 = b[:, 1] * deg2rad
+
+    dlat = abs(lat1-lat2)
+    dlon = abs(lon1-lon2)
+
+    al = tensor.sin(dlat/2)**2  + tensor.cos(lat1) * tensor.cos(lat2) * (tensor.sin(dlon/2)**2)
+    d = tensor.arctan2(tensor.sqrt(al), tensor.sqrt(const(1)-al))
+
+    hd = const(2) * rearth * d
+
+    return tensor.switch(tensor.eq(hd, float('nan')), (a-b).norm(2, axis=1), hd)
+
+def erdist(a, b):
+    lat1 = a[:, 0] * deg2rad
+    lon1 = a[:, 1] * deg2rad
+    lat2 = b[:, 0] * deg2rad
+    lon2 = b[:, 1] * deg2rad
+    x = (lon2-lon1) * tensor.cos((lat1+lat2)/2)
+    y = (lat2-lat1)
+    return tensor.sqrt(tensor.sqr(x) + tensor.sqr(y)) * rearth
+
+def rmsle(a, b):
+    return tensor.sqrt( ( (tensor.log(a+1)-tensor.log(b+1)) ** 2 ).mean() )
diff --git a/hdist.py b/hdist.py
deleted file mode 100644
index 866aea4..0000000
--- a/hdist.py
+++ /dev/null
@@ -1,37 +0,0 @@
-from theano import tensor
-import theano
-import numpy
-
-def const(v):
-    if theano.config.floatX == 'float32':
-        return numpy.float32(v)
-    else:
-        return numpy.float64(v)
-
-rearth = const(6371)
-deg2rad = const(3.141592653589793 / 180)
-
-def hdist(a, b):
-    lat1 = a[:, 0] * deg2rad
-    lon1 = a[:, 1] * deg2rad
-    lat2 = b[:, 0] * deg2rad
-    lon2 = b[:, 1] * deg2rad
-
-    dlat = abs(lat1-lat2)
-    dlon = abs(lon1-lon2)
-
-    al = tensor.sin(dlat/2)**2  + tensor.cos(lat1) * tensor.cos(lat2) * (tensor.sin(dlon/2)**2)
-    d = tensor.arctan2(tensor.sqrt(al), tensor.sqrt(const(1)-al))
-
-    hd = const(2) * rearth * d
-
-    return tensor.switch(tensor.eq(hd, float('nan')), (a-b).norm(2, axis=1), hd)
-
-def erdist(a, b):
-    lat1 = a[:, 0] * deg2rad
-    lon1 = a[:, 1] * deg2rad
-    lat2 = b[:, 0] * deg2rad
-    lon2 = b[:, 1] * deg2rad
-    x = (lon2-lon1) * tensor.cos((lat1+lat2)/2)
-    y = (lat2-lat1)
-    return tensor.sqrt(tensor.sqr(x) + tensor.sqr(y)) * rearth
diff --git a/model/dest_simple_mlp.py b/model/dest_simple_mlp.py
new file mode 100644
index 0000000..896f219
--- /dev/null
+++ b/model/dest_simple_mlp.py
@@ -0,0 +1,73 @@
+from blocks.bricks import MLP, Rectifier, Linear, Sigmoid, Identity
+from blocks.bricks.lookup import LookupTable
+
+from blocks.initialization import IsotropicGaussian, Constant
+
+from theano import tensor
+
+import data
+import error
+
+class Model(object):
+    def __init__(self, config):
+        # The input and the targets
+        x_firstk_latitude = (tensor.matrix('first_k_latitude') - data.porto_center[0]) / data.data_std[0]
+        x_firstk_longitude = (tensor.matrix('first_k_longitude') - data.porto_center[1]) / data.data_std[1]
+
+        x_lastk_latitude = (tensor.matrix('last_k_latitude') - data.porto_center[0]) / data.data_std[0]
+        x_lastk_longitude = (tensor.matrix('last_k_longitude') - data.porto_center[1]) / data.data_std[1]
+
+        input_list = [x_firstk_latitude, x_firstk_longitude, x_lastk_latitude, x_lastk_longitude]
+        embed_tables = []
+
+        self.require_inputs = ['first_k_latitude', 'first_k_longitude', 'last_k_latitude', 'last_k_longitude']
+
+        for (varname, num, dim) in config.dim_embeddings:
+            self.require_inputs.append(varname)
+            vardata = tensor.lvector(varname)
+            tbl = LookupTable(length=num, dim=dim, name='%s_lookup'%varname)
+            embed_tables.append(tbl)
+            input_list.append(tbl.apply(vardata))
+
+        y = tensor.concatenate((tensor.vector('destination_latitude')[:, None],
+                                tensor.vector('destination_longitude')[:, None]), axis=1)
+
+        # Define the model
+        mlp = MLP(activations=[Rectifier() for _ in config.dim_hidden] + [Identity()],
+                           dims=[config.dim_input] + config.dim_hidden + [config.dim_output])
+
+        # Create the Theano variables
+        inputs = tensor.concatenate(input_list, axis=1)
+        # inputs = theano.printing.Print("inputs")(inputs)
+        outputs = mlp.apply(inputs)
+
+        # Normalize & Center
+        # outputs = theano.printing.Print("normal_outputs")(outputs)
+        outputs = data.data_std * outputs + data.porto_center
+
+        # outputs = theano.printing.Print("outputs")(outputs)
+        # y = theano.printing.Print("y")(y)
+
+        outputs.name = 'outputs'
+
+        # Calculate the cost
+        cost = error.erdist(outputs, y).mean()
+        cost.name = 'cost'
+        hcost = error.hdist(outputs, y).mean()
+        hcost.name = 'hcost'
+
+        # Initialization
+        for tbl in embed_tables:
+            tbl.weights_init = IsotropicGaussian(0.001)
+        mlp.weights_init = IsotropicGaussian(0.01)
+        mlp.biases_init = Constant(0.001)
+
+        for tbl in embed_tables:
+            tbl.initialize()
+        mlp.initialize()
+
+        self.cost = cost
+        self.monitor = [cost, hcost]
+        self.outputs = outputs
+        self.pred_vars = ['destination_latitude', 'destination_longitude']
+
diff --git a/model/dest_simple_mlp_tgtcls.py b/model/dest_simple_mlp_tgtcls.py
new file mode 100644
index 0000000..d8fdeb3
--- /dev/null
+++ b/model/dest_simple_mlp_tgtcls.py
@@ -0,0 +1,75 @@
+import numpy 
+
+import theano
+from theano import tensor
+
+from blocks.bricks import MLP, Rectifier, Linear, Sigmoid, Identity, Softmax
+from blocks.bricks.lookup import LookupTable
+
+from blocks.initialization import IsotropicGaussian, Constant
+
+import data
+import error
+
+class Model(object):
+    def __init__(self, config):
+        # The input and the targets
+        x_firstk_latitude = (tensor.matrix('first_k_latitude') - data.porto_center[0]) / data.data_std[0]
+        x_firstk_longitude = (tensor.matrix('first_k_longitude') - data.porto_center[1]) / data.data_std[1]
+
+        x_lastk_latitude = (tensor.matrix('last_k_latitude') - data.porto_center[0]) / data.data_std[0]
+        x_lastk_longitude = (tensor.matrix('last_k_longitude') - data.porto_center[1]) / data.data_std[1]
+
+        input_list = [x_firstk_latitude, x_firstk_longitude, x_lastk_latitude, x_lastk_longitude]
+        embed_tables = []
+
+        self.require_inputs = ['first_k_latitude', 'first_k_longitude', 'last_k_latitude', 'last_k_longitude']
+
+        for (varname, num, dim) in config.dim_embeddings:
+            self.require_inputs.append(varname)
+            vardata = tensor.lvector(varname)
+            tbl = LookupTable(length=num, dim=dim, name='%s_lookup'%varname)
+            embed_tables.append(tbl)
+            input_list.append(tbl.apply(vardata))
+
+        y = tensor.concatenate((tensor.vector('destination_latitude')[:, None],
+                                tensor.vector('destination_longitude')[:, None]), axis=1)
+
+        # Define the model
+        mlp = MLP(activations=[Rectifier() for _ in config.dim_hidden] + [Softmax()],
+                           dims=[config.dim_input] + config.dim_hidden + [config.dim_output])
+        classes = theano.shared(numpy.array(config.tgtcls, dtype=theano.config.floatX), name='classes')
+
+        # Create the Theano variables
+        inputs = tensor.concatenate(input_list, axis=1)
+
+        # inputs = theano.printing.Print("inputs")(inputs)
+        cls_probas = mlp.apply(inputs)
+        outputs = tensor.dot(cls_probas, classes)
+
+        # outputs = theano.printing.Print("outputs")(outputs)
+        # y = theano.printing.Print("y")(y)
+
+        outputs.name = 'outputs'
+
+        # Calculate the cost
+        cost = error.erdist(outputs, y).mean()
+        cost.name = 'cost'
+        hcost = error.hdist(outputs, y).mean()
+        hcost.name = 'hcost'
+
+        # Initialization
+        for tbl in embed_tables:
+            tbl.weights_init = IsotropicGaussian(0.001)
+        mlp.weights_init = IsotropicGaussian(0.01)
+        mlp.biases_init = Constant(0.001)
+
+        for tbl in embed_tables:
+            tbl.initialize()
+        mlp.initialize()
+
+        self.cost = cost
+        self.monitor = [cost, hcost]
+        self.outputs = outputs
+        self.pred_vars = ['destination_latitude', 'destination_longitude']
+
diff --git a/model/simple_mlp.py b/model/simple_mlp.py
deleted file mode 100644
index fc065f7..0000000
--- a/model/simple_mlp.py
+++ /dev/null
@@ -1,71 +0,0 @@
-from blocks.bricks import MLP, Rectifier, Linear, Sigmoid, Identity
-from blocks.bricks.lookup import LookupTable
-
-from blocks.initialization import IsotropicGaussian, Constant
-
-from theano import tensor
-
-import data
-import hdist
-
-class Model(object):
-    def __init__(self, config):
-        # The input and the targets
-        x_firstk_latitude = (tensor.matrix('first_k_latitude') - data.porto_center[0]) / data.data_std[0]
-        x_firstk_longitude = (tensor.matrix('first_k_longitude') - data.porto_center[1]) / data.data_std[1]
-
-        x_lastk_latitude = (tensor.matrix('last_k_latitude') - data.porto_center[0]) / data.data_std[0]
-        x_lastk_longitude = (tensor.matrix('last_k_longitude') - data.porto_center[1]) / data.data_std[1]
-
-        input_list = [x_firstk_latitude, x_firstk_longitude, x_lastk_latitude, x_lastk_longitude]
-        embed_tables = []
-
-        self.require_inputs = ['first_k_latitude', 'first_k_longitude', 'last_k_latitude', 'last_k_longitude']
-
-        for (varname, num, dim) in config.dim_embeddings:
-            self.require_inputs.append(varname)
-            vardata = tensor.lvector(varname)
-            tbl = LookupTable(length=num, dim=dim, name='%s_lookup'%varname)
-            embed_tables.append(tbl)
-            input_list.append(tbl.apply(vardata))
-
-        y = tensor.concatenate((tensor.vector('destination_latitude')[:, None],
-                                tensor.vector('destination_longitude')[:, None]), axis=1)
-
-        # Define the model
-        mlp = MLP(activations=[Rectifier() for _ in config.dim_hidden] + [Identity()],
-                           dims=[config.dim_input] + config.dim_hidden + [config.dim_output])
-
-        # Create the Theano variables
-        inputs = tensor.concatenate(input_list, axis=1)
-        # inputs = theano.printing.Print("inputs")(inputs)
-        outputs = mlp.apply(inputs)
-
-        # Normalize & Center
-        # outputs = theano.printing.Print("normal_outputs")(outputs)
-        outputs = data.data_std * outputs + data.porto_center
-
-        # outputs = theano.printing.Print("outputs")(outputs)
-        # y = theano.printing.Print("y")(y)
-
-        outputs.name = 'outputs'
-
-        # Calculate the cost
-        cost = hdist.erdist(outputs, y).mean()
-        cost.name = 'cost'
-        hcost = hdist.hdist(outputs, y).mean()
-        hcost.name = 'hcost'
-
-        # Initialization
-        for tbl in embed_tables:
-            tbl.weights_init = IsotropicGaussian(0.001)
-        mlp.weights_init = IsotropicGaussian(0.01)
-        mlp.biases_init = Constant(0.001)
-
-        for tbl in embed_tables:
-            tbl.initialize()
-        mlp.initialize()
-
-        self.cost = cost
-        self.hcost = hcost
-        self.outputs = outputs
diff --git a/model/simple_mlp_tgtcls.py b/model/simple_mlp_tgtcls.py
deleted file mode 100644
index b2a1a6e..0000000
--- a/model/simple_mlp_tgtcls.py
+++ /dev/null
@@ -1,73 +0,0 @@
-import numpy 
-
-import theano
-from theano import tensor
-
-from blocks.bricks import MLP, Rectifier, Linear, Sigmoid, Identity, Softmax
-from blocks.bricks.lookup import LookupTable
-
-from blocks.initialization import IsotropicGaussian, Constant
-
-import data
-import hdist
-
-class Model(object):
-    def __init__(self, config):
-        # The input and the targets
-        x_firstk_latitude = (tensor.matrix('first_k_latitude') - data.porto_center[0]) / data.data_std[0]
-        x_firstk_longitude = (tensor.matrix('first_k_longitude') - data.porto_center[1]) / data.data_std[1]
-
-        x_lastk_latitude = (tensor.matrix('last_k_latitude') - data.porto_center[0]) / data.data_std[0]
-        x_lastk_longitude = (tensor.matrix('last_k_longitude') - data.porto_center[1]) / data.data_std[1]
-
-        input_list = [x_firstk_latitude, x_firstk_longitude, x_lastk_latitude, x_lastk_longitude]
-        embed_tables = []
-
-        self.require_inputs = ['first_k_latitude', 'first_k_longitude', 'last_k_latitude', 'last_k_longitude']
-
-        for (varname, num, dim) in config.dim_embeddings:
-            self.require_inputs.append(varname)
-            vardata = tensor.lvector(varname)
-            tbl = LookupTable(length=num, dim=dim, name='%s_lookup'%varname)
-            embed_tables.append(tbl)
-            input_list.append(tbl.apply(vardata))
-
-        y = tensor.concatenate((tensor.vector('destination_latitude')[:, None],
-                                tensor.vector('destination_longitude')[:, None]), axis=1)
-
-        # Define the model
-        mlp = MLP(activations=[Rectifier() for _ in config.dim_hidden] + [Softmax()],
-                           dims=[config.dim_input] + config.dim_hidden + [config.dim_output])
-        classes = theano.shared(numpy.array(config.tgtcls, dtype=theano.config.floatX), name='classes')
-
-        # Create the Theano variables
-        inputs = tensor.concatenate(input_list, axis=1)
-
-        # inputs = theano.printing.Print("inputs")(inputs)
-        cls_probas = mlp.apply(inputs)
-        outputs = tensor.dot(cls_probas, classes)
-
-        # outputs = theano.printing.Print("outputs")(outputs)
-        # y = theano.printing.Print("y")(y)
-
-        outputs.name = 'outputs'
-
-        # Calculate the cost
-        cost = hdist.erdist(outputs, y).mean()
-        cost.name = 'cost'
-        hcost = hdist.hdist(outputs, y).mean()
-        hcost.name = 'hcost'
-
-        # Initialization
-        for tbl in embed_tables:
-            tbl.weights_init = IsotropicGaussian(0.001)
-        mlp.weights_init = IsotropicGaussian(0.01)
-        mlp.biases_init = Constant(0.001)
-
-        for tbl in embed_tables:
-            tbl.initialize()
-        mlp.initialize()
-
-        self.cost = cost
-        self.hcost = hcost
-        self.outputs = outputs
diff --git a/model/time_simple_mlp.py b/model/time_simple_mlp.py
new file mode 100644
index 0000000..1568ed3
--- /dev/null
+++ b/model/time_simple_mlp.py
@@ -0,0 +1,65 @@
+from blocks.bricks import MLP, Rectifier, Linear, Sigmoid, Identity
+from blocks.bricks.lookup import LookupTable
+
+from blocks.initialization import IsotropicGaussian, Constant
+
+from theano import tensor
+
+import data
+import error
+
+class Model(object):
+    def __init__(self, config):
+        # The input and the targets
+        x_firstk_latitude = (tensor.matrix('first_k_latitude') - data.porto_center[0]) / data.data_std[0]
+        x_firstk_longitude = (tensor.matrix('first_k_longitude') - data.porto_center[1]) / data.data_std[1]
+
+        x_lastk_latitude = (tensor.matrix('last_k_latitude') - data.porto_center[0]) / data.data_std[0]
+        x_lastk_longitude = (tensor.matrix('last_k_longitude') - data.porto_center[1]) / data.data_std[1]
+
+        input_list = [x_firstk_latitude, x_firstk_longitude, x_lastk_latitude, x_lastk_longitude]
+        embed_tables = []
+
+        self.require_inputs = ['first_k_latitude', 'first_k_longitude', 'last_k_latitude', 'last_k_longitude']
+
+        for (varname, num, dim) in config.dim_embeddings:
+            self.require_inputs.append(varname)
+            vardata = tensor.lvector(varname)
+            tbl = LookupTable(length=num, dim=dim, name='%s_lookup'%varname)
+            embed_tables.append(tbl)
+            input_list.append(tbl.apply(vardata))
+
+        y = tensor.lvector('time')
+
+        # Define the model
+        mlp = MLP(activations=[Rectifier() for _ in config.dim_hidden] + [Identity()],
+                           dims=[config.dim_input] + config.dim_hidden + [config.dim_output])
+
+        # Create the Theano variables
+        inputs = tensor.concatenate(input_list, axis=1)
+        # inputs = theano.printing.Print("inputs")(inputs)
+        outputs = tensor.exp(mlp.apply(inputs) + 2)
+
+        # outputs = theano.printing.Print("outputs")(outputs)
+        # y = theano.printing.Print("y")(y)
+
+        outputs.name = 'outputs'
+
+        # Calculate the cost
+        cost = error.rmsle(outputs.flatten(), y.flatten())
+        cost.name = 'cost'
+
+        # Initialization
+        for tbl in embed_tables:
+            tbl.weights_init = IsotropicGaussian(0.001)
+        mlp.weights_init = IsotropicGaussian(0.01)
+        mlp.biases_init = Constant(0.001)
+
+        for tbl in embed_tables:
+            tbl.initialize()
+        mlp.initialize()
+
+        self.cost = cost
+        self.monitor = [cost]
+        self.outputs = outputs
+        self.pred_vars = ['time']
diff --git a/train.py b/train.py
index 2c9522e..4cbd526 100644
--- a/train.py
+++ b/train.py
@@ -20,7 +20,7 @@ from blocks.model import Model
 from fuel.datasets.hdf5 import H5PYDataset
 from fuel.transformers import Batch
 from fuel.streams import DataStream
-from fuel.schemes import ConstantScheme, SequentialExampleScheme
+from fuel.schemes import ConstantScheme, SequentialExampleScheme, ShuffledExampleScheme
 
 from blocks.algorithms import GradientDescent, Scale, AdaDelta, Momentum
 from blocks.graph import ComputationGraph
@@ -31,7 +31,6 @@ from blocks.extensions.monitoring import DataStreamMonitoring
 
 import data
 import transformers
-import hdist
 import apply_model
 
 if __name__ == "__main__":
@@ -48,7 +47,7 @@ def setup_train_stream(req_vars):
                         which_set='train',
                         subset=slice(0, data.dataset_size),
                         load_in_memory=True)
-    train = DataStream(train, iteration_scheme=SequentialExampleScheme(data.dataset_size - config.n_valid))
+    train = DataStream(train, iteration_scheme=ShuffledExampleScheme(data.dataset_size))
 
     train = transformers.TaxiExcludeTrips(data.valid_trips, train)
     train = transformers.TaxiGenerateSplits(train, max_splits=100)
@@ -91,10 +90,9 @@ def main():
     model = config.model.Model(config)
 
     cost = model.cost
-    hcost = model.hcost
     outputs = model.outputs
 
-    req_vars = model.require_inputs + [ 'destination_latitude', 'destination_longitude' ]
+    req_vars = model.require_inputs + model.pred_vars
     req_vars_test = model.require_inputs + [ 'trip_id' ]
 
     train_stream = setup_train_stream(req_vars)
@@ -109,7 +107,7 @@ def main():
         step_rule=Momentum(learning_rate=config.learning_rate, momentum=config.momentum),
         params=params)
 
-    extensions=[DataStreamMonitoring([cost, hcost], valid_stream,
+    extensions=[DataStreamMonitoring(model.monitor, valid_stream,
                                      prefix='valid',
                                      every_n_batches=1000),
                 Printing(every_n_batches=1000),
@@ -132,11 +130,18 @@ def main():
 
     outfile = open("output/test-output-%s.csv" % model_name, "w")
     outcsv = csv.writer(outfile)
-    outcsv.writerow(["TRIP_ID", "LATITUDE", "LONGITUDE"])
-    for out in apply_model.Apply(outputs=outputs, stream=test_stream, return_vars=['trip_id', 'outputs']):
-        dest = out['outputs']
-        for i, trip in enumerate(out['trip_id']):
-            outcsv.writerow([trip, repr(dest[i, 0]), repr(dest[i, 1])])
+    if model.pred_vars == ['time']:
+        outcsv.writerow(["TRIP_ID", "TRAVEL_TIME"])
+        for out in apply_model.Apply(outputs=outputs, stream=test_stream, return_vars=['trip_id', 'outputs']):
+            time = out['outputs']
+            for i, trip in enumerate(out['trip_id']):
+                outcsv.writerow([trip, int(time[i, 0])])
+    else:
+        outcsv.writerow(["TRIP_ID", "LATITUDE", "LONGITUDE"])
+        for out in apply_model.Apply(outputs=outputs, stream=test_stream, return_vars=['trip_id', 'outputs']):
+            dest = out['outputs']
+            for i, trip in enumerate(out['trip_id']):
+                outcsv.writerow([trip, repr(dest[i, 0]), repr(dest[i, 1])])
     outfile.close()
 
 
diff --git a/transformers.py b/transformers.py
index 6ee0df1..73e3868 100644
--- a/transformers.py
+++ b/transformers.py
@@ -32,7 +32,7 @@ class Select(Transformer):
 class TaxiGenerateSplits(Transformer):
     def __init__(self, data_stream, max_splits=-1):
         super(TaxiGenerateSplits, self).__init__(data_stream)
-        self.sources = data_stream.sources + ('destination_latitude', 'destination_longitude')
+        self.sources = data_stream.sources + ('destination_latitude', 'destination_longitude', 'time')
         self.max_splits = max_splits
         self.data = None
         self.splits = []
@@ -63,7 +63,7 @@ class TaxiGenerateSplits(Transformer):
         dlat = numpy.float32(self.data[self.id_latitude][-1])
         dlon = numpy.float32(self.data[self.id_longitude][-1])
 
-        return tuple(r + [dlat, dlon])
+        return tuple(r + [dlat, dlon, 15 * (len(self.data[self.id_longitude]) - 1)])
 
 class TaxiAddFirstK(Transformer):
     def __init__(self, k, stream):
-- 
cgit v1.2.3