aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorÉtienne Simon <esimon@esimon.eu>2015-05-05 21:55:13 -0400
committerÉtienne Simon <esimon@esimon.eu>2015-05-05 22:05:21 -0400
commit1f2ff96e6480a62089fcac35154a956c218ed678 (patch)
treed0bb7a2a6d7ba6ae512a2ce3729b1ccbdc21c822
parent54613c1f9cf510ca7a71d6619418f2247515aec6 (diff)
downloadtaxi-1f2ff96e6480a62089fcac35154a956c218ed678.tar.gz
taxi-1f2ff96e6480a62089fcac35154a956c218ed678.zip
Clean data module and generalize use of hdf5.
-rw-r--r--config/dest_simple_mlp_2_cs.py6
-rw-r--r--config/dest_simple_mlp_2_cswdt.py6
-rw-r--r--config/dest_simple_mlp_2_noembed.py2
-rw-r--r--config/dest_simple_mlp_tgtcls_0_cs.py8
-rw-r--r--config/dest_simple_mlp_tgtcls_1_cs.py8
-rw-r--r--config/dest_simple_mlp_tgtcls_1_cswdt.py8
-rw-r--r--config/dest_simple_mlp_tgtcls_1_cswdtx.py8
-rw-r--r--data/__init__.py31
-rw-r--r--data/csv.py (renamed from data.py)121
-rwxr-xr-xdata/csv_to_hdf5.py (renamed from convert_data.py)38
-rw-r--r--data/cuts/__init__.py0
-rw-r--r--data/cuts/test_times_0.py8
-rw-r--r--data/hdf5.py61
-rwxr-xr-xdata/init_valid.py61
-rwxr-xr-xdata/make_valid_cut.py72
-rw-r--r--data/transformers.py (renamed from transformers.py)8
-rw-r--r--make_valid.py37
-rw-r--r--make_valid_cut.py40
-rw-r--r--model/dest_simple_mlp.py10
-rw-r--r--model/dest_simple_mlp_tgtcls.py8
-rwxr-xr-x[-rw-r--r--]train.py51
21 files changed, 334 insertions, 258 deletions
diff --git a/config/dest_simple_mlp_2_cs.py b/config/dest_simple_mlp_2_cs.py
index 2cec78d..0dd2704 100644
--- a/config/dest_simple_mlp_2_cs.py
+++ b/config/dest_simple_mlp_2_cs.py
@@ -8,8 +8,8 @@ n_end_pts = 5
n_valid = 1000
dim_embeddings = [
- ('origin_call', data.n_train_clients+1, 10),
- ('origin_stand', data.n_stands+1, 10)
+ ('origin_call', data.origin_call_train_size, 10),
+ ('origin_stand', data.stands_size, 10)
]
dim_input = n_begin_end_pts * 2 * 2 + sum(x for (_, _, x) in dim_embeddings)
@@ -19,3 +19,5 @@ dim_output = 2
learning_rate = 0.0001
momentum = 0.99
batch_size = 32
+
+valid_set = 'cuts/test_times_0'
diff --git a/config/dest_simple_mlp_2_cswdt.py b/config/dest_simple_mlp_2_cswdt.py
index f6ddf34..1011488 100644
--- a/config/dest_simple_mlp_2_cswdt.py
+++ b/config/dest_simple_mlp_2_cswdt.py
@@ -8,8 +8,8 @@ n_end_pts = 5
n_valid = 1000
dim_embeddings = [
- ('origin_call', data.n_train_clients+1, 10),
- ('origin_stand', data.n_stands+1, 10),
+ ('origin_call', data.origin_call_train_size, 10),
+ ('origin_stand', data.stands_size, 10),
('week_of_year', 52, 10),
('day_of_week', 7, 10),
('qhour_of_day', 24 * 4, 10),
@@ -23,3 +23,5 @@ dim_output = 2
learning_rate = 0.0001
momentum = 0.99
batch_size = 32
+
+valid_set = 'cuts/test_times_0'
diff --git a/config/dest_simple_mlp_2_noembed.py b/config/dest_simple_mlp_2_noembed.py
index 3832146..3cddcb9 100644
--- a/config/dest_simple_mlp_2_noembed.py
+++ b/config/dest_simple_mlp_2_noembed.py
@@ -16,3 +16,5 @@ dim_output = 2
learning_rate = 0.0001
momentum = 0.99
batch_size = 32
+
+valid_set = 'cuts/test_times_0'
diff --git a/config/dest_simple_mlp_tgtcls_0_cs.py b/config/dest_simple_mlp_tgtcls_0_cs.py
index a8a5a0e..031cd12 100644
--- a/config/dest_simple_mlp_tgtcls_0_cs.py
+++ b/config/dest_simple_mlp_tgtcls_0_cs.py
@@ -9,11 +9,11 @@ n_end_pts = 5
n_valid = 1000
-with open(data.DATA_PATH + "/arrival-clusters.pkl") as f: tgtcls = cPickle.load(f)
+with open("%s/arrival-clusters.pkl" % data.path) as f: tgtcls = cPickle.load(f)
dim_embeddings = [
- ('origin_call', data.n_train_clients+1, 10),
- ('origin_stand', data.n_stands+1, 10)
+ ('origin_call', data.origin_call_train_size, 10),
+ ('origin_stand', data.stands_size, 10)
]
dim_input = n_begin_end_pts * 2 * 2 + sum(x for (_, _, x) in dim_embeddings)
@@ -23,3 +23,5 @@ dim_output = tgtcls.shape[0]
learning_rate = 0.0001
momentum = 0.99
batch_size = 32
+
+valid_set = 'cuts/test_times_0'
diff --git a/config/dest_simple_mlp_tgtcls_1_cs.py b/config/dest_simple_mlp_tgtcls_1_cs.py
index 8136f10..48d9fa0 100644
--- a/config/dest_simple_mlp_tgtcls_1_cs.py
+++ b/config/dest_simple_mlp_tgtcls_1_cs.py
@@ -9,11 +9,11 @@ n_end_pts = 5
n_valid = 1000
-with open(data.DATA_PATH + "/arrival-clusters.pkl") as f: tgtcls = cPickle.load(f)
+with open("%s/arrival-clusters.pkl" % data.path) as f: tgtcls = cPickle.load(f)
dim_embeddings = [
- ('origin_call', data.n_train_clients+1, 10),
- ('origin_stand', data.n_stands+1, 10)
+ ('origin_call', data.origin_call_train_size, 10),
+ ('origin_stand', data.stands_size, 10)
]
dim_input = n_begin_end_pts * 2 * 2 + sum(x for (_, _, x) in dim_embeddings)
@@ -23,3 +23,5 @@ dim_output = tgtcls.shape[0]
learning_rate = 0.0001
momentum = 0.99
batch_size = 32
+
+valid_set = 'cuts/test_times_0'
diff --git a/config/dest_simple_mlp_tgtcls_1_cswdt.py b/config/dest_simple_mlp_tgtcls_1_cswdt.py
index af7b2a3..6aa2a03 100644
--- a/config/dest_simple_mlp_tgtcls_1_cswdt.py
+++ b/config/dest_simple_mlp_tgtcls_1_cswdt.py
@@ -9,11 +9,11 @@ n_end_pts = 5
n_valid = 1000
-with open(data.DATA_PATH + "/arrival-clusters.pkl") as f: tgtcls = cPickle.load(f)
+with open("%s/arrival-clusters.pkl" % data.path) as f: tgtcls = cPickle.load(f)
dim_embeddings = [
- ('origin_call', data.n_train_clients+1, 10),
- ('origin_stand', data.n_stands+1, 10),
+ ('origin_call', data.origin_call_train_size, 10),
+ ('origin_stand', data.stands_size, 10),
('week_of_year', 52, 10),
('day_of_week', 7, 10),
('qhour_of_day', 24 * 4, 10),
@@ -27,3 +27,5 @@ dim_output = tgtcls.shape[0]
learning_rate = 0.0001
momentum = 0.99
batch_size = 32
+
+valid_set = 'cuts/test_times_0'
diff --git a/config/dest_simple_mlp_tgtcls_1_cswdtx.py b/config/dest_simple_mlp_tgtcls_1_cswdtx.py
index b9832df..7918242 100644
--- a/config/dest_simple_mlp_tgtcls_1_cswdtx.py
+++ b/config/dest_simple_mlp_tgtcls_1_cswdtx.py
@@ -9,11 +9,11 @@ n_end_pts = 5
n_valid = 1000
-with open(data.DATA_PATH + "/arrival-clusters.pkl") as f: tgtcls = cPickle.load(f)
+with open("%s/arrival-clusters.pkl" % data.path) as f: tgtcls = cPickle.load(f)
dim_embeddings = [
- ('origin_call', data.n_train_clients+1, 10),
- ('origin_stand', data.n_stands+1, 10),
+ ('origin_call', data.origin_call_train_size, 10),
+ ('origin_stand', data.stands_size, 10),
('week_of_year', 52, 10),
('day_of_week', 7, 10),
('qhour_of_day', 24 * 4, 10),
@@ -28,3 +28,5 @@ dim_output = tgtcls.shape[0]
learning_rate = 0.0001
momentum = 0.99
batch_size = 32
+
+valid_set = 'cuts/test_times_0'
diff --git a/data/__init__.py b/data/__init__.py
new file mode 100644
index 0000000..1278e0b
--- /dev/null
+++ b/data/__init__.py
@@ -0,0 +1,31 @@
+import os
+
+import h5py
+import numpy
+import theano
+
+
+path = os.environ.get('TAXI_PATH', '/data/lisatmp3/auvolat/taxikaggle')
+Polyline = h5py.special_dtype(vlen=theano.config.floatX)
+
+
+# `wc -l test.csv` - 1 # Minus 1 to ignore the header
+test_size = 320
+
+# `wc -l train.csv` - 1
+train_size = 1710670
+
+# `wc -l metaData_taxistandsID_name_GPSlocation.csv`
+stands_size = 64 # include 0 ("no origin_stands")
+
+# `cut -d, -f 5 train.csv test.csv | sort -u | wc -l` - 1
+taxi_id_size = 448
+
+# `cut -d, -f 3 train.csv test.csv | sort -u | wc -l` - 2
+origin_call_size = 57125 # include 0 ("no origin_call")
+
+# As printed by csv_to_hdf5.py
+origin_call_train_size = 57106
+
+train_gps_mean = numpy.array([41.1573, -8.61612], dtype=theano.config.floatX)
+train_gps_std = numpy.sqrt(numpy.array([0.00549598, 0.00333233], dtype=theano.config.floatX))
diff --git a/data.py b/data/csv.py
index 42ebe1c..b6fe5b1 100644
--- a/data.py
+++ b/data/csv.py
@@ -1,103 +1,14 @@
-import ast, csv
-import socket
-import fuel
+import ast
+import csv
import numpy
-import h5py
-from enum import Enum
+
from fuel.datasets import Dataset
from fuel.streams import DataStream
from fuel.iterator import DataIterator
-import theano
-
-if socket.gethostname() == "adeb.laptop":
- DATA_PATH = "/Users/adeb/data/taxi"
-else:
- DATA_PATH="/data/lisatmp3/auvolat/taxikaggle"
-
-H5DATA_PATH = '/data/lisatmp3/simonet/taxi/data.hdf5'
-
-porto_center = numpy.array([41.1573, -8.61612], dtype=theano.config.floatX)
-data_std = numpy.sqrt(numpy.array([0.00549598, 0.00333233], dtype=theano.config.floatX))
-
-n_clients = 57124
-n_train_clients = 57105
-n_stands = 63
-
-dataset_size = 1710670
-
-# ---- Read client IDs and create reverse dictionnary
-
-def make_client_ids():
- f = h5py.File(H5DATA_PATH, "r")
- l = f['unique_origin_call']
- r = {l[i]: i for i in range(l.shape[0])}
- return r
-
-client_ids = make_client_ids()
-
-def get_client_id(n):
- if n in client_ids and client_ids[n] <= n_train_clients:
- return client_ids[n]
- else:
- return 0
-
-# ---- Read taxi IDs and create reverse dictionnary
-
-def make_taxi_ids():
- f = h5py.File(H5DATA_PATH, "r")
- l = f['unique_taxi_id']
- r = {l[i]: i for i in range(l.shape[0])}
- return r
-
-taxi_ids = make_taxi_ids()
-
-# ---- Enum types
-
-class CallType(Enum):
- CENTRAL = 0
- STAND = 1
- STREET = 2
-
- @classmethod
- def from_data(cls, val):
- if val=='A':
- return cls.CENTRAL
- elif val=='B':
- return cls.STAND
- elif val=='C':
- return cls.STREET
-
- @classmethod
- def to_data(cls, val):
- if val==cls.CENTRAL:
- return 'A'
- elif val==cls.STAND:
- return 'B'
- elif val==cls.STREET:
- return 'C'
-
-class DayType(Enum):
- NORMAL = 0
- HOLIDAY = 1
- HOLIDAY_EVE = 2
-
- @classmethod
- def from_data(cls, val):
- if val=='A':
- return cls.NORMAL
- elif val=='B':
- return cls.HOLIDAY
- elif val=='C':
- return cls.HOLIDAY_EVE
-
- @classmethod
- def to_data(cls, val):
- if val==cls.NORMAL:
- return 'A'
- elif val==cls.HOLIDAY:
- return 'B'
- elif val==cls.HOLIDAY_EVE:
- return 'C'
+
+import data
+from data.hdf5 import origin_call_normalize, taxi_id_normalize
+
class TaxiData(Dataset):
example_iteration_scheme=None
@@ -161,10 +72,10 @@ class TaxiData(Dataset):
taxi_columns = [
("trip_id", lambda l: l[0]),
- ("call_type", lambda l: CallType.from_data(l[1])),
- ("origin_call", lambda l: 0 if l[2] == '' or l[2] == 'NA' else get_client_id(int(l[2]))),
+ ("call_type", lambda l: ord(l[1])-ord('A')),
+ ("origin_call", lambda l: 0 if l[2] == '' or l[2] == 'NA' else origin_call_normalize(int(l[2]))),
("origin_stand", lambda l: 0 if l[3] == '' or l[3] == 'NA' else int(l[3])),
- ("taxi_id", lambda l: taxi_ids[int(l[4])]),
+ ("taxi_id", lambda l: taxi_id_normalize(int(l[4]))),
("timestamp", lambda l: int(l[5])),
("day_type", lambda l: ord(l[6])-ord('A')),
("missing_data", lambda l: l[7][0] == 'T'),
@@ -179,18 +90,18 @@ taxi_columns_valid = taxi_columns + [
("time", lambda l: int(l[11])),
]
-valid_files=["%s/valid2-cut.csv" % (DATA_PATH,)]
-test_file="%s/test.csv" % (DATA_PATH,)
+train_file="%s/train.csv" % data.path
+valid_file="%s/valid2-cut.csv" % data.path
+test_file="%s/test.csv" % data.path
-valid_data = TaxiData(valid_files, taxi_columns_valid)
+train_data=TaxiData(train_file, taxi_columns, has_header=True)
+valid_data = TaxiData(valid_file, taxi_columns_valid)
test_data = TaxiData(test_file, taxi_columns, has_header=True)
-valid_trips = [l for l in open(DATA_PATH + "/valid2-cut-ids.txt")]
+valid_trips = [l for l in open("%s/valid2-cut-ids.txt" % data.path)]
def train_it():
return DataIterator(DataStream(train_data))
def test_it():
return DataIterator(DataStream(valid_data))
-
-
diff --git a/convert_data.py b/data/csv_to_hdf5.py
index ca66786..17217f3 100755
--- a/convert_data.py
+++ b/data/csv_to_hdf5.py
@@ -1,15 +1,17 @@
#!/usr/bin/env python
-import os, h5py, csv, sys, numpy, theano, ast
-from fuel.converters.base import fill_hdf5_file
-test_size = 320 # `wc -l test.csv` - 1 # Minus 1 to ignore the header
-train_size = 1710670 # `wc -l train.csv` - 1
+import ast
+import csv
+import os
+import sys
+
+import h5py
+import numpy
+import theano
+from fuel.converters.base import fill_hdf5_file
-stands_size = 63 # `wc -l metaData_taxistandsID_name_GPSlocation.csv` - 1
-taxi_id_size = 448 # `cut -d, -f 5 train.csv test.csv | sort -u | wc -l` - 1
-origin_call_size = 57124 # `cut -d, -f 3 train.csv test.csv | sort -u | wc -l` - 3 # Minus 3 to ignore "NA", "" and the header
+import data
-Polyline = h5py.special_dtype(vlen=theano.config.floatX)
taxi_id_dict = {}
origin_call_dict = {0: 0}
@@ -29,9 +31,9 @@ def get_unique_origin_call(val):
return len(origin_call_dict) - 1
def read_stands(input_directory, h5file):
- stands_name = numpy.empty(shape=(stands_size+1,), dtype=('a', 24))
- stands_latitude = numpy.empty(shape=(stands_size+1,), dtype=theano.config.floatX)
- stands_longitude = numpy.empty(shape=(stands_size+1,), dtype=theano.config.floatX)
+ stands_name = numpy.empty(shape=(data.stands_size,), dtype=('a', 24))
+ stands_latitude = numpy.empty(shape=(data.stands_size,), dtype=theano.config.floatX)
+ stands_longitude = numpy.empty(shape=(data.stands_size,), dtype=theano.config.floatX)
stands_name[0] = 'None'
stands_latitude[0] = stands_longitude[0] = 0
with open(os.path.join(input_directory, 'metaData_taxistandsID_name_GPSlocation.csv'), 'r') as f:
@@ -48,7 +50,7 @@ def read_stands(input_directory, h5file):
def read_taxis(input_directory, h5file, dataset):
print >> sys.stderr, 'read %s: begin' % dataset
- size=globals()['%s_size'%dataset]
+ size=getattr(data, '%s_size'%dataset)
trip_id = numpy.empty(shape=(size,), dtype='S19')
call_type = numpy.empty(shape=(size,), dtype=numpy.uint8)
origin_call = numpy.empty(shape=(size,), dtype=numpy.uint32)
@@ -57,8 +59,8 @@ def read_taxis(input_directory, h5file, dataset):
timestamp = numpy.empty(shape=(size,), dtype=numpy.uint32)
day_type = numpy.empty(shape=(size,), dtype=numpy.uint8)
missing_data = numpy.empty(shape=(size,), dtype=numpy.bool)
- latitude = numpy.empty(shape=(size,), dtype=Polyline)
- longitude = numpy.empty(shape=(size,), dtype=Polyline)
+ latitude = numpy.empty(shape=(size,), dtype=data.Polyline)
+ longitude = numpy.empty(shape=(size,), dtype=data.Polyline)
with open(os.path.join(input_directory, '%s.csv'%dataset), 'r') as f:
reader = csv.reader(f)
reader.next() # header
@@ -86,13 +88,13 @@ def read_taxis(input_directory, h5file, dataset):
return splits
def unique(h5file):
- unique_taxi_id = numpy.empty(shape=(taxi_id_size,), dtype=numpy.uint32)
- assert len(taxi_id_dict) == taxi_id_size
+ unique_taxi_id = numpy.empty(shape=(data.taxi_id_size,), dtype=numpy.uint32)
+ assert len(taxi_id_dict) == data.taxi_id_size
for k, v in taxi_id_dict.items():
unique_taxi_id[v] = k
- unique_origin_call = numpy.empty(shape=(origin_call_size+1,), dtype=numpy.uint32)
- assert len(origin_call_dict) == origin_call_size+1
+ unique_origin_call = numpy.empty(shape=(data.origin_call_size,), dtype=numpy.uint32)
+ assert len(origin_call_dict) == data.origin_call_size
for k, v in origin_call_dict.items():
unique_origin_call[v] = k
diff --git a/data/cuts/__init__.py b/data/cuts/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/data/cuts/__init__.py
diff --git a/data/cuts/test_times_0.py b/data/cuts/test_times_0.py
new file mode 100644
index 0000000..b590072
--- /dev/null
+++ b/data/cuts/test_times_0.py
@@ -0,0 +1,8 @@
+# Cuts of the test set minus 1 year
+cuts = [
+ 1376503200, # 2013-08-14 18:00
+ 1380616200, # 2013-10-01 08:30
+ 1381167900, # 2013-10-07 17:45
+ 1383364800, # 2013-11-02 04:00
+ 1387722600 # 2013-12-22 14:30
+]
diff --git a/data/hdf5.py b/data/hdf5.py
new file mode 100644
index 0000000..d848023
--- /dev/null
+++ b/data/hdf5.py
@@ -0,0 +1,61 @@
+import os
+
+import h5py
+from fuel.datasets import H5PYDataset
+from fuel.iterator import DataIterator
+from fuel.schemes import SequentialExampleScheme
+from fuel.streams import DataStream
+
+import data
+
+
+class TaxiDataset(H5PYDataset):
+ def __init__(self, which_set, filename='data.hdf5', **kwargs):
+ self.filename = filename
+ kwargs.setdefault('load_in_memory', True)
+ super(TaxiDataset, self).__init__(self.data_path, which_set, **kwargs)
+
+ @property
+ def data_path(self):
+ return os.path.join(data.path, self.filename)
+
+class TaxiStream(DataStream):
+ def __init__(self, which_set, filename='data.hdf5', iteration_scheme=None, **kwargs):
+ dataset = TaxiDataset(which_set, filename, **kwargs)
+ if iteration_scheme is None:
+ iteration_scheme = SequentialExampleScheme(dataset.num_examples)
+ super(TaxiStream, self).__init__(dataset, iteration_scheme=iteration_scheme)
+
+_origin_calls = None
+_reverse_origin_calls = None
+
+def origin_call_unnormalize(x):
+ if _origin_calls is None:
+ _origin_calls = h5py.File(os.path.join(data.path, 'data.hdf5'), 'r')['unique_origin_call']
+ return _origin_calls[x]
+
+def origin_call_normalize(x):
+ if _reverse_origin_calls is None:
+ origin_call_unnormalize(0)
+ _reverse_origin_calls = { _origin_calls[i]: i for i in range(_origin_calls.shape[0]) }
+ return _reverse_origin_calls[x]
+
+_taxi_ids = None
+_reverse_taxi_ids = None
+
+def taxi_id_unnormalize(x):
+ if _taxi_ids is None:
+ _taxi_ids = h5py.File(os.path.join(data.path, 'data.hdf5'), 'r')['unique_taxi_id']
+ return _taxi_ids[x]
+
+def taxi_id_normalize(x):
+ if _reverse_taxi_ids is None:
+ taxi_id_unnormalize(0)
+ _reverse_taxi_ids = { _taxi_ids[i]: i for i in range(_taxi_ids.shape[0]) }
+ return _reverse_taxi_ids[x]
+
+def taxi_it(which_set, filename='data.hdf5', sub=None, as_dict=True):
+ dataset = TaxiDataset(which_set, filename)
+ if sub is None:
+ sub = xrange(dataset.num_examples)
+ return DataIterator(DataStream(dataset), iter(sub), as_dict)
diff --git a/data/init_valid.py b/data/init_valid.py
new file mode 100755
index 0000000..14a854c
--- /dev/null
+++ b/data/init_valid.py
@@ -0,0 +1,61 @@
+#!/usr/bin/env python
+# Initialize the valid hdf5
+
+import os
+import sys
+
+import h5py
+import numpy
+import theano
+
+import data
+
+
+_fields = {
+ 'trip_id': 'S19',
+ 'call_type': numpy.uint8,
+ 'origin_call': numpy.uint32,
+ 'origin_stand': numpy.uint8,
+ 'taxi_id': numpy.uint16,
+ 'timestamp': numpy.uint32,
+ 'day_type': numpy.uint8,
+ 'missing_data': numpy.bool,
+ 'latitude': data.Polyline,
+ 'longitude': data.Polyline,
+ 'destination_latitude': theano.config.floatX,
+ 'destination_longitude': theano.config.floatX,
+ 'travel_time': numpy.uint32,
+}
+
+
+def init_valid(path):
+ h5file = h5py.File(path, 'w')
+
+ for k, v in _fields.items():
+ h5file.create_dataset(k, (0,), dtype=v, maxshape=(None,))
+
+ split_array = numpy.empty(len(_fields), dtype=numpy.dtype([
+ ('split', 'a', 64),
+ ('source', 'a', 21),
+ ('start', numpy.int64, 1),
+ ('stop', numpy.int64, 1),
+ ('available', numpy.bool, 1),
+ ('comment', 'a', 1)]))
+
+ split_array[:]['split'] = 'dummy'.encode('utf8')
+ for (i, k) in enumerate(_fields.keys()):
+ split_array[i] = k.encode('utf8')
+ split_array[:]['start'] = 0
+ split_array[:]['stop'] = 0
+ split_array[:]['available'] = False
+ split_array[:]['comment'] = '.'.encode('utf8')
+ h5file.attrs['split'] = split_array
+
+ h5file.flush()
+ h5file.close()
+
+if __name__ == '__main__':
+ if len(sys.argv) > 2:
+ print >> sys.stderr, 'Usage: %s [file]' % sys.argv[0]
+ sys.exit(1)
+ init_valid(sys.argv[1] if len(sys.argv) == 2 else os.path.join(data.path, 'valid.hdf5'))
diff --git a/data/make_valid_cut.py b/data/make_valid_cut.py
new file mode 100755
index 0000000..d5be083
--- /dev/null
+++ b/data/make_valid_cut.py
@@ -0,0 +1,72 @@
+#!/usr/bin/env python
+# Make a valid dataset by cutting the training set at specified timestamps
+
+import os
+import sys
+import importlib
+
+import h5py
+import numpy
+
+import data
+from data.hdf5 import taxi_it
+
+
+_fields = ['trip_id', 'call_type', 'origin_call', 'origin_stand', 'taxi_id', 'timestamp', 'day_type', 'missing_data', 'latitude', 'longitude', 'destination_latitude', 'destination_longitude', 'travel_time']
+
+def make_valid(cutfile, outpath):
+ cuts = importlib.import_module('.%s' % cutfile, 'data.cuts').cuts
+
+ valid = []
+
+ for line in taxi_it('train'):
+ time = line['timestamp']
+ latitude = line['latitude']
+ longitude = line['longitude']
+
+ if len(latitude) == 0:
+ continue
+
+ for ts in cuts:
+ if time <= ts and time + 15 * (len(latitude) - 1) >= ts:
+ # keep it
+ n = (ts - time) / 15 + 1
+ line.update({
+ 'latitude': latitude[:n],
+ 'longitude': longitude[:n],
+ 'destination_latitude': latitude[-1],
+ 'destination_longitude': longitude[-1],
+ 'travel_time': 15 * (len(latitude)-1)
+ })
+ valid.append(line)
+
+ file = h5py.File(outpath, 'a')
+ clen = file['trip_id'].shape[0]
+ alen = len(valid)
+ for field in _fields:
+ dset = file[field]
+ dset.resize((clen + alen,))
+ for i in xrange(alen):
+ dset[clen + i] = valid[i][field]
+
+ splits = file.attrs['split']
+ slen = splits.shape[0]
+ splits = numpy.resize(splits, (slen+len(_fields),))
+ for (i, field) in enumerate(_fields):
+ splits[slen+i]['split'] = ('cuts/%s' % cutfile).encode('utf8')
+ splits[slen+i]['source'] = field.encode('utf8')
+ splits[slen+i]['start'] = clen
+ splits[slen+i]['stop'] = alen
+ splits[slen+i]['available'] = True
+ splits[slen+i]['comment'] = '.'
+ file.attrs['split'] = splits
+
+ file.flush()
+ file.close()
+
+if __name__ == '__main__':
+ if len(sys.argv) < 2 or len(sys.argv) > 3:
+ print >> sys.stderr, 'Usage: %s cutfile [outfile]' % sys.argv[0]
+ sys.exit(1)
+ outpath = os.path.join(data.path, 'valid.hdf5') if len(sys.argv) < 3 else sys.argv[2]
+ make_valid(sys.argv[1], outpath)
diff --git a/transformers.py b/data/transformers.py
index 73e3868..1cc4834 100644
--- a/transformers.py
+++ b/data/transformers.py
@@ -1,10 +1,12 @@
-from fuel.transformers import Transformer, Filter, Mapping
+import datetime
+import random
+
import numpy
import theano
-import random
+from fuel.transformers import Transformer
+
import data
-import datetime
def at_least_k(k, v, pad_at_begin, is_longitude):
if len(v) == 0:
diff --git a/make_valid.py b/make_valid.py
deleted file mode 100644
index d5e147d..0000000
--- a/make_valid.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# Takes valid-full.csv which is a subset of the lines of train.csv, formatted in the
-# exact same way
-# Outputs valid.csv which contains the polylines cut at an arbitrary location, and three
-# new columns containing the destination point and the length in seconds of the original polyline
-# (see contest definition for the time taken by a taxi along a polyline)
-
-import random
-import csv
-import ast
-
-with open("valid-full.csv") as f:
- vlines = [l for l in csv.reader(f)]
-
-def make_valid_item(l):
- polyline = ast.literal_eval(l[-1])
- last = polyline[-1]
- cut_idx = random.randrange(len(polyline)+1)
- cut = polyline[:cut_idx]
- return l[:-1] + [
- cut.__str__(),
- last[0],
- last[1],
- 15 * (len(polyline)-1),
- ]
-
-vlines = map(make_valid_item, filter(lambda l: (len(ast.literal_eval(l[-1])) > 0), vlines))
-
-with open("valid.csv", "w") as f:
- wr = csv.writer(f)
- for r in vlines:
- wr.writerow(r)
-
-with open("valid-solution.csv", "w") as f:
- wr = csv.writer(f)
- wr.writerow(["TRIP_ID", "LATITUDE", "LONGITUDE"])
- for r in vlines:
- wr.writerow([r[0], r[-2], r[-3]])
diff --git a/make_valid_cut.py b/make_valid_cut.py
deleted file mode 100644
index 2698af8..0000000
--- a/make_valid_cut.py
+++ /dev/null
@@ -1,40 +0,0 @@
-# Cuts the training dataset at the following timestamps :
-
-cuts = [
- 1376503200,
- 1380616200,
- 1381167900,
- 1383364800,
- 1387722600,
-]
-
-import random
-import csv
-import ast
-
-f = open("train.csv")
-fr = csv.reader(f)
-_skip_header = fr.next()
-g = open("cutvalid.csv", "w")
-gw = csv.writer(g)
-
-for l in fr:
- polyline = ast.literal_eval(l[-1])
- if len(polyline) == 0: continue
- time = int(l[5])
- for ts in cuts:
- if time <= ts and time + 15 * (len(polyline) - 1) >= ts:
- # keep it
- n = (ts - time) / 15 + 1
- cut = polyline[:n]
- row = l[:-1] + [
- cut.__str__(),
- polyline[-1][0],
- polyline[-1][1],
- 15 * (len(polyline)-1)
- ]
- print row
- gw.writerow(row)
-
-f.close()
-g.close()
diff --git a/model/dest_simple_mlp.py b/model/dest_simple_mlp.py
index 896f219..f422f11 100644
--- a/model/dest_simple_mlp.py
+++ b/model/dest_simple_mlp.py
@@ -11,11 +11,11 @@ import error
class Model(object):
def __init__(self, config):
# The input and the targets
- x_firstk_latitude = (tensor.matrix('first_k_latitude') - data.porto_center[0]) / data.data_std[0]
- x_firstk_longitude = (tensor.matrix('first_k_longitude') - data.porto_center[1]) / data.data_std[1]
+ x_firstk_latitude = (tensor.matrix('first_k_latitude') - data.train_gps_mean[0]) / data.train_gps_std[0]
+ x_firstk_longitude = (tensor.matrix('first_k_longitude') - data.train_gps_mean[1]) / data.train_gps_std[1]
- x_lastk_latitude = (tensor.matrix('last_k_latitude') - data.porto_center[0]) / data.data_std[0]
- x_lastk_longitude = (tensor.matrix('last_k_longitude') - data.porto_center[1]) / data.data_std[1]
+ x_lastk_latitude = (tensor.matrix('last_k_latitude') - data.train_gps_mean[0]) / data.train_gps_std[0]
+ x_lastk_longitude = (tensor.matrix('last_k_longitude') - data.train_gps_mean[1]) / data.train_gps_std[1]
input_list = [x_firstk_latitude, x_firstk_longitude, x_lastk_latitude, x_lastk_longitude]
embed_tables = []
@@ -43,7 +43,7 @@ class Model(object):
# Normalize & Center
# outputs = theano.printing.Print("normal_outputs")(outputs)
- outputs = data.data_std * outputs + data.porto_center
+ outputs = data.train_gps_std * outputs + data.train_gps_mean
# outputs = theano.printing.Print("outputs")(outputs)
# y = theano.printing.Print("y")(y)
diff --git a/model/dest_simple_mlp_tgtcls.py b/model/dest_simple_mlp_tgtcls.py
index d8fdeb3..a7b6f9b 100644
--- a/model/dest_simple_mlp_tgtcls.py
+++ b/model/dest_simple_mlp_tgtcls.py
@@ -14,11 +14,11 @@ import error
class Model(object):
def __init__(self, config):
# The input and the targets
- x_firstk_latitude = (tensor.matrix('first_k_latitude') - data.porto_center[0]) / data.data_std[0]
- x_firstk_longitude = (tensor.matrix('first_k_longitude') - data.porto_center[1]) / data.data_std[1]
+ x_firstk_latitude = (tensor.matrix('first_k_latitude') - data.train_gps_mean[0]) / data.train_gps_std[0]
+ x_firstk_longitude = (tensor.matrix('first_k_longitude') - data.train_gps_mean[1]) / data.train_gps_std[1]
- x_lastk_latitude = (tensor.matrix('last_k_latitude') - data.porto_center[0]) / data.data_std[0]
- x_lastk_longitude = (tensor.matrix('last_k_longitude') - data.porto_center[1]) / data.data_std[1]
+ x_lastk_latitude = (tensor.matrix('last_k_latitude') - data.train_gps_mean[0]) / data.train_gps_std[0]
+ x_lastk_longitude = (tensor.matrix('last_k_longitude') - data.train_gps_mean[1]) / data.train_gps_std[1]
input_list = [x_firstk_latitude, x_firstk_longitude, x_lastk_latitude, x_lastk_longitude]
embed_tables = []
diff --git a/train.py b/train.py
index 4cbd526..9e915ed 100644..100755
--- a/train.py
+++ b/train.py
@@ -1,36 +1,26 @@
-import logging
-import os
+#!/usr/bin/env python
+
import sys
+import logging
import importlib
-from argparse import ArgumentParser
import csv
-import numpy
-
-import theano
-from theano import printing
-from theano import tensor
-from theano.ifelse import ifelse
-
-from blocks.filter import VariableFilter
-
from blocks.model import Model
-from fuel.datasets.hdf5 import H5PYDataset
from fuel.transformers import Batch
from fuel.streams import DataStream
-from fuel.schemes import ConstantScheme, SequentialExampleScheme, ShuffledExampleScheme
+from fuel.schemes import ConstantScheme, ShuffledExampleScheme
-from blocks.algorithms import GradientDescent, Scale, AdaDelta, Momentum
+from blocks.algorithms import GradientDescent, AdaDelta, Momentum
from blocks.graph import ComputationGraph
from blocks.main_loop import MainLoop
from blocks.extensions import Printing, FinishAfter
from blocks.extensions.saveload import Dump, LoadFromDump, Checkpoint
from blocks.extensions.monitoring import DataStreamMonitoring
-import data
-import transformers
+from data import transformers
+from data.hdf5 import TaxiDataset, TaxiStream
import apply_model
if __name__ == "__main__":
@@ -38,18 +28,18 @@ if __name__ == "__main__":
print >> sys.stderr, 'Usage: %s config' % sys.argv[0]
sys.exit(1)
model_name = sys.argv[1]
- config = importlib.import_module(model_name)
+ config = importlib.import_module('.%s' % model_name, 'config')
+def compile_valid_trip_ids():
+ valid = TaxiDataset(config.valid_set, 'valid.hdf5', sources=('trip_id',))
+ ids = valid.get_data(None, slice(0, valid.num_examples))
+ return set(ids[0])
-def setup_train_stream(req_vars):
- # Load the training and test data
- train = H5PYDataset(data.H5DATA_PATH,
- which_set='train',
- subset=slice(0, data.dataset_size),
- load_in_memory=True)
- train = DataStream(train, iteration_scheme=ShuffledExampleScheme(data.dataset_size))
+def setup_train_stream(req_vars, valid_trips_ids):
+ train = TaxiDataset('train')
+ train = DataStream(train, iteration_scheme=ShuffledExampleScheme(train.num_examples))
- train = transformers.TaxiExcludeTrips(data.valid_trips, train)
+ train = transformers.TaxiExcludeTrips(valid_trips_ids, train)
train = transformers.TaxiGenerateSplits(train, max_splits=100)
train = transformers.TaxiAddDateTime(train)
@@ -62,7 +52,7 @@ def setup_train_stream(req_vars):
return train_stream
def setup_valid_stream(req_vars):
- valid = DataStream(data.valid_data)
+ valid = TaxiStream(config.valid_set, 'valid.hdf5')
valid = transformers.TaxiAddDateTime(valid)
valid = transformers.TaxiAddFirstK(config.n_begin_end_pts, valid)
@@ -74,7 +64,7 @@ def setup_valid_stream(req_vars):
return valid_stream
def setup_test_stream(req_vars):
- test = DataStream(data.test_data)
+ test = TaxiStream('test')
test = transformers.TaxiAddDateTime(test)
test = transformers.TaxiAddFirstK(config.n_begin_end_pts, test)
@@ -95,12 +85,13 @@ def main():
req_vars = model.require_inputs + model.pred_vars
req_vars_test = model.require_inputs + [ 'trip_id' ]
- train_stream = setup_train_stream(req_vars)
+ valid_trips_ids = compile_valid_trip_ids()
+ train_stream = setup_train_stream(req_vars, valid_trips_ids)
valid_stream = setup_valid_stream(req_vars)
# Training
cg = ComputationGraph(cost)
- params = cg.parameters # VariableFilter(bricks=[Linear])(cg.parameters)
+ params = cg.parameters
algorithm = GradientDescent(
cost=cost,
# step_rule=AdaDelta(decay_rate=0.5),