aboutsummaryrefslogtreecommitdiff
path: root/data/__init__.py
diff options
context:
space:
mode:
authorÉtienne Simon <esimon@esimon.eu>2015-05-05 21:55:13 -0400
committerÉtienne Simon <esimon@esimon.eu>2015-05-05 22:05:21 -0400
commit1f2ff96e6480a62089fcac35154a956c218ed678 (patch)
treed0bb7a2a6d7ba6ae512a2ce3729b1ccbdc21c822 /data/__init__.py
parent54613c1f9cf510ca7a71d6619418f2247515aec6 (diff)
downloadtaxi-1f2ff96e6480a62089fcac35154a956c218ed678.tar.gz
taxi-1f2ff96e6480a62089fcac35154a956c218ed678.zip
Clean data module and generalize use of hdf5.
Diffstat (limited to 'data/__init__.py')
-rw-r--r--data/__init__.py31
1 files changed, 31 insertions, 0 deletions
diff --git a/data/__init__.py b/data/__init__.py
new file mode 100644
index 0000000..1278e0b
--- /dev/null
+++ b/data/__init__.py
@@ -0,0 +1,31 @@
+import os
+
+import h5py
+import numpy
+import theano
+
+
+path = os.environ.get('TAXI_PATH', '/data/lisatmp3/auvolat/taxikaggle')
+Polyline = h5py.special_dtype(vlen=theano.config.floatX)
+
+
+# `wc -l test.csv` - 1 # Minus 1 to ignore the header
+test_size = 320
+
+# `wc -l train.csv` - 1
+train_size = 1710670
+
+# `wc -l metaData_taxistandsID_name_GPSlocation.csv`
+stands_size = 64 # include 0 ("no origin_stands")
+
+# `cut -d, -f 5 train.csv test.csv | sort -u | wc -l` - 1
+taxi_id_size = 448
+
+# `cut -d, -f 3 train.csv test.csv | sort -u | wc -l` - 2
+origin_call_size = 57125 # include 0 ("no origin_call")
+
+# As printed by csv_to_hdf5.py
+origin_call_train_size = 57106
+
+train_gps_mean = numpy.array([41.1573, -8.61612], dtype=theano.config.floatX)
+train_gps_std = numpy.sqrt(numpy.array([0.00549598, 0.00333233], dtype=theano.config.floatX))