aboutsummaryrefslogtreecommitdiff
path: root/data/__init__.py
diff options
context:
space:
mode:
authorÉtienne Simon <esimon@esimon.eu>2015-07-23 21:20:32 -0400
committerÉtienne Simon <esimon@esimon.eu>2015-07-23 21:20:32 -0400
commit13fc171f60ae1981c7ad4f2a302a8a85c29addc5 (patch)
treeabc29e6a877a2f971b0be9715c112d8eee8b0eb4 /data/__init__.py
parent8d31f9240056ec110cf63bde79d7661321d8ca7a (diff)
downloadtaxi-13fc171f60ae1981c7ad4f2a302a8a85c29addc5.tar.gz
taxi-13fc171f60ae1981c7ad4f2a302a8a85c29addc5.zip
Use new tvt dataset with option --tvt
Diffstat (limited to 'data/__init__.py')
-rw-r--r--data/__init__.py48
1 files changed, 36 insertions, 12 deletions
diff --git a/data/__init__.py b/data/__init__.py
index 2121033..9d01d2a 100644
--- a/data/__init__.py
+++ b/data/__init__.py
@@ -1,4 +1,5 @@
import os
+import sys
import h5py
import numpy
@@ -8,23 +9,46 @@ path = os.environ.get('TAXI_PATH', '/data/lisatmp3/auvolat/taxikaggle')
Polyline = h5py.special_dtype(vlen=numpy.float32)
-# `wc -l test.csv` - 1 # Minus 1 to ignore the header
-test_size = 320
-
-# `wc -l train.csv` - 1
-train_size = 1710670
-
# `wc -l metaData_taxistandsID_name_GPSlocation.csv`
stands_size = 64 # include 0 ("no origin_stands")
# `cut -d, -f 5 train.csv test.csv | sort -u | wc -l` - 1
taxi_id_size = 448
-# `cut -d, -f 3 train.csv test.csv | sort -u | wc -l` - 2
-origin_call_size = 57125 # include 0 ("no origin_call")
-
-# As printed by csv_to_hdf5.py
-origin_call_train_size = 57106
-
train_gps_mean = numpy.array([41.1573, -8.61612], dtype=numpy.float32)
train_gps_std = numpy.sqrt(numpy.array([0.00549598, 0.00333233], dtype=numpy.float32))
+
+tvt = '--tvt' in sys.argv
+
+if tvt:
+ test_size = 19770
+ valid_size = 19427
+ train_size = 1671473
+
+ origin_call_size = 57106
+ origin_call_train_size = 57106
+
+ valid_set = 'valid'
+ valid_ds = 'tvt.hdf5'
+ traintest_ds = 'tvt.hdf5'
+
+else:
+ # `wc -l test.csv` - 1 # Minus 1 to ignore the header
+ test_size = 320
+
+ # `wc -l train.csv` - 1
+ train_size = 1710670
+
+ # `cut -d, -f 3 train.csv test.csv | sort -u | wc -l` - 2
+ origin_call_size = 57125 # include 0 ("no origin_call")
+
+ # As printed by csv_to_hdf5.py
+ origin_call_train_size = 57106
+
+ if '--largevalid' in sys.argv:
+ valid_set = 'cuts/large_valid'
+ else:
+ valid_set = 'cuts/test_times_0'
+
+ valid_ds = 'valid.hdf5'
+ traintest_ds = 'data.hdf5'