diff options
author | AdeB <adbrebs@gmail.com> | 2015-05-05 22:15:29 -0400 |
---|---|---|
committer | AdeB <adbrebs@gmail.com> | 2015-05-05 22:15:29 -0400 |
commit | c29a0d3f22134a8d1f5d557b325f6779c5961546 (patch) | |
tree | 6fa431d9b3595b5d2d11089920aa07ea43172d90 /data/__init__.py | |
parent | f4d3ee6449217535bdbe19ac9c5fdd825d71b0d3 (diff) | |
parent | 1f2ff96e6480a62089fcac35154a956c218ed678 (diff) | |
download | taxi-c29a0d3f22134a8d1f5d557b325f6779c5961546.tar.gz taxi-c29a0d3f22134a8d1f5d557b325f6779c5961546.zip |
Merge branch 'master' of github.com:adbrebs/taxi
Diffstat (limited to 'data/__init__.py')
-rw-r--r-- | data/__init__.py | 31 |
1 files changed, 31 insertions, 0 deletions
diff --git a/data/__init__.py b/data/__init__.py new file mode 100644 index 0000000..1278e0b --- /dev/null +++ b/data/__init__.py @@ -0,0 +1,31 @@ +import os + +import h5py +import numpy +import theano + + +path = os.environ.get('TAXI_PATH', '/data/lisatmp3/auvolat/taxikaggle') +Polyline = h5py.special_dtype(vlen=theano.config.floatX) + + +# `wc -l test.csv` - 1 # Minus 1 to ignore the header +test_size = 320 + +# `wc -l train.csv` - 1 +train_size = 1710670 + +# `wc -l metaData_taxistandsID_name_GPSlocation.csv` +stands_size = 64 # include 0 ("no origin_stands") + +# `cut -d, -f 5 train.csv test.csv | sort -u | wc -l` - 1 +taxi_id_size = 448 + +# `cut -d, -f 3 train.csv test.csv | sort -u | wc -l` - 2 +origin_call_size = 57125 # include 0 ("no origin_call") + +# As printed by csv_to_hdf5.py +origin_call_train_size = 57106 + +train_gps_mean = numpy.array([41.1573, -8.61612], dtype=theano.config.floatX) +train_gps_std = numpy.sqrt(numpy.array([0.00549598, 0.00333233], dtype=theano.config.floatX)) |