aboutsummaryrefslogtreecommitdiff
path: root/data/__init__.py
blob: 9d01d2a321bd7c651884057d5c91e974d015dc05 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import os
import sys

import h5py
import numpy


path = os.environ.get('TAXI_PATH', '/data/lisatmp3/auvolat/taxikaggle')

Polyline = h5py.special_dtype(vlen=numpy.float32)

# `wc -l metaData_taxistandsID_name_GPSlocation.csv`
stands_size = 64 # include 0 ("no origin_stands")

# `cut -d, -f 5 train.csv test.csv | sort -u | wc -l` - 1
taxi_id_size = 448 

train_gps_mean = numpy.array([41.1573, -8.61612], dtype=numpy.float32)
train_gps_std = numpy.sqrt(numpy.array([0.00549598, 0.00333233], dtype=numpy.float32))

tvt = '--tvt' in sys.argv

if tvt:
    test_size = 19770
    valid_size = 19427
    train_size = 1671473

    origin_call_size = 57106
    origin_call_train_size = 57106

    valid_set = 'valid'
    valid_ds = 'tvt.hdf5'
    traintest_ds = 'tvt.hdf5'

else:
    # `wc -l test.csv` - 1 # Minus 1 to ignore the header
    test_size = 320 

    # `wc -l train.csv` - 1
    train_size = 1710670 

    # `cut -d, -f 3 train.csv test.csv | sort -u | wc -l` - 2
    origin_call_size = 57125 # include 0 ("no origin_call")

    # As printed by csv_to_hdf5.py
    origin_call_train_size = 57106

    if '--largevalid' in sys.argv:
        valid_set = 'cuts/large_valid'
    else:
        valid_set = 'cuts/test_times_0'

    valid_ds = 'valid.hdf5'
    traintest_ds = 'data.hdf5'