convert_data.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125

#!/usr/bin/env python
import os, h5py, csv, sys, numpy, theano, ast
from fuel.converters.base import fill_hdf5_file

test_size = 320 # `wc -l test.csv` - 1 # Minus 1 to ignore the header
train_size = 1710670 # `wc -l train.csv` - 1

stands_size = 63 # `wc -l metaData_taxistandsID_name_GPSlocation.csv` - 1
taxi_id_size = 448 # `cut -d, -f 5 train.csv test.csv | sort -u | wc -l` - 1
origin_call_size = 57124 # `cut -d, -f 3 train.csv test.csv | sort -u | wc -l` - 3 # Minus 3 to ignore "NA", "" and the header

Polyline = h5py.special_dtype(vlen=theano.config.floatX)

taxi_id_dict = {}
origin_call_dict = {0: 0}

def get_unique_taxi_id(val):
    if val in taxi_id_dict:
        return taxi_id_dict[val]
    else:
        taxi_id_dict[val] = len(taxi_id_dict)
        return len(taxi_id_dict) - 1

def get_unique_origin_call(val):
    if val in origin_call_dict:
        return origin_call_dict[val]
    else:
        origin_call_dict[val] = len(origin_call_dict)
        return len(origin_call_dict) - 1

def read_stands(input_directory, h5file):
    stands_name = numpy.empty(shape=(stands_size+1,), dtype=('a', 24))
    stands_latitude = numpy.empty(shape=(stands_size+1,), dtype=theano.config.floatX)
    stands_longitude = numpy.empty(shape=(stands_size+1,), dtype=theano.config.floatX)
    stands_name[0] = 'None'
    stands_latitude[0] = stands_longitude[0] = 0
    with open(os.path.join(input_directory, 'metaData_taxistandsID_name_GPSlocation.csv'), 'r') as f:
        reader = csv.reader(f)
        reader.next() # header
        for line in reader:
            id = int(line[0])
            stands_name[id] = line[1]
            stands_latitude[id] = float(line[2])
            stands_longitude[id] = float(line[3])
    return (('stands', 'stands_name', stands_name),
            ('stands', 'stands_latitude', stands_latitude),
            ('stands', 'stands_longitude', stands_longitude))

def read_taxis(input_directory, h5file, dataset):
    print >> sys.stderr, 'read %s: begin' % dataset
    size=globals()['%s_size'%dataset]
    trip_id = numpy.empty(shape=(size,), dtype='S19')
    call_type = numpy.empty(shape=(size,), dtype=numpy.uint8)
    origin_call = numpy.empty(shape=(size,), dtype=numpy.uint32)
    origin_stand = numpy.empty(shape=(size,), dtype=numpy.uint8)
    taxi_id = numpy.empty(shape=(size,), dtype=numpy.uint16)
    timestamp = numpy.empty(shape=(size,), dtype=numpy.uint32)
    day_type = numpy.empty(shape=(size,), dtype=numpy.uint8)
    missing_data = numpy.empty(shape=(size,), dtype=numpy.bool)
    latitude = numpy.empty(shape=(size,), dtype=Polyline)
    longitude = numpy.empty(shape=(size,), dtype=Polyline)
    with open(os.path.join(input_directory, '%s.csv'%dataset), 'r') as f:
        reader = csv.reader(f)
        reader.next() # header
        id=0
        for line in reader:
            if id%10000==0 and id!=0:
                print >> sys.stderr, 'read %s: %d done' % (dataset, id)
            trip_id[id] = line[0]
            call_type[id] = ord(line[1][0]) - ord('A')
            origin_call[id] = 0 if line[2]=='NA' or line[2]=='' else get_unique_origin_call(int(line[2]))
            origin_stand[id] = 0 if line[3]=='NA' or line[3]=='' else int(line[3])
            taxi_id[id] = get_unique_taxi_id(int(line[4]))
            timestamp[id] = int(line[5])
            day_type[id] = ord(line[6][0]) - ord('A')
            missing_data[id] = line[7][0] == 'T'
            polyline = ast.literal_eval(line[8])
            latitude[id] = numpy.array([point[1] for point in polyline], dtype=theano.config.floatX)
            longitude[id] = numpy.array([point[0] for point in polyline], dtype=theano.config.floatX)
            id+=1
    splits = ()
    print >> sys.stderr, 'read %s: writing' % dataset
    for name in ['trip_id', 'call_type', 'origin_call', 'origin_stand', 'taxi_id', 'timestamp', 'day_type', 'missing_data', 'latitude', 'longitude']:
        splits += ((dataset, name, locals()[name]),)
    print >> sys.stderr, 'read %s: end' % dataset
    return splits

def unique(h5file):
    unique_taxi_id = numpy.empty(shape=(taxi_id_size,), dtype=numpy.uint32)
    assert len(taxi_id_dict) == taxi_id_size
    for k, v in taxi_id_dict.items():
        unique_taxi_id[v] = k

    unique_origin_call = numpy.empty(shape=(origin_call_size+1,), dtype=numpy.uint32)
    assert len(origin_call_dict) == origin_call_size+1
    for k, v in origin_call_dict.items():
        unique_origin_call[v] = k

    return (('unique_taxi_id', 'unique_taxi_id', unique_taxi_id),
            ('unique_origin_call', 'unique_origin_call', unique_origin_call))

def convert(input_directory, save_path):
    h5file = h5py.File(save_path, 'w')
    split = ()
    split += read_stands(input_directory, h5file)
    split += read_taxis(input_directory, h5file, 'train')
    print 'First origin_call not present in training set: ', len(origin_call_dict)
    split += read_taxis(input_directory, h5file, 'test')
    split += unique(h5file)

    fill_hdf5_file(h5file, split)

    for name in ['stands_name', 'stands_latitude', 'stands_longitude', 'unique_taxi_id', 'unique_origin_call']:
        h5file[name].dims[0].label = 'index'
    for name in ['trip_id', 'call_type', 'origin_call', 'origin_stand', 'taxi_id', 'timestamp', 'day_type', 'missing_data', 'latitude', 'longitude']:
        h5file[name].dims[0].label = 'batch'

    h5file.flush()
    h5file.close()

if __name__ == '__main__':
    if len(sys.argv) != 3:
        print >> sys.stderr, 'Usage: %s download_dir output_file' % sys.argv[0]
        sys.exit(1)
    convert(sys.argv[1], sys.argv[2])