From 0b4b65cb3d88ac4818e71ccef0bded3ddee0683c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89tienne=20Simon?= Date: Wed, 6 May 2015 11:54:52 -0400 Subject: Fix floatX!=float32 in hdf5 creation --- data/__init__.py | 7 ++-- data/csv.py | 107 ---------------------------------------------------- data/csv_to_hdf5.py | 9 ++--- data/init_valid.py | 5 +-- data/rfc4180.py | 107 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 116 insertions(+), 119 deletions(-) delete mode 100644 data/csv.py create mode 100644 data/rfc4180.py (limited to 'data') diff --git a/data/__init__.py b/data/__init__.py index 1278e0b..2121033 100644 --- a/data/__init__.py +++ b/data/__init__.py @@ -2,12 +2,11 @@ import os import h5py import numpy -import theano path = os.environ.get('TAXI_PATH', '/data/lisatmp3/auvolat/taxikaggle') -Polyline = h5py.special_dtype(vlen=theano.config.floatX) +Polyline = h5py.special_dtype(vlen=numpy.float32) # `wc -l test.csv` - 1 # Minus 1 to ignore the header test_size = 320 @@ -27,5 +26,5 @@ origin_call_size = 57125 # include 0 ("no origin_call") # As printed by csv_to_hdf5.py origin_call_train_size = 57106 -train_gps_mean = numpy.array([41.1573, -8.61612], dtype=theano.config.floatX) -train_gps_std = numpy.sqrt(numpy.array([0.00549598, 0.00333233], dtype=theano.config.floatX)) +train_gps_mean = numpy.array([41.1573, -8.61612], dtype=numpy.float32) +train_gps_std = numpy.sqrt(numpy.array([0.00549598, 0.00333233], dtype=numpy.float32)) diff --git a/data/csv.py b/data/csv.py deleted file mode 100644 index b6fe5b1..0000000 --- a/data/csv.py +++ /dev/null @@ -1,107 +0,0 @@ -import ast -import csv -import numpy - -from fuel.datasets import Dataset -from fuel.streams import DataStream -from fuel.iterator import DataIterator - -import data -from data.hdf5 import origin_call_normalize, taxi_id_normalize - - -class TaxiData(Dataset): - example_iteration_scheme=None - - class State: - __slots__ = ('file', 'index', 'reader') - - def __init__(self, pathes, columns, has_header=False): - if not isinstance(pathes, list): - pathes=[pathes] - assert len(pathes)>0 - self.columns=columns - self.provides_sources = tuple(map(lambda x: x[0], columns)) - self.pathes=pathes - self.has_header=has_header - super(TaxiData, self).__init__() - - def open(self): - state=self.State() - state.file=open(self.pathes[0]) - state.index=0 - state.reader=csv.reader(state.file) - if self.has_header: - state.reader.next() - return state - - def close(self, state): - state.file.close() - - def reset(self, state): - if state.index==0: - state.file.seek(0) - else: - state.index=0 - state.file.close() - state.file=open(self.pathes[0]) - state.reader=csv.reader(state.file) - return state - - def get_data(self, state, request=None): - if request is not None: - raise ValueError - try: - line=state.reader.next() - except (ValueError, StopIteration): - # print state.index - state.file.close() - state.index+=1 - if state.index>=len(self.pathes): - raise StopIteration - state.file=open(self.pathes[state.index]) - state.reader=csv.reader(state.file) - if self.has_header: - state.reader.next() - return self.get_data(state) - - values = [] - for _, constructor in self.columns: - values.append(constructor(line)) - return tuple(values) - -taxi_columns = [ - ("trip_id", lambda l: l[0]), - ("call_type", lambda l: ord(l[1])-ord('A')), - ("origin_call", lambda l: 0 if l[2] == '' or l[2] == 'NA' else origin_call_normalize(int(l[2]))), - ("origin_stand", lambda l: 0 if l[3] == '' or l[3] == 'NA' else int(l[3])), - ("taxi_id", lambda l: taxi_id_normalize(int(l[4]))), - ("timestamp", lambda l: int(l[5])), - ("day_type", lambda l: ord(l[6])-ord('A')), - ("missing_data", lambda l: l[7][0] == 'T'), - ("polyline", lambda l: map(tuple, ast.literal_eval(l[8]))), - ("longitude", lambda l: map(lambda p: p[0], ast.literal_eval(l[8]))), - ("latitude", lambda l: map(lambda p: p[1], ast.literal_eval(l[8]))), -] - -taxi_columns_valid = taxi_columns + [ - ("destination_longitude", lambda l: numpy.float32(float(l[9]))), - ("destination_latitude", lambda l: numpy.float32(float(l[10]))), - ("time", lambda l: int(l[11])), -] - -train_file="%s/train.csv" % data.path -valid_file="%s/valid2-cut.csv" % data.path -test_file="%s/test.csv" % data.path - -train_data=TaxiData(train_file, taxi_columns, has_header=True) -valid_data = TaxiData(valid_file, taxi_columns_valid) -test_data = TaxiData(test_file, taxi_columns, has_header=True) - -valid_trips = [l for l in open("%s/valid2-cut-ids.txt" % data.path)] - -def train_it(): - return DataIterator(DataStream(train_data)) - -def test_it(): - return DataIterator(DataStream(valid_data)) diff --git a/data/csv_to_hdf5.py b/data/csv_to_hdf5.py index 17217f3..97cf428 100755 --- a/data/csv_to_hdf5.py +++ b/data/csv_to_hdf5.py @@ -7,7 +7,6 @@ import sys import h5py import numpy -import theano from fuel.converters.base import fill_hdf5_file import data @@ -32,8 +31,8 @@ def get_unique_origin_call(val): def read_stands(input_directory, h5file): stands_name = numpy.empty(shape=(data.stands_size,), dtype=('a', 24)) - stands_latitude = numpy.empty(shape=(data.stands_size,), dtype=theano.config.floatX) - stands_longitude = numpy.empty(shape=(data.stands_size,), dtype=theano.config.floatX) + stands_latitude = numpy.empty(shape=(data.stands_size,), dtype=numpy.float32) + stands_longitude = numpy.empty(shape=(data.stands_size,), dtype=numpy.float32) stands_name[0] = 'None' stands_latitude[0] = stands_longitude[0] = 0 with open(os.path.join(input_directory, 'metaData_taxistandsID_name_GPSlocation.csv'), 'r') as f: @@ -77,8 +76,8 @@ def read_taxis(input_directory, h5file, dataset): day_type[id] = ord(line[6][0]) - ord('A') missing_data[id] = line[7][0] == 'T' polyline = ast.literal_eval(line[8]) - latitude[id] = numpy.array([point[1] for point in polyline], dtype=theano.config.floatX) - longitude[id] = numpy.array([point[0] for point in polyline], dtype=theano.config.floatX) + latitude[id] = numpy.array([point[1] for point in polyline], dtype=numpy.float32) + longitude[id] = numpy.array([point[0] for point in polyline], dtype=numpy.float32) id+=1 splits = () print >> sys.stderr, 'read %s: writing' % dataset diff --git a/data/init_valid.py b/data/init_valid.py index 14a854c..eed0059 100755 --- a/data/init_valid.py +++ b/data/init_valid.py @@ -6,7 +6,6 @@ import sys import h5py import numpy -import theano import data @@ -22,8 +21,8 @@ _fields = { 'missing_data': numpy.bool, 'latitude': data.Polyline, 'longitude': data.Polyline, - 'destination_latitude': theano.config.floatX, - 'destination_longitude': theano.config.floatX, + 'destination_latitude': numpy.float32, + 'destination_longitude': numpy.float32, 'travel_time': numpy.uint32, } diff --git a/data/rfc4180.py b/data/rfc4180.py new file mode 100644 index 0000000..b6fe5b1 --- /dev/null +++ b/data/rfc4180.py @@ -0,0 +1,107 @@ +import ast +import csv +import numpy + +from fuel.datasets import Dataset +from fuel.streams import DataStream +from fuel.iterator import DataIterator + +import data +from data.hdf5 import origin_call_normalize, taxi_id_normalize + + +class TaxiData(Dataset): + example_iteration_scheme=None + + class State: + __slots__ = ('file', 'index', 'reader') + + def __init__(self, pathes, columns, has_header=False): + if not isinstance(pathes, list): + pathes=[pathes] + assert len(pathes)>0 + self.columns=columns + self.provides_sources = tuple(map(lambda x: x[0], columns)) + self.pathes=pathes + self.has_header=has_header + super(TaxiData, self).__init__() + + def open(self): + state=self.State() + state.file=open(self.pathes[0]) + state.index=0 + state.reader=csv.reader(state.file) + if self.has_header: + state.reader.next() + return state + + def close(self, state): + state.file.close() + + def reset(self, state): + if state.index==0: + state.file.seek(0) + else: + state.index=0 + state.file.close() + state.file=open(self.pathes[0]) + state.reader=csv.reader(state.file) + return state + + def get_data(self, state, request=None): + if request is not None: + raise ValueError + try: + line=state.reader.next() + except (ValueError, StopIteration): + # print state.index + state.file.close() + state.index+=1 + if state.index>=len(self.pathes): + raise StopIteration + state.file=open(self.pathes[state.index]) + state.reader=csv.reader(state.file) + if self.has_header: + state.reader.next() + return self.get_data(state) + + values = [] + for _, constructor in self.columns: + values.append(constructor(line)) + return tuple(values) + +taxi_columns = [ + ("trip_id", lambda l: l[0]), + ("call_type", lambda l: ord(l[1])-ord('A')), + ("origin_call", lambda l: 0 if l[2] == '' or l[2] == 'NA' else origin_call_normalize(int(l[2]))), + ("origin_stand", lambda l: 0 if l[3] == '' or l[3] == 'NA' else int(l[3])), + ("taxi_id", lambda l: taxi_id_normalize(int(l[4]))), + ("timestamp", lambda l: int(l[5])), + ("day_type", lambda l: ord(l[6])-ord('A')), + ("missing_data", lambda l: l[7][0] == 'T'), + ("polyline", lambda l: map(tuple, ast.literal_eval(l[8]))), + ("longitude", lambda l: map(lambda p: p[0], ast.literal_eval(l[8]))), + ("latitude", lambda l: map(lambda p: p[1], ast.literal_eval(l[8]))), +] + +taxi_columns_valid = taxi_columns + [ + ("destination_longitude", lambda l: numpy.float32(float(l[9]))), + ("destination_latitude", lambda l: numpy.float32(float(l[10]))), + ("time", lambda l: int(l[11])), +] + +train_file="%s/train.csv" % data.path +valid_file="%s/valid2-cut.csv" % data.path +test_file="%s/test.csv" % data.path + +train_data=TaxiData(train_file, taxi_columns, has_header=True) +valid_data = TaxiData(valid_file, taxi_columns_valid) +test_data = TaxiData(test_file, taxi_columns, has_header=True) + +valid_trips = [l for l in open("%s/valid2-cut-ids.txt" % data.path)] + +def train_it(): + return DataIterator(DataStream(train_data)) + +def test_it(): + return DataIterator(DataStream(valid_data)) -- cgit v1.2.3