#!/usr/bin/env python # Make a valid dataset by cutting the training set at specified timestamps import os import sys import importlib import h5py import numpy import data from data.hdf5 import taxi_it _fields = ['trip_id', 'call_type', 'origin_call', 'origin_stand', 'taxi_id', 'timestamp', 'day_type', 'missing_data', 'latitude', 'longitude', 'destination_latitude', 'destination_longitude', 'travel_time'] def make_valid(cutfile, outpath): cuts = importlib.import_module('.%s' % cutfile, 'data.cuts').cuts valid = [] for line in taxi_it('train'): time = line['timestamp'] latitude = line['latitude'] longitude = line['longitude'] if len(latitude) == 0: continue for ts in cuts: if time <= ts and time + 15 * (len(latitude) - 1) >= ts: # keep it n = (ts - time) / 15 + 1 line.update({ 'latitude': latitude[:n], 'longitude': longitude[:n], 'destination_latitude': latitude[-1], 'destination_longitude': longitude[-1], 'travel_time': 15 * (len(latitude)-1) }) valid.append(line) file = h5py.File(outpath, 'a') clen = file['trip_id'].shape[0] alen = len(valid) for field in _fields: dset = file[field] dset.resize((clen + alen,)) for i in xrange(alen): dset[clen + i] = valid[i][field] splits = file.attrs['split'] slen = splits.shape[0] splits = numpy.resize(splits, (slen+len(_fields),)) for (i, field) in enumerate(_fields): splits[slen+i]['split'] = ('cuts/%s' % cutfile).encode('utf8') splits[slen+i]['source'] = field.encode('utf8') splits[slen+i]['start'] = clen splits[slen+i]['stop'] = alen splits[slen+i]['available'] = True splits[slen+i]['comment'] = '.' file.attrs['split'] = splits file.flush() file.close() if __name__ == '__main__': if len(sys.argv) < 2 or len(sys.argv) > 3: print >> sys.stderr, 'Usage: %s cutfile [outfile]' % sys.argv[0] sys.exit(1) outpath = os.path.join(data.path, 'valid.hdf5') if len(sys.argv) < 3 else sys.argv[2] make_valid(sys.argv[1], outpath)