diff options
author | Étienne Simon <esimon@esimon.eu> | 2015-05-18 16:22:00 -0400 |
---|---|---|
committer | Étienne Simon <esimon@esimon.eu> | 2015-05-18 16:22:00 -0400 |
commit | 6d946f29f7548c75e97f30c4356dbac200ee6cce (patch) | |
tree | 387e586c7ad0c1a0167d21451c9a8c877cf3ef0e | |
parent | 1e6d08b0c9ac5983691b182631c71e9d46ee71cc (diff) | |
download | taxi-6d946f29f7548c75e97f30c4356dbac200ee6cce.tar.gz taxi-6d946f29f7548c75e97f30c4356dbac200ee6cce.zip |
Refactor models, clean the code and separate training from testing.
25 files changed, 475 insertions, 469 deletions
diff --git a/config/dest_simple_mlp_2_cs.py b/config/dest_simple_mlp_2_cs.py index accb611..be93427 100644 --- a/config/dest_simple_mlp_2_cs.py +++ b/config/dest_simple_mlp_2_cs.py @@ -1,8 +1,8 @@ from blocks.initialization import IsotropicGaussian, Constant -import model.dest_simple_mlp as model - import data +from model.dest_simple_mlp import Model, Stream + n_begin_end_pts = 5 # how many points we consider at the beginning and end of the known trajectory n_end_pts = 5 diff --git a/config/dest_simple_mlp_2_cswdt.py b/config/dest_simple_mlp_2_cswdt.py index 62d0db4..a86977c 100644 --- a/config/dest_simple_mlp_2_cswdt.py +++ b/config/dest_simple_mlp_2_cswdt.py @@ -1,8 +1,8 @@ -import model.dest_simple_mlp as model - from blocks.initialization import IsotropicGaussian, Constant import data +from model.dest_simple_mlp import Model, Stream + n_begin_end_pts = 5 # how many points we consider at the beginning and end of the known trajectory n_end_pts = 5 diff --git a/config/dest_simple_mlp_2_noembed.py b/config/dest_simple_mlp_2_noembed.py index bbe7798..5615a5c 100644 --- a/config/dest_simple_mlp_2_noembed.py +++ b/config/dest_simple_mlp_2_noembed.py @@ -1,8 +1,8 @@ -import model.dest_simple_mlp as model - from blocks.initialization import IsotropicGaussian, Constant import data +from model.dest_simple_mlp import Model, Stream + n_begin_end_pts = 5 # how many points we consider at the beginning and end of the known trajectory n_end_pts = 5 diff --git a/config/dest_simple_mlp_tgtcls_0_cs.py b/config/dest_simple_mlp_tgtcls_0_cs.py index 704e62c..f5b9a1e 100644 --- a/config/dest_simple_mlp_tgtcls_0_cs.py +++ b/config/dest_simple_mlp_tgtcls_0_cs.py @@ -3,8 +3,8 @@ import cPickle from blocks.initialization import IsotropicGaussian, Constant import data +from model.dest_simple_mlp_tgtcls import Model, Stream -import model.dest_simple_mlp_tgtcls as model n_begin_end_pts = 5 # how many points we consider at the beginning and end of the known trajectory n_end_pts = 5 diff --git a/config/dest_simple_mlp_tgtcls_1_cs.py b/config/dest_simple_mlp_tgtcls_1_cs.py index f2a22a5..54457e7 100644 --- a/config/dest_simple_mlp_tgtcls_1_cs.py +++ b/config/dest_simple_mlp_tgtcls_1_cs.py @@ -3,8 +3,8 @@ import cPickle from blocks.initialization import IsotropicGaussian, Constant import data +from model.dest_simple_mlp_tgtcls import Model, Stream -import model.dest_simple_mlp_tgtcls as model n_begin_end_pts = 5 # how many points we consider at the beginning and end of the known trajectory n_end_pts = 5 diff --git a/config/dest_simple_mlp_tgtcls_1_cswdt.py b/config/dest_simple_mlp_tgtcls_1_cswdt.py index a3ae654..486b7a6 100644 --- a/config/dest_simple_mlp_tgtcls_1_cswdt.py +++ b/config/dest_simple_mlp_tgtcls_1_cswdt.py @@ -3,8 +3,8 @@ import cPickle from blocks.initialization import IsotropicGaussian, Constant import data +from model.dest_simple_mlp_tgtcls import Model, Stream -import model.dest_simple_mlp_tgtcls as model n_begin_end_pts = 5 # how many points we consider at the beginning and end of the known trajectory n_end_pts = 5 diff --git a/config/dest_simple_mlp_tgtcls_1_cswdtx.py b/config/dest_simple_mlp_tgtcls_1_cswdtx.py index 6306c15..713581d 100644 --- a/config/dest_simple_mlp_tgtcls_1_cswdtx.py +++ b/config/dest_simple_mlp_tgtcls_1_cswdtx.py @@ -3,8 +3,8 @@ import cPickle from blocks.initialization import IsotropicGaussian, Constant import data +from model.dest_simple_mlp_tgtcls import Model, Stream -import model.dest_simple_mlp_tgtcls as model n_begin_end_pts = 5 # how many points we consider at the beginning and end of the known trajectory n_end_pts = 5 diff --git a/config/dest_simple_mlp_tgtcls_1_cswdtx_alexandre.py b/config/dest_simple_mlp_tgtcls_1_cswdtx_alexandre.py index 8c090c7..c0ba36c 100644 --- a/config/dest_simple_mlp_tgtcls_1_cswdtx_alexandre.py +++ b/config/dest_simple_mlp_tgtcls_1_cswdtx_alexandre.py @@ -3,8 +3,8 @@ import cPickle from blocks.initialization import IsotropicGaussian, Constant import data +from model.dest_simple_mlp_tgtcls import Model, Stream -import model.dest_simple_mlp_tgtcls as model n_begin_end_pts = 5 # how many points we consider at the beginning and end of the known trajectory n_end_pts = 5 diff --git a/config/joint_simple_mlp_tgtcls_111_cswdtx.py b/config/joint_simple_mlp_tgtcls_111_cswdtx.py index deb6eba..a7e0415 100644 --- a/config/joint_simple_mlp_tgtcls_111_cswdtx.py +++ b/config/joint_simple_mlp_tgtcls_111_cswdtx.py @@ -1,10 +1,10 @@ import cPickle -import model.joint_simple_mlp_tgtcls as model - from blocks.initialization import IsotropicGaussian, Constant import data +from model.joint_simple_mlp_tgtcls import Model, Stream + n_begin_end_pts = 5 # how many points we consider at the beginning and end of the known trajectory n_end_pts = 5 diff --git a/config/joint_simple_mlp_tgtcls_111_cswdtx_noise_dout.py b/config/joint_simple_mlp_tgtcls_111_cswdtx_noise_dout.py new file mode 100644 index 0000000..1faea15 --- /dev/null +++ b/config/joint_simple_mlp_tgtcls_111_cswdtx_noise_dout.py @@ -0,0 +1,62 @@ +import cPickle + +from blocks import roles +from blocks.bricks import Rectifier +from blocks.filter import VariableFilter +from blocks.initialization import IsotropicGaussian, Constant + +import data +from model.joint_simple_mlp_tgtcls import Model, Stream + + +n_begin_end_pts = 5 # how many points we consider at the beginning and end of the known trajectory +n_end_pts = 5 + +n_valid = 1000 + +with open("%s/arrival-clusters.pkl" % data.path) as f: + dest_tgtcls = cPickle.load(f) + +# generate target classes for time prediction as a Fibonacci sequence +time_tgtcls = [1, 2] +for i in range(22): + time_tgtcls.append(time_tgtcls[-1] + time_tgtcls[-2]) + +dim_embeddings = [ + ('origin_call', data.origin_call_size+1, 10), + ('origin_stand', data.stands_size+1, 10), + ('week_of_year', 52, 10), + ('day_of_week', 7, 10), + ('qhour_of_day', 24 * 4, 10), + ('day_type', 3, 10), + ('taxi_id', 448, 10), +] + +# Common network part +dim_input = n_begin_end_pts * 2 * 2 + sum(x for (_, _, x) in dim_embeddings) +dim_hidden = [500] + +# Destination prediction part +dim_hidden_dest = [100] +dim_output_dest = len(dest_tgtcls) + +# Time prediction part +dim_hidden_time = [100] +dim_output_time = len(time_tgtcls) + +# Cost ratio between distance cost and time cost +time_cost_factor = 4 + +embed_weights_init = IsotropicGaussian(0.001) +mlp_weights_init = IsotropicGaussian(0.01) +mlp_biases_init = Constant(0.001) + +batch_size = 200 + +dropout = 0.5 +dropout_inputs = VariableFilter(bricks=[Rectifier], name='output') + +noise = 0.01 +noise_inputs = VariableFilter(roles=[roles.PARAMETER]) + +valid_set = 'cuts/test_times_0' diff --git a/config/joint_simple_mlp_tgtcls_1_cswdtx.py b/config/joint_simple_mlp_tgtcls_1_cswdtx.py index 74b3c75..c4bc2fb 100644 --- a/config/joint_simple_mlp_tgtcls_1_cswdtx.py +++ b/config/joint_simple_mlp_tgtcls_1_cswdtx.py @@ -1,10 +1,10 @@ import cPickle -import model.joint_simple_mlp_tgtcls as model - from blocks.initialization import IsotropicGaussian, Constant import data +from model.joint_simple_mlp_tgtcls import Model, Stream + n_begin_end_pts = 5 # how many points we consider at the beginning and end of the known trajectory n_end_pts = 5 diff --git a/config/time_simple_mlp_1.py b/config/time_simple_mlp_1.py index bf3699d..c9203c8 100644 --- a/config/time_simple_mlp_1.py +++ b/config/time_simple_mlp_1.py @@ -1,8 +1,8 @@ -import model.time_simple_mlp as model - from blocks.initialization import IsotropicGaussian, Constant import data +from model.time_simple_mlp import Model, Stream + n_begin_end_pts = 5 # how many points we consider at the beginning and end of the known trajectory n_end_pts = 5 diff --git a/config/time_simple_mlp_2_cswdtx.py b/config/time_simple_mlp_2_cswdtx.py index 98467e3..dac585f 100644 --- a/config/time_simple_mlp_2_cswdtx.py +++ b/config/time_simple_mlp_2_cswdtx.py @@ -1,8 +1,8 @@ -import model.time_simple_mlp as model - from blocks.initialization import IsotropicGaussian, Constant import data +from model.time_simple_mlp import Model, Stream + n_begin_end_pts = 5 # how many points we consider at the beginning and end of the known trajectory n_end_pts = 5 diff --git a/config/time_simple_mlp_tgtcls_2_cswdtx.py b/config/time_simple_mlp_tgtcls_2_cswdtx.py index eb69714..eb06334 100644 --- a/config/time_simple_mlp_tgtcls_2_cswdtx.py +++ b/config/time_simple_mlp_tgtcls_2_cswdtx.py @@ -1,8 +1,8 @@ -import model.time_simple_mlp_tgtcls as model - from blocks.initialization import IsotropicGaussian, Constant import data +from model.time_simple_mlp_tgtcls import Model, Stream + n_begin_end_pts = 5 # how many points we consider at the beginning and end of the known trajectory n_end_pts = 5 diff --git a/data/transformers.py b/data/transformers.py index 57747fc..1b82dae 100644 --- a/data/transformers.py +++ b/data/transformers.py @@ -64,8 +64,9 @@ class TaxiGenerateSplits(Transformer): dlat = numpy.float32(self.data[self.id_latitude][-1]) dlon = numpy.float32(self.data[self.id_longitude][-1]) + ttime = numpy.int32(15 * (len(self.data[self.id_longitude]) - 1)) - return tuple(r + [dlat, dlon, 15 * (len(self.data[self.id_longitude]) - 1)]) + return tuple(r + [dlat, dlon, ttime]) class TaxiAddFirstLastLen(Transformer): def __init__(self, k, stream): @@ -87,7 +88,7 @@ class TaxiAddFirstLastLen(Transformer): dtype=theano.config.floatX), numpy.array(at_least_k(self.k, data[self.id_longitude], True, True)[-self.k:], dtype=theano.config.floatX)) - input_time = (15 * (len(data[self.id_latitude]) - 1),) + input_time = (numpy.int32(15 * (len(data[self.id_latitude]) - 1)),) return data + first_k + last_k + input_time class TaxiAddDateTime(Transformer): @@ -101,7 +102,9 @@ class TaxiAddDateTime(Transformer): ts = data[self.id_timestamp] date = datetime.datetime.utcfromtimestamp(ts) yearweek = date.isocalendar()[1] - 1 - info = ((51 if yearweek == 52 else yearweek), date.weekday(), date.hour * 4 + date.minute / 15) + info = (numpy.int8(51 if yearweek == 52 else yearweek), + numpy.int8(date.weekday()), + numpy.int8(date.hour * 4 + date.minute / 15)) return data + info class TaxiExcludeTrips(Transformer): diff --git a/model/__init__.py b/model/__init__.py index e69de29..5c051f4 100644 --- a/model/__init__.py +++ b/model/__init__.py @@ -0,0 +1,36 @@ +from blocks.bricks import application, Initializable +from blocks.bricks.lookup import LookupTable + + +class ContextEmbedder(Initializable): + def __init__(self, config, **kwargs): + super(ContextEmbedder, self).__init__(**kwargs) + self.dim_embeddings = config.dim_embeddings + self.embed_weights_init = config.embed_weights_init + + self.inputs = [ name for (name, _, _) in self.dim_embeddings ] + self.outputs = [ '%s_embedded' % name for name in self.inputs ] + + self.lookups = { name: LookupTable(name='%s_lookup' % name) for name in self.inputs } + self.children = self.lookups.values() + + def _push_allocation_config(self): + for (name, num, dim) in self.dim_embeddings: + self.lookups[name].length = num + self.lookups[name].dim = dim + + def _push_initialization_config(self): + for name in self.inputs: + self.lookups[name].weights_init = self.embed_weights_init + + @application + def apply(self, **kwargs): + return tuple(self.lookups[name].apply(kwargs[name]) for name in self.inputs) + + @apply.property('inputs') + def apply_inputs(self): + return self.inputs + + @apply.property('outputs') + def apply_outputs(self): + return self.outputs diff --git a/model/dest_simple_mlp.py b/model/dest_simple_mlp.py index a9e97cb..78d7131 100644 --- a/model/dest_simple_mlp.py +++ b/model/dest_simple_mlp.py @@ -1,71 +1,32 @@ -from blocks.bricks import MLP, Rectifier, Linear, Sigmoid, Identity -from blocks.bricks.lookup import LookupTable - from theano import tensor +from blocks.bricks import application, Identity import data import error +from model.mlp import FFMLP, Stream -class Model(object): - def __init__(self, config): - # The input and the targets - x_firstk_latitude = (tensor.matrix('first_k_latitude') - data.train_gps_mean[0]) / data.train_gps_std[0] - x_firstk_longitude = (tensor.matrix('first_k_longitude') - data.train_gps_mean[1]) / data.train_gps_std[1] - - x_lastk_latitude = (tensor.matrix('last_k_latitude') - data.train_gps_mean[0]) / data.train_gps_std[0] - x_lastk_longitude = (tensor.matrix('last_k_longitude') - data.train_gps_mean[1]) / data.train_gps_std[1] - - input_list = [x_firstk_latitude, x_firstk_longitude, x_lastk_latitude, x_lastk_longitude] - embed_tables = [] - - self.require_inputs = ['first_k_latitude', 'first_k_longitude', 'last_k_latitude', 'last_k_longitude'] - - for (varname, num, dim) in config.dim_embeddings: - self.require_inputs.append(varname) - vardata = tensor.lvector(varname) - tbl = LookupTable(length=num, dim=dim, name='%s_lookup'%varname) - embed_tables.append(tbl) - input_list.append(tbl.apply(vardata)) - - y = tensor.concatenate((tensor.vector('destination_latitude')[:, None], - tensor.vector('destination_longitude')[:, None]), axis=1) - - # Define the model - mlp = MLP(activations=[Rectifier() for _ in config.dim_hidden] + [Identity()], - dims=[config.dim_input] + config.dim_hidden + [config.dim_output]) - - # Create the Theano variables - inputs = tensor.concatenate(input_list, axis=1) - # inputs = theano.printing.Print("inputs")(inputs) - outputs = mlp.apply(inputs) - - # Normalize & Center - # outputs = theano.printing.Print("normal_outputs")(outputs) - outputs = data.train_gps_std * outputs + data.train_gps_mean - - # outputs = theano.printing.Print("outputs")(outputs) - # y = theano.printing.Print("y")(y) - outputs.name = 'outputs' +class Model(FFMLP): + def __init__(self, config, **kwargs): + super(Model, self).__init__(config, output_layer=Identity, **kwargs) - # Calculate the cost - cost = error.erdist(outputs, y).mean() - cost.name = 'cost' - hcost = error.hdist(outputs, y).mean() - hcost.name = 'hcost' + @application(outputs=['destination']) + def predict(self, **kwargs): + outputs = super(Model, self).predict(**kwargs) + return data.train_gps_std * outputs + data.train_gps_mean - # Initialization - for tbl in embed_tables: - tbl.weights_init = config.embed_weights_init - mlp.weights_init = config.mlp_weights_init - mlp.biases_init = config.mlp_biases_init + @predict.property('inputs') + def predict_inputs(self): + return self.inputs - for tbl in embed_tables: - tbl.initialize() - mlp.initialize() + @application(outputs=['cost']) + def cost(self, **kwargs): + y_hat = self.predict(**kwargs) + y = tensor.concatenate((kwargs['destination_latitude'][:, None], + kwargs['destination_longitude'][:, None]), axis=1) - self.cost = cost - self.monitor = [cost, hcost] - self.outputs = outputs - self.pred_vars = ['destination_latitude', 'destination_longitude'] + return error.erdist(y_hat, y).mean() + @cost.property('inputs') + def cost_inputs(self): + return self.inputs + ['destination_latitude', 'destination_longitude'] diff --git a/model/dest_simple_mlp_tgtcls.py b/model/dest_simple_mlp_tgtcls.py index 1381d4c..2d65097 100644 --- a/model/dest_simple_mlp_tgtcls.py +++ b/model/dest_simple_mlp_tgtcls.py @@ -1,73 +1,34 @@ -import numpy - +import numpy import theano from theano import tensor +from blocks.bricks import application, Softmax -from blocks.bricks import MLP, Rectifier, Linear, Sigmoid, Identity, Softmax -from blocks.bricks.lookup import LookupTable - -import data import error +from model.mlp import FFMLP, Stream -class Model(object): - def __init__(self, config): - # The input and the targets - x_firstk_latitude = (tensor.matrix('first_k_latitude') - data.train_gps_mean[0]) / data.train_gps_std[0] - x_firstk_longitude = (tensor.matrix('first_k_longitude') - data.train_gps_mean[1]) / data.train_gps_std[1] - - x_lastk_latitude = (tensor.matrix('last_k_latitude') - data.train_gps_mean[0]) / data.train_gps_std[0] - x_lastk_longitude = (tensor.matrix('last_k_longitude') - data.train_gps_mean[1]) / data.train_gps_std[1] - - input_list = [x_firstk_latitude, x_firstk_longitude, x_lastk_latitude, x_lastk_longitude] - embed_tables = [] - - self.require_inputs = ['first_k_latitude', 'first_k_longitude', 'last_k_latitude', 'last_k_longitude'] - - for (varname, num, dim) in config.dim_embeddings: - self.require_inputs.append(varname) - vardata = tensor.lvector(varname) - tbl = LookupTable(length=num, dim=dim, name='%s_lookup'%varname) - embed_tables.append(tbl) - input_list.append(tbl.apply(vardata)) - - y = tensor.concatenate((tensor.vector('destination_latitude')[:, None], - tensor.vector('destination_longitude')[:, None]), axis=1) - - # Define the model - mlp = MLP(activations=[Rectifier() for _ in config.dim_hidden] + [Softmax()], - dims=[config.dim_input] + config.dim_hidden + [config.dim_output]) - classes = theano.shared(numpy.array(config.tgtcls, dtype=theano.config.floatX), name='classes') - - # Create the Theano variables - inputs = tensor.concatenate(input_list, axis=1) - - # inputs = theano.printing.Print("inputs")(inputs) - cls_probas = mlp.apply(inputs) - outputs = tensor.dot(cls_probas, classes) - - # outputs = theano.printing.Print("outputs")(outputs) - # y = theano.printing.Print("y")(y) - outputs.name = 'outputs' +class Model(FFMLP): + def __init__(self, config, **kwargs): + super(Model, self, output_layer=Softmax).__init__(config, **kwargs) + self.classes = theano.shared(numpy.array(config.tgtcls, dtype=theano.config.floatX), name='classes') - # Calculate the cost - cost = error.erdist(outputs, y).mean() - cost.name = 'cost' - hcost = error.hdist(outputs, y).mean() - hcost.name = 'hcost' + @application(outputs=['destination']) + def predict(self, **kwargs): + cls_probas = super(Model, self).predict(**kwargs) + return tensor.dot(cls_probas, self.classes) - # Initialization - for tbl in embed_tables: - tbl.weights_init = config.embed_weights_init - mlp.weights_init = config.mlp_weights_init - mlp.biases_init = config.mlp_biases_init + @predict.property('inputs') + def predict_inputs(self): + return self.inputs - for tbl in embed_tables: - tbl.initialize() - mlp.initialize() + @application(outputs=['cost']) + def cost(self, **kwargs): + y_hat = self.predict(**kwargs) + y = tensor.concatenate((kwargs['destination_latitude'][:, None], + kwargs['destination_longitude'][:, None]), axis=1) - self.cost = cost - self.monitor = [cost, hcost] - self.outputs = outputs - self.pred_vars = ['destination_latitude', 'destination_longitude'] + return error.erdist(y_hat, y).mean() + @cost.property('inputs') + def cost_inputs(self): + return self.inputs + ['destination_latitude', 'destination_longitude'] diff --git a/model/joint_simple_mlp_tgtcls.py b/model/joint_simple_mlp_tgtcls.py index 834afbf..d6d4e49 100644 --- a/model/joint_simple_mlp_tgtcls.py +++ b/model/joint_simple_mlp_tgtcls.py @@ -1,109 +1,71 @@ -from blocks.bricks import MLP, Rectifier, Linear, Sigmoid, Identity, Softmax -from blocks.bricks.lookup import LookupTable - -from blocks.filter import VariableFilter -from blocks.graph import ComputationGraph, apply_dropout - import numpy import theano from theano import tensor +from blocks import roles +from blocks.bricks import application, MLP, Rectifier, Softmax -import data import error +from model.mlp import FFMLP, Stream -class Model(object): - def __init__(self, config): - # The input and the targets - x_firstk_latitude = (tensor.matrix('first_k_latitude') - data.train_gps_mean[0]) / data.train_gps_std[0] - x_firstk_longitude = (tensor.matrix('first_k_longitude') - data.train_gps_mean[1]) / data.train_gps_std[1] - x_lastk_latitude = (tensor.matrix('last_k_latitude') - data.train_gps_mean[0]) / data.train_gps_std[0] - x_lastk_longitude = (tensor.matrix('last_k_longitude') - data.train_gps_mean[1]) / data.train_gps_std[1] +class Model(FFMLP): + def __init__(self, config, **kwargs): + super(Model, self).__init__(config, **kwargs) + + self.dest_mlp = MLP(activations=[Rectifier() for _ in config.dim_hidden_dest] + [Softmax()], + dims=[config.dim_hidden[-1]] + config.dim_hidden_dest + [config.dim_output_dest], + name='dest_mlp') + self.time_mlp = MLP(activations=[Rectifier() for _ in config.dim_hidden_time] + [Softmax()], + dims=[config.dim_hidden[-1]] + config.dim_hidden_time + [config.dim_output_time], + name='time_mlp') - x_input_time = tensor.lvector('input_time') + self.dest_classes = theano.shared(numpy.array(config.dest_tgtcls, dtype=theano.config.floatX), name='dest_classes') + self.time_classes = theano.shared(numpy.array(config.time_tgtcls, dtype=theano.config.floatX), name='time_classes') - input_list = [x_firstk_latitude, x_firstk_longitude, x_lastk_latitude, x_lastk_longitude] - embed_tables = [] + self.inputs.append('input_time') + self.children.extend([self.dest_mlp, self.time_mlp]) - self.require_inputs = ['first_k_latitude', 'first_k_longitude', 'last_k_latitude', 'last_k_longitude', 'input_time'] + def _push_initialization_config(self): + super(Model, self)._push_initialization_config() + for mlp in [self.dest_mlp, self.time_mlp]: + mlp.weights_init = self.config.mlp_weights_init + mlp.biases_init = self.config.mlp_biases_init - for (varname, num, dim) in config.dim_embeddings: - self.require_inputs.append(varname) - vardata = tensor.lvector(varname) - tbl = LookupTable(length=num, dim=dim, name='%s_lookup'%varname) - embed_tables.append(tbl) - input_list.append(tbl.apply(vardata)) + @application(outputs=['destination', 'duration']) + def predict(self, **kwargs): + hidden = super(Model, self).predict(**kwargs) - y_dest = tensor.concatenate((tensor.vector('destination_latitude')[:, None], - tensor.vector('destination_longitude')[:, None]), axis=1) - y_time = tensor.lvector('travel_time') + dest_cls_probas = self.dest_mlp.apply(hidden) + dest_outputs = tensor.dot(dest_cls_probas, self.dest_classes) - # Define the model - common_mlp = MLP(activations=[Rectifier() for _ in config.dim_hidden], - dims=[config.dim_input] + config.dim_hidden) + time_cls_probas = self.time_mlp.apply(hidden) + time_outputs = kwargs['input_time'] + tensor.dot(time_cls_probas, self.time_classes) - dest_mlp = MLP(activations=[Rectifier() for _ in config.dim_hidden_dest] + [Softmax()], - dims=[config.dim_hidden[-1]] + config.dim_hidden_dest + [config.dim_output_dest], - name='dest_mlp') - dest_classes = theano.shared(numpy.array(config.dest_tgtcls, dtype=theano.config.floatX), name='dest_classes') + self.add_auxiliary_variable(dest_cls_probas, name='destination classes ponderations') + self.add_auxiliary_variable(time_cls_probas, name='time classes ponderations') - time_mlp = MLP(activations=[Rectifier() for _ in config.dim_hidden_time] + [Softmax()], - dims=[config.dim_hidden[-1]] + config.dim_hidden_time + [config.dim_output_time], - name='time_mlp') - time_classes = theano.shared(numpy.array(config.time_tgtcls, dtype=theano.config.floatX), name='time_classes') - - # Create the Theano variables - inputs = tensor.concatenate(input_list, axis=1) - # inputs = theano.printing.Print("inputs")(inputs) - hidden = common_mlp.apply(inputs) - - dest_cls_probas = dest_mlp.apply(hidden) - dest_outputs = tensor.dot(dest_cls_probas, dest_classes) - dest_outputs.name = 'dest_outputs' - - time_cls_probas = time_mlp.apply(hidden) - time_outputs = tensor.dot(time_cls_probas, time_classes) + x_input_time - time_outputs.name = 'time_outputs' - - # Calculate the cost - dest_cost = error.erdist(dest_outputs, y_dest).mean() - dest_cost.name = 'dest_cost' - dest_hcost = error.hdist(dest_outputs, y_dest).mean() - dest_hcost.name = 'dest_hcost' - - time_cost = error.rmsle(time_outputs.flatten(), y_time.flatten()) - time_cost.name = 'time_cost' - time_scost = config.time_cost_factor * time_cost - time_scost.name = 'time_scost' - - cost = dest_cost + time_scost - - if hasattr(config, 'dropout_p'): - cg = ComputationGraph(cost) - dropout_inputs = VariableFilter( - bricks=[b for b in list(common_mlp.children) + - list(dest_mlp.children) + - list(time_mlp.children) - if isinstance(b, Rectifier)], - name='output')(cg) - cg = apply_dropout(cg, dropout_inputs, config.dropout_p) - cost = cg.outputs[0] - - cost.name = 'cost' - - # Initialization - for tbl in embed_tables: - tbl.weights_init = config.embed_weights_init - tbl.initialize() - - for mlp in [common_mlp, dest_mlp, time_mlp]: - mlp.weights_init = config.mlp_weights_init - mlp.biases_init = config.mlp_biases_init - mlp.initialize() - - self.cost = cost - self.monitor = [cost, dest_cost, dest_hcost, time_cost, time_scost] - self.outputs = tensor.concatenate([dest_outputs, time_outputs[:, None]], axis=1) - self.outputs.name = 'outputs' - self.pred_vars = ['destination_longitude', 'destination_latitude', 'travel_time'] + return (dest_outputs, time_outputs) + + @predict.property('inputs') + def predict_inputs(self): + return self.inputs + + @application(outputs=['cost']) + def cost(self, **kwargs): + (destination_hat, time_hat) = self.predict(**kwargs) + + destination = tensor.concatenate((kwargs['destination_latitude'][:, None], + kwargs['destination_longitude'][:, None]), axis=1) + time = kwargs['travel_time'] + + destination_cost = error.erdist(destination_hat, destination).mean() + time_cost = error.rmsle(time_hat.flatten(), time.flatten()) + + self.add_auxiliary_variable(destination_cost, [roles.COST], 'destination_cost') + self.add_auxiliary_variable(time_cost, [roles.COST], 'time_cost') + + return destination_cost + self.config.time_cost_factor * time_cost + @cost.property('inputs') + def cost_inputs(self): + return self.inputs + ['destination_latitude', 'destination_longitude', 'travel_time'] diff --git a/model/mlp.py b/model/mlp.py new file mode 100644 index 0000000..9c84ef9 --- /dev/null +++ b/model/mlp.py @@ -0,0 +1,103 @@ +from theano import tensor + +from fuel.transformers import Batch +from fuel.streams import DataStream +from fuel.schemes import ConstantScheme, ShuffledExampleScheme +from blocks.bricks import application, MLP, Rectifier, Initializable + +import data +from data import transformers +from data.hdf5 import TaxiDataset, TaxiStream +from model import ContextEmbedder + + +class FFMLP(Initializable): + def __init__(self, config, output_layer=None, **kwargs): + super(FFMLP, self).__init__(**kwargs) + self.config = config + + self.context_embedder = ContextEmbedder(config) + + output_activation = [] if output_layer is None else [output_layer()] + output_dim = [] if output_layer is None else [config.dim_output] + self.mlp = MLP(activations=[Rectifier() for _ in config.dim_hidden] + output_activation, + dims=[config.dim_input] + config.dim_hidden + output_dim) + + self.extremities = {'%s_k_%s' % (side, ['latitude', 'longitude'][axis]): axis for side in ['first', 'last'] for axis in [0, 1]} + self.inputs = self.context_embedder.inputs + self.extremities.keys() + self.children = [ self.context_embedder, self.mlp ] + + def _push_initialization_config(self): + self.mlp.weights_init = self.config.mlp_weights_init + self.mlp.biases_init = self.config.mlp_biases_init + + @application(outputs=['prediction']) + def predict(self, **kwargs): + embeddings = tuple(self.context_embedder.apply(**{k: kwargs[k] for k in self.context_embedder.inputs })) + extremities = tuple((kwargs[k] - data.train_gps_mean[v]) / data.train_gps_std[v] for k, v in self.extremities.items()) + + inputs = tensor.concatenate(extremities + embeddings, axis=1) + outputs = self.mlp.apply(inputs) + + return outputs + + @predict.property('inputs') + def predict_inputs(self): + return self.inputs + +class Stream(object): + def __init__(self, config): + self.config = config + + def train(self, req_vars): + stream = TaxiDataset('train') + stream = DataStream(stream, iteration_scheme=ShuffledExampleScheme(stream.num_examples)) + + valid = TaxiDataset(self.config.valid_set, 'valid.hdf5', sources=('trip_id',)) + valid_trips_ids = valid.get_data(None, slice(0, valid.num_examples))[0] + + stream = transformers.TaxiExcludeTrips(valid_trips_ids, stream) + stream = transformers.TaxiGenerateSplits(stream, max_splits=100) + + stream = transformers.TaxiAddDateTime(stream) + stream = transformers.TaxiAddFirstLastLen(self.config.n_begin_end_pts, stream) + stream = transformers.Select(stream, tuple(req_vars)) + return Batch(stream, iteration_scheme=ConstantScheme(self.config.batch_size)) + + def valid(self, req_vars): + stream = TaxiStream(self.config.valid_set, 'valid.hdf5') + + stream = transformers.TaxiAddDateTime(stream) + stream = transformers.TaxiAddFirstLastLen(self.config.n_begin_end_pts, stream) + stream = transformers.Select(stream, tuple(req_vars)) + return Batch(stream, iteration_scheme=ConstantScheme(1000)) + + def test(self, req_vars): + stream = TaxiStream('test') + + stream = transformers.TaxiAddDateTime(stream) + stream = transformers.TaxiAddFirstLastLen(self.config.n_begin_end_pts, stream) + + return Batch(stream, iteration_scheme=ConstantScheme(1)) + + def inputs(self): + return {'call_type': tensor.bvector('call_type'), + 'origin_call': tensor.ivector('origin_call'), + 'origin_stand': tensor.bvector('origin_stand'), + 'taxi_id': tensor.wvector('taxi_id'), + 'timestamp': tensor.ivector('timestamp'), + 'day_type': tensor.bvector('day_type'), + 'missing_data': tensor.bvector('missing_data'), + 'latitude': tensor.matrix('latitude'), + 'longitude': tensor.matrix('longitude'), + 'destination_latitude': tensor.vector('destination_latitude'), + 'destination_longitude': tensor.vector('destination_longitude'), + 'travel_time': tensor.ivector('travel_time'), + 'first_k_latitude': tensor.matrix('first_k_latitude'), + 'first_k_longitude': tensor.matrix('first_k_longitude'), + 'last_k_latitude': tensor.matrix('last_k_latitude'), + 'last_k_longitude': tensor.matrix('last_k_longitude'), + 'input_time': tensor.ivector('input_time'), + 'week_of_year': tensor.bvector('week_of_year'), + 'day_of_week': tensor.bvector('day_of_week'), + 'qhour_of_day': tensor.bvector('qhour_of_day')} diff --git a/model/time_simple_mlp.py b/model/time_simple_mlp.py index 9e1b10a..0ce6b29 100644 --- a/model/time_simple_mlp.py +++ b/model/time_simple_mlp.py @@ -1,63 +1,29 @@ -from blocks.bricks import MLP, Rectifier, Linear, Sigmoid, Identity -from blocks.bricks.lookup import LookupTable +from blocks.bricks import application, Identity -from theano import tensor - -import data import error +from model.mlp import FFMLP, Stream -class Model(object): - def __init__(self, config): - # The input and the targets - x_firstk_latitude = (tensor.matrix('first_k_latitude') - data.train_gps_mean[0]) / data.train_gps_std[0] - x_firstk_longitude = (tensor.matrix('first_k_longitude') - data.train_gps_mean[1]) / data.train_gps_std[1] - - x_lastk_latitude = (tensor.matrix('last_k_latitude') - data.train_gps_mean[0]) / data.train_gps_std[0] - x_lastk_longitude = (tensor.matrix('last_k_longitude') - data.train_gps_mean[1]) / data.train_gps_std[1] - - input_list = [x_firstk_latitude, x_firstk_longitude, x_lastk_latitude, x_lastk_longitude] - embed_tables = [] - - self.require_inputs = ['first_k_latitude', 'first_k_longitude', 'last_k_latitude', 'last_k_longitude'] - - for (varname, num, dim) in config.dim_embeddings: - self.require_inputs.append(varname) - vardata = tensor.lvector(varname) - tbl = LookupTable(length=num, dim=dim, name='%s_lookup'%varname) - embed_tables.append(tbl) - input_list.append(tbl.apply(vardata)) - - y = tensor.lvector('travel_time') - - # Define the model - mlp = MLP(activations=[Rectifier() for _ in config.dim_hidden] + [Identity()], - dims=[config.dim_input] + config.dim_hidden + [config.dim_output]) - - # Create the Theano variables - inputs = tensor.concatenate(input_list, axis=1) - # inputs = theano.printing.Print("inputs")(inputs) - outputs = config.exp_base ** mlp.apply(inputs) - - # outputs = theano.printing.Print("outputs")(outputs) - # y = theano.printing.Print("y")(y) - outputs.name = 'outputs' +class Model(FFMLP): + def __init__(self, config, **kwargs): + super(Model, self).__init__(config, output_layer=Identity, **kwargs) + self.inputs.append('input_time') - # Calculate the cost - cost = error.rmsle(outputs.flatten(), y.flatten()) - cost.name = 'cost' + @application(outputs=['duration']) + def predict(self, **kwargs): + outputs = super(Model, self).predict(**kwargs) + return kwargs['input_time'] + self.config.exp_base ** outputs - # Initialization - for tbl in embed_tables: - tbl.weights_init = config.embed_weights_init - mlp.weights_init = config.mlp_weights_init - mlp.biases_init = config.mlp_biases_init + @predict.property('inputs') + def predict_inputs(self): + return self.inputs - for tbl in embed_tables: - tbl.initialize() - mlp.initialize() + @application(outputs=['cost']) + def cost(self, **kwargs): + y_hat = self.predict(**kwargs) + y = kwargs['travel_time'] + return error.rmsle(y_hat.flatten(), y.flatten()) - self.cost = cost - self.monitor = [cost] - self.outputs = outputs - self.pred_vars = ['travel_time'] + @cost.property('inputs') + def cost_inputs(self): + return self.inputs + ['travel_time'] diff --git a/model/time_simple_mlp_tgtcls.py b/model/time_simple_mlp_tgtcls.py index 1f1eab7..35c8d8a 100644 --- a/model/time_simple_mlp_tgtcls.py +++ b/model/time_simple_mlp_tgtcls.py @@ -1,67 +1,33 @@ -from blocks.bricks import MLP, Rectifier, Linear, Sigmoid, Identity, Softmax -from blocks.bricks.lookup import LookupTable - import numpy import theano from theano import tensor +from blocks.bricks import application, Softmax -import data import error +from model.mlp import FFMLP, Stream -class Model(object): - def __init__(self, config): - # The input and the targets - x_firstk_latitude = (tensor.matrix('first_k_latitude') - data.train_gps_mean[0]) / data.train_gps_std[0] - x_firstk_longitude = (tensor.matrix('first_k_longitude') - data.train_gps_mean[1]) / data.train_gps_std[1] - - x_lastk_latitude = (tensor.matrix('last_k_latitude') - data.train_gps_mean[0]) / data.train_gps_std[0] - x_lastk_longitude = (tensor.matrix('last_k_longitude') - data.train_gps_mean[1]) / data.train_gps_std[1] - - input_list = [x_firstk_latitude, x_firstk_longitude, x_lastk_latitude, x_lastk_longitude] - embed_tables = [] - - self.require_inputs = ['first_k_latitude', 'first_k_longitude', 'last_k_latitude', 'last_k_longitude'] - - for (varname, num, dim) in config.dim_embeddings: - self.require_inputs.append(varname) - vardata = tensor.lvector(varname) - tbl = LookupTable(length=num, dim=dim, name='%s_lookup'%varname) - embed_tables.append(tbl) - input_list.append(tbl.apply(vardata)) - - y = tensor.lvector('travel_time') - - # Define the model - mlp = MLP(activations=[Rectifier() for _ in config.dim_hidden] + [Softmax()], - dims=[config.dim_input] + config.dim_hidden + [config.dim_output]) - classes = theano.shared(numpy.array(config.tgtcls, dtype=theano.config.floatX), name='classes') - - # Create the Theano variables - inputs = tensor.concatenate(input_list, axis=1) - # inputs = theano.printing.Print("inputs")(inputs) - cls_probas = mlp.apply(inputs) - outputs = tensor.dot(cls_probas, classes) - - # outputs = theano.printing.Print("outputs")(outputs) - # y = theano.printing.Print("y")(y) - outputs.name = 'outputs' +class Model(FFMLP): + def __init__(self, config, **kwargs): + super(Model, self, output_layer=Softmax).__init__(config, **kwargs) + self.classes = theano.shared(numpy.array(config.tgtcls, dtype=theano.config.floatX), name='classes') + self.inputs.append('input_time') - # Calculate the cost - cost = error.rmsle(outputs.flatten(), y.flatten()) - cost.name = 'cost' + @application(outputs=['duration']) + def predict(self, **kwargs): + cls_probas = super(Model, self).predict(**kwargs) + return kwargs['input_time'] + tensor.dot(cls_probas, self.classes) - # Initialization - for tbl in embed_tables: - tbl.weights_init = config.embed_weights_init - mlp.weights_init = config.mlp_weights_init - mlp.biases_init = config.mlp_biases_init + @predict.property('inputs') + def predict_inputs(self): + return self.inputs - for tbl in embed_tables: - tbl.initialize() - mlp.initialize() + @application(outputs=['cost']) + def cost(self, **kwargs): + y_hat = self.predict(**kwargs) + y = kwargs['travel_time'] + return error.rmsle(y_hat.flatten(), y.flatten()) - self.cost = cost - self.monitor = [cost] - self.outputs = outputs - self.pred_vars = ['travel_time'] + @cost.property('inputs') + def cost_inputs(self): + return self.inputs + ['travel_time'] @@ -0,0 +1,48 @@ +#!/usr/bin/env python + +import sys +import os +import importlib +import csv + +from blocks.dump import load_parameter_values +from blocks.model import Model + + +if __name__ == "__main__": + if len(sys.argv) != 2: + print >> sys.stderr, 'Usage: %s config' % sys.argv[0] + sys.exit(1) + model_name = sys.argv[1] + config = importlib.import_module('.%s' % model_name, 'config') + model_config = config.Model(config) + + stream = config.Stream(config) + inputs = stream.inputs() + outputs = model_config.predict.outputs + req_vars_test = model_config.predict.inputs + ['trip_id'] + test_stream = stream.test(req_vars_test) + + model = Model(model_config.predict(**inputs)) + parameters = load_parameter_values(os.path.join('model_data', model_name, 'params.npz')) + model.set_param_values(parameters) + + if 'destination' in outputs: + dest_outfile = open("output/test-dest-output-%s.csv" % model_name, "w") + dest_outcsv = csv.writer(dest_outfile) + dest_outcsv.writerow(["TRIP_ID", "LATITUDE", "LONGITUDE"]) + if 'duration' in outputs: + time_outfile = open("output/test-time-output-%s.csv" % model_name, "w") + time_outcsv = csv.writer(time_outfile) + time_outcsv.writerow(["TRIP_ID", "TRAVEL_TIME"]) + + function = model.get_theano_function() + for d in test_stream.get_epoch_iterator(as_dict=True): + input_values = [d[k.name] for k in model.inputs] + output_values = function(*input_values) + if 'destination' in outputs: + destination = output_values[outputs.index('destination')] + dest_outcsv.writerow([d['trip_id'][0], destination[0, 0], destination[0, 1]]) + if 'duration' in outputs: + duration = output_values[outputs.index('duration')] + time_outcsv.writerow([d['trip_id'][0], duration[0]]) @@ -1,32 +1,25 @@ #!/usr/bin/env python -import sys -import logging import importlib +import logging +import operator +import os +import sys +from functools import reduce -import csv - -from picklable_itertools.extras import equizip - -from blocks.model import Model - -from fuel.transformers import Batch -from fuel.streams import DataStream -from fuel.schemes import ConstantScheme, ShuffledExampleScheme - -from blocks.algorithms import CompositeRule, RemoveNotFinite, GradientDescent, AdaDelta, Momentum -from blocks.graph import ComputationGraph, apply_dropout -from blocks.main_loop import MainLoop +from blocks import roles +from blocks.algorithms import AdaDelta, CompositeRule, GradientDescent, RemoveNotFinite from blocks.extensions import Printing, FinishAfter -from blocks.extensions.saveload import Dump, LoadFromDump, Checkpoint from blocks.extensions.monitoring import DataStreamMonitoring, TrainingDataMonitoring from blocks.extensions.plot import Plot +from blocks.extensions.saveload import Dump, LoadFromDump +from blocks.filter import VariableFilter +from blocks.graph import ComputationGraph, apply_dropout, apply_noise +from blocks.main_loop import MainLoop +from blocks.model import Model -from theano import tensor -from data import transformers -from data.hdf5 import TaxiDataset, TaxiStream -import apply_model +logger = logging.getLogger(__name__) if __name__ == "__main__": if len(sys.argv) != 2: @@ -35,123 +28,70 @@ if __name__ == "__main__": model_name = sys.argv[1] config = importlib.import_module('.%s' % model_name, 'config') -def compile_valid_trip_ids(): - valid = TaxiDataset(config.valid_set, 'valid.hdf5', sources=('trip_id',)) - ids = valid.get_data(None, slice(0, valid.num_examples)) - return set(ids[0]) - -def setup_train_stream(req_vars, valid_trips_ids): - train = TaxiDataset('train') - train = DataStream(train, iteration_scheme=ShuffledExampleScheme(train.num_examples)) - - train = transformers.TaxiExcludeTrips(valid_trips_ids, train) - train = transformers.TaxiGenerateSplits(train, max_splits=100) - - train = transformers.TaxiAddDateTime(train) - train = transformers.TaxiAddFirstLastLen(config.n_begin_end_pts, train) - train = transformers.Select(train, tuple(req_vars)) - - train_stream = Batch(train, iteration_scheme=ConstantScheme(config.batch_size)) - - return train_stream - -def setup_valid_stream(req_vars): - valid = TaxiStream(config.valid_set, 'valid.hdf5') + logger.info('# Configuration: %s' % config.__name__) + for key in dir(config): + if not key.startswith('__') and isinstance(getattr(config, key), (int, str, list, tuple)): + logger.info(' %20s %s' % (key, str(getattr(config, key)))) - valid = transformers.TaxiAddDateTime(valid) - valid = transformers.TaxiAddFirstLastLen(config.n_begin_end_pts, valid) - valid = transformers.Select(valid, tuple(req_vars)) + model = config.Model(config) + model.initialize() - valid_stream = Batch(valid, iteration_scheme=ConstantScheme(1000)) - - return valid_stream - -def setup_test_stream(req_vars): - test = TaxiStream('test') - - test = transformers.TaxiAddDateTime(test) - test = transformers.TaxiAddFirstLastLen(config.n_begin_end_pts, test) - test = transformers.Select(test, tuple(req_vars)) - - test_stream = Batch(test, iteration_scheme=ConstantScheme(1000)) - - return test_stream - - -def main(): - model = config.model.Model(config) - - cost = model.cost - outputs = model.outputs - - req_vars = model.require_inputs + model.pred_vars - req_vars_test = model.require_inputs + [ 'trip_id' ] + stream = config.Stream(config) + inputs = stream.inputs() + req_vars = model.cost.inputs - valid_trips_ids = compile_valid_trip_ids() - train_stream = setup_train_stream(req_vars, valid_trips_ids) - valid_stream = setup_valid_stream(req_vars) + train_stream = stream.train(req_vars) + valid_stream = stream.valid(req_vars) - # Training + cost = model.cost(**inputs) cg = ComputationGraph(cost) + unmonitor = set() + if hasattr(config, 'dropout') and config.dropout < 1.0: + unmonitor.update(VariableFilter(roles=[roles.COST])(cg.variables)) + cg = apply_dropout(cg, config.dropout_inputs(cg), config.dropout) + if hasattr(config, 'noise') and config.noise > 0.0: + unmonitor.update(VariableFilter(roles=[roles.COST])(cg.variables)) + cg = apply_noise(cg, config.noise_inputs(cg), config.noise) + cost = cg.outputs[0] + cg = Model(cost) + + logger.info('# Parameter shapes:') + parameters_size = 0 + for key, value in cg.get_params().iteritems(): + logger.info(' %20s %s' % (value.get_value().shape, key)) + parameters_size += reduce(operator.mul, value.get_value().shape, 1) + logger.info('Total number of parameters: %d in %d matrices' % (parameters_size, len(cg.get_params()))) params = cg.parameters - algorithm = GradientDescent( cost=cost, step_rule=CompositeRule([ RemoveNotFinite(), - #AdaDelta(decay_rate=0.95), - Momentum(learning_rate=config.learning_rate, momentum=config.momentum), - ]), + AdaDelta(), + ]), params=params) - plot_vars = [['valid_' + x.name for x in model.monitor]] - print "Plot: ", plot_vars - - extensions=[TrainingDataMonitoring(model.monitor, prefix='train', every_n_batches=1000), - DataStreamMonitoring(model.monitor, valid_stream, + monitored = set([cost] + VariableFilter(roles=[roles.COST])(cg.variables)) - unmonitor + plot_vars = [['valid_' + x.name for x in monitored]] + logger.info('Plotted variables: %s' % str(plot_vars)) + + dump_path = os.path.join('model_data', model_name) + logger.info('Dump path: %s' % dump_path) + extensions=[TrainingDataMonitoring(monitored, prefix='train', every_n_batches=1000), + DataStreamMonitoring(monitored, valid_stream, prefix='valid', - every_n_batches=500), - Printing(every_n_batches=500), + every_n_batches=1000), + Printing(every_n_batches=1000), Plot(model_name, channels=plot_vars, every_n_batches=500), - # Checkpoint('model.pkl', every_n_batches=100), - Dump('model_data/' + model_name, every_n_batches=500), - LoadFromDump('model_data/' + model_name), - # FinishAfter(after_epoch=4), + Dump(dump_path, every_n_batches=5000), + LoadFromDump(dump_path), + #FinishAfter(after_n_batches=2), ] main_loop = MainLoop( - model=Model([cost]), + model=cg, data_stream=train_stream, algorithm=algorithm, extensions=extensions) main_loop.run() main_loop.profile.report() - - # Produce an output on the test data - test_stream = setup_test_stream(req_vars_test) - - if 'destination_longitude' in model.pred_vars: - dest_outfile = open("output/test-dest-output-%s.csv" % model_name, "w") - dest_outcsv = csv.writer(dest_outfile) - dest_outcsv.writerow(["TRIP_ID", "LATITUDE", "LONGITUDE"]) - if 'travel_time' in model.pred_vars: - time_outfile = open("output/test-time-output-%s.csv" % model_name, "w") - time_outcsv = csv.writer(time_outfile) - time_outcsv.writerow(["TRIP_ID", "TRAVEL_TIME"]) - - for out in apply_model.Apply(outputs=outputs, stream=test_stream, return_vars=['trip_id', 'outputs']): - outputs = out['outputs'] - for i, trip in enumerate(out['trip_id']): - if model.pred_vars == ['travel_time']: - time_outcsv.writerow([trip, int(outputs[i])]) - else: - dest_outcsv.writerow([trip, repr(outputs[i, 0]), repr(outputs[i, 1])]) - if 'travel_time' in model.pred_vars: - time_outcsv.writerow([trip, int(outputs[i, 2])]) - - -if __name__ == "__main__": - logging.basicConfig(level=logging.INFO) - main() - diff --git a/visualizer/HTTPServer.py b/visualizer/HTTPServer.py index e71bef3..52b2531 100755 --- a/visualizer/HTTPServer.py +++ b/visualizer/HTTPServer.py @@ -7,8 +7,6 @@ import SimpleHTTPServer import SocketServer from cStringIO import StringIO -import h5py - import data from data.hdf5 import TaxiDataset from visualizer import Vlist, Path |