From 0fd3b1497ffa1bb625bf593c845e28901bc640b7 Mon Sep 17 00:00:00 2001 From: AdeB Date: Sun, 21 Jun 2015 17:00:25 -0400 Subject: Model with only embeddings. --- config/dest_simple_mlp_emb_only.py | 31 +++++++++ data/__init__.py | 1 + data/transformers.py | 2 +- model/mlp.py | 1 + model/mlp_emb.py | 126 +++++++++++++++++++++++++++++++++++++ 5 files changed, 160 insertions(+), 1 deletion(-) create mode 100644 config/dest_simple_mlp_emb_only.py create mode 100644 model/mlp_emb.py diff --git a/config/dest_simple_mlp_emb_only.py b/config/dest_simple_mlp_emb_only.py new file mode 100644 index 0000000..e5c91b8 --- /dev/null +++ b/config/dest_simple_mlp_emb_only.py @@ -0,0 +1,31 @@ +from blocks.initialization import IsotropicGaussian, Constant + +import data +from model.mlp_emb import Model, Stream + +use_cuts_for_training = True + +dim_embeddings = [ + ('origin_call', data.origin_call_train_size, 10), + ('origin_stand', data.stands_size, 10), + ('week_of_year', 52, 10), + ('day_of_week', 7, 10), + ('qhour_of_day', 24 * 4, 10), + ('day_type', 3, 10), +] + +dim_input = sum(x for (_, _, x) in dim_embeddings) +dim_hidden = [200, 100] +output_mode = "destination" +dim_output = 2 + +embed_weights_init = IsotropicGaussian(0.001) +mlp_weights_init = IsotropicGaussian(0.01) +mlp_biases_init = Constant(0.001) + +learning_rate = 0.0001 +momentum = 0.99 +batch_size = 32 + +valid_set = 'cuts/test_times_0' +max_splits = 100 diff --git a/data/__init__.py b/data/__init__.py index 2121033..604809c 100644 --- a/data/__init__.py +++ b/data/__init__.py @@ -5,6 +5,7 @@ import numpy path = os.environ.get('TAXI_PATH', '/data/lisatmp3/auvolat/taxikaggle') +path = os.environ.get('TAXI_PATH', '/Users/adeb/data/taxi') Polyline = h5py.special_dtype(vlen=numpy.float32) diff --git a/data/transformers.py b/data/transformers.py index 1bed887..e6806cc 100644 --- a/data/transformers.py +++ b/data/transformers.py @@ -94,7 +94,7 @@ class TaxiGenerateSplits(Transformer): class _taxi_add_first_last_len_helper(object): - def __init__(self, k, latitude, longitude): + def __init__(self, k, id_latitude, id_longitude): self.k = k self.id_latitude = id_latitude self.id_longitude = id_longitude diff --git a/model/mlp.py b/model/mlp.py index 6abc86f..a6f3991 100644 --- a/model/mlp.py +++ b/model/mlp.py @@ -63,6 +63,7 @@ class Stream(object): stream = transformers.TaxiExcludeTrips(stream, valid_trips_ids) stream = transformers.TaxiGenerateSplits(stream, max_splits=self.config.max_splits) + stream = transformers.add_destination(stream) stream = transformers.taxi_add_datetime(stream) stream = transformers.taxi_add_first_last_len(stream, self.config.n_begin_end_pts) diff --git a/model/mlp_emb.py b/model/mlp_emb.py new file mode 100644 index 0000000..f34541b --- /dev/null +++ b/model/mlp_emb.py @@ -0,0 +1,126 @@ +from theano import tensor + +from fuel.transformers import Batch, MultiProcessing +from fuel.streams import DataStream +from fuel.schemes import ConstantScheme, ShuffledExampleScheme +from blocks.bricks import application, MLP, Rectifier, Initializable, Identity + +import error +import data +from data import transformers +from data.hdf5 import TaxiDataset, TaxiStream +from data.cut import TaxiTimeCutScheme +from model import ContextEmbedder + + +class Model(Initializable): + def __init__(self, config, **kwargs): + super(Model, self).__init__(**kwargs) + self.config = config + + self.context_embedder = ContextEmbedder(config) + self.mlp = MLP(activations=[Rectifier() for _ in config.dim_hidden] + [Identity()], + dims=[config.dim_input] + config.dim_hidden + [config.dim_output]) + + self.inputs = self.context_embedder.inputs # + self.extremities.keys() + self.children = [ self.context_embedder, self.mlp ] + + def _push_initialization_config(self): + self.mlp.weights_init = self.config.mlp_weights_init + self.mlp.biases_init = self.config.mlp_biases_init + + @application(outputs=['destination']) + def predict(self, **kwargs): + embeddings = tuple(self.context_embedder.apply(**{k: kwargs[k] for k in self.context_embedder.inputs })) + + inputs = tensor.concatenate(embeddings, axis=1) + outputs = self.mlp.apply(inputs) + + if self.config.output_mode == "destination": + return data.train_gps_std * outputs + data.train_gps_mean + elif self.config.dim_output == "clusters": + return tensor.dot(outputs, self.classes) + + @predict.property('inputs') + def predict_inputs(self): + return self.inputs + + @application(outputs=['cost']) + def cost(self, **kwargs): + y_hat = self.predict(**kwargs) + y = tensor.concatenate((kwargs['destination_latitude'][:, None], + kwargs['destination_longitude'][:, None]), axis=1) + + return error.erdist(y_hat, y).mean() + + @cost.property('inputs') + def cost_inputs(self): + return self.inputs + ['destination_latitude', 'destination_longitude'] + + +class Stream(object): + def __init__(self, config): + self.config = config + + def train(self, req_vars): + valid = TaxiDataset(self.config.valid_set, 'valid.hdf5', sources=('trip_id',)) + valid_trips_ids = valid.get_data(None, slice(0, valid.num_examples))[0] + + stream = TaxiDataset('train') + + if hasattr(self.config, 'use_cuts_for_training') and self.config.use_cuts_for_training: + stream = DataStream(stream, iteration_scheme=TaxiTimeCutScheme()) + else: + stream = DataStream(stream, iteration_scheme=ShuffledExampleScheme(stream.num_examples)) + + stream = transformers.TaxiExcludeTrips(stream, valid_trips_ids) + stream = transformers.TaxiGenerateSplits(stream, max_splits=self.config.max_splits) + + stream = transformers.taxi_add_datetime(stream) + # stream = transformers.taxi_add_first_last_len(stream, self.config.n_begin_end_pts) + stream = transformers.Select(stream, tuple(req_vars)) + + stream = Batch(stream, iteration_scheme=ConstantScheme(self.config.batch_size)) + + stream = MultiProcessing(stream) + + return stream + + def valid(self, req_vars): + stream = TaxiStream(self.config.valid_set, 'valid.hdf5') + + stream = transformers.taxi_add_datetime(stream) + # stream = transformers.taxi_add_first_last_len(stream, self.config.n_begin_end_pts) + stream = transformers.Select(stream, tuple(req_vars)) + return Batch(stream, iteration_scheme=ConstantScheme(1000)) + + def test(self, req_vars): + stream = TaxiStream('test') + + stream = transformers.taxi_add_datetime(stream) + # stream = transformers.taxi_add_first_last_len(stream, self.config.n_begin_end_pts) + stream = transformers.taxi_remove_test_only_clients(stream) + + return Batch(stream, iteration_scheme=ConstantScheme(1)) + + def inputs(self): + return {'call_type': tensor.bvector('call_type'), + 'origin_call': tensor.ivector('origin_call'), + 'origin_stand': tensor.bvector('origin_stand'), + 'taxi_id': tensor.wvector('taxi_id'), + 'timestamp': tensor.ivector('timestamp'), + 'day_type': tensor.bvector('day_type'), + 'missing_data': tensor.bvector('missing_data'), + 'latitude': tensor.matrix('latitude'), + 'longitude': tensor.matrix('longitude'), + 'destination_latitude': tensor.vector('destination_latitude'), + 'destination_longitude': tensor.vector('destination_longitude'), + 'travel_time': tensor.ivector('travel_time'), + 'first_k_latitude': tensor.matrix('first_k_latitude'), + 'first_k_longitude': tensor.matrix('first_k_longitude'), + 'last_k_latitude': tensor.matrix('last_k_latitude'), + 'last_k_longitude': tensor.matrix('last_k_longitude'), + 'input_time': tensor.ivector('input_time'), + 'week_of_year': tensor.bvector('week_of_year'), + 'day_of_week': tensor.bvector('day_of_week'), + 'qhour_of_day': tensor.bvector('qhour_of_day')} -- cgit v1.2.3