From 8879521b13536fa9f0bde159f4bb4a112fcc5dbf Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Wed, 15 Jul 2015 17:06:01 -0400 Subject: s/FAST_RUN/fast_run --- prepare.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prepare.sh b/prepare.sh index 7559167..ccaca1b 100755 --- a/prepare.sh +++ b/prepare.sh @@ -121,4 +121,4 @@ echo -n "${YELLOW}mkdir output... $RESET"; mkdir output; echo "${GREEN}ok" echo -e "\n$GREEN${BOLD}The data was successfully prepared" echo "${YELLOW}To train the winning model on gpu, you can now run the following command:" -echo "${YELLOW}THEANO_FLAGS=floatX=float32,device=gpu,optimizer=FAST_RUN python2 train.py dest_mlp_tgtcls_1_cswdtx_alexandre" +echo "${YELLOW}THEANO_FLAGS=floatX=float32,device=gpu,optimizer=fast_run python2 train.py dest_mlp_tgtcls_1_cswdtx_alexandre" -- cgit v1.2.3 From fc59592e58412f70ad3b3d47dc3732e7be6dc0c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89tienne=20Simon?= Date: Thu, 16 Jul 2015 08:41:55 +0000 Subject: s/FAST_RUN/fast_run --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 05cd7ed..ef46106 100644 --- a/README.md +++ b/README.md @@ -51,6 +51,6 @@ Note that some script expect the repository to be in your PYTHONPATH (go to the 6. Create a folder `model_data` and a folder `output` (next to the training script), which will receive respectively a regular save of the model parameters and many submission files generated from the model at a regular interval. 7. Run `./train.py dest_mlp_tgtcls_1_cswdtx_alexandre` to train the model. Output solutions are generated in `output/` every 1000 iterations. Interrupt the model with three consecutive Ctrl+C at any times. The training script is set to stop training after 10 000 000 iterations, but a result file produced after less than 2 000 000 iterations is already the winning solution. We trained our model on a GeForce GTX 680 card and it took about an afternoon to generate the winning solution. When running the training script, set the following Theano flags environment variable to exploit GPU parallelism: - `THEANO_FLAGS=floatX=float32,device=gpu,optimizer=FAST_RUN` + `THEANO_FLAGS=floatX=float32,device=gpu,optimizer=fast_run` *More information in this pdf: https://github.com/adbrebs/taxi/blob/master/doc/short_report.pdf* -- cgit v1.2.3 From 404250df425df4f89d9edfe2357fc0cb7b8b77e6 Mon Sep 17 00:00:00 2001 From: AdeB Date: Thu, 16 Jul 2015 10:36:11 -0400 Subject: Update the heatmap script. --- data_analysis/maps.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/data_analysis/maps.py b/data_analysis/maps.py index 991f279..e951f23 100644 --- a/data_analysis/maps.py +++ b/data_analysis/maps.py @@ -1,13 +1,17 @@ import cPickle -import scipy import numpy as np import matplotlib.pyplot as plt +from fuel.schemes import ConstantScheme +from fuel.streams import DataStream + import data +from data.hdf5 import TaxiDataset, TaxiStream def compute_number_coordinates(): - train_it = data.train_it() + stream = TaxiDataset('train').get_example_stream() + train_it = stream.get_epoch_iterator() # Count the number of coordinates n_coordinates = 0 @@ -24,16 +28,20 @@ def extract_coordinates(n_coordinates=None): if n_coordinates is None: n_coordinates = compute_number_coordinates() + dataset = TaxiDataset('train') + stream = DataStream(dataset, iteration_scheme=ConstantScheme(1, dataset.num_examples)) + coordinates = np.zeros((n_coordinates, 2), dtype="float32") - train_it = data.train_it() + train_it = stream.get_epoch_iterator() c = 0 for ride in train_it: - for point in ride[-1]: + for point in zip(ride[2], ride[3]): coordinates[c] = point c += 1 + print c - cPickle.dump(coordinates, open(data.DATA_PATH + "/coordinates_array.pkl", "wb")) + cPickle.dump(coordinates, open(data.path + "/coordinates_array.pkl", "wb")) def draw_map(coordinates, xrg, yrg): @@ -47,9 +55,9 @@ def draw_map(coordinates, xrg, yrg): if __name__ == "__main__": - # extract_coordinates(n_coordinates=83360928) + extract_coordinates(n_coordinates=32502730) - coordinates = cPickle.load(open(data.DATA_PATH + "/coordinates_array.pkl", "rb")) + coordinates = cPickle.load(open(data.path + "/coordinates_array.pkl", "rb")) xrg = [-8.75, -8.55] yrg = [41.05, 41.25] draw_map(coordinates, xrg, yrg) -- cgit v1.2.3 From 97e9ac0e15b890076af0cf469efbead89f8eb804 Mon Sep 17 00:00:00 2001 From: AdeB Date: Thu, 16 Jul 2015 10:38:45 -0400 Subject: Update number of coordinates function --- data_analysis/maps.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/data_analysis/maps.py b/data_analysis/maps.py index e951f23..d5db182 100644 --- a/data_analysis/maps.py +++ b/data_analysis/maps.py @@ -10,13 +10,14 @@ from data.hdf5 import TaxiDataset, TaxiStream def compute_number_coordinates(): - stream = TaxiDataset('train').get_example_stream() + dataset = TaxiDataset('train') + stream = DataStream(dataset, iteration_scheme=ConstantScheme(1, dataset.num_examples)) train_it = stream.get_epoch_iterator() # Count the number of coordinates n_coordinates = 0 for ride in train_it: - n_coordinates += len(ride[-1]) + n_coordinates += len(ride[2]) print n_coordinates return n_coordinates @@ -51,7 +52,7 @@ def draw_map(coordinates, xrg, yrg): hist, xx, yy = np.histogram2d(coordinates[:, 0], coordinates[:, 1], bins=2000, range=[xrg, yrg]) plt.imshow(np.log(hist)) - plt.savefig(data.DATA_PATH + "/analysis/xyhmap2.png") + plt.savefig(data.path + "/analysis/xyhmap2.png") if __name__ == "__main__": -- cgit v1.2.3 From b6566c010be7c871a5b6c199feaf1dfda0910ade Mon Sep 17 00:00:00 2001 From: AdeB Date: Thu, 16 Jul 2015 18:59:50 -0400 Subject: Fix a bug in the heatmap generation --- data_analysis/maps.py | 28 ++++++++++------------------ 1 file changed, 10 insertions(+), 18 deletions(-) diff --git a/data_analysis/maps.py b/data_analysis/maps.py index d5db182..2912c8d 100644 --- a/data_analysis/maps.py +++ b/data_analysis/maps.py @@ -2,22 +2,16 @@ import cPickle import numpy as np import matplotlib.pyplot as plt -from fuel.schemes import ConstantScheme -from fuel.streams import DataStream - import data -from data.hdf5 import TaxiDataset, TaxiStream +from data.hdf5 import taxi_it def compute_number_coordinates(): - dataset = TaxiDataset('train') - stream = DataStream(dataset, iteration_scheme=ConstantScheme(1, dataset.num_examples)) - train_it = stream.get_epoch_iterator() # Count the number of coordinates n_coordinates = 0 - for ride in train_it: - n_coordinates += len(ride[2]) + for ride in taxi_it('train'): + n_coordinates += len(ride['latitude']) print n_coordinates return n_coordinates @@ -29,17 +23,14 @@ def extract_coordinates(n_coordinates=None): if n_coordinates is None: n_coordinates = compute_number_coordinates() - dataset = TaxiDataset('train') - stream = DataStream(dataset, iteration_scheme=ConstantScheme(1, dataset.num_examples)) - coordinates = np.zeros((n_coordinates, 2), dtype="float32") - train_it = stream.get_epoch_iterator() c = 0 - for ride in train_it: - for point in zip(ride[2], ride[3]): + for ride in taxi_it('train'): + for point in zip(ride['latitude'], ride['longitude']): coordinates[c] = point c += 1 + print c cPickle.dump(coordinates, open(data.path + "/coordinates_array.pkl", "wb")) @@ -52,13 +43,14 @@ def draw_map(coordinates, xrg, yrg): hist, xx, yy = np.histogram2d(coordinates[:, 0], coordinates[:, 1], bins=2000, range=[xrg, yrg]) plt.imshow(np.log(hist)) + plt.gca().invert_yaxis() plt.savefig(data.path + "/analysis/xyhmap2.png") if __name__ == "__main__": - extract_coordinates(n_coordinates=32502730) + extract_coordinates(n_coordinates=83409386) coordinates = cPickle.load(open(data.path + "/coordinates_array.pkl", "rb")) - xrg = [-8.75, -8.55] - yrg = [41.05, 41.25] + xrg = [41.05, 41.25] + yrg = [-8.75, -8.55] draw_map(coordinates, xrg, yrg) -- cgit v1.2.3