diff options
-rw-r--r-- | README.md | 2 | ||||
-rw-r--r-- | data_analysis/maps.py | 27 | ||||
-rwxr-xr-x | prepare.sh | 2 |
3 files changed, 16 insertions, 15 deletions
@@ -51,6 +51,6 @@ Note that some script expect the repository to be in your PYTHONPATH (go to the 6. Create a folder `model_data` and a folder `output` (next to the training script), which will receive respectively a regular save of the model parameters and many submission files generated from the model at a regular interval. 7. Run `./train.py dest_mlp_tgtcls_1_cswdtx_alexandre` to train the model. Output solutions are generated in `output/` every 1000 iterations. Interrupt the model with three consecutive Ctrl+C at any times. The training script is set to stop training after 10 000 000 iterations, but a result file produced after less than 2 000 000 iterations is already the winning solution. We trained our model on a GeForce GTX 680 card and it took about an afternoon to generate the winning solution. When running the training script, set the following Theano flags environment variable to exploit GPU parallelism: - `THEANO_FLAGS=floatX=float32,device=gpu,optimizer=FAST_RUN` + `THEANO_FLAGS=floatX=float32,device=gpu,optimizer=fast_run` *More information in this pdf: https://github.com/adbrebs/taxi/blob/master/doc/short_report.pdf* diff --git a/data_analysis/maps.py b/data_analysis/maps.py index 991f279..2912c8d 100644 --- a/data_analysis/maps.py +++ b/data_analysis/maps.py @@ -1,18 +1,17 @@ import cPickle -import scipy import numpy as np import matplotlib.pyplot as plt import data +from data.hdf5 import taxi_it def compute_number_coordinates(): - train_it = data.train_it() # Count the number of coordinates n_coordinates = 0 - for ride in train_it: - n_coordinates += len(ride[-1]) + for ride in taxi_it('train'): + n_coordinates += len(ride['latitude']) print n_coordinates return n_coordinates @@ -25,15 +24,16 @@ def extract_coordinates(n_coordinates=None): n_coordinates = compute_number_coordinates() coordinates = np.zeros((n_coordinates, 2), dtype="float32") - train_it = data.train_it() c = 0 - for ride in train_it: - for point in ride[-1]: + for ride in taxi_it('train'): + for point in zip(ride['latitude'], ride['longitude']): coordinates[c] = point c += 1 - cPickle.dump(coordinates, open(data.DATA_PATH + "/coordinates_array.pkl", "wb")) + print c + + cPickle.dump(coordinates, open(data.path + "/coordinates_array.pkl", "wb")) def draw_map(coordinates, xrg, yrg): @@ -43,13 +43,14 @@ def draw_map(coordinates, xrg, yrg): hist, xx, yy = np.histogram2d(coordinates[:, 0], coordinates[:, 1], bins=2000, range=[xrg, yrg]) plt.imshow(np.log(hist)) - plt.savefig(data.DATA_PATH + "/analysis/xyhmap2.png") + plt.gca().invert_yaxis() + plt.savefig(data.path + "/analysis/xyhmap2.png") if __name__ == "__main__": - # extract_coordinates(n_coordinates=83360928) + extract_coordinates(n_coordinates=83409386) - coordinates = cPickle.load(open(data.DATA_PATH + "/coordinates_array.pkl", "rb")) - xrg = [-8.75, -8.55] - yrg = [41.05, 41.25] + coordinates = cPickle.load(open(data.path + "/coordinates_array.pkl", "rb")) + xrg = [41.05, 41.25] + yrg = [-8.75, -8.55] draw_map(coordinates, xrg, yrg) @@ -121,4 +121,4 @@ echo -n "${YELLOW}mkdir output... $RESET"; mkdir output; echo "${GREEN}ok" echo -e "\n$GREEN${BOLD}The data was successfully prepared" echo "${YELLOW}To train the winning model on gpu, you can now run the following command:" -echo "${YELLOW}THEANO_FLAGS=floatX=float32,device=gpu,optimizer=FAST_RUN python2 train.py dest_mlp_tgtcls_1_cswdtx_alexandre" +echo "${YELLOW}THEANO_FLAGS=floatX=float32,device=gpu,optimizer=fast_run python2 train.py dest_mlp_tgtcls_1_cswdtx_alexandre" |