aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--README.md2
-rw-r--r--data_analysis/maps.py27
-rwxr-xr-xprepare.sh2
3 files changed, 16 insertions, 15 deletions
diff --git a/README.md b/README.md
index 05cd7ed..ef46106 100644
--- a/README.md
+++ b/README.md
@@ -51,6 +51,6 @@ Note that some script expect the repository to be in your PYTHONPATH (go to the
6. Create a folder `model_data` and a folder `output` (next to the training script), which will receive respectively a regular save of the model parameters and many submission files generated from the model at a regular interval.
7. Run `./train.py dest_mlp_tgtcls_1_cswdtx_alexandre` to train the model. Output solutions are generated in `output/` every 1000 iterations. Interrupt the model with three consecutive Ctrl+C at any times. The training script is set to stop training after 10 000 000 iterations, but a result file produced after less than 2 000 000 iterations is already the winning solution. We trained our model on a GeForce GTX 680 card and it took about an afternoon to generate the winning solution.
When running the training script, set the following Theano flags environment variable to exploit GPU parallelism:
- `THEANO_FLAGS=floatX=float32,device=gpu,optimizer=FAST_RUN`
+ `THEANO_FLAGS=floatX=float32,device=gpu,optimizer=fast_run`
*More information in this pdf: https://github.com/adbrebs/taxi/blob/master/doc/short_report.pdf*
diff --git a/data_analysis/maps.py b/data_analysis/maps.py
index 991f279..2912c8d 100644
--- a/data_analysis/maps.py
+++ b/data_analysis/maps.py
@@ -1,18 +1,17 @@
import cPickle
-import scipy
import numpy as np
import matplotlib.pyplot as plt
import data
+from data.hdf5 import taxi_it
def compute_number_coordinates():
- train_it = data.train_it()
# Count the number of coordinates
n_coordinates = 0
- for ride in train_it:
- n_coordinates += len(ride[-1])
+ for ride in taxi_it('train'):
+ n_coordinates += len(ride['latitude'])
print n_coordinates
return n_coordinates
@@ -25,15 +24,16 @@ def extract_coordinates(n_coordinates=None):
n_coordinates = compute_number_coordinates()
coordinates = np.zeros((n_coordinates, 2), dtype="float32")
- train_it = data.train_it()
c = 0
- for ride in train_it:
- for point in ride[-1]:
+ for ride in taxi_it('train'):
+ for point in zip(ride['latitude'], ride['longitude']):
coordinates[c] = point
c += 1
- cPickle.dump(coordinates, open(data.DATA_PATH + "/coordinates_array.pkl", "wb"))
+ print c
+
+ cPickle.dump(coordinates, open(data.path + "/coordinates_array.pkl", "wb"))
def draw_map(coordinates, xrg, yrg):
@@ -43,13 +43,14 @@ def draw_map(coordinates, xrg, yrg):
hist, xx, yy = np.histogram2d(coordinates[:, 0], coordinates[:, 1], bins=2000, range=[xrg, yrg])
plt.imshow(np.log(hist))
- plt.savefig(data.DATA_PATH + "/analysis/xyhmap2.png")
+ plt.gca().invert_yaxis()
+ plt.savefig(data.path + "/analysis/xyhmap2.png")
if __name__ == "__main__":
- # extract_coordinates(n_coordinates=83360928)
+ extract_coordinates(n_coordinates=83409386)
- coordinates = cPickle.load(open(data.DATA_PATH + "/coordinates_array.pkl", "rb"))
- xrg = [-8.75, -8.55]
- yrg = [41.05, 41.25]
+ coordinates = cPickle.load(open(data.path + "/coordinates_array.pkl", "rb"))
+ xrg = [41.05, 41.25]
+ yrg = [-8.75, -8.55]
draw_map(coordinates, xrg, yrg)
diff --git a/prepare.sh b/prepare.sh
index 7559167..ccaca1b 100755
--- a/prepare.sh
+++ b/prepare.sh
@@ -121,4 +121,4 @@ echo -n "${YELLOW}mkdir output... $RESET"; mkdir output; echo "${GREEN}ok"
echo -e "\n$GREEN${BOLD}The data was successfully prepared"
echo "${YELLOW}To train the winning model on gpu, you can now run the following command:"
-echo "${YELLOW}THEANO_FLAGS=floatX=float32,device=gpu,optimizer=FAST_RUN python2 train.py dest_mlp_tgtcls_1_cswdtx_alexandre"
+echo "${YELLOW}THEANO_FLAGS=floatX=float32,device=gpu,optimizer=fast_run python2 train.py dest_mlp_tgtcls_1_cswdtx_alexandre"