aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorÉtienne Simon <esimon@esimon.eu>2015-07-14 07:53:03 -0400
committerÉtienne Simon <esimon@esimon.eu>2015-07-14 07:53:03 -0400
commitc97af300b17ac042c52cfc54f43d4f01fd61fbe9 (patch)
treead4e847a8942f2b6e120e7f811d472b93d7766cf
parentdc430951d6cb660ab804c7e6250aea1acc2dcd9d (diff)
downloadtaxi-c97af300b17ac042c52cfc54f43d4f01fd61fbe9.tar.gz
taxi-c97af300b17ac042c52cfc54f43d4f01fd61fbe9.zip
Add prepare.sh to prepare the kaggle data
-rw-r--r--README.md6
-rw-r--r--prepare.sh106
2 files changed, 110 insertions, 2 deletions
diff --git a/README.md b/README.md
index b35f35f..0aa5f99 100644
--- a/README.md
+++ b/README.md
@@ -38,10 +38,12 @@ Here is a brief description of the Python files in the archive:
* `train.py` contains the main code for the training and testing
## How to reproduce the winning results?
+
+There is an helper script `prepare.sh` which might helps you (by performing step 1-6 and some other checks), but if you encounter an error, the script will re-execute all the steps from the beginning (before the actual training, step 2, 4 and 5 are quite long).
1. Set the `TAXI_PATH` environment variable to the path of the folder containing the CSV files.
-2. Run `data/csv_to_hdf5.py` to generate the HDF5 file (which is generated in `TAXI_PATH`, along the CSV files). This takes around 20 minutes on our machines.
-3. Run `data/init_valid.py` to initialize the validation set HDF5 file.
+2. Run `data/csv_to_hdf5.py "$TAXI_PATH" "$TAXI_PATH/data.hdf5"` to generate the HDF5 file (which is generated in `TAXI_PATH`, along the CSV files). This takes around 20 minutes on our machines.
+3. Run `data/init_valid.py valid.hdf5` to initialize the validation set HDF5 file.
4. Run `data/make_valid_cut.py test_times_0` to generate the validation set. This can take a few minutes.
5. Run `data_analysis/cluster_arrival.py` to generate the arrival point clustering. This can take a few minutes.
6. Create a folder `model_data` and a folder `output` (next to the training script), which will receive respectively a regular save of the model parameters and many submission files generated from the model at a regular interval.
diff --git a/prepare.sh b/prepare.sh
new file mode 100644
index 0000000..addc3df
--- /dev/null
+++ b/prepare.sh
@@ -0,0 +1,106 @@
+#!/bin/sh
+
+RESET=`tput sgr0`
+BOLD="`tput bold`"
+RED="$RESET`tput setaf 1`$BOLD"
+GREEN="$RESET`tput setaf 2`"
+YELLOW="$RESET`tput setaf 3`"
+BLUE="$RESET`tput setaf 4`$BOLD"
+
+export PYTHONPATH="$PWD:$PYTHONPATH"
+
+echo "${YELLOW}This script will prepare the data."
+echo "${YELLOW}You should run it from inside the repository."
+echo "${YELLOW}You should set the TAXI_PATH variable to where the data downloaded from kaggle is."
+echo "${YELLOW}Three data files are needed: ${BOLD}train.csv${YELLOW}, ${BOLD}test.csv${YELLOW} and ${BOLD}metaData_taxistandsID_name_GPSlocation.csv.zip${YELLOW}. They can be found at the following url: ${BOLD}https://www.kaggle.com/c/pkdd-15-predict-taxi-service-trajectory-i/data"
+if [ ! -e train.py ]; then
+ echo "${RED}train.py not found, you are not inside the taxi repository."
+ exit 1
+fi
+
+
+echo -e "\n$BLUE# Checking dependencies"
+
+python_import(){
+ echo -n "${YELLOW}$1... $RESET"
+ if ! python -c "import $1; print '${GREEN}version', $1.__version__, '${YELLOW}(we used version $2)'"; then
+ echo "${RED}failed, $1 is not installed"
+ exit 1
+ fi
+}
+
+python_import h5py 2.5.0
+python_import theano 0.7.0.dev
+python_import fuel 0.0.1
+python_import blocks 0.0.1
+python_import sklearn 0.16.1
+
+
+echo -e "\n$BLUE# Checking data"
+
+echo "${YELLOW}TAXI_PATH is set to $TAXI_PATH"
+
+md5_check(){
+ echo -n "${YELLOW}md5sum $1... $RESET"
+ if [ ! -e "$TAXI_PATH/$1" ]; then
+ echo "${RED}file not found, are you sure you set the TAXI_PATH variable correctly?"
+ exit 1
+ fi
+ md5=`md5sum "$TAXI_PATH/$1" | sed -e 's/ .*//'`
+ if [ $md5 = $2 ]; then
+ echo "$GREEN$md5 ok"
+ else
+ echo "$RED$md5 failed"
+ exit 1
+ fi
+}
+
+md5_check train.csv 68cc499ac4937a3079ebf69e69e73971
+md5_check test.csv f2ceffde9d98e3c49046c7d998308e71
+md5_check metaData_taxistandsID_name_GPSlocation.csv.zip fecec7286191af868ce8fb208f5c7643
+
+
+echo -e "\n$BLUE# Extracting metadata"
+
+echo -n "${YELLOW}unziping... $RESET"
+unzip -o "$TAXI_PATH/metaData_taxistandsID_name_GPSlocation.csv.zip" -d "$TAXI_PATH"
+echo "${GREEN}ok"
+
+echo -n "${YELLOW}patching error in metadata csv... $RESET"
+sed -e 's/41,Nevogilde,41.163066654-8.67598304213/41,Nevogilde,41.163066654,-8.67598304213/' -i "$TAXI_PATH/metaData_taxistandsID_name_GPSlocation.csv"
+echo "${GREEN}ok"
+
+md5_check metaData_taxistandsID_name_GPSlocation.csv 724805b0b1385eb3efc02e8bdfe9c1df
+
+
+echo -e "\n$BLUE# Conversion of training set to HDF5"
+echo "${YELLOW}This might take some time$RESET"
+data/csv_to_hdf5.py "$TAXI_PATH" "$TAXI_PATH/data.hdf5"
+
+
+echo -e "\n$BLUE# Generation of validation set"
+echo "${YELLOW}This might take some time$RESET"
+
+echo -n "${YELLOW}initialization... $RESET"
+data/init_valid.py
+echo "${GREEN}ok"
+
+echo -n "${YELLOW}cutting... $RESET"
+data/make_valid_cut.py test_times_0
+echo "${GREEN}ok"
+
+
+echo -e "\n$BLUE# Generation of destination cluster"
+echo "${YELLOW}This might take some time$RESET"
+echo -n "${YELLOW}generating... $RESET"
+data_analysis/cluster_arrival.py
+echo "${GREEN}ok"
+
+
+echo -e "\n$BLUE# Creating output folders"
+echo -n "${YELLOW}mkdir model_data... $RESET"; mkdir model_data; echo "${GREEN}ok"
+echo -n "${YELLOW}mkdir output... $RESET"; mkdir output; echo "${GREEN}ok"
+
+echo -e "\n$GREEN${BOLD}The data was successfully prepared"
+echo "${YELLOW}To train the winning model on gpu, you can now run the following command:"
+echo "${YELLOW}THEANO_FLAGS=floatX=float32,device=gpu,optimizer=FAST_RUN ./train.py dest_mlp_tgtcls_1_cswdtx_alexandre"