diff options
author | Étienne Simon <esimon@esimon.eu> | 2015-07-14 07:53:03 -0400 |
---|---|---|
committer | Étienne Simon <esimon@esimon.eu> | 2015-07-14 07:53:03 -0400 |
commit | c97af300b17ac042c52cfc54f43d4f01fd61fbe9 (patch) | |
tree | ad4e847a8942f2b6e120e7f811d472b93d7766cf /prepare.sh | |
parent | dc430951d6cb660ab804c7e6250aea1acc2dcd9d (diff) | |
download | taxi-c97af300b17ac042c52cfc54f43d4f01fd61fbe9.tar.gz taxi-c97af300b17ac042c52cfc54f43d4f01fd61fbe9.zip |
Add prepare.sh to prepare the kaggle data
Diffstat (limited to 'prepare.sh')
-rw-r--r-- | prepare.sh | 106 |
1 files changed, 106 insertions, 0 deletions
diff --git a/prepare.sh b/prepare.sh new file mode 100644 index 0000000..addc3df --- /dev/null +++ b/prepare.sh @@ -0,0 +1,106 @@ +#!/bin/sh + +RESET=`tput sgr0` +BOLD="`tput bold`" +RED="$RESET`tput setaf 1`$BOLD" +GREEN="$RESET`tput setaf 2`" +YELLOW="$RESET`tput setaf 3`" +BLUE="$RESET`tput setaf 4`$BOLD" + +export PYTHONPATH="$PWD:$PYTHONPATH" + +echo "${YELLOW}This script will prepare the data." +echo "${YELLOW}You should run it from inside the repository." +echo "${YELLOW}You should set the TAXI_PATH variable to where the data downloaded from kaggle is." +echo "${YELLOW}Three data files are needed: ${BOLD}train.csv${YELLOW}, ${BOLD}test.csv${YELLOW} and ${BOLD}metaData_taxistandsID_name_GPSlocation.csv.zip${YELLOW}. They can be found at the following url: ${BOLD}https://www.kaggle.com/c/pkdd-15-predict-taxi-service-trajectory-i/data" +if [ ! -e train.py ]; then + echo "${RED}train.py not found, you are not inside the taxi repository." + exit 1 +fi + + +echo -e "\n$BLUE# Checking dependencies" + +python_import(){ + echo -n "${YELLOW}$1... $RESET" + if ! python -c "import $1; print '${GREEN}version', $1.__version__, '${YELLOW}(we used version $2)'"; then + echo "${RED}failed, $1 is not installed" + exit 1 + fi +} + +python_import h5py 2.5.0 +python_import theano 0.7.0.dev +python_import fuel 0.0.1 +python_import blocks 0.0.1 +python_import sklearn 0.16.1 + + +echo -e "\n$BLUE# Checking data" + +echo "${YELLOW}TAXI_PATH is set to $TAXI_PATH" + +md5_check(){ + echo -n "${YELLOW}md5sum $1... $RESET" + if [ ! -e "$TAXI_PATH/$1" ]; then + echo "${RED}file not found, are you sure you set the TAXI_PATH variable correctly?" + exit 1 + fi + md5=`md5sum "$TAXI_PATH/$1" | sed -e 's/ .*//'` + if [ $md5 = $2 ]; then + echo "$GREEN$md5 ok" + else + echo "$RED$md5 failed" + exit 1 + fi +} + +md5_check train.csv 68cc499ac4937a3079ebf69e69e73971 +md5_check test.csv f2ceffde9d98e3c49046c7d998308e71 +md5_check metaData_taxistandsID_name_GPSlocation.csv.zip fecec7286191af868ce8fb208f5c7643 + + +echo -e "\n$BLUE# Extracting metadata" + +echo -n "${YELLOW}unziping... $RESET" +unzip -o "$TAXI_PATH/metaData_taxistandsID_name_GPSlocation.csv.zip" -d "$TAXI_PATH" +echo "${GREEN}ok" + +echo -n "${YELLOW}patching error in metadata csv... $RESET" +sed -e 's/41,Nevogilde,41.163066654-8.67598304213/41,Nevogilde,41.163066654,-8.67598304213/' -i "$TAXI_PATH/metaData_taxistandsID_name_GPSlocation.csv" +echo "${GREEN}ok" + +md5_check metaData_taxistandsID_name_GPSlocation.csv 724805b0b1385eb3efc02e8bdfe9c1df + + +echo -e "\n$BLUE# Conversion of training set to HDF5" +echo "${YELLOW}This might take some time$RESET" +data/csv_to_hdf5.py "$TAXI_PATH" "$TAXI_PATH/data.hdf5" + + +echo -e "\n$BLUE# Generation of validation set" +echo "${YELLOW}This might take some time$RESET" + +echo -n "${YELLOW}initialization... $RESET" +data/init_valid.py +echo "${GREEN}ok" + +echo -n "${YELLOW}cutting... $RESET" +data/make_valid_cut.py test_times_0 +echo "${GREEN}ok" + + +echo -e "\n$BLUE# Generation of destination cluster" +echo "${YELLOW}This might take some time$RESET" +echo -n "${YELLOW}generating... $RESET" +data_analysis/cluster_arrival.py +echo "${GREEN}ok" + + +echo -e "\n$BLUE# Creating output folders" +echo -n "${YELLOW}mkdir model_data... $RESET"; mkdir model_data; echo "${GREEN}ok" +echo -n "${YELLOW}mkdir output... $RESET"; mkdir output; echo "${GREEN}ok" + +echo -e "\n$GREEN${BOLD}The data was successfully prepared" +echo "${YELLOW}To train the winning model on gpu, you can now run the following command:" +echo "${YELLOW}THEANO_FLAGS=floatX=float32,device=gpu,optimizer=FAST_RUN ./train.py dest_mlp_tgtcls_1_cswdtx_alexandre" |