aboutsummaryrefslogtreecommitdiff
path: root/prepare.sh
blob: 3fd18fb3f72db74cb82af789ca8733055556946f (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
#!/bin/sh

RESET=`tput sgr0`
BOLD="`tput bold`"
RED="$RESET`tput setaf 1`$BOLD"
GREEN="$RESET`tput setaf 2`"
YELLOW="$RESET`tput setaf 3`"
BLUE="$RESET`tput setaf 4`$BOLD"

export PYTHONPATH="$PWD:$PYTHONPATH"

echo "${YELLOW}This script will prepare the data."
echo "${YELLOW}You should run it from inside the repository."
echo "${YELLOW}You should set the TAXI_PATH variable to where the data downloaded from kaggle is."
echo "${YELLOW}Three data files are needed: ${BOLD}train.csv.zip${YELLOW}, ${BOLD}test.csv.zip${YELLOW} and ${BOLD}metaData_taxistandsID_name_GPSlocation.csv.zip${YELLOW}. They can be found at the following url: ${BOLD}https://www.kaggle.com/c/pkdd-15-predict-taxi-service-trajectory-i/data"
if [ ! -e train.py ]; then
    echo "${RED}train.py not found, you are not inside the taxi repository."
    exit 1
fi


echo -e "\n$BLUE# Checking dependencies"

python_import(){
    echo -n "${YELLOW}$1... $RESET"
    if ! python2 -c "import $1; print '${GREEN}version', $1.__version__, '${YELLOW}(we used version $2)'"; then
        echo "${RED}failed, $1 is not installed"
        exit 1
    fi
}

python_import h5py 2.5.0
python_import theano 0.7.0.dev
python_import fuel 0.0.1
python_import blocks 0.0.1
python_import sklearn 0.16.1


echo -e "\n$BLUE# Checking data"

echo "${YELLOW}TAXI_PATH is set to $TAXI_PATH"

md5_check(){
    echo -n "${YELLOW}md5sum $1... $RESET"
    if [ ! -e "$TAXI_PATH/$1" ]; then
        echo "${RED}file not found, are you sure you set the TAXI_PATH variable correctly?"
        exit 1
    fi
	if command -v md5 >/dev/null 2>&1; then
		md5cmd=md5
	elif command -v md5sum >/dev/null 2>&1; then
		md5cmd=md5sum
	else
        echo "${RED} no md5 utility"
		return
	fi
    md5=`$md5cmd "$TAXI_PATH/$1" | sed -e 's/ .*//'`
    if [ $md5 = $2 ]; then
        echo "$GREEN$md5 ok"
    else
        echo "$RED$md5 failed"
        exit 1
    fi
}

md5_check train.csv.zip 87a1b75adfde321dc163160b495964e8
md5_check test.csv.zip 47133bf7349cb80cc668fa56af8ce743
md5_check metaData_taxistandsID_name_GPSlocation.csv.zip fecec7286191af868ce8fb208f5c7643


echo -e "\n$BLUE# Extracting data"

zipextract(){
	echo -n "${YELLOW}unziping $1... $RESET"
	unzip -o "$TAXI_PATH/$1" -d "$TAXI_PATH"
	echo "${GREEN}ok"
}

zipextract train.csv.zip
md5_check train.csv 68cc499ac4937a3079ebf69e69e73971

zipextract test.csv.zip
md5_check test.csv f2ceffde9d98e3c49046c7d998308e71

zipextract metaData_taxistandsID_name_GPSlocation.csv.zip

echo -n "${YELLOW}patching error in metadata csv... $RESET"
sed -e 's/41,Nevogilde,41.163066654-8.67598304213/41,Nevogilde,41.163066654,-8.67598304213/' -i "$TAXI_PATH/metaData_taxistandsID_name_GPSlocation.csv"
echo "${GREEN}ok"

md5_check metaData_taxistandsID_name_GPSlocation.csv 724805b0b1385eb3efc02e8bdfe9c1df


echo -e "\n$BLUE# Conversion of training set to HDF5"
echo "${YELLOW}This might take some time$RESET"
data/csv_to_hdf5.py "$TAXI_PATH" "$TAXI_PATH/data.hdf5"


echo -e "\n$BLUE# Generation of validation set"
echo "${YELLOW}This might take some time$RESET"

echo -n "${YELLOW}initialization... $RESET"
data/init_valid.py
echo "${GREEN}ok"

echo -n "${YELLOW}cutting... $RESET"
data/make_valid_cut.py test_times_0
echo "${GREEN}ok"


echo -e "\n$BLUE# Generation of destination cluster"
echo "${YELLOW}This might take some time$RESET"
echo -n "${YELLOW}generating... $RESET"
data_analysis/cluster_arrival.py
echo "${GREEN}ok"


echo -e "\n$BLUE# Creating output folders"
echo -n "${YELLOW}mkdir model_data... $RESET"; mkdir model_data; echo "${GREEN}ok"
echo -n "${YELLOW}mkdir output... $RESET"; mkdir output; echo "${GREEN}ok"

echo -e "\n$GREEN${BOLD}The data was successfully prepared"
echo "${YELLOW}To train the winning model on gpu, you can now run the following command:"
echo "${YELLOW}THEANO_FLAGS=floatX=float32,device=gpu,optimizer=FAST_RUN ./train.py dest_mlp_tgtcls_1_cswdtx_alexandre"