aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlex Auvolat <alex.auvolat@ens.fr>2015-07-29 12:06:00 -0400
committerAlex Auvolat <alex.auvolat@ens.fr>2015-07-29 12:06:00 -0400
commitdefab74395f2ddb2641bba6ab8d18bdedde7a334 (patch)
tree17821ee924e2bdcd2927ed1e61cbf410dac40108
parentca40e5c81d385e1422cebe40e009d7e93b95bfbb (diff)
downloadtaxi-defab74395f2ddb2641bba6ab8d18bdedde7a334.tar.gz
taxi-defab74395f2ddb2641bba6ab8d18bdedde7a334.zip
p-value caluculation script
-rwxr-xr-xdata/make_reference_output.py28
-rw-r--r--data/transformers.py2
-rwxr-xr-xpvalue.py60
3 files changed, 89 insertions, 1 deletions
diff --git a/data/make_reference_output.py b/data/make_reference_output.py
new file mode 100755
index 0000000..1cd31ae
--- /dev/null
+++ b/data/make_reference_output.py
@@ -0,0 +1,28 @@
+#!/usr/bin/env python
+
+import csv
+import os
+
+from fuel.iterator import DataIterator
+from fuel.schemes import SequentialExampleScheme
+from fuel.streams import DataStream
+
+from data.hdf5 import TaxiDataset
+import data
+
+dest_outfile = open(os.path.join(data.path, 'test_answer.csv'), 'w')
+dest_outcsv = csv.writer(dest_outfile)
+dest_outcsv.writerow(["TRIP_ID", "LATITUDE", "LONGITUDE"])
+
+dataset = TaxiDataset('test', 'tvt.hdf5',
+ sources=('trip_id', 'longitude', 'latitude',
+ 'destination_longitude', 'destination_latitude'))
+it = DataIterator(DataStream(dataset), iter(xrange(dataset.num_examples)), as_dict=True)
+
+for v in it:
+ # print v
+ dest_outcsv.writerow([v['trip_id'], v['destination_latitude'],
+ v['destination_longitude']])
+
+dest_outfile.close()
+
diff --git a/data/transformers.py b/data/transformers.py
index f0ed44a..479afc5 100644
--- a/data/transformers.py
+++ b/data/transformers.py
@@ -187,7 +187,7 @@ class _window_helper(object):
if x.shape[0] < self.window_len:
x = numpy.concatenate(
- [x, numpy.full((self.window_len - x.shape[0],), x[-1])])
+ [numpy.full((self.window_len - x.shape[0],), x[0]), x])
y = [x[i: i+x.shape[0]-self.window_len+1][:, None]
for i in range(self.window_len)]
diff --git a/pvalue.py b/pvalue.py
new file mode 100755
index 0000000..15c6e10
--- /dev/null
+++ b/pvalue.py
@@ -0,0 +1,60 @@
+#!/usr/bin/env python
+
+import os
+import sys
+
+import math
+import numpy
+
+import data
+
+# Haversine distance calculation
+# --------- -------- -----------
+
+rearth = 6371.
+deg2rad = 3.141592653589793 / 180.
+
+def hdist(a, b):
+ lat1 = a[:, 0] * deg2rad
+ lon1 = a[:, 1] * deg2rad
+ lat2 = b[:, 0] * deg2rad
+ lon2 = b[:, 1] * deg2rad
+
+ dlat = abs(lat1-lat2)
+ dlon = abs(lon1-lon2)
+
+ al = numpy.sin(dlat/2)**2 + numpy.cos(lat1) * numpy.cos(lat2) * (numpy.sin(dlon/2)**2)
+ d = numpy.arctan2(numpy.sqrt(al), numpy.sqrt(1.-al))
+
+ hd = 2. * rearth * d
+
+ return hd
+
+
+# Read the inputs
+# ---- --- ------
+
+def readcsv(f):
+ return numpy.genfromtxt(f, delimiter=',', skip_header=1)[:, 1:3]
+
+answer = readcsv(os.path.join(data.path, 'test_answer.csv'))
+
+tables = [readcsv(f) for f in sys.argv if '.csv' in f]
+etables = [hdist(t, answer) for t in tables]
+
+# Calculate p-values
+# --------- --------
+
+pvalue = numpy.zeros((len(tables), len(tables)))
+
+for i, a in enumerate(etables):
+ for j, b in enumerate(etables):
+ if i == j:
+ continue
+ d = b - a
+ var = (numpy.mean((a - numpy.mean(a))**2)
+ + numpy.mean((b - numpy.mean(b))**2)) / 2.
+ pv = 1 - .5 * (1 + math.erf(numpy.mean(d) / numpy.sqrt(2 * var)))
+ pvalue[i, j] = pv
+
+print pvalue