diff options
author | Alex Auvolat <alex.auvolat@ens.fr> | 2015-07-29 12:06:00 -0400 |
---|---|---|
committer | Alex Auvolat <alex.auvolat@ens.fr> | 2015-07-29 12:06:00 -0400 |
commit | defab74395f2ddb2641bba6ab8d18bdedde7a334 (patch) | |
tree | 17821ee924e2bdcd2927ed1e61cbf410dac40108 | |
parent | ca40e5c81d385e1422cebe40e009d7e93b95bfbb (diff) | |
download | taxi-defab74395f2ddb2641bba6ab8d18bdedde7a334.tar.gz taxi-defab74395f2ddb2641bba6ab8d18bdedde7a334.zip |
p-value caluculation script
-rwxr-xr-x | data/make_reference_output.py | 28 | ||||
-rw-r--r-- | data/transformers.py | 2 | ||||
-rwxr-xr-x | pvalue.py | 60 |
3 files changed, 89 insertions, 1 deletions
diff --git a/data/make_reference_output.py b/data/make_reference_output.py new file mode 100755 index 0000000..1cd31ae --- /dev/null +++ b/data/make_reference_output.py @@ -0,0 +1,28 @@ +#!/usr/bin/env python + +import csv +import os + +from fuel.iterator import DataIterator +from fuel.schemes import SequentialExampleScheme +from fuel.streams import DataStream + +from data.hdf5 import TaxiDataset +import data + +dest_outfile = open(os.path.join(data.path, 'test_answer.csv'), 'w') +dest_outcsv = csv.writer(dest_outfile) +dest_outcsv.writerow(["TRIP_ID", "LATITUDE", "LONGITUDE"]) + +dataset = TaxiDataset('test', 'tvt.hdf5', + sources=('trip_id', 'longitude', 'latitude', + 'destination_longitude', 'destination_latitude')) +it = DataIterator(DataStream(dataset), iter(xrange(dataset.num_examples)), as_dict=True) + +for v in it: + # print v + dest_outcsv.writerow([v['trip_id'], v['destination_latitude'], + v['destination_longitude']]) + +dest_outfile.close() + diff --git a/data/transformers.py b/data/transformers.py index f0ed44a..479afc5 100644 --- a/data/transformers.py +++ b/data/transformers.py @@ -187,7 +187,7 @@ class _window_helper(object): if x.shape[0] < self.window_len: x = numpy.concatenate( - [x, numpy.full((self.window_len - x.shape[0],), x[-1])]) + [numpy.full((self.window_len - x.shape[0],), x[0]), x]) y = [x[i: i+x.shape[0]-self.window_len+1][:, None] for i in range(self.window_len)] diff --git a/pvalue.py b/pvalue.py new file mode 100755 index 0000000..15c6e10 --- /dev/null +++ b/pvalue.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python + +import os +import sys + +import math +import numpy + +import data + +# Haversine distance calculation +# --------- -------- ----------- + +rearth = 6371. +deg2rad = 3.141592653589793 / 180. + +def hdist(a, b): + lat1 = a[:, 0] * deg2rad + lon1 = a[:, 1] * deg2rad + lat2 = b[:, 0] * deg2rad + lon2 = b[:, 1] * deg2rad + + dlat = abs(lat1-lat2) + dlon = abs(lon1-lon2) + + al = numpy.sin(dlat/2)**2 + numpy.cos(lat1) * numpy.cos(lat2) * (numpy.sin(dlon/2)**2) + d = numpy.arctan2(numpy.sqrt(al), numpy.sqrt(1.-al)) + + hd = 2. * rearth * d + + return hd + + +# Read the inputs +# ---- --- ------ + +def readcsv(f): + return numpy.genfromtxt(f, delimiter=',', skip_header=1)[:, 1:3] + +answer = readcsv(os.path.join(data.path, 'test_answer.csv')) + +tables = [readcsv(f) for f in sys.argv if '.csv' in f] +etables = [hdist(t, answer) for t in tables] + +# Calculate p-values +# --------- -------- + +pvalue = numpy.zeros((len(tables), len(tables))) + +for i, a in enumerate(etables): + for j, b in enumerate(etables): + if i == j: + continue + d = b - a + var = (numpy.mean((a - numpy.mean(a))**2) + + numpy.mean((b - numpy.mean(b))**2)) / 2. + pv = 1 - .5 * (1 + math.erf(numpy.mean(d) / numpy.sqrt(2 * var))) + pvalue[i, j] = pv + +print pvalue |