diff options
author | Alex Auvolat <alex@adnab.me> | 2015-07-10 19:21:04 -0400 |
---|---|---|
committer | Alex Auvolat <alex@adnab.me> | 2015-07-10 19:21:04 -0400 |
commit | 8442ba13a442d509a407b5913b95d9f1b00989d2 (patch) | |
tree | 821568570dcc277c1638c72ad06460b2a7b03b05 /data_analysis/cluster_arrival.py | |
parent | c5187418bc93c34e3fdce4fdc1a3b5316812b69a (diff) | |
parent | 97b9672860b97a397ae48b24287741922e7fcb8a (diff) | |
download | taxi-8442ba13a442d509a407b5913b95d9f1b00989d2.tar.gz taxi-8442ba13a442d509a407b5913b95d9f1b00989d2.zip |
Merge branch 'master' of github.com:adbrebs/taxi
Diffstat (limited to 'data_analysis/cluster_arrival.py')
-rwxr-xr-x[-rw-r--r--] | data_analysis/cluster_arrival.py | 23 |
1 files changed, 17 insertions, 6 deletions
diff --git a/data_analysis/cluster_arrival.py b/data_analysis/cluster_arrival.py index fd4ea04..5e990cd 100644..100755 --- a/data_analysis/cluster_arrival.py +++ b/data_analysis/cluster_arrival.py @@ -1,20 +1,31 @@ -import matplotlib.pyplot as plt +#!/usr/bin/env python import numpy import cPickle import scipy.misc +import os from sklearn.cluster import MeanShift, estimate_bandwidth from sklearn.datasets.samples_generator import make_blobs from itertools import cycle -print "Reading arrival point list" -with open("arrivals.pkl") as f: - pts = cPickle.load(f) +import data +from data.hdf5 import taxi_it +from data.transformers import add_destination + +print "Generating arrival point list" +dests = [] +for v in taxi_it("train"): + if len(v['latitude']) == 0: continue + dests.append([v['latitude'][-1], v['longitude'][-1]]) +pts = numpy.array(dests) + +with open(os.path.join(data.path, "arrivals.pkl"), "w") as f: + cPickle.dump(pts, f, protocol=cPickle.HIGHEST_PROTOCOL) print "Doing clustering" bw = estimate_bandwidth(pts, quantile=.1, n_samples=1000) print bw -bw = 0.001 +bw = 0.001 # ( ms = MeanShift(bandwidth=bw, bin_seeding=True, min_bin_freq=5) ms.fit(pts) @@ -22,6 +33,6 @@ cluster_centers = ms.cluster_centers_ print "Clusters shape: ", cluster_centers.shape -with open("arrival-cluters.pkl", "w") as f: +with open(os.path.join(data.path, "arrival-clusters.pkl"), "w") as f: cPickle.dump(cluster_centers, f, protocol=cPickle.HIGHEST_PROTOCOL) |