diff options
author | Alex Auvolat <alex.auvolat@ens.fr> | 2015-05-04 16:43:48 -0400 |
---|---|---|
committer | Alex Auvolat <alex.auvolat@ens.fr> | 2015-05-04 16:58:31 -0400 |
commit | 80d3ea67a845484d119cb88f0a0412f981ab344c (patch) | |
tree | 37b8130b6d761bcda48c8c0f74114498b85dad97 /data_analysis | |
parent | f9a31bd246e3c4736d3f532b566b7437eba6b4de (diff) | |
download | taxi-80d3ea67a845484d119cb88f0a0412f981ab344c.tar.gz taxi-80d3ea67a845484d119cb88f0a0412f981ab344c.zip |
Mew data analysis tool: clustering of arrival points.
Diffstat (limited to 'data_analysis')
-rw-r--r-- | data_analysis/cluster_arrival.py | 27 | ||||
-rw-r--r-- | data_analysis/destmaps.py | 20 |
2 files changed, 37 insertions, 10 deletions
diff --git a/data_analysis/cluster_arrival.py b/data_analysis/cluster_arrival.py new file mode 100644 index 0000000..fd4ea04 --- /dev/null +++ b/data_analysis/cluster_arrival.py @@ -0,0 +1,27 @@ +import matplotlib.pyplot as plt +import numpy +import cPickle +import scipy.misc + +from sklearn.cluster import MeanShift, estimate_bandwidth +from sklearn.datasets.samples_generator import make_blobs +from itertools import cycle + +print "Reading arrival point list" +with open("arrivals.pkl") as f: + pts = cPickle.load(f) + +print "Doing clustering" +bw = estimate_bandwidth(pts, quantile=.1, n_samples=1000) +print bw +bw = 0.001 + +ms = MeanShift(bandwidth=bw, bin_seeding=True, min_bin_freq=5) +ms.fit(pts) +cluster_centers = ms.cluster_centers_ + +print "Clusters shape: ", cluster_centers.shape + +with open("arrival-cluters.pkl", "w") as f: + cPickle.dump(cluster_centers, f, protocol=cPickle.HIGHEST_PROTOCOL) + diff --git a/data_analysis/destmaps.py b/data_analysis/destmaps.py index 0725ca7..6d08096 100644 --- a/data_analysis/destmaps.py +++ b/data_analysis/destmaps.py @@ -4,7 +4,7 @@ import cPickle import scipy.misc print "Loading data..." -with open("train_normal.pkl") as f: normal = cPickle.load(f) +with open("train.pkl") as f: normal = cPickle.load(f) print "Extracting x and y" # xes = [c[0] for l in normal for c in l[-1]] @@ -12,21 +12,21 @@ print "Extracting x and y" xes = [l[-1][-1][0] for l in normal if len(l[-1]) > 0] yes = [l[-1][-1][1] for l in normal if len(l[-1]) > 0] -xrg = [-8.75, -8.55] -yrg = [41.05, 41.25] +xrg = [-8.80, -8.50] +yrg = [41.00, 41.30] -print "Doing 1d x histogram" -plt.clf(); plt.hist(xes, bins=1000, range=xrg); plt.savefig("xhist_dest.pdf") -print "Doing 1d y histogram" -plt.clf(); plt.hist(yes, bins=1000, range=yrg); plt.savefig("yhist_dest.pdf") +#print "Doing 1d x histogram" +#plt.clf(); plt.hist(xes, bins=2000, range=xrg); plt.savefig("xhist_dest.pdf") +#print "Doing 1d y histogram" +#plt.clf(); plt.hist(yes, bins=2000, range=yrg); plt.savefig("yhist_dest.pdf") print "Doing 2d histogram" -hist, xx, yy = numpy.histogram2d(xes, yes, bins=2000, range=[xrg, yrg]) +hist, xx, yy = numpy.histogram2d(xes, yes, bins=4000, range=[xrg, yrg]) # import ipdb; ipdb.set_trace() print "Imshow" -plt.clf(); plt.imshow(numpy.log(hist)); plt.savefig("xyhmap_dest.png", dpi=600) +plt.clf(); plt.imshow(numpy.log(hist)); plt.savefig("xyhmap_dest_x.png", dpi=600) print "Imsave" -scipy.misc.imsave("xymap_dest_2.png", numpy.log(hist + 1)) +scipy.misc.imsave("xymap_dest_2_x.png", numpy.log(hist + 1)) |