aboutsummaryrefslogtreecommitdiff
path: root/data_analysis
diff options
context:
space:
mode:
authorAlex Auvolat <alex.auvolat@ens.fr>2015-05-04 16:43:48 -0400
committerAlex Auvolat <alex.auvolat@ens.fr>2015-05-04 16:58:31 -0400
commit80d3ea67a845484d119cb88f0a0412f981ab344c (patch)
tree37b8130b6d761bcda48c8c0f74114498b85dad97 /data_analysis
parentf9a31bd246e3c4736d3f532b566b7437eba6b4de (diff)
downloadtaxi-80d3ea67a845484d119cb88f0a0412f981ab344c.tar.gz
taxi-80d3ea67a845484d119cb88f0a0412f981ab344c.zip
Mew data analysis tool: clustering of arrival points.
Diffstat (limited to 'data_analysis')
-rw-r--r--data_analysis/cluster_arrival.py27
-rw-r--r--data_analysis/destmaps.py20
2 files changed, 37 insertions, 10 deletions
diff --git a/data_analysis/cluster_arrival.py b/data_analysis/cluster_arrival.py
new file mode 100644
index 0000000..fd4ea04
--- /dev/null
+++ b/data_analysis/cluster_arrival.py
@@ -0,0 +1,27 @@
+import matplotlib.pyplot as plt
+import numpy
+import cPickle
+import scipy.misc
+
+from sklearn.cluster import MeanShift, estimate_bandwidth
+from sklearn.datasets.samples_generator import make_blobs
+from itertools import cycle
+
+print "Reading arrival point list"
+with open("arrivals.pkl") as f:
+ pts = cPickle.load(f)
+
+print "Doing clustering"
+bw = estimate_bandwidth(pts, quantile=.1, n_samples=1000)
+print bw
+bw = 0.001
+
+ms = MeanShift(bandwidth=bw, bin_seeding=True, min_bin_freq=5)
+ms.fit(pts)
+cluster_centers = ms.cluster_centers_
+
+print "Clusters shape: ", cluster_centers.shape
+
+with open("arrival-cluters.pkl", "w") as f:
+ cPickle.dump(cluster_centers, f, protocol=cPickle.HIGHEST_PROTOCOL)
+
diff --git a/data_analysis/destmaps.py b/data_analysis/destmaps.py
index 0725ca7..6d08096 100644
--- a/data_analysis/destmaps.py
+++ b/data_analysis/destmaps.py
@@ -4,7 +4,7 @@ import cPickle
import scipy.misc
print "Loading data..."
-with open("train_normal.pkl") as f: normal = cPickle.load(f)
+with open("train.pkl") as f: normal = cPickle.load(f)
print "Extracting x and y"
# xes = [c[0] for l in normal for c in l[-1]]
@@ -12,21 +12,21 @@ print "Extracting x and y"
xes = [l[-1][-1][0] for l in normal if len(l[-1]) > 0]
yes = [l[-1][-1][1] for l in normal if len(l[-1]) > 0]
-xrg = [-8.75, -8.55]
-yrg = [41.05, 41.25]
+xrg = [-8.80, -8.50]
+yrg = [41.00, 41.30]
-print "Doing 1d x histogram"
-plt.clf(); plt.hist(xes, bins=1000, range=xrg); plt.savefig("xhist_dest.pdf")
-print "Doing 1d y histogram"
-plt.clf(); plt.hist(yes, bins=1000, range=yrg); plt.savefig("yhist_dest.pdf")
+#print "Doing 1d x histogram"
+#plt.clf(); plt.hist(xes, bins=2000, range=xrg); plt.savefig("xhist_dest.pdf")
+#print "Doing 1d y histogram"
+#plt.clf(); plt.hist(yes, bins=2000, range=yrg); plt.savefig("yhist_dest.pdf")
print "Doing 2d histogram"
-hist, xx, yy = numpy.histogram2d(xes, yes, bins=2000, range=[xrg, yrg])
+hist, xx, yy = numpy.histogram2d(xes, yes, bins=4000, range=[xrg, yrg])
# import ipdb; ipdb.set_trace()
print "Imshow"
-plt.clf(); plt.imshow(numpy.log(hist)); plt.savefig("xyhmap_dest.png", dpi=600)
+plt.clf(); plt.imshow(numpy.log(hist)); plt.savefig("xyhmap_dest_x.png", dpi=600)
print "Imsave"
-scipy.misc.imsave("xymap_dest_2.png", numpy.log(hist + 1))
+scipy.misc.imsave("xymap_dest_2_x.png", numpy.log(hist + 1))