iRoCS Toolbox  1.1.0
kmeans-inl.hh
Go to the documentation of this file.
1 /**************************************************************************
2  *
3  * Copyright (C) 2015 Margret Keuper, Thorsten Falk
4  *
5  * Image Analysis Lab, University of Freiburg, Germany
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3 of the License, or
10  * (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, write to the Free Software Foundation,
19  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  *
21  **************************************************************************/
22 
23 #ifndef LIBMARGRET_SRC_KMEANS_INL_HH
24 #define LIBMARGRET_SRC_KMEANS_INL_HH
25 
26 #ifdef HAVE_CONFIG_H
27 #include <config.hh>
28 #endif
29 
30 #include "kmeans.hh"
31 
32 #include <time.h>
33 
34 #include <blitz/array.h>
35 #include <vector>
36 #include <set>
37 
38 #include "segmentation_error.hh"
39 
40 namespace segmentation
41 {
42 
43  template<class DataT>
45  {}
46 
47  template<class DataT>
49  const PointT& p1, const PointT& p2) const
50  {
51  PointT diff(p1.shape());
52  diff = p1 - p2;
53  return blitz::sum(diff*diff);
54  }
55 
56  template<class DataT>
57  _KMeans<DataT>::_KMeans(unsigned int k) :
58  k_(k), max_iterations_(50), max_restarts_(100),
59  variance_threshold_(0),
60  variance_iter_no_improvement_(10),
61  all(blitz::Range::all())
62  {
63  distance_ = new EuclideanDistance();
64  }
65 
66 
67  template<class DataT>
69  {
70  delete distance_;
71  }
72 
73  template<class DataT>
74  double _KMeans<DataT>::cluster(const ArrayPointT &points,
75  ArrayPointT &best_means,
76  std::vector<int> &labels)
77  {
78  num_points_ = points.extent(0);
79  point_dim_ = points.extent(1);
80  // throw exception if number of data point is lower than k
81  if (num_points_ < k_)
82  {
83  std::stringstream s;
84  s << "KMeans: Number of data points " << num_points_
85  << " is less than k=" << k_;
86  throw LabellingError(s.str().c_str());
87  }
88  points_.reference(points);
89 
90  means_.resize(k_, point_dim_);
91  labels_.resize(num_points_, 0);
92 
93  //srand(static_cast<unsigned int>(time(NULL)));
94  // compute boundaries
95  //boundaries(points_, lbound_, ubound_);
96 
97  // restart multiple times and return lowest variance result
98  best_means.resize(k_, point_dim_);
99  std::vector<int> &best_labels = labels;
100  best_labels.resize(k_);
101  double min_variance = std::numeric_limits<double>::max();
102  double last_variance = min_variance;
103  // iterate till variance threshold or maximum number of restarts is reached
104  for (int i = 0; i < max_restarts_ && min_variance > variance_threshold_; ++i)
105  {
106  initializeMeans();
107  iterateUntilConvergence();
108  double variance = totalVariance();
109  if (variance < min_variance)
110  {
111  min_variance = variance;
112  best_means = means_;
113  best_labels = labels_;
114  }
115  // stop if variance didn't drop for x iterations
116  if (i % variance_iter_no_improvement_ == 0)
117  {
118  if (last_variance == min_variance)
119  {
120  break;
121  }
122  last_variance = min_variance;
123  }
124  }
125  return min_variance;
126  }
127 
128  template<class DataT>
130  {
131  for (int i = 0; i < max_iterations_; ++i)
132  {
133  //stop if labels don't change
134  if (!assignLabels()) break;
135  updateMeans();
136  }
137  }
138 
139 
140  template<class DataT>
142  {
143  std::set<ptrdiff_t> sampled_indices;
144  while (sampled_indices.size() < k_)
145  {
146  ptrdiff_t rand_i = static_cast<ptrdiff_t>(
147  static_cast<double>(rand()) / (RAND_MAX - 1) *
148  static_cast<double>(num_points_));
149  sampled_indices.insert(rand_i);
150  }
151  int i = 0;
152  for (std::set<ptrdiff_t>::const_iterator it = sampled_indices.begin();
153  it != sampled_indices.end(); ++it, ++i)
154  means_(i, all) = points_(*it, all);
155  }
156 
157  template<class DataT>
159  {
160  means_ = 0.;
161  blitz::Array<double, 1> tt(4);
162  blitz::Array<DataT, 1> count_points(k_);
163  count_points = 0;
164  for (ptrdiff_t i_p = 0; i_p < num_points_; ++i_p)
165  {
166  ptrdiff_t mean_idx = labels_[i_p];
167  means_(mean_idx, all) += points_(i_p, all);
168  ++count_points(mean_idx);
169  }
170  blitz::firstIndex i;
171  blitz::secondIndex j;
172  means_ = means_(i, j) / count_points(i);
173  // means_(i, all) /= i; //count_points(i);
174 
175  }
176 
177  template<class DataT>
179  {
180  bool changed = false;
181  for (ptrdiff_t point_idx = 0; point_idx < num_points_; ++point_idx)
182  {
183  const PointT& point = points_(point_idx, all);
184  double min_dist = std::numeric_limits<DataT>::max();
185  int old_label = labels_[point_idx];
186  // find nearest mean
187  for (ptrdiff_t mean_idx = 0; mean_idx < k_; ++mean_idx)
188  {
189  const PointT &mean = means_(mean_idx, all);
190  double dist = (*distance_)(mean, point);
191  if (dist < min_dist)
192  {
193  min_dist = dist;
194  labels_[point_idx] = mean_idx;
195  }
196  }
197  if (!changed && old_label != labels_[point_idx])
198  changed = true;
199  }
200  return changed;
201  }
202 
203  template<class DataT>
205  {
206  double variance = 0;
207  for (ptrdiff_t i = 0; i < num_points_; ++i)
208  {
209  double dist = (*distance_)(points_(i, all), means_(labels_[i], all));
210  variance += blitz::pow2(dist);
211  }
212  return variance / static_cast<double>(num_points_);
213  }
214 
215  template<class DataT>
216  double kmeans(const unsigned int k,
217  const blitz::Array<DataT, 2 > &points,
218  blitz::Array<DataT, 2 > &means,
219  std::vector<int> &labels
220  )
221  {
222  // shortcut for point
224  return kmeans.cluster(points, means, labels);
225  //kmeans.cluster();
226 
227  }
228 
229 }
230 #endif
double cluster(const ArrayPointT &points, ArrayPointT &means, std::vector< int > &labels)
Definition: kmeans-inl.hh:74
_KMeans(const unsigned int k, const Distance &distance)
virtual double operator()(const PointT &p1, const PointT &p2) const
Definition: kmeans-inl.hh:48
double kmeans(const unsigned int k, const blitz::Array< DataT, 2 > &points, blitz::Array< DataT, 2 > &means, std::vector< int > &labels)
Definition: kmeans-inl.hh:216