Pick centers in KMeans++ with a probability proportional to their distance^2, instead of simple distance, to previous centers
This commit is contained in:
parent
459e7d4a80
commit
45e0e5f8e9
@ -210,8 +210,11 @@ private:
|
||||
assert(index >=0 && index < n);
|
||||
centers[0] = dsindices[index];
|
||||
|
||||
// Computing distance^2 will have the advantage of even higher probability further to pick new centers
|
||||
// far from previous centers (and this complies to "k-means++: the advantages of careful seeding" article)
|
||||
for (int i = 0; i < n; i++) {
|
||||
closestDistSq[i] = distance(dataset[dsindices[i]], dataset[dsindices[index]], dataset.cols);
|
||||
closestDistSq[i] *= closestDistSq[i];
|
||||
currentPot += closestDistSq[i];
|
||||
}
|
||||
|
||||
@ -237,7 +240,10 @@ private:
|
||||
|
||||
// Compute the new potential
|
||||
double newPot = 0;
|
||||
for (int i = 0; i < n; i++) newPot += std::min( distance(dataset[dsindices[i]], dataset[dsindices[index]], dataset.cols), closestDistSq[i] );
|
||||
for (int i = 0; i < n; i++) {
|
||||
DistanceType dist = distance(dataset[dsindices[i]], dataset[dsindices[index]], dataset.cols);
|
||||
newPot += std::min( dist*dist, closestDistSq[i] );
|
||||
}
|
||||
|
||||
// Store the best result
|
||||
if ((bestNewPot < 0)||(newPot < bestNewPot)) {
|
||||
@ -249,7 +255,10 @@ private:
|
||||
// Add the appropriate center
|
||||
centers[centerCount] = dsindices[bestNewIndex];
|
||||
currentPot = bestNewPot;
|
||||
for (int i = 0; i < n; i++) closestDistSq[i] = std::min( distance(dataset[dsindices[i]], dataset[dsindices[bestNewIndex]], dataset.cols), closestDistSq[i] );
|
||||
for (int i = 0; i < n; i++) {
|
||||
DistanceType dist = distance(dataset[dsindices[i]], dataset[dsindices[bestNewIndex]], dataset.cols);
|
||||
closestDistSq[i] = std::min( dist*dist, closestDistSq[i] );
|
||||
}
|
||||
}
|
||||
|
||||
centers_length = centerCount;
|
||||
|
Loading…
Reference in New Issue
Block a user