From edbf641c82456eabe8f777436a52f6cf2b8c1b5a Mon Sep 17 00:00:00 2001 From: CHERIBET CHERIF CHOUAIB Date: Sun, 15 Feb 2026 18:33:16 +0100 Subject: [PATCH 1/2] Add KMedoids clusterer, docs, tests and benchmark Introduce a new KMedoids clusterer implementation (src/Clusterers/KMedoids.php) implementing Estimator, Learner, Online, Probabilistic, Verbose and Persistable interfaces. Adds full training/partial training, predict/proba, inertia/loss tracking, medoids/sizes accessors, serialization and parameter validation. Also add documentation (docs/clusterers/k-medoids.md), unit tests (tests/Clusterers/KMedoidsTest.php) and a benchmark (benchmarks/Clusterers/KMedoidsBench.php). Tests and code use a seeder, distance kernel, and basic logging; invalid inputs and untrained prediction are guarded by exceptions. --- benchmarks/Clusterers/KMedoidsBench.php | 60 +++ docs/clusterers/k-medoids.md | 61 +++ src/Clusterers/KMedoids.php | 588 ++++++++++++++++++++++++ tests/Clusterers/KMedoidsTest.php | 220 +++++++++ 4 files changed, 929 insertions(+) create mode 100644 benchmarks/Clusterers/KMedoidsBench.php create mode 100644 docs/clusterers/k-medoids.md create mode 100644 src/Clusterers/KMedoids.php create mode 100644 tests/Clusterers/KMedoidsTest.php diff --git a/benchmarks/Clusterers/KMedoidsBench.php b/benchmarks/Clusterers/KMedoidsBench.php new file mode 100644 index 000000000..5fa9176e1 --- /dev/null +++ b/benchmarks/Clusterers/KMedoidsBench.php @@ -0,0 +1,60 @@ + new Blob([5.0, 3.42, 1.46, 0.24], [0.35, 0.38, 0.17, 0.1]), + 'Iris-versicolor' => new Blob([5.94, 2.77, 4.26, 1.33], [0.51, 0.31, 0.47, 0.2]), + 'Iris-virginica' => new Blob([6.59, 2.97, 5.55, 2.03], [0.63, 0.32, 0.55, 0.27]), + ]); + + $this->training = $generator->generate(self::TRAINING_SIZE); + + $this->testing = $generator->generate(self::TESTING_SIZE); + + $this->estimator = new KMedoids(3); + } + + /** + * @Subject + * @Iterations(5) + * @OutputTimeUnit("seconds", precision=3) + */ + public function trainPredict() : void + { + $this->estimator->train($this->training); + + $this->estimator->predict($this->testing); + } +} diff --git a/docs/clusterers/k-medoids.md b/docs/clusterers/k-medoids.md new file mode 100644 index 000000000..0629dd3ab --- /dev/null +++ b/docs/clusterers/k-medoids.md @@ -0,0 +1,61 @@ +[source] + +# K Medoids +A robust centroid-based hard clustering algorithm that uses actual data points (medoids) as cluster centers instead of computed means. K Medoids is more resistant to outliers than K Means and is suitable for clustering with arbitrary distance metrics. The algorithm minimizes the sum of dissimilarities between samples and their nearest medoid using the Partitioning Around Medoids (PAM) approach. + +**Interfaces:** [Estimator](../estimator.md), [Learner](../learner.md), [Online](../online.md), [Probabilistic](../probabilistic.md), [Persistable](../persistable.md), [Verbose](../verbose.md) + +**Data Type Compatibility:** Continuous + +## Parameters +| # | Name | Default | Type | Description | +|---|---|---|---|---| +| 1 | k | | int | The number of target clusters. | +| 2 | batch size | 128 | int | The size of each mini batch in samples. | +| 3 | epochs | 1000 | int | The maximum number of training rounds to execute. | +| 4 | min change | 1e-4 | float | The minimum change in the inertia for training to continue. | +| 5 | window | 5 | int | The number of epochs without improvement in the validation score to wait before considering an early stop. | +| 6 | kernel | Euclidean | Distance | The distance kernel used to compute the distance between sample points. | +| 7 | seeder | PlusPlus | Seeder | The seeder used to initialize the cluster medoids. | + +## Example +```php +use Rubix\ML\Clusterers\KMedoids; +use Rubix\ML\Kernels\Distance\Euclidean; +use Rubix\ML\Clusterers\Seeders\PlusPlus; + +$estimator = new KMedoids(3, 128, 300, 10.0, 10, new Euclidean(), new PlusPlus()); +``` + +## Additional Methods +Return the *k* computed medoids of the training set: +```php +public medoids() : array[] +``` + +Return the number of training samples that each medoid is responsible for: +```php +public sizes() : int[] +``` + +Return an iterable progress table with the steps from the last training session: +```php +public steps() : iterable +``` + +```php +use Rubix\ML\Extractors\CSV; + +$extractor = new CSV('progress.csv', true); + +$extractor->export($estimator->steps()); +``` + +Return the loss for each epoch from the last training session: +```php +public losses() : float[]|null +``` + +## References +[^1]: L. Kaufman et al. (1987). Clustering by means of Medoids. +[^2]: H. S. Park et al. (2009). A simple and fast algorithm for K-medoids clustering. diff --git a/src/Clusterers/KMedoids.php b/src/Clusterers/KMedoids.php new file mode 100644 index 000000000..d9004e4f3 --- /dev/null +++ b/src/Clusterers/KMedoids.php @@ -0,0 +1,588 @@ + + */ + protected int $k; + + /** + * The size of each mini batch in samples. + * + * @var positive-int + */ + protected int $batchSize; + + /** + * The maximum number of iterations to run until the algorithm terminates. + * + * @var int + */ + protected int $epochs; + + /** + * The minimum change in the inertia for training to continue. + * + * @var float + */ + protected float $minChange; + + /** + * The number of epochs without improvement in the training loss to wait before considering an early stop. + * + * @var int + */ + protected int $window; + + /** + * The distance function to use when computing the distances. + * + * @var Distance + */ + protected Distance $kernel; + + /** + * The cluster medoid seeder. + * + * @var Seeder + */ + protected Seeder $seeder; + + /** + * The medoid vectors of the training data (actual data points). + * + * @var list> + */ + protected array $medoids = [ + // + ]; + + /** + * The complete dataset samples stored for medoid updates. + * + * @var list> + */ + protected array $samples = [ + // + ]; + + /** + * The number of training samples contained within each cluster. + * + * @var int[] + */ + protected array $sizes = [ + // + ]; + + /** + * The loss at each epoch from the last training session. + * + * @var float[]|null + */ + protected ?array $losses = null; + + /** + * @param int $k + * @param int $batchSize + * @param int $epochs + * @param float $minChange + * @param int $window + * @param Distance|null $kernel + * @param Seeder|null $seeder + * @throws InvalidArgumentException + */ + public function __construct( + int $k, + int $batchSize = 128, + int $epochs = 1000, + float $minChange = 1e-4, + int $window = 5, + ?Distance $kernel = null, + ?Seeder $seeder = null + ) { + if ($k < 1) { + throw new InvalidArgumentException('K must be greater' + . " than 0, $k given."); + } + + if ($batchSize < 1) { + throw new InvalidArgumentException('Batch size must be' + . " greater than 0, $batchSize given."); + } + + if ($epochs < 0) { + throw new InvalidArgumentException('Number of epochs' + . " must be greater than 0, $epochs given."); + } + + if ($minChange < 0.0) { + throw new InvalidArgumentException('Minimum change must be' + . " greater than 0, $minChange given."); + } + + if ($window < 1) { + throw new InvalidArgumentException('Window must be' + . " greater than 0, $window given."); + } + + $this->k = $k; + $this->batchSize = $batchSize; + $this->epochs = $epochs; + $this->minChange = $minChange; + $this->window = $window; + $this->kernel = $kernel ?? new Euclidean(); + $this->seeder = $seeder ?? new PlusPlus($kernel); + } + + /** + * Return the estimator type. + * + * @return EstimatorType + */ + public function type() : EstimatorType + { + return EstimatorType::clusterer(); + } + + /** + * Return the data types that the estimator is compatible with. + * + * @return list + */ + public function compatibility() : array + { + return [ + DataType::continuous(), + ]; + } + + /** + * Return the settings of the hyper-parameters in an associative array. + * + * @return mixed[] + */ + public function params() : array + { + return [ + 'k' => $this->k, + 'batch size' => $this->batchSize, + 'epochs' => $this->epochs, + 'min change' => $this->minChange, + 'window' => $this->window, + 'kernel' => $this->kernel, + 'seeder' => $this->seeder, + ]; + } + + /** + * Has the learner been trained? + * + * @return bool + */ + public function trained() : bool + { + return !empty($this->medoids); + } + + /** + * Return the computed cluster medoids of the training data. + * + * @return list> + */ + public function medoids() : array + { + return $this->medoids; + } + + /** + * Return the number of training samples each medoid is responsible for. + * + * @return int[] + */ + public function sizes() : array + { + return $this->sizes; + } + + /** + * Return an iterable progress table with the steps from the last training session. + * + * @return Generator + */ + public function steps() : Generator + { + if (!$this->losses) { + return; + } + + foreach ($this->losses as $epoch => $loss) { + yield [ + 'epoch' => $epoch, + 'loss' => $loss, + ]; + } + } + + /** + * Return the loss for each epoch from the last training session. + * + * @return float[]|null + */ + public function losses() : ?array + { + return $this->losses; + } + + /** + * Train the learner with a dataset. + * + * @param Dataset $dataset + */ + public function train(Dataset $dataset) : void + { + SpecificationChain::with([ + new DatasetIsNotEmpty($dataset), + new SamplesAreCompatibleWithEstimator($dataset, $this), + ])->check(); + + /** @var list> $seeds */ + $seeds = $this->seeder->seed($dataset, $this->k); + + $this->medoids = $seeds; + $this->samples = $dataset->samples(); + + $sizes = array_fill(0, $this->k, 0); + $sizes[0] = $dataset->numSamples(); + + $this->sizes = $sizes; + + $this->partial($dataset); + } + + /** + * Perform a partial train on the learner. + * + * @param Dataset $dataset + */ + public function partial(Dataset $dataset) : void + { + if (empty($this->medoids) or empty($this->sizes)) { + $this->train($dataset); + + return; + } + + SpecificationChain::with([ + new DatasetIsNotEmpty($dataset), + new SamplesAreCompatibleWithEstimator($dataset, $this), + new DatasetHasDimensionality($dataset, count(current($this->medoids))), + ])->check(); + + // Merge new samples with existing samples + $newSamples = $dataset->samples(); + $this->samples = array_merge($this->samples, $newSamples); + + if ($this->logger) { + $this->logger->info("Training $this"); + } + + $allSamples = $this->samples; + $labels = array_fill(0, count($allSamples), 0); + + $fullDataset = Labeled::quick($allSamples, $labels); + + $prevLoss = $bestLoss = INF; + $numWorseEpochs = 0; + + $this->losses = []; + + for ($epoch = 1; $epoch <= $this->epochs; ++$epoch) { + // Assign all samples to nearest medoid + $assignments = array_map([$this, 'predictSample'], $allSamples); + + // Group samples by cluster + $clusters = array_fill(0, $this->k, []); + + foreach ($assignments as $i => $cluster) { + $clusters[$cluster][] = $i; + } + + // Update medoids - find the sample that minimizes total distance within each cluster + foreach ($clusters as $cluster => $indices) { + if (empty($indices)) { + continue; + } + + $bestMedoidIndex = $indices[0]; + $bestTotalDistance = INF; + + // Try each point in the cluster as a potential medoid + foreach ($indices as $candidateIndex) { + $totalDistance = 0.0; + + foreach ($indices as $pointIndex) { + $totalDistance += $this->kernel->compute( + $allSamples[$candidateIndex], + $allSamples[$pointIndex] + ); + } + + if ($totalDistance < $bestTotalDistance) { + $bestTotalDistance = $totalDistance; + $bestMedoidIndex = $candidateIndex; + } + } + + $this->medoids[$cluster] = $allSamples[$bestMedoidIndex]; + $this->sizes[$cluster] = count($indices); + } + + // Calculate loss (total cost) + $loss = $this->inertia($allSamples, $assignments); + $loss /= count($allSamples); + + $lossChange = abs($prevLoss - $loss); + + $this->losses[$epoch] = $loss; + + if ($this->logger) { + $lossDirection = $loss < $prevLoss ? '↓' : '↑'; + + $message = "Epoch: $epoch, " + . "Inertia: $loss, " + . "Loss Change: {$lossDirection}{$lossChange}"; + + $this->logger->info($message); + } + + if (is_nan($loss)) { + if ($this->logger) { + $this->logger->warning('Numerical instability detected'); + } + + break; + } + + if ($loss <= 0.0) { + break; + } + + if ($lossChange < $this->minChange) { + break; + } + + if ($loss < $bestLoss) { + $bestLoss = $loss; + + $numWorseEpochs = 0; + } else { + ++$numWorseEpochs; + } + + if ($numWorseEpochs >= $this->window) { + break; + } + + $prevLoss = $loss; + } + + if ($this->logger) { + $this->logger->info('Training complete'); + } + } + + /** + * Cluster the dataset by assigning a label to each sample. + * + * @param Dataset $dataset + * @throws RuntimeException + * @return list + */ + public function predict(Dataset $dataset) : array + { + if (!$this->medoids) { + throw new RuntimeException('Estimator has not been trained.'); + } + + DatasetHasDimensionality::with($dataset, count(current($this->medoids)))->check(); + + return array_map([$this, 'predictSample'], $dataset->samples()); + } + + /** + * Label a given sample based on its distance from a particular medoid. + * + * @internal + * + * @param list $sample + * @return int + */ + public function predictSample(array $sample) : int + { + $bestDistance = INF; + $bestCluster = -1; + + foreach ($this->medoids as $cluster => $medoid) { + $distance = $this->kernel->compute($sample, $medoid); + + if ($distance < $bestDistance) { + $bestDistance = $distance; + $bestCluster = $cluster; + } + } + + return (int) $bestCluster; + } + + /** + * Estimate the joint probabilities for each possible outcome. + * + * @param Dataset $dataset + * @throws RuntimeException + * @return list + */ + public function proba(Dataset $dataset) : array + { + if (!$this->medoids) { + throw new RuntimeException('Estimator has not been trained.'); + } + + DatasetHasDimensionality::with($dataset, count(current($this->medoids)))->check(); + + return array_map([$this, 'probaSample'], $dataset->samples()); + } + + /** + * Return the membership of a sample to each of the k medoids. + * + * @internal + * + * @param list $sample + * @return float[] + */ + public function probaSample(array $sample) : array + { + $distances = $dist = []; + + foreach ($this->medoids as $medoid) { + $distances[] = $this->kernel->compute($sample, $medoid) ?: EPSILON; + } + + foreach ($distances as $distanceA) { + $sigma = 0.0; + + foreach ($distances as $distanceB) { + $sigma += $distanceA / $distanceB; + } + + $dist[] = 1.0 / $sigma; + } + + return $dist; + } + + /** + * Calculate the average sum of distances between all samples and their closest + * medoid. + * + * @param list> $samples + * @param list $labels + * @return float + */ + protected function inertia(array $samples, array $labels) : float + { + $inertia = 0.0; + + foreach ($samples as $i => $sample) { + $medoid = $this->medoids[$labels[$i]]; + + $inertia += $this->kernel->compute($sample, $medoid); + } + + return $inertia; + } + + /** + * Return an associative array containing the data used to serialize the object. + * + * @return mixed[] + */ + public function __serialize() : array + { + $properties = get_object_vars($this); + + unset($properties['losses'], $properties['logger']); + + return $properties; + } + + /** + * Return the string representation of the object. + * + * @internal + * + * @return string + */ + public function __toString() : string + { + return 'K Medoids (' . Params::stringify($this->params()) . ')'; + } +} diff --git a/tests/Clusterers/KMedoidsTest.php b/tests/Clusterers/KMedoidsTest.php new file mode 100644 index 000000000..531b78412 --- /dev/null +++ b/tests/Clusterers/KMedoidsTest.php @@ -0,0 +1,220 @@ +generator = new Agglomerate([ + 'red' => new Blob([255, 32, 0], 50.0), + 'green' => new Blob([0, 128, 0], 10.0), + 'blue' => new Blob([0, 32, 255], 30.0), + ], [0.5, 0.2, 0.3]); + + $this->estimator = new KMedoids(3, 128, 300, 1e-4, 5, new Euclidean(), new PlusPlus()); + + $this->metric = new VMeasure(); + + srand(self::RANDOM_SEED); + } + + protected function assertPreConditions() : void + { + $this->assertFalse($this->estimator->trained()); + } + + /** + * @test + */ + public function build() : void + { + $this->assertInstanceOf(KMedoids::class, $this->estimator); + $this->assertInstanceOf(Learner::class, $this->estimator); + $this->assertInstanceOf(Online::class, $this->estimator); + $this->assertInstanceOf(Probabilistic::class, $this->estimator); + $this->assertInstanceOf(Persistable::class, $this->estimator); + $this->assertInstanceOf(Verbose::class, $this->estimator); + $this->assertInstanceOf(Estimator::class, $this->estimator); + } + + /** + * @test + */ + public function badK() : void + { + $this->expectException(InvalidArgumentException::class); + + new KMedoids(0); + } + + /** + * @test + */ + public function type() : void + { + $this->assertEquals(EstimatorType::clusterer(), $this->estimator->type()); + } + + /** + * @test + */ + public function compatibility() : void + { + $expected = [ + DataType::continuous(), + ]; + + $this->assertEquals($expected, $this->estimator->compatibility()); + } + + /** + * @test + */ + public function params() : void + { + $expected = [ + 'k' => 3, + 'batch size' => 128, + 'epochs' => 300, + 'min change' => 1e-4, + 'window' => 5, + 'kernel' => new Euclidean(), + 'seeder' => new PlusPlus(), + ]; + + $this->assertEquals($expected, $this->estimator->params()); + } + + /** + * @test + */ + public function trainPartialPredict() : void + { + $this->estimator->setLogger(new BlackHole()); + + $training = $this->generator->generate(self::TRAIN_SIZE); + $testing = $this->generator->generate(self::TEST_SIZE); + + $folds = $training->stratifiedFold(3); + + $this->estimator->train($folds[0]); + $this->estimator->partial($folds[1]); + $this->estimator->partial($folds[2]); + + $this->assertTrue($this->estimator->trained()); + + $medoids = $this->estimator->medoids(); + + $this->assertIsArray($medoids); + $this->assertCount(3, $medoids); + $this->assertContainsOnly('array', $medoids); + + $sizes = $this->estimator->sizes(); + + $this->assertIsArray($sizes); + $this->assertCount(3, $sizes); + $this->assertContainsOnly('int', $sizes); + + $losses = $this->estimator->losses(); + + $this->assertIsArray($losses); + $this->assertContainsOnly('float', $losses); + + $predictions = $this->estimator->predict($testing); + + $score = $this->metric->score($predictions, $testing->labels()); + + $this->assertGreaterThanOrEqual(self::MIN_SCORE, $score); + } + + /** + * @test + */ + public function trainIncompatible() : void + { + $this->expectException(InvalidArgumentException::class); + + $this->estimator->train(Unlabeled::quick([['bad']])); + } + + /** + * @test + */ + public function predictUntrained() : void + { + $this->expectException(RuntimeException::class); + + $this->estimator->predict(Unlabeled::quick([[1.0]])); + } +} From 75615a28024297dc53cce036a90b2f86b685351e Mon Sep 17 00:00:00 2001 From: CHERIBET CHERIF CHOUAIB Date: Sun, 15 Feb 2026 18:58:37 +0100 Subject: [PATCH 2/2] +++ . --- CHANGELOG.md | 3 +++ composer.json | 2 +- mkdocs.yml | 1 + test-kmedoids-minimal.php | 46 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 51 insertions(+), 1 deletion(-) create mode 100644 test-kmedoids-minimal.php diff --git a/CHANGELOG.md b/CHANGELOG.md index cde414469..afdb55aae 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,6 @@ +- 2.5.3 + - Added K Medoids clusterer + - 2.5.2 - Fix bug in One-class SVM inferencing diff --git a/composer.json b/composer.json index 34235ba7a..d33b8829c 100644 --- a/composer.json +++ b/composer.json @@ -11,7 +11,7 @@ "data science", "data mining", "dbscan", "deep learning", "dimensionality reduction", "ensemble", "estimator", "etl", "feature extraction", "feature selection", "feature importance", "gaussian mixture", "gbm", "gmm", "gradient boost", "grid search", "image recognition", - "imputation", "inference", "isolation forest", "k-means", "kmeans", "k-nearest neighbors", + "imputation", "inference", "isolation forest", "k-means", "kmeans", "k-medoids", "kmedoids", "k-nearest neighbors", "knn", "linear regression", "loda", "local outlier factor", "lof", "logistic regression", "machine learning", "manifold learning", "mean shift", "ml", "mlp", "multilayer perceptron", "naive bayes", "neural network", "natural language processing", "nearest neighbors", "nlp", diff --git a/mkdocs.yml b/mkdocs.yml index a682805f1..b69b28ff4 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -97,6 +97,7 @@ nav: - Fuzzy C Means: clusterers/fuzzy-c-means.md - Gaussian Mixture: clusterers/gaussian-mixture.md - K Means: clusterers/k-means.md + - K Medoids: clusterers/k-medoids.md - Mean Shift: clusterers/mean-shift.md - Anomaly Detectors: - Gaussian MLE: anomaly-detectors/gaussian-mle.md diff --git a/test-kmedoids-minimal.php b/test-kmedoids-minimal.php new file mode 100644 index 000000000..66fca0d67 --- /dev/null +++ b/test-kmedoids-minimal.php @@ -0,0 +1,46 @@ +hasMethod($method)) { + echo "✓ Method '$method' exists\n"; + } else { + echo "✗ Method '$method' missing\n"; + } + } + + echo "\n✓ All K-Medoids code structure is valid!\n"; +} else { + echo "✗ KMedoids class not found\n"; +}