Skip to content

Commit 7feb790

Browse files
yanboliangNick Pentreath
authored andcommitted
[MINOR][DOC][ML] ml.clustering scala & python api doc sync
## What changes were proposed in this pull request? Since we done Scala API audit for ml.clustering at #13148, we should also fix and update the corresponding Python API docs to keep them in sync. ## How was this patch tested? Docs change, no tests. Author: Yanbo Liang <[email protected]> Closes #13291 from yanboliang/spark-15361-followup. (cherry picked from commit 594484c) Signed-off-by: Nick Pentreath <[email protected]>
1 parent 20a07e4 commit 7feb790

File tree

1 file changed

+25
-10
lines changed

1 file changed

+25
-10
lines changed

python/pyspark/ml/clustering.py

Lines changed: 25 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,21 @@ class GaussianMixture(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIte
6464
.. note:: Experimental
6565
6666
GaussianMixture clustering.
67+
This class performs expectation maximization for multivariate Gaussian
68+
Mixture Models (GMMs). A GMM represents a composite distribution of
69+
independent Gaussian distributions with associated "mixing" weights
70+
specifying each's contribution to the composite.
71+
72+
Given a set of sample points, this class will maximize the log-likelihood
73+
for a mixture of k Gaussians, iterating until the log-likelihood changes by
74+
less than convergenceTol, or until it has reached the max number of iterations.
75+
While this process is generally guaranteed to converge, it is not guaranteed
76+
to find a global optimum.
77+
78+
Note: For high-dimensional data (with many features), this algorithm may perform poorly.
79+
This is due to high-dimensional data (a) making it difficult to cluster at all
80+
(based on statistical/theoretical arguments) and (b) numerical issues with
81+
Gaussian distributions.
6782
6883
>>> from pyspark.ml.linalg import Vectors
6984
@@ -118,8 +133,8 @@ class GaussianMixture(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIte
118133
.. versionadded:: 2.0.0
119134
"""
120135

121-
k = Param(Params._dummy(), "k", "number of clusters to create",
122-
typeConverter=TypeConverters.toInt)
136+
k = Param(Params._dummy(), "k", "Number of independent Gaussians in the mixture model. " +
137+
"Must be > 1.", typeConverter=TypeConverters.toInt)
123138

124139
@keyword_only
125140
def __init__(self, featuresCol="features", predictionCol="prediction", k=2,
@@ -227,15 +242,15 @@ class KMeans(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIter, HasTol
227242
.. versionadded:: 1.5.0
228243
"""
229244

230-
k = Param(Params._dummy(), "k", "number of clusters to create",
245+
k = Param(Params._dummy(), "k", "The number of clusters to create. Must be > 1.",
231246
typeConverter=TypeConverters.toInt)
232247
initMode = Param(Params._dummy(), "initMode",
233-
"the initialization algorithm. This can be either \"random\" to " +
248+
"The initialization algorithm. This can be either \"random\" to " +
234249
"choose random points as initial cluster centers, or \"k-means||\" " +
235250
"to use a parallel variant of k-means++",
236251
typeConverter=TypeConverters.toString)
237-
initSteps = Param(Params._dummy(), "initSteps", "steps for k-means initialization mode",
238-
typeConverter=TypeConverters.toInt)
252+
initSteps = Param(Params._dummy(), "initSteps", "The number of steps for k-means|| " +
253+
"initialization mode. Must be > 0.", typeConverter=TypeConverters.toInt)
239254

240255
@keyword_only
241256
def __init__(self, featuresCol="features", predictionCol="prediction", k=2,
@@ -380,11 +395,11 @@ class BisectingKMeans(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIte
380395
.. versionadded:: 2.0.0
381396
"""
382397

383-
k = Param(Params._dummy(), "k", "number of clusters to create",
398+
k = Param(Params._dummy(), "k", "The desired number of leaf clusters. Must be > 1.",
384399
typeConverter=TypeConverters.toInt)
385400
minDivisibleClusterSize = Param(Params._dummy(), "minDivisibleClusterSize",
386-
"the minimum number of points (if >= 1.0) " +
387-
"or the minimum proportion",
401+
"The minimum number of points (if >= 1.0) or the minimum " +
402+
"proportion of points (if < 1.0) of a divisible cluster.",
388403
typeConverter=TypeConverters.toFloat)
389404

390405
@keyword_only
@@ -661,7 +676,7 @@ class LDA(JavaEstimator, HasFeaturesCol, HasMaxIter, HasSeed, HasCheckpointInter
661676
.. versionadded:: 2.0.0
662677
"""
663678

664-
k = Param(Params._dummy(), "k", "number of topics (clusters) to infer",
679+
k = Param(Params._dummy(), "k", "The number of topics (clusters) to infer. Must be > 1.",
665680
typeConverter=TypeConverters.toInt)
666681
optimizer = Param(Params._dummy(), "optimizer",
667682
"Optimizer or inference algorithm used to estimate the LDA model. "

0 commit comments

Comments
 (0)