@@ -64,6 +64,21 @@ class GaussianMixture(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIte
6464 .. note:: Experimental
6565
6666 GaussianMixture clustering.
67+ This class performs expectation maximization for multivariate Gaussian
68+ Mixture Models (GMMs). A GMM represents a composite distribution of
69+ independent Gaussian distributions with associated "mixing" weights
70+ specifying each's contribution to the composite.
71+
72+ Given a set of sample points, this class will maximize the log-likelihood
73+ for a mixture of k Gaussians, iterating until the log-likelihood changes by
74+ less than convergenceTol, or until it has reached the max number of iterations.
75+ While this process is generally guaranteed to converge, it is not guaranteed
76+ to find a global optimum.
77+
78+ Note: For high-dimensional data (with many features), this algorithm may perform poorly.
79+ This is due to high-dimensional data (a) making it difficult to cluster at all
80+ (based on statistical/theoretical arguments) and (b) numerical issues with
81+ Gaussian distributions.
6782
6883 >>> from pyspark.ml.linalg import Vectors
6984
@@ -118,8 +133,8 @@ class GaussianMixture(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIte
118133 .. versionadded:: 2.0.0
119134 """
120135
121- k = Param (Params ._dummy (), "k" , "number of clusters to create" ,
122- typeConverter = TypeConverters .toInt )
136+ k = Param (Params ._dummy (), "k" , "Number of independent Gaussians in the mixture model. " +
137+ "Must be > 1." , typeConverter = TypeConverters .toInt )
123138
124139 @keyword_only
125140 def __init__ (self , featuresCol = "features" , predictionCol = "prediction" , k = 2 ,
@@ -227,15 +242,15 @@ class KMeans(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIter, HasTol
227242 .. versionadded:: 1.5.0
228243 """
229244
230- k = Param (Params ._dummy (), "k" , "number of clusters to create" ,
245+ k = Param (Params ._dummy (), "k" , "The number of clusters to create. Must be > 1. " ,
231246 typeConverter = TypeConverters .toInt )
232247 initMode = Param (Params ._dummy (), "initMode" ,
233- "the initialization algorithm. This can be either \" random\" to " +
248+ "The initialization algorithm. This can be either \" random\" to " +
234249 "choose random points as initial cluster centers, or \" k-means||\" " +
235250 "to use a parallel variant of k-means++" ,
236251 typeConverter = TypeConverters .toString )
237- initSteps = Param (Params ._dummy (), "initSteps" , "steps for k-means initialization mode" ,
238- typeConverter = TypeConverters .toInt )
252+ initSteps = Param (Params ._dummy (), "initSteps" , "The number of steps for k-means|| " +
253+ "initialization mode. Must be > 0." , typeConverter = TypeConverters .toInt )
239254
240255 @keyword_only
241256 def __init__ (self , featuresCol = "features" , predictionCol = "prediction" , k = 2 ,
@@ -380,11 +395,11 @@ class BisectingKMeans(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIte
380395 .. versionadded:: 2.0.0
381396 """
382397
383- k = Param (Params ._dummy (), "k" , "number of clusters to create " ,
398+ k = Param (Params ._dummy (), "k" , "The desired number of leaf clusters. Must be > 1. " ,
384399 typeConverter = TypeConverters .toInt )
385400 minDivisibleClusterSize = Param (Params ._dummy (), "minDivisibleClusterSize" ,
386- "the minimum number of points (if >= 1.0) " +
387- "or the minimum proportion " ,
401+ "The minimum number of points (if >= 1.0) or the minimum " +
402+ "proportion of points (if < 1.0) of a divisible cluster. " ,
388403 typeConverter = TypeConverters .toFloat )
389404
390405 @keyword_only
@@ -661,7 +676,7 @@ class LDA(JavaEstimator, HasFeaturesCol, HasMaxIter, HasSeed, HasCheckpointInter
661676 .. versionadded:: 2.0.0
662677 """
663678
664- k = Param (Params ._dummy (), "k" , "number of topics (clusters) to infer" ,
679+ k = Param (Params ._dummy (), "k" , "The number of topics (clusters) to infer. Must be > 1. " ,
665680 typeConverter = TypeConverters .toInt )
666681 optimizer = Param (Params ._dummy (), "optimizer" ,
667682 "Optimizer or inference algorithm used to estimate the LDA model. "
0 commit comments