[minor doc] Add exploratory data analysis warning for DataFrame.stat.freqItem API

rxin · jeanlyn · commit 2a37bfb3a2b6 · 2015-06-12T13:18:05.000+08:00
Author: Reynold Xin <rxin@databricks.com> Closes apache#6569 from rxin/freqItemsWarning and squashes the following commits: 7eec145 [Reynold Xin] [minor doc] Add exploratory data analysis warning for DataFrame.stat.freqItem API.
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
@@ -1170,6 +1170,9 @@ def freqItems(self, cols, support=None):
         "http://dx.doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou".
         :func:`DataFrame.freqItems` and :func:`DataFrameStatFunctions.freqItems` are aliases.
 
+        This function is meant for exploratory data analysis, as we make no guarantee about the
+        backward compatibility of the schema of the resulting DataFrame.
+
         :param cols: Names of the columns to calculate frequent items for as a list or tuple of
             strings.
         :param support: The frequency with which to consider an item 'frequent'. Default is 1%.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
@@ -97,6 +97,9 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
    * [[http://dx.doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou]].
    * The `support` should be greater than 1e-4.
    *
+   * This function is meant for exploratory data analysis, as we make no guarantee about the
+   * backward compatibility of the schema of the resulting [[DataFrame]].
+   *
    * @param cols the names of the columns to search frequent items in.
    * @param support The minimum frequency for an item to be considered `frequent`. Should be greater
    *                than 1e-4.
@@ -114,6 +117,9 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
    * [[http://dx.doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou]].
    * Uses a `default` support of 1%.
    *
+   * This function is meant for exploratory data analysis, as we make no guarantee about the
+   * backward compatibility of the schema of the resulting [[DataFrame]].
+   *
    * @param cols the names of the columns to search frequent items in.
    * @return A Local DataFrame with the Array of frequent items for each column.
    *
@@ -128,6 +134,9 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
    * frequent element count algorithm described in
    * [[http://dx.doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou]].
    *
+   * This function is meant for exploratory data analysis, as we make no guarantee about the
+   * backward compatibility of the schema of the resulting [[DataFrame]].
+   *
    * @param cols the names of the columns to search frequent items in.
    * @return A Local DataFrame with the Array of frequent items for each column.
    *
@@ -143,6 +152,9 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
    * [[http://dx.doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou]].
    * Uses a `default` support of 1%.
    *
+   * This function is meant for exploratory data analysis, as we make no guarantee about the
+   * backward compatibility of the schema of the resulting [[DataFrame]].
+   *
    * @param cols the names of the columns to search frequent items in.
    * @return A Local DataFrame with the Array of frequent items for each column.
    *

Original file line number	Diff line number	Diff line change
`@@ -97,6 +97,9 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {`
`97`	`97`	`* [[http://dx.doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou]].`
`98`	`98`	* The `support` should be greater than 1e-4.
`99`	`99`	`*`
	`100`	`+ * This function is meant for exploratory data analysis, as we make no guarantee about the`
	`101`	`+ * backward compatibility of the schema of the resulting [[DataFrame]].`
	`102`	`+ *`
`100`	`103`	`* @param cols the names of the columns to search frequent items in.`
`101`	`104`	* @param support The minimum frequency for an item to be considered `frequent`. Should be greater
`102`	`105`	`* than 1e-4.`
`@@ -114,6 +117,9 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {`
`114`	`117`	`* [[http://dx.doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou]].`
`115`	`118`	* Uses a `default` support of 1%.
`116`	`119`	`*`
	`120`	`+ * This function is meant for exploratory data analysis, as we make no guarantee about the`
	`121`	`+ * backward compatibility of the schema of the resulting [[DataFrame]].`
	`122`	`+ *`
`117`	`123`	`* @param cols the names of the columns to search frequent items in.`
`118`	`124`	`* @return A Local DataFrame with the Array of frequent items for each column.`
`119`	`125`	`*`
`@@ -128,6 +134,9 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {`
`128`	`134`	`* frequent element count algorithm described in`
`129`	`135`	`* [[http://dx.doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou]].`
`130`	`136`	`*`
	`137`	`+ * This function is meant for exploratory data analysis, as we make no guarantee about the`
	`138`	`+ * backward compatibility of the schema of the resulting [[DataFrame]].`
	`139`	`+ *`
`131`	`140`	`* @param cols the names of the columns to search frequent items in.`
`132`	`141`	`* @return A Local DataFrame with the Array of frequent items for each column.`
`133`	`142`	`*`
`@@ -143,6 +152,9 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {`
`143`	`152`	`* [[http://dx.doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou]].`
`144`	`153`	* Uses a `default` support of 1%.
`145`	`154`	`*`
	`155`	`+ * This function is meant for exploratory data analysis, as we make no guarantee about the`
	`156`	`+ * backward compatibility of the schema of the resulting [[DataFrame]].`
	`157`	`+ *`
`146`	`158`	`* @param cols the names of the columns to search frequent items in.`
`147`	`159`	`* @return A Local DataFrame with the Array of frequent items for each column.`
`148`	`160`	`*`