@@ -67,24 +67,24 @@ private[sql] object Dataset {
6767 *
6868 * Operations available on Datasets are divided into transformations and actions. Transformations
6969 * are the ones that produce new Datasets, and actions are the ones that trigger computation and
70- * return results. Example transformations include map, filter, select, aggregate (groupBy).
70+ * return results. Example transformations include map, filter, select, and aggregate (` groupBy` ).
7171 * Example actions count, show, or writing data out to file systems.
7272 *
7373 * Datasets are "lazy", i.e. computations are only triggered when an action is invoked. Internally,
7474 * a Dataset represents a logical plan that describes the computation required to produce the data.
7575 * When an action is invoked, Spark's query optimizer optimizes the logical plan and generates a
76- * physical plan for efficient execution in a parallel or distributed manner. To explore the
76+ * physical plan for efficient execution in a parallel and distributed manner. To explore the
7777 * logical plan as well as optimized physical plan, use the `explain` function.
7878 *
7979 * To efficiently support domain-specific objects, an [[Encoder ]] is required. The encoder maps
80- * the domain specific type T to Spark's internal type system. For example, given a class Person
81- * with two fields, name (string) and age (int), an encoder is used to tell Spark to generate code
82- * at runtime to serialize the Person object into a binary structure. This binary structure often
83- * has much lower memory footprint as well as are optimized for efficiency in data processing
80+ * the domain specific type `T` to Spark's internal type system. For example, given a class ` Person`
81+ * with two fields, ` name` (string) and ` age` (int), an encoder is used to tell Spark to generate
82+ * code at runtime to serialize the ` Person` object into a binary structure. This binary structure
83+ * often has much lower memory footprint as well as are optimized for efficiency in data processing
8484 * (e.g. in a columnar format). To understand the internal binary representation for data, use the
8585 * `schema` function.
8686 *
87- * There are typically two ways to create a Dataset. The most common way to by pointing Spark
87+ * There are typically two ways to create a Dataset. The most common way is by pointing Spark
8888 * to some files on storage systems, using the `read` function available on a `SparkSession`.
8989 * {{{
9090 * val people = session.read.parquet("...").as[Person] // Scala
@@ -98,7 +98,7 @@ private[sql] object Dataset {
9898 * Dataset<String> names = people.map((Person p) -> p.name, Encoders.STRING) // in Java 8
9999 * }}}
100100 *
101- * Dataset operations can also be untyped, through the various domain-specific-language (DSL)
101+ * Dataset operations can also be untyped, through various domain-specific-language (DSL)
102102 * functions defined in: [[Dataset ]] (this class), [[Column ]], and [[functions ]]. These operations
103103 * are very similar to the operations available in the data frame abstraction in R or Python.
104104 *
@@ -118,8 +118,8 @@ private[sql] object Dataset {
118118 * A more concrete example in Scala:
119119 * {{{
120120 * // To create Dataset[Row] using SQLContext
121- * val people = sqlContext .read.parquet("...")
122- * val department = sqlContext .read.parquet("...")
121+ * val people = session .read.parquet("...")
122+ * val department = session .read.parquet("...")
123123 *
124124 * people.filter("age > 30")
125125 * .join(department, people("deptId") === department("id"))
@@ -130,8 +130,8 @@ private[sql] object Dataset {
130130 * and in Java:
131131 * {{{
132132 * // To create Dataset<Row> using SQLContext
133- * Dataset<Row> people = sqlContext .read().parquet("...");
134- * Dataset<Row> department = sqlContext .read().parquet("...");
133+ * Dataset<Row> people = session .read().parquet("...");
134+ * Dataset<Row> department = session .read().parquet("...");
135135 *
136136 * people.filter("age".gt(30))
137137 * .join(department, people.col("deptId").equalTo(department("id")))
@@ -1106,7 +1106,7 @@ class Dataset[T] private[sql](
11061106 }
11071107
11081108 /**
1109- * Groups the [[Dataset ]] using the specified columns, so we can run aggregation on them.
1109+ * Groups the [[Dataset ]] using the specified columns, so that we can run aggregation on them.
11101110 * See [[RelationalGroupedDataset ]] for all the available aggregate functions.
11111111 *
11121112 * This is a variant of groupBy that can only group by existing columns using column names
@@ -1341,7 +1341,7 @@ class Dataset[T] private[sql](
13411341 }
13421342
13431343 /**
1344- * Returns a new [[Dataset ]] containing union of rows in this frame and another frame .
1344+ * Returns a new [[Dataset ]] containing union of rows in this Dataset and another Dataset .
13451345 * This is equivalent to `UNION ALL` in SQL.
13461346 *
13471347 * To do a SQL-style set union (that does deduplication of elements), use this function followed
@@ -1357,7 +1357,7 @@ class Dataset[T] private[sql](
13571357 }
13581358
13591359 /**
1360- * Returns a new [[Dataset ]] containing union of rows in this frame and another frame .
1360+ * Returns a new [[Dataset ]] containing union of rows in this Dataset and another Dataset .
13611361 * This is equivalent to `UNION ALL` in SQL.
13621362 *
13631363 * @group typedrel
@@ -1366,7 +1366,7 @@ class Dataset[T] private[sql](
13661366 def union (other : Dataset [T ]): Dataset [T ] = unionAll(other)
13671367
13681368 /**
1369- * Returns a new [[Dataset ]] containing rows only in both this frame and another frame .
1369+ * Returns a new [[Dataset ]] containing rows only in both this Dataset and another Dataset .
13701370 * This is equivalent to `INTERSECT` in SQL.
13711371 *
13721372 * Note that, equality checking is performed directly on the encoded representation of the data
@@ -1380,7 +1380,7 @@ class Dataset[T] private[sql](
13801380 }
13811381
13821382 /**
1383- * Returns a new [[Dataset ]] containing rows in this frame but not in another frame .
1383+ * Returns a new [[Dataset ]] containing rows in this Dataset but not in another Dataset .
13841384 * This is equivalent to `EXCEPT` in SQL.
13851385 *
13861386 * Note that, equality checking is performed directly on the encoded representation of the data
@@ -1394,9 +1394,12 @@ class Dataset[T] private[sql](
13941394 }
13951395
13961396 /**
1397- * Returns a new [[Dataset ]] containing rows in this frame but not in another frame .
1397+ * Returns a new [[Dataset ]] containing rows in this Dataset but not in another Dataset .
13981398 * This is equivalent to `EXCEPT` in SQL.
13991399 *
1400+ * Note that, equality checking is performed directly on the encoded representation of the data
1401+ * and thus is not affected by a custom `equals` function defined on `T`.
1402+ *
14001403 * @group typedrel
14011404 * @since 2.0.0
14021405 */
0 commit comments