@@ -687,9 +687,10 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
687687 * @param minPartitions Minimum number of Hadoop Splits to generate.
688688 *
689689 * '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each
690- * record, directly caching the returned RDD will create many references to the same object.
691- * If you plan to directly cache Hadoop writable objects, you should first copy them using
692- * a `map` function.
690+ * record, directly caching the returned RDD or directly passing it to an aggregation or shuffle
691+ * operation will create many references to the same object.
692+ * If you plan to directly cache, sort, or aggregate Hadoop writable objects, you should first
693+ * copy them using a `map` function.
693694 */
694695 def hadoopRDD [K , V ](
695696 conf : JobConf ,
@@ -705,12 +706,13 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
705706 }
706707
707708 /** Get an RDD for a Hadoop file with an arbitrary InputFormat
708- *
709- * '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each
710- * record, directly caching the returned RDD will create many references to the same object.
711- * If you plan to directly cache Hadoop writable objects, you should first copy them using
712- * a `map` function.
713- * */
709+ *
710+ * '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each
711+ * record, directly caching the returned RDD or directly passing it to an aggregation or shuffle
712+ * operation will create many references to the same object.
713+ * If you plan to directly cache, sort, or aggregate Hadoop writable objects, you should first
714+ * copy them using a `map` function.
715+ */
714716 def hadoopFile [K , V ](
715717 path : String ,
716718 inputFormatClass : Class [_ <: InputFormat [K , V ]],
@@ -741,9 +743,10 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
741743 * }}}
742744 *
743745 * '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each
744- * record, directly caching the returned RDD will create many references to the same object.
745- * If you plan to directly cache Hadoop writable objects, you should first copy them using
746- * a `map` function.
746+ * record, directly caching the returned RDD or directly passing it to an aggregation or shuffle
747+ * operation will create many references to the same object.
748+ * If you plan to directly cache, sort, or aggregate Hadoop writable objects, you should first
749+ * copy them using a `map` function.
747750 */
748751 def hadoopFile [K , V , F <: InputFormat [K , V ]]
749752 (path : String , minPartitions : Int )
@@ -764,9 +767,10 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
764767 * }}}
765768 *
766769 * '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each
767- * record, directly caching the returned RDD will create many references to the same object.
768- * If you plan to directly cache Hadoop writable objects, you should first copy them using
769- * a `map` function.
770+ * record, directly caching the returned RDD or directly passing it to an aggregation or shuffle
771+ * operation will create many references to the same object.
772+ * If you plan to directly cache, sort, or aggregate Hadoop writable objects, you should first
773+ * copy them using a `map` function.
770774 */
771775 def hadoopFile [K , V , F <: InputFormat [K , V ]](path : String )
772776 (implicit km : ClassTag [K ], vm : ClassTag [V ], fm : ClassTag [F ]): RDD [(K , V )] =
@@ -788,9 +792,10 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
788792 * and extra configuration options to pass to the input format.
789793 *
790794 * '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each
791- * record, directly caching the returned RDD will create many references to the same object.
792- * If you plan to directly cache Hadoop writable objects, you should first copy them using
793- * a `map` function.
795+ * record, directly caching the returned RDD or directly passing it to an aggregation or shuffle
796+ * operation will create many references to the same object.
797+ * If you plan to directly cache, sort, or aggregate Hadoop writable objects, you should first
798+ * copy them using a `map` function.
794799 */
795800 def newAPIHadoopFile [K , V , F <: NewInputFormat [K , V ]](
796801 path : String ,
@@ -810,9 +815,10 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
810815 * and extra configuration options to pass to the input format.
811816 *
812817 * '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each
813- * record, directly caching the returned RDD will create many references to the same object.
814- * If you plan to directly cache Hadoop writable objects, you should first copy them using
815- * a `map` function.
818+ * record, directly caching the returned RDD or directly passing it to an aggregation or shuffle
819+ * operation will create many references to the same object.
820+ * If you plan to directly cache, sort, or aggregate Hadoop writable objects, you should first
821+ * copy them using a `map` function.
816822 */
817823 def newAPIHadoopRDD [K , V , F <: NewInputFormat [K , V ]](
818824 conf : Configuration = hadoopConfiguration,
@@ -826,9 +832,10 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
826832 /** Get an RDD for a Hadoop SequenceFile with given key and value types.
827833 *
828834 * '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each
829- * record, directly caching the returned RDD will create many references to the same object.
830- * If you plan to directly cache Hadoop writable objects, you should first copy them using
831- * a `map` function.
835+ * record, directly caching the returned RDD or directly passing it to an aggregation or shuffle
836+ * operation will create many references to the same object.
837+ * If you plan to directly cache, sort, or aggregate Hadoop writable objects, you should first
838+ * copy them using a `map` function.
832839 */
833840 def sequenceFile [K , V ](path : String ,
834841 keyClass : Class [K ],
@@ -843,9 +850,10 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
843850 /** Get an RDD for a Hadoop SequenceFile with given key and value types.
844851 *
845852 * '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each
846- * record, directly caching the returned RDD will create many references to the same object.
847- * If you plan to directly cache Hadoop writable objects, you should first copy them using
848- * a `map` function.
853+ * record, directly caching the returned RDD or directly passing it to an aggregation or shuffle
854+ * operation will create many references to the same object.
855+ * If you plan to directly cache, sort, or aggregate Hadoop writable objects, you should first
856+ * copy them using a `map` function.
849857 * */
850858 def sequenceFile [K , V ](path : String , keyClass : Class [K ], valueClass : Class [V ]): RDD [(K , V )] = {
851859 assertNotStopped()
@@ -869,9 +877,10 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
869877 * allow it to figure out the Writable class to use in the subclass case.
870878 *
871879 * '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each
872- * record, directly caching the returned RDD will create many references to the same object.
873- * If you plan to directly cache Hadoop writable objects, you should first copy them using
874- * a `map` function.
880+ * record, directly caching the returned RDD or directly passing it to an aggregation or shuffle
881+ * operation will create many references to the same object.
882+ * If you plan to directly cache, sort, or aggregate Hadoop writable objects, you should first
883+ * copy them using a `map` function.
875884 */
876885 def sequenceFile [K , V ]
877886 (path : String , minPartitions : Int = defaultMinPartitions)
0 commit comments