Updating implementation (ray-project#2)

devin-petersohn · kunalgosar · commit 2444616e5de2 · 2018-03-15T16:26:35.000-07:00
diff --git a/python/ray/dataframe/dataframe.py b/python/ray/dataframe/dataframe.py
@@ -64,30 +64,32 @@ def __init__(self, data=None, index=None, columns=None, dtype=None,
 
             pd_df = pd.DataFrame(data=data, index=index, columns=columns,
                                  dtype=dtype, copy=copy)
-            row_partitions = \
+            row_partitions, col_partitions = \
                 _partition_pandas_dataframe(pd_df,
-                                            npartitions=get_npartitions())
+                                            num_partitions=get_npartitions())
             columns = pd_df.columns
             index = pd_df.index
 
-        if col_partitions is None:
+        elif col_partitions is None:
             col_partitions = _rebuild_cols.remote(row_partitions,
                                                   index,
                                                   columns)
-        if row_partitions is None:
+
+        elif row_partitions is None:
             row_partitions = _rebuild_rows.remote(col_partitions,
                                                   index,
                                                   columns)
 
-        self._col_partitions = col_partitions
-        self._row_partitions = row_partitions
 
         # this _index object is a pd.DataFrame
         # and we use that DataFrame's Index to index the rows.
         self._row_lengths, self._row_index = \
-            _compute_length_and_index.remote(self._row_partitions)
+            _compute_length_and_index.remote(row_partitions, index)
         self._col_lengths, self._col_index = \
-            _compute_width_and_index.remote(self._col_partitions)
+            _compute_width_and_index.remote(col_partitions, columns)
+
+        self._col_partitions = col_partitions
+        self._row_partitions = row_partitions
 
         # TODO: Remove testing print code comments
         # print("row partitions")
@@ -100,11 +102,12 @@ def __init__(self, data=None, index=None, columns=None, dtype=None,
         # print(self._row_lengths, "\n", self._row_index)
         # print("col items")
         # print(self._col_lengths, "\n", self._col_index)
-
-        if index is not None:
-            self.index = index
-
-        self.columns = columns
+        #
+        # if index is not None:
+        #     self.index = index
+        #
+        # if columns is not None:
+        #     self.columns = columns
 
     def __str__(self):
         return repr(self)
@@ -114,8 +117,68 @@ def __repr__(self):
             result = repr(to_pandas(self))
             return result
 
-        head = repr(to_pandas(self.head(20)))
-        tail = repr(to_pandas(self.tail(20)))
+        def head(df, n):
+            """Compute the head for this without creating a new DataFrame"""
+
+            sizes = df._row_lengths
+            cumulative = np.cumsum(np.array(sizes))
+            new_dfs = [df._row_partitions[i]
+                       for i in range(len(cumulative))
+                       if cumulative[i] < n]
+
+            last_index = len(new_dfs)
+
+            # this happens when we only need from the first partition
+            if last_index == 0:
+                num_to_transfer = n
+            else:
+                num_to_transfer = n - cumulative[last_index - 1]
+
+            new_dfs.append(_deploy_func.remote(lambda df: df.head(num_to_transfer),
+                                               df._row_partitions[last_index]))
+
+            index = df._row_index.head(n).index
+            pd_head = pd.concat(ray.get(new_dfs))
+            pd_head.index = index
+            pd_head.columns = df.columns
+            return pd_head
+
+        def tail(df, n):
+            """Compute the tail for this without creating a new DataFrame"""
+
+            sizes = self._row_lengths
+
+            if n >= sum(sizes):
+                return self
+
+            cumulative = np.cumsum(np.array(sizes[::-1]))
+
+            reverse_dfs = self._row_partitions[::-1]
+            new_dfs = [reverse_dfs[i]
+                       for i in range(len(cumulative))
+                       if cumulative[i] < n]
+
+            last_index = len(new_dfs)
+
+            # this happens when we only need from the last partition
+            if last_index == 0:
+                num_to_transfer = n
+            else:
+                num_to_transfer = n - cumulative[last_index - 1]
+
+            new_dfs.append(_deploy_func.remote(lambda df: df.tail(num_to_transfer),
+                                               reverse_dfs[last_index]))
+
+            new_dfs.reverse()
+
+            index = self._row_index.tail(n).index
+            pd_tail = pd.concat(ray.get(new_dfs))
+            pd_tail.index = index
+            pd_tail.columns = df.columns
+            return pd_tail
+
+        head = repr(head(self, 20))
+        tail = repr(tail(self, 20))
 
         result = head + "\n...\n" + tail
 
@@ -444,6 +507,7 @@ def _update_inplace(self, row_partitions=None, col_partitions=None,
         # Perform rollback if length mismatch, since operations occur inplace here
         # Since we want to keep the existing DF in a usable state, we need to carefully catch
         # and handle this error
+        # TODO Make this asynchronous
         if index is not None and len(index) != len(new_row_index):
             self._row_partitions = old_row_parts
             self._col_partitions = old_col_parts
@@ -586,6 +650,12 @@ def sum(self, axis=None, skipna=True, level=None, numeric_only=None):
             The sum of the DataFrame.
         """
         # TODO: Fix this function - it does not work.
+        return ray.get(
+            _map_partitions(lambda df: df.sum(axis=axis,
+                                              skipna=skipna,
+                                              level=level,
+                                              numeric_only=numeric_only),
+                            self._col_partitions))
 
         # # TODO: We don't support `level` right now, so df.sum always returns a pd.Series
         # #       Generalize when we support MultiIndexes, and distributed Series (?)
diff --git a/python/ray/dataframe/shuffle.py b/python/ray/dataframe/shuffle.py
@@ -67,6 +67,8 @@ def assign_and_send_data(self, index, partition_assignments,
              A value to be used with ray.get() to ensure proper scheduling
              order.
         """
+        if len(self.partition_data) == 0:
+            return None
 
         def calc_send(i, indices_to_send, data_to_send):
             """Separates the data to send into a list based on assignments.
diff --git a/python/ray/dataframe/utils.py b/python/ray/dataframe/utils.py
@@ -32,10 +32,9 @@ def assign_partitions(index, num_partitions):
         raise TypeError("Unexpected value of type {0} assigned to ShuffleActor"
                         .format(type(index).__name__))
 
-    if len(uniques) % num_partitions == 0:
-        chunksize = int(len(uniques) / num_partitions)
-    else:
-        chunksize = int(len(uniques) / num_partitions) + 1
+    chunksize = len(uniques) // num_partitions \
+        if len(uniques) % num_partitions == 0 \
+        else len(uniques) // num_partitions + 1
 
     assignments = []
 
@@ -81,55 +80,83 @@ def _get_widths(df):
         return 0
 
 
-def _partition_pandas_dataframe(df, npartitions=None, chunksize=None):
+def _partition_pandas_dataframe(df, num_partitions=None, row_chunksize=None,
+                                col_chunksize=None):
     """Partitions a Pandas DataFrame object.
     Args:
         df (pandas.DataFrame): The pandas DataFrame to convert.
         npartitions (int): The number of partitions to split the DataFrame
             into. Has priority over chunksize.
-        chunksize (int): The number of rows to put in each partition.
+        row_chunksize (int): The number of rows to put in each partition.
     Returns:
         [ObjectID]: A list of object IDs corresponding to the dataframe
         partitions
     """
-    if npartitions is not None:
-        chunksize = len(df) // npartitions + 1
-    elif chunksize is None:
-        raise ValueError("The number of partitions or chunksize must be set.")
+    if num_partitions is not None:
+        row_chunksize = len(df) // num_partitions \
+            if len(df) % num_partitions == 0 \
+            else len(df) // num_partitions + 1
+
+        col_chunksize = len(df.columns) // num_partitions \
+            if len(df.columns) % num_partitions == 0 \
+            else len(df.columns) // num_partitions + 1
+    else:
+        assert row_chunksize is not None and col_chunksize is not None
+
+    temp_df = df
+
+    row_partitions = []
+    while len(temp_df) > row_chunksize:
+        t_df = temp_df[:row_chunksize]
+        # reset_index here because we want a pd.RangeIndex
+        # within the partitions. It is smaller and sometimes faster.
+        t_df.reset_index(drop=True, inplace=True)
+        t_df.columns = pd.RangeIndex(0, len(t_df.columns))
+        top = ray.put(t_df)
+        row_partitions.append(top)
+        temp_df = temp_df[row_chunksize:]
+    else:
+        temp_df.reset_index(drop=True, inplace=True)
+        temp_df.columns = pd.RangeIndex(0, len(temp_df.columns))
+        row_partitions.append(ray.put(temp_df))
 
     temp_df = df
 
-    dataframes = []
-    while len(temp_df) > chunksize:
-        t_df = temp_df[:chunksize]
+    col_partitions = []
+    while len(temp_df.columns) > col_chunksize:
+        t_df = temp_df.iloc[:, 0:col_chunksize]
         # reset_index here because we want a pd.RangeIndex
         # within the partitions. It is smaller and sometimes faster.
-        t_df = t_df.reset_index(drop=True)
+        t_df.reset_index(drop=True, inplace=True)
+        t_df.columns = pd.RangeIndex(0, len(t_df.columns))
         top = ray.put(t_df)
-        dataframes.append(top)
-        temp_df = temp_df[chunksize:]
+        col_partitions.append(top)
+        temp_df = temp_df.iloc[:, col_chunksize:]
     else:
-        temp_df = temp_df.reset_index(drop=True)
-        dataframes.append(ray.put(temp_df))
+        temp_df.reset_index(drop=True, inplace=True)
+        temp_df.columns = pd.RangeIndex(0, len(temp_df.columns))
+        col_partitions.append(ray.put(temp_df))
 
-    return dataframes
+    return row_partitions, col_partitions
 
 
-def from_pandas(df, npartitions=None, chunksize=None):
+def from_pandas(df, num_partitions=None, chunksize=None):
     """Converts a pandas DataFrame to a Ray DataFrame.
     Args:
         df (pandas.DataFrame): The pandas DataFrame to convert.
-        npartitions (int): The number of partitions to split the DataFrame
+        num_partitions (int): The number of partitions to split the DataFrame
             into. Has priority over chunksize.
         chunksize (int): The number of rows to put in each partition.
     Returns:
         A new Ray DataFrame object.
     """
     from .dataframe import DataFrame
 
-    dataframes = _partition_pandas_dataframe(df, npartitions, chunksize)
+    row_partitions, col_partitions = \
+        _partition_pandas_dataframe(df, num_partitions, chunksize)
 
-    return DataFrame(row_partitions=dataframes,
+    return DataFrame(row_partitions=row_partitions,
+                     col_partitions=col_partitions,
                      columns=df.columns,
                      index=df.index)
 
@@ -158,10 +185,13 @@ def _rebuild_cols(row_partitions, index, columns):
         [ObjectID]: List of new column partitions.
     """
     # NOTE: Reexamine if this is the correct number of columns solution
-    n_cols = min(get_npartitions(), len(row_partitions), len(columns))
+    n_cols = min(max(get_npartitions(), len(row_partitions)), len(columns))
     partition_assignments = assign_partitions.remote(columns, n_cols)
-    shufflers = [ShuffleActor.remote(x, partition_axis=0, shuffle_axis=1)
-                 for x in row_partitions]
+    shufflers = [ShuffleActor.remote(
+        row_partitions[i] if i < len(row_partitions) else pd.DataFrame(),
+        partition_axis=0,
+        shuffle_axis=1)
+                 for i in range(n_cols)]
 
     shufflers_done = \
         [shufflers[i].shuffle.remote(
@@ -176,11 +206,12 @@ def _rebuild_cols(row_partitions, index, columns):
 
     # TODO: Determine if this is the right place to reset the index
     def fix_indexes(df):
-        df.index = index
-        df.columns = np.arange(len(df.columns))
+        df.columns = pd.RangeIndex(0, len(df.columns))
+        df.reset_index(drop=True, inplace=True)
         return df
 
-    return [shuffler.apply_func.remote(fix_indexes) for shuffler in shufflers[:n_cols]]
+    return [shuffler.apply_func.remote(fix_indexes)
+            for shuffler in shufflers[:n_cols]]
 
 
 @ray.remote
@@ -212,8 +243,9 @@ def _rebuild_rows(col_partitions, index, columns):
     # TODO: Determine if this is the right place to reset the index
     # TODO: Determine if this needs the same changes as above
     def fix_indexes(df):
-        df.columns = columns
-        return df.reset_index(drop=True)
+        df.columns = pd.RangeIndex(0, len(df.columns))
+        df.reset_index(drop=True, inplace=True)
+        return df
 
     return [shuffler.apply_func.remote(fix_indexes) for shuffler in shufflers[:n_rows]]
 
@@ -265,7 +297,7 @@ def _map_partitions(func, partitions, *argslists):
 
 
 @ray.remote(num_return_vals=2)
-def _compute_length_and_index(dfs):
+def _compute_length_and_index(dfs, index):
     """Create a default index, which is a RangeIndex
     Returns:
         The pd.RangeIndex object that represents this DataFrame.
@@ -278,11 +310,12 @@ def _compute_length_and_index(dfs):
 
     idx_df_col_names = ("partition", "index_within_partition")
 
-    return lengths, pd.DataFrame(dest_indices, columns=idx_df_col_names)
+    return lengths, pd.DataFrame(dest_indices, index=index,
+                                 columns=idx_df_col_names)
 
 
 @ray.remote(num_return_vals=2)
-def _compute_width_and_index(dfs):
+def _compute_width_and_index(dfs, columns):
     """Create a default index, which is a RangeIndex
     Returns:
         The pd.RangeIndex object that represents this DataFrame.
@@ -295,4 +328,5 @@ def _compute_width_and_index(dfs):
 
     idx_df_col_names = ("partition", "index_within_partition")
 
-    return widths, pd.DataFrame(dest_indices, columns=idx_df_col_names)
+    return widths, pd.DataFrame(dest_indices, index=columns,
+                                columns=idx_df_col_names)