diff --git a/neo4j/_async/work/result.py b/neo4j/_async/work/result.py index ebca415be..a0bd7ec36 100644 --- a/neo4j/_async/work/result.py +++ b/neo4j/_async/work/result.py @@ -29,6 +29,10 @@ ResultNotSingleError, ) from ...meta import experimental +from ...time import ( + Date, + DateTime, +) from ...work import ResultSummary from ..io import ConnectionErrorHandler @@ -527,7 +531,7 @@ async def data(self, *keys): @experimental("pandas support is experimental and might be changed or " "removed in future versions") - async def to_df(self, expand=False): + async def to_df(self, expand=False, parse_dates=False): r"""Convert (the rest of) the result to a pandas DataFrame. This method is only available if the `pandas` library is installed. @@ -540,7 +544,7 @@ async def to_df(self, expand=False): for instance will return a DataFrame with two columns: ``n`` and ``m`` and 10 rows. - :param expand: if :const:`True`, some structures in the result will be + :param expand: If :const:`True`, some structures in the result will be recursively expanded (flattened out into multiple columns) like so (everything inside ``<...>`` is a placeholder): @@ -604,6 +608,11 @@ async def to_df(self, expand=False): :const:`dict` keys and variable names that contain ``.`` or ``\`` will be escaped with a backslash (``\.`` and ``\\`` respectively). :type expand: bool + :param parse_dates: + If :const:`True`, columns that excluvively contain + :class:`time.DateTime` objects, :class:`time.Date` objects, or + :const:`None`, will be converted to :class:`pandas.Timestamp`. + :type parse_dates: bool :rtype: :py:class:`pandas.DataFrame` :raises ImportError: if `pandas` library is not available. @@ -618,7 +627,7 @@ async def to_df(self, expand=False): import pandas as pd if not expand: - return pd.DataFrame(await self.values(), columns=self._keys) + df = pd.DataFrame(await self.values(), columns=self._keys) else: df_keys = None rows = [] @@ -638,13 +647,29 @@ async def to_df(self, expand=False): df_keys = False rows.append(row) if df_keys is False: - return pd.DataFrame(rows) + df = pd.DataFrame(rows) else: columns = df_keys or [ k.replace(".", "\\.").replace("\\", "\\\\") for k in self._keys ] - return pd.DataFrame(rows, columns=columns) + df = pd.DataFrame(rows, columns=columns) + if not parse_dates: + return df + dt_columns = df.columns[df.apply( + lambda col: pd.api.types.infer_dtype(col) == "mixed" and col.map( + lambda x: isinstance(x, (DateTime, Date, type(None))) + ).all() + )] + df[dt_columns] = df[dt_columns].apply( + lambda col: col.map( + lambda x: + pd.Timestamp(x.iso_format()) + .replace(tzinfo=getattr(x, "tzinfo", None)) + if x else pd.NaT + ) + ) + return df def closed(self): """Return True if the result has been closed. diff --git a/neo4j/_sync/work/result.py b/neo4j/_sync/work/result.py index e2a4b4482..c13385524 100644 --- a/neo4j/_sync/work/result.py +++ b/neo4j/_sync/work/result.py @@ -29,6 +29,10 @@ ResultNotSingleError, ) from ...meta import experimental +from ...time import ( + Date, + DateTime, +) from ...work import ResultSummary from ..io import ConnectionErrorHandler @@ -527,7 +531,7 @@ def data(self, *keys): @experimental("pandas support is experimental and might be changed or " "removed in future versions") - def to_df(self, expand=False): + def to_df(self, expand=False, parse_dates=False): r"""Convert (the rest of) the result to a pandas DataFrame. This method is only available if the `pandas` library is installed. @@ -540,7 +544,7 @@ def to_df(self, expand=False): for instance will return a DataFrame with two columns: ``n`` and ``m`` and 10 rows. - :param expand: if :const:`True`, some structures in the result will be + :param expand: If :const:`True`, some structures in the result will be recursively expanded (flattened out into multiple columns) like so (everything inside ``<...>`` is a placeholder): @@ -604,6 +608,11 @@ def to_df(self, expand=False): :const:`dict` keys and variable names that contain ``.`` or ``\`` will be escaped with a backslash (``\.`` and ``\\`` respectively). :type expand: bool + :param parse_dates: + If :const:`True`, columns that excluvively contain + :class:`time.DateTime` objects, :class:`time.Date` objects, or + :const:`None`, will be converted to :class:`pandas.Timestamp`. + :type parse_dates: bool :rtype: :py:class:`pandas.DataFrame` :raises ImportError: if `pandas` library is not available. @@ -618,7 +627,7 @@ def to_df(self, expand=False): import pandas as pd if not expand: - return pd.DataFrame(self.values(), columns=self._keys) + df = pd.DataFrame(self.values(), columns=self._keys) else: df_keys = None rows = [] @@ -638,13 +647,29 @@ def to_df(self, expand=False): df_keys = False rows.append(row) if df_keys is False: - return pd.DataFrame(rows) + df = pd.DataFrame(rows) else: columns = df_keys or [ k.replace(".", "\\.").replace("\\", "\\\\") for k in self._keys ] - return pd.DataFrame(rows, columns=columns) + df = pd.DataFrame(rows, columns=columns) + if not parse_dates: + return df + dt_columns = df.columns[df.apply( + lambda col: pd.api.types.infer_dtype(col) == "mixed" and col.map( + lambda x: isinstance(x, (DateTime, Date, type(None))) + ).all() + )] + df[dt_columns] = df[dt_columns].apply( + lambda col: col.map( + lambda x: + pd.Timestamp(x.iso_format()) + .replace(tzinfo=getattr(x, "tzinfo", None)) + if x else pd.NaT + ) + ) + return df def closed(self): """Return True if the result has been closed. diff --git a/setup.py b/setup.py index f22a18420..fc6a1d4db 100644 --- a/setup.py +++ b/setup.py @@ -34,6 +34,9 @@ install_requires = [ "pytz", ] +extra_require = { + "pandas": ["pandas>=1.0.0"], +} classifiers = [ "Intended Audience :: Developers", "License :: OSI Approved :: Apache Software License", @@ -67,6 +70,7 @@ "keywords": "neo4j graph database", "url": "https://github.com/neo4j/neo4j-python-driver", "install_requires": install_requires, + "extra_require": extra_require, "classifiers": classifiers, "packages": packages, "entry_points": entry_points, diff --git a/tests/unit/async_/work/test_result.py b/tests/unit/async_/work/test_result.py index efdc2395a..229cf7f7e 100644 --- a/tests/unit/async_/work/test_result.py +++ b/tests/unit/async_/work/test_result.py @@ -14,13 +14,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - - +from re import match from unittest import mock import warnings import pandas as pd import pytest +import pytz from neo4j import ( Address, @@ -29,10 +29,12 @@ ResultSummary, ServerInfo, SummaryCounters, + time as neo4j_time, Version, ) from neo4j._async_compat.util import AsyncUtil from neo4j.data import ( + DataDehydrator, DataHydrator, Node, Relationship, @@ -844,6 +846,17 @@ async def test_to_df(keys, values, types, instances, test_default_expand): ], ["object", "object", "object", "object", "float64", "bool"], ), + ( + ["dt"], + [ + DataDehydrator().dehydrate(( + neo4j_time.DateTime(2022, 1, 2, 3, 4, 5, 6), + )), + ], + ["dt"], + [[neo4j_time.DateTime(2022, 1, 2, 3, 4, 5, 6)]], + ["object"], + ), ) ) @mark_async_test @@ -873,3 +886,202 @@ async def test_to_df_expand(keys, values, expected_columns, expected_rows, expected_df = pd.DataFrame(expected_rows, columns=expected_columns) assert df.equals(expected_df) + + +@pytest.mark.parametrize( + ("keys", "values", "expected_df"), + ( + # DateTime + ( + ["dt"], + [ + DataDehydrator().dehydrate(( + neo4j_time.DateTime(2022, 1, 2, 3, 4, 5, 6), + )), + ], + pd.DataFrame( + [[pd.Timestamp("2022-01-02 03:04:05.000000006")]], + columns=["dt"], + ) + ), + # Date + ( + ["d"], + [ + DataDehydrator().dehydrate(( + neo4j_time.Date(2222, 2, 22), + )), + ], + pd.DataFrame( + [[pd.Timestamp("2222-02-22")]], + columns=["d"], + ) + ), + # DateTime with timezone + ( + ["dt_tz"], + [ + DataDehydrator().dehydrate(( + pytz.timezone("Europe/Stockholm").localize( + neo4j_time.DateTime(1970, 1, 1, 0, 0, 0, 0) + ), + )), + ], + pd.DataFrame( + [[ + pytz.timezone("Europe/Stockholm").localize( + pd.Timestamp("1970-01-01") + ) + ]], + columns=["dt_tz"], + ) + ), + # DateTime, Date, DateTime with timezone, and None + ( + ["mixed"], + [ + [None], + DataDehydrator().dehydrate(( + neo4j_time.DateTime(2022, 1, 2, 3, 4, 5, 6), + )), + DataDehydrator().dehydrate(( + neo4j_time.Date(2222, 2, 22), + )), + DataDehydrator().dehydrate(( + pytz.timezone("Europe/Stockholm").localize( + neo4j_time.DateTime(1970, 1, 1, 0, 0, 0, 0) + ), + )), + ], + pd.DataFrame( + [ + [pd.NaT], + [pd.Timestamp("2022-01-02 03:04:05.000000006")], + [pd.Timestamp("2222-02-22")], + [ + pytz.timezone("Europe/Stockholm").localize( + pd.Timestamp("1970-01-01") + ) + ], + ], + columns=["mixed"], + ) + ), + # DateTime, Date, DateTime with timezone, and None in the middle + ( + ["mixed"], + [ + DataDehydrator().dehydrate(( + neo4j_time.DateTime(2022, 1, 2, 3, 4, 5, 6), + )), + DataDehydrator().dehydrate(( + neo4j_time.Date(2222, 2, 22), + )), + [None], + DataDehydrator().dehydrate(( + pytz.timezone("Europe/Stockholm").localize( + neo4j_time.DateTime(1970, 1, 1, 0, 0, 0, 0) + ), + )), + ], + pd.DataFrame( + [ + [pd.Timestamp("2022-01-02 03:04:05.000000006")], + [pd.Timestamp("2222-02-22")], + [pd.NaT], + [ + pytz.timezone("Europe/Stockholm").localize( + pd.Timestamp("1970-01-01") + ) + ], + ], + columns=["mixed"], + ) + ), + # DateTime, Date, DateTime with timezone, and None at the end + ( + ["mixed"], + [ + DataDehydrator().dehydrate(( + neo4j_time.DateTime(2022, 1, 2, 3, 4, 5, 6), + )), + DataDehydrator().dehydrate(( + neo4j_time.Date(2222, 2, 22), + )), + DataDehydrator().dehydrate(( + pytz.timezone("Europe/Stockholm").localize( + neo4j_time.DateTime(1970, 1, 1, 0, 0, 0, 0) + ), + )), + [None], + ], + pd.DataFrame( + [ + [pd.Timestamp("2022-01-02 03:04:05.000000006")], + [pd.Timestamp("2222-02-22")], + [ + pytz.timezone("Europe/Stockholm").localize( + pd.Timestamp("1970-01-01") + ) + ], + [pd.NaT], + ], + columns=["mixed"], + ) + ), + # Column with only None (should not be transfomred to NaT) + ( + ["all_none"], + [ + [None], + [None], + ], + pd.DataFrame( + [[None], [None]], + columns=["all_none"], + ) + ), + # Multiple columns + ( + ["all_none", "mixed", "n"], + [ + [ + None, + None, + 1, + ], + [ + None, + *DataDehydrator().dehydrate(( + neo4j_time.DateTime(2022, 1, 2, 3, 4, 5, 6), + )), + 1.234, + ], + ], + pd.DataFrame( + [ + [ + None, + pd.NaT, + 1.0, + ], + [ + None, + pd.Timestamp("2022-01-02 03:04:05.000000006"), + 1.234 + ], + ], + columns=["all_none", "mixed", "n"], + ) + ), + ), +) +@pytest.mark.parametrize("expand", [True, False]) +@mark_async_test +async def test_to_df_parse_dates(keys, values, expected_df, expand): + connection = AsyncConnectionStub(records=Records(keys, values)) + result = AsyncResult(connection, DataHydrator(), 1, noop, noop) + await result._run("CYPHER", {}, None, None, "r", None) + df = await result.to_df(expand=expand, parse_dates=True) + + pd.testing.assert_frame_equal(df, expected_df) diff --git a/tests/unit/sync/work/test_result.py b/tests/unit/sync/work/test_result.py index b8ccb695d..b813c4812 100644 --- a/tests/unit/sync/work/test_result.py +++ b/tests/unit/sync/work/test_result.py @@ -14,13 +14,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - - +from re import match from unittest import mock import warnings import pandas as pd import pytest +import pytz from neo4j import ( Address, @@ -29,10 +29,12 @@ ResultSummary, ServerInfo, SummaryCounters, + time as neo4j_time, Version, ) from neo4j._async_compat.util import Util from neo4j.data import ( + DataDehydrator, DataHydrator, Node, Relationship, @@ -844,6 +846,17 @@ def test_to_df(keys, values, types, instances, test_default_expand): ], ["object", "object", "object", "object", "float64", "bool"], ), + ( + ["dt"], + [ + DataDehydrator().dehydrate(( + neo4j_time.DateTime(2022, 1, 2, 3, 4, 5, 6), + )), + ], + ["dt"], + [[neo4j_time.DateTime(2022, 1, 2, 3, 4, 5, 6)]], + ["object"], + ), ) ) @mark_sync_test @@ -873,3 +886,202 @@ def test_to_df_expand(keys, values, expected_columns, expected_rows, expected_df = pd.DataFrame(expected_rows, columns=expected_columns) assert df.equals(expected_df) + + +@pytest.mark.parametrize( + ("keys", "values", "expected_df"), + ( + # DateTime + ( + ["dt"], + [ + DataDehydrator().dehydrate(( + neo4j_time.DateTime(2022, 1, 2, 3, 4, 5, 6), + )), + ], + pd.DataFrame( + [[pd.Timestamp("2022-01-02 03:04:05.000000006")]], + columns=["dt"], + ) + ), + # Date + ( + ["d"], + [ + DataDehydrator().dehydrate(( + neo4j_time.Date(2222, 2, 22), + )), + ], + pd.DataFrame( + [[pd.Timestamp("2222-02-22")]], + columns=["d"], + ) + ), + # DateTime with timezone + ( + ["dt_tz"], + [ + DataDehydrator().dehydrate(( + pytz.timezone("Europe/Stockholm").localize( + neo4j_time.DateTime(1970, 1, 1, 0, 0, 0, 0) + ), + )), + ], + pd.DataFrame( + [[ + pytz.timezone("Europe/Stockholm").localize( + pd.Timestamp("1970-01-01") + ) + ]], + columns=["dt_tz"], + ) + ), + # DateTime, Date, DateTime with timezone, and None + ( + ["mixed"], + [ + [None], + DataDehydrator().dehydrate(( + neo4j_time.DateTime(2022, 1, 2, 3, 4, 5, 6), + )), + DataDehydrator().dehydrate(( + neo4j_time.Date(2222, 2, 22), + )), + DataDehydrator().dehydrate(( + pytz.timezone("Europe/Stockholm").localize( + neo4j_time.DateTime(1970, 1, 1, 0, 0, 0, 0) + ), + )), + ], + pd.DataFrame( + [ + [pd.NaT], + [pd.Timestamp("2022-01-02 03:04:05.000000006")], + [pd.Timestamp("2222-02-22")], + [ + pytz.timezone("Europe/Stockholm").localize( + pd.Timestamp("1970-01-01") + ) + ], + ], + columns=["mixed"], + ) + ), + # DateTime, Date, DateTime with timezone, and None in the middle + ( + ["mixed"], + [ + DataDehydrator().dehydrate(( + neo4j_time.DateTime(2022, 1, 2, 3, 4, 5, 6), + )), + DataDehydrator().dehydrate(( + neo4j_time.Date(2222, 2, 22), + )), + [None], + DataDehydrator().dehydrate(( + pytz.timezone("Europe/Stockholm").localize( + neo4j_time.DateTime(1970, 1, 1, 0, 0, 0, 0) + ), + )), + ], + pd.DataFrame( + [ + [pd.Timestamp("2022-01-02 03:04:05.000000006")], + [pd.Timestamp("2222-02-22")], + [pd.NaT], + [ + pytz.timezone("Europe/Stockholm").localize( + pd.Timestamp("1970-01-01") + ) + ], + ], + columns=["mixed"], + ) + ), + # DateTime, Date, DateTime with timezone, and None at the end + ( + ["mixed"], + [ + DataDehydrator().dehydrate(( + neo4j_time.DateTime(2022, 1, 2, 3, 4, 5, 6), + )), + DataDehydrator().dehydrate(( + neo4j_time.Date(2222, 2, 22), + )), + DataDehydrator().dehydrate(( + pytz.timezone("Europe/Stockholm").localize( + neo4j_time.DateTime(1970, 1, 1, 0, 0, 0, 0) + ), + )), + [None], + ], + pd.DataFrame( + [ + [pd.Timestamp("2022-01-02 03:04:05.000000006")], + [pd.Timestamp("2222-02-22")], + [ + pytz.timezone("Europe/Stockholm").localize( + pd.Timestamp("1970-01-01") + ) + ], + [pd.NaT], + ], + columns=["mixed"], + ) + ), + # Column with only None (should not be transfomred to NaT) + ( + ["all_none"], + [ + [None], + [None], + ], + pd.DataFrame( + [[None], [None]], + columns=["all_none"], + ) + ), + # Multiple columns + ( + ["all_none", "mixed", "n"], + [ + [ + None, + None, + 1, + ], + [ + None, + *DataDehydrator().dehydrate(( + neo4j_time.DateTime(2022, 1, 2, 3, 4, 5, 6), + )), + 1.234, + ], + ], + pd.DataFrame( + [ + [ + None, + pd.NaT, + 1.0, + ], + [ + None, + pd.Timestamp("2022-01-02 03:04:05.000000006"), + 1.234 + ], + ], + columns=["all_none", "mixed", "n"], + ) + ), + ), +) +@pytest.mark.parametrize("expand", [True, False]) +@mark_sync_test +def test_to_df_parse_dates(keys, values, expected_df, expand): + connection = ConnectionStub(records=Records(keys, values)) + result = Result(connection, DataHydrator(), 1, noop, noop) + result._run("CYPHER", {}, None, None, "r", None) + df = result.to_df(expand=expand, parse_dates=True) + + pd.testing.assert_frame_equal(df, expected_df)