|
1 | 1 | # -*- coding: utf-8 -*- |
2 | 2 | # pylint: disable=E1101,E1103,W0232 |
3 | 3 |
|
| 4 | +import pytest |
4 | 5 | import sys |
5 | 6 | from datetime import datetime |
6 | 7 | from distutils.version import LooseVersion |
|
17 | 18 | import pandas.compat as compat |
18 | 19 | import pandas.util.testing as tm |
19 | 20 | from pandas import (Categorical, Index, Series, DataFrame, PeriodIndex, |
20 | | - Timestamp, CategoricalIndex, isnull) |
| 21 | + Timestamp, CategoricalIndex, DatetimeIndex, |
| 22 | + isnull, NaT) |
21 | 23 | from pandas.compat import range, lrange, u, PY3 |
22 | 24 | from pandas.core.config import option_context |
23 | 25 |
|
@@ -160,12 +162,6 @@ def f(): |
160 | 162 |
|
161 | 163 | self.assertRaises(ValueError, f) |
162 | 164 |
|
163 | | - def f(): |
164 | | - with tm.assert_produces_warning(FutureWarning): |
165 | | - Categorical([1, 2], [1, 2, np.nan, np.nan]) |
166 | | - |
167 | | - self.assertRaises(ValueError, f) |
168 | | - |
169 | 165 | # The default should be unordered |
170 | 166 | c1 = Categorical(["a", "b", "c", "a"]) |
171 | 167 | self.assertFalse(c1.ordered) |
@@ -222,29 +218,12 @@ def f(): |
222 | 218 | cat = pd.Categorical([np.nan, 1., 2., 3.]) |
223 | 219 | self.assertTrue(is_float_dtype(cat.categories)) |
224 | 220 |
|
225 | | - # Deprecating NaNs in categoires (GH #10748) |
226 | | - # preserve int as far as possible by converting to object if NaN is in |
227 | | - # categories |
228 | | - with tm.assert_produces_warning(FutureWarning): |
229 | | - cat = pd.Categorical([np.nan, 1, 2, 3], |
230 | | - categories=[np.nan, 1, 2, 3]) |
231 | | - self.assertTrue(is_object_dtype(cat.categories)) |
232 | | - |
233 | 221 | # This doesn't work -> this would probably need some kind of "remember |
234 | 222 | # the original type" feature to try to cast the array interface result |
235 | 223 | # to... |
236 | 224 |
|
237 | 225 | # vals = np.asarray(cat[cat.notnull()]) |
238 | 226 | # self.assertTrue(is_integer_dtype(vals)) |
239 | | - with tm.assert_produces_warning(FutureWarning): |
240 | | - cat = pd.Categorical([np.nan, "a", "b", "c"], |
241 | | - categories=[np.nan, "a", "b", "c"]) |
242 | | - self.assertTrue(is_object_dtype(cat.categories)) |
243 | | - # but don't do it for floats |
244 | | - with tm.assert_produces_warning(FutureWarning): |
245 | | - cat = pd.Categorical([np.nan, 1., 2., 3.], |
246 | | - categories=[np.nan, 1., 2., 3.]) |
247 | | - self.assertTrue(is_float_dtype(cat.categories)) |
248 | 227 |
|
249 | 228 | # corner cases |
250 | 229 | cat = pd.Categorical([1]) |
@@ -295,6 +274,22 @@ def f(): |
295 | 274 | c = Categorical(np.array([], dtype='int64'), # noqa |
296 | 275 | categories=[3, 2, 1], ordered=True) |
297 | 276 |
|
| 277 | + def test_constructor_with_null(self): |
| 278 | + |
| 279 | + # Cannot have NaN in categories |
| 280 | + with pytest.raises(ValueError): |
| 281 | + pd.Categorical([np.nan, "a", "b", "c"], |
| 282 | + categories=[np.nan, "a", "b", "c"]) |
| 283 | + |
| 284 | + with pytest.raises(ValueError): |
| 285 | + pd.Categorical([None, "a", "b", "c"], |
| 286 | + categories=[None, "a", "b", "c"]) |
| 287 | + |
| 288 | + with pytest.raises(ValueError): |
| 289 | + pd.Categorical(DatetimeIndex(['nat', '20160101']), |
| 290 | + categories=[NaT, Timestamp('20160101')]) |
| 291 | + |
| 292 | + |
298 | 293 | def test_constructor_with_index(self): |
299 | 294 | ci = CategoricalIndex(list('aabbca'), categories=list('cab')) |
300 | 295 | tm.assert_categorical_equal(ci.values, Categorical(ci)) |
@@ -418,6 +413,12 @@ def f(): |
418 | 413 |
|
419 | 414 | self.assertRaises(ValueError, f) |
420 | 415 |
|
| 416 | + # NaN categories included |
| 417 | + def f(): |
| 418 | + Categorical.from_codes([0, 1, 2], ["a", "b", np.nan]) |
| 419 | + |
| 420 | + self.assertRaises(ValueError, f) |
| 421 | + |
421 | 422 | # too negative |
422 | 423 | def f(): |
423 | 424 | Categorical.from_codes([-2, 1, 2], ["a", "b", "c"]) |
@@ -649,30 +650,6 @@ def test_describe(self): |
649 | 650 | name='categories')) |
650 | 651 | tm.assert_frame_equal(desc, expected) |
651 | 652 |
|
652 | | - # NA as a category |
653 | | - with tm.assert_produces_warning(FutureWarning): |
654 | | - cat = pd.Categorical(["a", "c", "c", np.nan], |
655 | | - categories=["b", "a", "c", np.nan]) |
656 | | - result = cat.describe() |
657 | | - |
658 | | - expected = DataFrame([[0, 0], [1, 0.25], [2, 0.5], [1, 0.25]], |
659 | | - columns=['counts', 'freqs'], |
660 | | - index=pd.CategoricalIndex(['b', 'a', 'c', np.nan], |
661 | | - name='categories')) |
662 | | - tm.assert_frame_equal(result, expected, check_categorical=False) |
663 | | - |
664 | | - # NA as an unused category |
665 | | - with tm.assert_produces_warning(FutureWarning): |
666 | | - cat = pd.Categorical(["a", "c", "c"], |
667 | | - categories=["b", "a", "c", np.nan]) |
668 | | - result = cat.describe() |
669 | | - |
670 | | - exp_idx = pd.CategoricalIndex( |
671 | | - ['b', 'a', 'c', np.nan], name='categories') |
672 | | - expected = DataFrame([[0, 0], [1, 1 / 3.], [2, 2 / 3.], [0, 0]], |
673 | | - columns=['counts', 'freqs'], index=exp_idx) |
674 | | - tm.assert_frame_equal(result, expected, check_categorical=False) |
675 | | - |
676 | 653 | def test_print(self): |
677 | 654 | expected = ["[a, b, b, a, a, c, c, c]", |
678 | 655 | "Categories (3, object): [a < b < c]"] |
@@ -1119,90 +1096,18 @@ def test_nan_handling(self): |
1119 | 1096 | self.assert_numpy_array_equal(c._codes, |
1120 | 1097 | np.array([0, -1, -1, 0], dtype=np.int8)) |
1121 | 1098 |
|
1122 | | - # If categories have nan included, the code should point to that |
1123 | | - # instead |
1124 | | - with tm.assert_produces_warning(FutureWarning): |
1125 | | - c = Categorical(["a", "b", np.nan, "a"], |
1126 | | - categories=["a", "b", np.nan]) |
1127 | | - self.assert_index_equal(c.categories, Index(["a", "b", np.nan])) |
1128 | | - self.assert_numpy_array_equal(c._codes, |
1129 | | - np.array([0, 1, 2, 0], dtype=np.int8)) |
1130 | | - c[1] = np.nan |
1131 | | - self.assert_index_equal(c.categories, Index(["a", "b", np.nan])) |
1132 | | - self.assert_numpy_array_equal(c._codes, |
1133 | | - np.array([0, 2, 2, 0], dtype=np.int8)) |
1134 | | - |
1135 | | - # Changing categories should also make the replaced category np.nan |
1136 | | - c = Categorical(["a", "b", "c", "a"]) |
1137 | | - with tm.assert_produces_warning(FutureWarning): |
1138 | | - c.categories = ["a", "b", np.nan] # noqa |
1139 | | - |
1140 | | - self.assert_index_equal(c.categories, Index(["a", "b", np.nan])) |
1141 | | - self.assert_numpy_array_equal(c._codes, |
1142 | | - np.array([0, 1, 2, 0], dtype=np.int8)) |
1143 | | - |
1144 | 1099 | # Adding nan to categories should make assigned nan point to the |
1145 | 1100 | # category! |
1146 | 1101 | c = Categorical(["a", "b", np.nan, "a"]) |
1147 | 1102 | self.assert_index_equal(c.categories, Index(["a", "b"])) |
1148 | 1103 | self.assert_numpy_array_equal(c._codes, |
1149 | 1104 | np.array([0, 1, -1, 0], dtype=np.int8)) |
1150 | | - with tm.assert_produces_warning(FutureWarning): |
1151 | | - c.set_categories(["a", "b", np.nan], rename=True, inplace=True) |
1152 | | - |
1153 | | - self.assert_index_equal(c.categories, Index(["a", "b", np.nan])) |
1154 | | - self.assert_numpy_array_equal(c._codes, |
1155 | | - np.array([0, 1, -1, 0], dtype=np.int8)) |
1156 | | - c[1] = np.nan |
1157 | | - self.assert_index_equal(c.categories, Index(["a", "b", np.nan])) |
1158 | | - self.assert_numpy_array_equal(c._codes, |
1159 | | - np.array([0, 2, -1, 0], dtype=np.int8)) |
1160 | | - |
1161 | | - # Remove null categories (GH 10156) |
1162 | | - cases = [([1.0, 2.0, np.nan], [1.0, 2.0]), |
1163 | | - (['a', 'b', None], ['a', 'b']), |
1164 | | - ([pd.Timestamp('2012-05-01'), pd.NaT], |
1165 | | - [pd.Timestamp('2012-05-01')])] |
1166 | | - |
1167 | | - null_values = [np.nan, None, pd.NaT] |
1168 | | - |
1169 | | - for with_null, without in cases: |
1170 | | - with tm.assert_produces_warning(FutureWarning): |
1171 | | - base = Categorical([], with_null) |
1172 | | - expected = Categorical([], without) |
1173 | | - |
1174 | | - for nullval in null_values: |
1175 | | - result = base.remove_categories(nullval) |
1176 | | - self.assert_categorical_equal(result, expected) |
1177 | | - |
1178 | | - # Different null values are indistinguishable |
1179 | | - for i, j in [(0, 1), (0, 2), (1, 2)]: |
1180 | | - nulls = [null_values[i], null_values[j]] |
1181 | | - |
1182 | | - def f(): |
1183 | | - with tm.assert_produces_warning(FutureWarning): |
1184 | | - Categorical([], categories=nulls) |
1185 | | - |
1186 | | - self.assertRaises(ValueError, f) |
1187 | 1105 |
|
1188 | 1106 | def test_isnull(self): |
1189 | 1107 | exp = np.array([False, False, True]) |
1190 | 1108 | c = Categorical(["a", "b", np.nan]) |
1191 | 1109 | res = c.isnull() |
1192 | | - self.assert_numpy_array_equal(res, exp) |
1193 | 1110 |
|
1194 | | - with tm.assert_produces_warning(FutureWarning): |
1195 | | - c = Categorical(["a", "b", np.nan], categories=["a", "b", np.nan]) |
1196 | | - res = c.isnull() |
1197 | | - self.assert_numpy_array_equal(res, exp) |
1198 | | - |
1199 | | - # test both nan in categories and as -1 |
1200 | | - exp = np.array([True, False, True]) |
1201 | | - c = Categorical(["a", "b", np.nan]) |
1202 | | - with tm.assert_produces_warning(FutureWarning): |
1203 | | - c.set_categories(["a", "b", np.nan], rename=True, inplace=True) |
1204 | | - c[0] = np.nan |
1205 | | - res = c.isnull() |
1206 | 1111 | self.assert_numpy_array_equal(res, exp) |
1207 | 1112 |
|
1208 | 1113 | def test_codes_immutable(self): |
@@ -1487,45 +1392,10 @@ def test_slicing_directly(self): |
1487 | 1392 |
|
1488 | 1393 | def test_set_item_nan(self): |
1489 | 1394 | cat = pd.Categorical([1, 2, 3]) |
1490 | | - exp = pd.Categorical([1, np.nan, 3], categories=[1, 2, 3]) |
1491 | 1395 | cat[1] = np.nan |
1492 | | - tm.assert_categorical_equal(cat, exp) |
1493 | 1396 |
|
1494 | | - # if nan in categories, the proper code should be set! |
1495 | | - cat = pd.Categorical([1, 2, 3, np.nan], categories=[1, 2, 3]) |
1496 | | - with tm.assert_produces_warning(FutureWarning): |
1497 | | - cat.set_categories([1, 2, 3, np.nan], rename=True, inplace=True) |
1498 | | - cat[1] = np.nan |
1499 | | - exp = np.array([0, 3, 2, -1], dtype=np.int8) |
1500 | | - self.assert_numpy_array_equal(cat.codes, exp) |
1501 | | - |
1502 | | - cat = pd.Categorical([1, 2, 3, np.nan], categories=[1, 2, 3]) |
1503 | | - with tm.assert_produces_warning(FutureWarning): |
1504 | | - cat.set_categories([1, 2, 3, np.nan], rename=True, inplace=True) |
1505 | | - cat[1:3] = np.nan |
1506 | | - exp = np.array([0, 3, 3, -1], dtype=np.int8) |
1507 | | - self.assert_numpy_array_equal(cat.codes, exp) |
1508 | | - |
1509 | | - cat = pd.Categorical([1, 2, 3, np.nan], categories=[1, 2, 3]) |
1510 | | - with tm.assert_produces_warning(FutureWarning): |
1511 | | - cat.set_categories([1, 2, 3, np.nan], rename=True, inplace=True) |
1512 | | - cat[1:3] = [np.nan, 1] |
1513 | | - exp = np.array([0, 3, 0, -1], dtype=np.int8) |
1514 | | - self.assert_numpy_array_equal(cat.codes, exp) |
1515 | | - |
1516 | | - cat = pd.Categorical([1, 2, 3, np.nan], categories=[1, 2, 3]) |
1517 | | - with tm.assert_produces_warning(FutureWarning): |
1518 | | - cat.set_categories([1, 2, 3, np.nan], rename=True, inplace=True) |
1519 | | - cat[1:3] = [np.nan, np.nan] |
1520 | | - exp = np.array([0, 3, 3, -1], dtype=np.int8) |
1521 | | - self.assert_numpy_array_equal(cat.codes, exp) |
1522 | | - |
1523 | | - cat = pd.Categorical([1, 2, np.nan, 3], categories=[1, 2, 3]) |
1524 | | - with tm.assert_produces_warning(FutureWarning): |
1525 | | - cat.set_categories([1, 2, 3, np.nan], rename=True, inplace=True) |
1526 | | - cat[pd.isnull(cat)] = np.nan |
1527 | | - exp = np.array([0, 1, 3, 2], dtype=np.int8) |
1528 | | - self.assert_numpy_array_equal(cat.codes, exp) |
| 1397 | + exp = pd.Categorical([1, np.nan, 3], categories=[1, 2, 3]) |
| 1398 | + tm.assert_categorical_equal(cat, exp) |
1529 | 1399 |
|
1530 | 1400 | def test_shift(self): |
1531 | 1401 | # GH 9416 |
@@ -2026,33 +1896,12 @@ def test_sideeffects_free(self): |
2026 | 1896 |
|
2027 | 1897 | def test_nan_handling(self): |
2028 | 1898 |
|
2029 | | - # Nans are represented as -1 in labels |
| 1899 | + # NaNs are represented as -1 in labels |
2030 | 1900 | s = Series(Categorical(["a", "b", np.nan, "a"])) |
2031 | 1901 | self.assert_index_equal(s.cat.categories, Index(["a", "b"])) |
2032 | 1902 | self.assert_numpy_array_equal(s.values.codes, |
2033 | 1903 | np.array([0, 1, -1, 0], dtype=np.int8)) |
2034 | 1904 |
|
2035 | | - # If categories have nan included, the label should point to that |
2036 | | - # instead |
2037 | | - with tm.assert_produces_warning(FutureWarning): |
2038 | | - s2 = Series(Categorical(["a", "b", np.nan, "a"], |
2039 | | - categories=["a", "b", np.nan])) |
2040 | | - |
2041 | | - exp_cat = Index(["a", "b", np.nan]) |
2042 | | - self.assert_index_equal(s2.cat.categories, exp_cat) |
2043 | | - self.assert_numpy_array_equal(s2.values.codes, |
2044 | | - np.array([0, 1, 2, 0], dtype=np.int8)) |
2045 | | - |
2046 | | - # Changing categories should also make the replaced category np.nan |
2047 | | - s3 = Series(Categorical(["a", "b", "c", "a"])) |
2048 | | - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): |
2049 | | - s3.cat.categories = ["a", "b", np.nan] |
2050 | | - |
2051 | | - exp_cat = Index(["a", "b", np.nan]) |
2052 | | - self.assert_index_equal(s3.cat.categories, exp_cat) |
2053 | | - self.assert_numpy_array_equal(s3.values.codes, |
2054 | | - np.array([0, 1, 2, 0], dtype=np.int8)) |
2055 | | - |
2056 | 1905 | def test_cat_accessor(self): |
2057 | 1906 | s = Series(Categorical(["a", "b", np.nan, "a"])) |
2058 | 1907 | self.assert_index_equal(s.cat.categories, Index(["a", "b"])) |
|
0 commit comments