From eedda91913983e774fe32e11f10ffab92cf137bf Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Wed, 16 Dec 2020 03:58:39 -0500 Subject: [PATCH 001/478] BUG Fixes fetch_kddcup99 for return_X_y and as_frame (#19011) --- sklearn/datasets/_kddcup99.py | 6 +++--- sklearn/datasets/tests/test_common.py | 8 ++++++++ 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/sklearn/datasets/_kddcup99.py b/sklearn/datasets/_kddcup99.py index e5c8bb2f298de..539b7ffaf862e 100644 --- a/sklearn/datasets/_kddcup99.py +++ b/sklearn/datasets/_kddcup99.py @@ -199,15 +199,15 @@ def fetch_kddcup99(*, subset=None, data_home=None, shuffle=False, with open(join(module_path, 'descr', 'kddcup99.rst')) as rst_file: fdescr = rst_file.read() - if return_X_y: - return data, target - frame = None if as_frame: frame, data, target = _convert_data_dataframe( "fetch_kddcup99", data, target, feature_names, target_names ) + if return_X_y: + return data, target + return Bunch( data=data, target=target, diff --git a/sklearn/datasets/tests/test_common.py b/sklearn/datasets/tests/test_common.py index 073eb2023eedf..2a905b75e94eb 100644 --- a/sklearn/datasets/tests/test_common.py +++ b/sklearn/datasets/tests/test_common.py @@ -75,6 +75,14 @@ def check_as_frame(bunch, dataset_func, if expected_target_dtype is not None: assert np.all(frame_bunch.target.dtypes == expected_target_dtype) + # Test for return_X_y and as_frame=True + frame_X, frame_y = dataset_func(as_frame=True, return_X_y=True) + assert isinstance(frame_X, pd.DataFrame) + if frame_y.ndim > 1: + assert isinstance(frame_X, pd.DataFrame) + else: + assert isinstance(frame_y, pd.Series) + def _skip_network_tests(): return os.environ.get('SKLEARN_SKIP_NETWORK_TESTS', '1') == '1' From 38b7155b11946ac2e97883424db8339ac95e3d93 Mon Sep 17 00:00:00 2001 From: Neal Fultz Date: Wed, 16 Dec 2020 01:07:42 -0800 Subject: [PATCH 002/478] DOC fix citations for de Leeuw in IsotonicRegression (#18952) Co-authored-by: Thomas J. Fan --- sklearn/isotonic.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/isotonic.py b/sklearn/isotonic.py index 35d0004aa4a73..b57ce23f8cc52 100644 --- a/sklearn/isotonic.py +++ b/sklearn/isotonic.py @@ -190,7 +190,7 @@ class IsotonicRegression(RegressorMixin, TransformerMixin, BaseEstimator): Notes ----- - Ties are broken using the secondary method from Leeuw, 1977. + Ties are broken using the secondary method from de Leeuw, 1977. References ---------- @@ -201,11 +201,11 @@ class IsotonicRegression(RegressorMixin, TransformerMixin, BaseEstimator): Isotone Optimization in R : Pool-Adjacent-Violators Algorithm (PAVA) and Active Set Methods - Leeuw, Hornik, Mair + de Leeuw, Hornik, Mair Journal of Statistical Software 2009 Correctness of Kruskal's algorithms for monotone regression with ties - Leeuw, Psychometrica, 1977 + de Leeuw, Psychometrica, 1977 Examples -------- From 6b605584a2232a68f18b2b68536457e1a2118ca3 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Wed, 16 Dec 2020 11:35:25 +0100 Subject: [PATCH 003/478] Trigger [cd build] to test #18782 From d304331b450344e4660550b15d8174b15fb616c7 Mon Sep 17 00:00:00 2001 From: RamyaNP <56212418+RamyaNP@users.noreply.github.com> Date: Wed, 16 Dec 2020 22:05:19 +0530 Subject: [PATCH 004/478] DOC fix multiclass AUC formulas in user guide (#18559) --- doc/modules/model_evaluation.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index e092137ab7982..0bc08f24bb19c 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -1423,7 +1423,7 @@ uniformly: .. math:: - \frac{2}{c(c-1)}\sum_{j=1}^{c}\sum_{k > j}^c (\text{AUC}(j | k) + + \frac{1}{c(c-1)}\sum_{j=1}^{c}\sum_{k > j}^c (\text{AUC}(j | k) + \text{AUC}(k | j)) where :math:`c` is the number of classes and :math:`\text{AUC}(j | k)` is the @@ -1438,7 +1438,7 @@ prevalence: .. math:: - \frac{2}{c(c-1)}\sum_{j=1}^{c}\sum_{k > j}^c p(j \cup k)( + \frac{1}{c(c-1)}\sum_{j=1}^{c}\sum_{k > j}^c p(j \cup k)( \text{AUC}(j | k) + \text{AUC}(k | j)) where :math:`c` is the number of classes. This algorithm is used by setting From e1408d05c56a9cee22127b2edb9e5ecbd26852bc Mon Sep 17 00:00:00 2001 From: Brian Rice Date: Thu, 17 Dec 2020 02:32:24 -0600 Subject: [PATCH 005/478] MNT better error message in RidgeCV (#19020) --- sklearn/linear_model/_ridge.py | 2 +- sklearn/linear_model/tests/test_ridge.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/linear_model/_ridge.py b/sklearn/linear_model/_ridge.py index 199a1cd760660..f3f1074312f60 100644 --- a/sklearn/linear_model/_ridge.py +++ b/sklearn/linear_model/_ridge.py @@ -1466,7 +1466,7 @@ def fit(self, X, y, sample_weight=None): if np.any(self.alphas <= 0): raise ValueError( - "alphas must be positive. Got {} containing some " + "alphas must be strictly positive. Got {} containing some " "negative or null value instead.".format(self.alphas)) X, y, X_offset, y_offset, X_scale = LinearModel._preprocess_data( diff --git a/sklearn/linear_model/tests/test_ridge.py b/sklearn/linear_model/tests/test_ridge.py index 7d52de903aee5..2da9a60fb301e 100644 --- a/sklearn/linear_model/tests/test_ridge.py +++ b/sklearn/linear_model/tests/test_ridge.py @@ -1126,13 +1126,13 @@ def test_ridgecv_negative_alphas(): # Negative integers ridge = RidgeCV(alphas=(-1, -10, -100)) assert_raises_regex(ValueError, - "alphas must be positive", + "alphas must be strictly positive", ridge.fit, X, y) # Negative floats ridge = RidgeCV(alphas=(-0.1, -1.0, -10.0)) assert_raises_regex(ValueError, - "alphas must be positive", + "alphas must be strictly positive", ridge.fit, X, y) From 3bca0412c10b89bb474bcf2f38442e2b1f36e6f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juan=20Carlos=20Alfaro=20Jim=C3=A9nez?= Date: Thu, 17 Dec 2020 11:41:01 +0100 Subject: [PATCH 006/478] CI Avoid Travis stuck on failure (#19018) --- .travis.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index 21fa7789495a7..4702fb63c497c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -88,9 +88,9 @@ jobs: - BUILD_WHEEL=true - CIBW_BUILD=cp39-manylinux_aarch64 -install: source build_tools/travis/install.sh -script: source build_tools/travis/script.sh -after_success: source build_tools/travis/after_success.sh +install: source build_tools/travis/install.sh || travis_terminate 1 +script: source build_tools/travis/script.sh || travis_terminate 1 +after_success: source build_tools/travis/after_success.sh || travis_terminate 1 notifications: webhooks: From e406860586036fae87269cf795497591d1c48827 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Thu, 17 Dec 2020 14:20:54 -0500 Subject: [PATCH 007/478] CI Only build wheels on main repo (#19026) --- .github/workflows/wheels.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 3d4244861d53d..ac1d495642049 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -22,6 +22,7 @@ jobs: check_build_trigger: name: Check build trigger runs-on: ubuntu-latest + if: github.repository == 'scikit-learn/scikit-learn' outputs: build: ${{ steps.check_build_trigger.outputs.build }} @@ -86,7 +87,7 @@ jobs: SKLEARN_SKIP_NETWORK_TESTS=1 SKLEARN_BUILD_PARALLEL=3 CIBW_BUILD: cp${{ matrix.python }}-${{ matrix.platform_id }} - CIBW_REPAIR_WHEEL_COMMAND_WINDOWS: bash build_tools/github/repair_windows_wheels.sh {wheel} {dest_dir} ${{ matrix.bitness }} + CIBW_REPAIR_WHEEL_COMMAND_WINDOWS: bash build_tools/github/repair_windows_wheels.sh {wheel} {dest_dir} ${{ matrix.bitness }} CIBW_BEFORE_TEST_WINDOWS: bash build_tools/github/build_minimal_windows_image.sh ${{ matrix.python }} ${{ matrix.bitness }} CIBW_TEST_REQUIRES: pytest pandas threadpoolctl CIBW_TEST_COMMAND: bash {project}/build_tools/github/test_wheels.sh From be4f8a509f1382a9bbd24194bcfd19c6563fcf31 Mon Sep 17 00:00:00 2001 From: Thomas A Caswell Date: Thu, 17 Dec 2020 17:06:10 -0500 Subject: [PATCH 008/478] DOC correct some typo in bug_triaging.rst (#19032) --- doc/developers/bug_triaging.rst | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/doc/developers/bug_triaging.rst b/doc/developers/bug_triaging.rst index 2cd87590ca511..ff96ec9f0faae 100644 --- a/doc/developers/bug_triaging.rst +++ b/doc/developers/bug_triaging.rst @@ -1,7 +1,7 @@ .. _bug_triaging: Bug triaging and issue curation -================================ +=============================== The `issue tracker `_ is important to the communication in the project: it helps @@ -10,7 +10,7 @@ priorities. For this reason, it is important to curate it, adding labels to issues and closing issues that are not necessary. Working on issues to improve them --------------------------------------- +--------------------------------- Improving issues increases their chances of being successfully resolved. Guidelines on submitting good issues can be found :ref:`here @@ -36,7 +36,7 @@ The following actions are typically useful: Online discussions may be harder than it seems at first glance, in particular given that a person new to open-source may have a very - different understanding of the process than a seasonned maintainer. + different understanding of the process than a seasoned maintainer. Overall, it is useful to stay positive and assume good will. `The following article @@ -44,14 +44,14 @@ The following actions are typically useful: explores how to lead online discussions in the context of open source. Working on PRs to help review ------------------------------- +----------------------------- Reviewing code is also encouraged. Contributors and users are welcome to participate to the review process following our :ref:`review guidelines `. Triaging operations for members of the core and triage teams -------------------------------------------------------------- +------------------------------------------------------------ In addition to the above, members of the core team and the triage team can do the following important tasks: @@ -91,7 +91,7 @@ See the github description for `roles in the organization should be closed. A typical workflow for triaging issues ----------------------------------------- +-------------------------------------- The following workflow [1]_ is a good way to approach issue triaging: From 105d37a51bbab43613a0d678d7bf0e3d728314e1 Mon Sep 17 00:00:00 2001 From: Amol Deshmukh <34318357+des137@users.noreply.github.com> Date: Fri, 18 Dec 2020 05:16:46 -0500 Subject: [PATCH 009/478] DOC Revises a sentence in the description of RFE (#19033) --- doc/modules/feature_selection.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/modules/feature_selection.rst b/doc/modules/feature_selection.rst index 7a4993520c1d5..0b758bf72bc0c 100644 --- a/doc/modules/feature_selection.rst +++ b/doc/modules/feature_selection.rst @@ -119,12 +119,12 @@ Recursive feature elimination ============================= Given an external estimator that assigns weights to features (e.g., the -coefficients of a linear model), recursive feature elimination (:class:`RFE`) +coefficients of a linear model), the goal of recursive feature elimination (:class:`RFE`) is to select features by recursively considering smaller and smaller sets of features. First, the estimator is trained on the initial set of features and the importance of each feature is obtained either through any specific attribute (such as ``coef_``, ``feature_importances_``) or callable. Then, the least important -features are pruned from current set of features.That procedure is recursively +features are pruned from current set of features. That procedure is recursively repeated on the pruned set until the desired number of features to select is eventually reached. From 51bd34378f9c9c813c44778b9b03a6925ee6dc2c Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 18 Dec 2020 11:17:22 +0100 Subject: [PATCH 010/478] MNT skip preprocessing.rst when pandas is not installed (#19016) --- doc/conftest.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/doc/conftest.py b/doc/conftest.py index c950303acf280..4496bb74152ac 100644 --- a/doc/conftest.py +++ b/doc/conftest.py @@ -3,8 +3,6 @@ from os.path import join import warnings -import numpy as np - from sklearn.utils import IS_PYPY from sklearn.utils._testing import SkipTest from sklearn.utils._testing import check_skip_network @@ -72,6 +70,13 @@ def setup_grid_search(): raise SkipTest("Skipping grid_search.rst, pandas not installed") +def setup_preprocessing(): + try: + import pandas # noqa + except ImportError: + raise SkipTest("Skipping preprocessing.rst, pandas not installed") + + def setup_unsupervised_learning(): try: import skimage # noqa @@ -105,5 +110,7 @@ def pytest_runtest_setup(item): setup_impute() elif fname.endswith('modules/grid_search.rst'): setup_grid_search() + elif fname.endswith('modules/preprocessing.rst'): + setup_preprocessing() elif fname.endswith('statistical_inference/unsupervised_learning.rst'): setup_unsupervised_learning() From 2218ec46227c92301ac6837c4a8ae9b8dc5d3960 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 18 Dec 2020 17:09:19 +0100 Subject: [PATCH 011/478] MNT change 0.25 to 1.0 and 0.26 to 1.1 in deprecation messages (#19005) Co-authored-by: Thomas J. Fan --- doc/developers/develop.rst | 4 +- doc/glossary.rst | 6 +- doc/modules/classes.rst | 4 +- doc/whats_new/v0.23.rst | 4 +- doc/whats_new/v0.24.rst | 41 +++++---- sklearn/base.py | 9 +- sklearn/calibration.py | 8 +- sklearn/cluster/_affinity_propagation.py | 19 ++-- sklearn/cluster/_bicluster.py | 6 +- sklearn/cluster/_kmeans.py | 26 +++--- sklearn/cluster/_spectral.py | 4 +- .../tests/test_affinity_propagation.py | 11 +-- sklearn/cluster/tests/test_bicluster.py | 4 +- sklearn/cluster/tests/test_k_means.py | 12 +-- sklearn/cluster/tests/test_spectral.py | 2 +- sklearn/covariance/_graph_lasso.py | 15 ++-- .../covariance/tests/test_graphical_lasso.py | 7 +- sklearn/cross_decomposition/_pls.py | 90 ++++++++++--------- sklearn/cross_decomposition/tests/test_pls.py | 10 +-- sklearn/decomposition/_dict_learning.py | 8 +- sklearn/decomposition/_kernel_pca.py | 4 +- sklearn/decomposition/_nmf.py | 2 +- .../decomposition/tests/test_dict_learning.py | 2 +- .../decomposition/tests/test_kernel_pca.py | 2 +- sklearn/decomposition/tests/test_nmf.py | 20 ++--- sklearn/ensemble/_forest.py | 16 ++-- sklearn/ensemble/_gb.py | 36 ++++---- .../ensemble/tests/test_gradient_boosting.py | 8 +- sklearn/exceptions.py | 4 +- sklearn/inspection/_partial_dependence.py | 10 +-- .../inspection/_plot/partial_dependence.py | 2 +- .../tests/test_partial_dependence.py | 4 +- sklearn/kernel_ridge.py | 4 +- sklearn/linear_model/_stochastic_gradient.py | 15 ++-- .../tests/test_passive_aggressive.py | 2 +- sklearn/linear_model/tests/test_sgd.py | 2 +- sklearn/manifold/_mds.py | 4 +- sklearn/manifold/_spectral_embedding.py | 4 +- sklearn/manifold/_t_sne.py | 24 ++--- sklearn/manifold/tests/test_mds.py | 4 +- .../manifold/tests/test_spectral_embedding.py | 2 +- sklearn/manifold/tests/test_t_sne.py | 2 +- sklearn/metrics/pairwise.py | 18 ++-- sklearn/metrics/tests/test_pairwise.py | 2 +- sklearn/model_selection/_search.py | 8 +- sklearn/model_selection/tests/test_search.py | 8 +- .../model_selection/tests/test_validation.py | 5 +- sklearn/multiclass.py | 26 +++--- sklearn/naive_bayes.py | 16 ++-- sklearn/neighbors/_base.py | 4 +- sklearn/neighbors/_regression.py | 4 +- sklearn/neighbors/tests/test_neighbors.py | 2 +- sklearn/pipeline.py | 4 +- sklearn/preprocessing/_data.py | 4 +- sklearn/preprocessing/tests/test_data.py | 4 +- sklearn/svm/_base.py | 4 +- sklearn/svm/_classes.py | 8 +- sklearn/svm/tests/test_svm.py | 4 +- sklearn/tests/test_base.py | 5 +- sklearn/tests/test_calibration.py | 2 +- sklearn/tests/test_docstring_parameters.py | 8 +- sklearn/tests/test_kernel_ridge.py | 2 +- sklearn/tests/test_multiclass.py | 16 ++-- sklearn/tests/test_naive_bayes.py | 8 +- sklearn/tree/_classes.py | 41 +++++---- sklearn/tree/_criterion.pyx | 2 +- sklearn/tree/_export.py | 6 +- sklearn/tree/tests/test_export.py | 6 +- sklearn/tree/tests/test_tree.py | 2 +- sklearn/utils/estimator_checks.py | 8 +- sklearn/utils/fixes.py | 6 +- sklearn/utils/metaestimators.py | 5 +- sklearn/utils/tests/test_fixes.py | 2 +- sklearn/utils/tests/test_validation.py | 12 +-- sklearn/utils/validation.py | 17 ++-- 75 files changed, 393 insertions(+), 339 deletions(-) diff --git a/doc/developers/develop.rst b/doc/developers/develop.rst index 08ce24933dd8e..c68becf18f93c 100644 --- a/doc/developers/develop.rst +++ b/doc/developers/develop.rst @@ -228,8 +228,8 @@ to slice rows and columns. .. deprecated:: 0.24 - The _pairwise attribute is deprecated in 0.24. From 0.26 onward, - the `pairwise` estimator tag should be used instead. + The _pairwise attribute is deprecated in 0.24. From 1.1 (renaming of 0.26) + onward, the `pairwise` estimator tag should be used instead. Universal attributes ^^^^^^^^^^^^^^^^^^^^ diff --git a/doc/glossary.rst b/doc/glossary.rst index cb4bb9e3fd3d6..30e647be1c0f4 100644 --- a/doc/glossary.rst +++ b/doc/glossary.rst @@ -389,9 +389,9 @@ General Concepts .. deprecated:: 0.24 - The _pairwise attribute is deprecated in 0.24. From 0.26 - onward, the `pairwise` estimator tag should be used - instead. + The _pairwise attribute is deprecated in 0.24. From 1.1 + (renaming of 0.26) onward, the `pairwise` estimator tag + should be used instead. For more detailed info, see :ref:`estimator_tags`. diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index 2e9ab3884b1b5..84f8097cbbe9d 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -1637,5 +1637,5 @@ Utilities from joblib: Recently deprecated =================== -To be removed in 0.25 ---------------------- +To be removed in 1.0 (renaming of 0.25) +--------------------------------------- diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index da9f2f01d29a2..598d9adc5cef4 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -166,8 +166,8 @@ In an effort to promote clear and non-ambiguous use of the library, most constructor and function parameters are now expected to be passed as keyword arguments (i.e. using the `param=value` syntax) instead of positional. To ease the transition, a `FutureWarning` is raised if a keyword-only parameter -is used as positional. In version 0.25, these parameters will be strictly -keyword-only, and a `TypeError` will be raised. +is used as positional. In version 1.0 (renaming of 0.25), these parameters +will be strictly keyword-only, and a `TypeError` will be raised. :issue:`15005` by `Joel Nothman`_, `Adrin Jalali`_, `Thomas Fan`_, and `Nicolas Hug`_. See `SLEP009 `_ diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst index a5b0ec36d62aa..7197b74b94faa 100644 --- a/doc/whats_new/v0.24.rst +++ b/doc/whats_new/v0.24.rst @@ -104,8 +104,8 @@ Changelog initial cluster centroids. :pr:`17937` by :user:`g-walsh` - |API| :class:`cluster.MiniBatchKMeans` attributes, `counts_` and - `init_size_`, are deprecated and will be removed in 0.26. :pr:`17864` by - :user:`Jérémie du Boisberranger `. + `init_size_`, are deprecated and will be removed in 1.1 (renaming of 0.26). + :pr:`17864` by :user:`Jérémie du Boisberranger `. :mod:`sklearn.compose` ...................... @@ -128,7 +128,8 @@ Changelog - |API| Deprecates `cv_alphas_` in favor of `cv_results_['alphas']` and `grid_scores_` in favor of split scores in `cv_results_` in :class:`covariance.GraphicalLassoCV`. `cv_alphas_` and `grid_scores_` will be - removed in version 0.26. :pr:`16392` by `Thomas Fan`_. + removed in version 1.1 (renaming of 0.26). + :pr:`16392` by `Thomas Fan`_. :mod:`sklearn.cross_decomposition` .................................. @@ -149,7 +150,7 @@ Changelog - |API| For :class:`cross_decomposition.NMF`, the `init` value, when 'init=None' and n_components <= min(n_samples, n_features) will be changed from - `'nndsvd'` to `'nndsvda'` in 0.26. + `'nndsvd'` to `'nndsvda'` in 1.1 (renaming of 0.26). :pr:`18525` by :user:`Chiara Marmo `. - |API| The bounds of the `n_components` parameter is now restricted: @@ -159,20 +160,23 @@ Changelog and :class:`cross_decomposition.PLSCanonical`. - into `[1, n_features]` or :class:`cross_decomposition.PLSRegression`. - An error will be raised in 0.26. :pr:`17095` by `Nicolas Hug`_. + An error will be raised in 1.1 (renaming of 0.26). + :pr:`17095` by `Nicolas Hug`_. - |API| For :class:`cross_decomposition.PLSSVD`, :class:`cross_decomposition.CCA`, and :class:`cross_decomposition.PLSCanonical`, the `x_scores_` and `y_scores_` - attributes were deprecated and will be removed in 0.26. They can be - retrieved by calling `transform` on the training data. The `norm_y_weights` - attribute will also be removed. :pr:`17095` by `Nicolas Hug`_. + attributes were deprecated and will be removed in 1.1 (renaming of 0.26). + They can be retrieved by calling `transform` on the training data. + The `norm_y_weights` attribute will also be removed. + :pr:`17095` by `Nicolas Hug`_. - |API| For :class:`cross_decomposition.PLSRegression`, :class:`cross_decomposition.PLSCanonical`, :class:`cross_decomposition.CCA`, and :class:`cross_decomposition.PLSSVD`, the `x_mean_`, `y_mean_`, `x_std_`, and - `y_std_` attributes were deprecated and will be removed in 0.26. + `y_std_` attributes were deprecated and will be removed in 1.1 + (renaming of 0.26). :pr:`18768` by :user:`Maren Westermann `. - |Fix| :class:`decomposition.TruncatedSVD` becomes deterministic by using the @@ -240,8 +244,9 @@ Changelog - |Fix| Fix :class:`decomposition.SparseCoder` such that it follows scikit-learn API and support cloning. The attribute `components_` is - deprecated in 0.24 and will be removed in 0.26. This attribute was - redundant with the `dictionary` attribute and constructor parameter. + deprecated in 0.24 and will be removed in 1.1 (renaming of 0.26). + This attribute was redundant with the `dictionary` attribute and constructor + parameter. :pr:`17679` by :user:`Xavier Dupré `. - |Fix| :meth:`TruncatedSVD.fit_transform` consistently returns the same @@ -302,7 +307,8 @@ Changelog - |API| :class:`exceptions.ChangedBehaviorWarning` and :class:`exceptions.NonBLASDotWarning` are deprecated and will be removed in - v0.26, :pr:`17804` by `Adrin Jalali`_. + 1.1 (renaming of 0.26). + :pr:`17804` by `Adrin Jalali`_. :mod:`sklearn.feature_extraction` ................................. @@ -390,7 +396,8 @@ Changelog :user:`Roei Kahny `. - |API| Positional arguments are deprecated in - :meth:`inspection.PartialDependenceDisplay.plot` and will error in 0.26. + :meth:`inspection.PartialDependenceDisplay.plot` and will error in 1.1 + (renaming of 0.26). :pr:`18293` by `Thomas Fan`_. :mod:`sklearn.isotonic` @@ -458,8 +465,8 @@ Changelog - |Enhancement| Add `square_distances` parameter to :class:`manifold.TSNE`, which provides backward compatibility during deprecation of legacy squaring - behavior. Distances will be squared by default in 0.26, and this parameter - will be removed in 0.28. :pr:`17662` by + behavior. Distances will be squared by default in 1.1 (renaming of 0.26), + and this parameter will be removed in 1.3. :pr:`17662` by :user:`Joshua Newton `. - |Fix| :class:`manifold.MDS` now correctly sets its `_pairwise` attribute. @@ -645,8 +652,8 @@ Changelog - |API| The attributes ``coef_`` and ``intercept_`` are now deprecated in :class:`naive_bayes.MultinomialNB`, :class:`naive_bayes.ComplementNB`, :class:`naive_bayes.BernoulliNB` and :class:`naive_bayes.CategoricalNB`, - and will be removed in v0.26. :pr:`17427` by - :user:`Juan Carlos Alfaro Jiménez `. + and will be removed in v1.1 (renaming of 0.26). + :pr:`17427` by :user:`Juan Carlos Alfaro Jiménez `. :mod:`sklearn.neighbors` ........................ diff --git a/sklearn/base.py b/sklearn/base.py index 3d49ec4fe96f6..3626e931aa9cf 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -845,9 +845,12 @@ def _is_pairwise(estimator): if has_pairwise_attribute: if pairwise_attribute != pairwise_tag: - warnings.warn("_pairwise was deprecated in 0.24 and will be " - "removed in 0.26. Set the estimator tags of your " - "estimator instead", FutureWarning) + warnings.warn( + "_pairwise was deprecated in 0.24 and will be removed in 1.1 " + "(renaming of 0.26). Set the estimator tags of your estimator " + "instead", + FutureWarning + ) return pairwise_attribute # use pairwise tag when the attribute is not present diff --git a/sklearn/calibration.py b/sklearn/calibration.py index 46faf680923f5..3c997c906497c 100644 --- a/sklearn/calibration.py +++ b/sklearn/calibration.py @@ -599,7 +599,7 @@ class _CalibratedClassifier: .. deprecated:: 0.24 `calibrators_` is deprecated from 0.24 and will be removed in - 0.26. Use `calibrators` instead. + 1.1 (renaming of 0.26). Use `calibrators` instead. """ def __init__(self, base_estimator, calibrators, *, classes, method='sigmoid'): @@ -608,11 +608,11 @@ def __init__(self, base_estimator, calibrators, *, classes, self.classes = classes self.method = method - # TODO: Remove in 0.26 + # TODO: Remove in 1.1 # mypy error: Decorated property not supported @deprecated( # type: ignore - "calibrators_ is deprecated in 0.24 and will be removed in 0.26. " - "Use calibrators instead." + "calibrators_ is deprecated in 0.24 and will be removed in 1.1" + "(renaming of 0.26). Use calibrators instead." ) @property def calibrators_(self): diff --git a/sklearn/cluster/_affinity_propagation.py b/sklearn/cluster/_affinity_propagation.py index 9937962095895..cb9230cd2382f 100644 --- a/sklearn/cluster/_affinity_propagation.py +++ b/sklearn/cluster/_affinity_propagation.py @@ -145,13 +145,14 @@ def affinity_propagation(S, *, preference=None, convergence_iter=15, else (np.array([0]), np.array([0] * n_samples))) if random_state == 'warn': - warnings.warn(("'random_state' has been introduced in 0.23. " - "It will be set to None starting from 0.25 which " - "means that results will differ at every function " - "call. Set 'random_state' to None to silence this " - "warning, or to 0 to keep the behavior of versions " - "<0.23."), - FutureWarning) + warnings.warn( + "'random_state' has been introduced in 0.23. It will be set to " + "None starting from 1.0 (renaming of 0.25) which means that " + "results will differ at every function call. Set 'random_state' " + "to None to silence this warning, or to 0 to keep the behavior of " + "versions <0.23.", + FutureWarning + ) random_state = 0 random_state = check_random_state(random_state) @@ -375,10 +376,10 @@ def __init__(self, *, damping=.5, max_iter=200, convergence_iter=15, self.affinity = affinity self.random_state = random_state - # TODO: Remove in 0.26 + # TODO: Remove in 1.1 # mypy error: Decorated property not supported @deprecated("Attribute _pairwise was deprecated in " # type: ignore - "version 0.24 and will be removed in 0.26.") + "version 0.24 and will be removed in 1.1 (renaming of 0.26).") @property def _pairwise(self): return self.affinity == "precomputed" diff --git a/sklearn/cluster/_bicluster.py b/sklearn/cluster/_bicluster.py index d80463f211aba..6d293206bddd8 100644 --- a/sklearn/cluster/_bicluster.py +++ b/sklearn/cluster/_bicluster.py @@ -118,7 +118,7 @@ def fit(self, X, y=None): """ if self.n_jobs != 'deprecated': warnings.warn("'n_jobs' was deprecated in version 0.23 and will be" - " removed in 0.25.", FutureWarning) + " removed in 1.0 (renaming of 0.25).", FutureWarning) X = self._validate_data(X, accept_sparse='csr', dtype=np.float64) self._check_parameters() @@ -240,7 +240,7 @@ class SpectralCoclustering(BaseSpectral): .. deprecated:: 0.23 ``n_jobs`` was deprecated in version 0.23 and will be removed in - 0.25. + 1.0 (renaming of 0.25). random_state : int, RandomState instance, default=None Used for randomizing the singular value decomposition and the k-means @@ -392,7 +392,7 @@ class SpectralBiclustering(BaseSpectral): .. deprecated:: 0.23 ``n_jobs`` was deprecated in version 0.23 and will be removed in - 0.25. + 1.0 (renaming of 0.25). random_state : int, RandomState instance, default=None Used for randomizing the singular value decomposition and the k-means diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index f23df27dc8ad5..d10dfba0d08b3 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -212,7 +212,7 @@ def k_means(X, n_clusters, *, sample_weight=None, init='k-means++', .. deprecated:: 0.23 'precompute_distances' was deprecated in version 0.23 and will be - removed in 0.25. It has no effect. + removed in 1.0 (renaming of 0.25). It has no effect. n_init : int, default=10 Number of time the k-means algorithm will be run with different @@ -254,7 +254,7 @@ def k_means(X, n_clusters, *, sample_weight=None, init='k-means++', .. deprecated:: 0.23 ``n_jobs`` was deprecated in version 0.23 and will be removed in - 0.25. + 1.0 (renaming of 0.25). algorithm : {"auto", "full", "elkan"}, default="auto" K-means algorithm to use. The classical EM-style algorithm is "full". @@ -657,7 +657,7 @@ class KMeans(TransformerMixin, ClusterMixin, BaseEstimator): .. deprecated:: 0.23 'precompute_distances' was deprecated in version 0.22 and will be - removed in 0.25. It has no effect. + removed in 1.0 (renaming of 0.25). It has no effect. verbose : int, default=0 Verbosity mode. @@ -686,7 +686,7 @@ class KMeans(TransformerMixin, ClusterMixin, BaseEstimator): .. deprecated:: 0.23 ``n_jobs`` was deprecated in version 0.23 and will be removed in - 0.25. + 1.0 (renaming of 0.25). algorithm : {"auto", "full", "elkan"}, default="auto" K-means algorithm to use. The classical EM-style algorithm is "full". @@ -784,13 +784,13 @@ def _check_params(self, X): # precompute_distances if self.precompute_distances != 'deprecated': warnings.warn("'precompute_distances' was deprecated in version " - "0.23 and will be removed in 0.25. It has no " - "effect", FutureWarning) + "0.23 and will be removed in 1.0 (renaming of 0.25)" + ". It has no effect", FutureWarning) # n_jobs if self.n_jobs != 'deprecated': warnings.warn("'n_jobs' was deprecated in version 0.23 and will be" - " removed in 0.25.", FutureWarning) + " removed in 1.0 (renaming of 0.25).", FutureWarning) self._n_threads = self.n_jobs else: self._n_threads = None @@ -1512,13 +1512,15 @@ class MiniBatchKMeans(KMeans): Weigth sum of each cluster. .. deprecated:: 0.24 - This attribute is deprecated in 0.24 and will be removed in 0.26. + This attribute is deprecated in 0.24 and will be removed in + 1.1 (renaming of 0.26). init_size_ : int The effective number of samples used for the initialization. .. deprecated:: 0.24 - This attribute is deprecated in 0.24 and will be removed in 0.26. + This attribute is deprecated in 0.24 and will be removed in + 1.1 (renaming of 0.26). See Also -------- @@ -1577,19 +1579,19 @@ def __init__(self, n_clusters=8, *, init='k-means++', max_iter=100, self.reassignment_ratio = reassignment_ratio @deprecated("The attribute 'counts_' is deprecated in 0.24" # type: ignore - " and will be removed in 0.26.") + " and will be removed in 1.1 (renaming of 0.26).") @property def counts_(self): return self._counts @deprecated("The attribute 'init_size_' is deprecated in " # type: ignore - "0.24 and will be removed in 0.26.") + "0.24 and will be removed in 1.1 (renaming of 0.26).") @property def init_size_(self): return self._init_size @deprecated("The attribute 'random_state_' is deprecated " # type: ignore - "in 0.24 and will be removed in 0.26.") + "in 0.24 and will be removed in 1.1 (renaming of 0.26).") @property def random_state_(self): return getattr(self, "_random_state", None) diff --git a/sklearn/cluster/_spectral.py b/sklearn/cluster/_spectral.py index f9a01dc2c00da..79a0b77954028 100644 --- a/sklearn/cluster/_spectral.py +++ b/sklearn/cluster/_spectral.py @@ -571,10 +571,10 @@ def _more_tags(self): return {'pairwise': self.affinity in ["precomputed", "precomputed_nearest_neighbors"]} - # TODO: Remove in 0.26 + # TODO: Remove in 1.1 # mypy error: Decorated property not supported @deprecated("Attribute _pairwise was deprecated in " # type: ignore - "version 0.24 and will be removed in 0.26.") + "version 0.24 and will be removed in 1.1 (renaming of 0.26).") @property def _pairwise(self): return self.affinity in ["precomputed", diff --git a/sklearn/cluster/tests/test_affinity_propagation.py b/sklearn/cluster/tests/test_affinity_propagation.py index f3e367ddf022f..446b0f43c74d9 100644 --- a/sklearn/cluster/tests/test_affinity_propagation.py +++ b/sklearn/cluster/tests/test_affinity_propagation.py @@ -204,16 +204,11 @@ def test_affinity_propagation_random_state(): assert np.mean((centers0 - centers76) ** 2) > 1 -# FIXME: to be removed in 0.25 +# FIXME: to be removed in 1.0 def test_affinity_propagation_random_state_warning(): # test that a warning is raised when random_state is not defined. X = np.array([[0, 0], [1, 1], [-2, -2]]) - match = ("'random_state' has been introduced in 0.23. " - "It will be set to None starting from 0.25 which " - "means that results will differ at every function " - "call. Set 'random_state' to None to silence this " - "warning, or to 0 to keep the behavior of versions " - "<0.23.") + match = "'random_state' has been introduced in 0.23." with pytest.warns(FutureWarning, match=match): AffinityPropagation().fit(X) @@ -246,7 +241,7 @@ def test_affinity_propagation_float32(): assert_array_equal(afp.labels_, expected) -# TODO: Remove in 0.26 +# TODO: Remove in 1.1 def test_affinity_propagation_pairwise_is_deprecated(): afp = AffinityPropagation(affinity='precomputed') msg = r"Attribute _pairwise was deprecated in version 0\.24" diff --git a/sklearn/cluster/tests/test_bicluster.py b/sklearn/cluster/tests/test_bicluster.py index 6e3e664c622a8..97ca3db0201b6 100644 --- a/sklearn/cluster/tests/test_bicluster.py +++ b/sklearn/cluster/tests/test_bicluster.py @@ -267,9 +267,9 @@ def test_n_features_in_(est): @pytest.mark.parametrize("klass", [SpectralBiclustering, SpectralCoclustering]) @pytest.mark.parametrize("n_jobs", [None, 1]) def test_n_jobs_deprecated(klass, n_jobs): - # FIXME: remove in 0.25 + # FIXME: remove in 1.0 depr_msg = ("'n_jobs' was deprecated in version 0.23 and will be removed " - "in 0.25.") + "in 1.0") S, _, _ = make_biclusters((30, 30), 3, noise=0.5, random_state=0) est = klass(random_state=0, n_jobs=n_jobs) diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py index 063781bbd6532..341b00c5c137f 100644 --- a/sklearn/cluster/tests/test_k_means.py +++ b/sklearn/cluster/tests/test_k_means.py @@ -854,9 +854,9 @@ def test_result_of_kmeans_equal_in_diff_n_threads(): @pytest.mark.parametrize("precompute_distances", ["auto", False, True]) def test_precompute_distance_deprecated(precompute_distances): - # FIXME: remove in 0.25 + # FIXME: remove in 1.0 depr_msg = ("'precompute_distances' was deprecated in version 0.23 and " - "will be removed in 0.25.") + "will be removed in 1.0") X, _ = make_blobs(n_samples=10, n_features=2, centers=2, random_state=0) kmeans = KMeans(n_clusters=2, n_init=1, init='random', random_state=0, precompute_distances=precompute_distances) @@ -867,9 +867,9 @@ def test_precompute_distance_deprecated(precompute_distances): @pytest.mark.parametrize("n_jobs", [None, 1]) def test_n_jobs_deprecated(n_jobs): - # FIXME: remove in 0.25 + # FIXME: remove in 1.0 depr_msg = ("'n_jobs' was deprecated in version 0.23 and will be removed " - "in 0.25.") + "in 1.0") X, _ = make_blobs(n_samples=10, n_features=2, centers=2, random_state=0) kmeans = KMeans(n_clusters=2, n_init=1, init='random', random_state=0, n_jobs=n_jobs) @@ -881,9 +881,9 @@ def test_n_jobs_deprecated(n_jobs): @pytest.mark.parametrize("attr", ["counts_", "init_size_", "random_state_"]) def test_minibatch_kmeans_deprecated_attributes(attr): # check that we raise a deprecation warning when accessing `init_size_` - # FIXME: remove in 0.26 + # FIXME: remove in 1.1 depr_msg = (f"The attribute '{attr}' is deprecated in 0.24 and will be " - f"removed in 0.26.") + f"removed in 1.1") km = MiniBatchKMeans(n_clusters=2, n_init=1, init='random', random_state=0) km.fit(X) diff --git a/sklearn/cluster/tests/test_spectral.py b/sklearn/cluster/tests/test_spectral.py index 19b1496d2719e..2c0ac67016749 100644 --- a/sklearn/cluster/tests/test_spectral.py +++ b/sklearn/cluster/tests/test_spectral.py @@ -268,7 +268,7 @@ def test_verbose(assign_labels, capsys): assert re.search(r"Iteration [0-9]+, inertia", captured.out) -# TODO: Remove in 0.26 +# TODO: Remove in 1.1 @pytest.mark.parametrize("affinity", ["precomputed", "precomputed_nearest_neighbors"]) def test_pairwise_is_deprecated(affinity): diff --git a/sklearn/covariance/_graph_lasso.py b/sklearn/covariance/_graph_lasso.py index c43b465def374..6dc88fb7908fb 100644 --- a/sklearn/covariance/_graph_lasso.py +++ b/sklearn/covariance/_graph_lasso.py @@ -607,14 +607,16 @@ class GraphicalLassoCV(GraphicalLasso): .. deprecated:: 0.24 The `cv_alphas_` attribute is deprecated in version 0.24 in favor - of `cv_results_['alphas']` and will be removed in version 0.26. + of `cv_results_['alphas']` and will be removed in version + 1.1 (renaming of 0.26). grid_scores_ : ndarray of shape (n_alphas, n_folds) Log-likelihood score on left-out data across folds. .. deprecated:: 0.24 The `grid_scores_` attribute is deprecated in version 0.24 in favor - of `cv_results_` and will be removed in version 0.26. + of `cv_results_` and will be removed in version + 1.1 (renaming of 0.26). cv_results_ : dict of ndarrays A dict with keys: @@ -828,11 +830,11 @@ def fit(self, X, y=None): verbose=inner_verbose, return_n_iter=True) return self - # TODO: Remove in 0.26 when grid_scores_ is deprecated + # TODO: Remove in 1.1 when grid_scores_ is deprecated # mypy error: Decorated property not supported @deprecated( # type: ignore "The grid_scores_ attribute is deprecated in version 0.24 in favor " - "of cv_results_ and will be removed in version 0.26" + "of cv_results_ and will be removed in version 1.1 (renaming of 0.26)." ) @property def grid_scores_(self): @@ -842,11 +844,12 @@ def grid_scores_(self): [self.cv_results_["split{}_score".format(i)] for i in range(n_alphas)]).T - # TODO: Remove in 0.26 when cv_alphas_ is deprecated + # TODO: Remove in 1.1 when cv_alphas_ is deprecated # mypy error: Decorated property not supported @deprecated( # type: ignore "The cv_alphas_ attribute is deprecated in version 0.24 in favor " - "of cv_results_['alpha'] and will be removed in version 0.26" + "of cv_results_['alpha'] and will be removed in version 1.1 " + "(renaming of 0.26)." ) @property def cv_alphas_(self): diff --git a/sklearn/covariance/tests/test_graphical_lasso.py b/sklearn/covariance/tests/test_graphical_lasso.py index 2030056f34ab5..9bcce6673dd65 100644 --- a/sklearn/covariance/tests/test_graphical_lasso.py +++ b/sklearn/covariance/tests/test_graphical_lasso.py @@ -152,7 +152,7 @@ def test_graphical_lasso_cv(random_state=1): GraphicalLassoCV(alphas=[0.8, 0.5], tol=1e-1, n_jobs=1).fit(X) -# TODO: Remove in 0.26 when grid_scores_ is deprecated +# TODO: Remove in 1.1 when grid_scores_ is deprecated def test_graphical_lasso_cv_grid_scores_and_cv_alphas_deprecated(): splits = 4 n_alphas = 5 @@ -168,13 +168,14 @@ def test_graphical_lasso_cv_grid_scores_and_cv_alphas_deprecated(): total_alphas = n_refinements * n_alphas + 1 msg = (r"The grid_scores_ attribute is deprecated in version 0\.24 in " - r"favor of cv_results_ and will be removed in version 0\.26") + r"favor of cv_results_ and will be removed in version 1\.1 " + r"\(renaming of 0\.26\).") with pytest.warns(FutureWarning, match=msg): assert cov.grid_scores_.shape == (total_alphas, splits) msg = (r"The cv_alphas_ attribute is deprecated in version 0\.24 in " r"favor of cv_results_\['alpha'\] and will be removed in version " - r"0\.26") + r"1\.1 \(renaming of 0\.26\)") with pytest.warns(FutureWarning, match=msg): assert len(cov.cv_alphas_) == total_alphas diff --git a/sklearn/cross_decomposition/_pls.py b/sklearn/cross_decomposition/_pls.py index f35c049c37ae8..7c1dc303e361f 100644 --- a/sklearn/cross_decomposition/_pls.py +++ b/sklearn/cross_decomposition/_pls.py @@ -181,12 +181,13 @@ def fit(self, X, Y): # see Wegelin page 25 rank_upper_bound = p if not 1 <= n_components <= rank_upper_bound: - # TODO: raise an error in 0.26 + # TODO: raise an error in 1.1 warnings.warn( f"As of version 0.24, n_components({n_components}) should " f"be in [1, n_features]." f"n_components={rank_upper_bound} will be used instead. " - f"In version 0.26, an error will be raised.", + f"In version 1.1 (renaming of 0.26), an error will be " + f"raised.", FutureWarning ) n_components = rank_upper_bound @@ -195,13 +196,14 @@ def fit(self, X, Y): # X and the rank of Y: see Wegelin page 12 rank_upper_bound = min(n, p, q) if not 1 <= self.n_components <= rank_upper_bound: - # TODO: raise an error in 0.26 + # TODO: raise an error in 1.1 warnings.warn( f"As of version 0.24, n_components({n_components}) should " f"be in [1, min(n_features, n_samples, n_targets)] = " f"[1, {rank_upper_bound}]. " f"n_components={rank_upper_bound} will be used instead. " - f"In version 0.26, an error will be raised.", + f"In version 1.1 (renaming of 0.26), an error will be " + f"raised.", FutureWarning ) n_components = rank_upper_bound @@ -210,7 +212,7 @@ def fit(self, X, Y): raise ValueError("algorithm should be 'svd' or 'nipals', got " f"{self.algorithm}.") - self._norm_y_weights = (self.deflation_mode == 'canonical') # 0.26 + self._norm_y_weights = (self.deflation_mode == 'canonical') # 1.1 norm_y_weights = self._norm_y_weights # Scale (in place) @@ -406,60 +408,60 @@ def fit_transform(self, X, y=None): # mypy error: Decorated property not supported @deprecated( # type: ignore "Attribute norm_y_weights was deprecated in version 0.24 and " - "will be removed in 0.26.") + "will be removed in 1.1 (renaming of 0.26).") @property def norm_y_weights(self): return self._norm_y_weights @deprecated( # type: ignore "Attribute x_mean_ was deprecated in version 0.24 and " - "will be removed in 0.26.") + "will be removed in 1.1 (renaming of 0.26).") @property def x_mean_(self): return self._x_mean @deprecated( # type: ignore "Attribute y_mean_ was deprecated in version 0.24 and " - "will be removed in 0.26.") + "will be removed in 1.1 (renaming of 0.26).") @property def y_mean_(self): return self._y_mean @deprecated( # type: ignore "Attribute x_std_ was deprecated in version 0.24 and " - "will be removed in 0.26.") + "will be removed in 1.1 (renaming of 0.26).") @property def x_std_(self): return self._x_std @deprecated( # type: ignore "Attribute y_std_ was deprecated in version 0.24 and " - "will be removed in 0.26.") + "will be removed in 1.1 (renaming of 0.26).") @property def y_std_(self): return self._y_std @property def x_scores_(self): - # TODO: raise error in 0.26 instead + # TODO: raise error in 1.1 instead if not isinstance(self, PLSRegression): pass warnings.warn( "Attribute x_scores_ was deprecated in version 0.24 and " - "will be removed in 0.26. Use est.transform(X) on the " - "training data instead.", + "will be removed in 1.1 (renaming of 0.26). Use " + "est.transform(X) on the training data instead.", FutureWarning ) return self._x_scores @property def y_scores_(self): - # TODO: raise error in 0.26 instead + # TODO: raise error in 1.1 instead if not isinstance(self, PLSRegression): warnings.warn( "Attribute y_scores_ was deprecated in version 0.24 and " - "will be removed in 0.26. Use est.transform(X) on the " - "training data instead.", + "will be removed in 1.1 (renaming of 0.26). Use " + "est.transform(X) on the training data instead.", FutureWarning ) return self._y_scores @@ -625,15 +627,17 @@ class PLSCanonical(_PLS): The transformed training samples. .. deprecated:: 0.24 - `x_scores_` is deprecated in 0.24 and will be removed in 0.26. You - can just call `transform` on the training data instead. + `x_scores_` is deprecated in 0.24 and will be removed in 1.1 + (renaming of 0.26). You can just call `transform` on the training + data instead. y_scores_ : ndarray of shape (n_samples, n_components) The transformed training targets. .. deprecated:: 0.24 - `y_scores_` is deprecated in 0.24 and will be removed in 0.26. You - can just call `transform` on the training data instead. + `y_scores_` is deprecated in 0.24 and will be removed in 1.1 + (renaming of 0.26). You can just call `transform` on the training + data instead. x_rotations_ : ndarray of shape (n_features, n_components) The projection matrix used to transform `X`. @@ -735,15 +739,17 @@ class CCA(_PLS): The transformed training samples. .. deprecated:: 0.24 - `x_scores_` is deprecated in 0.24 and will be removed in 0.26. You - can just call `transform` on the training data instead. + `x_scores_` is deprecated in 0.24 and will be removed in 1.1 + (renaming of 0.26). You can just call `transform` on the training + data instead. y_scores_ : ndarray of shape (n_samples, n_components) The transformed training targets. .. deprecated:: 0.24 - `y_scores_` is deprecated in 0.24 and will be removed in 0.26. You - can just call `transform` on the training data instead. + `y_scores_` is deprecated in 0.24 and will be removed in 1.1 + (renaming of 0.26). You can just call `transform` on the training + data instead. x_rotations_ : ndarray of shape (n_features, n_components) The projection matrix used to transform `X`. @@ -824,15 +830,17 @@ class PLSSVD(TransformerMixin, BaseEstimator): The transformed training samples. .. deprecated:: 0.24 - `x_scores_` is deprecated in 0.24 and will be removed in 0.26. You - can just call `transform` on the training data instead. + `x_scores_` is deprecated in 0.24 and will be removed in 1.1 + (renaming of 0.26). You can just call `transform` on the training + data instead. y_scores_ : ndarray of shape (n_samples, n_components) The transformed training targets. .. deprecated:: 0.24 - `y_scores_` is deprecated in 0.24 and will be removed in 0.26. You - can just call `transform` on the training data instead. + `y_scores_` is deprecated in 0.24 and will be removed in 1.1 + (renaming of 0.26). You can just call `transform` on the training + data instead. Examples -------- @@ -886,13 +894,13 @@ def fit(self, X, Y): n_components = self.n_components rank_upper_bound = min(X.shape[0], X.shape[1], Y.shape[1]) if not 1 <= n_components <= rank_upper_bound: - # TODO: raise an error in 0.26 + # TODO: raise an error in 1.1 warnings.warn( f"As of version 0.24, n_components({n_components}) should be " f"in [1, min(n_features, n_samples, n_targets)] = " f"[1, {rank_upper_bound}]. " f"n_components={rank_upper_bound} will be used instead. " - f"In version 0.26, an error will be raised.", + f"In version 1.1 (renaming of 0.26), an error will be raised.", FutureWarning ) n_components = rank_upper_bound @@ -908,8 +916,8 @@ def fit(self, X, Y): U, Vt = svd_flip(U, Vt) V = Vt.T - self._x_scores = np.dot(X, U) # TODO: remove in 0.26 - self._y_scores = np.dot(Y, V) # TODO: remove in 0.26 + self._x_scores = np.dot(X, U) # TODO: remove in 1.1 + self._y_scores = np.dot(Y, V) # TODO: remove in 1.1 self.x_weights_ = U self.y_weights_ = V return self @@ -917,8 +925,9 @@ def fit(self, X, Y): # mypy error: Decorated property not supported @deprecated( # type: ignore "Attribute x_scores_ was deprecated in version 0.24 and " - "will be removed in 0.26. Use est.transform(X) on the " - "training data instead.") + "will be removed in 1.1 (renaming of 0.26). Use est.transform(X) on " + "the training data instead." + ) @property def x_scores_(self): return self._x_scores @@ -926,36 +935,37 @@ def x_scores_(self): # mypy error: Decorated property not supported @deprecated( # type: ignore "Attribute y_scores_ was deprecated in version 0.24 and " - "will be removed in 0.26. Use est.transform(X, Y) on the " - "training data instead.") + "will be removed in 1.1 (renaming of 0.26). Use est.transform(X, Y) " + "on the training data instead." + ) @property def y_scores_(self): return self._y_scores @deprecated( # type: ignore "Attribute x_mean_ was deprecated in version 0.24 and " - "will be removed in 0.26.") + "will be removed in 1.1 (renaming of 0.26).") @property def x_mean_(self): return self._x_mean @deprecated( # type: ignore "Attribute y_mean_ was deprecated in version 0.24 and " - "will be removed in 0.26.") + "will be removed in 1.1 (renaming of 0.26).") @property def y_mean_(self): return self._y_mean @deprecated( # type: ignore "Attribute x_std_ was deprecated in version 0.24 and " - "will be removed in 0.26.") + "will be removed in 1.1 (renaming of 0.26).") @property def x_std_(self): return self._x_std @deprecated( # type: ignore "Attribute y_std_ was deprecated in version 0.24 and " - "will be removed in 0.26.") + "will be removed in 1.1 (renaming of 0.26).") @property def y_std_(self): return self._y_std diff --git a/sklearn/cross_decomposition/tests/test_pls.py b/sklearn/cross_decomposition/tests/test_pls.py index a36a95ed153cf..c01e790ca1644 100644 --- a/sklearn/cross_decomposition/tests/test_pls.py +++ b/sklearn/cross_decomposition/tests/test_pls.py @@ -315,7 +315,7 @@ def test_convergence_fail(): pls_nipals.fit(X, Y) -@pytest.mark.filterwarnings('ignore:.*scores_ was deprecated') # 0.26 +@pytest.mark.filterwarnings('ignore:.*scores_ was deprecated') # 1.1 @pytest.mark.parametrize('Est', (PLSSVD, PLSRegression, PLSCanonical)) def test_attibutes_shapes(Est): # Make sure attributes are of the correct shape depending on n_components @@ -439,7 +439,7 @@ def test_scale_and_stability(Est, X, Y): @pytest.mark.parametrize('n_components', (0, 4)) def test_n_components_bounds(Est, n_components): # n_components should be in [1, min(n_samples, n_features, n_targets)] - # TODO: catch error instead of warning in 0.26 + # TODO: catch error instead of warning in 1.1 rng = np.random.RandomState(0) X = rng.randn(10, 5) Y = rng.randn(10, 3) @@ -454,7 +454,7 @@ def test_n_components_bounds(Est, n_components): @pytest.mark.parametrize('n_components', (0, 6)) def test_n_components_bounds_pls_regression(n_components): # For PLSRegression, the upper bound for n_components is n_features - # TODO: catch error instead of warning in 0.26 + # TODO: catch error instead of warning in 1.1 rng = np.random.RandomState(0) X = rng.randn(10, 5) Y = rng.randn(10, 3) @@ -471,7 +471,7 @@ def test_scores_deprecations(Est): # Make sure x_scores_ and y_scores_ are deprecated. # It's not deprecated for PLSRegression because y_score_ is different from # transform(Y_train) - # TODO: remove attributes and test in 0.26 + # TODO: remove attributes and test in 1.1 rng = np.random.RandomState(0) X = rng.randn(10, 5) Y = rng.randn(10, 3) @@ -492,7 +492,7 @@ def test_norm_y_weights_deprecation(Est): est.norm_y_weights -# TODO: Remove test in 0.26 +# TODO: Remove test in 1.1 @pytest.mark.parametrize('Estimator', (PLSRegression, PLSCanonical, CCA, PLSSVD)) @pytest.mark.parametrize('attribute', diff --git a/sklearn/decomposition/_dict_learning.py b/sklearn/decomposition/_dict_learning.py index 74a4d4f4d17a4..781f288b70351 100644 --- a/sklearn/decomposition/_dict_learning.py +++ b/sklearn/decomposition/_dict_learning.py @@ -1024,8 +1024,8 @@ class SparseCoder(_BaseSparseCoding, BaseEstimator): The unchanged dictionary atoms. .. deprecated:: 0.24 - This attribute is deprecated in 0.24 and will be removed in 0.26. - Use `dictionary` instead. + This attribute is deprecated in 0.24 and will be removed in + 1.1 (renaming of 0.26). Use `dictionary` instead. Examples -------- @@ -1089,8 +1089,8 @@ def fit(self, X, y=None): return self @deprecated("The attribute 'components_' is deprecated " # type: ignore - "in 0.24 and will be removed in 0.26. Use the " - "'dictionary' instead.") + "in 0.24 and will be removed in 1.1 (renaming of 0.26). Use " + "the 'dictionary' instead.") @property def components_(self): return self.dictionary diff --git a/sklearn/decomposition/_kernel_pca.py b/sklearn/decomposition/_kernel_pca.py index ff1d2d869834f..5655eddb0bf31 100644 --- a/sklearn/decomposition/_kernel_pca.py +++ b/sklearn/decomposition/_kernel_pca.py @@ -167,10 +167,10 @@ def __init__(self, n_components=None, *, kernel="linear", self.n_jobs = n_jobs self.copy_X = copy_X - # TODO: Remove in 0.26 + # TODO: Remove in 1.1 # mypy error: Decorated property not supported @deprecated("Attribute _pairwise was deprecated in " # type: ignore - "version 0.24 and will be removed in 0.26.") + "version 0.24 and will be removed in 1.1 (renaming of 0.26).") @property def _pairwise(self): return self.kernel == "precomputed" diff --git a/sklearn/decomposition/_nmf.py b/sklearn/decomposition/_nmf.py index d801b418d5a18..5d01060951ae1 100644 --- a/sklearn/decomposition/_nmf.py +++ b/sklearn/decomposition/_nmf.py @@ -312,7 +312,7 @@ def _initialize_nmf(X, n_components, init='warn', eps=1e-6, warnings.warn(("The 'init' value, when 'init=None' and " "n_components is less than n_samples and " "n_features, will be changed from 'nndsvd' to " - "'nndsvda' in 0.26."), FutureWarning) + "'nndsvda' in 1.1 (renaming of 0.26)."), FutureWarning) init = None check_non_negative(X, "NMF initialization") diff --git a/sklearn/decomposition/tests/test_dict_learning.py b/sklearn/decomposition/tests/test_dict_learning.py index b1f67a187acd2..c9590f3136678 100644 --- a/sklearn/decomposition/tests/test_dict_learning.py +++ b/sklearn/decomposition/tests/test_dict_learning.py @@ -557,7 +557,7 @@ def test_sparse_coder_common_transformer(): check_transformers_unfitted(sc.__class__.__name__, sc) -# TODO: remove in 0.26 +# TODO: remove in 1.1 def test_sparse_coder_deprecation(): # check that we raise a deprecation warning when accessing `components_` rng = np.random.RandomState(777) diff --git a/sklearn/decomposition/tests/test_kernel_pca.py b/sklearn/decomposition/tests/test_kernel_pca.py index 2b12314b3c980..2acccb0df6781 100644 --- a/sklearn/decomposition/tests/test_kernel_pca.py +++ b/sklearn/decomposition/tests/test_kernel_pca.py @@ -316,7 +316,7 @@ def test_32_64_decomposition_shape(): kpca.fit_transform(X.astype(np.float32)).shape) -# TODO: Remove in 0.26 +# TODO: Remove in 1.1 def test_kernel_pcc_pairwise_is_deprecated(): kp = KernelPCA(kernel='precomputed') msg = r"Attribute _pairwise was deprecated in version 0\.24" diff --git a/sklearn/decomposition/tests/test_nmf.py b/sklearn/decomposition/tests/test_nmf.py index ff6b4ed8b4245..88c1ba406ad99 100644 --- a/sklearn/decomposition/tests/test_nmf.py +++ b/sklearn/decomposition/tests/test_nmf.py @@ -42,7 +42,7 @@ def test_initialize_nn_output(): def test_parameter_checking(): A = np.ones((2, 2)) name = 'spam' - # FIXME : should be removed in 0.26 + # FIXME : should be removed in 1.1 init = 'nndsvda' msg = "Invalid solver parameter: got 'spam' instead of one of" assert_raise_message(ValueError, msg, NMF(solver=name, init=init).fit, A) @@ -179,7 +179,7 @@ def test_n_components_greater_n_features(): # Smoke test for the case of more components than features. rng = np.random.mtrand.RandomState(42) A = np.abs(rng.randn(30, 10)) - # FIXME : should be removed in 0.26 + # FIXME : should be removed in 1.1 init = 'random' NMF(n_components=15, random_state=0, tol=1e-2, init=init).fit(A) @@ -441,7 +441,7 @@ def test_nmf_regularization(): rng = np.random.mtrand.RandomState(42) X = np.abs(rng.randn(n_samples, n_features)) - # FIXME : should be removed in 0.26 + # FIXME : should be removed in 1.1 init = 'nndsvda' # L1 regularization should increase the number of zeros l1_ratio = 1. @@ -552,7 +552,7 @@ def test_nmf_dtype_match(dtype_in, dtype_out, solver, regularization): # Check that NMF preserves dtype (float32 and float64) X = np.random.RandomState(0).randn(20, 15).astype(dtype_in, copy=False) np.abs(X, out=X) - # FIXME : should be removed in 0.26 + # FIXME : should be removed in 1.1 init = 'nndsvda' nmf = NMF(solver=solver, regularization=regularization, init=init) @@ -568,7 +568,7 @@ def test_nmf_float32_float64_consistency(solver, regularization): # Check that the result of NMF is the same between float32 and float64 X = np.random.RandomState(0).randn(50, 7) np.abs(X, out=X) - # FIXME : should be removed in 0.26 + # FIXME : should be removed in 1.1 init = 'nndsvda' nmf32 = NMF(solver=solver, regularization=regularization, random_state=0, init=init) @@ -595,13 +595,13 @@ def test_nmf_custom_init_dtype_error(): non_negative_factorization(X, H=H, update_H=False) -# FIXME : should be removed in 0.26 +# FIXME : should be removed in 1.1 def test_init_default_deprecation(): # Test FutureWarning on init default - msg = ("The 'init' value, when 'init=None' and " - "n_components is less than n_samples and " - "n_features, will be changed from 'nndsvd' to " - "'nndsvda' in 0.26.") + msg = (r"The 'init' value, when 'init=None' and " + r"n_components is less than n_samples and " + r"n_features, will be changed from 'nndsvd' to " + r"'nndsvda' in 1.1 \(renaming of 0.26\).") rng = np.random.mtrand.RandomState(42) A = np.abs(rng.randn(6, 5)) with pytest.warns(FutureWarning, match=msg): diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index 81fc319fdfadb..ff1e781f7e166 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -991,8 +991,8 @@ class RandomForestClassifier(ForestClassifier): ``min_impurity_split`` has been deprecated in favor of ``min_impurity_decrease`` in 0.19. The default value of ``min_impurity_split`` has changed from 1e-7 to 0 in 0.23 and it - will be removed in 0.25. Use ``min_impurity_decrease`` instead. - + will be removed in 1.0 (renaming of 0.25). + Use ``min_impurity_decrease`` instead. bootstrap : bool, default=True Whether bootstrap samples are used when building trees. If False, the @@ -1314,7 +1314,8 @@ class RandomForestRegressor(ForestRegressor): ``min_impurity_split`` has been deprecated in favor of ``min_impurity_decrease`` in 0.19. The default value of ``min_impurity_split`` has changed from 1e-7 to 0 in 0.23 and it - will be removed in 0.25. Use ``min_impurity_decrease`` instead. + will be removed in 1.0 (renaming of 0.25). + Use ``min_impurity_decrease`` instead. bootstrap : bool, default=True Whether bootstrap samples are used when building trees. If False, the @@ -1596,7 +1597,8 @@ class ExtraTreesClassifier(ForestClassifier): ``min_impurity_split`` has been deprecated in favor of ``min_impurity_decrease`` in 0.19. The default value of ``min_impurity_split`` has changed from 1e-7 to 0 in 0.23 and it - will be removed in 0.25. Use ``min_impurity_decrease`` instead. + will be removed in 1.0 (renaming of 0.25). + Use ``min_impurity_decrease`` instead. bootstrap : bool, default=False Whether bootstrap samples are used when building trees. If False, the @@ -1914,7 +1916,8 @@ class ExtraTreesRegressor(ForestRegressor): ``min_impurity_split`` has been deprecated in favor of ``min_impurity_decrease`` in 0.19. The default value of ``min_impurity_split`` has changed from 1e-7 to 0 in 0.23 and it - will be removed in 0.25. Use ``min_impurity_decrease`` instead. + will be removed in 1.0 (renaming of 0.25). + Use ``min_impurity_decrease`` instead. bootstrap : bool, default=False Whether bootstrap samples are used when building trees. If False, the @@ -2173,7 +2176,8 @@ class RandomTreesEmbedding(BaseForest): ``min_impurity_split`` has been deprecated in favor of ``min_impurity_decrease`` in 0.19. The default value of ``min_impurity_split`` has changed from 1e-7 to 0 in 0.23 and it - will be removed in 0.25. Use ``min_impurity_decrease`` instead. + will be removed in 1.0 (renaming of 0.25). + Use ``min_impurity_decrease`` instead. sparse_output : bool, default=True Whether or not to return a sparse CSR matrix, as default behavior, diff --git a/sklearn/ensemble/_gb.py b/sklearn/ensemble/_gb.py index a25b716edc22b..15f5404f4701c 100644 --- a/sklearn/ensemble/_gb.py +++ b/sklearn/ensemble/_gb.py @@ -398,7 +398,7 @@ def fit(self, X, y, sample_weight=None, monitor=None): self : object """ if self.criterion == 'mae': - # TODO: This should raise an error from 0.26 + # TODO: This should raise an error from 1.1 self._warn_mae_for_criterion() # if not warmstart - clear the estimator state @@ -812,8 +812,9 @@ class GradientBoostingClassifier(ClassifierMixin, BaseGradientBoosting): .. versionadded:: 0.18 .. deprecated:: 0.24 `criterion='mae'` is deprecated and will be removed in version - 0.26. Use `criterion='friedman_mse'` or `'mse'` instead, as trees - should use a least-square criterion in Gradient Boosting. + 1.1 (renaming of 0.26). Use `criterion='friedman_mse'` or `'mse'` + instead, as trees should use a least-square criterion in + Gradient Boosting. min_samples_split : int or float, default=2 The minimum number of samples required to split an internal node: @@ -878,7 +879,8 @@ class GradientBoostingClassifier(ClassifierMixin, BaseGradientBoosting): ``min_impurity_split`` has been deprecated in favor of ``min_impurity_decrease`` in 0.19. The default value of ``min_impurity_split`` has changed from 1e-7 to 0 in 0.23 and it - will be removed in 0.25. Use ``min_impurity_decrease`` instead. + will be removed in 1.0 (renaming of 0.25). + Use ``min_impurity_decrease`` instead. init : estimator or 'zero', default=None An estimator object that is used to compute the initial predictions. @@ -1112,9 +1114,9 @@ def _validate_y(self, y, sample_weight): return y def _warn_mae_for_criterion(self): - # TODO: This should raise an error from 0.26 + # TODO: This should raise an error from 1.1 warnings.warn("criterion='mae' was deprecated in version 0.24 and " - "will be removed in version 0.26. Use " + "will be removed in version 1.1 (renaming of 0.26). Use " "criterion='friedman_mse' or 'mse' instead, as trees " "should use a least-square criterion in Gradient " "Boosting.", FutureWarning) @@ -1339,8 +1341,8 @@ class GradientBoostingRegressor(RegressorMixin, BaseGradientBoosting): .. versionadded:: 0.18 .. deprecated:: 0.24 `criterion='mae'` is deprecated and will be removed in version - 0.26. The correct way of minimizing the absolute error is to use - `loss='lad'` instead. + 1.1 (renaming of 0.26). The correct way of minimizing the absolute + error is to use `loss='lad'` instead. min_samples_split : int or float, default=2 The minimum number of samples required to split an internal node: @@ -1405,7 +1407,8 @@ class GradientBoostingRegressor(RegressorMixin, BaseGradientBoosting): ``min_impurity_split`` has been deprecated in favor of ``min_impurity_decrease`` in 0.19. The default value of ``min_impurity_split`` has changed from 1e-7 to 0 in 0.23 and it - will be removed in 0.25. Use ``min_impurity_decrease`` instead. + will be removed in 1.0 (renaming of 0.25). + Use ``min_impurity_decrease`` instead. init : estimator or 'zero', default=None An estimator object that is used to compute the initial predictions. @@ -1535,7 +1538,7 @@ class GradientBoostingRegressor(RegressorMixin, BaseGradientBoosting): .. deprecated:: 0.24 Attribute ``n_classes_`` was deprecated in version 0.24 and - will be removed in 0.26. + will be removed in 1.1 (renaming of 0.26). n_estimators_ : int The number of estimators as selected by early stopping (if @@ -1623,11 +1626,11 @@ def _validate_y(self, y, sample_weight=None): return y def _warn_mae_for_criterion(self): - # TODO: This should raise an error from 0.26 + # TODO: This should raise an error from 1.1 warnings.warn("criterion='mae' was deprecated in version 0.24 and " - "will be removed in version 0.26. The correct way of " - "minimizing the absolute error is to use loss='lad' " - "instead.", FutureWarning) + "will be removed in version 1.1 (renaming of 0.26). The " + "correct way of minimizing the absolute error is to use " + " loss='lad' instead.", FutureWarning) def predict(self, X): """Predict regression target for X. @@ -1692,10 +1695,11 @@ def apply(self, X): leaves = leaves.reshape(X.shape[0], self.estimators_.shape[0]) return leaves - # FIXME: to be removed in 0.26 + # FIXME: to be removed in 1.1 # mypy error: Decorated property not supported @deprecated("Attribute n_classes_ was deprecated " # type: ignore - "in version 0.24 and will be removed in 0.26.") + "in version 0.24 and will be removed in 1.1 " + "(renaming of 0.26).") @property def n_classes_(self): try: diff --git a/sklearn/ensemble/tests/test_gradient_boosting.py b/sklearn/ensemble/tests/test_gradient_boosting.py index 812ff16933758..256b79db4865c 100644 --- a/sklearn/ensemble/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/tests/test_gradient_boosting.py @@ -1308,7 +1308,7 @@ def test_gbr_degenerate_feature_importances(): np.zeros(10, dtype=np.float64)) -# TODO: Remove in 0.26 when `n_classes_` is deprecated +# TODO: Remove in 1.1 when `n_classes_` is deprecated def test_gbr_deprecated_attr(): # check that accessing n_classes_ in GradientBoostingRegressor raises # a deprecation warning @@ -1320,7 +1320,7 @@ def test_gbr_deprecated_attr(): gbr.n_classes_ -# TODO: Remove in 0.26 when `n_classes_` is deprecated +# TODO: Remove in 1.1 when `n_classes_` is deprecated @pytest.mark.filterwarnings("ignore:Attribute n_classes_ was deprecated") def test_attr_error_raised_if_not_fitted(): # check that accessing n_classes_ in not fitted GradientBoostingRegressor @@ -1335,7 +1335,7 @@ def test_attr_error_raised_if_not_fitted(): gbr.n_classes_ -# TODO: Update in 0.26 to check for the error raised +# TODO: Update in 1.1 to check for the error raised @pytest.mark.parametrize('estimator', [ GradientBoostingClassifier(criterion='mae'), GradientBoostingRegressor(criterion='mae') @@ -1344,6 +1344,6 @@ def test_criterion_mae_deprecation(estimator): # checks whether a deprecation warning is issues when criterion='mae' # is used. msg = ("criterion='mae' was deprecated in version 0.24 and " - "will be removed in version 0.26.") + "will be removed in version 1.1") with pytest.warns(FutureWarning, match=msg): estimator.fit(X, y) diff --git a/sklearn/exceptions.py b/sklearn/exceptions.py index 3a3188be35468..2ab7545705115 100644 --- a/sklearn/exceptions.py +++ b/sklearn/exceptions.py @@ -41,7 +41,7 @@ class NotFittedError(ValueError, AttributeError): @deprecated("ChangedBehaviorWarning is deprecated in 0.24 and will be removed " - "in 0.26") + "in 1.1") class ChangedBehaviorWarning(UserWarning): """Warning class used to notify the user of any change in the behavior. @@ -114,7 +114,7 @@ class FitFailedWarning(RuntimeWarning): @deprecated("NonBLASDotWarning is deprecated in 0.24 and will be removed in " - "0.26") + "1.1") class NonBLASDotWarning(EfficiencyWarning): """Warning used when the dot operation does not use BLAS. diff --git a/sklearn/inspection/_partial_dependence.py b/sklearn/inspection/_partial_dependence.py index 60646c5126ded..1e9c0c9718a51 100644 --- a/sklearn/inspection/_partial_dependence.py +++ b/sklearn/inspection/_partial_dependence.py @@ -307,7 +307,7 @@ def partial_dependence(estimator, X, features, *, response_method='auto', .. versionadded:: 0.24 .. deprecated:: 0.24 - `kind='legacy'` is deprecated and will be removed in version 0.26. + `kind='legacy'` is deprecated and will be removed in version 1.1. `kind='average'` will be the new default. It is intended to migrate from the ndarray output to :class:`~sklearn.utils.Bunch` output. @@ -504,12 +504,12 @@ def partial_dependence(estimator, X, features, *, response_method='auto', if kind == 'legacy': warnings.warn( "A Bunch will be returned in place of 'predictions' from version" - " 0.26 with partial dependence results accessible via the " - "'average' key. In the meantime, pass kind='average' to get the " - "future behaviour.", + " 1.1 (renaming of 0.26) with partial dependence results " + "accessible via the 'average' key. In the meantime, pass " + "kind='average' to get the future behaviour.", FutureWarning ) - # TODO 0.26: Remove kind == 'legacy' section + # TODO 1.1: Remove kind == 'legacy' section return averaged_predictions, values elif kind == 'average': return Bunch(average=averaged_predictions, values=values) diff --git a/sklearn/inspection/_plot/partial_dependence.py b/sklearn/inspection/_plot/partial_dependence.py index 4790c2bb3842a..d6604d7ae675f 100644 --- a/sklearn/inspection/_plot/partial_dependence.py +++ b/sklearn/inspection/_plot/partial_dependence.py @@ -817,7 +817,7 @@ def _plot_two_way_partial_dependence( ax.set_xlabel(self.feature_names[feature_idx[0]]) ax.set_ylabel(self.feature_names[feature_idx[1]]) - @_deprecate_positional_args(version="0.26") + @_deprecate_positional_args(version="1.1") def plot(self, *, ax=None, n_cols=3, line_kw=None, contour_kw=None): """Plot partial dependence plots. diff --git a/sklearn/inspection/tests/test_partial_dependence.py b/sklearn/inspection/tests/test_partial_dependence.py index 9fc1b2683545b..997c61c0e5f8b 100644 --- a/sklearn/inspection/tests/test_partial_dependence.py +++ b/sklearn/inspection/tests/test_partial_dependence.py @@ -100,7 +100,7 @@ def test_output_shape(Estimator, method, data, grid_resolution, est, X=X, features=features, method=method, kind=kind, grid_resolution=grid_resolution ) - # FIXME: Remove 'legacy' support in 0.26 + # FIXME: Remove 'legacy' support in 1.1 pdp, axes = result if kind == 'legacy' else (result, result["values"]) expected_pdp_shape = (n_targets, @@ -711,7 +711,7 @@ def test_warning_for_kind_legacy(): est.fit(X, y) err_msg = ("A Bunch will be returned in place of 'predictions' from " - "version 0.26") + "version 1.1") with pytest.warns(FutureWarning, match=err_msg): partial_dependence(est, X=X, features=[1, 2]) diff --git a/sklearn/kernel_ridge.py b/sklearn/kernel_ridge.py index 119b27e9084ae..8a27ea572b344 100644 --- a/sklearn/kernel_ridge.py +++ b/sklearn/kernel_ridge.py @@ -136,10 +136,10 @@ def _get_kernel(self, X, Y=None): def _more_tags(self): return {'pairwise': self.kernel == 'precomputed'} - # TODO: Remove in 0.26 + # TODO: Remove in 1.1 # mypy error: Decorated property not supported @deprecated("Attribute _pairwise was deprecated in " # type: ignore - "version 0.24 and will be removed in 0.26.") + "version 0.24 and will be removed in 1.1 (renaming of 0.26).") @property def _pairwise(self): return self.kernel == "precomputed" diff --git a/sklearn/linear_model/_stochastic_gradient.py b/sklearn/linear_model/_stochastic_gradient.py index e99116ca4f3e3..7b019e5545534 100644 --- a/sklearn/linear_model/_stochastic_gradient.py +++ b/sklearn/linear_model/_stochastic_gradient.py @@ -291,7 +291,8 @@ def _make_validation_score_cb(self, validation_mask, X, y, sample_weight, # mypy error: Decorated property not supported @deprecated("Attribute standard_coef_ was deprecated " # type: ignore - "in version 0.23 and will be removed in 0.25.") + "in version 0.23 and will be removed in 1.0 " + "(renaming of 0.25).") @property def standard_coef_(self): return self._standard_coef @@ -299,7 +300,7 @@ def standard_coef_(self): # mypy error: Decorated property not supported @deprecated( # type: ignore "Attribute standard_intercept_ was deprecated " - "in version 0.23 and will be removed in 0.25." + "in version 0.23 and will be removed in 1.0 (renaming of 0.25)." ) @property def standard_intercept_(self): @@ -307,14 +308,16 @@ def standard_intercept_(self): # mypy error: Decorated property not supported @deprecated("Attribute average_coef_ was deprecated " # type: ignore - "in version 0.23 and will be removed in 0.25.") + "in version 0.23 and will be removed in 1.0 " + "(renaming of 0.25).") @property def average_coef_(self): return self._average_coef # mypy error: Decorated property not supported @deprecated("Attribute average_intercept_ was deprecated " # type: ignore - "in version 0.23 and will be removed in 0.25.") + "in version 0.23 and will be removed in 1.0 " + "(renaming of 0.25).") @property def average_intercept_(self): return self._average_intercept @@ -1531,14 +1534,14 @@ class SGDRegressor(BaseSGDRegressor): .. deprecated:: 0.23 Attribute ``average_coef_`` was deprecated - in version 0.23 and will be removed in 0.25. + in version 0.23 and will be removed in 1.0 (renaming of 0.25). average_intercept_ : ndarray of shape (1,) The averaged intercept term. Only available if ``average=True``. .. deprecated:: 0.23 Attribute ``average_intercept_`` was deprecated - in version 0.23 and will be removed in 0.25. + in version 0.23 and will be removed in 1.0 (renaming of 0.25). n_iter_ : int The actual number of iterations before reaching the stopping criterion. diff --git a/sklearn/linear_model/tests/test_passive_aggressive.py b/sklearn/linear_model/tests/test_passive_aggressive.py index 27381059eaf33..f67a768844213 100644 --- a/sklearn/linear_model/tests/test_passive_aggressive.py +++ b/sklearn/linear_model/tests/test_passive_aggressive.py @@ -267,7 +267,7 @@ def test_regressor_undefined_methods(): for meth in ("transform",): assert_raises(AttributeError, lambda x: getattr(reg, x), meth) -# TODO: remove in 0.25 +# TODO: remove in 1.0 @pytest.mark.parametrize('klass', [PassiveAggressiveClassifier, PassiveAggressiveRegressor]) def test_passive_aggressive_deprecated_attr(klass): diff --git a/sklearn/linear_model/tests/test_sgd.py b/sklearn/linear_model/tests/test_sgd.py index 8e8d3f94b6c99..d5063981ff9aa 100644 --- a/sklearn/linear_model/tests/test_sgd.py +++ b/sklearn/linear_model/tests/test_sgd.py @@ -269,7 +269,7 @@ def test_plain_has_no_average_attr(klass): assert not hasattr(clf, '_standard_coef') -# TODO: remove in 0.25 +# TODO: remove in 1.0 @pytest.mark.parametrize('klass', [SGDClassifier, SGDRegressor]) def test_sgd_deprecated_attr(klass): est = klass(average=True, eta0=.01) diff --git a/sklearn/manifold/_mds.py b/sklearn/manifold/_mds.py index ac749d737c762..6a144e3033e8e 100644 --- a/sklearn/manifold/_mds.py +++ b/sklearn/manifold/_mds.py @@ -389,10 +389,10 @@ def __init__(self, n_components=2, *, metric=True, n_init=4, def _more_tags(self): return {'pairwise': self.dissimilarity == 'precomputed'} - # TODO: Remove in 0.26 + # TODO: Remove in 1.1 # mypy error: Decorated property not supported @deprecated("Attribute _pairwise was deprecated in " # type: ignore - "version 0.24 and will be removed in 0.26.") + "version 0.24 and will be removed in 1.1 (renaming of 0.26).") @property def _pairwise(self): return self.dissimilarity == "precomputed" diff --git a/sklearn/manifold/_spectral_embedding.py b/sklearn/manifold/_spectral_embedding.py index b77da83ecad30..70f817904ac65 100644 --- a/sklearn/manifold/_spectral_embedding.py +++ b/sklearn/manifold/_spectral_embedding.py @@ -472,10 +472,10 @@ def _more_tags(self): return {'pairwise': self.affinity in ["precomputed", "precomputed_nearest_neighbors"]} - # TODO: Remove in 0.26 + # TODO: Remove in 1.1 # mypy error: Decorated property not supported @deprecated("Attribute _pairwise was deprecated in " # type: ignore - "version 0.24 and will be removed in 0.26.") + "version 0.24 and will be removed in 1.1 (renaming of 0.26).") @property def _pairwise(self): return self.affinity in ["precomputed", diff --git a/sklearn/manifold/_t_sne.py b/sklearn/manifold/_t_sne.py index d07a9faf62d35..b6072a6e198c4 100644 --- a/sklearn/manifold/_t_sne.py +++ b/sklearn/manifold/_t_sne.py @@ -611,8 +611,8 @@ class TSNE(BaseEstimator): legacy squaring behavior. .. deprecated:: 0.24 Legacy squaring behavior was deprecated in 0.24. The ``'legacy'`` - value will be removed in 0.26, at which point the default value will - change to ``True``. + value will be removed in 1.1 (renaming of 0.26), at which point the + default value will change to ``True``. Attributes ---------- @@ -675,7 +675,7 @@ def __init__(self, n_components=2, *, perplexity=30.0, self.method = method self.angle = angle self.n_jobs = n_jobs - # TODO Revisit deprecation of square_distances for 0.26-0.28 (#12401) + # TODO Revisit deprecation of square_distances for 1.1-1.3 (#12401) self.square_distances = square_distances def _fit(self, X, skip_num_points=0): @@ -688,14 +688,16 @@ def _fit(self, X, skip_num_points=0): if self.square_distances not in [True, 'legacy']: raise ValueError("'square_distances' must be True or 'legacy'.") if self.metric != "euclidean" and self.square_distances is not True: - warnings.warn(("'square_distances' has been introduced in 0.24" - "to help phase out legacy squaring behavior. The " - "'legacy' setting will be removed in 0.26, and the " - "default setting will be changed to True. In 0.28, " - "'square_distances' will be removed altogether," - "and distances will be squared by default. Set " - "'square_distances'=True to silence this warning."), - FutureWarning) + warnings.warn( + "'square_distances' has been introduced in 0.24 to help phase " + "out legacy squaring behavior. The 'legacy' setting will be " + "removed in 1.1 (renaming of 0.26), and the default setting " + "will be changed to True. In 1.3, 'square_distances' will be " + "removed altogether, and distances will be squared by " + "default. Set 'square_distances'=True to silence this " + "warning.", + FutureWarning + ) if self.method == 'barnes_hut': X = self._validate_data(X, accept_sparse=['csr'], ensure_min_samples=2, diff --git a/sklearn/manifold/tests/test_mds.py b/sklearn/manifold/tests/test_mds.py index 7ca046385a6ed..6e2016c798772 100644 --- a/sklearn/manifold/tests/test_mds.py +++ b/sklearn/manifold/tests/test_mds.py @@ -65,7 +65,7 @@ def test_MDS(): mds_clf.fit(sim) -# TODO: Remove in 0.26 +# TODO: Remove in 1.1 def test_MDS_pairwise_deprecated(): mds_clf = mds.MDS(metric='precomputed') msg = r"Attribute _pairwise was deprecated in version 0\.24" @@ -73,7 +73,7 @@ def test_MDS_pairwise_deprecated(): mds_clf._pairwise -# TODO: Remove in 0.26 +# TODO: Remove in 1.1 @ignore_warnings(category=FutureWarning) @pytest.mark.parametrize("dissimilarity, expected_pairwise", [ ("precomputed", True), diff --git a/sklearn/manifold/tests/test_spectral_embedding.py b/sklearn/manifold/tests/test_spectral_embedding.py index 02cbd303134e6..8fcf113874927 100644 --- a/sklearn/manifold/tests/test_spectral_embedding.py +++ b/sklearn/manifold/tests/test_spectral_embedding.py @@ -347,7 +347,7 @@ def test_spectral_embedding_first_eigen_vector(): assert np.std(embedding[:, 1]) > 1e-3 -# TODO: Remove in 0.26 +# TODO: Remove in 1.1 @pytest.mark.parametrize("affinity", ["precomputed", "precomputed_nearest_neighbors"]) def test_spectral_embedding_pairwise_deprecated(affinity): diff --git a/sklearn/manifold/tests/test_t_sne.py b/sklearn/manifold/tests/test_t_sne.py index 97d6ea5ce5933..716c031d4f5bf 100644 --- a/sklearn/manifold/tests/test_t_sne.py +++ b/sklearn/manifold/tests/test_t_sne.py @@ -898,7 +898,7 @@ def test_tsne_with_different_distance_metrics(): @ignore_warnings(category=FutureWarning) def test_tsne_different_square_distances(method, metric, square_distances): # Make sure that TSNE works for different square_distances settings - # FIXME remove test when square_distances=True becomes the default in 0.26 + # FIXME remove test when square_distances=True becomes the default in 1.1 random_state = check_random_state(0) n_components_original = 3 n_components_embedding = 2 diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 5263b593e9594..a3b4accc03655 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -1447,18 +1447,24 @@ def _precompute_metric_params(X, Y, metric=None, **kwds): if X is Y: V = np.var(X, axis=0, ddof=1, dtype=dtype) else: - warnings.warn("from version 0.25, pairwise_distances for " - "metric='seuclidean' will require V to be " - "specified if Y is passed.", FutureWarning) + warnings.warn( + "from version 1.0 (renaming of 0.25), pairwise_distances for " + "metric='seuclidean' will require V to be specified if Y is " + "passed.", + FutureWarning + ) V = np.var(np.vstack([X, Y]), axis=0, ddof=1, dtype=dtype) return {'V': V} if metric == "mahalanobis" and 'VI' not in kwds: if X is Y: VI = np.linalg.inv(np.cov(X.T)).T else: - warnings.warn("from version 0.25, pairwise_distances for " - "metric='mahalanobis' will require VI to be " - "specified if Y is passed.", FutureWarning) + warnings.warn( + "from version 1.0 (renaming of 0.25), pairwise_distances for " + "metric='mahalanobis' will require VI to be specified if Y " + "is passed.", + FutureWarning + ) VI = np.linalg.inv(np.cov(np.vstack([X, Y]).T)).T return {'VI': VI} return {} diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index 92dea4e791dfe..88c285421fca6 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -1281,7 +1281,7 @@ def test_pairwise_distances_data_derived_params(n_jobs, metric, dist_function, params = {'VI': np.linalg.inv(np.cov(np.vstack([X, Y]).T)).T} expected_dist_explicit_params = cdist(X, Y, metric=metric, **params) - # TODO: Remove warn_checker in 0.25 + # TODO: Remove warn_checker in 1.0 if y_is_x: warn_checker = pytest.warns(None) else: diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index 51f43debf78ed..213204b50c2a7 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -314,10 +314,10 @@ def __len__(self): return self.n_iter -# FIXME Remove fit_grid_point in 0.25 +# FIXME Remove fit_grid_point in 1.0 @deprecated( "fit_grid_point is deprecated in version 0.23 " - "and will be removed in version 0.25" + "and will be removed in version 1.0 (renaming of 0.25)" ) def fit_grid_point(X, y, estimator, parameters, train, test, scorer, verbose, error_score=np.nan, **fit_params): @@ -440,10 +440,10 @@ def _more_tags(self): "DataConversionWarning not caught"}, } - # TODO: Remove in 0.26 + # TODO: Remove in 1.1 # mypy error: Decorated property not supported @deprecated("Attribute _pairwise was deprecated in " # type: ignore - "version 0.24 and will be removed in 0.26.") + "version 0.24 and will be removed in 1.1 (renaming of 0.26).") @property def _pairwise(self): # allows cross-validation to see 'precomputed' metrics diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py index b1194600c530d..af2ca92aee26b 100644 --- a/sklearn/model_selection/tests/test_search.py +++ b/sklearn/model_selection/tests/test_search.py @@ -1266,7 +1266,7 @@ def test_grid_search_correct_score_results(): assert_almost_equal(correct_score, cv_scores[i]) -# FIXME remove test_fit_grid_point as the function will be removed on 0.25 +# FIXME remove test_fit_grid_point as the function will be removed on 1.0 @ignore_warnings(category=FutureWarning) def test_fit_grid_point(): X, y = make_classification(random_state=0) @@ -1297,13 +1297,13 @@ def test_fit_grid_point(): # FIXME remove test_fit_grid_point_deprecated as -# fit_grid_point will be removed on 0.25 +# fit_grid_point will be removed on 1.0 def test_fit_grid_point_deprecated(): X, y = make_classification(random_state=0) svc = LinearSVC(random_state=0) scorer = make_scorer(accuracy_score) msg = ("fit_grid_point is deprecated in version 0.23 " - "and will be removed in version 0.25") + "and will be removed in version 1.0") params = {'C': 0.1} train, test = next(StratifiedKFold().split(X, y)) @@ -1963,7 +1963,7 @@ def _more_tags(self): assert pairwise == cv._get_tags()['pairwise'], attr_message -# TODO: Remove in 0.26 +# TODO: Remove in 1.1 @ignore_warnings(category=FutureWarning) def test_search_cv__pairwise_property_delegated_to_base_estimator(): """ diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py index 4437a7a4cb35c..8405d3b38c452 100644 --- a/sklearn/model_selection/tests/test_validation.py +++ b/sklearn/model_selection/tests/test_validation.py @@ -1962,7 +1962,7 @@ def custom_scorer(clf, X, y): assert "test_{}".format(name) in cv_results -# TODO: Remove in 0.26 when the _pairwise attribute is removed +# TODO: Remove in 1.1 when the _pairwise attribute is removed def test_validation_pairwise(): # checks the interactions between the pairwise estimator tag # and the _pairwise attribute @@ -1981,7 +1981,6 @@ def _more_tags(self): return {'pairwise': False} svm = IncorrectTagSVM(kernel='precomputed') - msg = ("_pairwise was deprecated in 0.24 and will be removed in 0.26. " - "Set the estimator tags of your estimator instead") + msg = "_pairwise was deprecated in 0.24 and will be removed in 1.1" with pytest.warns(FutureWarning, match=msg): cross_validate(svm, linear_kernel, y, cv=2) diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py index 182a412f8313f..da29fdd4daf11 100644 --- a/sklearn/multiclass.py +++ b/sklearn/multiclass.py @@ -186,7 +186,7 @@ class OneVsRestClassifier(MultiOutputMixin, ClassifierMixin, .. deprecated:: 0.24 This attribute is deprecated in 0.24 and will - be removed in 0.26. If you use this attribute + be removed in 1.1 (renaming of 0.26). If you use this attribute in :class:`~sklearn.feature_selection.RFE` or :class:`~sklearn.feature_selection.SelectFromModel`, you may pass a callable to the `importance_getter` @@ -200,7 +200,7 @@ class OneVsRestClassifier(MultiOutputMixin, ClassifierMixin, .. deprecated:: 0.24 This attribute is deprecated in 0.24 and will - be removed in 0.26. If you use this attribute + be removed in 1.1 (renaming of 0.26). If you use this attribute in :class:`~sklearn.feature_selection.RFE` or :class:`~sklearn.feature_selection.SelectFromModel`, you may pass a callable to the `importance_getter` @@ -456,10 +456,10 @@ def multilabel_(self): def n_classes_(self): return len(self.classes_) - # TODO: Remove coef_ attribute in 0.26 + # TODO: Remove coef_ attribute in 1.1 # mypy error: Decorated property not supported @deprecated("Attribute coef_ was deprecated in " # type: ignore - "version 0.24 and will be removed in 0.26. " + "version 0.24 and will be removed in 1.1 (renaming of 0.26). " "If you observe this warning while using RFE " "or SelectFromModel, use the importance_getter " "parameter instead.") @@ -474,10 +474,10 @@ def coef_(self): return sp.vstack(coefs) return np.vstack(coefs) - # TODO: Remove intercept_ attribute in 0.26 + # TODO: Remove intercept_ attribute in 1.1 # mypy error: Decorated property not supported @deprecated("Attribute intercept_ was deprecated in " # type: ignore - "version 0.24 and will be removed in 0.26. " + "version 0.24 and will be removed in 1.1 (renaming of 0.26). " "If you observe this warning while using RFE " "or SelectFromModel, use the importance_getter " "parameter instead.") @@ -489,10 +489,10 @@ def intercept_(self): "Base estimator doesn't have an intercept_ attribute.") return np.array([e.intercept_.ravel() for e in self.estimators_]) - # TODO: Remove in 0.26 + # TODO: Remove in 1.1 # mypy error: Decorated property not supported @deprecated("Attribute _pairwise was deprecated in " # type: ignore - "version 0.24 and will be removed in 0.26.") + "version 0.24 and will be removed in 1.1 (renaming of 0.26).") @property def _pairwise(self): """Indicate if wrapped estimator is using a precomputed Gram matrix""" @@ -591,9 +591,9 @@ class OneVsOneClassifier(MetaEstimatorMixin, ClassifierMixin, BaseEstimator): .. deprecated:: 0.24 - The _pairwise attribute is deprecated in 0.24. From 0.26 and - onward, `pairwise_indices_` will use the pairwise estimator tag - instead. + The _pairwise attribute is deprecated in 0.24. From 1.1 + (renaming of 0.25) and onward, `pairwise_indices_` will use the + pairwise estimator tag instead. Examples -------- @@ -769,10 +769,10 @@ def decision_function(self, X): def n_classes_(self): return len(self.classes_) - # TODO: Remove in 0.26 + # TODO: Remove in 1.1 # mypy error: Decorated property not supported @deprecated("Attribute _pairwise was deprecated in " # type: ignore - "version 0.24 and will be removed in 0.26.") + "version 0.24 and will be removed in 1.1 (renaming of 0.26).") @property def _pairwise(self): """Indicate if wrapped estimator is using a precomputed Gram matrix""" diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index 93a2da00549de..bcc7a9d24ce1c 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -648,7 +648,7 @@ def _init_counters(self, n_effective_classes, n_features): # mypy error: Decorated property not supported @deprecated("Attribute coef_ was deprecated in " # type: ignore - "version 0.24 and will be removed in 0.26.") + "version 0.24 and will be removed in 1.1 (renaming of 0.26).") @property def coef_(self): return (self.feature_log_prob_[1:] @@ -656,7 +656,7 @@ def coef_(self): # mypy error: Decorated property not supported @deprecated("Attribute intercept_ was deprecated in " # type: ignore - "version 0.24 and will be removed in 0.26.") + "version 0.24 and will be removed in 1.1 (renaming of 0.26).") @property def intercept_(self): return (self.class_log_prior_[1:] @@ -708,7 +708,8 @@ class MultinomialNB(_BaseDiscreteNB): as a linear model. .. deprecated:: 0.24 - ``coef_`` is deprecated in 0.24 and will be removed in 0.26. + ``coef_`` is deprecated in 0.24 and will be removed in 1.1 + (renaming of 0.26). feature_count_ : ndarray of shape (n_classes, n_features) Number of samples encountered for each (class, feature) @@ -724,7 +725,8 @@ class MultinomialNB(_BaseDiscreteNB): as a linear model. .. deprecated:: 0.24 - ``intercept_`` is deprecated in 0.24 and will be removed in 0.26. + ``intercept_`` is deprecated in 0.24 and will be removed in 1.1 + (renaming of 0.26). n_features_ : int Number of features of each sample. @@ -830,7 +832,8 @@ class ComplementNB(_BaseDiscreteNB): as a linear model. .. deprecated:: 0.24 - ``coef_`` is deprecated in 0.24 and will be removed in 0.26. + ``coef_`` is deprecated in 0.24 and will be removed in 1.1 + (renaming of 0.26). feature_all_ : ndarray of shape (n_features,) Number of samples encountered for each feature during fitting. This @@ -848,7 +851,8 @@ class ComplementNB(_BaseDiscreteNB): as a linear model. .. deprecated:: 0.24 - ``coef_`` is deprecated in 0.24 and will be removed in 0.26. + ``coef_`` is deprecated in 0.24 and will be removed in 1.1 + (renaming of 0.26). n_features_ : int Number of features of each sample. diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py index 1e666043347cf..54cf473b2ab75 100644 --- a/sklearn/neighbors/_base.py +++ b/sklearn/neighbors/_base.py @@ -528,10 +528,10 @@ def _more_tags(self): # For cross-validation routines to split data correctly return {'pairwise': self.metric == 'precomputed'} - # TODO: Remove in 0.26 + # TODO: Remove in 1.1 # mypy error: Decorated property not supported @deprecated("Attribute _pairwise was deprecated in " # type: ignore - "version 0.24 and will be removed in 0.26.") + "version 0.24 and will be removed in 1.1 (renaming of 0.26).") @property def _pairwise(self): # For cross-validation routines to split data correctly diff --git a/sklearn/neighbors/_regression.py b/sklearn/neighbors/_regression.py index 49bc199a86ec6..9bf28f037294a 100644 --- a/sklearn/neighbors/_regression.py +++ b/sklearn/neighbors/_regression.py @@ -160,10 +160,10 @@ def _more_tags(self): # For cross-validation routines to split data correctly return {'pairwise': self.metric == 'precomputed'} - # TODO: Remove in 0.26 + # TODO: Remove in 1.1 # mypy error: Decorated property not supported @deprecated("Attribute _pairwise was deprecated in " # type: ignore - "version 0.24 and will be removed in 0.26.") + "version 0.24 and will be removed in 1.1 (renaming of 0.26).") @property def _pairwise(self): # For cross-validation routines to split data correctly diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index ab0b793176b04..513df1edb1bec 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -1733,7 +1733,7 @@ def test_auto_algorithm(X, metric, metric_params, expected_algo): assert model._fit_method == expected_algo -# TODO: Remove in 0.26 +# TODO: Remove in 1.1 @pytest.mark.parametrize("NearestNeighbors", [neighbors.KNeighborsClassifier, neighbors.KNeighborsRegressor, neighbors.NearestNeighbors]) diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 6df8cddc476c4..00aad1a8e5315 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -629,10 +629,10 @@ def _more_tags(self): # check if first estimator expects pairwise input return {'pairwise': _safe_tags(self.steps[0][1], "pairwise")} - # TODO: Remove in 0.26 + # TODO: Remove in 1.1 # mypy error: Decorated property not supported @deprecated("Attribute _pairwise was deprecated in " # type: ignore - "version 0.24 and will be removed in 0.26.") + "version 0.24 and will be removed in 1.1 (renaming of 0.26).") @property def _pairwise(self): # check if first estimator expects pairwise input diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py index 3f6cbb7546439..478d41ecc768a 100644 --- a/sklearn/preprocessing/_data.py +++ b/sklearn/preprocessing/_data.py @@ -2312,10 +2312,10 @@ def transform(self, K, copy=True): def _more_tags(self): return {'pairwise': True} - # TODO: Remove in 0.26 + # TODO: Remove in 1.1 # mypy error: Decorated property not supported @deprecated("Attribute _pairwise was deprecated in " # type: ignore - "version 0.24 and will be removed in 0.26.") + "version 0.24 and will be removed in 1.1.") @property def _pairwise(self): return True diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index 4fef462b9d849..b0fbee8db9455 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -2246,7 +2246,7 @@ def test_cv_pipeline_precomputed(): # did the pipeline set the pairwise attribute? assert pipeline._get_tags()['pairwise'] - # TODO: Remove in 0.26 + # TODO: Remove in 1.1 msg = r"Attribute _pairwise was deprecated in version 0\.24" with pytest.warns(FutureWarning, match=msg): assert pipeline._pairwise @@ -2258,7 +2258,7 @@ def test_cv_pipeline_precomputed(): assert_array_almost_equal(y_true, y_pred) -# TODO: Remove in 0.26 +# TODO: Remove in 1.1 def test_pairwise_deprecated(): kcent = KernelCenterer() msg = r"Attribute _pairwise was deprecated in version 0\.24" diff --git a/sklearn/svm/_base.py b/sklearn/svm/_base.py index c5196a5801607..fa09badf64691 100644 --- a/sklearn/svm/_base.py +++ b/sklearn/svm/_base.py @@ -107,10 +107,10 @@ def _more_tags(self): # Used by cross_val_score. return {'pairwise': self.kernel == 'precomputed'} - # TODO: Remove in 0.26 + # TODO: Remove in 1.1 # mypy error: Decorated property not supported @deprecated("Attribute _pairwise was deprecated in " # type: ignore - "version 0.24 and will be removed in 0.26.") + "version 0.24 and will be removed in 1.1 (renaming of 0.26).") @property def _pairwise(self): # Used by cross_val_score. diff --git a/sklearn/svm/_classes.py b/sklearn/svm/_classes.py index 295ff577b642e..908ece408bb1d 100644 --- a/sklearn/svm/_classes.py +++ b/sklearn/svm/_classes.py @@ -1048,7 +1048,7 @@ def __init__(self, *, kernel='rbf', degree=3, gamma='scale', # mypy error: Decorated property not supported @deprecated( # type: ignore "The probA_ attribute is deprecated in version 0.23 and will be " - "removed in version 0.25.") + "removed in version 1.0 (renaming of 0.25).") @property def probA_(self): return self._probA @@ -1056,7 +1056,7 @@ def probA_(self): # mypy error: Decorated property not supported @deprecated( # type: ignore "The probB_ attribute is deprecated in version 0.23 and will be " - "removed in version 0.25.") + "removed in version 1.0 (renaming of 0.25).") @property def probB_(self): return self._probB @@ -1434,7 +1434,7 @@ def predict(self, X): # mypy error: Decorated property not supported @deprecated( # type: ignore "The probA_ attribute is deprecated in version 0.23 and will be " - "removed in version 0.25.") + "removed in version 1.0.") @property def probA_(self): return self._probA @@ -1442,7 +1442,7 @@ def probA_(self): # mypy error: Decorated property not supported @deprecated( # type: ignore "The probB_ attribute is deprecated in version 0.23 and will be " - "removed in version 0.25.") + "removed in version 1.0.") @property def probB_(self): return self._probB diff --git a/sklearn/svm/tests/test_svm.py b/sklearn/svm/tests/test_svm.py index ad8402e5bbd18..4d57f4f7da450 100644 --- a/sklearn/svm/tests/test_svm.py +++ b/sklearn/svm/tests/test_svm.py @@ -1235,7 +1235,7 @@ def test_n_support_oneclass_svr(): assert reg.n_support_ == 4 -# TODO: Remove in 0.25 when probA_ and probB_ are deprecated +# TODO: Remove in 1.0 when probA_ and probB_ are deprecated @pytest.mark.parametrize("SVMClass, data", [ (svm.OneClassSVM, (X, )), (svm.SVR, (X, Y)) @@ -1245,7 +1245,7 @@ def test_svm_probA_proB_deprecated(SVMClass, data, deprecated_prob): clf = SVMClass().fit(*data) msg = ("The {} attribute is deprecated in version 0.23 and will be " - "removed in version 0.25.").format(deprecated_prob) + "removed in version 1.0").format(deprecated_prob) with pytest.warns(FutureWarning, match=msg): getattr(clf, deprecated_prob) diff --git a/sklearn/tests/test_base.py b/sklearn/tests/test_base.py index 0c07db459d128..7dd8d02f3c0bf 100644 --- a/sklearn/tests/test_base.py +++ b/sklearn/tests/test_base.py @@ -540,7 +540,7 @@ def test_repr_html_wraps(): assert "' - f'
') + container_id = "sk-" + str(uuid.uuid4()) + style_template = Template(_STYLE) + style_with_id = style_template.substitute(id=container_id) + out.write(f'' + f'
' + '
') _write_estimator_html(out, estimator, estimator.__class__.__name__, str(estimator), first_call=True) out.write('
') From 4b72b579606ef50524e9c24b1304d4dfc11defbc Mon Sep 17 00:00:00 2001 From: Benjamin Pedigo Date: Wed, 17 Feb 2021 03:49:46 -0500 Subject: [PATCH 174/478] DOC Fix typo in LDA User Guide (#19468) --- doc/modules/lda_qda.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/lda_qda.rst b/doc/modules/lda_qda.rst index e8f25d2c66930..962d65705f75a 100644 --- a/doc/modules/lda_qda.rst +++ b/doc/modules/lda_qda.rst @@ -136,7 +136,7 @@ Mathematical formulation of LDA dimensionality reduction First note that the K means :math:`\mu_k` are vectors in :math:`\mathcal{R}^d`, and they lie in an affine subspace :math:`H` of -dimension at least :math:`K - 1` (2 points lie on a line, 3 points lie on a +dimension at most :math:`K - 1` (2 points lie on a line, 3 points lie on a plane, etc). As mentioned above, we can interpret LDA as assigning :math:`x` to the class From 80d674e3cd2288f8c8c331d44bffa18006db7f46 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juan=20Carlos=20Alfaro=20Jim=C3=A9nez?= Date: Wed, 17 Feb 2021 11:10:06 +0100 Subject: [PATCH 175/478] DOC Fix closing backtick in IterativeImputer (#19476) Co-authored-by: Nicolas Hug --- sklearn/impute/_iterative.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/impute/_iterative.py b/sklearn/impute/_iterative.py index 81ae946459a3a..f5688fa96d238 100644 --- a/sklearn/impute/_iterative.py +++ b/sklearn/impute/_iterative.py @@ -68,8 +68,8 @@ class IterativeImputer(_BaseImputer): Maximum number of imputation rounds to perform before returning the imputations computed during the final round. A round is a single imputation of each feature with missing values. The stopping criterion - is met once `max(abs(X_t - X_{t-1}))/max(abs(X[known_vals]))` < tol, - where `X_t` is `X` at iteration `t. Note that early stopping is only + is met once `max(abs(X_t - X_{t-1}))/max(abs(X[known_vals])) < tol`, + where `X_t` is `X` at iteration `t`. Note that early stopping is only applied if ``sample_posterior=False``. tol : float, default=1e-3 From 4d60a815d84531ba91bf097e9c814460113a7b72 Mon Sep 17 00:00:00 2001 From: Nodar Okroshiashvili Date: Wed, 17 Feb 2021 17:41:20 +0400 Subject: [PATCH 176/478] DOC Remove extra word from LOF docstring (#19477) --- sklearn/neighbors/_lof.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/neighbors/_lof.py b/sklearn/neighbors/_lof.py index 8ba39b315f891..29bf1a5e73f91 100644 --- a/sklearn/neighbors/_lof.py +++ b/sklearn/neighbors/_lof.py @@ -114,7 +114,7 @@ class LocalOutlierFactor(KNeighborsMixin, By default, LocalOutlierFactor is only meant to be used for outlier detection (novelty=False). Set novelty to True if you want to use LocalOutlierFactor for novelty detection. In this case be aware that - that you should only use predict, decision_function and score_samples + you should only use predict, decision_function and score_samples on new unseen data and not on the training set. .. versionadded:: 0.20 From 66f67dd9362983a1d446ccf752b54c72c30fc090 Mon Sep 17 00:00:00 2001 From: Nigel Bosch Date: Thu, 18 Feb 2021 02:06:52 -0600 Subject: [PATCH 177/478] DOC Fix broken link to wikipedia in semi-supervised UG (#19481) --- doc/modules/semi_supervised.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/semi_supervised.rst b/doc/modules/semi_supervised.rst index 484484fca5a78..7c1ea8f296a49 100644 --- a/doc/modules/semi_supervised.rst +++ b/doc/modules/semi_supervised.rst @@ -27,7 +27,7 @@ labeled points and a large amount of unlabeled points. Semi-supervised algorithms need to make assumptions about the distribution of the dataset in order to achieve performance gains. See `here - `_ + `_ for more details. .. _self_training: From cc13313b26852169dff3fdf80c40008c233ce40f Mon Sep 17 00:00:00 2001 From: Zito Relova Date: Thu, 18 Feb 2021 07:43:54 -0800 Subject: [PATCH 178/478] TST remove assert_warns in feature_extraction/tests/ module (#19439) --- sklearn/feature_extraction/tests/test_text.py | 25 ++++++++++--------- 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py index ebe13cc0c240a..767b04ddb5d95 100644 --- a/sklearn/feature_extraction/tests/test_text.py +++ b/sklearn/feature_extraction/tests/test_text.py @@ -29,8 +29,7 @@ from numpy.testing import assert_array_equal from sklearn.utils import IS_PYPY from sklearn.utils._testing import (assert_almost_equal, - assert_warns_message, assert_raise_message, - assert_no_warnings, + assert_raise_message, fails_if_pypy, assert_allclose_dense_sparse, skip_if_32bit) @@ -386,8 +385,8 @@ def test_countvectorizer_uppercase_in_vocab(): " be matched with any documents") vectorizer = CountVectorizer(lowercase=True, vocabulary=vocabulary) - assert_warns_message(UserWarning, message, - vectorizer.fit_transform, vocabulary) + with pytest.warns(UserWarning, match=message): + vectorizer.fit_transform(vocabulary) def test_tf_idf_smoothing(): @@ -429,8 +428,8 @@ def test_tfidf_no_smoothing(): tr = TfidfTransformer(smooth_idf=False, norm='l2') in_warning_message = 'divide by zero' - assert_warns_message(RuntimeWarning, in_warning_message, - tr.fit_transform, X).toarray() + with pytest.warns(RuntimeWarning, match=in_warning_message): + tr.fit_transform(X).toarray() def test_sublinear_tf(): @@ -1213,27 +1212,29 @@ def _check_stop_words_consistency(estimator): @fails_if_pypy def test_vectorizer_stop_words_inconsistent(): - lstr = "['and', 'll', 've']" + lstr = r"\['and', 'll', 've'\]" message = ('Your stop_words may be inconsistent with your ' 'preprocessing. Tokenizing the stop words generated ' 'tokens %s not in stop_words.' % lstr) for vec in [CountVectorizer(), TfidfVectorizer(), HashingVectorizer()]: vec.set_params(stop_words=["you've", "you", "you'll", 'AND']) - assert_warns_message(UserWarning, message, vec.fit_transform, - ['hello world']) + with pytest.warns(UserWarning, match=message): + vec.fit_transform(['hello world']) # reset stop word validation del vec._stop_words_id assert _check_stop_words_consistency(vec) is False # Only one warning per stop list - assert_no_warnings(vec.fit_transform, ['hello world']) + with pytest.warns(None) as record: + vec.fit_transform(['hello world']) + assert not len(record) assert _check_stop_words_consistency(vec) is None # Test caching of inconsistency assessment vec.set_params(stop_words=["you've", "you", "you'll", 'blah', 'AND']) - assert_warns_message(UserWarning, message, vec.fit_transform, - ['hello world']) + with pytest.warns(UserWarning, match=message): + vec.fit_transform(['hello world']) @skip_if_32bit From 321799971be8ede64d4603c93687becd5701d30f Mon Sep 17 00:00:00 2001 From: Zito Relova Date: Thu, 18 Feb 2021 07:50:27 -0800 Subject: [PATCH 179/478] TST Replace the use of assert_warns messages in cluster/tests/ module (#19437) --- .../tests/test_affinity_propagation.py | 33 +++++++++++-------- sklearn/cluster/tests/test_birch.py | 4 +-- .../tests/test_feature_agglomeration.py | 10 ++++-- sklearn/cluster/tests/test_hierarchical.py | 15 +++++---- sklearn/cluster/tests/test_spectral.py | 4 +-- 5 files changed, 38 insertions(+), 28 deletions(-) diff --git a/sklearn/cluster/tests/test_affinity_propagation.py b/sklearn/cluster/tests/test_affinity_propagation.py index 446b0f43c74d9..51b4fd425349e 100644 --- a/sklearn/cluster/tests/test_affinity_propagation.py +++ b/sklearn/cluster/tests/test_affinity_propagation.py @@ -8,9 +8,7 @@ from scipy.sparse import csr_matrix from sklearn.exceptions import ConvergenceWarning -from sklearn.utils._testing import ( - assert_array_equal, assert_warns, - assert_warns_message, assert_no_warnings) +from sklearn.utils._testing import assert_array_equal from sklearn.cluster import AffinityPropagation from sklearn.cluster._affinity_propagation import ( @@ -72,6 +70,7 @@ def test_affinity_propagation(): with pytest.raises(TypeError): af_2.fit(csr_matrix((3, 3))) + def test_affinity_propagation_predict(): # Test AffinityPropagation.predict af = AffinityPropagation(affinity="euclidean", random_state=63) @@ -104,7 +103,8 @@ def test_affinity_propagation_fit_non_convergence(): # Force non-convergence by allowing only a single iteration af = AffinityPropagation(preference=-10, max_iter=1, random_state=82) - assert_warns(ConvergenceWarning, af.fit, X) + with pytest.warns(ConvergenceWarning): + af.fit(X) assert_array_equal(np.empty((0, 2)), af.cluster_centers_) assert_array_equal(np.array([-1, -1, -1]), af.labels_) @@ -114,24 +114,28 @@ def test_affinity_propagation_equal_mutual_similarities(): S = -euclidean_distances(X, squared=True) # setting preference > similarity - cluster_center_indices, labels = assert_warns_message( - UserWarning, "mutually equal", affinity_propagation, S, preference=0) + with pytest.warns(UserWarning, match="mutually equal"): + cluster_center_indices, labels = affinity_propagation( + S, preference=0) # expect every sample to become an exemplar assert_array_equal([0, 1], cluster_center_indices) assert_array_equal([0, 1], labels) # setting preference < similarity - cluster_center_indices, labels = assert_warns_message( - UserWarning, "mutually equal", affinity_propagation, S, preference=-10) + with pytest.warns(UserWarning, match="mutually equal"): + cluster_center_indices, labels = affinity_propagation( + S, preference=-10) # expect one cluster, with arbitrary (first) sample as exemplar assert_array_equal([0], cluster_center_indices) assert_array_equal([0, 0], labels) # setting different preferences - cluster_center_indices, labels = assert_no_warnings( - affinity_propagation, S, preference=[-20, -10], random_state=37) + with pytest.warns(None) as record: + cluster_center_indices, labels = affinity_propagation( + S, preference=[-20, -10], random_state=37) + assert not len(record) # expect one cluster, with highest-preference sample as exemplar assert_array_equal([1], cluster_center_indices) @@ -144,14 +148,15 @@ def test_affinity_propagation_predict_non_convergence(): X = np.array([[0, 0], [1, 1], [-2, -2]]) # Force non-convergence by allowing only a single iteration - af = assert_warns(ConvergenceWarning, - AffinityPropagation(preference=-10, - max_iter=1, random_state=75).fit, X) + with pytest.warns(ConvergenceWarning): + af = AffinityPropagation(preference=-10, + max_iter=1, random_state=75).fit(X) # At prediction time, consider new samples as noise since there are no # clusters to_predict = np.array([[2, 2], [3, 3], [4, 4]]) - y = assert_warns(ConvergenceWarning, af.predict, to_predict) + with pytest.warns(ConvergenceWarning): + y = af.predict(to_predict) assert_array_equal(np.array([-1, -1, -1]), y) diff --git a/sklearn/cluster/tests/test_birch.py b/sklearn/cluster/tests/test_birch.py index 37c9a083842b1..e199c897f97ef 100644 --- a/sklearn/cluster/tests/test_birch.py +++ b/sklearn/cluster/tests/test_birch.py @@ -17,7 +17,6 @@ from sklearn.utils._testing import assert_almost_equal from sklearn.utils._testing import assert_array_equal from sklearn.utils._testing import assert_array_almost_equal -from sklearn.utils._testing import assert_warns def test_n_samples_leaves_roots(): @@ -92,7 +91,8 @@ def test_n_clusters(): # Test that a small number of clusters raises a warning. brc4 = Birch(threshold=10000.) - assert_warns(ConvergenceWarning, brc4.fit, X) + with pytest.warns(ConvergenceWarning): + brc4.fit(X) def test_sparse_X(): diff --git a/sklearn/cluster/tests/test_feature_agglomeration.py b/sklearn/cluster/tests/test_feature_agglomeration.py index 8afb5854252f3..ebc2fe49d7a7f 100644 --- a/sklearn/cluster/tests/test_feature_agglomeration.py +++ b/sklearn/cluster/tests/test_feature_agglomeration.py @@ -3,8 +3,8 @@ """ # Authors: Sergul Aydore 2017 import numpy as np +import pytest from sklearn.cluster import FeatureAgglomeration -from sklearn.utils._testing import assert_no_warnings from sklearn.utils._testing import assert_array_almost_equal @@ -16,8 +16,12 @@ def test_feature_agglomeration(): pooling_func=np.mean) agglo_median = FeatureAgglomeration(n_clusters=n_clusters, pooling_func=np.median) - assert_no_warnings(agglo_mean.fit, X) - assert_no_warnings(agglo_median.fit, X) + with pytest.warns(None) as record: + agglo_mean.fit(X) + assert not len(record) + with pytest.warns(None) as record: + agglo_median.fit(X) + assert not len(record) assert np.size(np.unique(agglo_mean.labels_)) == n_clusters assert np.size(np.unique(agglo_median.labels_)) == n_clusters assert np.size(agglo_mean.labels_) == X.shape[1] diff --git a/sklearn/cluster/tests/test_hierarchical.py b/sklearn/cluster/tests/test_hierarchical.py index 26f30dcd87847..b5a2d9bbf2701 100644 --- a/sklearn/cluster/tests/test_hierarchical.py +++ b/sklearn/cluster/tests/test_hierarchical.py @@ -33,7 +33,6 @@ from sklearn.cluster._hierarchical_fast import average_merge, max_merge from sklearn.utils._fast_dict import IntFloatDict from sklearn.utils._testing import assert_array_equal -from sklearn.utils._testing import assert_warns from sklearn.datasets import make_moons, make_circles @@ -94,17 +93,18 @@ def test_unstructured_linkage_tree(): # With specified a number of clusters just for the sake of # raising a warning and testing the warning code with ignore_warnings(): - children, n_nodes, n_leaves, parent = assert_warns( - UserWarning, ward_tree, this_X.T, n_clusters=10) + with pytest.warns(UserWarning): + children, n_nodes, n_leaves, parent = ward_tree( + this_X.T, n_clusters=10) n_nodes = 2 * X.shape[1] - 1 assert len(children) + n_leaves == n_nodes for tree_builder in _TREE_BUILDERS.values(): for this_X in (X, X[0]): with ignore_warnings(): - children, n_nodes, n_leaves, parent = assert_warns( - UserWarning, tree_builder, this_X.T, n_clusters=10) - + with pytest.warns(UserWarning): + children, n_nodes, n_leaves, parent = tree_builder( + this_X.T, n_clusters=10) n_nodes = 2 * X.shape[1] - 1 assert len(children) + n_leaves == n_nodes @@ -550,7 +550,8 @@ def test_connectivity_fixing_non_lil(): m = np.array([[True, False], [False, True]]) c = grid_to_graph(n_x=2, n_y=2, mask=m) w = AgglomerativeClustering(connectivity=c, linkage='ward') - assert_warns(UserWarning, w.fit, x) + with pytest.warns(UserWarning): + w.fit(x) def test_int_float_dict(): diff --git a/sklearn/cluster/tests/test_spectral.py b/sklearn/cluster/tests/test_spectral.py index 7af3b8089a09c..6962e98917ed0 100644 --- a/sklearn/cluster/tests/test_spectral.py +++ b/sklearn/cluster/tests/test_spectral.py @@ -10,7 +10,6 @@ from sklearn.utils import check_random_state from sklearn.utils._testing import assert_array_equal -from sklearn.utils._testing import assert_warns_message from sklearn.cluster import SpectralClustering, spectral_clustering from sklearn.cluster._spectral import discretize @@ -132,7 +131,8 @@ def test_affinities(): # nearest neighbors affinity sp = SpectralClustering(n_clusters=2, affinity='nearest_neighbors', random_state=0) - assert_warns_message(UserWarning, 'not fully connected', sp.fit, X) + with pytest.warns(UserWarning, match='not fully connected'): + sp.fit(X) assert adjusted_rand_score(y, sp.labels_) == 1 sp = SpectralClustering(n_clusters=2, gamma=2, random_state=0) From c282d6cdb7647fceb1f73a5e343c112c42a21a04 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 18 Feb 2021 10:55:00 -0500 Subject: [PATCH 180/478] DOC clarify that n_iter_no_change is not just for early-stopping for SGD (#19462) --- sklearn/linear_model/_stochastic_gradient.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/sklearn/linear_model/_stochastic_gradient.py b/sklearn/linear_model/_stochastic_gradient.py index 948910e61b51c..65f6cc6966ba4 100644 --- a/sklearn/linear_model/_stochastic_gradient.py +++ b/sklearn/linear_model/_stochastic_gradient.py @@ -813,6 +813,8 @@ class SGDClassifier(BaseSGDClassifier): The stopping criterion. If it is not None, training will stop when (loss > best_loss - tol) for ``n_iter_no_change`` consecutive epochs. + Convergence is checked against the training loss or the + validation loss depending on the `early_stopping` parameter. .. versionadded:: 0.19 @@ -884,7 +886,10 @@ class SGDClassifier(BaseSGDClassifier): Added 'validation_fraction' option n_iter_no_change : int, default=5 - Number of iterations with no improvement to wait before early stopping. + Number of iterations with no improvement to wait before stopping + fitting. + Convergence is checked against the training loss or the + validation loss depending on the `early_stopping` parameter. .. versionadded:: 0.20 Added 'n_iter_no_change' option @@ -1431,6 +1436,8 @@ class SGDRegressor(BaseSGDRegressor): The stopping criterion. If it is not None, training will stop when (loss > best_loss - tol) for ``n_iter_no_change`` consecutive epochs. + Convergence is checked against the training loss or the + validation loss depending on the `early_stopping` parameter. .. versionadded:: 0.19 @@ -1495,7 +1502,10 @@ class SGDRegressor(BaseSGDRegressor): Added 'validation_fraction' option n_iter_no_change : int, default=5 - Number of iterations with no improvement to wait before early stopping. + Number of iterations with no improvement to wait before stopping + fitting. + Convergence is checked against the training loss or the + validation loss depending on the `early_stopping` parameter. .. versionadded:: 0.20 Added 'n_iter_no_change' option From eec623a8c5219b293625090e7ea5a01975741e37 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?xavier=20dupr=C3=A9?= Date: Thu, 18 Feb 2021 18:28:23 +0100 Subject: [PATCH 181/478] DOC Replace onnxmltools by sklearn-onnx (#19484) --- doc/related_projects.rst | 2 +- doc/roadmap.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/related_projects.rst b/doc/related_projects.rst index 0f8f2c21eabc5..acc2689388896 100644 --- a/doc/related_projects.rst +++ b/doc/related_projects.rst @@ -97,7 +97,7 @@ enhance the functionality of scikit-learn's estimators. **Model export for production** -- `onnxmltools `_ Serializes many +- `sklearn-onnx `_ Serialization of many Scikit-learn pipelines to `ONNX `_ for interchange and prediction. diff --git a/doc/roadmap.rst b/doc/roadmap.rst index 7076e22b40287..30c9f58339502 100644 --- a/doc/roadmap.rst +++ b/doc/roadmap.rst @@ -223,7 +223,7 @@ the document up to date as we work on these issues. (to be discussed); * Extend documentation to mention how to deploy models in Python-free - environments for instance `ONNX `_. + environments for instance `ONNX `_. and use the above best practices to assess predictive consistency between scikit-learn and ONNX prediction functions on validation set. * Document good practices to detect temporal distribution drift for deployed From bea9211cdbed5f5beb6f7f283831373a642cb0a4 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 18 Feb 2021 18:59:09 +0100 Subject: [PATCH 182/478] DOC update whats new 0.24 for backport (#19434) --- doc/whats_new/v0.24.rst | 14 ++++++++++++++ doc/whats_new/v1.0.rst | 11 ----------- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst index 891d238c0ac43..6f2584dccdd10 100644 --- a/doc/whats_new/v0.24.rst +++ b/doc/whats_new/v0.24.rst @@ -19,6 +19,13 @@ Changelog with `sample_weight` parameter and `least_absolute_deviation` loss function. :pr:`19407` by :user:`Vadim Ushtanit `. +:mod:`sklearn.linear_model` +........................... + +- |Fix|: Fixed a bug in :class:`linear_model.LogisticRegression`: the + sample_weight object is not modified anymore. :pr:`19182` by + :user:`Yosuke KOBAYASHI `. + :mod:`sklearn.preprocessing` ............................ @@ -27,6 +34,13 @@ Changelog `'use_encoded_value'` strategies. :pr:`19234` by `Guillaume Lemaitre `. +:mod:`sklearn.semi_supervised` +.............................. + +- |Fix| Avoid NaN during label propagation in + :class:`~sklearn.semi_supervised.LabelPropagation`. + :pr:`19271` by :user:`Zhaowei Wang `. + :mod:`sklearn.utils` .................... diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index 024fefe3fd825..518aec8f4d7ba 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -122,10 +122,6 @@ Changelog not corresponding to their objective. :pr:`19172` by :user:`Mathurin Massias ` -- |Fix|: Fixed a bug in :class:`linear_model.LogisticRegression`: the - sample_weight object is not modified anymore. :pr:`19182` by - :user:`Yosuke KOBAYASHI `. - - |API|: The parameter ``normalize`` of :class:`linear_model.LinearRegression` is deprecated and will be removed in 1.2. Motivation for this deprecation: ``normalize`` parameter did not take any @@ -180,13 +176,6 @@ Changelog for non-English characters. :pr:`18959` by :user:`Zero ` and :user:`wstates `. -:mod:`sklearn.semi_supervised` -.............................. - -- |Fix| Avoid NaN during label propagation in - :class:`~sklearn.semi_supervised.LabelPropagation`. - :pr:`19271` by :user:`Zhaowei Wang `. - Code and Documentation Contributors ----------------------------------- From 6a6217f15f654284d227c27c9179bb02a793e811 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?xavier=20dupr=C3=A9?= Date: Thu, 18 Feb 2021 19:43:40 +0100 Subject: [PATCH 183/478] ENH Add mean_pinball_loss metric for quantile regression (#19415) Co-authored-by: Olivier Grisel Co-authored-by: Christian Lorentzen --- doc/modules/classes.rst | 1 + doc/modules/model_evaluation.rst | 71 +++- doc/whats_new/v1.0.rst | 4 + .../plot_gradient_boosting_quantile.py | 347 +++++++++++++++--- .../test_gradient_boosting_loss_functions.py | 18 +- sklearn/metrics/__init__.py | 2 + sklearn/metrics/_regression.py | 83 +++++ sklearn/metrics/tests/test_common.py | 8 +- sklearn/metrics/tests/test_regression.py | 126 ++++++- 9 files changed, 595 insertions(+), 65 deletions(-) diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index 65d555f978df0..c658bc6b12452 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -991,6 +991,7 @@ details. metrics.mean_poisson_deviance metrics.mean_gamma_deviance metrics.mean_tweedie_deviance + metrics.mean_pinball_loss Multilabel ranking metrics -------------------------- diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index 86e64f997cdd8..c807af982e277 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -416,7 +416,7 @@ defined as .. math:: - \texttt{accuracy}(y, \hat{y}) = \frac{1}{n_\text{samples}} \sum_{i=0}^{n_\text{samples}-1} 1(\hat{y}_i = y_i) + \texttt{accuracy}(y, \hat{y}) = \frac{1}{n_\text{samples}} \sum_{i=0}^{n_\text{samples}-1} 1(\hat{y}_i = y_i) where :math:`1(x)` is the `indicator function `_. @@ -1960,8 +1960,8 @@ Regression metrics The :mod:`sklearn.metrics` module implements several loss, score, and utility functions to measure regression performance. Some of those have been enhanced to handle the multioutput case: :func:`mean_squared_error`, -:func:`mean_absolute_error`, :func:`explained_variance_score` and -:func:`r2_score`. +:func:`mean_absolute_error`, :func:`explained_variance_score`, +:func:`r2_score` and :func:`mean_pinball_loss`. These functions have an ``multioutput`` keyword argument which specifies the @@ -2354,6 +2354,71 @@ the difference in errors decreases. Finally, by setting, ``power=2``:: we would get identical errors. The deviance when ``power=2`` is thus only sensitive to relative errors. +.. _pinball_loss: + +Pinball loss +------------ + +The :func:`mean_pinball_loss` function is used to evaluate the predictive +performance of quantile regression models. The `pinball loss +`_ is equivalent +to :func:`mean_absolute_error` when the quantile parameter ``alpha`` is set to +0.5. + +.. math:: + + \text{pinball}(y, \hat{y}) = \frac{1}{n_{\text{samples}}} \sum_{i=0}^{n_{\text{samples}}-1} \alpha \max(y_i - \hat{y}_i, 0) + (1 - \alpha) \max(\hat{y}_i - y_i, 0) + +Here is a small example of usage of the :func:`mean_pinball_loss` function:: + + >>> from sklearn.metrics import mean_pinball_loss + >>> y_true = [1, 2, 3] + >>> mean_pinball_loss(y_true, [0, 2, 3], alpha=0.1) + 0.03... + >>> mean_pinball_loss(y_true, [1, 2, 4], alpha=0.1) + 0.3... + >>> mean_pinball_loss(y_true, [0, 2, 3], alpha=0.9) + 0.3... + >>> mean_pinball_loss(y_true, [1, 2, 4], alpha=0.9) + 0.03... + >>> mean_pinball_loss(y_true, y_true, alpha=0.1) + 0.0 + >>> mean_pinball_loss(y_true, y_true, alpha=0.9) + 0.0 + +It is possible to build a scorer object with a specific choice of alpha:: + + >>> from sklearn.metrics import make_scorer + >>> mean_pinball_loss_95p = make_scorer(mean_pinball_loss, alpha=0.95) + +Such a scorer can be used to evaluate the generalization performance of a +quantile regressor via cross-validation: + + >>> from sklearn.datasets import make_regression + >>> from sklearn.model_selection import cross_val_score + >>> from sklearn.ensemble import GradientBoostingRegressor + >>> + >>> X, y = make_regression(n_samples=100, random_state=0) + >>> estimator = GradientBoostingRegressor( + ... loss="quantile", + ... alpha=0.95, + ... random_state=0, + ... ) + >>> cross_val_score(estimator, X, y, cv=5, scoring=mean_pinball_loss_95p) + array([11.1..., 10.4... , 24.4..., 9.2..., 12.9...]) + +It is also possible to build scorer objects for hyper-parameter tuning. The +sign of the loss must be switched to ensure that greater means better as +explained in the example linked below. + +.. topic:: Example: + + * See :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_quantile.py` + for an example of using a the pinball loss to evaluate and tune the + hyper-parameters of quantile regression models on data with non-symmetric + noise and outliers. + + .. _clustering_metrics: Clustering metrics diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index 518aec8f4d7ba..1005920b891d3 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -145,6 +145,10 @@ Changelog class methods and will be removed in 1.2. :pr:`18543` by `Guillaume Lemaitre`_. +- |Feature| :func:`metrics.mean_pinball_loss` exposes the pinball loss for + quantile regression. :pr:`19415` by :user:`Xavier Dupré ` + and :user:`Oliver Grisel `. + :mod:`sklearn.naive_bayes` .......................... diff --git a/examples/ensemble/plot_gradient_boosting_quantile.py b/examples/ensemble/plot_gradient_boosting_quantile.py index ef40a2247bcc5..f29a87fe6cff7 100644 --- a/examples/ensemble/plot_gradient_boosting_quantile.py +++ b/examples/ensemble/plot_gradient_boosting_quantile.py @@ -3,77 +3,330 @@ Prediction Intervals for Gradient Boosting Regression ===================================================== -This example shows how quantile regression can be used -to create prediction intervals. +This example shows how quantile regression can be used to create prediction +intervals. """ - +# %% +# Generate some data for a synthetic regression problem by applying the +# function f to uniformly sampled random inputs. import numpy as np -import matplotlib.pyplot as plt - -from sklearn.ensemble import GradientBoostingRegressor - -np.random.seed(1) +from sklearn.model_selection import train_test_split def f(x): """The function to predict.""" return x * np.sin(x) -#---------------------------------------------------------------------- -# First the noiseless case -X = np.atleast_2d(np.random.uniform(0, 10.0, size=100)).T -X = X.astype(np.float32) -# Observations -y = f(X).ravel() +rng = np.random.RandomState(42) +X = np.atleast_2d(rng.uniform(0, 10.0, size=1000)).T +expected_y = f(X).ravel() + +# %% +# To make the problem interesting, we generate observations of the target y as +# the sum of a deterministic term computed by the function f and a random noise +# term that follows a centered `log-normal +# `_. To make this even +# more interesting we consider the case where the amplitude of the noise +# depends on the input variable x (heteroscedastic noise). +# +# The lognormal distribution is non-symmetric and long tailed: observing large +# outliers is likely but it is impossible to observe small outliers. +sigma = 0.5 + X.ravel() / 10 +noise = rng.lognormal(sigma=sigma) - np.exp(sigma ** 2 / 2) +y = expected_y + noise + +# %% +# Split into train, test datasets: +X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) + +# %% +# Fitting non-linear quantile and least squares regressors +# -------------------------------------------------------- +# +# Fit gradient boosting models trained with the quantile loss and +# alpha=0.05, 0.5, 0.95. +# +# The models obtained for alpha=0.05 and alpha=0.95 produce a 90% confidence +# interval (95% - 5% = 90%). +# +# The model trained with alpha=0.5 produces a regression of the median: on +# average, there should be the same number of target observations above and +# below the predicted values. +from sklearn.ensemble import GradientBoostingRegressor +from sklearn.metrics import mean_pinball_loss, mean_squared_error + -dy = 1.5 + 1.0 * np.random.random(y.shape) -noise = np.random.normal(0, dy) -y += noise -y = y.astype(np.float32) +all_models = {} +common_params = dict( + learning_rate=0.05, + n_estimators=250, + max_depth=2, + min_samples_leaf=9, + min_samples_split=9, +) +for alpha in [0.05, 0.5, 0.95]: + gbr = GradientBoostingRegressor(loss='quantile', alpha=alpha, + **common_params) + all_models["q %1.2f" % alpha] = gbr.fit(X_train, y_train) -# Mesh the input space for evaluations of the real function, the prediction and -# its MSE +# %% +# For the sake of comparison, also fit a baseline model trained with the usual +# least squares loss (ls), also known as the mean squared error (MSE). +gbr_ls = GradientBoostingRegressor(loss='ls', **common_params) +all_models["ls"] = gbr_ls.fit(X_train, y_train) + +# %% +# Create an evenly spaced evaluation set of input values spanning the [0, 10] +# range. xx = np.atleast_2d(np.linspace(0, 10, 1000)).T -xx = xx.astype(np.float32) -alpha = 0.95 +# %% +# Plot the true conditional mean function f, the prediction of the conditional +# mean (least squares loss), the conditional median and the conditional 90% +# interval (from 5th to 95th conditional percentiles). +import matplotlib.pyplot as plt + + +y_pred = all_models['ls'].predict(xx) +y_lower = all_models['q 0.05'].predict(xx) +y_upper = all_models['q 0.95'].predict(xx) +y_med = all_models['q 0.50'].predict(xx) + +fig = plt.figure(figsize=(10, 10)) +plt.plot(xx, f(xx), 'g:', linewidth=3, label=r'$f(x) = x\,\sin(x)$') +plt.plot(X_test, y_test, 'b.', markersize=10, label='Test observations') +plt.plot(xx, y_med, 'r-', label='Predicted median', color="orange") +plt.plot(xx, y_pred, 'r-', label='Predicted mean') +plt.plot(xx, y_upper, 'k-') +plt.plot(xx, y_lower, 'k-') +plt.fill_between(xx.ravel(), y_lower, y_upper, alpha=0.4, + label='Predicted 90% interval') +plt.xlabel('$x$') +plt.ylabel('$f(x)$') +plt.ylim(-10, 25) +plt.legend(loc='upper left') +plt.show() + +# %% +# Comparing the predicted median with the predicted mean, we note that the +# median is on average below the mean as the noise is skewed towards high +# values (large outliers). The median estimate also seems to be smoother +# because of its natural robustness to outliers. +# +# Also observe that the inductive bias of gradient boosting trees is +# unfortunately preventing our 0.05 quantile to fully capture the sinoisoidal +# shape of the signal, in particular around x=8. Tuning hyper-parameters can +# reduce this effect as shown in the last part of this notebook. +# +# Analysis of the error metrics +# ----------------------------- +# +# Measure the models with :func:`mean_squared_error` and +# :func:`mean_pinball_loss` metrics on the training dataset. +import pandas as pd + + +def highlight_min(x): + x_min = x.min() + return ['font-weight: bold' if v == x_min else '' + for v in x] + + +results = [] +for name, gbr in sorted(all_models.items()): + metrics = {'model': name} + y_pred = gbr.predict(X_train) + for alpha in [0.05, 0.5, 0.95]: + metrics["pbl=%1.2f" % alpha] = mean_pinball_loss( + y_train, y_pred, alpha=alpha) + metrics['MSE'] = mean_squared_error(y_train, y_pred) + results.append(metrics) + +pd.DataFrame(results).set_index('model').style.apply(highlight_min) + +# %% +# One column shows all models evaluated by the same metric. The minimum number +# on a column should be obtained when the model is trained and measured with +# the same metric. This should be always the case on the training set if the +# training converged. +# +# Note that because the target distribution is asymmetric, the expected +# conditional mean and conditional median are signficiantly different and +# therefore one could not use the least squares model get a good estimation of +# the conditional median nor the converse. +# +# If the target distribution were symmetric and had no outliers (e.g. with a +# Gaussian noise), then median estimator and the least squares estimator would +# have yielded similar predictions. +# +# We then do the same on the test set. +results = [] +for name, gbr in sorted(all_models.items()): + metrics = {'model': name} + y_pred = gbr.predict(X_test) + for alpha in [0.05, 0.5, 0.95]: + metrics["pbl=%1.2f" % alpha] = mean_pinball_loss( + y_test, y_pred, alpha=alpha) + metrics['MSE'] = mean_squared_error(y_test, y_pred) + results.append(metrics) -clf = GradientBoostingRegressor(loss='quantile', alpha=alpha, - n_estimators=250, max_depth=3, - learning_rate=.1, min_samples_leaf=9, - min_samples_split=9) +pd.DataFrame(results).set_index('model').style.apply(highlight_min) -clf.fit(X, y) -# Make the prediction on the meshed x-axis -y_upper = clf.predict(xx) +# %% +# Errors are higher meaning the models slightly overfitted the data. It still +# shows that the best test metric is obtained when the model is trained by +# minimizing this same metric. +# +# Note that the conditional median estimator is competitive with the least +# squares estimator in terms of MSE on the test set: this can be explained by +# the fact the least squares estimator is very sensitive to large outliers +# which can cause significant overfitting. This can be seen on the right hand +# side of the previous plot. The conditional median estimator is biased +# (underestimation for this asymetric noise) but is also naturally robust to +# outliers and overfits less. +# +# Calibration of the confidence interval +# -------------------------------------- +# +# We can also evaluate the ability of the two extreme quantile estimators at +# producing a well-calibrated conditational 90%-confidence interval. +# +# To do this we can compute the fraction of observations that fall between the +# predictions: +def coverage_fraction(y, y_low, y_high): + return np.mean(np.logical_and(y >= y_low, y <= y_high)) -clf.set_params(alpha=1.0 - alpha) -clf.fit(X, y) -# Make the prediction on the meshed x-axis -y_lower = clf.predict(xx) +coverage_fraction(y_train, + all_models['q 0.05'].predict(X_train), + all_models['q 0.95'].predict(X_train)) -clf.set_params(loss='ls') -clf.fit(X, y) +# %% +# On the training set the calibration is very close to the expected coverage +# value for a 90% confidence interval. +coverage_fraction(y_test, + all_models['q 0.05'].predict(X_test), + all_models['q 0.95'].predict(X_test)) -# Make the prediction on the meshed x-axis -y_pred = clf.predict(xx) -# Plot the function, the prediction and the 95% confidence interval based on -# the MSE -fig = plt.figure() -plt.plot(xx, f(xx), 'g:', label=r'$f(x) = x\,\sin(x)$') -plt.plot(X, y, 'b.', markersize=10, label=u'Observations') -plt.plot(xx, y_pred, 'r-', label=u'Prediction') +# %% +# On the test set, the estimated confidence interval is slightly too narrow. +# Note, however, that we would need to wrap those metrics in a cross-validation +# loop to assess their variability under data resampling. +# +# Tuning the hyper-parameters of the quantile regressors +# ------------------------------------------------------ +# +# In the plot above, we observed that the 5th percentile regressor seems to +# underfit and could not adapt to sinusoidal shape of the signal. +# +# The hyper-parameters of the model were approximately hand-tuned for the +# median regressor and there is no reason than the same hyper-parameters are +# suitable for the 5th percentile regressor. +# +# To confirm this hypothesis, we tune the hyper-parameters of a new regressor +# of the 5th percentile by selecting the best model parameters by +# cross-validation on the pinball loss with alpha=0.05: + +# %% +from sklearn.model_selection import RandomizedSearchCV +from sklearn.metrics import make_scorer +from pprint import pprint + + +param_grid = dict( + learning_rate=[0.01, 0.05, 0.1], + n_estimators=[100, 150, 200, 250, 300], + max_depth=[2, 5, 10, 15, 20], + min_samples_leaf=[1, 5, 10, 20, 30, 50], + min_samples_split=[2, 5, 10, 20, 30, 50], +) +alpha = 0.05 +neg_mean_pinball_loss_05p_scorer = make_scorer( + mean_pinball_loss, + alpha=alpha, + greater_is_better=False, # maximize the negative loss +) +gbr = GradientBoostingRegressor(loss="quantile", alpha=alpha, random_state=0) +search_05p = RandomizedSearchCV( + gbr, + param_grid, + n_iter=10, # increase this if computational budget allows + scoring=neg_mean_pinball_loss_05p_scorer, + n_jobs=2, + random_state=0, +).fit(X_train, y_train) +pprint(search_05p.best_params_) + +# %% +# We observe that the search procedure identifies that deeper trees are needed +# to get a good fit for the 5th percentile regressor. Deeper trees are more +# expressive and less likely to underfit. +# +# Let's now tune the hyper-parameters for the 95th percentile regressor. We +# need to redefine the `scoring` metric used to select the best model, along +# with adjusting the alpha parameter of the inner gradient boosting estimator +# itself: +from sklearn.base import clone + +alpha = 0.95 +neg_mean_pinball_loss_95p_scorer = make_scorer( + mean_pinball_loss, + alpha=alpha, + greater_is_better=False, # maximize the negative loss +) +search_95p = clone(search_05p).set_params( + estimator__alpha=alpha, + scoring=neg_mean_pinball_loss_95p_scorer, +) +search_95p.fit(X_train, y_train) +pprint(search_95p.best_params_) + +# %% +# This time, shallower trees are selected and lead to a more constant piecewise +# and therefore more robust estimation of the 95th percentile. This is +# beneficial as it avoids overfitting the large outliers of the log-normal +# additive noise. +# +# We can confirm this intuition by displaying the predicted 90% confidence +# interval comprised by the predictions of those two tuned quantile regressors: +# the prediction of the upper 95th percentile has a much coarser shape than the +# prediction of the lower 5th percentile: +y_lower = search_05p.predict(xx) +y_upper = search_95p.predict(xx) + +fig = plt.figure(figsize=(10, 10)) +plt.plot(xx, f(xx), 'g:', linewidth=3, label=r'$f(x) = x\,\sin(x)$') +plt.plot(X_test, y_test, 'b.', markersize=10, label='Test observations') plt.plot(xx, y_upper, 'k-') plt.plot(xx, y_lower, 'k-') -plt.fill(np.concatenate([xx, xx[::-1]]), - np.concatenate([y_upper, y_lower[::-1]]), - alpha=.5, fc='b', ec='None', label='95% prediction interval') +plt.fill_between(xx.ravel(), y_lower, y_upper, alpha=0.4, + label='Predicted 90% interval') plt.xlabel('$x$') plt.ylabel('$f(x)$') -plt.ylim(-10, 20) +plt.ylim(-10, 25) plt.legend(loc='upper left') +plt.title("Prediction with tuned hyper-parameters") plt.show() + +# %% +# The plot looks qualitatively better than for the untuned models, especially +# for the shape of the of lower quantile. +# +# We now quantitatively evaluate the joint-calibration of the pair of +# estimators: +coverage_fraction(y_train, + search_05p.predict(X_train), + search_95p.predict(X_train)) +# %% +coverage_fraction(y_test, + search_05p.predict(X_test), + search_95p.predict(X_test)) +# %% +# The calibration of the tuned pair is sadly not better on the test set: the +# width of the estimated confidence interval is still too narrow. +# +# Again, we would need to wrap this study in a cross-validation loop to +# better assess the variability of those estimates. diff --git a/sklearn/ensemble/tests/test_gradient_boosting_loss_functions.py b/sklearn/ensemble/tests/test_gradient_boosting_loss_functions.py index d0300ddc371c7..4d7ea9bfe9bb3 100644 --- a/sklearn/ensemble/tests/test_gradient_boosting_loss_functions.py +++ b/sklearn/ensemble/tests/test_gradient_boosting_loss_functions.py @@ -8,6 +8,7 @@ from pytest import approx from sklearn.utils import check_random_state +from sklearn.metrics import mean_pinball_loss from sklearn.ensemble._gb_losses import RegressionLossFunction from sklearn.ensemble._gb_losses import LeastSquaresError from sklearn.ensemble._gb_losses import LeastAbsoluteError @@ -115,6 +116,8 @@ def test_quantile_loss_function(): y_found = QuantileLossFunction(0.9)(x, np.zeros_like(x)) y_expected = np.asarray([0.1, 0.0, 0.9]).mean() np.testing.assert_allclose(y_found, y_expected) + y_found_p = mean_pinball_loss(x, np.zeros_like(x), alpha=0.9) + np.testing.assert_allclose(y_found, y_found_p) def test_sample_weight_deviance(): @@ -293,10 +296,11 @@ def test_init_raw_predictions_values(): @pytest.mark.parametrize('seed', range(5)) -def test_lad_equals_quantile_50(seed): +@pytest.mark.parametrize('alpha', [0.4, 0.5, 0.6]) +def test_lad_equals_quantiles(seed, alpha): # Make sure quantile loss with alpha = .5 is equivalent to LAD lad = LeastAbsoluteError() - ql = QuantileLossFunction(alpha=0.5) + ql = QuantileLossFunction(alpha=alpha) n_samples = 50 rng = np.random.RandomState(seed) @@ -305,9 +309,15 @@ def test_lad_equals_quantile_50(seed): lad_loss = lad(y_true, raw_predictions) ql_loss = ql(y_true, raw_predictions) - assert lad_loss == approx(2 * ql_loss) + if alpha == 0.5: + assert lad_loss == approx(2 * ql_loss) weights = np.linspace(0, 1, n_samples) ** 2 lad_weighted_loss = lad(y_true, raw_predictions, sample_weight=weights) ql_weighted_loss = ql(y_true, raw_predictions, sample_weight=weights) - assert lad_weighted_loss == approx(2 * ql_weighted_loss) + if alpha == 0.5: + assert lad_weighted_loss == approx(2 * ql_weighted_loss) + pbl_weighted_loss = mean_pinball_loss(y_true, raw_predictions, + sample_weight=weights, + alpha=alpha) + assert pbl_weighted_loss == approx(ql_weighted_loss) diff --git a/sklearn/metrics/__init__.py b/sklearn/metrics/__init__.py index 84e7c98e29324..bca22e3916c61 100644 --- a/sklearn/metrics/__init__.py +++ b/sklearn/metrics/__init__.py @@ -69,6 +69,7 @@ from ._regression import mean_squared_log_error from ._regression import median_absolute_error from ._regression import mean_absolute_percentage_error +from ._regression import mean_pinball_loss from ._regression import r2_score from ._regression import mean_tweedie_deviance from ._regression import mean_poisson_deviance @@ -133,6 +134,7 @@ 'mean_absolute_error', 'mean_squared_error', 'mean_squared_log_error', + 'mean_pinball_loss', 'mean_poisson_deviance', 'mean_gamma_deviance', 'mean_tweedie_deviance', diff --git a/sklearn/metrics/_regression.py b/sklearn/metrics/_regression.py index 0d8fddd0ba24e..7edf7924e50e1 100644 --- a/sklearn/metrics/_regression.py +++ b/sklearn/metrics/_regression.py @@ -43,6 +43,7 @@ "mean_squared_log_error", "median_absolute_error", "mean_absolute_percentage_error", + "mean_pinball_loss", "r2_score", "explained_variance_score", "mean_tweedie_deviance", @@ -194,6 +195,88 @@ def mean_absolute_error(y_true, y_pred, *, return np.average(output_errors, weights=multioutput) +def mean_pinball_loss(y_true, y_pred, *, + sample_weight=None, + alpha=0.5, + multioutput='uniform_average'): + """Pinball loss for quantile regression. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + y_true : array-like of shape (n_samples,) or (n_samples, n_outputs) + Ground truth (correct) target values. + + y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs) + Estimated target values. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + alpha: double, slope of the pinball loss, default=0.5, + this loss is equivalent to :ref:`mean_absolute_error` when `alpha=0.5`, + `alpha=0.95` is minimized by estimators of the 95th percentile. + + multioutput : {'raw_values', 'uniform_average'} or array-like of shape \ + (n_outputs,), default='uniform_average' + Defines aggregating of multiple output values. + Array-like value defines weights used to average errors. + + 'raw_values' : + Returns a full set of errors in case of multioutput input. + + 'uniform_average' : + Errors of all outputs are averaged with uniform weight. + Returns + ------- + loss : float or ndarray of floats + If multioutput is 'raw_values', then mean absolute error is returned + for each output separately. + If multioutput is 'uniform_average' or an ndarray of weights, then the + weighted average of all output errors is returned. + + The pinball loss output is a non-negative floating point. The best + value is 0.0. + + Examples + -------- + >>> from sklearn.metrics import mean_pinball_loss + >>> y_true = [1, 2, 3] + >>> mean_pinball_loss(y_true, [0, 2, 3], alpha=0.1) + 0.03... + >>> mean_pinball_loss(y_true, [1, 2, 4], alpha=0.1) + 0.3... + >>> mean_pinball_loss(y_true, [0, 2, 3], alpha=0.9) + 0.3... + >>> mean_pinball_loss(y_true, [1, 2, 4], alpha=0.9) + 0.03... + >>> mean_pinball_loss(y_true, y_true, alpha=0.1) + 0.0 + >>> mean_pinball_loss(y_true, y_true, alpha=0.9) + 0.0 + """ + y_type, y_true, y_pred, multioutput = _check_reg_targets( + y_true, y_pred, multioutput) + check_consistent_length(y_true, y_pred, sample_weight) + diff = y_true - y_pred + sign = (diff >= 0).astype(diff.dtype) + loss = alpha * sign * diff - (1 - alpha) * (1 - sign) * diff + output_errors = np.average(loss, weights=sample_weight, axis=0) + if isinstance(multioutput, str): + if multioutput == 'raw_values': + return output_errors + elif multioutput == 'uniform_average': + # pass None as weights to np.average: uniform mean + multioutput = None + else: + raise ValueError("multioutput is expected to be 'raw_values' " + "or 'uniform_average' but we got %r" + " instead." % multioutput) + + return np.average(output_errors, weights=multioutput) + + def mean_absolute_percentage_error(y_true, y_pred, sample_weight=None, multioutput='uniform_average'): diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py index 181baf19de3c2..dbf1bdd458f1a 100644 --- a/sklearn/metrics/tests/test_common.py +++ b/sklearn/metrics/tests/test_common.py @@ -50,6 +50,7 @@ from sklearn.metrics import mean_gamma_deviance from sklearn.metrics import median_absolute_error from sklearn.metrics import multilabel_confusion_matrix +from sklearn.metrics import mean_pinball_loss from sklearn.metrics import precision_recall_curve from sklearn.metrics import precision_score from sklearn.metrics import r2_score @@ -101,6 +102,7 @@ "max_error": max_error, "mean_absolute_error": mean_absolute_error, "mean_squared_error": mean_squared_error, + "mean_pinball_loss": mean_pinball_loss, "median_absolute_error": median_absolute_error, "mean_absolute_percentage_error": mean_absolute_percentage_error, "explained_variance_score": explained_variance_score, @@ -437,7 +439,8 @@ def precision_recall_curve_padded_thresholds(*args, **kwargs): # Regression metrics with "multioutput-continuous" format support MULTIOUTPUT_METRICS = { "mean_absolute_error", "median_absolute_error", "mean_squared_error", - "r2_score", "explained_variance_score", "mean_absolute_percentage_error" + "r2_score", "explained_variance_score", "mean_absolute_percentage_error", + "mean_pinball_loss" } # Symmetric with respect to their input arguments y_true and y_pred @@ -460,6 +463,9 @@ def precision_recall_curve_padded_thresholds(*args, **kwargs): "matthews_corrcoef_score", "mean_absolute_error", "mean_squared_error", "median_absolute_error", "max_error", + # Pinball loss is only symmetric for alpha=0.5 which is the default. + "mean_pinball_loss", + "cohen_kappa_score", "mean_normal_deviance" } diff --git a/sklearn/metrics/tests/test_regression.py b/sklearn/metrics/tests/test_regression.py index 5b8406cf7a61f..8e935173d3319 100644 --- a/sklearn/metrics/tests/test_regression.py +++ b/sklearn/metrics/tests/test_regression.py @@ -1,5 +1,6 @@ import numpy as np +from scipy import optimize from numpy.testing import assert_allclose from itertools import product import pytest @@ -7,6 +8,8 @@ from sklearn.utils._testing import assert_almost_equal from sklearn.utils._testing import assert_array_equal from sklearn.utils._testing import assert_array_almost_equal +from sklearn.dummy import DummyRegressor +from sklearn.model_selection import GridSearchCV from sklearn.metrics import explained_variance_score from sklearn.metrics import mean_absolute_error @@ -15,23 +18,30 @@ from sklearn.metrics import median_absolute_error from sklearn.metrics import mean_absolute_percentage_error from sklearn.metrics import max_error +from sklearn.metrics import mean_pinball_loss from sklearn.metrics import r2_score from sklearn.metrics import mean_tweedie_deviance +from sklearn.metrics import make_scorer from sklearn.metrics._regression import _check_reg_targets -from ...exceptions import UndefinedMetricWarning +from sklearn.exceptions import UndefinedMetricWarning def test_regression_metrics(n_samples=50): y_true = np.arange(n_samples) y_pred = y_true + 1 + y_pred_2 = y_true - 1 assert_almost_equal(mean_squared_error(y_true, y_pred), 1.) assert_almost_equal(mean_squared_log_error(y_true, y_pred), mean_squared_error(np.log(1 + y_true), np.log(1 + y_pred))) assert_almost_equal(mean_absolute_error(y_true, y_pred), 1.) + assert_almost_equal(mean_pinball_loss(y_true, y_pred), 0.5) + assert_almost_equal(mean_pinball_loss(y_true, y_pred_2), 0.5) + assert_almost_equal(mean_pinball_loss(y_true, y_pred, alpha=0.4), 0.6) + assert_almost_equal(mean_pinball_loss(y_true, y_pred_2, alpha=0.4), 0.4) assert_almost_equal(median_absolute_error(y_true, y_pred), 1.) mape = mean_absolute_percentage_error(y_true, y_pred) assert np.isfinite(mape) @@ -90,6 +100,9 @@ def test_multioutput_regression(): error = mean_absolute_error(y_true, y_pred) assert_almost_equal(error, (1. + 2. / 3) / 4.) + error = mean_pinball_loss(y_true, y_pred) + assert_almost_equal(error, (1. + 2. / 3) / 8.) + error = np.around(mean_absolute_percentage_error(y_true, y_pred), decimals=2) assert np.isfinite(error) @@ -104,15 +117,16 @@ def test_multioutput_regression(): def test_regression_metrics_at_limits(): - assert_almost_equal(mean_squared_error([0.], [0.]), 0.00, 2) - assert_almost_equal(mean_squared_error([0.], [0.], squared=False), 0.00, 2) - assert_almost_equal(mean_squared_log_error([0.], [0.]), 0.00, 2) - assert_almost_equal(mean_absolute_error([0.], [0.]), 0.00, 2) - assert_almost_equal(mean_absolute_percentage_error([0.], [0.]), 0.00, 2) - assert_almost_equal(median_absolute_error([0.], [0.]), 0.00, 2) - assert_almost_equal(max_error([0.], [0.]), 0.00, 2) - assert_almost_equal(explained_variance_score([0.], [0.]), 1.00, 2) - assert_almost_equal(r2_score([0., 1], [0., 1]), 1.00, 2) + assert_almost_equal(mean_squared_error([0.], [0.]), 0.0) + assert_almost_equal(mean_squared_error([0.], [0.], squared=False), 0.0) + assert_almost_equal(mean_squared_log_error([0.], [0.]), 0.0) + assert_almost_equal(mean_absolute_error([0.], [0.]), 0.0) + assert_almost_equal(mean_pinball_loss([0.], [0.]), 0.0) + assert_almost_equal(mean_absolute_percentage_error([0.], [0.]), 0.0) + assert_almost_equal(median_absolute_error([0.], [0.]), 0.0) + assert_almost_equal(max_error([0.], [0.]), 0.0) + assert_almost_equal(explained_variance_score([0.], [0.]), 1.0) + assert_almost_equal(r2_score([0., 1], [0., 1]), 1.0) err_msg = ("Mean Squared Logarithmic Error cannot be used when targets " "contain negative values.") with pytest.raises(ValueError, match=err_msg): @@ -207,6 +221,11 @@ def test_regression_multioutput_array(): mse = mean_squared_error(y_true, y_pred, multioutput='raw_values') mae = mean_absolute_error(y_true, y_pred, multioutput='raw_values') + err_msg = ("multioutput is expected to be 'raw_values' " + "or 'uniform_average' but we got 'variance_weighted' instead.") + with pytest.raises(ValueError, match=err_msg): + mean_pinball_loss(y_true, y_pred, multioutput='variance_weighted') + pbl = mean_pinball_loss(y_true, y_pred, multioutput='raw_values') mape = mean_absolute_percentage_error(y_true, y_pred, multioutput='raw_values') r = r2_score(y_true, y_pred, multioutput='raw_values') @@ -214,6 +233,7 @@ def test_regression_multioutput_array(): assert_array_almost_equal(mse, [0.125, 0.5625], decimal=2) assert_array_almost_equal(mae, [0.25, 0.625], decimal=2) + assert_array_almost_equal(pbl, [0.25/2, 0.625/2], decimal=2) assert_array_almost_equal(mape, [0.0778, 0.2262], decimal=2) assert_array_almost_equal(r, [0.95, 0.93], decimal=2) assert_array_almost_equal(evs, [0.95, 0.93], decimal=2) @@ -224,9 +244,11 @@ def test_regression_multioutput_array(): y_pred = [[1, 1]]*4 mse = mean_squared_error(y_true, y_pred, multioutput='raw_values') mae = mean_absolute_error(y_true, y_pred, multioutput='raw_values') + pbl = mean_pinball_loss(y_true, y_pred, multioutput='raw_values') r = r2_score(y_true, y_pred, multioutput='raw_values') assert_array_almost_equal(mse, [1., 1.], decimal=2) assert_array_almost_equal(mae, [1., 1.], decimal=2) + assert_array_almost_equal(pbl, [0.5, 0.5], decimal=2) assert_array_almost_equal(r, [0., 0.], decimal=2) r = r2_score([[0, -1], [0, 1]], [[2, 2], [1, 1]], multioutput='raw_values') @@ -330,3 +352,87 @@ def test_mean_absolute_percentage_error(): y_true = random_number_generator.exponential(size=100) y_pred = 1.2 * y_true assert mean_absolute_percentage_error(y_true, y_pred) == pytest.approx(0.2) + + +@pytest.mark.parametrize("distribution", + ["normal", "lognormal", "exponential", "uniform"]) +@pytest.mark.parametrize("target_quantile", [0.05, 0.5, 0.75]) +def test_mean_pinball_loss_on_constant_predictions( + distribution, + target_quantile +): + if not hasattr(np, "quantile"): + pytest.skip("This test requires a more recent version of numpy " + "with support for np.quantile.") + + # Check that the pinball loss is minimized by the empirical quantile. + n_samples = 3000 + rng = np.random.RandomState(42) + data = getattr(rng, distribution)(size=n_samples) + + # Compute the best possible pinball loss for any constant predictor: + best_pred = np.quantile(data, target_quantile) + best_constant_pred = np.full(n_samples, fill_value=best_pred) + best_pbl = mean_pinball_loss(data, best_constant_pred, + alpha=target_quantile) + + # Evaluate the loss on a grid of quantiles + candidate_predictions = np.quantile(data, np.linspace(0, 1, 100)) + for pred in candidate_predictions: + # Compute the pinball loss of a constant predictor: + constant_pred = np.full(n_samples, fill_value=pred) + pbl = mean_pinball_loss(data, constant_pred, alpha=target_quantile) + + # Check that the loss of this constant predictor is greater or equal + # than the loss of using the optimal quantile (up to machine + # precision): + assert pbl >= best_pbl - np.finfo(best_pbl.dtype).eps + + # Check that the value of the pinball loss matches the analytical + # formula. + expected_pbl = ( + (pred - data[data < pred]).sum() * (1 - target_quantile) + + (data[data >= pred] - pred).sum() * target_quantile + ) + expected_pbl /= n_samples + assert_almost_equal(expected_pbl, pbl) + + # Check that we can actually recover the target_quantile by minimizing the + # pinball loss w.r.t. the constant prediction quantile. + def objective_func(x): + constant_pred = np.full(n_samples, fill_value=x) + return mean_pinball_loss(data, constant_pred, alpha=target_quantile) + + result = optimize.minimize(objective_func, data.mean(), + method="Nelder-Mead") + assert result.success + # The minimum is not unique with limited data, hence the large tolerance. + assert result.x == pytest.approx(best_pred, rel=1e-2) + assert result.fun == pytest.approx(best_pbl) + + +def test_dummy_quantile_parameter_tuning(): + # Integration test to check that it is possible to use the pinball loss to + # tune the hyperparameter of a quantile regressor. This is conceptually + # similar to the previous test but using the scikit-learn estimator and + # scoring API instead. + n_samples = 1000 + rng = np.random.RandomState(0) + X = rng.normal(size=(n_samples, 5)) # Ignored + y = rng.exponential(size=n_samples) + + all_quantiles = [0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95] + for alpha in all_quantiles: + neg_mean_pinball_loss = make_scorer( + mean_pinball_loss, + alpha=alpha, + greater_is_better=False, + ) + regressor = DummyRegressor(strategy="quantile", quantile=0.25) + grid_search = GridSearchCV( + regressor, + param_grid=dict(quantile=all_quantiles), + scoring=neg_mean_pinball_loss, + ).fit(X, y) + + assert grid_search.best_params_["quantile"] == pytest.approx(alpha) From e9c6fcaa17b983858400465fd39a2616c980c3db Mon Sep 17 00:00:00 2001 From: Dmitry Kobak Date: Thu, 18 Feb 2021 19:55:43 +0100 Subject: [PATCH 184/478] Fix underflow issues due to float precision in TSNE (#19472) --- doc/whats_new/v1.0.rst | 10 +++++++++- sklearn/manifold/_utils.pyx | 18 +++++++++--------- sklearn/manifold/tests/test_t_sne.py | 12 ++++++++++++ 3 files changed, 30 insertions(+), 10 deletions(-) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index 1005920b891d3..66272c97d7a16 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -22,7 +22,8 @@ parameters, may produce different models from the previous version. This often occurs due to changes in the modelling logic (bug fixes or enhancements), or in random sampling procedures. - +- |Fix| :class:`manifold.TSNE` now avoids numerical underflow issues during + affinity matrix computation. Details are listed in the changelog below. @@ -134,6 +135,13 @@ Changelog :pr:`17743` by :user:`Maria Telenczuk ` and :user:`Alexandre Gramfort `. +:mod:`sklearn.manifold` +....................... + +- |Fix| Change numerical precision to prevent underflow issues + during affinity matrix computation for :class:`manifold.TSNE`. + :pr:`19472` by :user:`Dmitry Kobak `. + :mod:`sklearn.metrics` ...................... diff --git a/sklearn/manifold/_utils.pyx b/sklearn/manifold/_utils.pyx index 0cc2b0af137cc..cd6ade795ae91 100644 --- a/sklearn/manifold/_utils.pyx +++ b/sklearn/manifold/_utils.pyx @@ -51,18 +51,18 @@ cpdef np.ndarray[np.float32_t, ndim=2] _binary_search_perplexity( cdef long n_neighbors = sqdistances.shape[1] cdef int using_neighbors = n_neighbors < n_samples # Precisions of conditional Gaussian distributions - cdef float beta - cdef float beta_min - cdef float beta_max - cdef float beta_sum = 0.0 + cdef double beta + cdef double beta_min + cdef double beta_max + cdef double beta_sum = 0.0 # Use log scale - cdef float desired_entropy = math.log(desired_perplexity) - cdef float entropy_diff + cdef double desired_entropy = math.log(desired_perplexity) + cdef double entropy_diff - cdef float entropy - cdef float sum_Pi - cdef float sum_disti_Pi + cdef double entropy + cdef double sum_Pi + cdef double sum_disti_Pi cdef long i, j, k, l # This array is later used as a 32bit array. It has multiple intermediate diff --git a/sklearn/manifold/tests/test_t_sne.py b/sklearn/manifold/tests/test_t_sne.py index 716c031d4f5bf..bd0cc3df339bf 100644 --- a/sklearn/manifold/tests/test_t_sne.py +++ b/sklearn/manifold/tests/test_t_sne.py @@ -116,6 +116,18 @@ def test_binary_search(): assert_almost_equal(mean_perplexity, desired_perplexity, decimal=3) +def test_binary_search_underflow(): + # Test if the binary search finds Gaussians with desired perplexity. + # A more challenging case than the one above, producing numeric + # underflow in float precision (see issue #19471 and PR #19472). + random_state = check_random_state(42) + data = random_state.randn(1, 90).astype(np.float32) + 100 + desired_perplexity = 30.0 + P = _binary_search_perplexity(data, desired_perplexity, verbose=0) + perplexity = 2 ** -np.nansum(P[0, 1:] * np.log2(P[0, 1:])) + assert_almost_equal(perplexity, desired_perplexity, decimal=3) + + def test_binary_search_neighbors(): # Binary perplexity search approximation. # Should be approximately equal to the slow method when we use From 0c39dd32ea23922f52312615d65521f03ed247a3 Mon Sep 17 00:00:00 2001 From: James Alan Preiss Date: Fri, 19 Feb 2021 01:39:49 -0800 Subject: [PATCH 185/478] DOC enet_path docstring: fix sub/superscript swap (#19493) --- sklearn/linear_model/_coordinate_descent.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/linear_model/_coordinate_descent.py b/sklearn/linear_model/_coordinate_descent.py index 9b50ea93c78c2..4fdeb783db194 100644 --- a/sklearn/linear_model/_coordinate_descent.py +++ b/sklearn/linear_model/_coordinate_descent.py @@ -332,7 +332,7 @@ def enet_path(X, y, *, l1_ratio=0.5, eps=1e-3, n_alphas=100, alphas=None, For multi-output tasks it is:: - (1 / (2 * n_samples)) * ||Y - XW||^Fro_2 + (1 / (2 * n_samples)) * ||Y - XW||_Fro^2 + alpha * l1_ratio * ||W||_21 + 0.5 * alpha * (1 - l1_ratio) * ||W||_Fro^2 From 43241b1979f901a44b3c30ac58e005b0179d784a Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Fri, 19 Feb 2021 11:41:17 +0100 Subject: [PATCH 186/478] DOC fix missing closing quote in user guide --- doc/modules/preprocessing.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index a339b4bfae4e2..e1b4c5599c3b5 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -878,8 +878,9 @@ three middle diagonals are non-zero for ``degree=2``. The higher the degree, the more overlapping of the splines. Interestingly, a :class:`SplineTransformer` of ``degree=0`` is the same as -:class:`~sklearn.preprocessing.KBinsDiscretizer` with ``encode='onehot-dense`` -and ``n_bins = n_knots - 1`` if ``knots = strategy``. +:class:`~sklearn.preprocessing.KBinsDiscretizer` with +``encode='onehot-dense'`` and ``n_bins = n_knots - 1`` if +``knots = strategy``. .. topic:: Examples: From b5e55f79fdfcb0f41f0cfb279e54a123822bca43 Mon Sep 17 00:00:00 2001 From: Atsushi Nukariya Date: Fri, 19 Feb 2021 19:44:09 +0900 Subject: [PATCH 187/478] TST replace assert_warns* by pytest.warns in model_selection/tests (#19458) --- sklearn/model_selection/_validation.py | 2 +- sklearn/model_selection/tests/test_search.py | 20 ++++++--- sklearn/model_selection/tests/test_split.py | 5 +-- .../model_selection/tests/test_validation.py | 41 ++++++++++++------- 4 files changed, 43 insertions(+), 25 deletions(-) diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index 7a52b656e1804..8452c4eafbf90 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -1473,7 +1473,7 @@ def _translate_train_sizes(train_sizes, n_max_training_samples): if n_ticks > train_sizes_abs.shape[0]: warnings.warn("Removed duplicate entries from 'train_sizes'. Number " "of ticks will be less than the size of " - "'train_sizes' %d instead of %d)." + "'train_sizes': %d instead of %d." % (train_sizes_abs.shape[0], n_ticks), RuntimeWarning) return train_sizes_abs diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py index 5e63716164b6f..f9e0babebe3ad 100644 --- a/sklearn/model_selection/tests/test_search.py +++ b/sklearn/model_selection/tests/test_search.py @@ -14,8 +14,6 @@ import pytest from sklearn.utils._testing import ( - assert_warns, - assert_warns_message, assert_raise_message, assert_array_equal, assert_array_almost_equal, @@ -1433,7 +1431,12 @@ def test_grid_search_failing_classifier(): # error in this test. gs = GridSearchCV(clf, [{'parameter': [0, 1, 2]}], scoring='accuracy', refit=False, error_score=0.0) - assert_warns(FitFailedWarning, gs.fit, X, y) + warning_message = ( + "Estimator fit failed. The score on this train-test partition " + "for these parameters will be set to 0.0.*." + ) + with pytest.warns(FitFailedWarning, match=warning_message): + gs.fit(X, y) n_candidates = len(gs.cv_results_['params']) # Ensure that grid scores were set to zero as required for those fits @@ -1449,7 +1452,12 @@ def get_cand_scores(i): gs = GridSearchCV(clf, [{'parameter': [0, 1, 2]}], scoring='accuracy', refit=False, error_score=float('nan')) - assert_warns(FitFailedWarning, gs.fit, X, y) + warning_message = ( + "Estimator fit failed. The score on this train-test partition " + "for these parameters will be set to nan." + ) + with pytest.warns(FitFailedWarning, match=warning_message): + gs.fit(X, y) n_candidates = len(gs.cv_results_['params']) assert all(np.all(np.isnan(get_cand_scores(cand_i))) for cand_i in range(n_candidates) @@ -1492,8 +1500,8 @@ def test_parameters_sampler_replacement(): 'than n_iter=%d. Running %d iterations. For ' 'exhaustive searches, use GridSearchCV.' % (grid_size, n_iter, grid_size)) - assert_warns_message(UserWarning, expected_warning, - list, sampler) + with pytest.warns(UserWarning, match=expected_warning): + list(sampler) # degenerates to GridSearchCV if n_iter the same as grid_size sampler = ParameterSampler(params, n_iter=8) diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py index 5d91a505238ef..183a2eab84b63 100644 --- a/sklearn/model_selection/tests/test_split.py +++ b/sklearn/model_selection/tests/test_split.py @@ -14,7 +14,6 @@ from sklearn.utils._testing import assert_raises_regexp from sklearn.utils._testing import assert_array_almost_equal from sklearn.utils._testing import assert_array_equal -from sklearn.utils._testing import assert_warns_message from sklearn.utils._testing import assert_raise_message from sklearn.utils._testing import ignore_warnings from sklearn.utils.validation import _num_samples @@ -193,8 +192,8 @@ def test_kfold_valueerrors(): y = np.array([3, 3, -1, -1, 3]) skf_3 = StratifiedKFold(3) - assert_warns_message(Warning, "The least populated class", - next, skf_3.split(X2, y)) + with pytest.warns(Warning, match="The least populated class"): + next(skf_3.split(X2, y)) # Check that despite the warning the folds are still computed even # though all the classes are not necessarily represented at on each diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py index 8405d3b38c452..8bb853bcd51b4 100644 --- a/sklearn/model_selection/tests/test_validation.py +++ b/sklearn/model_selection/tests/test_validation.py @@ -17,8 +17,6 @@ from sklearn.utils._testing import assert_almost_equal from sklearn.utils._testing import assert_raises from sklearn.utils._testing import assert_raise_message -from sklearn.utils._testing import assert_warns -from sklearn.utils._testing import assert_warns_message from sklearn.utils._testing import assert_raises_regex from sklearn.utils._testing import assert_array_almost_equal from sklearn.utils._testing import assert_array_equal @@ -857,13 +855,12 @@ def split(self, X, y=None, groups=None): X, y = load_iris(return_X_y=True) - warning_message = ('Number of classes in training fold (2) does ' - 'not match total number of classes (3). ' + warning_message = (r'Number of classes in training fold \(2\) does ' + r'not match total number of classes \(3\). ' 'Results may not be appropriate for your use case.') - assert_warns_message(RuntimeWarning, warning_message, - cross_val_predict, - LogisticRegression(solver="liblinear"), - X, y, method='predict_proba', cv=KFold(2)) + with pytest.warns(RuntimeWarning, match=warning_message): + cross_val_predict(LogisticRegression(solver="liblinear"), + X, y, method='predict_proba', cv=KFold(2)) def test_cross_val_predict_decision_function_shape(): @@ -1210,9 +1207,13 @@ def test_learning_curve_remove_duplicate_sample_sizes(): n_redundant=0, n_classes=2, n_clusters_per_class=1, random_state=0) estimator = MockImprovingEstimator(2) - train_sizes, _, _ = assert_warns( - RuntimeWarning, learning_curve, estimator, X, y, cv=3, - train_sizes=np.linspace(0.33, 1.0, 3)) + warning_message = ( + "Removed duplicate entries from 'train_sizes'. Number of ticks " + "will be less than the size of 'train_sizes': 2 instead of 3." + ) + with pytest.warns(RuntimeWarning, match=warning_message): + train_sizes, _, _ = learning_curve( + estimator, X, y, cv=3, train_sizes=np.linspace(0.33, 1.0, 3)) assert_array_equal(train_sizes, [1, 2]) @@ -1753,8 +1754,13 @@ def test_fit_and_score_failing(): # passing error score to trigger the warning message fit_and_score_kwargs = {'error_score': 0} # check if the warning message type is as expected - assert_warns(FitFailedWarning, _fit_and_score, *fit_and_score_args, - **fit_and_score_kwargs) + warning_message = ( + "Estimator fit failed. The score on this train-test partition for " + "these parameters will be set to %f." + % (fit_and_score_kwargs['error_score']) + ) + with pytest.warns(FitFailedWarning, match=warning_message): + _fit_and_score(*fit_and_score_args, **fit_and_score_kwargs) # since we're using FailingClassfier, our error will be the following error_message = "ValueError: Failing classifier failed as required" # the warning message we're expecting to see @@ -1769,8 +1775,13 @@ def test_warn_trace(msg): mtb = split[0] + '\n' + split[-1] return warning_message in mtb # check traceback is included - assert_warns_message(FitFailedWarning, test_warn_trace, _fit_and_score, - *fit_and_score_args, **fit_and_score_kwargs) + warning_message = ( + "Estimator fit failed. The score on this train-test partition for " + "these parameters will be set to %f." + % (fit_and_score_kwargs['error_score']) + ) + with pytest.warns(FitFailedWarning, match=warning_message): + _fit_and_score(*fit_and_score_args, **fit_and_score_kwargs) fit_and_score_kwargs = {'error_score': 'raise'} # check if exception was raised, with default error_score='raise' From b169bc09b06bc257186feebb9706d38048365987 Mon Sep 17 00:00:00 2001 From: Maren Westermann Date: Sat, 20 Feb 2021 09:50:57 +0100 Subject: [PATCH 188/478] FIX RuntimeWarning by dividing by zero in test_sanity_check_pls_regression_constant_column_Y (#19480) --- sklearn/cross_decomposition/tests/test_pls.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/sklearn/cross_decomposition/tests/test_pls.py b/sklearn/cross_decomposition/tests/test_pls.py index c01e790ca1644..04c791fd4154a 100644 --- a/sklearn/cross_decomposition/tests/test_pls.py +++ b/sklearn/cross_decomposition/tests/test_pls.py @@ -148,9 +148,12 @@ def test_sanity_check_pls_regression_constant_column_Y(): x_loadings_sign_flip = np.sign(expected_x_loadings / pls.x_loadings_) x_weights_sign_flip = np.sign(expected_x_weights / pls.x_weights_) - y_loadings_sign_flip = np.sign(expected_y_loadings / pls.y_loadings_) + # we ignore the first full-zeros row for y + y_loadings_sign_flip = np.sign(expected_y_loadings[1:] / + pls.y_loadings_[1:]) + assert_array_equal(x_loadings_sign_flip, x_weights_sign_flip) - assert_array_equal(x_loadings_sign_flip[1:], y_loadings_sign_flip[1:]) + assert_array_equal(x_loadings_sign_flip[1:], y_loadings_sign_flip) def test_sanity_check_pls_canonical(): From 70c6ac9d04c396faaf604c2fd1d3945f25e4d6d4 Mon Sep 17 00:00:00 2001 From: Ashish Date: Sat, 20 Feb 2021 22:31:19 +0530 Subject: [PATCH 189/478] DOC Fixed typos in documentation (#19511) --- sklearn/inspection/_permutation_importance.py | 2 +- sklearn/model_selection/_search.py | 4 ++-- sklearn/model_selection/_validation.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/inspection/_permutation_importance.py b/sklearn/inspection/_permutation_importance.py index 9f2bdb0916254..2a7b6cd23147b 100644 --- a/sklearn/inspection/_permutation_importance.py +++ b/sklearn/inspection/_permutation_importance.py @@ -115,7 +115,7 @@ def permutation_importance(estimator, X, y, *, scoring=None, n_repeats=5, - a single string (see :ref:`scoring_parameter`); - a callable (see :ref:`scoring`) that returns a single value. - If `scoring` reprents multiple scores, one can use: + If `scoring` represents multiple scores, one can use: - a list or tuple of unique strings; - a callable returning a dictionary where the keys are the metric diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index e9c498816eae2..abe3b87488d8c 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -1006,7 +1006,7 @@ class GridSearchCV(BaseSearchCV): - a single string (see :ref:`scoring_parameter`); - a callable (see :ref:`scoring`) that returns a single value. - If `scoring` reprents multiple scores, one can use: + If `scoring` represents multiple scores, one can use: - a list or tuple of unique strings; - a callable returning a dictionary where the keys are the metric @@ -1346,7 +1346,7 @@ class RandomizedSearchCV(BaseSearchCV): - a single string (see :ref:`scoring_parameter`); - a callable (see :ref:`scoring`) that returns a single value. - If `scoring` reprents multiple scores, one can use: + If `scoring` represents multiple scores, one can use: - a list or tuple of unique strings; - a callable returning a dictionary where the keys are the metric diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index 8452c4eafbf90..63f9a53fcf91f 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -74,7 +74,7 @@ def cross_validate(estimator, X, y=None, *, groups=None, scoring=None, cv=None, - a single string (see :ref:`scoring_parameter`); - a callable (see :ref:`scoring`) that returns a single value. - If `scoring` reprents multiple scores, one can use: + If `scoring` represents multiple scores, one can use: - a list or tuple of unique strings; - a callable returning a dictionary where the keys are the metric From 23f8df3c8e96697cc965116416bcfc85f3a716e8 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Mon, 22 Feb 2021 08:38:43 +0100 Subject: [PATCH 190/478] TST Improve ridge solver consistency tests (#19503) --- sklearn/linear_model/tests/test_ridge.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/sklearn/linear_model/tests/test_ridge.py b/sklearn/linear_model/tests/test_ridge.py index 6e8a6761dda26..8e33514af83f9 100644 --- a/sklearn/linear_model/tests/test_ridge.py +++ b/sklearn/linear_model/tests/test_ridge.py @@ -39,6 +39,7 @@ from sklearn.model_selection import cross_val_predict from sklearn.model_selection import LeaveOneOut +from sklearn.preprocessing import minmax_scale from sklearn.utils import check_random_state from sklearn.datasets import make_multilabel_classification @@ -415,24 +416,32 @@ def _make_sparse_offset_regression( @pytest.mark.parametrize( 'n_samples,dtype,proportion_nonzero', [(20, 'float32', .1), (40, 'float32', 1.), (20, 'float64', .2)]) +@pytest.mark.parametrize('normalize', [True, False]) @pytest.mark.parametrize('seed', np.arange(3)) def test_solver_consistency( - solver, proportion_nonzero, n_samples, dtype, sparse_X, seed): + solver, proportion_nonzero, n_samples, dtype, sparse_X, seed, + normalize): alpha = 1. noise = 50. if proportion_nonzero > .9 else 500. X, y = _make_sparse_offset_regression( bias=10, n_features=30, proportion_nonzero=proportion_nonzero, noise=noise, random_state=seed, n_samples=n_samples) + if not normalize: + # Manually scale the data to avoid pathological cases. We use + # minmax_scale to deal with the sparse case without breaking + # the sparsity pattern. + X = minmax_scale(X) svd_ridge = Ridge( - solver='svd', normalize=True, alpha=alpha).fit(X, y) + solver='svd', normalize=normalize, alpha=alpha).fit(X, y) X = X.astype(dtype, copy=False) y = y.astype(dtype, copy=False) if sparse_X: X = sp.csr_matrix(X) if solver == 'ridgecv': - ridge = RidgeCV(alphas=[alpha], normalize=True) + ridge = RidgeCV(alphas=[alpha], normalize=normalize) else: - ridge = Ridge(solver=solver, tol=1e-10, normalize=True, alpha=alpha) + ridge = Ridge(solver=solver, tol=1e-10, normalize=normalize, + alpha=alpha) ridge.fit(X, y) assert_allclose( ridge.coef_, svd_ridge.coef_, atol=1e-3, rtol=1e-3) From c3eb5eda0fe08d3c0341031a23211b89721ae3a8 Mon Sep 17 00:00:00 2001 From: Haidar Almubarak Date: Mon, 22 Feb 2021 11:54:31 +0300 Subject: [PATCH 191/478] TST replace assert_raise_* by pytest.raises in neighbors module (#19388) Co-authored-by: SteveKola Co-authored-by: Olivier Grisel --- sklearn/cluster/tests/test_hierarchical.py | 7 +- sklearn/cluster/tests/test_mean_shift.py | 7 +- sklearn/cluster/tests/test_optics.py | 7 +- sklearn/neighbors/tests/test_dist_metrics.py | 7 +- sklearn/neighbors/tests/test_kde.py | 39 +++-- sklearn/neighbors/tests/test_lof.py | 20 +-- sklearn/neighbors/tests/test_nca.py | 165 +++++++++--------- .../neighbors/tests/test_nearest_centroid.py | 6 +- sklearn/neighbors/tests/test_neighbors.py | 139 ++++++++------- 9 files changed, 203 insertions(+), 194 deletions(-) diff --git a/sklearn/cluster/tests/test_hierarchical.py b/sklearn/cluster/tests/test_hierarchical.py index b5a2d9bbf2701..1f835a52f0105 100644 --- a/sklearn/cluster/tests/test_hierarchical.py +++ b/sklearn/cluster/tests/test_hierarchical.py @@ -17,7 +17,6 @@ from sklearn.metrics.cluster import adjusted_rand_score from sklearn.utils._testing import assert_almost_equal from sklearn.utils._testing import assert_array_almost_equal -from sklearn.utils._testing import assert_raise_message from sklearn.utils._testing import ignore_warnings from sklearn.cluster import ward_tree @@ -140,7 +139,8 @@ def test_zero_cosine_linkage_tree(): X = np.array([[0, 1], [0, 0]]) msg = 'Cosine affinity cannot be used when X contains zero vectors' - assert_raise_message(ValueError, msg, linkage_tree, X, affinity='cosine') + with pytest.raises(ValueError, match=msg): + linkage_tree(X, affinity='cosine') @pytest.mark.parametrize('n_clusters, distance_threshold', @@ -644,7 +644,8 @@ def test_agg_n_clusters(): agc = AgglomerativeClustering(n_clusters=n_clus) msg = ("n_clusters should be an integer greater than 0." " %s was provided." % str(agc.n_clusters)) - assert_raise_message(ValueError, msg, agc.fit, X) + with pytest.raises(ValueError, match=msg): + agc.fit(X) def test_affinity_passed_to_fix_connectivity(): diff --git a/sklearn/cluster/tests/test_mean_shift.py b/sklearn/cluster/tests/test_mean_shift.py index 7d2300711466a..2feb5363c28c8 100644 --- a/sklearn/cluster/tests/test_mean_shift.py +++ b/sklearn/cluster/tests/test_mean_shift.py @@ -11,7 +11,6 @@ from sklearn.utils._testing import assert_array_equal from sklearn.utils._testing import assert_array_almost_equal -from sklearn.utils._testing import assert_raise_message from sklearn.utils._testing import assert_allclose from sklearn.cluster import MeanShift @@ -73,7 +72,8 @@ def test_estimate_bandwidth_with_sparse_matrix(): # Test estimate_bandwidth with sparse matrix X = sparse.lil_matrix((1000, 1000)) msg = "A sparse matrix was passed, but dense data is required." - assert_raise_message(TypeError, msg, estimate_bandwidth, X) + with pytest.raises(TypeError, match=msg): + estimate_bandwidth(X) def test_parallel(): @@ -103,7 +103,8 @@ def test_meanshift_all_orphans(): # init away from the data, crash with a sensible warning ms = MeanShift(bandwidth=0.1, seeds=[[-9, -9], [-10, -10]]) msg = "No point was within bandwidth=0.1" - assert_raise_message(ValueError, msg, ms.fit, X,) + with pytest.raises(ValueError, match=msg): + ms.fit(X,) def test_unfitted(): diff --git a/sklearn/cluster/tests/test_optics.py b/sklearn/cluster/tests/test_optics.py index 8578c68d0f0dc..b253173c0b957 100644 --- a/sklearn/cluster/tests/test_optics.py +++ b/sklearn/cluster/tests/test_optics.py @@ -13,7 +13,6 @@ from sklearn.cluster import DBSCAN from sklearn.utils import shuffle from sklearn.utils._testing import assert_array_equal -from sklearn.utils._testing import assert_raise_message from sklearn.utils._testing import assert_allclose from sklearn.cluster.tests.common import generate_clustered_data @@ -181,7 +180,8 @@ def test_minimum_number_of_sample_check(): clust = OPTICS(max_eps=5.0 * 0.3, min_samples=10, min_cluster_size=1) # Run the fit - assert_raise_message(ValueError, msg, clust.fit, X) + with pytest.raises(ValueError, match=msg): + clust.fit(X) def test_bad_extract(): @@ -195,7 +195,8 @@ def test_bad_extract(): clust = OPTICS(max_eps=5.0 * 0.03, cluster_method='dbscan', eps=0.3, min_samples=10) - assert_raise_message(ValueError, msg, clust.fit, X) + with pytest.raises(ValueError, match=msg): + clust.fit(X) def test_bad_reachability(): diff --git a/sklearn/neighbors/tests/test_dist_metrics.py b/sklearn/neighbors/tests/test_dist_metrics.py index 441bcc134fe6b..05e0f4294ebb6 100644 --- a/sklearn/neighbors/tests/test_dist_metrics.py +++ b/sklearn/neighbors/tests/test_dist_metrics.py @@ -10,7 +10,6 @@ from sklearn.neighbors import DistanceMetric from sklearn.neighbors import BallTree from sklearn.utils import check_random_state -from sklearn.utils._testing import assert_raises_regex from sklearn.utils.fixes import sp_version, parse_version @@ -207,9 +206,9 @@ def wrong_distance(x, y): return "1" X = np.ones((5, 2)) - assert_raises_regex(TypeError, - "Custom distance function must accept two vectors", - BallTree, X, metric=wrong_distance) + msg = "Custom distance function must accept two vectors" + with pytest.raises(TypeError, match=msg): + BallTree(X, metric=wrong_distance) def test_input_data_size(): diff --git a/sklearn/neighbors/tests/test_kde.py b/sklearn/neighbors/tests/test_kde.py index cff7ffafe5acd..90ce667e5c284 100644 --- a/sklearn/neighbors/tests/test_kde.py +++ b/sklearn/neighbors/tests/test_kde.py @@ -2,7 +2,7 @@ import pytest -from sklearn.utils._testing import assert_allclose, assert_raises +from sklearn.utils._testing import assert_allclose from sklearn.neighbors import KernelDensity, KDTree, NearestNeighbors from sklearn.neighbors._ball_tree import kernel_norm from sklearn.pipeline import make_pipeline @@ -92,7 +92,8 @@ def test_kernel_density_sampling(n_samples=100, n_features=3): # check unsupported kernels for kernel in ['epanechnikov', 'exponential', 'linear', 'cosine']: kde = KernelDensity(bandwidth=bandwidth, kernel=kernel).fit(X) - assert_raises(NotImplementedError, kde.sample, 100) + with pytest.raises(NotImplementedError): + kde.sample(100) # non-regression test: used to return a scalar X = rng.randn(4, 1) @@ -111,8 +112,8 @@ def test_kde_algorithm_metric_choice(algorithm, metric): Y = rng.randn(10, 2) if algorithm == 'kd_tree' and metric not in KDTree.valid_metrics: - assert_raises(ValueError, KernelDensity, - algorithm=algorithm, metric=metric) + with pytest.raises(ValueError): + KernelDensity(algorithm=algorithm, metric=metric) else: kde = KernelDensity(algorithm=algorithm, metric=metric) kde.fit(X) @@ -129,21 +130,23 @@ def test_kde_score(n_samples=100, n_features=3): def test_kde_badargs(): - assert_raises(ValueError, KernelDensity, - algorithm='blah') - assert_raises(ValueError, KernelDensity, - bandwidth=0) - assert_raises(ValueError, KernelDensity, - kernel='blah') - assert_raises(ValueError, KernelDensity, - metric='blah') - assert_raises(ValueError, KernelDensity, - algorithm='kd_tree', metric='blah') + with pytest.raises(ValueError): + KernelDensity(algorithm='blah') + with pytest.raises(ValueError): + KernelDensity(bandwidth=0) + with pytest.raises(ValueError): + KernelDensity(kernel='blah') + with pytest.raises(ValueError): + KernelDensity(metric='blah') + with pytest.raises(ValueError): + KernelDensity(algorithm='kd_tree', metric='blah') kde = KernelDensity() - assert_raises(ValueError, kde.fit, np.random.random((200, 10)), - sample_weight=np.random.random((200, 10))) - assert_raises(ValueError, kde.fit, np.random.random((200, 10)), - sample_weight=-np.random.random(200)) + with pytest.raises(ValueError): + kde.fit(np.random.random((200, 10)), + sample_weight=np.random.random((200, 10))) + with pytest.raises(ValueError): + kde.fit(np.random.random((200, 10)), + sample_weight=-np.random.random(200)) def test_kde_pipeline_gridsearch(): diff --git a/sklearn/neighbors/tests/test_lof.py b/sklearn/neighbors/tests/test_lof.py index 750fc57a8f457..5d479d5b141f7 100644 --- a/sklearn/neighbors/tests/test_lof.py +++ b/sklearn/neighbors/tests/test_lof.py @@ -6,7 +6,7 @@ import numpy as np from sklearn import neighbors - +import re import pytest from numpy.testing import assert_array_equal @@ -15,9 +15,6 @@ from sklearn.utils import check_random_state from sklearn.utils._testing import assert_array_almost_equal -from sklearn.utils._testing import assert_warns_message -from sklearn.utils._testing import assert_raises -from sklearn.utils._testing import assert_raises_regex from sklearn.utils.estimator_checks import check_estimator from sklearn.utils.estimator_checks import check_outlier_corruption @@ -125,9 +122,9 @@ def test_n_neighbors_attribute(): assert clf.n_neighbors_ == X.shape[0] - 1 clf = neighbors.LocalOutlierFactor(n_neighbors=500) - assert_warns_message(UserWarning, - "n_neighbors will be set to (n_samples - 1)", - clf.fit, X) + msg = "n_neighbors will be set to (n_samples - 1)" + with pytest.warns(UserWarning, match=re.escape(msg)): + clf.fit(X) assert clf.n_neighbors_ == X.shape[0] - 1 @@ -149,7 +146,8 @@ def test_score_samples(): def test_contamination(): X = [[1, 1], [1, 0]] clf = neighbors.LocalOutlierFactor(contamination=0.6) - assert_raises(ValueError, clf.fit, X) + with pytest.raises(ValueError): + clf.fit(X) def test_novelty_errors(): @@ -161,12 +159,14 @@ def test_novelty_errors(): # predict, decision_function and score_samples raise ValueError for method in ['predict', 'decision_function', 'score_samples']: msg = ('{} is not available when novelty=False'.format(method)) - assert_raises_regex(AttributeError, msg, getattr, clf, method) + with pytest.raises(AttributeError, match=msg): + getattr(clf, method) # check errors for novelty=True clf = neighbors.LocalOutlierFactor(novelty=True) msg = 'fit_predict is not available when novelty=True' - assert_raises_regex(AttributeError, msg, getattr, clf, 'fit_predict') + with pytest.raises(AttributeError, match=msg): + getattr(clf, 'fit_predict') def test_novelty_training_scores(): diff --git a/sklearn/neighbors/tests/test_nca.py b/sklearn/neighbors/tests/test_nca.py index 0bf6d3c0d1763..e7fc741899209 100644 --- a/sklearn/neighbors/tests/test_nca.py +++ b/sklearn/neighbors/tests/test_nca.py @@ -15,8 +15,6 @@ from sklearn import clone from sklearn.exceptions import ConvergenceWarning from sklearn.utils import check_random_state -from sklearn.utils._testing import (assert_raises, - assert_raise_message, assert_warns_message) from sklearn.datasets import load_iris, make_classification, make_blobs from sklearn.neighbors import NeighborhoodComponentsAnalysis from sklearn.metrics import pairwise_distances @@ -127,38 +125,42 @@ def test_params_validation(): rng = np.random.RandomState(42) # TypeError - assert_raises(TypeError, NCA(max_iter='21').fit, X, y) - assert_raises(TypeError, NCA(verbose='true').fit, X, y) - assert_raises(TypeError, NCA(tol='1').fit, X, y) - assert_raises(TypeError, NCA(n_components='invalid').fit, X, y) - assert_raises(TypeError, NCA(warm_start=1).fit, X, y) + with pytest.raises(TypeError): + NCA(max_iter='21').fit(X, y) + with pytest.raises(TypeError): + NCA(verbose='true').fit(X, y) + with pytest.raises(TypeError): + NCA(tol='1').fit(X, y) + with pytest.raises(TypeError): + NCA(n_components='invalid').fit(X, y) + with pytest.raises(TypeError): + NCA(warm_start=1).fit(X, y) # ValueError - assert_raise_message(ValueError, - "`init` must be 'auto', 'pca', 'lda', 'identity', " - "'random' or a numpy array of shape " - "(n_components, n_features).", - NCA(init=1).fit, X, y) - assert_raise_message(ValueError, - '`max_iter`= -1, must be >= 1.', - NCA(max_iter=-1).fit, X, y) - + msg = ( + r"`init` must be 'auto', 'pca', 'lda', 'identity', 'random' or a " + r"numpy array of shape (n_components, n_features)." + ) + with pytest.raises(ValueError, match=re.escape(msg)): + NCA(init=1).fit(X, y) + with pytest.raises(ValueError, match='`max_iter`= -1, must be >= 1.'): + NCA(max_iter=-1).fit(X, y) init = rng.rand(5, 3) - assert_raise_message(ValueError, - 'The output dimensionality ({}) of the given linear ' - 'transformation `init` cannot be greater than its ' - 'input dimensionality ({}).' - .format(init.shape[0], init.shape[1]), - NCA(init=init).fit, X, y) - + msg = ( + f"The output dimensionality ({init.shape[0]}) " + "of the given linear transformation `init` cannot be " + f"greater than its input dimensionality ({init.shape[1]})." + ) + with pytest.raises(ValueError, match=re.escape(msg)): + NCA(init=init).fit(X, y) n_components = 10 - assert_raise_message(ValueError, - 'The preferred dimensionality of the ' - 'projected space `n_components` ({}) cannot ' - 'be greater than the given data ' - 'dimensionality ({})!' - .format(n_components, X.shape[1]), - NCA(n_components=n_components).fit, X, y) + msg = ( + "The preferred dimensionality of the projected space " + f"`n_components` ({n_components}) cannot be greater " + f"than the given data dimensionality ({X.shape[1]})!" + ) + with pytest.raises(ValueError, match=re.escape(msg)): + NCA(n_components=n_components).fit(X, y) def test_transformation_dimensions(): @@ -167,17 +169,15 @@ def test_transformation_dimensions(): # Fail if transformation input dimension does not match inputs dimensions transformation = np.array([[1, 2], [3, 4]]) - assert_raises(ValueError, - NeighborhoodComponentsAnalysis(init=transformation).fit, - X, y) + with pytest.raises(ValueError): + NeighborhoodComponentsAnalysis(init=transformation).fit(X, y) # Fail if transformation output dimension is larger than # transformation input dimension transformation = np.array([[1, 2], [3, 4], [5, 6]]) # len(transformation) > len(transformation[0]) - assert_raises(ValueError, - NeighborhoodComponentsAnalysis(init=transformation).fit, - X, y) + with pytest.raises(ValueError): + NeighborhoodComponentsAnalysis(init=transformation).fit(X, y) # Pass otherwise transformation = np.arange(9).reshape(3, 3) @@ -194,24 +194,25 @@ def test_n_components(): # n_components = X.shape[1] != transformation.shape[0] n_components = X.shape[1] nca = NeighborhoodComponentsAnalysis(init=init, n_components=n_components) - assert_raise_message(ValueError, - 'The preferred dimensionality of the ' - 'projected space `n_components` ({}) does not match ' - 'the output dimensionality of the given ' - 'linear transformation `init` ({})!' - .format(n_components, init.shape[0]), - nca.fit, X, y) + msg = ( + "The preferred dimensionality of the projected space " + f"`n_components` ({n_components}) does not match the output " + "dimensionality of the given linear transformation " + f"`init` ({init.shape[0]})!" + ) + with pytest.raises(ValueError, match=re.escape(msg)): + nca.fit(X, y) # n_components > X.shape[1] n_components = X.shape[1] + 2 nca = NeighborhoodComponentsAnalysis(init=init, n_components=n_components) - assert_raise_message(ValueError, - 'The preferred dimensionality of the ' - 'projected space `n_components` ({}) cannot ' - 'be greater than the given data ' - 'dimensionality ({})!' - .format(n_components, X.shape[1]), - nca.fit, X, y) + msg = ( + "The preferred dimensionality of the projected space " + f"`n_components` ({n_components}) cannot be greater than " + f"the given data dimensionality ({X.shape[1]})!" + ) + with pytest.raises(ValueError, match=re.escape(msg)): + nca.fit(X, y) # n_components < X.shape[1] nca = NeighborhoodComponentsAnalysis(n_components=2, init='identity') @@ -249,34 +250,37 @@ def test_init_transformation(): # init.shape[1] must match X.shape[1] init = rng.rand(X.shape[1], X.shape[1] + 1) nca = NeighborhoodComponentsAnalysis(init=init) - assert_raise_message(ValueError, - 'The input dimensionality ({}) of the given ' - 'linear transformation `init` must match the ' - 'dimensionality of the given inputs `X` ({}).' - .format(init.shape[1], X.shape[1]), - nca.fit, X, y) + msg = ( + f"The input dimensionality ({init.shape[1]}) of the given " + "linear transformation `init` must match the " + f"dimensionality of the given inputs `X` ({X.shape[1]})." + ) + with pytest.raises(ValueError, match=re.escape(msg)): + nca.fit(X, y) # init.shape[0] must be <= init.shape[1] init = rng.rand(X.shape[1] + 1, X.shape[1]) nca = NeighborhoodComponentsAnalysis(init=init) - assert_raise_message(ValueError, - 'The output dimensionality ({}) of the given ' - 'linear transformation `init` cannot be ' - 'greater than its input dimensionality ({}).' - .format(init.shape[0], init.shape[1]), - nca.fit, X, y) + msg = ( + f"The output dimensionality ({init.shape[0]}) of the given " + "linear transformation `init` cannot be " + f"greater than its input dimensionality ({init.shape[1]})." + ) + with pytest.raises(ValueError, match=re.escape(msg)): + nca.fit(X, y) # init.shape[0] must match n_components init = rng.rand(X.shape[1], X.shape[1]) n_components = X.shape[1] - 2 nca = NeighborhoodComponentsAnalysis(init=init, n_components=n_components) - assert_raise_message(ValueError, - 'The preferred dimensionality of the ' - 'projected space `n_components` ({}) does not match ' - 'the output dimensionality of the given ' - 'linear transformation `init` ({})!' - .format(n_components, init.shape[0]), - nca.fit, X, y) + msg = ( + "The preferred dimensionality of the " + f"projected space `n_components` ({n_components}) " + "does not match the output dimensionality of the given " + f"linear transformation `init` ({init.shape[0]})!" + ) + with pytest.raises(ValueError, match=re.escape(msg)): + nca.fit(X, y) @pytest.mark.parametrize('n_samples', [3, 5, 7, 11]) @@ -325,13 +329,13 @@ def test_warm_start_validation(): X_less_features, y = make_classification(n_samples=30, n_features=4, n_classes=4, n_redundant=0, n_informative=4, random_state=0) - assert_raise_message(ValueError, - 'The new inputs dimensionality ({}) does not ' - 'match the input dimensionality of the ' - 'previously learned transformation ({}).' - .format(X_less_features.shape[1], - nca.components_.shape[1]), - nca.fit, X_less_features, y) + msg = ( + f"The new inputs dimensionality ({X_less_features.shape[1]}) " + "does not match the input dimensionality of the previously learned " + f"transformation ({nca.components_.shape[1]})." + ) + with pytest.raises(ValueError, match=re.escape(msg)): + nca.fit(X_less_features, y) def test_warm_start_effectiveness(): @@ -466,7 +470,8 @@ def test_callback(capsys): y = iris_target nca = NeighborhoodComponentsAnalysis(callback='my_cb') - assert_raises(ValueError, nca.fit, X, y) + with pytest.raises(ValueError): + nca.fit(X, y) max_iter = 10 @@ -515,9 +520,9 @@ def callback(self, transformation, n_iter): def test_convergence_warning(): nca = NeighborhoodComponentsAnalysis(max_iter=2, verbose=1) cls_name = nca.__class__.__name__ - assert_warns_message(ConvergenceWarning, - '[{}] NCA did not converge'.format(cls_name), - nca.fit, iris_data, iris_target) + msg = '[{}] NCA did not converge'.format(cls_name) + with pytest.warns(ConvergenceWarning, match=re.escape(msg)): + nca.fit(iris_data, iris_target) @pytest.mark.parametrize('param, value', [('n_components', np.int32(3)), diff --git a/sklearn/neighbors/tests/test_nearest_centroid.py b/sklearn/neighbors/tests/test_nearest_centroid.py index 451aeff377e19..f91cae74b0585 100644 --- a/sklearn/neighbors/tests/test_nearest_centroid.py +++ b/sklearn/neighbors/tests/test_nearest_centroid.py @@ -3,12 +3,12 @@ """ import numpy as np +import pytest from scipy import sparse as sp from numpy.testing import assert_array_equal from sklearn.neighbors import NearestCentroid from sklearn import datasets -from sklearn.utils._testing import assert_raises # toy sample X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]] @@ -56,7 +56,7 @@ def test_classification_toy(): def test_precomputed(): clf = NearestCentroid(metric='precomputed') - with assert_raises(ValueError): + with pytest.raises(ValueError): clf.fit(X, y) @@ -158,5 +158,5 @@ def test_features_zero_var(): y[0] = 1 clf = NearestCentroid(shrink_threshold=0.1) - with assert_raises(ValueError): + with pytest.raises(ValueError): clf.fit(X, y) diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index a4b55afd090c3..8ce52119faa02 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -1,6 +1,7 @@ from itertools import product import pytest +import re import numpy as np from scipy.sparse import (bsr_matrix, coo_matrix, csc_matrix, csr_matrix, dok_matrix, lil_matrix, issparse) @@ -19,11 +20,6 @@ from sklearn.pipeline import make_pipeline from sklearn.utils._testing import assert_array_almost_equal from sklearn.utils._testing import assert_array_equal -from sklearn.utils._testing import assert_raises -from sklearn.utils._testing import assert_raises_regex -from sklearn.utils._testing import assert_warns -from sklearn.utils._testing import assert_warns_message -from sklearn.utils._testing import assert_raise_message from sklearn.utils._testing import ignore_warnings from sklearn.utils.validation import check_random_state from sklearn.utils.fixes import sp_version, parse_version @@ -128,18 +124,21 @@ def test_n_neighbors_datatype(): msg = "Expected n_neighbors > 0. Got -3" neighbors_ = neighbors.NearestNeighbors(n_neighbors=3.) - assert_raises_regex(TypeError, expected_msg, neighbors_.fit, X) - assert_raises_regex(ValueError, msg, - neighbors_.kneighbors, X=X, n_neighbors=-3) - assert_raises_regex(TypeError, expected_msg, - neighbors_.kneighbors, X=X, n_neighbors=3.) + with pytest.raises(TypeError, match=expected_msg): + neighbors_.fit(X) + with pytest.raises(ValueError, match=msg): + neighbors_.kneighbors(X=X, n_neighbors=-3) + with pytest.raises(TypeError, match=expected_msg): + neighbors_.kneighbors(X=X, n_neighbors=3.) def test_not_fitted_error_gets_raised(): X = [[1]] neighbors_ = neighbors.NearestNeighbors() - assert_raises(NotFittedError, neighbors_.kneighbors_graph, X) - assert_raises(NotFittedError, neighbors_.radius_neighbors_graph, X) + with pytest.raises(NotFittedError): + neighbors_.kneighbors_graph(X) + with pytest.raises(NotFittedError): + neighbors_.radius_neighbors_graph(X) @ignore_warnings(category=EfficiencyWarning) @@ -181,7 +180,8 @@ def check_precomputed(make_train_test, estimators): assert_array_almost_equal(ind_X, ind_D) # Must raise a ValueError if the matrix is not of correct shape - assert_raises(ValueError, getattr(nbrs_D, method), X) + with pytest.raises(ValueError): + getattr(nbrs_D, method)(X) target = np.arange(X.shape[0]) for Est in estimators: @@ -295,14 +295,15 @@ def test_precomputed_sparse_invalid(): dist_csr = csr_matrix(dist) neigh.fit(dist_csr) msg = "2 neighbors per samples are required, but some samples have only 1" - assert_raises_regex(ValueError, msg, neigh.kneighbors, None, n_neighbors=1) + with pytest.raises(ValueError, match=msg): + neigh.kneighbors(None, n_neighbors=1) # Checks error with inconsistent distance matrix dist = np.array([[5., 2., 1.], [-2., 0., 3.], [1., 3., 0.]]) dist_csr = csr_matrix(dist) msg = "Negative values in data passed to precomputed distance matrix." - assert_raises_regex(ValueError, msg, neigh.kneighbors, dist_csr, - n_neighbors=1) + with pytest.raises(ValueError, match=msg): + neigh.kneighbors(dist_csr, n_neighbors=1) def test_precomputed_cross_validation(): @@ -486,7 +487,8 @@ def test_radius_neighbors_classifier_when_no_neighbors(): assert_array_equal(np.array([1, 2]), clf.predict(z1)) if outlier_label is None: - assert_raises(ValueError, clf.predict, z2) + with pytest.raises(ValueError): + clf.predict(z2) def test_radius_neighbors_classifier_outlier_labeling(): @@ -526,13 +528,15 @@ def test_radius_neighbors_classifier_outlier_labeling(): def check_array_exception(): clf = RNC(radius=1, outlier_label=[[5]]) clf.fit(X, y) - assert_raises(TypeError, check_array_exception) + with pytest.raises(TypeError): + check_array_exception() # test invalid outlier_label dtype def check_dtype_exception(): clf = RNC(radius=1, outlier_label='a') clf.fit(X, y) - assert_raises(TypeError, check_dtype_exception) + with pytest.raises(TypeError): + check_dtype_exception() # test most frequent clf = RNC(radius=1, outlier_label='most_frequent') @@ -553,7 +557,8 @@ def check_warning(): clf = RNC(radius=1, outlier_label=4) clf.fit(X, y) clf.predict_proba([[1], [15]]) - assert_warns(UserWarning, check_warning) + with pytest.warns(UserWarning): + check_warning() # test multi output same outlier label y_multi = [[0, 1], [2, 1], [2, 2], [1, 2], [1, 2], @@ -580,7 +585,8 @@ def check_warning(): def check_exception(): clf = RNC(radius=1, outlier_label=[0, 1, 2]) clf.fit(X, y_multi) - assert_raises(ValueError, check_exception) + with pytest.raises(ValueError): + check_exception() def test_radius_neighbors_classifier_zero_distance(): @@ -934,10 +940,8 @@ def test_radius_neighbors_regressor(n_samples=40, X_test_nan = np.full((1, n_features), -1.) empty_warning_msg = ("One or more samples have no neighbors " "within specified radius; predicting NaN.") - pred = assert_warns_message(UserWarning, - empty_warning_msg, - neigh.predict, - X_test_nan) + with pytest.warns(UserWarning, match=re.escape(empty_warning_msg)): + pred = neigh.predict(X_test_nan) assert np.all(np.isnan(pred)) @@ -1044,8 +1048,7 @@ def test_neighbors_iris(): rgs = neighbors.KNeighborsRegressor(n_neighbors=5, algorithm=algorithm) rgs.fit(iris.data, iris.target) - assert (np.mean(rgs.predict(iris.data).round() == iris.target) > - 0.95) + assert (np.mean(rgs.predict(iris.data).round() == iris.target) > 0.95) def test_neighbors_digits(): @@ -1166,9 +1169,8 @@ def test_radius_neighbors_graph_sparse(seed=36): def test_neighbors_badargs(): # Test bad argument values: these should all raise ValueErrors - assert_raises(ValueError, - neighbors.NearestNeighbors, - algorithm='blah') + with pytest.raises(ValueError): + neighbors.NearestNeighbors(algorithm='blah') X = rng.random_sample((10, 2)) Xsparse = csr_matrix(X) @@ -1179,49 +1181,45 @@ def test_neighbors_badargs(): neighbors.RadiusNeighborsClassifier, neighbors.KNeighborsRegressor, neighbors.RadiusNeighborsRegressor): - assert_raises(ValueError, - cls, - weights='blah') - assert_raises(ValueError, - cls, p=-1) - assert_raises(ValueError, - cls, algorithm='blah') + with pytest.raises(ValueError): + cls(weights='blah') + with pytest.raises(ValueError): + cls(p=-1) + with pytest.raises(ValueError): + cls(algorithm='blah') nbrs = cls(algorithm='ball_tree', metric='haversine') - assert_raises(ValueError, - nbrs.predict, - X) - assert_raises(ValueError, - ignore_warnings(nbrs.fit), - Xsparse, y) + with pytest.raises(ValueError): + nbrs.predict(X) + with pytest.raises(ValueError): + ignore_warnings(nbrs.fit(Xsparse, y)) nbrs = cls(metric='haversine', algorithm='brute') nbrs.fit(X3, y) - assert_raise_message(ValueError, - "Haversine distance only valid in 2 dimensions", - nbrs.predict, - X3) + msg = "Haversine distance only valid in 2 dimensions" + with pytest.raises(ValueError, match=msg): + nbrs.predict(X3) nbrs = cls() - assert_raises(ValueError, - nbrs.fit, - np.ones((0, 2)), np.ones(0)) - assert_raises(ValueError, - nbrs.fit, - X[:, :, None], y) + with pytest.raises(ValueError): + nbrs.fit(np.ones((0, 2)), np.ones(0)) + with pytest.raises(ValueError): + nbrs.fit(X[:, :, None], y) nbrs.fit(X, y) - assert_raises(ValueError, - nbrs.predict, - [[]]) + with pytest.raises(ValueError): + nbrs.predict([[]]) if (issubclass(cls, neighbors.KNeighborsClassifier) or issubclass(cls, neighbors.KNeighborsRegressor)): nbrs = cls(n_neighbors=-1) - assert_raises(ValueError, nbrs.fit, X, y) + with pytest.raises(ValueError): + nbrs.fit(X, y) nbrs = neighbors.NearestNeighbors().fit(X) - assert_raises(ValueError, nbrs.kneighbors_graph, X, mode='blah') - assert_raises(ValueError, nbrs.radius_neighbors_graph, X, mode='blah') + with pytest.raises(ValueError): + nbrs.kneighbors_graph(X, mode='blah') + with pytest.raises(ValueError): + nbrs.radius_neighbors_graph(X, mode='blah') def test_neighbors_metrics(n_samples=20, n_features=3, @@ -1257,10 +1255,10 @@ def test_neighbors_metrics(n_samples=20, n_features=3, # KD tree doesn't support all metrics if (algorithm == 'kd_tree' and metric not in neighbors.KDTree.valid_metrics): - assert_raises(ValueError, - neighbors.NearestNeighbors, - algorithm=algorithm, - metric=metric, metric_params=metric_params) + with pytest.raises(ValueError): + neighbors.NearestNeighbors(algorithm=algorithm, + metric=metric, + metric_params=metric_params) continue neigh = neighbors.NearestNeighbors(n_neighbors=n_neighbors, algorithm=algorithm, @@ -1363,8 +1361,8 @@ def test_valid_brute_metric_for_auto_algorithm(): def test_metric_params_interface(): - assert_warns(SyntaxWarning, neighbors.KNeighborsClassifier, - metric_params={'p': 3}) + with pytest.warns(SyntaxWarning): + neighbors.KNeighborsClassifier(metric_params={'p': 3}) def test_predict_sparse_ball_kd_tree(): @@ -1375,7 +1373,8 @@ def test_predict_sparse_ball_kd_tree(): nbrs2 = neighbors.KNeighborsRegressor(1, algorithm='ball_tree') for model in [nbrs1, nbrs2]: model.fit(X, y) - assert_raises(ValueError, model.predict, csr_matrix(X)) + with pytest.raises(ValueError): + model.predict(csr_matrix(X)) def test_non_euclidean_kneighbors(): @@ -1406,12 +1405,12 @@ def test_non_euclidean_kneighbors(): # Raise error when wrong parameters are supplied, X_nbrs = neighbors.NearestNeighbors(n_neighbors=3, metric='manhattan') X_nbrs.fit(X) - assert_raises(ValueError, neighbors.kneighbors_graph, X_nbrs, 3, - metric='euclidean') + with pytest.raises(ValueError): + neighbors.kneighbors_graph(X_nbrs, 3, metric='euclidean') X_nbrs = neighbors.NearestNeighbors(radius=radius, metric='manhattan') X_nbrs.fit(X) - assert_raises(ValueError, neighbors.radius_neighbors_graph, X_nbrs, - radius, metric='euclidean') + with pytest.raises(ValueError): + neighbors.radius_neighbors_graph(X_nbrs, radius, metric='euclidean') def check_object_arrays(nparray, list_check): From dbd68b2846905efb3682db46c798298b1fd3d6c2 Mon Sep 17 00:00:00 2001 From: Maria Telenczuk Date: Mon, 22 Feb 2021 11:48:54 +0100 Subject: [PATCH 192/478] MRG fix Normalize for linear models when used with sample_weight (#19426) Co-authored-by: Alexandre Gramfort Co-authored-by: Guillaume Lemaitre Co-authored-by: Olivier Grisel --- doc/whats_new/v1.0.rst | 6 + sklearn/linear_model/_base.py | 40 ++--- sklearn/linear_model/tests/test_base.py | 138 ++++++++++++----- .../tests/test_coordinate_descent.py | 145 ++++++++++++------ 4 files changed, 222 insertions(+), 107 deletions(-) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index 66272c97d7a16..25e0b369bebd3 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -135,6 +135,12 @@ Changelog :pr:`17743` by :user:`Maria Telenczuk ` and :user:`Alexandre Gramfort `. +- |Fix|: `sample_weight` are now fully taken into account in linear models + when `normalize=True` for both feature centering and feature + scaling. + :pr:`19426` by :user:`Alexandre Gramfort ` and + :user:`Maria Telenczuk `. + :mod:`sklearn.manifold` ....................... diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py index f84d4234c193c..61005cb4b5d4a 100644 --- a/sklearn/linear_model/_base.py +++ b/sklearn/linear_model/_base.py @@ -33,6 +33,7 @@ from ..utils.validation import _deprecate_positional_args from ..utils import check_random_state from ..utils.extmath import safe_sparse_dot +from ..utils.extmath import _incremental_mean_and_var from ..utils.sparsefuncs import mean_variance_axis, inplace_column_scale from ..utils.fixes import sparse_lsqr from ..utils._seq_dataset import ArrayDataset32, CSRDataset32 @@ -40,7 +41,6 @@ from ..utils.validation import check_is_fitted, _check_sample_weight from ..utils.fixes import delayed -from ..preprocessing import normalize as f_normalize # TODO: bayesian_ridge_regression and bayesian_regression_ard # should be squashed into its respective objects. @@ -229,33 +229,33 @@ def _preprocess_data(X, y, fit_intercept, normalize=False, copy=True, if fit_intercept: if sp.issparse(X): - X_offset, X_var = mean_variance_axis(X, axis=0) + X_offset, X_var = mean_variance_axis( + X, axis=0, weights=sample_weight + ) if not return_mean: X_offset[:] = X.dtype.type(0) + else: + X_offset, X_var, _ = _incremental_mean_and_var( + X, last_mean=0., last_variance=0., last_sample_count=0., + sample_weight=sample_weight + ) - if normalize: + X_offset = X_offset.astype(X.dtype) + X -= X_offset - # TODO: f_normalize could be used here as well but the function - # inplace_csr_row_normalize_l2 must be changed such that it - # can return also the norms computed internally + X_var = X_var.astype(X.dtype, copy=False) - # transform variance to norm in-place - X_var *= X.shape[0] - X_scale = np.sqrt(X_var, X_var) - del X_var - X_scale[X_scale == 0] = 1 + if normalize: + X_var *= X.shape[0] + X_scale = np.sqrt(X_var, out=X_var) + X_scale[X_scale < 10 * np.finfo(X_scale.dtype).eps] = 1. + if sp.issparse(X): inplace_column_scale(X, 1. / X_scale) else: - X_scale = np.ones(X.shape[1], dtype=X.dtype) - + X /= X_scale else: - X_offset = np.average(X, axis=0, weights=sample_weight) - X -= X_offset - if normalize: - X, X_scale = f_normalize(X, axis=0, copy=False, - return_norm=True) - else: - X_scale = np.ones(X.shape[1], dtype=X.dtype) + X_scale = np.ones(X.shape[1], dtype=X.dtype) + y_offset = np.average(y, axis=0, weights=sample_weight) y = y - y_offset else: diff --git a/sklearn/linear_model/tests/test_base.py b/sklearn/linear_model/tests/test_base.py index 75cc9dd5fd8f1..56ee18f5f0d06 100644 --- a/sklearn/linear_model/tests/test_base.py +++ b/sklearn/linear_model/tests/test_base.py @@ -1,5 +1,6 @@ # Author: Alexandre Gramfort # Fabian Pedregosa +# Maria Telenczuk # # License: BSD 3 clause @@ -24,6 +25,7 @@ from sklearn.datasets import make_sparse_uncorrelated from sklearn.datasets import make_regression from sklearn.datasets import load_iris +from sklearn.preprocessing import StandardScaler rng = np.random.RandomState(0) rtol = 1e-6 @@ -407,31 +409,31 @@ def test_preprocess_data(): X = rng.rand(n_samples, n_features) y = rng.rand(n_samples) expected_X_mean = np.mean(X, axis=0) - expected_X_norm = np.std(X, axis=0) * np.sqrt(X.shape[0]) + expected_X_scale = np.std(X, axis=0) * np.sqrt(X.shape[0]) expected_y_mean = np.mean(y, axis=0) - Xt, yt, X_mean, y_mean, X_norm = \ + Xt, yt, X_mean, y_mean, X_scale = \ _preprocess_data(X, y, fit_intercept=False, normalize=False) assert_array_almost_equal(X_mean, np.zeros(n_features)) assert_array_almost_equal(y_mean, 0) - assert_array_almost_equal(X_norm, np.ones(n_features)) + assert_array_almost_equal(X_scale, np.ones(n_features)) assert_array_almost_equal(Xt, X) assert_array_almost_equal(yt, y) - Xt, yt, X_mean, y_mean, X_norm = \ + Xt, yt, X_mean, y_mean, X_scale = \ _preprocess_data(X, y, fit_intercept=True, normalize=False) assert_array_almost_equal(X_mean, expected_X_mean) assert_array_almost_equal(y_mean, expected_y_mean) - assert_array_almost_equal(X_norm, np.ones(n_features)) + assert_array_almost_equal(X_scale, np.ones(n_features)) assert_array_almost_equal(Xt, X - expected_X_mean) assert_array_almost_equal(yt, y - expected_y_mean) - Xt, yt, X_mean, y_mean, X_norm = \ + Xt, yt, X_mean, y_mean, X_scale = \ _preprocess_data(X, y, fit_intercept=True, normalize=True) assert_array_almost_equal(X_mean, expected_X_mean) assert_array_almost_equal(y_mean, expected_y_mean) - assert_array_almost_equal(X_norm, expected_X_norm) - assert_array_almost_equal(Xt, (X - expected_X_mean) / expected_X_norm) + assert_array_almost_equal(X_scale, expected_X_scale) + assert_array_almost_equal(Xt, (X - expected_X_mean) / expected_X_scale) assert_array_almost_equal(yt, y - expected_y_mean) @@ -461,36 +463,94 @@ def test_preprocess_data_multioutput(): assert_array_almost_equal(yt, y - y_mean) -def test_preprocess_data_weighted(): +@pytest.mark.parametrize("is_sparse", [False, True]) +def test_preprocess_data_weighted(is_sparse): n_samples = 200 - n_features = 2 + n_features = 4 + # Generate random data with 50% of zero values to make sure + # that the sparse variant of this test is actually sparse. This also + # shifts the mean value for each columns in X further away from + # zero. X = rng.rand(n_samples, n_features) + X[X < 0.5] = 0. + + # Scale the first feature of X to be 10 larger than the other to + # better check the impact of feature scaling. + X[:, 0] *= 10 + + # Constant non-zero feature: this edge-case is currently not handled + # correctly for sparse data, see: + # https://github.com/scikit-learn/scikit-learn/issues/19450 + # X[:, 2] = 1. + + # Constant zero feature (non-materialized in the sparse case) + X[:, 3] = 0. y = rng.rand(n_samples) + sample_weight = rng.rand(n_samples) expected_X_mean = np.average(X, axis=0, weights=sample_weight) expected_y_mean = np.average(y, axis=0, weights=sample_weight) - # XXX: if normalize=True, should we expect a weighted standard deviation? - # Currently not weighted, but calculated with respect to weighted mean - expected_X_norm = (np.sqrt(X.shape[0]) * - np.mean((X - expected_X_mean) ** 2, axis=0) ** .5) + X_sample_weight_avg = np.average(X, weights=sample_weight, axis=0) + X_sample_weight_var = np.average((X - X_sample_weight_avg)**2, + weights=sample_weight, + axis=0) + expected_X_scale = np.sqrt(X_sample_weight_var) * np.sqrt(n_samples) + + # near constant features should not be scaled + expected_X_scale[expected_X_scale < 10 * np.finfo(np.float64).eps] = 1 + + if is_sparse: + X = sparse.csr_matrix(X) - Xt, yt, X_mean, y_mean, X_norm = \ + # normalize is False + Xt, yt, X_mean, y_mean, X_scale = \ _preprocess_data(X, y, fit_intercept=True, normalize=False, - sample_weight=sample_weight) + sample_weight=sample_weight, return_mean=True) assert_array_almost_equal(X_mean, expected_X_mean) assert_array_almost_equal(y_mean, expected_y_mean) - assert_array_almost_equal(X_norm, np.ones(n_features)) - assert_array_almost_equal(Xt, X - expected_X_mean) + assert_array_almost_equal(X_scale, np.ones(n_features)) + if is_sparse: + assert_array_almost_equal(Xt.toarray(), X.toarray()) + else: + assert_array_almost_equal(Xt, X - expected_X_mean) assert_array_almost_equal(yt, y - expected_y_mean) - Xt, yt, X_mean, y_mean, X_norm = \ + # normalize is True + Xt, yt, X_mean, y_mean, X_scale = \ _preprocess_data(X, y, fit_intercept=True, normalize=True, - sample_weight=sample_weight) + sample_weight=sample_weight, return_mean=True) + assert_array_almost_equal(X_mean, expected_X_mean) assert_array_almost_equal(y_mean, expected_y_mean) - assert_array_almost_equal(X_norm, expected_X_norm) - assert_array_almost_equal(Xt, (X - expected_X_mean) / expected_X_norm) + assert_array_almost_equal(X_scale, expected_X_scale) + + if is_sparse: + # X is not centered + assert_array_almost_equal( + Xt.toarray(), X.toarray() / expected_X_scale + ) + else: + assert_array_almost_equal( + Xt, (X - expected_X_mean) / expected_X_scale + ) + + # _preprocess_data with normalize=True scales the data by the feature-wise + # euclidean norms while StandardScaler scales the data by the feature-wise + # standard deviations. + # The two are equivalent up to a ratio of np.sqrt(n_samples) + if is_sparse: + scaler = StandardScaler(with_mean=False).fit( + X, sample_weight=sample_weight) + + assert_array_almost_equal( + scaler.transform(X).toarray() / np.sqrt(n_samples), Xt.toarray() + ) + else: + scaler = StandardScaler(with_mean=True).fit( + X, sample_weight=sample_weight) + assert_array_almost_equal(scaler.mean_, X_mean) + assert_array_almost_equal(scaler.transform(X) / np.sqrt(n_samples), Xt) assert_array_almost_equal(yt, y - expected_y_mean) @@ -502,33 +562,33 @@ def test_sparse_preprocess_data_with_return_mean(): X = X.tolil() y = rng.rand(n_samples) XA = X.toarray() - expected_X_norm = np.std(XA, axis=0) * np.sqrt(X.shape[0]) + expected_X_scale = np.std(XA, axis=0) * np.sqrt(X.shape[0]) - Xt, yt, X_mean, y_mean, X_norm = \ + Xt, yt, X_mean, y_mean, X_scale = \ _preprocess_data(X, y, fit_intercept=False, normalize=False, return_mean=True) assert_array_almost_equal(X_mean, np.zeros(n_features)) assert_array_almost_equal(y_mean, 0) - assert_array_almost_equal(X_norm, np.ones(n_features)) + assert_array_almost_equal(X_scale, np.ones(n_features)) assert_array_almost_equal(Xt.A, XA) assert_array_almost_equal(yt, y) - Xt, yt, X_mean, y_mean, X_norm = \ + Xt, yt, X_mean, y_mean, X_scale = \ _preprocess_data(X, y, fit_intercept=True, normalize=False, return_mean=True) assert_array_almost_equal(X_mean, np.mean(XA, axis=0)) assert_array_almost_equal(y_mean, np.mean(y, axis=0)) - assert_array_almost_equal(X_norm, np.ones(n_features)) + assert_array_almost_equal(X_scale, np.ones(n_features)) assert_array_almost_equal(Xt.A, XA) assert_array_almost_equal(yt, y - np.mean(y, axis=0)) - Xt, yt, X_mean, y_mean, X_norm = \ + Xt, yt, X_mean, y_mean, X_scale = \ _preprocess_data(X, y, fit_intercept=True, normalize=True, return_mean=True) assert_array_almost_equal(X_mean, np.mean(XA, axis=0)) assert_array_almost_equal(y_mean, np.mean(y, axis=0)) - assert_array_almost_equal(X_norm, expected_X_norm) - assert_array_almost_equal(Xt.A, XA / expected_X_norm) + assert_array_almost_equal(X_scale, expected_X_scale) + assert_array_almost_equal(Xt.A, XA / expected_X_scale) assert_array_almost_equal(yt, y - np.mean(y, axis=0)) @@ -577,19 +637,19 @@ def test_dtype_preprocess_data(): for fit_intercept in [True, False]: for normalize in [True, False]: - Xt_32, yt_32, X_mean_32, y_mean_32, X_norm_32 = _preprocess_data( + Xt_32, yt_32, X_mean_32, y_mean_32, X_scale_32 = _preprocess_data( X_32, y_32, fit_intercept=fit_intercept, normalize=normalize, return_mean=True) - Xt_64, yt_64, X_mean_64, y_mean_64, X_norm_64 = _preprocess_data( + Xt_64, yt_64, X_mean_64, y_mean_64, X_scale_64 = _preprocess_data( X_64, y_64, fit_intercept=fit_intercept, normalize=normalize, return_mean=True) - Xt_3264, yt_3264, X_mean_3264, y_mean_3264, X_norm_3264 = ( + Xt_3264, yt_3264, X_mean_3264, y_mean_3264, X_scale_3264 = ( _preprocess_data(X_32, y_64, fit_intercept=fit_intercept, normalize=normalize, return_mean=True)) - Xt_6432, yt_6432, X_mean_6432, y_mean_6432, X_norm_6432 = ( + Xt_6432, yt_6432, X_mean_6432, y_mean_6432, X_scale_6432 = ( _preprocess_data(X_64, y_32, fit_intercept=fit_intercept, normalize=normalize, return_mean=True)) @@ -597,25 +657,25 @@ def test_dtype_preprocess_data(): assert yt_32.dtype == np.float32 assert X_mean_32.dtype == np.float32 assert y_mean_32.dtype == np.float32 - assert X_norm_32.dtype == np.float32 + assert X_scale_32.dtype == np.float32 assert Xt_64.dtype == np.float64 assert yt_64.dtype == np.float64 assert X_mean_64.dtype == np.float64 assert y_mean_64.dtype == np.float64 - assert X_norm_64.dtype == np.float64 + assert X_scale_64.dtype == np.float64 assert Xt_3264.dtype == np.float32 assert yt_3264.dtype == np.float32 assert X_mean_3264.dtype == np.float32 assert y_mean_3264.dtype == np.float32 - assert X_norm_3264.dtype == np.float32 + assert X_scale_3264.dtype == np.float32 assert Xt_6432.dtype == np.float64 assert yt_6432.dtype == np.float64 assert X_mean_6432.dtype == np.float64 assert y_mean_6432.dtype == np.float64 - assert X_norm_6432.dtype == np.float64 + assert X_scale_6432.dtype == np.float64 assert X_32.dtype == np.float32 assert y_32.dtype == np.float32 @@ -626,7 +686,7 @@ def test_dtype_preprocess_data(): assert_array_almost_equal(yt_32, yt_64) assert_array_almost_equal(X_mean_32, X_mean_64) assert_array_almost_equal(y_mean_32, y_mean_64) - assert_array_almost_equal(X_norm_32, X_norm_64) + assert_array_almost_equal(X_scale_32, X_scale_64) @pytest.mark.parametrize('n_targets', [None, 2]) diff --git a/sklearn/linear_model/tests/test_coordinate_descent.py b/sklearn/linear_model/tests/test_coordinate_descent.py index b6acb78838a33..3eba535d70c89 100644 --- a/sklearn/linear_model/tests/test_coordinate_descent.py +++ b/sklearn/linear_model/tests/test_coordinate_descent.py @@ -13,6 +13,7 @@ from sklearn.datasets import make_regression from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline +from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler from sklearn.exceptions import ConvergenceWarning from sklearn.utils._testing import assert_allclose @@ -25,6 +26,7 @@ from sklearn.utils._testing import _convert_container from sklearn.utils._testing import TempMemmap from sklearn.utils.fixes import parse_version +from sklearn.utils.sparsefuncs import mean_variance_axis from sklearn.linear_model import ( ARDRegression, @@ -298,7 +300,33 @@ def test_lasso_cv_positive_constraint(): assert min(clf_constrained.coef_) >= 0 -# FIXME: 'normalize' to be removed in 1.2 +def _scale_alpha_inplace(estimator, n_samples): + """Rescale the parameter alpha from when the estimator is evoked with + normalize set to True to when it is evoked in a Pipeline with normalize set + to False and with a StandardScaler. + """ + if 'alpha' not in estimator.get_params(): + return + + if isinstance(estimator, (Lasso, LassoLars, MultiTaskLasso)): + alpha = estimator.alpha * np.sqrt(n_samples) + if isinstance(estimator, (Ridge, RidgeClassifier)): + alpha = estimator.alpha * n_samples + if isinstance(estimator, (ElasticNet, MultiTaskElasticNet)): + if estimator.l1_ratio == 1: + alpha = estimator.alpha * np.sqrt(n_samples) + elif estimator.l1_ratio == 0: + alpha = estimator.alpha * n_samples + else: + # To avoid silent errors in case of refactoring + raise NotImplementedError + + estimator.set_params(alpha=alpha) + + +# FIXME: 'normalize' to be removed in 1.2 for all the models excluding: +# OrthogonalMatchingPursuit, Lars, LassoLars, LarsCV, LassoLarsCV +# for which it is to be removed in 1.4 @pytest.mark.filterwarnings("ignore:'normalize' was deprecated") @pytest.mark.parametrize( "LinearModel, params", @@ -324,7 +352,6 @@ def test_model_pipeline_same_as_normalize_true(LinearModel, params): # in the pipeline and with normalize set to False # normalize is True - model_name = LinearModel.__name__ model_normalize = LinearModel(normalize=True, fit_intercept=True, **params) pipeline = make_pipeline( @@ -351,22 +378,7 @@ def test_model_pipeline_same_as_normalize_true(LinearModel, params): X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) - if 'alpha' in params: - model_normalize.set_params(alpha=params['alpha']) - if model_name in ['Lasso', 'LassoLars', 'MultiTaskLasso']: - new_params = dict( - alpha=params['alpha'] * np.sqrt(X_train.shape[0])) - if model_name in ['Ridge', 'RidgeClassifier']: - new_params = dict(alpha=params['alpha'] * X_train.shape[0]) - if model_name in ['ElasticNet', 'MultiTaskElasticNet']: - if params['l1_ratio'] == 1: - new_params = dict( - alpha=params['alpha'] * np.sqrt(X_train.shape[0])) - if params['l1_ratio'] == 0: - new_params = dict(alpha=params['alpha'] * X_train.shape[0]) - - if 'new_params' in locals(): - pipeline[1].set_params(**new_params) + _scale_alpha_inplace(pipeline[1], X_train.shape[0]) model_normalize.fit(X_train, y_train) y_pred_normalize = model_normalize.predict(X_test) @@ -386,24 +398,47 @@ def test_model_pipeline_same_as_normalize_true(LinearModel, params): # FIXME: 'normalize' to be removed in 1.2 @pytest.mark.filterwarnings("ignore:'normalize' was deprecated") @pytest.mark.parametrize( - "estimator, is_sparse, with_mean", - [(LinearRegression, True, False), - (LinearRegression, False, True), - (LinearRegression, False, False)] + "estimator, params", + [ + (Lasso, {"tol": 1e-16, "alpha": 0.1}), + (RidgeClassifier, {"solver": 'sparse_cg', "alpha": 0.1}), + (ElasticNet, {"tol": 1e-16, 'l1_ratio': 1, "alpha": 0.1}), + (ElasticNet, {"tol": 1e-16, 'l1_ratio': 0, "alpha": 0.1}), + (Ridge, {"solver": 'sparse_cg', 'tol': 1e-12, "alpha": 0.1}), + (LinearRegression, {}), + ] +) +@pytest.mark.parametrize( + "is_sparse, with_mean", [ + (False, True), + (False, False), + (True, False) + # No need to test sparse and with_mean=True + ] ) def test_linear_model_sample_weights_normalize_in_pipeline( - estimator, is_sparse, with_mean + is_sparse, with_mean, estimator, params ): - # Test that the results for running linear regression LinearRegression with - # sample_weight set and with normalize set to True gives similar results as - # LinearRegression with no normalize in a pipeline with a StandardScaler - # and set sample_weight. + # Test that the results for running linear model with sample_weight + # and with normalize set to True gives similar results as the same linear + # model with normalize set to False in a pipeline with + # a StandardScaler and sample_weight. + model_name = estimator.__name__ + + if model_name in ['Lasso', 'ElasticNet'] and is_sparse: + pytest.skip(f'{model_name} does not support sample_weight with sparse') + rng = np.random.RandomState(0) X, y = make_regression(n_samples=20, n_features=5, noise=1e-2, random_state=rng) + + if is_classifier(estimator): + y = np.sign(y) + # make sure the data is not centered to make the problem more - # difficult - X += 10 + # difficult + add 0s for the sparse case + X[X < 0] = 0 + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=rng) if is_sparse: @@ -412,27 +447,41 @@ def test_linear_model_sample_weights_normalize_in_pipeline( sample_weight = rng.rand(X_train.shape[0]) - # linear estimator with explicit sample_weight - reg_with_normalize = estimator(normalize=True) + # linear estimator with built-in feature normalization + reg_with_normalize = estimator(normalize=True, fit_intercept=True, + **params) reg_with_normalize.fit(X_train, y_train, sample_weight=sample_weight) - # linear estimator in a pipeline - reg_with_scaler = make_pipeline( - StandardScaler(with_mean=with_mean), - estimator(normalize=False) - ) - kwargs = {reg_with_scaler.steps[-1][0] + '__sample_weight': - sample_weight} - reg_with_scaler.fit(X_train, y_train, **kwargs) - - y_pred_norm = reg_with_normalize.predict(X_test) - y_pred_pip = reg_with_scaler.predict(X_test) - - assert_allclose( - reg_with_normalize.coef_ * reg_with_scaler[0].scale_, - reg_with_scaler[1].coef_ - ) - assert_allclose(y_pred_norm, y_pred_pip) + # linear estimator in a pipeline with a StandardScaler, normalize=False + linear_regressor = estimator(normalize=False, fit_intercept=True, **params) + _scale_alpha_inplace(linear_regressor, X_train.shape[0]) # rescale alpha + reg_with_scaler = Pipeline([ + ("scaler", StandardScaler(with_mean=with_mean)), + ("linear_regressor", linear_regressor) + ]) + + fit_params = { + "scaler__sample_weight": sample_weight, + "linear_regressor__sample_weight": sample_weight, + } + + reg_with_scaler.fit(X_train, y_train, **fit_params) + + # Check that the 2 regressions models are exactly equivalent in the + # sense that they predict exactly the same outcome. + y_pred_normalize = reg_with_normalize.predict(X_test) + y_pred_scaler = reg_with_scaler.predict(X_test) + assert_allclose(y_pred_normalize, y_pred_scaler) + # Check intercept computation when normalize is True + y_train_mean = np.average(y_train, weights=sample_weight) + if is_sparse: + X_train_mean, _ = mean_variance_axis(X_train, axis=0, + weights=sample_weight) + else: + X_train_mean = np.average(X_train, weights=sample_weight, axis=0) + assert (reg_with_normalize.intercept_ == + pytest.approx(y_train_mean - + reg_with_normalize.coef_.dot(X_train_mean))) # FIXME: 'normalize' to be removed in 1.2 From 04534204f2125741505172ecd8dc3c92f9917698 Mon Sep 17 00:00:00 2001 From: putschblos <75161135+putschblos@users.noreply.github.com> Date: Mon, 22 Feb 2021 16:23:55 +0100 Subject: [PATCH 193/478] EXA Fix overlapping titles in clustering overview (#19506) --- examples/cluster/plot_cluster_comparison.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/examples/cluster/plot_cluster_comparison.py b/examples/cluster/plot_cluster_comparison.py index 5791464f3dc67..0eea5ee1e27e1 100644 --- a/examples/cluster/plot_cluster_comparison.py +++ b/examples/cluster/plot_cluster_comparison.py @@ -63,8 +63,8 @@ # ============ # Set up cluster parameters # ============ -plt.figure(figsize=(9 * 2 + 3, 12.5)) -plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=.05, +plt.figure(figsize=(9 * 2 + 3, 13)) +plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.95, wspace=.05, hspace=.01) plot_num = 1 @@ -135,16 +135,16 @@ n_components=params['n_clusters'], covariance_type='full') clustering_algorithms = ( - ('MiniBatchKMeans', two_means), - ('AffinityPropagation', affinity_propagation), + ('MiniBatch\nKMeans', two_means), + ('Affinity\nPropagation', affinity_propagation), ('MeanShift', ms), - ('SpectralClustering', spectral), + ('Spectral\nClustering', spectral), ('Ward', ward), - ('AgglomerativeClustering', average_linkage), + ('Agglomerative\nClustering', average_linkage), ('DBSCAN', dbscan), ('OPTICS', optics), ('BIRCH', birch), - ('GaussianMixture', gmm) + ('Gaussian\nMixture', gmm) ) for name, algorithm in clustering_algorithms: From 1000d0a61be311542e01d56f6745178307406395 Mon Sep 17 00:00:00 2001 From: Alihan Zihna Date: Mon, 22 Feb 2021 15:25:34 +0000 Subject: [PATCH 194/478] TST replace assert_raise with pytest.raises in test_base.py (#19500) Co-authored-by: Alihan Zihna --- sklearn/tests/test_base.py | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/sklearn/tests/test_base.py b/sklearn/tests/test_base.py index 7dd8d02f3c0bf..666df1499d7dc 100644 --- a/sklearn/tests/test_base.py +++ b/sklearn/tests/test_base.py @@ -7,7 +7,6 @@ import sklearn from sklearn.utils._testing import assert_array_equal -from sklearn.utils._testing import assert_raises from sklearn.utils._testing import assert_no_warnings from sklearn.utils._testing import assert_warns_message from sklearn.utils._testing import ignore_warnings @@ -145,16 +144,20 @@ def test_clone_buggy(): # Check that clone raises an error on buggy estimators. buggy = Buggy() buggy.a = 2 - assert_raises(RuntimeError, clone, buggy) + with pytest.raises(RuntimeError): + clone(buggy) no_estimator = NoEstimator() - assert_raises(TypeError, clone, no_estimator) + with pytest.raises(TypeError): + clone(no_estimator) varg_est = VargEstimator() - assert_raises(RuntimeError, clone, varg_est) + with pytest.raises(RuntimeError): + clone(varg_est) est = ModifyInitParams() - assert_raises(RuntimeError, clone, est) + with pytest.raises(RuntimeError): + clone(est) def test_clone_empty_array(): @@ -233,7 +236,9 @@ def test_get_params(): test.set_params(a__d=2) assert test.a.d == 2 - assert_raises(ValueError, test.set_params, a__a=2) + + with pytest.raises(ValueError): + test.set_params(a__a=2) def test_is_classifier(): @@ -248,10 +253,15 @@ def test_is_classifier(): def test_set_params(): # test nested estimator parameter setting clf = Pipeline([("svc", SVC())]) + # non-existing parameter in svc - assert_raises(ValueError, clf.set_params, svc__stupid_param=True) + with pytest.raises(ValueError): + clf.set_params(svc__stupid_param=True) + # non-existing parameter of pipeline - assert_raises(ValueError, clf.set_params, svm__stupid_param=True) + with pytest.raises(ValueError): + clf.set_params(svm__stupid_param=True) + # we don't currently catch if the things in pipeline are estimators # bad_pipeline = Pipeline([("bad", NoEstimator())]) # assert_raises(AttributeError, bad_pipeline.set_params, From 6852e31da88a94262c8a6a82a6ad00a34ddc385b Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Mon, 22 Feb 2021 21:18:21 +0100 Subject: [PATCH 195/478] ENH Adds n_features_in_ to ensemble module (#19326) Co-authored-by: Guillaume Lemaitre --- sklearn/ensemble/_bagging.py | 62 +++++++++---------- sklearn/ensemble/_forest.py | 42 ++++++++++--- sklearn/ensemble/_gb.py | 47 +++++++++----- .../gradient_boosting.py | 8 ++- sklearn/ensemble/_iforest.py | 15 +++-- sklearn/ensemble/_weight_boosting.py | 8 ++- sklearn/ensemble/tests/test_bagging.py | 22 ++++--- sklearn/ensemble/tests/test_forest.py | 18 ++++++ .../ensemble/tests/test_gradient_boosting.py | 16 +++++ sklearn/ensemble/tests/test_iforest.py | 12 ++++ sklearn/tests/test_common.py | 1 - 11 files changed, 174 insertions(+), 77 deletions(-) diff --git a/sklearn/ensemble/_bagging.py b/sklearn/ensemble/_bagging.py index 070bc374f3123..1ac309f00ad69 100644 --- a/sklearn/ensemble/_bagging.py +++ b/sklearn/ensemble/_bagging.py @@ -16,7 +16,7 @@ from ..base import ClassifierMixin, RegressorMixin from ..metrics import r2_score, accuracy_score from ..tree import DecisionTreeClassifier, DecisionTreeRegressor -from ..utils import check_random_state, check_array, column_or_1d +from ..utils import check_random_state, column_or_1d, deprecated from ..utils import indices_to_mask from ..utils.metaestimators import if_delegate_has_method from ..utils.multiclass import check_classification_targets @@ -287,7 +287,7 @@ def _fit(self, X, y, max_samples=None, max_depth=None, sample_weight=None): sample_weight = _check_sample_weight(sample_weight, X, dtype=None) # Remap output - n_samples, self.n_features_ = X.shape + n_samples = X.shape[0] self._n_samples = n_samples y = self._validate_y(y) @@ -313,11 +313,11 @@ def _fit(self, X, y, max_samples=None, max_depth=None, sample_weight=None): if isinstance(self.max_features, numbers.Integral): max_features = self.max_features elif isinstance(self.max_features, float): - max_features = self.max_features * self.n_features_ + max_features = self.max_features * self.n_features_in_ else: raise ValueError("max_features must be int or float") - if not (0 < max_features <= self.n_features_): + if not (0 < max_features <= self.n_features_in_): raise ValueError("max_features must be in (0, n_features]") max_features = max(1, int(max_features)) @@ -408,7 +408,7 @@ def _get_estimators_indices(self): # to those in `_parallel_build_estimators()` feature_indices, sample_indices = _generate_bagging_indices( seed, self.bootstrap_features, self.bootstrap, - self.n_features_, self._n_samples, self._max_features, + self.n_features_in_, self._n_samples, self._max_features, self._max_samples) yield feature_indices, sample_indices @@ -429,6 +429,16 @@ def estimators_samples_(self): return [sample_indices for _, sample_indices in self._get_estimators_indices()] + # TODO: Remove in 1.2 + # mypy error: Decorated property not supported + @deprecated( # type: ignore + "Attribute n_features_ was deprecated in version 1.0 and will be " + "removed in 1.2. Use 'n_features_in_' instead." + ) + @property + def n_features_(self): + return self.n_features_in_ + class BaggingClassifier(ClassifierMixin, BaseBagging): """A Bagging classifier. @@ -523,6 +533,10 @@ class BaggingClassifier(ClassifierMixin, BaseBagging): n_features_ : int The number of features when :meth:`fit` is performed. + .. deprecated:: 1.0 + Attribute `n_features_` was deprecated in version 1.0 and will be + removed in 1.2. Use `n_features_in_` instead. + estimators_ : list of estimators The collection of fitted base estimators. @@ -702,17 +716,11 @@ def predict_proba(self, X): """ check_is_fitted(self) # Check data - X = check_array( + X = self._validate_data( X, accept_sparse=['csr', 'csc'], dtype=None, - force_all_finite=False + force_all_finite=False, reset=False ) - if self.n_features_ != X.shape[1]: - raise ValueError("Number of features of the model must " - "match the input. Model n_features is {0} and " - "input n_features is {1}." - "".format(self.n_features_, X.shape[1])) - # Parallel loop n_jobs, n_estimators, starts = _partition_estimators(self.n_estimators, self.n_jobs) @@ -753,17 +761,11 @@ def predict_log_proba(self, X): check_is_fitted(self) if hasattr(self.base_estimator_, "predict_log_proba"): # Check data - X = check_array( + X = self._validate_data( X, accept_sparse=['csr', 'csc'], dtype=None, - force_all_finite=False + force_all_finite=False, reset=False ) - if self.n_features_ != X.shape[1]: - raise ValueError("Number of features of the model must " - "match the input. Model n_features is {0} " - "and input n_features is {1} " - "".format(self.n_features_, X.shape[1])) - # Parallel loop n_jobs, n_estimators, starts = _partition_estimators( self.n_estimators, self.n_jobs) @@ -811,17 +813,11 @@ def decision_function(self, X): check_is_fitted(self) # Check data - X = check_array( + X = self._validate_data( X, accept_sparse=['csr', 'csc'], dtype=None, - force_all_finite=False + force_all_finite=False, reset=False ) - if self.n_features_ != X.shape[1]: - raise ValueError("Number of features of the model must " - "match the input. Model n_features is {0} and " - "input n_features is {1} " - "".format(self.n_features_, X.shape[1])) - # Parallel loop n_jobs, n_estimators, starts = _partition_estimators(self.n_estimators, self.n_jobs) @@ -929,6 +925,10 @@ class BaggingRegressor(RegressorMixin, BaseBagging): n_features_ : int The number of features when :meth:`fit` is performed. + .. deprecated:: 1.0 + Attribute `n_features_` was deprecated in version 1.0 and will be + removed in 1.2. Use `n_features_in_` instead. + estimators_ : list of estimators The collection of fitted sub-estimators. @@ -1024,9 +1024,9 @@ def predict(self, X): """ check_is_fitted(self) # Check data - X = check_array( + X = self._validate_data( X, accept_sparse=['csr', 'csc'], dtype=None, - force_all_finite=False + force_all_finite=False, reset=False ) # Parallel loop diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index c0b190c60ef54..a93e9b7ee877e 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -57,7 +57,7 @@ class calls the ``fit`` method of each sub-estimator on random samples from ..tree import (DecisionTreeClassifier, DecisionTreeRegressor, ExtraTreeClassifier, ExtraTreeRegressor) from ..tree._tree import DTYPE, DOUBLE -from ..utils import check_random_state, check_array, compute_sample_weight +from ..utils import check_random_state, compute_sample_weight, deprecated from ..exceptions import DataConversionWarning from ._base import BaseEnsemble, _partition_estimators from ..utils.fixes import delayed @@ -312,9 +312,6 @@ def fit(self, X, y, sample_weight=None): # ensemble sorts the indices. X.sort_indices() - # Remap output - self.n_features_ = X.shape[1] - y = np.atleast_1d(y) if y.ndim == 2 and y.shape[1] == 1: warn("A column-vector y was passed when a 1d array was" @@ -446,7 +443,8 @@ def _compute_oob_predictions(self, X, y): (n_samples, 1, n_outputs) The OOB predictions. """ - X = check_array(X, dtype=DTYPE, accept_sparse='csr') + X = self._validate_data(X, dtype=DTYPE, accept_sparse='csr', + reset=False) n_samples = y.shape[0] n_outputs = self.n_outputs_ @@ -530,12 +528,22 @@ def feature_importances_(self): for tree in self.estimators_ if tree.tree_.node_count > 1) if not all_importances: - return np.zeros(self.n_features_, dtype=np.float64) + return np.zeros(self.n_features_in_, dtype=np.float64) all_importances = np.mean(all_importances, axis=0, dtype=np.float64) return all_importances / np.sum(all_importances) + # TODO: Remove in 1.2 + # mypy error: Decorated property not supported + @deprecated( # type: ignore + "Attribute n_features_ was deprecated in version 1.0 and will be " + "removed in 1.2. Use 'n_features_in_' instead." + ) + @property + def n_features_(self): + return self.n_features_in_ + def _accumulate_prediction(predict, X, out, lock): """ @@ -1164,6 +1172,10 @@ class labels (multi-output problem). n_features_ : int The number of features when ``fit`` is performed. + .. deprecated:: 1.0 + Attribute `n_features_` was deprecated in version 1.0 and will be + removed in 1.2. Use `n_features_in_` instead. + n_outputs_ : int The number of outputs when ``fit`` is performed. @@ -1465,6 +1477,10 @@ class RandomForestRegressor(ForestRegressor): n_features_ : int The number of features when ``fit`` is performed. + .. deprecated:: 1.0 + Attribute `n_features_` was deprecated in version 1.0 and will be + removed in 1.2. Use `n_features_in_` instead. + n_outputs_ : int The number of outputs when ``fit`` is performed. @@ -1786,6 +1802,10 @@ class labels (multi-output problem). n_features_ : int The number of features when ``fit`` is performed. + .. deprecated:: 1.0 + Attribute `n_features_` was deprecated in version 1.0 and will be + removed in 1.2. Use `n_features_in_` instead. + n_outputs_ : int The number of outputs when ``fit`` is performed. @@ -2072,6 +2092,10 @@ class ExtraTreesRegressor(ForestRegressor): n_features_ : int The number of features. + .. deprecated:: 1.0 + Attribute `n_features_` was deprecated in version 1.0 and will be + removed in 1.2. Use `n_features_in_` instead. + n_outputs_ : int The number of outputs. @@ -2296,6 +2320,10 @@ class RandomTreesEmbedding(BaseForest): n_features_ : int The number of features when ``fit`` is performed. + .. deprecated:: 1.0 + Attribute `n_features_` was deprecated in version 1.0 and will be + removed in 1.2. Use `n_features_in_` instead. + n_outputs_ : int The number of outputs when ``fit`` is performed. @@ -2425,7 +2453,7 @@ def fit_transform(self, X, y=None, sample_weight=None): X_transformed : sparse matrix of shape (n_samples, n_out) Transformed dataset. """ - X = check_array(X, accept_sparse=['csc']) + X = self._validate_data(X, accept_sparse=['csc']) if issparse(X): # Pre-sort indices to avoid that each individual tree of the # ensemble sorts the indices. diff --git a/sklearn/ensemble/_gb.py b/sklearn/ensemble/_gb.py index 15f5404f4701c..e9f7402188860 100644 --- a/sklearn/ensemble/_gb.py +++ b/sklearn/ensemble/_gb.py @@ -273,25 +273,25 @@ def _check_params(self): if isinstance(self.max_features, str): if self.max_features == "auto": if is_classifier(self): - max_features = max(1, int(np.sqrt(self.n_features_))) + max_features = max(1, int(np.sqrt(self.n_features_in_))) else: - max_features = self.n_features_ + max_features = self.n_features_in_ elif self.max_features == "sqrt": - max_features = max(1, int(np.sqrt(self.n_features_))) + max_features = max(1, int(np.sqrt(self.n_features_in_))) elif self.max_features == "log2": - max_features = max(1, int(np.log2(self.n_features_))) + max_features = max(1, int(np.log2(self.n_features_in_))) else: raise ValueError("Invalid value for max_features: %r. " "Allowed string values are 'auto', 'sqrt' " "or 'log2'." % self.max_features) elif self.max_features is None: - max_features = self.n_features_ + max_features = self.n_features_in_ elif isinstance(self.max_features, numbers.Integral): max_features = self.max_features else: # float if 0. < self.max_features <= 1.: max_features = max(int(self.max_features * - self.n_features_), 1) + self.n_features_in_), 1) else: raise ValueError("max_features must be in (0, n_features]") @@ -411,7 +411,6 @@ def fit(self, X, y, sample_weight=None, monitor=None): X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc', 'coo'], dtype=DTYPE, multi_output=True) - n_samples, self.n_features_ = X.shape sample_weight_is_none = sample_weight is None @@ -608,9 +607,6 @@ def _raw_predict_init(self, X): """Check input and compute raw predictions of the init estimator.""" self._check_initialized() X = self.estimators_[0, 0]._validate_X_predict(X, check_input=True) - if X.shape[1] != self.n_features_: - raise ValueError("X.shape[1] should be {0:d}, not {1:d}.".format( - self.n_features_, X.shape[1])) if self.init_ == 'zero': raw_predictions = np.zeros(shape=(X.shape[0], self.loss_.K), dtype=np.float64) @@ -647,7 +643,8 @@ def _staged_raw_predict(self, X): Regression and binary classification are special cases with ``k == 1``, otherwise ``k==n_classes``. """ - X = check_array(X, dtype=DTYPE, order="C", accept_sparse='csr') + X = self._validate_data(X, dtype=DTYPE, order="C", accept_sparse='csr', + reset=False) raw_predictions = self._raw_predict_init(X) for i in range(self.estimators_.shape[0]): predict_stage(self.estimators_, i, X, self.learning_rate, @@ -681,7 +678,7 @@ def feature_importances_(self): if tree.tree_.node_count > 1] if not relevant_trees: # degenerate case where all trees have only one node - return np.zeros(shape=self.n_features_, dtype=np.float64) + return np.zeros(shape=self.n_features_in_, dtype=np.float64) relevant_feature_importances = [ tree.tree_.compute_feature_importances(normalize=False) @@ -764,6 +761,16 @@ def apply(self, X): return leaves + # TODO: Remove in 1.2 + # mypy error: Decorated property not supported + @deprecated( # type: ignore + "Attribute n_features_ was deprecated in version 1.0 and will be " + "removed in 1.2. Use 'n_features_in_' instead." + ) + @property + def n_features_(self): + return self.n_features_in_ + class GradientBoostingClassifier(ClassifierMixin, BaseGradientBoosting): """Gradient Boosting for classification. @@ -1005,7 +1012,7 @@ class GradientBoostingClassifier(ClassifierMixin, BaseGradientBoosting): Set via the ``init`` argument or ``loss.init_estimator``. estimators_ : ndarray of DecisionTreeRegressor of \ -shape (n_estimators, ``loss_.K``) + shape (n_estimators, ``loss_.K``) The collection of fitted sub-estimators. ``loss_.K`` is 1 for binary classification, otherwise n_classes. @@ -1015,6 +1022,10 @@ class GradientBoostingClassifier(ClassifierMixin, BaseGradientBoosting): n_features_ : int The number of data features. + .. deprecated:: 1.0 + Attribute `n_features_` was deprecated in version 1.0 and will be + removed in 1.2. Use `n_features_in_` instead. + n_classes_ : int The number of classes. @@ -1140,7 +1151,8 @@ def decision_function(self, X): :term:`classes_`. Regression and binary classification produce an array of shape (n_samples,). """ - X = check_array(X, dtype=DTYPE, order="C", accept_sparse='csr') + X = self._validate_data(X, dtype=DTYPE, order="C", accept_sparse='csr', + reset=False) raw_predictions = self._raw_predict(X) if raw_predictions.shape[1] == 1: return raw_predictions.ravel() @@ -1548,6 +1560,10 @@ class GradientBoostingRegressor(RegressorMixin, BaseGradientBoosting): n_features_ : int The number of data features. + .. deprecated:: 1.0 + Attribute `n_features_` was deprecated in version 1.0 and will be + removed in 1.2. Use `n_features_in_` instead. + max_features_ : int The inferred value of max_features. @@ -1647,7 +1663,8 @@ def predict(self, X): y : ndarray of shape (n_samples,) The predicted values. """ - X = check_array(X, dtype=DTYPE, order="C", accept_sparse='csr') + X = self._validate_data(X, dtype=DTYPE, order="C", accept_sparse='csr', + reset=False) # In regression we can directly return the raw value from the trees. return self._raw_predict(X).ravel() diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 15b4c95f8cd54..4fff6030b0d5a 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -8,7 +8,7 @@ from timeit import default_timer as time from ...base import (BaseEstimator, RegressorMixin, ClassifierMixin, is_classifier) -from ...utils import check_random_state, check_array, resample +from ...utils import check_random_state, resample from ...utils.validation import (check_is_fitted, check_consistent_length, _check_sample_weight, @@ -733,7 +733,8 @@ def _raw_predict(self, X): """ is_binned = getattr(self, '_in_fit', False) dtype = X_BINNED_DTYPE if is_binned else X_DTYPE - X = check_array(X, dtype=dtype, force_all_finite=False) + X = self._validate_data(X, dtype=dtype, force_all_finite=False, + reset=False) check_is_fitted(self) if X.shape[1] != self._n_features: raise ValueError( @@ -789,7 +790,8 @@ def _staged_raw_predict(self, X): The raw predictions of the input samples. The order of the classes corresponds to that in the attribute :term:`classes_`. """ - X = check_array(X, dtype=X_DTYPE, force_all_finite=False) + X = self._validate_data(X, dtype=X_DTYPE, force_all_finite=False, + reset=False) check_is_fitted(self) if X.shape[1] != self._n_features: raise ValueError( diff --git a/sklearn/ensemble/_iforest.py b/sklearn/ensemble/_iforest.py index e607342456cd4..588b1bbef299c 100644 --- a/sklearn/ensemble/_iforest.py +++ b/sklearn/ensemble/_iforest.py @@ -144,6 +144,10 @@ class IsolationForest(OutlierMixin, BaseBagging): n_features_ : int The number of features when ``fit`` is performed. + .. deprecated:: 1.0 + Attribute `n_features_` was deprecated in version 1.0 and will be + removed in 1.2. Use `n_features_in_` instead. + Notes ----- The implementation is based on an ensemble of ExtraTreeRegressor. The @@ -238,7 +242,7 @@ def fit(self, X, y=None, sample_weight=None): self : object Fitted estimator. """ - X = check_array(X, accept_sparse=['csc']) + X = self._validate_data(X, accept_sparse=['csc']) if issparse(X): # Pre-sort indices to avoid that each individual tree of the # ensemble sorts the indices. @@ -314,7 +318,7 @@ def predict(self, X): be considered as an inlier according to the fitted model. """ check_is_fitted(self) - X = check_array(X, accept_sparse='csr') + X = self._validate_data(X, accept_sparse='csr', reset=False) is_inlier = np.ones(X.shape[0], dtype=int) is_inlier[self.decision_function(X) < 0] = -1 return is_inlier @@ -380,12 +384,7 @@ def score_samples(self, X): check_is_fitted(self) # Check data - X = check_array(X, accept_sparse='csr') - if self.n_features_ != X.shape[1]: - raise ValueError("Number of features of the model must " - "match the input. Model n_features is {0} and " - "input n_features is {1}." - "".format(self.n_features_, X.shape[1])) + X = self._validate_data(X, accept_sparse='csr', reset=False) # Take the opposite of the scores as bigger is better (here less # abnormal) diff --git a/sklearn/ensemble/_weight_boosting.py b/sklearn/ensemble/_weight_boosting.py index 3ea94cff7da53..d5354232a4385 100644 --- a/sklearn/ensemble/_weight_boosting.py +++ b/sklearn/ensemble/_weight_boosting.py @@ -33,7 +33,7 @@ from ..base import ClassifierMixin, RegressorMixin, is_classifier, is_regressor from ..tree import DecisionTreeClassifier, DecisionTreeRegressor -from ..utils import check_array, check_random_state, _safe_indexing +from ..utils import check_random_state, _safe_indexing from ..utils.extmath import softmax from ..utils.extmath import stable_cumsum from ..metrics import accuracy_score, r2_score @@ -73,8 +73,10 @@ def __init__(self, self.random_state = random_state def _check_X(self, X): - return check_array(X, accept_sparse=['csr', 'csc'], ensure_2d=True, - allow_nd=True, dtype=None) + # Only called to validate X in non-fit methods, therefore reset=False + return self._validate_data( + X, accept_sparse=['csr', 'csc'], ensure_2d=True, allow_nd=True, + dtype=None, reset=False) def fit(self, X, y, sample_weight=None): """Build a boosted classifier/regressor from the training set (X, y). diff --git a/sklearn/ensemble/tests/test_bagging.py b/sklearn/ensemble/tests/test_bagging.py index e7cb11185fa5c..b17cbf7c147ac 100644 --- a/sklearn/ensemble/tests/test_bagging.py +++ b/sklearn/ensemble/tests/test_bagging.py @@ -480,15 +480,6 @@ def test_parallel_classification(): decisions2 = ensemble.decision_function(X_test) assert_array_almost_equal(decisions1, decisions2) - X_err = np.hstack((X_test, np.zeros((X_test.shape[0], 1)))) - err_msg = ( - f"Number of features of the model must match the input. Model " - f"n_features is {X_test.shape[1]} and input n_features is " - f"{X_err.shape[1]} " - ) - with pytest.raises(ValueError, match=err_msg): - ensemble.decision_function(X_err) - ensemble = BaggingClassifier(SVC(decision_function_shape='ovr'), n_jobs=1, random_state=0).fit(X_train, y_train) @@ -921,3 +912,16 @@ def fit(self, X, y): assert_array_equal(clf.estimators_[0]._sample_indices, clf.estimators_samples_[0]) + + +# FIXME: remove in 1.2 +@pytest.mark.parametrize("Estimator", [BaggingClassifier, BaggingRegressor]) +def test_n_features_deprecation(Estimator): + # Check that we raise the proper deprecation warning if accessing + # `n_features_`. + X = np.array([[1, 2], [3, 4]]) + y = np.array([1, 0]) + est = Estimator().fit(X, y) + + with pytest.warns(FutureWarning, match="n_features_ was deprecated"): + est.n_features_ diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py index 89ded326d21aa..c05cad26708b4 100644 --- a/sklearn/ensemble/tests/test_forest.py +++ b/sklearn/ensemble/tests/test_forest.py @@ -1476,3 +1476,21 @@ def test_little_tree_with_small_max_samples(ForestClass): msg = "Tree without `max_samples` restriction should have more nodes" assert tree1.node_count > tree2.node_count, msg + + +# FIXME: remove in 1.2 +@pytest.mark.parametrize( + "Estimator", + [ExtraTreesClassifier, ExtraTreesRegressor, + RandomForestClassifier, RandomForestRegressor, + RandomTreesEmbedding] +) +def test_n_features_deprecation(Estimator): + # Check that we raise the proper deprecation warning if accessing + # `n_features_`. + X = np.array([[1, 2], [3, 4]]) + y = np.array([1, 0]) + est = Estimator().fit(X, y) + + with pytest.warns(FutureWarning, match="n_features_ was deprecated"): + est.n_features_ diff --git a/sklearn/ensemble/tests/test_gradient_boosting.py b/sklearn/ensemble/tests/test_gradient_boosting.py index 57ac93f52d0d3..63d4e668e674f 100644 --- a/sklearn/ensemble/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/tests/test_gradient_boosting.py @@ -1353,3 +1353,19 @@ def test_criterion_mae_deprecation(estimator): "will be removed in version 1.1") with pytest.warns(FutureWarning, match=msg): estimator.fit(X, y) + + +# FIXME: remove in 1.2 +@pytest.mark.parametrize( + "Estimator", + [GradientBoostingClassifier, GradientBoostingRegressor] +) +def test_n_features_deprecation(Estimator): + # Check that we raise the proper deprecation warning if accessing + # `n_features_`. + X = np.array([[1, 2], [3, 4]]) + y = np.array([1, 0]) + est = Estimator().fit(X, y) + + with pytest.warns(FutureWarning, match="n_features_ was deprecated"): + est.n_features_ diff --git a/sklearn/ensemble/tests/test_iforest.py b/sklearn/ensemble/tests/test_iforest.py index de0c56fff793b..0b3a521346b30 100644 --- a/sklearn/ensemble/tests/test_iforest.py +++ b/sklearn/ensemble/tests/test_iforest.py @@ -345,3 +345,15 @@ def test_iforest_with_uniform_data(): assert all(iforest.predict(X) == 1) assert all(iforest.predict(rng.randn(100, 10)) == 1) assert all(iforest.predict(np.ones((100, 10))) == 1) + + +# FIXME: remove in 1.2 +def test_n_features_deprecation(): + # Check that we raise the proper deprecation warning if accessing + # `n_features_`. + X = np.array([[1, 2], [3, 4]]) + y = np.array([1, 0]) + est = IsolationForest().fit(X, y) + + with pytest.warns(FutureWarning, match="n_features_ was deprecated"): + est.n_features_ diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index dbac492d5efb9..4cdae851f9b9c 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -266,7 +266,6 @@ def test_search_cv(estimator, check, request): N_FEATURES_IN_AFTER_FIT_MODULES_TO_IGNORE = { 'calibration', 'compose', - 'ensemble', 'feature_extraction', 'isotonic', 'manifold', From 8b71a677004f5ffe665b0eb6ee3341ce17573ec3 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Mon, 22 Feb 2021 15:21:20 -0500 Subject: [PATCH 196/478] CI Fixes twitter workflow (#19525) --- .github/workflows/twitter.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/twitter.yml b/.github/workflows/twitter.yml index 7c219b13ec28c..96b32ec902efa 100644 --- a/.github/workflows/twitter.yml +++ b/.github/workflows/twitter.yml @@ -16,7 +16,7 @@ jobs: steps: - name: Tweet URL of last commit as @sklearn_commits if: github.repository == 'scikit-learn/scikit-learn' - uses: xorilog/twitter-action@0.1 + uses: docker://thomasjpfan/twitter-action:0.3 with: args: "-message \"https://github.com/scikit-learn/scikit-learn/commit/${{ github.sha }}\"" env: From 26c5530e792c1319ddd3335e23d1f36cf90f6c3d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juan=20Carlos=20Alfaro=20Jim=C3=A9nez?= Date: Mon, 22 Feb 2021 21:22:49 +0100 Subject: [PATCH 197/478] MNT Set non-interactive installation mode for the pypy job (#19461) --- .circleci/config.yml | 3 +++ sklearn/gaussian_process/tests/test_kernels.py | 5 ++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 4ca26a110f28c..bc4acd8a35fcb 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -104,6 +104,9 @@ jobs: pypy3: docker: - image: condaforge/miniforge3 + environment: + # Avoid the interactive dialog when installing tzdata + - DEBIAN_FRONTEND: noninteractive steps: - restore_cache: keys: diff --git a/sklearn/gaussian_process/tests/test_kernels.py b/sklearn/gaussian_process/tests/test_kernels.py index 4627117677c8b..1f8e196104e75 100644 --- a/sklearn/gaussian_process/tests/test_kernels.py +++ b/sklearn/gaussian_process/tests/test_kernels.py @@ -20,7 +20,8 @@ from sklearn.utils._testing import (assert_almost_equal, assert_array_equal, assert_array_almost_equal, assert_allclose, - assert_raise_message) + assert_raise_message, + fails_if_pypy) X = np.random.RandomState(0).normal(0, 1, (5, 2)) @@ -49,6 +50,8 @@ kernels.append(PairwiseKernel(gamma=1.0, metric=metric)) +# Numerical precisions errors in PyPy +@fails_if_pypy @pytest.mark.parametrize('kernel', kernels) def test_kernel_gradient(kernel): # Compare analytic and numeric gradient of kernels. From 5c246225ddf130f1eee398e889e4c2a19b5f1791 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Tue, 23 Feb 2021 17:40:18 +0100 Subject: [PATCH 198/478] ENH Adds n_features_in_ to naive_bayes (#19485) --- sklearn/naive_bayes.py | 92 ++++++++++++++++++++----------- sklearn/tests/test_common.py | 1 - sklearn/tests/test_naive_bayes.py | 75 +++++++++++-------------- 3 files changed, 91 insertions(+), 77 deletions(-) diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index d32e0756f2907..70f5993f98b1a 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -27,10 +27,10 @@ from .preprocessing import binarize from .preprocessing import LabelBinarizer from .preprocessing import label_binarize -from .utils import check_X_y, check_array, deprecated +from .utils import deprecated from .utils.extmath import safe_sparse_dot from .utils.multiclass import _check_partial_fit_first_call -from .utils.validation import check_is_fitted, check_non_negative, column_or_1d +from .utils.validation import check_is_fitted, check_non_negative from .utils.validation import _check_sample_weight from .utils.validation import _deprecate_positional_args @@ -55,7 +55,10 @@ def _joint_log_likelihood(self, X): @abstractmethod def _check_X(self, X): - """To be overridden in subclasses with the actual checks.""" + """To be overridden in subclasses with the actual checks. + + Only used in predict* methods. + """ def predict(self, X): """ @@ -214,12 +217,12 @@ def fit(self, X, y, sample_weight=None): self : object """ X, y = self._validate_data(X, y) - y = column_or_1d(y, warn=True) return self._partial_fit(X, y, np.unique(y), _refit=True, sample_weight=sample_weight) def _check_X(self, X): - return check_array(X) + """Validate X, used only in predict* methods.""" + return self._validate_data(X, reset=False) @staticmethod def _update_mean_variance(n_past, mu, var, X, sample_weight=None): @@ -367,7 +370,11 @@ def _partial_fit(self, X, y, classes=None, _refit=False, ------- self : object """ - X, y = check_X_y(X, y) + if _refit: + self.classes_ = None + + first_call = _check_partial_fit_first_call(self, classes) + X, y = self._validate_data(X, y, reset=first_call) if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X) @@ -377,10 +384,7 @@ def _partial_fit(self, X, y, classes=None, _refit=False, # deviation of the largest dimension. self.epsilon_ = self.var_smoothing * np.var(X, axis=0).max() - if _refit: - self.classes_ = None - - if _check_partial_fit_first_call(self, classes): + if first_call: # This is the first call to partial_fit: # initialize various cumulative counters n_features = X.shape[1] @@ -488,10 +492,12 @@ class _BaseDiscreteNB(_BaseNB): """ def _check_X(self, X): - return check_array(X, accept_sparse='csr') + """Validate X, used only in predict* methods.""" + return self._validate_data(X, accept_sparse='csr', reset=False) - def _check_X_y(self, X, y): - return self._validate_data(X, y, accept_sparse='csr') + def _check_X_y(self, X, y, reset=True): + """Validate X and y in fit methods.""" + return self._validate_data(X, y, accept_sparse='csr', reset=reset) def _update_class_log_prior(self, class_prior=None): n_classes = len(self.classes_) @@ -518,7 +524,7 @@ def _check_alpha(self): raise ValueError('Smoothing parameter alpha = %.1e. ' 'alpha should be > 0.' % np.min(self.alpha)) if isinstance(self.alpha, np.ndarray): - if not self.alpha.shape[0] == self.n_features_: + if not self.alpha.shape[0] == self.n_features_in_: raise ValueError("alpha should be a scalar or a numpy array " "with shape [n_features]") if np.min(self.alpha) < _ALPHA_MIN: @@ -563,7 +569,8 @@ def partial_fit(self, X, y, classes=None, sample_weight=None): ------- self : object """ - X, y = self._check_X_y(X, y) + first_call = not hasattr(self, "classes_") + X, y = self._check_X_y(X, y, reset=first_call) _, n_features = X.shape if _check_partial_fit_first_call(self, classes): @@ -571,10 +578,6 @@ def partial_fit(self, X, y, classes=None, sample_weight=None): # initialize various cumulative counters n_classes = len(classes) self._init_counters(n_classes, n_features) - self.n_features_ = n_features - elif n_features != self.n_features_: - msg = "Number of features %d does not match previous data %d." - raise ValueError(msg % (n_features, self.n_features_)) Y = label_binarize(y, classes=self.classes_) if Y.shape[1] == 1: @@ -631,7 +634,6 @@ def fit(self, X, y, sample_weight=None): """ X, y = self._check_X_y(X, y) _, n_features = X.shape - self.n_features_ = n_features labelbin = LabelBinarizer() Y = labelbin.fit_transform(y) @@ -687,6 +689,16 @@ def intercept_(self): def _more_tags(self): return {'poor_score': True} + # TODO: Remove in 1.2 + # mypy error: Decorated property not supported + @deprecated( # type: ignore + "Attribute n_features_ was deprecated in version 1.0 and will be " + "removed in 1.2. Use 'n_features_in_' instead." + ) + @property + def n_features_(self): + return self.n_features_in_ + class MultinomialNB(_BaseDiscreteNB): """ @@ -753,6 +765,10 @@ class MultinomialNB(_BaseDiscreteNB): n_features_ : int Number of features of each sample. + .. deprecated:: 1.0 + Attribute `n_features_` was deprecated in version 1.0 and will be + removed in 1.2. Use `n_features_in_` instead. + Examples -------- >>> import numpy as np @@ -879,6 +895,10 @@ class ComplementNB(_BaseDiscreteNB): n_features_ : int Number of features of each sample. + .. deprecated:: 1.0 + Attribute `n_features_` was deprecated in version 1.0 and will be + removed in 1.2. Use `n_features_in_` instead. + Examples -------- >>> import numpy as np @@ -996,6 +1016,10 @@ class BernoulliNB(_BaseDiscreteNB): n_features_ : int Number of features of each sample. + .. deprecated:: 1.0 + Attribute `n_features_` was deprecated in version 1.0 and will be + removed in 1.2. Use `n_features_in_` instead. + Examples -------- >>> import numpy as np @@ -1032,13 +1056,14 @@ def __init__(self, *, alpha=1.0, binarize=.0, fit_prior=True, self.class_prior = class_prior def _check_X(self, X): + """Validate X, used only in predict* methods.""" X = super()._check_X(X) if self.binarize is not None: X = binarize(X, threshold=self.binarize) return X - def _check_X_y(self, X, y): - X, y = super()._check_X_y(X, y) + def _check_X_y(self, X, y, reset=True): + X, y = super()._check_X_y(X, y, reset=reset) if self.binarize is not None: X = binarize(X, threshold=self.binarize) return X, y @@ -1133,6 +1158,10 @@ class CategoricalNB(_BaseDiscreteNB): n_features_ : int Number of features of each sample. + .. deprecated:: 1.0 + Attribute `n_features_` was deprecated in version 1.0 and will be + removed in 1.2. Use `n_features_in_` instead. + n_categories_ : ndarray of shape (n_features,), dtype=np.int64 Number of categories for each feature. This value is inferred from the data or set by the minimum number of categories. @@ -1235,14 +1264,15 @@ def _more_tags(self): return {'requires_positive_X': True} def _check_X(self, X): - X = check_array(X, dtype='int', accept_sparse=False, - force_all_finite=True) + """Validate X, used only in predict* methods.""" + X = self._validate_data(X, dtype='int', accept_sparse=False, + force_all_finite=True, reset=False) check_non_negative(X, "CategoricalNB (input X)") return X - def _check_X_y(self, X, y): + def _check_X_y(self, X, y, reset=True): X, y = self._validate_data(X, y, dtype='int', accept_sparse=False, - force_all_finite=True) + force_all_finite=True, reset=reset) check_non_negative(X, "CategoricalNB (input X)") return X, y @@ -1297,7 +1327,7 @@ def _update_cat_count(X_feature, Y, cat_count, n_classes): self.class_count_ += Y.sum(axis=0) self.n_categories_ = self._validate_n_categories( X, self.min_categories) - for i in range(self.n_features_): + for i in range(self.n_features_in_): X_feature = X[:, i] self.category_count_[i] = _update_cat_count_dims( self.category_count_[i], self.n_categories_[i] - 1) @@ -1307,7 +1337,7 @@ def _update_cat_count(X_feature, Y, cat_count, n_classes): def _update_feature_log_prob(self, alpha): feature_log_prob = [] - for i in range(self.n_features_): + for i in range(self.n_features_in_): smoothed_cat_count = self.category_count_[i] + alpha smoothed_class_count = smoothed_cat_count.sum(axis=1) feature_log_prob.append( @@ -1316,11 +1346,9 @@ def _update_feature_log_prob(self, alpha): self.feature_log_prob_ = feature_log_prob def _joint_log_likelihood(self, X): - if not X.shape[1] == self.n_features_: - raise ValueError("Expected input with %d features, got %d instead" - % (self.n_features_, X.shape[1])) + self._check_n_features(X, reset=False) jll = np.zeros((X.shape[0], self.class_count_.shape[0])) - for i in range(self.n_features_): + for i in range(self.n_features_in_): indices = X[:, i] jll += self.feature_log_prob_[i][:, indices].T total_ll = jll + self.class_log_prior_ diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index 4cdae851f9b9c..4f6f232a8f716 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -273,7 +273,6 @@ def test_search_cv(estimator, check, request): 'model_selection', 'multiclass', 'multioutput', - 'naive_bayes', 'pipeline', 'random_projection', } diff --git a/sklearn/tests/test_naive_bayes.py b/sklearn/tests/test_naive_bayes.py index 02b83e51ac8b6..dcd4b07712357 100644 --- a/sklearn/tests/test_naive_bayes.py +++ b/sklearn/tests/test_naive_bayes.py @@ -57,7 +57,11 @@ def test_gnb(): # Test whether label mismatch between target y and classes raises # an Error # FIXME Remove this test once the more general partial_fit tests are merged - assert_raises(ValueError, GaussianNB().partial_fit, X, y, classes=[0, 1]) + with pytest.raises( + ValueError, + match="The target label.* in y do not exist in the initial classes" + ): + GaussianNB().partial_fit(X, y, classes=[0, 1]) # TODO remove in 1.2 once sigma_ attribute is removed (GH #18842) @@ -74,7 +78,7 @@ def test_gnb_prior(): clf = GaussianNB().fit(X, y) assert_array_almost_equal(np.array([3, 3]) / 6.0, clf.class_prior_, 8) - clf.fit(X1, y1) + clf = GaussianNB().fit(X1, y1) # Check that the class priors sum to 1 assert_array_almost_equal(clf.class_prior_.sum(), 1) @@ -171,16 +175,6 @@ def test_gnb_check_update_with_no_data(): assert tvar == var -def test_gnb_pfit_wrong_nb_features(): - """Test whether an error is raised when the number of feature changes - between two partial fit""" - clf = GaussianNB() - # Fit for the first time the GNB - clf.fit(X, y) - # Partial fit a second time with an incoherent X - assert_raises(ValueError, clf.partial_fit, np.hstack((X, X)), y) - - def test_gnb_partial_fit(): clf = GaussianNB().fit(X, y) clf_pf = GaussianNB().partial_fit(X, y, np.unique(y)) @@ -272,37 +266,22 @@ def test_discretenb_partial_fit(DiscreteNaiveBayes): @pytest.mark.parametrize('NaiveBayes', ALL_NAIVE_BAYES_CLASSES) -def test_naive_bayes_input_check_fit(NaiveBayes): - # Test input checks for the fit method - - # check shape consistency for number of samples at fit time - assert_raises(ValueError, NaiveBayes().fit, X2, y2[:-1]) - - # check shape consistency for number of input features at predict time - clf = NaiveBayes().fit(X2, y2) - assert_raises(ValueError, clf.predict, X2[:, :-1]) - - -@pytest.mark.parametrize('DiscreteNaiveBayes', DISCRETE_NAIVE_BAYES_CLASSES) -def test_discretenb_input_check_partial_fit(DiscreteNaiveBayes): - # check shape consistency - assert_raises(ValueError, DiscreteNaiveBayes().partial_fit, X2, y2[:-1], - classes=np.unique(y2)) - +def test_NB_partial_fit_no_first_classes(NaiveBayes): # classes is required for first call to partial fit - assert_raises(ValueError, DiscreteNaiveBayes().partial_fit, X2, y2) + with pytest.raises( + ValueError, + match="classes must be passed on the first call to partial_fit." + ): + NaiveBayes().partial_fit(X2, y2) # check consistency of consecutive classes values - clf = DiscreteNaiveBayes() + clf = NaiveBayes() clf.partial_fit(X2, y2, classes=np.unique(y2)) - assert_raises(ValueError, clf.partial_fit, X2, y2, - classes=np.arange(42)) - - # check consistency of input shape for partial_fit - assert_raises(ValueError, clf.partial_fit, X2[:, :-1], y2) - - # check consistency of input shape for predict - assert_raises(ValueError, clf.predict, X2[:, :-1]) + with pytest.raises( + ValueError, + match="is not the same as on last call to partial_fit" + ): + clf.partial_fit(X2, y2, classes=np.arange(42)) # TODO: Remove in version 1.1 @@ -725,11 +704,6 @@ def test_categoricalnb(): assert_raise_message(ValueError, error_msg, clf.predict, X) assert_raise_message(ValueError, error_msg, clf.fit, X, y) - # Check error is raised for incorrect X - X = np.array([[1, 4, 1], [2, 5, 6]]) - msg = "Expected input with 2 features, got 3 instead" - assert_raise_message(ValueError, msg, clf.predict, X) - # Test alpha X3_test = np.array([[2, 5]]) # alpha=1 increases the count of all categories by one so the final @@ -941,3 +915,16 @@ def test_check_accuracy_on_digits(): scores = cross_val_score(GaussianNB(), X_3v8, y_3v8, cv=10) assert scores.mean() > 0.86 + + +# FIXME: remove in 1.2 +@pytest.mark.parametrize("Estimator", DISCRETE_NAIVE_BAYES_CLASSES) +def test_n_features_deprecation(Estimator): + # Check that we raise the proper deprecation warning if accessing + # `n_features_`. + X = np.array([[1, 2], [3, 4]]) + y = np.array([1, 0]) + est = Estimator().fit(X, y) + + with pytest.warns(FutureWarning, match="n_features_ was deprecated"): + est.n_features_ From fab739c480ed8641cb7e1c6fb2cc30f9346056e5 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Tue, 23 Feb 2021 17:55:20 +0100 Subject: [PATCH 199/478] CI Add workflow to check Changelog entry. (#19155) Co-authored-by: Nicolas Hug --- .github/workflows/check-changelog.yml | 54 +++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 .github/workflows/check-changelog.yml diff --git a/.github/workflows/check-changelog.yml b/.github/workflows/check-changelog.yml new file mode 100644 index 0000000000000..7d954c530cff8 --- /dev/null +++ b/.github/workflows/check-changelog.yml @@ -0,0 +1,54 @@ +name: Check Changelog +# This check makes sure that the changelog is properly updated +# when a PR introduces a change in a test file. +# To bypass this check, label the PR with "No Changelog Needed". +on: + pull_request: + +jobs: + check: + runs-on: ubuntu-latest + if: ${{ contains(github.event.pull_request.labels.*.name, 'No Changelog Needed') == 0 }} + steps: + - name: Get PR number and milestone + run: | + echo "PR_NUMBER=${{ github.event.pull_request.number }}" >> $GITHUB_ENV + echo "TAGGED_MILESTONE=${{ github.event.pull_request.milestone.title }} >> $GITHUB_ENV + - uses: actions/checkout@v2 + with: + fetch-depth: '0' + - name: Check the changelog + run: | + set -xe + changed_files=$(git diff --name-only origin/master) + # Changelog should be updated only if tests have been modified + if [[ ! "$changed_files" =~ tests ]] + then + exit 0 + fi + all_changelogs=$(cat ./doc/whats_new/v*.rst) + if [[ "$all_changelogs" =~ :pr:\`$PR_NUMBER\` ]] + then + echo "Changelog has been updated." + # If the pull request is milestoned check the correspondent changelog + if exist -f ./doc/whats_new/v${TAGGED_MILESTONE:0:4}.rst + then + expected_changelog=$(cat ./doc/whats_new/v${TAGGED_MILESTONE:0:4}.rst) + if [[ "$expected_changelog" =~ :pr:\`$PR_NUMBER\` ]] + then + echo "Changelog and milestone correspond." + else + echo "Changelog and milestone do not correspond." + echo "If you see this error make sure that the tagged milestone for the PR" + echo "and the changelog name properly match." + exit 1 + fi + fi + else + echo "Changelog entry is missing." + echo "If you see this error and there is already a changelog entry then make sure that" + echo "the PR number is correct. If no changelog entry is required for this PR," + echo "label the PR with 'No Changelog Needed' to bypass this check." + exit 1 + fi + From e23dd851476ef54c2153d6178500a3e2345f95b4 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Tue, 23 Feb 2021 13:18:41 -0500 Subject: [PATCH 200/478] TST Does not use cache in openml test (#19534) --- sklearn/datasets/tests/test_openml.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py index a84e705b0db68..9f55909c6643b 100644 --- a/sklearn/datasets/tests/test_openml.py +++ b/sklearn/datasets/tests/test_openml.py @@ -528,12 +528,12 @@ def test_fetch_openml_as_frame_auto(monkeypatch): data_id = 61 # iris dataset version 1 _monkey_patch_webbased_functions(monkeypatch, data_id, True) - data = fetch_openml(data_id=data_id, as_frame='auto') + data = fetch_openml(data_id=data_id, as_frame='auto', cache=False) assert isinstance(data.data, pd.DataFrame) data_id = 292 # Australian dataset version 1 _monkey_patch_webbased_functions(monkeypatch, data_id, True) - data = fetch_openml(data_id=data_id, as_frame='auto') + data = fetch_openml(data_id=data_id, as_frame='auto', cache=False) assert isinstance(data.data, scipy.sparse.csr_matrix) From f2943c6d10f68dd2144a80ad4b12475a25ff635a Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Wed, 24 Feb 2021 14:44:08 +1100 Subject: [PATCH 201/478] CI Fix shell syntax introduced in #19155 --- .github/workflows/check-changelog.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/check-changelog.yml b/.github/workflows/check-changelog.yml index 7d954c530cff8..9560e4cb9d680 100644 --- a/.github/workflows/check-changelog.yml +++ b/.github/workflows/check-changelog.yml @@ -13,7 +13,7 @@ jobs: - name: Get PR number and milestone run: | echo "PR_NUMBER=${{ github.event.pull_request.number }}" >> $GITHUB_ENV - echo "TAGGED_MILESTONE=${{ github.event.pull_request.milestone.title }} >> $GITHUB_ENV + echo "TAGGED_MILESTONE=${{ github.event.pull_request.milestone.title }}" >> $GITHUB_ENV - uses: actions/checkout@v2 with: fetch-depth: '0' From 86445abc8086305e6723993beb1a55ed7344ef19 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Wed, 24 Feb 2021 17:16:44 +1100 Subject: [PATCH 202/478] CI Fix origin/master -> origin/main in check-changelog --- .github/workflows/check-changelog.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/check-changelog.yml b/.github/workflows/check-changelog.yml index 9560e4cb9d680..5957744d907c7 100644 --- a/.github/workflows/check-changelog.yml +++ b/.github/workflows/check-changelog.yml @@ -20,7 +20,7 @@ jobs: - name: Check the changelog run: | set -xe - changed_files=$(git diff --name-only origin/master) + changed_files=$(git diff --name-only origin/main) # Changelog should be updated only if tests have been modified if [[ ! "$changed_files" =~ tests ]] then From 638b7689bbbfae4bcc4592c6f8a43ce86b571f0b Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Wed, 24 Feb 2021 01:23:44 -0500 Subject: [PATCH 203/478] ENH Adds nan passthrough in OrdinalEncoder (#19069) --- doc/modules/preprocessing.rst | 11 ++ doc/whats_new/v1.0.rst | 6 + sklearn/preprocessing/_encoders.py | 35 ++++- sklearn/preprocessing/tests/test_encoders.py | 137 ++++++++++++++++--- 4 files changed, 167 insertions(+), 22 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index e1b4c5599c3b5..b87971ec4ae5a 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -482,6 +482,17 @@ scikit-learn estimators, as these expect continuous input, and would interpret the categories as being ordered, which is often not desired (i.e. the set of browsers was ordered arbitrarily). +:class:`OrdinalEncoder` will also passthrough missing values that are +indicated by `np.nan`. + + >>> enc = preprocessing.OrdinalEncoder() + >>> X = [['male'], ['female'], [np.nan], ['female']] + >>> enc.fit_transform(X) + array([[ 1.], + [ 0.], + [nan], + [ 0.]]) + Another possibility to convert categorical features to features that can be used with scikit-learn estimators is to use a one-of-K, also known as one-hot or dummy encoding. diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index 25e0b369bebd3..6a565b8d5e21b 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -123,6 +123,12 @@ Changelog not corresponding to their objective. :pr:`19172` by :user:`Mathurin Massias ` +:mod:`sklearn.preprocessing` +............................ + +- |Feature| :class:`preprocessing.OrdinalEncoder` supports passing through + missing values by default. :pr:`19069` by `Thomas Fan`_. + - |API|: The parameter ``normalize`` of :class:`linear_model.LinearRegression` is deprecated and will be removed in 1.2. Motivation for this deprecation: ``normalize`` parameter did not take any diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 342b730ba91ed..043f9fc40ef53 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -10,6 +10,7 @@ from ..utils import check_array, is_scalar_nan from ..utils.validation import check_is_fitted from ..utils.validation import _deprecate_positional_args +from ..utils._mask import _get_mask from ..utils._encode import _encode, _check_unknown, _unique @@ -752,7 +753,7 @@ def fit(self, X, y=None): if np.dtype(self.dtype).kind != 'f': raise ValueError( f"When unknown_value is np.nan, the dtype " - "parameter should be " + f"parameter should be " f"a float dtype. Got {self.dtype}." ) elif not isinstance(self.unknown_value, numbers.Integral): @@ -765,7 +766,7 @@ def fit(self, X, y=None): f"handle_unknown is 'use_encoded_value', " f"got {self.unknown_value}.") - self._fit(X) + self._fit(X, force_all_finite='allow-nan') if self.handle_unknown == 'use_encoded_value': for feature_cats in self.categories_: @@ -775,6 +776,21 @@ def fit(self, X, y=None): f"values already used for encoding the " f"seen categories.") + # stores the missing indices per category + self._missing_indices = {} + for cat_idx, categories_for_idx in enumerate(self.categories_): + for i, cat in enumerate(categories_for_idx): + if is_scalar_nan(cat): + self._missing_indices[cat_idx] = i + continue + + if np.dtype(self.dtype).kind != 'f' and self._missing_indices: + raise ValueError( + "There are missing values in features " + f"{list(self._missing_indices)}. For OrdinalEncoder to " + "passthrough missing values, the dtype parameter must be a " + "float") + return self def transform(self, X): @@ -791,9 +807,14 @@ def transform(self, X): X_out : sparse matrix or a 2-d array Transformed input. """ - X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown) + X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown, + force_all_finite='allow-nan') X_trans = X_int.astype(self.dtype, copy=False) + for cat_idx, missing_idx in self._missing_indices.items(): + X_missing_mask = X_int[:, cat_idx] == missing_idx + X_trans[X_missing_mask, cat_idx] = np.nan + # create separate category for unknown values if self.handle_unknown == 'use_encoded_value': X_trans[~X_mask] = self.unknown_value @@ -814,7 +835,7 @@ def inverse_transform(self, X): Inverse transformed array. """ check_is_fitted(self) - X = check_array(X, accept_sparse='csr') + X = check_array(X, accept_sparse='csr', force_all_finite='allow-nan') n_samples, _ = X.shape n_features = len(self.categories_) @@ -833,6 +854,12 @@ def inverse_transform(self, X): for i in range(n_features): labels = X[:, i].astype('int64', copy=False) + + # replace values of X[:, i] that were nan with actual indices + if i in self._missing_indices: + X_i_mask = _get_mask(X[:, i], np.nan) + labels[X_i_mask] = self._missing_indices[i] + if self.handle_unknown == 'use_encoded_value': unknown_labels = labels == self.unknown_value X_tr[:, i] = self.categories_[i][np.where( diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index fd28d8c40b46c..b1eff0cad21e0 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -574,24 +574,6 @@ def test_ordinal_encoder_inverse(): enc.inverse_transform(X_tr) -@pytest.mark.parametrize("X", [np.array([[1, np.nan]]).T, - np.array([['a', np.nan]], dtype=object).T], - ids=['numeric', 'object']) -def test_ordinal_encoder_raise_missing(X): - ohe = OrdinalEncoder() - - with pytest.raises(ValueError, match="Input contains NaN"): - ohe.fit(X) - - with pytest.raises(ValueError, match="Input contains NaN"): - ohe.fit_transform(X) - - ohe.fit(X[:1, :]) - - with pytest.raises(ValueError, match="Input contains NaN"): - ohe.transform(X) - - def test_ordinal_encoder_handle_unknowns_string(): enc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-2) X_fit = np.array([['a', 'x'], ['b', 'y'], ['c', 'z']], dtype=object) @@ -930,3 +912,122 @@ def test_ohe_missing_value_support_pandas_categorical(pd_nan_type): assert len(ohe.categories_) == 1 assert_array_equal(ohe.categories_[0][:-1], ['a', 'b', 'c']) assert np.isnan(ohe.categories_[0][-1]) + + +def test_ordinal_encoder_passthrough_missing_values_float_errors_dtype(): + """Test ordinal encoder with nan passthrough fails when dtype=np.int32.""" + + X = np.array([[np.nan, 3.0, 1.0, 3.0]]).T + oe = OrdinalEncoder(dtype=np.int32) + + msg = (r"There are missing values in features \[0\]. For OrdinalEncoder " + "to passthrough missing values, the dtype parameter must be a " + "float") + with pytest.raises(ValueError, match=msg): + oe.fit(X) + + +def test_ordinal_encoder_passthrough_missing_values_float(): + """Test ordinal encoder with nan on float dtypes.""" + + X = np.array([[np.nan, 3.0, 1.0, 3.0]], dtype=np.float64).T + oe = OrdinalEncoder().fit(X) + + assert len(oe.categories_) == 1 + assert_allclose(oe.categories_[0], [1.0, 3.0, np.nan]) + + X_trans = oe.transform(X) + assert_allclose(X_trans, [[np.nan], [1.0], [0.0], [1.0]]) + + X_inverse = oe.inverse_transform(X_trans) + assert_allclose(X_inverse, X) + + +@pytest.mark.parametrize('pd_nan_type', ['pd.NA', 'np.nan']) +def test_ordinal_encoder_missing_value_support_pandas_categorical(pd_nan_type): + """Check ordinal encoder is compatible with pandas.""" + # checks pandas dataframe with categorical features + if pd_nan_type == 'pd.NA': + # pd.NA is in pandas 1.0 + pd = pytest.importorskip('pandas', minversion="1.0") + pd_missing_value = pd.NA + else: # np.nan + pd = pytest.importorskip('pandas') + pd_missing_value = np.nan + + df = pd.DataFrame({ + 'col1': pd.Series(['c', 'a', pd_missing_value, 'b', 'a'], + dtype='category'), + }) + + oe = OrdinalEncoder().fit(df) + assert len(oe.categories_) == 1 + assert_array_equal(oe.categories_[0][:3], ['a', 'b', 'c']) + assert np.isnan(oe.categories_[0][-1]) + + df_trans = oe.transform(df) + + assert_allclose(df_trans, [[2.0], [0.0], [np.nan], [1.0], [0.0]]) + + X_inverse = oe.inverse_transform(df_trans) + assert X_inverse.shape == (5, 1) + assert_array_equal(X_inverse[:2, 0], ['c', 'a']) + assert_array_equal(X_inverse[3:, 0], ['b', 'a']) + assert np.isnan(X_inverse[2, 0]) + + +@pytest.mark.parametrize("X, X2, cats, cat_dtype", [ + ((np.array([['a', np.nan]], dtype=object).T, + np.array([['a', 'b']], dtype=object).T, + [np.array(['a', np.nan, 'd'], dtype=object)], np.object_)), + ((np.array([['a', np.nan]], dtype=object).T, + np.array([['a', 'b']], dtype=object).T, + [np.array(['a', np.nan, 'd'], dtype=object)], np.object_)), + ((np.array([[2.0, np.nan]], dtype=np.float64).T, + np.array([[3.0]], dtype=np.float64).T, + [np.array([2.0, 4.0, np.nan])], np.float64)), + ], ids=['object-None-missing-value', 'object-nan-missing_value', + 'numeric-missing-value']) +def test_ordinal_encoder_specified_categories_missing_passthrough( + X, X2, cats, cat_dtype): + """Test ordinal encoder for specified categories.""" + oe = OrdinalEncoder(categories=cats) + exp = np.array([[0.], [np.nan]]) + assert_array_equal(oe.fit_transform(X), exp) + # manually specified categories should have same dtype as + # the data when coerced from lists + assert oe.categories_[0].dtype == cat_dtype + + # when specifying categories manually, unknown categories should already + # raise when fitting + oe = OrdinalEncoder(categories=cats) + with pytest.raises(ValueError, match="Found unknown categories"): + oe.fit(X2) + + +@pytest.mark.parametrize("X, expected_X_trans, X_test", [ + (np.array([[1.0, np.nan, 3.0]]).T, + np.array([[0.0, np.nan, 1.0]]).T, + np.array([[4.0]])), + (np.array([[1.0, 4.0, 3.0]]).T, + np.array([[0.0, 2.0, 1.0]]).T, + np.array([[np.nan]])), + (np.array([['c', np.nan, 'b']], dtype=object).T, + np.array([[1.0, np.nan, 0.0]]).T, + np.array([['d']], dtype=object)), + (np.array([['c', 'a', 'b']], dtype=object).T, + np.array([[2.0, 0.0, 1.0]]).T, + np.array([[np.nan]], dtype=object)), +]) +def test_ordinal_encoder_handle_missing_and_unknown( + X, expected_X_trans, X_test +): + """Test the interaction between missing values and handle_unknown""" + + oe = OrdinalEncoder(handle_unknown="use_encoded_value", + unknown_value=-1) + + X_trans = oe.fit_transform(X) + assert_allclose(X_trans, expected_X_trans) + + assert_allclose(oe.transform(X_test), [[-1.0]]) From c748e465c76c43a173ad5ab2fd82639210f8e895 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Thu, 25 Feb 2021 10:21:52 +0100 Subject: [PATCH 204/478] FIX Don't scale near-constant features to large values (#19527) --- doc/whats_new/v1.0.rst | 7 +++ sklearn/linear_model/_base.py | 6 ++- sklearn/linear_model/tests/test_base.py | 24 ++++++--- sklearn/preprocessing/_data.py | 34 ++++++++++--- sklearn/preprocessing/tests/test_data.py | 63 ++++++++++++++++++++++-- 5 files changed, 113 insertions(+), 21 deletions(-) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index 6a565b8d5e21b..3e36438dda095 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -193,6 +193,13 @@ Changelog positioning strategy ``knots``. :pr:`18368` by :user:`Christian Lorentzen `. +- |Fix| :func:`preprocessing.scale`, :class:`preprocessing.StandardScaler` + and similar scalers detect near-constant features to avoid scaling them to + very large values. This problem happens in particular when using a scaler on + sparse data with a constant column with sample weights, in which case + centering is typically disabled. :pr:`19527` by :user:`Oliver Grisel + ` and :user:`Maria Telenczuk `. + :mod:`sklearn.tree` ................... diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py index 61005cb4b5d4a..28cc386b4ecda 100644 --- a/sklearn/linear_model/_base.py +++ b/sklearn/linear_model/_base.py @@ -246,9 +246,13 @@ def _preprocess_data(X, y, fit_intercept, normalize=False, copy=True, X_var = X_var.astype(X.dtype, copy=False) if normalize: + # Detect constant features on the computed variance, before taking + # the np.sqrt. Otherwise constant features cannot be detected with + # sample_weights. + constant_mask = X_var < 10 * np.finfo(X.dtype).eps X_var *= X.shape[0] X_scale = np.sqrt(X_var, out=X_var) - X_scale[X_scale < 10 * np.finfo(X_scale.dtype).eps] = 1. + X_scale[constant_mask] = 1. if sp.issparse(X): inplace_column_scale(X, 1. / X_scale) else: diff --git a/sklearn/linear_model/tests/test_base.py b/sklearn/linear_model/tests/test_base.py index 56ee18f5f0d06..bf7a2696fcda2 100644 --- a/sklearn/linear_model/tests/test_base.py +++ b/sklearn/linear_model/tests/test_base.py @@ -478,10 +478,8 @@ def test_preprocess_data_weighted(is_sparse): # better check the impact of feature scaling. X[:, 0] *= 10 - # Constant non-zero feature: this edge-case is currently not handled - # correctly for sparse data, see: - # https://github.com/scikit-learn/scikit-learn/issues/19450 - # X[:, 2] = 1. + # Constant non-zero feature. + X[:, 2] = 1. # Constant zero feature (non-materialized in the sparse case) X[:, 3] = 0. @@ -495,10 +493,12 @@ def test_preprocess_data_weighted(is_sparse): X_sample_weight_var = np.average((X - X_sample_weight_avg)**2, weights=sample_weight, axis=0) + constant_mask = X_sample_weight_var < 10 * np.finfo(X.dtype).eps + assert_array_equal(constant_mask, [0, 0, 1, 1]) expected_X_scale = np.sqrt(X_sample_weight_var) * np.sqrt(n_samples) # near constant features should not be scaled - expected_X_scale[expected_X_scale < 10 * np.finfo(np.float64).eps] = 1 + expected_X_scale[constant_mask] = 1 if is_sparse: X = sparse.csr_matrix(X) @@ -538,14 +538,22 @@ def test_preprocess_data_weighted(is_sparse): # _preprocess_data with normalize=True scales the data by the feature-wise # euclidean norms while StandardScaler scales the data by the feature-wise # standard deviations. - # The two are equivalent up to a ratio of np.sqrt(n_samples) + # The two are equivalent up to a ratio of np.sqrt(n_samples). if is_sparse: scaler = StandardScaler(with_mean=False).fit( X, sample_weight=sample_weight) + # Non-constant features are scaled similarly with np.sqrt(n_samples) assert_array_almost_equal( - scaler.transform(X).toarray() / np.sqrt(n_samples), Xt.toarray() - ) + scaler.transform(X).toarray()[:, :2] / np.sqrt(n_samples), + Xt.toarray()[:, :2] + ) + + # Constant features go through un-scaled. + assert_array_almost_equal( + scaler.transform(X).toarray()[:, 2:], + Xt.toarray()[:, 2:] + ) else: scaler = StandardScaler(with_mean=True).fit( X, sample_weight=sample_weight) diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py index 92a4135147b87..29190dd6e2b67 100644 --- a/sklearn/preprocessing/_data.py +++ b/sklearn/preprocessing/_data.py @@ -60,22 +60,36 @@ ] -def _handle_zeros_in_scale(scale, copy=True): - """Makes sure that whenever scale is zero, we handle it correctly. +def _handle_zeros_in_scale(scale, copy=True, constant_mask=None): + """Set scales of near constant features to 1. - This happens in most scalers when we have constant features. - """ + The goal is to avoid division by very small or zero values. + + Near constant features are detected automatically by identifying + scales close to machine precision unless they are precomputed by + the caller and passed with the `constant_mask` kwarg. + Typically for standard scaling, the scales are the standard + deviation while near constant features are better detected on the + computed variances which are closer to machine precision by + construction. + """ # if we are fitting on 1D arrays, scale might be a scalar if np.isscalar(scale): if scale == .0: scale = 1. return scale elif isinstance(scale, np.ndarray): + if constant_mask is None: + # Detect near constant values to avoid dividing by a very small + # value that could lead to suprising results and numerical + # stability issues. + constant_mask = scale < 10 * np.finfo(scale.dtype).eps + if copy: # New array to avoid side-effects scale = scale.copy() - scale[scale == 0.0] = 1.0 + scale[constant_mask] = 1.0 return scale @@ -408,7 +422,7 @@ def partial_fit(self, X, y=None): data_range = data_max - data_min self.scale_ = ((feature_range[1] - feature_range[0]) / - _handle_zeros_in_scale(data_range)) + _handle_zeros_in_scale(data_range, copy=True)) self.min_ = feature_range[0] - data_min * self.scale_ self.data_min_ = data_min self.data_max_ = data_max @@ -850,7 +864,11 @@ def partial_fit(self, X, y=None, sample_weight=None): self.n_samples_seen_ = self.n_samples_seen_[0] if self.with_std: - self.scale_ = _handle_zeros_in_scale(np.sqrt(self.var_)) + # Extract the list of near constant features on the raw variances, + # before taking the square root. + constant_mask = self.var_ < 10 * np.finfo(X.dtype).eps + self.scale_ = _handle_zeros_in_scale( + np.sqrt(self.var_), copy=False, constant_mask=constant_mask) else: self.scale_ = None @@ -1078,7 +1096,7 @@ def partial_fit(self, X, y=None): self.n_samples_seen_ += X.shape[0] self.max_abs_ = max_abs - self.scale_ = _handle_zeros_in_scale(max_abs) + self.scale_ = _handle_zeros_in_scale(max_abs, copy=True) return self def transform(self, X): diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index 974dad31258eb..fdd88be0ccff4 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -414,6 +414,62 @@ def test_standard_scaler_dtype(add_sample_weight, sparse_constructor): assert scaler.scale_.dtype == np.float64 +@pytest.mark.parametrize("scaler", [ + StandardScaler(with_mean=False), + RobustScaler(with_centering=False), +]) +@pytest.mark.parametrize("sparse_constructor", + [np.asarray, sparse.csc_matrix, sparse.csr_matrix]) +@pytest.mark.parametrize("add_sample_weight", [False, True]) +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +@pytest.mark.parametrize("constant", [0, 1., 100.]) +def test_standard_scaler_constant_features( + scaler, add_sample_weight, sparse_constructor, dtype, constant): + if (isinstance(scaler, StandardScaler) + and constant > 1 + and sparse_constructor is not np.asarray + and add_sample_weight): + # https://github.com/scikit-learn/scikit-learn/issues/19546 + pytest.xfail("Computation of weighted variance is numerically unstable" + " for sparse data. See: #19546.") + + if isinstance(scaler, RobustScaler) and add_sample_weight: + pytest.skip(f"{scaler.__class__.__name__} does not yet support" + f" sample_weight") + + rng = np.random.RandomState(0) + n_samples = 100 + n_features = 1 + if add_sample_weight: + fit_params = dict(sample_weight=rng.uniform(size=n_samples) * 2) + else: + fit_params = {} + X_array = np.full(shape=(n_samples, n_features), fill_value=constant, + dtype=dtype) + X = sparse_constructor(X_array) + X_scaled = scaler.fit(X, **fit_params).transform(X) + + if isinstance(scaler, StandardScaler): + # The variance info should be close to zero for constant features. + assert_allclose(scaler.var_, np.zeros(X.shape[1]), atol=1e-7) + + # Constant features should not be scaled (scale of 1.): + assert_allclose(scaler.scale_, np.ones(X.shape[1])) + + if hasattr(X_scaled, "toarray"): + assert_allclose(X_scaled.toarray(), X_array) + else: + assert_allclose(X_scaled, X) + + if isinstance(scaler, StandardScaler) and not add_sample_weight: + # Also check consistency with the standard scale function. + X_scaled_2 = scale(X, with_mean=scaler.with_mean) + if hasattr(X_scaled_2, "toarray"): + assert_allclose(X_scaled_2.toarray(), X_scaled_2.toarray()) + else: + assert_allclose(X_scaled_2, X_scaled_2) + + def test_scale_1d(): # 1-d inputs X_list = [1., 3., 5., 0.] @@ -538,12 +594,11 @@ def test_scaler_float16_overflow(): def test_handle_zeros_in_scale(): - s1 = np.array([0, 1, 2, 3]) + s1 = np.array([0, 1e-16, 1, 2, 3]) s2 = _handle_zeros_in_scale(s1, copy=True) - assert not s1[0] == s2[0] - assert_array_equal(s1, np.array([0, 1, 2, 3])) - assert_array_equal(s2, np.array([1, 1, 2, 3])) + assert_allclose(s1, np.array([0, 1e-16, 1, 2, 3])) + assert_allclose(s2, np.array([1, 1, 1, 2, 3])) def test_minmax_scaler_partial_fit(): From 15d2df47b12d0bb3243d76d71401f9f4cc71caeb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juan=20Carlos=20Alfaro=20Jim=C3=A9nez?= Date: Thu, 25 Feb 2021 10:22:30 +0100 Subject: [PATCH 205/478] MNT Clear travis installation script (#19532) --- build_tools/travis/install_main.sh | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/build_tools/travis/install_main.sh b/build_tools/travis/install_main.sh index 383fefa5bd1a3..c0795139859bb 100755 --- a/build_tools/travis/install_main.sh +++ b/build_tools/travis/install_main.sh @@ -32,12 +32,7 @@ ccache --max-size 100M --show-stats # to setup a conda-based environment instead deactivate -if [[ $TRAVIS_CPU_ARCH == arm64 ]]; then - # Different Miniconda URL for ARM64 architectures - MINICONDA_URL="https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-aarch64.sh" -else - MINICONDA_URL="https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh" -fi +MINICONDA_URL="https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-aarch64.sh" # Install Miniconda wget $MINICONDA_URL -O miniconda.sh From 70af34c4afd34dbb604ef888846b2d62e93cf225 Mon Sep 17 00:00:00 2001 From: DS_anas <32871888+anashas@users.noreply.github.com> Date: Thu, 25 Feb 2021 10:23:41 +0100 Subject: [PATCH 206/478] TST Use pytest.warns in sklearn.semi_supervised tests (#19510) --- .../tests/test_label_propagation.py | 21 ++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/sklearn/semi_supervised/tests/test_label_propagation.py b/sklearn/semi_supervised/tests/test_label_propagation.py index 652f83b90a3d6..9f355281d9881 100644 --- a/sklearn/semi_supervised/tests/test_label_propagation.py +++ b/sklearn/semi_supervised/tests/test_label_propagation.py @@ -4,8 +4,6 @@ import pytest from scipy.sparse import issparse -from sklearn.utils._testing import assert_warns -from sklearn.utils._testing import assert_no_warnings from sklearn.semi_supervised import _label_propagation as label_propagation from sklearn.metrics.pairwise import rbf_kernel from sklearn.model_selection import train_test_split @@ -143,18 +141,25 @@ def test_convergence_warning(): X = np.array([[1., 0.], [0., 1.], [1., 2.5]]) y = np.array([0, 1, -1]) mdl = label_propagation.LabelSpreading(kernel='rbf', max_iter=1) - assert_warns(ConvergenceWarning, mdl.fit, X, y) + warn_msg = ('max_iter=1 was reached without convergence.') + with pytest.warns(ConvergenceWarning, match=warn_msg): + mdl.fit(X, y) assert mdl.n_iter_ == mdl.max_iter mdl = label_propagation.LabelPropagation(kernel='rbf', max_iter=1) - assert_warns(ConvergenceWarning, mdl.fit, X, y) + with pytest.warns(ConvergenceWarning, match=warn_msg): + mdl.fit(X, y) assert mdl.n_iter_ == mdl.max_iter mdl = label_propagation.LabelSpreading(kernel='rbf', max_iter=500) - assert_no_warnings(mdl.fit, X, y) + with pytest.warns(None) as record: + mdl.fit(X, y) + assert len(record) == 0 mdl = label_propagation.LabelPropagation(kernel='rbf', max_iter=500) - assert_no_warnings(mdl.fit, X, y) + with pytest.warns(None) as record: + mdl.fit(X, y) + assert len(record) == 0 @pytest.mark.parametrize("LabelPropagationCls", @@ -170,7 +175,9 @@ def test_label_propagation_non_zero_normalizer(LabelPropagationCls): mdl = LabelPropagationCls(kernel='knn', max_iter=100, n_neighbors=1) - assert_no_warnings(mdl.fit, X, y) + with pytest.warns(None) as record: + mdl.fit(X, y) + assert len(record) == 0 def test_predict_sparse_callable_kernel(): From 97fbf4eb2e162fc1bedbfd7fa4b65bc70af9f6a4 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Thu, 25 Feb 2021 11:02:54 +0100 Subject: [PATCH 207/478] [CI] Add trigging events to check-changelog workflow. (#19545) Co-authored-by: Joel Nothman --- .github/workflows/check-changelog.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/check-changelog.yml b/.github/workflows/check-changelog.yml index 5957744d907c7..753f473354131 100644 --- a/.github/workflows/check-changelog.yml +++ b/.github/workflows/check-changelog.yml @@ -4,6 +4,7 @@ name: Check Changelog # To bypass this check, label the PR with "No Changelog Needed". on: pull_request: + types: [opened, edited, labeled, unlabeled, synchronize] jobs: check: @@ -51,4 +52,3 @@ jobs: echo "label the PR with 'No Changelog Needed' to bypass this check." exit 1 fi - From 052efae8916080bd26722e7027cbfdf9296077f2 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Thu, 25 Feb 2021 06:02:08 -0500 Subject: [PATCH 208/478] TST Allows isotonic and manifold (#19539) --- sklearn/tests/test_common.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index 4f6f232a8f716..6a4702aefa34c 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -267,8 +267,6 @@ def test_search_cv(estimator, check, request): 'calibration', 'compose', 'feature_extraction', - 'isotonic', - 'manifold', 'mixture', 'model_selection', 'multiclass', From 12db86ee1ba602f9352a49cab8d731b4dc55cd08 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Thu, 25 Feb 2021 08:59:27 -0500 Subject: [PATCH 209/478] ENH Checks n_features_in_ after fitting in random_projection (#19541) --- sklearn/random_projection.py | 5 ++--- sklearn/tests/test_common.py | 1 - 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/sklearn/random_projection.py b/sklearn/random_projection.py index 33dc108a59a4e..8e968088e8141 100644 --- a/sklearn/random_projection.py +++ b/sklearn/random_projection.py @@ -38,7 +38,7 @@ from .utils import check_random_state from .utils.extmath import safe_sparse_dot from .utils.random import sample_without_replacement -from .utils.validation import check_array, check_is_fitted +from .utils.validation import check_is_fitted from .utils.validation import _deprecate_positional_args from .exceptions import DataDimensionalityWarning @@ -402,9 +402,8 @@ def transform(self, X): X_new : {ndarray, sparse matrix} of shape (n_samples, n_components) Projected array. """ - X = check_array(X, accept_sparse=['csr', 'csc']) - check_is_fitted(self) + X = self._validate_data(X, accept_sparse=['csr', 'csc'], reset=False) if X.shape[1] != self.components_.shape[1]: raise ValueError( diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index 6a4702aefa34c..859335843fd76 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -272,7 +272,6 @@ def test_search_cv(estimator, check, request): 'multiclass', 'multioutput', 'pipeline', - 'random_projection', } N_FEATURES_IN_AFTER_FIT_ESTIMATORS = [ From 139d75148ee22f1aa4f44ca561a47b62b4864801 Mon Sep 17 00:00:00 2001 From: Vangelis Gkiastas <50487017+egkiastas@users.noreply.github.com> Date: Thu, 25 Feb 2021 16:11:20 +0200 Subject: [PATCH 210/478] DOC Update calibration.rst (#19557) --- doc/modules/calibration.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/calibration.rst b/doc/modules/calibration.rst index 146601d70765e..d0a9737dac612 100644 --- a/doc/modules/calibration.rst +++ b/doc/modules/calibration.rst @@ -181,7 +181,7 @@ common kernel functions on various benchmark datasets in section 2.1 of Platt 1999 [3]_ but does not necessarily hold in general. Additionally, the logistic model works best if the calibration error is symmetrical, meaning the classifier output for each binary class is normally distributed with -the same variance [6]_. This is can be a problem for highly imbalanced +the same variance [6]_. This can be a problem for highly imbalanced classification problems, where outputs do not have equal variance. In general this method is most effective when the un-calibrated model is From 94abe05b4b96de2ca30d998fb9adb2fbd3eb1bde Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Thu, 25 Feb 2021 09:16:14 -0500 Subject: [PATCH 211/478] ENH Enables common test for bicluster (#19542) --- sklearn/cluster/_bicluster.py | 13 +++++++++++++ sklearn/tests/test_common.py | 3 --- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/sklearn/cluster/_bicluster.py b/sklearn/cluster/_bicluster.py index 6d293206bddd8..3bde33399a8e0 100644 --- a/sklearn/cluster/_bicluster.py +++ b/sklearn/cluster/_bicluster.py @@ -178,6 +178,19 @@ def _k_means(self, data, n_clusters): labels = model.labels_ return centroid, labels + def _more_tags(self): + return { + "_xfail_checks": { + "check_estimators_dtypes": "raises nan error", + "check_fit2d_1sample": "_scale_normalize fails", + "check_fit2d_1feature": "raises apply_along_axis error", + "check_estimator_sparse_data": "does not fail gracefully", + "check_methods_subset_invariance": "empty array passed inside", + "check_dont_overwrite_parameters": "empty array passed inside", + "check_fit2d_predict1d": "emptry array passed inside", + } + } + class SpectralCoclustering(BaseSpectral): """Spectral Co-Clustering algorithm (Dhillon, 2001). diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index 859335843fd76..bfd7f98268350 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -23,7 +23,6 @@ from sklearn.utils.estimator_checks import check_estimator import sklearn -from sklearn.base import BiclusterMixin from sklearn.decomposition import PCA from sklearn.linear_model._base import LinearClassifierMixin @@ -73,8 +72,6 @@ def test_get_check_estimator_ids(val, expected): def _tested_estimators(): for name, Estimator in all_estimators(): - if issubclass(Estimator, BiclusterMixin): - continue try: estimator = _construct_instance(Estimator) except SkipTest: From e0f0c7f8533550dc73822c93837bf1c609659096 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Sat, 27 Feb 2021 04:02:58 +0100 Subject: [PATCH 212/478] DOC Fix documentation on pickle portability (#19561) Co-authored-by: Olivier Grisel --- doc/modules/model_persistence.rst | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/doc/modules/model_persistence.rst b/doc/modules/model_persistence.rst index 19d3e12205c12..e00212d80fd10 100644 --- a/doc/modules/model_persistence.rst +++ b/doc/modules/model_persistence.rst @@ -59,10 +59,10 @@ Security & maintainability limitations pickle (and joblib by extension), has some issues regarding maintainability and security. Because of this, -* Never unpickle untrusted data as it could lead to malicious code being +* Never unpickle untrusted data as it could lead to malicious code being executed upon loading. -* While models saved using one version of scikit-learn might load in - other versions, this is entirely unsupported and inadvisable. It should +* While models saved using one version of scikit-learn might load in + other versions, this is entirely unsupported and inadvisable. It should also be kept in mind that operations performed on such data could give different and unexpected results. @@ -77,12 +77,11 @@ additional metadata should be saved along the pickled model: This should make it possible to check that the cross-validation score is in the same range as before. -Since a model internal representation may be different on two different -architectures, dumping a model on one architecture and loading it on -another architecture is not a supported behaviour, even if it might work -on some cases. -To overcome the issue of portability, pickle models are often deployed in -production using containers, like docker. +Aside for a few exceptions, pickled models should be portable across +architectures assuming the same versions of dependencies and Python are used. +If you encounter an estimator that is not portable please open an issue on +GitHub. Pickled models are often deployed in production using containers, like +Docker, in order to freeze the environment and dependencies. If you want to know more about these issues and explore other possible serialization methods, please refer to this @@ -108,7 +107,7 @@ models between different machine learning frameworks, and to improve their portability on different computing architectures. More details are available from the `ONNX tutorial `_. To convert scikit-learn model to ONNX a specific tool `sklearn-onnx -`_ has been developed. +`_ has been developed. PMML is an implementation of the `XML `_ document standard From f0a6f054e03bbdba96219b9698760583b3e5037e Mon Sep 17 00:00:00 2001 From: mlondschien <61679398+mlondschien@users.noreply.github.com> Date: Sat, 27 Feb 2021 13:09:43 +0100 Subject: [PATCH 213/478] FIX bug in SplineTransformer.n_features_out_ (#19577) --- sklearn/preprocessing/_polynomial.py | 4 ++-- sklearn/preprocessing/tests/test_polynomial.py | 16 ++++++++++++++++ 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/sklearn/preprocessing/_polynomial.py b/sklearn/preprocessing/_polynomial.py index 47ab90be2ebcd..26587e7f05823 100644 --- a/sklearn/preprocessing/_polynomial.py +++ b/sklearn/preprocessing/_polynomial.py @@ -307,7 +307,7 @@ def fit(self, X, y=None): ] self.bsplines_ = bsplines - self.n_features_out_ = n_out - n_features * self.include_bias + self.n_features_out_ = n_out - n_features * (1 - self.include_bias) return self def transform(self, X): @@ -336,7 +336,7 @@ def transform(self, X): # Note that scipy BSpline returns float64 arrays and converts input # x=X[:, i] to c-contiguous float64. - n_out = self.n_features_out_ + n_features * self.include_bias + n_out = self.n_features_out_ + n_features * (1 - self.include_bias) if X.dtype in FLOAT_DTYPES: dtype = X.dtype else: diff --git a/sklearn/preprocessing/tests/test_polynomial.py b/sklearn/preprocessing/tests/test_polynomial.py index 9dd65c44d8bba..2ca3260f7c05e 100644 --- a/sklearn/preprocessing/tests/test_polynomial.py +++ b/sklearn/preprocessing/tests/test_polynomial.py @@ -243,3 +243,19 @@ def test_spline_transformer_kbindiscretizer(): # Though they should be exactly equal, we test approximately with high # accuracy. assert_allclose(splines, kbins, rtol=1e-13) + + +@pytest.mark.parametrize("n_knots", [5, 10]) +@pytest.mark.parametrize("include_bias", [True, False]) +@pytest.mark.parametrize("degree", [3, 5]) +def test_spline_transformer_n_features_out(n_knots, include_bias, degree): + """Test that transform results in n_features_out_ features.""" + splt = SplineTransformer( + n_knots=n_knots, + degree=degree, + include_bias=include_bias + ) + X = np.linspace(0, 1, 10)[:, None] + splt.fit(X) + + assert splt.transform(X).shape[1] == splt.n_features_out_ From 0df7abfc87fd7aa875d0ee5ad133c455b2ded423 Mon Sep 17 00:00:00 2001 From: Steven Kolawole <45284829+SteveKola@users.noreply.github.com> Date: Sat, 27 Feb 2021 16:01:46 +0100 Subject: [PATCH 214/478] TST replace asert_warns by pytest.warns in compose/tests (#19492) Co-authored-by: Olivier Grisel Co-authored-by: Olivier Grisel --- sklearn/compose/tests/test_target.py | 7 +++---- sklearn/neighbors/tests/test_nearest_centroid.py | 1 - 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/sklearn/compose/tests/test_target.py b/sklearn/compose/tests/test_target.py index 573518b3fa43a..dc5d8d95743ef 100644 --- a/sklearn/compose/tests/test_target.py +++ b/sklearn/compose/tests/test_target.py @@ -8,7 +8,6 @@ from sklearn.dummy import DummyRegressor from sklearn.utils._testing import assert_allclose -from sklearn.utils._testing import assert_warns_message from sklearn.utils._testing import assert_no_warnings from sklearn.preprocessing import FunctionTransformer @@ -54,9 +53,9 @@ def test_transform_target_regressor_invertible(): regr = TransformedTargetRegressor(regressor=LinearRegression(), func=np.sqrt, inverse_func=np.log, check_inverse=True) - assert_warns_message(UserWarning, "The provided functions or transformer" - " are not strictly inverse of each other.", - regr.fit, X, y) + with pytest.warns(UserWarning, match="The provided functions or" + " transformer are not strictly inverse of each other."): + regr.fit(X, y) regr = TransformedTargetRegressor(regressor=LinearRegression(), func=np.sqrt, inverse_func=np.log) regr.set_params(check_inverse=False) diff --git a/sklearn/neighbors/tests/test_nearest_centroid.py b/sklearn/neighbors/tests/test_nearest_centroid.py index f91cae74b0585..9af02b07e2a96 100644 --- a/sklearn/neighbors/tests/test_nearest_centroid.py +++ b/sklearn/neighbors/tests/test_nearest_centroid.py @@ -1,7 +1,6 @@ """ Testing for the nearest centroid module. """ - import numpy as np import pytest from scipy import sparse as sp From 15c2c72e27c6ea18566f4e786506c7a3aef8a5de Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Sat, 27 Feb 2021 12:35:40 -0500 Subject: [PATCH 215/478] FIX Do not call get_feature_names for empty column selections (#19579) --- doc/whats_new/v0.24.rst | 7 +++++++ sklearn/compose/_column_transformer.py | 3 +-- sklearn/compose/tests/test_column_transformer.py | 10 ++++++++++ 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst index 6f2584dccdd10..5ac6f74f3d7df 100644 --- a/doc/whats_new/v0.24.rst +++ b/doc/whats_new/v0.24.rst @@ -12,6 +12,13 @@ Version 0.24.2 Changelog --------- +:mod:`sklearn.compose` +...................... + +- |Fix| :meth:`compose.ColumnTransformer.get_feature_names` does not call + :term:`get_feature_names` on transformers with an empty column selection. + :pr:`19579` by `Thomas Fan`_. + :mod:`sklearn.ensemble` ....................... diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index 3d71c1e5abbf5..c0444fe2d6cda 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -351,8 +351,7 @@ def get_feature_names(self): check_is_fitted(self) feature_names = [] for name, trans, column, _ in self._iter(fitted=True): - if trans == 'drop' or ( - hasattr(column, '__len__') and not len(column)): + if trans == 'drop' or _is_empty_column_selection(column): continue if trans == 'passthrough': if self._feature_names_in is not None: diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py index f2a32d6f065f4..ae2e25b68210f 100644 --- a/sklearn/compose/tests/test_column_transformer.py +++ b/sklearn/compose/tests/test_column_transformer.py @@ -1420,3 +1420,13 @@ def test_sk_visual_block_remainder_fitted_numpy(remainder): assert visual_block.names == ('scale', 'remainder') assert visual_block.name_details == ([0, 2], [1]) assert visual_block.estimators == (scaler, remainder) + + +@pytest.mark.parametrize("selector", [[], [False, False]]) +def test_get_feature_names_empty_selection(selector): + """Test that get_feature_names is only called for transformers that + were selected. Non-regression test for #19550. + """ + ct = ColumnTransformer([('ohe', OneHotEncoder(drop='first'), selector)]) + ct.fit([[1, 2], [3, 4]]) + assert ct.get_feature_names() == [] From c00c4bd7e441fbe181302a74d24f8b08c67abff3 Mon Sep 17 00:00:00 2001 From: Alihan Zihna Date: Mon, 1 Mar 2021 09:36:59 +0000 Subject: [PATCH 216/478] Change assert_raises to pytest_raises (#19509) Co-authored-by: Alihan Zihna --- sklearn/tests/test_calibration.py | 17 +++++++++-------- sklearn/tests/test_check_build.py | 7 ++++--- sklearn/tests/test_metaestimators.py | 12 ++++++------ 3 files changed, 19 insertions(+), 17 deletions(-) diff --git a/sklearn/tests/test_calibration.py b/sklearn/tests/test_calibration.py index 7c3ccd06815b3..4ba1599eba3e6 100644 --- a/sklearn/tests/test_calibration.py +++ b/sklearn/tests/test_calibration.py @@ -13,7 +13,7 @@ from sklearn.utils._testing import (assert_array_almost_equal, assert_almost_equal, assert_array_equal, - assert_raises, ignore_warnings) + ignore_warnings) from sklearn.utils.extmath import softmax from sklearn.exceptions import NotFittedError from sklearn.datasets import make_classification, make_blobs @@ -60,7 +60,8 @@ def test_calibration(data, method, ensemble): prob_pos_clf = clf.predict_proba(X_test)[:, 1] cal_clf = CalibratedClassifierCV(clf, cv=y.size + 1, ensemble=ensemble) - assert_raises(ValueError, cal_clf.fit, X, y) + with pytest.raises(ValueError): + cal_clf.fit(X, y) # Naive Bayes with calibration for this_X_train, this_X_test in [(X_train, X_test), @@ -386,8 +387,8 @@ def test_sigmoid_calibration(): # check that _SigmoidCalibration().fit only accepts 1d array or 2d column # arrays - assert_raises(ValueError, _SigmoidCalibration().fit, - np.vstack((exF, exF)), exY) + with pytest.raises(ValueError): + _SigmoidCalibration().fit(np.vstack((exF, exF)), exY) def test_calibration_curve(): @@ -406,8 +407,8 @@ def test_calibration_curve(): # probabilities outside [0, 1] should not be accepted when normalize # is set to False - assert_raises(ValueError, calibration_curve, [1.1], [-0.1], - normalize=False) + with pytest.raises(ValueError): + calibration_curve([1.1], [-0.1], normalize=False) # test that quantiles work as expected y_true2 = np.array([0, 0, 0, 0, 1, 1]) @@ -421,8 +422,8 @@ def test_calibration_curve(): assert_almost_equal(prob_pred_quantile, [0.1, 0.8]) # Check that error is raised when invalid strategy is selected - assert_raises(ValueError, calibration_curve, y_true2, y_pred2, - strategy='percentile') + with pytest.raises(ValueError): + calibration_curve(y_true2, y_pred2, strategy='percentile') @pytest.mark.parametrize('ensemble', [True, False]) diff --git a/sklearn/tests/test_check_build.py b/sklearn/tests/test_check_build.py index a7799ad1b3789..3c8e64e1ba906 100644 --- a/sklearn/tests/test_check_build.py +++ b/sklearn/tests/test_check_build.py @@ -5,10 +5,11 @@ # Author: G Varoquaux # License: BSD 3 clause -from sklearn.__check_build import raise_build_error +import pytest -from sklearn.utils._testing import assert_raises +from sklearn.__check_build import raise_build_error def test_raise_build_error(): - assert_raises(ImportError, raise_build_error, ImportError()) + with pytest.raises(ImportError): + raise_build_error(ImportError()) diff --git a/sklearn/tests/test_metaestimators.py b/sklearn/tests/test_metaestimators.py index 69a994c1b5fc0..2caa01d71c444 100644 --- a/sklearn/tests/test_metaestimators.py +++ b/sklearn/tests/test_metaestimators.py @@ -2,11 +2,11 @@ import functools import numpy as np +import pytest from sklearn.base import BaseEstimator from sklearn.datasets import make_classification -from sklearn.utils._testing import assert_raises from sklearn.utils.validation import check_is_fitted from sklearn.pipeline import Pipeline from sklearn.model_selection import GridSearchCV, RandomizedSearchCV @@ -124,12 +124,12 @@ def score(self, X, y, *args, **kwargs): % (delegator_data.name, method)) # delegation before fit raises a NotFittedError if method == 'score': - assert_raises(NotFittedError, getattr(delegator, method), - delegator_data.fit_args[0], - delegator_data.fit_args[1]) + with pytest.raises(NotFittedError): + getattr(delegator, method)(delegator_data.fit_args[0], + delegator_data.fit_args[1]) else: - assert_raises(NotFittedError, getattr(delegator, method), - delegator_data.fit_args[0]) + with pytest.raises(NotFittedError): + getattr(delegator, method)(delegator_data.fit_args[0]) delegator.fit(*delegator_data.fit_args) for method in methods: From 8d3b4241120a1290c0477e77beb7d2fff454462e Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Mon, 1 Mar 2021 14:45:01 +0100 Subject: [PATCH 217/478] FIX race condition in get_data_home causing FileExistsError (#19560) --- sklearn/datasets/_base.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/sklearn/datasets/_base.py b/sklearn/datasets/_base.py index adcad1474550a..17d2db9f2075b 100644 --- a/sklearn/datasets/_base.py +++ b/sklearn/datasets/_base.py @@ -12,7 +12,7 @@ import shutil from collections import namedtuple from os import environ, listdir, makedirs -from os.path import dirname, exists, expanduser, isdir, join, splitext +from os.path import dirname, expanduser, isdir, join, splitext from ..utils import Bunch from ..utils import check_random_state @@ -52,8 +52,7 @@ def get_data_home(data_home=None) -> str: data_home = environ.get('SCIKIT_LEARN_DATA', join('~', 'scikit_learn_data')) data_home = expanduser(data_home) - if not exists(data_home): - makedirs(data_home) + makedirs(data_home, exist_ok=True) return data_home From 72db93cc40884f42e05e4290d6ab63713d0075c9 Mon Sep 17 00:00:00 2001 From: Mohamed Haseeb Date: Mon, 1 Mar 2021 18:07:56 +0100 Subject: [PATCH 218/478] TST replaces assert_raises* by pytest.raises in model_selection/tests/test_split.py (#19585) Co-authored-by: Cycks --- sklearn/model_selection/tests/test_split.py | 79 +++++++++++++-------- 1 file changed, 48 insertions(+), 31 deletions(-) diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py index 183a2eab84b63..e6900c90e7a87 100644 --- a/sklearn/model_selection/tests/test_split.py +++ b/sklearn/model_selection/tests/test_split.py @@ -10,8 +10,6 @@ from itertools import permutations from sklearn.utils._testing import assert_allclose -from sklearn.utils._testing import assert_raises -from sklearn.utils._testing import assert_raises_regexp from sklearn.utils._testing import assert_array_almost_equal from sklearn.utils._testing import assert_array_equal from sklearn.utils._testing import assert_raise_message @@ -206,11 +204,14 @@ def test_kfold_valueerrors(): # classes are less than n_splits. y = np.array([3, 3, -1, -1, 2]) - assert_raises(ValueError, next, skf_3.split(X2, y)) + with pytest.raises(ValueError): + next(skf_3.split(X2, y)) # Error when number of folds is <= 1 - assert_raises(ValueError, KFold, 0) - assert_raises(ValueError, KFold, 1) + with pytest.raises(ValueError): + KFold(0) + with pytest.raises(ValueError): + KFold(1) error_string = ("k-fold cross-validation requires at least one" " train/test split") assert_raise_message(ValueError, error_string, @@ -219,13 +220,18 @@ def test_kfold_valueerrors(): StratifiedKFold, 1) # When n_splits is not integer: - assert_raises(ValueError, KFold, 1.5) - assert_raises(ValueError, KFold, 2.0) - assert_raises(ValueError, StratifiedKFold, 1.5) - assert_raises(ValueError, StratifiedKFold, 2.0) + with pytest.raises(ValueError): + KFold(1.5) + with pytest.raises(ValueError): + KFold(2.0) + with pytest.raises(ValueError): + StratifiedKFold(1.5) + with pytest.raises(ValueError): + StratifiedKFold(2.0) # When shuffle is not a bool: - assert_raises(TypeError, KFold, n_splits=4, shuffle=None) + with pytest.raises(TypeError): + KFold(n_splits=4, shuffle=None) def test_kfold_indices(): @@ -565,24 +571,25 @@ def test_stratified_shuffle_split_init(): X = np.arange(7) y = np.asarray([0, 1, 1, 1, 2, 2, 2]) # Check that error is raised if there is a class with only one sample - assert_raises(ValueError, next, - StratifiedShuffleSplit(3, 0.2).split(X, y)) + with pytest.raises(ValueError): + next(StratifiedShuffleSplit(3, 0.2).split(X, y)) # Check that error is raised if the test set size is smaller than n_classes - assert_raises(ValueError, next, StratifiedShuffleSplit(3, 2).split(X, y)) + with pytest.raises(ValueError): + next(StratifiedShuffleSplit(3, 2).split(X, y)) # Check that error is raised if the train set size is smaller than # n_classes - assert_raises(ValueError, next, - StratifiedShuffleSplit(3, 3, 2).split(X, y)) + with pytest.raises(ValueError): + next(StratifiedShuffleSplit(3, 3, 2).split(X, y)) X = np.arange(9) y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2]) # Train size or test size too small - assert_raises(ValueError, next, - StratifiedShuffleSplit(train_size=2).split(X, y)) - assert_raises(ValueError, next, - StratifiedShuffleSplit(test_size=2).split(X, y)) + with pytest.raises(ValueError): + next(StratifiedShuffleSplit(train_size=2).split(X, y)) + with pytest.raises(ValueError): + next(StratifiedShuffleSplit(test_size=2).split(X, y)) def test_stratified_shuffle_split_respects_test_size(): @@ -845,9 +852,9 @@ def test_leave_one_p_group_out(): assert lpgo_1.get_n_splits(groups=np.arange(4)) == 4 # raise ValueError if a `groups` parameter is illegal - with assert_raises(ValueError): + with pytest.raises(ValueError): logo.get_n_splits(None, None, [0.0, np.nan, 0.0]) - with assert_raises(ValueError): + with pytest.raises(ValueError): lpgo_2.get_n_splits(None, None, [0.0, np.inf, 0.0]) msg = "The 'groups' parameter should not be None." @@ -911,8 +918,10 @@ def test_leave_one_p_group_out_error_on_fewer_number_of_groups(): def test_repeated_cv_value_errors(): # n_repeats is not integer or <= 0 for cv in (RepeatedKFold, RepeatedStratifiedKFold): - assert_raises(ValueError, cv, n_repeats=0) - assert_raises(ValueError, cv, n_repeats=1.5) + with pytest.raises(ValueError): + cv(n_repeats=0) + with pytest.raises(ValueError): + cv(n_repeats=1.5) @pytest.mark.parametrize( @@ -954,7 +963,8 @@ def test_repeated_kfold_determinstic_split(): assert_array_equal(train, [2, 3, 4]) assert_array_equal(test, [0, 1]) - assert_raises(StopIteration, next, splits) + with pytest.raises(StopIteration): + next(splits) def test_get_n_splits_for_repeated_kfold(): @@ -1002,7 +1012,8 @@ def test_repeated_stratified_kfold_determinstic_split(): assert_array_equal(train, [0, 1, 4]) assert_array_equal(test, [2, 3]) - assert_raises(StopIteration, next, splits) + with pytest.raises(StopIteration): + next(splits) def test_train_test_split_errors(): @@ -1258,7 +1269,8 @@ def test_check_cv(): cv = check_cv(3, y_multioutput, classifier=True) np.testing.assert_equal(list(KFold(3).split(X)), list(cv.split(X))) - assert_raises(ValueError, check_cv, cv="lolo") + with pytest.raises(ValueError): + check_cv(cv="lolo") def test_cv_iterable_wrapper(): @@ -1375,17 +1387,22 @@ def test_group_kfold(): # Should fail if there are more folds than groups groups = np.array([1, 1, 1, 2, 2]) X = y = np.ones(len(groups)) - assert_raises_regexp(ValueError, "Cannot have number of splits.*greater", - next, GroupKFold(n_splits=3).split(X, y, groups)) + with pytest.raises( + ValueError, + match="Cannot have number of splits.*greater" + ): + next(GroupKFold(n_splits=3).split(X, y, groups)) def test_time_series_cv(): X = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14]] # Should fail if there are more folds than samples - assert_raises_regexp(ValueError, "Cannot have number of folds.*greater", - next, - TimeSeriesSplit(n_splits=7).split(X)) + with pytest.raises( + ValueError, + match="Cannot have number of folds.*greater" + ): + next(TimeSeriesSplit(n_splits=7).split(X)) tscv = TimeSeriesSplit(2) From 192952affa8d7db7902d3dd3bba6062bb296d294 Mon Sep 17 00:00:00 2001 From: Samuel Brice <7470577+samdbrice@users.noreply.github.com> Date: Mon, 1 Mar 2021 20:04:58 -0500 Subject: [PATCH 219/478] FIX Deep copy criterion in trees to fix concurrency bug (#19580) Co-authored-by: Samuel Brice --- doc/whats_new/v0.24.rst | 10 ++++++++++ sklearn/ensemble/tests/test_forest.py | 18 ++++++++++++++++++ sklearn/tree/_classes.py | 5 +++++ 3 files changed, 33 insertions(+) diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst index 5ac6f74f3d7df..84e712c05ea79 100644 --- a/doc/whats_new/v0.24.rst +++ b/doc/whats_new/v0.24.rst @@ -48,6 +48,16 @@ Changelog :class:`~sklearn.semi_supervised.LabelPropagation`. :pr:`19271` by :user:`Zhaowei Wang `. +:mod:`sklearn.tree` +....................... + +- |Fix| Fix a bug in `fit` of :class:`tree.BaseDecisionTree` that caused + segmentation faults under certain conditions. `fit` now deep copies the + `Criterion` object to prevent shared concurrent accesses. + :pr:`19580` by :user:`Samuel Brice ` and + :user:`Alex Adamson ` and + :user:`Wil Yegelwel `. + :mod:`sklearn.utils` .................... diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py index c05cad26708b4..efb1a645842bc 100644 --- a/sklearn/ensemble/tests/test_forest.py +++ b/sklearn/ensemble/tests/test_forest.py @@ -1494,3 +1494,21 @@ def test_n_features_deprecation(Estimator): with pytest.warns(FutureWarning, match="n_features_ was deprecated"): est.n_features_ + + +@pytest.mark.parametrize('Forest', FOREST_REGRESSORS) +def test_mse_criterion_object_segfault_smoke_test(Forest): + # This is a smoke test to ensure that passing a mutable criterion + # does not cause a segfault when fitting with concurrent threads. + # Non-regression test for: + # https://github.com/scikit-learn/scikit-learn/issues/12623 + from sklearn.tree._criterion import MSE + + y = y_reg.reshape(-1, 1) + n_samples, n_outputs = y.shape + mse_criterion = MSE(n_outputs, n_samples) + est = FOREST_REGRESSORS[Forest]( + n_estimators=2, n_jobs=2, criterion=mse_criterion + ) + + est.fit(X_reg, y) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index c09ebe388aa5d..f7ae823c0070f 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -16,6 +16,7 @@ import numbers import warnings +import copy from abc import ABCMeta from abc import abstractmethod from math import ceil @@ -349,6 +350,10 @@ def fit(self, X, y, sample_weight=None, check_input=True, else: criterion = CRITERIA_REG[self.criterion](self.n_outputs_, n_samples) + else: + # Make a deepcopy in case the criterion has mutable attributes that + # might be shared and modified concurrently during parallel fitting + criterion = copy.deepcopy(criterion) SPLITTERS = SPARSE_SPLITTERS if issparse(X) else DENSE_SPLITTERS From 482a7781bd7ab01ab2afe1682e6bfa64c93611f5 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Tue, 2 Mar 2021 11:41:34 +0100 Subject: [PATCH 220/478] MAINT Update _arff.py (#19597) --- sklearn/externals/_arff.py | 175 +++++++++++++++++-------------------- 1 file changed, 79 insertions(+), 96 deletions(-) diff --git a/sklearn/externals/_arff.py b/sklearn/externals/_arff.py index 8330eec8adb87..ccfbbc5e5e971 100644 --- a/sklearn/externals/_arff.py +++ b/sklearn/externals/_arff.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # ============================================================================= # Federal University of Rio Grande do Sul (UFRGS) # Connectionist Artificial Intelligence Laboratory (LIAC) @@ -98,7 +97,7 @@ The above keys must follow the case which were described, i.e., the keys are case sensitive. The attribute type ``attribute_type`` must be one of these strings (they are not case sensitive): ``NUMERIC``, ``INTEGER``, ``REAL`` or -``STRING``. For nominal attributes, the ``attribute_type`` must be a list of +``STRING``. For nominal attributes, the ``atribute_type`` must be a list of strings. In this format, the XOR dataset presented above can be represented as a python @@ -138,7 +137,7 @@ - Supports read and write the descriptions of files; - Supports missing values and names with spaces; - Supports unicode values and names; -- Fully compatible with Python 2.7+, Python 3.3+, pypy and pypy3; +- Fully compatible with Python 2.7+, Python 3.5+, pypy and pypy3; - Under `MIT License `_ ''' @@ -148,12 +147,11 @@ 'joel.nothman@gmail.com') __version__ = '2.4.0' -from typing import TYPE_CHECKING -from typing import Optional, List, Dict, Any, Iterator, Union, Tuple - import re import sys import csv +import typing +from typing import Optional, List, Dict, Any, Iterator, Union, Tuple # CONSTANTS =================================================================== _SIMPLE_TYPES = ['NUMERIC', 'REAL', 'INTEGER', 'STRING'] @@ -166,7 +164,6 @@ _RE_RELATION = re.compile(r'^([^\{\}%,\s]*|\".*\"|\'.*\')$', re.UNICODE) _RE_ATTRIBUTE = re.compile(r'^(\".*\"|\'.*\'|[^\{\}%,\s]*)\s+(.+)$', re.UNICODE) -_RE_TYPE_NOMINAL = re.compile(r'^\{\s*((\".*\"|\'.*\'|\S*)\s*,\s*)*(\".*\"|\'.*\'|\S*)\s*\}$', re.UNICODE) _RE_QUOTE_CHARS = re.compile(r'["\'\\\s%,\000-\031]', re.UNICODE) _RE_ESCAPE_CHARS = re.compile(r'(?=["\'\\%])|[\n\r\t\000-\031]') _RE_SPARSE_LINE = re.compile(r'^\s*\{.*\}\s*$', re.UNICODE) @@ -176,7 +173,7 @@ ArffSparseDataType = Tuple[List, ...] -if TYPE_CHECKING: +if typing.TYPE_CHECKING: # typing_extensions is available when mypy is installed from typing_extensions import TypedDict @@ -218,10 +215,10 @@ def _build_re_values(): dense = re.compile(r'''(?x) , # may follow ',' \s* - ((?=,)|$|%(value_re)s) # empty or value + ((?=,)|$|{value_re}) # empty or value | (\S.*) # error - ''' % {'value_re': value_re}) + '''.format(value_re=value_re)) # This captures (key, value) groups and will have an empty key/value # in case of syntax errors. @@ -240,6 +237,7 @@ def _build_re_values(): return dense, sparse + _RE_DENSE_VALUES, _RE_SPARSE_KEY_VALUES = _build_re_values() @@ -265,10 +263,10 @@ def _escape_sub_callback(match): if len(s) == 2: try: return _ESCAPE_SUB_MAP[s] - except KeyError as e: - raise ValueError('Unsupported escape sequence: %s' % s) from e + except KeyError: + raise ValueError('Unsupported escape sequence: %s' % s) if s[1] == 'u': - return unichr(int(s[2:], 16)) + return chr(int(s[2:], 16)) else: return chr(int(s[1:], 8)) @@ -303,8 +301,8 @@ def _parse_values(s): # an ARFF syntax error in sparse data for match in _RE_SPARSE_KEY_VALUES.finditer(s): if not match.group(1): - raise BadLayout('Error parsing %r' % match.group()) from exc - raise BadLayout('Unknown parsing error') from exc + raise BadLayout('Error parsing %r' % match.group()) + raise BadLayout('Unknown parsing error') else: # an ARFF syntax error for match in _RE_DENSE_VALUES.finditer(s): @@ -321,24 +319,10 @@ def _parse_values(s): LOD_GEN = 4 # Generator of dictionaries _SUPPORTED_DATA_STRUCTURES = [DENSE, COO, LOD, DENSE_GEN, LOD_GEN] -# ============================================================================= - -# COMPATIBILITY WITH PYTHON 3 ================================================= -PY3 = sys.version_info[0] == 3 -if PY3: - unicode = str - basestring = str - xrange = range - unichr = chr -# COMPABILITY WITH PYTHON 2 =================================================== -# ============================================================================= -PY2 = sys.version_info[0] == 2 -if PY2: - from itertools import izip as zip # EXCEPTIONS ================================================================== class ArffException(Exception): - message : Optional[str] = None + message: Optional[str] = None def __init__(self): self.line = -1 @@ -357,7 +341,7 @@ class BadAttributeFormat(ArffException): class BadDataFormat(ArffException): '''Error raised when some data instance is in an invalid format.''' def __init__(self, value): - super(BadDataFormat, self).__init__() + super().__init__() self.message = ( 'Bad @DATA instance format in line %d: ' + ('%s' % value) @@ -373,7 +357,7 @@ class BadAttributeName(ArffException): declaration.''' def __init__(self, value, value2): - super(BadAttributeName, self).__init__() + super().__init__() self.message = ( ('Bad @ATTRIBUTE name %s at line' % value) + ' %d, this name is already in use in line' + @@ -385,7 +369,7 @@ class BadNominalValue(ArffException): declared into it respective attribute declaration.''' def __init__(self, value): - super(BadNominalValue, self).__init__() + super().__init__() self.message = ( ('Data value %s not found in nominal declaration, ' % value) + 'at line %d.' @@ -394,7 +378,7 @@ def __init__(self, value): class BadNominalFormatting(ArffException): '''Error raised when a nominal value with space is not properly quoted.''' def __init__(self, value): - super(BadNominalFormatting, self).__init__() + super().__init__() self.message = ( ('Nominal data value "%s" not properly quoted in line ' % value) + '%d.' @@ -414,7 +398,7 @@ class BadLayout(ArffException): message = 'Invalid layout of the ARFF file, at line %d.' def __init__(self, msg=''): - super(BadLayout, self).__init__() + super().__init__() if msg: self.message = BadLayout.message + ' ' + msg.replace('%', '%%') @@ -437,11 +421,11 @@ def _unescape_sub_callback(match): def encode_string(s): if _RE_QUOTE_CHARS.search(s): - return u"'%s'" % _RE_ESCAPE_CHARS.sub(_unescape_sub_callback, s) + return "'%s'" % _RE_ESCAPE_CHARS.sub(_unescape_sub_callback, s) return s -class EncodedNominalConversor(object): +class EncodedNominalConversor: def __init__(self, values): self.values = {v: i for i, v in enumerate(values)} self.values[0] = 0 @@ -449,11 +433,11 @@ def __init__(self, values): def __call__(self, value): try: return self.values[value] - except KeyError as e: - raise BadNominalValue(value) from e + except KeyError: + raise BadNominalValue(value) -class NominalConversor(object): +class NominalConversor: def __init__(self, values): self.values = set(values) self.zero_value = values[0] @@ -467,10 +451,10 @@ def __call__(self, value): # with EncodedNominalConversor. return self.zero_value raise BadNominalValue(value) - return unicode(value) + return str(value) -class DenseGeneratorData(object): +class DenseGeneratorData: '''Internal helper class to allow for different matrix types without making the code a huge collection of if statements.''' @@ -483,7 +467,7 @@ def decode_rows(self, stream, conversors): raise BadDataFormat(row) # XXX: int 0 is used for implicit values, not '0' values = [values[i] if i in values else 0 for i in - xrange(len(conversors))] + range(len(conversors))] else: if len(values) != len(conversors): raise BadDataFormat(row) @@ -498,7 +482,7 @@ def _decode_values(values, conversors): in zip(conversors, values)] except ValueError as exc: if 'float: ' in str(exc): - raise BadNumericalValue from exc + raise BadNumericalValue() return values def encode_data(self, data, attributes): @@ -522,27 +506,27 @@ def encode_data(self, data, attributes): new_data = [] for value in inst: - if value is None or value == u'' or value != value: + if value is None or value == '' or value != value: s = '?' else: - s = encode_string(unicode(value)) + s = encode_string(str(value)) new_data.append(s) current_row += 1 - yield u','.join(new_data) + yield ','.join(new_data) -class _DataListMixin(object): +class _DataListMixin: """Mixin to return a list from decode_rows instead of a generator""" def decode_rows(self, stream, conversors): - return list(super(_DataListMixin, self).decode_rows(stream, conversors)) + return list(super().decode_rows(stream, conversors)) class Data(_DataListMixin, DenseGeneratorData): pass -class COOData(object): +class COOData: def decode_rows(self, stream, conversors): data, rows, cols = [], [], [] for i, row in enumerate(stream): @@ -557,11 +541,11 @@ def decode_rows(self, stream, conversors): for key, value in zip(row_cols, values)] except ValueError as exc: if 'float: ' in str(exc): - raise BadNumericalValue from exc + raise BadNumericalValue() raise - except IndexError as e: + except IndexError: # conversor out of range - raise BadDataFormat(row) from e + raise BadDataFormat(row) data.extend(values) rows.extend([i] * len(values)) @@ -579,7 +563,7 @@ def encode_data(self, data, attributes): data = data.data # Check if the rows are sorted - if not all(row[i] <= row[i + 1] for i in xrange(len(row) - 1)): + if not all(row[i] <= row[i + 1] for i in range(len(row) - 1)): raise ValueError("liac-arff can only output COO matrices with " "sorted rows.") @@ -587,7 +571,7 @@ def encode_data(self, data, attributes): if row > current_row: # Add empty rows if necessary while current_row < row: - yield " ".join([u"{", u','.join(new_data), u"}"]) + yield " ".join(["{", ','.join(new_data), "}"]) new_data = [] current_row += 1 @@ -597,15 +581,15 @@ def encode_data(self, data, attributes): (current_row, col + 1, num_attributes) ) - if v is None or v == u'' or v != v: + if v is None or v == '' or v != v: s = '?' else: - s = encode_string(unicode(v)) + s = encode_string(str(v)) new_data.append("%d %s" % (col, s)) - yield " ".join([u"{", u','.join(new_data), u"}"]) + yield " ".join(["{", ','.join(new_data), "}"]) -class LODGeneratorData(object): +class LODGeneratorData: def decode_rows(self, stream, conversors): for row in stream: values = _parse_values(row) @@ -617,11 +601,11 @@ def decode_rows(self, stream, conversors): for key, value in values.items()} except ValueError as exc: if 'float: ' in str(exc): - raise BadNumericalValue from exc + raise BadNumericalValue() raise - except IndexError as e: + except IndexError: # conversor out of range - raise BadDataFormat(row) from e + raise BadDataFormat(row) def encode_data(self, data, attributes): current_row = 0 @@ -638,14 +622,14 @@ def encode_data(self, data, attributes): for col in sorted(row): v = row[col] - if v is None or v == u'' or v != v: + if v is None or v == '' or v != v: s = '?' else: - s = encode_string(unicode(v)) + s = encode_string(str(v)) new_data.append("%d %s" % (col, s)) current_row += 1 - yield " ".join([u"{", u','.join(new_data), u"}"]) + yield " ".join(["{", ','.join(new_data), "}"]) class LODData(_DataListMixin, LODGeneratorData): pass @@ -680,7 +664,7 @@ def _get_data_object_for_encoding(matrix): # ============================================================================= # ADVANCED INTERFACE ========================================================== -class ArffDecoder(object): +class ArffDecoder: '''An ARFF decoder.''' def __init__(self): @@ -724,7 +708,7 @@ def _decode_relation(self, s): if not _RE_RELATION.match(v): raise BadRelationFormat() - res = unicode(v.strip('"\'')) + res = str(v.strip('"\'')) return res def _decode_attribute(self, s): @@ -766,20 +750,20 @@ def _decode_attribute(self, s): name, type_ = m.groups() # Extracts the final name - name = unicode(name.strip('"\'')) + name = str(name.strip('"\'')) # Extracts the final type - if _RE_TYPE_NOMINAL.match(type_): + if type_[:1] == "{" and type_[-1:] == "}": try: type_ = _parse_values(type_.strip('{} ')) - except Exception as e: - raise BadAttributeType from e + except Exception: + raise BadAttributeType() if isinstance(type_, dict): raise BadAttributeType() else: # If not nominal, verify the type name - type_ = unicode(type_).upper() + type_ = str(type_).upper() if type_ not in ['NUMERIC', 'REAL', 'INTEGER', 'STRING']: raise BadAttributeType() @@ -792,15 +776,15 @@ def _decode(self, s, encode_nominal=False, matrix_type=DENSE): self._current_line = 0 # If string, convert to a list of lines - if isinstance(s, basestring): + if isinstance(s, str): s = s.strip('\r\n ').replace('\r\n', '\n').split('\n') # Create the return object obj: ArffContainerType = { - u'description': u'', - u'relation': u'', - u'attributes': [], - u'data': [] + 'description': '', + 'relation': '', + 'attributes': [], + 'data': [] } attribute_names = {} @@ -852,7 +836,7 @@ def _decode(self, s, encode_nominal=False, matrix_type=DENSE): else: conversor = NominalConversor(attr[1]) else: - CONVERSOR_MAP = {'STRING': unicode, + CONVERSOR_MAP = {'STRING': str, 'INTEGER': lambda x: int(float(x)), 'NUMERIC': float, 'REAL': float} @@ -915,7 +899,7 @@ def decode(self, s, encode_nominal=False, return_type=DENSE): raise e -class ArffEncoder(object): +class ArffEncoder: '''An ARFF encoder.''' def _encode_comment(self, s=''): @@ -931,9 +915,9 @@ def _encode_comment(self, s=''): :return: a string with the encoded comment line. ''' if s: - return u'%s %s'%(_TK_COMMENT, s) + return '%s %s'%(_TK_COMMENT, s) else: - return u'%s' % _TK_COMMENT + return '%s' % _TK_COMMENT def _encode_relation(self, name): '''(INTERNAL) Decodes a relation line. @@ -949,7 +933,7 @@ def _encode_relation(self, name): name = '"%s"'%name break - return u'%s %s'%(_TK_RELATION, name) + return '%s %s'%(_TK_RELATION, name) def _encode_attribute(self, name, type_): '''(INTERNAL) Encodes an attribute line. @@ -980,20 +964,20 @@ def _encode_attribute(self, name, type_): break if isinstance(type_, (tuple, list)): - type_tmp = [u'%s' % encode_string(type_k) for type_k in type_] - type_ = u'{%s}'%(u', '.join(type_tmp)) + type_tmp = ['%s' % encode_string(type_k) for type_k in type_] + type_ = '{%s}'%(', '.join(type_tmp)) - return u'%s %s %s'%(_TK_ATTRIBUTE, name, type_) + return '%s %s %s'%(_TK_ATTRIBUTE, name, type_) def encode(self, obj): '''Encodes a given object to an ARFF file. :param obj: the object containing the ARFF information. - :return: the ARFF file as an unicode string. + :return: the ARFF file as an string. ''' data = [row for row in self.iter_encode(obj)] - return u'\n'.join(data) + return '\n'.join(data) def iter_encode(self, obj): '''The iterative version of `arff.ArffEncoder.encode`. @@ -1002,7 +986,7 @@ def iter_encode(self, obj): lines of the ARFF file. :param obj: the object containing the ARFF information. - :return: (yields) the ARFF file as unicode strings. + :return: (yields) the ARFF file as strings. ''' # DESCRIPTION if obj.get('description', None): @@ -1014,7 +998,7 @@ def iter_encode(self, obj): raise BadObject('Relation name not found or with invalid value.') yield self._encode_relation(obj['relation']) - yield u'' + yield '' # ATTRIBUTES if not obj.get('attributes'): @@ -1025,10 +1009,10 @@ def iter_encode(self, obj): # Verify for bad object format if not isinstance(attr, (tuple, list)) or \ len(attr) != 2 or \ - not isinstance(attr[0], basestring): + not isinstance(attr[0], str): raise BadObject('Invalid attribute declaration "%s"'%str(attr)) - if isinstance(attr[1], basestring): + if isinstance(attr[1], str): # Verify for invalid types if attr[1] not in _SIMPLE_TYPES: raise BadObject('Invalid attribute type "%s"'%str(attr)) @@ -1045,17 +1029,16 @@ def iter_encode(self, obj): attribute_names.add(attr[0]) yield self._encode_attribute(attr[0], attr[1]) - yield u'' + yield '' attributes = obj['attributes'] # DATA yield _TK_DATA if 'data' in obj: data = _get_data_object_for_encoding(obj.get('data')) - for line in data.encode_data(obj.get('data'), attributes): - yield line + yield from data.encode_data(obj.get('data'), attributes) - yield u'' + yield '' # ============================================================================= @@ -1108,7 +1091,7 @@ def dump(obj, fp): last_row = next(generator) for row in generator: - fp.write(last_row + u'\n') + fp.write(last_row + '\n') last_row = row fp.write(last_row) From 5c0bbb0a4a4e674ba8017e3cdc664e0b7c7c8dc0 Mon Sep 17 00:00:00 2001 From: Mohamed Haseeb Date: Tue, 2 Mar 2021 12:14:26 +0100 Subject: [PATCH 221/478] TST replaces assert_raise* by pytest.raises in model_selection (#19592) Co-authored-by: Cycks --- .../model_selection/tests/test_validation.py | 184 +++++++++--------- 1 file changed, 95 insertions(+), 89 deletions(-) diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py index 8bb853bcd51b4..c280d1e8ef140 100644 --- a/sklearn/model_selection/tests/test_validation.py +++ b/sklearn/model_selection/tests/test_validation.py @@ -15,9 +15,6 @@ from sklearn.model_selection.tests.test_search import FailingClassifier from sklearn.utils._testing import assert_almost_equal -from sklearn.utils._testing import assert_raises -from sklearn.utils._testing import assert_raise_message -from sklearn.utils._testing import assert_raises_regex from sklearn.utils._testing import assert_array_almost_equal from sklearn.utils._testing import assert_array_equal from sklearn.utils._testing import assert_allclose @@ -125,7 +122,6 @@ def _is_training_data(self, X): def partial_fit(self, X, y=None, **params): self.train_sizes += X.shape[0] self.x = X[0] - if self.expected_fit_params: missing = set(self.expected_fit_params) - set(params) if missing: @@ -281,7 +277,8 @@ def test_cross_val_score(): clf = CheckingClassifier(check_y=list_check) scores = cross_val_score(clf, X, y2.tolist(), cv=3) - assert_raises(ValueError, cross_val_score, clf, X, y2, scoring="sklearn") + with pytest.raises(ValueError): + cross_val_score(clf, X, y2, scoring="sklearn") # test with 3d X and X_3d = X[:, :, np.newaxis] @@ -289,8 +286,8 @@ def test_cross_val_score(): scores = cross_val_score(clf, X_3d, y2) clf = MockClassifier(allow_nd=False) - assert_raises(ValueError, cross_val_score, clf, X_3d, y2, - error_score='raise') + with pytest.raises(ValueError): + cross_val_score(clf, X_3d, y2, error_score='raise') def test_cross_validate_many_jobs(): @@ -312,38 +309,39 @@ def test_cross_validate_invalid_scoring_param(): # List/tuple of callables should raise a message advising users to use # dict of names to callables mapping - assert_raises_regex(ValueError, error_message_regexp, - cross_validate, estimator, X, y, - scoring=(make_scorer(precision_score), - make_scorer(accuracy_score))) - assert_raises_regex(ValueError, error_message_regexp, - cross_validate, estimator, X, y, - scoring=(make_scorer(precision_score),)) + with pytest.raises(ValueError, match=error_message_regexp): + cross_validate(estimator, X, y, scoring=(make_scorer(precision_score), + make_scorer(accuracy_score))) + with pytest.raises(ValueError, match=error_message_regexp): + cross_validate(estimator, X, y, + scoring=(make_scorer(precision_score),)) # So should empty lists/tuples - assert_raises_regex(ValueError, error_message_regexp + "Empty list.*", - cross_validate, estimator, X, y, scoring=()) + with pytest.raises( + ValueError, + match=error_message_regexp + "Empty list.*" + ): + cross_validate(estimator, X, y, scoring=()) # So should duplicated entries - assert_raises_regex(ValueError, error_message_regexp + "Duplicate.*", - cross_validate, estimator, X, y, - scoring=('f1_micro', 'f1_micro')) + with pytest.raises(ValueError, match=error_message_regexp + "Duplicate.*"): + cross_validate(estimator, X, y, scoring=('f1_micro', 'f1_micro')) # Nested Lists should raise a generic error message - assert_raises_regex(ValueError, error_message_regexp, - cross_validate, estimator, X, y, - scoring=[[make_scorer(precision_score)]]) + with pytest.raises(ValueError, match=error_message_regexp): + cross_validate(estimator, X, y, + scoring=[[make_scorer(precision_score)]]) error_message_regexp = (".*scoring is invalid.*Refer to the scoring " "glossary for details:.*") # Empty dict should raise invalid scoring error - assert_raises_regex(ValueError, "An empty dict", - cross_validate, estimator, X, y, scoring=(dict())) + with pytest.raises(ValueError, match="An empty dict"): + cross_validate(estimator, X, y, scoring=(dict())) # And so should any other invalid entry - assert_raises_regex(ValueError, error_message_regexp, - cross_validate, estimator, X, y, scoring=5) + with pytest.raises(ValueError, match=error_message_regexp): + cross_validate(estimator, X, y, scoring=5) multiclass_scorer = make_scorer(precision_recall_fscore_support) @@ -359,8 +357,11 @@ def test_cross_validate_invalid_scoring_param(): with pytest.warns(UserWarning, match=warning_message): cross_validate(estimator, X, y, scoring={"foo": multiclass_scorer}) - assert_raises_regex(ValueError, "'mse' is not a valid scoring value.", - cross_validate, SVC(), X, y, scoring="mse") + with pytest.raises( + ValueError, + match="'mse' is not a valid scoring value." + ): + cross_validate(SVC(), X, y, scoring="mse") def test_cross_validate_nested_estimator(): @@ -532,13 +533,12 @@ def test_cross_val_score_predict_groups(): group_cvs = [LeaveOneGroupOut(), LeavePGroupsOut(2), GroupKFold(), GroupShuffleSplit()] + error_message = "The 'groups' parameter should not be None." for cv in group_cvs: - assert_raise_message(ValueError, - "The 'groups' parameter should not be None.", - cross_val_score, estimator=clf, X=X, y=y, cv=cv) - assert_raise_message(ValueError, - "The 'groups' parameter should not be None.", - cross_val_predict, estimator=clf, X=X, y=y, cv=cv) + with pytest.raises(ValueError, match=error_message): + cross_val_score(estimator=clf, X=X, y=y, cv=cv) + with pytest.raises(ValueError, match=error_message): + cross_val_predict(estimator=clf, X=X, y=y, cv=cv) @pytest.mark.filterwarnings('ignore: Using or importing the ABCs from') @@ -597,12 +597,13 @@ def test_cross_val_score_precomputed(): # Error raised for non-square X svm = SVC(kernel="precomputed") - assert_raises(ValueError, cross_val_score, svm, X, y) + with pytest.raises(ValueError): + cross_val_score(svm, X, y) # test error is raised when the precomputed kernel is not array-like # or sparse - assert_raises(ValueError, cross_val_score, svm, - linear_kernel.tolist(), y) + with pytest.raises(ValueError): + cross_val_score(svm, linear_kernel.tolist(), y) def test_cross_val_score_fit_params(): @@ -657,7 +658,8 @@ def test_cross_val_score_errors(): class BrokenEstimator: pass - assert_raises(TypeError, cross_val_score, BrokenEstimator(), X) + with pytest.raises(TypeError): + cross_val_score(BrokenEstimator(), X) def test_cross_val_score_with_score_func_classification(): @@ -851,7 +853,8 @@ def split(self, X, y=None, groups=None): for i in range(4): yield np.array([0, 1, 2, 3]), np.array([4, 5, 6, 7, 8]) - assert_raises(ValueError, cross_val_predict, est, X, y, cv=BadCV()) + with pytest.raises(ValueError): + cross_val_predict(est, X, y, cv=BadCV()) X, y = load_iris(return_X_y=True) @@ -882,15 +885,15 @@ def test_cross_val_predict_decision_function_shape(): # class. X = X[:100] y = y[:100] - assert_raise_message(ValueError, - 'Only 1 class/es in training fold,' - ' but 2 in overall dataset. This' - ' is not supported for decision_function' - ' with imbalanced folds. To fix ' - 'this, use a cross-validation technique ' - 'resulting in properly stratified folds', - cross_val_predict, RidgeClassifier(), X, y, - method='decision_function', cv=KFold(2)) + error_message = 'Only 1 class/es in training fold,'\ + ' but 2 in overall dataset. This'\ + ' is not supported for decision_function'\ + ' with imbalanced folds. To fix '\ + 'this, use a cross-validation technique '\ + 'resulting in properly stratified folds' + with pytest.raises(ValueError, match=error_message): + cross_val_predict(RidgeClassifier(), X, y, method='decision_function', + cv=KFold(2)) X, y = load_digits(return_X_y=True) est = SVC(kernel='linear', decision_function_shape='ovo') @@ -902,12 +905,13 @@ def test_cross_val_predict_decision_function_shape(): ind = np.argsort(y) X, y = X[ind], y[ind] - assert_raises_regex(ValueError, - r'Output shape \(599L?, 21L?\) of decision_function ' - r'does not match number of classes \(7\) in fold. ' - 'Irregular decision_function .*', - cross_val_predict, est, X, y, - cv=KFold(n_splits=3), method='decision_function') + error_message_regexp = r'Output shape \(599L?, 21L?\) of ' \ + 'decision_function does not match number of ' \ + r'classes \(7\) in fold. Irregular ' \ + 'decision_function .*' + with pytest.raises(ValueError, match=error_message_regexp): + cross_val_predict(est, X, y, cv=KFold(n_splits=3), + method='decision_function') def test_cross_val_predict_predict_proba_shape(): @@ -1126,8 +1130,8 @@ def test_learning_curve_incremental_learning_not_possible(): n_clusters_per_class=1, random_state=0) # The mockup does not have partial_fit() estimator = MockImprovingEstimator(1) - assert_raises(ValueError, learning_curve, estimator, X, y, - exploit_incremental_learning=True) + with pytest.raises(ValueError): + learning_curve(estimator, X, y, exploit_incremental_learning=True) def test_learning_curve_incremental_learning(): @@ -1190,16 +1194,16 @@ def test_learning_curve_n_sample_range_out_of_bounds(): n_redundant=0, n_classes=2, n_clusters_per_class=1, random_state=0) estimator = MockImprovingEstimator(20) - assert_raises(ValueError, learning_curve, estimator, X, y, cv=3, - train_sizes=[0, 1]) - assert_raises(ValueError, learning_curve, estimator, X, y, cv=3, - train_sizes=[0.0, 1.0]) - assert_raises(ValueError, learning_curve, estimator, X, y, cv=3, - train_sizes=[0.1, 1.1]) - assert_raises(ValueError, learning_curve, estimator, X, y, cv=3, - train_sizes=[0, 20]) - assert_raises(ValueError, learning_curve, estimator, X, y, cv=3, - train_sizes=[1, 21]) + with pytest.raises(ValueError): + learning_curve(estimator, X, y, cv=3, train_sizes=[0, 1]) + with pytest.raises(ValueError): + learning_curve(estimator, X, y, cv=3, train_sizes=[0.0, 1.0]) + with pytest.raises(ValueError): + learning_curve(estimator, X, y, cv=3, train_sizes=[0.1, 1.1]) + with pytest.raises(ValueError): + learning_curve(estimator, X, y, cv=3, train_sizes=[0, 20]) + with pytest.raises(ValueError): + learning_curve(estimator, X, y, cv=3, train_sizes=[1, 21]) def test_learning_curve_remove_duplicate_sample_sizes(): @@ -1253,9 +1257,10 @@ def test_learning_curve_with_shuffle(): np.array([0.75, 0.3, 0.36111111])) assert_array_almost_equal(test_scores_batch.mean(axis=1), np.array([0.36111111, 0.25, 0.25])) - assert_raises(ValueError, learning_curve, estimator, X, y, cv=cv, n_jobs=1, - train_sizes=np.linspace(0.3, 1.0, 3), groups=groups, - error_score='raise') + with pytest.raises(ValueError): + learning_curve(estimator, X, y, cv=cv, n_jobs=1, + train_sizes=np.linspace(0.3, 1.0, 3), groups=groups, + error_score='raise') train_sizes_inc, train_scores_inc, test_scores_inc = learning_curve( estimator, X, y, cv=cv, n_jobs=1, train_sizes=np.linspace(0.3, 1.0, 3), @@ -1709,8 +1714,8 @@ def test_score_memmap(): score = np.memmap(tf.name, shape=(), mode='r', dtype=np.float64) try: cross_val_score(clf, X, y, scoring=lambda est, X, y: score) - assert_raises(ValueError, cross_val_score, clf, X, y, - scoring=lambda est, X, y: scores) + with pytest.raises(ValueError): + cross_val_score(clf, X, y, scoring=lambda est, X, y: scores) finally: # Best effort to release the mmap file handles before deleting the # backing file under Windows @@ -1785,26 +1790,28 @@ def test_warn_trace(msg): fit_and_score_kwargs = {'error_score': 'raise'} # check if exception was raised, with default error_score='raise' - assert_raise_message(ValueError, "Failing classifier failed as required", - _fit_and_score, *fit_and_score_args, - **fit_and_score_kwargs) + with pytest.raises( + ValueError, + match="Failing classifier failed as required" + ): + _fit_and_score(*fit_and_score_args, **fit_and_score_kwargs) # check that functions upstream pass error_score param to _fit_and_score - error_message = ("error_score must be the string 'raise' or a" - " numeric value. (Hint: if using 'raise', please" - " make sure that it has been spelled correctly.)") - - assert_raise_message(ValueError, error_message, cross_validate, - failing_clf, X, cv=3, error_score='unvalid-string') + error_message = re.escape( + "error_score must be the string 'raise' or a numeric value. (Hint: if " + "using 'raise', please make sure that it has been spelled correctly.)" + ) + with pytest.raises(ValueError, match=error_message): + cross_validate(failing_clf, X, cv=3, error_score='unvalid-string') - assert_raise_message(ValueError, error_message, cross_val_score, - failing_clf, X, cv=3, error_score='unvalid-string') + with pytest.raises(ValueError, match=error_message): + cross_val_score(failing_clf, X, cv=3, error_score='unvalid-string') - assert_raise_message(ValueError, error_message, learning_curve, - failing_clf, X, y, cv=3, error_score='unvalid-string') + with pytest.raises(ValueError, match=error_message): + learning_curve(failing_clf, X, y, cv=3, error_score='unvalid-string') - assert_raise_message(ValueError, error_message, validation_curve, - failing_clf, X, y, param_name='parameter', + with pytest.raises(ValueError, match=error_message): + validation_curve(failing_clf, X, y, param_name='parameter', param_range=[FailingClassifier.FAILING_PARAMETER], cv=3, error_score='unvalid-string') @@ -1907,7 +1914,6 @@ def test_cross_validate_failing_scorer( assert_allclose(results[key], error_score) - def three_params_scorer(i, j, k): return 3.4213 @@ -1952,8 +1958,8 @@ def test_score(): def two_params_scorer(estimator, X_test): return None fit_and_score_args = [None, None, None, two_params_scorer] - assert_raise_message(ValueError, error_message, - _score, *fit_and_score_args, error_score=np.nan) + with pytest.raises(ValueError, match=error_message): + _score(*fit_and_score_args, error_score=np.nan) def test_callable_multimetric_confusion_matrix_cross_validate(): From 28ee486b44f8e7e6440f3439e7315ba1e6d35e43 Mon Sep 17 00:00:00 2001 From: Alihan Zihna Date: Tue, 2 Mar 2021 11:19:56 +0000 Subject: [PATCH 222/478] TST Change assert to pytest style in tests/test_discriminant.py (#19558) Co-authored-by: Alihan Zihna Co-authored-by: Olivier Grisel --- sklearn/tests/test_discriminant_analysis.py | 71 ++++++++++++++------- 1 file changed, 49 insertions(+), 22 deletions(-) diff --git a/sklearn/tests/test_discriminant_analysis.py b/sklearn/tests/test_discriminant_analysis.py index 18364ce156f87..3dd22e2154400 100644 --- a/sklearn/tests/test_discriminant_analysis.py +++ b/sklearn/tests/test_discriminant_analysis.py @@ -5,14 +5,10 @@ from scipy import linalg from sklearn.utils import check_random_state -from sklearn.utils._testing import assert_array_equal, assert_no_warnings +from sklearn.utils._testing import assert_array_equal from sklearn.utils._testing import assert_array_almost_equal from sklearn.utils._testing import assert_allclose from sklearn.utils._testing import assert_almost_equal -from sklearn.utils._testing import assert_raises -from sklearn.utils._testing import assert_raise_message -from sklearn.utils._testing import assert_warns -from sklearn.utils._testing import ignore_warnings from sklearn.datasets import make_blobs from sklearn.discriminant_analysis import LinearDiscriminantAnalysis @@ -89,15 +85,22 @@ def test_lda_predict(): # Test invalid shrinkages clf = LinearDiscriminantAnalysis(solver="lsqr", shrinkage=-0.2231) - assert_raises(ValueError, clf.fit, X, y) + with pytest.raises(ValueError): + clf.fit(X, y) + clf = LinearDiscriminantAnalysis(solver="eigen", shrinkage="dummy") - assert_raises(ValueError, clf.fit, X, y) + with pytest.raises(ValueError): + clf.fit(X, y) + clf = LinearDiscriminantAnalysis(solver="svd", shrinkage="auto") - assert_raises(NotImplementedError, clf.fit, X, y) + with pytest.raises(NotImplementedError): + clf.fit(X, y) + clf = LinearDiscriminantAnalysis(solver="lsqr", shrinkage=np.array([1, 2])) with pytest.raises(TypeError, match="shrinkage must be a float or a string"): clf.fit(X, y) + clf = LinearDiscriminantAnalysis(solver="lsqr", shrinkage=0.1, covariance_estimator=ShrunkCovariance()) @@ -106,9 +109,11 @@ def test_lda_predict(): "parameters are not None. " "Only one of the two can be set.")): clf.fit(X, y) + # Test unknown solver clf = LinearDiscriminantAnalysis(solver="dummy") - assert_raises(ValueError, clf.fit, X, y) + with pytest.raises(ValueError): + clf.fit(X, y) # test bad solver with covariance_estimator clf = LinearDiscriminantAnalysis(solver="svd", @@ -199,7 +204,9 @@ def test_lda_priors(): priors = np.array([0.5, -0.5]) clf = LinearDiscriminantAnalysis(priors=priors) msg = "priors must be non-negative" - assert_raise_message(ValueError, msg, clf.fit, X, y) + + with pytest.raises(ValueError, match=msg): + clf.fit(X, y) # Test that priors passed as a list are correctly handled (run to see if # failure) @@ -210,7 +217,10 @@ def test_lda_priors(): priors = np.array([0.5, 0.6]) prior_norm = np.array([0.45, 0.55]) clf = LinearDiscriminantAnalysis(priors=priors) - assert_warns(UserWarning, clf.fit, X, y) + + with pytest.warns(UserWarning): + clf.fit(X, y) + assert_array_almost_equal(clf.priors_, prior_norm, 2) @@ -247,7 +257,9 @@ def test_lda_transform(): clf = LinearDiscriminantAnalysis(solver="lsqr", n_components=1) clf.fit(X, y) msg = "transform not implemented for 'lsqr'" - assert_raise_message(NotImplementedError, msg, clf.transform, X) + + with pytest.raises(NotImplementedError, match=msg): + clf.transform(X) def test_lda_explained_variance_ratio(): @@ -424,7 +436,8 @@ def test_lda_dimension_warning(n_classes, n_features): for n_components in [max_components - 1, None, max_components]: # if n_components <= min(n_classes - 1, n_features), no warning lda = LinearDiscriminantAnalysis(n_components=n_components) - assert_no_warnings(lda.fit, X, y) + with pytest.warns(None): + lda.fit(X, y) for n_components in [max_components + 1, max(n_features, n_classes - 1) + 1]: @@ -486,7 +499,8 @@ def test_qda(): assert np.any(y_pred3 != y7) # Classes should have at least 2 elements - assert_raises(ValueError, clf.fit, X6, y4) + with pytest.raises(ValueError): + clf.fit(X6, y4) def test_qda_priors(): @@ -523,23 +537,36 @@ def test_qda_store_covariance(): def test_qda_regularization(): - # the default is reg_param=0. and will cause issues - # when there is a constant variable + # The default is reg_param=0. and will cause issues when there is a + # constant variable. + + # Fitting on data with constant variable triggers an UserWarning. + collinear_msg = "Variables are collinear" clf = QuadraticDiscriminantAnalysis() - with ignore_warnings(): - y_pred = clf.fit(X2, y6).predict(X2) + with pytest.warns(UserWarning, match=collinear_msg): + y_pred = clf.fit(X2, y6) + + # XXX: RuntimeWarning is also raised at predict time because of divisions + # by zero when the model is fit with a constant feature and without + # regularization: should this be considered a bug? Either by the fit-time + # message more informative, raising and exception instead of a warning in + # this case or somehow changing predict to avoid division by zero. + with pytest.warns(RuntimeWarning, match="divide by zero"): + y_pred = clf.predict(X2) assert np.any(y_pred != y6) - # adding a little regularization fixes the problem + # Adding a little regularization fixes the division by zero at predict + # time. But UserWarning will persist at fit time. clf = QuadraticDiscriminantAnalysis(reg_param=0.01) - with ignore_warnings(): + with pytest.warns(UserWarning, match=collinear_msg): clf.fit(X2, y6) y_pred = clf.predict(X2) assert_array_equal(y_pred, y6) - # Case n_samples_in_a_class < n_features + # UserWarning should also be there for the n_samples_in_a_class < + # n_features case. clf = QuadraticDiscriminantAnalysis(reg_param=0.1) - with ignore_warnings(): + with pytest.warns(UserWarning, match=collinear_msg): clf.fit(X5, y5) y_pred5 = clf.predict(X5) assert_array_equal(y_pred5, y5) From bd53f54fb666459dc54af4de52032a65fca551be Mon Sep 17 00:00:00 2001 From: Alihan Zihna Date: Wed, 3 Mar 2021 17:01:07 +0000 Subject: [PATCH 223/478] TST Change assert from sklearn to pytest style in tests/test_multiclass.py (#19593) --- sklearn/tests/test_multiclass.py | 95 ++++++++++++++++++++------------ 1 file changed, 59 insertions(+), 36 deletions(-) diff --git a/sklearn/tests/test_multiclass.py b/sklearn/tests/test_multiclass.py index 96bd1b807a95f..74b380505e45a 100644 --- a/sklearn/tests/test_multiclass.py +++ b/sklearn/tests/test_multiclass.py @@ -6,10 +6,6 @@ from sklearn.utils._testing import assert_array_equal from sklearn.utils._testing import assert_almost_equal -from sklearn.utils._testing import assert_raises -from sklearn.utils._testing import assert_warns -from sklearn.utils._testing import assert_raise_message -from sklearn.utils._testing import assert_raises_regexp from sklearn.utils._testing import ignore_warnings from sklearn.utils._mocking import CheckingClassifier from sklearn.multiclass import OneVsRestClassifier @@ -35,6 +31,7 @@ from sklearn.pipeline import Pipeline, make_pipeline from sklearn.impute import SimpleImputer from sklearn import svm +from sklearn.exceptions import NotFittedError from sklearn import datasets iris = datasets.load_iris() @@ -47,22 +44,30 @@ def test_ovr_exceptions(): ovr = OneVsRestClassifier(LinearSVC(random_state=0)) - assert_raises(ValueError, ovr.predict, []) + + # test predicting without fitting + with pytest.raises(NotFittedError): + ovr.predict([]) # Fail on multioutput data - assert_raises(ValueError, OneVsRestClassifier(MultinomialNB()).fit, - np.array([[1, 0], [0, 1]]), - np.array([[1, 2], [3, 1]])) - assert_raises(ValueError, OneVsRestClassifier(MultinomialNB()).fit, - np.array([[1, 0], [0, 1]]), - np.array([[1.5, 2.4], [3.1, 0.8]])) + msg = "Multioutput target data is not supported with label binarization" + with pytest.raises(ValueError, match=msg): + X = np.array([[1, 0], [0, 1]]) + y = np.array([[1, 2], [3, 1]]) + OneVsRestClassifier(MultinomialNB()).fit(X, y) + + with pytest.raises(ValueError, match=msg): + X = np.array([[1, 0], [0, 1]]) + y = np.array([[1.5, 2.4], [3.1, 0.8]]) + OneVsRestClassifier(MultinomialNB()).fit(X, y) def test_check_classification_targets(): # Test that check_classification_target return correct type. #5782 y = np.array([0.0, 1.1, 2.0, 3.0]) msg = type_of_target(y) - assert_raise_message(ValueError, msg, check_classification_targets, y) + with pytest.raises(ValueError, match=msg): + check_classification_targets(y) def test_ovr_fit_predict(): @@ -120,12 +125,12 @@ def test_ovr_partial_fit_exceptions(): X = np.abs(np.random.randn(14, 2)) y = [1, 1, 1, 1, 2, 3, 3, 0, 0, 2, 3, 1, 2, 3] ovr.partial_fit(X[:7], y[:7], np.unique(y)) - # A new class value which was not in the first call of partial_fit - # It should raise ValueError + # If a new class that was not in the first call of partial fit is seen + # it should raise ValueError y1 = [5] + y[7:-1] - assert_raises_regexp(ValueError, r"Mini-batch contains \[.+\] while " - r"classes must be subset of \[.+\]", - ovr.partial_fit, X=X[7:], y=y1) + msg = r"Mini-batch contains \[.+\] while classes must be subset of \[.+\]" + with pytest.raises(ValueError, match=msg): + ovr.partial_fit(X=X[7:], y=y1) def test_ovr_ovo_regressor(): @@ -201,7 +206,9 @@ def test_ovr_always_present(): y[:, 2] = 1 ovr = OneVsRestClassifier(LogisticRegression()) - assert_warns(UserWarning, ovr.fit, X, y) + msg = r'Label .+ is present in all training examples' + with pytest.warns(UserWarning, match=msg): + ovr.fit(X, y) y_pred = ovr.predict(X) assert_array_equal(np.array(y_pred), np.array(y)) y_pred = ovr.decision_function(X) @@ -213,7 +220,10 @@ def test_ovr_always_present(): y = np.zeros((10, 2)) y[5:, 0] = 1 # variable label ovr = OneVsRestClassifier(LogisticRegression()) - assert_warns(UserWarning, ovr.fit, X, y) + + msg = r'Label not 1 is present in all training examples' + with pytest.warns(UserWarning, match=msg): + ovr.fit(X, y) y_pred = ovr.predict_proba(X) assert_array_equal(y_pred[:, -1], np.zeros(X.shape[0])) @@ -266,7 +276,7 @@ def conduct_test(base_clf, test_predict_proba=False): probabilities = clf.predict_proba(X_test) assert 2 == len(probabilities[0]) assert (clf.classes_[np.argmax(probabilities, axis=1)] == - clf.predict(X_test)) + clf.predict(X_test)) # test input as label indicator matrix clf = OneVsRestClassifier(base_clf).fit(X, Y) @@ -389,8 +399,8 @@ def test_ovr_single_label_predict_proba(): assert_almost_equal(Y_proba.sum(axis=1), 1.0) # predict assigns a label if the probability that the - # sample has the label is greater than 0.5. - pred = np.array([l.argmax() for l in Y_proba]) + # sample has the label with the greatest predictive probability. + pred = Y_proba.argmax(axis=1) assert not (pred - Y_pred).any() @@ -458,7 +468,7 @@ def test_ovr_coef_(): assert shape[1] == iris.data.shape[1] # don't densify sparse coefficients assert (sp.issparse(ovr.estimators_[0].coef_) == - sp.issparse(ovr.coef_)) + sp.issparse(ovr.coef_)) # TODO: Remove this test in version 1.1 @@ -467,13 +477,16 @@ def test_ovr_coef_(): def test_ovr_coef_exceptions(): # Not fitted exception! ovr = OneVsRestClassifier(LinearSVC(random_state=0)) - # lambda is needed because we don't want coef_ to be evaluated right away - assert_raises(ValueError, lambda x: ovr.coef_, None) + + with pytest.raises(NotFittedError): + ovr.coef_ # Doesn't have coef_ exception! ovr = OneVsRestClassifier(DecisionTreeClassifier()) ovr.fit(iris.data, iris.target) - assert_raises(AttributeError, lambda x: ovr.coef_, None) + msg = "Base estimator doesn't have a coef_ attribute" + with pytest.raises(AttributeError, match=msg): + ovr.coef_ # TODO: Remove this test in version 1.1 when @@ -494,7 +507,8 @@ def test_ovr_deprecated_coef_intercept(): def test_ovo_exceptions(): ovo = OneVsOneClassifier(LinearSVC(random_state=0)) - assert_raises(ValueError, ovo.predict, []) + with pytest.raises(NotFittedError): + ovo.predict([]) def test_ovo_fit_on_list(): @@ -563,8 +577,8 @@ def test_ovo_partial_fit_predict(): message_re = escape("Mini-batch contains {0} while " "it must be subset of {1}".format(np.unique(error_y), np.unique(y))) - assert_raises_regexp(ValueError, message_re, ovo.partial_fit, X[:7], - error_y, np.unique(y)) + with pytest.raises(ValueError, match=message_re): + ovo.partial_fit(X[:7], error_y, np.unique(y)) # test partial_fit only exists if estimator has it: ovr = OneVsOneClassifier(SVC()) @@ -682,7 +696,9 @@ def test_ovo_one_class(): y = np.array(['a'] * 4) ovo = OneVsOneClassifier(LinearSVC()) - assert_raise_message(ValueError, "when only one class", ovo.fit, X, y) + msg = "when only one class" + with pytest.raises(ValueError, match=msg): + ovo.fit(X, y) def test_ovo_float_y(): @@ -691,12 +707,15 @@ def test_ovo_float_y(): y = iris.data[:, 0] ovo = OneVsOneClassifier(LinearSVC()) - assert_raise_message(ValueError, "Unknown label type", ovo.fit, X, y) + msg = "Unknown label type" + with pytest.raises(ValueError, match=msg): + ovo.fit(X, y) def test_ecoc_exceptions(): ecoc = OutputCodeClassifier(LinearSVC(random_state=0)) - assert_raises(ValueError, ecoc.predict, []) + with pytest.raises(NotFittedError): + ecoc.predict([]) def test_ecoc_fit_predict(): @@ -728,10 +747,14 @@ def test_ecoc_float_y(): y = iris.data[:, 0] ovo = OutputCodeClassifier(LinearSVC()) - assert_raise_message(ValueError, "Unknown label type", ovo.fit, X, y) + msg = "Unknown label type" + with pytest.raises(ValueError, match=msg): + ovo.fit(X, y) + ovo = OutputCodeClassifier(LinearSVC(), code_size=-1) - assert_raise_message(ValueError, "code_size should be greater than 0," - " got -1", ovo.fit, X, y) + msg = "code_size should be greater than 0, got -1" + with pytest.raises(ValueError, match=msg): + ovo.fit(X, y) def test_ecoc_delegate_sparse_base_estimator(): @@ -773,7 +796,7 @@ def test_pairwise_indices(): for idx in precomputed_indices: assert (idx.shape[0] * n_estimators / (n_estimators - 1) == - linear_kernel.shape[0]) + linear_kernel.shape[0]) @ignore_warnings(category=FutureWarning) From 4e732f893c04c45e1cdb287abaef67cf83b731eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Wed, 3 Mar 2021 18:14:45 +0100 Subject: [PATCH 224/478] Add BNP Paribas Cardif testimonial. (#19586) --- .../images/bnp_paribas_cardif.png | Bin 0 -> 65058 bytes doc/testimonials/testimonials.rst | 41 ++++++++++++++++++ 2 files changed, 41 insertions(+) create mode 100644 doc/testimonials/images/bnp_paribas_cardif.png diff --git a/doc/testimonials/images/bnp_paribas_cardif.png b/doc/testimonials/images/bnp_paribas_cardif.png new file mode 100644 index 0000000000000000000000000000000000000000..0c7a040bae329d65086384fd1ba90d70bfaaf0a5 GIT binary patch literal 65058 zcmX_nWmr_-7p{s@f}}LkjdXXz5JPtf3?L=l2rAv(CEXz1A)SJBNJvO`_uc&O{oM}( zJUk5NoW1v2>#Ye^f=Z(z6Cgi(_6!vwBcbx_*$ZFra}**x`07|~6a&5?*~w@-K7012 z`|0&Op5YDQvuC8wAQGZ#ZfSe1?rAX7&xeoRzf^Qx)m;@8FZWPhz;zLzR0WZLH|Y2- zT;8(!1{ceK3~w!?#5fEiv7}K-l&bLsrs%39^K%RdiB}XxK3R-ap>XWOQln=}`U||9 z?%qCWH$tC_o0^I}n#_-u&H`tQii<~iM;`<-H=70NQ5fJw7+8A0Kkqvq6M>1vsqi%5 ziTK{*c09*knHCzlq$CoT+X_HfcUe5LO`XvimTe!otR30?t45QhZ5TC_dJP2EX?z5)bCvTzF1lEDy7JXZhZ_(7Sk+NDprx%q{S^5 z$YXaa+RJ$B@V^I4_~A#@viBoKw2V}YorABlebI>m?e5=<#GAOBFEUe;{hZ_IV3BS2 z$7J9aFj8O{-yO+k8C%T`ZKaduF@CoyAYNMDDD;zd?^{~Jl%AKD)A&Z`n4H_cG;cXj zTe_`tRqnrxN=YPeJXX(OZr9-n_K&lL5V!dK@8R9tS9k+b_;@t1=$B2iRmf$V_cDn3 zb_dbX<>28=mCAh+Ay@Us|GtZ&q3km)DSC&fW z56>MhMkwi8Mzj}Ku%(DIG|K0*PXGv%+T10q0tY1{#Q4yLA zcMNeboVk1n#%PPFvx`yqHk5xqk2q9CEp{>=KSN_Q|6mto%_XOWWEbCg`uRh^9$hS@ zH&j(RwlP=V8MBhQeZ=`tPpdGuT%RD63h9sQcC+*TOU^*jS5g)%?>_p9t>HzQ3cCN1 zN46cJE}o$BXmV(v%Zv(AK_A;K_?P;XYL7Jm@$dgzkmJA6b}v{(;#pj%kBDk=#^$4B zZ?|NFmzIR!5fXkc#{Ycvg7^#9cQtty6nni+ze%JTQ)kx7qC|NlN;K%I0b#Kj2GWdI zUhJuYXXR9PXr}keVL!Oe20S<({xDj$Ylf>lcWk??Q)Cjg;7YJKz7?BmFJkX1_RC3Ek|)`4m8@=)?aSnG#7{p}R2Xsm_sZ&8x| zNH0_(HL%&|RDVA|Tmet!Som~rIHhST94k=MOT7A=k6Tn_n%skHah#ahr(E-XJ=Z?R*IIeDt4C~wv$1+bSLvMPS zhg?A<*Ec9OpC4czv3a4W?Pf(l-lMZ9?a z)FE)H{K)&=0};usb3`n6bAb>TtB@ zaJoE4#|u9uQi7I}I_cGnpMT96LD3G)^F`LLj~H!rbKcV%M~`d5cpWUqW7rTOmzxs- zeI6X?smeLkB(6Y>yL?D#&47xhYKY8@!PvxRXIPL z&rYH;v{hCDkZl?oc|cxBwU`DG?*QBq1R<}ig|2J ze!zTK?$*0+)b?TTot2~G+)eh67|=LOI=o7kv#R9~+QE>-DCakM{1hf?MO?AKe5%7f zBwC-qe9d^TEX6CU#4IzR)81_FRF7L-`pdt#pedHj z+A95y4X7`)p8wWO@x(Z2f1BLIu2pNWz{#(~OC=-l4TYGg@g`xJ68*u)UgeK(s|DA1 zKECB&@4oOr4%DGhf_g#ivWcFRvmr^IqFAIgGd3N#+wBB3DoZ<^KYabrvllu2d|CrR(ak;tWi_;gfky-Jcb4~ zE+yWzE@n@K$h)L0iH#dtQzD5~I}wA*@oDy=x($}UIB8ux4b@%}+!ihyU!H-rYI~%g z{gpz*@JkY)#Qh87cU5VlOtwxK?ZK-K|13Eeb!Vf3wai(ys-0#V`QK^zdarm!EN_-d(f7E{Q-N=!LFodj>Ap z@T2EjgD`&jj*JY7KPY7osOJJ7U7w!aNXCXH9j9YUhDz?ZMHp9-jQIE8Q&C}u6I1FP z682rMYSROim5SAr%Eg&dDaOR{D2g`v6Q4vlFbuCf9d4vs#>IW zB!E`Nsf-;BEqD(Jqw&8vi#?o=y>q?sTDxbihaGG{eVFh}g0h2XLw0a5W}MA2enK%` zKF?D@`l81qO&?!OK`A%OPTBWk(%ht#D~8%a{Ce@U#f|+>uFa*Byo(sY1?=vZ^hv{P z($G%_!`x+S-vmn6cv*jkMtQE(9($tH85R^sp2}5i>=X`G_--L7-Bhd>^`lJ-GhL5- z!NkY1-KAFZ7*R;B6teNaci#e{Fhb_*ZR-HN^4FuzNLIy)nnY+1Sij`NI`l@R1D`J# z{*k4Q{o5ZNSm|NM3C%5nCL#{LrO%@`$uZd@SdgSbro%=)yJRZRDq|614L;=0{ezvS z5)9VktvGdT7DFQ33>6AoCrbzogLCl|GY%RtZXhCYo%`Q#Ng=J}PI~7e*uVls4Y1L~5C|h{ws@~y_n&8w< z8+6AfYnvH+py^y-6!WULeN+=OFOEDN8VfDh|2%?o;qn^=N4&TaZ(Ws=4RTV8Hyisv z@wx#03g&GLX;H189c2GPb!UWU999W4?A369OT2S!yI#h!aARwQF1(-f(LJAPd{`=e zq*71pG5-C@;q!84qS*(ssFCE{__%eC3~5(a?u^Yjua}}CwK}z@_SCU+Ya>L%?ZgD_ zgeHjJK1>rGG?l+k-EGo{B3)hx4>da%8ZuKdF=qR&<^{FSg}4jhFVk?$7YBv2;@Rv$ zs2=fIIR*W5u&>3wZU;lk+x`(upH-}%sTz!ukl1ld=wmrGyBF`4;1Oe}!BJCF;}LiS zA6G)yM3lSGpGoyf{Z^){6F*zzna(L;k;N!VCKi$S1O14_pEntlSq>3U z51+*+E=j1JZm4bemLT$Cz9^kuMxn@h6)i#rN)aBxQSMMm@p@8@{*o(3MU>bg&4~ad z>2nLA1rbacdu}viv&Wqco=@*6j5Th^w5h47G)cH|b{OU1-%dI}h5Get5Or$Yx$|F~ zbTsNzy?9>BG$`WPw8g3GgnxM=TFq_0v$l_%WR}*i3wmM=K!g!rq|%V#da96!qAL}t zJ=-Ivw{ZT}D%ldVvjaVeqBLMGKu!3LKZnJ^v@|dpv9@w6)OI+k@^`2mQfol75qJFJ zyzWC8rLVEh_$L#p!2k z>En^Nol?AnY&T7}t3&;vRJXEp`T6;?l_m%|IXN#yKb)ppcZp?ntgMi>wzi5`vHQaw zjvQ?@|2WfNp(jTlC zp|3n^J9NC+E~x%If#ZD_n`x(cvFv^ip>gaN4P4=WSH8B|fUyg&=`gwSQvxqPDPJww ztnXKPSVS0$0Nb!yWq(yUkgif^s za^C_{C3we<+j}V=IG@?~jo0cp=oe7qhe-w^MarlV(ax zb+;_=Iz0h%QSxL#13t&*1X-K?e4MR`^~vvH1vYb zg><0CYq@?jQI;xZCOSLDdsVO!2OUV>{S67#lv$HZSb_!Ifx10J+k8$+U z(o$2fG8bidXsEnap31<%ht%jPDD`6_4!;Gi_Iguu^Yf&Xlx&@{m=Ux6&It=0&yZf% zKqDFI*h-stO3~A=le(IkiT7p61Ty^^U{m#&a^lD{M;E+nl+N|4d*tYP`ul^Zn!vem z&)Dx9ca;<)bC0_V4RL0jm!#|I?tVFEJuSNtV|k;ITWU`fVDxd{33PV-R4S_V@xE6^ zc=K!npZEJesK_2RF{N_ZWGm4|jc)EYrwC%be6Fghs$t3Ks9fzyMh%ONjh!AAImsLr zU%MNF4*t1vEvv0{O*m8uEK&;@7=Q!>25xU}lf@1UL!@vBWsCUun~I9)YNlb;+S#;K zbuQpOE@sxHK4nS#z{j^q>jLZe@)Hc^7f9lr)O^^Mx?A2HVV?Q9j;UgVi{!-C(2xWU z4vxruJ(|eA8-vJmud-CPJ_41HYu1y17)CG?$j63fMM~Hmok3VaVMVD#LxX3cJ{&kb)K@qGPVD382BPtQks zf?~ku@1QTQV7lxE?<9ZZlxAMG%Ee~C*U@_ubxRTJg+rv^rIel%R6{+^j``!NntI$a zlSE{-fhk`nPmSJTbkWsuo4Z~4@{)I8MF7EB<)qHz+|uB@SkwOOw{*2p7yQlbEu>(g zq1-AsNCr8K)umoI@r*2(1%)ngNYC!)D>eGJ)HF1JTWxP!$mYf9e^4#LzV>=I;|$)^ zavW~E{_QnWQeJ}PtD-=8>Mn2&>i;;ro-y?KyeJrcPFbAJP~Py(iO7q)K+KOA2M_j*fU)A9^`a(ih9 zkhctWM194&rV}xLP&g>4LSN-!gBI#_9BR0J+(%Ns_Faz&PRp1efULsje)sw)P&jOQ zKE}NHr{POVg-fc9r=>1>mPb(F7ib4VY(I0`N`1v?>6A|k!^JE35AJ$ zBfxMnb(lK8knn_i9gV`}AO*0ZL=;NnMeN5@?(2f>|I^@Nlt{+i zAtm#VJO8DS9Wzb(6<-A3kzIaL@2eX&_wn4IJnMj(lkEfTUnv>Sxek0t`CUPFbGHU_xBNv-v+G1+)U7N16v(um7vEq&I^NEI*b=+g?3xo-1wZ0o zo2QbcQ?`2Bw%B+uUmHph!SijV^}tzSNIJ~nkI5;?z@1%1RCR`53! zOC0>P!_Ea-Vz0mcxD6NcR_>tucZVrsF0}o{zCJv73!P=5Rf3~dHD{g9VO@zcSCHwQ z0(xdc(WHlvOoDQ*4uq~XZ2Re7eQU)vO7fP?3UaAu-qjDB`;ck}``9BEtx<($I`aJR~3g2^>w;Z?spcm};=H!L(%6ibrq6}Oy=9V!4I%97QNFx>Wq69MLMv>dO*zXSH&nUv3=M3#?s*|_-FnlF1K?htyXXo&onNfV0BI~d3Zq@{KS{m5~fde=+rhs z7w|F7*A9(ghqqv9p4C~;b($vX_debjK8Bv}&Nwc)ZyuZbT$qEoRt^n`4SUXs+YR0L z(e-96!}B1kq9PIuRXG8&1dZGY3jo*d@9#lX2wwi_>Jxsjf9JOH+|v;KrOS9SgMPFI zXp$$_ccpyRUexij%0+5>xywkECjyaxN`cLsn3#CKESv~a5~9gNAEyo#9)`W)jv?c>?m*<`CBwPMj_j~Lr>cFOkdu8Fm^b>#9K&oK>I_mDji zxEnNqqi0XAL_G(j3b!5xp2L>Yt5e}e$LVr?-OIEy3}xQBs?1o2fmPy{&qdLXhS^7t z`Y9sE78b0iGQaH00b&&#j07;e8^DGGFt)w=W$#KQGL+rkelgs6^%9NaKaQiUtgND% znwpDddC72UDxJxcJCJMw$!IDRh+9TqtfJ5WmZHGVDNREKY)P*@6Oy$!Rp{SIRYj`U zf!SADTuvi{Gcwp{DEj&P}OruDR-fLHP+1Se~t?Lbc*k$YC=cn1@Y2!QnF|;slwYa2w^=}|aosmiL zR552OZM^LC!zuX1KQQGs{ji{K%LDY97T`c=h}`1rot8 z4Wb{8Lc_zl;rIAv(+I5EG2!x{MKw34R#vhG1O(_{ipnHS21^Wlu-yFZu{9r=v5P55 zEu_iQ+52lkc`xnzNQVAzp$0J>!~KlJ7J|h)OwV9hJal|tdz9!LB)vkOfGR?atAO&a z?>sb7jymf`^qfc3?-tUz(Hl}M^kJoJ(D*21b+Me^y-LA@c*K7R`~OyX&&e*iok3bQ z)+go~E<~o!-l!Z^U40c*r%1DuAqe~=tT277q>dVoNP{vqg)9^iPFRTYk$8&wOOR6F zM^#lrU5EsBri`)tzFG|WsG|~Jr+pNEu?B(7ED+GOk)9o(8pI+rBbcdLt~#za01o-H}7mN0IV?Om={#^&*Pg-?2Y%f|aO@0)Fbo2?uG z!U5CR4f>Gtviumc=n>R->G8Pr9YB#`tO+YMT>#>l_r=t-S{KHy(eN(Cn4-t$yA2Jo zJ3M!hN6L!kBPJloRw?>nF+t7Lu<`;ba4d1qxcQE#uhNen(tTN@NV%(W0(a!k?DZAw zZ_RX_O+i6Haz;j7Djk`Q;oT~(xg^{ymW%;RUXDn7#<@tNM9*P>%Io1Y6XAzdgv`S_ zZ;^N?by+yp|X#^p{|D_?eoUns)X)Qb+m}wi{K>iFxTZqU0r&{kT$s1l#2i zMhPcwbYyPQ4d#d20QXbIRz`*EBqilgUPUo1+=@b0_KMRT6n8Gb`{VPzwM6rWMV|)K zZ6bra(Iyw$w!jLyv9Y4|+Y%Q+tUj?b)LbQ)(pl`<*!`Q8D+6it>^)`G%c=@kufjf;y|$IohoNG0kg*8<9T%B69@C3wk486=j?5;pXG&Eci; z{v|CXqF5N-7k;LY>z8XB@5|pHXoh>eCPx{%O{BT8cBD&E3E3D*)D`~+Xp-JcEmjL- zKA5^JDEsBQoh_79NJELtU=a&`Y4q9?$>w1$)Z1>tkz73^c{mCL7$3N5&MmZ zm=aT>`%LpvPkr0@du0C?fbc)zc2z)g>e{Iwb(00w)21UaBlarUKv>lRup z+@8OP^a~9pt}+JPN;j7;6MW9Fe1E9UEPhOQwaAo@ZfXbDp{)RcO;+co_Mv4=8yh>B zmOS}qPvevqWgN;xw=bnF#IloylcMB%X)srSCB+1TS7T-E;W*Z*VCf z_}gM5`zya8Bm1yLuAo>Rs(y=FW#0}r$1Bu*nrgU`r!OuqFOU9`Dn!{@uEzQU{!o?v zUb_pbNvtZ*9EUfP_(u*!;ae0F)d?OcZOi-|ofU=m1({n{Ef~aEL`OtNpK0F63f^8t zO9qzDp%m*p6dQfnb#I_mpb&ckfmxV^f(C$6i0tOGvR%uUyss(_&LRU=0pWWZ?1s5*%aO zywyg%PfNM0{L9yGMA5&;RgJf4&?XLfJ>D*F5+AztWnVykc65lOi%YYiM!N*(!knD2 zM^8(!;*#Y1@od?>+5IcV}oH(A{UqeAc30{B-e;fz&lajMO$A$&|R_mft82n z=CO+m~=*HR1U%kxgBc!*#OX^cCr&b()Q;yhq_w2sW^xu+_&V^l}chFsmuv_mVkYEC8B zn*C}9?y_tD4Kh@$ug5ALGH~-Xdv^t@!O(wyOOK-Y_eAtwl#CRjE&GvD~*x0$bNO@00`Xf=? z>j?Y8Ep%e3v-3;)Bjv!;;r!*(X7h}VnwgBwjuKhw9sqoU#1NkxqY;IRy#B%W2V@q|nRVZQfw8sktnJ@YLZEbHGfh}FWbRV%Y=$dJ} z;MAxv2xsj*NQtq;ohuW%m^BeN>VWeCtM=jcVjs_h78}{6Z+3Cf#^zSSzY2&!b)Oao zpID?rClbw3pU+(%alAsVuxIu=;E4!;w|kw$X0DSw{3Y4RB{SYlOdE@3UFNGyX7ANP ziYk0RFG@-0xZfLl2+y+-r+T&N^I-l)@N6vq5)}>Ybj26VVfpi={r&k&wfpJ6rYhRE z{EPjalo_?zCl30Fpmm>ST($(>9=4O@VV7u>%`>>j$&R=s)GdU+&Jx`E?%S{y!LLgIGN<>4KAB(&uyb(0aECQ8KP+EfZ!7u?+SD)h78PVz zLPm|}Zr{9l1C&w#EiyK2w97QgniRO+WyI zB=J_#f8mqYPn5sOyz}Eljv*Ly)tl}YGIiCXLaC8HcWYq44yVKtOn4ssM7Umly!CNh zbe;jE#fYjGw0W1MuG8>iA#%TZ#B~=Uok<+qu!0(M=$NprD}vZJMQ*MMwx* zq?Qdv+3~S`TnMqe!TpZrlOQpt^=)x+5nQqw;I#~o9lEE6yXd-XGZAxbV9m{KyJUR} zp^5p)JJ~(RbR+A|6A`JxB+C#MZ1YN-*w6=YY((ZS(Pd3DpF2}AaVQ}#I<2@?bIe6l zZ?}jhfsaOzb(s`}@ z#Y4*@Zt4Fy^QK5=YYUr+m+h?RMalSm>7nkBw_u5O#txNY0Mj_k{&&7?z3gwf3`TtF zUTS=jWYXeZm#@8k5`LWHxIX2t+R4}ma#q(4iDrJX9=~mF*@I&qA^59S%dpWCYlZjH zkJ#xbH(aq&3pQ1(?jnY+uFI5IonZ=;2{0;-2Q}Mbz=4b!N##BTEbLW~U<>Rd5zp6H z>hEn^_yRXC?||^bn(#C}+^BY2ilyE4$%evX!rdQvL52$pO#^o-G{H0U^4%(vzBl~G zoW(^k{70a<)s+bs(R@_=^o-$syyrrZr`}AS7Nf(MD1vsp{RurzC;+ff;d`g2eLDfH z^mt4h&Hm>rTrG-oAnP!2Vtu1}3Ghj8Umvg?K^OFm{JdvUYS8NOJLrRnXHc<$F*nyk z>v8jL6VP(1%ti=GWaAk!N3kR^z@$s_IvMaf%OL?Fj;mL$%b&S3uZPV6A1KfP{`Io$ zR6nFMLv4aC5AcR{Kwm3t3ZPshkcB3ziq-^hByV-u?i3gZ>m3@0pSZ@nzjDxkapAVV z05=6@c+`mvT7r zF;l$k+4=eNqu-I2$iS$0p;bY!$ zS+B-m^Px5oadCD&W_~a{EP)mX1^dzn zEQzZX1mXUyz;~9Ls-vgYFyD1LsWKKTDUR;Y)CrdF;j9vTq;yzN|jyrtVbiIHiAG}97SP7YdJWQHtcc174v5!NFMBdH~}4qOUN*X zQskyyM(K$syeZXV0?YbUCB(I5jrfT|b4?q9W}?RgT1&rKwT1#!?{(cTaLF$y&~Eo`RcGLmeT?C<42S)gD#thb_Nt!T zHCt@YDGS%(J!Z?n!9f!PApEb)rgR)U4xrnfnhy4aE^Z{Ez`!XqH0E@ieX9Y?s@U2i0wg2P?&=@=)zIEgCg zv}bFpFn-VLyu|R8SxdeyhMhgR1%fidFVoOj#Ng|s7EK{@HhczC?9PPJq1*|Daf__V z#oB35B=&QriT1yEH=|_pW~)qJw;fi4$_B|1f3d7ju}K54t=-+V0K!jNEZdkd^fsW_ zXY^guYptv$Y4h}9Z9CiBe-VVQzAn--VvlRoFS_XJhiO!vV>;ad+aOS%C~~c zea_~Lufm}6IP{-vl}Sd?X$o9s?MvS+<8~UwU!fUerk;2?1W(}Wh+5Mpyw~O0wN)`l zW`TVPiACxFzSZ?vfpEA=2p}JEDjILK>EriSn*k#!c`*leff=mGZPgSd%0NyE0nMe^ z1@TFm3=+^~7V2!)J|B9Le4&DKd@rODaQ&gpGEHima`>X{Jx(ZqEV_Nsy**koF|z|f z&z-ZBR)Q!ZRzeN$$fb$w{o-?jo8+il({R8$N zP!l!*$RYH3y!5eM(2@z*Z34%fjgu2-TR>|Xo$vqrqX1fcS6-SA!w>| zZk!W%h(vuxfY;Pp&`c5Vb^010dqg6`sU~k2f=omrI`4$B{n3 ziX-X*?%$MUd}U=N?=8}B)}~x9W*EQE-bQhFWlWA%xE6`GsiN4>@Y^cMafjI_Mw0!@8wlN;^_WTbcI9xru=Lkf0&z@2ZuC5?cREAxx#aIQf zXq*bbpm0aCCXpj5WK1{`oKuVHQm^VTkaCBgWilYg}wcjtaZ+wA!^UMr-iqsVahSYCx-C7+}06rZW zt@}=&6o$#kICX!nf2?%LJ5JT{VN~;PWP5%wk1W04G0an2v>+Tb&c_Div0PvJ5#Y)GXY&|yFFNokz@KB#{TPLU`25YQp@>CtR_JSgZ8ifCifY&e$g+;6kezXA7$2}rbE(P z`U99=#T&U(E4F8vl1}wJzh~g}T72GeA69SokbL-0!N-f95W(Q*hvLYePQv>1fnv&B zrsD=UGsnhkVU@q=q!_|9B&4zwZ<8O zQ;2iUgsQd_+=<vu*vL;ia%{yIvF(F-y@bydB(U1zy|^N!dIBQ@-+g_IM4>&5t|^q4QbeAu_$PvZ%&&xK2>uD4W& znzGu>n)Y5Puf^gdSI*(vY*+(0WY5#O~pGb4HMclDbhF6B7t90H(Wo1$-uNftYy29J-L@Q=+<;tFpcS zT&g{P0&N5#vFS2h%^B`kukTQFB{{=k9+*7j$hOgNkTWMQb$^cGo~x+JrlkM_Qd?{7 z_o;cmuIVhsJ>iGuiS@~)ROHt3**232CNNQF^x-g}dHw(;zFkuY7VHOlh+f$J&2N0m&5BR8Io10S+ud1bR$E8iP@ z^P-}1Opj^mwGtX{)XojZr=w~*zlVqw7Paf-m9U~>_Vd(bu~o(~Zd~WuL;P2;OOh(d z<>QTt<@2-ghn*bHL&fdKE*mNC|Yeq^RTXcLszEa2P zR_UPrt84w`s5_HLHMTbS%F(MOZS@+r{>Oe^dRTnElD-9-pnF=4bfZd957wl;*H7Bg zIStF0x-=u~X|+RTB7SMP#cr-xW*R=;J}fipIA|_j)daR!;^<~F2v{l6jsa~2A-d2K z_5$PW8E6c(tsu-Y(uq8qivmH@VHJRi5orsZL`Q1HYf;AHw+A z&TcG&c>#&2Z<_}eDnAQho0GXCpegTlAtEGP1C6kDd^UGdk&=)_T>i<`1bcruh{SdB z>l%FoI`EzZP_AabR1SC z`5{g;G3)E_KPHZ6@p4p2*JSqw*E8DM+O%PkbuYf(~#5V$T zoToIAGPZWOBHQ7BK+Ue}K&)sCdcj+Z3DAd+fzfOCq=ayBnayA$naBM>q`CcRoK*xH zgUGkBlk3}1$w1Cad7yi{Lj?5!l@zaQt;%nE{!&>sICV`W1vqO2XU}@GgdzGy73xVU z_H~akzgnjV6cz3hy=QQr3h26uooUSw=G{Igayr}YnG{s~TTNa{wxWc{_qHSZZ7K#Bsd0+P=z$ninHD&JTHR=A-%r6sM!tQ()A@%`FL} z-fEmhWb@yL4(Z)1c&7qVpP%}v>X+a^<3#!M+N1YX&+2uC58EHTKee7~EL7kiZ>NS| z+o~s)QCgPWaCPiVl{_~Ke?{gKK05GEg~A(aLh@wsPBIqPSCpF z%>$8`_Q|FQj~}M7VeHYEITiRNa4NrB#CSUH#U*WA48no#z_D-K&QC3M8a@@^C!wZt z95{=KvA$;Xd)O3yzefGu8qf>IE7^GudMn}07lu<)P+s5Gw zT#8u8!@EMfn5At6zNs4A>nt5Lsk)6YD1NwK(P1m%5MuIvk1Y=qdx15b8)M(n!DIG8 zu<|rHnu!v@_+{l~Rq#%Rm5>@Sk-#4(_@A z>8a_i-uN&OIu@jW#b@u^B(Y;r|%H+$KT*#VwNu6cteVV%LwAZRj ziw(#@D=OoIL$>lDi(wG=#ag8B(l}Yp@#2VD%z&pK3nb{N;?B-$^u1zJqfH_O6ToXa z0*lNXQtg*ZaGCkAo%sx>=PRry)(Cfyx5`Y z(y`jjrQQZ>@}#Gf5L?DYx#&Dr0519^xs4)GCPpp~6Pz^{ukgsoQy@@9y0+gg3Y_#) zghfP5$J5|zi=L3SGk3qK-;Z^nQK_a2Jqru8Ddgm#PmtL6ElFM!Pk(3~Ii)Ob%&;(1 z#n_%tZp+H&VTGubkI(3@B;`c>E9GwYXNU6p=AhkDXrPr-Y-|)}(_xXxt~?7w$IbCl zq9Ydw@e2!>*h4O$%fg$U5NO9hAM+ zBbcb%8{k7jmJTwAVhTZ3eZPX^>a|NHw?ER0zR=f;EB>sT+w-s?;&6g<5;p*|-U=f{ z1WN&2!i=)V!_om>Saya28xSb=3-Pp22oTK=i7cw-nV*RxB&|4)IP7&kUht+UG)_!Tx! zZUM#;pv6qBXDWoF$~#Ij-`xIyzsh_vn?=%9W!RgUqt`(gNAPY(vwf%V*F1wxL59kYH2X^ ztHx)>>qBsssbs4fu`P~^)k^?h41DJi^fUvX(488+>FA3@R6m*RZ=@^!xCSfW!nI%j z#j-MNhJrLYh^99FOR=(UAf1bob$V`})C|-e-rdsby=utM$T)qqg=fMyza!mNgK75Q zX_NajQYR_l;o+^lhSpQ)O(1EjFM(h9^m){Y%!fG5_M0iM9o0-@5c=dY%PCymkWoJ# z{}gZ?HJ#u=Ua8Ga9l>pG>6d+s*zj2aDW>3?3~%iVc}GvaLS9_T3@eu^8O;3r1XKmT zHxynuTL*)c9*Z6qhRqS2B^nd^8!~G-U!IL|y6elte^;R@hF6m{4e7Dy?d?lW3W3{> zP=x-lg84;PveJw0NyeS46spAg&b<5LF#;hv4)Ohp*k^AdjU#PjCo)1rY#t;&IQ$Cr zqo#5M`3ty=U3AIg=Z@tzT{?0E2xVr6Z*qlVp#<#Cn?DY9eU3?cm)j0O!h0{d7C>r^ z9hckv`qSi5uK=1%uNcePLBN!oUG|H0@emLY=Gs2p3~KMFgYHzMB70n7cps9^?>f(ug4sk(=dT6aTwAv;!Y45s0NVA-PC+*O z|FQt>kFDV*bVjO_4`y7cK!6888)zW!fo%Qf+k!~$pL8Y}WC1C4bn_Z` zQNXXOoPTW+7S92NLSGy`#a5pJZvV^MffUSioo*E6<2Vk8#2tEV@;?QVtH|sxUV`|f z5SWzq`ppm$AiY+;RGTIPe3$3TlRZ~v`#JRKuUoE1Z2wg}>{RfaPpT@*K0z*S43=uC zy`m_y>0#}5RXaN(7Fn#6sQ!=H^T)74oxbo3L43Pm6pPBu7t+1GD{=u+7^~zN`6v2v z{n(l=!p0RVvxieG5>`D`Ai9lO{^7eTKEo}-*2spvQW(5yTz&yQKECN~oz31-ZD{51 z%K7W>RC=pD3yOKh(rG&C?%UYOUtch2Twuh(4aQ1V4Vv~8a}wQ!_-a=gQ|UKISm5ec zbB+yuup8^(P^Xrw3@M?RkTl3McVFz?F(EF=kv!3?R+FF;D?29YOW!Blmp+fK!09qZBXlJS1Su{c za^iy=`RTtA3FU6Ut$6Q_y6_1IR&RDI>ba<^I$W!Ktz8t?TrC1sp}hW21`+e;ZMo_4 z+&N0BQ6I1ktH9>8qEk*{@G*9!So$gKL& z+L?~Oc!}$Wyehf#HU{qCML!@Nfn2=vuJ#{#VPh+Ld->G&ZP#8)9^Va}r_?o4Xt*?t z32C{s58X#Sm1Lskr%!+3#gM=B>&=m=c7c!vIRCvr&7VhYj4K=?M1VUaoN+ zFe9o92}gP_X^zL`m|0m^zNidlSc4C{%(b|)KWUvH?}o&^?8UBI|EV?~Zq7OZI8`ot zvqtF_gBxrJ#7^j7N+0ro%v~e!e}Q3LO+!xMVZWlkQ{omlo$!rOmrh4pHdY(T;9WGk9p>Q%~PZh~HD4Br(8@ z-CfF!tl~teMK|UBtUK3W8)yM}2M2br?7$#ke2WKa2?y;a_fnsjKvNyXa!pR^tJ5CQVc@ZE zArs+5HTOpp!Bilc+VR;~}843KnKdJB+Q0ubhh@UorHf8}GloeYU%_UVVptfSqp;k!}GLQ;PXd@zps7 z=zdT@T7Pm6*2Z7nWTNMT9ub?B2F!S?x1=$CXbp(u)6ZYfWD70Q!I z6*+9VfL-gm_wi6N7QJSC)cByg*el@A)}tt&PO6J*r7`q;1m|N zqsw+Sj8`nttqn(@cu;ie$sQjy$DX-giqY0Y1X=t^vRirk_6C4#Kd(;e&tMt|*job~ zh1z3$3FT1jB8{p)G#P!sH1Ihr3<5eZ1QSWoF#wm9Us`J@jG&rNpQXL(UvvKMI-w*C z4BB&DR#J~j^k?~MSXfw&<8qI{FN-WQ5)c33Y8m?Qj{0J~oUa?gZ+}+97zF_YFgiYh zDRdC`h;TKVQYkNdN^ZNKU;lmC&Qaq=^M|?fu)g(@^}c~CUiQW6qca4mzS{P?EJv35 zx^4Y&TCwq1r||~#vK6@@CVj>rXRPS(eET@gKtWoOT$c9Kw6Cmh%j#Nk9gVdTJf7!a zTI4W_x96R#B+u{Tc|J86> z-BtbQ^yAdqX2i`Y)5q4f7MIqPJxoJ_{$)Pl;~%aGt$YE2K8>wkzYm}j%v2a(E6u=3{SJ00!HH|H`=m!@V={sAE$Eyv zNuJNU2X4ZleQ5vwH->F0+?$;B-;cJ~FkpWS?=Rjm0;aEtbt5QuyA-6x4H=ZJZL zFERdix!F%Z*7aS-q)r;WMMi?}l6t}@N7utOOE&n!?g!C)HAdt$-I(E=U}4|=r@|%u zg4wO|g3V0jh%O_f`~6q^P7kQPLC)+zTDTHSBlfe2SL>eqv$0PVqebCQ-I^IwZL?Ab zx*c;;ZGEk2d!q@^CR@+eGg$EyDFhhn0!-v7eKfjrWuL@|{Il{s?Re-6`cI`v7Y2Ar z?SAkKC{%mPzlw5_07m9|e=1CNPW6bY)c}egU>iVOc|byJ=XQNz~%3W#?^QQM4Ql*8kmty7TW5hzGNZ}Q2lV!vVYw1@G2I8IYs~F zi7xwH#9SM{bB8?(7*_D;W=xxv%vv0K%EXJYwxxzCdqwnVw)+-J`P5Vo*%sQkl92Q) zlz%;bqW0o;{#~&LxIs=O!fT&5)44;9rKvX`e3h&;4JJ_NPLyqr|D~#lX%SwaQTmOT zys6=VMa@bSX}@5Gl-ApgC4Lob3IDCFne{Svhg$tMUOZc^(zW-uuxYc0O*s-fFrJy+ zx$-+%5FRZ1B!7Lq`aSOvf%CsTjkeQ`^)K&+-B>%jA{wFV zBOk69p_uXoF7=)>@t{T6^i*|zeUzo4Z&>aAmw;!Ilz3g0EDQTH(j(t#D`m+&F(BMZ zK234Wj7e(D%}IRmvlQ7bK6PN;6Y;FO$?W%6XFp3yvh0qt>|$!!My({eNWJ+f(=@rKiq=>e&*il?7O=YwGI>un|bPgG^xBZKzXIU0s4HsojYTTlB)Q z1r|T#nM-x)`-=HI=af=Wg}9U#!Ag)qfR~rof*Lo@ZoV$6h3G4yuty85Klt_QSEzIz zm|SdZZIzz8t_i%?Z*Xe{w*IB?&uLxm*C%!X<9%lA|D4o3yg&y*=s3hF2n}&kON$78 zSYVOKe-dtSL5~@lc8R$yDx6lNYo-t{-`t#T)ojU_-?>5w{$y-lQ)8o-J{9;6Q+cfM z`^?6zYTHzVpc0!>;?JoJTgY8us?A z;I}e@Vro4t)V&}u!eH8buEx5%2y1SuFnbsbjm+s88A|q(l;SMdIr=N$UKx_QZ(!#7 z@6(R~DIvP6(NG>>U86UXe{F=XrAI#;kpPujREh*ot&p>V=(rlM`oayd0}HSzZO8se&` zG9NhoRw!al#w{mI%t$LTUTH2t<&yH=zeuBha7(Q4Mf^F8GrNG$u!@zfoOmV_1~K5DsGV~0!OyU*J{KK=ouZGtQ<)<-;7tF@7(OtA~n z8dl7CHLH1?cc0%BU+hd!DveI%;HA9IzqFytOQ)DMP|-O1LLracx8{sRG%=$sN$vaY z2QHn%cT~M3Q6qC<2^`_;(=57M1H4)9(0u8>#|A0g(W9rE?r^(QDgS+=6xE<|@K4eL zgSo-|3l@Ex? z&@W2G$3OW;!AdY;pKJJV*^BRr!8ADhv0IhWiRNckg-6^m20Tv!E(wLvM(TPSeXWIL zP#W%OXI&S%e{t?+)0iu}>)T49#z=**#6 z>1rZQg4&yUqtU=*xonr#1K%|=p0A)(kjiiU@Ph=y^gZklfsITT&p(q|$V_~w76%I} zao5CtzM!ZmjMsL`5hzcjUmVcLP33o@CMPE^uo2G}7BjDXQ^~I+lKfqMZ<#+-6kuYd(Z9*$zIayCp4`FGO>#!0>6 zE-uwhFQvKqZePGOw_|6vI;>}92P!ijUf!{(sqO*xR{eF#0Wa#fJ7JP0GcKRL?-r^t z0)eSWdbxbRbum@~n<{$*p{B)C#n5R7sv6*Z%+}fK5&JV;@HdVmJQEB9O!e+OEsJhV zDExGIG{8JW7(U1T|o6cD4`f zUsetdkFmpQoOoZBCF*Z)Z|A`+3%8u94v}S+7ro(W`+fItdd{iSe;h2Hcb6PfU+;gJ zrW{@7N894d6JPGs==o6m)sbeC< zV9}`xw4Nv+P9yfJ1097L>27SNN*MDVX=HH|VnE~d6>;B%J<7Vnu|x4GjkYN4B8m3& z9n{trKYm`4gISUh>4wD4;`?Ypv7+z$R9QLp@u5b#T0j4{c2+xU>m|6rQpV2<`oG^@ za6qeA0m9I{Kar8X*oSAs%`=rhRo1vjO%nsp=*zTUQ#Q#dXS%w)qSF6COs8 zm6er|kgV)0Z;%8P|m=h-Sh9y z_9e^t)4Mv2_c`N^VAUzme!cdj(}WYyYL$go+pN@ap^ZOrel><3(-8czGIm2(?!P%b zKRjPy3q9`3923|~0k7OCL&}R8vc_o~kiDj^aj&EFF z{=9w%;Lzb_UJQbfLlX{L!R% zy~AlFODs0W%0TG@c7t@g*Tj#qHd1DV0qXoj-;KJjeg$yheVtmHNe-0Pv#CJ}eR`-8CIE zNYu*FnAT!u;Ey)!X$Wkv;IginQcU9yYZYDHuXxfMFQym1+xODQm? zEHXOQL?q5kkYuX45%(Mq9w+@9l=u0KFJlz?9~<@?tUV2;F6dg!Mj6G%pw9(2QbDJh zy}dng(gK*CajsKPh?-?ernGh}d7){W^=bY-@&QWngZb-6@bMTV}15Ip#IY2#$b4 z&55Mo;@~i1iiCw7YL$U+H`lq^HXbM_{m^87+F{i9D7dU|I07Cml zgQ8}|(+@SDea(x2zqnZH>@S*A0r#JvOc#6cLL}P~CO(R(9Bq;$Oe419AL-BzV9v`l zuNXPmErwoY#bXoOo}NzfYkOM&?AD0xd5X|Q__d&PBD10}N&hf{NAa?>{ZH zSXk)#q5!z=)K0%ZzJRZ92*qSn2EjU{z@*HO|MiHRO-9xE?blj{fa5@R?b43(s)4+W z5MFBkO|C$VaAG;)fScNZ@^1beRHRLt{wq)0sNTt`5rkFsl+5lt-X~B@7mPxwt!X4* zRU;M@Y^u!V%Q})DGoAKe(PX@^Wjv@6C=7jywm(V!mZRu~hPCQ}ervN{W$1WI0I%g= z9Tt607N0a<`bWB(!@m#4KP5Fct@${=jPq7X)s|o#QYV>NnCLRn!ht&|P{KHoJU5j7 z-4OnX@T=SZ4()=}jK`m0%e-cwZdjEoxb0MrgS?}@jL#}-H?t$<*YP;v(a{!SA~;5SYq0gMApt1hhkrU5 z`x18&8CZy(w&5VEv{J83`=j>h^NPV^?;ERezi80}43>ZQ?st0K?|DYwAmFlo0Fngg zL2Jygj|d71Mt+cWG)Z3B?+?=c7ryyNlb&89%eF?v#L?+4wQtqOGyLZQD-B^HvjBef zA#3w9kux)F{>A?i2q$M|WF;g5mcC%z*Zn2C;Vgg{FPA@;36$wqTk+E;$bp&Ys&q#% zsJJN6u=+6}#$i6YPZ$h+-3MFDbchhe&=Yoq8Q6zJIkN0W{a3y739tW^U-tuVXV%cs z)@8^a--hc1t=zG&#Yol=7*x8>HnNo!6fn6h1|KAsGHwu;Yx>sf{w#6D5|OpFwMA&Q z8{V9&FiVDD0EO4Dg@96MDbjrRtT5`BVD-YhTBmxp!af6@3Pkw;!_f#_(7`OrDD;Xm zy_)V~*KK?15I*1=5`;HKj&N@u#*Hj8_ZefP=L@ExsOjH0 z{5_$344+Jx&)**@KpNUu)T{N|HTY+QK%bgRlB3dV)$;=OB!o*vbn9j)-GIzat0S01BmT|s|y50Pe0mM8z79-x-|*(_;~ujXBK#kRiruV;$iGf>3}ClOQCJRwv%OnO_} zb*C_O^qV9tQIFJElKlow`PJLH6Fzvd<<=(X48)`Iyy--hQ}zl!RO}`voy2y{FrC*2 z4g`?NC$Hw;8LeV=gQrNd{dGkNlbB!@kXqGGpq#hKm1t%_b^mD^B@0Erq7m;xmUJ5(8F*s zyk!-26io8`^;$1>;&)N|t6+PbmMmjNbZM_wb`IC&c5PwmcLwFp#K#ap4eKGZHZ~KW zW?=n0065tRwErcqtg#skUA_$7+~T33A*7i%sqEF%GU$RfwGAp)0mc5@c+-1%TZV7PvZH(P?m`BA{2~z!Y0+|HU6&G{@m}WCmGCFv!e+_F!UWrgw4i zJu$b~hDEl|$w%@4^N;#Tigk*2w6+o$p;HivN(0jLc<_(>$}O zf0c0BP2hNt+Y5dl#%2r&-stJgqOc+&gBS2)v~*}BP;L+hi_gb*YB3Pw747-tR+IZ@VHojAJJZb!cezlL>qg5WQU+96 z`1w&&W~)yYKVITklutDVRysLuJv&h`6n(qruQ0#*X<&Wr%Ej0+pAl@wv3%iqJw40Dtc8u6c>`B zDc#r=D$nZHEW6%m#y=uSMNynbz7MRm!2k+l12a-+l_3Vn@F3>;G$zx1 z&icQC?>}VuUYJ#YVzsR)KQ=zz{)*b0)>uzi+1s1%eQ#9E7I_zdEf@;sz<0^XL=ral3!jVA#pBWnx8NLI zbqavqMp{X<>#4+FxXkgGk8d0v8@Qk0uX!^I8Py9ooR%e5Xxmk$%J@6m4xD*Nz*yfC zimo84+v43QCybvc#i$v)g9J30Z-yV_q8RM#??0l5ae=&0X10>33@Vun7bPY0= zRuB{3;A^o!Q!U-KQ}mFIRc8Sti31TtpcAp1Q&q7>&mVAsY`o@+qk;d~JTvE0H~;;A z7QkhgvkxU+KvzI!M}C8K6U+BOrg&pHXnYc^xR0pGu&b@cs70>NMkYX$I9eNM{4=FD zkjd(l`r)BovbhQZXd&Gq;w(a1d>T)YbgwKK*t3w>w8{7sYILHQ9-S()u)gvAy*)l3 z9?xjlw{oQ-tX{t4Yi=K7ffzC14jkw0>y!Q{g{f!&i%zfoLvm9fK8`KhvRWrv_(*nr zBzpvk!iMXs^Ty+0AtgCM#x1lCTcd&2pcYEMpD^8n_>On`-XQ@vBB^QImg5r>>d!+d zHMZ_BnH!rX*^KBuwHBX9-ap-pKd8AwM7o=*_FPNq`uDAJv1;$a;?*wQY8vL03 zlqkg#64Kh*X#2;_oEa1I4}_yuJUtmi^RyXV83iO&odp%_1L`hxyHxs1C{U+7o(N+m z+Q$uBhCSwUxj@g(Z>rBmqxeTxgkMv}>S*8l%2&$tUY(P0Os*bEaiUM;O)<+gLX4xV z2^@hNJsl6qgXeyhyI#pd{9Z^{;V#FM`Zu~ zcYM8+e#bgW~6Zeq&==%t-Tp8Vwi)vMz#9>Ibc zTMAH{3ybhXuoh>jgi)iuD;z6R7Qw7LwE9c=6AztGEcPm%*1sQDW%LF{E>BXmA1MZ!M+YrM#Q{kOg@}(vdH~(81V4IC>5nQdM3Ii zBeV)L9h#D}Y{X4a*xLG&GFJ>c!%lHk1ZnA@f8$?1teTeCo?vrg@Np5J%Y za=Gh(1-&EYsyc`z`+KP&t)Cv=Ud(r%vLN4JU2jA_(r&SQ|}E)^l3`C;XcjjHPx=Ny*( zSoD&lp*eS~qtuLwI4HuicK!?aa@o*DofE3$kiE}h%vhpwhncW@g6@cE(JHF=lA}?L zNZuUIn zk#JDa9(8C@p&O%3=|HBvjMy+EzWBuCd_a{Cp$1-iTb||n82`0LQC7PlZckA0(f$$= zGH2>^5ThiXyc?+!{MOGEeJ0aqhI?Dep{Hu`jWbyY5?}~MzIc;Lt2^sej{O4u8@K)k zzbxVS$(ow3aJ-#(9|f)ZtHh_Xog~;HVO=I*^Kyo8xg)?{j^SU5K}>IvY}L~omG_{j z9H4l(X}dJnF*7qonziv=Q^tuc(fB_2IHXMYHj1jks(S*XYTD{)J%iZN`);> zx@7o*@3}pUf0n{x%+v(;ptX2xy!Kw5CGn=yS0;i`|BUVa&Xl-4h@w)_(&}L>%xs6r z$p3gE;F~>9mqK84k(Zy}b>PKpp35}kW&l~1QV_?QcB?a3Ab^kVkKcTEod%=Ol!q{x zJUVS6Sry((qYom2gmbI{^e^!q8Bi`i9h{V`{1`Dcs-mUrVoES{H|pDDrw=g}uZEQ5 zAR}RgzLKdDd6QDsuj#F*&d^-SecU4kAfHzqi8nXtKCs}Ap*&MUDU6j4Vl9b{>1_1% zdO?P%(RuYYCgAbKGfw%tY(8*^xl#%BdWM#SW&ntc>ND^LHl_haU%J+C%t9b?|KNSSnW%I%^DGD z3Wgmj0)NQ5gJ~79xDz`!k8?SlW zCss0iHtap^w|L}J2KLTMVkrB0Hh!4HhWY5uT8z(FFLg!yV7lQ2`r^mpV&(yxWU&00 z8W};%c#n1AI}1aN2pN*6o(L5(F-g85DZwhc6P`rul{%*7Qx5DDJn-=1X&n3rIbE4Nbg2^pg|q>Ku$LORXU^e?koVj$>h@ zNPzH)A*2`nU`mU{wq&fc|DNm{3_(k@EB#@pQ(@HRU*)2S_m1uHw40+sw2|uhZ0mn@ zN9G{`qaV(c4m{uTRVGUDkmS%4Syt3%f0m6D;4a{Ny{zG$!IQ-3MqnlX4ZoZEllV+x zJfXZAgYMlN%PRvL@noe;4kvN`XB5mE_^L*>n0gYNhLrf>i@LaO#(TeYW`4}e&QffN zhYxtb62UCo4xLhtjfpYfS136EeU4lo?+Qa>f0N?yZs@e5oOOA`(V{i;o$GhL)!m`*+; za=z1n1Y#iz79PPv|71maU_SzMX#FExMqv`Q{p%N3K$A8LiN7O18DDcJD5c0m6!7dW zzsqUAv~x5?t}uRonxH}v#rfIT#cPK=JzSjx5t?DW(X<9|+``Edr=82M31=y(r;#tG*|< zx3$q8|45qF@Y-+ZB>Z>#^f><@@F7G`mVG(V&GJ5ygATYz{?}R@f;uP;>RUslgkk1`?qvq=+Fko4!^QM&QLp#Ho@oV4H2c?yo z55h(MI_j>_CVT%waV8>sgXL9s;#OgN+|*p_bc_6*ftj-aG zcDSp^MqUOHq0^@PNgUg{leq6q^iws?wRasI)$%#6N@?|*Lf^Bp-&MqVe0HlwQpvq5a zj`qcJpEasAn1al83T`Un(xLuL(`okP1X!F4IH=c}8c#Mg{`~!mkjxMQHH zH10Yd%y}p>BU8Y^j>kNuv5=71F;7fNbYre>$f(w__PrLtqh}w$Mq(N*5DdY_rh_XC zYsEKR!JV6=f+?tse`HW?p)_fW<*Q|u7`URNbj~9i6#E+81}mIf+soTa1pa(A%`$7L z>%JlKI50P*$QSDI{qWz>w7D7GR&EhM%&M1bz^#YI;Dh~t5)ztmwNkVqn52D z#T(1-(oO1JG&@S-H%Y|D9l{2^hw%@njIA8o3dR29Mc)7XtwG4<4eoukfH>GmQ}{!5NC zp$`Z4+)4%?pACQg`X;Un{Z}%oeKUcCxVAHULZNt~vbI^hLf z?)5a5i?7P$(<{TrPG+lVTGWAuo|$?t8&M$u1lCTMPmcegZru@N@K2PJZb`MvG^@l2 zf4hH&I=^-uf-4rmS@FApj|udu_0Qd~t%X;i-M92o=FbK<`CF$w+PBgz56OhatP){< z)1Snwq}C>zmOrUDXUkHSo*KEo*xF;0_kMzJ+{0t?H026mMZPz>^j!E>ykpu>&pC#S zLMS(M0Q0WczPk`3Of&swyAN%wanZs5lIt{!UoUzEYvK>qLWahAm~RjqlY9%)9HHC^!Hk|e$%1ah#xkaS(pr4vuT%-YR4XX zAUBE|HZjMarlVePeE7p+OwgF?&k}<%W%%{`cDjo}w6fsP2sKVg9il9zwzF3sMJm<8 z@8I5G$kCg_NWhVjGe7dI>qO+I5?Lx3T@Ym|!J>Ss|eQ-^zogYp! zE1qS>r)1Cb7VW~iLvPioZ$-4#2oh){hMP`=$-HG=pK`lYCS5mc&fHdB5%6`di5X1B zojWu<`5Cy&Ktjawh9aU1ORGXRD5$dP*vx2;g@m5O>rGK1OAg=cJqYCCH18*bq=+_u zRPS9|Lxd)mODkzZvlq(CI5iF_6qTKi>%1W+{g0Bhp*PH80Ra_(ZdZI>y7{ipg8ISS z%}Dyfixzx0bV1XkBr<>coTh#jzk_!H0EWGV)hGjoC6Q_Rw8W$F!o100+ze@#Nn~GS zX~n^=TmI!_d8c~B0-~EB1&~IY%vD@?BoA^e%sQ_(Wki>eP!0f6gg63Ap0C^pm*cB- zu=VMWFW-C)#ScnESW9fS9s;B>_&t(H3(jc9{Ckpn|OzJoS>j9(0ToUmI z7fpet#=UZP$8UA&Xdu*nQF-|WRDVW4KD!NOJRac>ZL4I@hycsx==bl6K0aS^o=4VN z#}3^-jm|G2Z4;puOhGagppRpMWT||CdhLV#tPNs=QDfDTMN!3@0X&2)E$<5%t?w>B zQhyUi#@G<8@3nP_rzjp_RC8Q&H@ftJtNwVJ@7(Lpo|fCsuTIx>=_h%|35%C(Ew3Z(c- z?<;M*Rq25IJ)7MWstgeyK~PW_SBUx#Z%q8HnL=U{JIciHmCZ~ESMSO99)M2+4~9ot zljnEp)ai2ho}b4AQ}p{_Fu4Nl-~yD?GBQe_e)iT1GhFWU3}#Z6KqeNDOU{|m5Q9y<|j`H2JYRHCHKNtEK!|*c&ZYGv^Mf(4R=7DyiUrUy)g5UC| z)IrtIa0J|iO1iqbiVp;3J3dYgJ74uLqqe!@(`r~f9amnf_fPIWy&r8#_mJC6lOIFH znMi(LU;&qF^AnYiSX?3=`BVAKwthO+P(qvCCWa%;DT;lJNCdT8dnST z1|sj{@O`Jf`y;uH1!MOsrO)U5vqDaI>eM#Xo|gy5?=OYOxX35Ca?iNmnYY952}#bau$$5NM9)CSI!W!8RH0`D?b4{MYkW{>IGQyp%^lwqs4Q z|4J_F+XWuBteDgywJ1a3nvI-H>Q9NWRCi2Nw{yl{-ot;AG-8%&oz1<;C~Gkx8M+uq zh)P+MKzpo%0l~CuqN1*p!en>$=iAS|ttEV5XW4s{$DnMg!zXW@C?)q|qeH^^6P`m4 zDN+2-J~K_Z0t$yvsoSqW28LMukl7p0u~++7Amg8iP2Z|!+Fyw4jGKW?e~a)rg|d6m zQ47|=`j~>o9=b5;jvSFh`guG5zznPlqRUKQi=dxb`D`PUVrC%w4(`K0}H zQdaGZM+f8SgOTXi_PmE7B4q#UlniAj>|eVwhxiKL7N@vnoZ?INieWrLBA%LfZA>}1 zw99vV&iLP6({ZDEN`=*-lec%PPw(q!_Qu%0MFEL--`so(JJc{V{fi8G{tyTs*xnGz zvb=H~li1z7OBHU~0uuVhsT>Uv7LoJn;Y^J^^NShkm3x<_w%2Q3R|Au8E&E50A8e&H zRtb&M=}-&c_DG+m$i>gE z{k+ZigO)JGo+b#;&`l4%6G}nFBuX{aXqsMZfW{P2lFZv@ITAgoFExo2obbDU^|am1 zzi1lQxx*F<3vmo%20iJhc#aY76{U=#xIZ{Q2x5ef?!Y#{bv$6jF7OjgYAF`Ceo%^L zsEDh)uC|uA)2*gf_*CGO^ec?HkYii&Q(nk>czC2qj>$gL0j3V=x|ch{I_7SBhcuIn z2sRLpm~oN$T^!y{OB_7t=eX2f|IEh9$_fOk)dM3+9?Z02I(@P*$p$b*eWFn2;iemy zJr;gmqTPmAhMK*O8=&Pw3|MLG@qK2*m$Gw;ObJk$BzTo2a%Ikfas@ZJI9OH-3cCNJ zV*j-E82M?zyqFSnjG&{%GYP8VVP-2#Z0>@#sBl z$u>q4opf{pz0JKH`T}vlFP9|!-|oFw-xJB8!FdwdH^j;?O=HH)GD5S%ES~aPRo?q? zgc>SAtAu5^SGX}?Y=z9cPqnJDTQuUc7H-4`$o+f$s_%nZLBp4o2FVpp{4l{*?k-c` z_NT(q{3OQXzl#*-?3pSZSy{RDba-5djxpXfH~gaGbNJD2Bq-I!c)E60vee|K-af*t zOmJAk#(paNj3C;WE&CCE)Z)E+87^}9x+lb_G;BmSy>{*CtQU=C=e1yT&Q~y486p36)hF$CZU3dQOJ7xIvdROux%BYYUBBt=wB0u<7q$ za=%SpjFrFUEI_ZA`ewr`)2W-d;Nfm$HT6ghbl{Lz?Y9KA;Db8G@z45K5=0fSorfT~ zZ>HUms$H(kaVKM=Vgk=_pFA8gseFJ4p#QTJ?CkK6@ytTv9Q%fgLCVRA`^&~PA_BZS zFFfe***7BtvzW^*wP>{f+DsA{7E9R1&Z@7kzt5oDUnFT6fZeDT4v}l)Fa`!ZRp+xS zqwRDVd2n;T=bbt+b#(&h#4`P^&S>$&sy@)ZsjK7rAIM3RG*YY0q6VrrobdKPw)BsY zWr;S+IE;r89em&SZxh#^@xh$`HYCL%?;Hv1e{36|U)cNo&i%)-jWgh=LM+O#At6t2 zSHyh;Q{PlDT|xzJKC`udTvj#2B@~e`T(9oLwUwWY@mLfcmIoY61IY z^Cgj5!zxX<|6L}0X@w`|JzX97@0}a^=`?yx1Ol{^m1s>;OmR+ZuTyGF-GLZS2tnC!v8Q^piCn||t8ak@tIF<)j zv8^)Hc!+cdLC{xiH65PfyH%KXo!zBBPsL?+jwvP{i+&n58Vz(@7#rg(vvWB$!c*1p z4!Bvv-f~I}6hGf)h1rdWTkMyg8ynMr^CC1#?Hr5-c4rOD&!#{pAT)yZStX^Ln5_~I z$r{v^(6do2fn&giI7if~Ey(bnHEwBfjnS8>!-XD@A|Xqlr48~t3_!s`z|oWe;vSAv zF;=oA#9@FO^9Ju{RR6CPc6eeZDKn-{4Vkg`2#_jBKvQO%e8Q*+t_~1-z`MYqa6C&n zeLpdSZmJhPTZ4mFOrXKDXo*w$Xx9FGf_Ha+e*;eZT7s9#9j3AL#x?Q7|7QUlXEOgH za1GQ((zhy{34oRc1ZEYIulX|e-meJ7cgpi(zLXa^>r+Tyit7l4RkO^Vm2H!}%x#7oDK8*D&maf16LHbP9gIXLL=ds&*48E=EQIAS8NG6r zVx{&65d8sp4FP)Se**?2T&qB|UFA0eNfb$9C7kR5+yH_;z*!pppZb_PGR5vk+3PXP zcjsw4FGP-n6!K=Bc(XnUULGXszs4GUd*swsX73CznQg~^ z)nu&)549WiABE+G-t3RYox)Il?fTmtPmRcpw>Ge2oYAK!rDSDUs$EjtP5AKKaI8FH zRXnzqGD0CIwMQ&PGEZ#QNQZ)HZ(6#JWwgxjQBv;7nsK6d-l*>aMj58heskZyzkH*k z-+G_E`VS{XrT0X@f3C~Ppgkux22LDY+V;9#@yq%c08;Q1%5aW5t4lS!Ch<5*0_ov5FL5% zc|w>DnbQL=DtsQQ2hyK}UyRmFLDpeRM#owGOZ=&wtu5lmg_>ju{Xy97O0QAkxJr59 zi!F3A@z?eDcaF#$hg~M}_~C_*xu+9Lg8QA24)1mDY-7*@*6Z6URl+ud{*MSlaNq*v z>=(5oa{TI53Faj+-}cS8=~{_n81qk&+ycZ?bqwT5M&vJ>j}2&H&t6V;@zB>8?g8!U&qySg60 z)g~4ehC#}TRK4-O~PksSZJRX?$78f zZ~AFkAdy@~STVD2&%a`6c^`@TK$m%a_}u^yc{(ryDdVyqipvU{O3!%tISd)aYOmp> z`30tVuyqK^yqNZmH1E5`JGqqcMlw3!dl%Mf#xvNs;?ooNqE5TGD_E zg`PC{M-PXkO5VWjB7U^2BW@468=cIk=v`^((tRGXbAL@x>>bH(zdE-{t0Rr$ z`0enc+cKIzSoxtdt$JwuU+2KeJ9?^QyH9!A1B=xE>)obC?df=qF@KC#VaBGwgP#{7 zGG(W&+}7O(J*Kvv+U8PXzaG%&u>f#$g3I4Bq9;)JRX)Hl(*1K!(s2lAlrvGdb zZ+2DrFw`x6Ui=qr)LZA%UbeBTNa(3CVYAs7XsX&C_h7aSON#WW;>1m@OIS)zMa-es z=^Nb#E~EZq+XKcC!uO&{bS^M z&F%9yzb6qV7LpQ0`7z0dsbTci#o378I=D1%McilSn|>htQbq<1PQ+lEG^O3()W$Or zzEnEZ($aTAeU$0wpT-{en|&HGuOd4%_4E|rSCOO^55lTHu_izO= zY|?5c75U01V^z>;o(tX=y;#CQvMwTTscwBoc^sQ;j&N*HEf1X9acJzy{!6ec$jHcu zlK7pFeGv%0|MuUiOCc6q#1!5KVjvc))luCYPFvPi%H)q5fk)Ic>v2i?m}fRS)}&`j z)Cr7x&t^w_pG&je8MpD5b@lOif}Dea>;-T=T6xC2Bg!7^DJG&-o|J*Kl{MzmOb#0I-$T>Pk@o@DN21ZiAvvlLhl)2!RD%)&@6x*thw-lz$FOt5Ms-vV7;@%I8Oe21c*is`@TD|81 zd?`69gG)r=R?}il*cdnpI##>>M=Hr2+rq-T@gtUU`SDh}T-M{X?mvC#)h!hZ8}qyN zd+DgAV(keqAZqqLd&T*I;fq;Y%OQ0M?9Wy9;dA+qlFYh_*%RE4BuzODnblNJGFO|` z5*o%63x{m2a_cK#h;dj~5?}Gxu*YBJ%1XSH>+A8EFchH>hY#~A_`$+Byl3Uxw{H)u zL-g5X1mDrWdqzs3=eP`n6gYgVQ)SU}K|STLonnC)3)pcVw!)wu33&=mGKp^^buC-- z?beHy58V-aQamn)0gefY=E>cnm#<& zP9u1*8mieM&As(E*&HEE5;TP=B4D@x)2D8yq?NwnrY2IO8A}=+t~9d8g^ItgzfSL6 z6#D%u;N-WwHZxG&swoY!d9xRTaaQe+ys7JH!uN`{V#!C$c(RRIFstiO!TX@`F|w%t5A7z()YO!kx;i(@91WeS)OrEF^*x<5 zAdtb5ElZ0J&DXGoF(9U3if}^C?SVAr)T)m_lu;DW2--<{ZpJel&IHiS+?83uoptY3 zWGdTE6YMy`{bkBaTfz}Q{)FYMXXM}zy(tgV>$_T>pr@H5gD&4lxxvtr7|}sf=Ilpu zZ?Uor$-9nX((<1MQlG)e2&m~u}#Kpk&$)h z4=%)2&*GGJ^}PJf`NFMSq14IJYj1VEGCW?Rxa|nf@S(vzHIdF1n3UrlT7{dXt4aFX^2?MN$w6DW3dmYo|hTB_R zhC6#iON!dexk#F#+`>8Y&@rp?kcALqFjF7_G4c#qSOcbevNMY`aHd%}7o)#DQHxmz zxSsusj%{F;(0<69AcCIfpvI&`lm&BGoDb5~W~q9~fGFLEo3lQLOO~I}3Kn-gy#erQ zXs#QDY3gNqbmopR0JlyIIX6?n0Kl7D`HrI07L}H=(#*t|ZMn|X9BbD^%F>#&Fk@~b zY0O7w%}yT!l4WUuzR81}KrkYyWCJRb3ILaePm90gjg?QbIkgKpmDG5VlyuVTbr4BS zpTIes%g+pwiS*6lcoWJnOn-lQ>DrS0w*7%rT)7Nb=MdWpvOps=0r<_O{QN|r%+D_? z85WXCDjL?di4@vyQC)B1?Tb)9FyvWKFO%QM1^_Ebz?nhX_RsuiFRLpy;+<)y}g+9 zXwMaWLc_x@c7{oP5S+q&2LY{;xq{rN$oYB>XW}j#?U3C{>wE$+?j9!l-UI7N01OOs zyf$;aAoc=guM&KYRmK0(bAX{k67-8TgN$rHuLe+3zeJYW2fxsB-%d1qM+xRQT=CT! zkwA6>@2k+b2^!(3v<&4CPGhaYQz%g8Xuh9p%FE`vLA8)=$*4ljGxtoyW#) z1xPjX2UVHJ;x7pj!g#P*mw?Ny@?V;gX?`1dL~EMcStaY1*HcG%IWTf(Z z3<5t{tOWLa1{Br2WJA|Gp@nA=1B1*L1L6;p0sN~+F}^uhff@xqXh;VfiUBie%JTe) zBO@cF^PM0v23+yD-d*E*8z-=;Z+dDPS3CymbiYKtai9FQiS4AYz}l3Hm7u_ZqO{4^ zG@s;z5RaFzF!wH8uO`QvZ3H%Q113(G3Zxtl*9TC(UOx_N`lwtlo0X+7rHLARH)y z26sV`wLjo*?$D`F2!jMeTtmz%();&Gdp)$p*I7|$6k z<;|%UieC-&8}+tah3~$b23Dh9>Q;RewcR52?^mqs-Y+@YQECoveL1oIL(GU%=x%sO z%>7kHv<;eTi45FpU$y@hN%!0eHICC&HXvTqb}g5R@gc5|McB2zi2u zeEC58js}r^kSQBHP{9QQ)9r84w2ZK9SZwxu{?pyE(@myoWApA z6yi2uq6t(7Ht)L&`&s1RGY$y;1%bH+GPYysQ)TiJpq>MAGD4Os(`?`^fr7)p!OQIB zrIvQl3{GX>!0!FjWg>@4lQ9r^l2cnxM*4JC{2+GMY_kaYMZB-(Lx3>>!jeIf2k_lR zEACp(^MRP56}L?!P;IkX3n1z>&H)|KXiLXshL)3C>tBt%8({iDIgf^T4Qr1Qc($wU znP5Bsy{7*{dkdTldKM`abf|ABSa}Whan4yjxaZe}W$7s`Ii^K*B)DUd^V)-#B9Br0{;J4{J7@AZQEHD`MuaJ_Gh~>47#N4dc}G= zn$8(*2Phn5PmE1?!MO(&s!6#mWk#`Lz~4uO6~SV&P*<~QS-}kkIxh!=SAL)Bxch?dz6mg&^Hx}uBtQWkyCA2q zu&r%5@(s7%7lC&{!eAPM+mqN^3YIQ_l(z$NK#0@?Yzoc+)fc>~kCogxz^4g8W&pW) z=*ufZx4e-NuA3U6EM~)3j=Lj;K&k^HRgjWrJb)UWAa$aXKOF79lLd5qvHy6?f8T;F z7zXML0X3y%H~POAJ)8CrV+c%jSR%=2eUn)d2e)NjWlEd_v$*n;&C|mz5Vwg=>WKhS zZ3tRtIUnYP*pq-&ojUM(1l5)6#p%2R5`O_ zR*Tn7Wy_mY_J=K^hcwyh$!}ZQ*yY>r(Cn`2CKV;l_j&(4zK%i>AyI#zC8oU4_)~Xg zD*pohjlL_BWe}Vvp`s1yc(}Qu@{fFaz6&@3@0)Seww()ZXZKRM%u;#p_SNIvpAtd)XlDms8(RBJvN6Ke4@Ks%FXup$lj`PGs;ol$2C<5AjE+PB;N5znn_P*3t@ zw!qojX^)o4$=HlOKvrQ`YQit5(41JGxp3Kjyb$Glyr6FuiQ9}-qr#FpN_v!Rd?9`9 zC@W|>=%r*qWe8L4qamlK=W;*4xnRvfK$4M`W&~14f=B?edx@oA?Zbe_gnTW}<*%3# z*?v743VIsD{Bn5b0(9Zq5sG6exYMmI@r$myA<~%lqyp*rO}uqbt6a#^U}57?=|wC-vjg*Y&DOH{D2vA^g!O3}F1#?H=H zyWYu}w9N;fFAMY6bmZapc{#k)u}C~HAoiS$eXi}b&ON4=-3K4RuHe|?wRcC{t&2W-FNN zso>UScAfhJw2PgGM3d^s3P5f&WAbu4%t+#kozQ&zO}UC$lVu>-*gi(+!j#t|A|W+* zsE5N1v`jvZO~zuAWM%DEgG1N-etgd^j8CGQf>%vb-{XGlNue!n7Jl9qL-d=5vowo; zvx56Vp^!r{$i_Ghe9awgz-Uc}{6+|cnS-NeZf>siw68kxP;$g_A%qzuK?5vkgVmYR zG5UF5j?#i^w598Mv5h~LJ8PY4s^y?ghjzhqno5BsA56RIUyo3D!>$u*Cc$&-Gq<50 zqfH*9vypZYFr|Ic*-)Wt6LDeL_xPmNaT}1JAh>W3rVed{TtE zOGnLS(o)~{H>!WSNj*|1c>(Vft+!#ReVN`OY%3G|eLDW(n;U~O_k2!wzedd2`zVwI zSCII4!0~}}_3U1AKtNK|43ue?9+_Oc|3ObVxSu(W_(bWidn<)jenoR|BS-CEyu9DJCtg72=ONF&3)Wia50C z%2it83O`H)s#K=$NgX|MUyVuPe9Bk3E}D=QRU1Z2k88bBu$So2U&A!Z?&|MwLQ~zB z5&L-c(MsnM*|(QmmR2!pqK`Q0^QVe&;$zRPJ1KL7Vdj#*NKxA%(Ev0p&y&cP_Tr{L zA9<3QCAKE4xK57$mNutLfFM%MUL3yWK1r2cRr|6j;R#OZ{eFA&?jN@WabUC78CAwq z$3n>TeJk}oNtpb@osLh;TMrVR407>KVi*`0>cY97u0PIO;}{W^&u(hEWmU2-_@srX zYSLGBVr5=U_UQ#qh7bPci~i7Mw6ExLpLw>xGu((lZ<-oJ7+13i-!+E%raxbIA7))R zd`dVb-HxY2sia_XOsOZ!`%BGCLw(j@Y07fnwNmt?QMR-4YjrKf#q3k58y)!BBw@>| zYtLnUCwQSjuG(I;afjXh&S7DT!3kN|T%7A<*8a3R8P2fGh7_j*GvOk(>o{bF(#nw-w-ZcV}f-Ad@kum|2N3 zV>Zt7Yw!(wC#)iw*gbbbaPYTN7VE`JMAb?18BrrrpR4XocINJ{X8;RM6-3NnI)ocF z?U>N}M)Z2}MGIc6&i+Cxg~<}bC$($WCC5cOZ9I@~bnjI`B&3BS@acL_PX1QBMXOGS z{;IpV1HMz3`hV!s-D8m0L~F(rr;73Z&rMD4ctD|fImQI?>%CLLzX2EPoli@@l51CB zZUrWL)g69p?aa~8w<^tsIKBTqm(zDFDv*507)%{r za;Zo&!?uxl12^~bKvTIZb^ z>Xu^VzSkJ@qfpRCiJa!tj@>38)e`n;yuWab5m29iUD&I3(UNa;wD!Fg<$Gbs_|}nT z-}$bVJ@;>d`cI|ZQQ`9+b+4K6@(MdOLZVgyPUD4LsvSQTOl?u|&_K zX8|(;M|c5dZe_pUXw{MuL{hD(=1{e^P=Px8kfB8ShSAtHjTiv!k_#5Vcw<8 ztvj;{Yqn@aD?;^ckrbmRA6*W!$!xvgp!33$R0&bdw;;wZbQSpYmpye;5$8{QmjcPB zIz+x$ZzE!e{@yO`weqRJJ1mfXS2jy3G@uYU5r=QVTs=gd7(A5MTb3dcFZ|E&(EI;) z0jA<6zVe{9id<%XZgeE>2>NDdG%8(i8pn8e9iB@;)*IYzgo=(wYV@GTyYU^bp1XC)Q7nE;HS^sWHX_7{lr&k* zIF6rnSC@@^Ss3@Fm{eDnPdzhjM^EYHK_d^V%yHJQ{jZbq-0wPcwz4~}ek@%b@@lsE zFqC#kkepa7gmTHW_?Q!OU~O(xaFi%F3|JUezd5w>Z%0EGcu*b)B5iIbMT+CF?rG2} z*8C@tU0>ltfpfmiO&_gE^qsY-iKA)m3YlHr%Kf|%$DWCstp+Sz0daxQgG*^0ARCLd zbj`JcNVBf`{^e}5&5|O)CvZ{)xonV}wHIS%Ny$W-T~Wx+$+GPfOv-Kdq<8d@Aq&bF zx_z~@R#@m##+{T2O*_7?pj```qNo^inm|bH9v%DMJyiYo12cE@^L^V-9L6hD41|L+ zIdl(HZYBw9TcZf&he#W;uXiY_LQV|qnguURs+mvy1Vk6!$RJ{R=*bzy36FU?973-V zyik3Np;4Tn7<4n9Wm@9c??!2exYKA0O5LK(&zS8qh{*4aI#YPt-Kl=MMxm#M=BjpK zvqx2hSp>ULl5pa#=Scb*kbD|p=1!#fU}#}ESV^!rpGju`&z;8bhqjH%@iFEwW0g45 zzf;`hjWO2SQbfkOI!x#K2=(OZomE|xM~NOLtsrN>k_O(d5LjK=;`ofJTTUzs-y#`0 zi4?^!r_XwY#H*IQwXWB%?eqIVGBNBak*A==j|g-e!vg<~A=cGZe$FSZf3-om1*Uwa_s6aOL$H^Q)#d&v%)NMp@R zLrU%KQZZv*STZ!i%31@Dhi7EW{B;`M3NeDlJEXLJAr^jbzV)w{6ibKmvLj5$#1Gq& z&n42o9U_y}C$tYGM6S3B?xkGQ>}X~<-#U8TN5~yECb0#}B?gIW6%+n;99C%!A@mtv zmwul)9P`SwP{E{X&0RqP5~RE z)LDq_KN+Jh6-GhR@G518;nV#y^xv^h;>TQ+H4#}9C&-ZNNI7~+)BzetmewF>N-KSM zm$&ZVWK18CEvh52A7z7k(!v;NhPqCQFT1tN{DntO-wJ)Qa7IChNt#>1J8=*G2@)Y5 zL?<%(h$zXN1c$@a0?z7-JQ+`XDQ!m0=%oksyU*HX(;3G}O5x0i%QpriKfY8G<}QEx z$!`6DZ8hj&aErG;&Z`lxI#UX$3seCO?sXbBMGYp6%fIAnhOgDsqxrZTXf?$c z!GXU@ryiE!xqM=^$1BJalq!U?$Y3&s^DJ0l-~Eo+S(Yl{=rpWw=F}#U&!aAkN_k0} z2oKr=dMpA-bR(2t5V0ypfzpk@RZsk;e9+hG_3GQS9_gvt|BK$`F7$ z9hxI&(zw3qW{+%>F3U-VhAR!+;@`C&-??d7fz)%GOoh*8X4Nww(>A(3X?|XFwq1(2 zH&3fViI!qU1L5-VFu2@T7v*)gePsPirug1UTa9ncf$*`j$X!ZNN3YX;DSLsamWSZv zZb*ipLpyvf69)VU*R6s9nCt*_g=Kx(z0Xz?{>XcD@V)QC_j1b1EWz4~iwK*5r(oDE z|MR_h_N$eAawWZNh{NWT*UPke5wh+m%E@}oQu}zL7&8uAG`w_bcQM3V8|@Gj8|{7L z5VuN=Zfd1BBr^@Sg%=WQ10>v>UvB+F{_P2P24tqCy~^q6AWBiToLGhjjr{QEc!y@0 zHoaYN{%inn^GjeMcngRI5HaVXYYoD<3?Koup(*-G=k5=W)S!1zBZ`M936=B$#$8s$cLX$&u&_G7?WAT2nNZ-*e{jlsRcIAiAJMfe41K0qA=R#~O8 z>6;PP_AV&43@jLo0@_NZFll)7_izW9tsdN7h+qd1%b?&nXg^kt8y`R)j-QT>-F_qJ z{RcFfdF!~*-TvKkX_s;N4?JOK(%fK~4v@cVC1WdUG8wFckX zCPJ8&^Q0_iHLy<9&0s``Zg5eClZvlQX#AyQ>ecKV1`lD`wA(hFem_VqaDadSdf**p zWn~qvop&nD3B(b7Lx7E)GjBb@i-pE|%@++4$IGDIp?g_a3VbFLWe*M_5c-wjzLVOz zuli#H44$?CZT3qq5C6V&JvC2L0vcD z_PyXmU^k~&$rf9_1fO=f#Vz{&)+kn3RS)GAJA*S*Wys%l(z3T9T<3LcO`B!vjeGo zc>e^gawny^{P^>YFHXdg=*2pWyKrG!F&}Y}#Kgp2q@<>jAc_c__#s1x%4*}@_W%)62DmzwPO; zbIX&n({O*2yiOy}ShVH{%cG_twPIJ0vIm|q2k-*`tN>jPnm9odc0dcSe*wU_AewSa zj5KYyXi!l`aIDW0YzTX_wXeAnf08DoB@d)o4WMqdptvh3&f0)}(COd8zW|}&@1OSs zAF1^27iK}oDOgws>L42z*PMNNrpnsEGx&2#N@!eLv4qnOTr3TkOmt~PK`hL;dCcH8 zzJ0ylJ!{ldU)~k;+rot@hO>vh> zNhJNd!mP+0BhH~e-Ar8Ee0bu9475n@xg!YOZOSQnjF?MceM8>%U;~N=4go=7M@P0) z;j3Sul<9^B_LWT=@cWlMQDOnYx&L7Y4~>j~hYsY|fxkslRn5bXddlQMZlf5+Bnfnw z)oTDC)awEp?|^#){^n{w^bQ5|E2pIwI>o1o%6_z`eXi|4IjJJkun66E2LR6|1s#3; z)tbg-B%r%gSF=EA#~@4)h@2DjQYHe~z1Up`2iA)>iUwO!uA-At532fG6sIF46QsLq#;fp1`tTG!+!L z>wSSq1B=~+9#5ilnHdC)Q!!h}n`f{elp4VcE<&xbwMK#w3NxOy`Nf`w((_l1rQhNF zy5>TXrK3tTT&x8%8eeF9qzgVuQns#5ErpDy(c$yw;M7zcKs?P9R$@$U21B}L5Cwb_ z(Q$)d@GU{Y94#*9Y-lKwH(PW(4!@omF*PgeGr01#D#S2E<(ART#;iHq?-y6)+VqMQ zsJSjnC+n~am_@A)ZBM}H@dLC*i(ooBG&DrUnHTAERh#X6UO^6CE98u=*NXCJJ8GbM zLk!+`sltuUmO`cek~F2dF>`hvG(WI5R-!_RDALIc9xrBo{-~KWF!XeU_;dixb_Q~+ z00*t7yE~AsDAtIO+l3W8`?D^WT6LUournopWv0tHMy@fGe#J znOrWBGAzX{76^XFhIu?)+b_enu<`UnH7b;`=~*y2#ui-vP+ELf`%@K|{!_Q;{nifY#(p3c83!vfx^%wX%8GBvF?0m+DxAziZFFT&pp^09jOruep z_1}6OMn@2#3CIH-1zzH}wiAneYWv~B^Ru&@Ofu=J0_tfQl8l=}Lx>=|h3=_p0vtN! zh!3`$&WeH7`1z54uGz&j6ROSSQB56WbQD9tuF`w79uyoI48;=|sn-u@(j)6-u3iM& zw?lzQk%#3Il8vZl43lprKFdjD()s>%kWIh7GW21tPko7hP+$pyxqHsF(R+{P+^MTr zhO54*!H-U2V-7QTDu}1y8cUnTR^=>>ZBatu-JrPaxc2Pm{m_0AWO9(z3tl5Q&BgN9 zZCB86%V+<+fP&~+!FPBIij)y$@`y$7E=nx#@0SGYWL?ng@u>aY7O>rf3YFGfeRf?l z@+cN{7;!LOr8K-i#oPwD(+nHx;-8!I>$JSU)3Yit({Qt;~K#)9e%&?V85C zdDNNlT(0<{L5({Q`{%-b;E5=h1oBB_GRX6wk^Z{_u+agC5CYZ-I&W^cKrLdMZHgcF zgPV=5>l*>@4S<=+Rj6(9AG;$bO8qJ6sacjIagdm51C$KjdM6gBfPIvi66)W@Fs+(o ziJy&o>jCx?&$K%(waWpc`uCx{ynL_ysFk~}Z8tfo(W>^Crj5ZSqiRugNMfl!^7eBt8b0jN-gT-V2J$D zTlDkBXK^JG7ZzNsgP8I0>n}L3k3$gx2oI_xKVM1}+PE*6DPqYaY?=S*F_01dC4*0O z>pz39Vxc_3K=O4XpE6s>h>i`MU)#z0BOu9WyJ4H6fCnrS0&c4EOR(oU-(*krd@A@P z4w)E0Nu!fUKPKivtf((lh^F0O4Y-qz)2izB!yjkAfY`T@T90`m)@<1&WdE-nx9J^* zMn;|6Tm5{{{PV4})!`XoaZ!;QDB8I|PIx97tzq*c4l=ok#KVEX$XCri^tI4Mqx;JG z&t1Vt4?A{Gl+f|~KdVx$;5}P|-Rj8Ln1c3nzTMVj6Am1AVM)SQ?_x)3;`%~wrYZGjjO zb}5<_KniR*acRjR<1e~cJv*7S;L|iYf5Cg0Pu(3PY`C&Je<>~03s#=6adEZ2EWQ9U z0Z=mod*|He3UXujerdgpkv%XX0kX#zx2o@6YwiLwa}8==vaWV?Rch$sI@;7m^b9@{ zGH)fit(O!}W>hF%IJ#l+vF&POcOS-^EePrlygqk?{VA(iFnf9oQ#76Vr|$08kSwkU zMsNx<R3goa+gET~Vl7TGO{QP_!z)k!QFu!ClGuU1yUcDG(Q)RaHbK%0cf=Y0fX(&Lyi%yO+;`B0!A7@ zBNkVR?va8NMw<0{e$OjVJjjChcd#{mZhLYcae<_0HEh$k+#~OMr|i&-4^A*phm@;; zJ51-&_O6JDa~*Q6^!N9FBm8;~qc{?Xk5H2B)Y-xC-)TRLZ6sP7eHbnt%zWd3H~ZeMZYwrio<49{DjHHTTMY7fDHE;Sftn6u<{6Qb8X9(x?-00Qm>e%$Xm;NWw$$lbcP^W z8mq@B%AV%+TlD#juhva-*WOS!bpqDFauXcj^;*)DI+cY`6n@M$1LguZz|ukxRsj38 zs-S!7BQ0A&L`I$jF;oW!^x#2vDjoL|hSxAb1eZqUAZlQ9R=v?GsS9Gy&sY zswk5}I413+?rU#BIHOdGn*C&Hjuof5y(LS5L&kd{0;jYW;ke}Mqr`9!4C#Ljh1m(c zlH$`an@$N+P+-!W$$VHUOI^!W5GsKk=pfeVY~Cn zKe=^_q7@i6T)R~|hvG!w9QmUJFH7X>3CaQL2dA=$i0j!c zTM2^I-9~Z3kwijGu4c(e;&vA^`8QIpLqnD=IJa9(Cc`>D2^0~=U@!J-Ujr@Wj4k|9 z_VtWn#F3J?26V}xxQr=21l<@@!6=(l`bNt65#boqiAl#d1n1P#OR}6+XWq;tCJGW! zD5xdhL7cpxbhU=liUYVn^!|dXgnKXrBw%@igA!DkTz7x-cU6@J$OlxlQUl5DtU=N# zqgOhG$o)@W3rO*>tlxL|JJGl8&C7O| zL<6#QC0;Q8hZHh{3EWC;(=rqu_srm}=YHo`3z&u^RC%v@?3b?Q{YV!Zn7j!kNT6B@ z&~$Cu-D-}ZaA1U>&!cax7(f#Q)=QxPmn@&VJ&u~681?k&%5DWT*3tyNcbBp3PoM!L z5n7y9p$(6+sShK=RFgmzFxC3p%}_r&Ot!CU<+CcB!*6XJx`B->i~SK!HfnRmX6+4C zmJ~^z%|Kym6R+X3FBDL!LFu*ZaL>~{TgRklsf z%*|~sD3aoXU9OE;*N7C2`fr-VmbX$;1!!)wmAcU0XJuhV%4N}L2}mv61h@Fvtyixj z1EL7{9iimm-8Lo-xjAq=udNxVqmlLg!QS*(`>ZPB&QKtb$;kn#LO^&v<+IC@bS&PV zy8Jf_Q)MWf|I!O7Xw|o=(Qf0x$%UOO-l*joM}(1D<8S81Ep=$1 z2f{NcAzojy)!M~{oi4v{b8{18` z+0SB6H*wEHnAtbddah6I)aU_Hr8*4N2+F*1X=xe$#gXaOgU0d;C7M<&U`qyjG0o%= z7luPjdvj&9Qu^N(X|mcgIr1oh0girhhG+#=7T=Vg?0`$g&UZoECG3AHhVcv0bsyGq z-`f?6sP$lr!*+l~=J>&FH&7q=k^hkE{H`esJb&)HysQ1drDBNPvOjLk>PdA;kr|y# z>1`2o@Za?)HYAn!0#Nz*&LWjD+Qe!+%2{hqIq@E<)EF6B8Cl}OzrYr7CE3y5W7Q0gX9jozLI4ok-O}mKS7Kxn zG!F1N46}^Vzx+h`U+}K0`)sQ(%}v~s#*mg%hws#PN~1qq_`rwB7o#}gxfahtr}nzI zc3ucbKV_LF@Tsy$BxYra`W&DWlmA0OSQM)ho2lkC?36XkawwQGjZ-_c%YPL|ObFof zJqt;$vAVXMiLO`&X7k+th*WWhCJN+a0(Y`{eSV2(`-bn{(-ey|&k<=jYiU`iq+T16fCGY)H+2@NM2qlIllh@&uwK+wXr-?z4o#G{46aHe&Fh7MW@!}Sth$i z8U7wW26w+S~RgUIWs8da+x9q@I?yRQ}{JPuqjkf1Sk;up5MZ zzt|BjiHkO^FaVJ=wp(p)*siFDKoJneLz=NU z5D+!Nr3<11T+}{hhWo0)OO?FAY$Ar^0z!CZu`$mqyElEojrarXPl9{kKK(qZRH3#z zOs2-s)z$U3$@M(8fqdW&(dC}M@3dsvawM-96fbVJze8L$@)9U}j5e^#KDMU>S&BL! znX69Fo;VQty(iv9=M=H5t;H~kHY)Ge6eaOD9~I~cHw~{QEmeCbB|^;(FGADj*m;-F zee@m44OZ&42I9*!Qf_)wR1(3PC#816Z~wmw;0@{ZH-)caWrKnbb>1S+17sb>2t7@F zRdF0#Gsf?~R|pw(ozi_8mOic-Xs-;7Mb;=X7P?tx$l+yBUNcTHp1b<%{;H(-e5!Ey z@SgO1N8?MgoW%=Rd|nkP-g@PP?_GWeQGbE=+U{OEhQb+y`G>2wcA};^QFp{kIa0?^ zDm!dK=0GY!pBuv$yA{O=4!OMuj7?twy%u5gya`AznjJp;pfa>Vkd>+q*pdZMgeNj+ zg&I;;Vgkc!OXnS~d8J8BJ(bpsY$oz#wZ=hN$z9)%EmE)LEXoVTBmuxS&^_yKxru`mj7x)8B zRe?77e1U9?qEjqN7)2qy1Z|i9F5u;>oKA9Nz6%jT?+a8JY(wJ!)e4lrz+ju7td!yE z9{NjG#^co1YHt!QH6o@_>fsfZc4d(LQKvB7C-;2g-TBq(9Ofn8qkq3ssy^ozg5-Ti2;`j13t>XY$!$Q*|Z zjcD!s7Krq5Cj6pV(BWu|2w($ux%eA#h82ZMB{}sw()xEw+R{Ale)g~WUWofn?QCBI zr(gE-nf_A*f@^VCqtPqt$aTl;dwd9G1E`jGAnz(gouMUTVEm%#jRlc}#1zTtGF|eZ zjfuFGEZFh$ygFF7d%m)}JqZy+BoAL78;d#LkXA3UovZT8We`05gnX{l{tjdejX_OKyIdjmYA!_R*S-_+S5 z^{8>kVvLsqN_C!RuR#VCy4u%5F?GL`dr->?T&peApX);2Vs91g*Gm}GWEYu62P8&V z)}(xtY{>n1_*!Ou=D`8gWzq66O7x?4*y)TB(sydH-8j;SDphN?m&{TI*Eu+7EgE)$E+}jx9cb&{zc@c~o z@=sgxI&9pZ4nuL&6^(#$Y5Y$FX-|^= z@29|t-Gz&bi?}Ni*}EF96!)KbCo4eGJkpU77sWf{p=^S$&y4D-$kZ?i<#cMKS@nu(fq4RB;dV5 zPYy=6p^Wk$bDNTTq^!9H2Ky(bZ8-RsvUpVF1KPrgKV&=0kdF zgm{Y)!(HyPKPkg%n0`~5*gem{J6zD) zfvE9OU&uLIa=XfMI~rFSn@X2Ea+gTvsL=$PiUU>@7z^Z`(5^g>OB;v z9fP92r%y+6czS9N1-HDC6v&BL1NYI&RYRjaq-=B zQuUbI|0$iY2ndXDwx1$Fmb&B+%1?^nCI#B!p#OJqnF->jON0lKG-w0BG_@0a08-e& zQCuWc;>j86&;)$XP;e=4)3?VJ=0Bg&;{F(>eVn`1IbWKWSi1h6 z+ucTNsNz!MEk?)3euoK6(R^mbz;6usU1W?HCY|5&|464rGR_&h`?=QyFA`8K{+!Pc$2YU)_4_ecW~J4Q7$jsxf)!B-&{HSc3#P#*7O}vHUCZmlEG0p=pbZ z=oSvpa5CsdK zr*U9)o7OAjj!(D5I86W+x?CLJ9>G%x>h1}XD&K`!Jc-2cx=3qEp}!kvGFw^}sA!OiXS2%L5=)?_3$ zed1|_{~VXU#?Usf-r~Csu|!6dQ|`XqddRUt?K3J}dAiaNP^i=H#}-ECr9;k_=)5F{ z-(07rqDdYE-)Z{hJoSe|@XmRD=bFOe9$Ojy^aWUke%g|t^~hb@d!MJTf?pMRo`v4Y z4!(ro%I3OWawKR*o0j#dt!V?g^(g>3M|iZYU|u8#Ha@0IfNj&9ZGnw@_F=7SEcKvv@v#QA`umEc&K#` z$a`J%BVXxa@(^E#dUxcU&+JzlTA(x`&+DF)H*K7?C>!Y8i}8I@x?Rv*RstLC?E)PkUvJ4v)#tVMI?4iVd8_->F6ZaLg-b zer{orX0uiOk9JIoSCFqzyWMvH1V^gDhQzgJgDKdV7ODn<`9@!kQQ`%!n!t>ILrc^@ zO}dwQ0W&#?*)kPU{6Wzy@O8)U19Ym|k~~sn2sh=A3}l~~pPzf5(_xalQ7#?v#wE%7B@%Z_%1KOIJk5lt9#D*HKQO^~awbpB$ zM9*+~&^rOS@S6MlsGK#0K;PwIZ-gmRK>i1r4MSeS2JlmF0zfGVSWeKY=mA9lLb8;E z(hM&c4S)ei(`gSL1U4YUsj<10izYW6(^^y;iX%wJN9Ps*(ni)xH_kry*sQoE3pTs9 zh^XpCuT1(Re`15po+#VLJoZ68hxf2TzkmzpD|w@;16A$rr)caepln6E_jAsS;2>+K zZ#9G+hTGJCnS|d^Qxto@Jdhed8Jt?>r=GnoE^lXieZW4t_=T^S=$TKB`%Xg)_HJAa zb%Y}&Go$E-?+b^Qbr@7-IV;vE7!lTUP2b|>`NB^3+2xPsMRS>Y$JU71I81i4mT$D! z_}FD8KlR6R7n!~_W+o@E#S0AWmo1yOHc~ZevMd)BJ=J-eAl8TAvuDE5aXNciDjO{2 zor5XpYsvc<07K2Rwkt2tpT2TIIl}w=(b{vvzVhcftjpp2@UJoA84bRQ|Na$xHBHqY zGn!$EC<&S0=Cu*z)~lFJy>!o#3zx3z`6S6`IK|Tw0%MO4rY<{L`Oc-3aw0U_E!r6H zK@G?FUqq4bBdnFY8%vdNB4*M6&)XlwlbpmxdllFS^tYkgHG`(8AnI$O{-93=$oqpb z0in~);k>{bK1kDUj5q0NmW&#+_PF18%SDzMrjn8+6 zKPh$tQ50zQBk?2G-k`?NmLJ#Y9v6F}ry7a&mgv zA?yA7_h7+ejD?Cx`+K#5U&2-pV}j#c#TpqB$zP*4TlWIMk+{3K`qaQy-!N0-c@0he zeqUTTE#3k5EJP7Z&CE0fQrCIupeNYx*oEDR{eAJKUkf|~_b+L!DW?ldO*u1~m(Gmbu6Ge$o5m!)#tzcZd^Z%x`%|3Q&qGiJ`w?+*XZB64CX zpK|8<1di+e;6ttLr2WI?ZUeIdJ&S6^+0cmi*rVyUg)bPC95OOTWDQ|aK|&O&Jq+$+ zEojSId0P_75r-yr&vED}P6(-15sjsGU)5W0lHLDUm?`H)&W5JiOySU8PpTi8eKGlX z+`CuZ^Pf>#>zg*UHVsA6i6BUCZWlI2H# zu%1i!6sN1xMM|zjYX>fNvxIlWe<-M@1S~fI9y$jmQ$`+Nz!?LimUXbzk(@lWW7YiE zX;SxZ6Sgs=uJXDZtU)81pzAK(9PCkuF9x1rk;OlV|eWSGH4kj3u_ zocutIW#Q%y+jZCRx!rzP`0jcNk`{d9(1f=e_Af=)gNyFBhWT7u^CVEJb4-HSu_szioz&=4IDW3E$=#2nN&)_r7JeV=Ua4Z z-lI=?ZV!^*C-uA2+Jzi~zKZ>mWL3*?5Xd@9sdi3KEXU&yNA7qI(7Bf9JRJtRFfG&K z>lL(Y-25VBG6^JyG%C_>WF%x7dFHt#P(_yzh3LR2usx^=hr-Yv<`&xlmGAj4`c39+ zTfJy&nzhzXzihrIxO}oQw?Vs(#PyF>*f{vxD@4nK!}@n#I8)RHB~TZ9Rc?HSi5N7hV-d zYX-AZFCgXF_ts8UBl^Esz1H=B#DGAWbP>ynLSPm>Rv_;EFS5K)$*Vbrk(qS_Y_NbQ zU+3i{5dVZws=$c=?Gi$#0{9nyjK&%0HbLu*Z4B0v7@Cpe zRB2%#G{e#C{xGvSm|`lqljVCG?@Cs7qWYhK(4S!9lm54FQc~A%KH@dqG&1G2xH5@X z;8MI++a74WyzCgcQ}zj)#+%LUuiYkkN+NnX3Ay>h_orQ;{72XZ9D#%x+GOIKd#NYa zADAJ3%0R~}HUhzPUPOfd8p~SM8Zp-De&k7phZsf3st|$SWl!@ywDq-7tx6|FYFjOH#~i|B>Y3s9o@-s3hSikwK4z&DMEKJW4%||e z7N98Y#b@YcZ`DdA`rKv69FN9|EkW19(ciliJDIY@!1~v>_+q6l@s^R@A##4F>|#{t zTD+ui@7{%5N2nuJSopG=dhDghHfs1EfljU- zs!@A!yN%9OH@bADrJ>+U@mjM{J*S93F-l5S*iE#R1=qoeN++6hw};z-E_FhVaN@`l z;{YN}%4~w&?x^0=N(9c@BDfGDBZ#Y z@z%)C1AhW8{nNliq;Mw}sY(x~sY!9rlwf|!zYoi%aXN!?$}YFIi!sl&QbxFj_*6FV zsXtFV&(kglaBtcnUgsPLHqAR9&=3S|XN%8|{}zY8_{O%MN}q}^B-c)Iy@0U4U^ru8 zP#GhtLR|3?>}LXYiKtzHQrJItLhwKZ9Z$4<1xaWib&s37e{2k;amfK;lHll#Dg&DQ-u6I{(cuUsM-C)K zk?xl64hiWLqXKGo;$B-2O4{QlfS)+ z5DwaXje{Pg4L^$_73h2;KugeeR32@Sv1JaBf(AK%{4z%P=hiBlT|~I^25jTtF+~E8 zKIp8nd}eD528l+JZ0A%o5#YXn{Je3O?E^#nJTkprsTLs1+8BLXQ*ps8~A2to>(^*oD$1&>sD&O{ZS~%i77e6NXxc1<^ zU8UWQox`VuIVC!ub$H7Gbvebrg&Aogjc-icpSqQq;Jv38KO}A-u8V(-u`U^vFR5UK z(NWI1aq)V!IYp%p{1Bo|$73y}P{a7T01ssurU-dOO`ORyHTU^4lzUH=Ss%W-csxel zU_U&_&OkzNK=a1x4|$Br%-J!L+I9VGp<9hgVf{Mh+vA;!%Uo)iM+xg6gc{Sc3r?=> zQ+(>hN69qt3v{c9y|i@mOfH)Jd3gmYgvCvS!js-nf@Jy^Xy~qVj5b=OMEpZ*N(0Tz zK{Z1o&Ls!-2uYWy3EOJz@`POki6F#T;rfBpk=2Ndj znN|#d@YP4Lux~y5DK%kYg&_xp z@Zgf-`;tt3Ns2&P#cQ`m$RbLIt}&#NBC0{*DJ8DnJZHh`aCk#6@o7JseMgek&cYsZ zbbsON_uH%Bi(Rj`FTS3>MI~GxtQr$9eocKZ@@}F;I$jd{zc^k@|4*>{%CT* zhme4hE##R-MwPh^GJ zYquZE8QpsrUwQ&--P~>wZA$#E8~;iWJC08csnWS%vmbs3u8fop6-@A5SokI28aGyn z=enpOHd+j@IU*tePQPI;{DjkaA^EdM8obT-V7~cla+uoh)P_YM+Y#{FcnPicpi3C* ztU!VV#0>SfQfhU{7X~y^a%C%we zk}-4J?VUk}DDWRf0o_#wFbaGQda;uhrW1_+9H7BI$Jxq&gVe+!lkCe!N1ogq?PZGh z^j83v<9{v;#XJv$RxGx<(t~^2u||8q@OG3fFCeZE;GscduGh!YqI%!F9l4|xok`VF z-ZuCfD{)PMcVLT>I(R7;&Xqh_8@=mT#fc{Tf!mLd}sz7L3YS&5tqN@pHB(FVUU zkSZZGgIm~HNBz^illB_NP)uML%r8UUj`9hK$??ZUJ7*cRHEBU>nh6L9$(~BD%HLD> zO*{6zdKvP<;(H}&(Dv+_(CIqLb9eqUF8%J-4~#w-)M&8ZK`ySS0(NtUspEX~UsPVR zy+_{-tcU?_GT-N*S`C|5zhWA!NvdRQ*w}`VO^5C)_j8@Zw4i#LD!W9uyFc{~hdG?O zdjG~8tDMS~eE@wRS_)N-Iu1;0v`ZgT^@_^*5#5Q)tlJBsbYjq-VswgCJ0f{2QqdmK z)*A<`XVX1Lgd)+*l-gTozdS?hDw!V#U*g?7JW?bs!X=is%ip4%+w(Jd?tX9w;Nr}> zOg&Z`YEZ~&s|g_i^F>4?*G17d&cM;s;GmfP1`+ZV|DG0^T=H*669nu6P+)pt)?v9 zEH0tlvOSl!rPrx8qqOE@Y~i3y9OTMmX`1`@DIe9Q=Zl8Wjo$tpHH{M7yUqM{0C1SA zf}3EX;MwLKrnZM6mwvCgR-4Ge&zzFUW^Od!Z>OSVV?#6z2byg@@Ww2t(B#u@fLPHi znDuaRZXX#wdzGMLz>i;=_IvFi{a{lE)Kf_GP6Kw%GlBc|?*`$m!6>y~hmAA5_hyD1 z6Q2juyZK zhI;G$;6>*$lBU@~bwaM>egntMpdznibi{`ZL{eWl@nHj(zpZK%ix%G#YVGrTjV3e? zAljuBT!lEAG|z8JNcz0rwUDQzi#h4B$UVi0(EGLCNcrC4XpHN#enp9e4R=L{Ztin3 ztp_v7XXFiVEU-GX+_d8iH+oVG>g-HY z@B3g3BVR1lX{I1LJwe`5L{NG{fQ$pv;$r|&J^@6bRJ6J|CD8LfIj&xp=UIG;*4YPI z$=sB}uw(K;KF}Nj%(TaxP3BQXYX{P!t)rx)>hav80W@;T9rzKnCz-eJ<)D1kbZ_SF@IwV3;T~&vW|`*!G}1(yom(3_GxRkLrjT>pyfrG zcunbwo_ym+tG18xmCF2xwFJdzgypR54p-8Fg7iByMiOAB0my9=FF@8Z9Zb9Lmcqb_ z@Gp>(d(vhya;aFBzUqYVDfL%)kxa;iznzLh?8}>cU;&?)**O7$u%j!z2o>l1qE0g%F*$k_MrK;Leo z8euPYcuA~XBlZ&zv(TXMy@gC!Dj)#}Al%(I3*F!8Z;$9jJlOiboP!}&*n6&+wJbCp z3CQ{{f~MzbK5{4p5oSrnofVla?qdJ)nLkh+5&I~|p5boYx(HzMe*wyvV0Z(_4B(Iz zCuO|sIuO!gyTy-odpKeC$k%LDX3N(u9Y(K6ift5H7kx+d9Va1y2pijny%E>L{{A0n z$1*C@RXWiHJc;aBS<bHl2)*TE;m<{v^ux;#bH#c!}I|J=@hY@rgW)Ct6V0fEqlD9K9c@ZaQV z>jviFTiFf>SOAQ>z<_jYr*iLp<1-otMJ!}wl6CP0+}E&(IVNc}l35ETr^lRzAVvhm zK(9&q_c{26T|!8DSxB&X#%o`t&SJS)IEQ1H59H@Rd!?>Fb$Pxr8GJRlz4Mj8+v3}$ z#*ZJ1UO>#s>x#;V3CV+r5s1x$R7{a}Ghv zt(Kc@!`66c23BH&6Bk@Nuoqg)kg*Rbi;)IZvC<5ZxVKNQh0T0#5F^}Ak`yi2AtM5t z?tZlia_L2nwXiGtvt0Pwp=R{p1^?b8;&DAlh=H4x5g>~tMV6H3g|D^T^bAgx^!yod zD6^hMc98Ocfp$BZ>toC zT>!vu0zkl10ER%bVO1=%k_vahGa4Vdk0(kJhem>&m!ZS~5OjbBx!|1Q4W}peQExA1 z0=-0nYtTu~#dK>IQocCUsAl>+ck=gX?O;9YRR%(+=CvP^{G5%4#C5pjE- zbG}lFVdwq;=r#(?f^V6Ptj51my z?HlF$u$JC&_)gphh=KCsPTDFRO&WLZV##Gcbk{QyU2)CoVh-o{-kj^Xv zqY>31vigG|)~pa)ZBlFoLi&DH1?~EAyJcWc*DrSdGaS$a*8n|Nr!Y}-U4jYUmsD(pmUN(&iVUOYAomi2sqj+}ygon4)fRkLO8+XL;UV&R1^juY*y6SH zOu-|-YR@XFz?jCW82mzDlS5N!G1%9))*JG&ZbOTN8F-wP_JwLORHngY^7x}9uL+M; zht=}4p;t>JibIGH1hyT1Z;D1zF*&DFZ}9CA@x^KA&4;ZNrAg|OcPf(wC1Koh6a^#$ zKS&INj7gBJquI9rJSq`>eRhLS7qBaFxU;L&G54m=I9}M+2tO+}$Djo3nMRg@MEvm<)MAew8)ZA@ zoK*4W@&lhDgz+R=)?2;s)XXOMD?%1ex=5#t>Z8$?NTgOp1c3df)?q^u#m=b2L0?ct*~TeU2o=1Rwtk(KAhNgE6aR(}DkNw(9Br`l};|KfDU z+||#37@rmI4B2T`XXWSjpmotZXZz->g@?s?Mj%*CxgRb|)py33V)emwM()2n|BjXP zg(#JZm8RvhGLIHo>)VHR?z>Yrz^O8v%-~RKOx~Zbf@|q3@uo5oYAkWn@+0 ze#{7NYK!I3w6W{`SVrSc8}i($PNBqU)?jacp_#`w;BJa5jP$2DIhs}6hbN#t!v zaM0G0+ih3yXBqWd%xiP;sMZXMF@n&N2k?s5ippg6LRjgTO}b$$=@t!(($!?NX3YsT zTOa%v{}>;lI!Q}hM4}RE2(RTi7-l*nMm>vQDb>e5Z4p}mJ+;4_S{FcTnw6U?+h#I= zcFmy?^&wekbsgOPQ6As^nNq$$Cg$g`l3rLA!OE*w)=FEnPQ>g& zY|5&rk_BGRbSA2OR0OJCw^OeQ36>`KQkEl^m%ac>0r)f7`91OS#r4>9_tcH!YxSzoWfAD{kYIX>C2>7BS`RRd zSW+RN;bTzM{?XkE7_$_o^(n)@H<~k*OLO^N@qo|C8f8NjJTm$Hpk*9G_-&y6HXS`Z zPQlWNC7zq8s3tdd{A3@N&$tPk{s zDq}>!o+nGEG0!WKmU9)1OB-;>V3URF6jZC%GFV?a(DLT3@I1~CR#GI8o32WS8X=UtS{^AO0yx0 z*9hOGp#d&#pdr5h-kFslV6>7_0=Sq)x}WVDrj~C}1c-taMpOq zIE7x_2cTO5@H@9RA5N!@<242DQO_6{vLgYl_%1+B_ULzCP32F$FIUTZu%m!aAMV`l ztdI1xB&vG`4OPt%ummr3n z_H6)B-R%sHjp@hfh_>IS!eIH78Az1nR9`ghB%oQNhh|RE0+I5lxe+4m`LbSV zD2?z-D7XXH^?}-@$A{NHavU(?f}ee=9VE7+n8V>fOE+1||DAYpK|s@2^6M z#FTebCm|oTp4`p`smWx|q4EPCKf7sTzjb!{9V3@TRe87R}iY5Z5c15%;epDj&r;Tc~{1Fau4 zg9k)E;ORmHoTT{s`@2HgWm9At{u+H;97z_ve%~7r(^n+f_4Nyb%~;oqNA>mf7Th~7 zQTIZ=u)XhF2Ua^&)YMD`tNBjFsLp0XW?{!CK3Oi22xhu5*asWC`T{mV=dI ziH1f+R}Jf|vU;gj&(_wvHXoPyCfpnW=V6u$e$vJ-DYRi$Yg#j`ffzXO`mMGw;=bSFU)rlJB;eL82yFYu1 zZcc>mSopE316ps0CiZ<3h921fP}XQE16uICuFd>UD%`2eJwR6X(}_JwabAMckjqwG z)`fWJAe;Ki3q^hR%tY2&c*X6sUG+HjlkS|Te%;#p1&-saqBLxORV@JVZl5sGBPLFq z3Ln$kg3g<3kY;za4++OA!e0($K%tnX52YrOSTG*ClH4bS@mgwF$yp7tB>otNjWX3> zlLunsp<XJ1yzj+X&oZ3{Ngm>b!VIF3eLaJ zbpDiPpLVL^4t;BezjkD?NtF^}=n_RlaVC%UF{7HzuAicCL+urCa6_1}U@&_TI240r zc1_cPUcr>lN()c#&PAAGsyOvN_Kcpj3SZijU#x*0H8-=|^$=kWqSO{%f`+Ch>s`mn zDTmt8x|&S_Eu7fidX*Rx?1lWVgglhZ3~K?(tUfJXah*I6p&wgPE1G7lrX_t2)TG$~ z8jR9yCh{K_1@eqPMD>jvty|T9uf)j{x$bQA+}rh<^NnrM+RoSa@7nkz(7t&Tz08$x`j9-e%-RgC4Y^}Ik7V!>brf_pA78ayIk|Tyv2JcL4{&-De z{g-X)N^A1ce>jPVu<_qrh*W54P(^6lt5A~qlid^@{Z!?0H)K`%oUaiU8Dk8C1NnQ2 zu6krtl2*AQwxqm<5@pFBZ7xf5>p6mkr%t6jkfgKZe^`Wichk4jsr=(xl|I4@#mmO< z$_pTvxFwuY&@dQgL^TU-jEH76>jz4e^EN4~*ns(;m>5LK$DbGVNcVbUkB*K^YAwh8 z?j}FT$!hz(^Q2bM`Bh?}zDScrluLWMSoo};WHjoYazN!8@Cr94$;LFKuV5rcN zXX$c7%djJ*BX7p_rMu;=apS|#$UN7q;Mfk;?j)b_lV4V#R_+YX$2FHtvvMv|=OQI@ zQO78}Wy)r9=kU*FX!b}K6X+is+54))p~`v!pU7QT=0A5VzSK2%Sal09-`|ZiTl1aMDkvs+ zL(Yy>X4GNzsAht5P-EL@b?WuT`YQ@{@iVz8Zb7%6_3wH$F+-H77fS4!Ra4(SznXP( zA3yqo|Gj;%Ipw5*Os?9_*jTLDlUazANvDFfIPG2=Y-o{h#Egi< z{DAE-Pa}CuSWF@C2>UvHEagRBlOBN%BF~?>IhkVe|DFk!tKqGuX;W8@wtwrABZa5g zGSC|*NJEo5=^+tVQuiFsae5u(#*xlKlp-R*V|ozh*}@-1Z9>7Z4_`Txwf+>a_zHh6 zH_C$w__L5{YSCz-8;whY;=tt1doJiSya21H?+9N0i!eI#4-(&`+icqy(VdL$4+`Ku zL#==d&ydNVb<=Ve!d!BLqbrYxOGbi$MGCoGK8 z$|*;_UaTu`8hn+?syxA1Cj8jio(ZC4mG3kVUH$%U@HV?Dg|-w_u^>}|(WY#M8`D}S zOsa+G1pD_lnZ{>?u#%aBO~=i|hLbSgt=9$E1%z%_@GaJ9Y)8DI{I^NNDs9PEnPIHr z!m{O2pKfaeW@9D`_G0qaP>T*K=a>tl$D08?tYD_`DtxkQ!)2lB-Se?r?_Z2b60LpT z3}wkO!x^WVX1q~GDBPmRL4O=<@o+6_)rZm?EZug$u_3fOj!6`m@?8_)# zcRXYFjrvY8wUi(;iS)2W7akHUcIF5jl9E{E5TY+5JMlOaZ5<14;@mz5>qt+Q;R)r@ z9#EW|KROc399~u)?8+SNj*L{mxX+A0FfylwDZW}#jJ7c9l&W&Vh~(ujf=iNA3)=X4 zOJ<~Iuq$5Hvr0v)o!`^1;9wLg+v?%vnINy5d-H^JGD2PG1P`8sxAU!M&G$Htr%gPn zgQxX@u!tR2hSDpS__KdCr-wn`VF9^w|4hl%Uzu+2j`}`n<)KVD4z-cWPRJrPOKS9NtVvXSJWF=wucyk zUu!z?c*uX0Ms>!gRHpKC-^a=z5%T$mGBd)9m?m1XA%|GdZ;5!m6!x z95kWUx`5UdC?{kuMxn9pyP?rVVb(!H7Nn>{=HhYgeEAYo@vIps>xBU(jT2yd;RpCdX2L=^w9!EkB_ z8B}R`wZT2t1kW2j$pWWMnf^YY!s6URWSWA&o6TkF_v#s`C~kT>zO8Q~Lh$n_ zytifFZxPS_X8ll}Gkm(vS~Dx)(?!_Lvqlq->V`B#RD?z)^0vIuWJmat?)0D+A- zwEo}qw`%TNC)n91!-L~S$rkFGe+sZ7Euvv0pSy+qtOfSPGF@lWru9-EyfXclk&w)R`?uP!X>Lj_``iiEvg6UZ z-M~Wzspwg+obkw@Znq(*;-s)T^ovcKl71^AiQgVLqCRsGc49+sK4KdQV}$A;B}FKx zk})ZD2YEFsESWjsj14;k$XgA(@G`Vc~`S-qYm{mWHiQOfARE>(bDjsam_JI>O5;a zy9fgdp-ZA%2$L_+v8>jazI=Trap=F>+PAhCl_V06O`;LQu!=RMBgu}8A{$aP-)W1D zJSy_1-YKA!_pXY@(IsAB;4`_dvFv^K_5STyZ-4j-hhI3J;a%FLBG3XZP zKv)7xGo2?scNNM~A_W!V6r&UBO!qH$&4^}a80tQH(z5>ZbfS)+TN-2l literal 0 HcmV?d00001 diff --git a/doc/testimonials/testimonials.rst b/doc/testimonials/testimonials.rst index cac1292d92fa7..88997285e347e 100644 --- a/doc/testimonials/testimonials.rst +++ b/doc/testimonials/testimonials.rst @@ -1108,3 +1108,44 @@ Michael Fitzke Next Generation Technologies Sr Leader, Mars Inc.
+ +`BNP Paribas Cardif `_ +--------------------------------------------------------- + +.. raw:: html + +
+
+ +BNP Paribas Cardif uses scikit-learn for several of its machine learning models +in production. Our internal community of developers and data scientists has +been using scikit-learn since 2015, for several reasons: the quality of the +developments, documentation and contribution governance, and the sheer size of +the contributing community. We even explicitly mention the use of +scikit-learn's pipelines in our internal model risk governance as one of our +good practices to decrease operational risks and overfitting risk. As a way to +support open source software development and in particular scikit-learn +project, we decided to participate to scikit-learn's consortium at La Fondation +Inria since its creation in 2018. + +.. raw:: html + + + +Sébastien Conort, Chief Data Scientist, BNP Paribas Cardif + +.. raw:: html + + +
+
+ +.. image:: images/bnp_paribas_cardif.png + :width: 120pt + :align: center + :target: https://www.bnpparibascardif.com/ + +.. raw:: html + +
+
From 1045d16ec13b1cab7878e7555538573d1884aad3 Mon Sep 17 00:00:00 2001 From: Maria Telenczuk Date: Wed, 3 Mar 2021 18:43:17 +0100 Subject: [PATCH 225/478] PERF don't compute variance when normalize is False in linear models (#19606) --- sklearn/linear_model/_base.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py index 28cc386b4ecda..1842620dfa105 100644 --- a/sklearn/linear_model/_base.py +++ b/sklearn/linear_model/_base.py @@ -235,17 +235,19 @@ def _preprocess_data(X, y, fit_intercept, normalize=False, copy=True, if not return_mean: X_offset[:] = X.dtype.type(0) else: - X_offset, X_var, _ = _incremental_mean_and_var( - X, last_mean=0., last_variance=0., last_sample_count=0., - sample_weight=sample_weight - ) + if normalize: + X_offset, X_var, _ = _incremental_mean_and_var( + X, last_mean=0., last_variance=0., last_sample_count=0., + sample_weight=sample_weight + ) + else: + X_offset = np.average(X, axis=0, weights=sample_weight) - X_offset = X_offset.astype(X.dtype) + X_offset = X_offset.astype(X.dtype, copy=False) X -= X_offset - X_var = X_var.astype(X.dtype, copy=False) - if normalize: + X_var = X_var.astype(X.dtype, copy=False) # Detect constant features on the computed variance, before taking # the np.sqrt. Otherwise constant features cannot be detected with # sample_weights. From 42e90e9ba28fb37c2c9bd3e8aed1ac2387f1d5d5 Mon Sep 17 00:00:00 2001 From: RichardScottOZ <72196131+RichardScottOZ@users.noreply.github.com> Date: Mon, 8 Mar 2021 01:24:16 +1030 Subject: [PATCH 226/478] DOC Fixes spelling mistake in _kmeans.py (#19634) Compatibility typo --- sklearn/cluster/_kmeans.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index c1d889b37db2d..1c54a5c9ff9e0 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -264,7 +264,7 @@ def k_means(X, n_clusters, *, sample_weight=None, init='k-means++', intensive due to the allocation of an extra array of shape (n_samples, n_clusters). - For now "auto" (kept for backward compatibiliy) chooses "elkan" but it + For now "auto" (kept for backward compatibility) chooses "elkan" but it might change in the future for a better heuristic. return_n_iter : bool, default=False From f2773e840a0fcc9dd673cdd0da82dc43299a713b Mon Sep 17 00:00:00 2001 From: Alihan Zihna Date: Mon, 8 Mar 2021 18:03:56 +0000 Subject: [PATCH 227/478] TST replace assert_raise_* by pytest.raises in tests/test_multioutput.py (#19618) Co-authored-by: Alihan Zihna --- sklearn/tests/test_multioutput.py | 36 ++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/sklearn/tests/test_multioutput.py b/sklearn/tests/test_multioutput.py index edfcdef1bf89c..87e5218e08e22 100644 --- a/sklearn/tests/test_multioutput.py +++ b/sklearn/tests/test_multioutput.py @@ -5,9 +5,6 @@ from joblib import cpu_count from sklearn.utils._testing import assert_almost_equal -from sklearn.utils._testing import assert_raises -from sklearn.utils._testing import assert_raises_regex -from sklearn.utils._testing import assert_raise_message from sklearn.utils._testing import assert_array_equal from sklearn.utils._testing import assert_array_almost_equal from sklearn import datasets @@ -80,7 +77,9 @@ def test_multi_target_regression_one_target(): # Test multi target regression raises X, y = datasets.make_regression(n_targets=1) rgr = MultiOutputRegressor(GradientBoostingRegressor(random_state=0)) - assert_raises(ValueError, rgr.fit, X, y) + msg = 'at least two dimensions' + with pytest.raises(ValueError, match=msg): + rgr.fit(X, y) def test_multi_target_sparse_regression(): @@ -106,8 +105,9 @@ def test_multi_target_sample_weights_api(): w = [0.8, 0.6] rgr = MultiOutputRegressor(OrthogonalMatchingPursuit()) - assert_raises_regex(ValueError, "does not support sample weights", - rgr.fit, X, y, w) + msg = "does not support sample weights" + with pytest.raises(ValueError, match=msg): + rgr.fit(X, y, w) # no exception should be raised if the base estimator supports weights rgr = MultiOutputRegressor(GradientBoostingRegressor(random_state=0)) @@ -252,9 +252,9 @@ def test_multi_output_classification_partial_fit(): def test_multi_output_classification_partial_fit_no_first_classes_exception(): sgd_linear_clf = SGDClassifier(loss='log', random_state=1, max_iter=5) multi_target_linear = MultiOutputClassifier(sgd_linear_clf) - assert_raises_regex(ValueError, "classes must be passed on the first call " - "to partial_fit.", - multi_target_linear.partial_fit, X, y) + msg = "classes must be passed on the first call to partial_fit." + with pytest.raises(ValueError, match=msg): + multi_target_linear.partial_fit(X, y) def test_multi_output_classification(): @@ -386,17 +386,27 @@ def test_multi_output_exceptions(): # NotFittedError when fit is not done but score, predict and # and predict_proba are called moc = MultiOutputClassifier(LinearSVC(random_state=0)) - assert_raises(NotFittedError, moc.predict, y) + + with pytest.raises(NotFittedError): + moc.predict(y) + with pytest.raises(NotFittedError): moc.predict_proba - assert_raises(NotFittedError, moc.score, X, y) + + with pytest.raises(NotFittedError): + moc.score(X, y) + # ValueError when number of outputs is different # for fit and score y_new = np.column_stack((y1, y2)) moc.fit(X, y) - assert_raises(ValueError, moc.score, X, y_new) + with pytest.raises(ValueError): + moc.score(X, y_new) + # ValueError when y is continuous - assert_raise_message(ValueError, "Unknown label type", moc.fit, X, X[:, 1]) + msg = "Unknown label type" + with pytest.raises(ValueError, match=msg): + moc.fit(X, X[:, 1]) def generate_multilabel_dataset_with_correlations(): From ae3d955c90d03479d4b6a8a3b359fba10826dc2a Mon Sep 17 00:00:00 2001 From: Mohamed Haseeb Date: Tue, 9 Mar 2021 17:45:57 +0100 Subject: [PATCH 228/478] TST Uses pytest.raises in model_selection/tests (#19621) --- sklearn/model_selection/tests/test_search.py | 72 +++++++++++--------- sklearn/model_selection/tests/test_split.py | 72 +++++++++++--------- 2 files changed, 80 insertions(+), 64 deletions(-) diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py index f9e0babebe3ad..c71c812b3368f 100644 --- a/sklearn/model_selection/tests/test_search.py +++ b/sklearn/model_selection/tests/test_search.py @@ -14,7 +14,6 @@ import pytest from sklearn.utils._testing import ( - assert_raise_message, assert_array_equal, assert_array_almost_equal, assert_allclose, @@ -270,8 +269,8 @@ def test_grid_search_no_score(): # giving no scoring function raises an error grid_search_no_score = GridSearchCV(clf_no_score, {'C': Cs}) - assert_raise_message(TypeError, "no scoring", grid_search_no_score.fit, - [[1]]) + with pytest.raises(TypeError, match="no scoring"): + grid_search_no_score.fit([[1]]) def test_grid_search_score_method(): @@ -316,11 +315,11 @@ def test_grid_search_groups(): group_cvs = [LeaveOneGroupOut(), LeavePGroupsOut(2), GroupKFold(n_splits=3), GroupShuffleSplit()] + error_msg = "The 'groups' parameter should not be None." for cv in group_cvs: gs = GridSearchCV(clf, grid, cv=cv) - assert_raise_message(ValueError, - "The 'groups' parameter should not be None.", - gs.fit, X, y) + with pytest.raises(ValueError, match=error_msg): + gs.fit(X, y) gs.fit(X, y, groups=groups) non_group_cvs = [StratifiedKFold(), StratifiedShuffleSplit()] @@ -385,20 +384,21 @@ def test_no_refit(): # error messages for fn_name in ('predict', 'predict_proba', 'predict_log_proba', 'transform', 'inverse_transform'): - assert_raise_message(NotFittedError, - ('refit=False. %s is available only after ' - 'refitting on the best parameters' - % fn_name), getattr(grid_search, fn_name), X) + error_msg = (f"refit=False. {fn_name} is available only after " + f"refitting on the best parameters") + with pytest.raises(NotFittedError, match=error_msg): + getattr(grid_search, fn_name)(X) # Test that an invalid refit param raises appropriate error messages + error_msg = ("For multi-metric scoring, the parameter refit must be set to" + " a scorer key") for refit in ["", 5, True, 'recall', 'accuracy']: - assert_raise_message(ValueError, "For multi-metric scoring, the " - "parameter refit must be set to a scorer key", - GridSearchCV(clf, {}, refit=refit, - scoring={'acc': 'accuracy', - 'prec': 'precision'} - ).fit, - X, y) + with pytest.raises(ValueError, match=error_msg): + GridSearchCV( + clf, {}, + refit=refit, + scoring={'acc': 'accuracy', 'prec': 'precision'} + ).fit(X, y) def test_grid_search_error(): @@ -437,30 +437,33 @@ def test_grid_search_when_param_grid_includes_range(): def test_grid_search_bad_param_grid(): param_dict = {"C": 1} clf = SVC(gamma='auto') - assert_raise_message( - ValueError, + error_msg = re.escape( "Parameter grid for parameter (C) needs to" " be a list or numpy array, but got ()." " Single values need to be wrapped in a list" - " with one element.", - GridSearchCV, clf, param_dict) + " with one element." + ) + with pytest.raises(ValueError, match=error_msg): + GridSearchCV(clf, param_dict) param_dict = {"C": []} clf = SVC() - assert_raise_message( - ValueError, - "Parameter values for parameter (C) need to be a non-empty sequence.", - GridSearchCV, clf, param_dict) + error_msg = re.escape( + "Parameter values for parameter (C) need to be a non-empty sequence." + ) + with pytest.raises(ValueError, match=error_msg): + GridSearchCV(clf, param_dict) param_dict = {"C": "1,2,3"} clf = SVC(gamma='auto') - assert_raise_message( - ValueError, + error_msg = re.escape( "Parameter grid for parameter (C) needs to" " be a list or numpy array, but got ()." " Single values need to be wrapped in a list" - " with one element.", - GridSearchCV, clf, param_dict) + " with one element." + ) + with pytest.raises(ValueError, match=error_msg): + GridSearchCV(clf, param_dict) param_dict = {"C": np.ones((3, 2))} clf = SVC() @@ -1293,10 +1296,13 @@ def test_fit_grid_point(): assert n_test_samples == test.size # Should raise an error upon multimetric scorer - assert_raise_message(ValueError, "For evaluating multiple scores, use " - "sklearn.model_selection.cross_validate instead.", - fit_grid_point, X, y, svc, params, train, test, - {'score': scorer}, verbose=True) + error_msg = ("For evaluating multiple scores, use " + "sklearn.model_selection.cross_validate instead.") + with pytest.raises(ValueError, match=error_msg): + fit_grid_point( + X, y, svc, params, train, test, {'score': scorer}, + verbose=True + ) # FIXME remove test_fit_grid_point_deprecated as diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py index e6900c90e7a87..80c19c7f2e08c 100644 --- a/sklearn/model_selection/tests/test_split.py +++ b/sklearn/model_selection/tests/test_split.py @@ -1,6 +1,7 @@ """Test the split module""" import warnings import pytest +import re import numpy as np from scipy.sparse import coo_matrix, csc_matrix, csr_matrix from scipy import stats @@ -12,7 +13,6 @@ from sklearn.utils._testing import assert_allclose from sklearn.utils._testing import assert_array_almost_equal from sklearn.utils._testing import assert_array_equal -from sklearn.utils._testing import assert_raise_message from sklearn.utils._testing import ignore_warnings from sklearn.utils.validation import _num_samples from sklearn.utils._mocking import MockDataFrame @@ -116,10 +116,10 @@ def test_cross_validator_with_default_params(): # ValueError for get_n_splits methods msg = "The 'X' parameter should not be None." - assert_raise_message(ValueError, msg, - loo.get_n_splits, None, y, groups) - assert_raise_message(ValueError, msg, - lpo.get_n_splits, None, y, groups) + with pytest.raises(ValueError, match=msg): + loo.get_n_splits(None, y, groups) + with pytest.raises(ValueError, match=msg): + lpo.get_n_splits(None, y, groups) def test_2d_y(): @@ -214,10 +214,10 @@ def test_kfold_valueerrors(): KFold(1) error_string = ("k-fold cross-validation requires at least one" " train/test split") - assert_raise_message(ValueError, error_string, - StratifiedKFold, 0) - assert_raise_message(ValueError, error_string, - StratifiedKFold, 1) + with pytest.raises(ValueError, match=error_string): + StratifiedKFold(0) + with pytest.raises(ValueError, match=error_string): + StratifiedKFold(1) # When n_splits is not integer: with pytest.raises(ValueError): @@ -858,10 +858,10 @@ def test_leave_one_p_group_out(): lpgo_2.get_n_splits(None, None, [0.0, np.inf, 0.0]) msg = "The 'groups' parameter should not be None." - assert_raise_message(ValueError, msg, - logo.get_n_splits, None, None, None) - assert_raise_message(ValueError, msg, - lpgo_1.get_n_splits, None, None, None) + with pytest.raises(ValueError, match=msg): + logo.get_n_splits(None, None, None) + with pytest.raises(ValueError, match=msg): + lpgo_1.get_n_splits(None, None, None) def test_leave_group_out_changing_groups(): @@ -891,27 +891,37 @@ def test_leave_group_out_changing_groups(): def test_leave_one_p_group_out_error_on_fewer_number_of_groups(): X = y = groups = np.ones(0) - assert_raise_message(ValueError, "Found array with 0 sample(s)", next, - LeaveOneGroupOut().split(X, y, groups)) + msg = re.escape("Found array with 0 sample(s)") + with pytest.raises(ValueError, match=msg): + next(LeaveOneGroupOut().split(X, y, groups)) + X = y = groups = np.ones(1) - msg = ("The groups parameter contains fewer than 2 unique groups ({}). " - "LeaveOneGroupOut expects at least 2.").format(groups) - assert_raise_message(ValueError, msg, next, - LeaveOneGroupOut().split(X, y, groups)) + msg = re.escape( + f"The groups parameter contains fewer than 2 unique groups ({groups})." + f" LeaveOneGroupOut expects at least 2." + ) + with pytest.raises(ValueError, match=msg): + next(LeaveOneGroupOut().split(X, y, groups)) + X = y = groups = np.ones(1) - msg = ("The groups parameter contains fewer than (or equal to) n_groups " - "(3) numbers of unique groups ({}). LeavePGroupsOut expects " - "that at least n_groups + 1 (4) unique groups " - "be present").format(groups) - assert_raise_message(ValueError, msg, next, - LeavePGroupsOut(n_groups=3).split(X, y, groups)) + msg = re.escape( + f"The groups parameter contains fewer than (or equal to) n_groups " + f"(3) numbers of unique groups ({groups}). LeavePGroupsOut expects " + f"that at least n_groups + 1 (4) unique groups " + f"be present" + ) + with pytest.raises(ValueError, match=msg): + next(LeavePGroupsOut(n_groups=3).split(X, y, groups)) + X = y = groups = np.arange(3) - msg = ("The groups parameter contains fewer than (or equal to) n_groups " - "(3) numbers of unique groups ({}). LeavePGroupsOut expects " - "that at least n_groups + 1 (4) unique groups " - "be present").format(groups) - assert_raise_message(ValueError, msg, next, - LeavePGroupsOut(n_groups=3).split(X, y, groups)) + msg = re.escape( + f"The groups parameter contains fewer than (or equal to) n_groups " + f"(3) numbers of unique groups ({groups}). LeavePGroupsOut expects " + f"that at least n_groups + 1 (4) unique groups " + f"be present" + ) + with pytest.raises(ValueError, match=msg): + next(LeavePGroupsOut(n_groups=3).split(X, y, groups)) @ignore_warnings From ad7c316d9d05b937aba98d6601db99b4dadf2e52 Mon Sep 17 00:00:00 2001 From: Alek Lefebvre Date: Wed, 10 Mar 2021 05:27:23 -0500 Subject: [PATCH 229/478] Fix Calibrated classifier cv predictions with pipeline (#19641) Co-authored-by: Alek Lefebvre Co-authored-by: Olivier Grisel --- doc/whats_new/v1.0.rst | 7 +++++++ sklearn/calibration.py | 17 +++++++---------- sklearn/tests/test_calibration.py | 15 ++++++++++++--- 3 files changed, 26 insertions(+), 13 deletions(-) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index 3e36438dda095..a566d03ae1bbc 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -207,6 +207,13 @@ Changelog for non-English characters. :pr:`18959` by :user:`Zero ` and :user:`wstates `. +:mod:`sklearn.calibration` +............................ + +- |Fix| The predict and predict_proba methods of + :class:`calibration.CalibratedClassifierCV can now properly be used on + prefitted pipelines. :pr:`19641` by :user:`Alek Lefebvre ` + Code and Documentation Contributors ----------------------------------- diff --git a/sklearn/calibration.py b/sklearn/calibration.py index bff7f6c03502f..b60a415b4419b 100644 --- a/sklearn/calibration.py +++ b/sklearn/calibration.py @@ -24,15 +24,15 @@ MetaEstimatorMixin) from .preprocessing import label_binarize, LabelEncoder from .utils import ( - check_array, column_or_1d, deprecated, indexable, ) + from .utils.multiclass import check_classification_targets from .utils.fixes import delayed from .utils.validation import check_is_fitted, check_consistent_length -from .utils.validation import _check_sample_weight +from .utils.validation import _check_sample_weight, _num_samples from .pipeline import Pipeline from .isotonic import IsotonicRegression from .svm import LinearSVC @@ -344,8 +344,7 @@ def predict_proba(self, X): Parameters ---------- - X : array-like of shape (n_samples, n_features) - The samples. + X : The samples, as accepted by base_estimator.predict_proba Returns ------- @@ -353,11 +352,10 @@ def predict_proba(self, X): The predicted probas. """ check_is_fitted(self) - X = check_array(X, accept_sparse=['csc', 'csr', 'coo'], - force_all_finite=False) + # Compute the arithmetic mean of the predictions of the calibrated # classifiers - mean_proba = np.zeros((X.shape[0], len(self.classes_))) + mean_proba = np.zeros((_num_samples(X), len(self.classes_))) for calibrated_classifier in self.calibrated_classifiers_: proba = calibrated_classifier.predict_proba(X) mean_proba += proba @@ -373,8 +371,7 @@ class that has the highest probability, and can thus be different Parameters ---------- - X : array-like of shape (n_samples, n_features) - The samples. + X : The samples, as accepted by base_estimator.predict Returns ------- @@ -643,7 +640,7 @@ def predict_proba(self, X): self.base_estimator.classes_ ) - proba = np.zeros((X.shape[0], n_classes)) + proba = np.zeros((_num_samples(X), n_classes)) for class_idx, this_pred, calibrator in \ zip(pos_class_indices, predictions.T, self.calibrators): if n_classes == 2: diff --git a/sklearn/tests/test_calibration.py b/sklearn/tests/test_calibration.py index 4ba1599eba3e6..86a638c4a7679 100644 --- a/sklearn/tests/test_calibration.py +++ b/sklearn/tests/test_calibration.py @@ -533,9 +533,14 @@ def text_data_pipeline(text_data): def test_calibration_pipeline(text_data, text_data_pipeline): - # Test that calibration works in prefit pipeline with transformer, - # where `X` is not array-like, sparse matrix or dataframe at the start. - # See https://github.com/scikit-learn/scikit-learn/issues/8710 + """Test that calibration works in prefit pipeline with transformer + + `X` is not array-like, sparse matrix or dataframe at the start. + See https://github.com/scikit-learn/scikit-learn/issues/8710 + + Also test it can predict without running into validation errors. + See https://github.com/scikit-learn/scikit-learn/issues/19637 + """ X, y = text_data clf = text_data_pipeline calib_clf = CalibratedClassifierCV(clf, cv='prefit') @@ -546,6 +551,10 @@ def test_calibration_pipeline(text_data, text_data_pipeline): with pytest.raises(AttributeError, match=msg): calib_clf.n_features_in_ + # Ensure that no error is thrown with predict and predict_proba + calib_clf.predict(X) + calib_clf.predict_proba(X) + @pytest.mark.parametrize('clf, cv', [ pytest.param(LinearSVC(C=1), 2), From 4beb0c27fc0439c12dad244fe4063e96f8983a52 Mon Sep 17 00:00:00 2001 From: Geoffrey Thomas Date: Wed, 10 Mar 2021 08:26:22 -0500 Subject: [PATCH 230/478] MNT Make setup.py command parsing more robust (#19650) Separate arguments into options and commands, and use setuptools.setup if all the requested commands can / should run without NumPy installed, even if there are options present. This fixes a bug where `setup.py --no-user-site egg_info` wants NumPy to be installed, even though `setup.py egg_info` works fine. --- setup.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/setup.py b/setup.py index f2d832a459d89..e44f941e0a114 100755 --- a/setup.py +++ b/setup.py @@ -266,14 +266,9 @@ def setup_package(): package_data={'': ['*.pxd']}, **extra_setuptools_args) - if len(sys.argv) == 1 or ( - len(sys.argv) >= 2 and ('--help' in sys.argv[1:] or - sys.argv[1] in ('--help-commands', - 'egg_info', - 'dist_info', - '--version', - 'clean', - 'check'))): + commands = [arg for arg in sys.argv[1:] if not arg.startswith('-')] + if all(command in ('egg_info', 'dist_info', 'clean', 'check') + for command in commands): # These actions are required to succeed without Numpy for example when # pip is used to install Scikit-learn when Numpy is not yet present in # the system. From 58af0196acc96603c9669b5bbc0c18cf118a150e Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Thu, 11 Mar 2021 15:16:45 +0100 Subject: [PATCH 231/478] DOC Use term 'black people' instead of 'blacks' in Boston descr (#19661) --- sklearn/datasets/descr/boston_house_prices.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/datasets/descr/boston_house_prices.rst b/sklearn/datasets/descr/boston_house_prices.rst index dec9b999cd592..948bccf080c82 100644 --- a/sklearn/datasets/descr/boston_house_prices.rst +++ b/sklearn/datasets/descr/boston_house_prices.rst @@ -21,7 +21,7 @@ Boston house prices dataset - RAD index of accessibility to radial highways - TAX full-value property-tax rate per $10,000 - PTRATIO pupil-teacher ratio by town - - B 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town + - B 1000(Bk - 0.63)^2 where Bk is the proportion of black people by town - LSTAT % lower status of the population - MEDV Median value of owner-occupied homes in $1000's From 598045569c8f96fb345059f5316ea8903d374ff4 Mon Sep 17 00:00:00 2001 From: Jon Crall Date: Thu, 11 Mar 2021 10:25:13 -0500 Subject: [PATCH 232/478] ENH Speedup confusion_matrix (#9843) Co-authored-by: Guillaume Lemaitre --- doc/whats_new/v1.0.rst | 4 ++++ sklearn/metrics/_classification.py | 26 +++++++++++++++++--------- 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index a566d03ae1bbc..0f4882f1b2970 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -169,6 +169,10 @@ Changelog quantile regression. :pr:`19415` by :user:`Xavier Dupré ` and :user:`Oliver Grisel `. +- |Efficiency| Improved speed of :func:`metrics.confusion_matrix` when labels + are integral. + :pr:`9843` by :user:`Jon Crall `. + :mod:`sklearn.naive_bayes` .......................... diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py index 708bde662e765..b4ab145d80937 100644 --- a/sklearn/metrics/_classification.py +++ b/sklearn/metrics/_classification.py @@ -309,7 +309,7 @@ def confusion_matrix(y_true, y_pred, *, labels=None, sample_weight=None, raise ValueError("'labels' should contains at least one label.") elif y_true.size == 0: return np.zeros((n_labels, n_labels), dtype=int) - elif np.all([l not in y_true for l in labels]): + elif len(np.intersect1d(y_true, labels)) == 0: raise ValueError("At least one label specified must be in y_true") if sample_weight is None: @@ -324,17 +324,25 @@ def confusion_matrix(y_true, y_pred, *, labels=None, sample_weight=None, "'all', None}") n_labels = labels.size - label_to_ind = {y: x for x, y in enumerate(labels)} - # convert yt, yp into index - y_pred = np.array([label_to_ind.get(x, n_labels + 1) for x in y_pred]) - y_true = np.array([label_to_ind.get(x, n_labels + 1) for x in y_true]) + # If labels are not consecutive integers starting from zero, then + # y_true and y_pred must be converted into index form + need_index_conversion = not ( + labels.dtype.kind in {'i', 'u', 'b'} and + np.all(labels == np.arange(n_labels)) and + y_true.min() >= 0 and y_pred.min() >= 0 + ) + if need_index_conversion: + label_to_ind = {y: x for x, y in enumerate(labels)} + y_pred = np.array([label_to_ind.get(x, n_labels + 1) for x in y_pred]) + y_true = np.array([label_to_ind.get(x, n_labels + 1) for x in y_true]) # intersect y_pred, y_true with labels, eliminate items not in labels ind = np.logical_and(y_pred < n_labels, y_true < n_labels) - y_pred = y_pred[ind] - y_true = y_true[ind] - # also eliminate weights of eliminated items - sample_weight = sample_weight[ind] + if not np.all(ind): + y_pred = y_pred[ind] + y_true = y_true[ind] + # also eliminate weights of eliminated items + sample_weight = sample_weight[ind] # Choose the accumulator dtype to always have high precision if sample_weight.dtype.kind in {'i', 'u', 'b'}: From 6f180d79f58b42a3fa06055c489b1edf857399ff Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Thu, 11 Mar 2021 12:34:04 -0500 Subject: [PATCH 233/478] BUG Fixes verbose > 2 for grid search (#19659) --- doc/whats_new/v0.24.rst | 7 +++++++ sklearn/model_selection/_validation.py | 20 ++++++++++++++------ sklearn/model_selection/tests/test_search.py | 19 +++++++++++++++++++ 3 files changed, 40 insertions(+), 6 deletions(-) diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst index 84e712c05ea79..68ea8ba0f7a72 100644 --- a/doc/whats_new/v0.24.rst +++ b/doc/whats_new/v0.24.rst @@ -33,6 +33,13 @@ Changelog sample_weight object is not modified anymore. :pr:`19182` by :user:`Yosuke KOBAYASHI `. +:mod:`sklearn.model_selection` +.............................. + +- |Fix| :class:`model_selection.RandomizedSearchCV` and + :class:`model_selection.GridSearchCV` now correctly shows the score for + single metrics and verbose > 2. :pr:`19659` by `Thomas Fan`_. + :mod:`sklearn.preprocessing` ............................ diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index 63f9a53fcf91f..e61e693b2fa74 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -631,13 +631,21 @@ def _fit_and_score(estimator, X, y, scorer, train, test, verbose, total_time = score_time + fit_time end_msg = f"[CV{progress_msg}] END " result_msg = params_msg + (";" if params_msg else "") - if verbose > 2 and isinstance(test_scores, dict): - for scorer_name in sorted(test_scores): - result_msg += f" {scorer_name}: (" + if verbose > 2: + if isinstance(test_scores, dict): + for scorer_name in sorted(test_scores): + result_msg += f" {scorer_name}: (" + if return_train_score: + scorer_scores = train_scores[scorer_name] + result_msg += f"train={scorer_scores:.3f}, " + result_msg += f"test={test_scores[scorer_name]:.3f})" + else: + result_msg += ", score=" if return_train_score: - scorer_scores = train_scores[scorer_name] - result_msg += f"train={scorer_scores:.3f}, " - result_msg += f"test={test_scores[scorer_name]:.3f})" + result_msg += (f"(train={train_scores:.3f}, " + f"test={test_scores:.3f})") + else: + result_msg += f"{test_scores:.3f}" result_msg += f" total time={logger.short_format_time(total_time)}" # Right align the result_msg diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py index c71c812b3368f..25c4ce8cc22f7 100644 --- a/sklearn/model_selection/tests/test_search.py +++ b/sklearn/model_selection/tests/test_search.py @@ -2137,3 +2137,22 @@ def test_search_cv_using_minimal_compatible_estimator(SearchCV, Predictor): else: assert_allclose(y_pred, y.mean()) assert search.score(X, y) == pytest.approx(r2_score(y, y_pred)) + + +@pytest.mark.parametrize("return_train_score", [True, False]) +def test_search_cv_verbose_3(capsys, return_train_score): + """Check that search cv with verbose>2 shows the score for single + metrics. non-regression test fo #19658.""" + X, y = make_classification(n_samples=100, n_classes=2, flip_y=.2, + random_state=0) + clf = LinearSVC(random_state=0) + grid = {'C': [.1]} + + GridSearchCV(clf, grid, scoring='accuracy', verbose=3, cv=3, + return_train_score=return_train_score).fit(X, y) + captured = capsys.readouterr().out + if return_train_score: + match = re.findall(r"score=\(train=[\d\.]+, test=[\d.]+\)", captured) + else: + match = re.findall(r"score=[\d\.]+", captured) + assert len(match) == 3 From bfd7b58c1d0d459257687da25419edb052443528 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Fri, 12 Mar 2021 08:48:24 -0500 Subject: [PATCH 234/478] TST Do not use cache in test_fetch_openml_iris (#19594) --- sklearn/datasets/tests/test_openml.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py index 9f55909c6643b..dac0762eb2160 100644 --- a/sklearn/datasets/tests/test_openml.py +++ b/sklearn/datasets/tests/test_openml.py @@ -772,15 +772,12 @@ def test_fetch_openml_iris(monkeypatch, gzip_response): data_name = 'iris' _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response) - assert_warns_message( - UserWarning, - "Multiple active versions of the dataset matching the name" - " iris exist. Versions may be fundamentally different, " - "returning version 1.", - fetch_openml, - name=data_name, - as_frame=False - ) + + msg = ("Multiple active versions of the dataset matching the name" + " iris exist. Versions may be fundamentally different, " + "returning version 1.") + with pytest.warns(UserWarning, match=msg): + fetch_openml(name=data_name, as_frame=False, cache=False) def test_decode_iris(monkeypatch): From 5ccfabf08d13f50dc3f5b8a8e38dd362ab594c6e Mon Sep 17 00:00:00 2001 From: shivamgargsya Date: Fri, 12 Mar 2021 20:16:50 +0530 Subject: [PATCH 235/478] TST Change assert from sklearn to pytest style in module linear_model/tests (#19565) --- sklearn/linear_model/_omp.py | 8 ++- .../tests/test_coordinate_descent.py | 36 +++++++--- .../linear_model/tests/test_least_angle.py | 7 +- sklearn/linear_model/tests/test_logistic.py | 70 +++++++++++-------- sklearn/linear_model/tests/test_omp.py | 26 ++++--- sklearn/linear_model/tests/test_ransac.py | 11 ++- sklearn/linear_model/tests/test_ridge.py | 13 ++-- sklearn/linear_model/tests/test_sgd.py | 9 ++- .../tests/test_sparse_coordinate_descent.py | 9 ++- sklearn/linear_model/tests/test_theil_sen.py | 10 ++- 10 files changed, 133 insertions(+), 66 deletions(-) diff --git a/sklearn/linear_model/_omp.py b/sklearn/linear_model/_omp.py index e100cdef04fdb..3f995f0f34318 100644 --- a/sklearn/linear_model/_omp.py +++ b/sklearn/linear_model/_omp.py @@ -20,9 +20,11 @@ from ..utils.fixes import delayed from ..model_selection import check_cv -premature = """ Orthogonal matching pursuit ended prematurely due to linear -dependence in the dictionary. The requested precision might not have been met. -""" +premature = ( + "Orthogonal matching pursuit ended prematurely due to linear" + " dependence in the dictionary. The requested precision might" + " not have been met." +) def _cholesky_omp(X, y, n_nonzero_coefs, tol=None, copy_X=True, diff --git a/sklearn/linear_model/tests/test_coordinate_descent.py b/sklearn/linear_model/tests/test_coordinate_descent.py index 3eba535d70c89..ebddb6a7e47c6 100644 --- a/sklearn/linear_model/tests/test_coordinate_descent.py +++ b/sklearn/linear_model/tests/test_coordinate_descent.py @@ -19,8 +19,6 @@ from sklearn.utils._testing import assert_allclose from sklearn.utils._testing import assert_array_almost_equal from sklearn.utils._testing import assert_almost_equal -from sklearn.utils._testing import assert_warns -from sklearn.utils._testing import assert_warns_message from sklearn.utils._testing import ignore_warnings from sklearn.utils._testing import assert_array_equal from sklearn.utils._testing import _convert_container @@ -646,7 +644,13 @@ def test_lasso_alpha_warning(): Y = [-1, 0, 1] # just a straight line clf = Lasso(alpha=0) - assert_warns(UserWarning, clf.fit, X, Y) + warning_message = ( + "With alpha=0, this algorithm does not " + "converge well. You are advised to use the " + "LinearRegression estimator" + ) + with pytest.warns(UserWarning, match=warning_message): + clf.fit(X, Y) def test_lasso_positive_constraint(): @@ -733,7 +737,12 @@ def test_multi_task_lasso_and_enet(): assert_array_almost_equal(clf.coef_[0], clf.coef_[1]) clf = MultiTaskElasticNet(alpha=1.0, tol=1e-8, max_iter=1) - assert_warns_message(ConvergenceWarning, 'did not converge', clf.fit, X, Y) + warning_message = ( + "Objective did not converge. You might want to " + "increase the number of iterations." + ) + with pytest.warns(ConvergenceWarning, match=warning_message): + clf.fit(X, Y) def test_lasso_readonly_data(): @@ -1075,11 +1084,13 @@ def test_overrided_gram_matrix(): X, y, _, _ = build_dataset(n_samples=20, n_features=10) Gram = X.T.dot(X) clf = ElasticNet(selection='cyclic', tol=1e-8, precompute=Gram) - assert_warns_message(UserWarning, - "Gram matrix was provided but X was centered" - " to fit intercept, " - "or X was normalized : recomputing Gram matrix.", - clf.fit, X, y) + warning_message = ( + "Gram matrix was provided but X was centered" + " to fit intercept, " + "or X was normalized : recomputing Gram matrix." + ) + with pytest.warns(UserWarning, match=warning_message): + clf.fit(X, y) @pytest.mark.parametrize('model', [ElasticNet, Lasso]) @@ -1214,7 +1225,12 @@ def test_enet_coordinate_descent(klass, n_classes, kwargs): y = np.ones((n_samples, n_classes)) if klass == Lasso: y = y.ravel() - assert_warns(ConvergenceWarning, clf.fit, X, y) + warning_message = ( + "Objective did not converge. You might want to" + " increase the number of iterations." + ) + with pytest.warns(ConvergenceWarning, match=warning_message): + clf.fit(X, y) def test_convergence_warnings(): diff --git a/sklearn/linear_model/tests/test_least_angle.py b/sklearn/linear_model/tests/test_least_angle.py index 96c5a8fedbf14..a8b0e939c080d 100644 --- a/sklearn/linear_model/tests/test_least_angle.py +++ b/sklearn/linear_model/tests/test_least_angle.py @@ -10,7 +10,6 @@ from sklearn.utils._testing import assert_array_almost_equal from sklearn.utils._testing import assert_raises from sklearn.utils._testing import ignore_warnings -from sklearn.utils._testing import assert_warns from sklearn.utils._testing import TempMemmap from sklearn.utils.fixes import np_version, parse_version from sklearn.exceptions import ConvergenceWarning @@ -372,7 +371,11 @@ def objective_function(coef): + alpha * linalg.norm(coef, 1)) lars = linear_model.LassoLars(alpha=alpha, normalize=False) - assert_warns(ConvergenceWarning, lars.fit, X, y) + warning_message = ( + "Regressors in active set degenerate." + ) + with pytest.warns(ConvergenceWarning, match=warning_message): + lars.fit(X, y) lars_coef_ = lars.coef_ lars_obj = objective_function(lars_coef_) diff --git a/sklearn/linear_model/tests/test_logistic.py b/sklearn/linear_model/tests/test_logistic.py index 329f4f72f935b..bdc9a4a24914b 100644 --- a/sklearn/linear_model/tests/test_logistic.py +++ b/sklearn/linear_model/tests/test_logistic.py @@ -19,9 +19,7 @@ from sklearn.model_selection import cross_val_score from sklearn.preprocessing import LabelEncoder, StandardScaler from sklearn.utils import compute_class_weight, _IS_32BIT -from sklearn.utils._testing import assert_warns from sklearn.utils._testing import ignore_warnings -from sklearn.utils._testing import assert_warns_message from sklearn.utils import shuffle from sklearn.linear_model import SGDClassifier from sklearn.preprocessing import scale @@ -155,11 +153,13 @@ def test_lr_liblinear_warning(): target = iris.target_names[iris.target] lr = LogisticRegression(solver='liblinear', n_jobs=2) - assert_warns_message(UserWarning, - "'n_jobs' > 1 does not have any effect when" - " 'solver' is set to 'liblinear'. Got 'n_jobs'" - " = 2.", - lr.fit, iris.data, target) + warning_message = ( + "'n_jobs' > 1 does not have any effect when" + " 'solver' is set to 'liblinear'. Got 'n_jobs'" + " = 2." + ) + with pytest.warns(UserWarning, match=warning_message): + lr.fit(iris.data, target) def test_predict_3_classes(): @@ -1188,23 +1188,34 @@ def test_logreg_predict_proba_multinomial(): assert clf_wrong_loss > clf_multi_loss -def test_max_iter(): +@pytest.mark.parametrize("max_iter", np.arange(1, 5)) +@pytest.mark.parametrize("multi_class", ['ovr', 'multinomial']) +@pytest.mark.parametrize( + "solver, message", + [("newton-cg", "newton-cg failed to converge. Increase the " + "number of iterations."), + ("liblinear", "Liblinear failed to converge, increase the " + "number of iterations."), + ("sag", "The max_iter was reached which means the " + "coef_ did not converge"), + ("saga", "The max_iter was reached which means the " + "coef_ did not converge"), + ("lbfgs", "lbfgs failed to converge")]) +def test_max_iter(max_iter, multi_class, solver, message): # Test that the maximum number of iteration is reached X, y_bin = iris.data, iris.target.copy() y_bin[y_bin == 2] = 0 - solvers = ['newton-cg', 'liblinear', 'sag', 'saga', 'lbfgs'] + if solver == 'liblinear' and multi_class == 'multinomial': + pytest.skip("'multinomial' is unavailable when solver='liblinear'") + + lr = LogisticRegression(max_iter=max_iter, tol=1e-15, + multi_class=multi_class, + random_state=0, solver=solver) + with pytest.warns(ConvergenceWarning, match=message): + lr.fit(X, y_bin) - for max_iter in range(1, 5): - for solver in solvers: - for multi_class in ['ovr', 'multinomial']: - if solver == 'liblinear' and multi_class == 'multinomial': - continue - lr = LogisticRegression(max_iter=max_iter, tol=1e-15, - multi_class=multi_class, - random_state=0, solver=solver) - assert_warns(ConvergenceWarning, lr.fit, X, y_bin) - assert lr.n_iter_[0] == max_iter + assert lr.n_iter_[0] == max_iter @pytest.mark.parametrize('solver', @@ -1644,12 +1655,11 @@ def test_l1_ratio_param(l1_ratio): l1_ratio=l1_ratio).fit(X, Y1) if l1_ratio is not None: - msg = ("l1_ratio parameter is only used when penalty is 'elasticnet'." - " Got (penalty=l1)") - - assert_warns_message(UserWarning, msg, - LogisticRegression(penalty='l1', solver='saga', - l1_ratio=l1_ratio).fit, X, Y1) + msg = (r"l1_ratio parameter is only used when penalty is" + r" 'elasticnet'\. Got \(penalty=l1\)") + with pytest.warns(UserWarning, match=msg): + LogisticRegression(penalty='l1', solver='saga', + l1_ratio=l1_ratio).fit(X, Y1) @pytest.mark.parametrize('l1_ratios', ([], [.5, 2], None, 'something_wrong')) @@ -1664,11 +1674,12 @@ def test_l1_ratios_param(l1_ratios): l1_ratios=l1_ratios, cv=2).fit(X, Y1) if l1_ratios is not None: - msg = ("l1_ratios parameter is only used when penalty is " - "'elasticnet'. Got (penalty=l1)") + msg = (r"l1_ratios parameter is only used when penalty" + r" is 'elasticnet'. Got \(penalty=l1\)") function = LogisticRegressionCV(penalty='l1', solver='saga', l1_ratios=l1_ratios, cv=2).fit - assert_warns_message(UserWarning, msg, function, X, Y1) + with pytest.warns(UserWarning, match=msg): + function(X, Y1) @pytest.mark.parametrize('C', np.logspace(-3, 2, 4)) @@ -1769,7 +1780,8 @@ def test_penalty_none(solver): msg = "Setting penalty='none' will ignore the C" lr = LogisticRegression(penalty='none', solver=solver, C=4) - assert_warns_message(UserWarning, msg, lr.fit, X, y) + with pytest.warns(UserWarning, match=msg): + lr.fit(X, y) lr_none = LogisticRegression(penalty='none', solver=solver, random_state=0) diff --git a/sklearn/linear_model/tests/test_omp.py b/sklearn/linear_model/tests/test_omp.py index f3f3080aebe66..3cbda003f0148 100644 --- a/sklearn/linear_model/tests/test_omp.py +++ b/sklearn/linear_model/tests/test_omp.py @@ -2,11 +2,11 @@ # License: BSD 3 clause import numpy as np +import pytest from sklearn.utils._testing import assert_raises from sklearn.utils._testing import assert_array_equal from sklearn.utils._testing import assert_array_almost_equal -from sklearn.utils._testing import assert_warns from sklearn.utils._testing import ignore_warnings @@ -76,12 +76,16 @@ def test_unreachable_accuracy(): assert_array_almost_equal( orthogonal_mp(X, y, tol=0), orthogonal_mp(X, y, n_nonzero_coefs=n_features)) - - assert_array_almost_equal( - assert_warns(RuntimeWarning, orthogonal_mp, X, y, tol=0, - precompute=True), - orthogonal_mp(X, y, precompute=True, - n_nonzero_coefs=n_features)) + warning_message = ( + "Orthogonal matching pursuit ended prematurely " + "due to linear dependence in the dictionary. " + "The requested precision might not have been met." + ) + with pytest.warns(RuntimeWarning, match=warning_message): + assert_array_almost_equal( + orthogonal_mp(X, y, tol=0, precompute=True), + orthogonal_mp(X, y, precompute=True, + n_nonzero_coefs=n_features)) def test_bad_input(): @@ -155,7 +159,13 @@ def test_identical_regressors(): gamma = np.zeros(n_features) gamma[0] = gamma[1] = 1. newy = np.dot(newX, gamma) - assert_warns(RuntimeWarning, orthogonal_mp, newX, newy, 2) + warning_message = ( + "Orthogonal matching pursuit ended prematurely " + "due to linear dependence in the dictionary. " + "The requested precision might not have been met." + ) + with pytest.warns(RuntimeWarning, match=warning_message): + orthogonal_mp(newX, newy, 2) def test_swapped_regressors(): diff --git a/sklearn/linear_model/tests/test_ransac.py b/sklearn/linear_model/tests/test_ransac.py index 757faacd2d67f..f631199a5d268 100644 --- a/sklearn/linear_model/tests/test_ransac.py +++ b/sklearn/linear_model/tests/test_ransac.py @@ -6,7 +6,6 @@ from numpy.testing import assert_array_equal from sklearn.utils import check_random_state -from sklearn.utils._testing import assert_warns from sklearn.utils._testing import assert_raises_regexp from sklearn.utils._testing import assert_allclose from sklearn.datasets import make_regression @@ -232,8 +231,14 @@ def is_data_valid(X, y): is_data_valid=is_data_valid, max_skips=3, max_trials=5) - - assert_warns(ConvergenceWarning, ransac_estimator.fit, X, y) + warning_message = ( + "RANSAC found a valid consensus set but exited " + "early due to skipping more iterations than " + "`max_skips`. See estimator attributes for " + "diagnostics." + ) + with pytest.warns(ConvergenceWarning, match=warning_message): + ransac_estimator.fit(X, y) assert ransac_estimator.n_skips_no_inliers_ == 0 assert ransac_estimator.n_skips_invalid_data_ == 4 assert ransac_estimator.n_skips_invalid_model_ == 0 diff --git a/sklearn/linear_model/tests/test_ridge.py b/sklearn/linear_model/tests/test_ridge.py index 8e33514af83f9..01839fe0ba457 100644 --- a/sklearn/linear_model/tests/test_ridge.py +++ b/sklearn/linear_model/tests/test_ridge.py @@ -10,7 +10,6 @@ from sklearn.utils._testing import assert_array_almost_equal from sklearn.utils._testing import assert_array_equal from sklearn.utils._testing import ignore_warnings -from sklearn.utils._testing import assert_warns from sklearn.exceptions import ConvergenceWarning @@ -162,10 +161,14 @@ def test_ridge_regression_convergence_fail(): rng = np.random.RandomState(0) y = rng.randn(5) X = rng.randn(5, 10) - - assert_warns(ConvergenceWarning, ridge_regression, - X, y, alpha=1.0, solver="sparse_cg", - tol=0., max_iter=None, verbose=1) + warning_message = ( + r"sparse_cg did not converge after" + r" [0-9]+ iterations." + ) + with pytest.warns(ConvergenceWarning, match=warning_message): + ridge_regression(X, y, + alpha=1.0, solver="sparse_cg", + tol=0., max_iter=None, verbose=1) def test_ridge_sample_weights(): diff --git a/sklearn/linear_model/tests/test_sgd.py b/sklearn/linear_model/tests/test_sgd.py index 0ac7ce779f5a7..217249631390d 100644 --- a/sklearn/linear_model/tests/test_sgd.py +++ b/sklearn/linear_model/tests/test_sgd.py @@ -9,7 +9,6 @@ from sklearn.utils._testing import assert_almost_equal from sklearn.utils._testing import assert_array_almost_equal from sklearn.utils._testing import assert_raises_regexp -from sklearn.utils._testing import assert_warns from sklearn.utils._testing import ignore_warnings from sklearn.utils.fixes import parse_version @@ -1446,7 +1445,13 @@ def test_tol_parameter(): # Strict tolerance and small max_iter should trigger a warning model_3 = SGDClassifier(max_iter=3, tol=1e-3, random_state=0) - model_3 = assert_warns(ConvergenceWarning, model_3.fit, X, y) + warning_message = ( + "Maximum number of iteration reached before " + "convergence. Consider increasing max_iter to " + "improve the fit." + ) + with pytest.warns(ConvergenceWarning, match=warning_message): + model_3.fit(X, y) assert model_3.n_iter_ == 3 diff --git a/sklearn/linear_model/tests/test_sparse_coordinate_descent.py b/sklearn/linear_model/tests/test_sparse_coordinate_descent.py index 5f131209c1547..23b57a699a655 100644 --- a/sklearn/linear_model/tests/test_sparse_coordinate_descent.py +++ b/sklearn/linear_model/tests/test_sparse_coordinate_descent.py @@ -1,11 +1,11 @@ import numpy as np +import pytest import scipy.sparse as sp from sklearn.utils._testing import assert_array_almost_equal from sklearn.utils._testing import assert_almost_equal from sklearn.utils._testing import ignore_warnings -from sklearn.utils._testing import assert_warns from sklearn.exceptions import ConvergenceWarning from sklearn.linear_model import Lasso, ElasticNet, LassoCV, ElasticNetCV @@ -297,4 +297,9 @@ def test_sparse_enet_coordinate_descent(): n_features = 2 X = sp.csc_matrix((n_samples, n_features)) * 1e50 y = np.ones(n_samples) - assert_warns(ConvergenceWarning, clf.fit, X, y) + warning_message = ( + "Objective did not converge. You might want " + "to increase the number of iterations." + ) + with pytest.warns(ConvergenceWarning, match=warning_message): + clf.fit(X, y) diff --git a/sklearn/linear_model/tests/test_theil_sen.py b/sklearn/linear_model/tests/test_theil_sen.py index bd17298492ca0..c670fc3979b80 100644 --- a/sklearn/linear_model/tests/test_theil_sen.py +++ b/sklearn/linear_model/tests/test_theil_sen.py @@ -8,8 +8,9 @@ import sys from contextlib import contextmanager import numpy as np +import pytest from numpy.testing import assert_array_equal, assert_array_less -from numpy.testing import assert_array_almost_equal, assert_warns +from numpy.testing import assert_array_almost_equal from scipy.linalg import norm from scipy.optimize import fmin_bfgs from sklearn.exceptions import ConvergenceWarning @@ -154,7 +155,12 @@ def cost_func(y): fermat_weber = fmin_bfgs(cost_func, median, disp=False) assert_array_almost_equal(median, fermat_weber) # Check when maximum iteration is exceeded a warning is emitted - assert_warns(ConvergenceWarning, _spatial_median, X, max_iter=30, tol=0.) + warning_message = ( + "Maximum number of iterations 30 reached" + " in spatial median." + ) + with pytest.warns(ConvergenceWarning, match=warning_message): + _spatial_median(X, max_iter=30, tol=0.) def test_theil_sen_1d(): From 579e7de7f38f9f514ff2b2be049e67b14e723d17 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?= <34657725+jeremiedbb@users.noreply.github.com> Date: Fri, 12 Mar 2021 16:08:36 +0100 Subject: [PATCH 236/478] move kmpp public next to kmpp private (#19666) --- sklearn/cluster/_kmeans.py | 187 ++++++++++++++++++------------------- 1 file changed, 93 insertions(+), 94 deletions(-) diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index 1c54a5c9ff9e0..17272858ae476 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -47,6 +47,99 @@ ############################################################################### # Initialization heuristic +def kmeans_plusplus(X, n_clusters, *, x_squared_norms=None, + random_state=None, n_local_trials=None): + """Init n_clusters seeds according to k-means++ + + .. versionadded:: 0.24 + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The data to pick seeds from. + + n_clusters : int + The number of centroids to initialize + + x_squared_norms : array-like of shape (n_samples,), default=None + Squared Euclidean norm of each data point. + + random_state : int or RandomState instance, default=None + Determines random number generation for centroid initialization. Pass + an int for reproducible output across multiple function calls. + See :term:`Glossary `. + + n_local_trials : int, default=None + The number of seeding trials for each center (except the first), + of which the one reducing inertia the most is greedily chosen. + Set to None to make the number of trials depend logarithmically + on the number of seeds (2+log(k)). + + Returns + ------- + centers : ndarray of shape (n_clusters, n_features) + The inital centers for k-means. + + indices : ndarray of shape (n_clusters,) + The index location of the chosen centers in the data array X. For a + given index and center, X[index] = center. + + Notes + ----- + Selects initial cluster centers for k-mean clustering in a smart way + to speed up convergence. see: Arthur, D. and Vassilvitskii, S. + "k-means++: the advantages of careful seeding". ACM-SIAM symposium + on Discrete algorithms. 2007 + + Examples + -------- + + >>> from sklearn.cluster import kmeans_plusplus + >>> import numpy as np + >>> X = np.array([[1, 2], [1, 4], [1, 0], + ... [10, 2], [10, 4], [10, 0]]) + >>> centers, indices = kmeans_plusplus(X, n_clusters=2, random_state=0) + >>> centers + array([[10, 4], + [ 1, 0]]) + >>> indices + array([4, 2]) + """ + + # Check data + check_array(X, accept_sparse='csr', + dtype=[np.float64, np.float32]) + + if X.shape[0] < n_clusters: + raise ValueError(f"n_samples={X.shape[0]} should be >= " + f"n_clusters={n_clusters}.") + + # Check parameters + if x_squared_norms is None: + x_squared_norms = row_norms(X, squared=True) + else: + x_squared_norms = check_array(x_squared_norms, + dtype=X.dtype, + ensure_2d=False) + + if x_squared_norms.shape[0] != X.shape[0]: + raise ValueError( + f"The length of x_squared_norms {x_squared_norms.shape[0]} should " + f"be equal to the length of n_samples {X.shape[0]}.") + + if n_local_trials is not None and n_local_trials < 1: + raise ValueError( + f"n_local_trials is set to {n_local_trials} but should be an " + f"integer value greater than zero.") + + random_state = check_random_state(random_state) + + # Call private k-means++ + centers, indices = _kmeans_plusplus(X, n_clusters, x_squared_norms, + random_state, n_local_trials) + + return centers, indices + def _kmeans_plusplus(X, n_clusters, x_squared_norms, random_state, n_local_trials=None): @@ -1924,97 +2017,3 @@ def _more_tags(self): 'zero sample_weight is not equivalent to removing samples', } } - - -def kmeans_plusplus(X, n_clusters, *, x_squared_norms=None, - random_state=None, n_local_trials=None): - """Init n_clusters seeds according to k-means++ - - .. versionadded:: 0.24 - - Parameters - ---------- - X : {array-like, sparse matrix} of shape (n_samples, n_features) - The data to pick seeds from. - - n_clusters : int - The number of centroids to initialize - - x_squared_norms : array-like of shape (n_samples,), default=None - Squared Euclidean norm of each data point. - - random_state : int or RandomState instance, default=None - Determines random number generation for centroid initialization. Pass - an int for reproducible output across multiple function calls. - See :term:`Glossary `. - - n_local_trials : int, default=None - The number of seeding trials for each center (except the first), - of which the one reducing inertia the most is greedily chosen. - Set to None to make the number of trials depend logarithmically - on the number of seeds (2+log(k)). - - Returns - ------- - centers : ndarray of shape (n_clusters, n_features) - The inital centers for k-means. - - indices : ndarray of shape (n_clusters,) - The index location of the chosen centers in the data array X. For a - given index and center, X[index] = center. - - Notes - ----- - Selects initial cluster centers for k-mean clustering in a smart way - to speed up convergence. see: Arthur, D. and Vassilvitskii, S. - "k-means++: the advantages of careful seeding". ACM-SIAM symposium - on Discrete algorithms. 2007 - - Examples - -------- - - >>> from sklearn.cluster import kmeans_plusplus - >>> import numpy as np - >>> X = np.array([[1, 2], [1, 4], [1, 0], - ... [10, 2], [10, 4], [10, 0]]) - >>> centers, indices = kmeans_plusplus(X, n_clusters=2, random_state=0) - >>> centers - array([[10, 4], - [ 1, 0]]) - >>> indices - array([4, 2]) - """ - - # Check data - check_array(X, accept_sparse='csr', - dtype=[np.float64, np.float32]) - - if X.shape[0] < n_clusters: - raise ValueError(f"n_samples={X.shape[0]} should be >= " - f"n_clusters={n_clusters}.") - - # Check parameters - if x_squared_norms is None: - x_squared_norms = row_norms(X, squared=True) - else: - x_squared_norms = check_array(x_squared_norms, - dtype=X.dtype, - ensure_2d=False) - - if x_squared_norms.shape[0] != X.shape[0]: - raise ValueError( - f"The length of x_squared_norms {x_squared_norms.shape[0]} should " - f"be equal to the length of n_samples {X.shape[0]}.") - - if n_local_trials is not None and n_local_trials < 1: - raise ValueError( - f"n_local_trials is set to {n_local_trials} but should be an " - f"integer value greater than zero.") - - random_state = check_random_state(random_state) - - # Call private k-means++ - centers, indices = _kmeans_plusplus(X, n_clusters, x_squared_norms, - random_state, n_local_trials) - - return centers, indices From 15fd026963be233d37752f322b5dd484c58e09a8 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Sat, 13 Mar 2021 00:02:29 +0100 Subject: [PATCH 237/478] RFC Make non_negative_factorization call NMF instead of the opposite (#19607) --- sklearn/decomposition/_nmf.py | 217 +++++++++++++++++++++------------- 1 file changed, 132 insertions(+), 85 deletions(-) diff --git a/sklearn/decomposition/_nmf.py b/sklearn/decomposition/_nmf.py index 6d42fecb885a2..b978f1a33d3af 100644 --- a/sklearn/decomposition/_nmf.py +++ b/sklearn/decomposition/_nmf.py @@ -1021,74 +1021,14 @@ def non_negative_factorization(X, W=None, H=None, n_components=None, *, """ X = check_array(X, accept_sparse=('csr', 'csc'), dtype=[np.float64, np.float32]) - check_non_negative(X, "NMF (input X)") - beta_loss = _check_string_param(solver, regularization, beta_loss, init) - if X.min() == 0 and beta_loss <= 0: - raise ValueError("When beta_loss <= 0 and X contains zeros, " - "the solver may diverge. Please add small values to " - "X, or use a positive beta_loss.") + est = NMF(n_components=n_components, init=init, solver=solver, + beta_loss=beta_loss, tol=tol, max_iter=max_iter, + random_state=random_state, alpha=alpha, l1_ratio=l1_ratio, + verbose=verbose, shuffle=shuffle, regularization=regularization) - n_samples, n_features = X.shape - if n_components is None: - n_components = n_features - - if not isinstance(n_components, numbers.Integral) or n_components <= 0: - raise ValueError("Number of components must be a positive integer;" - " got (n_components=%r)" % n_components) - if not isinstance(max_iter, numbers.Integral) or max_iter < 0: - raise ValueError("Maximum number of iterations must be a positive " - "integer; got (max_iter=%r)" % max_iter) - if not isinstance(tol, numbers.Number) or tol < 0: - raise ValueError("Tolerance for stopping criteria must be " - "positive; got (tol=%r)" % tol) - - # check W and H, or initialize them - if init == 'custom' and update_H: - _check_init(H, (n_components, n_features), "NMF (input H)") - _check_init(W, (n_samples, n_components), "NMF (input W)") - if H.dtype != X.dtype or W.dtype != X.dtype: - raise TypeError("H and W should have the same dtype as X. Got " - "H.dtype = {} and W.dtype = {}." - .format(H.dtype, W.dtype)) - elif not update_H: - _check_init(H, (n_components, n_features), "NMF (input H)") - if H.dtype != X.dtype: - raise TypeError("H should have the same dtype as X. Got H.dtype = " - "{}.".format(H.dtype)) - # 'mu' solver should not be initialized by zeros - if solver == 'mu': - avg = np.sqrt(X.mean() / n_components) - W = np.full((n_samples, n_components), avg, dtype=X.dtype) - else: - W = np.zeros((n_samples, n_components), dtype=X.dtype) - else: - W, H = _initialize_nmf(X, n_components, init=init, - random_state=random_state) - - l1_reg_W, l1_reg_H, l2_reg_W, l2_reg_H = _compute_regularization( - alpha, l1_ratio, regularization) - - if solver == 'cd': - W, H, n_iter = _fit_coordinate_descent(X, W, H, tol, max_iter, - l1_reg_W, l1_reg_H, - l2_reg_W, l2_reg_H, - update_H=update_H, - verbose=verbose, - shuffle=shuffle, - random_state=random_state) - elif solver == 'mu': - W, H, n_iter = _fit_multiplicative_update(X, W, H, beta_loss, max_iter, - tol, l1_reg_W, l1_reg_H, - l2_reg_W, l2_reg_H, update_H, - verbose) - - else: - raise ValueError("Invalid solver parameter '%s'." % solver) - - if n_iter == max_iter and tol > 0: - warnings.warn("Maximum number of iterations %d reached. Increase it to" - " improve convergence." % max_iter, ConvergenceWarning) + with config_context(assume_finite=True): + W, H, n_iter = est._fit_transform(X, W=W, H=H, update_H=update_H) return W, H, n_iter @@ -1281,6 +1221,52 @@ def __init__(self, n_components=None, *, init='warn', solver='cd', def _more_tags(self): return {'requires_positive_X': True} + def _check_params(self, X): + self._n_components = self.n_components + if self._n_components is None: + self._n_components = X.shape[1] + if not isinstance( + self._n_components, numbers.Integral + ) or self._n_components <= 0: + raise ValueError("Number of components must be a positive integer;" + " got (n_components=%r)" % self._n_components) + if not isinstance( + self.max_iter, numbers.Integral + ) or self.max_iter < 0: + raise ValueError("Maximum number of iterations must be a positive " + "integer; got (max_iter=%r)" % self.max_iter) + if not isinstance(self.tol, numbers.Number) or self.tol < 0: + raise ValueError("Tolerance for stopping criteria must be " + "positive; got (tol=%r)" % self.tol) + return self + + def _check_w_h(self, X, W, H, update_H): + # check W and H, or initialize them + n_samples, n_features = X.shape + if self.init == 'custom' and update_H: + _check_init(H, (self._n_components, n_features), "NMF (input H)") + _check_init(W, (n_samples, self._n_components), "NMF (input W)") + if H.dtype != X.dtype or W.dtype != X.dtype: + raise TypeError("H and W should have the same dtype as X. Got " + "H.dtype = {} and W.dtype = {}." + .format(H.dtype, W.dtype)) + elif not update_H: + _check_init(H, (self._n_components, n_features), "NMF (input H)") + if H.dtype != X.dtype: + raise TypeError("H should have the same dtype as X. Got " + "H.dtype = {}.".format(H.dtype)) + # 'mu' solver should not be initialized by zeros + if self.solver == 'mu': + avg = np.sqrt(X.mean() / self._n_components) + W = np.full((n_samples, self._n_components), + avg, dtype=X.dtype) + else: + W = np.zeros((n_samples, self._n_components), dtype=X.dtype) + else: + W, H = _initialize_nmf(X, self._n_components, init=self.init, + random_state=self.random_state) + return W, H + def fit_transform(self, X, y=None, W=None, H=None): """Learn a NMF model for the data X and returns the transformed data. @@ -1308,23 +1294,92 @@ def fit_transform(self, X, y=None, W=None, H=None): dtype=[np.float64, np.float32]) with config_context(assume_finite=True): - W, H, n_iter_ = non_negative_factorization( - X=X, W=W, H=H, n_components=self.n_components, init=self.init, - update_H=True, solver=self.solver, beta_loss=self.beta_loss, - tol=self.tol, max_iter=self.max_iter, alpha=self.alpha, - l1_ratio=self.l1_ratio, regularization=self.regularization, - random_state=self.random_state, verbose=self.verbose, - shuffle=self.shuffle) - - self.reconstruction_err_ = _beta_divergence(X, W, H, self.beta_loss, + W, H, n_iter = self._fit_transform(X, W=W, H=H) + + self.reconstruction_err_ = _beta_divergence(X, W, H, self._beta_loss, square_root=True) self.n_components_ = H.shape[0] self.components_ = H - self.n_iter_ = n_iter_ + self.n_iter_ = n_iter return W + def _fit_transform(self, X, y=None, W=None, H=None, update_H=True): + """Learn a NMF model for the data X and returns the transformed data. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Data matrix to be decomposed + + y : Ignored + + W : array-like of shape (n_samples, n_components) + If init='custom', it is used as initial guess for the solution. + + H : array-like of shape (n_components, n_features) + If init='custom', it is used as initial guess for the solution. + If update_H=False, it is used as a constant, to solve for W only. + + update_H : bool, default=True + If True, both W and H will be estimated from initial guesses, + this corresponds to a call to the 'fit_transform' method. + If False, only W will be estimated, this corresponds to a call + to the 'transform' method. + + Returns + ------- + W : ndarray of shape (n_samples, n_components) + Transformed data. + + H : ndarray of shape (n_components, n_features) + Factorization matrix, sometimes called 'dictionary'. + + n_iter_ : int + Actual number of iterations. + """ + check_non_negative(X, "NMF (input X)") + self._beta_loss = _check_string_param(self.solver, self.regularization, + self.beta_loss, self.init) + + if X.min() == 0 and self._beta_loss <= 0: + raise ValueError("When beta_loss <= 0 and X contains zeros, " + "the solver may diverge. Please add small values " + "to X, or use a positive beta_loss.") + + n_samples, n_features = X.shape + + # check parameters + self._check_params(X) + + # initialize or check W and H + W, H = self._check_w_h(X, W, H, update_H) + + l1_reg_W, l1_reg_H, l2_reg_W, l2_reg_H = _compute_regularization( + self.alpha, self.l1_ratio, self.regularization) + + if self.solver == 'cd': + W, H, n_iter = _fit_coordinate_descent( + X, W, H, self.tol, self.max_iter, l1_reg_W, l1_reg_H, + l2_reg_W, l2_reg_H, update_H=update_H, + verbose=self.verbose, shuffle=self.shuffle, + random_state=self.random_state) + elif self.solver == 'mu': + W, H, n_iter = _fit_multiplicative_update( + X, W, H, self._beta_loss, self.max_iter, self.tol, + l1_reg_W, l1_reg_H, l2_reg_W, l2_reg_H, + update_H=update_H, verbose=self.verbose) + else: + raise ValueError("Invalid solver parameter '%s'." % self.solver) + + if n_iter == self.max_iter and self.tol > 0: + warnings.warn("Maximum number of iterations %d reached. Increase " + "it to improve convergence." % self.max_iter, + ConvergenceWarning) + + return W, H, n_iter + def fit(self, X, y=None, **params): """Learn a NMF model for the data X. @@ -1361,15 +1416,7 @@ def transform(self, X): reset=False) with config_context(assume_finite=True): - W, _, n_iter_ = non_negative_factorization( - X=X, W=None, H=self.components_, - n_components=self.n_components_, - init=self.init, update_H=False, solver=self.solver, - beta_loss=self.beta_loss, tol=self.tol, max_iter=self.max_iter, - alpha=self.alpha, l1_ratio=self.l1_ratio, - regularization=self.regularization, - random_state=self.random_state, - verbose=self.verbose, shuffle=self.shuffle) + W, *_ = self._fit_transform(X, H=self.components_, update_H=False) return W From f4e692c0876425ef6afb6f514b54696f3e071c35 Mon Sep 17 00:00:00 2001 From: PierreAttard Date: Sat, 13 Mar 2021 00:45:00 +0100 Subject: [PATCH 238/478] ENH Raises error in hinge_loss when 'pred_decision' is invalid (#19643) Co-authored-by: Olivier Grisel --- doc/whats_new/v1.0.rst | 5 ++++ sklearn/metrics/_classification.py | 26 +++++++++++++++++--- sklearn/metrics/tests/test_classification.py | 26 ++++++++++++++++++++ 3 files changed, 53 insertions(+), 4 deletions(-) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index 0f4882f1b2970..a1f21723bac28 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -165,6 +165,11 @@ Changelog class methods and will be removed in 1.2. :pr:`18543` by `Guillaume Lemaitre`_. +- |Enhancement| A fix to raise an error in :func:`metrics.hinge_loss` when + ``pred_decision`` is 1d whereas it is a multiclass classification or when + ``pred_decision`` parameter is not consistent with the ``labels`` parameter. + :pr:`19643` by :user:`Pierre Attard `. + - |Feature| :func:`metrics.mean_pinball_loss` exposes the pinball loss for quantile regression. :pr:`19415` by :user:`Xavier Dupré ` and :user:`Oliver Grisel `. diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py index b4ab145d80937..a68e17656a73b 100644 --- a/sklearn/metrics/_classification.py +++ b/sklearn/metrics/_classification.py @@ -2378,11 +2378,29 @@ def hinge_loss(y_true, pred_decision, *, labels=None, sample_weight=None): pred_decision = check_array(pred_decision, ensure_2d=False) y_true = column_or_1d(y_true) y_true_unique = np.unique(labels if labels is not None else y_true) + if y_true_unique.size > 2: - if (labels is None and pred_decision.ndim > 1 and - (np.size(y_true_unique) != pred_decision.shape[1])): - raise ValueError("Please include all labels in y_true " - "or pass labels as third argument") + + if pred_decision.ndim <= 1: + raise ValueError("The shape of pred_decision cannot be 1d array" + "with a multiclass target. pred_decision shape " + "must be (n_samples, n_classes), that is " + f"({y_true.shape[0]}, {y_true_unique.size})." + f" Got: {pred_decision.shape}") + + # pred_decision.ndim > 1 is true + if y_true_unique.size != pred_decision.shape[1]: + if labels is None: + raise ValueError("Please include all labels in y_true " + "or pass labels as third argument") + else: + raise ValueError("The shape of pred_decision is not " + "consistent with the number of classes. " + "With a multiclass target, pred_decision " + "shape must be " + "(n_samples, n_classes), that is " + f"({y_true.shape[0]}, {y_true_unique.size}). " + f"Got: {pred_decision.shape}") if labels is None: labels = y_true_unique le = LabelEncoder() diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py index c32e9c89ada47..7b634e88f2275 100644 --- a/sklearn/metrics/tests/test_classification.py +++ b/sklearn/metrics/tests/test_classification.py @@ -4,6 +4,7 @@ from itertools import chain from itertools import permutations import warnings +import re import numpy as np from scipy import linalg @@ -2135,6 +2136,31 @@ def test_hinge_loss_multiclass_missing_labels_with_labels_none(): hinge_loss(y_true, pred_decision) +def test_hinge_loss_multiclass_no_consistent_pred_decision_shape(): + # test for inconsistency between multiclass problem and pred_decision + # argument + y_true = np.array([2, 1, 0, 1, 0, 1, 1]) + pred_decision = np.array([0, 1, 2, 1, 0, 2, 1]) + error_message = ("The shape of pred_decision cannot be 1d array" + "with a multiclass target. pred_decision shape " + "must be (n_samples, n_classes), that is " + "(7, 3). Got: (7,)") + with pytest.raises(ValueError, match=re.escape(error_message)): + hinge_loss(y_true=y_true, pred_decision=pred_decision) + + # test for inconsistency between pred_decision shape and labels number + pred_decision = np.array([[0, 1], [0, 1], [0, 1], [0, 1], + [2, 0], [0, 1], [1, 0]]) + labels = [0, 1, 2] + error_message = ("The shape of pred_decision is not " + "consistent with the number of classes. " + "With a multiclass target, pred_decision " + "shape must be (n_samples, n_classes), that is " + "(7, 3). Got: (7, 2)") + with pytest.raises(ValueError, match=re.escape(error_message)): + hinge_loss(y_true=y_true, pred_decision=pred_decision, labels=labels) + + def test_hinge_loss_multiclass_with_missing_labels(): pred_decision = np.array([ [+0.36, -0.17, -0.58, -0.99], From e3e4a778d3a39e17a21db596d89b3357277cc3dc Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Sun, 14 Mar 2021 11:36:49 +0000 Subject: [PATCH 239/478] MNT Remove absolute imports (#19668) --- sklearn/gaussian_process/kernels.py | 2 +- sklearn/inspection/_partial_dependence.py | 2 +- sklearn/metrics/_plot/base.py | 2 +- sklearn/metrics/pairwise.py | 2 +- sklearn/utils/_estimator_html_repr.py | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/sklearn/gaussian_process/kernels.py b/sklearn/gaussian_process/kernels.py index c731dcac347cd..008c24f294737 100644 --- a/sklearn/gaussian_process/kernels.py +++ b/sklearn/gaussian_process/kernels.py @@ -31,9 +31,9 @@ from ..metrics.pairwise import pairwise_kernels from ..base import clone from ..utils.validation import _num_samples +from ..exceptions import ConvergenceWarning import warnings -from sklearn.exceptions import ConvergenceWarning def _check_length_scale(X, length_scale): diff --git a/sklearn/inspection/_partial_dependence.py b/sklearn/inspection/_partial_dependence.py index 1e9c0c9718a51..0736130f41524 100644 --- a/sklearn/inspection/_partial_dependence.py +++ b/sklearn/inspection/_partial_dependence.py @@ -27,7 +27,7 @@ from ..ensemble import RandomForestRegressor from ..exceptions import NotFittedError from ..ensemble._gb import BaseGradientBoosting -from sklearn.ensemble._hist_gradient_boosting.gradient_boosting import ( +from ..ensemble._hist_gradient_boosting.gradient_boosting import ( BaseHistGradientBoosting) diff --git a/sklearn/metrics/_plot/base.py b/sklearn/metrics/_plot/base.py index 0e44a7715a1ed..4ac561f6d3dfa 100644 --- a/sklearn/metrics/_plot/base.py +++ b/sklearn/metrics/_plot/base.py @@ -1,6 +1,6 @@ import numpy as np -from sklearn.base import is_classifier +from ...base import is_classifier def _check_classifier_response_method(estimator, response_method): diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index a3cf7f4bf1d72..45eb256d59f67 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -748,7 +748,7 @@ def haversine_distances(X, Y=None): array([[ 0. , 11099.54035582], [11099.54035582, 0. ]]) """ - from sklearn.neighbors import DistanceMetric + from ..neighbors import DistanceMetric return DistanceMetric.get_metric('haversine').pairwise(X, Y) diff --git a/sklearn/utils/_estimator_html_repr.py b/sklearn/utils/_estimator_html_repr.py index a593a6507371f..52fb779bee4d3 100644 --- a/sklearn/utils/_estimator_html_repr.py +++ b/sklearn/utils/_estimator_html_repr.py @@ -5,7 +5,7 @@ import uuid import html -from sklearn import config_context +from .. import config_context class _VisualBlock: From 0c74b8b7d5cdb60dc3a3240cdb36af40b9f40288 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Sun, 14 Mar 2021 15:43:51 +0100 Subject: [PATCH 240/478] ENH Optimize dot product order for LogisticRegression for dense matrices (#19571) * Use multi_dot for Hessian and gradient product. np.linalg.multi_dot quickly chooses the best order for the multiplication of three matrices. --- doc/whats_new/v1.0.rst | 5 +++++ sklearn/linear_model/_logistic.py | 5 ++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index a1f21723bac28..4698657c9a82e 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -113,6 +113,11 @@ Changelog :mod:`sklearn.linear_model` ........................... +- |Efficiency| The implementation of :class:`linear_model.LogisticRegression` + has been optimised for dense matrices when using `solver='newton-cg'` and + `multi_class!='multinomial'`. + :pr:`19571` by :user:`Julien Jerphanion `. + - |Enhancement| Validate user-supplied gram matrix passed to linear models via the `precompute` argument. :pr:`19004` by :user:`Adam Midvidy `. diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index 2b8b6a716cbf7..be28c5806ede5 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -233,7 +233,10 @@ def _logistic_grad_hess(w, X, y, alpha, sample_weight=None): def Hs(s): ret = np.empty_like(s) - ret[:n_features] = X.T.dot(dX.dot(s[:n_features])) + if sparse.issparse(X): + ret[:n_features] = X.T.dot(dX.dot(s[:n_features])) + else: + ret[:n_features] = np.linalg.multi_dot([X.T, dX, s[:n_features]]) ret[:n_features] += alpha * s[:n_features] # For the fit intercept case. From 95c3c762fbc39799639279a1ad35716375a7a6e8 Mon Sep 17 00:00:00 2001 From: Alessia Marcolini <98marcolini@gmail.com> Date: Mon, 15 Mar 2021 10:46:22 +0100 Subject: [PATCH 241/478] DOC Fix typo in plot_multi_metric_evaluation example (#19675) --- examples/model_selection/plot_multi_metric_evaluation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/model_selection/plot_multi_metric_evaluation.py b/examples/model_selection/plot_multi_metric_evaluation.py index 775d0af9817a8..4f03f1b19462d 100644 --- a/examples/model_selection/plot_multi_metric_evaluation.py +++ b/examples/model_selection/plot_multi_metric_evaluation.py @@ -36,7 +36,7 @@ X, y = make_hastie_10_2(n_samples=8000, random_state=42) -# The scorers can be either be one of the predefined metric strings or a scorer +# The scorers can be either one of the predefined metric strings or a scorer # callable, like the one returned by make_scorer scoring = {'AUC': 'roc_auc', 'Accuracy': make_scorer(accuracy_score)} From 77e998db353206e45e0d15ba6f8ab0fc412a7077 Mon Sep 17 00:00:00 2001 From: cliffordEmmanuel <45907515+cliffordEmmanuel@users.noreply.github.com> Date: Mon, 15 Mar 2021 11:22:16 +0000 Subject: [PATCH 242/478] ENH Deprecated the default random_state=0 in randomized_svd (#19670) Co-authored-by: Thomas J. Fan Co-authored-by: cinbez --- doc/whats_new/v1.0.rst | 9 ++++++++ sklearn/utils/extmath.py | 34 ++++++++++++++++++++++------- sklearn/utils/tests/test_extmath.py | 17 ++++++++++----- 3 files changed, 46 insertions(+), 14 deletions(-) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index 4698657c9a82e..89280c7f01d0d 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -221,6 +221,15 @@ Changelog for non-English characters. :pr:`18959` by :user:`Zero ` and :user:`wstates `. +:mod:`sklearn.utils` +.................... + +- |Enhancement| Deprecated the default value of the `random_state=0` in + :func:`~sklearn.utils.extmath.randomized_svd`. Starting in 1.2, + the default value of `random_state` will be set to `None`. + :pr:`19459` by :user:`Cindy Bezuidenhout ` and + :user:`Clifford Akai-Nettey`. + :mod:`sklearn.calibration` ............................ diff --git a/sklearn/utils/extmath.py b/sklearn/utils/extmath.py index 31ac63c42eb69..42a014dcd8ade 100644 --- a/sklearn/utils/extmath.py +++ b/sklearn/utils/extmath.py @@ -245,7 +245,7 @@ def randomized_range_finder(A, *, size, n_iter, @_deprecate_positional_args def randomized_svd(M, n_components, *, n_oversamples=10, n_iter='auto', power_iteration_normalizer='auto', transpose='auto', - flip_sign=True, random_state=0): + flip_sign=True, random_state='warn'): """Computes a truncated randomized SVD. Parameters @@ -296,11 +296,17 @@ def randomized_svd(M, n_components, *, n_oversamples=10, n_iter='auto', set to `True`, the sign ambiguity is resolved by making the largest loadings for each component in the left singular vectors positive. - random_state : int, RandomState instance or None, default=0 - The seed of the pseudo random number generator to use when shuffling - the data, i.e. getting the random vectors to initialize the algorithm. - Pass an int for reproducible results across multiple function calls. - See :term:`Glossary `. + random_state : int, RandomState instance or None, default='warn' + The seed of the pseudo random number generator to use when + shuffling the data, i.e. getting the random vectors to initialize + the algorithm. Pass an int for reproducible results across multiple + function calls. See :term:`Glossary `. + + .. versionchanged:: 1.2 + The previous behavior (`random_state=0`) is deprecated, and + from v1.2 the default value will be `random_state=None`. Set + the value of `random_state` explicitly to suppress the deprecation + warning. Notes ----- @@ -326,10 +332,22 @@ def randomized_svd(M, n_components, *, n_oversamples=10, n_iter='auto', """ if isinstance(M, (sparse.lil_matrix, sparse.dok_matrix)): warnings.warn("Calculating SVD of a {} is expensive. " - "csr_matrix is more efficient.".format( - type(M).__name__), + "csr_matrix is more efficient.".format(type(M).__name__), sparse.SparseEfficiencyWarning) + if random_state == 'warn': + warnings.warn( + "If 'random_state' is not supplied, the current default " + "is to use 0 as a fixed seed. This will change to " + "None in version 1.2 leading to non-deterministic results " + "that better reflect nature of the randomized_svd solver. " + "If you want to silence this warning, set 'random_state' " + "to an integer seed or to None explicitly depending " + "if you want your code to be deterministic or not.", + FutureWarning + ) + random_state = 0 + random_state = check_random_state(random_state) n_random = n_components + n_oversamples n_samples, n_features = M.shape diff --git a/sklearn/utils/tests/test_extmath.py b/sklearn/utils/tests/test_extmath.py index cee4870b087c2..8e53d94d911f0 100644 --- a/sklearn/utils/tests/test_extmath.py +++ b/sklearn/utils/tests/test_extmath.py @@ -249,7 +249,8 @@ def test_randomized_svd_infinite_rank(): # compute the singular values of X using the fast approximate method # without the iterated power method _, sa, _ = randomized_svd(X, k, n_iter=0, - power_iteration_normalizer=normalizer) + power_iteration_normalizer=normalizer, + random_state=0) # the approximation does not tolerate the noise: assert np.abs(s[:k] - sa).max() > 0.1 @@ -257,7 +258,8 @@ def test_randomized_svd_infinite_rank(): # compute the singular values of X using the fast approximate method # with iterated power method _, sap, _ = randomized_svd(X, k, n_iter=5, - power_iteration_normalizer=normalizer) + power_iteration_normalizer=normalizer, + random_state=0) # the iterated power method is still managing to get most of the # structure at the requested rank @@ -307,11 +309,13 @@ def test_randomized_svd_power_iteration_normalizer(): # Check that it diverges with many (non-normalized) power iterations U, s, Vt = randomized_svd(X, n_components, n_iter=2, - power_iteration_normalizer='none') + power_iteration_normalizer='none', + random_state=0) A = X - U.dot(np.diag(s).dot(Vt)) error_2 = linalg.norm(A, ord='fro') U, s, Vt = randomized_svd(X, n_components, n_iter=20, - power_iteration_normalizer='none') + power_iteration_normalizer='none', + random_state=0) A = X - U.dot(np.diag(s).dot(Vt)) error_20 = linalg.norm(A, ord='fro') assert np.abs(error_2 - error_20) > 100 @@ -401,14 +405,15 @@ def max_loading_is_positive(u, v): mat = np.arange(10 * 8).reshape(10, -1) # Without transpose - u_flipped, _, v_flipped = randomized_svd(mat, 3, flip_sign=True) + u_flipped, _, v_flipped = randomized_svd(mat, 3, flip_sign=True, + random_state=0) u_based, v_based = max_loading_is_positive(u_flipped, v_flipped) assert u_based assert not v_based # With transpose u_flipped_with_transpose, _, v_flipped_with_transpose = randomized_svd( - mat, 3, flip_sign=True, transpose=True) + mat, 3, flip_sign=True, transpose=True, random_state=0) u_based, v_based = max_loading_is_positive( u_flipped_with_transpose, v_flipped_with_transpose) assert u_based From ac6dea5b7ebcd7a6b8d8b0d499d9c57d6a7d8939 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vil=C3=A9m=20Zouhar?= Date: Mon, 15 Mar 2021 12:25:20 +0100 Subject: [PATCH 243/478] MNT Fix error message for Minkowski metric parameter (#19671) Co-authored-by: Thomas J. Fan --- sklearn/neighbors/_base.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py index 820b83eca1845..eb14e8ef0a900 100644 --- a/sklearn/neighbors/_base.py +++ b/sklearn/neighbors/_base.py @@ -355,7 +355,8 @@ def _check_algorithm_metric(self): effective_p = self.p if self.metric in ['wminkowski', 'minkowski'] and effective_p < 1: - raise ValueError("p must be greater than one for minkowski metric") + raise ValueError("p must be greater or equal to one for " + "minkowski metric") def _fit(self, X, y=None): if self._get_tags()["requires_y"]: @@ -411,8 +412,8 @@ def _fit(self, X, y=None): if self.metric == 'minkowski': p = self.effective_metric_params_.pop('p', 2) if p < 1: - raise ValueError("p must be greater than one " - "for minkowski metric") + raise ValueError("p must be greater or equal to one for " + "minkowski metric") elif p == 1: self.effective_metric_ = 'manhattan' elif p == 2: From 302106bcac4476ecdd76b8c03fddb454edbcad96 Mon Sep 17 00:00:00 2001 From: LSturtew <56136443+LSturtew@users.noreply.github.com> Date: Mon, 15 Mar 2021 20:00:14 +0100 Subject: [PATCH 244/478] FIX RuntimeWarning by dividing by zero in test_iforest_with_uniform_data (#19622) --- sklearn/ensemble/_iforest.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/sklearn/ensemble/_iforest.py b/sklearn/ensemble/_iforest.py index 588b1bbef299c..9c3f547f23459 100644 --- a/sklearn/ensemble/_iforest.py +++ b/sklearn/ensemble/_iforest.py @@ -450,11 +450,14 @@ def _compute_score_samples(self, X, subsample_features): + _average_path_length(n_samples_leaf) - 1.0 ) - + denominator = ( + len(self.estimators_) * _average_path_length([self.max_samples_]) + ) scores = 2 ** ( - -depths - / (len(self.estimators_) - * _average_path_length([self.max_samples_])) + # For a single training sample, denominator and depth are 0. + # Therefore, we set the score manually to 1. + -np.divide(depths, denominator, out=np.ones_like(depths), + where=denominator != 0) ) return scores From d996eaf088eda47a57aa64ae457d37b8fdfb499e Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Tue, 16 Mar 2021 10:14:25 -0400 Subject: [PATCH 245/478] ENH Adds _num_features for array-likes (#19633) Co-authored-by: Olivier Grisel Co-authored-by: Christian Lorentzen --- sklearn/base.py | 14 ++++++- sklearn/tests/test_base.py | 26 ++++++++++++ sklearn/utils/tests/test_validation.py | 57 ++++++++++++++++++++++++++ sklearn/utils/validation.py | 57 ++++++++++++++++++++++++++ 4 files changed, 153 insertions(+), 1 deletion(-) diff --git a/sklearn/base.py b/sklearn/base.py index 3626e931aa9cf..ec264b0cf5edc 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -21,6 +21,7 @@ ) from .utils.validation import check_X_y from .utils.validation import check_array +from .utils.validation import _num_features from .utils._estimator_html_repr import estimator_html_repr from .utils.validation import _deprecate_positional_args @@ -349,7 +350,18 @@ def _check_n_features(self, X, reset): call to `partial_fit`. All other methods that validate `X` should set `reset=False`. """ - n_features = X.shape[1] + try: + n_features = _num_features(X) + except TypeError as e: + if not reset and hasattr(self, "n_features_in_"): + raise ValueError( + "X does not contain any features, but " + f"{self.__class__.__name__} is expecting " + f"{self.n_features_in_} features" + ) from e + # If the number of features is not defined and reset=True, + # then we skip this check + return if reset: self.n_features_in_ = n_features diff --git a/sklearn/tests/test_base.py b/sklearn/tests/test_base.py index 666df1499d7dc..c91419bf10a0e 100644 --- a/sklearn/tests/test_base.py +++ b/sklearn/tests/test_base.py @@ -581,3 +581,29 @@ class TruePairwise(BaseEstimator): with pytest.warns(None) as record: assert not _is_pairwise(est) assert not record + + +def test_n_features_in_validation(): + """Check that `_check_n_features` validates data when reset=False""" + est = MyEstimator() + X_train = [[1, 2, 3], [4, 5, 6]] + est._check_n_features(X_train, reset=True) + + assert est.n_features_in_ == 3 + + msg = ("X does not contain any features, but MyEstimator is expecting " + "3 features") + with pytest.raises(ValueError, match=msg): + est._check_n_features("invalid X", reset=False) + + +def test_n_features_in_no_validation(): + """Check that `_check_n_features` does not validate data when + n_features_in_ is not defined.""" + est = MyEstimator() + est._check_n_features("invalid X", reset=True) + + assert not hasattr(est, "n_features_in_") + + # does not raise + est._check_n_features("invalid X", reset=False) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index f05bd4656cbd9..f3db51e694b52 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -2,6 +2,7 @@ import warnings import os +import re from tempfile import NamedTemporaryFile from itertools import product @@ -18,6 +19,7 @@ from sklearn.utils._testing import assert_array_equal from sklearn.utils._testing import assert_allclose_dense_sparse from sklearn.utils._testing import assert_allclose +from sklearn.utils._testing import _convert_container from sklearn.utils import as_float_array, check_array, check_symmetric from sklearn.utils import check_X_y from sklearn.utils import deprecated @@ -44,6 +46,7 @@ _deprecate_positional_args, _check_sample_weight, _allclose_dense_sparse, + _num_features, FLOAT_DTYPES) from sklearn.utils.validation import _check_fit_params from sklearn.utils.fixes import parse_version @@ -1324,3 +1327,57 @@ def test_check_pandas_sparse_valid(ntype1, ntype2, expected_subtype): dtype=ntype2)}) arr = check_array(df, accept_sparse=['csr', 'csc']) assert np.issubdtype(arr.dtype, expected_subtype) + + +@pytest.mark.parametrize("constructor_name", [ + "list", "tuple", "array", "dataframe", "sparse_csr", "sparse_csc" +]) +def test_num_features(constructor_name): + """Check _num_features for array-likes.""" + X = [[1, 2, 3], [4, 5, 6]] + X = _convert_container(X, constructor_name) + assert _num_features(X) == 3 + + +@pytest.mark.parametrize( + "X", + [ + [1, 2, 3], + ["a", "b", "c"], + [False, True, False], + [1.0, 3.4, 4.0] + ], + ids=["int", "str", "bool", "float"] +) +@pytest.mark.parametrize("constructor_name", [ + "list", "tuple", "array", "series" +]) +def test_num_features_errors_1d_containers(X, constructor_name): + X = _convert_container(X, constructor_name) + if constructor_name == "array": + expected_type_name = "numpy.ndarray" + elif constructor_name == "series": + expected_type_name = "pandas.core.series.Series" + else: + expected_type_name = constructor_name + message = ( + "Unable to find the number of features from X of type " + f"{expected_type_name}" + ) + if hasattr(X, "shape"): + message += " with shape (3,)" + elif isinstance(X[0], str): + message += " where the samples are of type str" + with pytest.raises(TypeError, match=re.escape(message)): + _num_features(X) + + +@pytest.mark.parametrize("X", [1, 'b', False, 3.0], + ids=["int", "str", "bool", "float"]) +def test_num_features_errors_scalars(X): + msg = ( + "Unable to find the number of features from X of type " + f"{type(X).__qualname__}" + ) + with pytest.raises(TypeError, match=msg): + _num_features(X) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 273a0cb2ab04c..d0f410dd7f5d8 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -184,6 +184,63 @@ def _is_arraylike(x): hasattr(x, '__array__')) +def _num_features(X): + """Return the number of features in an array-like X. + + This helper function tries hard to avoid to materialize an array version + of X unless necessary. For instance, if X is a list of lists, + this function will return the length of the first element, assuming + that subsequent elements are all lists of the same length without + checking. + Parameters + ---------- + X : array-like + array-like to get the number of features. + + Returns + ------- + features : int + Number of features + """ + type_ = type(X) + if type_.__module__ == "builtins": + type_name = type_.__qualname__ + else: + type_name = f"{type_.__module__}.{type_.__qualname__}" + message = ( + "Unable to find the number of features from X of type " + f"{type_name}" + ) + if not hasattr(X, '__len__') and not hasattr(X, 'shape'): + if not hasattr(X, '__array__'): + raise TypeError(message) + # Only convert X to a numpy array if there is no cheaper, heuristic + # option. + X = np.asarray(X) + + if hasattr(X, 'shape'): + if not hasattr(X.shape, '__len__') or len(X.shape) <= 1: + message += f" with shape {X.shape}" + raise TypeError(message) + return X.shape[1] + + first_sample = X[0] + + # Do not consider an array-like of strings to be a 2D array + if isinstance(first_sample, (str, bytes)): + message += (f" where the samples are of type " + f"{type(first_sample).__qualname__}") + raise TypeError(message) + + try: + # If X is a list of lists, for instance, we assume that all nested + # lists have the same length without checking or converting to + # a numpy array to keep this function call as cheap as possible. + return len(first_sample) + except Exception as err: + raise TypeError(message) from err + + def _num_samples(x): """Return number of samples in array-like x.""" message = 'Expected sequence or array-like, got %s' % type(x) From edc4f15f0d46b4d26c107894b80548474f25931b Mon Sep 17 00:00:00 2001 From: Alihan Zihna Date: Tue, 16 Mar 2021 20:21:22 +0000 Subject: [PATCH 246/478] TST Change assert from sklearn to pytest style in tests/test_pipeline.py (#19678) --- sklearn/tests/test_pipeline.py | 215 +++++++++++++++++++-------------- 1 file changed, 124 insertions(+), 91 deletions(-) diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index 7989394d0a65e..85d2f7b6e07ca 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -14,13 +14,9 @@ from sklearn.utils.fixes import parse_version from sklearn.utils._testing import ( - assert_raises, - assert_raises_regex, - assert_raise_message, assert_allclose, assert_array_equal, assert_array_almost_equal, - assert_no_warnings, MinimalClassifier, MinimalRegressor, MinimalTransformer, @@ -167,20 +163,23 @@ def predict(self, X, got_attribute=False): def test_pipeline_init(): # Test the various init parameters of the pipeline. - assert_raises(TypeError, Pipeline) + with pytest.raises(TypeError): + Pipeline() + # Check that we can't instantiate pipelines with objects without fit # method - assert_raises_regex(TypeError, - 'Last step of Pipeline should implement fit ' - 'or be the string \'passthrough\'' - '.*NoFit.*', - Pipeline, [('clf', NoFit())]) + msg = ('Last step of Pipeline should implement fit ' + 'or be the string \'passthrough\'' + '.*NoFit.*') + with pytest.raises(TypeError, match=msg): + Pipeline([('clf', NoFit())]) + # Smoke test with only an estimator clf = NoTrans() pipe = Pipeline([('svc', clf)]) assert (pipe.get_params(deep=True) == - dict(svc__a=None, svc__b=None, svc=clf, - **pipe.get_params(deep=False))) + dict(svc__a=None, svc__b=None, svc=clf, + **pipe.get_params(deep=False))) # Check that params are set pipe.set_params(svc__a=0.1) @@ -200,10 +199,9 @@ def test_pipeline_init(): # Check that we can't instantiate with non-transformers on the way # Note that NoTrans implements fit, but not transform - assert_raises_regex(TypeError, - 'All intermediate steps should be transformers' - '.*\\bNoTrans\\b.*', - Pipeline, [('t', NoTrans()), ('svc', clf)]) + msg = 'All intermediate steps should be transformers.*\\bNoTrans\\b.*' + with pytest.raises(TypeError, match=msg): + Pipeline([('t', NoTrans()), ('svc', clf)]) # Check that params are set pipe.set_params(svc__C=0.1) @@ -212,10 +210,13 @@ def test_pipeline_init(): repr(pipe) # Check that params are not set when naming them wrong - assert_raises(ValueError, pipe.set_params, anova__C=0.1) + msg = 'Invalid parameter C for estimator SelectKBest' + with pytest.raises(ValueError, match=msg): + pipe.set_params(anova__C=0.1) # Test clone - pipe2 = assert_no_warnings(clone, pipe) + with pytest.warns(None): + pipe2 = clone(pipe) assert not pipe.named_steps['svc'] is pipe2.named_steps['svc'] # Check that apart from estimators, the parameters are the same @@ -273,11 +274,10 @@ def test_pipeline_fit_params(): assert pipe.named_steps['transf'].a is None assert pipe.named_steps['transf'].b is None # invalid parameters should raise an error message - assert_raise_message( - TypeError, - "fit() got an unexpected keyword argument 'bad'", - pipe.fit, None, None, clf__bad=True - ) + + msg = re.escape("fit() got an unexpected keyword argument 'bad'") + with pytest.raises(TypeError, match=msg): + pipe.fit(None, None, clf__bad=True) def test_pipeline_sample_weight_supported(): @@ -298,11 +298,12 @@ def test_pipeline_sample_weight_unsupported(): pipe.fit(X, y=None) assert pipe.score(X) == 3 assert pipe.score(X, sample_weight=None) == 3 - assert_raise_message( - TypeError, - "score() got an unexpected keyword argument 'sample_weight'", - pipe.score, X, sample_weight=np.array([2, 3]) + + msg = re.escape( + "score() got an unexpected keyword argument 'sample_weight'" ) + with pytest.raises(TypeError, match=msg): + pipe.score(X, sample_weight=np.array([2, 3])) def test_pipeline_raise_set_params_error(): @@ -310,20 +311,18 @@ def test_pipeline_raise_set_params_error(): pipe = Pipeline([('cls', LinearRegression())]) # expected error message - error_msg = ('Invalid parameter %s for estimator %s. ' - 'Check the list of available parameters ' - 'with `estimator.get_params().keys()`.') + error_msg = re.escape( + f"Invalid parameter fake for estimator {pipe}. " + 'Check the list of available parameters ' + 'with `estimator.get_params().keys()`.' + ) - assert_raise_message(ValueError, - error_msg % ('fake', pipe), - pipe.set_params, - fake='nope') + with pytest.raises(ValueError, match=error_msg): + pipe.set_params(fake='nope') # nested model check - assert_raise_message(ValueError, - error_msg % ("fake", pipe), - pipe.set_params, - fake__estimator='nope') + with pytest.raises(ValueError, match=error_msg): + pipe.set_params(fake__estimator='nope') def test_pipeline_methods_pca_svm(): @@ -431,9 +430,10 @@ def test_fit_predict_on_pipeline_without_fit_predict(): scaler = StandardScaler() pca = PCA(svd_solver='full') pipe = Pipeline([('scaler', scaler), ('pca', pca)]) - assert_raises_regex(AttributeError, - "'PCA' object has no attribute 'fit_predict'", - getattr, pipe, 'fit_predict') + + msg = "'PCA' object has no attribute 'fit_predict'" + with pytest.raises(AttributeError, match=msg): + getattr(pipe, 'fit_predict') def test_fit_predict_with_intermediate_fit_params(): @@ -484,7 +484,8 @@ def test_feature_union(): assert_array_almost_equal(X_transformed, X_sp_transformed.toarray()) # Test clone - fs2 = assert_no_warnings(clone, fs) + with pytest.warns(None): + fs2 = clone(fs) assert fs.transformer_list[0][1] is not fs2.transformer_list[0][1] # test setting parameters @@ -497,11 +498,9 @@ def test_feature_union(): assert X_transformed.shape == (X.shape[0], 8) # test error if some elements do not support transform - assert_raises_regex(TypeError, - 'All estimators should implement fit and ' - 'transform.*\\bNoTrans\\b', - FeatureUnion, - [("transform", Transf()), ("no_transform", NoTrans())]) + msg = 'All estimators should implement fit and transform.*\\bNoTrans\\b' + with pytest.raises(TypeError, match=msg): + FeatureUnion([("transform", Transf()), ("no_transform", NoTrans())]) # test that init accepts tuples fs = FeatureUnion((("svd", svd), ("select", select))) @@ -523,13 +522,13 @@ def test_make_union_kwargs(): fu = make_union(pca, mock, n_jobs=3) assert fu.transformer_list == make_union(pca, mock).transformer_list assert 3 == fu.n_jobs + # invalid keyword parameters should raise an error message - assert_raise_message( - TypeError, - "make_union() got an unexpected " - "keyword argument 'transformer_weights'", - make_union, pca, mock, transformer_weights={'pca': 10, 'Transf': 1} + msg = re.escape( + "make_union() got an unexpected keyword argument 'transformer_weights'" ) + with pytest.raises(TypeError, match=msg): + make_union(pca, mock, transformer_weights={'pca': 10, 'Transf': 1}) def test_pipeline_transform(): @@ -600,8 +599,14 @@ def test_pipeline_index(): assert pipe['transf'] == transf assert pipe[-1] == clf assert pipe['clf'] == clf - assert_raises(IndexError, lambda: pipe[3]) - assert_raises(KeyError, lambda: pipe['foobar']) + + # should raise an error if slicing out of range + with pytest.raises(IndexError): + pipe[3] + + # should raise an error if indexing with wrong element name + with pytest.raises(KeyError): + pipe['foobar'] def test_set_pipeline_steps(): @@ -626,8 +631,15 @@ def test_set_pipeline_steps(): # With invalid data pipeline.set_params(steps=[('junk', ())]) - assert_raises(TypeError, pipeline.fit, [[1]], [1]) - assert_raises(TypeError, pipeline.fit_transform, [[1]], [1]) + msg = re.escape( + "Last step of Pipeline should implement fit or be the " + "string 'passthrough'." + ) + with pytest.raises(TypeError, match=msg): + pipeline.fit([[1]], [1]) + + with pytest.raises(TypeError, match=msg): + pipeline.fit_transform([[1]], [1]) def test_pipeline_named_steps(): @@ -692,15 +704,15 @@ def make(): assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) assert (pipeline.get_params(deep=True) == - {'steps': pipeline.steps, - 'm2': mult2, - 'm3': passthrough, - 'last': mult5, - 'memory': None, - 'm2__mult': 2, - 'last__mult': 5, - 'verbose': False - }) + {'steps': pipeline.steps, + 'm2': mult2, + 'm3': passthrough, + 'last': mult5, + 'memory': None, + 'm2__mult': 2, + 'last__mult': 5, + 'verbose': False + }) pipeline.set_params(m2=passthrough) exp = 5 @@ -727,9 +739,10 @@ def make(): assert_array_equal([[exp]], pipeline.fit(X, y).transform(X)) assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) - assert_raise_message(AttributeError, - "'str' object has no attribute 'predict'", - getattr, pipeline, 'predict') + + msg = "'str' object has no attribute 'predict'" + with pytest.raises(AttributeError, match=msg): + getattr(pipeline, 'predict') # Check 'passthrough' step at construction time exp = 2 * 5 @@ -872,9 +885,12 @@ def test_feature_union_feature_names(): assert len(feature_names) == 35 ft = FeatureUnion([("tr1", Transf())]).fit([[1]]) - assert_raise_message(AttributeError, - 'Transformer tr1 (type Transf) does not provide ' - 'get_feature_names', ft.get_feature_names) + + msg = re.escape( + 'Transformer tr1 (type Transf) does not provide get_feature_names' + ) + with pytest.raises(AttributeError, match=msg): + ft.get_feature_names() def test_classes_property(): @@ -883,10 +899,12 @@ def test_classes_property(): reg = make_pipeline(SelectKBest(k=1), LinearRegression()) reg.fit(X, y) - assert_raises(AttributeError, getattr, reg, "classes_") + with pytest.raises(AttributeError): + getattr(reg, 'classes_') clf = make_pipeline(SelectKBest(k=1), LogisticRegression(random_state=0)) - assert_raises(AttributeError, getattr, clf, "classes_") + with pytest.raises(AttributeError): + getattr(clf, 'classes_') clf.fit(X, y) assert_array_equal(clf.classes_, np.unique(y)) @@ -961,6 +979,11 @@ def test_set_feature_union_step_drop(): def test_step_name_validation(): + error_message_1 = r"Estimator names must not contain __: got \['a__q'\]" + error_message_2 = r"Names provided are not unique: \['a', 'a'\]" + error_message_3 = ( + r"Estimator names conflict with constructor arguments: \['%s'\]" + ) bad_steps1 = [('a__q', Mult(2)), ('b', Mult(3))] bad_steps2 = [('a', Mult(2)), ('a', Mult(3))] for cls, param in [(Pipeline, 'steps'), @@ -968,29 +991,32 @@ def test_step_name_validation(): # we validate in construction (despite scikit-learn convention) bad_steps3 = [('a', Mult(2)), (param, Mult(3))] for bad_steps, message in [ - (bad_steps1, "Estimator names must not contain __: got ['a__q']"), - (bad_steps2, "Names provided are not unique: ['a', 'a']"), - (bad_steps3, "Estimator names conflict with constructor " - "arguments: ['%s']" % param), + (bad_steps1, error_message_1), + (bad_steps2, error_message_2), + (bad_steps3, error_message_3 % param), ]: # three ways to make invalid: # - construction - assert_raise_message(ValueError, message, cls, - **{param: bad_steps}) + with pytest.raises(ValueError, match=message): + cls(**{param: bad_steps}) # - setattr est = cls(**{param: [('a', Mult(1))]}) setattr(est, param, bad_steps) - assert_raise_message(ValueError, message, est.fit, [[1]], [1]) - assert_raise_message(ValueError, message, est.fit_transform, - [[1]], [1]) + with pytest.raises(ValueError, match=message): + est.fit([[1]], [1]) + + with pytest.raises(ValueError, match=message): + est.fit_transform([[1]], [1]) # - set_params est = cls(**{param: [('a', Mult(1))]}) est.set_params(**{param: bad_steps}) - assert_raise_message(ValueError, message, est.fit, [[1]], [1]) - assert_raise_message(ValueError, message, est.fit_transform, - [[1]], [1]) + with pytest.raises(ValueError, match=message): + est.fit([[1]], [1]) + + with pytest.raises(ValueError, match=message): + est.fit_transform([[1]], [1]) def test_set_params_nested_pipeline(): @@ -1012,9 +1038,13 @@ def test_pipeline_wrong_memory(): memory = 1 cached_pipe = Pipeline([('transf', DummyTransf()), ('svc', SVC())], memory=memory) - assert_raises_regex(ValueError, "'memory' should be None, a string or" - " have the same interface as joblib.Memory." - " Got memory='1' instead.", cached_pipe.fit, X, y) + + msg = re.escape( + "'memory' should be None, a string or have the same interface " + "as joblib.Memory. Got memory='1' instead." + ) + with pytest.raises(ValueError, match=msg): + cached_pipe.fit(X, y) class DummyMemory: @@ -1034,9 +1064,12 @@ def test_pipeline_with_cache_attribute(): dummy = WrongDummyMemory() pipe = Pipeline([('transf', Transf()), ('clf', Mult())], memory=dummy) - assert_raises_regex(ValueError, "'memory' should be None, a string or" - " have the same interface as joblib.Memory." - " Got memory='{}' instead.".format(dummy), pipe.fit, X) + msg = re.escape( + "'memory' should be None, a string or have the same interface " + f"as joblib.Memory. Got memory='{dummy}' instead." + ) + with pytest.raises(ValueError, match=msg): + pipe.fit(X) def test_pipeline_memory(): From b7b510f9dbc87500e79301873852c6247c440a3e Mon Sep 17 00:00:00 2001 From: Mathis Batoul Date: Tue, 16 Mar 2021 22:35:46 +0100 Subject: [PATCH 247/478] FIX RuntimeWarning division by zero in check_classifiers_one_label (#19690) --- sklearn/discriminant_analysis.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/sklearn/discriminant_analysis.py b/sklearn/discriminant_analysis.py index c5c18ac9136d2..2e80f94404175 100644 --- a/sklearn/discriminant_analysis.py +++ b/sklearn/discriminant_analysis.py @@ -476,8 +476,12 @@ def _solve_svd(self, X, y): # (n_classes) centers _, S, Vt = linalg.svd(X, full_matrices=0) - self.explained_variance_ratio_ = (S**2 / np.sum( - S**2))[:self._max_components] + if self._max_components == 0: + self.explained_variance_ratio_ = np.empty((0,), dtype=S.dtype) + else: + self.explained_variance_ratio_ = (S**2 / np.sum( + S**2))[:self._max_components] + rank = np.sum(S > self.tol * S[0]) self.scalings_ = np.dot(scalings, Vt.T[:, :rank]) coef = np.dot(self.means_ - self.xbar_, self.scalings_) From fcf4740b4538657997b0f4b8015728d64e2d563e Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Wed, 17 Mar 2021 02:58:43 +0100 Subject: [PATCH 248/478] TST Add a test to check the consistency of the Ridge and ElasticNet(l1_ratio=0) solutions (#19620) --- sklearn/linear_model/_cd_fast.pyx | 18 +++++-- .../tests/test_coordinate_descent.py | 50 +++++++++++++++++++ 2 files changed, 64 insertions(+), 4 deletions(-) diff --git a/sklearn/linear_model/_cd_fast.pyx b/sklearn/linear_model/_cd_fast.pyx index 84e4b4a49df01..4841809ac7aa7 100644 --- a/sklearn/linear_model/_cd_fast.pyx +++ b/sklearn/linear_model/_cd_fast.pyx @@ -244,10 +244,20 @@ def enet_coordinate_descent(floating[::1] w, else: # for/else, runs if for doesn't end with a `break` with gil: - warnings.warn("Objective did not converge. You might want to " - "increase the number of iterations. Duality " - "gap: {}, tolerance: {}".format(gap, tol), - ConvergenceWarning) + message = ( + "Objective did not converge. You might want to increase " + "the number of iterations, check the scale of the " + "features or consider increasing regularisation. " + f"Duality gap: {gap:.3e}, tolerance: {tol:.3e}" + ) + if alpha < np.finfo(np.float64).eps: + message += ( + " Linear regression models with null weight for the " + "l1 regularization term are more efficiently fitted " + "using one of the solvers implemented in " + "sklearn.linear_model.Ridge/RidgeCV instead." + ) + warnings.warn(message, ConvergenceWarning) return w, gap, tol, n_iter + 1 diff --git a/sklearn/linear_model/tests/test_coordinate_descent.py b/sklearn/linear_model/tests/test_coordinate_descent.py index ebddb6a7e47c6..d63211d6050bc 100644 --- a/sklearn/linear_model/tests/test_coordinate_descent.py +++ b/sklearn/linear_model/tests/test_coordinate_descent.py @@ -1419,3 +1419,53 @@ def test_enet_sample_weight_does_not_overwrite_sample_weight(check_input): reg.fit(X, y, sample_weight=sample_weight, check_input=check_input) assert_array_equal(sample_weight, sample_weight_1_25) + + +@pytest.mark.parametrize("ridge_alpha", [1e-1, 1., 1e6]) +@pytest.mark.parametrize("normalize", [True, False]) +def test_enet_ridge_consistency(normalize, ridge_alpha): + # Check that ElasticNet(l1_ratio=0) converges to the same solution as Ridge + # provided that the value of alpha is adapted. + # + # XXX: this test does not pass for weaker regularization (lower values of + # ridge_alpha): it could be either a problem of ElasticNet or Ridge (less + # likely) and depends on the dataset statistics: lower values for + # effective_rank are more problematic in particular. + + rng = np.random.RandomState(42) + X, y = make_regression( + n_samples=100, + n_features=300, + effective_rank=100, + n_informative=50, + random_state=rng, + ) + sw = rng.uniform(low=0.01, high=2, size=X.shape[0]) + + ridge = Ridge( + alpha=ridge_alpha, + normalize=normalize, + ).fit(X, y, sample_weight=sw) + + enet = ElasticNet( + alpha=ridge_alpha / sw.sum(), + normalize=normalize, + l1_ratio=0., + max_iter=1000, + ) + # Even when the ElasticNet model has actually converged, the duality gap + # convergence criterion is never met when l1_ratio is 0 and for any value + # of the `tol` parameter. The convergence message should point the user to + # Ridge instead: + expected_msg = ( + r"Objective did not converge\. .* " + r"Linear regression models with null weight for the " + r"l1 regularization term are more efficiently fitted " + r"using one of the solvers implemented in " + r"sklearn\.linear_model\.Ridge/RidgeCV instead\." + ) + with pytest.warns(ConvergenceWarning, match=expected_msg): + enet.fit(X, y, sample_weight=sw) + + assert_allclose(ridge.coef_, enet.coef_) + assert_allclose(ridge.intercept_, enet.intercept_) From 36e43582c03f5933da15d833b71dc37eaafb436e Mon Sep 17 00:00:00 2001 From: Steve Stagg Date: Wed, 17 Mar 2021 05:21:07 +0000 Subject: [PATCH 249/478] [MRG] Fix documentation for russelrao formula (#19695) --- sklearn/neighbors/_dist_metrics.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/neighbors/_dist_metrics.pyx b/sklearn/neighbors/_dist_metrics.pyx index 8bee948eeaeba..4cc41d7136586 100755 --- a/sklearn/neighbors/_dist_metrics.pyx +++ b/sklearn/neighbors/_dist_metrics.pyx @@ -183,7 +183,7 @@ cdef class DistanceMetric: "dice" DiceDistance NNEQ / (NTT + NNZ) "kulsinski" KulsinskiDistance (NNEQ + N - NTT) / (NNEQ + N) "rogerstanimoto" RogersTanimotoDistance 2 * NNEQ / (N + NNEQ) - "russellrao" RussellRaoDistance NNZ / N + "russellrao" RussellRaoDistance (N - NTT) / N "sokalmichener" SokalMichenerDistance 2 * NNEQ / (N + NNEQ) "sokalsneath" SokalSneathDistance NNEQ / (NNEQ + 0.5 * NTT) ================= ======================= =============================== From 95b7c680ab027fcd23bcbf47ebae58ee3e130ec9 Mon Sep 17 00:00:00 2001 From: mlondschien <61679398+mlondschien@users.noreply.github.com> Date: Wed, 17 Mar 2021 13:46:45 +0100 Subject: [PATCH 250/478] ENH Add periodic extrapolation to SplineTransformer (#19483) Co-authored-by: Olivier Grisel Co-authored-by: Christian Lorentzen --- doc/whats_new/v1.0.rst | 3 + .../plot_polynomial_interpolation.py | 68 ++++++ sklearn/preprocessing/_polynomial.py | 135 +++++++---- .../preprocessing/tests/test_polynomial.py | 209 +++++++++++++++++- 4 files changed, 363 insertions(+), 52 deletions(-) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index 89280c7f01d0d..34f39ca48f20a 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -206,6 +206,9 @@ Changelog polynomial ``degree`` of the splines, number of knots ``n_knots`` and knot positioning strategy ``knots``. :pr:`18368` by :user:`Christian Lorentzen `. + :class:`preprocessing.SplineTransformer` also supports periodic + splines via the ``extrapolation`` argument. + :pr:`19483` by :user:`Malte Londschien `. - |Fix| :func:`preprocessing.scale`, :class:`preprocessing.StandardScaler` and similar scalers detect near-constant features to avoid scaling them to diff --git a/examples/linear_model/plot_polynomial_interpolation.py b/examples/linear_model/plot_polynomial_interpolation.py index cfa684ffd79ca..34972b9522c68 100644 --- a/examples/linear_model/plot_polynomial_interpolation.py +++ b/examples/linear_model/plot_polynomial_interpolation.py @@ -39,6 +39,7 @@ # Author: Mathieu Blondel # Jake Vanderplas # Christian Lorentzen +# Malte Londschien # License: BSD 3 clause import numpy as np @@ -145,3 +146,70 @@ def f(x): # function has local support and is continued as a constant beyond the fitted # range. This extrapolating behaviour could be changed by the argument # ``extrapolation``. + +# %% +# Periodic Splines +# ---------------- +# In the previous example we saw the limitations of polynomials and splines for +# extrapolation beyond the range of the training observations. In some +# settings, e.g. with seasonal effects, we expect a periodic continuation of +# the underlying signal. Such effects can be modelled using periodic splines, +# which have equal function value and equal derivatives at the first and last +# knot. In the following case we show how periodic splines provide a better fit +# both within and outside of the range of training data given the additional +# information of periodicity. The splines period is the distance between +# the first and last knot, which we specify manually. +# +# Periodic splines can also be useful for naturally periodic features (such as +# day of the year), as the smoothness at the boundary knots prevents a jump in +# the transformed values (e.g. from Dec 31st to Jan 1st). For such naturally +# periodic features or more generally features where the period is known, it is +# advised to explicitly pass this information to the `SplineTransformer` by +# setting the knots manually. + + +# %% +def g(x): + """Function to be approximated by periodic spline interpolation.""" + return np.sin(x) - 0.7 * np.cos(x * 3) + + +y_train = g(x_train) + +# Extend the test data into the future: +x_plot_ext = np.linspace(-1, 21, 200) +X_plot_ext = x_plot_ext[:, np.newaxis] + +lw = 2 +fig, ax = plt.subplots() +ax.set_prop_cycle(color=["black", "tomato", "teal"]) +ax.plot(x_plot_ext, g(x_plot_ext), linewidth=lw, label="ground truth") +ax.scatter(x_train, y_train, label="training points") + +for transformer, label in [ + (SplineTransformer(degree=3, n_knots=10), "spline"), + (SplineTransformer( + degree=3, + knots=np.linspace(0, 2 * np.pi, 10)[:, None], + extrapolation="periodic" + ), "periodic spline") +]: + model = make_pipeline(transformer, Ridge(alpha=1e-3)) + model.fit(X_train, y_train) + y_plot_ext = model.predict(X_plot_ext) + ax.plot(x_plot_ext, y_plot_ext, label=label) + +ax.legend() +fig.show() + +# %% We again plot the underlying splines. +fig, ax = plt.subplots() +knots = np.linspace(0, 2 * np.pi, 4) +splt = SplineTransformer( + knots=knots[:, None], + degree=3, + extrapolation="periodic" +).fit(X_train) +ax.plot(x_plot_ext, splt.transform(X_plot_ext)) +ax.legend(ax.lines, [f"spline {n}" for n in range(3)]) +plt.show() diff --git a/sklearn/preprocessing/_polynomial.py b/sklearn/preprocessing/_polynomial.py index 26587e7f05823..ad358e50c4681 100644 --- a/sklearn/preprocessing/_polynomial.py +++ b/sklearn/preprocessing/_polynomial.py @@ -19,13 +19,13 @@ # TODO: # - sparse support (either scipy or own cython solution)? -# - extrapolation (cyclic) class SplineTransformer(TransformerMixin, BaseEstimator): """Generate univariate B-spline bases for features. Generate a new feature matrix consisting of - `n_splines=n_knots + degree - 1` spline basis functions (B-splines) of - polynomial order=`degree` for each feature. + `n_splines=n_knots + degree - 1` (`n_knots - 1` for + `extrapolation="periodic"`) spline basis functions + (B-splines) of polynomial order=`degree` for each feature. Read more in the :ref:`User Guide `. @@ -54,14 +54,21 @@ class SplineTransformer(TransformerMixin, BaseEstimator): `degree` number of knots are added before the first knot, the same after the last knot. - extrapolation : {'error', 'constant', 'linear', 'continue'}, \ + extrapolation : {'error', 'constant', 'linear', 'continue', 'periodic'}, \ default='constant' If 'error', values outside the min and max values of the training features raises a `ValueError`. If 'constant', the value of the splines at minimum and maximum value of the features is used as constant extrapolation. If 'linear', a linear extrapolation is used. If 'continue', the splines are extrapolated as is, i.e. option - `extrapolate=True` in :class:`scipy.interpolate.BSpline`. + `extrapolate=True` in :class:`scipy.interpolate.BSpline`. If + 'periodic', periodic splines with a periodicity equal to the distance + between the first and last knot are used. Periodic splines enforce + equal function values and derivatives at the first and last knot. + For example, this makes it possible to avoid introducing an arbitrary + jump between Dec 31st and Jan 1st in spline features derived from a + naturally periodic "day-of-year" input feature. In this case it is + recommended to manually set the knot values to control the period. include_bias : bool, default=True If True (default), then the last spline element inside the data range @@ -84,7 +91,9 @@ class SplineTransformer(TransformerMixin, BaseEstimator): n_features_out_ : int The total number of output features, which is computed as `n_features * n_splines`, where `n_splines` is - the number of bases elements of the B-splines, `n_knots + degree - 1`. + the number of bases elements of the B-splines, + `n_knots + degree - 1` for non-periodic splines and + `n_knots - 1` for periodic ones. If `include_bias=False`, then it is only `n_features * (n_splines - 1)`. @@ -235,7 +244,7 @@ def fit(self, X, y=None): X, n_knots=self.n_knots, knots=self.knots ) else: - base_knots = check_array(self.knots) + base_knots = check_array(self.knots, dtype=np.float64) if base_knots.shape[0] < 2: raise ValueError( "Number of knots, knots.shape[0], must be >= " "2." @@ -250,10 +259,11 @@ def fit(self, X, y=None): "constant", "linear", "continue", + "periodic", ): raise ValueError( "extrapolation must be one of 'error', " - "'constant', 'linear' or 'continue'." + "'constant', 'linear', 'continue' or 'periodic'." ) if not isinstance(self.include_bias, (bool, np.bool_)): @@ -261,44 +271,74 @@ def fit(self, X, y=None): # number of knots for base interval n_knots = base_knots.shape[0] + + if self.extrapolation == "periodic" and n_knots <= self.degree: + raise ValueError( + "Periodic splines require degree < n_knots. Got n_knots=" + f"{n_knots} and degree={self.degree}." + ) + # number of splines basis functions - n_splines = n_knots + self.degree - 1 + if self.extrapolation != "periodic": + n_splines = n_knots + self.degree - 1 + else: + # periodic splines have self.degree less degrees of freedom + n_splines = n_knots - 1 + degree = self.degree n_out = n_features * n_splines # We have to add degree number of knots below, and degree number knots # above the base knots in order to make the spline basis complete. - # Eilers & Marx in "Flexible smoothing with B-splines and penalties" - # https://doi.org/10.1214/ss/1038425655 advice against repeating first - # and last knot several times, which would have inferior behaviour at - # boundaries if combined with a penalty (hence P-Spline). We follow - # this advice even if our splines are unpenalized. - # Meaning we do not: - # knots = np.r_[np.tile(base_knots.min(axis=0), reps=[degree, 1]), - # base_knots, - # np.tile(base_knots.max(axis=0), reps=[degree, 1]) - # ] - # Instead, we reuse the distance of the 2 fist/last knots. - dist_min = base_knots[1] - base_knots[0] - dist_max = base_knots[-1] - base_knots[-2] - knots = np.r_[ - linspace( - base_knots[0] - degree * dist_min, - base_knots[0] - dist_min, - num=degree, - ), - base_knots, - linspace( - base_knots[-1] + dist_max, - base_knots[-1] + degree * dist_max, - num=degree, - ), - ] + if self.extrapolation == "periodic": + # For periodic splines the spacing of the first / last degree knots + # needs to be a continuation of the spacing of the last / first + # base knots. + period = base_knots[-1] - base_knots[0] + knots = np.r_[ + base_knots[-(degree + 1): -1] - period, + base_knots, + base_knots[1: (degree + 1)] + period + ] + + else: + # Eilers & Marx in "Flexible smoothing with B-splines and + # penalties" https://doi.org/10.1214/ss/1038425655 advice + # against repeating first and last knot several times, which + # would have inferior behaviour at boundaries if combined with + # a penalty (hence P-Spline). We follow this advice even if our + # splines are unpenalized. Meaning we do not: + # knots = np.r_[ + # np.tile(base_knots.min(axis=0), reps=[degree, 1]), + # base_knots, + # np.tile(base_knots.max(axis=0), reps=[degree, 1]) + # ] + # Instead, we reuse the distance of the 2 fist/last knots. + dist_min = base_knots[1] - base_knots[0] + dist_max = base_knots[-1] - base_knots[-2] + + knots = np.r_[ + linspace( + base_knots[0] - degree * dist_min, + base_knots[0] - dist_min, + num=degree, + ), + base_knots, + linspace( + base_knots[-1] + dist_max, + base_knots[-1] + degree * dist_max, + num=degree, + ), + ] # With a diagonal coefficient matrix, we get back the spline basis # elements, i.e. the design matrix of the spline. # Note, BSpline appreciates C-contiguous float64 arrays as c=coef. - coef = np.eye(n_knots + self.degree - 1, dtype=np.float64) - extrapolate = self.extrapolation == "continue" + coef = np.eye(n_splines, dtype=np.float64) + if self.extrapolation == "periodic": + coef = np.concatenate((coef, coef[:degree, :])) + + extrapolate = self.extrapolation in ["periodic", "continue"] + bsplines = [ BSpline.construct_fast( knots[:, i], coef, self.degree, extrapolate=extrapolate @@ -331,7 +371,7 @@ def transform(self, X): ) n_samples, n_features = X.shape - n_splines = self.bsplines_[0].c.shape[0] + n_splines = self.bsplines_[0].c.shape[1] degree = self.degree # Note that scipy BSpline returns float64 arrays and converts input @@ -346,8 +386,23 @@ def transform(self, X): for i in range(n_features): spl = self.bsplines_[i] - if self.extrapolation in ("continue", "error"): - XBS[:, (i * n_splines):((i + 1) * n_splines)] = spl(X[:, i]) + if self.extrapolation in ("continue", "error", "periodic"): + + if self.extrapolation == "periodic": + # With periodic extrapolation we map x to the segment + # [spl.t[k], spl.t[n]]. + # This is equivalent to BSpline(.., extrapolate="periodic") + # for scipy>=1.0.0. + n = spl.t.size - spl.k - 1 + # Assign to new array to avoid inplace operation + x = spl.t[spl.k] + (X[:, i] - spl.t[spl.k]) % ( + spl.t[n] - spl.t[spl.k] + ) + else: + x = X[:, i] + + XBS[:, (i * n_splines):((i + 1) * n_splines)] = spl(x) + else: xmin = spl.t[degree] xmax = spl.t[-degree - 1] diff --git a/sklearn/preprocessing/tests/test_polynomial.py b/sklearn/preprocessing/tests/test_polynomial.py index 2ca3260f7c05e..b1908bf9fe12a 100644 --- a/sklearn/preprocessing/tests/test_polynomial.py +++ b/sklearn/preprocessing/tests/test_polynomial.py @@ -1,10 +1,13 @@ import numpy as np -from numpy.testing import assert_allclose, assert_array_equal import pytest - +from numpy.testing import assert_allclose, assert_array_equal +from scipy.interpolate import BSpline from sklearn.linear_model import LinearRegression from sklearn.pipeline import Pipeline from sklearn.preprocessing import KBinsDiscretizer, SplineTransformer +from sklearn.utils.fixes import linspace, sp_version + +from pkg_resources import parse_version # TODO: add PolynomialFeatures if it moves to _polynomial.py @@ -31,7 +34,7 @@ def is_c_contiguous(a): ({"n_knots": 1}, "n_knots must be a positive integer >= 2."), ({"n_knots": 2.5}, "n_knots must be a positive integer >= 2."), ({"n_knots": "string"}, "n_knots must be a positive integer >= 2."), - ({"knots": "string"}, "Expected 2D array, got scalar array instead:"), + ({"knots": 1}, "Expected 2D array, got scalar array instead:"), ({"knots": [1, 2]}, "Expected 2D array, got 1D array instead:"), ( {"knots": [[1]]}, @@ -48,22 +51,32 @@ def is_c_contiguous(a): ({"knots": [[2], [1]]}, "knots must be sorted without duplicates."), ( {"extrapolation": None}, - "extrapolation must be one of 'error', 'constant', 'linear' or " - "'continue'.", + "extrapolation must be one of 'error', 'constant', 'linear', " + "'continue' or 'periodic'.", ), ( {"extrapolation": 1}, - "extrapolation must be one of 'error', 'constant', 'linear' or " - "'continue'.", + "extrapolation must be one of 'error', 'constant', 'linear', " + "'continue' or 'periodic'.", ), ( {"extrapolation": "string"}, - "extrapolation must be one of 'error', 'constant', 'linear' or " - "'continue'.", + "extrapolation must be one of 'error', 'constant', 'linear', " + "'continue' or 'periodic'.", ), ({"include_bias": None}, "include_bias must be bool."), ({"include_bias": 1}, "include_bias must be bool."), ({"include_bias": "string"}, "include_bias must be bool."), + ( + {"extrapolation": "periodic", "n_knots": 3, "degree": 3}, + "Periodic splines require degree < n_knots. Got n_knots=" + "3 and degree=3." + ), + ( + {"extrapolation": "periodic", "knots": [[0], [1]], "degree": 2}, + "Periodic splines require degree < n_knots. Got n_knots=2 and " + "degree=2." + ) ], ) def test_spline_transformer_input_validation(params, err_msg): @@ -75,7 +88,8 @@ def test_spline_transformer_input_validation(params, err_msg): def test_spline_transformer_manual_knot_input(): - """Test that array-like knot positions in SplineTransformer are accepted. + """ + Test that array-like knot positions in SplineTransformer are accepted. """ X = np.arange(20).reshape(10, 2) knots = [[0.5, 1], [1.5, 2], [5, 10]] @@ -86,6 +100,18 @@ def test_spline_transformer_manual_knot_input(): assert_allclose(st1.bsplines_[i].t, st2.bsplines_[i].t) +@pytest.mark.parametrize("extrapolation", ["continue", "periodic"]) +def test_spline_transformer_integer_knots(extrapolation): + """Test that SplineTransformer accepts integer value knot positions.""" + X = np.arange(20).reshape(10, 2) + knots = [[0, 1], [1, 2], [5, 5], [11, 10], [12, 11]] + _ = SplineTransformer( + degree=3, + knots=knots, + extrapolation=extrapolation + ).fit_transform(X) + + def test_spline_transformer_feature_names(): """Test that SplineTransformer generates correct features name.""" X = np.arange(20).reshape(10, 2) @@ -127,7 +153,13 @@ def test_spline_transformer_feature_names(): @pytest.mark.parametrize("degree", range(1, 5)) @pytest.mark.parametrize("n_knots", range(3, 5)) @pytest.mark.parametrize("knots", ["uniform", "quantile"]) -def test_spline_transformer_unity_decomposition(degree, n_knots, knots): +@pytest.mark.parametrize("extrapolation", ["constant", "periodic"]) +def test_spline_transformer_unity_decomposition( + degree, + n_knots, + knots, + extrapolation +): """Test that B-splines are indeed a decomposition of unity. Splines basis functions must sum up to 1 per row, if we stay in between @@ -137,8 +169,16 @@ def test_spline_transformer_unity_decomposition(degree, n_knots, knots): # make the boundaries 0 and 1 part of X_train, for sure. X_train = np.r_[[[0]], X[::2, :], [[1]]] X_test = X[1::2, :] + + if extrapolation == "periodic": + n_knots = n_knots + degree # periodic splines require degree < n_knots + splt = SplineTransformer( - n_knots=n_knots, degree=degree, knots=knots, include_bias=True + n_knots=n_knots, + degree=degree, + knots=knots, + include_bias=True, + extrapolation=extrapolation ) splt.fit(X_train) for X in [X_train, X_test]: @@ -168,6 +208,151 @@ def test_spline_transformer_linear_regression(bias, intercept): assert_allclose(pipe.predict(X), y, rtol=1e-3) +@pytest.mark.parametrize("knots, n_knots, degree", [ + ("uniform", 5, 3), + ("uniform", 12, 8), + ( + [[-1.0, 0.0], [0, 1.0], [0.1, 2.0], [0.2, 3.0], [0.3, 4.0], [1, 5.0]], + 100, # this gets ignored. + 3 + ) +]) +def test_spline_transformer_periodicity_of_extrapolation( + knots, n_knots, degree +): + """Test that the SplineTransformer is periodic for multiple features.""" + X_1 = linspace((-1, 0), (1, 5), 10) + X_2 = linspace((1, 5), (3, 10), 10) + + splt = SplineTransformer( + knots=knots, + n_knots=n_knots, + degree=degree, + extrapolation="periodic" + ) + splt.fit(X_1) + + assert_allclose(splt.transform(X_1), splt.transform(X_2)) + + +@pytest.mark.parametrize(["bias", "intercept"], [(True, False), (False, True)]) +def test_spline_transformer_periodic_linear_regression(bias, intercept): + """Test that B-splines fit a periodic curve pretty well.""" + # "+ 3" to avoid the value 0 in assert_allclose + def f(x): + return np.sin(2 * np.pi * x) - np.sin(8 * np.pi * x) + 3 + + X = np.linspace(0, 1, 101)[:, None] + pipe = Pipeline( + steps=[ + ( + "spline", + SplineTransformer( + n_knots=20, + degree=3, + include_bias=bias, + extrapolation="periodic", + ), + ), + ("ols", LinearRegression(fit_intercept=intercept)), + ] + ) + pipe.fit(X, f(X[:, 0])) + + # Generate larger array to check periodic extrapolation + X_ = np.linspace(-1, 2, 301)[:, None] + predictions = pipe.predict(X_) + assert_allclose(predictions, f(X_[:, 0]), atol=0.01, rtol=0.01) + assert_allclose(predictions[0:100], predictions[100:200], rtol=1e-3) + + +@pytest.mark.skipif( + sp_version < parse_version("1.0.0"), + reason="Periodic extrapolation not yet implemented for BSpline.", +) +def test_spline_transformer_periodic_spline_backport(): + """Test that the backport of extrapolate="periodic" works correctly""" + X = np.linspace(-2, 3.5, 10)[:, None] + degree = 2 + + # Use periodic extrapolation backport in SplineTransformer + transformer = SplineTransformer( + degree=degree, + extrapolation="periodic", + knots=[[-1.0], [0.0], [1.0]] + ) + Xt = transformer.fit_transform(X) + + # Use periodic extrapolation in BSpline + coef = np.array([[1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0]]) + spl = BSpline(np.arange(-3, 4), coef, degree, "periodic") + Xspl = spl(X[:, 0]) + assert_allclose(Xt, Xspl) + + +def test_spline_transformer_periodic_splines_periodicity(): + """ + Test if shifted knots result in the same transformation up to permutation. + """ + X = np.linspace(0, 10, 101)[:, None] + + transformer_1 = SplineTransformer( + degree=3, + extrapolation="periodic", + knots=[[0.0], [1.0], [3.0], [4.0], [5.0], [8.0]] + ) + + transformer_2 = SplineTransformer( + degree=3, + extrapolation="periodic", + knots=[[1.0], [3.0], [4.0], [5.0], [8.0], [9.0]] + ) + + Xt_1 = transformer_1.fit_transform(X) + Xt_2 = transformer_2.fit_transform(X) + + assert_allclose(Xt_1, Xt_2[:, [4, 0, 1, 2, 3]]) + + +@pytest.mark.parametrize("degree", [3, 5]) +def test_spline_transformer_periodic_splines_smoothness(degree): + """Test that spline transformation is smooth at first / last knot.""" + X = np.linspace(-2, 10, 10_000)[:, None] + + transformer = SplineTransformer( + degree=degree, + extrapolation="periodic", + knots=[[0.0], [1.0], [3.0], [4.0], [5.0], [8.0]] + ) + Xt = transformer.fit_transform(X) + + delta = (X.max() - X.min()) / len(X) + tol = 10 * delta + + dXt = Xt + # We expect splines of degree `degree` to be (`degree`-1) times + # continuously differentiable. I.e. for d = 0, ..., `degree` - 1 the d-th + # derivative should be continous. This is the case if the (d+1)-th + # numerical derivative is reasonably small (smaller than `tol` in absolute + # value). We thus compute d-th numeric derivatives for d = 1, ..., `degree` + # and compare them to `tol`. + # + # Note that the 0-th derivative is the function itself, such that we are + # also checking its continuity. + for d in range(1, degree + 1): + # Check continuity of the (d-1)-th derivative + diff = np.diff(dXt, axis=0) + assert np.abs(diff).max() < tol + # Compute d-th numeric derivative + dXt = diff / delta + + # As degree `degree` splines are not `degree` times continously + # differentiable at the knots, the `degree + 1`-th numeric derivative + # should have spikes at the knots. + diff = np.diff(dXt, axis=0) + assert np.abs(diff).max() > 1 + + @pytest.mark.parametrize(["bias", "intercept"], [(True, False), (False, True)]) @pytest.mark.parametrize("degree", [1, 2, 3, 4, 5]) def test_spline_transformer_extrapolation(bias, intercept, degree): From edc69954771a6390b0e10be8309e0a47df5c0189 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juan=20Carlos=20Alfaro=20Jim=C3=A9nez?= Date: Wed, 17 Mar 2021 14:15:27 +0100 Subject: [PATCH 251/478] CI Use conda instead of pip to install anaconda-client (#19528) --- build_tools/github/upload_anaconda.sh | 6 +++++- build_tools/travis/after_success.sh | 11 ++++++++++- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/build_tools/github/upload_anaconda.sh b/build_tools/github/upload_anaconda.sh index 7651576cf558e..13e8420e3cc5a 100644 --- a/build_tools/github/upload_anaconda.sh +++ b/build_tools/github/upload_anaconda.sh @@ -11,7 +11,11 @@ else ANACONDA_TOKEN="$SCIKIT_LEARN_STAGING_UPLOAD_TOKEN" fi -pip install git+https://github.com/Anaconda-Server/anaconda-client +# Install Python 3.8 because of a bug with Python 3.9 +export PATH=$CONDA/bin:$PATH +conda create -n upload -y python=3.8 +source activate upload +conda install -y anaconda-client # Force a replacement if the remote file already exists anaconda -t $ANACONDA_TOKEN upload --force -u $ANACONDA_ORG dist/artifact/* diff --git a/build_tools/travis/after_success.sh b/build_tools/travis/after_success.sh index 2123f7efafc22..a09a4013ed946 100755 --- a/build_tools/travis/after_success.sh +++ b/build_tools/travis/after_success.sh @@ -18,7 +18,16 @@ if [[ $BUILD_WHEEL == true && $TRAVIS_EVENT_TYPE != pull_request ]]; then ANACONDA_TOKEN="$SCIKIT_LEARN_STAGING_UPLOAD_TOKEN" fi - pip install git+https://github.com/Anaconda-Server/anaconda-client + MINICONDA_URL="https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-aarch64.sh" + wget $MINICONDA_URL -O miniconda.sh + MINICONDA_PATH=$HOME/miniconda + chmod +x miniconda.sh && ./miniconda.sh -b -p $MINICONDA_PATH + + # Install Python 3.8 because of a bug with Python 3.9 + export PATH=$MINICONDA_PATH/bin:$PATH + conda create -n upload -y python=3.8 + source activate upload + conda install -y anaconda-client # Force a replacement if the remote file already exists anaconda -t $ANACONDA_TOKEN upload --force -u $ANACONDA_ORG wheelhouse/*.whl From 9a186a599d5b4f75f9798211a04b4dc88b4f926a Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Wed, 17 Mar 2021 14:52:03 +0100 Subject: [PATCH 252/478] MAINT Python 3.9 in badge on README.rst (#19702) --- README.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index 68f9ffee17d03..ebc4339b2ab58 100644 --- a/README.rst +++ b/README.rst @@ -17,8 +17,8 @@ .. |Nightly wheels| image:: https://github.com/scikit-learn/scikit-learn/workflows/Wheel%20builder/badge.svg?event=schedule .. _`Nightly wheels`: https://github.com/scikit-learn/scikit-learn/actions?query=workflow%3A%22Wheel+builder%22+event%3Aschedule -.. |PythonVersion| image:: https://img.shields.io/badge/python-3.6%20%7C%203.7%20%7C%203.8-blue -.. _PythonVersion: https://img.shields.io/badge/python-3.6%20%7C%203.7%20%7C%203.8-blue +.. |PythonVersion| image:: https://img.shields.io/badge/python-3.6%20%7C%203.7%20%7C%203.8%20%7C%203.9-blue +.. _PythonVersion: https://img.shields.io/badge/python-3.6%20%7C%203.7%20%7C%203.8%20%7C%203.9-blue .. |PyPi| image:: https://badge.fury.io/py/scikit-learn.svg .. _PyPi: https://badge.fury.io/py/scikit-learn From 04f84c6d082864c208682d27256ff74b7b488734 Mon Sep 17 00:00:00 2001 From: Sean Benhur J <43300345+seanbenhur@users.noreply.github.com> Date: Wed, 17 Mar 2021 20:23:00 +0530 Subject: [PATCH 253/478] DOC Added utils.gen_batches in documentation (#19688) --- doc/developers/utilities.rst | 3 +++ doc/modules/classes.rst | 1 + sklearn/utils/__init__.py | 9 +++++++++ 3 files changed, 13 insertions(+) diff --git a/doc/developers/utilities.rst b/doc/developers/utilities.rst index 1ca36d473a925..39c0c889afc95 100644 --- a/doc/developers/utilities.rst +++ b/doc/developers/utilities.rst @@ -196,6 +196,9 @@ Helper Functions to ``n``. Used in :func:`~sklearn.decomposition.dict_learning` and :func:`~sklearn.cluster.k_means`. +- :class:`gen_batches`: generator to create slices containing batch size elements + from 0 to ``n`` + - :func:`safe_mask`: Helper function to convert a mask to the format expected by the numpy array or scipy sparse matrix on which to use it (sparse matrices support integer indices only while numpy arrays support both diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index c658bc6b12452..0cd5abb16829d 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -1595,6 +1595,7 @@ Plotting utils.extmath.fast_logdet utils.extmath.density utils.extmath.weighted_mode + utils.gen_batches utils.gen_even_slices utils.graph.single_source_shortest_path_length utils.graph_shortest_path.graph_shortest_path diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index ca2be9d14fe29..972d56f66d900 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -691,6 +691,10 @@ def gen_batches(n, batch_size, *, min_batch_size=0): ------ slice of batch_size elements + See Also + -------- + gen_even_slices: Generator to create n_packs slices going up to n. + Examples -------- >>> from sklearn.utils import gen_batches @@ -740,6 +744,11 @@ def gen_even_slices(n, n_packs, *, n_samples=None): ------ slice + See Also + -------- + gen_batches: Generator to create slices containing batch_size elements + from 0 to n. + Examples -------- >>> from sklearn.utils import gen_even_slices From 2e7009bc69f6fec93e8f3c59dd76b082b473148d Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Thu, 18 Mar 2021 09:15:50 -0400 Subject: [PATCH 254/478] ENH Better error for corrupted files in fetch_kddcup99 (#19669) --- doc/whats_new/v1.0.rst | 3 +++ sklearn/conftest.py | 7 +++++-- sklearn/datasets/_kddcup99.py | 23 +++++++++++++---------- sklearn/datasets/tests/test_kddcup99.py | 16 ++++++++++++++++ 4 files changed, 37 insertions(+), 12 deletions(-) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index 34f39ca48f20a..c7b786ea6d1bf 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -68,6 +68,9 @@ Changelog `Thomas Fan`_ and :user:`Amanda Dsouza ` and :user:`EL-ATEIF Sara `. +- |Enhancement| :func:`datasets.fetch_kddcup99` raises a better message + when the cached file is invalid. :pr:`19669` `Thomas Fan`_. + :mod:`sklearn.decomposition` ............................ diff --git a/sklearn/conftest.py b/sklearn/conftest.py index 2978115e3091c..70fec749b7c8e 100644 --- a/sklearn/conftest.py +++ b/sklearn/conftest.py @@ -35,8 +35,11 @@ def wrapped(*args, **kwargs): kwargs['download_if_missing'] = download_if_missing try: return f(*args, **kwargs) - except IOError: - pytest.skip("test is enabled when SKLEARN_SKIP_NETWORK_TESTS=0") + except IOError as e: + if str(e) != "Data not found and `download_if_missing` is False": + raise + pytest.skip("test is enabled when " + "SKLEARN_SKIP_NETWORK_TESTS=0") return pytest.fixture(lambda: wrapped) diff --git a/sklearn/datasets/_kddcup99.py b/sklearn/datasets/_kddcup99.py index 539b7ffaf862e..26fb14197a211 100644 --- a/sklearn/datasets/_kddcup99.py +++ b/sklearn/datasets/_kddcup99.py @@ -315,7 +315,17 @@ def _fetch_brute_kddcup99(data_home=None, column_names = [c[0] for c in dt] target_names = column_names[-1] feature_names = column_names[:-1] - if download_if_missing and not available: + + if available: + try: + X = joblib.load(samples_path) + y = joblib.load(targets_path) + except Exception as e: + raise IOError( + "The cache for fetch_kddcup99 is invalid, please delete " + f"{str(kddcup_dir)} and run the fetch_kddcup99 again") from e + + elif download_if_missing: _mkdirp(kddcup_dir) logger.info("Downloading %s" % archive.url) _fetch_remote(archive, dirname=kddcup_dir) @@ -343,15 +353,8 @@ def _fetch_brute_kddcup99(data_home=None, joblib.dump(X, samples_path, compress=0) joblib.dump(y, targets_path, compress=0) - elif not available: - if not download_if_missing: - raise IOError("Data not found and `download_if_missing` is False") - - try: - X, y - except NameError: - X = joblib.load(samples_path) - y = joblib.load(targets_path) + else: + raise IOError("Data not found and `download_if_missing` is False") return Bunch( data=X, diff --git a/sklearn/datasets/tests/test_kddcup99.py b/sklearn/datasets/tests/test_kddcup99.py index 5119d0cda13a2..08017298d20e8 100644 --- a/sklearn/datasets/tests/test_kddcup99.py +++ b/sklearn/datasets/tests/test_kddcup99.py @@ -58,3 +58,19 @@ def test_fetch_kddcup99_shuffle(fetch_kddcup99_fxt): def test_pandas_dependency_message(fetch_kddcup99_fxt, hide_available_pandas): check_pandas_dependency_message(fetch_kddcup99_fxt) + + +def test_corrupted_file_error_message(fetch_kddcup99_fxt, tmp_path): + """Check that a nice error message is raised when cache is corrupted.""" + kddcup99_dir = tmp_path / "kddcup99_10-py3" + kddcup99_dir.mkdir() + samples_path = kddcup99_dir / "samples" + + with samples_path.open("wb") as f: + f.write(b"THIS IS CORRUPTED") + + msg = (f"The cache for fetch_kddcup99 is invalid, please " + f"delete {str(kddcup99_dir)} and run the fetch_kddcup99 again") + + with pytest.raises(IOError, match=msg): + fetch_kddcup99_fxt(data_home=str(tmp_path)) From ca9618c0e228b67293c422e99a0f133c3384f7b2 Mon Sep 17 00:00:00 2001 From: Avi Gupta <33635739+avigupta2612@users.noreply.github.com> Date: Thu, 18 Mar 2021 19:28:51 +0530 Subject: [PATCH 255/478] MNT move PolynomialFeatures from _data.py to _polynomial.py (#19611) Co-authored-by: Roman Yurchak --- sklearn/preprocessing/__init__.py | 2 +- sklearn/preprocessing/_data.py | 290 ----------------- sklearn/preprocessing/_polynomial.py | 294 +++++++++++++++++- sklearn/preprocessing/tests/test_data.py | 199 ------------ .../preprocessing/tests/test_polynomial.py | 196 +++++++++++- 5 files changed, 487 insertions(+), 494 deletions(-) diff --git a/sklearn/preprocessing/__init__.py b/sklearn/preprocessing/__init__.py index 076b9e85e1150..6653088ba85a7 100644 --- a/sklearn/preprocessing/__init__.py +++ b/sklearn/preprocessing/__init__.py @@ -23,7 +23,6 @@ from ._data import quantile_transform from ._data import power_transform from ._data import PowerTransformer -from ._data import PolynomialFeatures from ._encoders import OneHotEncoder from ._encoders import OrdinalEncoder @@ -35,6 +34,7 @@ from ._discretization import KBinsDiscretizer +from ._polynomial import PolynomialFeatures from ._polynomial import SplineTransformer diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py index 29190dd6e2b67..5e85b932a1e39 100644 --- a/sklearn/preprocessing/_data.py +++ b/sklearn/preprocessing/_data.py @@ -8,9 +8,7 @@ # License: BSD 3 clause -from itertools import chain, combinations import warnings -from itertools import combinations_with_replacement as combinations_w_r import numpy as np from scipy import sparse @@ -31,7 +29,6 @@ from ..utils.validation import (check_is_fitted, check_random_state, _check_sample_weight, FLOAT_DTYPES, _deprecate_positional_args) -from ._csr_polynomial_expansion import _csr_polynomial_expansion from ._encoders import OneHotEncoder @@ -1570,293 +1567,6 @@ def robust_scale(X, *, axis=0, with_centering=True, with_scaling=True, return X -class PolynomialFeatures(TransformerMixin, BaseEstimator): - """Generate polynomial and interaction features. - - Generate a new feature matrix consisting of all polynomial combinations - of the features with degree less than or equal to the specified degree. - For example, if an input sample is two dimensional and of the form - [a, b], the degree-2 polynomial features are [1, a, b, a^2, ab, b^2]. - - Parameters - ---------- - degree : int, default=2 - The degree of the polynomial features. - - interaction_only : bool, default=False - If true, only interaction features are produced: features that are - products of at most ``degree`` *distinct* input features (so not - ``x[1] ** 2``, ``x[0] * x[2] ** 3``, etc.). - - include_bias : bool, default=True - If True (default), then include a bias column, the feature in which - all polynomial powers are zero (i.e. a column of ones - acts as an - intercept term in a linear model). - - order : {'C', 'F'}, default='C' - Order of output array in the dense case. 'F' order is faster to - compute, but may slow down subsequent estimators. - - .. versionadded:: 0.21 - - Examples - -------- - >>> import numpy as np - >>> from sklearn.preprocessing import PolynomialFeatures - >>> X = np.arange(6).reshape(3, 2) - >>> X - array([[0, 1], - [2, 3], - [4, 5]]) - >>> poly = PolynomialFeatures(2) - >>> poly.fit_transform(X) - array([[ 1., 0., 1., 0., 0., 1.], - [ 1., 2., 3., 4., 6., 9.], - [ 1., 4., 5., 16., 20., 25.]]) - >>> poly = PolynomialFeatures(interaction_only=True) - >>> poly.fit_transform(X) - array([[ 1., 0., 1., 0.], - [ 1., 2., 3., 6.], - [ 1., 4., 5., 20.]]) - - Attributes - ---------- - powers_ : ndarray of shape (n_output_features, n_input_features) - powers_[i, j] is the exponent of the jth input in the ith output. - - n_input_features_ : int - The total number of input features. - - n_output_features_ : int - The total number of polynomial output features. The number of output - features is computed by iterating over all suitably sized combinations - of input features. - - See Also - -------- - SplineTransformer : Transformer that generates univariate B-spline bases - for features - - Notes - ----- - Be aware that the number of features in the output array scales - polynomially in the number of features of the input array, and - exponentially in the degree. High degrees can cause overfitting. - - See :ref:`examples/linear_model/plot_polynomial_interpolation.py - ` - """ - @_deprecate_positional_args - def __init__(self, degree=2, *, interaction_only=False, include_bias=True, - order='C'): - self.degree = degree - self.interaction_only = interaction_only - self.include_bias = include_bias - self.order = order - - @staticmethod - def _combinations(n_features, degree, interaction_only, include_bias): - comb = (combinations if interaction_only else combinations_w_r) - start = int(not include_bias) - return chain.from_iterable(comb(range(n_features), i) - for i in range(start, degree + 1)) - - @property - def powers_(self): - check_is_fitted(self) - - combinations = self._combinations(self.n_input_features_, self.degree, - self.interaction_only, - self.include_bias) - return np.vstack([np.bincount(c, minlength=self.n_input_features_) - for c in combinations]) - - def get_feature_names(self, input_features=None): - """ - Return feature names for output features - - Parameters - ---------- - input_features : list of str of shape (n_features,), default=None - String names for input features if available. By default, - "x0", "x1", ... "xn_features" is used. - - Returns - ------- - output_feature_names : list of str of shape (n_output_features,) - """ - powers = self.powers_ - if input_features is None: - input_features = ['x%d' % i for i in range(powers.shape[1])] - feature_names = [] - for row in powers: - inds = np.where(row)[0] - if len(inds): - name = " ".join("%s^%d" % (input_features[ind], exp) - if exp != 1 else input_features[ind] - for ind, exp in zip(inds, row[inds])) - else: - name = "1" - feature_names.append(name) - return feature_names - - def fit(self, X, y=None): - """ - Compute number of output features. - - - Parameters - ---------- - X : {array-like, sparse matrix} of shape (n_samples, n_features) - The data. - - y : None - Ignored. - - Returns - ------- - self : object - Fitted transformer. - """ - n_samples, n_features = self._validate_data( - X, accept_sparse=True).shape - combinations = self._combinations(n_features, self.degree, - self.interaction_only, - self.include_bias) - self.n_input_features_ = n_features - self.n_output_features_ = sum(1 for _ in combinations) - return self - - def transform(self, X): - """Transform data to polynomial features. - - Parameters - ---------- - X : {array-like, sparse matrix} of shape (n_samples, n_features) - The data to transform, row by row. - - Prefer CSR over CSC for sparse input (for speed), but CSC is - required if the degree is 4 or higher. If the degree is less than - 4 and the input format is CSC, it will be converted to CSR, have - its polynomial features generated, then converted back to CSC. - - If the degree is 2 or 3, the method described in "Leveraging - Sparsity to Speed Up Polynomial Feature Expansions of CSR Matrices - Using K-Simplex Numbers" by Andrew Nystrom and John Hughes is - used, which is much faster than the method used on CSC input. For - this reason, a CSC input will be converted to CSR, and the output - will be converted back to CSC prior to being returned, hence the - preference of CSR. - - Returns - ------- - XP : {ndarray, sparse matrix} of shape (n_samples, NP) - The matrix of features, where NP is the number of polynomial - features generated from the combination of inputs. If a sparse - matrix is provided, it will be converted into a sparse - ``csr_matrix``. - """ - check_is_fitted(self) - - X = self._validate_data(X, order='F', dtype=FLOAT_DTYPES, reset=False, - accept_sparse=('csr', 'csc')) - - n_samples, n_features = X.shape - - if n_features != self.n_input_features_: - raise ValueError("X shape does not match training shape") - - if sparse.isspmatrix_csr(X): - if self.degree > 3: - return self.transform(X.tocsc()).tocsr() - to_stack = [] - if self.include_bias: - to_stack.append(np.ones(shape=(n_samples, 1), dtype=X.dtype)) - to_stack.append(X) - for deg in range(2, self.degree+1): - Xp_next = _csr_polynomial_expansion(X.data, X.indices, - X.indptr, X.shape[1], - self.interaction_only, - deg) - if Xp_next is None: - break - to_stack.append(Xp_next) - XP = sparse.hstack(to_stack, format='csr') - elif sparse.isspmatrix_csc(X) and self.degree < 4: - return self.transform(X.tocsr()).tocsc() - else: - if sparse.isspmatrix(X): - combinations = self._combinations(n_features, self.degree, - self.interaction_only, - self.include_bias) - columns = [] - for comb in combinations: - if comb: - out_col = 1 - for col_idx in comb: - out_col = X[:, col_idx].multiply(out_col) - columns.append(out_col) - else: - bias = sparse.csc_matrix(np.ones((X.shape[0], 1))) - columns.append(bias) - XP = sparse.hstack(columns, dtype=X.dtype).tocsc() - else: - XP = np.empty((n_samples, self.n_output_features_), - dtype=X.dtype, order=self.order) - - # What follows is a faster implementation of: - # for i, comb in enumerate(combinations): - # XP[:, i] = X[:, comb].prod(1) - # This implementation uses two optimisations. - # First one is broadcasting, - # multiply ([X1, ..., Xn], X1) -> [X1 X1, ..., Xn X1] - # multiply ([X2, ..., Xn], X2) -> [X2 X2, ..., Xn X2] - # ... - # multiply ([X[:, start:end], X[:, start]) -> ... - # Second optimisation happens for degrees >= 3. - # Xi^3 is computed reusing previous computation: - # Xi^3 = Xi^2 * Xi. - - if self.include_bias: - XP[:, 0] = 1 - current_col = 1 - else: - current_col = 0 - - # d = 0 - XP[:, current_col:current_col + n_features] = X - index = list(range(current_col, - current_col + n_features)) - current_col += n_features - index.append(current_col) - - # d >= 1 - for _ in range(1, self.degree): - new_index = [] - end = index[-1] - for feature_idx in range(n_features): - start = index[feature_idx] - new_index.append(current_col) - if self.interaction_only: - start += (index[feature_idx + 1] - - index[feature_idx]) - next_col = current_col + end - start - if next_col <= current_col: - break - # XP[:, start:end] are terms of degree d - 1 - # that exclude feature #feature_idx. - np.multiply(XP[:, start:end], - X[:, feature_idx:feature_idx + 1], - out=XP[:, current_col:next_col], - casting='no') - current_col = next_col - - new_index.append(current_col) - index = new_index - - return XP - - @_deprecate_positional_args def normalize(X, norm='l2', *, axis=1, copy=True, return_norm=False): """Scale input vectors individually to unit norm (vector length). diff --git a/sklearn/preprocessing/_polynomial.py b/sklearn/preprocessing/_polynomial.py index ad358e50c4681..3f4ccc2fa05d4 100644 --- a/sklearn/preprocessing/_polynomial.py +++ b/sklearn/preprocessing/_polynomial.py @@ -2,14 +2,19 @@ This file contains preprocessing tools based on polynomials. """ import numbers +from itertools import chain, combinations +from itertools import combinations_with_replacement as combinations_w_r import numpy as np +from scipy import sparse from scipy.interpolate import BSpline from ..base import BaseEstimator, TransformerMixin from ..utils import check_array from ..utils.fixes import linspace -from ..utils.validation import check_is_fitted, FLOAT_DTYPES +from ..utils.validation import (check_is_fitted, FLOAT_DTYPES, + _deprecate_positional_args) +from ._csr_polynomial_expansion import _csr_polynomial_expansion __all__ = [ @@ -17,6 +22,293 @@ ] +class PolynomialFeatures(TransformerMixin, BaseEstimator): + """Generate polynomial and interaction features. + + Generate a new feature matrix consisting of all polynomial combinations + of the features with degree less than or equal to the specified degree. + For example, if an input sample is two dimensional and of the form + [a, b], the degree-2 polynomial features are [1, a, b, a^2, ab, b^2]. + + Parameters + ---------- + degree : int, default=2 + The degree of the polynomial features. + + interaction_only : bool, default=False + If true, only interaction features are produced: features that are + products of at most ``degree`` *distinct* input features (so not + ``x[1] ** 2``, ``x[0] * x[2] ** 3``, etc.). + + include_bias : bool, default=True + If True (default), then include a bias column, the feature in which + all polynomial powers are zero (i.e. a column of ones - acts as an + intercept term in a linear model). + + order : {'C', 'F'}, default='C' + Order of output array in the dense case. 'F' order is faster to + compute, but may slow down subsequent estimators. + + .. versionadded:: 0.21 + + Examples + -------- + >>> import numpy as np + >>> from sklearn.preprocessing import PolynomialFeatures + >>> X = np.arange(6).reshape(3, 2) + >>> X + array([[0, 1], + [2, 3], + [4, 5]]) + >>> poly = PolynomialFeatures(2) + >>> poly.fit_transform(X) + array([[ 1., 0., 1., 0., 0., 1.], + [ 1., 2., 3., 4., 6., 9.], + [ 1., 4., 5., 16., 20., 25.]]) + >>> poly = PolynomialFeatures(interaction_only=True) + >>> poly.fit_transform(X) + array([[ 1., 0., 1., 0.], + [ 1., 2., 3., 6.], + [ 1., 4., 5., 20.]]) + + Attributes + ---------- + powers_ : ndarray of shape (n_output_features, n_input_features) + powers_[i, j] is the exponent of the jth input in the ith output. + + n_input_features_ : int + The total number of input features. + + n_output_features_ : int + The total number of polynomial output features. The number of output + features is computed by iterating over all suitably sized combinations + of input features. + + See Also + -------- + SplineTransformer : Transformer that generates univariate B-spline bases + for features + + Notes + ----- + Be aware that the number of features in the output array scales + polynomially in the number of features of the input array, and + exponentially in the degree. High degrees can cause overfitting. + + See :ref:`examples/linear_model/plot_polynomial_interpolation.py + ` + """ + @_deprecate_positional_args + def __init__(self, degree=2, *, interaction_only=False, include_bias=True, + order='C'): + self.degree = degree + self.interaction_only = interaction_only + self.include_bias = include_bias + self.order = order + + @staticmethod + def _combinations(n_features, degree, interaction_only, include_bias): + comb = (combinations if interaction_only else combinations_w_r) + start = int(not include_bias) + return chain.from_iterable(comb(range(n_features), i) + for i in range(start, degree + 1)) + + @property + def powers_(self): + check_is_fitted(self) + + combinations = self._combinations(self.n_input_features_, self.degree, + self.interaction_only, + self.include_bias) + return np.vstack([np.bincount(c, minlength=self.n_input_features_) + for c in combinations]) + + def get_feature_names(self, input_features=None): + """ + Return feature names for output features + + Parameters + ---------- + input_features : list of str of shape (n_features,), default=None + String names for input features if available. By default, + "x0", "x1", ... "xn_features" is used. + + Returns + ------- + output_feature_names : list of str of shape (n_output_features,) + """ + powers = self.powers_ + if input_features is None: + input_features = ['x%d' % i for i in range(powers.shape[1])] + feature_names = [] + for row in powers: + inds = np.where(row)[0] + if len(inds): + name = " ".join("%s^%d" % (input_features[ind], exp) + if exp != 1 else input_features[ind] + for ind, exp in zip(inds, row[inds])) + else: + name = "1" + feature_names.append(name) + return feature_names + + def fit(self, X, y=None): + """ + Compute number of output features. + + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The data. + + y : None + Ignored. + + Returns + ------- + self : object + Fitted transformer. + """ + n_samples, n_features = self._validate_data( + X, accept_sparse=True).shape + combinations = self._combinations(n_features, self.degree, + self.interaction_only, + self.include_bias) + self.n_input_features_ = n_features + self.n_output_features_ = sum(1 for _ in combinations) + return self + + def transform(self, X): + """Transform data to polynomial features. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The data to transform, row by row. + + Prefer CSR over CSC for sparse input (for speed), but CSC is + required if the degree is 4 or higher. If the degree is less than + 4 and the input format is CSC, it will be converted to CSR, have + its polynomial features generated, then converted back to CSC. + + If the degree is 2 or 3, the method described in "Leveraging + Sparsity to Speed Up Polynomial Feature Expansions of CSR Matrices + Using K-Simplex Numbers" by Andrew Nystrom and John Hughes is + used, which is much faster than the method used on CSC input. For + this reason, a CSC input will be converted to CSR, and the output + will be converted back to CSC prior to being returned, hence the + preference of CSR. + + Returns + ------- + XP : {ndarray, sparse matrix} of shape (n_samples, NP) + The matrix of features, where NP is the number of polynomial + features generated from the combination of inputs. If a sparse + matrix is provided, it will be converted into a sparse + ``csr_matrix``. + """ + check_is_fitted(self) + + X = self._validate_data(X, order='F', dtype=FLOAT_DTYPES, reset=False, + accept_sparse=('csr', 'csc')) + + n_samples, n_features = X.shape + + if n_features != self.n_input_features_: + raise ValueError("X shape does not match training shape") + + if sparse.isspmatrix_csr(X): + if self.degree > 3: + return self.transform(X.tocsc()).tocsr() + to_stack = [] + if self.include_bias: + to_stack.append(np.ones(shape=(n_samples, 1), dtype=X.dtype)) + to_stack.append(X) + for deg in range(2, self.degree+1): + Xp_next = _csr_polynomial_expansion(X.data, X.indices, + X.indptr, X.shape[1], + self.interaction_only, + deg) + if Xp_next is None: + break + to_stack.append(Xp_next) + XP = sparse.hstack(to_stack, format='csr') + elif sparse.isspmatrix_csc(X) and self.degree < 4: + return self.transform(X.tocsr()).tocsc() + else: + if sparse.isspmatrix(X): + combinations = self._combinations(n_features, self.degree, + self.interaction_only, + self.include_bias) + columns = [] + for comb in combinations: + if comb: + out_col = 1 + for col_idx in comb: + out_col = X[:, col_idx].multiply(out_col) + columns.append(out_col) + else: + bias = sparse.csc_matrix(np.ones((X.shape[0], 1))) + columns.append(bias) + XP = sparse.hstack(columns, dtype=X.dtype).tocsc() + else: + XP = np.empty((n_samples, self.n_output_features_), + dtype=X.dtype, order=self.order) + + # What follows is a faster implementation of: + # for i, comb in enumerate(combinations): + # XP[:, i] = X[:, comb].prod(1) + # This implementation uses two optimisations. + # First one is broadcasting, + # multiply ([X1, ..., Xn], X1) -> [X1 X1, ..., Xn X1] + # multiply ([X2, ..., Xn], X2) -> [X2 X2, ..., Xn X2] + # ... + # multiply ([X[:, start:end], X[:, start]) -> ... + # Second optimisation happens for degrees >= 3. + # Xi^3 is computed reusing previous computation: + # Xi^3 = Xi^2 * Xi. + + if self.include_bias: + XP[:, 0] = 1 + current_col = 1 + else: + current_col = 0 + + # d = 0 + XP[:, current_col:current_col + n_features] = X + index = list(range(current_col, + current_col + n_features)) + current_col += n_features + index.append(current_col) + + # d >= 1 + for _ in range(1, self.degree): + new_index = [] + end = index[-1] + for feature_idx in range(n_features): + start = index[feature_idx] + new_index.append(current_col) + if self.interaction_only: + start += (index[feature_idx + 1] - + index[feature_idx]) + next_col = current_col + end - start + if next_col <= current_col: + break + # XP[:, start:end] are terms of degree d - 1 + # that exclude feature #feature_idx. + np.multiply(XP[:, start:end], + X[:, feature_idx:feature_idx + 1], + out=XP[:, current_col:next_col], + casting='no') + current_col = next_col + + new_index.append(current_col) + index = new_index + + return XP + + # TODO: # - sparse support (either scipy or own cython solution)? class SplineTransformer(TransformerMixin, BaseEstimator): diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index fdd88be0ccff4..196060388ddd2 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -10,7 +10,6 @@ import numpy as np import numpy.linalg as la from scipy import sparse, stats -from scipy.sparse import random as sparse_random import pytest @@ -43,7 +42,6 @@ from sklearn.preprocessing import RobustScaler from sklearn.preprocessing import robust_scale from sklearn.preprocessing import add_dummy_feature -from sklearn.preprocessing import PolynomialFeatures from sklearn.preprocessing import PowerTransformer from sklearn.preprocessing import power_transform from sklearn.preprocessing._data import _handle_zeros_in_scale @@ -94,203 +92,6 @@ def assert_correct_incr(i, batch_start, batch_stop, n, chunk_size, n_samples_seen) -def test_polynomial_features(): - # Test Polynomial Features - X1 = np.arange(6)[:, np.newaxis] - P1 = np.hstack([np.ones_like(X1), - X1, X1 ** 2, X1 ** 3]) - deg1 = 3 - - X2 = np.arange(6).reshape((3, 2)) - x1 = X2[:, :1] - x2 = X2[:, 1:] - P2 = np.hstack([x1 ** 0 * x2 ** 0, - x1 ** 1 * x2 ** 0, - x1 ** 0 * x2 ** 1, - x1 ** 2 * x2 ** 0, - x1 ** 1 * x2 ** 1, - x1 ** 0 * x2 ** 2]) - deg2 = 2 - - for (deg, X, P) in [(deg1, X1, P1), (deg2, X2, P2)]: - P_test = PolynomialFeatures(deg, include_bias=True).fit_transform(X) - assert_array_almost_equal(P_test, P) - - P_test = PolynomialFeatures(deg, include_bias=False).fit_transform(X) - assert_array_almost_equal(P_test, P[:, 1:]) - - interact = PolynomialFeatures(2, interaction_only=True, include_bias=True) - X_poly = interact.fit_transform(X) - assert_array_almost_equal(X_poly, P2[:, [0, 1, 2, 4]]) - - assert interact.powers_.shape == (interact.n_output_features_, - interact.n_input_features_) - - -def test_polynomial_feature_names(): - X = np.arange(30).reshape(10, 3) - poly = PolynomialFeatures(degree=2, include_bias=True).fit(X) - feature_names = poly.get_feature_names() - assert_array_equal(['1', 'x0', 'x1', 'x2', 'x0^2', 'x0 x1', - 'x0 x2', 'x1^2', 'x1 x2', 'x2^2'], - feature_names) - - poly = PolynomialFeatures(degree=3, include_bias=False).fit(X) - feature_names = poly.get_feature_names(["a", "b", "c"]) - assert_array_equal(['a', 'b', 'c', 'a^2', 'a b', 'a c', 'b^2', - 'b c', 'c^2', 'a^3', 'a^2 b', 'a^2 c', - 'a b^2', 'a b c', 'a c^2', 'b^3', 'b^2 c', - 'b c^2', 'c^3'], feature_names) - # test some unicode - poly = PolynomialFeatures(degree=1, include_bias=True).fit(X) - feature_names = poly.get_feature_names( - ["\u0001F40D", "\u262E", "\u05D0"]) - assert_array_equal(["1", "\u0001F40D", "\u262E", "\u05D0"], - feature_names) - - -def test_polynomial_feature_array_order(): - """Test that output array has the given order.""" - X = np.arange(10).reshape(5, 2) - - def is_c_contiguous(a): - return np.isfortran(a.T) - - assert is_c_contiguous(PolynomialFeatures().fit_transform(X)) - assert is_c_contiguous(PolynomialFeatures(order='C').fit_transform(X)) - assert np.isfortran(PolynomialFeatures(order='F').fit_transform(X)) - - -@pytest.mark.parametrize(['deg', 'include_bias', 'interaction_only', 'dtype'], - [(1, True, False, int), - (2, True, False, int), - (2, True, False, np.float32), - (2, True, False, np.float64), - (3, False, False, np.float64), - (3, False, True, np.float64), - (4, False, False, np.float64), - (4, False, True, np.float64)]) -def test_polynomial_features_csc_X(deg, include_bias, interaction_only, dtype): - rng = np.random.RandomState(0) - X = rng.randint(0, 2, (100, 2)) - X_csc = sparse.csc_matrix(X) - - est = PolynomialFeatures(deg, include_bias=include_bias, - interaction_only=interaction_only) - Xt_csc = est.fit_transform(X_csc.astype(dtype)) - Xt_dense = est.fit_transform(X.astype(dtype)) - - assert isinstance(Xt_csc, sparse.csc_matrix) - assert Xt_csc.dtype == Xt_dense.dtype - assert_array_almost_equal(Xt_csc.A, Xt_dense) - - -@pytest.mark.parametrize(['deg', 'include_bias', 'interaction_only', 'dtype'], - [(1, True, False, int), - (2, True, False, int), - (2, True, False, np.float32), - (2, True, False, np.float64), - (3, False, False, np.float64), - (3, False, True, np.float64)]) -def test_polynomial_features_csr_X(deg, include_bias, interaction_only, dtype): - rng = np.random.RandomState(0) - X = rng.randint(0, 2, (100, 2)) - X_csr = sparse.csr_matrix(X) - - est = PolynomialFeatures(deg, include_bias=include_bias, - interaction_only=interaction_only) - Xt_csr = est.fit_transform(X_csr.astype(dtype)) - Xt_dense = est.fit_transform(X.astype(dtype, copy=False)) - - assert isinstance(Xt_csr, sparse.csr_matrix) - assert Xt_csr.dtype == Xt_dense.dtype - assert_array_almost_equal(Xt_csr.A, Xt_dense) - - -@pytest.mark.parametrize(['deg', 'include_bias', 'interaction_only', 'dtype'], - [(2, True, False, np.float32), - (2, True, False, np.float64), - (3, False, False, np.float64), - (3, False, True, np.float64)]) -def test_polynomial_features_csr_X_floats(deg, include_bias, - interaction_only, dtype): - X_csr = sparse_random(1000, 10, 0.5, random_state=0).tocsr() - X = X_csr.toarray() - - est = PolynomialFeatures(deg, include_bias=include_bias, - interaction_only=interaction_only) - Xt_csr = est.fit_transform(X_csr.astype(dtype)) - Xt_dense = est.fit_transform(X.astype(dtype)) - - assert isinstance(Xt_csr, sparse.csr_matrix) - assert Xt_csr.dtype == Xt_dense.dtype - assert_array_almost_equal(Xt_csr.A, Xt_dense) - - -@pytest.mark.parametrize(['zero_row_index', 'deg', 'interaction_only'], - [(0, 2, True), (1, 2, True), (2, 2, True), - (0, 3, True), (1, 3, True), (2, 3, True), - (0, 2, False), (1, 2, False), (2, 2, False), - (0, 3, False), (1, 3, False), (2, 3, False)]) -def test_polynomial_features_csr_X_zero_row(zero_row_index, deg, - interaction_only): - X_csr = sparse_random(3, 10, 1.0, random_state=0).tocsr() - X_csr[zero_row_index, :] = 0.0 - X = X_csr.toarray() - - est = PolynomialFeatures(deg, include_bias=False, - interaction_only=interaction_only) - Xt_csr = est.fit_transform(X_csr) - Xt_dense = est.fit_transform(X) - - assert isinstance(Xt_csr, sparse.csr_matrix) - assert Xt_csr.dtype == Xt_dense.dtype - assert_array_almost_equal(Xt_csr.A, Xt_dense) - - -# This degree should always be one more than the highest degree supported by -# _csr_expansion. -@pytest.mark.parametrize(['include_bias', 'interaction_only'], - [(True, True), (True, False), - (False, True), (False, False)]) -def test_polynomial_features_csr_X_degree_4(include_bias, interaction_only): - X_csr = sparse_random(1000, 10, 0.5, random_state=0).tocsr() - X = X_csr.toarray() - - est = PolynomialFeatures(4, include_bias=include_bias, - interaction_only=interaction_only) - Xt_csr = est.fit_transform(X_csr) - Xt_dense = est.fit_transform(X) - - assert isinstance(Xt_csr, sparse.csr_matrix) - assert Xt_csr.dtype == Xt_dense.dtype - assert_array_almost_equal(Xt_csr.A, Xt_dense) - - -@pytest.mark.parametrize(['deg', 'dim', 'interaction_only'], - [(2, 1, True), - (2, 2, True), - (3, 1, True), - (3, 2, True), - (3, 3, True), - (2, 1, False), - (2, 2, False), - (3, 1, False), - (3, 2, False), - (3, 3, False)]) -def test_polynomial_features_csr_X_dim_edges(deg, dim, interaction_only): - X_csr = sparse_random(1000, dim, 0.5, random_state=0).tocsr() - X = X_csr.toarray() - - est = PolynomialFeatures(deg, interaction_only=interaction_only) - Xt_csr = est.fit_transform(X_csr) - Xt_dense = est.fit_transform(X) - - assert isinstance(Xt_csr, sparse.csr_matrix) - assert Xt_csr.dtype == Xt_dense.dtype - assert_array_almost_equal(Xt_csr.A, Xt_dense) - - def test_raises_value_error_if_sample_weights_greater_than_1d(): # Sample weights must be either scalar or 1D diff --git a/sklearn/preprocessing/tests/test_polynomial.py b/sklearn/preprocessing/tests/test_polynomial.py index b1908bf9fe12a..5068a8c7d8bdd 100644 --- a/sklearn/preprocessing/tests/test_polynomial.py +++ b/sklearn/preprocessing/tests/test_polynomial.py @@ -1,17 +1,22 @@ import numpy as np import pytest +from scipy import sparse +from scipy.sparse import random as sparse_random +from sklearn.utils._testing import assert_array_almost_equal + from numpy.testing import assert_allclose, assert_array_equal from scipy.interpolate import BSpline from sklearn.linear_model import LinearRegression from sklearn.pipeline import Pipeline -from sklearn.preprocessing import KBinsDiscretizer, SplineTransformer +from sklearn.preprocessing import ( + KBinsDiscretizer, PolynomialFeatures, SplineTransformer +) from sklearn.utils.fixes import linspace, sp_version from pkg_resources import parse_version -# TODO: add PolynomialFeatures if it moves to _polynomial.py -@pytest.mark.parametrize("est", (SplineTransformer,)) +@pytest.mark.parametrize("est", (PolynomialFeatures, SplineTransformer)) def test_polynomial_and_spline_array_order(est): """Test that output array has the given order.""" X = np.arange(10).reshape(5, 2) @@ -444,3 +449,188 @@ def test_spline_transformer_n_features_out(n_knots, include_bias, degree): splt.fit(X) assert splt.transform(X).shape[1] == splt.n_features_out_ + + +def test_polynomial_features(): + # Test Polynomial Features + X1 = np.arange(6)[:, np.newaxis] + P1 = np.hstack([np.ones_like(X1), + X1, X1 ** 2, X1 ** 3]) + deg1 = 3 + + X2 = np.arange(6).reshape((3, 2)) + x1 = X2[:, :1] + x2 = X2[:, 1:] + P2 = np.hstack([x1 ** 0 * x2 ** 0, + x1 ** 1 * x2 ** 0, + x1 ** 0 * x2 ** 1, + x1 ** 2 * x2 ** 0, + x1 ** 1 * x2 ** 1, + x1 ** 0 * x2 ** 2]) + deg2 = 2 + + for (deg, X, P) in [(deg1, X1, P1), (deg2, X2, P2)]: + P_test = PolynomialFeatures(deg, include_bias=True).fit_transform(X) + assert_array_almost_equal(P_test, P) + + P_test = PolynomialFeatures(deg, include_bias=False).fit_transform(X) + assert_array_almost_equal(P_test, P[:, 1:]) + + interact = PolynomialFeatures(2, interaction_only=True, include_bias=True) + X_poly = interact.fit_transform(X) + assert_array_almost_equal(X_poly, P2[:, [0, 1, 2, 4]]) + + assert interact.powers_.shape == (interact.n_output_features_, + interact.n_input_features_) + + +def test_polynomial_feature_names(): + X = np.arange(30).reshape(10, 3) + poly = PolynomialFeatures(degree=2, include_bias=True).fit(X) + feature_names = poly.get_feature_names() + assert_array_equal(['1', 'x0', 'x1', 'x2', 'x0^2', 'x0 x1', + 'x0 x2', 'x1^2', 'x1 x2', 'x2^2'], + feature_names) + + poly = PolynomialFeatures(degree=3, include_bias=False).fit(X) + feature_names = poly.get_feature_names(["a", "b", "c"]) + assert_array_equal(['a', 'b', 'c', 'a^2', 'a b', 'a c', 'b^2', + 'b c', 'c^2', 'a^3', 'a^2 b', 'a^2 c', + 'a b^2', 'a b c', 'a c^2', 'b^3', 'b^2 c', + 'b c^2', 'c^3'], feature_names) + # test some unicode + poly = PolynomialFeatures(degree=1, include_bias=True).fit(X) + feature_names = poly.get_feature_names( + ["\u0001F40D", "\u262E", "\u05D0"]) + assert_array_equal(["1", "\u0001F40D", "\u262E", "\u05D0"], + feature_names) + + +@pytest.mark.parametrize(['deg', 'include_bias', 'interaction_only', 'dtype'], + [(1, True, False, int), + (2, True, False, int), + (2, True, False, np.float32), + (2, True, False, np.float64), + (3, False, False, np.float64), + (3, False, True, np.float64), + (4, False, False, np.float64), + (4, False, True, np.float64)]) +def test_polynomial_features_csc_X(deg, include_bias, interaction_only, dtype): + rng = np.random.RandomState(0) + X = rng.randint(0, 2, (100, 2)) + X_csc = sparse.csc_matrix(X) + + est = PolynomialFeatures(deg, include_bias=include_bias, + interaction_only=interaction_only) + Xt_csc = est.fit_transform(X_csc.astype(dtype)) + Xt_dense = est.fit_transform(X.astype(dtype)) + + assert isinstance(Xt_csc, sparse.csc_matrix) + assert Xt_csc.dtype == Xt_dense.dtype + assert_array_almost_equal(Xt_csc.A, Xt_dense) + + +@pytest.mark.parametrize(['deg', 'include_bias', 'interaction_only', 'dtype'], + [(1, True, False, int), + (2, True, False, int), + (2, True, False, np.float32), + (2, True, False, np.float64), + (3, False, False, np.float64), + (3, False, True, np.float64)]) +def test_polynomial_features_csr_X(deg, include_bias, interaction_only, dtype): + rng = np.random.RandomState(0) + X = rng.randint(0, 2, (100, 2)) + X_csr = sparse.csr_matrix(X) + + est = PolynomialFeatures(deg, include_bias=include_bias, + interaction_only=interaction_only) + Xt_csr = est.fit_transform(X_csr.astype(dtype)) + Xt_dense = est.fit_transform(X.astype(dtype, copy=False)) + + assert isinstance(Xt_csr, sparse.csr_matrix) + assert Xt_csr.dtype == Xt_dense.dtype + assert_array_almost_equal(Xt_csr.A, Xt_dense) + + +@pytest.mark.parametrize(['deg', 'include_bias', 'interaction_only', 'dtype'], + [(2, True, False, np.float32), + (2, True, False, np.float64), + (3, False, False, np.float64), + (3, False, True, np.float64)]) +def test_polynomial_features_csr_X_floats(deg, include_bias, + interaction_only, dtype): + X_csr = sparse_random(1000, 10, 0.5, random_state=0).tocsr() + X = X_csr.toarray() + + est = PolynomialFeatures(deg, include_bias=include_bias, + interaction_only=interaction_only) + Xt_csr = est.fit_transform(X_csr.astype(dtype)) + Xt_dense = est.fit_transform(X.astype(dtype)) + + assert isinstance(Xt_csr, sparse.csr_matrix) + assert Xt_csr.dtype == Xt_dense.dtype + assert_array_almost_equal(Xt_csr.A, Xt_dense) + + +@pytest.mark.parametrize(['zero_row_index', 'deg', 'interaction_only'], + [(0, 2, True), (1, 2, True), (2, 2, True), + (0, 3, True), (1, 3, True), (2, 3, True), + (0, 2, False), (1, 2, False), (2, 2, False), + (0, 3, False), (1, 3, False), (2, 3, False)]) +def test_polynomial_features_csr_X_zero_row(zero_row_index, deg, + interaction_only): + X_csr = sparse_random(3, 10, 1.0, random_state=0).tocsr() + X_csr[zero_row_index, :] = 0.0 + X = X_csr.toarray() + + est = PolynomialFeatures(deg, include_bias=False, + interaction_only=interaction_only) + Xt_csr = est.fit_transform(X_csr) + Xt_dense = est.fit_transform(X) + + assert isinstance(Xt_csr, sparse.csr_matrix) + assert Xt_csr.dtype == Xt_dense.dtype + assert_array_almost_equal(Xt_csr.A, Xt_dense) + + +# This degree should always be one more than the highest degree supported by +# _csr_expansion. +@pytest.mark.parametrize(['include_bias', 'interaction_only'], + [(True, True), (True, False), + (False, True), (False, False)]) +def test_polynomial_features_csr_X_degree_4(include_bias, interaction_only): + X_csr = sparse_random(1000, 10, 0.5, random_state=0).tocsr() + X = X_csr.toarray() + + est = PolynomialFeatures(4, include_bias=include_bias, + interaction_only=interaction_only) + Xt_csr = est.fit_transform(X_csr) + Xt_dense = est.fit_transform(X) + + assert isinstance(Xt_csr, sparse.csr_matrix) + assert Xt_csr.dtype == Xt_dense.dtype + assert_array_almost_equal(Xt_csr.A, Xt_dense) + + +@pytest.mark.parametrize(['deg', 'dim', 'interaction_only'], + [(2, 1, True), + (2, 2, True), + (3, 1, True), + (3, 2, True), + (3, 3, True), + (2, 1, False), + (2, 2, False), + (3, 1, False), + (3, 2, False), + (3, 3, False)]) +def test_polynomial_features_csr_X_dim_edges(deg, dim, interaction_only): + X_csr = sparse_random(1000, dim, 0.5, random_state=0).tocsr() + X = X_csr.toarray() + + est = PolynomialFeatures(deg, interaction_only=interaction_only) + Xt_csr = est.fit_transform(X_csr) + Xt_dense = est.fit_transform(X) + + assert isinstance(Xt_csr, sparse.csr_matrix) + assert Xt_csr.dtype == Xt_dense.dtype + assert_array_almost_equal(Xt_csr.A, Xt_dense) From 0d7d46f3bef0a2f943ee321f0f979ced165e0477 Mon Sep 17 00:00:00 2001 From: Mathieu Blondel Date: Thu, 18 Mar 2021 18:42:47 +0100 Subject: [PATCH 256/478] Fix typo in elastic net docstring. (#19711) * Fix typo in elastic net docstring. * Use norms more explicitly. --- sklearn/linear_model/_coordinate_descent.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/linear_model/_coordinate_descent.py b/sklearn/linear_model/_coordinate_descent.py index 4fdeb783db194..6a23fedd9902e 100644 --- a/sklearn/linear_model/_coordinate_descent.py +++ b/sklearn/linear_model/_coordinate_descent.py @@ -570,7 +570,7 @@ class ElasticNet(MultiOutputMixin, RegressorMixin, LinearModel): If you are interested in controlling the L1 and L2 penalty separately, keep in mind that this is equivalent to:: - a * L1 + b * L2 + a * ||w||_1 + 0.5 * b * ||w||_2^2 where:: From bf4049cbef568fa211ec155cb724001fff742dbd Mon Sep 17 00:00:00 2001 From: flyingdutchman23 Date: Fri, 19 Mar 2021 14:51:00 +0100 Subject: [PATCH 257/478] DOC Correct scorer documentation (#19720) Co-authored-by: Joris Clement --- sklearn/metrics/_scorer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py index c686d3b7c0b34..8a814242cb6f1 100644 --- a/sklearn/metrics/_scorer.py +++ b/sklearn/metrics/_scorer.py @@ -215,7 +215,7 @@ def _score(self, method_caller, estimator, X, y_true, sample_weight=None): arguments, potentially caching results. estimator : object - Trained estimator to use for scoring. Must have a predict_proba + Trained estimator to use for scoring. Must have a `predict` method; the output of that is used to compute the score. X : {array-like, sparse matrix} @@ -254,7 +254,7 @@ def _score(self, method_caller, clf, X, y, sample_weight=None): arguments, potentially caching results. clf : object - Trained classifier to use for scoring. Must have a predict_proba + Trained classifier to use for scoring. Must have a `predict_proba` method; the output of that is used to compute the score. X : {array-like, sparse matrix} From b9d6db81ec2e75ec40404db49f97999a08f00c55 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Fri, 19 Mar 2021 15:21:34 +0100 Subject: [PATCH 258/478] [MRG] ENH Consistent loss name for squared error (#19310) --- benchmarks/bench_hist_gradient_boosting.py | 2 +- .../bench_hist_gradient_boosting_threading.py | 2 +- doc/modules/ensemble.rst | 15 ++-- doc/modules/sgd.rst | 8 +- doc/whats_new/v1.0.rst | 31 ++++++++ .../plot_model_complexity_influence.py | 2 +- .../plot_gradient_boosting_quantile.py | 24 +++--- .../plot_gradient_boosting_regression.py | 2 +- sklearn/ensemble/_base.py | 10 +++ sklearn/ensemble/_forest.py | 41 +++++++--- sklearn/ensemble/_gb.py | 79 +++++++++++++------ sklearn/ensemble/_gb_losses.py | 2 + .../gradient_boosting.py | 23 ++++-- .../ensemble/_hist_gradient_boosting/loss.py | 2 +- .../tests/test_gradient_boosting.py | 20 ++++- .../tests/test_loss.py | 18 ++--- .../_hist_gradient_boosting/utils.pyx | 6 +- sklearn/ensemble/tests/test_forest.py | 18 ++++- .../ensemble/tests/test_gradient_boosting.py | 37 ++++++++- .../tests/test_partial_dependence.py | 3 +- sklearn/linear_model/_ransac.py | 27 ++++--- sklearn/linear_model/_stochastic_gradient.py | 34 ++++++-- sklearn/linear_model/tests/test_ransac.py | 15 +++- sklearn/linear_model/tests/test_sgd.py | 45 ++++++++--- sklearn/neural_network/_base.py | 2 +- .../neural_network/_multilayer_perceptron.py | 4 +- sklearn/tree/_classes.py | 50 ++++++++---- sklearn/tree/_export.py | 3 + sklearn/tree/tests/test_export.py | 42 +++++----- sklearn/tree/tests/test_tree.py | 40 +++++++--- 30 files changed, 444 insertions(+), 163 deletions(-) diff --git a/benchmarks/bench_hist_gradient_boosting.py b/benchmarks/bench_hist_gradient_boosting.py index 158b6fbb22d2b..82eb64faeb462 100644 --- a/benchmarks/bench_hist_gradient_boosting.py +++ b/benchmarks/bench_hist_gradient_boosting.py @@ -110,7 +110,7 @@ def one_run(n_samples): else: # regression if loss == 'default': - loss = 'least_squares' + loss = 'squared_error' est.set_params(loss=loss) est.fit(X_train, y_train, sample_weight=sample_weight_train) sklearn_fit_duration = time() - tic diff --git a/benchmarks/bench_hist_gradient_boosting_threading.py b/benchmarks/bench_hist_gradient_boosting_threading.py index 3cc6afa3871c6..61803fb5cb9cc 100644 --- a/benchmarks/bench_hist_gradient_boosting_threading.py +++ b/benchmarks/bench_hist_gradient_boosting_threading.py @@ -112,7 +112,7 @@ def get_estimator_and_data(): else: # regression if loss == 'default': - loss = 'least_squares' + loss = 'squared_error' sklearn_est.set_params(loss=loss) diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst index 0e0aaaafaffba..c891b4d275b9a 100644 --- a/doc/modules/ensemble.rst +++ b/doc/modules/ensemble.rst @@ -537,7 +537,8 @@ Regression :class:`GradientBoostingRegressor` supports a number of :ref:`different loss functions ` for regression which can be specified via the argument -``loss``; the default loss function for regression is least squares (``'ls'``). +``loss``; the default loss function for regression is squared error +(``'squared_error'``). :: @@ -549,8 +550,10 @@ for regression which can be specified via the argument >>> X, y = make_friedman1(n_samples=1200, random_state=0, noise=1.0) >>> X_train, X_test = X[:200], X[200:] >>> y_train, y_test = y[:200], y[200:] - >>> est = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, - ... max_depth=1, random_state=0, loss='ls').fit(X_train, y_train) + >>> est = GradientBoostingRegressor( + ... n_estimators=100, learning_rate=0.1, max_depth=1, random_state=0, + ... loss='squared_error' + ... ).fit(X_train, y_train) >>> mean_squared_error(y_test, est.predict(X_test)) 5.00... @@ -741,8 +744,8 @@ the parameter ``loss``: * Regression - * Least squares (``'ls'``): The natural choice for regression due - to its superior computational properties. The initial model is + * Squared error (``'squared_error'``): The natural choice for regression + due to its superior computational properties. The initial model is given by the mean of the target values. * Least absolute deviation (``'lad'``): A robust loss function for regression. The initial model is given by the median of the @@ -950,7 +953,7 @@ controls the number of iterations of the boosting process:: >>> clf.score(X_test, y_test) 0.8965 -Available losses for regression are 'least_squares', +Available losses for regression are 'squared_error', 'least_absolute_deviation', which is less sensitive to outliers, and 'poisson', which is well suited to model counts and frequencies. For classification, 'binary_crossentropy' is used for binary classification and diff --git a/doc/modules/sgd.rst b/doc/modules/sgd.rst index 95a5111747509..1376947540e78 100644 --- a/doc/modules/sgd.rst +++ b/doc/modules/sgd.rst @@ -30,7 +30,7 @@ For example, using `SGDClassifier(loss='log')` results in logistic regression, i.e. a model equivalent to :class:`~sklearn.linear_model.LogisticRegression` which is fitted via SGD instead of being fitted by one of the other solvers in :class:`~sklearn.linear_model.LogisticRegression`. Similarly, -`SGDRegressor(loss='squared_loss', penalty='l2')` and +`SGDRegressor(loss='squared_error', penalty='l2')` and :class:`~sklearn.linear_model.Ridge` solve the same optimization problem, via different means. @@ -211,7 +211,7 @@ samples (> 10.000), for other problems we recommend :class:`Ridge`, The concrete loss function can be set via the ``loss`` parameter. :class:`SGDRegressor` supports the following loss functions: - * ``loss="squared_loss"``: Ordinary least squares, + * ``loss="squared_error"``: Ordinary least squares, * ``loss="huber"``: Huber loss for robust regression, * ``loss="epsilon_insensitive"``: linear Support Vector Regression. @@ -362,9 +362,9 @@ Different choices for :math:`L` entail different classifiers or regressors: - Hinge (soft-margin): equivalent to Support Vector Classification. :math:`L(y_i, f(x_i)) = \max(0, 1 - y_i f(x_i))`. -- Perceptron: +- Perceptron: :math:`L(y_i, f(x_i)) = \max(0, - y_i f(x_i))`. -- Modified Huber: +- Modified Huber: :math:`L(y_i, f(x_i)) = \max(0, 1 - y_i f(x_i))^2` if :math:`y_i f(x_i) > 1`, and :math:`L(y_i, f(x_i)) = -4 y_i f(x_i)` otherwise. - Log: equivalent to Logistic Regression. diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index c7b786ea6d1bf..b4ee0c57b97fc 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -45,6 +45,37 @@ Changelog :pr:`123456` by :user:`Joe Bloggs `. where 123456 is the *pull request* number, not the issue number. +- |API| The option for using the squared error via ``loss`` and + ``criterion`` parameters was made more consistent. The preferred way is by + setting the value to `"squared_error"`. Old option names are still valid, + produce the same models, but are deprecated and will be removed in version + 1.2. + :pr:`19310` by :user:`Christian Lorentzen `. + + - For :class:`ensemble.ExtraTreesRegressor`, `criterion="mse"` is deprecated, + use `"squared_error"` instead which is now the default. + + - For :class:`ensemble.GradientBoostingRegressor`, `loss="ls"` is deprecated, + use `"squared_error"` instead which is now the default. + + - For :class:`ensemble.RandomForestRegressor`, `criterion="mse"` is deprecated, + use `"squared_error"` instead which is now the default. + + - For :class:`ensemble.HistGradientBoostingRegressor`, `loss="least_squares"` + is deprecated, use `"squared_error"` instead which is now the default. + + - For :class:`linear_model.RANSACRegressor`, `loss="squared_loss"` is + deprecated, use `"squared_error"` instead. + + - For :class:`linear_model.SGDRegressor`, `loss="squared_loss"` is + deprecated, use `"squared_error"` instead which is now the default. + + - For :class:`tree.DecisionTreeRegressor`, `criterion="mse"` is deprecated, + use `"squared_error"` instead which is now the default. + + - For :class:`tree.ExtraTreeRegressor`, `criterion="mse"` is deprecated, + use `"squared_error"` instead which is now the default. + :mod:`sklearn.cluster` ...................... diff --git a/examples/applications/plot_model_complexity_influence.py b/examples/applications/plot_model_complexity_influence.py index 927fcd8e85e15..5748a546bdaad 100644 --- a/examples/applications/plot_model_complexity_influence.py +++ b/examples/applications/plot_model_complexity_influence.py @@ -177,7 +177,7 @@ def _count_nonzero_coefficients(estimator): 'prediction_performance_label': 'MSE', 'n_samples': 30}, {'estimator': GradientBoostingRegressor, - 'tuned_params': {'loss': 'ls'}, + 'tuned_params': {'loss': 'squared_error'}, 'changing_param': 'n_estimators', 'changing_param_values': [10, 50, 100, 200, 500], 'complexity_label': 'n_trees', diff --git a/examples/ensemble/plot_gradient_boosting_quantile.py b/examples/ensemble/plot_gradient_boosting_quantile.py index f29a87fe6cff7..00be70721c1da 100644 --- a/examples/ensemble/plot_gradient_boosting_quantile.py +++ b/examples/ensemble/plot_gradient_boosting_quantile.py @@ -71,10 +71,10 @@ def f(x): all_models["q %1.2f" % alpha] = gbr.fit(X_train, y_train) # %% -# For the sake of comparison, also fit a baseline model trained with the usual -# least squares loss (ls), also known as the mean squared error (MSE). -gbr_ls = GradientBoostingRegressor(loss='ls', **common_params) -all_models["ls"] = gbr_ls.fit(X_train, y_train) +# For the sake of comparison, we also fit a baseline model trained with the +# usual (mean) squared error (MSE). +gbr_ls = GradientBoostingRegressor(loss='squared_error', **common_params) +all_models["mse"] = gbr_ls.fit(X_train, y_train) # %% # Create an evenly spaced evaluation set of input values spanning the [0, 10] @@ -82,13 +82,13 @@ def f(x): xx = np.atleast_2d(np.linspace(0, 10, 1000)).T # %% -# Plot the true conditional mean function f, the prediction of the conditional -# mean (least squares loss), the conditional median and the conditional 90% -# interval (from 5th to 95th conditional percentiles). +# Plot the true conditional mean function f, the predictions of the conditional +# mean (loss equals squared error), the conditional median and the conditional +# 90% interval (from 5th to 95th conditional percentiles). import matplotlib.pyplot as plt -y_pred = all_models['ls'].predict(xx) +y_pred = all_models['mse'].predict(xx) y_lower = all_models['q 0.05'].predict(xx) y_upper = all_models['q 0.95'].predict(xx) y_med = all_models['q 0.50'].predict(xx) @@ -153,7 +153,7 @@ def highlight_min(x): # # Note that because the target distribution is asymmetric, the expected # conditional mean and conditional median are signficiantly different and -# therefore one could not use the least squares model get a good estimation of +# therefore one could not use the squared error model get a good estimation of # the conditional median nor the converse. # # If the target distribution were symmetric and had no outliers (e.g. with a @@ -179,9 +179,9 @@ def highlight_min(x): # shows that the best test metric is obtained when the model is trained by # minimizing this same metric. # -# Note that the conditional median estimator is competitive with the least -# squares estimator in terms of MSE on the test set: this can be explained by -# the fact the least squares estimator is very sensitive to large outliers +# Note that the conditional median estimator is competitive with the squared +# error estimator in terms of MSE on the test set: this can be explained by +# the fact the squared error estimator is very sensitive to large outliers # which can cause significant overfitting. This can be seen on the right hand # side of the previous plot. The conditional median estimator is biased # (underestimation for this asymetric noise) but is also naturally robust to diff --git a/examples/ensemble/plot_gradient_boosting_regression.py b/examples/ensemble/plot_gradient_boosting_regression.py index 92d35b26deb9c..3722f4bf2066f 100644 --- a/examples/ensemble/plot_gradient_boosting_regression.py +++ b/examples/ensemble/plot_gradient_boosting_regression.py @@ -67,7 +67,7 @@ 'max_depth': 4, 'min_samples_split': 5, 'learning_rate': 0.01, - 'loss': 'ls'} + 'loss': 'squared_error'} # %% # Fit regression model diff --git a/sklearn/ensemble/_base.py b/sklearn/ensemble/_base.py index 93891a2b719ab..095d801de166d 100644 --- a/sklearn/ensemble/_base.py +++ b/sklearn/ensemble/_base.py @@ -15,6 +15,7 @@ from ..base import is_classifier, is_regressor from ..base import BaseEstimator from ..base import MetaEstimatorMixin +from ..tree import DecisionTreeRegressor, ExtraTreeRegressor from ..utils import Bunch, _print_elapsed_time from ..utils import check_random_state from ..utils.metaestimators import _BaseComposition @@ -151,6 +152,15 @@ def _make_estimator(self, append=True, random_state=None): estimator.set_params(**{p: getattr(self, p) for p in self.estimator_params}) + # TODO: Remove in v1.2 + # criterion "mse" would cause warnings in every call to + # DecisionTreeRegressor.fit(..) + if ( + isinstance(estimator, (DecisionTreeRegressor, ExtraTreeRegressor)) + and getattr(estimator, "criterion", None) == "mse" + ): + estimator.set_params(criterion="squared_error") + if random_state is not None: _set_random_states(estimator, random_state) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index a93e9b7ee877e..140c1c93e8eef 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -345,6 +345,17 @@ def fit(self, X, y, sample_weight=None): # Check parameters self._validate_estimator() + # TODO: Remove in v1.2 + if ( + isinstance(self, (RandomForestRegressor, ExtraTreesRegressor)) + and self.criterion == "mse" + ): + warn( + "Criterion 'mse' was deprecated in v1.0 and will be " + "removed in version 1.2. Use `criterion='squared_error'` " + "which is equivalent.", + FutureWarning + ) if not self.bootstrap and self.oob_score: raise ValueError("Out of bag estimation only available" @@ -1310,15 +1321,19 @@ class RandomForestRegressor(ForestRegressor): The default value of ``n_estimators`` changed from 10 to 100 in 0.22. - criterion : {"mse", "mae"}, default="mse" + criterion : {"squared_error", "mse", "mae"}, default="squared_error" The function to measure the quality of a split. Supported criteria - are "mse" for the mean squared error, which is equal to variance - reduction as feature selection criterion, and "mae" for the mean - absolute error. + are "squared_error" for the mean squared error, which is equal to + variance reduction as feature selection criterion, and "mae" for the + mean absolute error. .. versionadded:: 0.18 Mean Absolute Error (MAE) criterion. + .. deprecated:: 1.0 + Criterion "mse" was deprecated in v1.0 and will be removed in + version 1.2. Use `criterion="squared_error"` which is equivalent. + max_depth : int, default=None The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than @@ -1537,7 +1552,7 @@ class RandomForestRegressor(ForestRegressor): @_deprecate_positional_args def __init__(self, n_estimators=100, *, - criterion="mse", + criterion="squared_error", max_depth=None, min_samples_split=2, min_samples_leaf=1, @@ -1921,15 +1936,19 @@ class ExtraTreesRegressor(ForestRegressor): The default value of ``n_estimators`` changed from 10 to 100 in 0.22. - criterion : {"mse", "mae"}, default="mse" + criterion : {"squared_error", "mse", "mae"}, default="squared_error" The function to measure the quality of a split. Supported criteria - are "mse" for the mean squared error, which is equal to variance - reduction as feature selection criterion, and "mae" for the mean - absolute error. + are "squared_error" and "mse" for the mean squared error, which is + equal to variance reduction as feature selection criterion, and "mae" + for the mean absolute error. .. versionadded:: 0.18 Mean Absolute Error (MAE) criterion. + .. deprecated:: 1.0 + Criterion "mse" was deprecated in v1.0 and will be removed in + version 1.2. Use `criterion="squared_error"` which is equivalent. + max_depth : int, default=None The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than @@ -2141,7 +2160,7 @@ class ExtraTreesRegressor(ForestRegressor): @_deprecate_positional_args def __init__(self, n_estimators=100, *, - criterion="mse", + criterion="squared_error", max_depth=None, min_samples_split=2, min_samples_leaf=1, @@ -2353,7 +2372,7 @@ class RandomTreesEmbedding(BaseForest): [0., 1., 1., 0., 1., 0., 0., 1., 1., 0.]]) """ - criterion = 'mse' + criterion = "squared_error" max_features = 1 @_deprecate_positional_args diff --git a/sklearn/ensemble/_gb.py b/sklearn/ensemble/_gb.py index e9f7402188860..4984575bce8c3 100644 --- a/sklearn/ensemble/_gb.py +++ b/sklearn/ensemble/_gb.py @@ -238,6 +238,12 @@ def _check_params(self): or self.loss not in _gb_losses.LOSS_FUNCTIONS): raise ValueError("Loss '{0:s}' not supported. ".format(self.loss)) + if self.loss == "ls": + warnings.warn("The loss 'ls' was deprecated in v1.0 and " + "will be removed in version 1.2. Use 'squared_error'" + " which is equivalent.", + FutureWarning) + if self.loss == 'deviance': loss_class = (_gb_losses.MultinomialDeviance if len(self.classes_) > 2 @@ -401,6 +407,15 @@ def fit(self, X, y, sample_weight=None, monitor=None): # TODO: This should raise an error from 1.1 self._warn_mae_for_criterion() + if self.criterion == 'mse': + # TODO: Remove in v1.2. By then it should raise an error. + warnings.warn( + "Criterion 'mse' was deprecated in v1.0 and will be " + "removed in version 1.2. Use `criterion='squared_error'` " + "which is equivalent.", + FutureWarning + ) + # if not warmstart - clear the estimator state if not self.warm_start: self._clear_state() @@ -808,20 +823,26 @@ class GradientBoostingClassifier(ClassifierMixin, BaseGradientBoosting): Choosing `subsample < 1.0` leads to a reduction of variance and an increase in bias. - criterion : {'friedman_mse', 'mse', 'mae'}, default='friedman_mse' + criterion : {'friedman_mse', 'squared_error', 'mse', 'mae'}, \ + default='friedman_mse' The function to measure the quality of a split. Supported criteria are 'friedman_mse' for the mean squared error with improvement - score by Friedman, 'mse' for mean squared error, and 'mae' for - the mean absolute error. The default value of 'friedman_mse' is - generally the best as it can provide a better approximation in - some cases. + score by Friedman, 'squared_error' for mean squared error, and 'mae' + for the mean absolute error. The default value of 'friedman_mse' is + generally the best as it can provide a better approximation in some + cases. .. versionadded:: 0.18 + .. deprecated:: 0.24 `criterion='mae'` is deprecated and will be removed in version - 1.1 (renaming of 0.26). Use `criterion='friedman_mse'` or `'mse'` - instead, as trees should use a least-square criterion in - Gradient Boosting. + 1.1 (renaming of 0.26). Use `criterion='friedman_mse'` or + `'squared_error'` instead, as trees should use a squared error + criterion in Gradient Boosting. + + .. deprecated:: 1.0 + Criterion 'mse' was deprecated in v1.0 and will be removed in + version 1.2. Use `criterion='squared_error'` which is equivalent. min_samples_split : int or float, default=2 The minimum number of samples required to split an internal node: @@ -1128,9 +1149,9 @@ def _warn_mae_for_criterion(self): # TODO: This should raise an error from 1.1 warnings.warn("criterion='mae' was deprecated in version 0.24 and " "will be removed in version 1.1 (renaming of 0.26). Use " - "criterion='friedman_mse' or 'mse' instead, as trees " - "should use a least-square criterion in Gradient " - "Boosting.", FutureWarning) + "criterion='friedman_mse' or 'squared_error' instead, as" + " trees should use a squared error criterion in Gradient" + " Boosting.", FutureWarning) def decision_function(self, X): """Compute the decision function of ``X``. @@ -1319,13 +1340,19 @@ class GradientBoostingRegressor(RegressorMixin, BaseGradientBoosting): Parameters ---------- - loss : {'ls', 'lad', 'huber', 'quantile'}, default='ls' - Loss function to be optimized. 'ls' refers to least squares - regression. 'lad' (least absolute deviation) is a highly robust + loss : {'squared_error', 'ls', 'lad', 'huber', 'quantile'}, \ + default='squared_error' + Loss function to be optimized. 'squared_error' refers to the squared + error for regression. + 'lad' (least absolute deviation) is a highly robust loss function solely based on order information of the input variables. 'huber' is a combination of the two. 'quantile' allows quantile regression (use `alpha` to specify the quantile). + .. deprecated:: 1.0 + The loss 'ls' was deprecated in v1.0 and will be removed in + version 1.2. Use `loss='squared_error'` which is equivalent. + learning_rate : float, default=0.1 Learning rate shrinks the contribution of each tree by `learning_rate`. There is a trade-off between learning_rate and n_estimators. @@ -1342,20 +1369,26 @@ class GradientBoostingRegressor(RegressorMixin, BaseGradientBoosting): Choosing `subsample < 1.0` leads to a reduction of variance and an increase in bias. - criterion : {'friedman_mse', 'mse', 'mae'}, default='friedman_mse' + criterion : {'friedman_mse', 'squared_error', 'mse', 'mae'}, \ + default='friedman_mse' The function to measure the quality of a split. Supported criteria are "friedman_mse" for the mean squared error with improvement - score by Friedman, "mse" for mean squared error, and "mae" for - the mean absolute error. The default value of "friedman_mse" is - generally the best as it can provide a better approximation in - some cases. + score by Friedman, "squared_error" for mean squared error, and "mae" + for the mean absolute error. The default value of "friedman_mse" is + generally the best as it can provide a better approximation in some + cases. .. versionadded:: 0.18 + .. deprecated:: 0.24 `criterion='mae'` is deprecated and will be removed in version 1.1 (renaming of 0.26). The correct way of minimizing the absolute error is to use `loss='lad'` instead. + .. deprecated:: 1.0 + Criterion 'mse' was deprecated in v1.0 and will be removed in + version 1.2. Use `criterion='squared_error'` which is equivalent. + min_samples_split : int or float, default=2 The minimum number of samples required to split an internal node: @@ -1427,7 +1460,7 @@ class GradientBoostingRegressor(RegressorMixin, BaseGradientBoosting): ``init`` has to provide :term:`fit` and :term:`predict`. If 'zero', the initial raw predictions are set to zero. By default a ``DummyEstimator`` is used, predicting either the average target value - (for loss='ls'), or a quantile for the other losses. + (for loss='squared_error'), or a quantile for the other losses. random_state : int, RandomState instance or None, default=None Controls the random seed given to each Tree estimator at each @@ -1610,10 +1643,12 @@ class GradientBoostingRegressor(RegressorMixin, BaseGradientBoosting): Elements of Statistical Learning Ed. 2, Springer, 2009. """ - _SUPPORTED_LOSS = ('ls', 'lad', 'huber', 'quantile') + # TODO: remove "ls" in verion 1.2 + _SUPPORTED_LOSS = ("squared_error", 'ls', 'lad', 'huber', 'quantile') @_deprecate_positional_args - def __init__(self, *, loss='ls', learning_rate=0.1, n_estimators=100, + def __init__(self, *, loss="squared_error", learning_rate=0.1, + n_estimators=100, subsample=1.0, criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0., max_depth=3, min_impurity_decrease=0., diff --git a/sklearn/ensemble/_gb_losses.py b/sklearn/ensemble/_gb_losses.py index 82b496ae8109d..f33c7086b596b 100644 --- a/sklearn/ensemble/_gb_losses.py +++ b/sklearn/ensemble/_gb_losses.py @@ -856,7 +856,9 @@ def get_init_raw_predictions(self, X, estimator): return raw_predictions.reshape(-1, 1).astype(np.float64) +# TODO: Remove entry 'ls' in version 1.2. LOSS_FUNCTIONS = { + "squared_error": LeastSquaresError, 'ls': LeastSquaresError, 'lad': LeastAbsoluteError, 'huber': HuberLossFunction, diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 4fff6030b0d5a..c35f79bd79251 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -3,6 +3,7 @@ from abc import ABC, abstractmethod from functools import partial +import warnings import numpy as np from timeit import default_timer as time @@ -903,8 +904,8 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting): Parameters ---------- - loss : {'least_squares', 'least_absolute_deviation', 'poisson'}, \ - default='least_squares' + loss : {'squared_error', 'least_squares', 'least_absolute_deviation', \ + 'poisson'}, default='squared_error' The loss function to use in the boosting process. Note that the "least squares" and "poisson" losses actually implement "half least squares loss" and "half poisson deviance" to simplify the @@ -914,6 +915,10 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting): .. versionchanged:: 0.23 Added option 'poisson'. + .. deprecated:: 1.0 + The loss 'least_squares' was deprecated in v1.0 and will be removed + in version 1.2. Use `loss='squared_error'` which is equivalent. + learning_rate : float, default=0.1 The learning rate, also known as *shrinkage*. This is used as a multiplicative factor for the leaves values. Use ``1`` for no @@ -1045,11 +1050,11 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting): 0.92... """ - _VALID_LOSSES = ('least_squares', 'least_absolute_deviation', - 'poisson') + _VALID_LOSSES = ('squared_error', 'least_squares', + 'least_absolute_deviation', 'poisson') @_deprecate_positional_args - def __init__(self, loss='least_squares', *, learning_rate=0.1, + def __init__(self, loss='squared_error', *, learning_rate=0.1, max_iter=100, max_leaf_nodes=31, max_depth=None, min_samples_leaf=20, l2_regularization=0., max_bins=255, categorical_features=None, monotonic_cst=None, @@ -1121,6 +1126,14 @@ def _encode_y(self, y): return y def _get_loss(self, sample_weight): + if self.loss == "least_squares": + warnings.warn( + "The loss 'least_squares' was deprecated in v1.0 and will be " + "removed in version 1.2. Use 'squared_error' which is " + "equivalent.", + FutureWarning) + return _LOSSES["squared_error"](sample_weight=sample_weight) + return _LOSSES[self.loss](sample_weight=sample_weight) diff --git a/sklearn/ensemble/_hist_gradient_boosting/loss.py b/sklearn/ensemble/_hist_gradient_boosting/loss.py index 4bbf59dc01088..c336bd347e4cf 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/loss.py +++ b/sklearn/ensemble/_hist_gradient_boosting/loss.py @@ -419,7 +419,7 @@ def predict_proba(self, raw_predictions): _LOSSES = { - 'least_squares': LeastSquares, + 'squared_error': LeastSquares, 'least_absolute_deviation': LeastAbsoluteDeviation, 'binary_crossentropy': BinaryCrossEntropy, 'categorical_crossentropy': CategoricalCrossEntropy, diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index 0e5d1e91c3dd0..265b4cf20f8f3 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -240,7 +240,7 @@ def test_poisson(): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=n_test, random_state=rng) gbdt_pois = HistGradientBoostingRegressor(loss='poisson', random_state=rng) - gbdt_ls = HistGradientBoostingRegressor(loss='least_squares', + gbdt_ls = HistGradientBoostingRegressor(loss='squared_error', random_state=rng) gbdt_pois.fit(X_train, y_train) gbdt_ls.fit(X_train, y_train) @@ -248,7 +248,7 @@ def test_poisson(): for X, y in [(X_train, y_train), (X_test, y_test)]: metric_pois = mean_poisson_deviance(y, gbdt_pois.predict(X)) - # least_squares might produce non-positive predictions => clip + # squared_error might produce non-positive predictions => clip metric_ls = mean_poisson_deviance(y, np.clip(gbdt_ls.predict(X), 1e-15, None)) metric_dummy = mean_poisson_deviance(y, dummy.predict(X)) @@ -652,7 +652,7 @@ def test_sample_weight_effect(problem, duplication): est_dup._raw_predict(X_dup)) -@pytest.mark.parametrize('loss_name', ('least_squares', +@pytest.mark.parametrize('loss_name', ('squared_error', 'least_absolute_deviation')) def test_sum_hessians_are_sample_weight(loss_name): # For losses with constant hessians, the sum_hessians field of the @@ -992,3 +992,17 @@ def test_uint8_predict(Est): est = Est() est.fit(X, y) est.predict(X) + + +# TODO: Remove in v1.2 +def test_loss_least_squares_deprecated(): + X, y = make_regression(n_samples=50, random_state=0) + est1 = HistGradientBoostingRegressor(loss="least_squares", random_state=0) + + with pytest.warns(FutureWarning, + match="The loss 'least_squares' was deprecated"): + est1.fit(X, y) + + est2 = HistGradientBoostingRegressor(loss="squared_error", random_state=0) + est2.fit(X, y) + assert_allclose(est1.predict(X), est2.predict(X)) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py index 221b94183a7ff..ce7b4acedbae5 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py @@ -47,9 +47,9 @@ def get_hessians(y_true, raw_predictions): @pytest.mark.parametrize('loss, x0, y_true', [ - ('least_squares', -2., 42), - ('least_squares', 117., 1.05), - ('least_squares', 0., 0.), + ("squared_error", -2., 42), + ("squared_error", 117., 1.05), + ("squared_error", 0., 0.), # The argmin of binary_crossentropy for y_true=0 and y_true=1 is resp. -inf # and +inf due to logit, cf. "complete separation". Therefore, we use # 0 < y_true < 1. @@ -102,7 +102,7 @@ def fprime2(x: np.ndarray) -> np.ndarray: @pytest.mark.parametrize('loss, n_classes, prediction_dim', [ - ('least_squares', 0, 1), + ("squared_error", 0, 1), ('least_absolute_deviation', 0, 1), ('binary_crossentropy', 2, 1), ('categorical_crossentropy', 3, 3), @@ -118,7 +118,7 @@ def test_numerical_gradients(loss, n_classes, prediction_dim, seed=0): rng = np.random.RandomState(seed) n_samples = 100 - if loss in ('least_squares', 'least_absolute_deviation'): + if loss in ("squared_error", 'least_absolute_deviation'): y_true = rng.normal(size=n_samples).astype(Y_DTYPE) elif loss in ('poisson'): y_true = rng.poisson(size=n_samples).astype(Y_DTYPE) @@ -161,7 +161,7 @@ def test_numerical_gradients(loss, n_classes, prediction_dim, seed=0): def test_baseline_least_squares(): rng = np.random.RandomState(0) - loss = _LOSSES['least_squares'](sample_weight=None) + loss = _LOSSES["squared_error"](sample_weight=None) y_train = rng.normal(size=100) baseline_prediction = loss.get_baseline_prediction(y_train, None, 1) assert baseline_prediction.shape == tuple() # scalar @@ -255,7 +255,7 @@ def test_baseline_categorical_crossentropy(): @pytest.mark.parametrize('loss, problem', [ - ('least_squares', 'regression'), + ("squared_error", 'regression'), ('least_absolute_deviation', 'regression'), ('binary_crossentropy', 'classification'), ('categorical_crossentropy', 'classification'), @@ -317,7 +317,7 @@ def test_init_gradient_and_hessians_sample_weight(): prediction_dim = 2 n_samples = 5 sample_weight = None - loss = _LOSSES['least_squares'](sample_weight=sample_weight) + loss = _LOSSES["squared_error"](sample_weight=sample_weight) _, hessians = loss.init_gradients_and_hessians( n_samples=n_samples, prediction_dim=prediction_dim, sample_weight=None) @@ -325,7 +325,7 @@ def test_init_gradient_and_hessians_sample_weight(): assert hessians.shape == (1, 1) sample_weight = np.ones(n_samples) - loss = _LOSSES['least_squares'](sample_weight=sample_weight) + loss = _LOSSES["squared_error"](sample_weight=sample_weight) _, hessians = loss.init_gradients_and_hessians( n_samples=n_samples, prediction_dim=prediction_dim, sample_weight=sample_weight) diff --git a/sklearn/ensemble/_hist_gradient_boosting/utils.pyx b/sklearn/ensemble/_hist_gradient_boosting/utils.pyx index cf2c5a51c90dd..d1168acf94835 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/utils.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/utils.pyx @@ -42,7 +42,7 @@ def get_equivalent_estimator(estimator, lib='lightgbm'): raise NotImplementedError('Early stopping should be deactivated.') lightgbm_loss_mapping = { - 'least_squares': 'regression_l2', + 'squared_error': 'regression_l2', 'least_absolute_deviation': 'regression_l1', 'binary_crossentropy': 'binary', 'categorical_crossentropy': 'multiclass' @@ -75,7 +75,7 @@ def get_equivalent_estimator(estimator, lib='lightgbm'): # XGB xgboost_loss_mapping = { - 'least_squares': 'reg:linear', + 'squared_error': 'reg:linear', 'least_absolute_deviation': 'LEAST_ABSOLUTE_DEV_NOT_SUPPORTED', 'binary_crossentropy': 'reg:logistic', 'categorical_crossentropy': 'multi:softmax' @@ -99,7 +99,7 @@ def get_equivalent_estimator(estimator, lib='lightgbm'): # Catboost catboost_loss_mapping = { - 'least_squares': 'RMSE', + 'squared_error': 'RMSE', # catboost does not support MAE when leaf_estimation_method is Newton 'least_absolute_deviation': 'LEAST_ASBOLUTE_DEV_NOT_SUPPORTED', 'binary_crossentropy': 'Logloss', diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py index efb1a645842bc..b6c1fea0e2f29 100644 --- a/sklearn/ensemble/tests/test_forest.py +++ b/sklearn/ensemble/tests/test_forest.py @@ -25,6 +25,7 @@ import pytest import joblib +from numpy.testing import assert_allclose from sklearn.utils._testing import assert_almost_equal from sklearn.utils._testing import assert_array_almost_equal @@ -175,7 +176,7 @@ def check_regression_criterion(name, criterion): @pytest.mark.parametrize('name', FOREST_REGRESSORS) -@pytest.mark.parametrize('criterion', ("mse", "mae", "friedman_mse")) +@pytest.mark.parametrize('criterion', ("squared_error", "mae", "friedman_mse")) def test_regression(name, criterion): check_regression_criterion(name, criterion) @@ -260,7 +261,7 @@ def check_importances(name, criterion, dtype, tolerance): itertools.chain(product(FOREST_CLASSIFIERS, ["gini", "entropy"]), product(FOREST_REGRESSORS, - ["mse", "friedman_mse", "mae"]))) + ["squared_error", "friedman_mse", "mae"]))) def test_importances(dtype, name, criterion): tolerance = 0.01 if name in FOREST_REGRESSORS and criterion == "mae": @@ -1496,6 +1497,19 @@ def test_n_features_deprecation(Estimator): est.n_features_ +# TODO: Remove in v1.2 +def test_mse_deprecated(): + est1 = RandomForestRegressor(criterion="mse", random_state=0) + + with pytest.warns(FutureWarning, + match="Criterion 'mse' was deprecated"): + est1.fit(X, y) + + est2 = RandomForestRegressor(criterion="squared_error", random_state=0) + est2.fit(X, y) + assert_allclose(est1.predict(X), est2.predict(X)) + + @pytest.mark.parametrize('Forest', FOREST_REGRESSORS) def test_mse_criterion_object_segfault_smoke_test(Forest): # This is a smoke test to ensure that passing a mutable criterion diff --git a/sklearn/ensemble/tests/test_gradient_boosting.py b/sklearn/ensemble/tests/test_gradient_boosting.py index 63d4e668e674f..166d6bdfc5c11 100644 --- a/sklearn/ensemble/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/tests/test_gradient_boosting.py @@ -3,6 +3,7 @@ """ import warnings import numpy as np +from numpy.testing import assert_allclose from scipy.sparse import csr_matrix from scipy.sparse import csc_matrix @@ -170,7 +171,7 @@ def test_classification_synthetic(loss): assert error_rate < 0.08 -@pytest.mark.parametrize('loss', ('ls', 'lad', 'huber')) +@pytest.mark.parametrize('loss', ('squared_error', 'lad', 'huber')) @pytest.mark.parametrize('subsample', (1.0, 0.5)) def test_regression_dataset(loss, subsample): # Check consistency on regression dataset with least squares @@ -229,7 +230,7 @@ def test_regression_synthetic(): random_state = check_random_state(1) regression_params = {'n_estimators': 100, 'max_depth': 4, 'min_samples_split': 2, 'learning_rate': 0.1, - 'loss': 'ls'} + 'loss': 'squared_error'} # Friedman1 X, y = datasets.make_friedman1(n_samples=1200, @@ -1066,7 +1067,7 @@ def test_non_uniform_weights_toy_edge_case_reg(): y = [0, 0, 1, 0] # ignore the first 2 training samples by setting their weight to 0 sample_weight = [0, 0, 1, 1] - for loss in ('huber', 'ls', 'lad', 'quantile'): + for loss in ('huber', 'squared_error', 'lad', 'quantile'): gb = GradientBoostingRegressor(learning_rate=1.0, n_estimators=2, loss=loss) gb.fit(X, y, sample_weight=sample_weight) @@ -1369,3 +1370,33 @@ def test_n_features_deprecation(Estimator): with pytest.warns(FutureWarning, match="n_features_ was deprecated"): est.n_features_ + + +# TODO: Remove in v1.2 +@pytest.mark.parametrize("Estimator", GRADIENT_BOOSTING_ESTIMATORS) +def test_criterion_mse_deprecated(Estimator): + est1 = Estimator(criterion="mse", random_state=0) + + with pytest.warns(FutureWarning, + match="Criterion 'mse' was deprecated"): + est1.fit(X, y) + + est2 = Estimator(criterion="squared_error", random_state=0) + est2.fit(X, y) + if hasattr(est1, "predict_proba"): + assert_allclose(est1.predict_proba(X), est2.predict_proba(X)) + else: + assert_allclose(est1.predict(X), est2.predict(X)) + + +# TODO: Remove in v1.2 +def test_loss_ls_deprecated(): + est1 = GradientBoostingRegressor(loss="ls", random_state=0) + + with pytest.warns(FutureWarning, + match="The loss 'ls' was deprecated"): + est1.fit(X, y) + + est2 = GradientBoostingRegressor(loss="squared_error", random_state=0) + est2.fit(X, y) + assert_allclose(est1.predict(X), est2.predict(X)) diff --git a/sklearn/inspection/tests/test_partial_dependence.py b/sklearn/inspection/tests/test_partial_dependence.py index f7727210148c6..51dd6e53e4304 100644 --- a/sklearn/inspection/tests/test_partial_dependence.py +++ b/sklearn/inspection/tests/test_partial_dependence.py @@ -264,7 +264,8 @@ def test_recursion_decision_tree_vs_forest_and_gbdt(seed): equiv_random_state = check_random_state(tree_seed).randint( np.iinfo(np.int32).max) gbdt = GradientBoostingRegressor(n_estimators=1, learning_rate=1, - criterion='mse', max_depth=max_depth, + criterion='squared_error', + max_depth=max_depth, random_state=equiv_random_state) tree = DecisionTreeRegressor(max_depth=max_depth, random_state=equiv_random_state) diff --git a/sklearn/linear_model/_ransac.py b/sklearn/linear_model/_ransac.py index c9246c121c387..2fc8143f432c8 100644 --- a/sklearn/linear_model/_ransac.py +++ b/sklearn/linear_model/_ransac.py @@ -138,9 +138,8 @@ class RANSACRegressor(MetaEstimatorMixin, RegressorMixin, the total number of samples. loss : string, callable, default='absolute_loss' - String inputs, "absolute_loss" and "squared_loss" are supported which - find the absolute loss and squared loss per sample - respectively. + String inputs, 'absolute_loss' and 'squared_error' are supported which + find the absolute loss and squared error per sample respectively. If ``loss`` is a callable, then it should be a function that takes two arrays as inputs, the true and predicted value and returns a 1-D @@ -152,6 +151,10 @@ class RANSACRegressor(MetaEstimatorMixin, RegressorMixin, .. versionadded:: 0.18 + .. deprecated:: 1.0 + The loss 'squared_loss' was deprecated in v1.0 and will be removed + in version 1.2. Use `loss='squared_error'` which is equivalent. + random_state : int, RandomState instance, default=None The generator used to initialize the centers. Pass an int for reproducible output across multiple function calls. @@ -203,7 +206,7 @@ class RANSACRegressor(MetaEstimatorMixin, RegressorMixin, .. [1] https://en.wikipedia.org/wiki/RANSAC .. [2] https://www.sri.com/sites/default/files/publications/ransac-publication.pdf .. [3] http://www.bmva.org/bmvc/2009/Papers/Paper355/Paper355.pdf - """ + """ # noqa: E501 @_deprecate_positional_args def __init__(self, base_estimator=None, *, min_samples=None, residual_threshold=None, is_data_valid=None, @@ -296,8 +299,15 @@ def fit(self, X, y, sample_weight=None): else: loss_function = lambda \ y_true, y_pred: np.sum(np.abs(y_true - y_pred), axis=1) - - elif self.loss == "squared_loss": + # TODO: Remove squared_loss in v1.2. + elif self.loss in ("squared_error", "squared_loss"): + if self.loss == "squared_loss": + warnings.warn( + "The loss 'squared_loss' was deprecated in v1.0 and will " + "be removed in version 1.2. Use `loss='squared_error'` " + "which is equivalent.", + FutureWarning + ) if y.ndim == 1: loss_function = lambda y_true, y_pred: (y_true - y_pred) ** 2 else: @@ -309,9 +319,8 @@ def fit(self, X, y, sample_weight=None): else: raise ValueError( - "loss should be 'absolute_loss', 'squared_loss' or a callable." - "Got %s. " % self.loss) - + "loss should be 'absolute_loss', 'squared_error' or a " + "callable. Got %s. " % self.loss) random_state = check_random_state(self.random_state) diff --git a/sklearn/linear_model/_stochastic_gradient.py b/sklearn/linear_model/_stochastic_gradient.py index 65f6cc6966ba4..a426c9a8d95f2 100644 --- a/sklearn/linear_model/_stochastic_gradient.py +++ b/sklearn/linear_model/_stochastic_gradient.py @@ -155,6 +155,14 @@ def _validate_params(self, for_partial_fit=False): if self.loss not in self.loss_functions: raise ValueError("The loss %s is not supported. " % self.loss) + if self.loss == "squared_loss": + warnings.warn( + "The loss 'squared_loss' was deprecated in v1.0 and will be " + "removed in version 1.2. Use `loss='squared_error'` which is " + "equivalent.", + FutureWarning + ) + def _get_loss_function(self, loss): """Get concrete ``LossFunction`` object for str ``loss``. """ try: @@ -452,12 +460,14 @@ def fit_binary(est, i, X, y, alpha, C, learning_rate, max_iter, class BaseSGDClassifier(LinearClassifierMixin, BaseSGD, metaclass=ABCMeta): + # TODO: Remove squared_loss in v1.2 loss_functions = { "hinge": (Hinge, 1.0), "squared_hinge": (SquaredHinge, 1.0), "perceptron": (Hinge, 0.0), "log": (Log, ), "modified_huber": (ModifiedHuber, ), + "squared_error": (SquaredLoss, ), "squared_loss": (SquaredLoss, ), "huber": (Huber, DEFAULT_EPSILON), "epsilon_insensitive": (EpsilonInsensitive, DEFAULT_EPSILON), @@ -766,7 +776,7 @@ class SGDClassifier(BaseSGDClassifier): linear SVM. The possible options are 'hinge', 'log', 'modified_huber', - 'squared_hinge', 'perceptron', or a regression loss: 'squared_loss', + 'squared_hinge', 'perceptron', or a regression loss: 'squared_error', 'huber', 'epsilon_insensitive', or 'squared_epsilon_insensitive'. The 'log' loss gives logistic regression, a probabilistic classifier. @@ -781,6 +791,10 @@ class SGDClassifier(BaseSGDClassifier): More details about the losses formulas can be found in the :ref:`User Guide `. + .. deprecated:: 1.0 + The loss 'squared_loss' was deprecated in v1.0 and will be removed + in version 1.2. Use `loss='squared_error'` which is equivalent. + penalty : {'l2', 'l1', 'elasticnet'}, default='l2' The penalty (aka regularization term) to be used. Defaults to 'l2' which is the standard regularizer for linear SVM models. 'l1' and @@ -1117,7 +1131,9 @@ def _more_tags(self): class BaseSGDRegressor(RegressorMixin, BaseSGD): + # TODO: Remove squared_loss in v1.2 loss_functions = { + "squared_error": (SquaredLoss, ), "squared_loss": (SquaredLoss, ), "huber": (Huber, DEFAULT_EPSILON), "epsilon_insensitive": (EpsilonInsensitive, DEFAULT_EPSILON), @@ -1127,7 +1143,7 @@ class BaseSGDRegressor(RegressorMixin, BaseSGD): @abstractmethod @_deprecate_positional_args - def __init__(self, loss="squared_loss", *, penalty="l2", alpha=0.0001, + def __init__(self, loss="squared_error", *, penalty="l2", alpha=0.0001, l1_ratio=0.15, fit_intercept=True, max_iter=1000, tol=1e-3, shuffle=True, verbose=0, epsilon=DEFAULT_EPSILON, random_state=None, learning_rate="invscaling", eta0=0.01, @@ -1389,12 +1405,12 @@ class SGDRegressor(BaseSGDRegressor): Parameters ---------- - loss : str, default='squared_loss' - The loss function to be used. The possible values are 'squared_loss', + loss : str, default='squared_error' + The loss function to be used. The possible values are 'squared_error', 'huber', 'epsilon_insensitive', or 'squared_epsilon_insensitive' - The 'squared_loss' refers to the ordinary least squares fit. - 'huber' modifies 'squared_loss' to focus less on getting outliers + The 'squared_error' refers to the ordinary least squares fit. + 'huber' modifies 'squared_error' to focus less on getting outliers correct by switching from squared to linear loss past a distance of epsilon. 'epsilon_insensitive' ignores errors less than epsilon and is linear past that; this is the loss function used in SVR. @@ -1404,6 +1420,10 @@ class SGDRegressor(BaseSGDRegressor): More details about the losses formulas can be found in the :ref:`User Guide `. + .. deprecated:: 1.0 + The loss 'squared_loss' was deprecated in v1.0 and will be removed + in version 1.2. Use `loss='squared_error'` which is equivalent. + penalty : {'l2', 'l1', 'elasticnet'}, default='l2' The penalty (aka regularization term) to be used. Defaults to 'l2' which is the standard regularizer for linear SVM models. 'l1' and @@ -1583,7 +1603,7 @@ class SGDRegressor(BaseSGDRegressor): """ @_deprecate_positional_args - def __init__(self, loss="squared_loss", *, penalty="l2", alpha=0.0001, + def __init__(self, loss="squared_error", *, penalty="l2", alpha=0.0001, l1_ratio=0.15, fit_intercept=True, max_iter=1000, tol=1e-3, shuffle=True, verbose=0, epsilon=DEFAULT_EPSILON, random_state=None, learning_rate="invscaling", eta0=0.01, diff --git a/sklearn/linear_model/tests/test_ransac.py b/sklearn/linear_model/tests/test_ransac.py index f631199a5d268..857696bf387d5 100644 --- a/sklearn/linear_model/tests/test_ransac.py +++ b/sklearn/linear_model/tests/test_ransac.py @@ -406,7 +406,7 @@ def loss_mono(y_true, y_pred): ransac_estimator2.predict(X)) ransac_estimator3 = RANSACRegressor(base_estimator, min_samples=2, residual_threshold=5, random_state=0, - loss="squared_loss") + loss="squared_error") ransac_estimator3.fit(X, y) assert_array_almost_equal(ransac_estimator0.predict(X), ransac_estimator2.predict(X)) @@ -536,3 +536,16 @@ def test_ransac_final_model_fit_sample_weight(): ) assert_allclose(ransac.estimator_.coef_, final_model.coef_, atol=1e-12) + + +# TODO: Remove in v1.2 +def test_loss_squared_loss_deprecated(): + est1 = RANSACRegressor(loss="squared_loss", random_state=0) + + with pytest.warns(FutureWarning, + match="The loss 'squared_loss' was deprecated"): + est1.fit(X, y) + + est2 = RANSACRegressor(loss="squared_error", random_state=0) + est2.fit(X, y) + assert_allclose(est1.predict(X), est2.predict(X)) diff --git a/sklearn/linear_model/tests/test_sgd.py b/sklearn/linear_model/tests/test_sgd.py index 217249631390d..aba043024fea3 100644 --- a/sklearn/linear_model/tests/test_sgd.py +++ b/sklearn/linear_model/tests/test_sgd.py @@ -2,6 +2,7 @@ import pytest import numpy as np +from numpy.testing import assert_allclose import scipy.sparse as sp import joblib @@ -310,10 +311,10 @@ def test_late_onset_averaging_reached(klass): Y_encode[Y_encode == 2] = 1.0 clf1 = klass(average=7, learning_rate="constant", - loss='squared_loss', eta0=eta0, + loss='squared_error', eta0=eta0, alpha=alpha, max_iter=2, shuffle=False) clf2 = klass(average=0, learning_rate="constant", - loss='squared_loss', eta0=eta0, + loss='squared_error', eta0=eta0, alpha=alpha, max_iter=1, shuffle=False) clf1.fit(X, Y_encode) @@ -540,7 +541,7 @@ def test_average_binary_computed_correctly(klass): X = rng.normal(size=(n_samples, n_features)) w = rng.normal(size=n_features) - clf = klass(loss='squared_loss', + clf = klass(loss='squared_error', learning_rate='constant', eta0=eta, alpha=alpha, fit_intercept=True, @@ -611,7 +612,7 @@ def test_sgd_multiclass_average(klass): eta = .001 alpha = .01 # Multi-class average test case - clf = klass(loss='squared_loss', + clf = klass(loss='squared_error', learning_rate='constant', eta0=eta, alpha=alpha, fit_intercept=True, @@ -675,6 +676,8 @@ def test_set_coef_multiclass(klass): clf = klass().fit(X2, Y2, intercept_init=np.zeros((3,))) +# TODO: Remove filterwarnings in v1.2. +@pytest.mark.filterwarnings("ignore:.*squared_loss.*:FutureWarning") @pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier]) def test_sgd_predict_proba_method_access(klass): # Checks that SGDClassifier predict_proba and predict_log_proba methods @@ -1067,7 +1070,7 @@ def test_regression_losses(klass): assert 1.0 == np.mean(clf.predict(X) == Y) clf = klass(alpha=0.01, learning_rate="constant", eta0=0.01, - loss="squared_loss", random_state=random_state) + loss="squared_error", random_state=random_state) clf.fit(X, Y) assert 1.0 == np.mean(clf.predict(X) == Y) @@ -1115,7 +1118,7 @@ def test_sgd_averaged_computed_correctly(klass): # simple linear function without noise y = np.dot(X, w) - clf = klass(loss='squared_loss', + clf = klass(loss='squared_error', learning_rate='constant', eta0=eta, alpha=alpha, fit_intercept=True, @@ -1144,7 +1147,7 @@ def test_sgd_averaged_partial_fit(klass): # simple linear function without noise y = np.dot(X, w) - clf = klass(loss='squared_loss', + clf = klass(loss='squared_error', learning_rate='constant', eta0=eta, alpha=alpha, fit_intercept=True, @@ -1166,7 +1169,7 @@ def test_average_sparse(klass): eta = .001 alpha = .01 - clf = klass(loss='squared_loss', + clf = klass(loss='squared_error', learning_rate='constant', eta0=eta, alpha=alpha, fit_intercept=True, @@ -1194,7 +1197,7 @@ def test_sgd_least_squares_fit(klass): # simple linear function without noise y = 0.5 * X.ravel() - clf = klass(loss='squared_loss', alpha=0.1, max_iter=20, + clf = klass(loss='squared_error', alpha=0.1, max_iter=20, fit_intercept=False) clf.fit(X, y) score = clf.score(X, y) @@ -1203,7 +1206,7 @@ def test_sgd_least_squares_fit(klass): # simple linear function with noise y = 0.5 * X.ravel() + rng.randn(n_samples, 1).ravel() - clf = klass(loss='squared_loss', alpha=0.1, max_iter=20, + clf = klass(loss='squared_error', alpha=0.1, max_iter=20, fit_intercept=False) clf.fit(X, y) score = clf.score(X, y) @@ -1646,3 +1649,25 @@ def test_SGDClassifier_fit_for_all_backends(backend): with joblib.parallel_backend(backend=backend): clf_parallel.fit(X, y) assert_array_almost_equal(clf_sequential.coef_, clf_parallel.coef_) + + +# TODO: Remove in v1.2 +@pytest.mark.parametrize( + 'Estimator', + [linear_model.SGDClassifier, linear_model.SGDRegressor] +) +def test_loss_squared_loss_deprecated(Estimator): + + # Note: class BaseSGD calls self._validate_params() in __init__, therefore + # even instatiation of class raises FutureWarning for squared_loss. + with pytest.warns(FutureWarning, + match="The loss 'squared_loss' was deprecated"): + est1 = Estimator(loss="squared_loss", random_state=0) + est1.fit(X, Y) + + est2 = Estimator(loss="squared_error", random_state=0) + est2.fit(X, Y) + if hasattr(est1, "predict_proba"): + assert_allclose(est1.predict_proba(X), est2.predict_proba(X)) + else: + assert_allclose(est1.predict(X), est2.predict(X)) diff --git a/sklearn/neural_network/_base.py b/sklearn/neural_network/_base.py index 6afe8a23db446..b8b2180bac5e5 100644 --- a/sklearn/neural_network/_base.py +++ b/sklearn/neural_network/_base.py @@ -224,5 +224,5 @@ def binary_log_loss(y_true, y_prob): xlogy(1 - y_true, 1 - y_prob).sum()) / y_prob.shape[0] -LOSS_FUNCTIONS = {'squared_loss': squared_loss, 'log_loss': log_loss, +LOSS_FUNCTIONS = {'squared_error': squared_loss, 'log_loss': log_loss, 'binary_log_loss': binary_log_loss} diff --git a/sklearn/neural_network/_multilayer_perceptron.py b/sklearn/neural_network/_multilayer_perceptron.py index ae06502d3ce1a..52c94a7129b9f 100644 --- a/sklearn/neural_network/_multilayer_perceptron.py +++ b/sklearn/neural_network/_multilayer_perceptron.py @@ -1127,7 +1127,7 @@ def predict_proba(self, X): class MLPRegressor(RegressorMixin, BaseMultilayerPerceptron): """Multi-layer Perceptron regressor. - This model optimizes the squared-loss using LBFGS or stochastic gradient + This model optimizes the squared error using LBFGS or stochastic gradient descent. .. versionadded:: 0.18 @@ -1383,7 +1383,7 @@ def __init__(self, hidden_layer_sizes=(100,), activation="relu", *, activation=activation, solver=solver, alpha=alpha, batch_size=batch_size, learning_rate=learning_rate, learning_rate_init=learning_rate_init, power_t=power_t, - max_iter=max_iter, loss='squared_loss', shuffle=shuffle, + max_iter=max_iter, loss='squared_error', shuffle=shuffle, random_state=random_state, tol=tol, verbose=verbose, warm_start=warm_start, momentum=momentum, nesterovs_momentum=nesterovs_momentum, diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index f7ae823c0070f..420292881f7db 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -62,7 +62,9 @@ CRITERIA_CLF = {"gini": _criterion.Gini, "entropy": _criterion.Entropy} -CRITERIA_REG = {"mse": _criterion.MSE, +# TODO: Remove "mse" in version 1.2. +CRITERIA_REG = {"squared_error": _criterion.MSE, + "mse": _criterion.MSE, "friedman_mse": _criterion.FriedmanMSE, "mae": _criterion.MAE, "poisson": _criterion.Poisson} @@ -350,6 +352,14 @@ def fit(self, X, y, sample_weight=None, check_input=True, else: criterion = CRITERIA_REG[self.criterion](self.n_outputs_, n_samples) + # TODO: Remove in v1.2 + if self.criterion == "mse": + warnings.warn( + "Criterion 'mse' was deprecated in v1.0 and will be " + "removed in version 1.2. Use `criterion='squared_error'` " + "which is equivalent.", + FutureWarning + ) else: # Make a deepcopy in case the criterion has mutable attributes that # might be shared and modified concurrently during parallel fitting @@ -991,15 +1001,16 @@ class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree): Parameters ---------- - criterion : {"mse", "friedman_mse", "mae", "poisson"}, default="mse" + criterion : {"squared_error", "mse", "friedman_mse", "mae", "poisson"}, \ + default="squared_error" The function to measure the quality of a split. Supported criteria - are "mse" for the mean squared error, which is equal to variance - reduction as feature selection criterion and minimizes the L2 loss - using the mean of each terminal node, "friedman_mse", which uses mean - squared error with Friedman's improvement score for potential splits, - "mae" for the mean absolute error, which minimizes the L1 loss using - the median of each terminal node, and "poisson" which uses reduction in - Poisson deviance to find splits. + are "squared_error" for the mean squared error, which is equal to + variance reduction as feature selection criterion and minimizes the L2 + loss using the mean of each terminal node, "friedman_mse", which uses + mean squared error with Friedman's improvement score for potential + splits, "mae" for the mean absolute error, which minimizes the L1 loss + using the median of each terminal node, and "poisson" which uses + reduction in Poisson deviance to find splits. .. versionadded:: 0.18 Mean Absolute Error (MAE) criterion. @@ -1007,6 +1018,10 @@ class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree): .. versionadded:: 0.24 Poisson deviance criterion. + .. deprecated:: 1.0 + Criterion "mse" was deprecated in v1.0 and will be removed in + version 1.2. Use `criterion="squared_error"` which is equivalent. + splitter : {"best", "random"}, default="best" The strategy used to choose the split at each node. Supported strategies are "best" to choose the best split and "random" to choose @@ -1187,7 +1202,7 @@ class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree): """ @_deprecate_positional_args def __init__(self, *, - criterion="mse", + criterion="squared_error", splitter="best", max_depth=None, min_samples_split=2, @@ -1545,11 +1560,12 @@ class ExtraTreeRegressor(DecisionTreeRegressor): Parameters ---------- - criterion : {"mse", "friedman_mse", "mae"}, default="mse" + criterion : {"squared_error", "mse", "friedman_mse", "mae"}, \ + default="squared_error" The function to measure the quality of a split. Supported criteria - are "mse" for the mean squared error, which is equal to variance - reduction as feature selection criterion and "mae" for the mean - absolute error. + are "squared_error" for the mean squared error, which is equal to + variance reduction as feature selection criterion and "mae" for the + mean absolute error. .. versionadded:: 0.18 Mean Absolute Error (MAE) criterion. @@ -1557,6 +1573,10 @@ class ExtraTreeRegressor(DecisionTreeRegressor): .. versionadded:: 0.24 Poisson deviance criterion. + .. deprecated:: 1.0 + Criterion "mse" was deprecated in v1.0 and will be removed in + version 1.2. Use `criterion="squared_error"` which is equivalent. + splitter : {"random", "best"}, default="random" The strategy used to choose the split at each node. Supported strategies are "best" to choose the best split and "random" to choose @@ -1722,7 +1742,7 @@ class ExtraTreeRegressor(DecisionTreeRegressor): """ @_deprecate_positional_args def __init__(self, *, - criterion="mse", + criterion="squared_error", splitter="random", max_depth=None, min_samples_split=2, diff --git a/sklearn/tree/_export.py b/sklearn/tree/_export.py index ff29790e3699e..affe1b68cfe9a 100644 --- a/sklearn/tree/_export.py +++ b/sklearn/tree/_export.py @@ -299,6 +299,9 @@ def node_to_str(self, tree, node_id, criterion): if self.impurity: if isinstance(criterion, _criterion.FriedmanMSE): criterion = "friedman_mse" + elif (isinstance(criterion, _criterion.MSE) + or criterion == "squared_error"): + criterion = "squared_error" elif not isinstance(criterion, str): criterion = "impurity" if labels: diff --git a/sklearn/tree/tests/test_export.py b/sklearn/tree/tests/test_export.py index 6a7bf33b2143f..7b94fbb527dc9 100644 --- a/sklearn/tree/tests/test_export.py +++ b/sklearn/tree/tests/test_export.py @@ -177,32 +177,34 @@ def test_graphviz_toy(): # Test regression output with plot_options clf = DecisionTreeRegressor(max_depth=3, min_samples_split=2, - criterion="mse", + criterion="squared_error", random_state=2) clf.fit(X, y) contents1 = export_graphviz(clf, filled=True, leaves_parallel=True, out_file=None, rotate=True, rounded=True, fontname="sans") - contents2 = 'digraph Tree {\n' \ - 'node [shape=box, style="filled, rounded", color="black", ' \ - 'fontname="sans"] ;\n' \ - 'graph [ranksep=equally, splines=polyline] ;\n' \ - 'edge [fontname="sans"] ;\n' \ - 'rankdir=LR ;\n' \ - '0 [label="X[0] <= 0.0\\nmse = 1.0\\nsamples = 6\\n' \ - 'value = 0.0", fillcolor="#f2c09c"] ;\n' \ - '1 [label="mse = 0.0\\nsamples = 3\\nvalue = -1.0", ' \ - 'fillcolor="#ffffff"] ;\n' \ - '0 -> 1 [labeldistance=2.5, labelangle=-45, ' \ - 'headlabel="True"] ;\n' \ - '2 [label="mse = 0.0\\nsamples = 3\\nvalue = 1.0", ' \ - 'fillcolor="#e58139"] ;\n' \ - '0 -> 2 [labeldistance=2.5, labelangle=45, ' \ - 'headlabel="False"] ;\n' \ - '{rank=same ; 0} ;\n' \ - '{rank=same ; 1; 2} ;\n' \ - '}' + contents2 = ('digraph Tree {\n' + 'node [shape=box, style="filled, rounded", color="black", ' + 'fontname="sans"] ;\n' + 'graph [ranksep=equally, splines=polyline] ;\n' + 'edge [fontname="sans"] ;\n' + 'rankdir=LR ;\n' + '0 [label="X[0] <= 0.0\\nsquared_error = 1.0\\nsamples = 6\\n' + 'value = 0.0", fillcolor="#f2c09c"] ;\n' + '1 [label="squared_error = 0.0\\nsamples = 3\\' + 'nvalue = -1.0", ' + 'fillcolor="#ffffff"] ;\n' + '0 -> 1 [labeldistance=2.5, labelangle=-45, ' + 'headlabel="True"] ;\n' + '2 [label="squared_error = 0.0\\nsamples = 3\\nvalue = 1.0", ' + 'fillcolor="#e58139"] ;\n' + '0 -> 2 [labeldistance=2.5, labelangle=45, ' + 'headlabel="False"] ;\n' + '{rank=same ; 0} ;\n' + '{rank=same ; 1; 2} ;\n' + '}' + ) assert contents1 == contents2 diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index be66316f7187a..2a1da1e2bfce0 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -51,7 +51,7 @@ from sklearn.utils import compute_sample_weight CLF_CRITERIONS = ("gini", "entropy") -REG_CRITERIONS = ("mse", "mae", "friedman_mse", "poisson") +REG_CRITERIONS = ("squared_error", "mae", "friedman_mse", "poisson") CLF_TREES = { "DecisionTreeClassifier": DecisionTreeClassifier, @@ -293,7 +293,7 @@ def test_diabetes_overfit(name, Tree, criterion): @pytest.mark.parametrize("name, Tree", REG_TREES.items()) @pytest.mark.parametrize( "criterion, max_depth, metric, max_loss", - [("mse", 15, mean_squared_error, 60), + [("squared_error", 15, mean_squared_error, 60), ("mae", 20, mean_squared_error, 60), ("friedman_mse", 15, mean_squared_error, 60), ("poisson", 15, mean_poisson_deviance, 30)] @@ -420,8 +420,8 @@ def test_importances_raises(): getattr(clf, 'feature_importances_') -def test_importances_gini_equal_mse(): - # Check that gini is equivalent to mse for binary output variable +def test_importances_gini_equal_squared_error(): + # Check that gini is equivalent to squared_error for binary output variable X, y = datasets.make_classification(n_samples=2000, n_features=10, @@ -436,7 +436,7 @@ def test_importances_gini_equal_mse(): # high tree depth, we restrict this maximal depth. clf = DecisionTreeClassifier(criterion="gini", max_depth=5, random_state=0).fit(X, y) - reg = DecisionTreeRegressor(criterion="mse", max_depth=5, + reg = DecisionTreeRegressor(criterion="squared_error", max_depth=5, random_state=0).fit(X, y) assert_almost_equal(clf.feature_importances_, reg.feature_importances_) @@ -1973,7 +1973,9 @@ def test_apply_path_readonly_all_trees(name): check_apply_path_readonly(name) -@pytest.mark.parametrize("criterion", ["mse", "friedman_mse", "poisson"]) +@pytest.mark.parametrize( + "criterion", ["squared_error", "friedman_mse", "poisson"] +) @pytest.mark.parametrize("Tree", REG_TREES.values()) def test_balance_property(criterion, Tree): # Test that sum(y_pred)=sum(y_true) on training set. @@ -1995,7 +1997,7 @@ def test_poisson_zero_nodes(seed): y = [0, 0, 0, 0, 1, 2, 3, 4] # Note that X[:, 0] == 0 is a 100% indicator for y == 0. The tree can # easily learn that: - reg = DecisionTreeRegressor(criterion="mse", random_state=seed) + reg = DecisionTreeRegressor(criterion="squared_error", random_state=seed) reg.fit(X, y) assert np.amin(reg.predict(X)) == 0 # whereas Poisson must predict strictly positive numbers @@ -2023,7 +2025,7 @@ def test_poisson_zero_nodes(seed): def test_poisson_vs_mse(): # For a Poisson distributed target, Poisson loss should give better results - # than least squares measured in Poisson deviance as metric. + # than squared error measured in Poisson deviance as metric. # We have a similar test, test_poisson(), in # sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py # Note: Some fine tuning was needed to have metric_poi < metric_dummy on @@ -2042,7 +2044,7 @@ def test_poisson_vs_mse(): tree_poi = DecisionTreeRegressor(criterion="poisson", min_samples_split=10, random_state=rng) - tree_mse = DecisionTreeRegressor(criterion="mse", + tree_mse = DecisionTreeRegressor(criterion="squared_error", min_samples_split=10, random_state=rng) @@ -2052,12 +2054,13 @@ def test_poisson_vs_mse(): for X, y, val in [(X_train, y_train, "train"), (X_test, y_test, "test")]: metric_poi = mean_poisson_deviance(y, tree_poi.predict(X)) - # mse might produce non-positive predictions => clip + # squared_error might produce non-positive predictions => clip metric_mse = mean_poisson_deviance(y, np.clip(tree_mse.predict(X), 1e-15, None)) metric_dummy = mean_poisson_deviance(y, dummy.predict(X)) - # As MSE might correctly predict 0 in train set, its train score can - # be better than Poisson. This is no longer the case for the test set. + # As squared_error might correctly predict 0 in train set, its train + # score can be better than Poisson. This is no longer the case for the + # test set. if val == "test": assert metric_poi < metric_mse assert metric_poi < metric_dummy @@ -2114,3 +2117,16 @@ def test_X_idx_sorted_deprecated(TreeEstimator): with pytest.warns(FutureWarning, match="The parameter 'X_idx_sorted' is deprecated"): tree.fit(X, y, X_idx_sorted=X_idx_sorted) + + +# TODO: Remove in v1.2 +@pytest.mark.parametrize("Tree", REG_TREES.values()) +def test_mse_deprecated(Tree): + tree = Tree(criterion="mse") + + with pytest.warns(FutureWarning, + match="Criterion 'mse' was deprecated"): + tree.fit(X, y) + + tree_sqer = Tree(criterion="squared_error").fit(X, y) + assert_allclose(tree.predict(X), tree_sqer.predict(X)) From 071ddc75e92917d372f84e20a7fca15c1b7c6ca0 Mon Sep 17 00:00:00 2001 From: Avi Gupta <33635739+avigupta2612@users.noreply.github.com> Date: Fri, 19 Mar 2021 20:56:15 +0530 Subject: [PATCH 259/478] Removed assert_warns_message from gaussian_process/tests (#19697) --- sklearn/gaussian_process/tests/test_gpc.py | 18 ++++----- sklearn/gaussian_process/tests/test_gpr.py | 37 ++++++++++--------- .../gaussian_process/tests/test_kernels.py | 12 +++--- 3 files changed, 36 insertions(+), 31 deletions(-) diff --git a/sklearn/gaussian_process/tests/test_gpc.py b/sklearn/gaussian_process/tests/test_gpc.py index 76804906f7fb4..57efc34891c51 100644 --- a/sklearn/gaussian_process/tests/test_gpc.py +++ b/sklearn/gaussian_process/tests/test_gpc.py @@ -17,7 +17,7 @@ from sklearn.exceptions import ConvergenceWarning from sklearn.utils._testing \ - import assert_almost_equal, assert_array_equal, assert_warns_message + import assert_almost_equal, assert_array_equal def f(x): @@ -189,14 +189,14 @@ def test_multi_class_n_jobs(kernel): def test_warning_bounds(): kernel = RBF(length_scale_bounds=[1e-5, 1e-3]) gpc = GaussianProcessClassifier(kernel=kernel) - assert_warns_message(ConvergenceWarning, "The optimal value found for " - "dimension 0 of parameter " - "length_scale is close to " - "the specified upper bound " - "0.001. Increasing the bound " - "and calling fit again may " - "find a better value.", - gpc.fit, X, y) + warning_message = ( + "The optimal value found for dimension 0 of parameter " + "length_scale is close to the specified upper bound " + "0.001. Increasing the bound and calling fit again may " + "find a better value." + ) + with pytest.warns(ConvergenceWarning, match=warning_message): + gpc.fit(X, y) kernel_sum = (WhiteKernel(noise_level_bounds=[1e-5, 1e-3]) + RBF(length_scale_bounds=[1e3, 1e5])) diff --git a/sklearn/gaussian_process/tests/test_gpr.py b/sklearn/gaussian_process/tests/test_gpr.py index 83c24c7cc8573..a5bfa05c47313 100644 --- a/sklearn/gaussian_process/tests/test_gpr.py +++ b/sklearn/gaussian_process/tests/test_gpr.py @@ -5,6 +5,7 @@ # License: BSD 3 clause import sys +import re import numpy as np import warnings @@ -21,9 +22,8 @@ from sklearn.utils._testing \ import (assert_array_less, - assert_almost_equal, assert_raise_message, - assert_array_almost_equal, assert_array_equal, - assert_allclose, assert_warns_message) + assert_almost_equal, assert_array_almost_equal, + assert_array_equal, assert_allclose) def f(x): @@ -404,12 +404,15 @@ def test_gpr_correct_error_message(): y = np.ones(6) kernel = DotProduct() gpr = GaussianProcessRegressor(kernel=kernel, alpha=0.0) - assert_raise_message(np.linalg.LinAlgError, - "The kernel, %s, is not returning a " - "positive definite matrix. Try gradually increasing " - "the 'alpha' parameter of your " - "GaussianProcessRegressor estimator." - % kernel, gpr.fit, X, y) + message = ( + "The kernel, %s, is not returning a " + "positive definite matrix. Try gradually increasing " + "the 'alpha' parameter of your " + "GaussianProcessRegressor estimator." + % kernel + ) + with pytest.raises(np.linalg.LinAlgError, match=re.escape(message)): + gpr.fit(X, y) @pytest.mark.parametrize('kernel', kernels) @@ -474,14 +477,14 @@ def test_K_inv_reset(kernel): def test_warning_bounds(): kernel = RBF(length_scale_bounds=[1e-5, 1e-3]) gpr = GaussianProcessRegressor(kernel=kernel) - assert_warns_message(ConvergenceWarning, "The optimal value found for " - "dimension 0 of parameter " - "length_scale is close to " - "the specified upper bound " - "0.001. Increasing the bound " - "and calling fit again may " - "find a better value.", - gpr.fit, X, y) + warning_message = ( + "The optimal value found for dimension 0 of parameter " + "length_scale is close to the specified upper bound " + "0.001. Increasing the bound and calling fit again may " + "find a better value." + ) + with pytest.warns(ConvergenceWarning, match=warning_message): + gpr.fit(X, y) kernel_sum = (WhiteKernel(noise_level_bounds=[1e-5, 1e-3]) + RBF(length_scale_bounds=[1e3, 1e5])) diff --git a/sklearn/gaussian_process/tests/test_kernels.py b/sklearn/gaussian_process/tests/test_kernels.py index 1f8e196104e75..b56c0b06b5fc0 100644 --- a/sklearn/gaussian_process/tests/test_kernels.py +++ b/sklearn/gaussian_process/tests/test_kernels.py @@ -20,7 +20,6 @@ from sklearn.utils._testing import (assert_almost_equal, assert_array_equal, assert_array_almost_equal, assert_allclose, - assert_raise_message, fails_if_pypy) @@ -361,7 +360,10 @@ def test_repr_kernels(kernel): def test_rational_quadratic_kernel(): kernel = RationalQuadratic(length_scale=[1., 1.]) - assert_raise_message(AttributeError, - "RationalQuadratic kernel only supports isotropic " - "version, please use a single " - "scalar for length_scale", kernel, X) + message = ( + "RationalQuadratic kernel only supports isotropic " + "version, please use a single " + "scalar for length_scale" + ) + with pytest.raises(AttributeError, match=message): + kernel(X) From cc1b171af86dee040d933aeeae64439e85a0cd54 Mon Sep 17 00:00:00 2001 From: Avi Gupta <33635739+avigupta2612@users.noreply.github.com> Date: Fri, 19 Mar 2021 22:03:57 +0530 Subject: [PATCH 260/478] Replaced assert_raises from utils/tests/test_estimator_checks (#19709) --- sklearn/utils/tests/test_estimator_checks.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py index 8fabe5f91ea31..4792f50f2baef 100644 --- a/sklearn/utils/tests/test_estimator_checks.py +++ b/sklearn/utils/tests/test_estimator_checks.py @@ -1,3 +1,7 @@ +# We can not use pytest here, because we run +# build_tools/azure/test_pytest_soft_dependency.sh on these +# tests to make sure estimator_checks works without pytest. + import unittest import sys @@ -139,6 +143,7 @@ def fit(self, X, y=None): X, y = self._validate_data(X, y) return self + class ModifiesValueInsteadOfRaisingError(BaseEstimator): def __init__(self, p=0): self.p = p From 03edffa25f9250cd2861117c096860b1c9e09d2c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sebastian=20P=C3=B6lsterl?= Date: Fri, 19 Mar 2021 19:16:23 +0000 Subject: [PATCH 261/478] DOC Add scikit-survival to related projects (#19728) --- doc/related_projects.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/doc/related_projects.rst b/doc/related_projects.rst index acc2689388896..2b1d41bf4a5e4 100644 --- a/doc/related_projects.rst +++ b/doc/related_projects.rst @@ -270,6 +270,10 @@ Other packages useful for data analysis and machine learning. - `Seaborn `_ Visualization library based on matplotlib. It provides a high-level interface for drawing attractive statistical graphics. +- `scikit-survival `_ A library implementing + models to learn from censored time-to-event data (also called survival analysis). + Models are fully compatible with scikit-learn. + Recommendation Engine packages ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ From fe897c0ba0f00171333dcbdb483ca0d0346fed95 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sebastian=20P=C3=B6lsterl?= Date: Fri, 19 Mar 2021 19:17:47 +0000 Subject: [PATCH 262/478] DOC Move Sacred to "Experimentation frameworks" (#19730) --- doc/related_projects.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/related_projects.rst b/doc/related_projects.rst index 2b1d41bf4a5e4..fb02ea8beaf0d 100644 --- a/doc/related_projects.rst +++ b/doc/related_projects.rst @@ -60,6 +60,9 @@ enhance the functionality of scikit-learn's estimators. **Experimentation frameworks** +- `Sacred `_ Tool to help you configure, + organize, log and reproduce experiments + - `REP `_ Environment for conducting data-driven research in a consistent and reproducible way @@ -264,9 +267,6 @@ Other packages useful for data analysis and machine learning. - `PyMC `_ Bayesian statistical models and fitting algorithms. -- `Sacred `_ Tool to help you configure, - organize, log and reproduce experiments - - `Seaborn `_ Visualization library based on matplotlib. It provides a high-level interface for drawing attractive statistical graphics. From 266400e60ddc0bdba1f0de02ed49f45893e5647c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?= <34657725+jeremiedbb@users.noreply.github.com> Date: Fri, 19 Mar 2021 21:44:08 +0100 Subject: [PATCH 263/478] DOC Fix doc regarding required_parameters (#19725) --- doc/developers/develop.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/developers/develop.rst b/doc/developers/develop.rst index c68becf18f93c..4956530d2bbf6 100644 --- a/doc/developers/develop.rst +++ b/doc/developers/develop.rst @@ -641,7 +641,7 @@ In addition to the tags, estimators also need to declare any non-optional parameters to ``__init__`` in the ``_required_parameters`` class attribute, which is a list or tuple. If ``_required_parameters`` is only ``["estimator"]`` or ``["base_estimator"]``, then the estimator will be -instantiated with an instance of ``LinearDiscriminantAnalysis`` (or +instantiated with an instance of ``LogisticRegression`` (or ``RidgeRegression`` if the estimator is a regressor) in the tests. The choice of these two models is somewhat idiosyncratic but both should provide robust closed-form solutions. From 0892a98fc9972b1da88ca25282e70105bd463608 Mon Sep 17 00:00:00 2001 From: Rodion Martynov Date: Sat, 20 Mar 2021 13:57:42 +0300 Subject: [PATCH 264/478] Stratified Group KFold implementation (#18649) * Initial implementation * Forgot to add to second __add__ list * Update split method parameter doc * Added example; changed default test_size to 0.1; added to author list * StratifiedGroupKFold impl and other improvements * Add class to __all__ spec * Remove random_state when no shuffle * Tighter formatting * Update the implementation of StratifiedGroupKFold * Add StratifiedGroupKFold to __init__ * Add y checks to StartifiedGroupKFold * Raise error if n_splits > max num samples in class * Warn if n_splits > mn num samples in class * Add SGKfold to general repr test * Add SGKFold to 2d_y test case * Add SGKfold to value erros test case Parameters are the same as for StratifiedKFold to ensure similar behavior given n_groups == n_samples * Add SGKFold to StratifiedKFold test cases The idea is to ensure similar behavior when groups are trivial (n_groups == n_samples) * Add SGKFold to reproducibility test case * Add SGKFold to GroupKFold test case * Add SGKFold to nested cv test case * Add SGKFold to random_state with shuffle=False test case * Add SGKFold to constant splits test case * Fix repr test case * Fix formatting issues * Add samples to a fold with least num samples Required to produce balanced size folds when the distribution of y is more or less the same * Remove GroupShuffleSplit impl * Add notes to StratifiedGroupKFold * Fix doctest * Added stratified group kfold tests * Better variable naming * Add section to documentation * Remove leftover StratifiedGroupShuffleSplit import * Add changelist and reference to original kernel * Better naming for least populated class check * Better expression for number of labels * Remove use of Counter We already have this data in output of np.unique * Add tests for homogeneous groups * Add StratifiedGroupKFold test against GroupKFold * Add changes to changelist in docstring * Add StratifiedGroupKFold to classes.rst * Fix description of StratifiedGroupKFold * Move license notice out of docstring * Disambiguate labels to classes in doc * Add changelog entry * Fix changelog author entry * Fix StratifiedGroupKFold docstring * Better variable names * Remove defaultdict in favor of numpy indexing * Extracted best_fold search into a separate method * Make use of numpy broadcasting instead of for loop * Encode groups and use arrays instead of dicts * Use numpy sort instead of python * Clarify shuffling behavior of StratifiedGroupKF in docs * Switch name from label_idx to class_idx * Remove accidentally leftover comment * Fix np.sort keyword to support numpy < 1.15 * Fix typo in docstring * Add StratifiedGroupKFold to visualization doc * Add visualization for uneven group as an example * Fix image numbers to match updated example * Add author * Add SGKF visualization to docs * Add comments for groups in stratified CV tests Co-authored-by: Leandro Hermida Co-authored-by: marrodion --- doc/modules/classes.rst | 1 + doc/modules/cross_validation.rst | 64 +++++- doc/whats_new/v1.0.rst | 10 + examples/model_selection/plot_cv_indices.py | 35 +++- sklearn/model_selection/__init__.py | 2 + sklearn/model_selection/_split.py | 192 +++++++++++++++++- sklearn/model_selection/tests/test_split.py | 212 ++++++++++++++++---- 7 files changed, 457 insertions(+), 59 deletions(-) diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index 0cd5abb16829d..ceebfc337352a 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -1176,6 +1176,7 @@ Splitter Classes model_selection.ShuffleSplit model_selection.StratifiedKFold model_selection.StratifiedShuffleSplit + model_selection.StratifiedGroupKFold model_selection.TimeSeriesSplit Splitter Functions diff --git a/doc/modules/cross_validation.rst b/doc/modules/cross_validation.rst index ae3d38f168f3f..0b090fd7385b6 100644 --- a/doc/modules/cross_validation.rst +++ b/doc/modules/cross_validation.rst @@ -353,7 +353,7 @@ Example of 2-fold cross-validation on a dataset with 4 samples:: Here is a visualization of the cross-validation behavior. Note that :class:`KFold` is not affected by classes or groups. -.. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_cv_indices_004.png +.. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_cv_indices_006.png :target: ../auto_examples/model_selection/plot_cv_indices.html :align: center :scale: 75% @@ -509,7 +509,7 @@ Here is a usage example:: Here is a visualization of the cross-validation behavior. Note that :class:`ShuffleSplit` is not affected by classes or groups. -.. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_cv_indices_006.png +.. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_cv_indices_008.png :target: ../auto_examples/model_selection/plot_cv_indices.html :align: center :scale: 75% @@ -566,7 +566,7 @@ We can see that :class:`StratifiedKFold` preserves the class ratios Here is a visualization of the cross-validation behavior. -.. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_cv_indices_007.png +.. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_cv_indices_009.png :target: ../auto_examples/model_selection/plot_cv_indices.html :align: center :scale: 75% @@ -585,7 +585,7 @@ percentage for each target class as in the complete set. Here is a visualization of the cross-validation behavior. -.. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_cv_indices_009.png +.. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_cv_indices_012.png :target: ../auto_examples/model_selection/plot_cv_indices.html :align: center :scale: 75% @@ -645,6 +645,58 @@ size due to the imbalance in the data. Here is a visualization of the cross-validation behavior. +.. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_cv_indices_007.png + :target: ../auto_examples/model_selection/plot_cv_indices.html + :align: center + :scale: 75% + +.. _stratified_group_k_fold: + +StratifiedGroupKFold +^^^^^^^^^^^^^^^^^^^^ + +:class:`StratifiedGroupKFold` is a cross-validation scheme that combines both +:class:`StratifiedKFold` and :class:`GroupKFold`. The idea is to try to +preserve the distribution of classes in each split while keeping each group +within a single split. That might be useful when you have an unbalanced +dataset so that using just :class:`GroupKFold` might produce skewed splits. + +Example:: + + >>> from sklearn.model_selection import StratifiedGroupKFold + >>> X = list(range(18)) + >>> y = [1] * 6 + [0] * 12 + >>> groups = [1, 2, 3, 3, 4, 4, 1, 1, 2, 2, 3, 4, 5, 5, 5, 6, 6, 6] + >>> sgkf = StratifiedGroupKFold(n_splits=3) + >>> for train, test in sgkf.split(X, y, groups=groups): + ... print("%s %s" % (train, test)) + [ 0 2 3 4 5 6 7 10 11 15 16 17] [ 1 8 9 12 13 14] + [ 0 1 4 5 6 7 8 9 11 12 13 14] [ 2 3 10 15 16 17] + [ 1 2 3 8 9 10 12 13 14 15 16 17] [ 0 4 5 6 7 11] + +Implementation notes: + +- With the current implementation full shuffle is not possible in most + scenarios. When shuffle=True, the following happens: + + 1. All groups a shuffled. + 2. Groups are sorted by standard deviation of classes using stable sort. + 3. Sorted groups are iterated over and assigned to folds. + + That means that only groups with the same standard deviation of class + distribution will be shuffled, which might be useful when each group has only + a single class. +- The algorithm greedily assigns each group to one of n_splits test sets, + choosing the test set that minimises the variance in class distribution + across test sets. Group assignment proceeds from groups with highest to + lowest variance in class frequency, i.e. large groups peaked on one or few + classes are assigned first. +- This split is suboptimal in a sense that it might produce imbalanced splits + even if perfect stratification is possible. If you have relatively close + distribution of classes in each group, using :class:`GroupKFold` is better. + +Here is a visualization of cross-validation behavior for uneven groups: + .. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_cv_indices_005.png :target: ../auto_examples/model_selection/plot_cv_indices.html :align: center @@ -733,7 +785,7 @@ Here is a usage example:: Here is a visualization of the cross-validation behavior. -.. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_cv_indices_008.png +.. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_cv_indices_011.png :target: ../auto_examples/model_selection/plot_cv_indices.html :align: center :scale: 75% @@ -835,7 +887,7 @@ Example of 3-split time series cross-validation on a dataset with 6 samples:: Here is a visualization of the cross-validation behavior. -.. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_cv_indices_010.png +.. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_cv_indices_013.png :target: ../auto_examples/model_selection/plot_cv_indices.html :align: center :scale: 75% diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index b4ee0c57b97fc..521e358ac2f02 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -217,6 +217,16 @@ Changelog are integral. :pr:`9843` by :user:`Jon Crall `. +:mod:`sklearn.model_selection` +.............................. + +- |Feature| added :class:`model_selection.StratifiedGroupKFold`, that combines + :class:`model_selection.StratifiedKFold` and `model_selection.GroupKFold`, + providing an ability to split data preserving the distribution of classes in + each split while keeping each group within a single split. + :pr:`18649` by `Leandro Hermida ` and + `Rodion Martynov `. + :mod:`sklearn.naive_bayes` .......................... diff --git a/examples/model_selection/plot_cv_indices.py b/examples/model_selection/plot_cv_indices.py index 91f71b0451cb2..f07fa1595e860 100644 --- a/examples/model_selection/plot_cv_indices.py +++ b/examples/model_selection/plot_cv_indices.py @@ -13,7 +13,8 @@ from sklearn.model_selection import (TimeSeriesSplit, KFold, ShuffleSplit, StratifiedKFold, GroupShuffleSplit, - GroupKFold, StratifiedShuffleSplit) + GroupKFold, StratifiedShuffleSplit, + StratifiedGroupKFold) import numpy as np import matplotlib.pyplot as plt from matplotlib.patches import Patch @@ -113,16 +114,32 @@ def plot_cv_indices(cv, X, y, group, ax, n_splits, lw=10): # %% # As you can see, by default the KFold cross-validation iterator does not # take either datapoint class or group into consideration. We can change this -# by using the ``StratifiedKFold`` like so. +# by using either: +# +# - ``StratifiedKFold`` to preserve the percentage of samples for each class. +# - ``GroupKFold`` to ensure that the same group will not appear in two +# different folds. +# - ``StratifiedGroupKFold`` to keep the constraint of ``GroupKFold`` while +# attempting to return stratified folds. -fig, ax = plt.subplots() -cv = StratifiedKFold(n_splits) -plot_cv_indices(cv, X, y, groups, ax, n_splits) +# To better demonstrate the difference, we will assign samples to groups +# unevenly: + +uneven_groups = np.sort(np.random.randint(0, 10, n_points)) + +cvs = [StratifiedKFold, GroupKFold, StratifiedGroupKFold] + +for cv in cvs: + fig, ax = plt.subplots(figsize=(6, 3)) + plot_cv_indices(cv(n_splits), X, y, uneven_groups, ax, n_splits) + ax.legend([Patch(color=cmap_cv(.8)), Patch(color=cmap_cv(.02))], + ['Testing set', 'Training set'], loc=(1.02, .8)) + # Make the legend fit + plt.tight_layout() + fig.subplots_adjust(right=.7) # %% -# In this case, the cross-validation retained the same ratio of classes across -# each CV split. Next we'll visualize this behavior for a number of CV -# iterators. +# Next we'll visualize this behavior for a number of CV iterators. # # Visualize cross-validation indices for many CV objects # ------------------------------------------------------ @@ -133,7 +150,7 @@ def plot_cv_indices(cv, X, y, group, ax, n_splits, lw=10): # # Note how some use the group/class information while others do not. -cvs = [KFold, GroupKFold, ShuffleSplit, StratifiedKFold, +cvs = [KFold, GroupKFold, ShuffleSplit, StratifiedKFold, StratifiedGroupKFold, GroupShuffleSplit, StratifiedShuffleSplit, TimeSeriesSplit] diff --git a/sklearn/model_selection/__init__.py b/sklearn/model_selection/__init__.py index 897183414b5a6..f79db2a5acc17 100644 --- a/sklearn/model_selection/__init__.py +++ b/sklearn/model_selection/__init__.py @@ -14,6 +14,7 @@ from ._split import ShuffleSplit from ._split import GroupShuffleSplit from ._split import StratifiedShuffleSplit +from ._split import StratifiedGroupKFold from ._split import PredefinedSplit from ._split import train_test_split from ._split import check_cv @@ -57,6 +58,7 @@ 'RandomizedSearchCV', 'ShuffleSplit', 'StratifiedKFold', + 'StratifiedGroupKFold', 'StratifiedShuffleSplit', 'check_cv', 'cross_val_predict', diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index 244b2b63af449..13edbeef071f5 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -3,13 +3,16 @@ functions to split the data based on a preset strategy. """ -# Author: Alexandre Gramfort , -# Gael Varoquaux , +# Author: Alexandre Gramfort +# Gael Varoquaux # Olivier Grisel # Raghav RV +# Leandro Hermida +# Rodion Martynov # License: BSD 3 clause from collections.abc import Iterable +from collections import defaultdict import warnings from itertools import chain, combinations from math import ceil, floor @@ -40,6 +43,7 @@ 'ShuffleSplit', 'GroupShuffleSplit', 'StratifiedKFold', + 'StratifiedGroupKFold', 'StratifiedShuffleSplit', 'PredefinedSplit', 'train_test_split', @@ -732,6 +736,190 @@ def split(self, X, y, groups=None): return super().split(X, y, groups) +class StratifiedGroupKFold(_BaseKFold): + """Stratified K-Folds iterator variant with non-overlapping groups. + + This cross-validation object is a variation of StratifiedKFold attempts to + return stratified folds with non-overlapping groups. The folds are made by + preserving the percentage of samples for each class. + + The same group will not appear in two different folds (the number of + distinct groups has to be at least equal to the number of folds). + + The difference between GroupKFold and StratifiedGroupKFold is that + the former attempts to create balanced folds such that the number of + distinct groups is approximately the same in each fold, whereas + StratifiedGroupKFold attempts to create folds which preserve the + percentage of samples for each class as much as possible given the + constraint of non-overlapping groups between splits. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + n_splits : int, default=5 + Number of folds. Must be at least 2. + + shuffle : bool, default=False + Whether to shuffle each class's samples before splitting into batches. + Note that the samples within each split will not be shuffled. + This implementation can only shuffle groups that have approximately the + same y distribution, no global shuffle will be performed. + + random_state : int or RandomState instance, default=None + When `shuffle` is True, `random_state` affects the ordering of the + indices, which controls the randomness of each fold for each class. + Otherwise, leave `random_state` as `None`. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.model_selection import StratifiedGroupKFold + >>> X = np.ones((17, 2)) + >>> y = np.array([0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]) + >>> groups = np.array([1, 1, 2, 2, 3, 3, 3, 4, 5, 5, 5, 5, 6, 6, 7, 8, 8]) + >>> cv = StratifiedGroupKFold(n_splits=3) + >>> for train_idxs, test_idxs in cv.split(X, y, groups): + ... print("TRAIN:", groups[train_idxs]) + ... print(" ", y[train_idxs]) + ... print(" TEST:", groups[test_idxs]) + ... print(" ", y[test_idxs]) + TRAIN: [1 1 2 2 4 5 5 5 5 8 8] + [0 0 1 1 1 0 0 0 0 0 0] + TEST: [3 3 3 6 6 7] + [1 1 1 0 0 0] + TRAIN: [3 3 3 4 5 5 5 5 6 6 7] + [1 1 1 1 0 0 0 0 0 0 0] + TEST: [1 1 2 2 8 8] + [0 0 1 1 0 0] + TRAIN: [1 1 2 2 3 3 3 6 6 7 8 8] + [0 0 1 1 1 1 1 0 0 0 0 0] + TEST: [4 5 5 5 5] + [1 0 0 0 0] + + Notes + ----- + The implementation is designed to: + + * Mimic the behavior of StratifiedKFold as much as possible for trivial + groups (e.g. when each group contains only one sample). + * Be invariant to class label: relabelling ``y = ["Happy", "Sad"]`` to + ``y = [1, 0]`` should not change the indices generated. + * Stratify based on samples as much as possible while keeping + non-overlapping groups constraint. That means that in some cases when + there is a small number of groups containing a large number of samples + the stratification will not be possible and the behavior will be close + to GroupKFold. + + See also + -------- + StratifiedKFold: Takes class information into account to build folds which + retain class distributions (for binary or multiclass classification + tasks). + + GroupKFold: K-fold iterator variant with non-overlapping groups. + """ + + def __init__(self, n_splits=5, shuffle=False, random_state=None): + super().__init__(n_splits=n_splits, shuffle=shuffle, + random_state=random_state) + + def _iter_test_indices(self, X, y, groups): + # Implementation is based on this kaggle kernel: + # https://www.kaggle.com/jakubwasikowski/stratified-group-k-fold-cross-validation + # and is a subject to Apache 2.0 License. You may obtain a copy of the + # License at http://www.apache.org/licenses/LICENSE-2.0 + # Changelist: + # - Refactored function to a class following scikit-learn KFold + # interface. + # - Added heuristic for assigning group to the least populated fold in + # cases when all other criteria are equal + # - Swtch from using python ``Counter`` to ``np.unique`` to get class + # distribution + # - Added scikit-learn checks for input: checking that target is binary + # or multiclass, checking passed random state, checking that number + # of splits is less than number of members in each class, checking + # that least populated class has more members than there are splits. + rng = check_random_state(self.random_state) + y = np.asarray(y) + type_of_target_y = type_of_target(y) + allowed_target_types = ('binary', 'multiclass') + if type_of_target_y not in allowed_target_types: + raise ValueError( + 'Supported target types are: {}. Got {!r} instead.'.format( + allowed_target_types, type_of_target_y)) + + y = column_or_1d(y) + _, y_inv, y_cnt = np.unique(y, return_inverse=True, return_counts=True) + if np.all(self.n_splits > y_cnt): + raise ValueError("n_splits=%d cannot be greater than the" + " number of members in each class." + % (self.n_splits)) + n_smallest_class = np.min(y_cnt) + if self.n_splits > n_smallest_class: + warnings.warn(("The least populated class in y has only %d" + " members, which is less than n_splits=%d." + % (n_smallest_class, self.n_splits)), UserWarning) + n_classes = len(y_cnt) + + _, groups_inv, groups_cnt = np.unique( + groups, return_inverse=True, return_counts=True) + y_counts_per_group = np.zeros((len(groups_cnt), n_classes)) + for class_idx, group_idx in zip(y_inv, groups_inv): + y_counts_per_group[group_idx, class_idx] += 1 + + y_counts_per_fold = np.zeros((self.n_splits, n_classes)) + groups_per_fold = defaultdict(set) + + if self.shuffle: + rng.shuffle(y_counts_per_group) + + # Stable sort to keep shuffled order for groups with the same + # class distribution variance + sorted_groups_idx = np.argsort(-np.std(y_counts_per_group, axis=1), + kind='mergesort') + + for group_idx in sorted_groups_idx: + group_y_counts = y_counts_per_group[group_idx] + best_fold = self._find_best_fold( + y_counts_per_fold=y_counts_per_fold, y_cnt=y_cnt, + group_y_counts=group_y_counts) + y_counts_per_fold[best_fold] += group_y_counts + groups_per_fold[best_fold].add(group_idx) + + for i in range(self.n_splits): + test_indices = [idx for idx, group_idx in enumerate(groups_inv) + if group_idx in groups_per_fold[i]] + yield test_indices + + def _find_best_fold( + self, y_counts_per_fold, y_cnt, group_y_counts): + best_fold = None + min_eval = np.inf + min_samples_in_fold = np.inf + for i in range(self.n_splits): + y_counts_per_fold[i] += group_y_counts + # Summarise the distribution over classes in each proposed fold + std_per_class = np.std( + y_counts_per_fold / y_cnt.reshape(1, -1), + axis=0) + y_counts_per_fold[i] -= group_y_counts + fold_eval = np.mean(std_per_class) + samples_in_fold = np.sum(y_counts_per_fold[i]) + is_current_fold_better = ( + fold_eval < min_eval or + np.isclose(fold_eval, min_eval) + and samples_in_fold < min_samples_in_fold + ) + if is_current_fold_better: + min_eval = fold_eval + min_samples_in_fold = samples_in_fold + best_fold = i + return best_fold + + class TimeSeriesSplit(_BaseKFold): """Time Series cross-validator diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py index 80c19c7f2e08c..c66d8e1836ac9 100644 --- a/sklearn/model_selection/tests/test_split.py +++ b/sklearn/model_selection/tests/test_split.py @@ -35,6 +35,7 @@ from sklearn.model_selection import GridSearchCV from sklearn.model_selection import RepeatedKFold from sklearn.model_selection import RepeatedStratifiedKFold +from sklearn.model_selection import StratifiedGroupKFold from sklearn.linear_model import Ridge @@ -80,6 +81,7 @@ def test_cross_validator_with_default_params(): lopo = LeavePGroupsOut(p) ss = ShuffleSplit(random_state=0) ps = PredefinedSplit([1, 1, 2, 2]) # n_splits = np of unique folds = 2 + sgkf = StratifiedGroupKFold(n_splits) loo_repr = "LeaveOneOut()" lpo_repr = "LeavePOut(p=2)" @@ -90,15 +92,17 @@ def test_cross_validator_with_default_params(): ss_repr = ("ShuffleSplit(n_splits=10, random_state=0, " "test_size=None, train_size=None)") ps_repr = "PredefinedSplit(test_fold=array([1, 1, 2, 2]))" + sgkf_repr = ("StratifiedGroupKFold(n_splits=2, random_state=None, " + "shuffle=False)") n_splits_expected = [n_samples, comb(n_samples, p), n_splits, n_splits, n_unique_groups, comb(n_unique_groups, p), - n_shuffle_splits, 2] + n_shuffle_splits, 2, n_splits] for i, (cv, cv_repr) in enumerate(zip( - [loo, lpo, kf, skf, lolo, lopo, ss, ps], + [loo, lpo, kf, skf, lolo, lopo, ss, ps, sgkf], [loo_repr, lpo_repr, kf_repr, skf_repr, lolo_repr, lopo_repr, - ss_repr, ps_repr])): + ss_repr, ps_repr, sgkf_repr])): # Test if get_n_splits works correctly assert n_splits_expected[i] == cv.get_n_splits(X, y, groups) @@ -133,10 +137,11 @@ def test_2d_y(): groups = rng.randint(0, 3, size=(n_samples,)) splitters = [LeaveOneOut(), LeavePOut(p=2), KFold(), StratifiedKFold(), RepeatedKFold(), RepeatedStratifiedKFold(), - ShuffleSplit(), StratifiedShuffleSplit(test_size=.5), - GroupShuffleSplit(), LeaveOneGroupOut(), - LeavePGroupsOut(n_groups=2), GroupKFold(n_splits=3), - TimeSeriesSplit(), PredefinedSplit(test_fold=groups)] + StratifiedGroupKFold(), ShuffleSplit(), + StratifiedShuffleSplit(test_size=.5), GroupShuffleSplit(), + LeaveOneGroupOut(), LeavePGroupsOut(n_groups=2), + GroupKFold(n_splits=3), TimeSeriesSplit(), + PredefinedSplit(test_fold=groups)] for splitter in splitters: list(splitter.split(X, y, groups)) list(splitter.split(X, y_2d, groups)) @@ -193,6 +198,11 @@ def test_kfold_valueerrors(): with pytest.warns(Warning, match="The least populated class"): next(skf_3.split(X2, y)) + sgkf_3 = StratifiedGroupKFold(3) + naive_groups = np.arange(len(y)) + with pytest.warns(Warning, match="The least populated class"): + next(sgkf_3.split(X2, y, naive_groups)) + # Check that despite the warning the folds are still computed even # though all the classes are not necessarily represented at on each # side of the split at each split @@ -200,12 +210,20 @@ def test_kfold_valueerrors(): warnings.simplefilter("ignore") check_cv_coverage(skf_3, X2, y, groups=None, expected_n_splits=3) + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + check_cv_coverage( + sgkf_3, X2, y, groups=naive_groups, expected_n_splits=3 + ) + # Check that errors are raised if all n_groups for individual # classes are less than n_splits. y = np.array([3, 3, -1, -1, 2]) with pytest.raises(ValueError): next(skf_3.split(X2, y)) + with pytest.raises(ValueError): + next(sgkf_3.split(X2, y)) # Error when number of folds is <= 1 with pytest.raises(ValueError): @@ -218,6 +236,10 @@ def test_kfold_valueerrors(): StratifiedKFold(0) with pytest.raises(ValueError, match=error_string): StratifiedKFold(1) + with pytest.raises(ValueError, match=error_string): + StratifiedGroupKFold(0) + with pytest.raises(ValueError, match=error_string): + StratifiedGroupKFold(1) # When n_splits is not integer: with pytest.raises(ValueError): @@ -228,6 +250,10 @@ def test_kfold_valueerrors(): StratifiedKFold(1.5) with pytest.raises(ValueError): StratifiedKFold(2.0) + with pytest.raises(ValueError): + StratifiedGroupKFold(1.5) + with pytest.raises(ValueError): + StratifiedGroupKFold(2.0) # When shuffle is not a bool: with pytest.raises(TypeError): @@ -318,7 +344,8 @@ def test_stratified_kfold_no_shuffle(): @pytest.mark.parametrize('shuffle', [False, True]) @pytest.mark.parametrize('k', [4, 5, 6, 7, 8, 9, 10]) -def test_stratified_kfold_ratios(k, shuffle): +@pytest.mark.parametrize('kfold', [StratifiedKFold, StratifiedGroupKFold]) +def test_stratified_kfold_ratios(k, shuffle, kfold): # Check that stratified kfold preserves class ratios in individual splits # Repeat with shuffling turned off and on n_samples = 1000 @@ -326,12 +353,14 @@ def test_stratified_kfold_ratios(k, shuffle): y = np.array([4] * int(0.10 * n_samples) + [0] * int(0.89 * n_samples) + [1] * int(0.01 * n_samples)) + # ensure perfect stratification with StratifiedGroupKFold + groups = np.arange(len(y)) distr = np.bincount(y) / len(y) test_sizes = [] random_state = None if not shuffle else 0 - skf = StratifiedKFold(k, random_state=random_state, shuffle=shuffle) - for train, test in skf.split(X, y): + skf = kfold(k, random_state=random_state, shuffle=shuffle) + for train, test in skf.split(X, y, groups=groups): assert_allclose(np.bincount(y[train]) / len(train), distr, atol=0.02) assert_allclose(np.bincount(y[test]) / len(test), distr, atol=0.02) test_sizes.append(len(test)) @@ -340,20 +369,23 @@ def test_stratified_kfold_ratios(k, shuffle): @pytest.mark.parametrize('shuffle', [False, True]) @pytest.mark.parametrize('k', [4, 6, 7]) -def test_stratified_kfold_label_invariance(k, shuffle): +@pytest.mark.parametrize('kfold', [StratifiedKFold, StratifiedGroupKFold]) +def test_stratified_kfold_label_invariance(k, shuffle, kfold): # Check that stratified kfold gives the same indices regardless of labels n_samples = 100 y = np.array([2] * int(0.10 * n_samples) + [0] * int(0.89 * n_samples) + [1] * int(0.01 * n_samples)) X = np.ones(len(y)) + # ensure perfect stratification with StratifiedGroupKFold + groups = np.arange(len(y)) def get_splits(y): random_state = None if not shuffle else 0 return [(list(train), list(test)) for train, test - in StratifiedKFold(k, random_state=random_state, - shuffle=shuffle).split(X, y)] + in kfold(k, random_state=random_state, + shuffle=shuffle).split(X, y, groups=groups)] splits_base = get_splits(y) for perm in permutations([0, 1, 2]): @@ -372,17 +404,20 @@ def test_kfold_balance(): assert np.sum(sizes) == i -def test_stratifiedkfold_balance(): +@pytest.mark.parametrize('kfold', [StratifiedKFold, StratifiedGroupKFold]) +def test_stratifiedkfold_balance(kfold): # Check that KFold returns folds with balanced sizes (only when # stratification is possible) # Repeat with shuffling turned off and on X = np.ones(17) y = [0] * 3 + [1] * 14 + # ensure perfect stratification with StratifiedGroupKFold + groups = np.arange(len(y)) for shuffle in (True, False): - cv = StratifiedKFold(3, shuffle=shuffle) + cv = kfold(3, shuffle=shuffle) for i in range(11, 17): - skf = cv.split(X[:i], y[:i]) + skf = cv.split(X[:i], y[:i], groups[:i]) sizes = [len(test) for _, test in skf] assert (np.max(sizes) - np.min(sizes)) <= 1 @@ -411,39 +446,39 @@ def test_shuffle_kfold(): assert sum(all_folds) == 300 -def test_shuffle_kfold_stratifiedkfold_reproducibility(): +@pytest.mark.parametrize("kfold", + [KFold, StratifiedKFold, StratifiedGroupKFold]) +def test_shuffle_kfold_stratifiedkfold_reproducibility(kfold): X = np.ones(15) # Divisible by 3 y = [0] * 7 + [1] * 8 + groups_1 = np.arange(len(y)) X2 = np.ones(16) # Not divisible by 3 y2 = [0] * 8 + [1] * 8 + groups_2 = np.arange(len(y2)) # Check that when the shuffle is True, multiple split calls produce the # same split when random_state is int - kf = KFold(3, shuffle=True, random_state=0) - skf = StratifiedKFold(3, shuffle=True, random_state=0) + kf = kfold(3, shuffle=True, random_state=0) - for cv in (kf, skf): - np.testing.assert_equal(list(cv.split(X, y)), list(cv.split(X, y))) - np.testing.assert_equal(list(cv.split(X2, y2)), list(cv.split(X2, y2))) + np.testing.assert_equal( + list(kf.split(X, y, groups_1)), + list(kf.split(X, y, groups_1)) + ) # Check that when the shuffle is True, multiple split calls often # (not always) produce different splits when random_state is # RandomState instance or None - kf = KFold(3, shuffle=True, random_state=np.random.RandomState(0)) - skf = StratifiedKFold(3, shuffle=True, - random_state=np.random.RandomState(0)) - - for cv in (kf, skf): - for data in zip((X, X2), (y, y2)): - # Test if the two splits are different cv - for (_, test_a), (_, test_b) in zip(cv.split(*data), - cv.split(*data)): - # cv.split(...) returns an array of tuples, each tuple - # consisting of an array with train indices and test indices - # Ensure that the splits for data are not same - # when random state is not set - with pytest.raises(AssertionError): - np.testing.assert_array_equal(test_a, test_b) + kf = kfold(3, shuffle=True, random_state=np.random.RandomState(0)) + for data in zip((X, X2), (y, y2), (groups_1, groups_2)): + # Test if the two splits are different cv + for (_, test_a), (_, test_b) in zip(kf.split(*data), + kf.split(*data)): + # cv.split(...) returns an array of tuples, each tuple + # consisting of an array with train indices and test indices + # Ensure that the splits for data are not same + # when random state is not set + with pytest.raises(AssertionError): + np.testing.assert_array_equal(test_a, test_b) def test_shuffle_stratifiedkfold(): @@ -514,6 +549,96 @@ def test_kfold_can_detect_dependent_samples_on_digits(): # see #2372 assert mean_score > 0.80 +def test_stratified_group_kfold_trivial(): + sgkf = StratifiedGroupKFold(n_splits=3) + # Trivial example - groups with the same distribution + y = np.array([1] * 6 + [0] * 12) + X = np.ones_like(y).reshape(-1, 1) + groups = np.asarray((1, 2, 3, 4, 5, 6, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6)) + distr = np.bincount(y) / len(y) + test_sizes = [] + for train, test in sgkf.split(X, y, groups): + # check group constraint + assert np.intersect1d(groups[train], groups[test]).size == 0 + # check y distribution + assert_allclose(np.bincount(y[train]) / len(train), distr, atol=0.02) + assert_allclose(np.bincount(y[test]) / len(test), distr, atol=0.02) + test_sizes.append(len(test)) + assert np.ptp(test_sizes) <= 1 + + +def test_stratified_group_kfold_approximate(): + # Not perfect stratification (even though it is possible) because of + # iteration over groups + sgkf = StratifiedGroupKFold(n_splits=3) + y = np.array([1] * 6 + [0] * 12) + X = np.ones_like(y).reshape(-1, 1) + groups = np.array([1, 2, 3, 3, 4, 4, 1, 1, 2, 2, 3, 4, 5, 5, 5, 6, 6, 6]) + expected = np.asarray([[0.833, 0.166], [0.666, 0.333], [0.5, 0.5]]) + test_sizes = [] + for (train, test), expect_dist in zip(sgkf.split(X, y, groups), expected): + # check group constraint + assert np.intersect1d(groups[train], groups[test]).size == 0 + split_dist = np.bincount(y[test]) / len(test) + assert_allclose(split_dist, expect_dist, atol=0.001) + test_sizes.append(len(test)) + assert np.ptp(test_sizes) <= 1 + + +@pytest.mark.parametrize('y, groups, expected', + [(np.array([0] * 6 + [1] * 6), + np.array([1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6]), + np.asarray([[.5, .5], + [.5, .5], + [.5, .5]])), + (np.array([0] * 9 + [1] * 3), + np.array([1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 5, 6]), + np.asarray([[.75, .25], + [.75, .25], + [.75, .25]]))]) +def test_stratified_group_kfold_homogeneous_groups(y, groups, expected): + sgkf = StratifiedGroupKFold(n_splits=3) + X = np.ones_like(y).reshape(-1, 1) + for (train, test), expect_dist in zip(sgkf.split(X, y, groups), expected): + # check group constraint + assert np.intersect1d(groups[train], groups[test]).size == 0 + split_dist = np.bincount(y[test]) / len(test) + assert_allclose(split_dist, expect_dist, atol=0.001) + + +@pytest.mark.parametrize('cls_distr', + [(0.4, 0.6), + (0.3, 0.7), + (0.2, 0.8), + (0.8, 0.2)]) +@pytest.mark.parametrize('n_groups', [5, 30, 70]) +def test_stratified_group_kfold_against_group_kfold(cls_distr, n_groups): + # Check that given sufficient amount of samples StratifiedGroupKFold + # produces better stratified folds than regular GroupKFold + n_splits = 5 + sgkf = StratifiedGroupKFold(n_splits=n_splits) + gkf = GroupKFold(n_splits=n_splits) + rng = np.random.RandomState(0) + n_points = 1000 + y = rng.choice(2, size=n_points, p=cls_distr) + X = np.ones_like(y).reshape(-1, 1) + g = rng.choice(n_groups, n_points) + sgkf_folds = sgkf.split(X, y, groups=g) + gkf_folds = gkf.split(X, y, groups=g) + sgkf_entr = 0 + gkf_entr = 0 + for (sgkf_train, sgkf_test), (_, gkf_test) in zip(sgkf_folds, gkf_folds): + # check group constraint + assert np.intersect1d(g[sgkf_train], g[sgkf_test]).size == 0 + sgkf_distr = np.bincount(y[sgkf_test]) / len(sgkf_test) + gkf_distr = np.bincount(y[gkf_test]) / len(gkf_test) + sgkf_entr += stats.entropy(sgkf_distr, qk=cls_distr) + gkf_entr += stats.entropy(gkf_distr, qk=cls_distr) + sgkf_entr /= n_splits + gkf_entr /= n_splits + assert sgkf_entr <= gkf_entr + + def test_shuffle_split(): ss1 = ShuffleSplit(test_size=0.2, random_state=0).split(X) ss2 = ShuffleSplit(test_size=2, random_state=0).split(X) @@ -1310,7 +1435,8 @@ def test_cv_iterable_wrapper(): "successive calls to split should yield different results") -def test_group_kfold(): +@pytest.mark.parametrize('kfold', [GroupKFold, StratifiedGroupKFold]) +def test_group_kfold(kfold): rng = np.random.RandomState(0) # Parameters of the test @@ -1329,7 +1455,7 @@ def test_group_kfold(): len(np.unique(groups)) # Get the test fold indices from the test set indices of each fold folds = np.zeros(n_samples) - lkf = GroupKFold(n_splits=n_splits) + lkf = kfold(n_splits=n_splits) for i, (_, test) in enumerate(lkf.split(X, y, groups)): folds[test] = i @@ -1569,7 +1695,7 @@ def test_nested_cv(): groups = rng.randint(0, 5, 15) cvs = [LeaveOneGroupOut(), LeaveOneOut(), GroupKFold(n_splits=3), - StratifiedKFold(), + StratifiedKFold(), StratifiedGroupKFold(), StratifiedShuffleSplit(n_splits=3, random_state=0)] for inner_cv, outer_cv in combinations_with_replacement(cvs, 2): @@ -1640,7 +1766,8 @@ def test_leave_p_out_empty_trainset(): next(cv.split(X, y, groups=[1, 2])) -@pytest.mark.parametrize('Klass', (KFold, StratifiedKFold)) +@pytest.mark.parametrize('Klass', + (KFold, StratifiedKFold, StratifiedGroupKFold)) def test_random_state_shuffle_false(Klass): # passing a non-default random_state when shuffle=False makes no sense with pytest.raises(ValueError, @@ -1653,6 +1780,8 @@ def test_random_state_shuffle_false(Klass): (KFold(shuffle=True, random_state=123), True), (StratifiedKFold(), True), (StratifiedKFold(shuffle=True, random_state=123), True), + (StratifiedGroupKFold(shuffle=True, random_state=123), True), + (StratifiedGroupKFold(), True), (RepeatedKFold(random_state=123), True), (RepeatedStratifiedKFold(random_state=123), True), (ShuffleSplit(random_state=123), True), @@ -1664,7 +1793,6 @@ def test_random_state_shuffle_false(Klass): (LeaveOneGroupOut(), True), (LeavePGroupsOut(n_groups=2), True), (LeavePOut(p=2), True), - (KFold(shuffle=True, random_state=None), False), (KFold(shuffle=True, random_state=None), False), (StratifiedKFold(shuffle=True, random_state=np.random.RandomState(0)), From 3e45aeef901871b84ce59709e62f3d2245463cd8 Mon Sep 17 00:00:00 2001 From: LSturtew <56136443+LSturtew@users.noreply.github.com> Date: Sat, 20 Mar 2021 15:07:09 +0100 Subject: [PATCH 265/478] TST Remove assert warn from preprocessing tests (#19691) --- sklearn/preprocessing/tests/test_data.py | 37 ++++++++++++------- .../tests/test_discretization.py | 21 ++++++----- .../tests/test_function_transformer.py | 27 ++++++++------ sklearn/preprocessing/tests/test_label.py | 14 +++---- 4 files changed, 56 insertions(+), 43 deletions(-) diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index 196060388ddd2..8a30eba27cff7 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -19,8 +19,6 @@ from sklearn.utils._testing import assert_array_almost_equal from sklearn.utils._testing import assert_array_equal from sklearn.utils._testing import assert_array_less -from sklearn.utils._testing import assert_warns_message -from sklearn.utils._testing import assert_no_warnings from sklearn.utils._testing import assert_allclose from sklearn.utils._testing import assert_allclose_dense_sparse from sklearn.utils._testing import skip_if_32bit @@ -291,28 +289,37 @@ def test_standard_scaler_numerical_stability(): x = np.full(8, np.log(1e-5), dtype=np.float64) # This does not raise a warning as the number of samples is too low # to trigger the problem in recent numpy - x_scaled = assert_no_warnings(scale, x) + with pytest.warns(None) as record: + scale(x) + assert len(record) == 0 assert_array_almost_equal(scale(x), np.zeros(8)) # with 2 more samples, the std computation run into numerical issues: x = np.full(10, np.log(1e-5), dtype=np.float64) - w = "standard deviation of the data is probably very close to 0" - x_scaled = assert_warns_message(UserWarning, w, scale, x) + warning_message = ( + "standard deviation of the data is probably very close to 0" + ) + with pytest.warns(UserWarning, match=warning_message): + x_scaled = scale(x) assert_array_almost_equal(x_scaled, np.zeros(10)) x = np.full(10, 1e-100, dtype=np.float64) - x_small_scaled = assert_no_warnings(scale, x) + with pytest.warns(None) as record: + x_small_scaled = scale(x) + assert len(record) == 0 assert_array_almost_equal(x_small_scaled, np.zeros(10)) # Large values can cause (often recoverable) numerical stability issues: x_big = np.full(10, 1e100, dtype=np.float64) - w = "Dataset may contain too large values" - x_big_scaled = assert_warns_message(UserWarning, w, scale, x_big) + warning_message = ( + "Dataset may contain too large values" + ) + with pytest.warns(UserWarning, match=warning_message): + x_big_scaled = scale(x_big) assert_array_almost_equal(x_big_scaled, np.zeros(10)) assert_array_almost_equal(x_big_scaled, x_small_scaled) - - x_big_centered = assert_warns_message(UserWarning, w, scale, x_big, - with_std=False) + with pytest.warns(UserWarning, match=warning_message): + x_big_centered = scale(x_big, with_std=False) assert_array_almost_equal(x_big_centered, np.zeros(10)) assert_array_almost_equal(x_big_centered, x_small_scaled) @@ -1239,9 +1246,11 @@ def test_quantile_transform_sparse_ignore_zeros(): n_quantiles=5) # dense case -> warning raise - assert_warns_message(UserWarning, "'ignore_implicit_zeros' takes effect" - " only with sparse matrix. This parameter has no" - " effect.", transformer.fit, X) + warning_message = ("'ignore_implicit_zeros' takes effect" + " only with sparse matrix. This parameter has no" + " effect.") + with pytest.warns(UserWarning, match=warning_message): + transformer.fit(X) X_expected = np.array([[0, 0], [0, 0], diff --git a/sklearn/preprocessing/tests/test_discretization.py b/sklearn/preprocessing/tests/test_discretization.py index 9d607c82d5831..87f3de1ce4c6c 100644 --- a/sklearn/preprocessing/tests/test_discretization.py +++ b/sklearn/preprocessing/tests/test_discretization.py @@ -9,7 +9,6 @@ from sklearn.utils._testing import ( assert_array_almost_equal, assert_array_equal, - assert_warns_message, assert_allclose_dense_sparse ) @@ -109,9 +108,10 @@ def test_same_min_max(strategy): [1, 0], [1, 1]]) est = KBinsDiscretizer(strategy=strategy, n_bins=3, encode='ordinal') - assert_warns_message(UserWarning, - "Feature 0 is constant and will be replaced " - "with 0.", est.fit, X) + warning_message = ("Feature 0 is constant and will be replaced " + "with 0.") + with pytest.warns(UserWarning, match=warning_message): + est.fit(X) assert est.n_bins_[0] == 1 # replace the feature with zeros Xt = est.transform(X) @@ -257,9 +257,9 @@ def test_overwrite(): def test_redundant_bins(strategy, expected_bin_edges): X = [[0], [0], [0], [0], [3], [3]] kbd = KBinsDiscretizer(n_bins=3, strategy=strategy) - msg = ("Bins whose width are too small (i.e., <= 1e-8) in feature 0 " - "are removed. Consider decreasing the number of bins.") - assert_warns_message(UserWarning, msg, kbd.fit, X) + warning_message = ("Consider decreasing the number of bins.") + with pytest.warns(UserWarning, match=warning_message): + kbd.fit(X) assert_array_almost_equal(kbd.bin_edges_[0], expected_bin_edges) @@ -269,9 +269,10 @@ def test_percentile_numeric_stability(): Xt = np.array([0, 0, 4]).reshape(-1, 1) kbd = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile') - msg = ("Bins whose width are too small (i.e., <= 1e-8) in feature 0 " - "are removed. Consider decreasing the number of bins.") - assert_warns_message(UserWarning, msg, kbd.fit, X) + warning_message = ("Consider decreasing the number of bins.") + with pytest.warns(UserWarning, match=warning_message): + kbd.fit(X) + assert_array_almost_equal(kbd.bin_edges_[0], bin_edges) assert_array_almost_equal(kbd.transform(X), Xt) diff --git a/sklearn/preprocessing/tests/test_function_transformer.py b/sklearn/preprocessing/tests/test_function_transformer.py index 924975fbed2e1..327bfa95f1160 100644 --- a/sklearn/preprocessing/tests/test_function_transformer.py +++ b/sklearn/preprocessing/tests/test_function_transformer.py @@ -4,8 +4,7 @@ from sklearn.preprocessing import FunctionTransformer from sklearn.utils._testing import (assert_array_equal, - assert_allclose_dense_sparse) -from sklearn.utils._testing import assert_warns_message, assert_no_warnings + assert_allclose_dense_sparse) def _make_func(args_store, kwargs_store, func=lambda X, *a, **k: X): @@ -127,29 +126,35 @@ def test_check_inverse(): accept_sparse=accept_sparse, check_inverse=True, validate=True) - assert_warns_message(UserWarning, - "The provided functions are not strictly" - " inverse of each other. If you are sure you" - " want to proceed regardless, set" - " 'check_inverse=False'.", - trans.fit, X) + warning_message = ("The provided functions are not strictly" + " inverse of each other. If you are sure you" + " want to proceed regardless, set" + " 'check_inverse=False'.") + with pytest.warns(UserWarning, match=warning_message): + trans.fit(X) trans = FunctionTransformer(func=np.expm1, inverse_func=np.log1p, accept_sparse=accept_sparse, check_inverse=True, validate=True) - Xt = assert_no_warnings(trans.fit_transform, X) + with pytest.warns(None) as record: + Xt = trans.fit_transform(X) + assert len(record) == 0 assert_allclose_dense_sparse(X, trans.inverse_transform(Xt)) # check that we don't check inverse when one of the func or inverse is not # provided. trans = FunctionTransformer(func=np.expm1, inverse_func=None, check_inverse=True, validate=True) - assert_no_warnings(trans.fit, X_dense) + with pytest.warns(None) as record: + trans.fit(X_dense) + assert len(record) == 0 trans = FunctionTransformer(func=None, inverse_func=np.expm1, check_inverse=True, validate=True) - assert_no_warnings(trans.fit, X_dense) + with pytest.warns(None) as record: + trans.fit(X_dense) + assert len(record) == 0 def test_function_transformer_frame(): diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py index aa9361d9164de..fd396ceb90712 100644 --- a/sklearn/preprocessing/tests/test_label.py +++ b/sklearn/preprocessing/tests/test_label.py @@ -12,7 +12,6 @@ from sklearn.utils.multiclass import type_of_target from sklearn.utils._testing import assert_array_equal -from sklearn.utils._testing import assert_warns_message from sklearn.utils._testing import ignore_warnings from sklearn.utils import _to_object_array @@ -351,15 +350,14 @@ def test_multilabel_binarizer_unknown_class(): mlb = MultiLabelBinarizer() y = [[1, 2]] Y = np.array([[1, 0], [0, 1]]) - w = 'unknown class(es) [0, 4] will be ignored' - matrix = assert_warns_message(UserWarning, w, - mlb.fit(y).transform, [[4, 1], [2, 0]]) - assert_array_equal(matrix, Y) + warning_message = 'unknown class.* will be ignored' + with pytest.warns(UserWarning, match=warning_message): + matrix = mlb.fit(y).transform([[4, 1], [2, 0]]) Y = np.array([[1, 0, 0], [0, 1, 0]]) mlb = MultiLabelBinarizer(classes=[1, 2, 3]) - matrix = assert_warns_message(UserWarning, w, - mlb.fit(y).transform, [[4, 1], [2, 0]]) + with pytest.warns(UserWarning, match=warning_message): + matrix = mlb.fit(y).transform([[4, 1], [2, 0]]) assert_array_equal(matrix, Y) @@ -535,7 +533,7 @@ def check_binarized_results(y, classes, pos_label, neg_label, expected): output_type=y_type, classes=classes, threshold=((neg_label + - pos_label) / + pos_label) / 2.)) assert_array_equal(toarray(inversed), toarray(y)) From 97bfa9266956993ed2c487ff085a6cea94b8eb59 Mon Sep 17 00:00:00 2001 From: JohanWork <39947546+JohanWork@users.noreply.github.com> Date: Sun, 21 Mar 2021 14:10:57 +0100 Subject: [PATCH 266/478] DOC Update broken link in conftest.py (#19736) --- conftest.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/conftest.py b/conftest.py index aec49c03ae13d..006dc973140a5 100644 --- a/conftest.py +++ b/conftest.py @@ -1,7 +1,7 @@ # Even if empty this file is useful so that when running from the root folder # ./sklearn is added to sys.path by pytest. See -# https://docs.pytest.org/en/latest/pythonpath.html for more details. For -# example, this allows to build extensions in place and run pytest +# https://docs.pytest.org/en/latest/explanation/pythonpath.html for more +# details. For example, this allows to build extensions in place and run pytest # doc/modules/clustering.rst and use sklearn from the local folder rather than # the one from site-packages. From e377d858325276ccbe1c5ac19403182c385c2184 Mon Sep 17 00:00:00 2001 From: cliffordEmmanuel <45907515+cliffordEmmanuel@users.noreply.github.com> Date: Sun, 21 Mar 2021 14:04:22 +0000 Subject: [PATCH 267/478] DOC Fix doc for single linkage in feature agglomeration (#19715) --- sklearn/cluster/_agglomerative.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/cluster/_agglomerative.py b/sklearn/cluster/_agglomerative.py index 66342797e33b5..ee0a117824dd8 100644 --- a/sklearn/cluster/_agglomerative.py +++ b/sklearn/cluster/_agglomerative.py @@ -1000,7 +1000,7 @@ class FeatureAgglomeration(AgglomerativeClustering, AgglomerationTransform): the two sets. - complete or maximum linkage uses the maximum distances between all features of the two sets. - - single uses the minimum of the distances between all observations + - single uses the minimum of the distances between all features of the two sets. pooling_func : callable, default=np.mean From 40f2dd1be18b70db53c746d2ae02465b62fbe01f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jon=20Haitz=20Legarreta=20Gorro=C3=B1o?= Date: Sun, 21 Mar 2021 13:06:49 -0400 Subject: [PATCH 268/478] DOC Fix typo in KDE metric docstring default value (#19735) --- sklearn/neighbors/_kde.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/neighbors/_kde.py b/sklearn/neighbors/_kde.py index 57f80f83762fb..5a5ad55d3261c 100644 --- a/sklearn/neighbors/_kde.py +++ b/sklearn/neighbors/_kde.py @@ -41,7 +41,7 @@ class KernelDensity(BaseEstimator): 'cosine'}, default='gaussian' The kernel to use. - metric : str, default='euclidian' + metric : str, default='euclidean' The distance metric to use. Note that not all metrics are valid with all algorithms. Refer to the documentation of :class:`BallTree` and :class:`KDTree` for a description of From 1db2681a051dc54b3e8b2af23a90830c67c1f56a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mehmet=20Ali=20=C3=96zer?= Date: Sun, 21 Mar 2021 20:07:24 +0300 Subject: [PATCH 269/478] DOC Fix load iris datasets (#19729) --- doc/modules/tree.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/modules/tree.rst b/doc/modules/tree.rst index 6d30fdcc6bf2f..d62ca5d8ed3e4 100644 --- a/doc/modules/tree.rst +++ b/doc/modules/tree.rst @@ -130,7 +130,8 @@ Using the Iris dataset, we can construct a tree as follows:: >>> from sklearn.datasets import load_iris >>> from sklearn import tree - >>> X, y = load_iris(return_X_y=True) + >>> iris = load_iris() + >>> X, y = iris.data, iris.target >>> clf = tree.DecisionTreeClassifier() >>> clf = clf.fit(X, y) From 81102146e35c81d7aab16d448f1c2b66d8a67ed9 Mon Sep 17 00:00:00 2001 From: guiweber Date: Sun, 21 Mar 2021 15:06:12 -0400 Subject: [PATCH 270/478] DOC Fixed typo in cross_val_predict docstring (#19739) --- sklearn/model_selection/_validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index e61e693b2fa74..5f5338512a0f2 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -784,7 +784,7 @@ def cross_val_predict(estimator, X, y=None, *, groups=None, cv=None, verbose : int, default=0 The verbosity level. - fit_params : dict, defualt=None + fit_params : dict, default=None Parameters to pass to the fit method of the estimator. pre_dispatch : int or str, default='2*n_jobs' From c854b83c91dd8c1bf9282112dba10d50e43b59a4 Mon Sep 17 00:00:00 2001 From: Albert Thomas Date: Tue, 23 Mar 2021 10:53:57 +0100 Subject: [PATCH 271/478] [MRG] Linear One-Class SVM using SGD implementation (#10027) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Tom Dupré la Tour Co-authored-by: Olivier Grisel --- benchmarks/bench_online_ocsvm.py | 279 ++++++++++ doc/modules/classes.rst | 1 + doc/modules/outlier_detection.rst | 32 +- doc/modules/sgd.rst | 52 ++ doc/whats_new/v1.0.rst | 7 + .../linear_model/plot_sgdocsvm_vs_ocsvm.py | 135 +++++ .../miscellaneous/plot_anomaly_comparison.py | 30 +- sklearn/linear_model/__init__.py | 3 +- sklearn/linear_model/_sgd_fast.pyx | 16 +- sklearn/linear_model/_stochastic_gradient.py | 486 +++++++++++++++++- sklearn/linear_model/tests/test_sgd.py | 384 +++++++++++++- sklearn/svm/_classes.py | 4 + 12 files changed, 1381 insertions(+), 48 deletions(-) create mode 100644 benchmarks/bench_online_ocsvm.py create mode 100644 examples/linear_model/plot_sgdocsvm_vs_ocsvm.py diff --git a/benchmarks/bench_online_ocsvm.py b/benchmarks/bench_online_ocsvm.py new file mode 100644 index 0000000000000..33262e8fcb690 --- /dev/null +++ b/benchmarks/bench_online_ocsvm.py @@ -0,0 +1,279 @@ +""" +===================================== +SGDOneClassSVM benchmark +===================================== +This benchmark compares the :class:`SGDOneClassSVM` with :class:`OneClassSVM`. +The former is an online One-Class SVM implemented with a Stochastic Gradient +Descent (SGD). The latter is based on the LibSVM implementation. The +complexity of :class:`SGDOneClassSVM` is linear in the number of samples +whereas the one of :class:`OneClassSVM` is at best quadratic in the number of +samples. We here compare the performance in terms of AUC and training time on +classical anomaly detection datasets. + +The :class:`OneClassSVM` is applied with a Gaussian kernel and we therefore +use a kernel approximation prior to the application of :class:`SGDOneClassSVM`. +""" + +from time import time +import numpy as np + +from scipy.interpolate import interp1d + +from sklearn.metrics import roc_curve, auc +from sklearn.datasets import fetch_kddcup99, fetch_covtype +from sklearn.preprocessing import LabelBinarizer, StandardScaler +from sklearn.pipeline import make_pipeline +from sklearn.utils import shuffle +from sklearn.kernel_approximation import Nystroem +from sklearn.svm import OneClassSVM +from sklearn.linear_model import SGDOneClassSVM + +import matplotlib.pyplot as plt +import matplotlib + +font = {'weight': 'normal', + 'size': 15} + +matplotlib.rc('font', **font) + +print(__doc__) + + +def print_outlier_ratio(y): + """ + Helper function to show the distinct value count of element in the target. + Useful indicator for the datasets used in bench_isolation_forest.py. + """ + uniq, cnt = np.unique(y, return_counts=True) + print("----- Target count values: ") + for u, c in zip(uniq, cnt): + print("------ %s -> %d occurrences" % (str(u), c)) + print("----- Outlier ratio: %.5f" % (np.min(cnt) / len(y))) + + +# for roc curve computation +n_axis = 1000 +x_axis = np.linspace(0, 1, n_axis) + +datasets = ['http', 'smtp', 'SA', 'SF', 'forestcover'] + +novelty_detection = False # if False, training set polluted by outliers + +random_states = [42] +nu = 0.05 + +results_libsvm = np.empty((len(datasets), n_axis + 5)) +results_online = np.empty((len(datasets), n_axis + 5)) + +for dat, dataset_name in enumerate(datasets): + + print(dataset_name) + + # Loading datasets + if dataset_name in ['http', 'smtp', 'SA', 'SF']: + dataset = fetch_kddcup99(subset=dataset_name, shuffle=False, + percent10=False, random_state=88) + X = dataset.data + y = dataset.target + + if dataset_name == 'forestcover': + dataset = fetch_covtype(shuffle=False) + X = dataset.data + y = dataset.target + # normal data are those with attribute 2 + # abnormal those with attribute 4 + s = (y == 2) + (y == 4) + X = X[s, :] + y = y[s] + y = (y != 2).astype(int) + + # Vectorizing data + if dataset_name == 'SF': + # Casting type of X (object) as string is needed for string categorical + # features to apply LabelBinarizer + lb = LabelBinarizer() + x1 = lb.fit_transform(X[:, 1].astype(str)) + X = np.c_[X[:, :1], x1, X[:, 2:]] + y = (y != b'normal.').astype(int) + + if dataset_name == 'SA': + lb = LabelBinarizer() + # Casting type of X (object) as string is needed for string categorical + # features to apply LabelBinarizer + x1 = lb.fit_transform(X[:, 1].astype(str)) + x2 = lb.fit_transform(X[:, 2].astype(str)) + x3 = lb.fit_transform(X[:, 3].astype(str)) + X = np.c_[X[:, :1], x1, x2, x3, X[:, 4:]] + y = (y != b'normal.').astype(int) + + if dataset_name in ['http', 'smtp']: + y = (y != b'normal.').astype(int) + + print_outlier_ratio(y) + + n_samples, n_features = np.shape(X) + if dataset_name == 'SA': # LibSVM too long with n_samples // 2 + n_samples_train = n_samples // 20 + else: + n_samples_train = n_samples // 2 + + n_samples_test = n_samples - n_samples_train + print('n_train: ', n_samples_train) + print('n_features: ', n_features) + + tpr_libsvm = np.zeros(n_axis) + tpr_online = np.zeros(n_axis) + fit_time_libsvm = 0 + fit_time_online = 0 + predict_time_libsvm = 0 + predict_time_online = 0 + + X = X.astype(float) + + gamma = 1 / n_features # OCSVM default parameter + + for random_state in random_states: + + print('random state: %s' % random_state) + + X, y = shuffle(X, y, random_state=random_state) + X_train = X[:n_samples_train] + X_test = X[n_samples_train:] + y_train = y[:n_samples_train] + y_test = y[n_samples_train:] + + if novelty_detection: + X_train = X_train[y_train == 0] + y_train = y_train[y_train == 0] + + std = StandardScaler() + + print('----------- LibSVM OCSVM ------------') + ocsvm = OneClassSVM(kernel='rbf', gamma=gamma, nu=nu) + pipe_libsvm = make_pipeline(std, ocsvm) + + tstart = time() + pipe_libsvm.fit(X_train) + fit_time_libsvm += time() - tstart + + tstart = time() + # scoring such that the lower, the more normal + scoring = -pipe_libsvm.decision_function(X_test) + predict_time_libsvm += time() - tstart + fpr_libsvm_, tpr_libsvm_, _ = roc_curve(y_test, scoring) + + f_libsvm = interp1d(fpr_libsvm_, tpr_libsvm_) + tpr_libsvm += f_libsvm(x_axis) + + print('----------- Online OCSVM ------------') + nystroem = Nystroem(gamma=gamma, random_state=random_state) + online_ocsvm = SGDOneClassSVM(nu=nu, random_state=random_state) + pipe_online = make_pipeline(std, nystroem, online_ocsvm) + + tstart = time() + pipe_online.fit(X_train) + fit_time_online += time() - tstart + + tstart = time() + # scoring such that the lower, the more normal + scoring = -pipe_online.decision_function(X_test) + predict_time_online += time() - tstart + fpr_online_, tpr_online_, _ = roc_curve(y_test, scoring) + + f_online = interp1d(fpr_online_, tpr_online_) + tpr_online += f_online(x_axis) + + tpr_libsvm /= len(random_states) + tpr_libsvm[0] = 0. + fit_time_libsvm /= len(random_states) + predict_time_libsvm /= len(random_states) + auc_libsvm = auc(x_axis, tpr_libsvm) + + results_libsvm[dat] = ([fit_time_libsvm, predict_time_libsvm, + auc_libsvm, n_samples_train, + n_features] + list(tpr_libsvm)) + + tpr_online /= len(random_states) + tpr_online[0] = 0. + fit_time_online /= len(random_states) + predict_time_online /= len(random_states) + auc_online = auc(x_axis, tpr_online) + + results_online[dat] = ([fit_time_online, predict_time_online, + auc_online, n_samples_train, + n_features] + list(tpr_libsvm)) + + +# -------- Plotting bar charts ------------- +fit_time_libsvm_all = results_libsvm[:, 0] +predict_time_libsvm_all = results_libsvm[:, 1] +auc_libsvm_all = results_libsvm[:, 2] +n_train_all = results_libsvm[:, 3] +n_features_all = results_libsvm[:, 4] + +fit_time_online_all = results_online[:, 0] +predict_time_online_all = results_online[:, 1] +auc_online_all = results_online[:, 2] + + +width = 0.7 +ind = 2 * np.arange(len(datasets)) +x_tickslabels = [(name + '\n' + r'$n={:,d}$' + '\n' + r'$d={:d}$') + .format(int(n), int(d)) + for name, n, d in zip(datasets, n_train_all, n_features_all)] + + +def autolabel_auc(rects, ax): + """Attach a text label above each bar displaying its height.""" + for rect in rects: + height = rect.get_height() + ax.text(rect.get_x() + rect.get_width() / 2., 1.05 * height, + '%.3f' % height, ha='center', va='bottom') + + +def autolabel_time(rects, ax): + """Attach a text label above each bar displaying its height.""" + for rect in rects: + height = rect.get_height() + ax.text(rect.get_x() + rect.get_width() / 2., 1.05 * height, + '%.1f' % height, ha='center', va='bottom') + + +fig, ax = plt.subplots(figsize=(15, 8)) +ax.set_ylabel('AUC') +ax.set_ylim((0, 1.3)) +rect_libsvm = ax.bar(ind, auc_libsvm_all, width=width, color='r') +rect_online = ax.bar(ind + width, auc_online_all, width=width, color='y') +ax.legend((rect_libsvm[0], rect_online[0]), ('LibSVM', 'Online SVM')) +ax.set_xticks(ind + width / 2) +ax.set_xticklabels(x_tickslabels) +autolabel_auc(rect_libsvm, ax) +autolabel_auc(rect_online, ax) +plt.show() + + +fig, ax = plt.subplots(figsize=(15, 8)) +ax.set_ylabel('Training time (sec) - Log scale') +ax.set_yscale('log') +rect_libsvm = ax.bar(ind, fit_time_libsvm_all, color='r', width=width) +rect_online = ax.bar(ind + width, fit_time_online_all, color='y', width=width) +ax.legend((rect_libsvm[0], rect_online[0]), ('LibSVM', 'Online SVM')) +ax.set_xticks(ind + width / 2) +ax.set_xticklabels(x_tickslabels) +autolabel_time(rect_libsvm, ax) +autolabel_time(rect_online, ax) +plt.show() + + +fig, ax = plt.subplots(figsize=(15, 8)) +ax.set_ylabel('Testing time (sec) - Log scale') +ax.set_yscale('log') +rect_libsvm = ax.bar(ind, predict_time_libsvm_all, color='r', width=width) +rect_online = ax.bar(ind + width, predict_time_online_all, + color='y', width=width) +ax.legend((rect_libsvm[0], rect_online[0]), ('LibSVM', 'Online SVM')) +ax.set_xticks(ind + width / 2) +ax.set_xticklabels(x_tickslabels) +autolabel_time(rect_libsvm, ax) +autolabel_time(rect_online, ax) +plt.show() diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index ceebfc337352a..45195dcedec64 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -762,6 +762,7 @@ Linear classifiers linear_model.RidgeClassifier linear_model.RidgeClassifierCV linear_model.SGDClassifier + linear_model.SGDOneClassSVM Classical linear regressors --------------------------- diff --git a/doc/modules/outlier_detection.rst b/doc/modules/outlier_detection.rst index 5d2008f3c3f58..14495bc558dab 100644 --- a/doc/modules/outlier_detection.rst +++ b/doc/modules/outlier_detection.rst @@ -110,9 +110,14 @@ does not perform very well for outlier detection. That being said, outlier detection in high-dimension, or without any assumptions on the distribution of the inlying data is very challenging. :class:`svm.OneClassSVM` may still be used with outlier detection but requires fine-tuning of its hyperparameter -`nu` to handle outliers and prevent overfitting. Finally, -:class:`covariance.EllipticEnvelope` assumes the data is Gaussian and learns -an ellipse. For more details on the different estimators refer to the example +`nu` to handle outliers and prevent overfitting. +:class:`linear_model.SGDOneClassSVM` provides an implementation of a +linear One-Class SVM with a linear complexity in the number of samples. This +implementation is here used with a kernel approximation technique to obtain +results similar to :class:`svm.OneClassSVM` which uses a Gaussian kernel +by default. Finally, :class:`covariance.EllipticEnvelope` assumes the data is +Gaussian and learns an ellipse. For more details on the different estimators +refer to the example :ref:`sphx_glr_auto_examples_miscellaneous_plot_anomaly_comparison.py` and the sections hereunder. @@ -173,6 +178,23 @@ but regular, observation outside the frontier. :scale: 75% +Scaling up the One-Class SVM +---------------------------- + +An online linear version of the One-Class SVM is implemented in +:class:`linear_model.SGDOneClassSVM`. This implementation scales linearly with +the number of samples and can be used with a kernel approximation to +approximate the solution of a kernelized :class:`svm.OneClassSVM` whose +complexity is at best quadratic in the number of samples. See section +:ref:`sgd_online_one_class_svm` for more details. + +.. topic:: Examples: + + * See :ref:`sphx_glr_auto_examples_linear_model_plot_sgdocsvm_vs_ocsvm.py` + for an illustration of the approximation of a kernelized One-Class SVM + with the `linear_model.SGDOneClassSVM` combined with kernel approximation. + + Outlier Detection ================= @@ -278,8 +300,8 @@ allows you to add more trees to an already fitted model:: for a comparison of :class:`ensemble.IsolationForest` with :class:`neighbors.LocalOutlierFactor`, :class:`svm.OneClassSVM` (tuned to perform like an outlier detection - method) and a covariance-based outlier detection with - :class:`covariance.EllipticEnvelope`. + method), :class:`linear_model.SGDOneClassSVM`, and a covariance-based + outlier detection with :class:`covariance.EllipticEnvelope`. .. topic:: References: diff --git a/doc/modules/sgd.rst b/doc/modules/sgd.rst index 1376947540e78..0a1d8407e64ae 100644 --- a/doc/modules/sgd.rst +++ b/doc/modules/sgd.rst @@ -232,6 +232,58 @@ For regression with a squared loss and a l2 penalty, another variant of SGD with an averaging strategy is available with Stochastic Average Gradient (SAG) algorithm, available as a solver in :class:`Ridge`. +.. _sgd_online_one_class_svm: + +Online One-Class SVM +==================== + +The class :class:`sklearn.linear_model.SGDOneClassSVM` implements an online +linear version of the One-Class SVM using a stochastic gradient descent. +Combined with kernel approximation techniques, +:class:`sklearn.linear_model.SGDOneClassSVM` can be used to approximate the +solution of a kernelized One-Class SVM, implemented in +:class:`sklearn.svm.OneClassSVM`, with a linear complexity in the number of +samples. Note that the complexity of a kernelized One-Class SVM is at best +quadratic in the number of samples. +:class:`sklearn.linear_model.SGDOneClassSVM` is thus well suited for datasets +with a large number of training samples (> 10,000) for which the SGD +variant can be several orders of magnitude faster. + +Its implementation is based on the implementation of the stochastic +gradient descent. Indeed, the original optimization problem of the One-Class +SVM is given by + +.. math:: + + \begin{aligned} + \min_{w, \rho, \xi} & \quad \frac{1}{2}\Vert w \Vert^2 - \rho + \frac{1}{\nu n} \sum_{i=1}^n \xi_i \\ + \text{s.t.} & \quad \langle w, x_i \rangle \geq \rho - \xi_i \quad 1 \leq i \leq n \\ + & \quad \xi_i \geq 0 \quad 1 \leq i \leq n + \end{aligned} + +where :math:`\nu \in (0, 1]` is the user-specified parameter controlling the +proportion of outliers and the proportion of support vectors. Getting rid of +the slack variables :math:`\xi_i` this problem is equivalent to + +.. math:: + + \min_{w, \rho} \frac{1}{2}\Vert w \Vert^2 - \rho + \frac{1}{\nu n} \sum_{i=1}^n \max(0, \rho - \langle w, x_i \rangle) \, . + +Multiplying by the constant :math:`\nu` and introducing the intercept +:math:`b = 1 - \rho` we obtain the following equivalent optimization problem + +.. math:: + + \min_{w, b} \frac{\nu}{2}\Vert w \Vert^2 + b\nu + \frac{1}{n} \sum_{i=1}^n \max(0, 1 - (\langle w, x_i \rangle + b)) \, . + +This is similar to the optimization problems studied in section +:ref:`sgd_mathematical_formulation` with :math:`y_i = 1, 1 \leq i \leq n` and +:math:`\alpha = \nu/2`, :math:`L` being the hinge loss function and :math:`R` +being the L2 norm. We just need to add the term :math:`b\nu` in the +optimization loop. + +As :class:`SGDClassifier` and :class:`SGDRegressor`, :class:`SGDOneClassSVM` +supports averaged SGD. Averaging can be enabled by setting ``average=True``. Stochastic Gradient Descent for sparse data =========================================== diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index 521e358ac2f02..c252f5df1074e 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -147,6 +147,13 @@ Changelog :mod:`sklearn.linear_model` ........................... +- |Feature| The new :class:`linear_model.SGDOneClassSVM` provides an SGD + implementation of the linear One-Class SVM. Combined with kernel + approximation techniques, this implementation approximates the solution of + a kernelized One Class SVM while benefitting from a linear + complexity in the number of samples. + :pr:`10027` by :user:`Albert Thomas `. + - |Efficiency| The implementation of :class:`linear_model.LogisticRegression` has been optimised for dense matrices when using `solver='newton-cg'` and `multi_class!='multinomial'`. diff --git a/examples/linear_model/plot_sgdocsvm_vs_ocsvm.py b/examples/linear_model/plot_sgdocsvm_vs_ocsvm.py new file mode 100644 index 0000000000000..e70694cdb1c1b --- /dev/null +++ b/examples/linear_model/plot_sgdocsvm_vs_ocsvm.py @@ -0,0 +1,135 @@ +""" +==================================================================== +One-Class SVM versus One-Class SVM using Stochastic Gradient Descent +==================================================================== + +This example shows how to approximate the solution of +:class:`sklearn.svm.OneClassSVM` in the case of an RBF kernel with +:class:`sklearn.linear_model.SGDOneClassSVM`, a Stochastic Gradient Descent +(SGD) version of the One-Class SVM. A kernel approximation is first used in +order to apply :class:`sklearn.linear_model.SGDOneClassSVM` which implements a +linear One-Class SVM using SGD. + +Note that :class:`sklearn.linear_model.SGDOneClassSVM` scales linearly with +the number of samples whereas the complexity of a kernelized +:class:`sklearn.svm.OneClassSVM` is at best quadratic with respect to the +number of samples. It is not the purpose of this example to illustrate the +benefits of such an approximation in terms of computation time but rather to +show that we obtain similar results on a toy dataset. +""" +print(__doc__) # noqa + +import numpy as np +import matplotlib.pyplot as plt +import matplotlib +from sklearn.svm import OneClassSVM +from sklearn.linear_model import SGDOneClassSVM +from sklearn.kernel_approximation import Nystroem +from sklearn.pipeline import make_pipeline + +font = {'weight': 'normal', + 'size': 15} + +matplotlib.rc('font', **font) + +random_state = 42 +rng = np.random.RandomState(random_state) + +# Generate train data +X = 0.3 * rng.randn(500, 2) +X_train = np.r_[X + 2, X - 2] +# Generate some regular novel observations +X = 0.3 * rng.randn(20, 2) +X_test = np.r_[X + 2, X - 2] +# Generate some abnormal novel observations +X_outliers = rng.uniform(low=-4, high=4, size=(20, 2)) + +xx, yy = np.meshgrid(np.linspace(-4.5, 4.5, 50), np.linspace(-4.5, 4.5, 50)) + +# OCSVM hyperparameters +nu = 0.05 +gamma = 2. + +# Fit the One-Class SVM +clf = OneClassSVM(gamma=gamma, kernel='rbf', nu=nu) +clf.fit(X_train) +y_pred_train = clf.predict(X_train) +y_pred_test = clf.predict(X_test) +y_pred_outliers = clf.predict(X_outliers) +n_error_train = y_pred_train[y_pred_train == -1].size +n_error_test = y_pred_test[y_pred_test == -1].size +n_error_outliers = y_pred_outliers[y_pred_outliers == 1].size + +Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) +Z = Z.reshape(xx.shape) + + +# Fit the One-Class SVM using a kernel approximation and SGD +transform = Nystroem(gamma=gamma, random_state=random_state) +clf_sgd = SGDOneClassSVM(nu=nu, shuffle=True, fit_intercept=True, + random_state=random_state, tol=1e-4) +pipe_sgd = make_pipeline(transform, clf_sgd) +pipe_sgd.fit(X_train) +y_pred_train_sgd = pipe_sgd.predict(X_train) +y_pred_test_sgd = pipe_sgd.predict(X_test) +y_pred_outliers_sgd = pipe_sgd.predict(X_outliers) +n_error_train_sgd = y_pred_train_sgd[y_pred_train_sgd == -1].size +n_error_test_sgd = y_pred_test_sgd[y_pred_test_sgd == -1].size +n_error_outliers_sgd = y_pred_outliers_sgd[y_pred_outliers_sgd == 1].size + +Z_sgd = pipe_sgd.decision_function(np.c_[xx.ravel(), yy.ravel()]) +Z_sgd = Z_sgd.reshape(xx.shape) + +# plot the level sets of the decision function +plt.figure(figsize=(9, 6)) +plt.title('One Class SVM') +plt.contourf(xx, yy, Z, levels=np.linspace(Z.min(), 0, 7), cmap=plt.cm.PuBu) +a = plt.contour(xx, yy, Z, levels=[0], linewidths=2, colors='darkred') +plt.contourf(xx, yy, Z, levels=[0, Z.max()], colors='palevioletred') + +s = 20 +b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white', s=s, edgecolors='k') +b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='blueviolet', s=s, + edgecolors='k') +c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c='gold', s=s, + edgecolors='k') +plt.axis('tight') +plt.xlim((-4.5, 4.5)) +plt.ylim((-4.5, 4.5)) +plt.legend([a.collections[0], b1, b2, c], + ["learned frontier", "training observations", + "new regular observations", "new abnormal observations"], + loc="upper left") +plt.xlabel( + "error train: %d/%d; errors novel regular: %d/%d; " + "errors novel abnormal: %d/%d" + % (n_error_train, X_train.shape[0], n_error_test, X_test.shape[0], + n_error_outliers, X_outliers.shape[0])) +plt.show() + +plt.figure(figsize=(9, 6)) +plt.title('Online One-Class SVM') +plt.contourf(xx, yy, Z_sgd, levels=np.linspace(Z_sgd.min(), 0, 7), + cmap=plt.cm.PuBu) +a = plt.contour(xx, yy, Z_sgd, levels=[0], linewidths=2, colors='darkred') +plt.contourf(xx, yy, Z_sgd, levels=[0, Z_sgd.max()], colors='palevioletred') + +s = 20 +b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white', s=s, edgecolors='k') +b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='blueviolet', s=s, + edgecolors='k') +c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c='gold', s=s, + edgecolors='k') +plt.axis('tight') +plt.xlim((-4.5, 4.5)) +plt.ylim((-4.5, 4.5)) +plt.legend([a.collections[0], b1, b2, c], + ["learned frontier", "training observations", + "new regular observations", "new abnormal observations"], + loc="upper left") +plt.xlabel( + "error train: %d/%d; errors novel regular: %d/%d; " + "errors novel abnormal: %d/%d" + % (n_error_train_sgd, X_train.shape[0], n_error_test_sgd, X_test.shape[0], + n_error_outliers_sgd, X_outliers.shape[0])) +plt.show() diff --git a/examples/miscellaneous/plot_anomaly_comparison.py b/examples/miscellaneous/plot_anomaly_comparison.py index b5ebd96bd8815..c0c3a4f890923 100644 --- a/examples/miscellaneous/plot_anomaly_comparison.py +++ b/examples/miscellaneous/plot_anomaly_comparison.py @@ -22,7 +22,17 @@ One-class SVM might give useful results in these situations depending on the value of its hyperparameters. -:class:`~sklearn.covariance.EllipticEnvelope` assumes the data is Gaussian and +The :class:`sklearn.linear_model.SGDOneClassSVM` is an implementation of the +One-Class SVM based on stochastic gradient descent (SGD). Combined with kernel +approximation, this estimator can be used to approximate the solution +of a kernelized :class:`sklearn.svm.OneClassSVM`. We note that, although not +identical, the decision boundaries of the +:class:`sklearn.linear_model.SGDOneClassSVM` and the ones of +:class:`sklearn.svm.OneClassSVM` are very similar. The main advantage of using +:class:`sklearn.linear_model.SGDOneClassSVM` is that it scales linearly with +the number of samples. + +:class:`sklearn.covariance.EllipticEnvelope` assumes the data is Gaussian and learns an ellipse. It thus degrades when the data is not unimodal. Notice however that this estimator is robust to outliers. @@ -66,6 +76,9 @@ from sklearn.covariance import EllipticEnvelope from sklearn.ensemble import IsolationForest from sklearn.neighbors import LocalOutlierFactor +from sklearn.linear_model import SGDOneClassSVM +from sklearn.kernel_approximation import Nystroem +from sklearn.pipeline import make_pipeline print(__doc__) @@ -77,11 +90,18 @@ n_outliers = int(outliers_fraction * n_samples) n_inliers = n_samples - n_outliers -# define outlier/anomaly detection methods to be compared +# define outlier/anomaly detection methods to be compared. +# the SGDOneClassSVM must be used in a pipeline with a kernel approximation +# to give similar results to the OneClassSVM anomaly_algorithms = [ ("Robust covariance", EllipticEnvelope(contamination=outliers_fraction)), ("One-Class SVM", svm.OneClassSVM(nu=outliers_fraction, kernel="rbf", gamma=0.1)), + ("One-Class SVM (SGD)", make_pipeline( + Nystroem(gamma=0.1, random_state=42, n_components=150), + SGDOneClassSVM(nu=outliers_fraction, shuffle=True, + fit_intercept=True, random_state=42, tol=1e-6) + )), ("Isolation Forest", IsolationForest(contamination=outliers_fraction, random_state=42)), ("Local Outlier Factor", LocalOutlierFactor( @@ -104,7 +124,7 @@ xx, yy = np.meshgrid(np.linspace(-7, 7, 150), np.linspace(-7, 7, 150)) -plt.figure(figsize=(len(anomaly_algorithms) * 2 + 3, 12.5)) +plt.figure(figsize=(len(anomaly_algorithms) * 2 + 4, 12.5)) plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=.05, hspace=.01) @@ -113,8 +133,8 @@ for i_dataset, X in enumerate(datasets): # Add outliers - X = np.concatenate([X, rng.uniform(low=-6, high=6, - size=(n_outliers, 2))], axis=0) + X = np.concatenate([X, rng.uniform(low=-6, high=6, size=(n_outliers, 2))], + axis=0) for name, algorithm in anomaly_algorithms: t0 = time.time() diff --git a/sklearn/linear_model/__init__.py b/sklearn/linear_model/__init__.py index 110e0008bccc9..f715e30795961 100644 --- a/sklearn/linear_model/__init__.py +++ b/sklearn/linear_model/__init__.py @@ -18,7 +18,7 @@ GammaRegressor, TweedieRegressor) from ._huber import HuberRegressor from ._sgd_fast import Hinge, Log, ModifiedHuber, SquaredLoss, Huber -from ._stochastic_gradient import SGDClassifier, SGDRegressor +from ._stochastic_gradient import SGDClassifier, SGDRegressor, SGDOneClassSVM from ._ridge import (Ridge, RidgeCV, RidgeClassifier, RidgeClassifierCV, ridge_regression) from ._logistic import LogisticRegression, LogisticRegressionCV @@ -65,6 +65,7 @@ 'RidgeClassifierCV', 'SGDClassifier', 'SGDRegressor', + 'SGDOneClassSVM', 'SquaredLoss', 'TheilSenRegressor', 'enet_path', diff --git a/sklearn/linear_model/_sgd_fast.pyx b/sklearn/linear_model/_sgd_fast.pyx index 3940e5d873669..dab7b36b14d0e 100644 --- a/sklearn/linear_model/_sgd_fast.pyx +++ b/sklearn/linear_model/_sgd_fast.pyx @@ -55,7 +55,7 @@ cdef class LossFunction: Parameters ---------- p : double - The prediction, p = w^T x + The prediction, p = w^T x + intercept y : double The true value (aka target) @@ -358,6 +358,7 @@ def _plain_sgd(np.ndarray[double, ndim=1, mode='c'] weights, double weight_pos, double weight_neg, int learning_rate, double eta0, double power_t, + bint one_class, double t=1.0, double intercept_decay=1.0, int average=0): @@ -427,6 +428,8 @@ def _plain_sgd(np.ndarray[double, ndim=1, mode='c'] weights, The initial learning rate. power_t : double The exponent for inverse scaling learning rate. + one_class : boolean + Whether to solve the One-Class SVM optimization problem. t : double Initial state of the learning rate. This value is equal to the iteration count except when the learning rate is set to `optimal`. @@ -435,6 +438,7 @@ def _plain_sgd(np.ndarray[double, ndim=1, mode='c'] weights, The number of iterations before averaging starts. average=1 is equivalent to averaging for all iterations. + Returns ------- weights : array, shape=[n_features] @@ -468,6 +472,7 @@ def _plain_sgd(np.ndarray[double, ndim=1, mode='c'] weights, cdef double eta = 0.0 cdef double p = 0.0 cdef double update = 0.0 + cdef double intercept_update = 0.0 cdef double sumloss = 0.0 cdef double score = 0.0 cdef double best_loss = INFINITY @@ -574,10 +579,15 @@ def _plain_sgd(np.ndarray[double, ndim=1, mode='c'] weights, # do not scale to negative values when eta or alpha are too # big: instead set the weights to zero w.scale(max(0, 1.0 - ((1.0 - l1_ratio) * eta * alpha))) + if update != 0.0: w.add(x_data_ptr, x_ind_ptr, xnnz, update) - if fit_intercept == 1: - intercept += update * intercept_decay + if fit_intercept == 1: + intercept_update = update + if one_class: # specific for One-Class SVM + intercept_update -= 2. * eta * alpha + if intercept_update != 0: + intercept += intercept_update * intercept_decay if 0 < average <= t: # compute the average for the intercept and update the diff --git a/sklearn/linear_model/_stochastic_gradient.py b/sklearn/linear_model/_stochastic_gradient.py index a426c9a8d95f2..44ecf564ffcc5 100644 --- a/sklearn/linear_model/_stochastic_gradient.py +++ b/sklearn/linear_model/_stochastic_gradient.py @@ -2,7 +2,9 @@ # Mathieu Blondel (partial_fit support) # # License: BSD 3 clause -"""Classification and regression using Stochastic Gradient Descent (SGD).""" +"""Classification, regression and One-Class SVM using Stochastic Gradient +Descent (SGD). +""" import numpy as np import warnings @@ -14,7 +16,7 @@ from ..base import clone, is_classifier from ._base import LinearClassifierMixin, SparseCoefMixin from ._base import make_dataset -from ..base import BaseEstimator, RegressorMixin +from ..base import BaseEstimator, RegressorMixin, OutlierMixin from ..utils import check_random_state from ..utils.extmath import safe_sparse_dot from ..utils.multiclass import _check_partial_fit_first_call @@ -134,7 +136,7 @@ def _validate_params(self, for_partial_fit=False): raise ValueError("max_iter must be > zero. Got %f" % self.max_iter) if not (0.0 <= self.l1_ratio <= 1.0): raise ValueError("l1_ratio must be in [0, 1]") - if self.alpha < 0.0: + if not isinstance(self, SGDOneClassSVM) and self.alpha < 0.0: raise ValueError("alpha must be >= 0") if self.n_iter_no_change < 1: raise ValueError("n_iter_no_change must be >= 1") @@ -190,7 +192,7 @@ def _get_penalty_type(self, penalty): raise ValueError("Penalty %s is not supported. " % penalty) from e def _allocate_parameter_mem(self, n_classes, n_features, coef_init=None, - intercept_init=None): + intercept_init=None, one_class=0): """Allocate mem for parameters; initialize if provided.""" if n_classes > 2: # allocate coef_ for multi-class @@ -215,7 +217,7 @@ def _allocate_parameter_mem(self, n_classes, n_features, coef_init=None, self.intercept_ = np.zeros(n_classes, dtype=np.float64, order="C") else: - # allocate coef_ for binary problem + # allocate coef_ if coef_init is not None: coef_init = np.asarray(coef_init, dtype=np.float64, order="C") @@ -229,26 +231,36 @@ def _allocate_parameter_mem(self, n_classes, n_features, coef_init=None, dtype=np.float64, order="C") - # allocate intercept_ for binary problem + # allocate intercept_ if intercept_init is not None: intercept_init = np.asarray(intercept_init, dtype=np.float64) if intercept_init.shape != (1,) and intercept_init.shape != (): raise ValueError("Provided intercept_init " "does not match dataset.") - self.intercept_ = intercept_init.reshape(1,) + if one_class: + self.offset_ = intercept_init.reshape(1,) + else: + self.intercept_ = intercept_init.reshape(1,) else: - self.intercept_ = np.zeros(1, dtype=np.float64, order="C") + if one_class: + self.offset_ = np.zeros(1, dtype=np.float64, order="C") + else: + self.intercept_ = np.zeros(1, dtype=np.float64, order="C") # initialize average parameters if self.average > 0: self._standard_coef = self.coef_ - self._standard_intercept = self.intercept_ self._average_coef = np.zeros(self.coef_.shape, dtype=np.float64, order="C") - self._average_intercept = np.zeros(self._standard_intercept.shape, - dtype=np.float64, - order="C") + if one_class: + self._standard_intercept = 1 - self.offset_ + else: + self._standard_intercept = self.intercept_ + + self._average_intercept = np.zeros( + self._standard_intercept.shape, dtype=np.float64, + order="C") def _make_validation_split(self, y): """Split the dataset between training set and validation set. @@ -447,7 +459,7 @@ def fit_binary(est, i, X, y, alpha, C, learning_rate, max_iter, est.early_stopping, validation_score_cb, int(est.n_iter_no_change), max_iter, tol, int(est.fit_intercept), int(est.verbose), int(est.shuffle), seed, pos_weight, neg_weight, learning_rate_type, - est.eta0, est.power_t, est.t_, intercept_decay, est.average) + est.eta0, est.power_t, 0, est.t_, intercept_decay, est.average) if est.average: if len(est.classes_) == 2: @@ -1363,7 +1375,7 @@ def _fit_regressor(self, X, y, alpha, C, loss, learning_rate, seed, 1.0, 1.0, learning_rate_type, - self.eta0, self.power_t, self.t_, + self.eta0, self.power_t, 0, self.t_, intercept_decay, self.average) self.t_ += self.n_iter_ * X.shape[0] @@ -1626,3 +1638,449 @@ def _more_tags(self): 'zero sample_weight is not equivalent to removing samples', } } + + +class SGDOneClassSVM(BaseSGD, OutlierMixin): + """Solves linear One-Class SVM using Stochastic Gradient Descent. + + This implementation is meant to be used with a kernel approximation + technique (e.g. `sklearn.kernel_approximation.Nystroem`) to obtain results + similar to `sklearn.svm.OneClassSVM` which uses a Gaussian kernel by + default. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 1.0 + + Parameters + ---------- + nu : float, optional + The nu parameter of the One Class SVM: an upper bound on the + fraction of training errors and a lower bound of the fraction of + support vectors. Should be in the interval (0, 1]. By default 0.5 + will be taken. + + fit_intercept : bool + Whether the intercept should be estimated or not. Defaults to True. + + max_iter : int, optional + The maximum number of passes over the training data (aka epochs). + It only impacts the behavior in the ``fit`` method, and not the + `partial_fit`. Defaults to 1000. + + tol : float or None, optional + The stopping criterion. If it is not None, the iterations will stop + when (loss > previous_loss - tol). Defaults to 1e-3. + + shuffle : bool, optional + Whether or not the training data should be shuffled after each epoch. + Defaults to True. + + verbose : integer, optional + The verbosity level + + random_state : int, RandomState instance or None, optional (default=None) + The seed of the pseudo random number generator to use when shuffling + the data. If int, random_state is the seed used by the random number + generator; If RandomState instance, random_state is the random number + generator; If None, the random number generator is the RandomState + instance used by `np.random`. + + learning_rate : string, optional + The learning rate schedule: + + 'constant': + eta = eta0 + 'optimal': [default] + eta = 1.0 / (alpha * (t + t0)) + where t0 is chosen by a heuristic proposed by Leon Bottou. + 'invscaling': + eta = eta0 / pow(t, power_t) + 'adaptive': + eta = eta0, as long as the training keeps decreasing. + Each time n_iter_no_change consecutive epochs fail to decrease the + training loss by tol or fail to increase validation score by tol if + early_stopping is True, the current learning rate is divided by 5. + + eta0 : double + The initial learning rate for the 'constant', 'invscaling' or + 'adaptive' schedules. The default value is 0.0 as eta0 is not used by + the default schedule 'optimal'. + + power_t : double + The exponent for inverse scaling learning rate [default 0.5]. + + warm_start : bool, optional + When set to True, reuse the solution of the previous call to fit as + initialization, otherwise, just erase the previous solution. + See :term:`the Glossary `. + + Repeatedly calling fit or partial_fit when warm_start is True can + result in a different solution than when calling fit a single time + because of the way the data is shuffled. + If a dynamic learning rate is used, the learning rate is adapted + depending on the number of samples already seen. Calling ``fit`` resets + this counter, while ``partial_fit`` will result in increasing the + existing counter. + + average : bool or int, optional + When set to True, computes the averaged SGD weights and stores the + result in the ``coef_`` attribute. If set to an int greater than 1, + averaging will begin once the total number of samples seen reaches + average. So ``average=10`` will begin averaging after seeing 10 + samples. + + Attributes + ---------- + coef_ : array, shape (1, n_features) + Weights assigned to the features. + + offset_ : array, shape (1,) + Offset used to define the decision function from the raw scores. + We have the relation: decision_function = score_samples - offset. + + n_iter_ : int + The actual number of iterations to reach the stopping criterion. + + t_ : int + Number of weight updates performed during training. + Same as ``(n_iter_ * n_samples)``. + + loss_function_ : concrete ``LossFunction`` + + Examples + -------- + >>> import numpy as np + >>> from sklearn import linear_model + >>> X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]]) + >>> clf = linear_model.SGDOneClassSVM(random_state=42) + >>> clf.fit(X) + SGDOneClassSVM(random_state=42) + + >>> print(clf.predict([[4, 4]])) + [1] + + See also + -------- + sklearn.svm.OneClassSVM + + Notes + ----- + This estimator has a linear complexity in the number of training samples + and is thus better suited than the `sklearn.svm.OneClassSVM` + implementation for datasets with a large number of training samples (say + > 10,000). + """ + + loss_functions = {"hinge": (Hinge, 1.0)} + + def __init__(self, nu=0.5, fit_intercept=True, max_iter=1000, tol=1e-3, + shuffle=True, verbose=0, random_state=None, + learning_rate="optimal", eta0=0.0, power_t=0.5, + warm_start=False, average=False): + + alpha = nu / 2 + self.nu = nu + super(SGDOneClassSVM, self).__init__( + loss="hinge", penalty='l2', alpha=alpha, C=1.0, l1_ratio=0, + fit_intercept=fit_intercept, max_iter=max_iter, tol=tol, + shuffle=shuffle, verbose=verbose, epsilon=DEFAULT_EPSILON, + random_state=random_state, learning_rate=learning_rate, + eta0=eta0, power_t=power_t, early_stopping=False, + validation_fraction=0.1, n_iter_no_change=5, + warm_start=warm_start, average=average) + + def _validate_params(self, for_partial_fit=False): + """Validate input params. """ + if not(0 < self.nu <= 1): + raise ValueError("nu must be in (0, 1], got nu=%f" % self.nu) + + super(SGDOneClassSVM, self)._validate_params( + for_partial_fit=for_partial_fit) + + def _fit_one_class(self, X, alpha, C, sample_weight, + learning_rate, max_iter): + """Uses SGD implementation with X and y=np.ones(n_samples).""" + + # The One-Class SVM uses the SGD implementation with + # y=np.ones(n_samples). + n_samples = X.shape[0] + y = np.ones(n_samples, dtype=np.float64, order="C") + + dataset, offset_decay = make_dataset(X, y, sample_weight) + + penalty_type = self._get_penalty_type(self.penalty) + learning_rate_type = self._get_learning_rate_type(learning_rate) + + # early stopping is set to False for the One-Class SVM. thus + # validation_mask and validation_score_cb will be set to values + # associated to early_stopping=False in _make_validation_split and + # _make_validation_score_cb respectively. + validation_mask = self._make_validation_split(y) + validation_score_cb = self._make_validation_score_cb( + validation_mask, X, y, sample_weight) + + random_state = check_random_state(self.random_state) + # numpy mtrand expects a C long which is a signed 32 bit integer under + # Windows + seed = random_state.randint(0, np.iinfo(np.int32).max) + + tol = self.tol if self.tol is not None else -np.inf + + one_class = 1 + # There are no class weights for the One-Class SVM and they are + # therefore set to 1. + pos_weight = 1 + neg_weight = 1 + + if self.average: + coef = self._standard_coef + intercept = self._standard_intercept + average_coef = self._average_coef + average_intercept = self._average_intercept + else: + coef = self.coef_ + intercept = 1 - self.offset_ + average_coef = None # Not used + average_intercept = [0] # Not used + + coef, intercept, average_coef, average_intercept, self.n_iter_ = \ + _plain_sgd(coef, + intercept[0], + average_coef, + average_intercept[0], + self.loss_function_, + penalty_type, + alpha, C, + self.l1_ratio, + dataset, + validation_mask, self.early_stopping, + validation_score_cb, + int(self.n_iter_no_change), + max_iter, tol, + int(self.fit_intercept), + int(self.verbose), + int(self.shuffle), + seed, + neg_weight, pos_weight, + learning_rate_type, + self.eta0, self.power_t, + one_class, self.t_, + offset_decay, self.average) + + self.t_ += self.n_iter_ * n_samples + + if self.average > 0: + + self._average_intercept = np.atleast_1d(average_intercept) + self._standard_intercept = np.atleast_1d(intercept) + + if self.average <= self.t_ - 1.0: + # made enough updates for averaging to be taken into account + self.coef_ = average_coef + self.offset_ = 1 - np.atleast_1d(average_intercept) + else: + self.coef_ = coef + self.offset_ = 1 - np.atleast_1d(intercept) + + else: + self.offset_ = 1 - np.atleast_1d(intercept) + + def _partial_fit(self, X, alpha, C, loss, learning_rate, max_iter, + sample_weight, coef_init, offset_init): + first_call = getattr(self, "coef_", None) is None + X = self._validate_data( + X, None, accept_sparse='csr', dtype=np.float64, + order="C", accept_large_sparse=False, + reset=first_call) + + n_features = X.shape[1] + + # Allocate datastructures from input arguments + sample_weight = _check_sample_weight(sample_weight, X) + + # We use intercept = 1 - offset where intercept is the intercept of + # the SGD implementation and offset is the offset of the One-Class SVM + # optimization problem. + if getattr(self, "coef_", None) is None or coef_init is not None: + self._allocate_parameter_mem(1, n_features, + coef_init, offset_init, 1) + elif n_features != self.coef_.shape[-1]: + raise ValueError("Number of features %d does not match previous " + "data %d." % (n_features, self.coef_.shape[-1])) + + if self.average and getattr(self, "_average_coef", None) is None: + self._average_coef = np.zeros(n_features, dtype=np.float64, + order="C") + self._average_intercept = np.zeros(1, dtype=np.float64, order="C") + + self.loss_function_ = self._get_loss_function(loss) + if not hasattr(self, "t_"): + self.t_ = 1.0 + + # delegate to concrete training procedure + self._fit_one_class(X, alpha=alpha, C=C, + learning_rate=learning_rate, + sample_weight=sample_weight, + max_iter=max_iter) + + return self + + def partial_fit(self, X, y=None, sample_weight=None): + """Fit linear One-Class SVM with Stochastic Gradient Descent. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + Subset of the training data. + + sample_weight : array-like, shape (n_samples,), optional + Weights applied to individual samples. + If not provided, uniform weights are assumed. + + Returns + ------- + self : returns an instance of self. + """ + + alpha = self.nu / 2 + self._validate_params(for_partial_fit=True) + + return self._partial_fit(X, alpha, C=1.0, loss=self.loss, + learning_rate=self.learning_rate, + max_iter=1, + sample_weight=sample_weight, + coef_init=None, offset_init=None) + + def _fit(self, X, alpha, C, loss, learning_rate, coef_init=None, + offset_init=None, sample_weight=None): + self._validate_params() + + if self.warm_start and hasattr(self, "coef_"): + if coef_init is None: + coef_init = self.coef_ + if offset_init is None: + offset_init = self.offset_ + else: + self.coef_ = None + self.offset_ = None + + # Clear iteration count for multiple call to fit. + self.t_ = 1.0 + + self._partial_fit(X, alpha, C, loss, learning_rate, self.max_iter, + sample_weight, coef_init, offset_init) + + if (self.tol is not None and self.tol > -np.inf + and self.n_iter_ == self.max_iter): + warnings.warn("Maximum number of iteration reached before " + "convergence. Consider increasing max_iter to " + "improve the fit.", + ConvergenceWarning) + + return self + + def fit(self, X, y=None, coef_init=None, offset_init=None, + sample_weight=None): + """Fit linear One-Class SVM with Stochastic Gradient Descent. + + This solves an equivalent optimization problem of the + One-Class SVM primal optimization problem and returns a weight vector + w and an offset rho such that the decision function is given by + - rho. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + Training data. + + coef_init : array, shape (n_classes, n_features) + The initial coefficients to warm-start the optimization. + + offset_init : array, shape (n_classes,) + The initial offset to warm-start the optimization. + + sample_weight : array-like, shape (n_samples,), optional + Weights applied to individual samples. + If not provided, uniform weights are assumed. These weights will + be multiplied with class_weight (passed through the + constructor) if class_weight is specified. + + Returns + ------- + self : returns an instance of self. + """ + + alpha = self.nu / 2 + self._fit(X, alpha=alpha, C=1.0, + loss=self.loss, learning_rate=self.learning_rate, + coef_init=coef_init, offset_init=offset_init, + sample_weight=sample_weight) + + return self + + def decision_function(self, X): + """Signed distance to the separating hyperplane. + + Signed distance is positive for an inlier and negative for an + outlier. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + Testing data. + + Returns + ------- + dec : array-like, shape (n_samples,) + Decision function values of the samples. + """ + + check_is_fitted(self, "coef_") + + X = self._validate_data(X, accept_sparse='csr', reset=False) + decisions = safe_sparse_dot(X, self.coef_.T, + dense_output=True) - self.offset_ + + return decisions.ravel() + + def score_samples(self, X): + """Raw scoring function of the samples. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + Testing data. + + Returns + ------- + score_samples : array-like, shape (n_samples,) + Unshiffted scoring function values of the samples. + """ + score_samples = self.decision_function(X) + self.offset_ + return score_samples + + def predict(self, X): + """Return labels (1 inlier, -1 outlier) of the samples. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + Testing data. + + Returns + ------- + y : array, shape (n_samples,) + Labels of the samples. + """ + y = (self.decision_function(X) >= 0).astype(np.int32) + y[y == 0] = -1 # for consistency with outlier detectors + return y + + def _more_tags(self): + return { + '_xfail_checks': { + 'check_sample_weights_invariance': + 'zero sample_weight is not equivalent to removing samples', + } + } diff --git a/sklearn/linear_model/tests/test_sgd.py b/sklearn/linear_model/tests/test_sgd.py index aba043024fea3..f943592c02005 100644 --- a/sklearn/linear_model/tests/test_sgd.py +++ b/sklearn/linear_model/tests/test_sgd.py @@ -9,14 +9,16 @@ from sklearn.utils._testing import assert_array_equal from sklearn.utils._testing import assert_almost_equal from sklearn.utils._testing import assert_array_almost_equal -from sklearn.utils._testing import assert_raises_regexp from sklearn.utils._testing import ignore_warnings from sklearn.utils.fixes import parse_version from sklearn import linear_model, datasets, metrics from sklearn.base import clone, is_classifier +from sklearn.svm import OneClassSVM from sklearn.preprocessing import LabelEncoder, scale, MinMaxScaler from sklearn.preprocessing import StandardScaler +from sklearn.kernel_approximation import Nystroem +from sklearn.pipeline import make_pipeline from sklearn.exceptions import ConvergenceWarning from sklearn.model_selection import StratifiedShuffleSplit, ShuffleSplit from sklearn.linear_model import _sgd_fast as sgd_fast @@ -67,6 +69,21 @@ def decision_function(self, X, *args, **kw): **kw) +class _SparseSGDOneClassSVM(linear_model.SGDOneClassSVM): + def fit(self, X, *args, **kw): + X = sp.csr_matrix(X) + return linear_model.SGDOneClassSVM.fit(self, X, *args, **kw) + + def partial_fit(self, X, *args, **kw): + X = sp.csr_matrix(X) + return linear_model.SGDOneClassSVM.partial_fit(self, X, *args, **kw) + + def decision_function(self, X, *args, **kw): + X = sp.csr_matrix(X) + return linear_model.SGDOneClassSVM.decision_function(self, X, *args, + **kw) + + def SGDClassifier(**kwargs): _update_kwargs(kwargs) return linear_model.SGDClassifier(**kwargs) @@ -77,6 +94,11 @@ def SGDRegressor(**kwargs): return linear_model.SGDRegressor(**kwargs) +def SGDOneClassSVM(**kwargs): + _update_kwargs(kwargs) + return linear_model.SGDOneClassSVM(**kwargs) + + def SparseSGDClassifier(**kwargs): _update_kwargs(kwargs) return _SparseSGDClassifier(**kwargs) @@ -87,6 +109,11 @@ def SparseSGDRegressor(**kwargs): return _SparseSGDRegressor(**kwargs) +def SparseSGDOneClassSVM(**kwargs): + _update_kwargs(kwargs) + return _SparseSGDOneClassSVM(**kwargs) + + # Test Data # test sample 1 @@ -252,7 +279,8 @@ def test_clone(klass): @pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier, - SGDRegressor, SparseSGDRegressor]) + SGDRegressor, SparseSGDRegressor, + SGDOneClassSVM, SparseSGDOneClassSVM]) def test_plain_has_no_average_attr(klass): clf = klass(average=True, eta0=.01) clf.fit(X, Y) @@ -285,7 +313,8 @@ def test_sgd_deprecated_attr(klass): @pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier, - SGDRegressor, SparseSGDRegressor]) + SGDRegressor, SparseSGDRegressor, + SGDOneClassSVM, SparseSGDOneClassSVM]) def test_late_onset_averaging_not_reached(klass): clf1 = klass(average=600) clf2 = klass() @@ -298,7 +327,11 @@ def test_late_onset_averaging_not_reached(klass): clf2.partial_fit(X, Y) assert_array_almost_equal(clf1.coef_, clf2.coef_, decimal=16) - assert_almost_equal(clf1.intercept_, clf2.intercept_, decimal=16) + if klass in [SGDClassifier, SparseSGDClassifier, SGDRegressor, + SparseSGDRegressor]: + assert_almost_equal(clf1.intercept_, clf2.intercept_, decimal=16) + elif klass in [SGDOneClassSVM, SparseSGDOneClassSVM]: + assert_allclose(clf1.offset_, clf2.offset_) @pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier, @@ -444,28 +477,32 @@ def test_sgd_bad_l1_ratio(klass): klass(l1_ratio=1.1) -@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier]) +@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier, + SGDOneClassSVM, SparseSGDOneClassSVM]) def test_sgd_bad_learning_rate_schedule(klass): # Check whether expected ValueError on bad learning_rate with pytest.raises(ValueError): klass(learning_rate="") -@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier]) +@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier, + SGDOneClassSVM, SparseSGDOneClassSVM]) def test_sgd_bad_eta0(klass): # Check whether expected ValueError on bad eta0 with pytest.raises(ValueError): klass(eta0=0, learning_rate="constant") -@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier]) +@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier, + SGDOneClassSVM, SparseSGDOneClassSVM]) def test_sgd_max_iter_param(klass): # Test parameter validity check with pytest.raises(ValueError): klass(max_iter=-10000) -@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier]) +@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier, + SGDOneClassSVM, SparseSGDOneClassSVM]) def test_sgd_shuffle_param(klass): # Test parameter validity check with pytest.raises(ValueError): @@ -493,7 +530,8 @@ def test_sgd_n_iter_no_change(klass): klass(n_iter_no_change=0) -@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier]) +@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier, + SGDOneClassSVM, SparseSGDOneClassSVM]) def test_argument_coef(klass): # Checks coef_init not allowed as model argument (only fit) # Provided coef_ does not match dataset @@ -501,7 +539,8 @@ def test_argument_coef(klass): klass(coef_init=np.zeros((3,))) -@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier]) +@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier, + SGDOneClassSVM, SparseSGDOneClassSVM]) def test_provide_coef(klass): # Checks coef_init shape for the warm starts # Provided coef_ does not match dataset. @@ -509,12 +548,17 @@ def test_provide_coef(klass): klass().fit(X, Y, coef_init=np.zeros((3,))) -@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier]) +@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier, + SGDOneClassSVM, SparseSGDOneClassSVM]) def test_set_intercept(klass): # Checks intercept_ shape for the warm starts # Provided intercept_ does not match dataset. - with pytest.raises(ValueError): - klass().fit(X, Y, intercept_init=np.zeros((3,))) + if klass in [SGDClassifier, SparseSGDClassifier]: + with pytest.raises(ValueError): + klass().fit(X, Y, intercept_init=np.zeros((3,))) + elif klass in [SGDOneClassSVM, SparseSGDOneClassSVM]: + with pytest.raises(ValueError): + klass().fit(X, Y, offset_init=np.zeros((3,))) @pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier]) @@ -590,10 +634,8 @@ def test_partial_fit_weight_class_balanced(klass): r"estimate the class frequency distributions\. " r"Pass the resulting weights as the class_weight " r"parameter\.") - assert_raises_regexp(ValueError, - regex, - klass(class_weight='balanced').partial_fit, - X, Y, classes=np.unique(Y)) + with pytest.raises(ValueError, match=regex): + klass(class_weight='balanced').partial_fit(X, Y, classes=np.unique(Y)) @pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier]) @@ -947,10 +989,14 @@ def test_sample_weights(klass): assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([-1])) -@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier]) +@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier, + SGDOneClassSVM, SparseSGDOneClassSVM]) def test_wrong_sample_weights(klass): # Test if ValueError is raised if sample_weight has wrong shape - clf = klass(alpha=0.1, max_iter=1000, fit_intercept=False) + if klass in [SGDClassifier, SparseSGDClassifier]: + clf = klass(alpha=0.1, max_iter=1000, fit_intercept=False) + elif klass in [SGDOneClassSVM, SparseSGDOneClassSVM]: + clf = klass(nu=0.1, max_iter=1000, fit_intercept=False) # provided sample_weight too long with pytest.raises(ValueError): clf.fit(X, Y, sample_weight=np.arange(7)) @@ -1341,6 +1387,303 @@ def test_loss_function_epsilon(klass): assert clf.loss_functions['huber'][1] == 0.1 +############################################################################### +# SGD One Class SVM Test Case + +# a simple implementation of ASGD to use for testing SGDOneClassSVM +def asgd_oneclass(klass, X, eta, nu, coef_init=None, offset_init=0.0): + if coef_init is None: + coef = np.zeros(X.shape[1]) + else: + coef = coef_init + + average_coef = np.zeros(X.shape[1]) + offset = offset_init + intercept = 1 - offset + average_intercept = 0.0 + decay = 1.0 + + # sparse data has a fixed decay of .01 + if klass == SparseSGDOneClassSVM: + decay = .01 + + for i, entry in enumerate(X): + p = np.dot(entry, coef) + p += intercept + if p <= 1.0: + gradient = -1 + else: + gradient = 0 + coef *= max(0, 1.0 - (eta * nu / 2)) + coef += -(eta * gradient * entry) + intercept += -(eta * (nu + gradient)) * decay + + average_coef *= i + average_coef += coef + average_coef /= i + 1.0 + + average_intercept *= i + average_intercept += intercept + average_intercept /= i + 1.0 + + return average_coef, 1 - average_intercept + + +@pytest.mark.parametrize('klass', [SGDOneClassSVM, SparseSGDOneClassSVM]) +@pytest.mark.parametrize('nu', [-0.5, 2]) +def test_bad_nu_values(klass, nu): + msg = r"nu must be in \(0, 1]" + with pytest.raises(ValueError, match=msg): + klass(nu=nu) + + clf = klass(nu=0.05) + clf2 = clone(clf) + with pytest.raises(ValueError, match=msg): + clf2.set_params(nu=nu) + + +@pytest.mark.parametrize('klass', [SGDOneClassSVM, SparseSGDOneClassSVM]) +def _test_warm_start_oneclass(klass, X, lr): + # Test that explicit warm restart... + clf = klass(nu=0.5, eta0=0.01, shuffle=False, + learning_rate=lr) + clf.fit(X) + + clf2 = klass(nu=0.1, eta0=0.01, shuffle=False, + learning_rate=lr) + clf2.fit(X, coef_init=clf.coef_.copy(), + offset_init=clf.offset_.copy()) + + # ... and implicit warm restart are equivalent. + clf3 = klass(nu=0.5, eta0=0.01, shuffle=False, + warm_start=True, learning_rate=lr) + clf3.fit(X) + + assert clf3.t_ == clf.t_ + assert_allclose(clf3.coef_, clf.coef_) + + clf3.set_params(nu=0.1) + clf3.fit(X) + + assert clf3.t_ == clf2.t_ + assert_allclose(clf3.coef_, clf2.coef_) + + +@pytest.mark.parametrize('klass', [SGDOneClassSVM, SparseSGDOneClassSVM]) +@pytest.mark.parametrize('lr', + ["constant", "optimal", "invscaling", "adaptive"]) +def test_warm_start_oneclass(klass, lr): + _test_warm_start_oneclass(klass, X, lr) + + +@pytest.mark.parametrize('klass', [SGDOneClassSVM, SparseSGDOneClassSVM]) +def test_clone_oneclass(klass): + # Test whether clone works ok. + clf = klass(nu=0.5) + clf = clone(clf) + clf.set_params(nu=0.1) + clf.fit(X) + + clf2 = klass(nu=0.1) + clf2.fit(X) + + assert_array_equal(clf.coef_, clf2.coef_) + + +@pytest.mark.parametrize('klass', [SGDOneClassSVM, SparseSGDOneClassSVM]) +def test_partial_fit_oneclass(klass): + third = X.shape[0] // 3 + clf = klass(nu=0.1) + + clf.partial_fit(X[:third]) + assert clf.coef_.shape == (X.shape[1], ) + assert clf.offset_.shape == (1,) + assert clf.predict([[0, 0]]).shape == (1, ) + id1 = id(clf.coef_.data) + + clf.partial_fit(X[third:]) + id2 = id(clf.coef_.data) + # check that coef_ haven't been re-allocated + assert id1 == id2 + + # raises ValueError if number of features does not match previous data + with pytest.raises(ValueError): + clf.partial_fit(X[:, 1]) + + +@pytest.mark.parametrize('klass', [SGDOneClassSVM, SparseSGDOneClassSVM]) +@pytest.mark.parametrize('lr', + ["constant", "optimal", "invscaling", "adaptive"]) +def test_partial_fit_equal_fit_oneclass(klass, lr): + clf = klass(nu=0.05, max_iter=2, eta0=0.01, + learning_rate=lr, shuffle=False) + clf.fit(X) + y_scores = clf.decision_function(T) + t = clf.t_ + coef = clf.coef_ + offset = clf.offset_ + + clf = klass(nu=0.05, eta0=0.01, max_iter=1, + learning_rate=lr, shuffle=False) + for _ in range(2): + clf.partial_fit(X) + y_scores2 = clf.decision_function(T) + + assert clf.t_ == t + assert_allclose(y_scores, y_scores2) + assert_allclose(clf.coef_, coef) + assert_allclose(clf.offset_, offset) + + +@pytest.mark.parametrize('klass', [SGDOneClassSVM, SparseSGDOneClassSVM]) +def test_late_onset_averaging_reached_oneclass(klass): + # Test average + eta0 = .001 + nu = .05 + + # 2 passes over the training set but average only at second pass + clf1 = klass(average=7, learning_rate="constant", eta0=eta0, + nu=nu, max_iter=2, shuffle=False) + # 1 pass over the training set with no averaging + clf2 = klass(average=0, learning_rate="constant", eta0=eta0, + nu=nu, max_iter=1, shuffle=False) + + clf1.fit(X) + clf2.fit(X) + + # Start from clf2 solution, compute averaging using asgd function and + # compare with clf1 solution + average_coef, average_offset = \ + asgd_oneclass(klass, X, eta0, nu, + coef_init=clf2.coef_.ravel(), + offset_init=clf2.offset_) + + assert_allclose(clf1.coef_.ravel(), average_coef.ravel()) + assert_allclose(clf1.offset_, average_offset) + + +@pytest.mark.parametrize('klass', [SGDOneClassSVM, SparseSGDOneClassSVM]) +def test_sgd_averaged_computed_correctly_oneclass(klass): + # Tests the average SGD One-Class SVM matches the naive implementation + eta = .001 + nu = .05 + n_samples = 20 + n_features = 10 + rng = np.random.RandomState(0) + X = rng.normal(size=(n_samples, n_features)) + + clf = klass(learning_rate='constant', + eta0=eta, nu=nu, + fit_intercept=True, + max_iter=1, average=True, shuffle=False) + + clf.fit(X) + average_coef, average_offset = asgd_oneclass(klass, X, eta, nu) + + assert_allclose(clf.coef_, average_coef) + assert_allclose(clf.offset_, average_offset) + + +@pytest.mark.parametrize('klass', [SGDOneClassSVM, SparseSGDOneClassSVM]) +def test_sgd_averaged_partial_fit_oneclass(klass): + # Tests whether the partial fit yields the same average as the fit + eta = .001 + nu = .05 + n_samples = 20 + n_features = 10 + rng = np.random.RandomState(0) + X = rng.normal(size=(n_samples, n_features)) + + clf = klass(learning_rate='constant', + eta0=eta, nu=nu, + fit_intercept=True, + max_iter=1, average=True, shuffle=False) + + clf.partial_fit(X[:int(n_samples / 2)][:]) + clf.partial_fit(X[int(n_samples / 2):][:]) + average_coef, average_offset = asgd_oneclass(klass, X, eta, nu) + + assert_allclose(clf.coef_, average_coef) + assert_allclose(clf.offset_, average_offset) + + +@pytest.mark.parametrize('klass', [SGDOneClassSVM, SparseSGDOneClassSVM]) +def test_average_sparse_oneclass(klass): + # Checks the average coef on data with 0s + eta = .001 + nu = .01 + clf = klass(learning_rate='constant', + eta0=eta, nu=nu, + fit_intercept=True, + max_iter=1, average=True, shuffle=False) + + n_samples = X3.shape[0] + + clf.partial_fit(X3[:int(n_samples / 2)]) + clf.partial_fit(X3[int(n_samples / 2):]) + average_coef, average_offset = asgd_oneclass(klass, X3, eta, nu) + + assert_allclose(clf.coef_, average_coef) + assert_allclose(clf.offset_, average_offset) + + +def test_sgd_oneclass(): + # Test fit, decision_function, predict and score_samples on a toy + # dataset + X_train = np.array([[-2, -1], [-1, -1], [1, 1]]) + X_test = np.array([[0.5, -2], [2, 2]]) + clf = SGDOneClassSVM(nu=0.5, eta0=1, learning_rate='constant', + shuffle=False, max_iter=1) + clf.fit(X_train) + assert_allclose(clf.coef_, np.array([-0.125, 0.4375])) + assert clf.offset_[0] == -0.5 + + scores = clf.score_samples(X_test) + assert_allclose(scores, np.array([-0.9375, 0.625])) + + dec = clf.score_samples(X_test) - clf.offset_ + assert_allclose(clf.decision_function(X_test), dec) + + pred = clf.predict(X_test) + assert_array_equal(pred, np.array([-1, 1])) + + +def test_ocsvm_vs_sgdocsvm(): + # Checks SGDOneClass SVM gives a good approximation of kernelized + # One-Class SVM + nu = 0.05 + gamma = 2. + random_state = 42 + + # Generate train and test data + rng = np.random.RandomState(random_state) + X = 0.3 * rng.randn(500, 2) + X_train = np.r_[X + 2, X - 2] + X = 0.3 * rng.randn(100, 2) + X_test = np.r_[X + 2, X - 2] + + # One-Class SVM + clf = OneClassSVM(gamma=gamma, kernel='rbf', nu=nu) + clf.fit(X_train) + y_pred_ocsvm = clf.predict(X_test) + dec_ocsvm = clf.decision_function(X_test).reshape(1, -1) + + # SGDOneClassSVM using kernel approximation + max_iter = 15 + transform = Nystroem(gamma=gamma, random_state=random_state) + clf_sgd = SGDOneClassSVM(nu=nu, shuffle=True, fit_intercept=True, + max_iter=max_iter, random_state=random_state, + tol=-np.inf) + pipe_sgd = make_pipeline(transform, clf_sgd) + pipe_sgd.fit(X_train) + y_pred_sgdocsvm = pipe_sgd.predict(X_test) + dec_sgdocsvm = pipe_sgd.decision_function(X_test).reshape(1, -1) + + assert np.mean(y_pred_sgdocsvm == y_pred_ocsvm) >= 0.99 + corrcoef = np.corrcoef(np.concatenate((dec_ocsvm, dec_sgdocsvm)))[0, 1] + assert corrcoef >= 0.9 + + def test_l1_ratio(): # Test if l1 ratio extremes match L1 and L2 penalty settings. X, y = datasets.make_classification(n_samples=1000, @@ -1396,7 +1739,8 @@ def test_underflow_or_overlow(): msg_regxp = (r"Floating-point under-/overflow occurred at epoch #.*" " Scaling input data with StandardScaler or MinMaxScaler" " might help.") - assert_raises_regexp(ValueError, msg_regxp, model.fit, X, y) + with pytest.raises(ValueError, match=msg_regxp): + model.fit(X, y) def test_numerical_stability_large_gradient(): diff --git a/sklearn/svm/_classes.py b/sklearn/svm/_classes.py index 908ece408bb1d..c402779f4eeb6 100644 --- a/sklearn/svm/_classes.py +++ b/sklearn/svm/_classes.py @@ -1334,6 +1334,10 @@ class OneClassSVM(OutlierMixin, BaseLibSVM): array([-1, 1, 1, 1, -1]) >>> clf.score_samples(X) array([1.7798..., 2.0547..., 2.0556..., 2.0561..., 1.7332...]) + + See also + -------- + sklearn.linear_model.SGDOneClassSVM """ _impl = 'one_class' From 3e64e9e6ce6f5356c08134dd9538e94dd10302f1 Mon Sep 17 00:00:00 2001 From: James Budarz Date: Tue, 23 Mar 2021 04:46:14 -0700 Subject: [PATCH 272/478] DOC Clarified n_jobs parallelization in plot_partial_dependence (#19750) Co-authored-by: James Michael Budarz --- sklearn/inspection/_plot/partial_dependence.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sklearn/inspection/_plot/partial_dependence.py b/sklearn/inspection/_plot/partial_dependence.py index d6604d7ae675f..a2ee1886066e2 100644 --- a/sklearn/inspection/_plot/partial_dependence.py +++ b/sklearn/inspection/_plot/partial_dependence.py @@ -174,6 +174,9 @@ def plot_partial_dependence( n_jobs : int, default=None The number of CPUs to use to compute the partial dependences. + Computation is parallelized over features specified by the `features` + parameter. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details. From cf296c74ba91def816045f305dfa6a6dba539ad1 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Tue, 23 Mar 2021 09:08:11 -0400 Subject: [PATCH 273/478] ENH Checks n_features_in_ after fitting in mixture (#19540) --- sklearn/mixture/_base.py | 42 +++++-------------- sklearn/mixture/_bayesian_mixture.py | 5 +++ sklearn/mixture/_gaussian_mixture.py | 5 +++ .../mixture/tests/test_gaussian_mixture.py | 24 ----------- sklearn/mixture/tests/test_mixture.py | 16 +++++++ sklearn/tests/test_common.py | 1 - sklearn/tests/test_docstring_parameters.py | 1 - 7 files changed, 36 insertions(+), 58 deletions(-) diff --git a/sklearn/mixture/_base.py b/sklearn/mixture/_base.py index a9aac7f4dac19..6acb6c2e09292 100644 --- a/sklearn/mixture/_base.py +++ b/sklearn/mixture/_base.py @@ -15,7 +15,7 @@ from ..base import BaseEstimator from ..base import DensityMixin from ..exceptions import ConvergenceWarning -from ..utils import check_array, check_random_state +from ..utils import check_random_state from ..utils.validation import check_is_fitted @@ -36,32 +36,6 @@ def _check_shape(param, param_shape, name): "but got %s" % (name, param_shape, param.shape)) -def _check_X(X, n_components=None, n_features=None, ensure_min_samples=1): - """Check the input data X. - - Parameters - ---------- - X : array-like of shape (n_samples, n_features) - - n_components : int - - Returns - ------- - X : array, shape (n_samples, n_features) - """ - X = check_array(X, dtype=[np.float64, np.float32], - ensure_min_samples=ensure_min_samples) - if n_components is not None and X.shape[0] < n_components: - raise ValueError('Expected n_samples >= n_components ' - 'but got n_components = %d, n_samples = %d' - % (n_components, X.shape[0])) - if n_features is not None and X.shape[1] != n_features: - raise ValueError("Expected the input data X have %d features, " - "but got %d features" - % (n_features, X.shape[1])) - return X - - class BaseMixture(DensityMixin, BaseEstimator, metaclass=ABCMeta): """Base class for mixture models. @@ -217,8 +191,12 @@ def fit_predict(self, X, y=None): labels : array, shape (n_samples,) Component labels. """ - X = _check_X(X, self.n_components, ensure_min_samples=2) - self._check_n_features(X, reset=True) + X = self._validate_data(X, dtype=[np.float64, np.float32], + ensure_min_samples=2) + if X.shape[0] < self.n_components: + raise ValueError("Expected n_samples >= n_components " + f"but got n_components = {self.n_components}, " + f"n_samples = {X.shape[0]}") self._check_initial_parameters(X) # if we enable warm_start, we will have a unique initialisation @@ -335,7 +313,7 @@ def score_samples(self, X): Log probabilities of each data point in X. """ check_is_fitted(self) - X = _check_X(X, None, self.means_.shape[1]) + X = self._validate_data(X, reset=False) return logsumexp(self._estimate_weighted_log_prob(X), axis=1) @@ -370,7 +348,7 @@ def predict(self, X): Component labels. """ check_is_fitted(self) - X = _check_X(X, None, self.means_.shape[1]) + X = self._validate_data(X, reset=False) return self._estimate_weighted_log_prob(X).argmax(axis=1) def predict_proba(self, X): @@ -389,7 +367,7 @@ def predict_proba(self, X): the model given each sample. """ check_is_fitted(self) - X = _check_X(X, None, self.means_.shape[1]) + X = self._validate_data(X, reset=False) _, log_resp = self._estimate_log_prob_resp(X) return np.exp(log_resp) diff --git a/sklearn/mixture/_bayesian_mixture.py b/sklearn/mixture/_bayesian_mixture.py index 34cef090be22b..bd1954ddc15c8 100644 --- a/sklearn/mixture/_bayesian_mixture.py +++ b/sklearn/mixture/_bayesian_mixture.py @@ -288,6 +288,11 @@ class BayesianGaussianMixture(BaseMixture): (n_features) if 'diag', float if 'spherical' + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + Examples -------- >>> import numpy as np diff --git a/sklearn/mixture/_gaussian_mixture.py b/sklearn/mixture/_gaussian_mixture.py index f510c81cec2dd..4bb14f9ca5bd7 100644 --- a/sklearn/mixture/_gaussian_mixture.py +++ b/sklearn/mixture/_gaussian_mixture.py @@ -582,6 +582,11 @@ class GaussianMixture(BaseMixture): Lower bound value on the log-likelihood (of the training data with respect to the model) of the best fit of EM. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + Examples -------- >>> import numpy as np diff --git a/sklearn/mixture/tests/test_gaussian_mixture.py b/sklearn/mixture/tests/test_gaussian_mixture.py index 403fbb2208618..ea5ea0c2eb649 100644 --- a/sklearn/mixture/tests/test_gaussian_mixture.py +++ b/sklearn/mixture/tests/test_gaussian_mixture.py @@ -172,30 +172,6 @@ def test_gaussian_mixture_attributes(): assert gmm.init_params == init_params -def test_check_X(): - from sklearn.mixture._base import _check_X - rng = np.random.RandomState(0) - - n_samples, n_components, n_features = 10, 2, 2 - - X_bad_dim = rng.rand(n_components - 1, n_features) - assert_raise_message(ValueError, - 'Expected n_samples >= n_components ' - 'but got n_components = %d, n_samples = %d' - % (n_components, X_bad_dim.shape[0]), - _check_X, X_bad_dim, n_components) - - X_bad_dim = rng.rand(n_components, n_features + 1) - assert_raise_message(ValueError, - 'Expected the input data X have %d features, ' - 'but got %d features' - % (n_features, X_bad_dim.shape[1]), - _check_X, X_bad_dim, n_components, n_features) - - X = rng.rand(n_samples, n_features) - assert_array_equal(X, _check_X(X, n_components, n_features)) - - def test_check_weights(): rng = np.random.RandomState(0) rand_data = RandomData(rng) diff --git a/sklearn/mixture/tests/test_mixture.py b/sklearn/mixture/tests/test_mixture.py index a79cafe3bccec..7f497cfe76642 100644 --- a/sklearn/mixture/tests/test_mixture.py +++ b/sklearn/mixture/tests/test_mixture.py @@ -21,3 +21,19 @@ def test_gaussian_mixture_n_iter(estimator): estimator.set_params(max_iter=max_iter) estimator.fit(X) assert estimator.n_iter_ == max_iter + + +@pytest.mark.parametrize( + "estimator", + [GaussianMixture(), + BayesianGaussianMixture()] +) +def test_mixture_n_components_greater_than_n_samples_error(estimator): + """Check error when n_components <= n_samples""" + rng = np.random.RandomState(0) + X = rng.rand(10, 5) + estimator.set_params(n_components=12) + + msg = "Expected n_samples >= n_components" + with pytest.raises(ValueError, match=msg): + estimator.fit(X) diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index bfd7f98268350..05f45a51de63d 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -264,7 +264,6 @@ def test_search_cv(estimator, check, request): 'calibration', 'compose', 'feature_extraction', - 'mixture', 'model_selection', 'multiclass', 'multioutput', diff --git a/sklearn/tests/test_docstring_parameters.py b/sklearn/tests/test_docstring_parameters.py index cd2bdba449799..38f22bc667f5b 100644 --- a/sklearn/tests/test_docstring_parameters.py +++ b/sklearn/tests/test_docstring_parameters.py @@ -193,7 +193,6 @@ def _construct_searchcv_instance(SearchCV): 'kernel_ridge', 'linear_model', 'manifold', - 'mixture', 'model_selection', 'multiclass', 'multioutput', From 5788d4a69182c6f150286757b7a0105f8adf2b12 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Tue, 23 Mar 2021 14:09:31 +0100 Subject: [PATCH 274/478] MAINT Improve issue template (#19704) Co-authored-by: Thomas J. Fan --- .github/ISSUE_TEMPLATE/config.yml | 9 ++++++--- .github/ISSUE_TEMPLATE/other_template.md | 10 ---------- 2 files changed, 6 insertions(+), 13 deletions(-) delete mode 100644 .github/ISSUE_TEMPLATE/other_template.md diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml index 7d39c399ca81b..c6af207bba1e8 100644 --- a/.github/ISSUE_TEMPLATE/config.yml +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -1,14 +1,17 @@ blank_issues_enabled: false contact_links: - name: Discussions - url: https://github.com/scikit-learn/scikit-learn/discussions + url: https://github.com/scikit-learn/scikit-learn/discussions/new about: Ask questions and discuss with other scikit-learn community members - - name: Stack overflow + - name: Stack Overflow url: https://stackoverflow.com/questions/tagged/scikit-learn - about: Please ask and answer usage questions on stackoverflow + about: Please ask and answer usage questions on Stack Overflow - name: Mailing list url: https://mail.python.org/mailman/listinfo/scikit-learn about: General discussions and announcements on the mailing list - name: Gitter url: https://gitter.im/scikit-learn/scikit-learn about: Users and developers can sometimes be found on the gitter channel + - name: Blank issue + url: https://github.com/scikit-learn/scikit-learn/issues/new + about: Please note that Github Discussions should be used in most cases instead diff --git a/.github/ISSUE_TEMPLATE/other_template.md b/.github/ISSUE_TEMPLATE/other_template.md deleted file mode 100644 index d46ae9e50b18f..0000000000000 --- a/.github/ISSUE_TEMPLATE/other_template.md +++ /dev/null @@ -1,10 +0,0 @@ ---- -name: Other -about: For all other issues to reach the community... -title: '' -labels: '' -assignees: '' - ---- - - From df3f1bda424911e2d746d68a731e188a60de925f Mon Sep 17 00:00:00 2001 From: makoeppel Date: Tue, 23 Mar 2021 14:14:41 +0100 Subject: [PATCH 275/478] FIX Adds check_array to inverse_transform of StandardScaler (#19356) Co-authored-by: Thomas J. Fan --- doc/whats_new/v1.0.rst | 3 +++ sklearn/preprocessing/_data.py | 11 +++-------- sklearn/preprocessing/tests/test_data.py | 20 ++++++++++++++++++++ 3 files changed, 26 insertions(+), 8 deletions(-) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index c252f5df1074e..be894774f5a27 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -268,6 +268,9 @@ Changelog centering is typically disabled. :pr:`19527` by :user:`Oliver Grisel ` and :user:`Maria Telenczuk `. +- |Fix| :meth:`preprocessing.StandardScaler.inverse_transform` now + correctly handles integer dtypes. :pr:`19356` by :user:`makoeppel`. + :mod:`sklearn.tree` ................... diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py index 5e85b932a1e39..6191fb2fd8bcd 100644 --- a/sklearn/preprocessing/_data.py +++ b/sklearn/preprocessing/_data.py @@ -926,22 +926,17 @@ def inverse_transform(self, X, copy=None): check_is_fitted(self) copy = copy if copy is not None else self.copy + X = check_array(X, accept_sparse='csr', copy=copy, ensure_2d=False, + dtype=FLOAT_DTYPES, force_all_finite="allow-nan") + if sparse.issparse(X): if self.with_mean: raise ValueError( "Cannot uncenter sparse matrices: pass `with_mean=False` " "instead See docstring for motivation and alternatives.") - if not sparse.isspmatrix_csr(X): - X = X.tocsr() - copy = False - if copy: - X = X.copy() if self.scale_ is not None: inplace_column_scale(X, self.scale_) else: - X = np.asarray(X) - if copy: - X = X.copy() if self.with_std: X *= self.scale_ if self.with_mean: diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index 8a30eba27cff7..5557562283850 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -613,6 +613,26 @@ def test_standard_scaler_trasform_with_partial_fit(sample_weight): ) +def test_standard_check_array_of_inverse_transform(): + # Check if StandardScaler inverse_transform is + # converting the integer array to float + x = np.array([ + [1, 1, 1, 0, 1, 0], + [1, 1, 1, 0, 1, 0], + [0, 8, 0, 1, 0, 0], + [1, 4, 1, 1, 0, 0], + [0, 1, 0, 0, 1, 0], + [0, 4, 0, 1, 0, 1]], dtype=np.int32) + + scaler = StandardScaler() + scaler.fit(x) + + # The of inverse_transform should be converted + # to a float array. + # If not X *= self.scale_ will fail. + scaler.inverse_transform(x) + + def test_min_max_scaler_iris(): X = iris.data scaler = MinMaxScaler() From 114616d9f6ce9eba7c1aacd3d4a254f868010e25 Mon Sep 17 00:00:00 2001 From: Isaack Mungui <41724425+isaack-mungui@users.noreply.github.com> Date: Tue, 23 Mar 2021 20:09:05 +0300 Subject: [PATCH 276/478] TST Replace assert_warns in covariance/tests (#19757) --- sklearn/covariance/tests/test_covariance.py | 23 +++++++++++++++---- .../tests/test_robust_covariance.py | 8 +++---- 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/sklearn/covariance/tests/test_covariance.py b/sklearn/covariance/tests/test_covariance.py index bcf163e8182d8..2557299cd395d 100644 --- a/sklearn/covariance/tests/test_covariance.py +++ b/sklearn/covariance/tests/test_covariance.py @@ -10,7 +10,6 @@ from sklearn.utils._testing import assert_almost_equal from sklearn.utils._testing import assert_array_almost_equal from sklearn.utils._testing import assert_array_equal -from sklearn.utils._testing import assert_warns from sklearn import datasets from sklearn.covariance import empirical_covariance, EmpiricalCovariance, \ @@ -57,7 +56,12 @@ def test_covariance(): # Create X with 1 sample and 5 features X_1sample = np.arange(5).reshape(1, 5) cov = EmpiricalCovariance() - assert_warns(UserWarning, cov.fit, X_1sample) + warn_msg = ( + "Only one sample available. You may want to reshape your data array" + ) + with pytest.warns(UserWarning, match=warn_msg): + cov.fit(X_1sample) + assert_array_almost_equal(cov.covariance_, np.zeros(shape=(5, 5), dtype=np.float64)) @@ -175,7 +179,13 @@ def test_ledoit_wolf(): # warning should be raised when using only 1 sample X_1sample = np.arange(5).reshape(1, 5) lw = LedoitWolf() - assert_warns(UserWarning, lw.fit, X_1sample) + + warn_msg = ( + "Only one sample available. You may want to reshape your data array" + ) + with pytest.warns(UserWarning, match=warn_msg): + lw.fit(X_1sample) + assert_array_almost_equal(lw.covariance_, np.zeros(shape=(5, 5), dtype=np.float64)) @@ -294,7 +304,12 @@ def test_oas(): # warning should be raised when using only 1 sample X_1sample = np.arange(5).reshape(1, 5) oa = OAS() - assert_warns(UserWarning, oa.fit, X_1sample) + warn_msg = ( + "Only one sample available. You may want to reshape your data array" + ) + with pytest.warns(UserWarning, match=warn_msg): + oa.fit(X_1sample) + assert_array_almost_equal(oa.covariance_, np.zeros(shape=(5, 5), dtype=np.float64)) diff --git a/sklearn/covariance/tests/test_robust_covariance.py b/sklearn/covariance/tests/test_robust_covariance.py index 55100702bd365..01f32563710aa 100644 --- a/sklearn/covariance/tests/test_robust_covariance.py +++ b/sklearn/covariance/tests/test_robust_covariance.py @@ -7,10 +7,10 @@ import itertools import numpy as np +import pytest from sklearn.utils._testing import assert_array_almost_equal from sklearn.utils._testing import assert_raise_message -from sklearn.utils._testing import assert_warns_message from sklearn import datasets from sklearn.covariance import empirical_covariance, MinCovDet @@ -163,6 +163,6 @@ def test_mcd_increasing_det_warning(): [5.2, 3.5, 1.5, 0.2]] mcd = MinCovDet(random_state=1) - assert_warns_message(RuntimeWarning, - "Determinant has increased", - mcd.fit, X) + warn_msg = "Determinant has increased" + with pytest.warns(RuntimeWarning, match=warn_msg): + mcd.fit(X) From 4dfdfb4e1bb3719628753a4ece995a1b2fa5312a Mon Sep 17 00:00:00 2001 From: waijean Date: Thu, 25 Mar 2021 19:39:16 +0000 Subject: [PATCH 277/478] DOC Fix typo in Truncated SVD documentation (#19765) --- doc/modules/decomposition.rst | 54 +++++++++++++++++------------------ 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/doc/modules/decomposition.rst b/doc/modules/decomposition.rst index 7e8e79d9d8bdd..e971d784c63d6 100644 --- a/doc/modules/decomposition.rst +++ b/doc/modules/decomposition.rst @@ -310,7 +310,7 @@ produces a low-rank approximation :math:`X`: .. math:: X \approx X_k = U_k \Sigma_k V_k^\top -After this operation, :math:`U_k \Sigma_k^\top` +After this operation, :math:`U_k \Sigma_k` is the transformed training set with :math:`k` features (called ``n_components`` in the API). @@ -872,34 +872,34 @@ The graphical model of LDA is a three-level generative model: .. image:: ../images/lda_model_graph.png :align: center -Note on notations presented in the graphical model above, which can be found in +Note on notations presented in the graphical model above, which can be found in Hoffman et al. (2013): * The corpus is a collection of :math:`D` documents. * A document is a sequence of :math:`N` words. - * There are :math:`K` topics in the corpus. - * The boxes represent repeated sampling. - -In the graphical model, each node is a random variable and has a role in the -generative process. A shaded node indicates an observed variable and an unshaded -node indicates a hidden (latent) variable. In this case, words in the corpus are -the only data that we observe. The latent variables determine the random mixture -of topics in the corpus and the distribution of words in the documents. -The goal of LDA is to use the observed words to infer the hidden topic -structure. - -When modeling text corpora, the model assumes the following generative process -for a corpus with :math:`D` documents and :math:`K` topics, with :math:`K` + * There are :math:`K` topics in the corpus. + * The boxes represent repeated sampling. + +In the graphical model, each node is a random variable and has a role in the +generative process. A shaded node indicates an observed variable and an unshaded +node indicates a hidden (latent) variable. In this case, words in the corpus are +the only data that we observe. The latent variables determine the random mixture +of topics in the corpus and the distribution of words in the documents. +The goal of LDA is to use the observed words to infer the hidden topic +structure. + +When modeling text corpora, the model assumes the following generative process +for a corpus with :math:`D` documents and :math:`K` topics, with :math:`K` corresponding to :attr:`n_components` in the API: - 1. For each topic :math:`k \in K`, draw :math:`\beta_k \sim - \mathrm{Dirichlet}(\eta)`. This provides a distribution over the words, - i.e. the probability of a word appearing in topic :math:`k`. - :math:`\eta` corresponds to :attr:`topic_word_prior`. + 1. For each topic :math:`k \in K`, draw :math:`\beta_k \sim + \mathrm{Dirichlet}(\eta)`. This provides a distribution over the words, + i.e. the probability of a word appearing in topic :math:`k`. + :math:`\eta` corresponds to :attr:`topic_word_prior`. - 2. For each document :math:`d \in D`, draw the topic proportions - :math:`\theta_d \sim \mathrm{Dirichlet}(\alpha)`. :math:`\alpha` - corresponds to :attr:`doc_topic_prior`. + 2. For each document :math:`d \in D`, draw the topic proportions + :math:`\theta_d \sim \mathrm{Dirichlet}(\alpha)`. :math:`\alpha` + corresponds to :attr:`doc_topic_prior`. 3. For each word :math:`i` in document :math:`d`: @@ -916,8 +916,8 @@ For parameter estimation, the posterior distribution is: Since the posterior is intractable, variational Bayesian method uses a simpler distribution :math:`q(z,\theta,\beta | \lambda, \phi, \gamma)` -to approximate it, and those variational parameters :math:`\lambda`, -:math:`\phi`, :math:`\gamma` are optimized to maximize the Evidence +to approximate it, and those variational parameters :math:`\lambda`, +:math:`\phi`, :math:`\gamma` are optimized to maximize the Evidence Lower Bound (ELBO): .. math:: @@ -928,10 +928,10 @@ Maximizing ELBO is equivalent to minimizing the Kullback-Leibler(KL) divergence between :math:`q(z,\theta,\beta)` and the true posterior :math:`p(z, \theta, \beta |w, \alpha, \eta)`. -:class:`LatentDirichletAllocation` implements the online variational Bayes +:class:`LatentDirichletAllocation` implements the online variational Bayes algorithm and supports both online and batch update methods. -While the batch method updates variational variables after each full pass through -the data, the online method updates variational variables from mini-batch data +While the batch method updates variational variables after each full pass through +the data, the online method updates variational variables from mini-batch data points. .. note:: From 80f923e00d6949b2385612b024981ac78a79e45a Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Mon, 29 Mar 2021 10:12:54 -0400 Subject: [PATCH 278/478] ENH num_features for a 1d collection of dicts is undefined (#19740) Co-authored-by: Christian Lorentzen --- sklearn/utils/tests/test_validation.py | 7 +++++-- sklearn/utils/validation.py | 4 ++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index f3db51e694b52..66f7d9ae77687 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -1345,9 +1345,10 @@ def test_num_features(constructor_name): [1, 2, 3], ["a", "b", "c"], [False, True, False], - [1.0, 3.4, 4.0] + [1.0, 3.4, 4.0], + [{"a": 1}, {"b": 2}, {"c": 3}], ], - ids=["int", "str", "bool", "float"] + ids=["int", "str", "bool", "float", "dict"] ) @pytest.mark.parametrize("constructor_name", [ "list", "tuple", "array", "series" @@ -1368,6 +1369,8 @@ def test_num_features_errors_1d_containers(X, constructor_name): message += " with shape (3,)" elif isinstance(X[0], str): message += " where the samples are of type str" + elif isinstance(X[0], dict): + message += " where the samples are of type dict" with pytest.raises(TypeError, match=re.escape(message)): _num_features(X) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index d0f410dd7f5d8..ce0fc0ead7e6d 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -226,8 +226,8 @@ def _num_features(X): first_sample = X[0] - # Do not consider an array-like of strings to be a 2D array - if isinstance(first_sample, (str, bytes)): + # Do not consider an array-like of strings or dicts to be a 2D array + if isinstance(first_sample, (str, bytes, dict)): message += (f" where the samples are of type " f"{type(first_sample).__qualname__}") raise TypeError(message) From 54ff7b7c4f745166258a529c33fec6a5ead0a432 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Tue, 30 Mar 2021 18:19:56 +0200 Subject: [PATCH 279/478] Test and doc for n_features_in_ for sklearn.calibration (#19555) Co-authored-by: Thomas J. Fan --- sklearn/calibration.py | 48 ++++++++++++---------- sklearn/tests/test_calibration.py | 37 ++++++++++++----- sklearn/tests/test_common.py | 1 - sklearn/tests/test_docstring_parameters.py | 1 - sklearn/utils/estimator_checks.py | 7 +++- 5 files changed, 57 insertions(+), 37 deletions(-) diff --git a/sklearn/calibration.py b/sklearn/calibration.py index b60a415b4419b..c6289d1df2936 100644 --- a/sklearn/calibration.py +++ b/sklearn/calibration.py @@ -9,7 +9,6 @@ import warnings from inspect import signature -from contextlib import suppress from functools import partial from math import log @@ -33,7 +32,7 @@ from .utils.fixes import delayed from .utils.validation import check_is_fitted, check_consistent_length from .utils.validation import _check_sample_weight, _num_samples -from .pipeline import Pipeline +from .utils import _safe_indexing from .isotonic import IsotonicRegression from .svm import LinearSVC from .model_selection import check_cv, cross_val_predict @@ -141,6 +140,12 @@ class CalibratedClassifierCV(ClassifierMixin, classes_ : ndarray of shape (n_classes,) The class labels. + n_features_in_ : int + Number of features seen during :term:`fit`. Only defined if the + underlying base_estimator exposes such an attribute when fit. + + .. versionadded:: 0.24 + calibrated_classifiers_ : list (len() equal to cv or 1 if `cv="prefit"` \ or `ensemble=False`) The list of classifier and calibrator pairs. @@ -250,14 +255,8 @@ def fit(self, X, y, sample_weight=None): self.calibrated_classifiers_ = [] if self.cv == "prefit": - # `classes_` and `n_features_in_` should be consistent with that - # of base_estimator - if isinstance(self.base_estimator, Pipeline): - check_is_fitted(self.base_estimator[-1]) - else: - check_is_fitted(self.base_estimator) - with suppress(AttributeError): - self.n_features_in_ = base_estimator.n_features_in_ + # `classes_` should be consistent with that of base_estimator + check_is_fitted(self.base_estimator, attributes=["classes_"]) self.classes_ = self.base_estimator.classes_ pred_method = _get_prediction_method(base_estimator) @@ -270,10 +269,6 @@ def fit(self, X, y, sample_weight=None): ) self.calibrated_classifiers_.append(calibrated_classifier) else: - X, y = self._validate_data( - X, y, accept_sparse=['csc', 'csr', 'coo'], - force_all_finite=False, allow_nd=True - ) # Set `classes_` using all `y` label_encoder_ = LabelEncoder().fit(y) self.classes_ = label_encoder_.classes_ @@ -334,6 +329,9 @@ def fit(self, X, y, sample_weight=None): ) self.calibrated_classifiers_.append(calibrated_classifier) + first_clf = self.calibrated_classifiers_[0].base_estimator + if hasattr(first_clf, "n_features_in_"): + self.n_features_in_ = first_clf.n_features_in_ return self def predict_proba(self, X): @@ -352,7 +350,6 @@ def predict_proba(self, X): The predicted probas. """ check_is_fitted(self) - # Compute the arithmetic mean of the predictions of the calibrated # classifiers mean_proba = np.zeros((_num_samples(X), len(self.classes_))) @@ -431,19 +428,26 @@ def _fit_classifier_calibrator_pair(estimator, X, y, train, test, supports_sw, ------- calibrated_classifier : _CalibratedClassifier instance """ - if sample_weight is not None and supports_sw: - estimator.fit(X[train], y[train], - sample_weight=sample_weight[train]) + X_train, y_train = _safe_indexing(X, train), _safe_indexing(y, train) + X_test, y_test = _safe_indexing(X, test), _safe_indexing(y, test) + if supports_sw and sample_weight is not None: + sw_train = _safe_indexing(sample_weight, train) + sw_test = _safe_indexing(sample_weight, test) + else: + sw_train = None + sw_test = None + + if supports_sw: + estimator.fit(X_train, y_train, sample_weight=sw_train) else: - estimator.fit(X[train], y[train]) + estimator.fit(X_train, y_train) n_classes = len(classes) pred_method = _get_prediction_method(estimator) - predictions = _compute_predictions(pred_method, X[test], n_classes) + predictions = _compute_predictions(pred_method, X_test, n_classes) - sw = None if sample_weight is None else sample_weight[test] calibrated_classifier = _fit_calibrator( - estimator, predictions, y[test], classes, method, sample_weight=sw + estimator, predictions, y_test, classes, method, sample_weight=sw_test ) return calibrated_classifier diff --git a/sklearn/tests/test_calibration.py b/sklearn/tests/test_calibration.py index 86a638c4a7679..53d620b41031c 100644 --- a/sklearn/tests/test_calibration.py +++ b/sklearn/tests/test_calibration.py @@ -512,19 +512,19 @@ def decision_function(self, X): @pytest.fixture -def text_data(): - text_data = [ +def dict_data(): + dict_data = [ {'state': 'NY', 'age': 'adult'}, {'state': 'TX', 'age': 'adult'}, {'state': 'VT', 'age': 'child'}, ] text_labels = [1, 0, 1] - return text_data, text_labels + return dict_data, text_labels @pytest.fixture -def text_data_pipeline(text_data): - X, y = text_data +def dict_data_pipeline(dict_data): + X, y = dict_data pipeline_prefit = Pipeline([ ('vectorizer', DictVectorizer()), ('clf', RandomForestClassifier()) @@ -532,7 +532,7 @@ def text_data_pipeline(text_data): return pipeline_prefit.fit(X, y) -def test_calibration_pipeline(text_data, text_data_pipeline): +def test_calibration_dict_pipeline(dict_data, dict_data_pipeline): """Test that calibration works in prefit pipeline with transformer `X` is not array-like, sparse matrix or dataframe at the start. @@ -541,15 +541,17 @@ def test_calibration_pipeline(text_data, text_data_pipeline): Also test it can predict without running into validation errors. See https://github.com/scikit-learn/scikit-learn/issues/19637 """ - X, y = text_data - clf = text_data_pipeline + X, y = dict_data + clf = dict_data_pipeline calib_clf = CalibratedClassifierCV(clf, cv='prefit') calib_clf.fit(X, y) # Check attributes are obtained from fitted estimator assert_array_equal(calib_clf.classes_, clf.classes_) - msg = "'CalibratedClassifierCV' object has no attribute" - with pytest.raises(AttributeError, match=msg): - calib_clf.n_features_in_ + + # Neither the pipeline nor the calibration meta-estimator + # expose the n_features_in_ check on this kind of data. + assert not hasattr(clf, 'n_features_in_') + assert not hasattr(calib_clf, 'n_features_in_') # Ensure that no error is thrown with predict and predict_proba calib_clf.predict(X) @@ -578,6 +580,19 @@ def test_calibration_attributes(clf, cv): assert calib_clf.n_features_in_ == X.shape[1] +def test_calibration_inconsistent_prefit_n_features_in(): + # Check that `n_features_in_` from prefit base estimator + # is consistent with training set + X, y = make_classification(n_samples=10, n_features=5, + n_classes=2, random_state=7) + clf = LinearSVC(C=1).fit(X, y) + calib_clf = CalibratedClassifierCV(clf, cv='prefit') + + msg = "X has 3 features, but LinearSVC is expecting 5 features as input." + with pytest.raises(ValueError, match=msg): + calib_clf.fit(X[:, :3], y) + + # FIXME: remove in 1.1 def test_calibrated_classifier_cv_deprecation(data): # Check that we raise the proper deprecation warning if accessing diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index 05f45a51de63d..8ec4125547722 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -261,7 +261,6 @@ def test_search_cv(estimator, check, request): # # check_classifiers_train would need to be updated with the error message N_FEATURES_IN_AFTER_FIT_MODULES_TO_IGNORE = { - 'calibration', 'compose', 'feature_extraction', 'model_selection', diff --git a/sklearn/tests/test_docstring_parameters.py b/sklearn/tests/test_docstring_parameters.py index 38f22bc667f5b..ee2fe055a4b43 100644 --- a/sklearn/tests/test_docstring_parameters.py +++ b/sklearn/tests/test_docstring_parameters.py @@ -176,7 +176,6 @@ def _construct_searchcv_instance(SearchCV): N_FEATURES_MODULES_TO_IGNORE = { - 'calibration', 'cluster', 'compose', 'covariance', diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 7561c64abe6a8..71f5b3b42de42 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -959,10 +959,13 @@ def check_dtype_object(name, estimator_orig): def check_complex_data(name, estimator_orig): + rng = np.random.RandomState(42) # check that estimators raise an exception on providing complex data - X = np.random.sample(10) + 1j * np.random.sample(10) + X = rng.uniform(size=10) + 1j * rng.uniform(size=10) X = X.reshape(-1, 1) - y = np.random.sample(10) + 1j * np.random.sample(10) + + # Something both valid for classification and regression + y = rng.randint(low=0, high=2, size=10) + 1j estimator = clone(estimator_orig) with raises(ValueError, match="Complex data not supported"): estimator.fit(X, y) From 57d3668f2a1fea69dafc2e68208576a56812cd45 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?= <34657725+jeremiedbb@users.noreply.github.com> Date: Wed, 31 Mar 2021 09:20:22 +0200 Subject: [PATCH 280/478] MNT Avoid catastrophic cancellation in mean_variance_axis (#19766) --- doc/whats_new/v1.0.rst | 11 ++++- sklearn/utils/sparsefuncs_fast.pyx | 62 +++++++++++++++++-------- sklearn/utils/tests/test_sparsefuncs.py | 20 ++++++++ 3 files changed, 71 insertions(+), 22 deletions(-) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index be894774f5a27..2b108d2f0e197 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -288,12 +288,19 @@ Changelog :user:`Clifford Akai-Nettey`. :mod:`sklearn.calibration` -............................ +.......................... - |Fix| The predict and predict_proba methods of - :class:`calibration.CalibratedClassifierCV can now properly be used on + :class:`calibration.CalibratedClassifierCV` can now properly be used on prefitted pipelines. :pr:`19641` by :user:`Alek Lefebvre ` +:mod:`sklearn.utils` +.................... + + - |Fix| Fixed a bug in :func:`utils.sparsefuncs.mean_variance_axis` where the + precision of the computed variance was very poor when the real variance is + exactly zero. :pr:`19766` by :user:`Jérémie du Boisberranger `. + Code and Documentation Contributors ----------------------------------- diff --git a/sklearn/utils/sparsefuncs_fast.pyx b/sklearn/utils/sparsefuncs_fast.pyx index e89599918ec5e..4a84c03eff86b 100644 --- a/sklearn/utils/sparsefuncs_fast.pyx +++ b/sklearn/utils/sparsefuncs_fast.pyx @@ -124,23 +124,32 @@ def _csr_mean_variance_axis0(np.ndarray[floating, ndim=1, mode="c"] X_data, variances = np.zeros_like(means, dtype=dtype) cdef: - np.ndarray[floating, ndim=1] sum_weights = \ - np.full(fill_value=np.sum(weights), shape=n_features, dtype=dtype) - np.ndarray[floating, ndim=1] sum_weights_nan = \ - np.zeros(shape=n_features, dtype=dtype) - np.ndarray[floating, ndim=1] sum_weights_nz = \ - np.zeros(shape=n_features, dtype=dtype) + np.ndarray[floating, ndim=1] sum_weights = np.full( + fill_value=np.sum(weights), shape=n_features, dtype=dtype) + np.ndarray[floating, ndim=1] sum_weights_nz = np.zeros( + shape=n_features, dtype=dtype) + + np.ndarray[np.uint64_t, ndim=1] counts = np.full( + fill_value=weights.shape[0], shape=n_features, dtype=np.uint64) + np.ndarray[np.uint64_t, ndim=1] counts_nz = np.zeros( + shape=n_features, dtype=np.uint64) for row_ind in range(len(X_indptr) - 1): for i in range(X_indptr[row_ind], X_indptr[row_ind + 1]): col_ind = X_indices[i] if not isnan(X_data[i]): means[col_ind] += (X_data[i] * weights[row_ind]) + # sum of weights where X[:, col_ind] is non-zero + sum_weights_nz[col_ind] += weights[row_ind] + # number of non-zero elements of X[:, col_ind] + counts_nz[col_ind] += 1 else: - sum_weights_nan[col_ind] += weights[row_ind] + # sum of weights where X[:, col_ind] is not nan + sum_weights[col_ind] -= weights[row_ind] + # number of non nan elements of X[:, col_ind] + counts[col_ind] -= 1 for i in range(n_features): - sum_weights[i] -= sum_weights_nan[i] means[i] /= sum_weights[i] for row_ind in range(len(X_indptr) - 1): @@ -149,10 +158,12 @@ def _csr_mean_variance_axis0(np.ndarray[floating, ndim=1, mode="c"] X_data, if not isnan(X_data[i]): diff = X_data[i] - means[col_ind] variances[col_ind] += diff * diff * weights[row_ind] - sum_weights_nz[col_ind] += weights[row_ind] for i in range(n_features): - variances[i] += (sum_weights[i] - sum_weights_nz[i]) * means[i]**2 + if counts[i] != counts_nz[i]: + # only compute it when it's guaranteed to be non-zero to avoid + # catastrophic cancellation. + variances[i] += (sum_weights[i] - sum_weights_nz[i]) * means[i]**2 variances[i] /= sum_weights[i] return means, variances, sum_weights @@ -228,23 +239,32 @@ def _csc_mean_variance_axis0(np.ndarray[floating, ndim=1, mode="c"] X_data, variances = np.zeros_like(means, dtype=dtype) cdef: - np.ndarray[floating, ndim=1] sum_weights = \ - np.full(fill_value=np.sum(weights), shape=n_features, dtype=dtype) - np.ndarray[floating, ndim=1] sum_weights_nan = \ - np.zeros(shape=n_features, dtype=dtype) - np.ndarray[floating, ndim=1] sum_weights_nz = \ - np.zeros(shape=n_features, dtype=dtype) + np.ndarray[floating, ndim=1] sum_weights = np.full( + fill_value=np.sum(weights), shape=n_features, dtype=dtype) + np.ndarray[floating, ndim=1] sum_weights_nz = np.zeros( + shape=n_features, dtype=dtype) + + np.ndarray[np.uint64_t, ndim=1] counts = np.full( + fill_value=weights.shape[0], shape=n_features, dtype=np.uint64) + np.ndarray[np.uint64_t, ndim=1] counts_nz = np.zeros( + shape=n_features, dtype=np.uint64) for col_ind in range(n_features): for i in range(X_indptr[col_ind], X_indptr[col_ind + 1]): row_ind = X_indices[i] if not isnan(X_data[i]): means[col_ind] += (X_data[i] * weights[row_ind]) + # sum of weights where X[:, col_ind] is non-zero + sum_weights_nz[col_ind] += weights[row_ind] + # number of non-zero elements of X[:, col_ind] + counts_nz[col_ind] += 1 else: - sum_weights_nan[col_ind] += weights[row_ind] + # sum of weights where X[:, col_ind] is not nan + sum_weights[col_ind] -= weights[row_ind] + # number of non nan elements of X[:, col_ind] + counts[col_ind] -= 1 for i in range(n_features): - sum_weights[i] -= sum_weights_nan[i] means[i] /= sum_weights[i] for col_ind in range(n_features): @@ -253,10 +273,12 @@ def _csc_mean_variance_axis0(np.ndarray[floating, ndim=1, mode="c"] X_data, if not isnan(X_data[i]): diff = X_data[i] - means[col_ind] variances[col_ind] += diff * diff * weights[row_ind] - sum_weights_nz[col_ind] += weights[row_ind] for i in range(n_features): - variances[i] += (sum_weights[i] - sum_weights_nz[i]) * means[i]**2 + if counts[i] != counts_nz[i]: + # only compute it when it's guaranteed to be non-zero to avoid + # catastrophic cancellation. + variances[i] += (sum_weights[i] - sum_weights_nz[i]) * means[i]**2 variances[i] /= sum_weights[i] return means, variances, sum_weights diff --git a/sklearn/utils/tests/test_sparsefuncs.py b/sklearn/utils/tests/test_sparsefuncs.py index 8366aabd751ad..8b087145c3d36 100644 --- a/sklearn/utils/tests/test_sparsefuncs.py +++ b/sklearn/utils/tests/test_sparsefuncs.py @@ -53,6 +53,26 @@ def test_mean_variance_axis0(): assert_array_almost_equal(X_vars, np.var(X_test, axis=0)) +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +@pytest.mark.parametrize("sparse_constructor", [sp.csr_matrix, sp.csc_matrix]) +def test_mean_variance_axis0_precision(dtype, sparse_constructor): + # Check that there's no big loss of precision when the real variance is + # exactly 0. (#19766) + rng = np.random.RandomState(0) + X = np.full(fill_value=100., shape=(1000, 1), dtype=dtype) + # Add some missing records which should be ignored: + missing_indices = rng.choice(np.arange(X.shape[0]), 10, replace=False) + X[missing_indices, 0] = np.nan + X = sparse_constructor(X) + + # Random positive weights: + sample_weight = rng.rand(X.shape[0]).astype(dtype) + + _, var = mean_variance_axis(X, weights=sample_weight, axis=0) + + assert var < np.finfo(dtype).eps + + def test_mean_variance_axis1(): X, _ = make_classification(5, 4, random_state=0) # Sparsify the array a little bit From c9c89cfc85dd8dfefd7921c16c87327d03140a06 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Wed, 31 Mar 2021 10:45:38 -0400 Subject: [PATCH 281/478] ENH Adds support for drop + handle_unknown=ignore in the OneHotEncoder (#19041) Co-authored-by: Olivier Grisel --- doc/modules/preprocessing.rst | 28 ++++++- doc/whats_new/v1.0.rst | 7 ++ sklearn/preprocessing/_encoders.py | 40 +++++++--- sklearn/preprocessing/tests/test_encoders.py | 83 +++++++++++++++++++- 4 files changed, 140 insertions(+), 18 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index b87971ec4ae5a..cdde7479b1a4f 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -560,9 +560,7 @@ parameter allows the user to specify a category for each feature to be dropped. This is useful to avoid co-linearity in the input matrix in some classifiers. Such functionality is useful, for example, when using non-regularized regression (:class:`LinearRegression `), -since co-linearity would cause the covariance matrix to be non-invertible. -When this parameter is not None, ``handle_unknown`` must be set to -``error``:: +since co-linearity would cause the covariance matrix to be non-invertible:: >>> X = [['male', 'from US', 'uses Safari'], ... ['female', 'from Europe', 'uses Firefox']] @@ -591,6 +589,30 @@ In the transformed `X`, the first column is the encoding of the feature with categories "male"/"female", while the remaining 6 columns is the encoding of the 2 features with respectively 3 categories each. +When `handle_unknown='ignore'` and `drop` is not None, unknown categories will +be encoded as all zeros:: + + >>> drop_enc = preprocessing.OneHotEncoder(drop='first', + ... handle_unknown='ignore').fit(X) + >>> X_test = [['unknown', 'America', 'IE']] + >>> drop_enc.transform(X_test).toarray() + array([[0., 0., 0., 0., 0.]]) + +All the categories in `X_test` are unknown during transform and will be mapped +to all zeros. This means that unknown categories will have the same mapping as +the dropped category. :meth`OneHotEncoder.inverse_transform` will map all zeros +to the dropped category if a category is dropped and `None` if a category is +not dropped:: + + >>> drop_enc = preprocessing.OneHotEncoder(drop='if_binary', sparse=False, + ... handle_unknown='ignore').fit(X) + >>> X_test = [['unknown', 'America', 'IE']] + >>> X_trans = drop_enc.transform(X_test) + >>> X_trans + array([[0., 0., 0., 0., 0., 0., 0.]]) + >>> drop_enc.inverse_transform(X_trans) + array([['female', None, None]], dtype=object) + :class:`OneHotEncoder` supports categorical features with missing values by considering the missing values as an additional category:: diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index 2b108d2f0e197..2aaecb6d9b438 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -79,6 +79,13 @@ Changelog :mod:`sklearn.cluster` ...................... +:mod:`sklearn.preprocessing` +............................ + +- |Feature| :class:`preprocessing.OneHotEncoder` now supports + `handle_unknown='ignore'` and dropping categories. :pr:`19041` by + `Thomas Fan`_. + - |Efficiency| The "k-means++" initialization of :class:`cluster.KMeans` and :class:`cluster.MiniBatchKMeans` is now faster, especially in multicore settings. :pr:`19002` by :user:`Jon Crall ` and diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 043f9fc40ef53..4344e010bba1a 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -2,6 +2,7 @@ # Joris Van den Bossche # License: BSD 3 clause +import warnings import numpy as np from scipy import sparse import numbers @@ -110,7 +111,8 @@ def _fit(self, X, handle_unknown='error', force_all_finite=True): raise ValueError(msg) self.categories_.append(cats) - def _transform(self, X, handle_unknown='error', force_all_finite=True): + def _transform(self, X, handle_unknown='error', force_all_finite=True, + warn_on_unknown=False): X_list, n_samples, n_features = self._check_X( X, force_all_finite=force_all_finite) @@ -125,6 +127,7 @@ def _transform(self, X, handle_unknown='error', force_all_finite=True): .format(len(self.categories_,), n_features) ) + columns_with_unknown = [] for i in range(n_features): Xi = X_list[i] diff, valid_mask = _check_unknown(Xi, self.categories_[i], @@ -136,6 +139,8 @@ def _transform(self, X, handle_unknown='error', force_all_finite=True): " during transform".format(diff, i)) raise ValueError(msg) else: + if warn_on_unknown: + columns_with_unknown.append(i) # Set the problematic rows to an acceptable value and # continue `The rows are marked `X_mask` and will be # removed later. @@ -153,6 +158,11 @@ def _transform(self, X, handle_unknown='error', force_all_finite=True): # already called above. X_int[:, i] = _encode(Xi, uniques=self.categories_[i], check_unknown=False) + if columns_with_unknown: + warnings.warn("Found unknown categories in columns " + f"{columns_with_unknown} during transform. These " + "unknown categories will be encoded as all zeros", + UserWarning) return X_int, X_mask @@ -327,14 +337,6 @@ def _validate_keywords(self): msg = ("handle_unknown should be either 'error' or 'ignore', " "got {0}.".format(self.handle_unknown)) raise ValueError(msg) - # If we have both dropped columns and ignored unknown - # values, there will be ambiguous cells. This creates difficulties - # in interpreting the model. - if self.drop is not None and self.handle_unknown != 'error': - raise ValueError( - "`handle_unknown` must be 'error' when the drop parameter is " - "specified, as both would create categories that are all " - "zero.") def _compute_drop_idx(self): if self.drop is None: @@ -459,8 +461,11 @@ def transform(self, X): """ check_is_fitted(self) # validation of X happens in _check_X called by _transform + warn_on_unknown = (self.handle_unknown == "ignore" + and self.drop is not None) X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown, - force_all_finite='allow-nan') + force_all_finite='allow-nan', + warn_on_unknown=warn_on_unknown) n_samples, n_features = X_int.shape @@ -509,8 +514,10 @@ def inverse_transform(self, X): """ Convert the data back to the original representation. - In case unknown categories are encountered (all zeros in the - one-hot encoding), ``None`` is used to represent this category. + When unknown categories are encountered (all zeros in the + one-hot encoding), ``None`` is used to represent this category. If the + feature with the unknown category has a dropped caregory, the dropped + category will be its inverse. Parameters ---------- @@ -571,7 +578,14 @@ def inverse_transform(self, X): unknown = np.asarray(sub.sum(axis=1) == 0).flatten() # ignored unknown categories: we have a row of all zero if unknown.any(): - found_unknown[i] = unknown + # if categories were dropped then unknown categories will + # be mapped to the dropped category + if self.drop_idx_ is None or self.drop_idx_[i] is None: + found_unknown[i] = unknown + else: + X_tr[unknown, i] = self.categories_[i][ + self.drop_idx_[i] + ] else: dropped = np.asarray(sub.sum(axis=1) == 0).flatten() if dropped.any(): diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index b1eff0cad21e0..eb776c4c25267 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -775,8 +775,6 @@ def test_one_hot_encoder_drop_manual(missing_value): "X_fit, params, err_msg", [([["Male"], ["Female"]], {'drop': 'second'}, "Wrong input for parameter `drop`"), - ([["Male"], ["Female"]], {'drop': 'first', 'handle_unknown': 'ignore'}, - "`handle_unknown` must be 'error'"), ([['abc', 2, 55], ['def', 1, 55], ['def', 3, 59]], {'drop': np.asarray('b', dtype=object)}, "Wrong input for parameter `drop`"), @@ -914,6 +912,87 @@ def test_ohe_missing_value_support_pandas_categorical(pd_nan_type): assert np.isnan(ohe.categories_[0][-1]) +def test_ohe_drop_first_handle_unknown_ignore_warns(): + """Check drop='first' and handle_unknown='ignore' during transform.""" + X = [['a', 0], ['b', 2], ['b', 1]] + + ohe = OneHotEncoder(drop='first', sparse=False, handle_unknown='ignore') + X_trans = ohe.fit_transform(X) + + X_expected = np.array([ + [0, 0, 0], + [1, 0, 1], + [1, 1, 0], + ]) + assert_allclose(X_trans, X_expected) + + # Both categories are unknown + X_test = [['c', 3]] + X_expected = np.array([[0, 0, 0]]) + + warn_msg = (r"Found unknown categories in columns \[0, 1\] during " + "transform. These unknown categories will be encoded as all " + "zeros") + with pytest.warns(UserWarning, match=warn_msg): + X_trans = ohe.transform(X_test) + assert_allclose(X_trans, X_expected) + + # inverse_transform maps to None + X_inv = ohe.inverse_transform(X_expected) + assert_array_equal(X_inv, np.array([['a', 0]], dtype=object)) + + +def test_ohe_drop_if_binary_handle_unknown_ignore_warns(): + """Check drop='if_binary' and handle_unknown='ignore' during transform.""" + X = [['a', 0], ['b', 2], ['b', 1]] + + ohe = OneHotEncoder(drop='if_binary', sparse=False, + handle_unknown='ignore') + X_trans = ohe.fit_transform(X) + + X_expected = np.array([ + [0, 1, 0, 0], + [1, 0, 0, 1], + [1, 0, 1, 0], + ]) + assert_allclose(X_trans, X_expected) + + # Both categories are unknown + X_test = [['c', 3]] + X_expected = np.array([[0, 0, 0, 0]]) + + warn_msg = (r"Found unknown categories in columns \[0, 1\] during " + "transform. These unknown categories will be encoded as all " + "zeros") + with pytest.warns(UserWarning, match=warn_msg): + X_trans = ohe.transform(X_test) + assert_allclose(X_trans, X_expected) + + # inverse_transform maps to None + X_inv = ohe.inverse_transform(X_expected) + assert_array_equal(X_inv, np.array([['a', None]], dtype=object)) + + +def test_ohe_drop_first_explicit_categories(): + """Check drop='first' and handle_unknown='ignore' during fit with + categories passed in.""" + + X = [['a', 0], ['b', 2], ['b', 1]] + + ohe = OneHotEncoder(drop='first', sparse=False, handle_unknown='ignore', + categories=[['b', 'a'], [1, 2]]) + ohe.fit(X) + + X_test = [['c', 1]] + X_expected = np.array([[0, 0]]) + + warn_msg = (r"Found unknown categories in columns \[0\] during transform. " + r"These unknown categories will be encoded as all zeros") + with pytest.warns(UserWarning, match=warn_msg): + X_trans = ohe.transform(X_test) + assert_allclose(X_trans, X_expected) + + def test_ordinal_encoder_passthrough_missing_values_float_errors_dtype(): """Test ordinal encoder with nan passthrough fails when dtype=np.int32.""" From 108dd7b00095a1265e6f0c4db0a69d620f590400 Mon Sep 17 00:00:00 2001 From: Alihan Zihna Date: Thu, 1 Apr 2021 16:15:23 +0100 Subject: [PATCH 282/478] TST Changes assert to pytest style in tests/test_naive_bayes.py (#19768) Co-authored-by: Thomas J. Fan Co-authored-by: Alihan Zihna --- sklearn/tests/test_naive_bayes.py | 125 ++++++++++++++++++++---------- 1 file changed, 82 insertions(+), 43 deletions(-) diff --git a/sklearn/tests/test_naive_bayes.py b/sklearn/tests/test_naive_bayes.py index dcd4b07712357..251ba6698ab0f 100644 --- a/sklearn/tests/test_naive_bayes.py +++ b/sklearn/tests/test_naive_bayes.py @@ -1,3 +1,4 @@ +import re import numpy as np import scipy.sparse @@ -11,10 +12,6 @@ from sklearn.utils._testing import assert_almost_equal from sklearn.utils._testing import assert_array_equal from sklearn.utils._testing import assert_array_almost_equal -from sklearn.utils._testing import assert_raises -from sklearn.utils._testing import assert_raise_message -from sklearn.utils._testing import assert_warns -from sklearn.utils._testing import assert_no_warnings from sklearn.utils._testing import ignore_warnings from sklearn.naive_bayes import GaussianNB, BernoulliNB @@ -118,7 +115,10 @@ def test_gnb_sample_weight(): def test_gnb_neg_priors(): """Test whether an error is raised in case of negative priors""" clf = GaussianNB(priors=np.array([-1., 2.])) - assert_raises(ValueError, clf.fit, X, y) + + msg = 'Priors must be non-negative' + with pytest.raises(ValueError, match=msg): + clf.fit(X, y) def test_gnb_priors(): @@ -146,13 +146,19 @@ def test_gnb_wrong_nb_priors(): """ Test whether an error is raised if the number of prior is different from the number of class""" clf = GaussianNB(priors=np.array([.25, .25, .25, .25])) - assert_raises(ValueError, clf.fit, X, y) + + msg = 'Number of priors must match number of classes' + with pytest.raises(ValueError, match=msg): + clf.fit(X, y) def test_gnb_prior_greater_one(): """Test if an error is raised if the sum of prior greater than one""" clf = GaussianNB(priors=np.array([2., 1.])) - assert_raises(ValueError, clf.fit, X, y) + + msg = 'The sum of the priors should be 1' + with pytest.raises(ValueError, match=msg): + clf.fit(X, y) def test_gnb_prior_large_bias(): @@ -339,9 +345,13 @@ def test_discretenb_provide_prior(DiscreteNaiveBayes): assert_array_almost_equal(prior, np.array([.5, .5])) # Inconsistent number of classes with prior - assert_raises(ValueError, clf.fit, [[0], [1], [2]], [0, 1, 2]) - assert_raises(ValueError, clf.partial_fit, [[0], [1]], [0, 1], - classes=[0, 1, 1]) + msg = 'Number of priors must match number of classes' + with pytest.raises(ValueError, match=msg): + clf.fit([[0], [1], [2]], [0, 1, 2]) + + msg = 'is not the same as on last call to partial_fit' + with pytest.raises(ValueError, match=msg): + clf.partial_fit([[0], [1]], [0, 1], classes=[0, 1, 1]) @pytest.mark.parametrize('DiscreteNaiveBayes', DISCRETE_NAIVE_BAYES_CLASSES) @@ -470,7 +480,10 @@ def test_mnnb(kind): # Check the ability to predict the learning set. clf = MultinomialNB() - assert_raises(ValueError, clf.fit, -X, y2) + + msg = 'Negative values in data passed to' + with pytest.raises(ValueError, match=msg): + clf.fit(-X, y2) y_pred = clf.fit(X, y2).predict(X) assert_array_equal(y_pred, y2) @@ -518,18 +531,18 @@ def test_mnb_prior_unobserved_targets(): clf = MultinomialNB() - assert_no_warnings( - clf.partial_fit, X, y, classes=[0, 1, 2] - ) + with pytest.warns(None) as record: + clf.partial_fit(X, y, classes=[0, 1, 2]) + assert len(record) == 0 assert clf.predict([[0, 1]]) == 0 assert clf.predict([[1, 0]]) == 1 assert clf.predict([[1, 1]]) == 0 # add a training example with previously unobserved class - assert_no_warnings( - clf.partial_fit, [[1, 1]], [2] - ) + with pytest.warns(None) as record: + clf.partial_fit([[1, 1]], [2]) + assert len(record) == 0 assert clf.predict([[0, 1]]) == 0 assert clf.predict([[1, 0]]) == 1 @@ -666,7 +679,10 @@ def test_cnb(): # Verify inputs are nonnegative. clf = ComplementNB(alpha=1.0) - assert_raises(ValueError, clf.fit, -X, Y) + + msg = re.escape('Negative values in data passed to ComplementNB (input X)') + with pytest.raises(ValueError, match=msg): + clf.fit(-X, Y) clf.fit(X, Y) @@ -700,9 +716,13 @@ def test_categoricalnb(): # Check error is raised for X with negative entries X = np.array([[0, -1]]) y = np.array([1]) - error_msg = "Negative values in data passed to CategoricalNB (input X)" - assert_raise_message(ValueError, error_msg, clf.predict, X) - assert_raise_message(ValueError, error_msg, clf.fit, X, y) + error_msg = re.escape( + "Negative values in data passed to CategoricalNB (input X)" + ) + with pytest.raises(ValueError, match=error_msg): + clf.predict(X) + with pytest.raises(ValueError, match=error_msg): + clf.fit(X, y) # Test alpha X3_test = np.array([[2, 5]]) @@ -794,52 +814,67 @@ def test_alpha(): X = np.array([[1, 0], [1, 1]]) y = np.array([0, 1]) nb = BernoulliNB(alpha=0.) - assert_warns(UserWarning, nb.partial_fit, X, y, classes=[0, 1]) - assert_warns(UserWarning, nb.fit, X, y) + msg = ( + "alpha too small will result in numeric errors," + " setting alpha = 1.0e-10" + ) + with pytest.warns(UserWarning, match=msg): + nb.partial_fit(X, y, classes=[0, 1]) + with pytest.warns(UserWarning, match=msg): + nb.fit(X, y) prob = np.array([[1, 0], [0, 1]]) assert_array_almost_equal(nb.predict_proba(X), prob) nb = MultinomialNB(alpha=0.) - assert_warns(UserWarning, nb.partial_fit, X, y, classes=[0, 1]) - assert_warns(UserWarning, nb.fit, X, y) + with pytest.warns(UserWarning, match=msg): + nb.partial_fit(X, y, classes=[0, 1]) + with pytest.warns(UserWarning, match=msg): + nb.fit(X, y) prob = np.array([[2. / 3, 1. / 3], [0, 1]]) assert_array_almost_equal(nb.predict_proba(X), prob) nb = CategoricalNB(alpha=0.) - assert_warns(UserWarning, nb.fit, X, y) + with pytest.warns(UserWarning, match=msg): + nb.fit(X, y) prob = np.array([[1., 0.], [0., 1.]]) assert_array_almost_equal(nb.predict_proba(X), prob) # Test sparse X X = scipy.sparse.csr_matrix(X) nb = BernoulliNB(alpha=0.) - assert_warns(UserWarning, nb.fit, X, y) + with pytest.warns(UserWarning, match=msg): + nb.fit(X, y) prob = np.array([[1, 0], [0, 1]]) assert_array_almost_equal(nb.predict_proba(X), prob) nb = MultinomialNB(alpha=0.) - assert_warns(UserWarning, nb.fit, X, y) + with pytest.warns(UserWarning, match=msg): + nb.fit(X, y) prob = np.array([[2. / 3, 1. / 3], [0, 1]]) assert_array_almost_equal(nb.predict_proba(X), prob) # Test for alpha < 0 X = np.array([[1, 0], [1, 1]]) y = np.array([0, 1]) - expected_msg = ('Smoothing parameter alpha = -1.0e-01. ' - 'alpha should be > 0.') + expected_msg = re.escape( + 'Smoothing parameter alpha = -1.0e-01. alpha should be > 0.' + ) b_nb = BernoulliNB(alpha=-0.1) m_nb = MultinomialNB(alpha=-0.1) c_nb = CategoricalNB(alpha=-0.1) - assert_raise_message(ValueError, expected_msg, b_nb.fit, X, y) - assert_raise_message(ValueError, expected_msg, m_nb.fit, X, y) - assert_raise_message(ValueError, expected_msg, c_nb.fit, X, y) + with pytest.raises(ValueError, match=expected_msg): + b_nb.fit(X, y) + with pytest.raises(ValueError, match=expected_msg): + m_nb.fit(X, y) + with pytest.raises(ValueError, match=expected_msg): + c_nb.fit(X, y) b_nb = BernoulliNB(alpha=-0.1) m_nb = MultinomialNB(alpha=-0.1) - assert_raise_message(ValueError, expected_msg, b_nb.partial_fit, - X, y, classes=[0, 1]) - assert_raise_message(ValueError, expected_msg, m_nb.partial_fit, - X, y, classes=[0, 1]) + with pytest.raises(ValueError, match=expected_msg): + b_nb.partial_fit(X, y, classes=[0, 1]) + with pytest.raises(ValueError, match=expected_msg): + m_nb.partial_fit(X, y, classes=[0, 1]) def test_alpha_vector(): @@ -862,10 +897,12 @@ def test_alpha_vector(): # Test alpha non-negative alpha = np.array([1., -0.1]) - expected_msg = ('Smoothing parameter alpha = -1.0e-01. ' - 'alpha should be > 0.') m_nb = MultinomialNB(alpha=alpha) - assert_raise_message(ValueError, expected_msg, m_nb.fit, X, y) + expected_msg = ( + 'Smoothing parameter alpha = -1.0e-01. alpha should be > 0.' + ) + with pytest.raises(ValueError, match=expected_msg): + m_nb.fit(X, y) # Test that too small pseudo-counts are replaced ALPHA_MIN = 1e-10 @@ -879,9 +916,11 @@ def test_alpha_vector(): # Test correct dimensions alpha = np.array([1., 2., 3.]) m_nb = MultinomialNB(alpha=alpha) - expected_msg = ('alpha should be a scalar or a numpy array ' - 'with shape [n_features]') - assert_raise_message(ValueError, expected_msg, m_nb.fit, X, y) + expected_msg = re.escape( + 'alpha should be a scalar or a numpy array with shape [n_features]' + ) + with pytest.raises(ValueError, match=expected_msg): + m_nb.fit(X, y) def test_check_accuracy_on_digits(): From bc7cd3189bc817545791071515693445e1e271db Mon Sep 17 00:00:00 2001 From: Frederick Robinson Date: Fri, 2 Apr 2021 01:30:03 -0700 Subject: [PATCH 283/478] ENH more efficient _num_combinations calculation in PolynomialFeatures (#19734) --- doc/whats_new/v1.0.rst | 4 +++ sklearn/preprocessing/_polynomial.py | 35 +++++++++++++++---- .../preprocessing/tests/test_polynomial.py | 21 +++++++++++ 3 files changed, 54 insertions(+), 6 deletions(-) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index 2aaecb6d9b438..979ed9096aba1 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -200,6 +200,10 @@ Changelog :pr:`19426` by :user:`Alexandre Gramfort ` and :user:`Maria Telenczuk `. +- |Efficiency| The implementation of `fit` for `PolynomialFeatures` transformer + is now faster. This is especially noticeable on large sparse input. + :pr:`19734` by :user:`Fred Robinson `. + :mod:`sklearn.manifold` ....................... diff --git a/sklearn/preprocessing/_polynomial.py b/sklearn/preprocessing/_polynomial.py index 3f4ccc2fa05d4..d1ec49d7539bf 100644 --- a/sklearn/preprocessing/_polynomial.py +++ b/sklearn/preprocessing/_polynomial.py @@ -8,6 +8,7 @@ import numpy as np from scipy import sparse from scipy.interpolate import BSpline +from scipy.special import comb from ..base import BaseEstimator, TransformerMixin from ..utils import check_array @@ -113,6 +114,29 @@ def _combinations(n_features, degree, interaction_only, include_bias): return chain.from_iterable(comb(range(n_features), i) for i in range(start, degree + 1)) + @staticmethod + def _num_combinations(n_features, degree, interaction_only, include_bias): + """Calculate number of terms in polynomial expansion + + This should be equivalent to counting the number of terms returned by + _combinations(...) but much faster. + """ + + if interaction_only: + combinations = sum( + [ + comb(n_features, i, exact=True) + for i in range(1, min(degree + 1, n_features + 1)) + ] + ) + else: + combinations = comb(n_features + degree, degree, exact=True) - 1 + + if include_bias: + combinations += 1 + + return combinations + @property def powers_(self): check_is_fitted(self) @@ -170,13 +194,12 @@ def fit(self, X, y=None): self : object Fitted transformer. """ - n_samples, n_features = self._validate_data( - X, accept_sparse=True).shape - combinations = self._combinations(n_features, self.degree, - self.interaction_only, - self.include_bias) + _, n_features = self._validate_data(X, accept_sparse=True).shape self.n_input_features_ = n_features - self.n_output_features_ = sum(1 for _ in combinations) + self.n_output_features_ = self._num_combinations( + n_features, self.degree, self.interaction_only, self.include_bias + ) + return self def transform(self, X): diff --git a/sklearn/preprocessing/tests/test_polynomial.py b/sklearn/preprocessing/tests/test_polynomial.py index 5068a8c7d8bdd..59c3a59df8873 100644 --- a/sklearn/preprocessing/tests/test_polynomial.py +++ b/sklearn/preprocessing/tests/test_polynomial.py @@ -552,6 +552,27 @@ def test_polynomial_features_csr_X(deg, include_bias, interaction_only, dtype): assert_array_almost_equal(Xt_csr.A, Xt_dense) +@pytest.mark.parametrize("n_features", [1, 4, 5]) +@pytest.mark.parametrize("degree", range(1, 5)) +@pytest.mark.parametrize("interaction_only", [True, False]) +@pytest.mark.parametrize("include_bias", [True, False]) +def test_num_combinations(n_features, degree, interaction_only, include_bias): + """ + Test that n_output_features_ is calculated correctly. + """ + x = sparse.csr_matrix(([1], ([0], [n_features - 1]))) + est = PolynomialFeatures( + degree, interaction_only=interaction_only, include_bias=include_bias + ) + est.fit(x) + num_combos = est.n_output_features_ + + combos = PolynomialFeatures._combinations( + n_features, degree, interaction_only, include_bias + ) + assert num_combos == sum([1 for _ in combos]) + + @pytest.mark.parametrize(['deg', 'include_bias', 'interaction_only', 'dtype'], [(2, True, False, np.float32), (2, True, False, np.float64), From a9ae69397e114d8b4df0f3f1cfb1f25525b43fc6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=20Dupr=C3=A9=20la=20Tour?= Date: Fri, 2 Apr 2021 01:44:06 -0700 Subject: [PATCH 284/478] FIX Approximate nearest neighbors in TSNE example (#19809) --- .../approximate_nearest_neighbors.py | 64 ++++++++----------- 1 file changed, 28 insertions(+), 36 deletions(-) diff --git a/examples/neighbors/approximate_nearest_neighbors.py b/examples/neighbors/approximate_nearest_neighbors.py index b7f09d3127b98..78f5f184a0da7 100644 --- a/examples/neighbors/approximate_nearest_neighbors.py +++ b/examples/neighbors/approximate_nearest_neighbors.py @@ -8,11 +8,6 @@ replace KNeighborsTransformer and perform approximate nearest neighbors. These packages can be installed with `pip install annoy nmslib`. -Note: Currently `TSNE(metric='precomputed')` does not modify the precomputed -distances, and thus assumes that precomputed euclidean distances are squared. -In future versions, a parameter in TSNE will control the optional squaring of -precomputed distances (see #12401). - Note: In KNeighborsTransformer we use the definition which includes each training point as its own neighbor in the count of `n_neighbors`, and for compatibility reasons, one extra neighbor is computed when @@ -91,7 +86,6 @@ def fit(self, X): # see more metric in the manual # https://github.com/nmslib/nmslib/tree/master/manual space = { - 'sqeuclidean': 'l2', 'euclidean': 'l2', 'cosine': 'cosinesimil', 'l1': 'l1', @@ -115,9 +109,6 @@ def transform(self, X): indices, distances = zip(*results) indices, distances = np.vstack(indices), np.vstack(distances) - if self.metric == 'sqeuclidean': - distances **= 2 - indptr = np.arange(0, n_samples_transform * n_neighbors + 1, n_neighbors) kneighbors_graph = csr_matrix((distances.ravel(), indices.ravel(), @@ -139,8 +130,7 @@ def __init__(self, n_neighbors=5, metric='euclidean', n_trees=10, def fit(self, X): self.n_samples_fit_ = X.shape[0] - metric = self.metric if self.metric != 'sqeuclidean' else 'euclidean' - self.annoy_ = annoy.AnnoyIndex(X.shape[1], metric=metric) + self.annoy_ = annoy.AnnoyIndex(X.shape[1], metric=self.metric) for i, x in enumerate(X): self.annoy_.add_item(i, x.tolist()) self.annoy_.build(self.n_trees) @@ -177,9 +167,6 @@ def _transform(self, X): x.tolist(), n_neighbors, self.search_k, include_distances=True) - if self.metric == 'sqeuclidean': - distances **= 2 - indptr = np.arange(0, n_samples_transform * n_neighbors + 1, n_neighbors) kneighbors_graph = csr_matrix((distances.ravel(), indices.ravel(), @@ -209,7 +196,7 @@ def test_transformers(): def load_mnist(n_samples): """Load MNIST, shuffle the data, and return only n_samples.""" - mnist = fetch_openml("mnist_784") + mnist = fetch_openml("mnist_784", as_frame=False) X, y = shuffle(mnist.data, mnist.target, random_state=2) return X[:n_samples] / 255, y[:n_samples] @@ -222,34 +209,39 @@ def run_benchmark(): n_iter = 500 perplexity = 30 + metric = "euclidean" # TSNE requires a certain number of neighbors which depends on the # perplexity parameter. # Add one since we include each sample as its own neighbor. n_neighbors = int(3. * perplexity + 1) + 1 + tsne_params = dict(perplexity=perplexity, method="barnes_hut", + random_state=42, n_iter=n_iter, + square_distances=True) + transformers = [ - ('AnnoyTransformer', AnnoyTransformer(n_neighbors=n_neighbors, - metric='sqeuclidean')), - ('NMSlibTransformer', NMSlibTransformer(n_neighbors=n_neighbors, - metric='sqeuclidean')), - ('KNeighborsTransformer', KNeighborsTransformer( - n_neighbors=n_neighbors, mode='distance', metric='sqeuclidean')), - ('TSNE with AnnoyTransformer', make_pipeline( - AnnoyTransformer(n_neighbors=n_neighbors, metric='sqeuclidean'), - TSNE(metric='precomputed', perplexity=perplexity, - method="barnes_hut", random_state=42, n_iter=n_iter), )), - ('TSNE with NMSlibTransformer', make_pipeline( - NMSlibTransformer(n_neighbors=n_neighbors, metric='sqeuclidean'), - TSNE(metric='precomputed', perplexity=perplexity, - method="barnes_hut", random_state=42, n_iter=n_iter), )), - ('TSNE with KNeighborsTransformer', make_pipeline( - KNeighborsTransformer(n_neighbors=n_neighbors, mode='distance', - metric='sqeuclidean'), - TSNE(metric='precomputed', perplexity=perplexity, - method="barnes_hut", random_state=42, n_iter=n_iter), )), + ('AnnoyTransformer', + AnnoyTransformer(n_neighbors=n_neighbors, metric=metric)), + ('NMSlibTransformer', + NMSlibTransformer(n_neighbors=n_neighbors, metric=metric)), + ('KNeighborsTransformer', + KNeighborsTransformer(n_neighbors=n_neighbors, mode='distance', + metric=metric)), + ('TSNE with AnnoyTransformer', + make_pipeline( + AnnoyTransformer(n_neighbors=n_neighbors, metric=metric), + TSNE(metric='precomputed', **tsne_params))), + ('TSNE with NMSlibTransformer', + make_pipeline( + NMSlibTransformer(n_neighbors=n_neighbors, metric=metric), + TSNE(metric='precomputed', **tsne_params))), + ('TSNE with KNeighborsTransformer', + make_pipeline( + KNeighborsTransformer(n_neighbors=n_neighbors, mode='distance', + metric=metric), + TSNE(metric='precomputed', **tsne_params))), ('TSNE with internal NearestNeighbors', - TSNE(metric='sqeuclidean', perplexity=perplexity, method="barnes_hut", - random_state=42, n_iter=n_iter)), + TSNE(metric=metric, **tsne_params)), ] # init the plot From 309f135c3284d7db6e23ca81a87948c7066a3949 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 2 Apr 2021 15:40:20 +0100 Subject: [PATCH 285/478] MNT Remove HistGradientBoosting from experimental (#19799) --- asv_benchmarks/benchmarks/ensemble.py | 1 - benchmarks/bench_hist_gradient_boosting.py | 2 - .../bench_hist_gradient_boosting_adult.py | 1 - ...hist_gradient_boosting_categorical_only.py | 1 - ...bench_hist_gradient_boosting_higgsboson.py | 2 - .../bench_hist_gradient_boosting_threading.py | 2 - doc/conf.py | 1 - doc/developers/maintainer.rst | 29 ++++++++++--- doc/modules/ensemble.rst | 16 +------ doc/whats_new/v0.21.rst | 5 +++ doc/whats_new/v1.0.rst | 5 +++ .../plot_gradient_boosting_categorical.py | 1 - .../ensemble/plot_monotonic_constraints.py | 1 - examples/ensemble/plot_stack_predictors.py | 1 - .../inspection/plot_partial_dependence.py | 1 - ...plot_poisson_regression_non_normal_loss.py | 1 - .../plot_release_highlights_0_22_0.py | 1 - .../plot_release_highlights_0_23_0.py | 2 - sklearn/ensemble/__init__.py | 13 ++---- .../gradient_boosting.py | 26 ----------- .../tests/test_compare_lightgbm.py | 2 - .../tests/test_gradient_boosting.py | 2 - .../tests/test_monotonic_contraints.py | 1 - .../tests/test_warm_start.py | 2 - .../enable_hist_gradient_boosting.py | 43 ++++++------------- .../test_enable_hist_gradient_boosting.py | 41 +++--------------- .../tests/test_from_model.py | 1 - .../tests/test_sequential.py | 1 - .../tests/test_partial_dependence.py | 1 - sklearn/model_selection/tests/test_search.py | 1 - sklearn/tests/test_pipeline.py | 1 - 31 files changed, 59 insertions(+), 149 deletions(-) diff --git a/asv_benchmarks/benchmarks/ensemble.py b/asv_benchmarks/benchmarks/ensemble.py index c46ac07c84475..8977eb0d10f20 100644 --- a/asv_benchmarks/benchmarks/ensemble.py +++ b/asv_benchmarks/benchmarks/ensemble.py @@ -1,4 +1,3 @@ -from sklearn.experimental import enable_hist_gradient_boosting # noqa from sklearn.ensemble import (RandomForestClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier) diff --git a/benchmarks/bench_hist_gradient_boosting.py b/benchmarks/bench_hist_gradient_boosting.py index 82eb64faeb462..533861b1b63e4 100644 --- a/benchmarks/bench_hist_gradient_boosting.py +++ b/benchmarks/bench_hist_gradient_boosting.py @@ -4,8 +4,6 @@ import matplotlib.pyplot as plt import numpy as np from sklearn.model_selection import train_test_split -# To use this experimental feature, we need to explicitly ask for it: -from sklearn.experimental import enable_hist_gradient_boosting # noqa from sklearn.ensemble import HistGradientBoostingRegressor from sklearn.ensemble import HistGradientBoostingClassifier from sklearn.datasets import make_classification diff --git a/benchmarks/bench_hist_gradient_boosting_adult.py b/benchmarks/bench_hist_gradient_boosting_adult.py index 5b47fcb3a6678..49109cfc049bb 100644 --- a/benchmarks/bench_hist_gradient_boosting_adult.py +++ b/benchmarks/bench_hist_gradient_boosting_adult.py @@ -4,7 +4,6 @@ from sklearn.model_selection import train_test_split from sklearn.datasets import fetch_openml from sklearn.metrics import accuracy_score, roc_auc_score -from sklearn.experimental import enable_hist_gradient_boosting # noqa from sklearn.ensemble import HistGradientBoostingClassifier from sklearn.ensemble._hist_gradient_boosting.utils import ( get_equivalent_estimator) diff --git a/benchmarks/bench_hist_gradient_boosting_categorical_only.py b/benchmarks/bench_hist_gradient_boosting_categorical_only.py index 6c69b32eff26f..d3d7a871b41d2 100644 --- a/benchmarks/bench_hist_gradient_boosting_categorical_only.py +++ b/benchmarks/bench_hist_gradient_boosting_categorical_only.py @@ -3,7 +3,6 @@ from sklearn.preprocessing import KBinsDiscretizer from sklearn.datasets import make_classification -from sklearn.experimental import enable_hist_gradient_boosting # noqa from sklearn.ensemble import HistGradientBoostingClassifier from sklearn.ensemble._hist_gradient_boosting.utils import ( get_equivalent_estimator) diff --git a/benchmarks/bench_hist_gradient_boosting_higgsboson.py b/benchmarks/bench_hist_gradient_boosting_higgsboson.py index 2c74bb8818343..4e795a18ae2ce 100644 --- a/benchmarks/bench_hist_gradient_boosting_higgsboson.py +++ b/benchmarks/bench_hist_gradient_boosting_higgsboson.py @@ -9,8 +9,6 @@ from joblib import Memory from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score, roc_auc_score -# To use this experimental feature, we need to explicitly ask for it: -from sklearn.experimental import enable_hist_gradient_boosting # noqa from sklearn.ensemble import HistGradientBoostingClassifier from sklearn.ensemble._hist_gradient_boosting.utils import ( get_equivalent_estimator) diff --git a/benchmarks/bench_hist_gradient_boosting_threading.py b/benchmarks/bench_hist_gradient_boosting_threading.py index 61803fb5cb9cc..6ab5de294dced 100644 --- a/benchmarks/bench_hist_gradient_boosting_threading.py +++ b/benchmarks/bench_hist_gradient_boosting_threading.py @@ -7,8 +7,6 @@ from threadpoolctl import threadpool_limits import sklearn from sklearn.model_selection import train_test_split -# To use this experimental feature, we need to explicitly ask for it: -from sklearn.experimental import enable_hist_gradient_boosting # noqa from sklearn.ensemble import HistGradientBoostingRegressor from sklearn.ensemble import HistGradientBoostingClassifier from sklearn.datasets import make_classification diff --git a/doc/conf.py b/doc/conf.py index 6768aab208a99..ba6b0595a7d44 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -358,7 +358,6 @@ def __call__(self, directory): # enable experimental module so that experimental estimators can be # discovered properly by sphinx -from sklearn.experimental import enable_hist_gradient_boosting # noqa from sklearn.experimental import enable_iterative_imputer # noqa from sklearn.experimental import enable_halving_search_cv # noqa diff --git a/doc/developers/maintainer.rst b/doc/developers/maintainer.rst index e4115e87025c7..8fd439c984660 100644 --- a/doc/developers/maintainer.rst +++ b/doc/developers/maintainer.rst @@ -363,10 +363,17 @@ deprecation cycle. To create an experimental module, you can just copy and modify the content of `enable_hist_gradient_boosting.py -`_, +`__, or `enable_iterative_imputer.py -`_. +`_. + +.. note:: + + These are permalink as in 0.24, where these estimators are still + experimental. They might be stable at the time of reading - hence the + permalink. See below for instructions on the transition from experimental + to stable. Note that the public import path must be to a public subpackage (like ``sklearn/ensemble`` or ``sklearn/impute``), not just a ``.py`` module. @@ -379,14 +386,15 @@ in the future when the features aren't experimental anymore. To avoid type checker (e.g. mypy) errors a direct import of experimental estimators should be done in the parent module, protected by the ``if typing.TYPE_CHECKING`` check. See `sklearn/ensemble/__init__.py -`_, +`_, or `sklearn/impute/__init__.py -`_ +`_ for an example. Please also write basic tests following those in `test_enable_hist_gradient_boosting.py -`_. +`__. + Make sure every user-facing code you write explicitly mentions that the feature is experimental, and add a ``# noqa`` comment to avoid pep8-related warnings:: @@ -402,3 +410,14 @@ sklearn.experimental import *`` **does not work**. Note that some experimental classes / functions are not included in the :mod:`sklearn.experimental` module: ``sklearn.datasets.fetch_openml``. + +Once the feature become stable, remove all `enable_my_experimental_feature` +in the scikit-learn code (even feature highlights etc.) and make the +`enable_my_experimental_feature` a no-op that just raises a warning: +`enable_hist_gradient_boosting.py +`__. +The file should stay there indefinitely as we don't want to break users code: +we just incentivize them to remove that import with the warning. + +Also update the tests accordingly: `test_enable_hist_gradient_boosting.py +`__. diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst index c891b4d275b9a..329215406c39c 100644 --- a/doc/modules/ensemble.rst +++ b/doc/modules/ensemble.rst @@ -467,7 +467,7 @@ trees. .. note:: - Scikit-learn 0.21 introduces two new experimental implementations of + Scikit-learn 0.21 introduces two new implementations of gradient boosting trees, namely :class:`HistGradientBoostingClassifier` and :class:`HistGradientBoostingRegressor`, inspired by `LightGBM `__ (See [LightGBM]_). @@ -898,7 +898,7 @@ based on permutation of the features. Histogram-Based Gradient Boosting ================================= -Scikit-learn 0.21 introduced two new experimental implementations of +Scikit-learn 0.21 introduced two new implementations of gradient boosting trees, namely :class:`HistGradientBoostingClassifier` and :class:`HistGradientBoostingRegressor`, inspired by `LightGBM `__ (See [LightGBM]_). @@ -920,15 +920,6 @@ estimators is slightly different, and some of the features from :class:`GradientBoostingClassifier` and :class:`GradientBoostingRegressor` are not yet supported, for instance some loss functions. -These estimators are still **experimental**: their predictions -and their API might change without any deprecation cycle. To use them, you -need to explicitly import ``enable_hist_gradient_boosting``:: - - >>> # explicitly require this experimental feature - >>> from sklearn.experimental import enable_hist_gradient_boosting # noqa - >>> # now you can import normally from ensemble - >>> from sklearn.ensemble import HistGradientBoostingClassifier - .. topic:: Examples: * :ref:`sphx_glr_auto_examples_inspection_plot_partial_dependence.py` @@ -941,7 +932,6 @@ Most of the parameters are unchanged from One exception is the ``max_iter`` parameter that replaces ``n_estimators``, and controls the number of iterations of the boosting process:: - >>> from sklearn.experimental import enable_hist_gradient_boosting >>> from sklearn.ensemble import HistGradientBoostingClassifier >>> from sklearn.datasets import make_hastie_10_2 @@ -992,7 +982,6 @@ with missing values should go to the left or right child, based on the potential gain. When predicting, samples with missing values are assigned to the left or right child consequently:: - >>> from sklearn.experimental import enable_hist_gradient_boosting # noqa >>> from sklearn.ensemble import HistGradientBoostingClassifier >>> import numpy as np @@ -1146,7 +1135,6 @@ You can specify a monotonic constraint on each feature using the constraint, while -1 and 1 indicate a negative and positive constraint, respectively:: - >>> from sklearn.experimental import enable_hist_gradient_boosting # noqa >>> from sklearn.ensemble import HistGradientBoostingRegressor ... # positive, negative, and no constraint on the 3 features diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index cf9886a6636af..8012fd02b4733 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -420,6 +420,11 @@ Support for Python 3.4 and below has been officially dropped. >>> from sklearn.experimental import enable_hist_gradient_boosting # noqa >>> # now you can import normally from sklearn.ensemble >>> from sklearn.ensemble import HistGradientBoostingClassifier + + .. note:: + Update: since version 1.0, these estimators are not experimental + anymore and you don't need to use `from sklearn.experimental import + enable_hist_gradient_boosting`. :pr:`12807` by :user:`Nicolas Hug`. diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index 979ed9096aba1..f75c29586efca 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -137,6 +137,11 @@ Changelog target. Additional private refactoring was performed. :pr:`19162` by :user:`Guillaume Lemaitre `. +- |Enhancement| :class:`~sklearn.ensemble.HistGradientBoostingClassifier` and + :class:`~sklearn.ensemble.HistGradientBoostingRegressor` are no longer + experimental. They are now considered stable and are subject to the same + deprecation cycles as all other estimators. :pr:`19799` by `Nicolas Hug`_. + :mod:`sklearn.feature_extraction` ................................. diff --git a/examples/ensemble/plot_gradient_boosting_categorical.py b/examples/ensemble/plot_gradient_boosting_categorical.py index 820a508f4de3c..876a1ca21ec4c 100644 --- a/examples/ensemble/plot_gradient_boosting_categorical.py +++ b/examples/ensemble/plot_gradient_boosting_categorical.py @@ -45,7 +45,6 @@ # As a baseline, we create an estimator where the categorical features are # dropped: -from sklearn.experimental import enable_hist_gradient_boosting # noqa from sklearn.ensemble import HistGradientBoostingRegressor from sklearn.pipeline import make_pipeline from sklearn.compose import make_column_transformer diff --git a/examples/ensemble/plot_monotonic_constraints.py b/examples/ensemble/plot_monotonic_constraints.py index 8b3f69f1d542e..c173ef35cf311 100644 --- a/examples/ensemble/plot_monotonic_constraints.py +++ b/examples/ensemble/plot_monotonic_constraints.py @@ -18,7 +18,6 @@ This example was inspired by the `XGBoost documentation `_. """ -from sklearn.experimental import enable_hist_gradient_boosting # noqa from sklearn.ensemble import HistGradientBoostingRegressor from sklearn.inspection import plot_partial_dependence import numpy as np diff --git a/examples/ensemble/plot_stack_predictors.py b/examples/ensemble/plot_stack_predictors.py index c07068b060c57..afa48c62d8d0b 100644 --- a/examples/ensemble/plot_stack_predictors.py +++ b/examples/ensemble/plot_stack_predictors.py @@ -160,7 +160,6 @@ def load_ames_housing(): rf_pipeline # %% -from sklearn.experimental import enable_hist_gradient_boosting # noqa from sklearn.ensemble import HistGradientBoostingRegressor gbdt_pipeline = make_pipeline( diff --git a/examples/inspection/plot_partial_dependence.py b/examples/inspection/plot_partial_dependence.py index 927857d845f9e..ac8d20ec9f155 100644 --- a/examples/inspection/plot_partial_dependence.py +++ b/examples/inspection/plot_partial_dependence.py @@ -134,7 +134,6 @@ # Let's now fit a :class:`~sklearn.ensemble.HistGradientBoostingRegressor` and # compute the partial dependence on the same features. -from sklearn.experimental import enable_hist_gradient_boosting # noqa from sklearn.ensemble import HistGradientBoostingRegressor print("Training HistGradientBoostingRegressor...") diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index 9541be1f62b24..7ebda543b4059 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -258,7 +258,6 @@ def score_estimator(estimator, df_test): # least-squares loss. Here we only fit trees with the Poisson loss to keep this # example concise. -from sklearn.experimental import enable_hist_gradient_boosting # noqa from sklearn.ensemble import HistGradientBoostingRegressor from sklearn.preprocessing import OrdinalEncoder diff --git a/examples/release_highlights/plot_release_highlights_0_22_0.py b/examples/release_highlights/plot_release_highlights_0_22_0.py index d9efc9a520af1..cc0cfe674c61d 100644 --- a/examples/release_highlights/plot_release_highlights_0_22_0.py +++ b/examples/release_highlights/plot_release_highlights_0_22_0.py @@ -131,7 +131,6 @@ # support for missing values (NaNs). This means that there is no need for # imputing data when training or predicting. -from sklearn.experimental import enable_hist_gradient_boosting # noqa from sklearn.ensemble import HistGradientBoostingClassifier X = np.array([0, 1, 2, np.nan]).reshape(-1, 1) diff --git a/examples/release_highlights/plot_release_highlights_0_23_0.py b/examples/release_highlights/plot_release_highlights_0_23_0.py index a34c23b4912be..364cd7958003e 100644 --- a/examples/release_highlights/plot_release_highlights_0_23_0.py +++ b/examples/release_highlights/plot_release_highlights_0_23_0.py @@ -36,7 +36,6 @@ import numpy as np from sklearn.model_selection import train_test_split from sklearn.linear_model import PoissonRegressor -from sklearn.experimental import enable_hist_gradient_boosting # noqa from sklearn.ensemble import HistGradientBoostingRegressor n_samples, n_features = 1000, 20 @@ -124,7 +123,6 @@ from matplotlib import pyplot as plt from sklearn.model_selection import train_test_split from sklearn.inspection import plot_partial_dependence -from sklearn.experimental import enable_hist_gradient_boosting # noqa from sklearn.ensemble import HistGradientBoostingRegressor n_samples = 500 diff --git a/sklearn/ensemble/__init__.py b/sklearn/ensemble/__init__.py index ae86349ad9af0..0a78a774cca36 100644 --- a/sklearn/ensemble/__init__.py +++ b/sklearn/ensemble/__init__.py @@ -2,8 +2,6 @@ The :mod:`sklearn.ensemble` module includes ensemble-based methods for classification, regression and anomaly detection. """ -import typing - from ._base import BaseEnsemble from ._forest import RandomForestClassifier from ._forest import RandomForestRegressor @@ -21,13 +19,9 @@ from ._voting import VotingRegressor from ._stacking import StackingClassifier from ._stacking import StackingRegressor - -if typing.TYPE_CHECKING: - # Avoid errors in type checkers (e.g. mypy) for experimental estimators. - # TODO: remove this check once the estimator is no longer experimental. - from ._hist_gradient_boosting.gradient_boosting import ( # noqa - HistGradientBoostingRegressor, HistGradientBoostingClassifier - ) +from ._hist_gradient_boosting.gradient_boosting import ( + HistGradientBoostingRegressor, HistGradientBoostingClassifier +) __all__ = ["BaseEnsemble", "RandomForestClassifier", "RandomForestRegressor", @@ -37,4 +31,5 @@ "GradientBoostingRegressor", "AdaBoostClassifier", "AdaBoostRegressor", "VotingClassifier", "VotingRegressor", "StackingClassifier", "StackingRegressor", + 'HistGradientBoostingClassifier', 'HistGradientBoostingRegressor', ] diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index c35f79bd79251..d3b62a5df784a 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -887,17 +887,6 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting): This implementation is inspired by `LightGBM `_. - .. note:: - - This estimator is still **experimental** for now: the predictions - and the API might change without any deprecation cycle. To use it, - you need to explicitly import ``enable_hist_gradient_boosting``:: - - >>> # explicitly require this experimental feature - >>> from sklearn.experimental import enable_hist_gradient_boosting # noqa - >>> # now you can import normally from ensemble - >>> from sklearn.ensemble import HistGradientBoostingRegressor - Read more in the :ref:`User Guide `. .. versionadded:: 0.21 @@ -1040,8 +1029,6 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting): Examples -------- - >>> # To use this experimental feature, we need to explicitly ask for it: - >>> from sklearn.experimental import enable_hist_gradient_boosting # noqa >>> from sklearn.ensemble import HistGradientBoostingRegressor >>> from sklearn.datasets import load_diabetes >>> X, y = load_diabetes(return_X_y=True) @@ -1156,17 +1143,6 @@ class HistGradientBoostingClassifier(ClassifierMixin, This implementation is inspired by `LightGBM `_. - .. note:: - - This estimator is still **experimental** for now: the predictions - and the API might change without any deprecation cycle. To use it, - you need to explicitly import ``enable_hist_gradient_boosting``:: - - >>> # explicitly require this experimental feature - >>> from sklearn.experimental import enable_hist_gradient_boosting # noqa - >>> # now you can import normally from ensemble - >>> from sklearn.ensemble import HistGradientBoostingClassifier - Read more in the :ref:`User Guide `. .. versionadded:: 0.21 @@ -1304,8 +1280,6 @@ class HistGradientBoostingClassifier(ClassifierMixin, Examples -------- - >>> # To use this experimental feature, we need to explicitly ask for it: - >>> from sklearn.experimental import enable_hist_gradient_boosting # noqa >>> from sklearn.ensemble import HistGradientBoostingClassifier >>> from sklearn.datasets import load_iris >>> X, y = load_iris(return_X_y=True) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_compare_lightgbm.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_compare_lightgbm.py index 4a6c4dbbb32c7..f34dffab2671c 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_compare_lightgbm.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_compare_lightgbm.py @@ -4,8 +4,6 @@ import numpy as np import pytest -# To use this experimental feature, we need to explicitly ask for it: -from sklearn.experimental import enable_hist_gradient_boosting # noqa from sklearn.ensemble import HistGradientBoostingRegressor from sklearn.ensemble import HistGradientBoostingClassifier from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index 265b4cf20f8f3..b2322f29f85d1 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -13,8 +13,6 @@ from sklearn.exceptions import NotFittedError from sklearn.compose import make_column_transformer -# To use this experimental feature, we need to explicitly ask for it: -from sklearn.experimental import enable_hist_gradient_boosting # noqa from sklearn.ensemble import HistGradientBoostingRegressor from sklearn.ensemble import HistGradientBoostingClassifier from sklearn.ensemble._hist_gradient_boosting.loss import _LOSSES diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_monotonic_contraints.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_monotonic_contraints.py index 29fc95d4bb070..725f9f6537865 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_monotonic_contraints.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_monotonic_contraints.py @@ -10,7 +10,6 @@ compute_node_value ) from sklearn.ensemble._hist_gradient_boosting.histogram import HistogramBuilder -from sklearn.experimental import enable_hist_gradient_boosting # noqa from sklearn.ensemble import HistGradientBoostingRegressor from sklearn.ensemble import HistGradientBoostingClassifier diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_warm_start.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_warm_start.py index 2417de4f6cc63..044a6237bc54d 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_warm_start.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_warm_start.py @@ -7,8 +7,6 @@ from sklearn.base import clone from sklearn.datasets import make_classification, make_regression -# To use this experimental feature, we need to explicitly ask for it: -from sklearn.experimental import enable_hist_gradient_boosting # noqa from sklearn.ensemble import HistGradientBoostingRegressor from sklearn.ensemble import HistGradientBoostingClassifier from sklearn.metrics import check_scoring diff --git a/sklearn/experimental/enable_hist_gradient_boosting.py b/sklearn/experimental/enable_hist_gradient_boosting.py index d7ceefbd58a2f..f0416ac013e96 100644 --- a/sklearn/experimental/enable_hist_gradient_boosting.py +++ b/sklearn/experimental/enable_hist_gradient_boosting.py @@ -1,36 +1,21 @@ -"""Enables histogram-based gradient boosting estimators. +"""This is now a no-op and can be safely removed from your code. -The API and results of these estimators might change without any deprecation -cycle. - -Importing this file dynamically sets the +It used to enable the use of :class:`~sklearn.ensemble.HistGradientBoostingClassifier` and -:class:`~sklearn.ensemble.HistGradientBoostingRegressor` as attributes of the -ensemble module:: - - >>> # explicitly require this experimental feature - >>> from sklearn.experimental import enable_hist_gradient_boosting # noqa - >>> # now you can import normally from ensemble - >>> from sklearn.ensemble import HistGradientBoostingClassifier - >>> from sklearn.ensemble import HistGradientBoostingRegressor - - -The ``# noqa`` comment comment can be removed: it just tells linters like -flake8 to ignore the import, which appears as unused. +:class:`~sklearn.ensemble.HistGradientBoostingRegressor` when they were still +:term:`experimental`, but these estimators are now stable and can be imported +normally from `sklearn.ensemble`. """ +# Don't remove this file, we don't want to break users code just because the +# feature isn't experimental anymore. -from ..ensemble._hist_gradient_boosting.gradient_boosting import ( - HistGradientBoostingClassifier, - HistGradientBoostingRegressor -) -from .. import ensemble +import warnings -# use settattr to avoid mypy errors when monkeypatching -setattr(ensemble, "HistGradientBoostingClassifier", - HistGradientBoostingClassifier) -setattr(ensemble, "HistGradientBoostingRegressor", - HistGradientBoostingRegressor) -ensemble.__all__ += ['HistGradientBoostingClassifier', - 'HistGradientBoostingRegressor'] +warnings.warn( + "Since version 1.0, " + "it is not needed to import enable_hist_gradient_boosting anymore. " + "HistGradientBoostingClassifier and HistGradientBoostingRegressor are now " + "stable and can be normally imported from sklearn.ensemble." +) diff --git a/sklearn/experimental/tests/test_enable_hist_gradient_boosting.py b/sklearn/experimental/tests/test_enable_hist_gradient_boosting.py index 06c0976d95a1f..8ea365fed6e59 100644 --- a/sklearn/experimental/tests/test_enable_hist_gradient_boosting.py +++ b/sklearn/experimental/tests/test_enable_hist_gradient_boosting.py @@ -5,41 +5,10 @@ from sklearn.utils._testing import assert_run_python_script -def test_imports_strategies(): - # Make sure different import strategies work or fail as expected. - - # Since Python caches the imported modules, we need to run a child process - # for every test case. Else, the tests would not be independent - # (manually removing the imports from the cache (sys.modules) is not - # recommended and can lead to many complications). - - good_import = """ - from sklearn.experimental import enable_hist_gradient_boosting - from sklearn.ensemble import GradientBoostingClassifier - from sklearn.ensemble import GradientBoostingRegressor - """ - assert_run_python_script(textwrap.dedent(good_import)) - - good_import_with_ensemble_first = """ - import sklearn.ensemble - from sklearn.experimental import enable_hist_gradient_boosting - from sklearn.ensemble import GradientBoostingClassifier - from sklearn.ensemble import GradientBoostingRegressor - """ - assert_run_python_script(textwrap.dedent(good_import_with_ensemble_first)) - - bad_imports = """ +def test_import_raises_warning(): + code = """ import pytest - - with pytest.raises(ImportError): - from sklearn.ensemble import HistGradientBoostingClassifier - - with pytest.raises(ImportError): - from sklearn.ensemble._hist_gradient_boosting import ( - HistGradientBoostingClassifier) - - import sklearn.experimental - with pytest.raises(ImportError): - from sklearn.ensemble import HistGradientBoostingClassifier + with pytest.warns(UserWarning, match="it is not needed to import"): + from sklearn.experimental import enable_hist_gradient_boosting # noqa """ - assert_run_python_script(textwrap.dedent(bad_imports)) + assert_run_python_script(textwrap.dedent(code)) diff --git a/sklearn/feature_selection/tests/test_from_model.py b/sklearn/feature_selection/tests/test_from_model.py index 37b5c105e1daa..17488b397b0c8 100644 --- a/sklearn/feature_selection/tests/test_from_model.py +++ b/sklearn/feature_selection/tests/test_from_model.py @@ -10,7 +10,6 @@ from sklearn.linear_model import LogisticRegression, SGDClassifier, Lasso from sklearn.svm import LinearSVC from sklearn.feature_selection import SelectFromModel -from sklearn.experimental import enable_hist_gradient_boosting # noqa from sklearn.ensemble import (RandomForestClassifier, HistGradientBoostingClassifier) from sklearn.linear_model import PassiveAggressiveClassifier diff --git a/sklearn/feature_selection/tests/test_sequential.py b/sklearn/feature_selection/tests/test_sequential.py index 2ca22517ef956..163f7acba6ce1 100644 --- a/sklearn/feature_selection/tests/test_sequential.py +++ b/sklearn/feature_selection/tests/test_sequential.py @@ -8,7 +8,6 @@ from sklearn.feature_selection import SequentialFeatureSelector from sklearn.datasets import make_regression from sklearn.linear_model import LinearRegression -from sklearn.experimental import enable_hist_gradient_boosting # noqa from sklearn.ensemble import HistGradientBoostingRegressor diff --git a/sklearn/inspection/tests/test_partial_dependence.py b/sklearn/inspection/tests/test_partial_dependence.py index 51dd6e53e4304..f79b2aca3beae 100644 --- a/sklearn/inspection/tests/test_partial_dependence.py +++ b/sklearn/inspection/tests/test_partial_dependence.py @@ -15,7 +15,6 @@ from sklearn.ensemble import GradientBoostingClassifier from sklearn.ensemble import GradientBoostingRegressor from sklearn.ensemble import RandomForestRegressor -from sklearn.experimental import enable_hist_gradient_boosting # noqa from sklearn.ensemble import HistGradientBoostingClassifier from sklearn.ensemble import HistGradientBoostingRegressor from sklearn.linear_model import LinearRegression diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py index 25c4ce8cc22f7..b74e250e94192 100644 --- a/sklearn/model_selection/tests/test_search.py +++ b/sklearn/model_selection/tests/test_search.py @@ -69,7 +69,6 @@ from sklearn.impute import SimpleImputer from sklearn.pipeline import Pipeline from sklearn.linear_model import Ridge, SGDClassifier, LinearRegression -from sklearn.experimental import enable_hist_gradient_boosting # noqa from sklearn.ensemble import HistGradientBoostingClassifier from sklearn.model_selection.tests.common import OneTimeSplitter diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index 85d2f7b6e07ca..93f19cdb8a93f 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -36,7 +36,6 @@ from sklearn.datasets import load_iris from sklearn.preprocessing import StandardScaler from sklearn.feature_extraction.text import CountVectorizer -from sklearn.experimental import enable_hist_gradient_boosting # noqa from sklearn.ensemble import HistGradientBoostingClassifier from sklearn.impute import SimpleImputer From 26e688d31e86461b978ca5cf7d23c279ac3f7299 Mon Sep 17 00:00:00 2001 From: Luca Bittarello <15511539+lbittarello@users.noreply.github.com> Date: Fri, 2 Apr 2021 22:08:16 +0200 Subject: [PATCH 286/478] ENH Record output of transformers in ColumnTransformer (#18393) Co-authored-by: Nicolas Hug Co-authored-by: Joel Nothman <78827+jnothman@users.noreply.github.com> --- doc/whats_new/v1.0.rst | 7 ++ sklearn/compose/_column_transformer.py | 29 ++++++ .../compose/tests/test_column_transformer.py | 88 ++++++++++++++++++- 3 files changed, 123 insertions(+), 1 deletion(-) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index f75c29586efca..9eb49b0139a6b 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -109,6 +109,13 @@ Changelog - |Enhancement| :func:`datasets.fetch_kddcup99` raises a better message when the cached file is invalid. :pr:`19669` `Thomas Fan`_. +:mod:`sklearn.compose` +...................... + +- |Enhancement| :class:`compose.ColumnTransformer` now records the output + of each transformer in `output_indices_`. :pr:`18393` by + :user:`Luca Bittarello `. + :mod:`sklearn.decomposition` ............................ diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index c0444fe2d6cda..da4a2dd93507c 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -134,6 +134,12 @@ class ColumnTransformer(TransformerMixin, _BaseComposition): sparse matrix or a dense numpy array, which depends on the output of the individual transformers and the `sparse_threshold` keyword. + output_indices_ : dict + A dictionary from each transformer name to a slice, where the slice + corresponds to indices in the transformed output. This is useful to + inspect which transformer is responsible for which transformed + feature(s). + Notes ----- The order of the columns in the transformed feature matrix follows the @@ -408,6 +414,28 @@ def _validate_output(self, result): "The output of the '{0}' transformer should be 2D (scipy " "matrix, array, or pandas DataFrame).".format(name)) + def _record_output_indices(self, Xs): + """ + Record which transformer produced which column. + """ + idx = 0 + self.output_indices_ = {} + + for transformer_idx, (name, _, _, _) in enumerate( + self._iter(fitted=True, replace_strings=True) + ): + n_columns = Xs[transformer_idx].shape[1] + self.output_indices_[name] = slice(idx, idx + n_columns) + idx += n_columns + + # `_iter` only generates transformers that have a non empty + # selection. Here we set empty slices for transformers that + # generate no output, which are safe for indexing + all_names = [t[0] for t in self.transformers] + ['remainder'] + for name in all_names: + if name not in self.output_indices_: + self.output_indices_[name] = slice(0, 0) + def _log_message(self, name, idx, total): if not self.verbose: return None @@ -518,6 +546,7 @@ def fit_transform(self, X, y=None): self._update_fitted_transformers(transformers) self._validate_output(Xs) + self._record_output_indices(Xs) return self._hstack(list(Xs)) diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py index ae2e25b68210f..f7c1874d4a1b7 100644 --- a/sklearn/compose/tests/test_column_transformer.py +++ b/sklearn/compose/tests/test_column_transformer.py @@ -225,7 +225,7 @@ def test_column_transformer_dataframe(): assert len(both.transformers_) == 1 assert both.transformers_[-1][0] != 'remainder' - # ensure pandas object is passes through + # ensure pandas object is passed through class TransAssert(BaseEstimator): @@ -310,6 +310,92 @@ def test_column_transformer_empty_columns(pandas, column_selection, assert isinstance(ct.transformers_[0][1], TransRaise) +def test_column_transformer_output_indices(): + # Checks for the output_indices_ attribute + X_array = np.arange(6).reshape(3, 2) + + ct = ColumnTransformer([('trans1', Trans(), [0]), + ('trans2', Trans(), [1])]) + X_trans = ct.fit_transform(X_array) + assert ct.output_indices_ == {'trans1': slice(0, 1), + 'trans2': slice(1, 2), + 'remainder': slice(0, 0)} + assert_array_equal(X_trans[:, [0]], + X_trans[:, ct.output_indices_['trans1']]) + assert_array_equal(X_trans[:, [1]], + X_trans[:, ct.output_indices_['trans2']]) + + # test with transformer_weights and multiple columns + ct = ColumnTransformer([('trans', Trans(), [0, 1])], + transformer_weights={'trans': .1}) + X_trans = ct.fit_transform(X_array) + assert ct.output_indices_ == {'trans': slice(0, 2), + 'remainder': slice(0, 0)} + assert_array_equal(X_trans[:, [0, 1]], + X_trans[:, ct.output_indices_['trans']]) + assert_array_equal(X_trans[:, []], + X_trans[:, ct.output_indices_['remainder']]) + + # test case that ensures that the attribute does also work when + # a given transformer doesn't have any columns to work on + ct = ColumnTransformer([('trans1', Trans(), [0, 1]), + ('trans2', TransRaise(), [])]) + X_trans = ct.fit_transform(X_array) + assert ct.output_indices_ == {'trans1': slice(0, 2), + 'trans2': slice(0, 0), + 'remainder': slice(0, 0)} + assert_array_equal(X_trans[:, [0, 1]], + X_trans[:, ct.output_indices_['trans1']]) + assert_array_equal(X_trans[:, []], + X_trans[:, ct.output_indices_['trans2']]) + assert_array_equal(X_trans[:, []], + X_trans[:, ct.output_indices_['remainder']]) + + ct = ColumnTransformer([('trans', TransRaise(), [])], + remainder='passthrough') + X_trans = ct.fit_transform(X_array) + assert ct.output_indices_ == {'trans': slice(0, 0), + 'remainder': slice(0, 2)} + assert_array_equal(X_trans[:, []], + X_trans[:, ct.output_indices_['trans']]) + assert_array_equal(X_trans[:, [0, 1]], + X_trans[:, ct.output_indices_['remainder']]) + + +def test_column_transformer_output_indices_df(): + # Checks for the output_indices_ attribute with data frames + pd = pytest.importorskip('pandas') + + X_df = pd.DataFrame(np.arange(6).reshape(3, 2), + columns=['first', 'second']) + + ct = ColumnTransformer([('trans1', Trans(), ['first']), + ('trans2', Trans(), ['second'])]) + X_trans = ct.fit_transform(X_df) + assert ct.output_indices_ == {'trans1': slice(0, 1), + 'trans2': slice(1, 2), + 'remainder': slice(0, 0)} + assert_array_equal(X_trans[:, [0]], + X_trans[:, ct.output_indices_['trans1']]) + assert_array_equal(X_trans[:, [1]], + X_trans[:, ct.output_indices_['trans2']]) + assert_array_equal(X_trans[:, []], + X_trans[:, ct.output_indices_['remainder']]) + + ct = ColumnTransformer([('trans1', Trans(), [0]), + ('trans2', Trans(), [1])]) + X_trans = ct.fit_transform(X_df) + assert ct.output_indices_ == {'trans1': slice(0, 1), + 'trans2': slice(1, 2), + 'remainder': slice(0, 0)} + assert_array_equal(X_trans[:, [0]], + X_trans[:, ct.output_indices_['trans1']]) + assert_array_equal(X_trans[:, [1]], + X_trans[:, ct.output_indices_['trans2']]) + assert_array_equal(X_trans[:, []], + X_trans[:, ct.output_indices_['remainder']]) + + def test_column_transformer_sparse_array(): X_sparse = sparse.eye(3, 2).tocsr() From f0576399d9cfb41c1f3cd4a0a2332578b1c0b573 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Fri, 2 Apr 2021 18:00:45 -0400 Subject: [PATCH 287/478] DOC Adds version added to output_indices_ in ColumnTransformer (#19815) --- sklearn/compose/_column_transformer.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index da4a2dd93507c..5006663331a40 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -140,6 +140,8 @@ class ColumnTransformer(TransformerMixin, _BaseComposition): inspect which transformer is responsible for which transformed feature(s). + .. versionadded:: 1.0 + Notes ----- The order of the columns in the transformed feature matrix follows the From 26b6f60cd40f682570a80a02eb6484c69de88354 Mon Sep 17 00:00:00 2001 From: Christopher Yeh Date: Sat, 3 Apr 2021 20:22:51 -0600 Subject: [PATCH 288/478] DOC Use the canonical Wikipedia link (#19819) --- sklearn/neighbors/_unsupervised.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/neighbors/_unsupervised.py b/sklearn/neighbors/_unsupervised.py index 822a30f503bd2..a6af48d9ed341 100644 --- a/sklearn/neighbors/_unsupervised.py +++ b/sklearn/neighbors/_unsupervised.py @@ -108,7 +108,7 @@ class NearestNeighbors(KNeighborsMixin, See :ref:`Nearest Neighbors ` in the online documentation for a discussion of the choice of ``algorithm`` and ``leaf_size``. - https://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm + https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm """ @_deprecate_positional_args From f47926999d35686ff2190c3940c82d7cc7f3e691 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Sun, 4 Apr 2021 21:53:44 +0200 Subject: [PATCH 289/478] DOC Fix order of whatsnew entries (#19822) --- doc/whats_new/v1.0.rst | 57 ++++++++++++++++++------------------------ 1 file changed, 24 insertions(+), 33 deletions(-) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index 9eb49b0139a6b..1dd809a94240c 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -76,15 +76,15 @@ Changelog - For :class:`tree.ExtraTreeRegressor`, `criterion="mse"` is deprecated, use `"squared_error"` instead which is now the default. -:mod:`sklearn.cluster` -...................... +:mod:`sklearn.calibration` +.......................... -:mod:`sklearn.preprocessing` -............................ +- |Fix| The predict and predict_proba methods of + :class:`calibration.CalibratedClassifierCV` can now properly be used on + prefitted pipelines. :pr:`19641` by :user:`Alek Lefebvre `. -- |Feature| :class:`preprocessing.OneHotEncoder` now supports - `handle_unknown='ignore'` and dropping categories. :pr:`19041` by - `Thomas Fan`_. +:mod:`sklearn.cluster` +...................... - |Efficiency| The "k-means++" initialization of :class:`cluster.KMeans` and :class:`cluster.MiniBatchKMeans` is now faster, especially in multicore @@ -98,6 +98,13 @@ Changelog - |API| :class:`cluster.Birch` attributes, `fit_` and `partial_fit_`, are deprecated and will be removed in 1.2. :pr:`19297` by `Thomas Fan`_. +:mod:`sklearn.compose` +...................... + +- |Enhancement| :class:`compose.ColumnTransformer` now records the output + of each transformer in `output_indices_`. :pr:`18393` by + :user:`Luca Bittarello `. + :mod:`sklearn.datasets` ....................... @@ -109,13 +116,6 @@ Changelog - |Enhancement| :func:`datasets.fetch_kddcup99` raises a better message when the cached file is invalid. :pr:`19669` `Thomas Fan`_. -:mod:`sklearn.compose` -...................... - -- |Enhancement| :class:`compose.ColumnTransformer` now records the output - of each transformer in `output_indices_`. :pr:`18393` by - :user:`Luca Bittarello `. - :mod:`sklearn.decomposition` ............................ @@ -169,7 +169,7 @@ Changelog - |Feature| The new :class:`linear_model.SGDOneClassSVM` provides an SGD implementation of the linear One-Class SVM. Combined with kernel approximation techniques, this implementation approximates the solution of - a kernelized One Class SVM while benefitting from a linear + a kernelized One Class SVM while benefitting from a linear complexity in the number of samples. :pr:`10027` by :user:`Albert Thomas `. @@ -188,12 +188,6 @@ Changelog not corresponding to their objective. :pr:`19172` by :user:`Mathurin Massias ` -:mod:`sklearn.preprocessing` -............................ - -- |Feature| :class:`preprocessing.OrdinalEncoder` supports passing through - missing values by default. :pr:`19069` by `Thomas Fan`_. - - |API|: The parameter ``normalize`` of :class:`linear_model.LinearRegression` is deprecated and will be removed in 1.2. Motivation for this deprecation: ``normalize`` parameter did not take any @@ -284,6 +278,9 @@ Changelog splines via the ``extrapolation`` argument. :pr:`19483` by :user:`Malte Londschien `. +- |Feature| :class:`preprocessing.OrdinalEncoder` supports passing through + missing values by default. :pr:`19069` by `Thomas Fan`_. + - |Fix| :func:`preprocessing.scale`, :class:`preprocessing.StandardScaler` and similar scalers detect near-constant features to avoid scaling them to very large values. This problem happens in particular when using a scaler on @@ -294,6 +291,10 @@ Changelog - |Fix| :meth:`preprocessing.StandardScaler.inverse_transform` now correctly handles integer dtypes. :pr:`19356` by :user:`makoeppel`. +- |Feature| :class:`preprocessing.OneHotEncoder` now supports + `handle_unknown='ignore'` and dropping categories. :pr:`19041` by + `Thomas Fan`_. + :mod:`sklearn.tree` ................... @@ -304,22 +305,12 @@ Changelog :mod:`sklearn.utils` .................... -- |Enhancement| Deprecated the default value of the `random_state=0` in +- |Enhancement| Deprecated the default value of the `random_state=0` in :func:`~sklearn.utils.extmath.randomized_svd`. Starting in 1.2, the default value of `random_state` will be set to `None`. - :pr:`19459` by :user:`Cindy Bezuidenhout ` and + :pr:`19459` by :user:`Cindy Bezuidenhout ` and :user:`Clifford Akai-Nettey`. -:mod:`sklearn.calibration` -.......................... - -- |Fix| The predict and predict_proba methods of - :class:`calibration.CalibratedClassifierCV` can now properly be used on - prefitted pipelines. :pr:`19641` by :user:`Alek Lefebvre ` - -:mod:`sklearn.utils` -.................... - - |Fix| Fixed a bug in :func:`utils.sparsefuncs.mean_variance_axis` where the precision of the computed variance was very poor when the real variance is exactly zero. :pr:`19766` by :user:`Jérémie du Boisberranger `. From 141123270a39c52a60e98017ca52795215dc2ce1 Mon Sep 17 00:00:00 2001 From: Flynn Date: Mon, 5 Apr 2021 01:24:05 -0400 Subject: [PATCH 290/478] API Adds predict_params for Pipeline proba delegates (#19790) --- doc/whats_new/v1.0.rst | 8 ++++++++ sklearn/pipeline.py | 18 ++++++++++++++---- sklearn/tests/test_pipeline.py | 20 ++++++++++++++++---- 3 files changed, 38 insertions(+), 8 deletions(-) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index 1dd809a94240c..4ccb4dd14b6a4 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -266,6 +266,14 @@ Changelog Use ``var_`` instead. :pr:`18842` by :user:`Hong Shao Yang `. +:mod:`sklearn.pipeline` +....................... + +- |API| The `predict_proba` and `predict_log_proba` methods of the + :class:`Pipeline` class now support passing prediction kwargs to + the final estimator. + :pr:`19790` by :user:`Christopher Flynn `. + :mod:`sklearn.preprocessing` ............................ diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index f466b735c4fa6..1c9a62d02b7d0 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -456,7 +456,7 @@ def fit_predict(self, X, y=None, **fit_params): return y_pred @if_delegate_has_method(delegate='_final_estimator') - def predict_proba(self, X): + def predict_proba(self, X, **predict_proba_params): """Apply transforms, and predict_proba of the final estimator Parameters @@ -465,6 +465,10 @@ def predict_proba(self, X): Data to predict on. Must fulfill input requirements of first step of the pipeline. + **predict_proba_params : dict of string -> object + Parameters to the ``predict_proba`` called at the end of all + transformations in the pipeline. + Returns ------- y_proba : array-like of shape (n_samples, n_classes) @@ -472,7 +476,7 @@ def predict_proba(self, X): Xt = X for _, name, transform in self._iter(with_final=False): Xt = transform.transform(Xt) - return self.steps[-1][-1].predict_proba(Xt) + return self.steps[-1][-1].predict_proba(Xt, **predict_proba_params) @if_delegate_has_method(delegate='_final_estimator') def decision_function(self, X): @@ -513,7 +517,7 @@ def score_samples(self, X): return self.steps[-1][-1].score_samples(Xt) @if_delegate_has_method(delegate='_final_estimator') - def predict_log_proba(self, X): + def predict_log_proba(self, X, **predict_log_proba_params): """Apply transforms, and predict_log_proba of the final estimator Parameters @@ -522,6 +526,10 @@ def predict_log_proba(self, X): Data to predict on. Must fulfill input requirements of first step of the pipeline. + **predict_log_proba_params : dict of string -> object + Parameters to the ``predict_log_proba`` called at the end of all + transformations in the pipeline. + Returns ------- y_score : array-like of shape (n_samples, n_classes) @@ -529,7 +537,9 @@ def predict_log_proba(self, X): Xt = X for _, name, transform in self._iter(with_final=False): Xt = transform.transform(Xt) - return self.steps[-1][-1].predict_log_proba(Xt) + return self.steps[-1][-1].predict_log_proba( + Xt, **predict_log_proba_params + ) @property def transform(self): diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index 93f19cdb8a93f..2ed5e37444bfc 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -159,6 +159,14 @@ def predict(self, X, got_attribute=False): self.got_attribute = got_attribute return self + def predict_proba(self, X, got_attribute=False): + self.got_attribute = got_attribute + return self + + def predict_log_proba(self, X, got_attribute=False): + self.got_attribute = got_attribute + return self + def test_pipeline_init(): # Test the various init parameters of the pipeline. @@ -448,12 +456,16 @@ def test_fit_predict_with_intermediate_fit_params(): assert 'should_succeed' not in pipe.named_steps['transf'].fit_params -def test_predict_with_predict_params(): - # tests that Pipeline passes predict_params to the final estimator - # when predict is invoked +@pytest.mark.parametrize("method_name", [ + "predict", "predict_proba", "predict_log_proba" +]) +def test_predict_methods_with_predict_params(method_name): + # tests that Pipeline passes predict_* to the final estimator + # when predict_* is invoked pipe = Pipeline([('transf', Transf()), ('clf', DummyEstimatorParams())]) pipe.fit(None, None) - pipe.predict(X=None, got_attribute=True) + method = getattr(pipe, method_name) + method(X=None, got_attribute=True) assert pipe.named_steps['clf'].got_attribute From c957eb37b5988e6e2a4692c1356e8689294404c5 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Mon, 5 Apr 2021 18:28:44 +0200 Subject: [PATCH 291/478] FIX Ignore zero sample weights in precision recall curve (#18328) Co-authored-by: Alonso Silva Allende --- doc/whats_new/v1.0.rst | 12 ++- sklearn/metrics/_ranking.py | 19 ++-- sklearn/metrics/tests/test_ranking.py | 121 ++++++++++++++------------ sklearn/utils/validation.py | 3 +- 4 files changed, 89 insertions(+), 66 deletions(-) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index 4ccb4dd14b6a4..ce683958d913f 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -241,6 +241,12 @@ Changelog are integral. :pr:`9843` by :user:`Jon Crall `. +- |Fix| Samples with zero `sample_weight` values do not affect the results + from :func:`metrics.det_curve`, :func:`metrics.precision_recall_curve` + and :func:`metrics.roc_curve`. + :pr:`18328` by :user:`Albert Villanova del Moral ` and + :user:`Alonso Silva Allende `. + :mod:`sklearn.model_selection` .............................. @@ -319,9 +325,9 @@ Changelog :pr:`19459` by :user:`Cindy Bezuidenhout ` and :user:`Clifford Akai-Nettey`. - - |Fix| Fixed a bug in :func:`utils.sparsefuncs.mean_variance_axis` where the - precision of the computed variance was very poor when the real variance is - exactly zero. :pr:`19766` by :user:`Jérémie du Boisberranger `. +- |Fix| Fixed a bug in :func:`utils.sparsefuncs.mean_variance_axis` where the + precision of the computed variance was very poor when the real variance is + exactly zero. :pr:`19766` by :user:`Jérémie du Boisberranger `. Code and Documentation Contributors ----------------------------------- diff --git a/sklearn/metrics/_ranking.py b/sklearn/metrics/_ranking.py index 0364fbba52f63..f1627e84fbcfe 100644 --- a/sklearn/metrics/_ranking.py +++ b/sklearn/metrics/_ranking.py @@ -27,6 +27,7 @@ from ..utils import assert_all_finite from ..utils import check_consistent_length +from ..utils.validation import _check_sample_weight from ..utils import column_or_1d, check_array from ..utils.multiclass import type_of_target from ..utils.extmath import stable_cumsum @@ -291,14 +292,14 @@ def det_curve(y_true, y_score, pos_label=None, sample_weight=None): >>> thresholds array([0.35, 0.4 , 0.8 ]) """ - if len(np.unique(y_true)) != 2: - raise ValueError("Only one class present in y_true. Detection error " - "tradeoff curve is not defined in that case.") - fps, tps, thresholds = _binary_clf_curve( y_true, y_score, pos_label=pos_label, sample_weight=sample_weight ) + if len(np.unique(y_true)) != 2: + raise ValueError("Only one class present in y_true. Detection error " + "tradeoff curve is not defined in that case.") + fns = tps[-1] - tps p_count = tps[-1] n_count = fps[-1] @@ -696,8 +697,14 @@ def _binary_clf_curve(y_true, y_score, pos_label=None, sample_weight=None): assert_all_finite(y_true) assert_all_finite(y_score) + # Filter out zero-weighted samples, as they should not impact the result if sample_weight is not None: sample_weight = column_or_1d(sample_weight) + sample_weight = _check_sample_weight(sample_weight, y_true) + nonzero_weight_mask = sample_weight != 0 + y_true = y_true[nonzero_weight_mask] + y_score = y_score[nonzero_weight_mask] + sample_weight = sample_weight[nonzero_weight_mask] pos_label = _check_pos_label_consistency(pos_label, y_true) @@ -759,7 +766,9 @@ def precision_recall_curve(y_true, probas_pred, *, pos_label=None, pos_label should be explicitly given. probas_pred : ndarray of shape (n_samples,) - Estimated probabilities or output of a decision function. + Target scores, can either be probability estimates of the positive + class, or non-thresholded measure of decisions (as returned by + `decision_function` on some classifiers). pos_label : int or str, default=None The label of the positive class. diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py index fd32e2cc0b860..c37ff34feddec 100644 --- a/sklearn/metrics/tests/test_ranking.py +++ b/sklearn/metrics/tests/test_ranking.py @@ -41,6 +41,13 @@ ############################################################################### # Utilities for testing +CURVE_FUNCS = [ + det_curve, + precision_recall_curve, + roc_curve, +] + + def make_prediction(dataset=None, binary=False): """Make some classification predictions on a toy dataset using a SVC @@ -73,16 +80,16 @@ def make_prediction(dataset=None, binary=False): # run classifier, get class probabilities and label predictions clf = svm.SVC(kernel='linear', probability=True, random_state=0) - probas_pred = clf.fit(X[:half], y[:half]).predict_proba(X[half:]) + y_score = clf.fit(X[:half], y[:half]).predict_proba(X[half:]) if binary: # only interested in probabilities of the positive case # XXX: do we really want a special API for the binary case? - probas_pred = probas_pred[:, 1] + y_score = y_score[:, 1] y_pred = clf.predict(X[half:]) y_true = y[half:] - return y_true, y_pred, probas_pred + return y_true, y_pred, y_score ############################################################################### @@ -183,14 +190,14 @@ def _partial_roc(y_true, y_predict, max_fpr): @pytest.mark.parametrize('drop', [True, False]) def test_roc_curve(drop): # Test Area under Receiver Operating Characteristic (ROC) curve - y_true, _, probas_pred = make_prediction(binary=True) - expected_auc = _auc(y_true, probas_pred) + y_true, _, y_score = make_prediction(binary=True) + expected_auc = _auc(y_true, y_score) - fpr, tpr, thresholds = roc_curve(y_true, probas_pred, + fpr, tpr, thresholds = roc_curve(y_true, y_score, drop_intermediate=drop) roc_auc = auc(fpr, tpr) assert_array_almost_equal(roc_auc, expected_auc, decimal=2) - assert_almost_equal(roc_auc, roc_auc_score(y_true, probas_pred)) + assert_almost_equal(roc_auc, roc_auc_score(y_true, y_score)) assert fpr.shape == tpr.shape assert fpr.shape == thresholds.shape @@ -211,13 +218,13 @@ def test_roc_curve_end_points(): def test_roc_returns_consistency(): # Test whether the returned threshold matches up with tpr # make small toy dataset - y_true, _, probas_pred = make_prediction(binary=True) - fpr, tpr, thresholds = roc_curve(y_true, probas_pred) + y_true, _, y_score = make_prediction(binary=True) + fpr, tpr, thresholds = roc_curve(y_true, y_score) # use the given thresholds to determine the tpr tpr_correct = [] for t in thresholds: - tp = np.sum((probas_pred >= t) & y_true) + tp = np.sum((y_score >= t) & y_true) p = np.sum(y_true) tpr_correct.append(1.0 * tp / p) @@ -229,17 +236,17 @@ def test_roc_returns_consistency(): def test_roc_curve_multi(): # roc_curve not applicable for multi-class problems - y_true, _, probas_pred = make_prediction(binary=False) + y_true, _, y_score = make_prediction(binary=False) with pytest.raises(ValueError): - roc_curve(y_true, probas_pred) + roc_curve(y_true, y_score) def test_roc_curve_confidence(): # roc_curve for confidence scores - y_true, _, probas_pred = make_prediction(binary=True) + y_true, _, y_score = make_prediction(binary=True) - fpr, tpr, thresholds = roc_curve(y_true, probas_pred - 0.5) + fpr, tpr, thresholds = roc_curve(y_true, y_score - 0.5) roc_auc = auc(fpr, tpr) assert_array_almost_equal(roc_auc, 0.90, decimal=2) assert fpr.shape == tpr.shape @@ -248,7 +255,7 @@ def test_roc_curve_confidence(): def test_roc_curve_hard(): # roc_curve for hard decisions - y_true, pred, probas_pred = make_prediction(binary=True) + y_true, pred, y_score = make_prediction(binary=True) # always predict one trivial_pred = np.ones(y_true.shape) @@ -668,23 +675,17 @@ def test_auc_score_non_binary_class(): roc_auc_score(y_true, y_pred) -def test_binary_clf_curve_multiclass_error(): +@pytest.mark.parametrize("curve_func", CURVE_FUNCS) +def test_binary_clf_curve_multiclass_error(curve_func): rng = check_random_state(404) y_true = rng.randint(0, 3, size=10) y_pred = rng.rand(10) msg = "multiclass format is not supported" - with pytest.raises(ValueError, match=msg): - precision_recall_curve(y_true, y_pred) - - with pytest.raises(ValueError, match=msg): - roc_curve(y_true, y_pred) + curve_func(y_true, y_pred) -@pytest.mark.parametrize("curve_func", [ - precision_recall_curve, - roc_curve, -]) +@pytest.mark.parametrize("curve_func", CURVE_FUNCS) def test_binary_clf_curve_implicit_pos_label(curve_func): # Check that using string class labels raises an informative # error for any supported string dtype: @@ -693,10 +694,10 @@ def test_binary_clf_curve_implicit_pos_label(curve_func): "value in {0, 1} or {-1, 1} or pass pos_label " "explicitly.") with pytest.raises(ValueError, match=msg): - roc_curve(np.array(["a", "b"], dtype=' Date: Mon, 5 Apr 2021 16:19:41 -0600 Subject: [PATCH 292/478] MNT Improve Nearest Neighbor documentation + code consistency (#19793) --- sklearn/neighbors/_base.py | 59 +++++++++++++++++--------------------- 1 file changed, 27 insertions(+), 32 deletions(-) diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py index eb14e8ef0a900..9a222762ec615 100644 --- a/sklearn/neighbors/_base.py +++ b/sklearn/neighbors/_base.py @@ -444,20 +444,19 @@ def _fit(self, X, y=None): self.n_samples_fit_ = X.data.shape[0] return self - if self.effective_metric_ == 'precomputed': + if self.metric == 'precomputed': X = _check_precomputed(X) + # Precomputed matrix X must be squared + if X.shape[0] != X.shape[1]: + raise ValueError("Precomputed matrix must be square." + " Input is a {}x{} matrix." + .format(X.shape[0], X.shape[1])) self.n_features_in_ = X.shape[1] n_samples = X.shape[0] if n_samples == 0: raise ValueError("n_samples must be greater than 0") - # Precomputed matrix X must be squared - if self.metric == 'precomputed' and X.shape[0] != X.shape[1]: - raise ValueError("Precomputed matrix must be a square matrix." - " Input is a {}x{} matrix." - .format(X.shape[0], X.shape[1])) - if issparse(X): if self.algorithm not in ('auto', 'brute'): warnings.warn("cannot use tree with sparse input: " @@ -514,14 +513,12 @@ def _fit(self, X, y=None): if self.n_neighbors <= 0: raise ValueError( "Expected n_neighbors > 0. Got %d" % - self.n_neighbors - ) - else: - if not isinstance(self.n_neighbors, numbers.Integral): - raise TypeError( - "n_neighbors does not take %s value, " - "enter integer value" % - type(self.n_neighbors)) + self.n_neighbors) + elif not isinstance(self.n_neighbors, numbers.Integral): + raise TypeError( + "n_neighbors does not take %s value, " + "enter integer value" % + type(self.n_neighbors)) return self @@ -654,18 +651,16 @@ class from an array representing our data set and ask who's elif n_neighbors <= 0: raise ValueError( "Expected n_neighbors > 0. Got %d" % - n_neighbors - ) - else: - if not isinstance(n_neighbors, numbers.Integral): - raise TypeError( - "n_neighbors does not take %s value, " - "enter integer value" % - type(n_neighbors)) + n_neighbors) + elif not isinstance(n_neighbors, numbers.Integral): + raise TypeError( + "n_neighbors does not take %s value, " + "enter integer value" % + type(n_neighbors)) if X is not None: query_is_train = False - if self.effective_metric_ == 'precomputed': + if self.metric == 'precomputed': X = _check_precomputed(X) else: X = self._validate_data(X, accept_sparse='csr', reset=False) @@ -687,7 +682,7 @@ class from an array representing our data set and ask who's n_jobs = effective_n_jobs(self.n_jobs) chunked_results = None if (self._fit_method == 'brute' and - self.effective_metric_ == 'precomputed' and issparse(X)): + self.metric == 'precomputed' and issparse(X)): results = _kneighbors_from_graph( X, n_neighbors=n_neighbors, return_distance=return_distance) @@ -793,8 +788,8 @@ def kneighbors_graph(self, X=None, n_neighbors=None, Returns ------- A : sparse-matrix of shape (n_queries, n_samples_fit) - `n_samples_fit` is the number of samples in the fitted data - `A[i, j]` is assigned the weight of edge that connects `i` to `j`. + `n_samples_fit` is the number of samples in the fitted data. + `A[i, j]` gives the weight of the edge connecting `i` to `j`. The matrix is of CSR format. Examples @@ -980,7 +975,7 @@ class from an array representing our data set and ask who's if X is not None: query_is_train = False - if self.effective_metric_ == 'precomputed': + if self.metric == 'precomputed': X = _check_precomputed(X) else: X = self._validate_data(X, accept_sparse='csr', reset=False) @@ -992,7 +987,7 @@ class from an array representing our data set and ask who's radius = self.radius if (self._fit_method == 'brute' and - self.effective_metric_ == 'precomputed' and issparse(X)): + self.metric == 'precomputed' and issparse(X)): results = _radius_neighbors_from_graph( X, radius=radius, return_distance=return_distance) @@ -1116,9 +1111,9 @@ def radius_neighbors_graph(self, X=None, radius=None, mode='connectivity', Returns ------- A : sparse-matrix of shape (n_queries, n_samples_fit) - `n_samples_fit` is the number of samples in the fitted data - `A[i, j]` is assigned the weight of edge that connects `i` to `j`. - The matrix if of format CSR. + `n_samples_fit` is the number of samples in the fitted data. + `A[i, j]` gives the weight of the edge connecting `i` to `j`. + The matrix is of CSR format. Examples -------- From 2b505bf019fc18393a3791a953360cc94679c5ec Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Tue, 6 Apr 2021 15:39:57 +0200 Subject: [PATCH 293/478] Add APHP to the Consortium sponsors (#19823) --- doc/about.rst | 6 ++++++ doc/images/logo_APHP.png | Bin 0 -> 16452 bytes doc/images/logo_APHP_text.png | Bin 0 -> 30396 bytes doc/templates/index.html | 1 + 4 files changed, 7 insertions(+) create mode 100644 doc/images/logo_APHP.png create mode 100644 doc/images/logo_APHP_text.png diff --git a/doc/about.rst b/doc/about.rst index fdfe8241b8aec..6b389d47d791b 100644 --- a/doc/about.rst +++ b/doc/about.rst @@ -160,6 +160,10 @@ Grisel, Guillaume Lemaitre, Jérémie du Boisberranger and Chiara Marmo. :width: 70pt :target: https://www.dataiku.com/ +.. |aphp| image:: images/logo_APHP_text.png + :width: 150pt + :target: https://aphp.fr/ + .. |inria| image:: images/inria-logo.jpg :width: 100pt :target: https://www.inria.fr @@ -185,6 +189,8 @@ Grisel, Guillaume Lemaitre, Jérémie du Boisberranger and Chiara Marmo. +---------+----------+ | |dataiku| | +---------+----------+ + | |aphp| | + +---------+----------+ | | +---------+----------+ | |inria| | diff --git a/doc/images/logo_APHP.png b/doc/images/logo_APHP.png new file mode 100644 index 0000000000000000000000000000000000000000..99813a042b1d4ee2a0d60e4bcf35863a689acb37 GIT binary patch literal 16452 zcmeHtWmud|((d5y?ry;bhr!+52?U$L-Q5Z9P6+M<0s(@<5Fki!4-Nr>1PLB2hva>C z_ucdD`M$H)bZ*R|r=J)NHF*qFQd9r{fT5@$qxJMR?dg+=jQI4u z#N1vE0FaRT=@@uunR`>axH((dIzXsBd|e>a5FcAB0KjLdHrLvVT*yE2rzOD)ST_UA zEupp0tdpesPz5_@uqu7o|$qKyL1l|$HvoXoM>%p`5%JKw<9KO;he<$5e0M7}u(6BW1MUZbp? zY=rH!U_FvZOp04eeO2x3!9!}-IPsrwrdlhPkqC-SdjLK==VPt-nU`_xZJLUEcHXub zKG8dR@>V?Pcsgssl!#ZLpX!)=_wKcLU{EHjX~p@~NuKVQ&|Qv6g~;>Kw~Ae=JKbNP zl0$u>?C07ySw5!6>R&2ItG!dtAQcq%Dhqi`?1Qo`#@>TRcc4{p^pvxy&t*_5bD*B| z_pu2x4ByyW*TJmG5484TD$`^)vpq=zt9S4HJNBOmyhx<$`eLD0W_EEX{{4Iz?R812 z_xQYiUi;ahZUnk?&Wv*+EzNP~oMt;r(A|&Qf-}+~vv%`3mHOrH#uoI~)$DVctudJ7 z**<}OV;Pt20Gh3}nBGvKy9MRRS%2&sSKeYjP$O_K^bnUN(STF6yN<_% z239Qt&n5=V2xAmzqNMDM_O_Lvau4YSRYmzyu`FeIO4U_m`EZL_ZsQDS_kJi0nL2JP zWtn<@7<}_O?(EIEOLorP2vDx(6{VWm=CwB4uOqJ;g^@1A?vd#ddC!T-CWHB?23RS(zkIj%Fjb^Np!z7MCiB96TW)ZX{HgRA!I_@hBWk|6yXh9#9F(YTL~TED_+g+ zz-$%svKNQMCE7sxSDI8ATr_WJbhmmM=^`5rIV3DMX&og49!d!cY@NOgh5u}!`O#PN zW0i>RE1#7D1Rk{!u$nN1Q$p>Ynn?}nJxZbqLNH9>)I+VMPF9ahkSg!nFVa8t;6Sk*6dP81&i zNH{{(PLxWJOtFQ}!uakYH~srX+9{tQR=Tn67Gr|B_OIMd0-JlkJ8H)|-H3%1dc|~1 z=MP`Cs;(aM0^jH1w?1lCs4CInyQzyy&Q&GBsvo;ntJmBq&)DEhm{-)vu;1kFZ4N~&l(t(GlNz;LQ&jd91m#E;aeNrBXG`c#lhxp0qb9yFc?=P4_|8#f%+y>i4`g-bF`$6Z$btk)Z`G zye2{p=6!R898R9d!6pFFXp48Cpqr?r+)@r8Gj~0}W9Jez^X2_ug=Ec&5%p5_mlwhNECXstFxC75D8b-+?b55zm)# zn#$Y^+(BeJ#Bn;o2B@2v>k<7dk9~m-6z82K3sC%d3HSAxglv=yX>#T6lBMcdvX^(B zq<9{Ah~eoBY@t8NUUBl`<_L+X##h-dNTtd%E7(>t85Us;0hFh^uKAiR&?y)x*n2$M zM6;QfLbGB5$vazN3QCl<^6RvYxld&!{1puyStd7+RT}?sjsJTCyefC$MJ~%t;L9lf z3a!D-fjX~U$^;7%bSD$3k@U@Pk=?7|YPv1UDylxhDwU*6KafBb(x7T7WYmEKFA_Yd z?<5)xzSZ6OJH#VYV6#jEs$$p-U5+Mht_%N*{_pq^uY~&y7L(l4nat+o#95+t@iCMF zCm9~lw>cCgr(kzJPriVgwoeWbb`{VVsuQuGZ2Pd+T^lB-5>Sq&h%-p0yT?eWw2j7%C+M$|DqJYZQ<_|UNU$2e zO&n8-Vo)e@YD0s)>s5!d|FJW`8@uxm-jZ+9);kUKI@JY{Ju|QquIszCm{CY#j9qnY z7(Q8NYH+g4>EvUqMtH@eO?nz*`dGS&_BZ}))MQvAv-7DG5*1?nmAsf~rq6KWvf+RO znV-forA4W7N!1vxum~=Sb?5X-Uf24>TmA)pcGxeE2*1DuhNpMb<%>d5u3PuIio}gH zr}PV7;uf;v9YOFU(imQwmGRGkPDzQ;7Z^_?cGWx%X)=9lj#RLb%iHc@5{L@K!kJ(5 ze7Ot3eg^gJBIn3{!*i=$FWXll>IP_};ZaE=*ezol!5)SK!y`wNC;CnPOqVW2;?$cO zs_B#1BN0wvui`As#L;^f0+sUfeb%hU2Pe;6ipV$%v-C#hds{~D&TO(-=6DO=IH@xM zm)VzAaKx*A*|J2Gv=7Z2nGDkJO#*w+KD>o-gnGFWx8W$fa0hlmWAtK%E6;KeHxivw zUN*6C5F@i4T@Wd;A|mZPBSYri=#N~P|L`Sk4_gjqGCIl-hhoD!ufFZA@RA7=?6aLv z;>jY)(9auVK`cLKVV(G@HykjqeB0CfmLiM=*xTZgGvet?jUpTH>g7(qN}BQXqP@;H zV~4q@eJ6eO(D9zzRM>@yp8TCptwNCcOPxyW_(dGpR#*GT1L_^VlDIv**j}MD#%LIE zU6HHp_--$93Ch+8_weo1QlS-mqvCrTb2${GB6M|)?>|O zX&C0s?fwTzFJYpgpxAf&>{`wFsF{f`x8073!Tv8YimJQXm}N9%YtTN0mY3v3pP9gi zDFoo5Ovd9WTHj$r1-=+tChL;>yijhcQ)uF^WqN_bRs7f-pUlun3VNMG!%2qT2Etb} z%|R1`jE zKFQA;&y9+vDka+<^g&8i(=bTIhM2^_W#JOlU_U_!Ndt|DLMxL*Wf_+^Rfj;NPby>^ zdXa_^`8_LEFrDKWHKXy1Ze#08SRcKxaZ5>npU#x_6|$%cwZcHL%J62rli^6#L3=!W zroJZbig=V{R*|h{5WY)c=`yBWzb6}ygIXd?gA~h>D6U)J#(nvAGlGI{Q*xqQHG=Bm zVDOAY6%FwoeYGc-3>*6j4k9&r4D2gpm`>bUl^>ON=w@%`=tT;cbQz9Uc&-qE18J1D zQXR{Zh(d||I$Xq5Q!h8@!(!$?58LyN01K)%>)_q{j8Kpz+|V5mK=JR9BusRp@F?0> zDGf*}WsIodh6Jc|hX$cY{6r`#5<;HjG30DpB*5_5XjG}MKy~`LWHmSlv2HMcIvi8I z=M~i`4hUfck+CSKh_a&H6EQiC@a55Ws^>~XoqeB*Y(*FNq?DS;fX_8nmZjm)a@ZZ4 zP67Dj{I1k9pfO(yx)rY8>e06mHS+w;NiRoX;-j8mpcZEA55hBb4VnBl{>!$mpPw+44PKQuUQ8? zR3q5;B@{WHvJTgx6taTKIn2)YZ?*{u%c=TVwnzpOK*f{H=qN&aLtBtg_9~_q^9Rm%&!}1T&&Q>CuL@TG__GCe4b3ni~J#-M(v0Vp*nJsP8Q50ld;nqibrlh@prT3&a=Luc?kP_BH`+2u zc}OirvB9+-qG+@6ir=0q_Q`s#(`i|@ z)|2+H0+LCdFBy9WdKINjhF;u-R=Q&QfgDQ)+{dc-FV%b8_%S3J z6A^5s@ZarOpoTliP)wd@dVHB@zGhb}mc=>E{E$&8b?ZxhGw?B%!t8bg*vN7HX}Rh8 z&U9U`XH$XvZpRh88)lE6HmHtQTb-F|Ie9eg4jX7wo6&Z8kIIXf#62 zWsJ4gyEyHS_YiiBVvKqOXYpYHjwjis#L9-t#W&7jHWa^yFd1JAs{{6v4i7%hR1&u` z0y~XG6FQx(D(M&yPC{v((MSNBhm>KHfyIu6)B+Lj2eopqN(y7sb+bHiVF*teOnE>R z6a8(D96N_`NOKyiz17R3fl`>jrFlil{v%n9<$ecv;sjA5P{Gnt5#0_@2PI60RiGF! zk6iyPEew5c)4);rHc4;B!D}0YP2|C^uouFpg~;p8i?#ZCvagt7 zNY<;{P?5YE-86o_#%UK;*~EI7Y7;sQt{u)OsjD0N(Cy-N-Z@65r_)?)Can#oX z-7HSd>}Ae-jccRK%dq6F2e}C!NgFXrxBfUzJEs$mf|SF*5?NHb2E{71-_?f8iweWU zYrDv%et~HxNF0y0C$;oaZ`1;3-abX#t_ZxPa7skRlJLk7HddGpA!k-HluHVtKr4|~ z5dk5S(S|A}yf8L;KeZqZo7UzGa7TB{2O1KvjngBCRFUg2>g-BcZoH)cedb?bde&W| zhSpAN+c-N}X9)%i!OF0wvP)s=8iziI`q~Ww#^pC>N-$W&motaVypnkWK_0~gFZT+E znF$muRxTxK*uylo7MH|3%S{}JaOhht-Gs~&IM`zE`x0RNZOPy{tqlV(w{{iWKvl02 zl}BKeL*8g3wG!0QTSCb(Rl`hQ%ulUczy{Tm7^M`^F@5i6MyPGz)+$C-@d*$C*ikJ+ zDZHW8(@Fzo7tXp+KQD>2evWwFsT@>F5r_$<^KV9xI9arhraOJXT%BNtptmuvH-Be@ zF1FsSGB#1uu8B6uq0z@HF6Qv1i!rk8m6wNu+yZu34I^IITQj@a_-*hoCGLJTEUfKH zX)}o2C19rbMvfM>4YJ-)Y?NP_c}n+j4n2Abx$l{u<%V7i1M&>>;}AO|EbMBGvwkTs z+l&T)cqBuP|6zeufL>svPWy;n*ggfmelnKojN;{cv884~EZboQWPu0h#Dv{M|(kt?7O~WNa@~S5iAm{EYsB*3q6HqX?AgD~{R2UPO^(O^hXDU)!5w z-jV+rDv#V-)z$b?)J{?uSV(pdCFjCtwh*d*U=b^QQo>wmw@3^V!;d%`!#zf+!FCh! zpycPaUR(2H7s8s$l)&MWx8x!WXIb!r4mDVoCz@}giDof}e>SdLXqfG#w6~4z7+tz8 zYd^6dukwtAsArN1IhaYw$1WuRAC1^-r|K(?;#50AjLVQL^Htd7o1MVPS59$~E&YA9 z(zh&!8JhI;qhFMV8UsDak>`?74mcWw1tjRLqVa|Enup>QsglyUHK+aWqbJ6o`Fzn9~nLZM`yXoY8)L9|4nG5 zGXE7@uBprL88&DC+=iotxWp_LYA5I|3g3Fzu*@DeKppZq*iswRxM6&L~V*5H^ZF3~J#!nVyHe zMZ<4^NvO(WVJ}8@&ZhmmetJ>xVyuO;m=)Q%#so=3X1ZLG1{2^I;OK2DSr1N>KEk+Q znQW{;0lwgXRT0f~jr9L0FC(>-OMR;FsCW16n&XC;A}zWCBM961$m9hV=?AeT8rdT| z9$cz&*%1dUQcK#46wOKN4o zpW;R8hNN72qR%RclpIj0+9-i?kS{H4&uHz?l+_JfFxp@4GJQAOn^Ze6v52&LMTVsaO}Y?PeM_XWtNOzDRPz ztL%C#>Jtig?HJIT{L~G6g(cOD&pt^9Z1aUiOi+^RU?v^7;-Pw(+3FQ-*Q=GPc0?P{ zSYHBTNv1C*tGf;tl|QK=i>ai%B_k=*ORNk)CMbzc7UuSb!fWdonx97#K_H3g*n7$p z*8ci>$X2pMi@jz04zWoV_=mkR%$zF(+?3eo+E@dX>A1Dlt=p>vr4LOlcf!+zc!Nt{EW1Y0%iPR@)z1cNAnwETi^_a=u4hekmkvty7$?BbrSOj4 z+tJ@)mU3r7n5SrGvz}Sh#ID&v zj_u5DO>QHT+F^dr_^=JAowML}N5VwSrXId|iKw83or0`WJzeDK*3ZtCjOVuuBQk32 zKlyG?08b4uDbnrx=M@U%%?yqq+fA-dvf_t@GR!s;kC?&JbX2+(KP;_e53Fx+C(f~n zrBWO8jKNs-+xxc%cEt z@v0dwHZsIuf3g4ySs)TJi^1!mwgWV`o#kyV* z`*oZQ2Q>@sQcVh|B;H>;<=LH&8s;dQ;u1V618uK~>z?lWkOxd}rdg?3ss_xt1|6DX z9M}ksmz^`Z8Px2{cH|h{VSG9MIjdPe6~e1SMpP7$eNl-FL^x@`u66ka(nNx0cBra2 z?!^EWU6k_8GNbLkVt!RpyC8i}WB(>h$ML;W+dZ*rLn@d#91qHg8aqlw@=R2=da(X@ z#BF7tcQG?wG7T+`=J`fF_~-{EM%Cpf|IYhw?jkB*kJoPs;?gh|y(WINS8i~u^PLh{ z&1+K+<|e__6?{x9e}5ztIgy#kczop9aejDwQo{Fk*`DCx?kITmexY`PCpF3Y<6Hjo zs8{Q4+H3cr?|zzY4F!_nPS!5~05IXU($X4=($arBCwV#@$qP&oRp^r-A26NCjMc&a zM!l6>1Tw@XaGiaIUnEsYVB`sGyFkk&PQ@pZ?`_`L7_^w{9Wc~K(wD@(2e`VsYk8#L z{>;KR>i2P3N%Fp?^gaO;X+m-dNa1)_`FW~Jvp>C!PSKCRmZJ(u9%ZxCOPxNYFug`T zz2xX^yq|aw@|z7C2nkUT=|i7ke8H5z)Pym9*UzPwsk`OArJ2T|a6jZZ@I_#pAqyd& zgr_k_Xp6YP;J~ab-0<^kG*ir%>T>PwnG7nY6gk@r>+IqH^k-ysxnC(z|xgkL$+Q9z|ykPLP_#+@5Q+^6UTr zSaG&b$5sZaDj-W|M-Fqavjv31$I<2KSPK9U5%+O1x3q_NP+LH(ZJk7EPda*Nscpfc zwEBFiKvfrMh>fj+pBqHmPff?t&)!l9Oe-#iD&hlrB5;Iwm{a>WIykw5d_-w~^Mam! z{}OZ3QvZf{*o)E{sA^D4JG()sc{zAFfb6n9wqD${VyM(2ZeS~rmWx3@QkHxGxin>8nwkdP23keid6oBavF?(Xa4VeZ53QY{lnhB)Nt2%Iz8mng19?- zx>-VGy&z5=bbmGK;^682SDT*hkYA$Temj7zIG>#QZTYV{@`|b&f7kgHMr&I~m){z{ z(0@gOE&q;l@pN(zll>qQBREkdS9_CJ#kY7|!u7Eb z;dF7b{@w9wT0l>Oc?zuguiAKG{w;qR3rN}xV(#JWrsM4FAWHkoGwNTIze|c*77a29dOW%7LeOek!fs^GwbBJNsDv z*%xmc$WxB60=YpzAnhO1OD)3rYeD_n@I-zsIjX9lzXm|$*OCNM{FQmyo-Qs9wh*^} z4$R*Y<^O>Di~gTU`9G=u750a=w6lxvQ_SohP6FkUd?`o-U7^f4@HdnHIk)=l|pD&vg6$XyJ+aKSuto`2Htc{|VQ>6@h;X z{GaUlPq_ZA2>e^%|76$yvv8sQV}}iKdiqDt`)U6S>sV9vwD(4|P?nbg{QUJU?5gC00iNRGLkwzOGmGL^K^H7 z2YbBrUdrMaX6U{_-Qupa(fE?u-#W18IHRaKn^*pE@nh){gQmTm3Xs&^fvmZTuT3$H zSW=YBwTiV+Q^z=a1C|yR^&RREHxQlY`YLJN{q-l{_4w*nQSb4S^UV^TQONjR{`y_# z#G_0l*e8hn)ORCbqbD>gM)o<1Chn{bFohN>xWT%yXe)`=v7{080BSD`$>h06svi8+dp&p>#D{7YhdS=OhzU&mE- z!CWsEuu7BT#-N39s%yqpPv(Q;lV`w-(|w$*(zfNZfnEq>fHX=^h(U0YtRkdb=C|SC zj?y1G0L?q)ivo`zMFD0)LnT;Gu^aZTeoJ(p>3$~gZVj|XJ5{)emFC_FH;3|?1>fB= znCbq}7KckzHk_dw8pNli*H+UvW(R(L1UMkn*+pk$ zVGKHcdYMV&R8i;?#8*!RoJMYZoAWSt^$n#EBO5@Tc{VJ&ONu=Z$Ye1UZP*@~&`z{ww`MC-B4T7wvZGh%AA715(8 zq=y7Z3;F(j9tSG+{yj*TUUz6emwpy=25tfNq{QcoFjUE6>MB{q-|0+YBW=h(Fha4Y zSFSLi)cV54CAYc?J zA2!$jxu2|Gj0m4xCS3^Q25*`KoW`SO15V4KOBBu_&W4y7%cK-rdq=c=O`|6{Y(VHZ z+r&%!Dx*=)X42mIMRVUt`ZGd{srMrcX&!}|P^OT#a7-k=BdP}xuaB)mbB21`xKGk* z4{OAMq=Gm%v@YBlrn}7y-;4qu3obtH`-+8>5O{2D3Ckj=}PEw|X;gDeK2f z^U{K=X9hjn6^rH4B)SqXh3OkrEZEsSxnxVxWvT@lyHKKYr}vAMiwPj$H8c@fwp&R; zd$Q1K(_D>^wu10OJD6RVp2ovKf(9EAAMn|IU}z73LsD%3ajsRxyl0jEU9cW`+5!dG=MmDnZ%UN= z$X5!VX&UwLIkt0Opyizg8qmUdyeUW{LK*(pPCA2s% z7K=V+@Rj(!w$)ey#^hyIdM6gqfm9_6ZVG7#k9M?P(IMF(BB8&Q2AqoJdMS^J) zWN^C)d)o_Jb=V#*BIr`Fs4ZJNf)D;g@HksVoQg%8(*UL2M=jR;VAF1hd1S{G*{eiA zkVAPe3lKMYa`6H*#`8QL%E!yE+JYfSkA&b;W60eCP|tb;U+ufgLHdetncy9TE+Lj% zp2pqs3yyN%-Pp53mHZwmY&bD}i@0674fS$jXL%MTZOv*Y6U^BAT4u(UIyTuHZu~2_ zahOLOga8vcB)65IID?C^vlPXt(^nxqENR5*g)8~3Ioo`OV!<7X(Il1cTJBRk5rUK8 zoFuq9jG|EH5bt~0c6Qpx+p9$cDyU8unBz4#0;$735IhWv39ev@U>PIacQ0u1;&&B9 zg)JwQn7yf~l&+G{PcFgYOhrnm4$)Rs2aRm1#=Mfb8;>DLnz8-JF+_h~xm~YQ@8ko+q&n>rn3Dv$e{)>jwe0N#03R21BN8}ZG(NjZZl#g6eNASn;`y3oZvGcH|h>2X6 z(c!c0oF3>0u{xY; z3LoUi7sg&h=m3tv^J^BwoR3Q=-Xv5oKauV!eP9`;Lf=tONJ;9G)6XKNN3<+$cGXI) z-Oa>FG77wn75$EQTFTOLU3jYa!J`5%glEkDgf?HUh&C(ZI}Ow~=c?YC+xO8-yVGPo z!$^a0clqj4E4^+J8&Y`ZBUIq!$v;^{LME3YGk9UVZmPaCq}GRdLZL>uY#ohyH~2WfT! znAGAMniS3E8Jp=WW;(?!{p-0lJWaoYCI>CHkxGGD6lgbCb?eM>`s&DDNd8+ zHjzU=2<}W7gC}Nn;;ObU=-O(VdR3#8pYO%*vr z4tYDg58=d5zS9a32HmiY(EyS(xBB+E98!&TMS4KoFlU;i#SfDWrpf{cE)zgpMte~G zU7CJcpW2S;^#&_AG|1rZJl|R9gCPB&w(fM*O}-aqA_0lLXC83Dtv;qrEevR#R-{ zB?$3gFvm1|Ro}yd;4&mU`PS1Med;6(a|f2TDgfcW%Q>rLhYUtbX znzh~UM?*_FeVrs5I*3yA@BL;QN+)TD?RZJ)(t3f()6MQ|2=h}pbb2^otxtOn;Z4)Z z<0CGw9pu&y1snDdEvBAp@SVdkBkD(c`_Z}>zdYEo+-D4*GI#i<$bD4as3(rR7nN&^ zK7RJ~a7rZWI2piS_?YuLTGmu9kg$$Ri=9veuFR4C0%hB9?=G7Y>W3)G_4x-54E zufK-CR=3lPN%}<|Pa9cG)mE;aXF$c_yccz|?KQkeS9fA;=SLM`=w>KW+dGIg?4d*O zu|yiB_&*JZ9xRDPV3WcFvd?2p<(gw2j#qv7z zBMYv1+V#a{7Og7r+#o(?37OcE%^8v*{J4+b@7XJPZ4^QXz42WL^Lr|zQZT-B7>m)Tu_}UZ7n04J)^JeWaIQqdBNMUJ?nnG)Ztuk<=R9I(#<)Gwvo_!qQ zB(kXPh?HCpQvMM+?MaO|qpzcV=Txd``>nI3Q@FF=Ecol&znSDT}L?zD>H(`vw zaqx0q+%a^tqoyfcLQRXO8cSeXd(c99=m-O8E??yw??bF^5pXVoNJa;ryfseA+&4IM`ZM?PWpf}INrj!;MksCIxF#bvCq^cD( z?Ll{?2*YdFzGe{DaTk|=5u^XjNd-PnBLV6SgEc{=Wkri2)%})iIFAb(-a(#(pyrHb zM(OU9BP?myrv1yM&tt^9J^6fb)SL2sry17{0UQ3C5cKiYQLt7|VH-Sa4+|cq1UvG{ zhx3VvqI+g4**jjHupkE1TRnt;lT?!gQ>AOT&*?N1TymIL_0!q0FjvJDfZ+b~6R82X zWz;jpDtYN}oUb*^>%v)QZVL?v2@wy>X<{j3awhg5d5Vn4R>PX}sc$Vr8dn>7g?x@L zv@0&EA}+@DWO=QcYe1;x#ZctoX_aVC*}F;*M|6Vv9ZzyLJq#Mp&NR>-CUzT#x5HK z&8GL%3W9OxON5-B_dOi(C=_}vI9u_`xiO|$aGg&z4g0iC1j5FJ3v>Fp>U)XD=RVv>O*+Npl6T;NQF@UsB zry7f;J9pZMotnM(wvCoBM|$=wkauBz=0}PmUc04D=5zVhp9GMuE`$|Dwp;T$Dh~S8 zF4Wf0jY|gP6iPD9l)mRb0tAD(75jo=c6eG|p3t1yzy%?SWGARsEvB;Ulk|}N$p3K2 zu{gi14GqTUR;eULc}}JoxN5&CjKY>1`U+Lo5S1xS#)7B1-GH(?3%Y5-%Wl7o)`+YZKGz-V2z3rESdQ>qp-HQ!%nl{b zEsFfgVWbf}PR3rWiJ#6mHhV4)`x7Bgd53;hAJyygk*jP&2Qo{`W5?PzHJThk$_<@D#9&XA*t|LNh+@!tBH`>`6zsXs=X%H2Ophb9F%q2 zw>bOeGuEaP3w^8pVB8{*jl(UVc{eA-s{LieGtQ-W1#ti&;eqmjD7PoB9l^6LGHcvw zP{1Ih*Gv2TPXq%kC0b2AmLk-}??aA1g~E6WIV%_AtChNu9603R7SBwD+{o^eH$1no zaOaBw_o{8BtL?#KM}bF<6%G54G!?u(-+6`Tm3sU#gog+N4}YOx*ONVtF`kyG#o74I zZKR_12kb~1LuP+8mn7Aq;C%d6#D; z;wgtA(;3L$Ppek#6+NM#|k=8Y~Ap7S)Ealj_KRH-yPF z+*Fa?j*nrPFF{PN7G+ZDaVMAFc+`)Nc&h4oHy?bw?%;EnDVPcz!poHRnMbF`t@nb#F_$ zJ$R{1o+6IXt6&C+K?n9U<);lygUzcNEYf%)I(uhy7ZjHq6aNfKwqps~NpIMnXm31@ zjL;Z!u5`xL5$mKVoRIV?OYJ1bmYorH4K9<;k$2-x?_GDLY@{}`m8QzJDnoz5Rr>Y| zOZwex-fRf+fo4AdvYc@j!YEznl52 zmY-hRD}pqIwyYdcMaVS9&*TBofK-Q87cu9m^QydXcvDiL^RV9jh;Y_4Fz#^PXq`s8 z0?i!z=?<42FVcdni5;CwkD)zlUY^U^*Wo6n2F8;6{FxA%L=jyiQZiSv>hI2POWEH# zIKTUfC9rcZ#e3f-P&Ugydv&X){cf4nv4d5dck{=vd1Tg7u)lPcQMgj#G3*+L=dh+i z`7~}dsUUtUvT7B%Uimt?Jmkz9Al9&#tF*S;O~8A3Zw@tNE^!YaiN4@IXgLjV-ho7c zpRpGmgxWGIfA>*JbI*6cb;USUV>pmsEjr1t_0Fsj4@bmTlfh$;bFIg;*RlnPcAlL?_f}%qcilNN0b%^ z@|@OKTt=f0CD9wOzF^3j&aC{LT0q zYzmxE$r>vCPc`GItG+Je9Bx=`XC0utO6IvJ3fr>J7stM=Js|Sn?$UzMSwa|v34-Y~ z)tK)3x=`HSMFQ2d^{9lmdwz}uJvqx%$Tx)}hciYYmxtor)A~GiqNcoaHG{Du>8hkw zEMq+(pKU)mbwX|?ZNG5IUmWoU9r#YGuI-V)%d-Wp1FieYW5*=EeI`-y$EyUJ z`5}t_;#blT9vx^S!>pOk+iMJb>a|e0#&rp{cG{IfQB49TNx8B|y7Y~oVTzqrI()de zEcDthBK)Wr8rcN`-w3UwALG1#GmJS=7=$!&l~L#@b`u<_zV+>t9}(^84lO`YR!yc( I$}H@E0n;Yj6951J literal 0 HcmV?d00001 diff --git a/doc/images/logo_APHP_text.png b/doc/images/logo_APHP_text.png new file mode 100644 index 0000000000000000000000000000000000000000..1194b92f88ad47e3831a5f5f635bd2501d3166ef GIT binary patch literal 30396 zcmY(qbyOTr@Gr{Z?(WVa!JWl5_~Hb2g1c)7?(T~vKnMg&f&>c$cXtc21YKBy1lh;; zckg@mzWQU%oH;XH)zzP>?vhE=)mFvBrp87>Lc)8krlg02gz|EWl!uA_a(!!bp@oDL z5c66|-XLi0tTeE|!f^E_(#a)+kC2!clTP8NJ6R76M9#x^yfnT1ycWt=TWdl{1eQoL zX>ya#*eb4wXMLYVs26n&(JYhLX1HxF6bikO@%>j)XnN~ENmk%%w*mod3P^ACV&cH^k4x)jo`<5Nk9q>83HQIYV)T*z zhbPAt-YJ?N(k=3Bwt7J4TW_!b2s`S(@vLK@3<<=MCmDYcj@G6BOB#$CB=Hx*mASIQ z=weu*bH+FXv}LMSZ|?v1SX>Pfn!&WQ`$&Y-5(B zjztClrUADZ>YW|`H$%o>GJW2Cz;d+qOaq_zV)Fl{|Nk-+4T`|SMo7%`w36MW|Nmvt z)o6;Lbo(rG@UaVrKs7#JipFT-~IrfZ5=N3*u)c@%tSGE6t z)=tqh(F?Ch%8Q>1+QGp;m=gu=!?Z3S+VA6D_xKRJoTAf7P~eo*r?3-ScZ9KXk7Z5W zB+Ea(<i?N9O4{88a>@J7yD9ml zXK$5%IF~)usluLUq9~uEYodNc#YfSRwN{5dW9mC=D;Op@Is%1RG8bj@15cG~w4|cx zstLQ0LD(Bu6CAmIHah=_t_}DPDYcR<=|o%JDmQ$n+RAy)nYEQVpJrnfVv97X?g(Ik z)}M%uZ?b@>A#LcP>ETgr$TcWGsAdVeC_aWYl844GR_~angBZ-BMJYu0{|{WBSrOlW z22hh7#Xc;RJMQqF>P)hyh#koVVK=hap;hj$PN@8J;`BjMUd%HlyMY>285JDSvPic` zeA(*fe*Mt@Gm<*-i^*V#QwLFaL5L1r)dx>(7x(L-Oe=WQ4eBMCbz6G+l1(YC-Xe4(zW1a7#D*-wc%O^is{rv zay|Y2{a=+kK83B6Q50+CB_@Jl&*)H#zl{F}_Bn3=1*la+P`IiNY6y}YDQN{wxO35787+$zaV;iZPBLjxmk`-QQxE#}|Q>dd2W)1|(N-Y+15y zJwk~TQMQ_G&B%!7CMQ84i)z%KQC_#9ie-%s^Utxm;@%M~Uo;Hx`QLs+Q6w`@#lE47 zQaowSPuI#u2?5p{y}|VnroBUkNhA@xI#x1_7ABjaC*+G+35nnpPR{~rd?=LCtx zQSOR}_Vh{%Wzl-HxH{N=7wTukMl3U{!Cb6~yu`e?VW+OUG9BH4c*n<~k}ydFH0MBc zZReS={oQt~l|Km%y%$yJ2Cq)W`%eC>irETbZ~v2kxmfxfzkum~IZ@*5yxrPlPq}A} zL{H@WA=bS#Ie{~wtH%l69YdBUvGrb>Bz6OcEcsu6I7O`NnS|VV8Z`Q%;un;>2K9Pf zrcQBJ#tKLGO>*%3J+S}tB_Wx4Aq%6o8V-yIq+z5z*dra$uWNDA zfNTOuaBi6_ATUe)3+Jku{%AwrWYBX=?e8h&5h2~P#q=9xto2v5iY+X;Pch4YZ3X%$ zBrPDWhvyy>34q|B?2&TX>L3RRBmSy#>3?||f-3@kj5{nYxGDbWJDv)h!pKeSP!GCz z%b=3XrSK7HtUwyC=oshg0^;wKgO3ROgNg~2*MA41pWG)81rVL=Vu|?p2QfH+7kGFby*3|0 z;oM2hX(OH@-+7x;@&jcBMI6^una1Zag{<%6NPqWtW+Qzz?sSYPfD8_;`2qJ@Vo^o- zJA*4Kjvca9MUg(Vk(hwmvnskb&^7?cPrExsZw<3&p|X$@O0v;%qH=0z7!@y0 ze#(sX$VzSAl5vvadTK*v!{Tke5a|`=DB62H!aQ&8cuy1EnSIzwZ+9AAneqJ_AMa(O zrv%@D@3DKT%Rfeeb@-b8W|=f{ZZ2qK-?qBpLM=4Cw9klCwg=n9MR3F3XA|4cej!C? zWg`VRmyq$jk$0Wt$%?@I@>UktFJg$#tk`~9tXPzkQ^jC)C&KRswY-0CQ|#ltk?NB8IS%2HT5s?1>h@=;eOJzr zY4OdSq zB@8%W=%$XYEK7%ted);tS;Qm1AA)xXS(a>stKMAZ7l8H^rc_ofC8q3e_d)hzy4Vn9 z2NqAo%Y*e(DBevlgCbT%?3+=u)RCTd{l@f1PQL|lgUZgT)Cb!*Y?r$Dt`LOk+7X0m z#-N{u)2;(>26UPC{U4`lZQ05JY<|5d>hC@LmLpB4|K5+(^{(9nTc1+lr+f?L=J)bn zyWmmUYD%887QmvYKHbq(#+Kt+jfQS}7Q^#zJSp{~A1)tjtUh~NJRX3CMkX-M7(GY! z8d3U%&48QT3i=h<-*!0uBZb%9~%+xMHKs#k5&N~9NM z?}4utR;f_=%DlC|Npy%P0%}pTn-zl;s16$GV6WMv;CIC@n>8JB2?bn=CYmOti(K7H z684B#TMNTrBaQRs_2Ppdbbo5a>x~LS5VT&g($EFML>CMXj4#3tpBlsGGMEBA#SU7S zbPAD7_tU;Neq#T;?l#d7^xR#`v)^_G9W2j$=2a`th~3LLh*T#lTeVWR*}5Ble8e+R=IUtZgQh5L?G(}4pEww# z%UZnG6h`4IQjo?ArmZP%wx-zsh!5Is7@2SX3F#&dX3j^M&i_zY49c_1T$Ta%MVvHw zd#Kuk-iy4NhjY->9O0DqCEJJONk5 zB11R9poL^Q61jbTi|=tYIfYR*6M;3Nckj(oBNFb`z6Kf+XNiO5DFTATrzKFPw>^AG zYLHF|Xj7qlT*t^S#}C-!y6(|L&7U*xf@I0I9_P@C8K~dxeL9LUP4G6$Ud=eTWn(`4 zeH0ohl-7PwMnEUFDWjufnJ2)B0NHND&%Sx1cWihy>38i#U5mH06ZR>eR@~>{(rpY- zDpYoMx>pddwEr84WpR*b%#-;DfI-DCt2KJGh!a zgpHA+r+I#=iz6!j@=-PFw@Pc*pt0xX((1m8$TEikRlA>O((#Amp19#kOnJiak+5qhYcQ}wTPyH^Hy&>p| zlWGN8th}Y&m_692Ez`aYp8(rzk2f)TpD%oW-!Nwe-I6?QXuZ=qZ- z>z=PTX=bunL^zFJ=!_zzuf8p_3E2p%NjAcQkYMT~YygV!-SHdb7@XAH+2U(XLhI zYm=Ee*I|$GZu1PJ0%@!;Tu2#n?uO|HVz1s&!^5BVxG-z{P-jXQV21^m_nmLhHF+|! zlvC6WN4LnhJS!$++E-_Ya0Vhck8(yjBZ4&x>nlq z6^U3^yJjJ@-zE&o2#cnI!d#PmlV@ zY`Z0ev)>on##N(mv1Y@;0?`V1PEr%Kfkm7E@7+{+rzSVn{Y!YV5mIO3`7tIMqjlBO>2B_=7hP>)XK7v1u|j;54U zov8FeY}XEEdxt5|RMF&}%K-4T*bVZQ@@0i)t{jUOIs|c&we*hyu`J_*6aPJptxbHM70e#qaxR7Cw~?P#KQ7Wz)-XjsO5DHoJFiE3uJ2acB+rY=O^o)7wTn%sMLMOX18sECh^=ea!s=!684{ zAKt`5`S>UQZK}xZ;m?Opga}9|c=0NFuQgtG@_)5yLwfPAY$Jo$d^_2q#(!jZumKX( z=$dnL2RTZq)QULnKwHBa^TeR`%kxtUG~ zO|*t6bt~9*S1JWY?hE$B;zXipB__>8+!hiGzB<(Hq+$#1wLV!TP13?$azst4DsjO+ zRSuYS7|;$?sYY(X?~wBpJSe9-5_XV)q2np;qI0a$H+CuikTU%T3K8Y3Q$a<~d(X)O z-MKe5&gGEHpOM^2ciffCJboo2O~%$NYYR1ttQh5~SU;i@>!uI-poFc?-SUPnycQH&m;#tT6H5E)j}@9bT*z}qFfj=4)#$lg z-}LRC8JU3)?VWxIObI8{rX@R+8pCA|MGoD$-#%s0vCHm8kcZZ2@?envWvnh*U-gUM z3z($~C`l6+a>#L;SOi<7-^69HLe>_V@ReQ$JW-L)?EUSjH*X|C6CT2r+%%NwlRH z=1^Rv#ex&Buo3gS`daN3x^y0XLc?#hH+L|eQxl3;ngcLDddiwV?ohJk$|WkF-1Uwn8Sq% z2yxH6T@`yu7E8=7wcQ7oHMBoF{+{B0F%}|z{7xL_AbJARLRVg4TgT1QBXN^^)3udu z#pC>HP_*&lyGA$5c=QTYMY$(i?gs5;{nzj%d{AV(sq95nWan3OvH}@3|M+L&;xL`o zWdBM|t+q;R^~D&E&o>O3;6kH8_&-T|)1Dp%D0eEep!V^a}&7}c~D zB%iok!o+nClN*Wql-^uD(7z>wbnH99)w6b+z~cLYl+_?Z5TpAx_KY4tK27J=UEkBAw@&ZnKpP(T(%ez0)U45f*Us`!HkDArX)w zx#ZE?%tm?3WmljfrS2`YXHy3*H&QVtyK&|ojPc8 z*jHNtO!Sm%*Kef~deSyB8_??0HxeNT_tv>@Hv&rvx%~l2mqn!U=s(^&8VJ1Bs^TBp zE~*Tyw;)tS@@&y9zp0~c<}nvrvw(@#wFw`o?F{6lv^Ove&#HW-s7yUN_8z1Ztc1*~ z_gE4hRFt|pO+*u^M6;WZjm8RGNR8%{60FhJr}Jxu6ULfsWggs1*+I6IJo(_xQp>$D%-BZ14&_ED4H&AB zKv?E9QaTXXrDu;K=_)U*?7s-P{Yk9z52*pFWz_%9P;`Nlz2nyWtA+88&}95QE%5es z4pcnS>It`X(eC)x-^|Z)F}x3{wsXJ*6o3nazZ%G3ELU&9wd-^S#L^1=_Qw=;L%* zyC!O18wBZm5BkwR@;iR7I=YcTW}rNUs^yfj_qEOP^?2>|>11#=0;nvSyD@jcMlv3= znW||Z^c|_KyybhFlet^=+1yGimuji{m$rwUMJ~uXHEEsSo7}X<DZST=Q)iZ^B)|`6AUKfg^W^9pqIQuFIQt!9x$31ijoD zZML>7bs6JKR)0*EeiDD6-}f~QzI@*;DKP!q;x%uAUiy{G zb}Te~x`n|iof#nL%PU$u=dUVC;LU{HS1d#xD6Rgy>HOx|g%q1~sCcB8*2J9HO)fwO zc&Ueq_j-;fnIGdo+5qLZi}1zku&3pp??{eTCmaBqe=nuL!)r~+p43U)uN(v`ix@^F zTlp0%wdE2!4w&-i?$Y@kKq(Z_vdVes{+Qg^Q1E6mLh8&1vtCi-Oz>=iv5^A#A=h-3 zr=)MBySFt7+Wzxq&gMxxjuBlZMTv~sKd!?3^E~s^LT;_Vjv^aVgC6p^>V1R;bA(+> z>H$B_ZUd=JvFF(95`~U!LTUXIO@O4}2Y%^TP;{G;S3c*jTR0V$dY`C4#fec!DB)0* zrDDymAoW8=zMm5*Pb+aL6_J(cg2N)gyf8mfAKl{ScDRt}gQ(el)1VXIRb$KSODYMR zKWxx901xEb74$1WCLzNKA@<^h$kt*1a*&YDjzmsDmT|ZYi@Egr1lUpwyjsvEU2kWO z$>HPZY+f-N_sR@dIq>^7sulEV3t(=&iK3C}4gAW>lm2Q1=QhXgF8sH>&k6NvnF-#; zbCIET()qV*KMDp%^gNf%TSVdt!(yCiz)$v#??)sURDbY8+1J3iT^Q;v&3P2Nj{0F$ zBNGw8^qQ=24p9~z)f^@d@x@J)suIf7D@9eOunQHlwliRftMV_4U}8w`I^ni4cjjOO z_}8>!RajJa2xSN;5_>FHLf|U{Lu+??OJi8t1{sp{9jJ5f$}Ge6ykmw5=7$QP!s}vb zYSga{H-eO!#ha&XW&h9=tzaYa*q%X31O9e_LK zGky8e>&@S?fJVOV&==m=(e^dMOn1ZE>DM;BA^K!DmZ4Nf+7=}KHBjXyCu_uMI$XSP z3(IixqRebDssbk#ZXq|Fq3hDqrdgW8^F(qPHqms)yHAIi zk|KYZt`W`Q1O5eH<+lS*coTZI(VEhjDYAyd(%qFZ^+(X&4~H}eDp2HhNcUSlw*h(H z<`ZMifO7gs9c;>wACcNo2Lo@7C!B>~4?MnCybt#;k>)-Z8`?cRuL$ra(r5_K1yqo5 z6FIwGow*%GqR4y8`yH^YS}Hn_G<<&3s@M7675ulXX^_<&;~X?>@b8!Wi2oa_K|9hXj$um}aoFVfw?*c5tDT z?}%$V(ca6 z!6N^$bBejML`>!$7g{m$Y(5SBWEM#L)Op_(W|W6L9x4E`J6lotU?+MdSH zNS%1jza!Sz)1_Ot0$BCKw{=nfgwu-XyvcKzIxJT)lEsiof-oQSC@%l5 z)>KZw#-sUAY7E=oC~$hCwyd$A!mQjc=wr1;=;*!+qLyNe$8v!GOaf1)i=&d=7;^jP zhzSL${v4iGD%mM#NsgOnDAu#Le_ch{tzz^> zRip|(mihGeO2jE$sPL@O4Mvm}n=3VyPMRpqrxa%K>;&E<-7iA>j3E?+M@nQw6a@13 z;sSnruaL-949ZPyX}Qf6Lcdcvs&qCCeFmm*)Y+VEjjT}0K%s#AL$iB6W7k>8_YM_% z!kP7!$7gl3{dpnsujsKzbjmWR3=7g1O!kw{prr6L=5?%v<_i*_>`ugvBM2d59Xb`C zEh%K_dGU|!_kftnnhCz@=%>KldxT&G->v-0OMt9oneIsXz|iJ>(;^{3*G@5wAmE?= z&4uyq;BC(6#5q;bIm@*+JN9Aqe5VXwxl{NB7I5{*Gn39GS(O8Oydasne<7APg`jHa z4T7^gHrxb|P{&IOR9TE7QfXX>d=Hsyy-(QBBLnZ2$MI`g#)&%{i(J z`6azbb@^lK#4V9yyEymnn)S?l+r5m+&tsoLxTPQ96D)N;`dq8FC8x-Hu7J2g;$4zb zOsSj<74)fXl(G*_wJtdTFNran(B=#j_LrS_uAb%RZyBxcdBb&YE-AZ7T%3@H59mZ$ zUR}v7Q0~`Zo(YiP)zTh5XeM{kipxx|OyL{&@95V8biOww3Dm03IAzrQZZ&%KX+vEu ze)7`zRC@Vu#Sz-CYp1?+9@MG`B)2%}Q)_~H=qupRtKwqUPn_jv6%yhJTi-rwe4{U) z`SAHP3!A$}UO&AoLiFlqMKfJ|4}bA7=bvIc9N)Xrk3f3?n9)-4+`~E73EoWJ z-h)Am+o`2;u>M1Z*L-jO(Fv5)gRP9t_uNg9eEbIcJh@cB{4fd|6iLa|9qM)TAWsaH z9rr-0mzj`f0!7P19qd0OpL}_;+DqRp>0kjzIM69lWtGtdeBWLi_ViTE>pFL+@^~9Ky%^UU;#f4RHtoCP+1G17nOo5SRAVQhblHh^b35FX*p6kZV`U; zC)X%CEvRtiosX2x2Pf@OlbnqZK1go}CYN5JQ|MazW@F00DT(hF%Xn&~bD({Xi;u|$ zimu*UE%d}k(xk!hn**9l{5`Mv>tpN!Qr)}EbaQQlMxasfhVrs?{ASBp!s~vx5;RWl!qH&*^Ukj0c^j3i3k}r(OVr17_b-1<>jejn&JXYS z4C-2_`3^a&XY7~6x5;8N{<1|{eGKgABx28g$noC$&O*?~_lGV^h}Q5+2m$|h%)+CT z%x;&&vM{ni&K`li6VL+9?s)B&TXrrhY9Hhldr`foAWqd_S3#6Mx@td%fGU}~cOQu! zB-@H>pCgb8GM$9+ZBmj~#k7+j`1*vC^7=5fJW>zPT1+0Q@Z)I_5~H#b$Q-D7$_qs}%TWL+)ADG3h`yQRjD> zBDJYr6={@9vzu52dUW8g`ZF@%>mak(VUfZk)MFRLwVYb>19uV#LPW@q)(twHsA6KY zchAk`T#>4XgRNhGbHB~^)#`!dOr=)+TY$gH0yd|K0O%0^EyvcK5xyT0gPoP#g;5yv z?WQbCst&DNV1%Q#T_2W*YH3ng<2z}iKT@6OL8WJfJ870N$4?h?@zLC=*n_7OS)a>= zVEXn1f9bK|G^E=SB60IeRKubD?Mh}$7z(1AES=|`&Ef!m5i`TBy8g_a+^+C%%hp+Ca^B1Km6wx@|YMWTu($ zyYp;PLF@BN+3_SKclne-xX4HEc&;@2)Ynt-fONd=C3)?tPdr`W#5^rb`sEwybR_dM z>@#nUE201WnY0`ajaJa}qWyFOXFf82c64;u?G6r*=+vpUAB?(O1kDGxLPR@@%tz!VFFiQj_er##dQD)+arG!b^k!BL1 zagh#=>20j8m@jZ}^Lg-*_H@WzQDU&w&I9+~77{HKgN4Q_Om1b{HPUNpRPAYD0U>#) zsRNb*C4a)QA98UGf#!b+b5Ogi7-*jq`;bAA-aSr677S~fTkMY?hj3YJ(up@`>In6k zv8E9y*s&5AK{PF-s)HIAE+h}E5QNFr_=j>je0J=ubwH8srhB;hqYgpO>Fy}; zVwbz@F=ecs8^)q&I|m6D98ph~k6_c!!v^WZO|1;}E@9+s?L{;4&yWi39rhdon2pq} z%1Wdx)`m;4&_kut_0y3DPz=pf6a~%t2j{Rxv=i(xxKCdhVo|D`-8yDXx@o!pJoOTq zJ+Ae4sW)TXi0FY6kZ0c4KkfDt?bU`^s+wOWwTcJk#jj%`z|e zI@D5F6u91+utQhTlnt)VB=cLQhO3?pwnvsnvL;dZkvqBsvxPlW z1#a)(ux>Fh%VDv6SkBP|5PmvB&fB~16*~BhQjM9CdUqDCOaJrn9RZ%{5c%FcUqziz znKGN&fOAs^Xto|bm)-2-#WdF_w_V574Ww=rb;TEL|A}Go+FbdV4Bd)WB~jkfV~3U( zJWQ<2td0l?^&)@R@2&keq{EUk3y+EodM1GDDo7=Kxh<1h+UNj!%v9W(iY1<#EV`(K z0zzp)k=!s>p(fTd+bz5U3r*pekqUN1c54Yj|1>bI6@HJl69GcJ+4+MkjVy0_C)byP z>o6j};*2Wg1}Z|*eCZVA&hfHT^wS>{hN)vQW3RoGNK*IHl_5vGp||dG>lxbGXY8L? z&On70)end>^3)j-RHrt4wxV8v59ST3yE2V6P6pbq#-6%p$c{Mm02%0lc@XjyQI=CN zQm+gh(Q$p|57JXV;a0r&%%7J|_kd~N7fCaK+=KT()8=m@P?5d+Tk6+ZgcdA86brex zhY=ETdZPg+bY56KmyrcqlNs*>-z^(OBg2MfilhOLakXpxs{*r2cIh*r5WL5#Q|u@% zlqZzn=q-kSf7Bkf-hoI50u18Bv^;PBn1mYFAYBi6?eo%Mjym9>jlH3sjwBMnDse=m z-ybY>z}D?4^-wKj?%~7ccocvjmXSv+@t)>yD=5**7u}1OVc1u=S_v78;hF-q@0CGE zfkZ0CfI!7A!B_p;zw@NYTTekiRp{yaQVHMFFElRggj6mxinq@PBQzqVTb5m%wWv>a z_Xd@9<1I)Wv1;>S1^^C(k0Ruv%F&>ZZy})`9+)5d{*Pc7+8uimREq8QRaugGxS9HO z7h2!#Srj&(u5PfCTW_Xos`slFmFbpyg6ve3BLr*v<+U62LM+z<4@<7wsQ!(m2vw++ z^OKIsf%CFK-&hmTpyP#&k4bj(^bKlH0<=-y`(wDgC9g%mgHp!EJnS)Cjm^p{(4euS zP)2U(jYfqH*(ZAp2*S#s!M^?s+DY8?peD5TNLdgEwSqV;kG?_I&YvzsQ8WES3Q<;O zuk>-&mQxn_Zrs-{vX(Q;FA0Wo#``E5X5v>k3t<}dUP6wCG*X+NTK z(L7*mwE{|%PT;9AHtG`<0}T;6`178|T^Fx@&a=E9-Zdj{8=_od74<_xnlyuISc=2P z7mT2r#P#O~ki&Z3<}_^*Rb*JGu@SEb#|>7~%FtvDQrlNNl~{a9q+Rlq?k-M66a3yw zS2EzmXzx`Z9yKJ@i5cvw`q2j=|F)q8YdzlzRVEb*6&mUUd_dBy2nlg9z<~%2DCR(+ z;-uJmi4szxhxnUm@Gv9nZ1A1SVMGKW+qq>XbeT^VKJk$Vy?CC}eX$9BY_A!JcHbzV zd~|YsTDq8+QYNqG_rv@0^U+xaZDLBN=h(NxHyd%A<6jezc(UcEG^U zAfoEwgtX6E`bk+f(q(O%_~W(2@v*G8xvts}MoC0v;d4s3p?{Xw-E?;C#G8t;f2_0q zi_@%<6qU5TTMiTp2ek%oOm<~xVo|h4cIV&q5ER>zw&Umyw7UFpeuL*c;_dMnfR&_U zv^7XN<^zP2L?JL(;kv@{hHi~y>6hAoBmanRg|kfR6|`@Ma<14>PC!!mM|JN5v#kWs z;`gb>%BEfvTG>QQ0HU>)3KO6@YU! zX!7ma5uNNxVi(b(o`kh3?rdc2bHkDVMXb)ELkgUe##rEjn#@jiu^uhq?lGCd^ z@{@8OT{^mFBS*bIY>Wp59C{^}gVbw`MKB$m7E)K!6UqzhdTfBE3z zfym-cm!&}|bO`M~77)sdey^CbIa3b1dC8-^C(i8Lv^}(!`8bjYG$)CWmL7=AL(w4@ z@$95LuMAAA57|Km_4@eN>yL^63-6gi;@Ku!o-)RRj4JrkjcrHHsrnFrAG!57-=G|B zjX7rc_HVh)Iu~^BLk$2sm+ zxq+inmTjzu_F4;a;WJ=A_O^ zg3~+Q9)BT%U$W?pE{1y@Z?janOWJP9y#GgcP>WQHvsK9SDijMAge7W>gU$Aa+B=a} z{$k>poh8N?8R1GFUDSn7@$?K1V2UG4iLIq^Bg`+8^KLLk<|zCql$md>RxFjiWdVDm zJJ6)8Jhdxm&xDYMAUr^kZkE1t?xxa1^_+#+a?~A{^v4vH9|FY}5;U5O%yh4Uys;Bs z`WyTDX&i`QLo!R9d3v3X`%^y{3|%;@N2jTCXs^|9-%`bA-A7{3#zL2i$z&-cUawr} z48vv79|w74r4)QA#XJ*{_gjdjW$8OatyJHBcJLSUdNax_rq=jyG&P1gtT`J1o>9i6Z9I!Rh z2R*l{gbBX3>>ojx20at&jpDnykboW(Skx(~1%i;Kw~>TZ^Wx`17nlk9NBfqKHtTI! zU?c`cg6@%RbC|=cz$uvIOVdka2bPoBhh&XuSGn_#2zjl*{`_5`J{{%(lj~cs(RQtP z>#J(i5;XHTo~~XL`k!zyfK+Qr^U{z%Gc0)DXk1g&H1_u-`9a0$qH;lM$o z$cE_BDdErVUL}jfHYVds7t%CBYgTpVewoo z_Cyt>($P25vuM68i6Q>>1;w%tUVKV?F%ncG8M?eXGVTW$%f##-?7^rf7*)0VwGzKB zd7cbyTM=42+fGY~=~$8p6drqMpINdchIJs*Xb)Dz=y5#eQ`qSmCOllV>jt36sQ6W` zYyZ$ate|JSQ1-5A1@eO1$;o`wcJGVrrqaNU`&2*^4S!+4NhPfP*r#HWbOa^U92bX* z6pFkJIKF)RaSGwhVO#`Ypjb5v)ysS>^Hfc}Sjy>>E3~%0#^gl(i0vfRCvMB`8H5?s ze00BTaYNV=HjUXZXmj%|YNNkv?UdI_)pPux&B(|T52W{1qqoK3Dz?(Zc>_B-*S zkcmKdKm_Urgd|dL6&gwDiC~o0I1^hjMMP~r{Ru*oi@)x;#^qZ@0vYv7*1Kw1Pt`ct zs?OB^5d|R%Q&ecvS#S>3_K`to9a3M!=6+0#1nYKG|5Q77W8s6&N&T^_%7oIqW)u3h zWLSMP@vZm1N&3NK2Y=x*F#D}j1C^4~yUfqYDdzELdu%%zedDIQGtJk$I))nJTj1h- z0g|0&;M9Y#evSt+{r8Zob@!Cg0OvHt+|n#jjzVnpHPNx$zTPWVi>f2K>dYpILb)$_ zp{C^cZ6Y2M?Tw@5paFX1#(Atpbku+soA7Drcol)!D8&T0nj zL$?IMG~zuSa9D}3^9@Tpg94;%aAdP{A!kgRrQ}DEPec>ZK+()|dlg3FQm69hxu5T(NvOHFrUVv;A@ zO4E8{X6|%LdCvHG>rgtH-(*!8s&@gE1JfbnwLx>XQh~%3Kh_l?$&tJFuOv7Db=pDg zRBMi;-_k*mU7SQtap%W!9nW;1_R~P(ax{?0#LXvFc6Cya3$;6{z$KO zdL|=qjqS08s4-w2)T;HGie7-jHT4lOq+W37tER&ysTf(^ApfO8ibRXEghq1TYU0PG zaX{C*{wCh?nvY>!oGQ1!@8Ka6ZTLOmiti&ylS1jK3d^$}B9Nx790afIDrq+MlT0!At8Z_VH5Q#(85QCxA|sF^wIOW{-1$pI}-p$3kw|wX(WGv&|1h`z)C}+ z>AK`qV|vd*3Pt(KqgTq!7O-oVf5$WEdGAA3u}{0P%6e%C`4#_ryjP`tRX3}*nXAGs20-7<0NmdrTvWjP0!n^Y}CMYTeosMEJ4qMHiMN@8`KXd2pSvdy7ZNm!OGmDpl0t{zgayHV*5kZq~oL| zkz$I`gKcpGGPhWKjN;w)}p zj6o)#jN4o1d67X{BNm~YX!9ma!w(&K20p%!6Y(X?P}12Aj35+9Z-If21_O=ldrlKi z9J(khQgwsky}0RI)N|or%=j@{7kg)ENUw0rk+$MFbW_3Am(@!U66p%YC|8I+7js>u z&@ALJ=zyeCCv~qLUVEwQ-v$X`Z{p-%Zcb07>1F>g1BP?18SO_?0@YTO{e{OEj&VX~ zemn$KZ^)*pi{Es4+0Z}`96xx{6_=Lm+&rNqhXFZf+ocsx(urH-8!p3KT!!FksZU;R zxSX_B3TW0!6TWeo((x?kU8V{B$YL~~up){Q$O3bGGONi7qfkBkZ;y-O`5S>rW$=>_43EPC*U1WO+6wlVqA`< zb_*XK*=YEXr4{ac>RID7|7`-Vn*qiIid>#~*pTzq(y>%-_#lh(DC&9DnB`W(^f}xj z#m9>XHdRk`%Z#`XnClFEkL<_erTtoc{4Ja=K8y9lZs~Vej*#GRmR`+_I}@IjN~>12 z_Qe%|1)*1ai<5UoNHmrjzm!|ag4SsS~K+Rfom{%fSl>c{v zb@j)NS16hQOy2J0^@#DlL=Q?z2c{H3^seZ+ zoUPBSmX$tiY$ut^Y)J*_6$Yic{*gMYl3_OH@~mK=))CD6wn-C;SQK5gv)hT8OAV>2 zh#1KtOB<){Tdni7baBtN0>H(1U$^pptd=6=1bDy6H{5N6n+Z|zTiy6{J)t#DA@VC8 zF-UdnGZ^h?P(&-K6Vo+1Cr^H?njlJCMo^Na`tCj%$P*wxG3~_o`I#W|;At#F^BL7} zHq;lr+4I|j7Z^8QoIelYLMKBt5NV|}81M$Wq;2O*4|Q-r)&2uQH)f_tGu6S<-q4cz zQuXb_OkqGUIf>%Bb-X5tmlk=l?%LP=QAf@@DxY~;$NUYV8;RAr7PmI#-gV3;W*-gG zkhzf~nd(bA2OP-?0-U7~mRDj`MeSFt*H2x40%pxK9e70nLa@hFqkqj<51VFPE_w5b z&^?>t3JyIr03is7G6{YZg3nh9y0?^@?`1yOzqFx$sSlH zK#<-A-bo@qj(^&1k*dXl^S19gH%N2cRsO*XYjQ`3%SLty-6GvHZDI%0WOD?Dz}2(o zNea<%R9M@I=4<2FMcK}73XAqWX2U@jli2ent=*ADb0#rum8iw$3Phc58+_75T7^;PSCpgi~0Wi8>g$mpUMJq)}YLZ(H!YK-w#V2^5 zR30uES0mfcV$hh9mrWhpKhep-yQikw{8r0Yrw?hsX-W%gOY2lHN=RhrPs3(|t%sof z+_MC*hqiJo;WMrd5_j7AyHzYAzo`qi$%k^+`mB%M>j?043B{!dtMW zd%vgGzTfe6)%|~(I?K4IqIPf7-Q6jTwB*nTh{ON`NH<7Gi8Kt|-4a8IAV_zQgc8Dl zNY~JaboZR`dCu>B&-Y=?UVHD^tiA5*f8BeRyJ^_n;u}pm*i6mskFBLyoc_OvmDj7| za(x$w+ZkX8=@XBlK(?#{WcWeF0Jdr!Xw^BQY`Lo^txUy?Dxik7Dtz@=dCC|zI&0|^ zB$Tw9@)MXiEevjxh!66o_3OY{a8aDXC<{$llr6f2sIj{*=^*241l^5BD<$e5Y3Je+ zF{h2bi22D1?i+9WTm1Q9ENkmM?D+ieffchVZ$}k@7g=@=r16&)2KIC}Q%d(cY~uPf zd1u3~F}8wOus-yt?HM~7!tCPIQByBYil{MCpYIsenot9T*^20-X5<#kJdJc4`O+CcSO?X(pv6S{xvZ}!QA*$)I#kz#%?0`-Xw8H!>RX|q9pIhz z>RP-P+3xNMvLvs))69P_kwj1=f%ZDyFc$!}%*f)AqQRU)Lpm0Eb7Rb~W=O53~b{IjbXb_$@GQL z>wf8}(={`D4~Fx?T|TA%U5@7Y2*5DhU*IGTo2#TU9%y58pZ22Im_FhhFZn5A9MS*z zO1PXweea!ONvU87@b~9$0KyO2WXOnfG>$l#F1|lXF;iAjAPuZf>ic2OZ;B{t;6PVb zn9*43>ooiY{Jc&=v9lu8q}bN38a*X-EQz$5<V-h{luTocK#lVEjG9Uft!l&yxAP8>#Y68X|G!0osCjBd(qj~b z0l@>s2EI&ySd)F(%6OEMj(RGHbJiwi4n6N8h${ZHVuU2H>LcUdLeE&L{U>FE zSV~E6z)(#8-~F`|w{?n1_1%pMF#17hSGmUQ-6Wf zcJR{P;Dd~9SxPap>diJ=W087||MhQB&z3qEjH7!$;DS0fVxS-WLJ-l5m4oXHnGqf| z9hVE0s@;*JmOkyz|M@)r8$|zxAVd!YX zQ-$(>bP9^GKXkudD`;P77VpSgVgmqRCDn{Uja0-`Wy0Y#5WO)P6_6tnTA_=L%>81= zu%a4a4kL9VmDX9Ly=6ehUtx*pp@PNp_eG_^Y%ioxVN}_Vd0(K57=11E0`GB3XdJ1T zC}Ca){4;zg9m=I^v~(5w-MWM137$;jW`}r$E>!;9 zwd#;*MQcMfUCu_{#WWh~?K~bdUa1^Sq)}k8(ybwkMl2t+8U^HmROf$HcXR204- zj5s*#1ZI_~h4rB!FlWoO+nysm4N?b54NyDK`mPdO<=*0uC9i^o^}$td#AxO7zo7&0 zLN!LTxDZE4sMgNdd7;QQILS zbkIMeb)k!$reK-g)k zL^4V;wu7)A;mdbWj!)^9z96I&#k!K!W0OdEdwCT{bM&0dy=lyj{6qwTU|Zps5#L=f z41QL{?I|wAyW=Yzb|2~zhMb*ynYrsLb_-8G()PUNPNifFZS0AWG`NF>oEB8uuFw<- zn4v>3v(q$Mv#f%-(5V+>ei$G5pdc{Znj13FTcBp}8rA;I6SBrPwGw(JcL7DdOLr7hdZaya#|@Wp1DtS9_!{O{7&n|CV_cG>c;;%_ z^1wLQn!qFMaM~u;+DcM==^Q*syu0QFcY~WpJQNfWb+7cAyaT^$J+F8X@sQH=-jhZd zoj$ZrhVI{Omq{7=oSXCN89RQI1&Y)CSL#HumE_$uOxLkAp&qMiN#-w+!-bvds>6wW-Gpn|lhig$8| zu+9;xgk4RhwdihA-K~d!%=*rTP#AdT`@J^kl+(FSpmEI@64Slrcow3LyHh-LG<0MR+eF;kc2d>6>RUVY3&=;*je-BqUi#;-VytzuSXcCkpzHi)G zq92WkX2vIeI2057DZ@VdmQRC(U;ba1z7LIKC5kdstZ`7*^4Z5~QI1W%tK<1enF=i* zq}pYME#sQgG=k;HUjUDqPSrhS!Vj*QQK)?KV2SF2XTd!3Jpzi&KI_ORREM#y>bv+< z69@ct%xiTZ6LveyCoF99d|KG?rKO{)eTk?$OIN7gX06A3u&67}PUlDT=Q zOSvxWGsT=0iSbrq%N9}ZH1j9Mx|M};T%(^1*OH?31)5G(iZf;P>ti$oOtjk~y(33) zr2bDIu<^#slK_L~@3iyAS|L+-3xR3Z0JmTx6r(f^0siwhbYXKPC+l&@Pj;;sDB2Be zs>Uvb)`suu-un=pAb@ga;{4blX#BA0Vus}mLg-T~6&HF}$>i#~H?u>BAVZ7#?SY5^0ncC7S54B@hhd=UO@-DMa~+mE3~+QRDc{CaJ1}Af}v#TM0q_ z0UtxfnghAXK}pX{te+#sH&bovh7(1z!^p|&eBj#|UV7hR^_kp85AhRWkVBntv-l6R zq89X?+z8<#cO$*tb_BPpIBi0D-OQDH2@O^Ou~~ROy|Z%8wkn8n+r;kMnhmy!BbIJW zi43b(Cyy3&vC6Qfpi)invFPWm5N4RAnN_++4C&(`D_UM_@RbJcPH;~T;x`W2sZ!YJ zGN)pzl#3Wu$(xCvpHp#iw*eDS}Q82V2UbP#(0V ztRp-c*_PvejLw8TlV46&wwh5{HGWUWiK!tut0Q?W+)od#0K^%a_@YB;O@rr}Aa!`W znr?njASvCgM^pPl!Ro7n!_T(}jCNr)>i6XJm@ot$-sNp<+JF-+y^uBby-es7H0MC* zU{s&#+j+!*lcqYPHEYk~P;8-qPBEm7vLRc>mS0(Zq9J^C{j4gYnGFomc?M9@Wr%8Gt^KS~9;Ex&TJHMyK^vL+zMWzQ7{hd>it64l4m?s}~b4uGmjFfNbSZra-1VeA38rP>fID zpyLN%fp19->ay9YzkCwG1fNyQI7O{*d9X zuH%vfhTf>xWy^(y2BK9UFZxER9lcmqV0XTkWM2d@iW$`i?HGFQD~fN+Q!lwQfd})? zPx6m>T+#qXTNn@}{ABqx^W&k}?yF<#xgRw=A1cAE1#uAnJ>^>xsvOGB|LpT}pj}1U zRb_D3!q!Qj>fUiZxJYI7vKB`#h)jg0ca>m1D)GyL-*IPbQ7;}qnRg+RtH?*$pAD#W~BFr1w_maQSaK2^1rioPiF~zLpqVEdfeUw04Gw|1*dX zsyw$1atEiw{P^gEDIXKGvLkv?tF;!r{Cf=x@_f`JUBhkYysiQN;9h0!TOUcs64)fv zz9D;KWiN{9Eb6FP@J_IfrQJzr5YA7o#QXKPemG=`+!llOIEvq^RL@T zxA|(2^{|Gf8muUe3Ttt`Wcmj^NfSt7<8j7Zf{$5yxYGxE5_zf+u^33@$XCN zo^?p5VQwXn_$TeB>j32DE}4j%y-BQ$?#-|bg~qha8oir~)buP*m53R7Ba!dM?HX%V zP={t2@o2sy-KYL;S0joN567tX5qx3Qiohg-I+2%ED_frFJQ@|W%!IpshlD{RH+xN( z$$u!9GS*v6wA}1l1|*qif0dmv&Z5ZS9h!#TakSWblr&Th-i8h52j}(JVp4>SyY9l~@-9T}x{K-=QnR(UvA~qye^OML z`NKI?2yY)_$?DQVW=Gq~>c8d>f1ND7WN>;fj2%@7(K%)aPvfR=K!Rq~WS0b^X7mRH z0M^YeK26o@3CU|0;Ay1%=Ivj;RGp@CK zG(sb6x4)I9+)m~;AS25RAz-JVqry#*gEoiKvulFrqx8dr6}jIZV}WcQzK(LEtf>6K zB+spPkncDDYMfQELu2jCDa~cGhYX8p4#;wClm^BfNv=(hja^ylqWzU&Q-5&H+->v- z&1~W`{G(3LC_A6HTXC=Dwe=YXEomFH0 zPd0XhJ>RyjBP!cfcI24tBdsW^ezr9ao5i-qdR8ROi`y@_6hcj^`!5L^&D>}r8|Rd~ z{dQ+9&dWZ`qw=?WF2+xt+h@P16_BK*Bb^+}Q49lL1OKDHg*2^wXV zTTr!Tt>yz7-$o{ZL9rPSt9UI|i_j~Y@=p;D)dmxa-+z9`@xPTh^SY>c`zl%7Xn7t|=*pNinoo9=$LnY3cBB5L&m_LX~O`_Sp$6FQ^#v-c!vBn^Aro5xEG zf6&#mfDDfYY`tJi46L0zTQTPr+s50}BkqINx{qa<21b36{O)vP_OF~%j%&l~puew5qWa^|lvi$zuWoR!@0 zVjz*?)_E7g7e3cD3hVq!%PKw50U(=0P=IL(a^(!(&Z{fNn13Nm_rwoEbWA;;M&Z!a z;JnA2_$y8`iS%u2dW`mX+05-sVM#s`HEkZu&O8pWJWnmuvle7D7wr+ z6Oi!%rOBh6}NeV2#I2&5u?aU>DuD)OuQ_fhZPKP!AyOdOoVb!Qz13TYIkY?9g_N2%H3tRahs`*SeFdXIBQdgL4O{}+)IDXZNcds;OgZ{Y&uPmH@S$FF!K#+y<03j(6CgFTi4)~ z54u4Axp`XL{!lX{ClT_Ahc4$4_SgsdE>lyu$C5DZ`JE_ZOh z^p6C=y+DD1#BG;Y6c!jESODQI<=05;7Jvu=F+cWC>Zr{J_fSX3s@cFx3vcA4WVj9b zXlUJGjraIlFHA{9!ZD&p@b`={-O%uOOhGM8-1v8I0yt5zm%Z&N6Hh2NL)h8=_h4ZsYmfj&Eus_@$@#zMe4$Wu)IrYWBHch zlF^%0a&Aqh3lb2@_bry=ancx%7MqN;5%aKXf?`i%1LO)UeE+Mv=k#fw$WrpIe8R@a z!2Np>A!?XA1Iuw5+^6YE-s7sI2U} z>Iz4eIHIdI`5K>Km=-#))Gv(4h{6k%&w!Y-=R2y)qPP@1Ubyvw5HJUx{voeJ?OzR{ zh8LfOnIdc)h(~kbK3gWYk|*%gFa4>E=Ct^9lwA^|7*>`bm#2W=9jBPZ<2-1;N#4b2 zEj@sKt@Op+xPjR{vO=-YJ^P)rEEAW1xPSH~myFqmYQuG)*TZdgtCLU-ieIFLD9&lr zR?>Kdyi~YWNFs}M42BN7j%KCd{76g(>|N?DIrc>%*6mE4i>t(W1&nT_jY_PE*KIsT z7btaHMo3ardC|{+6tx&G6@FX$u8~j&GDMkNgY7hU7n}#5aDJ<|aoVi?M3T;T%;uQO z*4O%`toN0*vAB${d}*ujf!Ix+SM-I$+czk9*nZA98OVQ^L&P?htxFGLEcmB1O;F8G zj*hd|!;KG~gNE8e~&MF5JeAc7aorIV0koOI#v)nEavaI#b^ zWSKP_1kFKi>lsa%pR+*_FX2X&5{him2nIri=>CQ$q(M>Z@oe$+2Y+n` zv=5px7#;CMO+%3M5?X<_xE{uixaa`PXhV3}a8FZ9Ov23hkK(Ok?w8hIoREnkfl|LG zqr~oleyhKc0FNo6WxSyAG{o1lrMO*p1zElR*;`L;)lP~^;WPP)l(jC{Vac0X16#-K z7+uWVpIz?H@jScgRCMUAVe>MZw2B2_hFKQk!CO}2JfZ^R?_k*-@bbL`slkQ}Y1W%H zBv>jaq<#68vi7NLcogwrGukr^;}WV1U5DC2*G?V+b;FVhcv*K`hVU9Iuj`}S#I@>X znr|sN#qbkvWac0^8zOomA4sv!#6hoioB#;-&taAr1!~7CHIJdl8xnpCBVO+~6nQC% zM>x!9z?0OQeY3+Q!wq)1a317@8|sZ181sv`Tfq&{1DT971XQ1IF^2sCpCOn{GYs6M z?*@IgUP4Gpiiv%v^(nXH!}>$y;&}hstqT7Irlq=Qyxm)5j6#Unaif68=vr9q&u}*b zD(oKY#6(VhsuGIwMVSB)9uzNx@wdyEtTleuCtjP_eh*(OJNrE27@Hom?Up&PRpL?f z$M`4@s_%^>MBGjY>!WDP*ueVhY|NALc|>JiMYDmdUVE6^Xr|rQYrh$)Y|v8xLx`1c)B1$ z*Gz{Kp0DVQ8VXBQ>EbaZRGM?2d|a>If5xL{c7I7??N zC%xG4$0zNg?7%!AAXoJgo+)#IvOj5zmCOASIv8hvDAx@tXc9e&toL1W@BZ z`;`X$aemlz+z~~=m%*ZNw)YblPf5E7%s1q1L?18-2-u70my@4I>|l?bJiG=jo3t+& z5LRslEs${L6b|}kFuBFRlqu=P{7P4LmUOrIjC9#eCQPw!Zjh!s>{R?)`eAb5CRJ19zQDQ`F-2o(_blfv{3Jq_J#YmU~m zy!b}2V==cERMHKDZ7Z)gGhxq{VvpLn^||KwAmCovzCwhhBlut zf~ani8JlbKo$c$+A&jIl ze^&l9VA05`bm015gnOp^X zLPZ@eY|UoZ?w-UVLEv}Jy1#xRSxFJ5#YeA8)GCQ^$>-R5MS@r2;dk6w35N6GHuuDe z7D;f%?u7;Z#>a)kKPjZ;UZ|UEmGf*`|i-w7wJ`CB{Vo`W&5geD)4BuU9K&Tbf zsQi8H&#H`=rlGI>p152KR{@`{9sL`yZl0f@>x~q;NuNdT&Xs7GZL)=QGSL?BcDOIb zTm;1a_GLYO4Ym>+eZh@QtWj(h4Tj(1^V!r>O8BA9{8-C}$m026%VB^0aehpn3JHaX zG?DwwpG_qzXz{{SFat_wZ8%S8;r2-F9@Imx22UaHMzU>1(?wvGQoLY4c}eRB1HrPq1St^uqhl4)IH{7%@eBkczi|WJBC$0qpdT^i(+v)!4<#xgn1q(IEP&+D5pNBR zKc+1Y_hcvcm>Jmoofl0a?1n6PCe(x-tZV1FP1082{gGr1DG+IhWK`MRye%75cv2Kj zcTadz#7RHh50G!kdMojVhN91ipYD(O6|AILB#yL_nZ@_7hy)LZBG>-xEhAjwQ~Huw z%v!kc7>+TG>G+$*6|E_fFtXG&RT$lQO6lS`!U#IYaUrfLUR(vpx{O7>bA-w-b)-3Q z*D9}zFk^mRGM9UEROA4chQH>%l@=(4b5oFAEJlL(Hq?(!q1zNj4OUc*lQW<{I|S-G zNPCw!?JeWJALR@}j@1UjSxl6n1Hbjnk$hE;Iuy_+ckw93TjcWrRXAE}7?b(C3I)th zC(U-XXs_KeMvo5_(nn?d#ZpyZuA?1KF{+gCh0Go6iAS%w0As*1&%jTiM)07HQ3=K? z2_0qZz*08$7TUmfOuQCTCZR#Hp@13es1PradM-%cm;SRBoOY}zw8QMvN77-u-t=#V zTDq$#YN@EscxM0N~lHMAKxaNN`iF! z^+;`NQCBIO=DIZMb;ekYkkFhJ=k(|FR1G&(`+GT}<#Axj2LXcT!=;z6;1+6634n?Z zj=iEA+D>Yi{+A97z(9@cH7vSx`>)mFr(i+XgYtT9yu@3R-E$4DGOxSH~*(;-HUPf|%J( zrtUc&^HFr(H!@wt5gN9=lSKEGmpq&~pCa_GrA{8`0N_O9x2xh%;tdCxfz>2l{r1lu z9Yv4yy1|GPbQsZmWY7LLKMG>TtfmXz#!P8Umpv=CswdIer#}2kYQJtAzTgVdqsoVZ zwkdhm%B4{E()#6-9sp+4b4Fv&O?&%9E*+8;+8*`{zHN>8{Yty;?nsXp8e0VHEHB)@ zovUprzM)Ul45ocCA8&G@lBUvn(w8;!9&myM*8p{iYtAA)27IA*+1k*T4~`|yTz%LR z3fniGUM4Ym6M_GBW`n4Pu;ob`M*aeoJn$r^hV5BDc{h$M&u~BEA~ng$2O9T3fBE(> zWl<)wLFNPzy6JlKKI>Z8!T*+Os^eL?5Xv)+XZlD<-;Yv{6ieWO_}RAfk4|+CBlIE# z^N1WFZyCSca6|n_d4h*jkXJR|x$(s+9qt3Mb0mn`y1kvLeJKh-KcJ>4% z)=!33)x8Zpr5YG-@HH0apI9|GiI(*49;F zox7`8^Q=jiW=0*Zwq;TNIeE`n>;uL01#eam4h%u$AspX9w5k3(bH~^3iLvtN?CNU{ z1)mE$as(#>ZNYR@`V)NE$0rS>V zRH3~f5i+01|3My`>G^xywGySP&&AJv)_pJNIu9Fry=YeDZA&~6OJ{V}@`)yD|J-r* z35;ui;GtRq(o7n&_DV8H!kj0sW4OF&_vfE7CWGHZ5Cm9?ppTD+q+il)W&L@U#cv>( zjH|+BDX82LPl^|cv+_2;6e|W5*)X6pzQb03MEmvIn+Wr~GG)7a6uT7wRy#4wGO-&= z=5W_m({mo1qqN{%B=r`XQ`0z*oGkW~v-_6QYj$Z(m9+dgcc+EYIIfL`R%2di9EGU) zG3i8BDs*5 z;-}|J@t)3^ox6le0XHpm3x_h1q}@_LJd!Ib=M-H^yJlf2??SsTO)VnViWZTza=P#8 z%hS6Gn#vwUx?oL0nB5u|Nz%teHRaq;nEKCiN=PaV>6cX4VKHbGG$RBQah&w$!;|_E z5f!P*T)EXk4(xOLh8ZO_S1k)W{*LOWh4$w%a?RNh)xY2IE+FmF;meIoXWUx$Z@tA~ z(aq-F;O3^MmyVZ$c|fsT)^*0=JLYXvGq?MTa$W|oM{x+7 z7sW55^l!-dCA!c(yLIYUdX6y~e1%o6C@9+4=d#BnL;WUd*U~VP`jf}AL>HtM6+y($ zSLG2`#muDWKH#5l+G6jC2PG)AZP^c%Ldt3lJ#55FPLiCsllfh(i;_*)itMo$0@#Ug z*=6+8^0}Hebnv|bwS|UQ3SQUeq-6xXnu4g9UOFc(Grxh}yDP?d7@^@Oki?@maX;7R z>2|n9gZ}~WuaN`QUuUL^jpO{?VWaqtpjmh*PTSKZ#XIsGX0QJq`y+sT*jzk$%Ex*l zu|sl6*8t)YhHMKmmkC@glHeNx$&ffJ#|$#CkGU%uIJ7SMIlCVDB%}@?z20UdB`Cl1 zT={vRHo{WgsSXgzj5uPMeSEFY#VrJ#)kgQ%$TvH(te{3h4v~vm+Z;pv@x9c{(8F(cQ3okshHBtTp%Q8`YZygv+73Yq}ea@e7=s@3JSD zX&S&>JvGTKpK&x{>*Q^VVQ25R3?z9re6>+SOcfTY-t%xKmbV!kcxJ)_s85`~848=@ zIp(ZG7pxbbMj3vPzWIju7SaojcTxJW>?B_4>wLaGM)<^!T zIY&yEO@6cNo~zDB=W(&TyWdlT>T|5UMkh`(K1xwxrhf9?k^+oR%*m#yJ>NghroBbr zBUcR)zm`(ppdZBS7KLsk&80op(e(Z8-4zIH_Wg(xuz^2o%P5L75LGKBlYBdo}rnSFY|ORV&(@;HXC8dig*;JsVr#v5d9 zDh4p@tDLxDp%b#+QpZAkyYM|?^{wyVqg2w%2WJeCOfcK0tuTT`uwqB_1qj7%D&GYR zNZUKf;o;7-=Yk+SFtb7hT#g?M;ELhDlBNB;8gji^TsF`sj*ulrk+#LaXiP`TEra%4 zvvg?#<2@P*b_mP$WQq58qnxDoNu+b82WvAg%@aEr02KTB`?-r8oCSwciR@uYvH-gm z$pIC?Vfd#HIY2;gr-J_&~tYTlh*;UE)R$n zVK6jv^_BQl9>ez)hxj9NaqN^Oj^9k8#}y@!ci?S`Rj!ZH3q&_wsjXe1EfI1&y8fud zsLaaS*K)DQrEkRj(4Xdn8$3|3!%Qmi?bNm`lUAzauU&eBwQrx$0y381x+@KO$WlX^_BIS*L6+S} zjGgb!hhn0EAj{6BC27GJjZADVlGOkUe>1&ad?6+~s2TUh-Z-RgWyecjca~?|6luu; zXmD!LPRS0)fy2#BKH@1>j0ZRI#CH8^()QX}4`%WB!t(+QJ(tML9rRvk?{l*SBa1~Z zrUmXax7(%#(i|utzFbur&6BHK^_`E9z50S?X_(0?gs| z1~db?J3-zr$ZCvJ#1wlyx!>T`gsehSwcL3g=6mJ&49h3c;n$pDUvPVr7q@ILQ7q?3KQF7KO_zlsb8Ylvz1W{B z#}&)1ene%S-r579IdsOs=CQFvYT4In{DbAPDZV%qvDU)h2d4Z&)}q-9JMfVRkl;HD zGFJPI>Q?H$nBBVM6+yb`BbD6{ZS2omX>;upl2%meG zi`IDa_}r0unljIJNX!+{^(kr`dWKK7q<K5U%9wI6tdI>UN&@<=N&(H@bE`7Pze0=Nk!+=nKy;D zSjAw|FQV=!(AgTgpSdE{iDjuUN%ehQQDXC19D zQRiLI+wh!YOBtdDmX}16!G4m#DUc8;_Fm^K+rZXb>X3{;8#_z?`DiJ|TL$dXAT_~o zN#BzCeOQSNQw@jTjtA)N=Idc}l8WNh=(FjgG3VxVwimbtO?PHgUO_R3WffkA-XDGx z*%rypUD-HHsB)dfTP6kQ)dXpM1A>#z)9fEZbGi6iFUOyQ=Ff@Mq^v7gbyQd%y2oP? z5zw{Jm?jQ$;#%=xBBJteo73F|EKBDY6;P=^`#!l4V>=xZeEG8FAQ47{IcnaN6%kD8 zkG`t1ZqRv0uf;8PhNqug$kob4=56$=QvlX?D6*-Hf2QK5WRXqD9T)PR_C24o?)(6L zWw`#AQ~szd&RVM9WkS=KcE8P16u4IGf0&MmCC3dnJ(lIJIlWAy7)W4>7!F{M2ZL$0 z3+zM*63dSPv2+w<3yGr`T#Y}s-<PraW;sN5a{%P0n3k+TZ%F zPvx)uyoYsm>Ia@!;Or`MKDFZBY=5UyfTU41wGBD{xKCYQ#pf#=NIWN}nUvz5JJj-D zV@B3%EgEmP5YbV>+?U=#j>yA%F-Sdi;VbOXNBfKeA`2`u2 z8}TX{(Gm^S87I6Wov>>2z5IWNbXoSV0hux+7i^w)JNIsD68~5&PJ;0>iY7~^w=E%3 z$b)9!vp}8o#}uVm1R*U^LDaE6i9d&a&8G89;KJ=r&&jpzG>GsfhaG_g$BEF<|B>TB zegIgEH`oWu;}hva9PJ?gyGXFG#%L$$jDnj_#xAhql0$Xc15H^gL>ap%KF&%YSwkg=~6NKx>;SL``W2)!m>`P0F$CnEL4 zvESIcIMRfi%74)P0+C`p8^w-Vs^X!vXE7opOLlO6khr%Y<}kvx^4qv_8l%X+9)SPb z1P~7N;V35kQ2WH>A?6&b@7#=zKq_dI?2&s?YlaOaT_y>;>CB)*I;d+t3@>CP2Y z+Df4cVTsxHg;Ix6{`bNTK%>w9+klhP@97!5r&9BfKVJ008ixArXv z{ElI>JnO@0r#UfitV2$B03QC|_H8s@fnO#iP;$`jcTaO(hw7mDA`4dSao+z}c~>MW zPbK~GUz5#}yHqUhs>rXq62&4VImNxG!Pex?LO$$Y>;h3?_)jhr^)ec&KFB}vsAOvX z%{n$Y;wJF#R8`^CKxLuFrw>^QreX04^d3!*fx<#Ldj+9l^;kCq9`6;mEGyB8u;rqp zZRa}5|9x#GFMl<<^r6ka!H}ap3)K&8fYyaN;sVfa6*QD(AYhq&;uCW?6oM@iLjN}B!(GwP(H}3BmZC9#+Yv@bP|~VhE~R@+68l* zSZ*2T5}Tn)o*n<6ldV{V@f9NkuRHP#e}~$@2^qFPUVDXa29Ez`L6e7UafIO^|HUpQ gpXlF^B@qZ>OSVV8w(!Tbf6ZxMz0_8zRk8~IA9JQ}%m4rY literal 0 HcmV?d00001 diff --git a/doc/templates/index.html b/doc/templates/index.html index d333530ef8376..c098fc05948af 100644 --- a/doc/templates/index.html +++ b/doc/templates/index.html @@ -245,6 +245,7 @@

Who uses scikit-learn?

+ From 877c6e6db42006445ccf0695c0dde3294ff4dd4a Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Tue, 6 Apr 2021 10:37:13 -0400 Subject: [PATCH 294/478] DOC Fixes style for versionadded (#19817) --- doc/themes/scikit-learn-modern/static/css/theme.css | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/doc/themes/scikit-learn-modern/static/css/theme.css b/doc/themes/scikit-learn-modern/static/css/theme.css index 5fa26391886e0..ed7a86a20fa3b 100644 --- a/doc/themes/scikit-learn-modern/static/css/theme.css +++ b/doc/themes/scikit-learn-modern/static/css/theme.css @@ -787,7 +787,7 @@ div.admonition p.admonition-title + p, div.deprecated p { } div.admonition, div.deprecated, -div.versionchanged, div.versionadded{ +div.versionchanged { margin-top: 0.5rem; padding: 0.5rem; border-radius: 0.5rem; @@ -795,6 +795,10 @@ div.versionchanged, div.versionadded{ border: 1px solid #ddd; } +div.versionadded { + margin: 1rem 0; +} + div.admonition { background-color: #eee; } From 9cfacf1540a991461b91617c779c69753a1ee4c0 Mon Sep 17 00:00:00 2001 From: Maria Telenczuk Date: Tue, 6 Apr 2021 21:33:28 +0200 Subject: [PATCH 295/478] DEP Deprecate 'normalize' in ridge models (#17772) Co-authored-by: Olivier Grisel Co-authored-by: Alexandre Gramfort --- doc/whats_new/v1.0.rst | 12 +++- examples/linear_model/plot_huber_vs_ridge.py | 4 +- sklearn/linear_model/_base.py | 65 ++++++++++++------- sklearn/linear_model/_glm/tests/test_glm.py | 2 + sklearn/linear_model/_ridge.py | 61 ++++++++++++----- sklearn/linear_model/tests/test_base.py | 28 -------- sklearn/linear_model/tests/test_common.py | 59 +++++++++++++++++ .../tests/test_coordinate_descent.py | 23 +++++-- sklearn/linear_model/tests/test_ridge.py | 13 ++-- 9 files changed, 186 insertions(+), 81 deletions(-) create mode 100644 sklearn/linear_model/tests/test_common.py diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index ce683958d913f..96bb2ddfa8f7d 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -193,12 +193,18 @@ Changelog Motivation for this deprecation: ``normalize`` parameter did not take any effect if ``fit_intercept`` was set to False and therefore was deemed confusing. - The behavior of the deprecated LinearRegression(normalize=True) can be + The behavior of the deprecated LinearModel(normalize=True) can be reproduced with :class:`~sklearn.pipeline.Pipeline` with - :class:`~sklearn.preprocessing.StandardScaler`as follows: - make_pipeline(StandardScaler(with_mean=False), LinearRegression()). + :class:`~sklearn.preprocessing.LinearModel` (where LinearModel is + LinearRegression, Ridge, RidgeClassifier, RidgeCV or RidgeClassifierCV) as + follows: + make_pipeline(StandardScaler(with_mean=False), LinearModel()). + LinearRegression was deprecated in: :pr:`17743` by :user:`Maria Telenczuk ` and :user:`Alexandre Gramfort `. + Ridge, RidgeClassifier, RidgeCV or RidgeClassifierCV were deprecated in: + :pr:`17772` by :user:`Maria Telenczuk ` and + :user:`Alexandre Gramfort `. - |Fix|: `sample_weight` are now fully taken into account in linear models when `normalize=True` for both feature centering and feature diff --git a/examples/linear_model/plot_huber_vs_ridge.py b/examples/linear_model/plot_huber_vs_ridge.py index 63abffe6be4ba..e5f71cc861d88 100644 --- a/examples/linear_model/plot_huber_vs_ridge.py +++ b/examples/linear_model/plot_huber_vs_ridge.py @@ -43,7 +43,7 @@ colors = ['r-', 'b-', 'y-', 'm-'] x = np.linspace(X.min(), X.max(), 7) -epsilon_values = [1.35, 1.5, 1.75, 1.9] +epsilon_values = [1, 1.5, 1.75, 1.9] for k, epsilon in enumerate(epsilon_values): huber = HuberRegressor(alpha=0.0, epsilon=epsilon) huber.fit(X, y) @@ -51,7 +51,7 @@ plt.plot(x, coef_, colors[k], label="huber loss, %s" % epsilon) # Fit a ridge regressor to compare it to huber regressor. -ridge = Ridge(alpha=0.0, random_state=0, normalize=True) +ridge = Ridge(alpha=0.0, random_state=0) ridge.fit(X, y) coef_ridge = ridge.coef_ coef_ = ridge.coef_ * x + ridge.intercept_ diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py index 1842620dfa105..c80c2db622921 100644 --- a/sklearn/linear_model/_base.py +++ b/sklearn/linear_model/_base.py @@ -101,38 +101,59 @@ def _deprecate_normalize(normalize, default, estimator_name): else: _normalize = normalize + pipeline_msg = ( + "If you wish to scale the data, use Pipeline with a StandardScaler " + "in a preprocessing stage. To reproduce the previous behavior:\n\n" + "from sklearn.pipeline import make_pipeline\n\n" + "model = make_pipeline(StandardScaler(with_mean=False), " + f"{estimator_name}())\n\n" + "If you wish to pass a sample_weight parameter, you need to pass it " + "as a fit parameter to each step of the pipeline as follows:\n\n" + "kwargs = {s[0] + '__sample_weight': sample_weight for s " + "in model.steps}\n" + "model.fit(X, y, **kwargs)\n\n" + ) + + if estimator_name == 'Ridge' or estimator_name == 'RidgeClassifier': + alpha_msg = 'Set parameter alpha to: original_alpha * n_samples. ' + elif 'Lasso' in estimator_name: + alpha_msg = ( + 'Set parameter alpha to: original_alpha * np.sqrt(n_samples). ' + ) + elif 'ElasticNet' in estimator_name: + alpha_msg = ( + 'Set parameter alpha to original_alpha * np.sqrt(n_samples) if ' + 'l1_ratio is 1, and to original_alpha * n_samples if l1_ratio is ' + '0. For other values of l1_ratio, no analytic formula is ' + 'available.' + ) + elif estimator_name == 'RidgeCV' or estimator_name == 'RidgeClassifierCV': + alpha_msg = 'Set parameter alphas to: original_alphas * n_samples. ' + else: + alpha_msg = "" + if default and normalize == 'deprecated': warnings.warn( "The default of 'normalize' will be set to False in version 1.2 " - "and deprecated in version 1.4. \nPass normalize=False and use " - "Pipeline with a StandardScaler in a preprocessing stage if you " - "wish to reproduce the previous behavior:\n" - "model = make_pipeline(StandardScaler(with_mean=False), \n" - f"{estimator_name}(normalize=False))\n" - "If you wish to use additional parameters in " - "the fit() you can include them as follows:\n" - "kwargs = {model.steps[-1][0] + " - "'__': }\n" - "model.fit(X, y, **kwargs)", FutureWarning + "and deprecated in version 1.4.\n" + + pipeline_msg + alpha_msg, + FutureWarning ) elif normalize != 'deprecated' and normalize and not default: warnings.warn( "'normalize' was deprecated in version 1.0 and will be " - "removed in 1.2 \nIf you still wish to normalize use " - "Pipeline with a StandardScaler in a preprocessing stage if you " - "wish to reproduce the previous behavior:\n" - "model = make_pipeline(StandardScaler(with_mean=False), " - f"{estimator_name}()). \nIf you wish to use additional " - "parameters in the fit() you can include them as follows: " - "kwargs = {model.steps[-1][0] + " - "'__': }\n" - "model.fit(X, y, **kwargs)", FutureWarning + "removed in 1.2.\n" + + pipeline_msg + alpha_msg, FutureWarning ) elif not normalize and not default: warnings.warn( - "'normalize' was deprecated in version 1.0 and will be" - " removed in 1.2 Don't set 'normalize' parameter" - " and leave it to its default value", FutureWarning + "'normalize' was deprecated in version 1.0 and will be " + "removed in 1.2. " + "Please leave the normalize parameter to its default value to " + "silence this warning. The default behavior of this estimator " + "is to not do any normalization. If normalization is needed " + "please use sklearn.preprocessing.StandardScaler instead.", + FutureWarning ) return _normalize diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py index d6fc4e14b12fa..89d388a424492 100644 --- a/sklearn/linear_model/_glm/tests/test_glm.py +++ b/sklearn/linear_model/_glm/tests/test_glm.py @@ -294,6 +294,8 @@ def test_warm_start(fit_intercept): assert_allclose(glm1.score(X, y), glm2.score(X, y), rtol=1e-4) +# FIXME: 'normalize' to be removed in 1.2 in LinearRegression +@pytest.mark.filterwarnings("ignore:'normalize' was deprecated") @pytest.mark.parametrize('n_samples, n_features', [(100, 10), (10, 100)]) @pytest.mark.parametrize('fit_intercept', [True, False]) @pytest.mark.parametrize('sample_weight', [None, True]) diff --git a/sklearn/linear_model/_ridge.py b/sklearn/linear_model/_ridge.py index 2d360c6edbc58..343bc6a170c9b 100644 --- a/sklearn/linear_model/_ridge.py +++ b/sklearn/linear_model/_ridge.py @@ -17,7 +17,8 @@ from scipy import sparse from scipy.sparse import linalg as sp_linalg -from ._base import LinearClassifierMixin, LinearModel, _rescale_data +from ._base import LinearClassifierMixin, LinearModel +from ._base import _deprecate_normalize, _rescale_data from ._sag import sag_solver from ..base import RegressorMixin, MultiOutputMixin, is_classifier from ..utils.extmath import safe_sparse_dot @@ -521,9 +522,9 @@ def _ridge_regression(X, y, alpha, sample_weight=None, solver='auto', class _BaseRidge(LinearModel, metaclass=ABCMeta): @abstractmethod @_deprecate_positional_args - def __init__(self, alpha=1.0, *, fit_intercept=True, normalize=False, - copy_X=True, max_iter=None, tol=1e-3, solver="auto", - random_state=None): + def __init__(self, alpha=1.0, *, fit_intercept=True, + normalize='deprecated', copy_X=True, max_iter=None, tol=1e-3, + solver="auto", random_state=None): self.alpha = alpha self.fit_intercept = fit_intercept self.normalize = normalize @@ -535,7 +536,11 @@ def __init__(self, alpha=1.0, *, fit_intercept=True, normalize=False, def fit(self, X, y, sample_weight=None): - # all other solvers work at both float precision levels + self._normalize = _deprecate_normalize( + self.normalize, default=False, + estimator_name=self.__class__.__name__ + ) + _dtype = [np.float64, np.float32] _accept_sparse = _get_valid_accept_sparse(sparse.issparse(X), self.solver) @@ -570,7 +575,7 @@ def fit(self, X, y, sample_weight=None): # when X is sparse we only remove offset from y X, y, X_offset, y_offset, X_scale = self._preprocess_data( - X, y, self.fit_intercept, self.normalize, self.copy_X, + X, y, self.fit_intercept, self._normalize, self.copy_X, sample_weight=sample_weight, return_mean=True) if solver == 'sag' and sparse.issparse(X) and self.fit_intercept: @@ -640,6 +645,10 @@ class Ridge(MultiOutputMixin, RegressorMixin, _BaseRidge): :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit`` on an estimator with ``normalize=False``. + .. deprecated:: 1.0 + ``normalize`` was deprecated in version 1.0 and + will be removed in 1.2. + copy_X : bool, default=True If True, X will be copied; else, it may be overwritten. @@ -731,9 +740,9 @@ class Ridge(MultiOutputMixin, RegressorMixin, _BaseRidge): Ridge() """ @_deprecate_positional_args - def __init__(self, alpha=1.0, *, fit_intercept=True, normalize=False, - copy_X=True, max_iter=None, tol=1e-3, solver="auto", - random_state=None): + def __init__(self, alpha=1.0, *, fit_intercept=True, + normalize='deprecated', copy_X=True, max_iter=None, tol=1e-3, + solver="auto", random_state=None): super().__init__( alpha=alpha, fit_intercept=fit_intercept, normalize=normalize, copy_X=copy_X, @@ -794,6 +803,10 @@ class RidgeClassifier(LinearClassifierMixin, _BaseRidge): :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit`` on an estimator with ``normalize=False``. + .. deprecated:: 1.0 + ``normalize`` was deprecated in version 1.0 and + will be removed in 1.2. + copy_X : bool, default=True If True, X will be copied; else, it may be overwritten. @@ -889,9 +902,10 @@ class RidgeClassifier(LinearClassifierMixin, _BaseRidge): 0.9595... """ @_deprecate_positional_args - def __init__(self, alpha=1.0, *, fit_intercept=True, normalize=False, - copy_X=True, max_iter=None, tol=1e-3, class_weight=None, - solver="auto", random_state=None): + def __init__(self, alpha=1.0, *, fit_intercept=True, + normalize='deprecated', copy_X=True, max_iter=None, + tol=1e-3, class_weight=None, solver="auto", + random_state=None): super().__init__( alpha=alpha, fit_intercept=fit_intercept, normalize=normalize, copy_X=copy_X, max_iter=max_iter, tol=tol, solver=solver, @@ -1115,7 +1129,7 @@ class _RidgeGCV(LinearModel): """ @_deprecate_positional_args def __init__(self, alphas=(0.1, 1.0, 10.0), *, - fit_intercept=True, normalize=False, + fit_intercept=True, normalize='deprecated', scoring=None, copy_X=True, gcv_mode=None, store_cv_values=False, is_clf=False, alpha_per_target=False): @@ -1451,6 +1465,11 @@ def fit(self, X, y, sample_weight=None): ------- self : object """ + _normalize = _deprecate_normalize( + self.normalize, default=False, + estimator_name=self.__class__.__name__ + ) + X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc', 'coo'], dtype=[np.float64], multi_output=True, y_numeric=True) @@ -1470,7 +1489,7 @@ def fit(self, X, y, sample_weight=None): "negative or null value instead.".format(self.alphas)) X, y, X_offset, y_offset, X_scale = LinearModel._preprocess_data( - X, y, self.fit_intercept, self.normalize, self.copy_X, + X, y, self.fit_intercept, _normalize, self.copy_X, sample_weight=sample_weight) gcv_mode = _check_gcv_mode(X, self.gcv_mode) @@ -1584,7 +1603,7 @@ def fit(self, X, y, sample_weight=None): class _BaseRidgeCV(LinearModel): @_deprecate_positional_args def __init__(self, alphas=(0.1, 1.0, 10.0), *, - fit_intercept=True, normalize=False, scoring=None, + fit_intercept=True, normalize='deprecated', scoring=None, cv=None, gcv_mode=None, store_cv_values=False, alpha_per_target=False): self.alphas = np.asarray(alphas) @@ -1699,6 +1718,10 @@ class RidgeCV(MultiOutputMixin, RegressorMixin, _BaseRidgeCV): :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit`` on an estimator with ``normalize=False``. + .. deprecated:: 1.0 + ``normalize`` was deprecated in version 1.0 and will be removed in + 1.2. + scoring : string, callable, default=None A string (see model evaluation documentation) or a scorer callable object / function with signature @@ -1828,6 +1851,10 @@ class RidgeClassifierCV(LinearClassifierMixin, _BaseRidgeCV): :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit`` on an estimator with ``normalize=False``. + .. deprecated:: 1.0 + ``normalize`` was deprecated in version 1.0 and + will be removed in 1.2. + scoring : string, callable, default=None A string (see model evaluation documentation) or a scorer callable object / function with signature @@ -1911,8 +1938,8 @@ class RidgeClassifierCV(LinearClassifierMixin, _BaseRidgeCV): """ @_deprecate_positional_args def __init__(self, alphas=(0.1, 1.0, 10.0), *, fit_intercept=True, - normalize=False, scoring=None, cv=None, class_weight=None, - store_cv_values=False): + normalize='deprecated', scoring=None, cv=None, + class_weight=None, store_cv_values=False): super().__init__( alphas=alphas, fit_intercept=fit_intercept, normalize=normalize, scoring=scoring, cv=cv, store_cv_values=store_cv_values) diff --git a/sklearn/linear_model/tests/test_base.py b/sklearn/linear_model/tests/test_base.py index bf7a2696fcda2..9fb35b389e33f 100644 --- a/sklearn/linear_model/tests/test_base.py +++ b/sklearn/linear_model/tests/test_base.py @@ -159,7 +159,6 @@ def test_error_on_wrong_normalize(): error_msg = "Leave 'normalize' to its default" with pytest.raises(ValueError, match=error_msg): _deprecate_normalize(normalize, default, 'estimator') - ValueError @pytest.mark.parametrize('normalize', [True, False, 'deprecated']) @@ -222,33 +221,6 @@ def test_linear_regression_sparse(random_state=0): assert_array_almost_equal(ols.predict(X) - y.ravel(), 0) -@pytest.mark.parametrize( - 'normalize, n_warnings, warning', - [(True, 1, FutureWarning), - (False, 1, FutureWarning), - ("deprecated", 0, None)] -) -# FIXME remove test in 1.4 -def test_linear_regression_normalize_deprecation( - normalize, n_warnings, warning -): - # check that we issue a FutureWarning when normalize was set in - # LinearRegression - rng = check_random_state(0) - n_samples = 200 - n_features = 2 - X = rng.randn(n_samples, n_features) - X[X < 0.1] = 0.0 - y = rng.rand(n_samples) - - model = LinearRegression(normalize=normalize) - with pytest.warns(warning) as record: - model.fit(X, y) - assert len(record) == n_warnings - if n_warnings: - assert "'normalize' was deprecated" in str(record[0].message) - - # FIXME: 'normalize' to be removed in 1.2 in LinearRegression @pytest.mark.filterwarnings("ignore:'normalize' was deprecated") @pytest.mark.parametrize('normalize', [True, False]) diff --git a/sklearn/linear_model/tests/test_common.py b/sklearn/linear_model/tests/test_common.py new file mode 100644 index 0000000000000..96a996d18dac7 --- /dev/null +++ b/sklearn/linear_model/tests/test_common.py @@ -0,0 +1,59 @@ +# Author: Maria Telenczuk +# +# License: BSD 3 clause + +import pytest + +import numpy as np + +from sklearn.base import is_classifier +from sklearn.linear_model import LinearRegression +from sklearn.linear_model import Ridge +from sklearn.linear_model import RidgeCV +from sklearn.linear_model import RidgeClassifier +from sklearn.linear_model import RidgeClassifierCV + +from sklearn.utils import check_random_state + + +@pytest.mark.parametrize( + 'normalize, n_warnings, warning_category', + [(True, 1, FutureWarning), + (False, 1, FutureWarning), + ("deprecated", 0, None)] +) +@pytest.mark.parametrize( + "estimator", + [LinearRegression, Ridge, RidgeCV, RidgeClassifier, RidgeClassifierCV] +) +# FIXME remove test in 1.2 +def test_linear_model_normalize_deprecation_message( + estimator, + normalize, n_warnings, warning_category +): + # check that we issue a FutureWarning when normalize was set in + # linear model + rng = check_random_state(0) + n_samples = 200 + n_features = 2 + X = rng.randn(n_samples, n_features) + X[X < 0.1] = 0.0 + y = rng.rand(n_samples) + if is_classifier(estimator): + y = np.sign(y) + + model = estimator(normalize=normalize) + with pytest.warns(warning_category) as record: + model.fit(X, y) + # Filter record in case other unrelated warnings are raised + unwanted = [r for r in record if r.category != warning_category] + if len(unwanted): + msg = "unexpected warnings:\n" + for w in unwanted: + msg += str(w) + msg += "\n" + raise AssertionError(msg) + wanted = [r for r in record if r.category == warning_category] + if warning_category is not None: + assert "'normalize' was deprecated" in str(wanted[0].message) + assert len(wanted) == n_warnings diff --git a/sklearn/linear_model/tests/test_coordinate_descent.py b/sklearn/linear_model/tests/test_coordinate_descent.py index d63211d6050bc..8a269f28ebd62 100644 --- a/sklearn/linear_model/tests/test_coordinate_descent.py +++ b/sklearn/linear_model/tests/test_coordinate_descent.py @@ -17,11 +17,12 @@ from sklearn.preprocessing import StandardScaler from sklearn.exceptions import ConvergenceWarning from sklearn.utils._testing import assert_allclose -from sklearn.utils._testing import assert_array_almost_equal from sklearn.utils._testing import assert_almost_equal -from sklearn.utils._testing import ignore_warnings +from sklearn.utils._testing import assert_array_almost_equal from sklearn.utils._testing import assert_array_equal +from sklearn.utils._testing import ignore_warnings from sklearn.utils._testing import _convert_container + from sklearn.utils._testing import TempMemmap from sklearn.utils.fixes import parse_version from sklearn.utils.sparsefuncs import mean_variance_axis @@ -48,6 +49,7 @@ OrthogonalMatchingPursuit, Ridge, RidgeClassifier, + RidgeClassifierCV, RidgeCV, ) @@ -303,9 +305,13 @@ def _scale_alpha_inplace(estimator, n_samples): normalize set to True to when it is evoked in a Pipeline with normalize set to False and with a StandardScaler. """ - if 'alpha' not in estimator.get_params(): + if (('alpha' not in estimator.get_params()) and + ('alphas' not in estimator.get_params())): return + if isinstance(estimator, (RidgeCV, RidgeClassifierCV)): + alphas = estimator.alphas * n_samples + return estimator.set_params(alphas=alphas) if isinstance(estimator, (Lasso, LassoLars, MultiTaskLasso)): alpha = estimator.alpha * np.sqrt(n_samples) if isinstance(estimator, (Ridge, RidgeClassifier)): @@ -342,7 +348,9 @@ def _scale_alpha_inplace(estimator, n_samples): (MultiTaskLasso, {"tol": 1e-16, "alpha": 0.1}), (Lars, {}), (LinearRegression, {}), - (LassoLarsIC, {})] + (LassoLarsIC, {}), + (RidgeCV, {"alphas": [0.1, 0.4]}), + (RidgeClassifierCV, {"alphas": [0.1, 0.4]})] ) def test_model_pipeline_same_as_normalize_true(LinearModel, params): # Test that linear models (LinearModel) set with normalize set to True are @@ -404,6 +412,8 @@ def test_model_pipeline_same_as_normalize_true(LinearModel, params): (ElasticNet, {"tol": 1e-16, 'l1_ratio': 0, "alpha": 0.1}), (Ridge, {"solver": 'sparse_cg', 'tol': 1e-12, "alpha": 0.1}), (LinearRegression, {}), + (RidgeCV, {"alphas": [0.1, 0.4]}), + (RidgeClassifierCV, {"alphas": [0.1, 0.4]}) ] ) @pytest.mark.parametrize( @@ -494,7 +504,8 @@ def test_linear_model_sample_weights_normalize_in_pipeline( (ElasticNet, {"tol": 1e-16, 'l1_ratio': 0, "alpha": 0.01}), (Ridge, {"solver": 'sparse_cg', 'tol': 1e-12, "alpha": 0.1}), (LinearRegression, {}), - (RidgeCV, {})] + (RidgeCV, {}), + (RidgeClassifierCV, {})] ) def test_model_pipeline_same_dense_and_sparse(LinearModel, params): # Test that linear model preceeded by StandardScaler in the pipeline and @@ -1421,6 +1432,8 @@ def test_enet_sample_weight_does_not_overwrite_sample_weight(check_input): assert_array_equal(sample_weight, sample_weight_1_25) +# FIXME: 'normalize' to be removed in 1.2 +@pytest.mark.filterwarnings("ignore:'normalize' was deprecated") @pytest.mark.parametrize("ridge_alpha", [1e-1, 1., 1e6]) @pytest.mark.parametrize("normalize", [True, False]) def test_enet_ridge_consistency(normalize, ridge_alpha): diff --git a/sklearn/linear_model/tests/test_ridge.py b/sklearn/linear_model/tests/test_ridge.py index 01839fe0ba457..b812788239b14 100644 --- a/sklearn/linear_model/tests/test_ridge.py +++ b/sklearn/linear_model/tests/test_ridge.py @@ -409,6 +409,8 @@ def _make_sparse_offset_regression( return X, y +# FIXME: 'normalize' to be removed in 1.2 +@pytest.mark.filterwarnings("ignore:'normalize' was deprecated") @pytest.mark.parametrize( 'solver, sparse_X', ((solver, sparse_X) for @@ -452,6 +454,8 @@ def test_solver_consistency( ridge.intercept_, svd_ridge.intercept_, atol=1e-3, rtol=1e-3) +# FIXME: 'normalize' to be removed in 1.2 +@pytest.mark.filterwarnings("ignore:'normalize' was deprecated") @pytest.mark.parametrize('gcv_mode', ['svd', 'eigen']) @pytest.mark.parametrize('X_constructor', [np.asarray, sp.csr_matrix]) @pytest.mark.parametrize('X_shape', [(11, 8), (11, 20)]) @@ -504,12 +508,10 @@ def test_ridge_loo_cv_asym_scoring(): alphas = [1e-3, .1, 1., 10., 1e3] loo_ridge = RidgeCV(cv=n_samples, fit_intercept=True, - alphas=alphas, scoring=scoring, - normalize=True) + alphas=alphas, scoring=scoring) gcv_ridge = RidgeCV(fit_intercept=True, - alphas=alphas, scoring=scoring, - normalize=True) + alphas=alphas, scoring=scoring) loo_ridge.fit(X, y) gcv_ridge.fit(X, y) @@ -658,6 +660,7 @@ def func(x, y): return ret +# FIXME: 'normalize' to be removed in 1.2 def _test_ridge_cv_normalize(filter_): ridge_cv = RidgeCV(normalize=True, cv=3) ridge_cv.fit(filter_(10. * X_diabetes), y_diabetes) @@ -871,6 +874,8 @@ def check_dense_sparse(test_func): assert_array_almost_equal(ret_dense, ret_sparse, decimal=3) +# FIXME: 'normalize' to be removed in 1.2 +@pytest.mark.filterwarnings("ignore:'normalize' was deprecated") @pytest.mark.parametrize( 'test_func', (_test_ridge_loo, _test_ridge_cv, _test_ridge_cv_normalize, From 3d7fbda709230f9f733978f8608c64820162baa3 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Wed, 7 Apr 2021 21:13:17 +0200 Subject: [PATCH 296/478] CI Add a check for milestones. (#19833) --- .github/workflows/check-milestone.yml | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 .github/workflows/check-milestone.yml diff --git a/.github/workflows/check-milestone.yml b/.github/workflows/check-milestone.yml new file mode 100644 index 0000000000000..6b71a62dd5740 --- /dev/null +++ b/.github/workflows/check-milestone.yml @@ -0,0 +1,21 @@ +name: Check Milestone (when failing needs Triage intervention) +# This check makes sure that the milestone is properly set. +# To bypass this check, label the PR with "Long Term". +on: + pull_request: + types: [opened, edited, labeled, unlabeled, synchronize] + +jobs: + check: + runs-on: ubuntu-latest + if: ${{ contains(github.event.pull_request.labels.*.name, 'Long Term') == 0 }} + steps: + - name: Check the milestone + run: | + set -xe + if [ ${{ github.event.pull_request.milestone.title }} == "" ] + then + echo "No milestone has been set." + exit 1 + fi + From 36c635b77f9744b627248f96f15f3e73e97d3571 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Wed, 7 Apr 2021 23:47:41 +0200 Subject: [PATCH 297/478] CI Fix string comparison in milestone workflow (#19840) --- .github/workflows/check-milestone.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/check-milestone.yml b/.github/workflows/check-milestone.yml index 6b71a62dd5740..8ed3ac4ef0b8d 100644 --- a/.github/workflows/check-milestone.yml +++ b/.github/workflows/check-milestone.yml @@ -13,7 +13,7 @@ jobs: - name: Check the milestone run: | set -xe - if [ ${{ github.event.pull_request.milestone.title }} == "" ] + if [ "${{ github.event.pull_request.milestone.title }}" == "" ] then echo "No milestone has been set." exit 1 From 4b53fc3f67fa6d7966bd51db7c9d754cd187d48f Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Thu, 8 Apr 2021 03:34:33 -0400 Subject: [PATCH 298/478] CI Removes check milestone (#19843) --- .github/workflows/check-milestone.yml | 21 --------------------- 1 file changed, 21 deletions(-) delete mode 100644 .github/workflows/check-milestone.yml diff --git a/.github/workflows/check-milestone.yml b/.github/workflows/check-milestone.yml deleted file mode 100644 index 8ed3ac4ef0b8d..0000000000000 --- a/.github/workflows/check-milestone.yml +++ /dev/null @@ -1,21 +0,0 @@ -name: Check Milestone (when failing needs Triage intervention) -# This check makes sure that the milestone is properly set. -# To bypass this check, label the PR with "Long Term". -on: - pull_request: - types: [opened, edited, labeled, unlabeled, synchronize] - -jobs: - check: - runs-on: ubuntu-latest - if: ${{ contains(github.event.pull_request.labels.*.name, 'Long Term') == 0 }} - steps: - - name: Check the milestone - run: | - set -xe - if [ "${{ github.event.pull_request.milestone.title }}" == "" ] - then - echo "No milestone has been set." - exit 1 - fi - From 246795f214ec31874aa1d1e89c90c7007ab60642 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Thu, 8 Apr 2021 10:01:49 -0400 Subject: [PATCH 299/478] TST Fixes test_partial_fit_oneclass (#19814) --- sklearn/linear_model/tests/test_sgd.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/sklearn/linear_model/tests/test_sgd.py b/sklearn/linear_model/tests/test_sgd.py index f943592c02005..8465631828613 100644 --- a/sklearn/linear_model/tests/test_sgd.py +++ b/sklearn/linear_model/tests/test_sgd.py @@ -1499,12 +1499,11 @@ def test_partial_fit_oneclass(klass): assert clf.coef_.shape == (X.shape[1], ) assert clf.offset_.shape == (1,) assert clf.predict([[0, 0]]).shape == (1, ) - id1 = id(clf.coef_.data) + previous_coefs = clf.coef_ clf.partial_fit(X[third:]) - id2 = id(clf.coef_.data) # check that coef_ haven't been re-allocated - assert id1 == id2 + assert clf.coef_ is previous_coefs # raises ValueError if number of features does not match previous data with pytest.raises(ValueError): From dff37c4a33ecca991ab72590211384bac260d5c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20Sad=C5=82ocha?= Date: Thu, 8 Apr 2021 17:13:49 +0200 Subject: [PATCH 300/478] DOC Fix incorrect 0-1 scaling in the RBM example (#19363) --- examples/neural_networks/plot_rbm_logistic_classification.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/neural_networks/plot_rbm_logistic_classification.py b/examples/neural_networks/plot_rbm_logistic_classification.py index 6994d3850f2f5..f7ad3513499ca 100644 --- a/examples/neural_networks/plot_rbm_logistic_classification.py +++ b/examples/neural_networks/plot_rbm_logistic_classification.py @@ -37,6 +37,7 @@ from sklearn.model_selection import train_test_split from sklearn.neural_network import BernoulliRBM from sklearn.pipeline import Pipeline +from sklearn.preprocessing import minmax_scale from sklearn.base import clone @@ -79,7 +80,7 @@ def shift(x, w): X, y = datasets.load_digits(return_X_y=True) X = np.asarray(X, 'float32') X, Y = nudge_dataset(X, y) -X = (X - np.min(X, 0)) / (np.max(X, 0) + 0.0001) # 0-1 scaling +X = minmax_scale(X, feature_range=(0, 1)) # 0-1 scaling X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size=0.2, random_state=0) From 1ce17151bcd9bafadd94524ce3acd52c4b665696 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?= <34657725+jeremiedbb@users.noreply.github.com> Date: Thu, 8 Apr 2021 17:31:18 +0200 Subject: [PATCH 301/478] TST Add a test for meta-estimators with non tabular data (#19755) Co-authored-by: Olivier Grisel --- sklearn/tests/test_metaestimators.py | 119 ++++++++++++++++++++++++++- 1 file changed, 117 insertions(+), 2 deletions(-) diff --git a/sklearn/tests/test_metaestimators.py b/sklearn/tests/test_metaestimators.py index 2caa01d71c444..ad716c3e4cd2f 100644 --- a/sklearn/tests/test_metaestimators.py +++ b/sklearn/tests/test_metaestimators.py @@ -1,19 +1,26 @@ """Common tests for metaestimators""" import functools +from inspect import signature import numpy as np import pytest from sklearn.base import BaseEstimator +from sklearn.base import is_regressor from sklearn.datasets import make_classification - +from sklearn.utils import all_estimators +from sklearn.utils.estimator_checks import _enforce_estimator_tags_x +from sklearn.utils.estimator_checks import _enforce_estimator_tags_y from sklearn.utils.validation import check_is_fitted -from sklearn.pipeline import Pipeline +from sklearn.utils._testing import set_random_state +from sklearn.pipeline import Pipeline, make_pipeline from sklearn.model_selection import GridSearchCV, RandomizedSearchCV +from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_selection import RFE, RFECV from sklearn.ensemble import BaggingClassifier from sklearn.exceptions import NotFittedError from sklearn.semi_supervised import SelfTrainingClassifier +from sklearn.linear_model import Ridge, LogisticRegression class DelegatorData: @@ -151,3 +158,111 @@ def score(self, X, y, *args, **kwargs): assert not hasattr(delegator, method), ( "%s has method %r when its delegate does not" % (delegator_data.name, method)) + + +def _generate_meta_estimator_instances_with_pipeline(): + """Generate instances of meta-estimators fed with a pipeline + + Are considered meta-estimators all estimators accepting one of "estimator", + "base_estimator" or "estimators". + """ + for _, Estimator in sorted(all_estimators()): + sig = set(signature(Estimator).parameters) + + if "estimator" in sig or "base_estimator" in sig: + if is_regressor(Estimator): + estimator = make_pipeline(TfidfVectorizer(), Ridge()) + param_grid = {"ridge__alpha": [0.1, 1.0]} + else: + estimator = make_pipeline(TfidfVectorizer(), + LogisticRegression()) + param_grid = {"logisticregression__C": [0.1, 1.0]} + + if "param_grid" in sig or "param_distributions" in sig: + # SearchCV estimators + extra_params = {"n_iter": 2} if "n_iter" in sig else {} + yield Estimator(estimator, param_grid, **extra_params) + else: + yield Estimator(estimator) + + elif "estimators" in sig: + # stacking, voting + if is_regressor(Estimator): + estimator = [ + ("est1", make_pipeline(TfidfVectorizer(), + Ridge(alpha=0.1))), + ("est2", make_pipeline(TfidfVectorizer(), + Ridge(alpha=1))), + ] + else: + estimator = [ + ("est1", make_pipeline(TfidfVectorizer(), + LogisticRegression(C=0.1))), + ("est2", make_pipeline(TfidfVectorizer(), + LogisticRegression(C=1))), + ] + yield Estimator(estimator) + + else: + continue + + +# TODO: remove data validation for the following estimators +# They should be able to work on any data and delegate data validation to +# their inner estimator(s). +DATA_VALIDATION_META_ESTIMATORS_TO_IGNORE = [ + "AdaBoostClassifier", + "AdaBoostRegressor", + "BaggingClassifier", + "BaggingRegressor", + "ClassifierChain", + "IterativeImputer", + "MultiOutputClassifier", + "MultiOutputRegressor", + "OneVsOneClassifier", + "OutputCodeClassifier", + "RANSACRegressor", + "RFE", + "RFECV", + "RegressorChain", + "SelfTrainingClassifier", + "SequentialFeatureSelector" # not applicable (2D data mandatory) +] + +DATA_VALIDATION_META_ESTIMATORS = [ + est for est in _generate_meta_estimator_instances_with_pipeline() if + est.__class__.__name__ not in DATA_VALIDATION_META_ESTIMATORS_TO_IGNORE +] + + +def _get_meta_estimator_id(estimator): + return estimator.__class__.__name__ + + +@pytest.mark.parametrize( + "estimator", DATA_VALIDATION_META_ESTIMATORS, ids=_get_meta_estimator_id +) +def test_meta_estimators_delegate_data_validation(estimator): + # Check that meta-estimators delegate data validation to the inner + # estimator(s). + rng = np.random.RandomState(0) + set_random_state(estimator) + + n_samples = 30 + X = rng.choice(np.array(["aa", "bb", "cc"], dtype=object), size=n_samples) + + if is_regressor(estimator): + y = rng.normal(size=n_samples) + else: + y = rng.randint(3, size=n_samples) + + X = _enforce_estimator_tags_x(estimator, X) + y = _enforce_estimator_tags_y(estimator, y) + + # Calling fit should not raise any data validation exception since X is a + # valid input datastructure for the first step of the pipeline passed as + # base estimator to the meta estimator. + estimator.fit(X, y) + + # n_features_in_ should not be defined since data is not tabular data. + assert not hasattr(estimator, "n_features_in_") From ee524f455dbf0285f7b121a08f1e9613a518abcf Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Thu, 8 Apr 2021 20:27:37 +0200 Subject: [PATCH 302/478] ENH Improve the creation of KDTree and BallTree on their worst-case time complexity (#19473) Co-authored-by: jiefangxuanyan <505745416@qq.com> Co-authored-by: "Thomas J. Fan" --- doc/whats_new/v1.0.rst | 10 ++ sklearn/neighbors/_binary_tree.pxi | 69 +------------- sklearn/neighbors/_partition_nodes.pxd | 9 ++ sklearn/neighbors/_partition_nodes.pyx | 122 +++++++++++++++++++++++++ sklearn/neighbors/setup.py | 6 ++ 5 files changed, 149 insertions(+), 67 deletions(-) create mode 100644 sklearn/neighbors/_partition_nodes.pxd create mode 100644 sklearn/neighbors/_partition_nodes.pyx diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index 96bb2ddfa8f7d..ce7da3139d140 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -278,6 +278,16 @@ Changelog Use ``var_`` instead. :pr:`18842` by :user:`Hong Shao Yang `. + +:mod:`sklearn.neighbors` +.......................... + +- |Enhancement| The creation of :class:`neighbors.KDTree` and + :class:`neighbors.BallTree` has been improved for their worst-cases time + complexity from :math:`\mathcal{O}(n^2)` to :math:`\mathcal{O}(n)`. + :pr:`19473` by :user:`jiefangxuanyan ` and + :user:`Julien Jerphanion `. + :mod:`sklearn.pipeline` ....................... diff --git a/sklearn/neighbors/_binary_tree.pxi b/sklearn/neighbors/_binary_tree.pxi index 1acff082c7d76..cabad951c4975 100755 --- a/sklearn/neighbors/_binary_tree.pxi +++ b/sklearn/neighbors/_binary_tree.pxi @@ -159,6 +159,8 @@ from ._typedefs import DTYPE, ITYPE from ._dist_metrics cimport (DistanceMetric, euclidean_dist, euclidean_rdist, euclidean_dist_to_rdist, euclidean_rdist_to_dist) +from ._partition_nodes cimport partition_node_indices + cdef extern from "numpy/arrayobject.h": void PyArray_ENABLEFLAGS(np.ndarray arr, int flags) @@ -776,73 +778,6 @@ cdef ITYPE_t find_node_split_dim(DTYPE_t* data, return j_max -cdef int partition_node_indices(DTYPE_t* data, - ITYPE_t* node_indices, - ITYPE_t split_dim, - ITYPE_t split_index, - ITYPE_t n_features, - ITYPE_t n_points) except -1: - """Partition points in the node into two equal-sized groups. - - Upon return, the values in node_indices will be rearranged such that - (assuming numpy-style indexing): - - data[node_indices[0:split_index], split_dim] - <= data[node_indices[split_index], split_dim] - - and - - data[node_indices[split_index], split_dim] - <= data[node_indices[split_index:n_points], split_dim] - - The algorithm is essentially a partial in-place quicksort around a - set pivot. - - Parameters - ---------- - data : double pointer - Pointer to a 2D array of the training data, of shape [N, n_features]. - N must be greater than any of the values in node_indices. - node_indices : int pointer - Pointer to a 1D array of length n_points. This lists the indices of - each of the points within the current node. This will be modified - in-place. - split_dim : int - the dimension on which to split. This will usually be computed via - the routine ``find_node_split_dim`` - split_index : int - the index within node_indices around which to split the points. - - Returns - ------- - status : int - integer exit status. On return, the contents of node_indices are - modified as noted above. - """ - cdef ITYPE_t left, right, midindex, i - cdef DTYPE_t d1, d2 - left = 0 - right = n_points - 1 - - while True: - midindex = left - for i in range(left, right): - d1 = data[node_indices[i] * n_features + split_dim] - d2 = data[node_indices[right] * n_features + split_dim] - if d1 < d2: - swap(node_indices, i, midindex) - midindex += 1 - swap(node_indices, midindex, right) - if midindex == split_index: - break - elif midindex < split_index: - left = midindex + 1 - else: - right = midindex - 1 - - return 0 - - ###################################################################### # NodeHeap : min-heap used to keep track of nodes during # breadth-first query diff --git a/sklearn/neighbors/_partition_nodes.pxd b/sklearn/neighbors/_partition_nodes.pxd new file mode 100644 index 0000000000000..522e826632824 --- /dev/null +++ b/sklearn/neighbors/_partition_nodes.pxd @@ -0,0 +1,9 @@ +from ._typedefs cimport DTYPE_t, ITYPE_t + +cdef int partition_node_indices( + DTYPE_t *data, + ITYPE_t *node_indices, + ITYPE_t split_dim, + ITYPE_t split_index, + ITYPE_t n_features, + ITYPE_t n_points) except -1 diff --git a/sklearn/neighbors/_partition_nodes.pyx b/sklearn/neighbors/_partition_nodes.pyx new file mode 100644 index 0000000000000..508e9560ae8c2 --- /dev/null +++ b/sklearn/neighbors/_partition_nodes.pyx @@ -0,0 +1,122 @@ +# distutils : language = c++ + +# BinaryTrees rely on partial sorts to partition their nodes during their +# initialisation. +# +# The C++ std library exposes nth_element, an efficient partial sort for this +# situation which has a linear time complexity as well as the best performances. +# +# To use std::algorithm::nth_element, a few fixture are defined using Cython: +# - partition_node_indices, a Cython function used in BinaryTrees, that calls +# - partition_node_indices_inner, a C++ function that wraps nth_element and uses +# - an IndexComparator to state how to compare KDTrees' indices +# +# IndexComparator has been defined so that partial sorts are stable with +# respect to the nodes initial indices. +# +# See for reference: +# - https://en.cppreference.com/w/cpp/algorithm/nth_element. +# - https://github.com/scikit-learn/scikit-learn/pull/11103 +# - https://github.com/scikit-learn/scikit-learn/pull/19473 + +cdef extern from *: + """ + #include + + template + class IndexComparator { + private: + const D *data; + I split_dim, n_features; + public: + IndexComparator(const D *data, const I &split_dim, const I &n_features): + data(data), split_dim(split_dim), n_features(n_features) {} + + bool operator()(const I &a, const I &b) const { + D a_value = data[a * n_features + split_dim]; + D b_value = data[b * n_features + split_dim]; + return a_value == b_value ? a < b : a_value < b_value; + } + }; + + template + void partition_node_indices_inner( + const D *data, + I *node_indices, + const I &split_dim, + const I &split_index, + const I &n_features, + const I &n_points) { + IndexComparator index_comparator(data, split_dim, n_features); + std::nth_element( + node_indices, + node_indices + split_index, + node_indices + n_points, + index_comparator); + } + """ + void partition_node_indices_inner[D, I]( + D *data, + I *node_indices, + I split_dim, + I split_index, + I n_features, + I n_points) except + + + +cdef int partition_node_indices( + DTYPE_t *data, + ITYPE_t *node_indices, + ITYPE_t split_dim, + ITYPE_t split_index, + ITYPE_t n_features, + ITYPE_t n_points) except -1: + """Partition points in the node into two equal-sized groups. + + Upon return, the values in node_indices will be rearranged such that + (assuming numpy-style indexing): + + data[node_indices[0:split_index], split_dim] + <= data[node_indices[split_index], split_dim] + + and + + data[node_indices[split_index], split_dim] + <= data[node_indices[split_index:n_points], split_dim] + + The algorithm is essentially a partial in-place quicksort around a + set pivot. + + Parameters + ---------- + data : double pointer + Pointer to a 2D array of the training data, of shape [N, n_features]. + N must be greater than any of the values in node_indices. + node_indices : int pointer + Pointer to a 1D array of length n_points. This lists the indices of + each of the points within the current node. This will be modified + in-place. + split_dim : int + the dimension on which to split. This will usually be computed via + the routine ``find_node_split_dim``. + split_index : int + the index within node_indices around which to split the points. + n_features: int + the number of features (i.e columns) in the 2D array pointed by data. + n_points : int + the length of node_indices. This is also the number of points in + the original dataset. + Returns + ------- + status : int + integer exit status. On return, the contents of node_indices are + modified as noted above. + """ + partition_node_indices_inner( + data, + node_indices, + split_dim, + split_index, + n_features, + n_points) + return 0 diff --git a/sklearn/neighbors/setup.py b/sklearn/neighbors/setup.py index 9264044678193..996b855d2d45a 100644 --- a/sklearn/neighbors/setup.py +++ b/sklearn/neighbors/setup.py @@ -20,6 +20,12 @@ def configuration(parent_package='', top_path=None): include_dirs=[numpy.get_include()], libraries=libraries) + config.add_extension('_partition_nodes', + sources=['_partition_nodes.pyx'], + include_dirs=[numpy.get_include()], + language="c++", + libraries=libraries) + config.add_extension('_dist_metrics', sources=['_dist_metrics.pyx'], include_dirs=[numpy.get_include(), From 132627e28b5be807b1e4b7d58bedf42b529d7800 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Thu, 8 Apr 2021 23:21:20 +0200 Subject: [PATCH 303/478] FIX Let ColumnTransformer.get_feature_names handle transformers with non-string feature names (#18459) Co-authored-by: Alonso Silva Allende Co-authored-by: Roman Yurchak --- doc/whats_new/v1.0.rst | 5 ++++ sklearn/compose/_column_transformer.py | 2 +- .../compose/tests/test_column_transformer.py | 28 ++++++++++++------- 3 files changed, 24 insertions(+), 11 deletions(-) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index ce7da3139d140..602d4b1246878 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -105,6 +105,11 @@ Changelog of each transformer in `output_indices_`. :pr:`18393` by :user:`Luca Bittarello `. +- |FIX| :meth:`compose.ColumnTransformer.get_feature_names` supports + non-string feature names returned by any of its transformers. + :pr:`18459` by :user:`Albert Villanova del Moral ` and + :user:`Alonso Silva Allende `. + :mod:`sklearn.datasets` ....................... diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index 5006663331a40..2f2da882652c0 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -376,7 +376,7 @@ def get_feature_names(self): raise AttributeError("Transformer %s (type %s) does not " "provide get_feature_names." % (str(name), type(trans).__name__)) - feature_names.extend([name + "__" + f for f in + feature_names.extend([f"{name}__{f}" for f in trans.get_feature_names()]) return feature_names diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py index f7c1874d4a1b7..549292ab51445 100644 --- a/sklearn/compose/tests/test_column_transformer.py +++ b/sklearn/compose/tests/test_column_transformer.py @@ -748,7 +748,7 @@ def test_column_transformer_cloning(): assert hasattr(ct.transformers_[0][1], 'mean_') -def test_column_transformer_get_feature_names(): +def test_column_transformer_get_feature_names_raises(): X_array = np.array([[0., 1., 2.], [2., 4., 6.]]).T ct = ColumnTransformer([('trans', Trans(), [0, 1])]) # raise correct error when not fitted @@ -756,23 +756,30 @@ def test_column_transformer_get_feature_names(): ct.get_feature_names() # raise correct error when no feature names are available ct.fit(X_array) - assert_raise_message(AttributeError, - "Transformer trans (type Trans) does not provide " - "get_feature_names", ct.get_feature_names) + msg = r"Transformer trans \(type Trans\) does not provide " \ + r"get_feature_names" + with pytest.raises(AttributeError, match=msg): + ct.get_feature_names() - # working example - X = np.array([[{'a': 1, 'b': 2}, {'a': 3, 'b': 4}], - [{'c': 5}, {'c': 6}]], dtype=object).T + +@pytest.mark.parametrize("X, keys", [ + (np.array([[{'a': 1, 'b': 2}, {'a': 3, 'b': 4}], + [{'c': 5}, {'c': 6}]], dtype=object).T, ('a', 'b', 'c')), + (np.array([[{1: 1, 2: 2}, {1: 3, 2: 4}], + [{3: 5}, {3: 6}]], dtype=object).T, ('1', '2', '3')), +]) +def test_column_transformer_get_feature_names(X, keys): ct = ColumnTransformer( [('col' + str(i), DictVectorizer(), i) for i in range(2)]) ct.fit(X) - assert ct.get_feature_names() == ['col0__a', 'col0__b', 'col1__c'] + assert ct.get_feature_names() == [f'col0__{key}' for key in keys[:2]] + \ + [f'col1__{keys[2]}'] # drop transformer ct = ColumnTransformer( [('col0', DictVectorizer(), 0), ('col1', 'drop', 1)]) ct.fit(X) - assert ct.get_feature_names() == ['col0__a', 'col0__b'] + assert ct.get_feature_names() == [f'col0__{key}' for key in keys[:2]] # passthrough transformer ct = ColumnTransformer([('trans', 'passthrough', [0, 1])]) @@ -782,7 +789,8 @@ def test_column_transformer_get_feature_names(): ct = ColumnTransformer([('trans', DictVectorizer(), 0)], remainder='passthrough') ct.fit(X) - assert ct.get_feature_names() == ['trans__a', 'trans__b', 'x1'] + assert ct.get_feature_names() == [f'trans__{key}' for key in keys[:2]] + \ + ['x1'] ct = ColumnTransformer([('trans', 'passthrough', [1])], remainder='passthrough') From 1f91b873e420fcfb5f1d84b821d27ab54bd76144 Mon Sep 17 00:00:00 2001 From: qdeffense Date: Fri, 9 Apr 2021 00:17:49 +0200 Subject: [PATCH 304/478] TST Remove redundant max iter in sklearn/linear_model/tests (#14622) Co-authored-by: Thomas J. Fan --- sklearn/linear_model/tests/test_coordinate_descent.py | 10 +++++----- sklearn/linear_model/tests/test_huber.py | 5 ++--- sklearn/linear_model/tests/test_logistic.py | 10 +++++----- sklearn/linear_model/tests/test_passive_aggressive.py | 6 +++--- .../tests/test_sparse_coordinate_descent.py | 4 ++-- 5 files changed, 17 insertions(+), 18 deletions(-) diff --git a/sklearn/linear_model/tests/test_coordinate_descent.py b/sklearn/linear_model/tests/test_coordinate_descent.py index 8a269f28ebd62..830cf32139b08 100644 --- a/sklearn/linear_model/tests/test_coordinate_descent.py +++ b/sklearn/linear_model/tests/test_coordinate_descent.py @@ -668,11 +668,11 @@ def test_lasso_positive_constraint(): X = [[-1], [0], [1]] y = [1, 0, -1] # just a straight line with negative slope - lasso = Lasso(alpha=0.1, max_iter=1000, positive=True) + lasso = Lasso(alpha=0.1, positive=True) lasso.fit(X, y) assert min(lasso.coef_) >= 0 - lasso = Lasso(alpha=0.1, max_iter=1000, precompute=True, positive=True) + lasso = Lasso(alpha=0.1, precompute=True, positive=True) lasso.fit(X, y) assert min(lasso.coef_) >= 0 @@ -681,7 +681,7 @@ def test_enet_positive_constraint(): X = [[-1], [0], [1]] y = [1, 0, -1] # just a straight line with negative slope - enet = ElasticNet(alpha=0.1, max_iter=1000, positive=True) + enet = ElasticNet(alpha=0.1, positive=True) enet.fit(X, y) assert min(enet.coef_) >= 0 @@ -1255,7 +1255,7 @@ def test_convergence_warnings(): # check that the model converges w/o warnings with pytest.warns(None) as record: - MultiTaskElasticNet(max_iter=1000).fit(X, y) + MultiTaskElasticNet().fit(X, y) assert not record.list @@ -1269,7 +1269,7 @@ def test_sparse_input_convergence_warning(): # check that the model converges w/o warnings with pytest.warns(None) as record: - Lasso(max_iter=1000).fit(sparse.csr_matrix(X, dtype=np.float32), y) + Lasso().fit(sparse.csr_matrix(X, dtype=np.float32), y) assert not record.list diff --git a/sklearn/linear_model/tests/test_huber.py b/sklearn/linear_model/tests/test_huber.py index 156cd4b57dbc8..7aa69e68f5136 100644 --- a/sklearn/linear_model/tests/test_huber.py +++ b/sklearn/linear_model/tests/test_huber.py @@ -128,7 +128,7 @@ def test_huber_sparse(): def test_huber_scaling_invariant(): # Test that outliers filtering is scaling independent. X, y = make_regression_with_outliers() - huber = HuberRegressor(fit_intercept=False, alpha=0.0, max_iter=100) + huber = HuberRegressor(fit_intercept=False, alpha=0.0) huber.fit(X, y) n_outliers_mask_1 = huber.outliers_ assert not np.all(n_outliers_mask_1) @@ -149,8 +149,7 @@ def test_huber_and_sgd_same_results(): # Fit once to find out the scale parameter. Scale down X and y by scale # so that the scale parameter is optimized to 1.0 - huber = HuberRegressor(fit_intercept=False, alpha=0.0, max_iter=100, - epsilon=1.35) + huber = HuberRegressor(fit_intercept=False, alpha=0.0, epsilon=1.35) huber.fit(X, y) X_scale = X / huber.scale_ y_scale = y / huber.scale_ diff --git a/sklearn/linear_model/tests/test_logistic.py b/sklearn/linear_model/tests/test_logistic.py index bdc9a4a24914b..5ec4a434f857a 100644 --- a/sklearn/linear_model/tests/test_logistic.py +++ b/sklearn/linear_model/tests/test_logistic.py @@ -1234,14 +1234,14 @@ def test_n_iter(solver): n_classes = 1 if solver == 'liblinear' else np.unique(y).shape[0] clf = LogisticRegression(tol=1e-2, multi_class='ovr', solver=solver, C=1., - random_state=42, max_iter=100) + random_state=42) clf.fit(X, y) assert clf.n_iter_.shape == (n_classes,) n_classes = np.unique(y).shape[0] clf = LogisticRegressionCV(tol=1e-2, multi_class='ovr', solver=solver, Cs=n_Cs, cv=n_cv_fold, - random_state=42, max_iter=100) + random_state=42) clf.fit(X, y) assert clf.n_iter_.shape == (n_classes, n_cv_fold, n_Cs) clf.fit(X, y_bin) @@ -1254,13 +1254,13 @@ def test_n_iter(solver): clf = LogisticRegression(tol=1e-2, multi_class='multinomial', solver=solver, C=1., - random_state=42, max_iter=100) + random_state=42) clf.fit(X, y) assert clf.n_iter_.shape == (n_classes,) clf = LogisticRegressionCV(tol=1e-2, multi_class='multinomial', solver=solver, Cs=n_Cs, cv=n_cv_fold, - random_state=42, max_iter=100) + random_state=42) clf.fit(X, y) assert clf.n_iter_.shape == (n_classes, n_cv_fold, n_Cs) clf.fit(X, y_bin) @@ -1280,7 +1280,7 @@ def test_warm_start(solver, warm_start, fit_intercept, multi_class): clf = LogisticRegression(tol=1e-4, multi_class=multi_class, warm_start=warm_start, solver=solver, - random_state=42, max_iter=100, + random_state=42, fit_intercept=fit_intercept) with ignore_warnings(category=ConvergenceWarning): clf.fit(X, y) diff --git a/sklearn/linear_model/tests/test_passive_aggressive.py b/sklearn/linear_model/tests/test_passive_aggressive.py index f2403773277a7..d0d099eeacc8d 100644 --- a/sklearn/linear_model/tests/test_passive_aggressive.py +++ b/sklearn/linear_model/tests/test_passive_aggressive.py @@ -165,16 +165,16 @@ def test_equal_class_weight(): X2 = [[1, 0], [1, 0], [0, 1], [0, 1]] y2 = [0, 0, 1, 1] clf = PassiveAggressiveClassifier( - C=0.1, max_iter=1000, tol=None, class_weight=None) + C=0.1, tol=None, class_weight=None) clf.fit(X2, y2) # Already balanced, so "balanced" weights should have no effect clf_balanced = PassiveAggressiveClassifier( - C=0.1, max_iter=1000, tol=None, class_weight="balanced") + C=0.1, tol=None, class_weight="balanced") clf_balanced.fit(X2, y2) clf_weighted = PassiveAggressiveClassifier( - C=0.1, max_iter=1000, tol=None, class_weight={0: 0.5, 1: 0.5}) + C=0.1, tol=None, class_weight={0: 0.5, 1: 0.5}) clf_weighted.fit(X2, y2) # should be similar up to some epsilon due to learning rate schedule diff --git a/sklearn/linear_model/tests/test_sparse_coordinate_descent.py b/sklearn/linear_model/tests/test_sparse_coordinate_descent.py index 23b57a699a655..c4364cc31a80d 100644 --- a/sklearn/linear_model/tests/test_sparse_coordinate_descent.py +++ b/sklearn/linear_model/tests/test_sparse_coordinate_descent.py @@ -63,7 +63,7 @@ def test_enet_toy_list_input(): assert_array_almost_equal(pred, [2, 3, 4]) assert_almost_equal(clf.dual_gap_, 0) - clf = ElasticNet(alpha=0.5, l1_ratio=0.3, max_iter=1000) + clf = ElasticNet(alpha=0.5, l1_ratio=0.3) clf.fit(X, Y) pred = clf.predict(T) assert_array_almost_equal(clf.coef_, [0.50819], decimal=3) @@ -102,7 +102,7 @@ def test_enet_toy_explicit_sparse_input(): assert_array_almost_equal(pred, [2, 3, 4]) assert_almost_equal(clf.dual_gap_, 0) - clf = ElasticNet(alpha=0.5, l1_ratio=0.3, max_iter=1000) + clf = ElasticNet(alpha=0.5, l1_ratio=0.3) clf.fit(X, Y) pred = clf.predict(T) assert_array_almost_equal(clf.coef_, [0.50819], decimal=3) From d88ffabb6fe3152902c213133eb2bdd0a3c9ab86 Mon Sep 17 00:00:00 2001 From: Shooter23 <44271378+Shooter23@users.noreply.github.com> Date: Thu, 8 Apr 2021 20:54:51 -0400 Subject: [PATCH 305/478] DOC Update attribute docstrings in _multilayer_perceptron.py (#19595) Co-authored-by: Thomas J. Fan --- sklearn/neural_network/_multilayer_perceptron.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/sklearn/neural_network/_multilayer_perceptron.py b/sklearn/neural_network/_multilayer_perceptron.py index 52c94a7129b9f..04822360791e7 100644 --- a/sklearn/neural_network/_multilayer_perceptron.py +++ b/sklearn/neural_network/_multilayer_perceptron.py @@ -884,7 +884,7 @@ class MLPClassifier(ClassifierMixin, BaseMultilayerPerceptron): layer i + 1. n_iter_ : int - The number of iterations the solver has ran. + The number of iterations the solver has run. n_layers_ : int Number of layers. @@ -1292,10 +1292,13 @@ class MLPRegressor(RegressorMixin, BaseMultilayerPerceptron): The minimum loss reached by the solver throughout fitting. loss_curve_ : list of shape (`n_iter_`,) + Loss value evaluated at the end of each training step. The ith element in the list represents the loss at the ith iteration. t_ : int The number of training samples seen by the solver during fitting. + Mathematically equals `n_iters * X.shape[0]`, it means + `time_step` and it is used by optimizer's learning rate scheduler. coefs_ : list of shape (n_layers - 1,) The ith element in the list represents the weight matrix corresponding @@ -1306,7 +1309,7 @@ class MLPRegressor(RegressorMixin, BaseMultilayerPerceptron): layer i + 1. n_iter_ : int - The number of iterations the solver has ran. + The number of iterations the solver has run. n_layers_ : int Number of layers. @@ -1317,13 +1320,6 @@ class MLPRegressor(RegressorMixin, BaseMultilayerPerceptron): out_activation_ : str Name of the output activation function. - loss_curve_ : list of shape (n_iters,) - Loss value evaluated at the end of each training step. - - t_ : int - Mathematically equals `n_iters * X.shape[0]`, it means - `time_step` and it is used by optimizer's learning rate scheduler. - Examples -------- >>> from sklearn.neural_network import MLPRegressor From a80b99ca04a6e8df9fb838bb195432654b592263 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 9 Apr 2021 10:31:53 +0200 Subject: [PATCH 306/478] DOC Fix versionchanged/versionadded in OneHotEncoder (#16562) Co-authored-by: Thomas J. Fan Co-authored-by: Guillaume Lemaitre --- sklearn/preprocessing/_encoders.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 4344e010bba1a..d3f557d2993cb 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -193,8 +193,6 @@ class OneHotEncoder(_BaseEncoder): Read more in the :ref:`User Guide `. - .. versionchanged:: 0.20 - Parameters ---------- categories : 'auto' or a list of array-like, default='auto' @@ -230,8 +228,11 @@ class OneHotEncoder(_BaseEncoder): - array : ``drop[i]`` is the category in feature ``X[:, i]`` that should be dropped. + .. versionadded:: 0.21 + The parameter `drop` was added in 0.21. + .. versionchanged:: 0.23 - Added option 'if_binary'. + The option `drop='if_binary'` was added in 0.23. sparse : bool, default=True Will return sparse matrix if set True else will return an array. From 734ae1f2dfb320ea824478860dda1f4aa5736d05 Mon Sep 17 00:00:00 2001 From: Alihan Zihna Date: Fri, 9 Apr 2021 09:37:15 +0100 Subject: [PATCH 307/478] DOC add explicit message regarding shuffling in default CV (#19776) Co-authored-by: Alihan Zihna --- sklearn/ensemble/_stacking.py | 4 ++++ sklearn/feature_selection/_sequential.py | 3 ++- sklearn/model_selection/_search.py | 6 ++++-- .../_search_successive_halving.py | 6 ++++-- sklearn/model_selection/_validation.py | 18 ++++++++++++------ 5 files changed, 26 insertions(+), 11 deletions(-) diff --git a/sklearn/ensemble/_stacking.py b/sklearn/ensemble/_stacking.py index 303015cc9f751..09a460f7519d5 100644 --- a/sklearn/ensemble/_stacking.py +++ b/sklearn/ensemble/_stacking.py @@ -297,6 +297,8 @@ class StackingClassifier(ClassifierMixin, _BaseStacking): either binary or multiclass, :class:`~sklearn.model_selection.StratifiedKFold` is used. In all other cases, :class:`~sklearn.model_selection.KFold` is used. + These splitters are instantiated with `shuffle=False` so the splits + will be the same across calls. Refer :ref:`User Guide ` for the various cross-validation strategies that can be used here. @@ -570,6 +572,8 @@ class StackingRegressor(RegressorMixin, _BaseStacking): either binary or multiclass, :class:`~sklearn.model_selection.StratifiedKFold` is used. In all other cases, :class:`~sklearn.model_selection.KFold` is used. + These splitters are instantiated with `shuffle=False` so the splits + will be the same across calls. Refer :ref:`User Guide ` for the various cross-validation strategies that can be used here. diff --git a/sklearn/feature_selection/_sequential.py b/sklearn/feature_selection/_sequential.py index 7ee6b043a0df1..8e831b53e4983 100644 --- a/sklearn/feature_selection/_sequential.py +++ b/sklearn/feature_selection/_sequential.py @@ -60,7 +60,8 @@ class SequentialFeatureSelector(SelectorMixin, MetaEstimatorMixin, For integer/None inputs, if the estimator is a classifier and ``y`` is either binary or multiclass, :class:`StratifiedKFold` is used. In all - other cases, :class:`KFold` is used. + other cases, :class:`KFold` is used. These splitters are instantiated + with `shuffle=False` so the splits will be the same across calls. Refer :ref:`User Guide ` for the various cross-validation strategies that can be used here. diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index abe3b87488d8c..ebd085c08e68f 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -1065,7 +1065,8 @@ class GridSearchCV(BaseSearchCV): For integer/None inputs, if the estimator is a classifier and ``y`` is either binary or multiclass, :class:`StratifiedKFold` is used. In all - other cases, :class:`KFold` is used. + other cases, :class:`KFold` is used. These splitters are instantiated + with `shuffle=False` so the splits will be the same across calls. Refer :ref:`User Guide ` for the various cross-validation strategies that can be used here. @@ -1407,7 +1408,8 @@ class RandomizedSearchCV(BaseSearchCV): For integer/None inputs, if the estimator is a classifier and ``y`` is either binary or multiclass, :class:`StratifiedKFold` is used. In all - other cases, :class:`KFold` is used. + other cases, :class:`KFold` is used. These splitters are instantiated + with `shuffle=False` so the splits will be the same across calls. Refer :ref:`User Guide ` for the various cross-validation strategies that can be used here. diff --git a/sklearn/model_selection/_search_successive_halving.py b/sklearn/model_selection/_search_successive_halving.py index b522ce7fbda41..f4396920c1677 100644 --- a/sklearn/model_selection/_search_successive_halving.py +++ b/sklearn/model_selection/_search_successive_halving.py @@ -425,7 +425,8 @@ class HalvingGridSearchCV(BaseSuccessiveHalving): For integer/None inputs, if the estimator is a classifier and ``y`` is either binary or multiclass, :class:`StratifiedKFold` is used. In all - other cases, :class:`KFold` is used. + other cases, :class:`KFold` is used. These splitters are instantiated + with `shuffle=False` so the splits will be the same across calls. Refer :ref:`User Guide ` for the various cross-validation strategies that can be used here. @@ -712,7 +713,8 @@ class HalvingRandomSearchCV(BaseSuccessiveHalving): For integer/None inputs, if the estimator is a classifier and ``y`` is either binary or multiclass, :class:`StratifiedKFold` is used. In all - other cases, :class:`KFold` is used. + other cases, :class:`KFold` is used. These splitters are instantiated + with `shuffle=False` so the splits will be the same across calls. Refer :ref:`User Guide ` for the various cross-validation strategies that can be used here. diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index 5f5338512a0f2..9765303a30b8d 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -94,7 +94,8 @@ def cross_validate(estimator, X, y=None, *, groups=None, scoring=None, cv=None, For int/None inputs, if the estimator is a classifier and ``y`` is either binary or multiclass, :class:`StratifiedKFold` is used. In all - other cases, :class:`KFold` is used. + other cases, :class:`.Fold` is used. These splitters are instantiated + with `shuffle=False` so the splits will be the same across calls. Refer :ref:`User Guide ` for the various cross-validation strategies that can be used here. @@ -364,7 +365,8 @@ def cross_val_score(estimator, X, y=None, *, groups=None, scoring=None, For int/None inputs, if the estimator is a classifier and ``y`` is either binary or multiclass, :class:`StratifiedKFold` is used. In all - other cases, :class:`KFold` is used. + other cases, :class:`KFold` is used. These splitters are instantiated + with `shuffle=False` so the splits will be the same across calls. Refer :ref:`User Guide ` for the various cross-validation strategies that can be used here. @@ -766,7 +768,8 @@ def cross_val_predict(estimator, X, y=None, *, groups=None, cv=None, For int/None inputs, if the estimator is a classifier and ``y`` is either binary or multiclass, :class:`StratifiedKFold` is used. In all - other cases, :class:`KFold` is used. + other cases, :class:`KFold` is used. These splitters are instantiated + with `shuffle=False` so the splits will be the same across calls. Refer :ref:`User Guide ` for the various cross-validation strategies that can be used here. @@ -1115,7 +1118,8 @@ def permutation_test_score(estimator, X, y, *, groups=None, cv=None, For int/None inputs, if the estimator is a classifier and ``y`` is either binary or multiclass, :class:`StratifiedKFold` is used. In all - other cases, :class:`KFold` is used. + other cases, :class:`KFold` is used. These splitters are instantiated + with `shuffle=False` so the splits will be the same across calls. Refer :ref:`User Guide ` for the various cross-validation strategies that can be used here. @@ -1279,7 +1283,8 @@ def learning_curve(estimator, X, y, *, groups=None, For int/None inputs, if the estimator is a classifier and ``y`` is either binary or multiclass, :class:`StratifiedKFold` is used. In all - other cases, :class:`KFold` is used. + other cases, :class:`KFold` is used. These splitters are instantiated + with `shuffle=False` so the splits will be the same across calls. Refer :ref:`User Guide ` for the various cross-validation strategies that can be used here. @@ -1579,7 +1584,8 @@ def validation_curve(estimator, X, y, *, param_name, param_range, groups=None, For int/None inputs, if the estimator is a classifier and ``y`` is either binary or multiclass, :class:`StratifiedKFold` is used. In all - other cases, :class:`KFold` is used. + other cases, :class:`KFold` is used. These splitters are instantiated + with `shuffle=False` so the splits will be the same across calls. Refer :ref:`User Guide ` for the various cross-validation strategies that can be used here. From b8903dacee48a82512619a8a6ed0bf706c1ab909 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Fri, 9 Apr 2021 05:32:41 -0400 Subject: [PATCH 308/478] ENH Adds final_estimator in html repr for Stacking* (#19564) --- doc/whats_new/v1.0.rst | 4 ++++ sklearn/ensemble/_stacking.py | 11 ++++++++--- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index 602d4b1246878..1245193d76d89 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -154,6 +154,10 @@ Changelog experimental. They are now considered stable and are subject to the same deprecation cycles as all other estimators. :pr:`19799` by `Nicolas Hug`_. +- |Enhancement| Improve the HTML rendering of the + :class:`ensemble.StackingClassifier` and :class:`ensemble.StackingRegressor`. + :pr:`19564` by `Thomas Fan`_. + :mod:`sklearn.feature_extraction` ................................. diff --git a/sklearn/ensemble/_stacking.py b/sklearn/ensemble/_stacking.py index 09a460f7519d5..3522b381389d3 100644 --- a/sklearn/ensemble/_stacking.py +++ b/sklearn/ensemble/_stacking.py @@ -250,9 +250,14 @@ def _sk_visual_block_(self, final_estimator): names, estimators = zip(*self.estimators) parallel = _VisualBlock('parallel', estimators, names=names, dash_wrapped=False) - serial = _VisualBlock('serial', (parallel, final_estimator), - dash_wrapped=False) - return _VisualBlock('serial', [serial]) + + # final estimator is wrapped in a parallel block to show the label: + # 'final_estimator' in the html repr + final_block = _VisualBlock('parallel', [final_estimator], + names=['final_estimator'], + dash_wrapped=False) + return _VisualBlock('serial', (parallel, final_block), + dash_wrapped=False) class StackingClassifier(ClassifierMixin, _BaseStacking): From a44653fb3438c80955e647c9d634c231de28a8c4 Mon Sep 17 00:00:00 2001 From: Alihan Zihna Date: Fri, 9 Apr 2021 10:40:37 +0100 Subject: [PATCH 309/478] TST Changes assert to pytest style in test_config.py and test_kernel_approximation.py (#19845) --- sklearn/tests/test_config.py | 12 ++++++++---- sklearn/tests/test_kernel_approximation.py | 19 +++++++++++++++---- 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/sklearn/tests/test_config.py b/sklearn/tests/test_config.py index eec349861258c..22ec862ef24a3 100644 --- a/sklearn/tests/test_config.py +++ b/sklearn/tests/test_config.py @@ -1,5 +1,5 @@ +import pytest from sklearn import get_config, set_config, config_context -from sklearn.utils._testing import assert_raises def test_config_context(): @@ -43,9 +43,12 @@ def test_config_context(): 'display': 'text'} # No positional arguments - assert_raises(TypeError, config_context, True) + with pytest.raises(TypeError): + config_context(True) + # No unknown arguments - assert_raises(TypeError, config_context(do_something_else=True).__enter__) + with pytest.raises(TypeError): + config_context(do_something_else=True).__enter__() def test_config_context_exception(): @@ -71,4 +74,5 @@ def test_set_config(): assert get_config()['assume_finite'] is False # No unknown arguments - assert_raises(TypeError, set_config, do_something_else=True) + with pytest.raises(TypeError): + set_config(do_something_else=True) diff --git a/sklearn/tests/test_kernel_approximation.py b/sklearn/tests/test_kernel_approximation.py index 0cee04f9f2d0a..cfd9c9671fc4d 100644 --- a/sklearn/tests/test_kernel_approximation.py +++ b/sklearn/tests/test_kernel_approximation.py @@ -1,9 +1,11 @@ +import re + import numpy as np from scipy.sparse import csr_matrix import pytest from sklearn.utils._testing import assert_array_equal -from sklearn.utils._testing import assert_array_almost_equal, assert_raises +from sklearn.utils._testing import assert_array_almost_equal from sklearn.metrics.pairwise import kernel_metrics from sklearn.kernel_approximation import RBFSampler @@ -90,11 +92,18 @@ def test_additive_chi2_sampler(): # test error is raised on negative input Y_neg = Y.copy() Y_neg[0, 0] = -1 - assert_raises(ValueError, transform.transform, Y_neg) + msg = 'Negative values in data passed to' + with pytest.raises(ValueError, match=msg): + transform.transform(Y_neg) # test error on invalid sample_steps transform = AdditiveChi2Sampler(sample_steps=4) - assert_raises(ValueError, transform.fit, X) + msg = re.escape( + "If sample_steps is not in [1, 2, 3]," + " you need to provide sample_interval" + ) + with pytest.raises(ValueError, match=msg): + transform.fit(X) # test that the sample interval is set correctly sample_steps_available = [1, 2, 3] @@ -154,7 +163,9 @@ def test_skewed_chi2_sampler(): # test error is raised on when inputs contains values smaller than -c Y_neg = Y.copy() Y_neg[0, 0] = -c * 2. - assert_raises(ValueError, transform.transform, Y_neg) + msg = 'X may not contain entries smaller than -skewedness' + with pytest.raises(ValueError, match=msg): + transform.transform(Y_neg) def test_additive_chi2_sampler_exceptions(): From 02e2a113e6cc63854f08349e054d4a3b3e045cb4 Mon Sep 17 00:00:00 2001 From: "Abdulelah S. Al Mesfer" <28743265+abdulelahsm@users.noreply.github.com> Date: Fri, 9 Apr 2021 13:04:54 +0300 Subject: [PATCH 310/478] TST replace assert_raises by pytest.raises in test_least_angle, test_omp, test_test_theil_sen (#19406) Co-authored-by: Olivier Grisel Co-authored-by: Olivier Grisel Co-authored-by: Guillaume Lemaitre Co-authored-by: Chiara Marmo --- .../linear_model/tests/test_least_angle.py | 10 +++---- sklearn/linear_model/tests/test_omp.py | 26 +++++++++---------- sklearn/linear_model/tests/test_theil_sen.py | 15 +++++++---- 3 files changed, 27 insertions(+), 24 deletions(-) diff --git a/sklearn/linear_model/tests/test_least_angle.py b/sklearn/linear_model/tests/test_least_angle.py index a8b0e939c080d..4321c39b45e92 100644 --- a/sklearn/linear_model/tests/test_least_angle.py +++ b/sklearn/linear_model/tests/test_least_angle.py @@ -3,12 +3,10 @@ import numpy as np import pytest from scipy import linalg - from sklearn.base import clone from sklearn.model_selection import train_test_split from sklearn.utils._testing import assert_allclose from sklearn.utils._testing import assert_array_almost_equal -from sklearn.utils._testing import assert_raises from sklearn.utils._testing import ignore_warnings from sklearn.utils._testing import TempMemmap from sklearn.utils.fixes import np_version, parse_version @@ -96,8 +94,8 @@ def test_lars_path_gram_equivalent(method, return_path): def test_x_none_gram_none_raises_value_error(): # Test that lars_path with no X and Gram raises exception Xy = np.dot(X.T, y) - assert_raises(ValueError, linear_model.lars_path, None, y, Gram=None, - Xy=Xy) + with pytest.raises(ValueError): + linear_model.lars_path(None, y, Gram=None, Xy=Xy) def test_all_precomputed(): @@ -489,7 +487,9 @@ def test_lasso_lars_ic(): # test error on unknown IC lars_broken = linear_model.LassoLarsIC('') - assert_raises(ValueError, lars_broken.fit, X, y) + + with pytest.raises(ValueError): + lars_broken.fit(X, y) def test_lars_path_readonly_data(): diff --git a/sklearn/linear_model/tests/test_omp.py b/sklearn/linear_model/tests/test_omp.py index 3cbda003f0148..1d2eb6a239786 100644 --- a/sklearn/linear_model/tests/test_omp.py +++ b/sklearn/linear_model/tests/test_omp.py @@ -4,7 +4,6 @@ import numpy as np import pytest -from sklearn.utils._testing import assert_raises from sklearn.utils._testing import assert_array_equal from sklearn.utils._testing import assert_array_almost_equal from sklearn.utils._testing import ignore_warnings @@ -33,16 +32,16 @@ def test_correct_shapes(): assert (orthogonal_mp(X, y[:, 0], n_nonzero_coefs=5).shape == - (n_features,)) + (n_features,)) assert (orthogonal_mp(X, y, n_nonzero_coefs=5).shape == - (n_features, 3)) + (n_features, 3)) def test_correct_shapes_gram(): assert (orthogonal_mp_gram(G, Xy[:, 0], n_nonzero_coefs=5).shape == - (n_features,)) + (n_features,)) assert (orthogonal_mp_gram(G, Xy, n_nonzero_coefs=5).shape == - (n_features, 3)) + (n_features, 3)) def test_n_nonzero_coefs(): @@ -88,15 +87,14 @@ def test_unreachable_accuracy(): n_nonzero_coefs=n_features)) -def test_bad_input(): - assert_raises(ValueError, orthogonal_mp, X, y, tol=-1) - assert_raises(ValueError, orthogonal_mp, X, y, n_nonzero_coefs=-1) - assert_raises(ValueError, orthogonal_mp, X, y, - n_nonzero_coefs=n_features + 1) - assert_raises(ValueError, orthogonal_mp_gram, G, Xy, tol=-1) - assert_raises(ValueError, orthogonal_mp_gram, G, Xy, n_nonzero_coefs=-1) - assert_raises(ValueError, orthogonal_mp_gram, G, Xy, - n_nonzero_coefs=n_features + 1) +@pytest.mark.parametrize("positional_params", [(X, y), (G, Xy)]) +@pytest.mark.parametrize( + "keyword_params", + [{"tol": -1}, {"n_nonzero_coefs": -1}, {"n_nonzero_coefs": n_features + 1}] +) +def test_bad_input(positional_params, keyword_params): + with pytest.raises(ValueError): + orthogonal_mp(*positional_params, **keyword_params) def test_perfect_signal_recovery(): diff --git a/sklearn/linear_model/tests/test_theil_sen.py b/sklearn/linear_model/tests/test_theil_sen.py index c670fc3979b80..125c89599af83 100644 --- a/sklearn/linear_model/tests/test_theil_sen.py +++ b/sklearn/linear_model/tests/test_theil_sen.py @@ -17,7 +17,7 @@ from sklearn.linear_model import LinearRegression, TheilSenRegressor from sklearn.linear_model._theil_sen import _spatial_median, _breakdown_point from sklearn.linear_model._theil_sen import _modified_weiszfeld_step -from sklearn.utils._testing import assert_almost_equal, assert_raises +from sklearn.utils._testing import assert_almost_equal @contextmanager @@ -209,19 +209,23 @@ def test_calc_breakdown_point(): def test_checksubparams_negative_subpopulation(): X, y, w, c = gen_toy_problem_1d() theil_sen = TheilSenRegressor(max_subpopulation=-1, random_state=0) - assert_raises(ValueError, theil_sen.fit, X, y) + + with pytest.raises(ValueError): + theil_sen.fit(X, y) def test_checksubparams_too_few_subsamples(): X, y, w, c = gen_toy_problem_1d() theil_sen = TheilSenRegressor(n_subsamples=1, random_state=0) - assert_raises(ValueError, theil_sen.fit, X, y) + with pytest.raises(ValueError): + theil_sen.fit(X, y) def test_checksubparams_too_many_subsamples(): X, y, w, c = gen_toy_problem_1d() theil_sen = TheilSenRegressor(n_subsamples=101, random_state=0) - assert_raises(ValueError, theil_sen.fit, X, y) + with pytest.raises(ValueError): + theil_sen.fit(X, y) def test_checksubparams_n_subsamples_if_less_samples_than_features(): @@ -230,7 +234,8 @@ def test_checksubparams_n_subsamples_if_less_samples_than_features(): X = random_state.normal(size=(n_samples, n_features)) y = random_state.normal(size=n_samples) theil_sen = TheilSenRegressor(n_subsamples=9, random_state=0) - assert_raises(ValueError, theil_sen.fit, X, y) + with pytest.raises(ValueError): + theil_sen.fit(X, y) def test_subpopulation(): From 80e985b5da06a835eecd9130abeed79a31e63200 Mon Sep 17 00:00:00 2001 From: LSturtew <56136443+LSturtew@users.noreply.github.com> Date: Fri, 9 Apr 2021 13:54:30 +0200 Subject: [PATCH 311/478] TST Changes assert to pytest style in test_random_projection.py (#19846) --- sklearn/tests/test_random_projection.py | 92 ++++++++++++------------- 1 file changed, 44 insertions(+), 48 deletions(-) diff --git a/sklearn/tests/test_random_projection.py b/sklearn/tests/test_random_projection.py index d01f318c3f1b1..79d2af5776859 100644 --- a/sklearn/tests/test_random_projection.py +++ b/sklearn/tests/test_random_projection.py @@ -14,12 +14,9 @@ from sklearn.random_projection import SparseRandomProjection from sklearn.random_projection import GaussianRandomProjection -from sklearn.utils._testing import assert_raises -from sklearn.utils._testing import assert_raise_message from sklearn.utils._testing import assert_array_equal from sklearn.utils._testing import assert_almost_equal from sklearn.utils._testing import assert_array_almost_equal -from sklearn.utils._testing import assert_warns from sklearn.exceptions import DataDimensionalityWarning all_sparse_random_matrix: List[Any] = [_sparse_random_matrix] @@ -59,19 +56,21 @@ def densify(matrix): ############################################################################### # test on JL lemma ############################################################################### -def test_invalid_jl_domain(): - assert_raises(ValueError, johnson_lindenstrauss_min_dim, 100, eps=1.1) - assert_raises(ValueError, johnson_lindenstrauss_min_dim, 100, eps=0.0) - assert_raises(ValueError, johnson_lindenstrauss_min_dim, 100, eps=-0.1) - assert_raises(ValueError, johnson_lindenstrauss_min_dim, 0, eps=0.5) +@pytest.mark.parametrize("n_samples, eps", [ + (100, 1.1), + (100, 0.0), + (100, -0.1), + (0, 0.5) +]) +def test_invalid_jl_domain(n_samples, eps): + with pytest.raises(ValueError): + johnson_lindenstrauss_min_dim(n_samples, eps=eps) -def test_input_size_jl_min_dim(): - assert_raises(ValueError, johnson_lindenstrauss_min_dim, - 3 * [100], eps=2 * [0.9]) - assert_raises(ValueError, johnson_lindenstrauss_min_dim, 3 * [100], - eps=2 * [0.9]) +def test_input_size_jl_min_dim(): + with pytest.raises(ValueError): + johnson_lindenstrauss_min_dim(3 * [100], eps=2 * [0.9]) johnson_lindenstrauss_min_dim(np.random.randint(1, 10, size=(10, 10)), eps=np.full((10, 10), 0.5)) @@ -81,18 +80,17 @@ def test_input_size_jl_min_dim(): # tests random matrix generation ############################################################################### def check_input_size_random_matrix(random_matrix): - assert_raises(ValueError, random_matrix, 0, 0) - assert_raises(ValueError, random_matrix, -1, 1) - assert_raises(ValueError, random_matrix, 1, -1) - assert_raises(ValueError, random_matrix, 1, 0) - assert_raises(ValueError, random_matrix, -1, 0) + inputs = [(0, 0), (-1, 1), (1, -1), (1, 0), (-1, 0)] + for n_components, n_features in inputs: + with pytest.raises(ValueError): + random_matrix(n_components, n_features) def check_size_generated(random_matrix): - assert random_matrix(1, 5).shape == (1, 5) - assert random_matrix(5, 1).shape == (5, 1) - assert random_matrix(5, 5).shape == (5, 5) - assert random_matrix(1, 1).shape == (1, 1) + inputs = [(1, 5), (5, 1), (5, 5), (1, 1)] + for n_components, n_features in inputs: + assert random_matrix(n_components, n_features).shape == ( + n_components, n_features) def check_zero_mean_and_unit_norm(random_matrix): @@ -109,8 +107,8 @@ def check_input_with_sparse_random_matrix(random_matrix): n_components, n_features = 5, 10 for density in [-1., 0.0, 1.1]: - assert_raises(ValueError, - random_matrix, n_components, n_features, density=density) + with pytest.raises(ValueError): + random_matrix(n_components, n_features, density=density) @pytest.mark.parametrize("random_matrix", all_random_matrix) @@ -153,9 +151,9 @@ def test_sparse_random_matrix(): s = 1 / density A = _sparse_random_matrix(n_components, - n_features, - density=density, - random_state=0) + n_features, + density=density, + random_state=0) A = densify(A) # Check possible values @@ -196,31 +194,27 @@ def test_sparse_random_matrix(): ############################################################################### # tests on random projection transformer ############################################################################### -def test_sparse_random_projection_transformer_invalid_density(): - for RandomProjection in all_SparseRandomProjection: - assert_raises(ValueError, - RandomProjection(density=1.1).fit, data) - assert_raises(ValueError, - RandomProjection(density=0).fit, data) - - assert_raises(ValueError, - RandomProjection(density=-0.1).fit, data) +@pytest.mark.parametrize("density", [1.1, 0, -0.1]) +def test_sparse_random_projection_transformer_invalid_density(density): + for RandomProjection in all_SparseRandomProjection: + with pytest.raises(ValueError): + RandomProjection(density=density).fit(data) -def test_random_projection_transformer_invalid_input(): +@pytest.mark.parametrize("n_components, fit_data", [ + ('auto', [[0, 1, 2]]), (-10, data)] +) +def test_random_projection_transformer_invalid_input(n_components, fit_data): for RandomProjection in all_RandomProjection: - assert_raises(ValueError, - RandomProjection(n_components='auto').fit, [[0, 1, 2]]) - - assert_raises(ValueError, - RandomProjection(n_components=-10).fit, data) + with pytest.raises(ValueError): + RandomProjection(n_components=n_components).fit(fit_data) def test_try_to_transform_before_fit(): for RandomProjection in all_RandomProjection: - assert_raises(ValueError, - RandomProjection(n_components='auto').transform, data) + with pytest.raises(ValueError): + RandomProjection(n_components='auto').transform(data) def test_too_many_samples_to_find_a_safe_embedding(): @@ -232,7 +226,8 @@ def test_too_many_samples_to_find_a_safe_embedding(): 'eps=0.100000 and n_samples=1000 lead to a target dimension' ' of 5920 which is larger than the original space with' ' n_features=100') - assert_raise_message(ValueError, expected_msg, rp.fit, data) + with pytest.raises(ValueError, match=expected_msg): + rp.fit(data) def test_random_projection_embedding_quality(): @@ -318,7 +313,8 @@ def test_correct_RandomProjection_dimensions_embedding(): assert_array_equal(projected_1, projected_3) # Try to transform with an input X of size different from fitted. - assert_raises(ValueError, rp.transform, data[:, 1:5]) + with pytest.raises(ValueError): + rp.transform(data[:, 1:5]) # it is also possible to fix the number of components and the density # level @@ -337,8 +333,8 @@ def test_warning_n_components_greater_than_n_features(): data, _ = make_sparse_random_data(5, n_features, int(n_features / 4)) for RandomProjection in all_RandomProjection: - assert_warns(DataDimensionalityWarning, - RandomProjection(n_components=n_features + 1).fit, data) + with pytest.warns(DataDimensionalityWarning): + RandomProjection(n_components=n_features + 1).fit(data) def test_works_with_sparse_data(): From da3c2d2a19ade5ca69adb6952ecace811ed122ff Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Fri, 9 Apr 2021 12:34:13 -0400 Subject: [PATCH 312/478] FIX MultiOutputRegressor correctly ducktypes fitted estimators (#19308) Co-authored-by: Olivier Grisel --- doc/whats_new/v0.24.rst | 7 +++++++ sklearn/multioutput.py | 2 +- sklearn/tests/test_multioutput.py | 18 ++++++++++++++++++ 3 files changed, 26 insertions(+), 1 deletion(-) diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst index 68ea8ba0f7a72..2cfe6970dd7b1 100644 --- a/doc/whats_new/v0.24.rst +++ b/doc/whats_new/v0.24.rst @@ -48,6 +48,13 @@ Changelog `'use_encoded_value'` strategies. :pr:`19234` by `Guillaume Lemaitre `. +:mod:`sklearn.multioutput` +.......................... + +- |Fix| :class:`multioutput.MultiOutputRegressor` now works with estimators + that dynamically define `predict` during fitting, such as + :class:`ensemble.StackingRegressor`. :pr:`19308` by `Thomas Fan`_. + :mod:`sklearn.semi_supervised` .............................. diff --git a/sklearn/multioutput.py b/sklearn/multioutput.py index 9987c01b13187..4cb01c524d59d 100644 --- a/sklearn/multioutput.py +++ b/sklearn/multioutput.py @@ -198,7 +198,7 @@ def predict(self, X): Note: Separate models are generated for each predictor. """ check_is_fitted(self) - if not hasattr(self.estimator, "predict"): + if not hasattr(self.estimators_[0], "predict"): raise ValueError("The base estimator should implement" " a predict method") diff --git a/sklearn/tests/test_multioutput.py b/sklearn/tests/test_multioutput.py index 87e5218e08e22..c20db084aa664 100644 --- a/sklearn/tests/test_multioutput.py +++ b/sklearn/tests/test_multioutput.py @@ -10,6 +10,7 @@ from sklearn import datasets from sklearn.base import clone from sklearn.datasets import make_classification +from sklearn.datasets import load_linnerud from sklearn.ensemble import GradientBoostingRegressor, RandomForestClassifier from sklearn.exceptions import NotFittedError from sklearn.linear_model import Lasso @@ -30,6 +31,7 @@ from sklearn.dummy import DummyRegressor, DummyClassifier from sklearn.pipeline import make_pipeline from sklearn.impute import SimpleImputer +from sklearn.ensemble import StackingRegressor def test_multi_target_regression(): @@ -658,3 +660,19 @@ def test_classifier_chain_tuple_invalid_order(): with pytest.raises(ValueError, match='invalid order'): chain.fit(X, y) + + +def test_multioutputregressor_ducktypes_fitted_estimator(): + """Test that MultiOutputRegressor checks the fitted estimator for + predict. Non-regression test for #16549.""" + X, y = load_linnerud(return_X_y=True) + stacker = StackingRegressor( + estimators=[("sgd", SGDRegressor(random_state=1))], + final_estimator=Ridge(), + cv=2 + ) + + reg = MultiOutputRegressor(estimator=stacker).fit(X, y) + + # Does not raise + reg.predict(X) From 7d728d357e55253f30408ce68cafcc82d888393c Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 9 Apr 2021 19:36:36 +0200 Subject: [PATCH 313/478] FIX missing space in import in svm/_base.py (#19852) --- sklearn/svm/_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/svm/_base.py b/sklearn/svm/_base.py index 67808278cc59a..62710ec5157fb 100644 --- a/sklearn/svm/_base.py +++ b/sklearn/svm/_base.py @@ -6,7 +6,7 @@ # mypy error: error: Module 'sklearn.svm' has no attribute '_libsvm' # (and same for other imports) from . import _libsvm as libsvm # type: ignore -from .import _liblinear as liblinear # type: ignore +from . import _liblinear as liblinear # type: ignore from . import _libsvm_sparse as libsvm_sparse # type: ignore from ..base import BaseEstimator, ClassifierMixin from ..preprocessing import LabelEncoder From 3ff1267a7b74259dd0f0fdaf7da88b02e727e7c1 Mon Sep 17 00:00:00 2001 From: Oras Phongpanangam Date: Fri, 9 Apr 2021 11:19:04 -0700 Subject: [PATCH 314/478] FIX allows TransformedTargetRegressor to take nD target (#18898) Co-authored-by: Guillaume Lemaitre --- doc/whats_new/v1.0.rst | 4 ++++ sklearn/compose/_target.py | 2 +- sklearn/compose/tests/test_target.py | 21 +++++++++++++++++++++ 3 files changed, 26 insertions(+), 1 deletion(-) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index 1245193d76d89..b438ee16139f3 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -110,6 +110,10 @@ Changelog :pr:`18459` by :user:`Albert Villanova del Moral ` and :user:`Alonso Silva Allende `. +- |Fix| :class:`compose.TransformedTargetRegressor` now takes nD targets with + an adequate transformer. + :pr:`18898` by :user:`Oras Phongpanagnam `. + :mod:`sklearn.datasets` ....................... diff --git a/sklearn/compose/_target.py b/sklearn/compose/_target.py index 1d6695a808d81..1a80046c66376 100644 --- a/sklearn/compose/_target.py +++ b/sklearn/compose/_target.py @@ -176,7 +176,7 @@ def fit(self, X, y, **fit_params): self : object """ y = check_array(y, accept_sparse=False, force_all_finite=True, - ensure_2d=False, dtype='numeric') + ensure_2d=False, dtype='numeric', allow_nd=True) # store the number of dimension of the target to predict an array of # similar shape at predict diff --git a/sklearn/compose/tests/test_target.py b/sklearn/compose/tests/test_target.py index dc5d8d95743ef..1f3d6bc08e711 100644 --- a/sklearn/compose/tests/test_target.py +++ b/sklearn/compose/tests/test_target.py @@ -197,6 +197,27 @@ def test_transform_target_regressor_2d_transformer_multioutput(): assert_allclose(regr.regressor_.coef_, lr.coef_) +def test_transform_target_regressor_3d_target(): + # Non-regression test for: + # https://github.com/scikit-learn/scikit-learn/issues/18866 + # Check with a 3D target with a transformer that reshapes the target + X = friedman[0] + y = np.tile(friedman[1].reshape(-1, 1, 1), [1, 3, 2]) + + def flatten_data(data): + return data.reshape(data.shape[0], -1) + + def unflatten_data(data): + return data.reshape(data.shape[0], -1, 2) + + transformer = FunctionTransformer(func=flatten_data, + inverse_func=unflatten_data) + regr = TransformedTargetRegressor(regressor=LinearRegression(), + transformer=transformer) + y_pred = regr.fit(X, y).predict(X) + assert y.shape == y_pred.shape + + def test_transform_target_regressor_multi_to_single(): X = friedman[0] y = np.transpose([friedman[1], (friedman[1] ** 2 + 1)]) From b1d686d07559fb83040cb085b752d86ebbb9b3ba Mon Sep 17 00:00:00 2001 From: Ana Pessoa <34238053+analuizaypessoa@users.noreply.github.com> Date: Sun, 11 Apr 2021 18:13:13 -0300 Subject: [PATCH 315/478] DOC Fixed typo in clustering.rst (#19863) --- doc/modules/clustering.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst index 17ae9eb2651c6..7f9fe2a7bd12e 100644 --- a/doc/modules/clustering.rst +++ b/doc/modules/clustering.rst @@ -202,7 +202,7 @@ As a result, the computation is often done several times, with different initializations of the centroids. One method to help address this issue is the k-means++ initialization scheme, which has been implemented in scikit-learn (use the ``init='k-means++'`` parameter). This initializes the centroids to be -(generally) distant from each other, leading to provably better results than +(generally) distant from each other, leading to probably better results than random initialization, as shown in the reference. K-means++ can also be called independently to select seeds for other From f1018c6af15711855e0e626a1c1d2a387ed8dbbb Mon Sep 17 00:00:00 2001 From: xiaoyuchai <39104103+xiaoyuchai@users.noreply.github.com> Date: Sun, 11 Apr 2021 23:54:31 -0700 Subject: [PATCH 316/478] FIX BaseSuccessiveHalving class groups support (#19847) Co-authored-by: Shawn Co-authored-by: Nicolas Hug --- doc/whats_new/v1.0.rst | 5 +++ .../_search_successive_halving.py | 2 +- .../tests/test_successive_halving.py | 36 +++++++++++++++++++ 3 files changed, 42 insertions(+), 1 deletion(-) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index b438ee16139f3..ba3f6d6d1110d 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -276,6 +276,11 @@ Changelog :pr:`18649` by `Leandro Hermida ` and `Rodion Martynov `. +- |Fix| The `fit` method of the successive halving parameter search + (:class:`model_selection.HalvingGridSearchCV`, and + :class:`model_selection.HalvingRandomSearchCV`) now correctly handles the + `groups` parameter. :pr:`19847` by :user:`Xiaoyu Chai `. + :mod:`sklearn.naive_bayes` .......................... diff --git a/sklearn/model_selection/_search_successive_halving.py b/sklearn/model_selection/_search_successive_halving.py index f4396920c1677..2f5c465d6cf41 100644 --- a/sklearn/model_selection/_search_successive_halving.py +++ b/sklearn/model_selection/_search_successive_halving.py @@ -210,7 +210,7 @@ def fit(self, X, y=None, groups=None, **fit_params): self._n_samples_orig = _num_samples(X) - super().fit(X, y=y, groups=None, **fit_params) + super().fit(X, y=y, groups=groups, **fit_params) # Set best_score_: BaseSearchCV does not set it, as refit is a callable self.best_score_ = ( diff --git a/sklearn/model_selection/tests/test_successive_halving.py b/sklearn/model_selection/tests/test_successive_halving.py index 2c55f6aa6cd85..6660b35a934ba 100644 --- a/sklearn/model_selection/tests/test_successive_halving.py +++ b/sklearn/model_selection/tests/test_successive_halving.py @@ -7,9 +7,16 @@ from sklearn.datasets import make_classification from sklearn.dummy import DummyClassifier from sklearn.experimental import enable_halving_search_cv # noqa +from sklearn.model_selection import StratifiedKFold +from sklearn.model_selection import StratifiedShuffleSplit +from sklearn.model_selection import LeaveOneGroupOut +from sklearn.model_selection import LeavePGroupsOut +from sklearn.model_selection import GroupKFold +from sklearn.model_selection import GroupShuffleSplit from sklearn.model_selection import HalvingGridSearchCV from sklearn.model_selection import HalvingRandomSearchCV from sklearn.model_selection import KFold, ShuffleSplit +from sklearn.svm import LinearSVC from sklearn.model_selection._search_successive_halving import ( _SubsampleMetaSplitter, _top_k, _refit_callable) @@ -562,3 +569,32 @@ def set_params(self, **params): assert (cv_results_df['params'] == passed_params).all() assert (cv_results_df['n_resources'] == passed_n_samples).all() + + +@pytest.mark.parametrize('Est', (HalvingGridSearchCV, HalvingRandomSearchCV)) +def test_groups_support(Est): + # Check if ValueError (when groups is None) propagates to + # HalvingGridSearchCV and HalvingRandomSearchCV + # And also check if groups is correctly passed to the cv object + rng = np.random.RandomState(0) + + X, y = make_classification(n_samples=50, n_classes=2, random_state=0) + groups = rng.randint(0, 3, 50) + + clf = LinearSVC(random_state=0) + grid = {'C': [1]} + + group_cvs = [LeaveOneGroupOut(), LeavePGroupsOut(2), + GroupKFold(n_splits=3), GroupShuffleSplit(random_state=0)] + error_msg = "The 'groups' parameter should not be None." + for cv in group_cvs: + gs = Est(clf, grid, cv=cv) + with pytest.raises(ValueError, match=error_msg): + gs.fit(X, y) + gs.fit(X, y, groups=groups) + + non_group_cvs = [StratifiedKFold(), StratifiedShuffleSplit(random_state=0)] + for cv in non_group_cvs: + gs = Est(clf, grid, cv=cv) + # Should not raise an error + gs.fit(X, y) From 7b343ddd53e4efe97b6588b74a75f08c37d76f46 Mon Sep 17 00:00:00 2001 From: Christopher Yeh Date: Mon, 12 Apr 2021 05:51:22 -0600 Subject: [PATCH 317/478] CLN Improve doc/error consistency for GaussianProcessRegressor (#19687) Co-authored-by: Thomas J. Fan --- sklearn/gaussian_process/_gpr.py | 36 ++++++++++++++++---------------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/sklearn/gaussian_process/_gpr.py b/sklearn/gaussian_process/_gpr.py index b4ab0441efc71..4e8814dd69951 100644 --- a/sklearn/gaussian_process/_gpr.py +++ b/sklearn/gaussian_process/_gpr.py @@ -30,9 +30,9 @@ class GaussianProcessRegressor(MultiOutputMixin, GaussianProcessRegressor: * allows prediction without prior fitting (based on the GP prior) - * provides an additional method sample_y(X), which evaluates samples + * provides an additional method `sample_y(X)`, which evaluates samples drawn from the GPR (prior or posterior) at given inputs - * exposes a method log_marginal_likelihood(theta), which can be used + * exposes a method `log_marginal_likelihood(theta)`, which can be used externally for other ways of selecting hyperparameters, e.g., via Markov chain Monte Carlo. @@ -68,8 +68,8 @@ class GaussianProcessRegressor(MultiOutputMixin, must have the signature:: def optimizer(obj_func, initial_theta, bounds): - # * 'obj_func' is the objective function to be minimized, which - # takes the hyperparameters theta as parameter and an + # * 'obj_func': the objective function to be minimized, which + # takes the hyperparameters theta as a parameter and an # optional flag eval_gradient, which determines if the # gradient is returned additionally to the function value # * 'initial_theta': the initial value for theta, which can be @@ -80,7 +80,7 @@ def optimizer(obj_func, initial_theta, bounds): # the corresponding value of the target function. return theta_opt, func_min - Per default, the 'L-BGFS-B' algorithm from scipy.optimize.minimize + Per default, the 'L-BFGS-B' algorithm from scipy.optimize.minimize is used. If None is passed, the kernel's parameters are kept fixed. Available internal optimizers are:: @@ -113,7 +113,7 @@ def optimizer(obj_func, initial_theta, bounds): random_state : int, RandomState instance or None, default=None Determines random number generation used to initialize the centers. Pass an int for reproducible results across multiple function calls. - See :term: `Glossary `. + See :term:`Glossary `. Attributes ---------- @@ -211,8 +211,8 @@ def fit(self, X, y): if self.alpha.shape[0] == 1: self.alpha = self.alpha[0] else: - raise ValueError("alpha must be a scalar or an array" - " with same number of entries as y.(%d != %d)" + raise ValueError("alpha must be a scalar or an array " + "with same number of entries as y. (%d != %d)" % (self.alpha.shape[0], y.shape[0])) self.X_train_ = np.copy(X) if self.copy_X_train else X @@ -283,9 +283,9 @@ def predict(self, X, return_std=False, return_cov=False): """Predict using the Gaussian process regression model We can also predict based on an unfitted model by using the GP prior. - In addition to the mean of the predictive distribution, also its - standard deviation (return_std=True) or covariance (return_cov=True). - Note that at most one of the two can be requested. + In addition to the mean of the predictive distribution, optionally also + returns its standard deviation (`return_std=True`) or covariance + (`return_cov=True`). Note that at most one of the two can be requested. Parameters ---------- @@ -302,7 +302,7 @@ def predict(self, X, return_std=False, return_cov=False): Returns ------- - y_mean : ndarray of shape (n_samples, [n_output_dims]) + y_mean : ndarray of shape (n_samples,) or (n_samples, n_targets) Mean of predictive distribution a query points. y_std : ndarray of shape (n_samples,), optional @@ -315,8 +315,7 @@ def predict(self, X, return_std=False, return_cov=False): """ if return_std and return_cov: raise RuntimeError( - "Not returning standard deviation of predictions when " - "returning full covariance.") + "At most one of return_std or return_cov can be requested.") if self.kernel is None or self.kernel.requires_vector_input: X = self._validate_data(X, ensure_2d=True, dtype="numeric", @@ -389,21 +388,22 @@ def sample_y(self, X, n_samples=1, random_state=0): Parameters ---------- - X : array-like of shape (n_samples, n_features) or list of object + X : array-like of shape (n_samples_X, n_features) or list of object Query points where the GP is evaluated. n_samples : int, default=1 - The number of samples drawn from the Gaussian process + Number of samples drawn from the Gaussian process per query point random_state : int, RandomState instance or None, default=0 Determines random number generation to randomly draw samples. Pass an int for reproducible results across multiple function calls. - See :term: `Glossary `. + See :term:`Glossary `. Returns ------- - y_samples : ndarray of shape (n_samples_X, [n_output_dims], n_samples) + y_samples : ndarray of shape (n_samples_X, n_samples), or \ + (n_samples_X, n_targets, n_samples) Values of n_samples samples drawn from Gaussian process and evaluated at query points. """ From e56d76a8da59f1d28f7887c8be4e55076da885b7 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 12 Apr 2021 17:03:02 +0200 Subject: [PATCH 318/478] FIX Removes unecessary check in _BaseChain (#19865) --- sklearn/multioutput.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/multioutput.py b/sklearn/multioutput.py index 4cb01c524d59d..9b64d28f41eb8 100644 --- a/sklearn/multioutput.py +++ b/sklearn/multioutput.py @@ -470,7 +470,6 @@ def fit(self, X, Y, **fit_params): X, Y = self._validate_data(X, Y, multi_output=True, accept_sparse=True) random_state = check_random_state(self.random_state) - check_array(X, accept_sparse=True) self.order_ = self.order if isinstance(self.order_, tuple): self.order_ = np.array(self.order_) From c09c654ed4d5833d73f557381f3d10f3d062e5d7 Mon Sep 17 00:00:00 2001 From: Vinicius Rios Fuck Date: Mon, 12 Apr 2021 16:59:41 -0300 Subject: [PATCH 319/478] DOC Fix typo in common_pitfalls.rst (#19867) --- doc/common_pitfalls.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/common_pitfalls.rst b/doc/common_pitfalls.rst index 6bc79fbc14c0d..ac5dccb3b5609 100644 --- a/doc/common_pitfalls.rst +++ b/doc/common_pitfalls.rst @@ -564,7 +564,7 @@ preformance by letting the estimator use a different RNG on each fold. This is done by passing a `RandomState` instance (or `None`) to the estimator initialization. -When we pass an integer, the estimator will use the same RNG on each fold: if +When we pass an integer, the estimator will use the same RNG on each fold: if the estimator performs well (or bad), as evaluated by CV, it might just be because we got lucky (or unlucky) with that specific seed. Passing instances leads to more robust CV results, and makes the comparison between various From c59a310e5eb4c3f72a00503a2643005551b9d3eb Mon Sep 17 00:00:00 2001 From: Vinicius Rios Fuck Date: Mon, 12 Apr 2021 23:18:20 -0300 Subject: [PATCH 320/478] DOC Fix typos plot_column_transformer_mixed_types.py (#19871) --- examples/compose/plot_column_transformer_mixed_types.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/compose/plot_column_transformer_mixed_types.py b/examples/compose/plot_column_transformer_mixed_types.py index a2937e041f186..401fe67b7f587 100644 --- a/examples/compose/plot_column_transformer_mixed_types.py +++ b/examples/compose/plot_column_transformer_mixed_types.py @@ -147,8 +147,8 @@ # %% # The resulting score is not exactly the same as the one from the previous -# pipeline becase the dtype-based selector treats the ``pclass`` columns as -# a numeric features instead of a categorical feature as previously: +# pipeline because the dtype-based selector treats the ``pclass`` column as +# a numeric feature instead of a categorical feature as previously: selector(dtype_exclude="category")(X_train) @@ -201,7 +201,7 @@ # %% # The best hyper-parameters have be used to re-fit a final model on the full # training set. We can evaluate that final model on held out test data that was -# not used for hyparameter tuning. +# not used for hyperparameter tuning. # print(("best logistic regression from grid search: %.3f" % grid_search.score(X_test, y_test))) From 926633c00f476f0fcbee9bac2dd275249feef444 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Tue, 13 Apr 2021 09:44:24 +0200 Subject: [PATCH 321/478] Update who may propose a new triage member. (#19870) --- doc/governance.rst | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/doc/governance.rst b/doc/governance.rst index 4ab968786cd79..1d971f16a566e 100644 --- a/doc/governance.rst +++ b/doc/governance.rst @@ -40,9 +40,10 @@ Similarly to what has been decided in the `python project any contributor may become a member of the scikit-learn triage team, after showing some continuity in participating to scikit-learn development (with pull requests and reviews). -Any core developer is welcome to propose a scikit-learn contributor to join the -triage team. Other core developers are then consulted: while it is expected -that most acceptances will be unanimous, a two-thirds majority is enough. +Any core developer or member of the triage team is welcome to propose a +scikit-learn contributor to join the triage team. Other core developers +are then consulted: while it is expected that most acceptances will be +unanimous, a two-thirds majority is enough. Every new triager will be announced in the mailing list. Triagers are welcome to participate in `monthly core developer meetings `_. From 767fd63c9ddddc46e288fdec2cca36a129529a8e Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 13 Apr 2021 12:27:14 +0200 Subject: [PATCH 322/478] DOC make documentation consistent regarding types in _encoders.py (#19876) --- sklearn/preprocessing/_encoders.py | 33 +++++++++++++++++------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index d3f557d2993cb..65e86e512e381 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -406,7 +406,7 @@ def fit(self, X, y=None): Parameters ---------- - X : array-like, shape [n_samples, n_features] + X : array-like of shape (n_samples, n_features) The data to determine the categories of each feature. y : None @@ -431,7 +431,7 @@ def fit_transform(self, X, y=None): Parameters ---------- - X : array-like, shape [n_samples, n_features] + X : array-like of shape (n_samples, n_features) The data to encode. y : None @@ -440,8 +440,10 @@ def fit_transform(self, X, y=None): Returns ------- - X_out : sparse matrix if sparse=True else a 2-d array - Transformed input. + X_out : {ndarray, sparse matrix} of shape \ + (n_samples, n_encoded_features) + Transformed input. If `sparse=True`, a sparse matrix will be + returned. """ self._validate_keywords() return super().fit_transform(X, y) @@ -452,13 +454,15 @@ def transform(self, X): Parameters ---------- - X : array-like, shape [n_samples, n_features] + X : array-like of shape (n_samples, n_features) The data to encode. Returns ------- - X_out : sparse matrix if sparse=True else a 2-d array - Transformed input. + X_out : {ndarray, sparse matrix} of shape \ + (n_samples, n_encoded_features) + Transformed input. If `sparse=True`, a sparse matrix will be + returned. """ check_is_fitted(self) # validation of X happens in _check_X called by _transform @@ -522,12 +526,13 @@ def inverse_transform(self, X): Parameters ---------- - X : array-like or sparse matrix, shape [n_samples, n_encoded_features] + X : {array-like, sparse matrix} of shape \ + (n_samples, n_encoded_features) The transformed data. Returns ------- - X_tr : array-like, shape [n_samples, n_features] + X_tr : ndarray of shape (n_samples, n_features) Inverse transformed array. """ check_is_fitted(self) @@ -745,7 +750,7 @@ def fit(self, X, y=None): Parameters ---------- - X : array-like, shape [n_samples, n_features] + X : array-like of shape (n_samples, n_features) The data to determine the categories of each feature. y : None @@ -814,12 +819,12 @@ def transform(self, X): Parameters ---------- - X : array-like, shape [n_samples, n_features] + X : array-like of shape (n_samples, n_features) The data to encode. Returns ------- - X_out : sparse matrix or a 2-d array + X_out : ndarray of shape (n_samples, n_features) Transformed input. """ X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown, @@ -841,12 +846,12 @@ def inverse_transform(self, X): Parameters ---------- - X : array-like or sparse matrix, shape [n_samples, n_encoded_features] + X : {array-like, sparse matrix} of shape (n_samples, n_features) The transformed data. Returns ------- - X_tr : array-like, shape [n_samples, n_features] + X_tr : ndarray of shape (n_samples, n_features) Inverse transformed array. """ check_is_fitted(self) From 8a3939aa69a9faa45eefc4dfb37d5d3f39f425d3 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 13 Apr 2021 13:59:30 +0200 Subject: [PATCH 323/478] FIX Error for sparse matrix in OrdinalEncoder.inverse_transform (#19879) --- doc/whats_new/v1.0.rst | 4 ++++ sklearn/preprocessing/_encoders.py | 4 ++-- sklearn/preprocessing/tests/test_encoders.py | 22 ++++++++++++++++++++ 3 files changed, 28 insertions(+), 2 deletions(-) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index ba3f6d6d1110d..23211cd3a95b1 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -343,6 +343,10 @@ Changelog `handle_unknown='ignore'` and dropping categories. :pr:`19041` by `Thomas Fan`_. +- |Fix| :meth:`preprocessing.OrdinalEncoder.inverse_transform` is not + supporting sparse matrix and raise the appropriate error message. + :pr:`19879` by :user:`Guillaume Lemaitre `. + :mod:`sklearn.tree` ................... diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 65e86e512e381..cd05dc89bb75d 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -846,7 +846,7 @@ def inverse_transform(self, X): Parameters ---------- - X : {array-like, sparse matrix} of shape (n_samples, n_features) + X : array-like of shape (n_samples, n_encoded_features) The transformed data. Returns @@ -855,7 +855,7 @@ def inverse_transform(self, X): Inverse transformed array. """ check_is_fitted(self) - X = check_array(X, accept_sparse='csr', force_all_finite='allow-nan') + X = check_array(X, force_all_finite='allow-nan') n_samples, _ = X.shape n_features = len(self.categories_) diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index eb776c4c25267..9f1e331f78fec 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -1110,3 +1110,25 @@ def test_ordinal_encoder_handle_missing_and_unknown( assert_allclose(X_trans, expected_X_trans) assert_allclose(oe.transform(X_test), [[-1.0]]) + + +def test_ordinal_encoder_sparse(): + """Check that we raise proper error with sparse input in OrdinalEncoder. + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/19878 + """ + X = np.array([[3, 2, 1], [0, 1, 1]]) + X_sparse = sparse.csr_matrix(X) + + encoder = OrdinalEncoder() + + err_msg = "A sparse matrix was passed, but dense data is required" + with pytest.raises(TypeError, match=err_msg): + encoder.fit(X_sparse) + with pytest.raises(TypeError, match=err_msg): + encoder.fit_transform(X_sparse) + + X_trans = encoder.fit_transform(X) + X_trans_sparse = sparse.csr_matrix(X_trans) + with pytest.raises(TypeError, match=err_msg): + encoder.inverse_transform(X_trans_sparse) From bbdd3bbbec6c28c03d2e7dbbf96039eaf3c64f97 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Tue, 13 Apr 2021 17:11:13 +0200 Subject: [PATCH 324/478] CI Add label on PRs modifying Cython code (#19850) Co-authored-by: "Thomas J. Fan" Co-authored-by: Nicolas Hug Co-authored-by: Olivier Grisel --- .github/labeler-file-extensions.yml | 8 ++++++++ .github/workflows/labeler-module.yml | 10 ++++++++++ 2 files changed, 18 insertions(+) create mode 100644 .github/labeler-file-extensions.yml diff --git a/.github/labeler-file-extensions.yml b/.github/labeler-file-extensions.yml new file mode 100644 index 0000000000000..63fcfcacfeb17 --- /dev/null +++ b/.github/labeler-file-extensions.yml @@ -0,0 +1,8 @@ +cython: +- sklearn/**/*.pyx +- sklearn/**/*.pxd +- sklearn/**/*.pxi +# Tempita templates +- sklearn/**/*.pyx.tp +- sklearn/**/*.pxd.tp +- sklearn/**/*.pxi.tp diff --git a/.github/workflows/labeler-module.yml b/.github/workflows/labeler-module.yml index 3a9ed8d364f79..eb1669443bb0d 100644 --- a/.github/workflows/labeler-module.yml +++ b/.github/workflows/labeler-module.yml @@ -12,3 +12,13 @@ jobs: repo-token: "${{ secrets.GITHUB_TOKEN }}" max-labels: "3" configuration-path: ".github/labeler-module.yml" + + triage_file_extensions: + runs-on: ubuntu-latest + steps: + - uses: thomasjpfan/labeler@v2.5.0 + continue-on-error: true + if: github.repository == 'scikit-learn/scikit-learn' + with: + repo-token: "${{ secrets.GITHUB_TOKEN }}" + configuration-path: ".github/labeler-file-extensions.yml" \ No newline at end of file From ab65c8b7d672164e43479f38a95788376229fed0 Mon Sep 17 00:00:00 2001 From: Alihan Zihna Date: Tue, 13 Apr 2021 16:47:09 +0100 Subject: [PATCH 325/478] TST Changes assert to pytest style in tests/test_isotonic.py (#19864) Co-authored-by: Alihan Zihna --- sklearn/tests/test_isotonic.py | 69 +++++++++++++++++++++++++--------- 1 file changed, 51 insertions(+), 18 deletions(-) diff --git a/sklearn/tests/test_isotonic.py b/sklearn/tests/test_isotonic.py index af14f73cd1beb..a88c830256e73 100644 --- a/sklearn/tests/test_isotonic.py +++ b/sklearn/tests/test_isotonic.py @@ -9,10 +9,9 @@ IsotonicRegression, _make_unique) from sklearn.utils.validation import check_array -from sklearn.utils._testing import (assert_raises, assert_allclose, +from sklearn.utils._testing import (assert_allclose, assert_array_equal, - assert_array_almost_equal, - assert_warns_message, assert_no_warnings) + assert_array_almost_equal) from sklearn.utils import shuffle from scipy.special import expit @@ -37,7 +36,10 @@ def test_check_increasing_small_number_of_samples(): x = [0, 1, 2] y = [1, 1.1, 1.05] - is_increasing = assert_no_warnings(check_increasing, x, y) + with pytest.warns(None) as record: + is_increasing = check_increasing(x, y) + assert len(record) == 0 + assert is_increasing @@ -46,7 +48,10 @@ def test_check_increasing_up(): y = [0, 1.5, 2.77, 8.99, 8.99, 50] # Check that we got increasing=True and no warnings - is_increasing = assert_no_warnings(check_increasing, x, y) + with pytest.warns(None) as record: + is_increasing = check_increasing(x, y) + assert len(record) == 0 + assert is_increasing @@ -55,7 +60,10 @@ def test_check_increasing_up_extreme(): y = [0, 1, 2, 3, 4, 5] # Check that we got increasing=True and no warnings - is_increasing = assert_no_warnings(check_increasing, x, y) + with pytest.warns(None) as record: + is_increasing = check_increasing(x, y) + assert len(record) == 0 + assert is_increasing @@ -64,7 +72,10 @@ def test_check_increasing_down(): y = [0, -1.5, -2.77, -8.99, -8.99, -50] # Check that we got increasing=False and no warnings - is_increasing = assert_no_warnings(check_increasing, x, y) + with pytest.warns(None) as record: + is_increasing = check_increasing(x, y) + assert len(record) == 0 + assert not is_increasing @@ -73,7 +84,10 @@ def test_check_increasing_down_extreme(): y = [0, -1, -2, -3, -4, -5] # Check that we got increasing=False and no warnings - is_increasing = assert_no_warnings(check_increasing, x, y) + with pytest.warns(None) as record: + is_increasing = check_increasing(x, y) + assert len(record) == 0 + assert not is_increasing @@ -82,9 +96,9 @@ def test_check_ci_warn(): y = [0, -1, 2, -3, 4, -5] # Check that we got increasing=False and CI interval warning - is_increasing = assert_warns_message(UserWarning, "interval", - check_increasing, - x, y) + msg = "interval" + with pytest.warns(UserWarning, match=msg): + is_increasing = check_increasing(x, y) assert not is_increasing @@ -244,10 +258,21 @@ def test_isotonic_regression_auto_increasing(): def test_assert_raises_exceptions(): ir = IsotonicRegression() rng = np.random.RandomState(42) - assert_raises(ValueError, ir.fit, [0, 1, 2], [5, 7, 3], [0.1, 0.6]) - assert_raises(ValueError, ir.fit, [0, 1, 2], [5, 7]) - assert_raises(ValueError, ir.fit, rng.randn(3, 10), [0, 1, 2]) - assert_raises(ValueError, ir.transform, rng.randn(3, 10)) + + msg = "Found input variables with inconsistent numbers of samples" + with pytest.raises(ValueError, match=msg): + ir.fit([0, 1, 2], [5, 7, 3], [0.1, 0.6]) + + with pytest.raises(ValueError, match=msg): + ir.fit([0, 1, 2], [5, 7]) + + msg = 'X should be a 1d array' + with pytest.raises(ValueError, match=msg): + ir.fit(rng.randn(3, 10), [0, 1, 2]) + + msg = 'Isotonic regression input X should be a 1d array' + with pytest.raises(ValueError, match=msg): + ir.transform(rng.randn(3, 10)) def test_isotonic_sample_weight_parameter_default_value(): @@ -298,7 +323,9 @@ def test_isotonic_regression_oob_raise(): ir.fit(x, y) # Check that an exception is thrown - assert_raises(ValueError, ir.predict, [min(x) - 10, max(x) + 10]) + msg = 'A value in x_new is below the interpolation range' + with pytest.raises(ValueError, match=msg): + ir.predict([min(x) - 10, max(x) + 10]) def test_isotonic_regression_oob_clip(): @@ -340,7 +367,10 @@ def test_isotonic_regression_oob_bad(): ir = IsotonicRegression(increasing='auto', out_of_bounds="xyz") # Make sure that we throw an error for bad out_of_bounds value - assert_raises(ValueError, ir.fit, x, y) + msg = ("The argument ``out_of_bounds`` must be in 'nan', " + "'clip', 'raise'; got xyz") + with pytest.raises(ValueError, match=msg): + ir.fit(x, y) def test_isotonic_regression_oob_bad_after(): @@ -354,7 +384,10 @@ def test_isotonic_regression_oob_bad_after(): # Make sure that we throw an error for bad out_of_bounds value in transform ir.fit(x, y) ir.out_of_bounds = "xyz" - assert_raises(ValueError, ir.transform, x) + msg = ("The argument ``out_of_bounds`` must be in 'nan', " + "'clip', 'raise'; got xyz") + with pytest.raises(ValueError, match=msg): + ir.transform(x) def test_isotonic_regression_pickle(): From 7fa2e6e2734b590d96e62d5932c648a9c1002f34 Mon Sep 17 00:00:00 2001 From: Christopher Yeh Date: Tue, 13 Apr 2021 15:00:39 -0600 Subject: [PATCH 326/478] DOC Clarify documentation for spectral clustering (#19795) --- sklearn/cluster/_spectral.py | 110 ++++++++++++++++++----------------- 1 file changed, 57 insertions(+), 53 deletions(-) diff --git a/sklearn/cluster/_spectral.py b/sklearn/cluster/_spectral.py index b86d5870025c3..e9a5d7a7b4302 100644 --- a/sklearn/cluster/_spectral.py +++ b/sklearn/cluster/_spectral.py @@ -191,7 +191,7 @@ def spectral_clustering(affinity, *, n_clusters=8, n_components=None, Number of clusters to extract. n_components : int, default=n_clusters - Number of eigen vectors to use for the spectral embedding + Number of eigenvectors to use for the spectral embedding eigen_solver : {None, 'arpack', 'lobpcg', or 'amg'} The eigenvalue decomposition strategy to use. AMG requires pyamg @@ -201,15 +201,16 @@ def spectral_clustering(affinity, *, n_clusters=8, n_components=None, random_state : int, RandomState instance, default=None A pseudo random number generator used for the initialization of the - lobpcg eigen vectors decomposition when eigen_solver == 'amg' and by + lobpcg eigenvectors decomposition when eigen_solver == 'amg' and by the K-Means initialization. Use an int to make the randomness deterministic. See :term:`Glossary `. n_init : int, default=10 Number of time the k-means algorithm will be run with different - centroid seeds. The final results will be the best output of - n_init consecutive runs in terms of inertia. + centroid seeds. The final results will be the best output of n_init + consecutive runs in terms of inertia. Only used if + ``assign_labels='kmeans'``. eigen_tol : float, default=0.0 Stopping criterion for eigendecomposition of the Laplacian matrix @@ -217,7 +218,7 @@ def spectral_clustering(affinity, *, n_clusters=8, n_components=None, assign_labels : {'kmeans', 'discretize'}, default='kmeans' The strategy to use to assign labels in the embedding - space. There are two ways to assign labels after the laplacian + space. There are two ways to assign labels after the Laplacian embedding. k-means can be applied and is a popular choice. But it can also be sensitive to initialization. Discretization is another approach which is less sensitive to random initialization. See @@ -265,7 +266,7 @@ def spectral_clustering(affinity, *, n_clusters=8, n_components=None, random_state = check_random_state(random_state) n_components = n_clusters if n_components is None else n_components - # The first eigen vector is constant only for fully connected graphs + # The first eigenvector is constant only for fully connected graphs # and should be kept for spectral clustering (drop_first = False) # See spectral_embedding documentation. maps = spectral_embedding(affinity, n_components=n_components, @@ -288,24 +289,24 @@ class SpectralClustering(ClusterMixin, BaseEstimator): """Apply clustering to a projection of the normalized Laplacian. In practice Spectral Clustering is very useful when the structure of - the individual clusters is highly non-convex or more generally when + the individual clusters is highly non-convex, or more generally when a measure of the center and spread of the cluster is not a suitable - description of the complete cluster. For instance when clusters are + description of the complete cluster, such as when clusters are nested circles on the 2D plane. - If affinity is the adjacency matrix of a graph, this method can be - used to find normalized graph cuts. + If the affinity matrix is the adjacency matrix of a graph, this method + can be used to find normalized graph cuts. When calling ``fit``, an affinity matrix is constructed using either - kernel function such the Gaussian (aka RBF) kernel of the euclidean - distanced ``d(X, X)``:: + a kernel function such the Gaussian (aka RBF) kernel with Euclidean + distance ``d(X, X)``:: np.exp(-gamma * d(X,X) ** 2) or a k-nearest neighbors connectivity matrix. - Alternatively, using ``precomputed``, a user-provided affinity - matrix can be used. + Alternatively, a user-provided affinity matrix can be specified by + setting ``affinity='precomputed'``. Read more in the :ref:`User Guide `. @@ -321,19 +322,20 @@ class SpectralClustering(ClusterMixin, BaseEstimator): used. n_components : int, default=n_clusters - Number of eigen vectors to use for the spectral embedding + Number of eigenvectors to use for the spectral embedding random_state : int, RandomState instance, default=None A pseudo random number generator used for the initialization of the - lobpcg eigen vectors decomposition when ``eigen_solver='amg'`` and by + lobpcg eigenvectors decomposition when ``eigen_solver='amg'`` and by the K-Means initialization. Use an int to make the randomness deterministic. See :term:`Glossary `. n_init : int, default=10 Number of time the k-means algorithm will be run with different - centroid seeds. The final results will be the best output of - n_init consecutive runs in terms of inertia. + centroid seeds. The final results will be the best output of n_init + consecutive runs in terms of inertia. Only used if + ``assign_labels='kmeans'``. gamma : float, default=1.0 Kernel coefficient for rbf, poly, sigmoid, laplacian and chi2 kernels. @@ -341,14 +343,15 @@ class SpectralClustering(ClusterMixin, BaseEstimator): affinity : str or callable, default='rbf' How to construct the affinity matrix. - - 'nearest_neighbors' : construct the affinity matrix by computing a + - 'nearest_neighbors': construct the affinity matrix by computing a graph of nearest neighbors. - - 'rbf' : construct the affinity matrix using a radial basis function + - 'rbf': construct the affinity matrix using a radial basis function (RBF) kernel. - - 'precomputed' : interpret ``X`` as a precomputed affinity matrix. - - 'precomputed_nearest_neighbors' : interpret ``X`` as a sparse graph - of precomputed nearest neighbors, and constructs the affinity matrix - by selecting the ``n_neighbors`` nearest neighbors. + - 'precomputed': interpret ``X`` as a precomputed affinity matrix, + where larger values indicate greater similarity between instances. + - 'precomputed_nearest_neighbors': interpret ``X`` as a sparse graph + of precomputed distances, and construct a binary affinity matrix + from the ``n_neighbors`` nearest neighbors of each instance. - one of the kernels supported by :func:`~sklearn.metrics.pairwise_kernels`. @@ -365,11 +368,11 @@ class SpectralClustering(ClusterMixin, BaseEstimator): when ``eigen_solver='arpack'``. assign_labels : {'kmeans', 'discretize'}, default='kmeans' - The strategy to use to assign labels in the embedding - space. There are two ways to assign labels after the laplacian - embedding. k-means can be applied and is a popular choice. But it can - also be sensitive to initialization. Discretization is another approach - which is less sensitive to random initialization. + The strategy for assigning labels in the embedding space. There are two + ways to assign labels after the Laplacian embedding. k-means is a + popular choice, but it can be sensitive to initialization. + Discretization is another approach which is less sensitive to random + initialization. degree : float, default=3 Degree of the polynomial kernel. Ignored by other kernels. @@ -398,7 +401,7 @@ class SpectralClustering(ClusterMixin, BaseEstimator): Attributes ---------- affinity_matrix_ : array-like of shape (n_samples, n_samples) - Affinity matrix used for clustering. Available only if after calling + Affinity matrix used for clustering. Available only after calling ``fit``. labels_ : ndarray of shape (n_samples,) @@ -411,7 +414,7 @@ class SpectralClustering(ClusterMixin, BaseEstimator): >>> X = np.array([[1, 1], [2, 1], [1, 0], ... [4, 7], [3, 5], [3, 6]]) >>> clustering = SpectralClustering(n_clusters=2, - ... assign_labels="discretize", + ... assign_labels='discretize', ... random_state=0).fit(X) >>> clustering.labels_ array([1, 1, 1, 0, 0, 0]) @@ -421,19 +424,18 @@ class SpectralClustering(ClusterMixin, BaseEstimator): Notes ----- - If you have an affinity matrix, such as a distance matrix, - for which 0 means identical elements, and high values means - very dissimilar elements, it can be transformed in a - similarity matrix that is well suited for the algorithm by - applying the Gaussian (RBF, heat) kernel:: + A distance matrix for which 0 indicates identical elements and high values + indicate very dissimilar elements can be transformed into an affinity / + similarity matrix that is well-suited for the algorithm by + applying the Gaussian (aka RBF, heat) kernel:: np.exp(- dist_matrix ** 2 / (2. * delta ** 2)) - Where ``delta`` is a free parameter representing the width of the Gaussian + where ``delta`` is a free parameter representing the width of the Gaussian kernel. - Another alternative is to take a symmetric version of the k - nearest neighbors connectivity matrix of the points. + An alternative is to take a symmetric version of the k-nearest neighbors + connectivity matrix of the points. If the pyamg package is installed, it is used: this greatly speeds up computation. @@ -480,13 +482,14 @@ def fit(self, X, y=None): Parameters ---------- - X : {array-like, sparse matrix} of shape (n_samples, n_features), or \ - array-like of shape (n_samples, n_samples) - Training instances to cluster, or similarities / affinities between - instances if ``affinity='precomputed'``. If a sparse matrix is - provided in a format other than ``csr_matrix``, ``csc_matrix``, - or ``coo_matrix``, it will be converted into a sparse - ``csr_matrix``. + X : {array-like, sparse matrix} of shape (n_samples, n_features) or \ + (n_samples, n_samples) + Training instances to cluster, similarities / affinities between + instances if ``affinity='precomputed'``, or distances between + instances if ``affinity='precomputed_nearest_neighbors``. If a + sparse matrix is provided in a format other than ``csr_matrix``, + ``csc_matrix``, or ``coo_matrix``, it will be converted into a + sparse ``csr_matrix``. y : Ignored Not used, present here for API consistency by convention. @@ -549,13 +552,14 @@ def fit_predict(self, X, y=None): Parameters ---------- - X : {array-like, sparse matrix} of shape (n_samples, n_features), or \ - array-like of shape (n_samples, n_samples) - Training instances to cluster, or similarities / affinities between - instances if ``affinity='precomputed'``. If a sparse matrix is - provided in a format other than ``csr_matrix``, ``csc_matrix``, - or ``coo_matrix``, it will be converted into a sparse - ``csr_matrix``. + X : {array-like, sparse matrix} of shape (n_samples, n_features) or \ + (n_samples, n_samples) + Training instances to cluster, similarities / affinities between + instances if ``affinity='precomputed'``, or distances between + instances if ``affinity='precomputed_nearest_neighbors``. If a + sparse matrix is provided in a format other than ``csr_matrix``, + ``csc_matrix``, or ``coo_matrix``, it will be converted into a + sparse ``csr_matrix``. y : Ignored Not used, present here for API consistency by convention. From 872052b9ab471cb336c448cf4e0aa968b49f9199 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 14 Apr 2021 08:30:25 +0100 Subject: [PATCH 327/478] FIX convert cv_results_ values to numpy array in SuccessiveHalving (#19211) Co-authored-by: Thomas J. Fan --- doc/whats_new/v0.24.rst | 5 +++++ sklearn/model_selection/_search.py | 4 ++++ sklearn/model_selection/tests/test_successive_halving.py | 6 ++++++ 3 files changed, 15 insertions(+) diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst index 2cfe6970dd7b1..09f3d9bdecd3e 100644 --- a/doc/whats_new/v0.24.rst +++ b/doc/whats_new/v0.24.rst @@ -40,6 +40,11 @@ Changelog :class:`model_selection.GridSearchCV` now correctly shows the score for single metrics and verbose > 2. :pr:`19659` by `Thomas Fan`_. +- |Fix| Some values in the `cv_results_` attribute of + :class:`model_selection.HalvingRandomSearchCV` and + :class:`model_selection.HalvingGridSearchCV` were not properly converted to + numpy arrays. :pr:`19211` by `Nicolas Hug`_. + :mod:`sklearn.preprocessing` ............................ diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index ebd085c08e68f..6e837a2f97b24 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -897,6 +897,10 @@ def _format_results(self, candidate_params, n_splits, out, out = _aggregate_score_dicts(out) results = dict(more_results or {}) + for key, val in results.items(): + # each value is a list (as per evaluate_candidate's convention) + # we convert it to an array for consistency with the other keys + results[key] = np.asarray(val) def _store(key_name, array, weights=None, splits=False, rank=False): """A small helper to store the scores/times to the cv_results_""" diff --git a/sklearn/model_selection/tests/test_successive_halving.py b/sklearn/model_selection/tests/test_successive_halving.py index 6660b35a934ba..3abd7956938d1 100644 --- a/sklearn/model_selection/tests/test_successive_halving.py +++ b/sklearn/model_selection/tests/test_successive_halving.py @@ -445,6 +445,12 @@ def scorer(est, X, y): sh.set_params(n_candidates=2 * 30, min_resources='exhaust') sh.fit(X, y) + + # non-regression check for + # https://github.com/scikit-learn/scikit-learn/issues/19203 + assert isinstance(sh.cv_results_['iter'], np.ndarray) + assert isinstance(sh.cv_results_['n_resources'], np.ndarray) + cv_results_df = pd.DataFrame(sh.cv_results_) # just make sure we don't have ties From 684b7d1955e76c0621ca2e399df90e83e525a6f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?= <34657725+jeremiedbb@users.noreply.github.com> Date: Wed, 14 Apr 2021 13:34:22 +0200 Subject: [PATCH 328/478] FIX detect near constant feature in StandardScaler and linear models (#19788) Co-authored-by: Olivier Grisel --- doc/whats_new/v1.0.rst | 3 +- sklearn/linear_model/_base.py | 6 +- sklearn/preprocessing/_data.py | 19 ++++- sklearn/preprocessing/tests/test_data.py | 63 +++++++++++++++-- sklearn/utils/extmath.py | 42 +++++++++-- sklearn/utils/sparsefuncs_fast.pyx | 90 ++++++++++++++---------- 6 files changed, 166 insertions(+), 57 deletions(-) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index 23211cd3a95b1..516af4b349c00 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -334,7 +334,8 @@ Changelog very large values. This problem happens in particular when using a scaler on sparse data with a constant column with sample weights, in which case centering is typically disabled. :pr:`19527` by :user:`Oliver Grisel - ` and :user:`Maria Telenczuk `. + ` and :user:`Maria Telenczuk ` and :pr:`19788` by + :user:`Jérémie du Boisberranger `. - |Fix| :meth:`preprocessing.StandardScaler.inverse_transform` now correctly handles integer dtypes. :pr:`19356` by :user:`makoeppel`. diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py index c80c2db622921..5783e4740a08c 100644 --- a/sklearn/linear_model/_base.py +++ b/sklearn/linear_model/_base.py @@ -28,6 +28,7 @@ from ..base import (BaseEstimator, ClassifierMixin, RegressorMixin, MultiOutputMixin) +from ..preprocessing._data import _is_constant_feature from ..utils import check_array from ..utils.validation import FLOAT_DTYPES from ..utils.validation import _deprecate_positional_args @@ -39,7 +40,6 @@ from ..utils._seq_dataset import ArrayDataset32, CSRDataset32 from ..utils._seq_dataset import ArrayDataset64, CSRDataset64 from ..utils.validation import check_is_fitted, _check_sample_weight - from ..utils.fixes import delayed # TODO: bayesian_ridge_regression and bayesian_regression_ard @@ -271,8 +271,8 @@ def _preprocess_data(X, y, fit_intercept, normalize=False, copy=True, X_var = X_var.astype(X.dtype, copy=False) # Detect constant features on the computed variance, before taking # the np.sqrt. Otherwise constant features cannot be detected with - # sample_weights. - constant_mask = X_var < 10 * np.finfo(X.dtype).eps + # sample weights. + constant_mask = _is_constant_feature(X_var, X_offset, X.shape[0]) X_var *= X.shape[0] X_scale = np.sqrt(X_var, out=X_var) X_scale[constant_mask] = 1. diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py index 6191fb2fd8bcd..80cb132174328 100644 --- a/sklearn/preprocessing/_data.py +++ b/sklearn/preprocessing/_data.py @@ -57,6 +57,22 @@ ] +def _is_constant_feature(var, mean, n_samples): + """Detect if a feature is indistinguishable from a constant feature. + + The detection is based on its computed variance and on the theoretical + error bounds of the '2 pass algorithm' for variance computation. + + See "Algorithms for computing the sample variance: analysis and + recommendations", by Chan, Golub, and LeVeque. + """ + # In scikit-learn, variance is always computed using float64 accumulators. + eps = np.finfo(np.float64).eps + + upper_bound = n_samples * eps * var + (n_samples * mean * eps)**2 + return var <= upper_bound + + def _handle_zeros_in_scale(scale, copy=True, constant_mask=None): """Set scales of near constant features to 1. @@ -863,7 +879,8 @@ def partial_fit(self, X, y=None, sample_weight=None): if self.with_std: # Extract the list of near constant features on the raw variances, # before taking the square root. - constant_mask = self.var_ < 10 * np.finfo(X.dtype).eps + constant_mask = _is_constant_feature( + self.var_, self.mean_, self.n_samples_seen_) self.scale_ = _handle_zeros_in_scale( np.sqrt(self.var_), copy=False, constant_mask=constant_mask) else: diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index 5557562283850..45d967d5f39a2 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -224,13 +224,6 @@ def test_standard_scaler_dtype(add_sample_weight, sparse_constructor): @pytest.mark.parametrize("constant", [0, 1., 100.]) def test_standard_scaler_constant_features( scaler, add_sample_weight, sparse_constructor, dtype, constant): - if (isinstance(scaler, StandardScaler) - and constant > 1 - and sparse_constructor is not np.asarray - and add_sample_weight): - # https://github.com/scikit-learn/scikit-learn/issues/19546 - pytest.xfail("Computation of weighted variance is numerically unstable" - " for sparse data. See: #19546.") if isinstance(scaler, RobustScaler) and add_sample_weight: pytest.skip(f"{scaler.__class__.__name__} does not yet support" @@ -269,6 +262,62 @@ def test_standard_scaler_constant_features( assert_allclose(X_scaled_2, X_scaled_2) +@pytest.mark.parametrize("n_samples", [10, 100, 10_000]) +@pytest.mark.parametrize("average", [1e-10, 1, 1e10]) +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +@pytest.mark.parametrize("array_constructor", + [np.asarray, sparse.csc_matrix, sparse.csr_matrix]) +def test_standard_scaler_near_constant_features(n_samples, array_constructor, + average, dtype): + # Check that when the variance is too small (var << mean**2) the feature + # is considered constant and not scaled. + + scale_min, scale_max = -30, 19 + scales = np.array([10**i for i in range(scale_min, scale_max + 1)], + dtype=dtype) + + n_features = scales.shape[0] + X = np.empty((n_samples, n_features), dtype=dtype) + # Make a dataset of known var = scales**2 and mean = average + X[:n_samples//2, :] = average + scales + X[n_samples//2:, :] = average - scales + X_array = array_constructor(X) + + scaler = StandardScaler(with_mean=False).fit(X_array) + + # StandardScaler uses float64 accumulators even if the data has a float32 + # dtype. + eps = np.finfo(np.float64).eps + + # if var < bound = N.eps.var + N².eps².mean², the feature is considered + # constant and the scale_ attribute is set to 1. + bounds = n_samples * eps * scales**2 + n_samples**2 * eps**2 * average**2 + within_bounds = scales**2 <= bounds + + # Check that scale_min is small enough to have some scales below the + # bound and therefore detected as constant: + assert np.any(within_bounds) + + # Check that such features are actually treated as constant by the scaler: + assert all(scaler.var_[within_bounds] <= bounds[within_bounds]) + assert_allclose(scaler.scale_[within_bounds], 1.) + + # Depending the on the dtype of X, some features might not actually be + # representable as non constant for small scales (even if above the + # precision bound of the float64 variance estimate). Such feature should + # be correctly detected as constants with 0 variance by StandardScaler. + representable_diff = X[0, :] - X[-1, :] != 0 + assert_allclose(scaler.var_[np.logical_not(representable_diff)], 0) + assert_allclose(scaler.scale_[np.logical_not(representable_diff)], 1) + + # The other features are scaled and scale_ is equal to sqrt(var_) assuming + # that scales are large enough for average + scale and average - scale to + # be distinct in X (depending on X's dtype). + common_mask = np.logical_and(scales**2 > bounds, representable_diff) + assert_allclose(scaler.scale_[common_mask], + np.sqrt(scaler.var_)[common_mask]) + + def test_scale_1d(): # 1-d inputs X_list = [1., 3., 5., 0.] diff --git a/sklearn/utils/extmath.py b/sklearn/utils/extmath.py index 42a014dcd8ade..add8c5883a751 100644 --- a/sklearn/utils/extmath.py +++ b/sklearn/utils/extmath.py @@ -18,6 +18,7 @@ from . import check_random_state from ._logistic_sigmoid import _log_logistic_sigmoid +from .fixes import np_version, parse_version from .sparsefuncs_fast import csr_row_norms from .validation import check_array from .validation import _deprecate_positional_args @@ -767,10 +768,17 @@ def _incremental_mean_and_var(X, last_mean, last_variance, last_sample_count, # updated = the aggregated stats last_sum = last_mean * last_sample_count if sample_weight is not None: - new_sum = _safe_accumulator_op(np.nansum, X * sample_weight[:, None], - axis=0) - new_sample_count = np.sum(sample_weight[:, None] * (~np.isnan(X)), - axis=0) + if np_version >= parse_version("1.16.6"): + # equivalent to np.nansum(X * sample_weight, axis=0) + # safer because np.float64(X*W) != np.float64(X)*np.float64(W) + # dtype arg of np.matmul only exists since version 1.16 + new_sum = _safe_accumulator_op( + np.matmul, sample_weight, np.where(np.isnan(X), 0, X)) + else: + new_sum = _safe_accumulator_op( + np.nansum, X * sample_weight[:, None], axis=0) + new_sample_count = _safe_accumulator_op( + np.sum, sample_weight[:, None] * (~np.isnan(X)), axis=0) else: new_sum = _safe_accumulator_op(np.nansum, X, axis=0) new_sample_count = np.sum(~np.isnan(X), axis=0) @@ -784,10 +792,30 @@ def _incremental_mean_and_var(X, last_mean, last_variance, last_sample_count, else: T = new_sum / new_sample_count if sample_weight is not None: - new_unnormalized_variance = np.nansum(sample_weight[:, None] * - (X - T)**2, axis=0) + if np_version >= parse_version("1.16.6"): + # equivalent to np.nansum((X-T)**2 * sample_weight, axis=0) + # safer because np.float64(X*W) != np.float64(X)*np.float64(W) + # dtype arg of np.matmul only exists since version 1.16 + new_unnormalized_variance = _safe_accumulator_op( + np.matmul, sample_weight, + np.where(np.isnan(X), 0, (X - T)**2)) + correction = _safe_accumulator_op( + np.matmul, sample_weight, np.where(np.isnan(X), 0, X - T)) + else: + new_unnormalized_variance = _safe_accumulator_op( + np.nansum, (X - T)**2 * sample_weight[:, None], axis=0) + correction = _safe_accumulator_op( + np.nansum, (X - T) * sample_weight[:, None], axis=0) else: - new_unnormalized_variance = np.nansum((X - T)**2, axis=0) + new_unnormalized_variance = _safe_accumulator_op( + np.nansum, (X - T)**2, axis=0) + correction = _safe_accumulator_op(np.nansum, X - T, axis=0) + + # correction term of the corrected 2 pass algorithm. + # See "Algorithms for computing the sample variance: analysis + # and recommendations", by Chan, Golub, and LeVeque. + new_unnormalized_variance -= correction**2 / new_sample_count + last_unnormalized_variance = last_variance * last_sample_count with np.errstate(divide='ignore', invalid='ignore'): diff --git a/sklearn/utils/sparsefuncs_fast.pyx b/sklearn/utils/sparsefuncs_fast.pyx index 4a84c03eff86b..09677600cbbe4 100644 --- a/sklearn/utils/sparsefuncs_fast.pyx +++ b/sklearn/utils/sparsefuncs_fast.pyx @@ -57,6 +57,8 @@ def _csr_row_norms(np.ndarray[floating, ndim=1, mode="c"] X_data, def csr_mean_variance_axis0(X, weights=None, return_sum_weights=False): """Compute mean and variance along axis 0 on a CSR matrix + Uses a np.float64 accumulator. + Parameters ---------- X : CSR sparse matrix, shape (n_samples, n_features) @@ -109,25 +111,18 @@ def _csr_mean_variance_axis0(np.ndarray[floating, ndim=1, mode="c"] X_data, np.npy_intp i unsigned long long row_ind integral col_ind - floating diff + np.float64_t diff # means[j] contains the mean of feature j - np.ndarray[floating, ndim=1] means + np.ndarray[np.float64_t, ndim=1] means = np.zeros(n_features) # variances[j] contains the variance of feature j - np.ndarray[floating, ndim=1] variances - - if floating is float: - dtype = np.float32 - else: - dtype = np.float64 + np.ndarray[np.float64_t, ndim=1] variances = np.zeros(n_features) - means = np.zeros(n_features, dtype=dtype) - variances = np.zeros_like(means, dtype=dtype) - - cdef: - np.ndarray[floating, ndim=1] sum_weights = np.full( - fill_value=np.sum(weights), shape=n_features, dtype=dtype) - np.ndarray[floating, ndim=1] sum_weights_nz = np.zeros( - shape=n_features, dtype=dtype) + np.ndarray[np.float64_t, ndim=1] sum_weights = np.full( + fill_value=np.sum(weights, dtype=np.float64), shape=n_features) + np.ndarray[np.float64_t, ndim=1] sum_weights_nz = np.zeros( + shape=n_features) + np.ndarray[np.float64_t, ndim=1] correction = np.zeros( + shape=n_features) np.ndarray[np.uint64_t, ndim=1] counts = np.full( fill_value=weights.shape[0], shape=n_features, dtype=np.uint64) @@ -138,7 +133,7 @@ def _csr_mean_variance_axis0(np.ndarray[floating, ndim=1, mode="c"] X_data, for i in range(X_indptr[row_ind], X_indptr[row_ind + 1]): col_ind = X_indices[i] if not isnan(X_data[i]): - means[col_ind] += (X_data[i] * weights[row_ind]) + means[col_ind] += (X_data[i]) * weights[row_ind] # sum of weights where X[:, col_ind] is non-zero sum_weights_nz[col_ind] += weights[row_ind] # number of non-zero elements of X[:, col_ind] @@ -157,21 +152,35 @@ def _csr_mean_variance_axis0(np.ndarray[floating, ndim=1, mode="c"] X_data, col_ind = X_indices[i] if not isnan(X_data[i]): diff = X_data[i] - means[col_ind] + # correction term of the corrected 2 pass algorithm. + # See "Algorithms for computing the sample variance: analysis + # and recommendations", by Chan, Golub, and LeVeque. + correction[col_ind] += diff * weights[row_ind] variances[col_ind] += diff * diff * weights[row_ind] for i in range(n_features): + if counts[i] != counts_nz[i]: + correction[i] -= (sum_weights[i] - sum_weights_nz[i]) * means[i] + correction[i] = correction[i]**2 / sum_weights[i] if counts[i] != counts_nz[i]: # only compute it when it's guaranteed to be non-zero to avoid # catastrophic cancellation. variances[i] += (sum_weights[i] - sum_weights_nz[i]) * means[i]**2 - variances[i] /= sum_weights[i] + variances[i] = (variances[i] - correction[i]) / sum_weights[i] - return means, variances, sum_weights + if floating is float: + return (np.array(means, dtype=np.float32), + np.array(variances, dtype=np.float32), + np.array(sum_weights, dtype=np.float32)) + else: + return means, variances, sum_weights def csc_mean_variance_axis0(X, weights=None, return_sum_weights=False): """Compute mean and variance along axis 0 on a CSC matrix + Uses a np.float64 accumulator. + Parameters ---------- X : CSC sparse matrix, shape (n_samples, n_features) @@ -224,25 +233,18 @@ def _csc_mean_variance_axis0(np.ndarray[floating, ndim=1, mode="c"] X_data, np.npy_intp i unsigned long long col_ind integral row_ind - floating diff + np.float64_t diff # means[j] contains the mean of feature j - np.ndarray[floating, ndim=1] means + np.ndarray[np.float64_t, ndim=1] means = np.zeros(n_features) # variances[j] contains the variance of feature j - np.ndarray[floating, ndim=1] variances - - if floating is float: - dtype = np.float32 - else: - dtype = np.float64 + np.ndarray[np.float64_t, ndim=1] variances = np.zeros(n_features) - means = np.zeros(n_features, dtype=dtype) - variances = np.zeros_like(means, dtype=dtype) - - cdef: - np.ndarray[floating, ndim=1] sum_weights = np.full( - fill_value=np.sum(weights), shape=n_features, dtype=dtype) - np.ndarray[floating, ndim=1] sum_weights_nz = np.zeros( - shape=n_features, dtype=dtype) + np.ndarray[np.float64_t, ndim=1] sum_weights = np.full( + fill_value=np.sum(weights, dtype=np.float64), shape=n_features) + np.ndarray[np.float64_t, ndim=1] sum_weights_nz = np.zeros( + shape=n_features) + np.ndarray[np.float64_t, ndim=1] correction = np.zeros( + shape=n_features) np.ndarray[np.uint64_t, ndim=1] counts = np.full( fill_value=weights.shape[0], shape=n_features, dtype=np.uint64) @@ -253,7 +255,7 @@ def _csc_mean_variance_axis0(np.ndarray[floating, ndim=1, mode="c"] X_data, for i in range(X_indptr[col_ind], X_indptr[col_ind + 1]): row_ind = X_indices[i] if not isnan(X_data[i]): - means[col_ind] += (X_data[i] * weights[row_ind]) + means[col_ind] += (X_data[i]) * weights[row_ind] # sum of weights where X[:, col_ind] is non-zero sum_weights_nz[col_ind] += weights[row_ind] # number of non-zero elements of X[:, col_ind] @@ -272,16 +274,28 @@ def _csc_mean_variance_axis0(np.ndarray[floating, ndim=1, mode="c"] X_data, row_ind = X_indices[i] if not isnan(X_data[i]): diff = X_data[i] - means[col_ind] + # correction term of the corrected 2 pass algorithm. + # See "Algorithms for computing the sample variance: analysis + # and recommendations", by Chan, Golub, and LeVeque. + correction[col_ind] += diff * weights[row_ind] variances[col_ind] += diff * diff * weights[row_ind] for i in range(n_features): + if counts[i] != counts_nz[i]: + correction[i] -= (sum_weights[i] - sum_weights_nz[i]) * means[i] + correction[i] = correction[i]**2 / sum_weights[i] if counts[i] != counts_nz[i]: # only compute it when it's guaranteed to be non-zero to avoid # catastrophic cancellation. variances[i] += (sum_weights[i] - sum_weights_nz[i]) * means[i]**2 - variances[i] /= sum_weights[i] + variances[i] = (variances[i] - correction[i]) / sum_weights[i] - return means, variances, sum_weights + if floating is float: + return (np.array(means, dtype=np.float32), + np.array(variances, dtype=np.float32), + np.array(sum_weights, dtype=np.float32)) + else: + return means, variances, sum_weights def incr_mean_variance_axis0(X, last_mean, last_var, last_n, weights=None): From 138da7ea911274f34d28849337c2768d7e3a7a96 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Wed, 14 Apr 2021 17:21:57 +0200 Subject: [PATCH 329/478] MNT Use const memory views in DistanceMetric subclasses (#19883) Co-authored-by: Olivier Grisel Co-authored-by: Thomas J. Fan --- doc/whats_new/v1.0.rst | 6 ++ sklearn/cluster/_hierarchical_fast.pyx | 2 +- sklearn/cluster/tests/test_hierarchical.py | 44 +++++++++++- sklearn/neighbors/_dist_metrics.pxd | 10 +-- sklearn/neighbors/_dist_metrics.pyx | 71 ++++++++++---------- sklearn/neighbors/tests/test_dist_metrics.py | 35 +++++++--- 6 files changed, 117 insertions(+), 51 deletions(-) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index 516af4b349c00..6c75ab511e21d 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -98,6 +98,9 @@ Changelog - |API| :class:`cluster.Birch` attributes, `fit_` and `partial_fit_`, are deprecated and will be removed in 1.2. :pr:`19297` by `Thomas Fan`_. +- |FIX| :class:`cluster.AgglomerativeClustering` now supports readonly + memory-mapped datasets. :pr:`19883` by `Julien Jerphanion `. + :mod:`sklearn.compose` ...................... @@ -306,6 +309,9 @@ Changelog :pr:`19473` by :user:`jiefangxuanyan ` and :user:`Julien Jerphanion `. +- |FIX| :class:`neighbors.DistanceMetric` subclasses now support readonly + memory-mapped datasets. :pr:`19883` by `Julien Jerphanion `. + :mod:`sklearn.pipeline` ....................... diff --git a/sklearn/cluster/_hierarchical_fast.pyx b/sklearn/cluster/_hierarchical_fast.pyx index ec8c96410c25c..2a58757ce327d 100644 --- a/sklearn/cluster/_hierarchical_fast.pyx +++ b/sklearn/cluster/_hierarchical_fast.pyx @@ -455,7 +455,7 @@ def single_linkage_label(L): @cython.boundscheck(False) @cython.nonecheck(False) def mst_linkage_core( - DTYPE_t [:, ::1] raw_data, + const DTYPE_t [:, ::1] raw_data, DistanceMetric dist_metric): """ Compute the necessary elements of a minimum spanning diff --git a/sklearn/cluster/tests/test_hierarchical.py b/sklearn/cluster/tests/test_hierarchical.py index 1f835a52f0105..513dbf8e9218e 100644 --- a/sklearn/cluster/tests/test_hierarchical.py +++ b/sklearn/cluster/tests/test_hierarchical.py @@ -5,6 +5,7 @@ # Authors: Vincent Michel, 2010, Gael Varoquaux 2012, # Matteo Visconti di Oleggio Castello 2014 # License: BSD 3 clause +import itertools from tempfile import mkdtemp import shutil import pytest @@ -15,7 +16,11 @@ from scipy.cluster import hierarchy from sklearn.metrics.cluster import adjusted_rand_score -from sklearn.utils._testing import assert_almost_equal +from sklearn.neighbors.tests.test_dist_metrics import METRICS_DEFAULT_PARAMS +from sklearn.utils._testing import ( + assert_almost_equal, + create_memmap_backed_data +) from sklearn.utils._testing import assert_array_almost_equal from sklearn.utils._testing import ignore_warnings @@ -28,8 +33,12 @@ from sklearn.metrics.pairwise import PAIRED_DISTANCES, cosine_distances,\ manhattan_distances, pairwise_distances from sklearn.metrics.cluster import normalized_mutual_info_score -from sklearn.neighbors import kneighbors_graph -from sklearn.cluster._hierarchical_fast import average_merge, max_merge +from sklearn.neighbors import kneighbors_graph, DistanceMetric +from sklearn.cluster._hierarchical_fast import ( + average_merge, + max_merge, + mst_linkage_core +) from sklearn.utils._fast_dict import IntFloatDict from sklearn.utils._testing import assert_array_equal from sklearn.datasets import make_moons, make_circles @@ -264,6 +273,16 @@ def test_agglomerative_clustering(): assert_array_equal(clustering.labels_, clustering2.labels_) +def test_agglomerative_clustering_memory_mapped(): + """AgglomerativeClustering must work on mem-mapped dataset. + + Non-regression test for issue #19875. + """ + rng = np.random.RandomState(0) + Xmm = create_memmap_backed_data(rng.randn(50, 100)) + AgglomerativeClustering(affinity="euclidean", linkage="single").fit(Xmm) + + def test_ward_agglomeration(): # Check that we obtain the correct solution in a simplistic case rng = np.random.RandomState(0) @@ -375,6 +394,25 @@ def test_vector_scikit_single_vs_scipy_single(seed): assess_same_labelling(cut, cut_scipy) +@pytest.mark.parametrize('metric', METRICS_DEFAULT_PARAMS) +def test_mst_linkage_core_memory_mapped(metric): + """The MST-LINKAGE-CORE algorithm must work on mem-mapped dataset. + + Non-regression test for issue #19875. + """ + rng = np.random.RandomState(seed=1) + X = rng.normal(size=(20, 4)) + Xmm = create_memmap_backed_data(X) + argdict = METRICS_DEFAULT_PARAMS[metric] + keys = argdict.keys() + for vals in itertools.product(*argdict.values()): + kwargs = dict(zip(keys, vals)) + distance_metric = DistanceMetric.get_metric(metric, **kwargs) + mst = mst_linkage_core(X, distance_metric) + mst_mm = mst_linkage_core(Xmm, distance_metric) + np.testing.assert_equal(mst, mst_mm) + + def test_identical_points(): # Ensure identical points are handled correctly when using mst with # a sparse connectivity matrix diff --git a/sklearn/neighbors/_dist_metrics.pxd b/sklearn/neighbors/_dist_metrics.pxd index 89c63cc46905f..856d5bb2dde5b 100644 --- a/sklearn/neighbors/_dist_metrics.pxd +++ b/sklearn/neighbors/_dist_metrics.pxd @@ -15,7 +15,7 @@ from ._typedefs import DTYPE, ITYPE # # We use these for the default (euclidean) case so that they can be # inlined. This leads to faster computation for the most common case -cdef inline DTYPE_t euclidean_dist(DTYPE_t* x1, DTYPE_t* x2, +cdef inline DTYPE_t euclidean_dist(const DTYPE_t* x1, const DTYPE_t* x2, ITYPE_t size) nogil except -1: cdef DTYPE_t tmp, d=0 cdef np.intp_t j @@ -25,7 +25,7 @@ cdef inline DTYPE_t euclidean_dist(DTYPE_t* x1, DTYPE_t* x2, return sqrt(d) -cdef inline DTYPE_t euclidean_rdist(DTYPE_t* x1, DTYPE_t* x2, +cdef inline DTYPE_t euclidean_rdist(const DTYPE_t* x1, const DTYPE_t* x2, ITYPE_t size) nogil except -1: cdef DTYPE_t tmp, d=0 cdef np.intp_t j @@ -35,11 +35,11 @@ cdef inline DTYPE_t euclidean_rdist(DTYPE_t* x1, DTYPE_t* x2, return d -cdef inline DTYPE_t euclidean_dist_to_rdist(DTYPE_t dist) nogil except -1: +cdef inline DTYPE_t euclidean_dist_to_rdist(const DTYPE_t dist) nogil except -1: return dist * dist -cdef inline DTYPE_t euclidean_rdist_to_dist(DTYPE_t dist) nogil except -1: +cdef inline DTYPE_t euclidean_rdist_to_dist(const DTYPE_t dist) nogil except -1: return sqrt(dist) @@ -61,7 +61,7 @@ cdef class DistanceMetric: cdef object func cdef object kwargs - cdef DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, + cdef DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2, ITYPE_t size) nogil except -1 cdef DTYPE_t rdist(self, DTYPE_t* x1, DTYPE_t* x2, diff --git a/sklearn/neighbors/_dist_metrics.pyx b/sklearn/neighbors/_dist_metrics.pyx index 4cc41d7136586..398591bcdf49f 100755 --- a/sklearn/neighbors/_dist_metrics.pyx +++ b/sklearn/neighbors/_dist_metrics.pyx @@ -300,7 +300,7 @@ cdef class DistanceMetric: """ return - cdef DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, + cdef DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2, ITYPE_t size) nogil except -1: """Compute the distance between vectors x1 and x2 @@ -308,7 +308,7 @@ cdef class DistanceMetric: """ return -999 - cdef DTYPE_t rdist(self, DTYPE_t* x1, DTYPE_t* x2, + cdef DTYPE_t rdist(self, const DTYPE_t* x1, const DTYPE_t* x2, ITYPE_t size) nogil except -1: """Compute the reduced distance between vectors x1 and x2. @@ -321,7 +321,7 @@ cdef class DistanceMetric: """ return self.dist(x1, x2, size) - cdef int pdist(self, DTYPE_t[:, ::1] X, DTYPE_t[:, ::1] D) except -1: + cdef int pdist(self, const DTYPE_t[:, ::1] X, DTYPE_t[:, ::1] D) except -1: """compute the pairwise distances between points in X""" cdef ITYPE_t i1, i2 for i1 in range(X.shape[0]): @@ -330,7 +330,7 @@ cdef class DistanceMetric: D[i2, i1] = D[i1, i2] return 0 - cdef int cdist(self, DTYPE_t[:, ::1] X, DTYPE_t[:, ::1] Y, + cdef int cdist(self, const DTYPE_t[:, ::1] X, const DTYPE_t[:, ::1] Y, DTYPE_t[:, ::1] D) except -1: """compute the cross-pairwise distances between arrays X and Y""" cdef ITYPE_t i1, i2 @@ -423,11 +423,11 @@ cdef class EuclideanDistance(DistanceMetric): def __init__(self): self.p = 2 - cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, + cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2, ITYPE_t size) nogil except -1: return euclidean_dist(x1, x2, size) - cdef inline DTYPE_t rdist(self, DTYPE_t* x1, DTYPE_t* x2, + cdef inline DTYPE_t rdist(self, const DTYPE_t* x1, const DTYPE_t* x2, ITYPE_t size) nogil except -1: return euclidean_rdist(x1, x2, size) @@ -463,7 +463,7 @@ cdef class SEuclideanDistance(DistanceMetric): if X.shape[1] != self.size: raise ValueError('SEuclidean dist: size of V does not match') - cdef inline DTYPE_t rdist(self, DTYPE_t* x1, DTYPE_t* x2, + cdef inline DTYPE_t rdist(self, const DTYPE_t* x1, const DTYPE_t* x2, ITYPE_t size) nogil except -1: cdef DTYPE_t tmp, d=0 cdef np.intp_t j @@ -472,7 +472,7 @@ cdef class SEuclideanDistance(DistanceMetric): d += tmp * tmp / self.vec_ptr[j] return d - cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, + cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2, ITYPE_t size) nogil except -1: return sqrt(self.rdist(x1, x2, size)) @@ -501,7 +501,7 @@ cdef class ManhattanDistance(DistanceMetric): def __init__(self): self.p = 1 - cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, + cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2, ITYPE_t size) nogil except -1: cdef DTYPE_t d = 0 cdef np.intp_t j @@ -534,7 +534,7 @@ cdef class ChebyshevDistance(DistanceMetric): def __init__(self): self.p = INF - cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, + cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2, ITYPE_t size) nogil except -1: cdef DTYPE_t d = 0 cdef np.intp_t j @@ -565,7 +565,7 @@ cdef class MinkowskiDistance(DistanceMetric): "For p=inf, use ChebyshevDistance.") self.p = p - cdef inline DTYPE_t rdist(self, DTYPE_t* x1, DTYPE_t* x2, + cdef inline DTYPE_t rdist(self, const DTYPE_t* x1, const DTYPE_t* x2, ITYPE_t size) nogil except -1: cdef DTYPE_t d=0 cdef np.intp_t j @@ -573,7 +573,7 @@ cdef class MinkowskiDistance(DistanceMetric): d += pow(fabs(x1[j] - x2[j]), self.p) return d - cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, + cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2, ITYPE_t size) nogil except -1: return pow(self.rdist(x1, x2, size), 1. / self.p) @@ -625,7 +625,7 @@ cdef class WMinkowskiDistance(DistanceMetric): raise ValueError('WMinkowskiDistance dist: ' 'size of w does not match') - cdef inline DTYPE_t rdist(self, DTYPE_t* x1, DTYPE_t* x2, + cdef inline DTYPE_t rdist(self, const DTYPE_t* x1, const DTYPE_t* x2, ITYPE_t size) nogil except -1: cdef DTYPE_t d=0 cdef np.intp_t j @@ -633,7 +633,7 @@ cdef class WMinkowskiDistance(DistanceMetric): d += pow(self.vec_ptr[j] * fabs(x1[j] - x2[j]), self.p) return d - cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, + cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2, ITYPE_t size) nogil except -1: return pow(self.rdist(x1, x2, size), 1. / self.p) @@ -690,7 +690,7 @@ cdef class MahalanobisDistance(DistanceMetric): if X.shape[1] != self.size: raise ValueError('Mahalanobis dist: size of V does not match') - cdef inline DTYPE_t rdist(self, DTYPE_t* x1, DTYPE_t* x2, + cdef inline DTYPE_t rdist(self, const DTYPE_t* x1, const DTYPE_t* x2, ITYPE_t size) nogil except -1: cdef DTYPE_t tmp, d = 0 cdef np.intp_t i, j @@ -706,7 +706,7 @@ cdef class MahalanobisDistance(DistanceMetric): d += tmp * self.vec_ptr[i] return d - cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, + cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2, ITYPE_t size) nogil except -1: return sqrt(self.rdist(x1, x2, size)) @@ -735,7 +735,7 @@ cdef class HammingDistance(DistanceMetric): .. math:: D(x, y) = \frac{1}{N} \sum_i \delta_{x_i, y_i} """ - cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, + cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2, ITYPE_t size) nogil except -1: cdef int n_unequal = 0 cdef np.intp_t j @@ -757,7 +757,7 @@ cdef class CanberraDistance(DistanceMetric): .. math:: D(x, y) = \sum_i \frac{|x_i - y_i|}{|x_i| + |y_i|} """ - cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, + cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2, ITYPE_t size) nogil except -1: cdef DTYPE_t denom, d = 0 cdef np.intp_t j @@ -780,7 +780,7 @@ cdef class BrayCurtisDistance(DistanceMetric): .. math:: D(x, y) = \frac{\sum_i |x_i - y_i|}{\sum_i(|x_i| + |y_i|)} """ - cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, + cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2, ITYPE_t size) nogil except -1: cdef DTYPE_t num = 0, denom = 0 cdef np.intp_t j @@ -806,7 +806,7 @@ cdef class JaccardDistance(DistanceMetric): .. math:: D(x, y) = \frac{N_{TF} + N_{FT}}{N_{TT} + N_{TF} + N_{FT}} """ - cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, + cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2, ITYPE_t size) nogil except -1: cdef int tf1, tf2, n_eq = 0, nnz = 0 cdef np.intp_t j @@ -836,7 +836,7 @@ cdef class MatchingDistance(DistanceMetric): .. math:: D(x, y) = \frac{N_{TF} + N_{FT}}{N} """ - cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, + cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2, ITYPE_t size) nogil except -1: cdef int tf1, tf2, n_neq = 0 cdef np.intp_t j @@ -860,7 +860,7 @@ cdef class DiceDistance(DistanceMetric): .. math:: D(x, y) = \frac{N_{TF} + N_{FT}}{2 * N_{TT} + N_{TF} + N_{FT}} """ - cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, + cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2, ITYPE_t size) nogil except -1: cdef int tf1, tf2, n_neq = 0, ntt = 0 cdef np.intp_t j @@ -885,7 +885,7 @@ cdef class KulsinskiDistance(DistanceMetric): .. math:: D(x, y) = 1 - \frac{N_{TT}}{N + N_{TF} + N_{FT}} """ - cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, + cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2, ITYPE_t size) nogil except -1: cdef int tf1, tf2, ntt = 0, n_neq = 0 cdef np.intp_t j @@ -910,7 +910,7 @@ cdef class RogersTanimotoDistance(DistanceMetric): .. math:: D(x, y) = \frac{2 (N_{TF} + N_{FT})}{N + N_{TF} + N_{FT}} """ - cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, + cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2, ITYPE_t size) nogil except -1: cdef int tf1, tf2, n_neq = 0 cdef np.intp_t j @@ -934,7 +934,7 @@ cdef class RussellRaoDistance(DistanceMetric): .. math:: D(x, y) = \frac{N - N_{TT}}{N} """ - cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, + cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2, ITYPE_t size) nogil except -1: cdef int tf1, tf2, ntt = 0 cdef np.intp_t j @@ -958,7 +958,7 @@ cdef class SokalMichenerDistance(DistanceMetric): .. math:: D(x, y) = \frac{2 (N_{TF} + N_{FT})}{N + N_{TF} + N_{FT}} """ - cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, + cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2, ITYPE_t size) nogil except -1: cdef int tf1, tf2, n_neq = 0 cdef np.intp_t j @@ -982,7 +982,7 @@ cdef class SokalSneathDistance(DistanceMetric): .. math:: D(x, y) = \frac{N_{TF} + N_{FT}}{N_{TT} / 2 + N_{TF} + N_{FT}} """ - cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, + cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2, ITYPE_t size) nogil except -1: cdef int tf1, tf2, ntt = 0, n_neq = 0 cdef np.intp_t j @@ -1016,13 +1016,13 @@ cdef class HaversineDistance(DistanceMetric): raise ValueError("Haversine distance only valid " "in 2 dimensions") - cdef inline DTYPE_t rdist(self, DTYPE_t* x1, DTYPE_t* x2, + cdef inline DTYPE_t rdist(self, const DTYPE_t* x1, const DTYPE_t* x2, ITYPE_t size) nogil except -1: cdef DTYPE_t sin_0 = sin(0.5 * (x1[0] - x2[0])) cdef DTYPE_t sin_1 = sin(0.5 * (x1[1] - x2[1])) return (sin_0 * sin_0 + cos(x1[0]) * cos(x2[0]) * sin_1 * sin_1) - cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, + cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2, ITYPE_t size) nogil except -1: return 2 * asin(sqrt(self.rdist(x1, x2, size))) @@ -1047,7 +1047,8 @@ cdef class HaversineDistance(DistanceMetric): # [This is not a true metric, so we will leave it out.] # #cdef class YuleDistance(DistanceMetric): -# cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, ITYPE_t size): +# cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2, +# ITYPE_t size): # cdef int tf1, tf2, ntf = 0, nft = 0, ntt = 0, nff = 0 # cdef np.intp_t j # for j in range(size): @@ -1066,7 +1067,8 @@ cdef class HaversineDistance(DistanceMetric): # [This is not a true metric, so we will leave it out.] # #cdef class CosineDistance(DistanceMetric): -# cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, ITYPE_t size): +# cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2, +# ITYPE_t size): # cdef DTYPE_t d = 0, norm1 = 0, norm2 = 0 # cdef np.intp_t j # for j in range(size): @@ -1082,7 +1084,8 @@ cdef class HaversineDistance(DistanceMetric): # [This is not a true metric, so we will leave it out.] # #cdef class CorrelationDistance(DistanceMetric): -# cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, ITYPE_t size): +# cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2, +# ITYPE_t size): # cdef DTYPE_t mu1 = 0, mu2 = 0, x1nrm = 0, x2nrm = 0, x1Tx2 = 0 # cdef DTYPE_t tmp1, tmp2 # @@ -1125,11 +1128,11 @@ cdef class PyFuncDistance(DistanceMetric): # allowed in cython >= 0.26 since it is a redundant GIL acquisition. The # only way to be back compatible is to inherit `dist` from the base class # without GIL and called an inline `_dist` which acquire GIL. - cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, + cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2, ITYPE_t size) nogil except -1: return self._dist(x1, x2, size) - cdef inline DTYPE_t _dist(self, DTYPE_t* x1, DTYPE_t* x2, + cdef inline DTYPE_t _dist(self, const DTYPE_t* x1, const DTYPE_t* x2, ITYPE_t size) except -1 with gil: cdef np.ndarray x1arr cdef np.ndarray x2arr diff --git a/sklearn/neighbors/tests/test_dist_metrics.py b/sklearn/neighbors/tests/test_dist_metrics.py index 05e0f4294ebb6..07705e93c3390 100644 --- a/sklearn/neighbors/tests/test_dist_metrics.py +++ b/sklearn/neighbors/tests/test_dist_metrics.py @@ -10,6 +10,7 @@ from sklearn.neighbors import DistanceMetric from sklearn.neighbors import BallTree from sklearn.utils import check_random_state +from sklearn.utils._testing import create_memmap_backed_data from sklearn.utils.fixes import sp_version, parse_version @@ -24,10 +25,15 @@ def dist_func(x1, x2, p): X1 = rng.random_sample((n1, d)).astype('float64', copy=False) X2 = rng.random_sample((n2, d)).astype('float64', copy=False) +[X1_mmap, X2_mmap] = create_memmap_backed_data([X1, X2]) + # make boolean arrays: ones and zeros X1_bool = X1.round(0) X2_bool = X2.round(0) +[X1_bool_mmap, X2_bool_mmap] = create_memmap_backed_data([X1_bool, X2_bool]) + + V = rng.random_sample((d, d)) VI = np.dot(V, V.T) @@ -47,14 +53,18 @@ def dist_func(x1, x2, p): 'canberra': {}, 'braycurtis': {}} - @pytest.mark.parametrize('metric', METRICS_DEFAULT_PARAMS) -def test_cdist(metric): +@pytest.mark.parametrize('X1, X2', [(X1, X2), (X1_mmap, X2_mmap)]) +def test_cdist(metric, X1, X2): argdict = METRICS_DEFAULT_PARAMS[metric] keys = argdict.keys() for vals in itertools.product(*argdict.values()): kwargs = dict(zip(keys, vals)) - if metric == "wminkowski": + if metric == "mahalanobis": + # See: https://github.com/scipy/scipy/issues/13861 + pytest.xfail("scipy#13861: cdist with 'mahalanobis' fails on" + "memmap data") + elif metric == "wminkowski": if sp_version >= parse_version("1.8.0"): pytest.skip("wminkowski will be removed in SciPy 1.8.0") @@ -71,7 +81,9 @@ def test_cdist(metric): @pytest.mark.parametrize('metric', BOOL_METRICS) -def test_cdist_bool_metric(metric): +@pytest.mark.parametrize('X1_bool, X2_bool', [(X1_bool, X2_bool), + (X1_bool_mmap, X2_bool_mmap)]) +def test_cdist_bool_metric(metric, X1_bool, X2_bool): D_true = cdist(X1_bool, X2_bool, metric) check_cdist_bool(metric, D_true) @@ -89,12 +101,17 @@ def check_cdist_bool(metric, D_true): @pytest.mark.parametrize('metric', METRICS_DEFAULT_PARAMS) -def test_pdist(metric): +@pytest.mark.parametrize('X1, X2', [(X1, X2), (X1_mmap, X2_mmap)]) +def test_pdist(metric, X1, X2): argdict = METRICS_DEFAULT_PARAMS[metric] keys = argdict.keys() for vals in itertools.product(*argdict.values()): kwargs = dict(zip(keys, vals)) - if metric == "wminkowski": + if metric == "mahalanobis": + # See: https://github.com/scipy/scipy/issues/13861 + pytest.xfail("scipy#13861: pdist with 'mahalanobis' fails on" + "memmap data") + elif metric == "wminkowski": if sp_version >= parse_version("1.8.0"): pytest.skip("wminkowski will be removed in SciPy 1.8.0") @@ -111,7 +128,8 @@ def test_pdist(metric): @pytest.mark.parametrize('metric', BOOL_METRICS) -def test_pdist_bool_metrics(metric): +@pytest.mark.parametrize('X1_bool', [X1_bool, X1_bool_mmap]) +def test_pdist_bool_metrics(metric, X1_bool): D_true = cdist(X1_bool, X1_bool, metric) check_pdist_bool(metric, D_true) @@ -143,7 +161,8 @@ def test_pickle(metric): @pytest.mark.parametrize('metric', BOOL_METRICS) -def test_pickle_bool_metrics(metric): +@pytest.mark.parametrize('X1_bool', [X1_bool, X1_bool_mmap]) +def test_pickle_bool_metrics(metric, X1_bool): dm = DistanceMetric.get_metric(metric) D1 = dm.pairwise(X1_bool) dm2 = pickle.loads(pickle.dumps(dm)) From 8c4589b23c6481f978d4cfab511f25b77a805f13 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?= <34657725+jeremiedbb@users.noreply.github.com> Date: Thu, 15 Apr 2021 08:32:22 +0200 Subject: [PATCH 330/478] ENH Scalable MiniBatchKMeans plus cln / fixes / refactoring (#17622) --- doc/whats_new/v1.0.rst | 22 +- ...{_k_means_fast.pxd => _k_means_common.pxd} | 0 ...{_k_means_fast.pyx => _k_means_common.pyx} | 118 +--- sklearn/cluster/_k_means_elkan.pyx | 14 +- sklearn/cluster/_k_means_lloyd.pyx | 10 +- sklearn/cluster/_k_means_minibatch.pyx | 228 ++++++ sklearn/cluster/_kmeans.py | 655 +++++++++--------- sklearn/cluster/setup.py | 9 +- sklearn/cluster/tests/test_k_means.py | 259 ++++--- 9 files changed, 736 insertions(+), 579 deletions(-) rename sklearn/cluster/{_k_means_fast.pxd => _k_means_common.pxd} (100%) rename sklearn/cluster/{_k_means_fast.pyx => _k_means_common.pyx} (67%) create mode 100644 sklearn/cluster/_k_means_minibatch.pyx diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index 6c75ab511e21d..5975177f7a0c8 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -95,12 +95,30 @@ Changelog in multicore settings. :pr:`19052` by :user:`Yusuke Nagasaka `. -- |API| :class:`cluster.Birch` attributes, `fit_` and `partial_fit_`, are - deprecated and will be removed in 1.2. :pr:`19297` by `Thomas Fan`_. +- |Efficiency| :class:`cluster.MiniBatchKMeans` is now faster in multicore + settings. :pr:`17622` by :user:`Jérémie du Boisberranger `. + +- |Fix| Fixed a bug in :class:`cluster.MiniBatchKMeans` where the sample + weights were partially ignored when the input is sparse. :pr:`17622` by + :user:`Jérémie du Boisberranger `. +- |Fix| Improved convergence detection based on center change in + :class:`cluster.MiniBatchKMeans` which was almost never achievable. + :pr:`17622` by :user:`Jérémie du Boisberranger `. + - |FIX| :class:`cluster.AgglomerativeClustering` now supports readonly memory-mapped datasets. :pr:`19883` by `Julien Jerphanion `. +- |API| :class:`cluster.Birch` attributes, `fit_` and `partial_fit_`, are + deprecated and will be removed in 1.2. :pr:`19297` by `Thomas Fan`_. + +- |API| the default value for the `batch_size` parameter of + :class:`MiniBatchKMeans` was changed from 100 to 1024 due to efficiency + reasons. The `n_iter_` attribute of :class:`MiniBatchKMeans` now reports the + number of started epochs and the `n_steps_` attribute reports the number of + mini batches processed. :pr:`17622` + by :user:`Jérémie du Boisberranger `. + :mod:`sklearn.compose` ...................... diff --git a/sklearn/cluster/_k_means_fast.pxd b/sklearn/cluster/_k_means_common.pxd similarity index 100% rename from sklearn/cluster/_k_means_fast.pxd rename to sklearn/cluster/_k_means_common.pxd diff --git a/sklearn/cluster/_k_means_fast.pyx b/sklearn/cluster/_k_means_common.pyx similarity index 67% rename from sklearn/cluster/_k_means_fast.pyx rename to sklearn/cluster/_k_means_common.pyx index 21bf7dd9bf65a..373be241dd013 100644 --- a/sklearn/cluster/_k_means_fast.pyx +++ b/sklearn/cluster/_k_means_common.pyx @@ -14,8 +14,8 @@ import numpy as np cimport numpy as np -cimport cython from cython cimport floating +from cython.parallel cimport prange from libc.math cimport sqrt from ..utils.extmath import row_norms @@ -23,9 +23,6 @@ from ..utils.extmath import row_norms np.import_array() -ctypedef np.float64_t DOUBLE -ctypedef np.int32_t INT - # Number of samples per data chunk defined as a global constant. CHUNK_SIZE = 256 @@ -103,7 +100,8 @@ cpdef floating _inertia_dense( np.ndarray[floating, ndim=2, mode='c'] X, # IN floating[::1] sample_weight, # IN floating[:, ::1] centers, # IN - int[::1] labels): # IN + int[::1] labels, # IN + int n_threads): """Compute inertia for dense input data Sum of squared distance between each sample and its assigned center. @@ -116,7 +114,8 @@ cpdef floating _inertia_dense( floating sq_dist = 0.0 floating inertia = 0.0 - for i in range(n_samples): + for i in prange(n_samples, nogil=True, num_threads=n_threads, + schedule='static'): j = labels[i] sq_dist = _euclidean_dense_dense(&X[i, 0], ¢ers[j, 0], n_features, True) @@ -129,7 +128,8 @@ cpdef floating _inertia_sparse( X, # IN floating[::1] sample_weight, # IN floating[:, ::1] centers, # IN - int[::1] labels): # IN + int[::1] labels, # IN + int n_threads): """Compute inertia for sparse input data Sum of squared distance between each sample and its assigned center. @@ -148,7 +148,8 @@ cpdef floating _inertia_sparse( floating[::1] centers_squared_norms = row_norms(centers, squared=True) - for i in range(n_samples): + for i in prange(n_samples, nogil=True, num_threads=n_threads, + schedule='static'): j = labels[i] sq_dist = _euclidean_sparse_dense( X_data[X_indptr[i]: X_indptr[i + 1]], @@ -286,104 +287,3 @@ cdef void _center_shift( for j in range(n_clusters): center_shift[j] = _euclidean_dense_dense( ¢ers_new[j, 0], ¢ers_old[j, 0], n_features, False) - - -def _mini_batch_update_csr(X, np.ndarray[floating, ndim=1] sample_weight, - np.ndarray[floating, ndim=1] x_squared_norms, - np.ndarray[floating, ndim=2] centers, - np.ndarray[floating, ndim=1] weight_sums, - np.ndarray[INT, ndim=1] nearest_center, - np.ndarray[floating, ndim=1] old_center, - int compute_squared_diff): - """Incremental update of the centers for sparse MiniBatchKMeans. - - Parameters - ---------- - - X : CSR matrix, dtype float - The complete (pre allocated) training set as a CSR matrix. - - centers : array, shape (n_clusters, n_features) - The cluster centers - - counts : array, shape (n_clusters,) - The vector in which we keep track of the numbers of elements in a - cluster - - Returns - ------- - inertia : float - The inertia of the batch prior to centers update, i.e. the sum - of squared distances to the closest center for each sample. This - is the objective function being minimized by the k-means algorithm. - - squared_diff : float - The sum of squared update (squared norm of the centers position - change). If compute_squared_diff is 0, this computation is skipped and - 0.0 is returned instead. - - Both squared diff and inertia are commonly used to monitor the convergence - of the algorithm. - """ - cdef: - np.ndarray[floating, ndim=1] X_data = X.data - np.ndarray[int, ndim=1] X_indices = X.indices - np.ndarray[int, ndim=1] X_indptr = X.indptr - unsigned int n_samples = X.shape[0] - unsigned int n_clusters = centers.shape[0] - unsigned int n_features = centers.shape[1] - - unsigned int sample_idx, center_idx, feature_idx - unsigned int k - DOUBLE old_weight_sum, new_weight_sum - DOUBLE center_diff - DOUBLE squared_diff = 0.0 - - # move centers to the mean of both old and newly assigned samples - for center_idx in range(n_clusters): - old_weight_sum = weight_sums[center_idx] - new_weight_sum = old_weight_sum - - # count the number of samples assigned to this center - for sample_idx in range(n_samples): - if nearest_center[sample_idx] == center_idx: - new_weight_sum += sample_weight[sample_idx] - - if new_weight_sum == old_weight_sum: - # no new sample: leave this center as it stands - continue - - # rescale the old center to reflect it previous accumulated weight - # with regards to the new data that will be incrementally contributed - if compute_squared_diff: - old_center[:] = centers[center_idx] - centers[center_idx] *= old_weight_sum - - # iterate of over samples assigned to this cluster to move the center - # location by inplace summation - for sample_idx in range(n_samples): - if nearest_center[sample_idx] != center_idx: - continue - - # inplace sum with new samples that are members of this cluster - # and update of the incremental squared difference update of the - # center position - for k in range(X_indptr[sample_idx], X_indptr[sample_idx + 1]): - centers[center_idx, X_indices[k]] += X_data[k] - - # inplace rescale center with updated count - if new_weight_sum > old_weight_sum: - # update the count statistics for this center - weight_sums[center_idx] = new_weight_sum - - # re-scale the updated center with the total new counts - centers[center_idx] /= new_weight_sum - - # update the incremental computation of the squared total - # centers position change - if compute_squared_diff: - for feature_idx in range(n_features): - squared_diff += (old_center[feature_idx] - - centers[center_idx, feature_idx]) ** 2 - - return squared_diff diff --git a/sklearn/cluster/_k_means_elkan.pyx b/sklearn/cluster/_k_means_elkan.pyx index 1010e581f5e7f..84464d78fe244 100644 --- a/sklearn/cluster/_k_means_elkan.pyx +++ b/sklearn/cluster/_k_means_elkan.pyx @@ -18,13 +18,13 @@ from libc.stdlib cimport calloc, free from libc.string cimport memset, memcpy from ..utils.extmath import row_norms -from ._k_means_fast import CHUNK_SIZE -from ._k_means_fast cimport _relocate_empty_clusters_dense -from ._k_means_fast cimport _relocate_empty_clusters_sparse -from ._k_means_fast cimport _euclidean_dense_dense -from ._k_means_fast cimport _euclidean_sparse_dense -from ._k_means_fast cimport _average_centers -from ._k_means_fast cimport _center_shift +from ._k_means_common import CHUNK_SIZE +from ._k_means_common cimport _relocate_empty_clusters_dense +from ._k_means_common cimport _relocate_empty_clusters_sparse +from ._k_means_common cimport _euclidean_dense_dense +from ._k_means_common cimport _euclidean_sparse_dense +from ._k_means_common cimport _average_centers +from ._k_means_common cimport _center_shift np.import_array() diff --git a/sklearn/cluster/_k_means_lloyd.pyx b/sklearn/cluster/_k_means_lloyd.pyx index 255f4f470a648..7cb7d2abb728e 100644 --- a/sklearn/cluster/_k_means_lloyd.pyx +++ b/sklearn/cluster/_k_means_lloyd.pyx @@ -11,16 +11,16 @@ cimport numpy as np from cython cimport floating from cython.parallel import prange, parallel from libc.stdlib cimport malloc, calloc, free -from libc.string cimport memset, memcpy +from libc.string cimport memset from libc.float cimport DBL_MAX, FLT_MAX from ..utils.extmath import row_norms from ..utils._cython_blas cimport _gemm from ..utils._cython_blas cimport RowMajor, Trans, NoTrans -from ._k_means_fast import CHUNK_SIZE -from ._k_means_fast cimport _relocate_empty_clusters_dense -from ._k_means_fast cimport _relocate_empty_clusters_sparse -from ._k_means_fast cimport _average_centers, _center_shift +from ._k_means_common import CHUNK_SIZE +from ._k_means_common cimport _relocate_empty_clusters_dense +from ._k_means_common cimport _relocate_empty_clusters_sparse +from ._k_means_common cimport _average_centers, _center_shift np.import_array() diff --git a/sklearn/cluster/_k_means_minibatch.pyx b/sklearn/cluster/_k_means_minibatch.pyx new file mode 100644 index 0000000000000..ab5aee35ea075 --- /dev/null +++ b/sklearn/cluster/_k_means_minibatch.pyx @@ -0,0 +1,228 @@ +# cython: profile=True, boundscheck=False, wraparound=False, cdivision=True + +# TODO: We still need to use ndarrays instead of typed memoryviews when using +# fused types and when the array may be read-only (for instance when it's +# provided by the user). This will be fixed in cython >= 0.3. + +cimport numpy as np +from cython cimport floating +from cython.parallel cimport parallel, prange +from libc.stdlib cimport malloc, free + + +np.import_array() + + +def _minibatch_update_dense( + np.ndarray[floating, ndim=2, mode="c"] X, # IN + floating[::1] sample_weight, # IN + floating[:, ::1] centers_old, # IN + floating[:, ::1] centers_new, # OUT + floating[::1] weight_sums, # INOUT + int[::1] labels, # IN + int n_threads): + """Update of the centers for dense MiniBatchKMeans. + + Parameters + ---------- + X : ndarray of shape (n_samples, n_features), dtype=floating + The observations to cluster. + + sample_weight : ndarray of shape (n_samples,), dtype=floating + The weights for each observation in X. + + centers_old : ndarray of shape (n_clusters, n_features), dtype=floating + Centers before previous iteration, placeholder for the centers after + previous iteration. + + centers_new : ndarray of shape (n_clusters, n_features), dtype=floating + Centers after previous iteration, placeholder for the new centers + computed during this iteration. + + weight_sums : ndarray of shape (n_clusters,), dtype=floating + Current sums of the accumulated weights for each center. + + labels : ndarray of shape (n_samples,), dtype=int + labels assignment. + + n_threads : int + The number of threads to be used by openmp. + """ + cdef: + int n_samples = X.shape[0] + int n_clusters = centers_old.shape[0] + int cluster_idx + + int *indices + + with nogil, parallel(num_threads=n_threads): + indices = malloc(n_samples * sizeof(int)) + + for cluster_idx in prange(n_clusters, schedule="static"): + update_center_dense(cluster_idx, &X[0, 0], sample_weight, + centers_old, centers_new, weight_sums, labels, + indices) + + free(indices) + + +cdef void update_center_dense( + int cluster_idx, + floating *X, # IN + floating[::1] sample_weight, # IN + floating[:, ::1] centers_old, # IN + floating[:, ::1] centers_new, # OUT + floating[::1] weight_sums, # INOUT + int[::1] labels, # IN + int *indices) nogil: # TMP + """Update of a single center for dense MinibatchKMeans""" + cdef: + int n_samples = sample_weight.shape[0] + int n_features = centers_old.shape[1] + floating alpha + int n_indices + int k, sample_idx, feature_idx + + floating wsum = 0 + + # indices = np.where(labels == cluster_idx)[0] + k = 0 + for sample_idx in range(n_samples): + if labels[sample_idx] == cluster_idx: + indices[k] = sample_idx + wsum += sample_weight[sample_idx] + k += 1 + n_indices = k + + if wsum > 0: + # Undo the previous count-based scaling for this cluster center + for feature_idx in range(n_features): + centers_new[cluster_idx, feature_idx] = centers_old[cluster_idx, feature_idx] * weight_sums[cluster_idx] + + # Update cluster with new point members + for k in range(n_indices): + sample_idx = indices[k] + for feature_idx in range(n_features): + centers_new[cluster_idx, feature_idx] += X[sample_idx * n_features + feature_idx] * sample_weight[sample_idx] + + # Update the count statistics for this center + weight_sums[cluster_idx] += wsum + + # Rescale to compute mean of all points (old and new) + alpha = 1 / weight_sums[cluster_idx] + for feature_idx in range(n_features): + centers_new[cluster_idx, feature_idx] *= alpha + else: + # No sample was assigned to this cluster in this batch of data + for feature_idx in range(n_features): + centers_new[cluster_idx, feature_idx] = centers_old[cluster_idx, feature_idx] + + +def _minibatch_update_sparse( + X, # IN + floating[::1] sample_weight, # IN + floating[:, ::1] centers_old, # IN + floating[:, ::1] centers_new, # OUT + floating[::1] weight_sums, # INOUT + int[::1] labels, # IN + int n_threads): + """Update of the centers for sparse MiniBatchKMeans. + + Parameters + ---------- + X : sparse matrix of shape (n_samples, n_features), dtype=floating + The observations to cluster. Must be in CSR format. + + sample_weight : ndarray of shape (n_samples,), dtype=floating + The weights for each observation in X. + + centers_old : ndarray of shape (n_clusters, n_features), dtype=floating + Centers before previous iteration, placeholder for the centers after + previous iteration. + + centers_new : ndarray of shape (n_clusters, n_features), dtype=floating + Centers after previous iteration, placeholder for the new centers + computed during this iteration. + + weight_sums : ndarray of shape (n_clusters,), dtype=floating + Current sums of the accumulated weights for each center. + + labels : ndarray of shape (n_samples,), dtype=int + labels assignment. + + n_threads : int + The number of threads to be used by openmp. + """ + cdef: + floating[::1] X_data = X.data + int[::1] X_indices = X.indices + int[::1] X_indptr = X.indptr + int n_samples = X.shape[0] + int n_clusters = centers_old.shape[0] + int cluster_idx + + int *indices + + with nogil, parallel(num_threads=n_threads): + indices = malloc(n_samples * sizeof(int)) + + for cluster_idx in prange(n_clusters, schedule="static"): + update_center_sparse(cluster_idx, X_data, X_indices, X_indptr, + sample_weight, centers_old, centers_new, + weight_sums, labels, indices) + + free(indices) + + +cdef void update_center_sparse( + int cluster_idx, + floating[::1] X_data, # IN + int[::1] X_indices, # IN + int[::1] X_indptr, # IN + floating[::1] sample_weight, # IN + floating[:, ::1] centers_old, # IN + floating[:, ::1] centers_new, # OUT + floating[::1] weight_sums, # INOUT + int[::1] labels, # IN + int *indices) nogil: # TMP + """Update of a single center for sparse MinibatchKMeans""" + cdef: + int n_samples = sample_weight.shape[0] + int n_features = centers_old.shape[1] + floating alpha + int n_indices + int k, sample_idx, feature_idx + + floating wsum = 0 + + # indices = np.where(labels == cluster_idx)[0] + k = 0 + for sample_idx in range(n_samples): + if labels[sample_idx] == cluster_idx: + indices[k] = sample_idx + wsum += sample_weight[sample_idx] + k += 1 + n_indices = k + + if wsum > 0: + # Undo the previous count-based scaling for this cluster center: + for feature_idx in range(n_features): + centers_new[cluster_idx, feature_idx] = centers_old[cluster_idx, feature_idx] * weight_sums[cluster_idx] + + # Update cluster with new point members + for k in range(n_indices): + sample_idx = indices[k] + for feature_idx in range(X_indptr[sample_idx], X_indptr[sample_idx + 1]): + centers_new[cluster_idx, X_indices[feature_idx]] += X_data[feature_idx] * sample_weight[sample_idx] + + # Update the count statistics for this center + weight_sums[cluster_idx] += wsum + + # Rescale to compute mean of all points (old and new) + alpha = 1 / weight_sums[cluster_idx] + for feature_idx in range(n_features): + centers_new[cluster_idx, feature_idx] *= alpha + else: + # No sample was assigned to this cluster in this batch of data + for feature_idx in range(n_features): + centers_new[cluster_idx, feature_idx] = centers_old[cluster_idx, feature_idx] diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index 17272858ae476..44c2837a8802a 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -26,16 +26,16 @@ from ..utils.sparsefuncs import mean_variance_axis from ..utils.validation import _deprecate_positional_args from ..utils import check_array -from ..utils import gen_batches from ..utils import check_random_state from ..utils import deprecated from ..utils.validation import check_is_fitted, _check_sample_weight from ..utils._openmp_helpers import _openmp_effective_n_threads from ..exceptions import ConvergenceWarning -from ._k_means_fast import CHUNK_SIZE -from ._k_means_fast import _inertia_dense -from ._k_means_fast import _inertia_sparse -from ._k_means_fast import _mini_batch_update_csr +from ._k_means_common import CHUNK_SIZE +from ._k_means_common import _inertia_dense +from ._k_means_common import _inertia_sparse +from ._k_means_minibatch import _minibatch_update_dense +from ._k_means_minibatch import _minibatch_update_sparse from ._k_means_lloyd import lloyd_iter_chunked_dense from ._k_means_lloyd import lloyd_iter_chunked_sparse from ._k_means_elkan import init_bounds_dense @@ -488,7 +488,7 @@ def _kmeans_single_elkan(X, sample_weight, centers_init, max_iter=300, np.asarray(center_half_distances), kth=1, axis=0)[1] if verbose: - inertia = _inertia(X, sample_weight, centers, labels) + inertia = _inertia(X, sample_weight, centers, labels, n_threads) print(f"Iteration {i}, inertia {inertia}") centers, centers_new = centers_new, centers @@ -517,7 +517,7 @@ def _kmeans_single_elkan(X, sample_weight, centers_init, max_iter=300, upper_bounds, lower_bounds, labels, center_shift, n_threads, update_centers=False) - inertia = _inertia(X, sample_weight, centers, labels) + inertia = _inertia(X, sample_weight, centers, labels, n_threads) return labels, inertia, centers, i + 1 @@ -602,7 +602,8 @@ def _kmeans_single_lloyd(X, sample_weight, centers_init, max_iter=300, weight_in_clusters, labels, center_shift, n_threads) if verbose: - inertia = _inertia(X, sample_weight, centers, labels) + inertia = _inertia(X, sample_weight, centers, labels, + n_threads) print(f"Iteration {i}, inertia {inertia}.") centers, centers_new = centers_new, centers @@ -630,13 +631,13 @@ def _kmeans_single_lloyd(X, sample_weight, centers_init, max_iter=300, weight_in_clusters, labels, center_shift, n_threads, update_centers=False) - inertia = _inertia(X, sample_weight, centers, labels) + inertia = _inertia(X, sample_weight, centers, labels, n_threads) return labels, inertia, centers, i + 1 def _labels_inertia(X, sample_weight, x_squared_norms, centers, - n_threads=None): + n_threads=1): """E step of the K-means EM algorithm. Compute the labels and the inertia of the given samples and centers. @@ -657,7 +658,7 @@ def _labels_inertia(X, sample_weight, x_squared_norms, centers, centers : ndarray of shape (n_clusters, n_features) The cluster centers. - n_threads : int, default=None + n_threads : int, default=1 The number of OpenMP threads to use for the computation. Parallelism is sample-wise on the main cython loop which assigns each sample to its closest center. @@ -673,8 +674,6 @@ def _labels_inertia(X, sample_weight, x_squared_norms, centers, n_samples = X.shape[0] n_clusters = centers.shape[0] - n_threads = _openmp_effective_n_threads(n_threads) - labels = np.full(n_samples, -1, dtype=np.int32) weight_in_clusters = np.zeros(n_clusters, dtype=centers.dtype) center_shift = np.zeros_like(weight_in_clusters) @@ -690,7 +689,17 @@ def _labels_inertia(X, sample_weight, x_squared_norms, centers, weight_in_clusters, labels, center_shift, n_threads, update_centers=False) - inertia = _inertia(X, sample_weight, centers, labels) + inertia = _inertia(X, sample_weight, centers, labels, n_threads) + + return labels, inertia + + +def _labels_inertia_threadpool_limit(X, sample_weight, x_squared_norms, + centers, n_threads=1): + """Same as _labels_inertia but in a threadpool_limits context.""" + with threadpool_limits(limits=1, user_api="blas"): + labels, inertia = _labels_inertia(X, sample_weight, x_squared_norms, + centers, n_threads) return labels, inertia @@ -806,7 +815,8 @@ class KMeans(TransformerMixin, ClusterMixin, BaseEstimator): Labels of each point inertia_ : float - Sum of squared distances of samples to their closest cluster center. + Sum of squared distances of samples to their closest cluster center, + weighted by the sample weights if provided. n_iter_ : int Number of iterations run. @@ -1192,10 +1202,6 @@ def fit_transform(self, X, y=None, sample_weight=None): X_new : ndarray of shape (n_samples, n_clusters) X transformed in the new space. """ - # Currently, this just skips a copy of the data if it is not in - # np.array or CSR format already. - # XXX This skips _check_test_data, which may change the dtype; - # we should refactor the input validation. return self.fit(X, sample_weight=sample_weight)._transform(X) def transform(self, X): @@ -1251,8 +1257,9 @@ def predict(self, X, sample_weight=None): x_squared_norms = row_norms(X, squared=True) sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) - return _labels_inertia(X, sample_weight, x_squared_norms, - self.cluster_centers_, self._n_threads)[0] + return _labels_inertia_threadpool_limit( + X, sample_weight, x_squared_norms, self.cluster_centers_, + self._n_threads)[0] def score(self, X, y=None, sample_weight=None): """Opposite of the value of X on the K-means objective. @@ -1280,8 +1287,9 @@ def score(self, X, y=None, sample_weight=None): x_squared_norms = row_norms(X, squared=True) sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) - return -_labels_inertia(X, sample_weight, x_squared_norms, - self.cluster_centers_)[1] + return -_labels_inertia_threadpool_limit( + X, sample_weight, x_squared_norms, self.cluster_centers_, + self._n_threads)[1] def _more_tags(self): return { @@ -1292,50 +1300,42 @@ def _more_tags(self): } -def _mini_batch_step(X, sample_weight, x_squared_norms, centers, weight_sums, - old_center_buffer, compute_squared_diff, - distances, random_reassign=False, - random_state=None, reassignment_ratio=.01, - verbose=False): +def _mini_batch_step(X, x_squared_norms, sample_weight, centers, centers_new, + weight_sums, random_state, random_reassign=False, + reassignment_ratio=0.01, verbose=False, n_threads=1): """Incremental update of the centers for the Minibatch K-Means algorithm. Parameters ---------- - X : ndarray of shape (n_samples, n_features) - The original data array. - - sample_weight : array-like of shape (n_samples,) - The weights for each observation in X. + X : {ndarray, sparse matrix} of shape (n_samples, n_features) + The original data array. If sparse, must be in CSR format. x_squared_norms : ndarray of shape (n_samples,) Squared euclidean norm of each data point. - centers : ndarray of shape (k, n_features) - The cluster centers. This array is MODIFIED IN PLACE + sample_weight : ndarray of shape (n_samples,) + The weights for each observation in X. + + centers : ndarray of shape (n_clusters, n_features) + The cluster centers before the current iteration - old_center_buffer : int - Copy of old centers for monitoring convergence. + centers_new : ndarray of shape (n_clusters, n_features) + The cluster centers after the current iteration. Modified in-place. - compute_squared_diff : bool - If set to False, the squared diff computation is skipped. + weight_sums : ndarray of shape (n_clusters,) + The vector in which we keep track of the numbers of points in a + cluster. This array is modified in place. - distances : ndarray of shape (n_samples,), dtype=float, default=None - If not None, should be a pre-allocated array that will be used to store - the distances of each sample to its closest center. - May not be None when random_reassign is True. + random_state : RandomState instance + Determines random number generation for low count centers reassignment. + See :term:`Glossary `. - random_reassign : bool, default=False + random_reassign : boolean, default=False If True, centers with very low counts are randomly reassigned to observations. - random_state : int, RandomState instance or None, default=None - Determines random number generation for centroid initialization and to - pick new clusters amongst observations with uniform probability. Use - an int to make the randomness deterministic. - See :term:`Glossary `. - - reassignment_ratio : float, default=.01 + reassignment_ratio : float, default=0.01 Control the fraction of the maximum number of counts for a center to be reassigned. A higher value means that low count centers are more likely to be reassigned, which means that the @@ -1345,156 +1345,64 @@ def _mini_batch_step(X, sample_weight, x_squared_norms, centers, weight_sums, verbose : bool, default=False Controls the verbosity. + n_threads : int, default=1 + The number of OpenMP threads to use for the computation. + Returns ------- inertia : float Sum of squared distances of samples to their closest cluster center. - - squared_diff : ndarray of shape (n_clusters,) - Squared distances between previous and updated cluster centers. - + The inertia is computed after finding the labels and before updating + the centers. """ # Perform label assignment to nearest centers - nearest_center, inertia = _labels_inertia(X, sample_weight, - x_squared_norms, centers) + # For better efficiency, it's better to run _mini_batch_step in a + # threadpool_limit context than using _labels_inertia_threadpool_limit here + labels, inertia = _labels_inertia(X, sample_weight, + x_squared_norms, centers, + n_threads=n_threads) + + # Update centers according to the labels + if sp.issparse(X): + _minibatch_update_sparse(X, sample_weight, centers, centers_new, + weight_sums, labels, n_threads) + else: + _minibatch_update_dense(X, sample_weight, centers, centers_new, + weight_sums, labels, n_threads) + # Reassign clusters that have very low weight if random_reassign and reassignment_ratio > 0: - random_state = check_random_state(random_state) - # Reassign clusters that have very low weight to_reassign = weight_sums < reassignment_ratio * weight_sums.max() + # pick at most .5 * batch_size samples as new centers if to_reassign.sum() > .5 * X.shape[0]: indices_dont_reassign = \ np.argsort(weight_sums)[int(.5 * X.shape[0]):] to_reassign[indices_dont_reassign] = False n_reassigns = to_reassign.sum() + if n_reassigns: # Pick new clusters amongst observations with uniform probability new_centers = random_state.choice(X.shape[0], replace=False, size=n_reassigns) if verbose: - print("[MiniBatchKMeans] Reassigning %i cluster centers." - % n_reassigns) + print(f"[MiniBatchKMeans] Reassigning {n_reassigns} " + f"cluster centers.") - if sp.issparse(X) and not sp.issparse(centers): + if sp.issparse(X): assign_rows_csr( X, new_centers.astype(np.intp, copy=False), np.where(to_reassign)[0].astype(np.intp, copy=False), - centers) + centers_new) else: - centers[to_reassign] = X[new_centers] + centers_new[to_reassign] = X[new_centers] + # reset counts of reassigned centers, but don't reset them too small # to avoid instant reassignment. This is a pretty dirty hack as it # also modifies the learning rates. weight_sums[to_reassign] = np.min(weight_sums[~to_reassign]) - # implementation for the sparse CSR representation completely written in - # cython - if sp.issparse(X): - return inertia, _mini_batch_update_csr( - X, sample_weight, x_squared_norms, centers, weight_sums, - nearest_center, old_center_buffer, compute_squared_diff) - - # dense variant in mostly numpy (not as memory efficient though) - k = centers.shape[0] - squared_diff = 0.0 - for center_idx in range(k): - # find points from minibatch that are assigned to this center - center_mask = nearest_center == center_idx - wsum = sample_weight[center_mask].sum() - - if wsum > 0: - if compute_squared_diff: - old_center_buffer[:] = centers[center_idx] - - # inplace remove previous count scaling - centers[center_idx] *= weight_sums[center_idx] - - # inplace sum with new points members of this cluster - centers[center_idx] += \ - np.sum(X[center_mask] * - sample_weight[center_mask, np.newaxis], axis=0) - - # update the count statistics for this center - weight_sums[center_idx] += wsum - - # inplace rescale to compute mean of all points (old and new) - # Note: numpy >= 1.10 does not support '/=' for the following - # expression for a mixture of int and float (see numpy issue #6464) - centers[center_idx] = centers[center_idx] / weight_sums[center_idx] - - # update the squared diff if necessary - if compute_squared_diff: - diff = centers[center_idx].ravel() - old_center_buffer.ravel() - squared_diff += np.dot(diff, diff) - - return inertia, squared_diff - - -def _mini_batch_convergence(model, iteration_idx, n_iter, tol, - n_samples, centers_squared_diff, batch_inertia, - context, verbose=0): - """Helper function to encapsulate the early stopping logic.""" - # Normalize inertia to be able to compare values when - # batch_size changes - batch_inertia /= model.batch_size - centers_squared_diff /= model.batch_size - - # Compute an Exponentially Weighted Average of the squared - # diff to monitor the convergence while discarding - # minibatch-local stochastic variability: - # https://en.wikipedia.org/wiki/Moving_average - ewa_diff = context.get('ewa_diff') - ewa_inertia = context.get('ewa_inertia') - if ewa_diff is None: - ewa_diff = centers_squared_diff - ewa_inertia = batch_inertia - else: - alpha = float(model.batch_size) * 2.0 / (n_samples + 1) - alpha = 1.0 if alpha > 1.0 else alpha - ewa_diff = ewa_diff * (1 - alpha) + centers_squared_diff * alpha - ewa_inertia = ewa_inertia * (1 - alpha) + batch_inertia * alpha - - # Log progress to be able to monitor convergence - if verbose: - progress_msg = ( - 'Minibatch iteration %d/%d:' - ' mean batch inertia: %f, ewa inertia: %f ' % ( - iteration_idx + 1, n_iter, batch_inertia, - ewa_inertia)) - print(progress_msg) - - # Early stopping based on absolute tolerance on squared change of - # centers position (using EWA smoothing) - if tol > 0.0 and ewa_diff <= tol: - if verbose: - print('Converged (small centers change) at iteration %d/%d' - % (iteration_idx + 1, n_iter)) - return True - - # Early stopping heuristic due to lack of improvement on smoothed inertia - ewa_inertia_min = context.get('ewa_inertia_min') - no_improvement = context.get('no_improvement', 0) - if ewa_inertia_min is None or ewa_inertia < ewa_inertia_min: - no_improvement = 0 - ewa_inertia_min = ewa_inertia - else: - no_improvement += 1 - - if (model.max_no_improvement is not None - and no_improvement >= model.max_no_improvement): - if verbose: - print('Converged (lack of improvement in inertia)' - ' at iteration %d/%d' - % (iteration_idx + 1, n_iter)) - return True - - # update the convergence context to maintain state across successive calls: - context['ewa_diff'] = ewa_diff - context['ewa_inertia'] = ewa_inertia - context['ewa_inertia_min'] = ewa_inertia_min - context['no_improvement'] = no_improvement - return False + return inertia class MiniBatchKMeans(KMeans): @@ -1531,8 +1439,13 @@ class MiniBatchKMeans(KMeans): Maximum number of iterations over the complete dataset before stopping independently of any early stopping criterion heuristics. - batch_size : int, default=100 + batch_size : int, default=1024 Size of the mini batches. + For faster compuations, you can set the ``batch_size`` greater than + 256 * number of cores to enable parallelism on all cores. + + .. versionchanged:: 1.0 + `batch_size` default changed from 100 to 1024. verbose : int, default=0 Verbosity mode. @@ -1570,7 +1483,8 @@ class MiniBatchKMeans(KMeans): only algorithm is initialized by running a batch KMeans on a random subset of the data. This needs to be larger than n_clusters. - If `None`, `init_size= 3 * batch_size`. + If `None`, the heuristic is `init_size = 3 * batch_size` if + `3 * batch_size < n_clusters`, else `init_size = 3 * n_clusters`. n_init : int, default=3 Number of random initializations that are tried. @@ -1578,11 +1492,12 @@ class MiniBatchKMeans(KMeans): best of the ``n_init`` initializations as measured by inertia. reassignment_ratio : float, default=0.01 - Control the fraction of the maximum number of counts for a - center to be reassigned. A higher value means that low count - centers are more easily reassigned, which means that the - model will take longer to converge, but should converge in a - better clustering. + Control the fraction of the maximum number of counts for a center to + be reassigned. A higher value means that low count centers are more + easily reassigned, which means that the model will take longer to + converge, but should converge in a better clustering. However, too high + a value may cause convergence issues, especially with a small batch + size. Attributes ---------- @@ -1590,17 +1505,24 @@ class MiniBatchKMeans(KMeans): cluster_centers_ : ndarray of shape (n_clusters, n_features) Coordinates of cluster centers. - labels_ : int + labels_ : ndarray of shape (n_samples,) Labels of each point (if compute_labels is set to True). inertia_ : float The value of the inertia criterion associated with the chosen - partition (if compute_labels is set to True). The inertia is - defined as the sum of square distances of samples to their nearest - neighbor. + partition if compute_labels is set to True. If compute_labels is set to + False, it's an approximation of the inertia based on an exponentially + weighted average of the batch inertiae. + The inertia is defined as the sum of square distances of samples to + their cluster center, weighted by the sample weights if provided. n_iter_ : int - Number of batches processed. + Number of iterations over the full dataset. + + n_steps_ : int + Number of minibatches processed. + + .. versionadded:: 1.0 counts_ : ndarray of shape (n_clusters,) Weigth sum of each cluster. @@ -1651,14 +1573,14 @@ class MiniBatchKMeans(KMeans): ... batch_size=6, ... max_iter=10).fit(X) >>> kmeans.cluster_centers_ - array([[3.95918367, 2.40816327], - [1.12195122, 1.3902439 ]]) + array([[1.19..., 1.22...], + [4.03..., 2.46...]]) >>> kmeans.predict([[0, 0], [4, 4]]) - array([1, 0], dtype=int32) + array([0, 1], dtype=int32) """ @_deprecate_positional_args def __init__(self, n_clusters=8, *, init='k-means++', max_iter=100, - batch_size=100, verbose=0, compute_labels=True, + batch_size=1024, verbose=0, compute_labels=True, random_state=None, tol=0.0, max_no_improvement=10, init_size=None, n_init=3, reassignment_ratio=0.01): @@ -1703,6 +1625,7 @@ def _check_params(self, X): if self.batch_size <= 0: raise ValueError( f"batch_size should be > 0, got {self.batch_size} instead.") + self._batch_size = min(self.batch_size, X.shape[0]) # init_size if self.init_size is not None and self.init_size <= 0: @@ -1710,7 +1633,7 @@ def _check_params(self, X): f"init_size should be > 0, got {self.init_size} instead.") self._init_size = self.init_size if self._init_size is None: - self._init_size = 3 * self.batch_size + self._init_size = 3 * self._batch_size if self._init_size < self.n_clusters: self._init_size = 3 * self.n_clusters elif self._init_size < self.n_clusters: @@ -1728,6 +1651,80 @@ def _check_params(self, X): f"reassignment_ratio should be >= 0, got " f"{self.reassignment_ratio} instead.") + def _mini_batch_convergence(self, step, n_steps, n_samples, + centers_squared_diff, batch_inertia): + """Helper function to encapsulate the early stopping logic""" + # Normalize inertia to be able to compare values when + # batch_size changes + batch_inertia /= self._batch_size + + # count steps starting from 1 for user friendly verbose mode. + step = step + 1 + + # Ignore first iteration because it's inertia from initialization. + if step == 1: + if self.verbose: + print(f"Minibatch step {step}/{n_steps}: mean batch " + f"inertia: {batch_inertia}") + return False + + # Compute an Exponentially Weighted Average of the inertia to + # monitor the convergence while discarding minibatch-local stochastic + # variability: https://en.wikipedia.org/wiki/Moving_average + if self._ewa_inertia is None: + self._ewa_inertia = batch_inertia + else: + alpha = self._batch_size * 2.0 / (n_samples + 1) + alpha = min(alpha, 1) + self._ewa_inertia = ( + self._ewa_inertia * (1 - alpha) + batch_inertia * alpha) + + # Log progress to be able to monitor convergence + if self.verbose: + print(f"Minibatch step {step}/{n_steps}: mean batch inertia: " + f"{batch_inertia}, ewa inertia: {self._ewa_inertia}") + + # Early stopping based on absolute tolerance on squared change of + # centers position + if self._tol > 0.0 and centers_squared_diff <= self._tol: + if self.verbose: + print(f"Converged (small centers change) at step " + f"{step}/{n_steps}") + return True + + # Early stopping heuristic due to lack of improvement on smoothed + # inertia + if (self._ewa_inertia_min is None or + self._ewa_inertia < self._ewa_inertia_min): + self._no_improvement = 0 + self._ewa_inertia_min = self._ewa_inertia + else: + self._no_improvement += 1 + + if (self.max_no_improvement is not None + and self._no_improvement >= self.max_no_improvement): + if self.verbose: + print(f"Converged (lack of improvement in inertia) at step " + f"{step}/{n_steps}") + return True + + return False + + def _random_reassign(self): + """Check if a random reassignment needs to be done. + + Do random reassignments each time 10 * n_clusters samples have been + processed. + + If there are empty clusters we always want to reassign. + """ + self._n_since_last_reassign += self._batch_size + if ((self._counts == 0).any() or + self._n_since_last_reassign >= (10 * self.n_clusters)): + self._n_since_last_reassign = 0 + return True + return False + def fit(self, X, y=None, sample_weight=None): """Compute the centroids on X by chunking it into mini-batches. @@ -1737,13 +1734,15 @@ def fit(self, X, y=None, sample_weight=None): Training instances to cluster. It must be noted that the data will be converted to C ordering, which will cause a memory copy if the given data is not C-contiguous. + If a sparse matrix is passed, a copy will be made if it's not in + CSR format. y : Ignored Not used, present here for API consistency by convention. sample_weight : array-like of shape (n_samples,), default=None The weights for each observation in X. If None, all observations - are assigned equal weight (default: None). + are assigned equal weight. .. versionadded:: 0.20 @@ -1758,6 +1757,7 @@ def fit(self, X, y=None, sample_weight=None): self._check_params(X) random_state = check_random_state(self.random_state) sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) + n_samples, n_features = X.shape # Validate init array init = self.init @@ -1765,182 +1765,146 @@ def fit(self, X, y=None, sample_weight=None): init = check_array(init, dtype=X.dtype, copy=True, order='C') self._validate_center_shape(X, init) - n_samples, n_features = X.shape - x_squared_norms = row_norms(X, squared=True) - - if self.tol > 0.0: - tol = _tolerance(X, self.tol) + self._check_mkl_vcomp(X, self._batch_size) - # using tol-based early stopping needs the allocation of a - # dedicated before which can be expensive for high dim data: - # hence we allocate it outside of the main loop - old_center_buffer = np.zeros(n_features, dtype=X.dtype) - else: - tol = 0.0 - # no need for the center buffer if tol-based early stopping is - # disabled - old_center_buffer = np.zeros(0, dtype=X.dtype) - - distances = np.zeros(self.batch_size, dtype=X.dtype) - n_batches = int(np.ceil(float(n_samples) / self.batch_size)) - n_iter = int(self.max_iter * n_batches) - - self._check_mkl_vcomp(X, self.batch_size) + # precompute squared norms of data points + x_squared_norms = row_norms(X, squared=True) + # Validation set for the init validation_indices = random_state.randint(0, n_samples, self._init_size) X_valid = X[validation_indices] sample_weight_valid = sample_weight[validation_indices] x_squared_norms_valid = x_squared_norms[validation_indices] - # perform several inits with random sub-sets + # perform several inits with random subsets best_inertia = None for init_idx in range(self._n_init): if self.verbose: - print("Init %d/%d with method: %s" - % (init_idx + 1, self._n_init, init)) - weight_sums = np.zeros(self.n_clusters, dtype=sample_weight.dtype) - - # TODO: once the `k_means` function works with sparse input we - # should refactor the following init to use it instead. + print(f"Init {init_idx + 1}/{self._n_init} with method {init}") # Initialize the centers using only a fraction of the data as we - # expect n_samples to be very large when using MiniBatchKMeans + # expect n_samples to be very large when using MiniBatchKMeans. cluster_centers = self._init_centroids( - X, x_squared_norms=x_squared_norms, - init=init, - random_state=random_state, - init_size=self._init_size) - - # Compute the label assignment on the init dataset - _mini_batch_step( - X_valid, sample_weight_valid, - x_squared_norms[validation_indices], cluster_centers, - weight_sums, old_center_buffer, False, distances=None, - verbose=self.verbose) - - # Keep only the best cluster centers across independent inits on - # the common validation set - _, inertia = _labels_inertia(X_valid, sample_weight_valid, - x_squared_norms_valid, - cluster_centers) + X, x_squared_norms=x_squared_norms, init=init, + random_state=random_state, init_size=self._init_size) + + # Compute inertia on a validation set. + _, inertia = _labels_inertia_threadpool_limit( + X_valid, sample_weight_valid, x_squared_norms_valid, + cluster_centers, n_threads=self._n_threads) + if self.verbose: - print("Inertia for init %d/%d: %f" - % (init_idx + 1, self._n_init, inertia)) + print(f"Inertia for init {init_idx + 1}/{self._n_init}: " + f"{inertia}") if best_inertia is None or inertia < best_inertia: - self.cluster_centers_ = cluster_centers - self._counts = weight_sums + init_centers = cluster_centers best_inertia = inertia - # Empty context to be used inplace by the convergence check routine - convergence_context = {} - - # Perform the iterative optimization until the final convergence - # criterion - for iteration_idx in range(n_iter): - # Sample a minibatch from the full dataset - minibatch_indices = random_state.randint( - 0, n_samples, self.batch_size) - - # Perform the actual update step on the minibatch data - batch_inertia, centers_squared_diff = _mini_batch_step( - X[minibatch_indices], sample_weight[minibatch_indices], - x_squared_norms[minibatch_indices], - self.cluster_centers_, self._counts, - old_center_buffer, tol > 0.0, distances=distances, - # Here we randomly choose whether to perform - # random reassignment: the choice is done as a function - # of the iteration index, and the minimum number of - # counts, in order to force this reassignment to happen - # every once in a while - random_reassign=((iteration_idx + 1) - % (10 + int(self._counts.min())) == 0), - random_state=random_state, - reassignment_ratio=self.reassignment_ratio, - verbose=self.verbose) - - # Monitor convergence and do early stopping if necessary - if _mini_batch_convergence( - self, iteration_idx, n_iter, tol, n_samples, - centers_squared_diff, batch_inertia, convergence_context, - verbose=self.verbose): - break + centers = init_centers + centers_new = np.empty_like(centers) + + # Initialize counts + self._counts = np.zeros(self.n_clusters, dtype=X.dtype) + + # Attributes to monitor the convergence + self._ewa_inertia = None + self._ewa_inertia_min = None + self._no_improvement = 0 + + # Initialize number of samples seen since last reassignment + self._n_since_last_reassign = 0 + + n_steps = (self.max_iter * n_samples) // self._batch_size + + with threadpool_limits(limits=1, user_api="blas"): + # Perform the iterative optimization until convergence + for i in range(n_steps): + # Sample a minibatch from the full dataset + minibatch_indices = random_state.randint(0, n_samples, + self._batch_size) + + # Perform the actual update step on the minibatch data + batch_inertia = _mini_batch_step( + X=X[minibatch_indices], + x_squared_norms=x_squared_norms[minibatch_indices], + sample_weight=sample_weight[minibatch_indices], + centers=centers, + centers_new=centers_new, + weight_sums=self._counts, + random_state=random_state, + random_reassign=self._random_reassign(), + reassignment_ratio=self.reassignment_ratio, + verbose=self.verbose, + n_threads=self._n_threads) + + if self._tol > 0.0: + centers_squared_diff = np.sum((centers_new - centers)**2) + else: + centers_squared_diff = 0 + + centers, centers_new = centers_new, centers + + # Monitor convergence and do early stopping if necessary + if self._mini_batch_convergence( + i, n_steps, n_samples, centers_squared_diff, + batch_inertia): + break + + self.cluster_centers_ = centers - self.n_iter_ = iteration_idx + 1 + self.n_steps_ = i + 1 + self.n_iter_ = int(np.ceil(((i + 1) * self._batch_size) / n_samples)) if self.compute_labels: - self.labels_, self.inertia_ = \ - self._labels_inertia_minibatch(X, sample_weight) + self.labels_, self.inertia_ = _labels_inertia_threadpool_limit( + X, sample_weight, x_squared_norms, self.cluster_centers_, + n_threads=self._n_threads) + else: + self.inertia_ = self._ewa_inertia * n_samples return self - def _labels_inertia_minibatch(self, X, sample_weight): - """Compute labels and inertia using mini batches. - - This is slightly slower than doing everything at once but prevents - memory errors / segfaults. - - Parameters - ---------- - X : array-like of shape (n_samples, n_features) - Input data. - - sample_weight : array-like of shape (n_samples,) - The weights for each observation in X. - - Returns - ------- - labels : ndarray of shape (n_samples,) - Cluster labels for each point. - - inertia : float - Sum of squared distances of points to nearest cluster. - """ - if self.verbose: - print('Computing label assignment and total inertia') - sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) - x_squared_norms = row_norms(X, squared=True) - slices = gen_batches(X.shape[0], self.batch_size) - results = [_labels_inertia(X[s], sample_weight[s], x_squared_norms[s], - self.cluster_centers_) for s in slices] - labels, inertia = zip(*results) - return np.hstack(labels), np.sum(inertia) - def partial_fit(self, X, y=None, sample_weight=None): """Update k means estimate on a single mini-batch X. Parameters ---------- - X : array-like of shape (n_samples, n_features) - Coordinates of the data points to cluster. It must be noted that - X will be copied if it is not C-contiguous. + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training instances to cluster. It must be noted that the data + will be converted to C ordering, which will cause a memory copy + if the given data is not C-contiguous. + If a sparse matrix is passed, a copy will be made if it's not in + CSR format. y : Ignored Not used, present here for API consistency by convention. sample_weight : array-like of shape (n_samples,), default=None The weights for each observation in X. If None, all observations - are assigned equal weight (default: None). + are assigned equal weight. Returns ------- self """ - is_first_call_to_partial_fit = not hasattr(self, 'cluster_centers_') + has_centers = hasattr(self, 'cluster_centers_') X = self._validate_data(X, accept_sparse='csr', dtype=[np.float64, np.float32], order='C', accept_large_sparse=False, - reset=is_first_call_to_partial_fit) + reset=not has_centers) self._random_state = getattr(self, "_random_state", check_random_state(self.random_state)) sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) + self.n_steps_ = getattr(self, "n_steps_", 0) + # precompute squared norms of data points x_squared_norms = row_norms(X, squared=True) - if is_first_call_to_partial_fit: - # this is the first call to partial_fit on this object + if not has_centers: + # this instance has not been fitted yet (fit or partial_fit) self._check_params(X) # Validate init array @@ -1953,34 +1917,34 @@ def partial_fit(self, X, y=None, sample_weight=None): # initialize the cluster centers self.cluster_centers_ = self._init_centroids( - X, x_squared_norms=x_squared_norms, - init=init, - random_state=self._random_state, - init_size=self._init_size) - - self._counts = np.zeros(self.n_clusters, - dtype=sample_weight.dtype) - random_reassign = False - distances = None - else: - # The lower the minimum count is, the more we do random - # reassignment, however, we don't want to do random - # reassignment too often, to allow for building up counts - random_reassign = self._random_state.randint( - 10 * (1 + self._counts.min())) == 0 - distances = np.zeros(X.shape[0], dtype=X.dtype) - - _mini_batch_step(X, sample_weight, x_squared_norms, - self.cluster_centers_, self._counts, - np.zeros(0, dtype=X.dtype), 0, - random_reassign=random_reassign, distances=distances, - random_state=self._random_state, - reassignment_ratio=self.reassignment_ratio, - verbose=self.verbose) + X, x_squared_norms=x_squared_norms, init=init, + random_state=self._random_state, init_size=self._init_size) + + # Initialize counts + self._counts = np.zeros(self.n_clusters, dtype=X.dtype) + + # Initialize number of samples seen since last reassignment + self._n_since_last_reassign = 0 + + with threadpool_limits(limits=1, user_api="blas"): + _mini_batch_step(X, + x_squared_norms=x_squared_norms, + sample_weight=sample_weight, + centers=self.cluster_centers_, + centers_new=self.cluster_centers_, + weight_sums=self._counts, + random_state=self._random_state, + random_reassign=self._random_reassign(), + reassignment_ratio=self.reassignment_ratio, + verbose=self.verbose, + n_threads=self._n_threads) if self.compute_labels: - self.labels_, self.inertia_ = _labels_inertia( - X, sample_weight, x_squared_norms, self.cluster_centers_) + self.labels_, self.inertia_ = _labels_inertia_threadpool_limit( + X, sample_weight, x_squared_norms, self.cluster_centers_, + n_threads=self._n_threads) + + self.n_steps_ += 1 return self @@ -1998,7 +1962,7 @@ def predict(self, X, sample_weight=None): sample_weight : array-like of shape (n_samples,), default=None The weights for each observation in X. If None, all observations - are assigned equal weight (default: None). + are assigned equal weight. Returns ------- @@ -2008,7 +1972,14 @@ def predict(self, X, sample_weight=None): check_is_fitted(self) X = self._check_test_data(X) - return self._labels_inertia_minibatch(X, sample_weight)[0] + x_squared_norms = row_norms(X, squared=True) + sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) + + labels, _ = _labels_inertia_threadpool_limit( + X, sample_weight, x_squared_norms, self.cluster_centers_, + n_threads=self._n_threads) + + return labels def _more_tags(self): return { diff --git a/sklearn/cluster/setup.py b/sklearn/cluster/setup.py index 48ed25c5c0eaf..9a85541731e5f 100644 --- a/sklearn/cluster/setup.py +++ b/sklearn/cluster/setup.py @@ -25,8 +25,8 @@ def configuration(parent_package='', top_path=None): include_dirs=[numpy.get_include()], libraries=libraries) - config.add_extension('_k_means_fast', - sources=['_k_means_fast.pyx'], + config.add_extension('_k_means_common', + sources=['_k_means_common.pyx'], include_dirs=[numpy.get_include()], libraries=libraries) @@ -40,6 +40,11 @@ def configuration(parent_package='', top_path=None): include_dirs=[numpy.get_include()], libraries=libraries) + config.add_extension('_k_means_minibatch', + sources=['_k_means_minibatch.pyx'], + include_dirs=[numpy.get_include()], + libraries=libraries) + config.add_subpackage('tests') return config diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py index 341b00c5c137f..248b2e1ddd498 100644 --- a/sklearn/cluster/tests/test_k_means.py +++ b/sklearn/cluster/tests/test_k_means.py @@ -9,9 +9,7 @@ import pytest from sklearn.utils._testing import assert_array_equal -from sklearn.utils._testing import assert_array_almost_equal from sklearn.utils._testing import assert_allclose -from sklearn.utils._testing import assert_almost_equal from sklearn.utils.fixes import _astype_copy_false from sklearn.base import clone from sklearn.exceptions import ConvergenceWarning @@ -24,12 +22,12 @@ from sklearn.cluster import MiniBatchKMeans from sklearn.cluster._kmeans import _labels_inertia from sklearn.cluster._kmeans import _mini_batch_step -from sklearn.cluster._k_means_fast import _relocate_empty_clusters_dense -from sklearn.cluster._k_means_fast import _relocate_empty_clusters_sparse -from sklearn.cluster._k_means_fast import _euclidean_dense_dense_wrapper -from sklearn.cluster._k_means_fast import _euclidean_sparse_dense_wrapper -from sklearn.cluster._k_means_fast import _inertia_dense -from sklearn.cluster._k_means_fast import _inertia_sparse +from sklearn.cluster._k_means_common import _relocate_empty_clusters_dense +from sklearn.cluster._k_means_common import _relocate_empty_clusters_sparse +from sklearn.cluster._k_means_common import _euclidean_dense_dense_wrapper +from sklearn.cluster._k_means_common import _euclidean_sparse_dense_wrapper +from sklearn.cluster._k_means_common import _inertia_dense +from sklearn.cluster._k_means_common import _inertia_sparse from sklearn.datasets import make_blobs from io import StringIO @@ -176,68 +174,58 @@ def test_kmeans_convergence(algorithm): def test_minibatch_update_consistency(): # Check that dense and sparse minibatch update give the same results rng = np.random.RandomState(42) - old_centers = centers + rng.normal(size=centers.shape) - new_centers = old_centers.copy() - new_centers_csr = old_centers.copy() + centers_old = centers + rng.normal(size=centers.shape) + centers_old_csr = centers_old.copy() - weight_sums = np.zeros(new_centers.shape[0], dtype=np.double) - weight_sums_csr = np.zeros(new_centers.shape[0], dtype=np.double) + centers_new = np.zeros_like(centers_old) + centers_new_csr = np.zeros_like(centers_old_csr) + + weight_sums = np.zeros(centers_old.shape[0], dtype=X.dtype) + weight_sums_csr = np.zeros(centers_old.shape[0], dtype=X.dtype) x_squared_norms = (X ** 2).sum(axis=1) x_squared_norms_csr = row_norms(X_csr, squared=True) - buffer = np.zeros(centers.shape[1], dtype=np.double) - buffer_csr = np.zeros(centers.shape[1], dtype=np.double) + sample_weight = np.ones(X.shape[0], dtype=X.dtype) # extract a small minibatch X_mb = X[:10] X_mb_csr = X_csr[:10] x_mb_squared_norms = x_squared_norms[:10] x_mb_squared_norms_csr = x_squared_norms_csr[:10] - - sample_weight_mb = np.ones(X_mb.shape[0], dtype=np.double) + sample_weight_mb = sample_weight[:10] # step 1: compute the dense minibatch update - old_inertia, incremental_diff = _mini_batch_step( - X_mb, sample_weight_mb, x_mb_squared_norms, new_centers, weight_sums, - buffer, 1, None, random_reassign=False) + old_inertia = _mini_batch_step( + X_mb, x_mb_squared_norms, sample_weight_mb, centers_old, centers_new, + weight_sums, np.random.RandomState(0), random_reassign=False) assert old_inertia > 0.0 # compute the new inertia on the same batch to check that it decreased labels, new_inertia = _labels_inertia( - X_mb, sample_weight_mb, x_mb_squared_norms, new_centers) + X_mb, sample_weight_mb, x_mb_squared_norms, centers_new) assert new_inertia > 0.0 assert new_inertia < old_inertia - # check that the incremental difference computation is matching the - # final observed value - effective_diff = np.sum((new_centers - old_centers) ** 2) - assert_almost_equal(incremental_diff, effective_diff) - # step 2: compute the sparse minibatch update - old_inertia_csr, incremental_diff_csr = _mini_batch_step( - X_mb_csr, sample_weight_mb, x_mb_squared_norms_csr, new_centers_csr, - weight_sums_csr, buffer_csr, 1, None, random_reassign=False) + old_inertia_csr = _mini_batch_step( + X_mb_csr, x_mb_squared_norms_csr, sample_weight_mb, centers_old_csr, + centers_new_csr, weight_sums_csr, np.random.RandomState(0), + random_reassign=False) assert old_inertia_csr > 0.0 # compute the new inertia on the same batch to check that it decreased labels_csr, new_inertia_csr = _labels_inertia( - X_mb_csr, sample_weight_mb, x_mb_squared_norms_csr, new_centers_csr) + X_mb_csr, sample_weight_mb, x_mb_squared_norms_csr, centers_new_csr) assert new_inertia_csr > 0.0 assert new_inertia_csr < old_inertia_csr - # check that the incremental difference computation is matching the - # final observed value - effective_diff = np.sum((new_centers_csr - old_centers) ** 2) - assert_almost_equal(incremental_diff_csr, effective_diff) - # step 3: check that sparse and dense updates lead to the same results assert_array_equal(labels, labels_csr) - assert_array_almost_equal(new_centers, new_centers_csr) - assert_almost_equal(incremental_diff, incremental_diff_csr) - assert_almost_equal(old_inertia, old_inertia_csr) - assert_almost_equal(new_inertia, new_inertia_csr) + assert_allclose(centers_new, centers_new_csr) + assert_allclose(old_inertia, old_inertia_csr) + assert_allclose(new_inertia, new_inertia_csr) def _check_fitted_model(km): @@ -250,7 +238,7 @@ def _check_fitted_model(km): assert np.unique(labels).shape[0] == n_clusters # check that the labels assignment are perfect (up to a permutation) - assert v_measure_score(true_labels, labels) == 1.0 + assert_allclose(v_measure_score(true_labels, labels), 1.0) assert km.inertia_ > 0.0 @@ -412,66 +400,54 @@ def test_minibatch_sensible_reassign(): assert km.cluster_centers_.any(axis=1).sum() > 10 -def test_minibatch_reassign(): - # Give a perfect initialization, but a large reassignment_ratio, - # as a result all the centers should be reassigned and the model - # should no longer be good - sample_weight = np.ones(X.shape[0], dtype=X.dtype) - for this_X in (X, X_csr): - mb_k_means = MiniBatchKMeans(n_clusters=n_clusters, batch_size=100, - random_state=42) - mb_k_means.fit(this_X) - - score_before = mb_k_means.score(this_X) - try: - old_stdout = sys.stdout - sys.stdout = StringIO() - # Turn on verbosity to smoke test the display code - _mini_batch_step(this_X, sample_weight, (X ** 2).sum(axis=1), - mb_k_means.cluster_centers_, - mb_k_means._counts, - np.zeros(X.shape[1], np.double), - False, distances=np.zeros(X.shape[0]), - random_reassign=True, random_state=42, - reassignment_ratio=1, verbose=True) - finally: - sys.stdout = old_stdout - assert score_before > mb_k_means.score(this_X) +@pytest.mark.parametrize("data", [X, X_csr], ids=["dense", "sparse"]) +def test_minibatch_reassign(data): + # Check the reassignment part of the minibatch step with very high or very + # low reassignment ratio. + perfect_centers = np.empty((n_clusters, n_features)) + for i in range(n_clusters): + perfect_centers[i] = X[true_labels == i].mean(axis=0) + + x_squared_norms = row_norms(data, squared=True) + sample_weight = np.ones(n_samples) + centers_new = np.empty_like(perfect_centers) + + # Give a perfect initialization, but a large reassignment_ratio, as a + # result many centers should be reassigned and the model should no longer + # be good + score_before = - _labels_inertia(data, sample_weight, x_squared_norms, + perfect_centers, 1)[1] + + _mini_batch_step(data, x_squared_norms, sample_weight, perfect_centers, + centers_new, np.zeros(n_clusters), + np.random.RandomState(0), random_reassign=True, + reassignment_ratio=1) + + score_after = - _labels_inertia(data, sample_weight, x_squared_norms, + centers_new, 1)[1] + + assert score_before > score_after # Give a perfect initialization, with a small reassignment_ratio, - # no center should be reassigned - for this_X in (X, X_csr): - mb_k_means = MiniBatchKMeans(n_clusters=n_clusters, batch_size=100, - init=centers.copy(), - random_state=42, n_init=1) - mb_k_means.fit(this_X) - clusters_before = mb_k_means.cluster_centers_ - # Turn on verbosity to smoke test the display code - _mini_batch_step(this_X, sample_weight, (X ** 2).sum(axis=1), - mb_k_means.cluster_centers_, - mb_k_means._counts, - np.zeros(X.shape[1], np.double), - False, distances=np.zeros(X.shape[0]), - random_reassign=True, random_state=42, - reassignment_ratio=1e-15) - assert_array_almost_equal(clusters_before, mb_k_means.cluster_centers_) + # no center should be reassigned. + _mini_batch_step(data, x_squared_norms, sample_weight, perfect_centers, + centers_new, np.zeros(n_clusters), + np.random.RandomState(0), random_reassign=True, + reassignment_ratio=1e-15) + + assert_allclose(centers_new, perfect_centers) def test_minibatch_with_many_reassignments(): # Test for the case that the number of clusters to reassign is bigger - # than the batch_size - n_samples = 550 - rnd = np.random.RandomState(42) - X = rnd.uniform(size=(n_samples, 10)) - # Check that the fit works if n_clusters is bigger than the batch_size. - # Run the test with 550 clusters and 550 samples, because it turned out - # that this values ensure that the number of clusters to reassign - # is always bigger than the batch_size - n_clusters = 550 - MiniBatchKMeans(n_clusters=n_clusters, - batch_size=100, + # than the batch_size. Run the test with 100 clusters and a batch_size of + # 10 because it turned out that these values ensure that the number of + # clusters to reassign is always bigger than the batch_size. + MiniBatchKMeans(n_clusters=100, + batch_size=10, init_size=n_samples, - random_state=42).fit(X) + random_state=42, + verbose=True).fit(X) def test_minibatch_kmeans_init_size(): @@ -491,6 +467,46 @@ def test_minibatch_kmeans_init_size(): assert km._init_size == n_samples +@pytest.mark.parametrize("tol, max_no_improvement", [(1e-4, None), (0, 10)]) +def test_minibatch_declared_convergence(capsys, tol, max_no_improvement): + # Check convergence detection based on ewa batch inertia or on + # small center change. + X, _, centers = make_blobs(centers=3, random_state=0, return_centers=True) + + km = MiniBatchKMeans(n_clusters=3, init=centers, batch_size=20, tol=tol, + random_state=0, max_iter=10, n_init=1, verbose=1, + max_no_improvement=max_no_improvement) + + km.fit(X) + assert 1 < km.n_iter_ < 10 + + captured = capsys.readouterr() + if max_no_improvement is None: + assert "Converged (small centers change)" in captured.out + if tol == 0: + assert "Converged (lack of improvement in inertia)" in captured.out + + +def test_minibatch_iter_steps(): + # Check consistency of n_iter_ and n_steps_ attributes. + batch_size = 30 + n_samples = X.shape[0] + km = MiniBatchKMeans(n_clusters=3, batch_size=batch_size, + random_state=0).fit(X) + + # n_iter_ is the number of started epochs + assert km.n_iter_ == np.ceil((km.n_steps_ * batch_size) / n_samples) + assert isinstance(km.n_iter_, int) + + # without stopping condition, max_iter should be reached + km = MiniBatchKMeans(n_clusters=3, batch_size=batch_size, random_state=0, + tol=0, max_no_improvement=None, max_iter=10).fit(X) + + assert km.n_iter_ == 10 + assert km.n_steps_ == (10 * n_samples) // batch_size + assert isinstance(km.n_steps_, int) + + def test_kmeans_copyx(): # Check that copy_x=False returns nearly equal X after de-centering. my_X = X.copy() @@ -584,6 +600,19 @@ def test_predict(Estimator, algorithm, init, dtype, array_constr): assert_allclose(v_measure_score(pred, np.arange(10)), 1) +@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans]) +def test_dense_sparse(Estimator): + # Check that the results are the same for dense and sparse input. + sample_weight = np.random.RandomState(0).random_sample((n_samples,)) + km_dense = Estimator(n_clusters=n_clusters, random_state=0, n_init=1) + km_dense.fit(X, sample_weight=sample_weight) + km_sparse = Estimator(n_clusters=n_clusters, random_state=0, n_init=1) + km_sparse.fit(X_csr, sample_weight=sample_weight) + + assert_array_equal(km_dense.labels_, km_sparse.labels_) + assert_allclose(km_dense.cluster_centers_, km_sparse.cluster_centers_) + + @pytest.mark.parametrize("init", ["random", "k-means++", centers], ids=["random", "k-means++", "ndarray"]) @pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans]) @@ -801,17 +830,19 @@ def test_unit_weights_vs_no_weights(Estimator, data): assert_allclose(km_none.cluster_centers_, km_ones.cluster_centers_) -def test_scaled_weights(): - # scaling all sample weights by a common factor +@pytest.mark.parametrize("data", [X, X_csr], ids=["dense", "sparse"]) +@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans]) +def test_scaled_weights(Estimator, data): + # Check that scaling all sample weights by a common factor # shouldn't change the result - sample_weight = np.ones(n_samples) - for estimator in [KMeans(n_clusters=n_clusters, random_state=42), - MiniBatchKMeans(n_clusters=n_clusters, random_state=42)]: - est_1 = clone(estimator).fit(X) - est_2 = clone(estimator).fit(X, sample_weight=0.5*sample_weight) - assert_almost_equal(v_measure_score(est_1.labels_, est_2.labels_), 1.0) - assert_almost_equal(_sort_centers(est_1.cluster_centers_), - _sort_centers(est_2.cluster_centers_)) + sample_weight = np.random.RandomState(0).uniform(n_samples) + + km = Estimator(n_clusters=n_clusters, random_state=42, n_init=1) + km_orig = clone(km).fit(data, sample_weight=sample_weight) + km_scaled = clone(km).fit(data, sample_weight=0.5 * sample_weight) + + assert_array_equal(km_orig.labels_, km_scaled.labels_) + assert_allclose(km_orig.cluster_centers_, km_scaled.cluster_centers_) def test_kmeans_elkan_iter_attribute(): @@ -837,18 +868,19 @@ def test_kmeans_empty_cluster_relocated(array_constr): assert_allclose(km.cluster_centers_, [[-1], [1]]) -def test_result_of_kmeans_equal_in_diff_n_threads(): - # Check that KMeans gives the same results in parallel mode than in - # sequential mode. +@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans]) +def test_result_equal_in_diff_n_threads(Estimator): + # Check that KMeans/MiniBatchKMeans give the same results in parallel mode + # than in sequential mode. rnd = np.random.RandomState(0) X = rnd.normal(size=(50, 10)) with threadpool_limits(limits=1, user_api="openmp"): - result_1 = KMeans( - n_clusters=3, random_state=0).fit(X).labels_ + result_1 = Estimator( + n_clusters=n_clusters, random_state=0).fit(X).labels_ with threadpool_limits(limits=2, user_api="openmp"): - result_2 = KMeans( - n_clusters=3, random_state=0).fit(X).labels_ + result_2 = Estimator( + n_clusters=n_clusters, random_state=0).fit(X).labels_ assert_array_equal(result_1, result_2) @@ -954,6 +986,7 @@ def test_euclidean_distance(dtype, squared): @pytest.mark.parametrize("dtype", [np.float32, np.float64]) def test_inertia(dtype): + # Check that the _inertia_(dense/sparse) helpers produce correct results. rng = np.random.RandomState(0) X_sparse = sp.random(100, 10, density=0.5, format="csr", random_state=rng, dtype=dtype) @@ -965,8 +998,10 @@ def test_inertia(dtype): distances = ((X_dense - centers[labels])**2).sum(axis=1) expected = np.sum(distances * sample_weight) - inertia_dense = _inertia_dense(X_dense, sample_weight, centers, labels) - inertia_sparse = _inertia_sparse(X_sparse, sample_weight, centers, labels) + inertia_dense = _inertia_dense( + X_dense, sample_weight, centers, labels, n_threads=1) + inertia_sparse = _inertia_sparse( + X_sparse, sample_weight, centers, labels, n_threads=1) assert_allclose(inertia_dense, inertia_sparse, rtol=1e-6) assert_allclose(inertia_dense, expected, rtol=1e-6) From 2c5ea4e6b3add57588fb35293b7dd25506c5fe06 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?= <34657725+jeremiedbb@users.noreply.github.com> Date: Thu, 15 Apr 2021 14:33:22 +0200 Subject: [PATCH 331/478] DictionaryLearning: Fix several issues in the dict update (#19198) Co-authored-by: Olivier Grisel --- doc/whats_new/v1.0.rst | 14 ++ sklearn/decomposition/_dict_learning.py | 157 ++++++++---------- .../decomposition/tests/test_dict_learning.py | 28 ++++ 3 files changed, 115 insertions(+), 84 deletions(-) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index 5975177f7a0c8..0494e5f29bf39 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -159,11 +159,25 @@ Changelog - |Fix| Fixes incorrect multiple data-conversion warnings when clustering boolean data. :pr:`19046` by :user:`Surya Prakash `. +:mod:`sklearn.decomposition` +............................ + - |Fix| Fixed :func:`dict_learning`, used by :class:`DictionaryLearning`, to ensure determinism of the output. Achieved by flipping signs of the SVD output which is used to initialize the code. :pr:`18433` by :user:`Bruno Charron `. +- |Fix| Fixed a bug in :class:`MiniBatchDictionaryLearning`, + :class:`MiniBatchSparsePCA` and :func:`dict_learning_online` where the + update of the dictionary was incorrect. :pr:`19198` by + :user:`Jérémie du Boisberranger `. + +- |Fix| Fixed a bug in :class:`DictionaryLearning`, :class:`SparsePCA`, + :class:`MiniBatchDictionaryLearning`, :class:`MiniBatchSparsePCA`, + :func:`dict_learning` and :func:`dict_learning_online` where the restart of + unused atoms during the dictionary update was not working as expected. + :pr:`19198` by :user:`Jérémie du Boisberranger `. + :mod:`sklearn.ensemble` ....................... diff --git a/sklearn/decomposition/_dict_learning.py b/sklearn/decomposition/_dict_learning.py index e2ae9f8355a54..bd8a95e37dbaf 100644 --- a/sklearn/decomposition/_dict_learning.py +++ b/sklearn/decomposition/_dict_learning.py @@ -355,28 +355,32 @@ def sparse_encode(X, dictionary, *, gram=None, cov=None, return code -def _update_dict(dictionary, Y, code, verbose=False, return_r2=False, +def _update_dict(dictionary, Y, code, A=None, B=None, verbose=False, random_state=None, positive=False): """Update the dense dictionary factor in place. Parameters ---------- - dictionary : ndarray of shape (n_features, n_components) + dictionary : ndarray of shape (n_components, n_features) Value of the dictionary at the previous iteration. - Y : ndarray of shape (n_features, n_samples) + Y : ndarray of shape (n_samples, n_features) Data matrix. - code : ndarray of shape (n_components, n_samples) + code : ndarray of shape (n_samples, n_components) Sparse coding of the data against which to optimize the dictionary. + A : ndarray of shape (n_components, n_components), default=None + Together with `B`, sufficient stats of the online model to update the + dictionary. + + B : ndarray of shape (n_features, n_components), default=None + Together with `A`, sufficient stats of the online model to update the + dictionary. + verbose: bool, default=False Degree of output the procedure will print. - return_r2 : bool, default=False - Whether to compute and return the residual sum of squares corresponding - to the computed solution. - random_state : int, RandomState instance or None, default=None Used for randomly initializing the dictionary. Pass an int for reproducible results across multiple function calls. @@ -386,54 +390,41 @@ def _update_dict(dictionary, Y, code, verbose=False, return_r2=False, Whether to enforce positivity when finding the dictionary. .. versionadded:: 0.20 - - Returns - ------- - dictionary : ndarray of shape (n_features, n_components) - Updated dictionary. """ - n_components = len(code) - n_features = Y.shape[0] + n_samples, n_components = code.shape random_state = check_random_state(random_state) - # Get BLAS functions - gemm, = linalg.get_blas_funcs(('gemm',), (dictionary, code, Y)) - ger, = linalg.get_blas_funcs(('ger',), (dictionary, code)) - nrm2, = linalg.get_blas_funcs(('nrm2',), (dictionary,)) - # Residuals, computed with BLAS for speed and efficiency - # R <- -1.0 * U * V^T + 1.0 * Y - # Outputs R as Fortran array for efficiency - R = gemm(-1.0, dictionary, code, 1.0, Y) + + if A is None: + A = code.T @ code + if B is None: + B = Y.T @ code + + n_unused = 0 + for k in range(n_components): - # R <- 1.0 * U_k * V_k^T + R - R = ger(1.0, dictionary[:, k], code[k, :], a=R, overwrite_a=True) - dictionary[:, k] = np.dot(R, code[k, :]) - if positive: - np.clip(dictionary[:, k], 0, None, out=dictionary[:, k]) - # Scale k'th atom - # (U_k * U_k) ** 0.5 - atom_norm = nrm2(dictionary[:, k]) - if atom_norm < 1e-10: - if verbose == 1: - sys.stdout.write("+") - sys.stdout.flush() - elif verbose: - print("Adding new random atom") - dictionary[:, k] = random_state.randn(n_features) - if positive: - np.clip(dictionary[:, k], 0, None, out=dictionary[:, k]) - # Setting corresponding coefs to 0 - code[k, :] = 0.0 - # (U_k * U_k) ** 0.5 - atom_norm = nrm2(dictionary[:, k]) - dictionary[:, k] /= atom_norm + if A[k, k] > 1e-6: + # 1e-6 is arbitrary but consistent with the spams implementation + dictionary[k] += (B[:, k] - A[k] @ dictionary) / A[k, k] else: - dictionary[:, k] /= atom_norm - # R <- -1.0 * U_k * V_k^T + R - R = ger(-1.0, dictionary[:, k], code[k, :], a=R, overwrite_a=True) - if return_r2: - R = nrm2(R) ** 2.0 - return dictionary, R - return dictionary + # kth atom is almost never used -> sample a new one from the data + newd = Y[random_state.choice(n_samples)] + + # add small noise to avoid making the sparse coding ill conditioned + noise_level = 0.01 * (newd.std() or 1) # avoid 0 std + noise = random_state.normal(0, noise_level, size=len(newd)) + + dictionary[k] = newd + noise + code[:, k] = 0 + n_unused += 1 + + if positive: + np.clip(dictionary[k], 0, None, out=dictionary[k]) + + # Projection on the constraint set ||V_k|| == 1 + dictionary[k] /= linalg.norm(dictionary[k]) + + if verbose and n_unused > 0: + print(f"{n_unused} unused atoms resampled.") @_deprecate_positional_args @@ -579,10 +570,9 @@ def dict_learning(X, n_components, *, alpha, max_iter=100, tol=1e-8, dictionary = np.r_[dictionary, np.zeros((n_components - r, dictionary.shape[1]))] - # Fortran-order dict, as we are going to access its row vectors - dictionary = np.array(dictionary, order='F') - - residuals = 0 + # Fortran-order dict better suited for the sparse coding which is the + # bottleneck of this algorithm. + dictionary = np.asfortranarray(dictionary) errors = [] current_cost = np.nan @@ -607,15 +597,14 @@ def dict_learning(X, n_components, *, alpha, max_iter=100, tol=1e-8, code = sparse_encode(X, dictionary, algorithm=method, alpha=alpha, init=code, n_jobs=n_jobs, positive=positive_code, max_iter=method_max_iter, verbose=verbose) - # Update dictionary - dictionary, residuals = _update_dict(dictionary.T, X.T, code.T, - verbose=verbose, return_r2=True, - random_state=random_state, - positive=positive_dict) - dictionary = dictionary.T + + # Update dictionary in place + _update_dict(dictionary, X, code, verbose=verbose, + random_state=random_state, positive=positive_dict) # Cost function - current_cost = 0.5 * residuals + alpha * np.sum(np.abs(code)) + current_cost = (0.5 * np.sum((X - code @ dictionary)**2) + + alpha * np.sum(np.abs(code))) errors.append(current_cost) if ii > 0: @@ -807,7 +796,9 @@ def dict_learning_online(X, n_components=2, *, alpha=1, n_iter=100, else: X_train = X - dictionary = check_array(dictionary.T, order='F', dtype=np.float64, + # Fortran-order dict better suited for the sparse coding which is the + # bottleneck of this algorithm. + dictionary = check_array(dictionary, order='F', dtype=np.float64, copy=False) dictionary = np.require(dictionary, requirements='W') @@ -839,11 +830,11 @@ def dict_learning_online(X, n_components=2, *, alpha=1, n_iter=100, print("Iteration % 3i (elapsed time: % 3is, % 4.1fmn)" % (ii, dt, dt / 60)) - this_code = sparse_encode(this_X, dictionary.T, algorithm=method, + this_code = sparse_encode(this_X, dictionary, algorithm=method, alpha=alpha, n_jobs=n_jobs, check_input=False, positive=positive_code, - max_iter=method_max_iter, verbose=verbose).T + max_iter=method_max_iter, verbose=verbose) # Update the auxiliary variables if ii < batch_size - 1: @@ -853,15 +844,13 @@ def dict_learning_online(X, n_components=2, *, alpha=1, n_iter=100, beta = (theta + 1 - batch_size) / (theta + 1) A *= beta - A += np.dot(this_code, this_code.T) + A += np.dot(this_code.T, this_code) B *= beta - B += np.dot(this_X.T, this_code.T) + B += np.dot(this_X.T, this_code) - # Update dictionary - dictionary = _update_dict(dictionary, B, A, verbose=verbose, - random_state=random_state, - positive=positive_dict) - # XXX: Can the residuals be of any use? + # Update dictionary in place + _update_dict(dictionary, this_X, this_code, A, B, verbose=verbose, + random_state=random_state, positive=positive_dict) # Maybe we need a stopping criteria based on the amount of # modification in the dictionary @@ -870,15 +859,15 @@ def dict_learning_online(X, n_components=2, *, alpha=1, n_iter=100, if return_inner_stats: if return_n_iter: - return dictionary.T, (A, B), ii - iter_offset + 1 + return dictionary, (A, B), ii - iter_offset + 1 else: - return dictionary.T, (A, B) + return dictionary, (A, B) if return_code: if verbose > 1: print('Learning code...', end=' ') elif verbose == 1: print('|', end=' ') - code = sparse_encode(X, dictionary.T, algorithm=method, alpha=alpha, + code = sparse_encode(X, dictionary, algorithm=method, alpha=alpha, n_jobs=n_jobs, check_input=False, positive=positive_code, max_iter=method_max_iter, verbose=verbose) @@ -886,14 +875,14 @@ def dict_learning_online(X, n_components=2, *, alpha=1, n_iter=100, dt = (time.time() - t0) print('done (total time: % 3is, % 4.1fmn)' % (dt, dt / 60)) if return_n_iter: - return code, dictionary.T, ii - iter_offset + 1 + return code, dictionary, ii - iter_offset + 1 else: - return code, dictionary.T + return code, dictionary if return_n_iter: - return dictionary.T, ii - iter_offset + 1 + return dictionary, ii - iter_offset + 1 else: - return dictionary.T + return dictionary class _BaseSparseCoding(TransformerMixin): @@ -1286,7 +1275,7 @@ class DictionaryLearning(_BaseSparseCoding, BaseEstimator): We can check the level of sparsity of `X_transformed`: >>> np.mean(X_transformed == 0) - 0.88... + 0.87... We can compare the average squared euclidean norm of the reconstruction error of the sparse coded signal relative to the squared euclidean norm of @@ -1294,7 +1283,7 @@ class DictionaryLearning(_BaseSparseCoding, BaseEstimator): >>> X_hat = X_transformed @ dict_learner.components_ >>> np.mean(np.sum((X_hat - X) ** 2, axis=1) / np.sum(X ** 2, axis=1)) - 0.07... + 0.08... Notes ----- @@ -1523,7 +1512,7 @@ class MiniBatchDictionaryLearning(_BaseSparseCoding, BaseEstimator): We can check the level of sparsity of `X_transformed`: >>> np.mean(X_transformed == 0) - 0.87... + 0.86... We can compare the average squared euclidean norm of the reconstruction error of the sparse coded signal relative to the squared euclidean norm of @@ -1531,7 +1520,7 @@ class MiniBatchDictionaryLearning(_BaseSparseCoding, BaseEstimator): >>> X_hat = X_transformed @ dict_learner.components_ >>> np.mean(np.sum((X_hat - X) ** 2, axis=1) / np.sum(X ** 2, axis=1)) - 0.10... + 0.07... Notes ----- diff --git a/sklearn/decomposition/tests/test_dict_learning.py b/sklearn/decomposition/tests/test_dict_learning.py index a13c07a6ac728..4048450a5d486 100644 --- a/sklearn/decomposition/tests/test_dict_learning.py +++ b/sklearn/decomposition/tests/test_dict_learning.py @@ -10,6 +10,7 @@ from sklearn.utils import check_array +from sklearn.utils._testing import assert_allclose from sklearn.utils._testing import assert_array_almost_equal from sklearn.utils._testing import assert_array_equal from sklearn.utils._testing import ignore_warnings @@ -25,6 +26,8 @@ from sklearn.utils.estimator_checks import check_transformer_general from sklearn.utils.estimator_checks import check_transformers_unfitted +from sklearn.decomposition._dict_learning import _update_dict + rng_global = np.random.RandomState(0) n_samples, n_features = 10, 8 @@ -575,6 +578,31 @@ def test_sparse_coder_n_features_in(): assert sc.n_features_in_ == d.shape[1] +def test_update_dict(): + # Check the dict update in batch mode vs online mode + # Non-regression test for #4866 + rng = np.random.RandomState(0) + + code = np.array([[0.5, -0.5], + [0.1, 0.9]]) + dictionary = np.array([[1., 0.], + [0.6, 0.8]]) + + X = np.dot(code, dictionary) + rng.randn(2, 2) + + # full batch update + newd_batch = dictionary.copy() + _update_dict(newd_batch, X, code) + + # online update + A = np.dot(code.T, code) + B = np.dot(X.T, code) + newd_online = dictionary.copy() + _update_dict(newd_online, X, code, A, B) + + assert_allclose(newd_batch, newd_online) + + @pytest.mark.parametrize("Estimator", [DictionaryLearning, MiniBatchDictionaryLearning]) def test_warning_default_transform_alpha(Estimator): From 962bd9a401bfee1f2d8e7e832018a75424b5bbe2 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 16 Apr 2021 14:19:43 +0200 Subject: [PATCH 332/478] DOC Adds consistence in docs for univariate selection metrics (#19904) Co-authored-by: Julien Jerphanion Co-authored-by: Olivier Grisel --- .../_univariate_selection.py | 29 ++++++++++--------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/sklearn/feature_selection/_univariate_selection.py b/sklearn/feature_selection/_univariate_selection.py index 0656e27d6e30f..be3298387f612 100644 --- a/sklearn/feature_selection/_univariate_selection.py +++ b/sklearn/feature_selection/_univariate_selection.py @@ -51,15 +51,15 @@ def f_oneway(*args): Parameters ---------- - *args : array-like, sparse matrices + *args : {array-like, sparse matrix} sample1, sample2... The sample measurements should be given as arguments. Returns ------- - F-value : float + f_statistic : float The computed F-value of the test. - p-value : float + p_value : float The associated p-value from the F-distribution. Notes @@ -127,19 +127,19 @@ def f_classif(X, y): Parameters ---------- - X : {array-like, sparse matrix} shape = [n_samples, n_features] + X : {array-like, sparse matrix} of shape (n_samples, n_features) The set of regressors that will be tested sequentially. - y : array of shape(n_samples) - The data matrix. + y : ndarray of shape (n_samples,) + The target vector. Returns ------- - F : array, shape = [n_features,] - The set of F values. + f_statistic : ndarray of shape (n_features,) + F-statistic for each feature. - pval : array, shape = [n_features,] - The set of p-values. + p_values : ndarray of shape (n_features,) + P-values associated with the F-statistic. See Also -------- @@ -195,10 +195,11 @@ def chi2(X, y): Returns ------- - chi2 : array, shape = (n_features,) - chi2 statistics of each feature. - pval : array, shape = (n_features,) - p-values of each feature. + chi2 : ndarray of shape (n_features,) + Chi2 statistics for each feature. + + p_values : ndarray of shape (n_features,) + P-values for each feature. Notes ----- From 90b399269b83531191f5f244d48182014acbc4f5 Mon Sep 17 00:00:00 2001 From: Shao Yang Hong Date: Sat, 17 Apr 2021 00:41:59 +0800 Subject: [PATCH 333/478] DOC Fix the description of some features in load_diabetes (#19366) Co-authored-by: Guillaume Lemaitre --- sklearn/datasets/_base.py | 6 ++++++ sklearn/datasets/descr/diabetes.rst | 6 +++--- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/sklearn/datasets/_base.py b/sklearn/datasets/_base.py index 17d2db9f2075b..948b4f7cba61e 100644 --- a/sklearn/datasets/_base.py +++ b/sklearn/datasets/_base.py @@ -765,6 +765,12 @@ def load_diabetes(*, return_X_y=False, as_frame=False): Features real, -.2 < x < .2 Targets integer 25 - 346 ============== ================== + + .. note:: + The meaning of each feature (i.e. `feature_names`) might be unclear + (especially for `ltg`) as the documentation of the original dataset is + not explicit. We provide information that seems correct in regard with + the scientific literature in this field of research. Read more in the :ref:`User Guide `. diff --git a/sklearn/datasets/descr/diabetes.rst b/sklearn/datasets/descr/diabetes.rst index 771b3e5fe282a..04651c0163307 100644 --- a/sklearn/datasets/descr/diabetes.rst +++ b/sklearn/datasets/descr/diabetes.rst @@ -21,11 +21,11 @@ quantitative measure of disease progression one year after baseline. - sex - bmi body mass index - bp average blood pressure - - s1 tc, T-Cells (a type of white blood cells) + - s1 tc, total serum cholesterol - s2 ldl, low-density lipoproteins - s3 hdl, high-density lipoproteins - - s4 tch, thyroid stimulating hormone - - s5 ltg, lamotrigine + - s4 tch, total cholesterol / HDL + - s5 ltg, possibly log of serum triglycerides level - s6 glu, blood sugar level Note: Each of these 10 feature variables have been mean centered and scaled by the standard deviation times `n_samples` (i.e. the sum of squares of each column totals 1). From e1f879e8eed85c5018d888c9f87f168bc44085e1 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Fri, 16 Apr 2021 21:07:13 +0200 Subject: [PATCH 334/478] DOC add FAQ entry for the many linear model classes (#19861) Co-authored-by: Chiara Marmo --- doc/faq.rst | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/doc/faq.rst b/doc/faq.rst index 0ebd4df759125..4038106bc93d7 100644 --- a/doc/faq.rst +++ b/doc/faq.rst @@ -396,3 +396,44 @@ and not at test time, for resampling and similar uses, like at `imbalanced-learn`. In general, these use cases can be solved with a custom meta estimator rather than a Pipeline + +Why are there so many different estimators for linear models? +------------------------------------------------------------- +Usually, there is one classifier and one regressor per model type, e.g. +:class:`~ensemble.GradientBoostingClassifier` and +:class:`~ensemble.GradientBoostingRegressor`. Both have similar options and +both have the parameter `loss`, which is especially useful in the regression +case as it enables the estimation of conditional mean as well as conditional +quantiles. + +For linear models, there are many estimator classes which are very close to +each other. Let us have a look at + +- :class:`~linear_model.LinearRegression`, no penalty +- :class:`~linear_model.Ridge`, L2 penalty +- :class:`~linear_model.Lasso`, L1 penalty (sparse models) +- :class:`~linear_model.ElasticNet`, L1 + L2 penalty (less sparse models) +- :class:`~linear_model.SGDRegressor` with `loss='squared_loss'` + +**Maintainer perspective:** +They all do in principle the same and are different only by the penalty they +impose. This, however, has a large impact on the way the underlying +optimization problem is solved. In the end, this amounts to usage of different +methods and tricks from linear algebra. A special case is `SGDRegressor` which +comprises all 4 previous models and is different by the optimization procedure. +A further side effect is that the different estimators favor different data +layouts (`X` c-contiguous or f-contiguous, sparse csr or csc). This complexity +of the seemingly simple linear models is the reason for having different +estimator classes for different penalties. + +**User perspective:** +First, the current design is inspired by the scientific literature where linear +regression models with different regularization/penalty were given different +names, e.g. *ridge regression*. Having different model classes with according +names makes it easier for users to find those regression models. +Secondly, if all the 5 above mentioned linear models were unified into a single +class, there would be parameters with a lot of options like the ``solver`` +parameter. On top of that, there would be a lot of exclusive interactions +between different parameters. For example, the possible options of the +parameters ``solver``, ``precompute`` and ``selection`` would depend on the +chosen values of the penalty parameters ``alpha`` and ``l1_ratio``. From 9605f3b586990c51a045838dd7464cc0ef3d3e18 Mon Sep 17 00:00:00 2001 From: Ishan Mishra <33893659+legitishan@users.noreply.github.com> Date: Sun, 18 Apr 2021 17:32:17 +0530 Subject: [PATCH 335/478] DOC Fixes links in outlier_detection.html (#19917) --- doc/modules/outlier_detection.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/modules/outlier_detection.rst b/doc/modules/outlier_detection.rst index 14495bc558dab..496b840e0c6da 100644 --- a/doc/modules/outlier_detection.rst +++ b/doc/modules/outlier_detection.rst @@ -356,7 +356,7 @@ on new unseen data when LOF is applied for novelty detection, i.e. when the This strategy is illustrated below. .. figure:: ../auto_examples/neighbors/images/sphx_glr_plot_lof_outlier_detection_001.png - :target: ../auto_examples/neighbors/sphx_glr_plot_lof_outlier_detection.html + :target: ../auto_examples/neighbors/plot_lof_outlier_detection.html :align: center :scale: 75% @@ -401,6 +401,6 @@ Note that ``fit_predict`` is not available in this case. Novelty detection with Local Outlier Factor is illustrated below. .. figure:: ../auto_examples/neighbors/images/sphx_glr_plot_lof_novelty_detection_001.png - :target: ../auto_examples/neighbors/sphx_glr_plot_lof_novelty_detection.html + :target: ../auto_examples/neighbors/plot_lof_novelty_detection.html :align: center :scale: 75% From 5d8796b91db3e85975e6cab8b779aefa9502227a Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Sun, 18 Apr 2021 11:31:50 -0400 Subject: [PATCH 336/478] FIX Fixes unknown handling for str dtypes in OrdinalEncoder.transform (#19888) * FIX Fixes unknown handling for str X in OrdinalEncoder.transform * DOC Adds whats new * DOC Move to 0.24.2 * DOC Adds reasoning in comment --- doc/whats_new/v0.24.rst | 3 +++ doc/whats_new/v1.0.rst | 2 +- sklearn/preprocessing/_encoders.py | 6 ++++++ sklearn/preprocessing/tests/test_encoders.py | 20 ++++++++++++++++++++ 4 files changed, 30 insertions(+), 1 deletion(-) diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst index 09f3d9bdecd3e..880d1879637ed 100644 --- a/doc/whats_new/v0.24.rst +++ b/doc/whats_new/v0.24.rst @@ -53,6 +53,9 @@ Changelog `'use_encoded_value'` strategies. :pr:`19234` by `Guillaume Lemaitre `. +- |Fix| :meth:`preprocessing.OrdinalEncoder.transfrom` correctly handles + unknown values for string dtypes. :pr:`19888` by `Thomas Fan`_. + :mod:`sklearn.multioutput` .......................... diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index 0494e5f29bf39..a78cbe69b746d 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -311,7 +311,7 @@ Changelog :pr:`18649` by `Leandro Hermida ` and `Rodion Martynov `. -- |Fix| The `fit` method of the successive halving parameter search +- |Fix| The `fit` method of the successive halving parameter search (:class:`model_selection.HalvingGridSearchCV`, and :class:`model_selection.HalvingRandomSearchCV`) now correctly handles the `groups` parameter. :pr:`19847` by :user:`Xiaoyu Chai `. diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index cd05dc89bb75d..7c62cbdcbc565 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -150,6 +150,12 @@ def _transform(self, X, handle_unknown='error', force_all_finite=True, if (self.categories_[i].dtype.kind in ('U', 'S') and self.categories_[i].itemsize > Xi.itemsize): Xi = Xi.astype(self.categories_[i].dtype) + elif (self.categories_[i].dtype.kind == 'O' and + Xi.dtype.kind == 'U'): + # categories are objects and Xi are numpy strings. + # Cast Xi to an object dtype to prevent truncation + # when setting invalid values. + Xi = Xi.astype('O') else: Xi = Xi.copy() diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index 9f1e331f78fec..94e2c276dcd58 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -1132,3 +1132,23 @@ def test_ordinal_encoder_sparse(): X_trans_sparse = sparse.csr_matrix(X_trans) with pytest.raises(TypeError, match=err_msg): encoder.inverse_transform(X_trans_sparse) + + +@pytest.mark.parametrize("X_train", [ + [['AA', 'B']], + np.array([['AA', 'B']], dtype='O'), + np.array([['AA', 'B']], dtype='U'), +]) +@pytest.mark.parametrize("X_test", [ + [['A', 'B']], + np.array([['A', 'B']], dtype='O'), + np.array([['A', 'B']], dtype='U'), +]) +def test_ordinal_encoder_handle_unknown_string_dtypes(X_train, X_test): + """Checks that ordinal encoder transforms string dtypes. Non-regression + test for #19872.""" + enc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-9) + enc.fit(X_train) + + X_trans = enc.transform(X_test) + assert_allclose(X_trans, [[-9, 0]]) From 66ec10f341087ca05156f701e0ba90717cd252d2 Mon Sep 17 00:00:00 2001 From: Maxwell Date: Mon, 19 Apr 2021 15:50:10 +0800 Subject: [PATCH 337/478] MAINT Clean up code in FastICA (#19796) * FIX code cleanup in FastICA * keep syntax X_mean Co-authored-by: Thomas J. Fan --- sklearn/decomposition/_fastica.py | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/sklearn/decomposition/_fastica.py b/sklearn/decomposition/_fastica.py index 27f7f646ea579..a57ddada85694 100644 --- a/sklearn/decomposition/_fastica.py +++ b/sklearn/decomposition/_fastica.py @@ -427,9 +427,8 @@ def _fit(self, X, compute_sources=False): ------- X_new : ndarray of shape (n_samples, n_components) """ - - X = self._validate_data(X, copy=self.whiten, dtype=FLOAT_DTYPES, - ensure_min_samples=2).T + XT = self._validate_data(X, copy=self.whiten, dtype=FLOAT_DTYPES, + ensure_min_samples=2).T fun_args = {} if self.fun_args is None else self.fun_args random_state = check_random_state(self.random_state) @@ -454,7 +453,7 @@ def g(x, fun_args): % self.fun ) - n_samples, n_features = X.shape + n_features, n_samples = XT.shape n_components = self.n_components if not self.whiten and n_components is not None: @@ -471,24 +470,24 @@ def g(x, fun_args): ) if self.whiten: - # Centering the columns (ie the variables) - X_mean = X.mean(axis=-1) - X -= X_mean[:, np.newaxis] + # Centering the features of X + X_mean = XT.mean(axis=-1) + XT -= X_mean[:, np.newaxis] # Whitening and preprocessing by PCA - u, d, _ = linalg.svd(X, full_matrices=False, check_finite=False) + u, d, _ = linalg.svd(XT, full_matrices=False, check_finite=False) del _ K = (u / d).T[:n_components] # see (6.33) p.140 del u, d - X1 = np.dot(K, X) + X1 = np.dot(K, XT) # see (13.6) p.267 Here X1 is white and data # in X has been projected onto a subspace by PCA - X1 *= np.sqrt(n_features) + X1 *= np.sqrt(n_samples) else: # X must be casted to floats to avoid typing issues with numpy # 2.0 and the line below - X1 = as_float_array(X, copy=False) # copy has been taken care of + X1 = as_float_array(XT, copy=False) # copy has been taken care of w_init = self.w_init if w_init is None: @@ -519,9 +518,9 @@ def g(x, fun_args): if compute_sources: if self.whiten: - S = np.linalg.multi_dot([W, K, X]).T + S = np.linalg.multi_dot([W, K, XT]).T else: - S = np.dot(W, X).T + S = np.dot(W, XT).T else: S = None From 969ec32c9273d77641bc9591f44cd2ae3daf4434 Mon Sep 17 00:00:00 2001 From: Isaack Mungui <41724425+isaack-mungui@users.noreply.github.com> Date: Mon, 19 Apr 2021 11:08:36 +0300 Subject: [PATCH 338/478] DOC fix early stopping description in MLP (#19818) * Maintenance task: Moved PolynomialFeatures to _polynomial.py * Updated docstring including behaviour of neural network when early stopping is activated * Revert "Maintenance task: Moved PolynomialFeatures to _polynomial.py" This reverts commit f76df548ecd31dbe6093d4e8329711197c830542. * reverted failing commit * Update sklearn/neural_network/_multilayer_perceptron.py Co-authored-by: Thomas J. Fan * Updated doc with proposed changes * Fixed whitespace error Co-authored-by: Thomas J. Fan --- sklearn/neural_network/_multilayer_perceptron.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sklearn/neural_network/_multilayer_perceptron.py b/sklearn/neural_network/_multilayer_perceptron.py index 04822360791e7..e349dfd844f96 100644 --- a/sklearn/neural_network/_multilayer_perceptron.py +++ b/sklearn/neural_network/_multilayer_perceptron.py @@ -825,6 +825,9 @@ class MLPClassifier(ClassifierMixin, BaseMultilayerPerceptron): validation score is not improving by at least tol for ``n_iter_no_change`` consecutive epochs. The split is stratified, except in a multilabel setting. + If early stopping is False, then the training stops when the training + loss does not improve by more than tol for n_iter_no_change consecutive + passes over the training set. Only effective when solver='sgd' or 'adam' validation_fraction : float, default=0.1 From 73e8b7d0a984fac8420cb4f948d53470ef9b5abf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juan=20Carlos=20Alfaro=20Jim=C3=A9nez?= Date: Mon, 19 Apr 2021 10:43:14 +0200 Subject: [PATCH 339/478] DOC use math mode in r2_score function (#19921) --- sklearn/metrics/_regression.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sklearn/metrics/_regression.py b/sklearn/metrics/_regression.py index 7edf7924e50e1..c2a0e7f7f033b 100644 --- a/sklearn/metrics/_regression.py +++ b/sklearn/metrics/_regression.py @@ -670,12 +670,12 @@ def explained_variance_score(y_true, y_pred, *, @_deprecate_positional_args def r2_score(y_true, y_pred, *, sample_weight=None, multioutput="uniform_average"): - """R^2 (coefficient of determination) regression score function. + """:math:`R^2` (coefficient of determination) regression score function. Best possible score is 1.0 and it can be negative (because the model can be arbitrarily worse). A constant model that always predicts the expected value of y, disregarding the input features, - would get a R^2 score of 0.0. + would get a :math:`R^2` score of 0.0. Read more in the :ref:`User Guide `. @@ -713,15 +713,15 @@ def r2_score(y_true, y_pred, *, sample_weight=None, Returns ------- z : float or ndarray of floats - The R^2 score or ndarray of scores if 'multioutput' is + The :math:`R^2` score or ndarray of scores if 'multioutput' is 'raw_values'. Notes ----- This is not a symmetric function. - Unlike most other scores, R^2 score may be negative (it need not actually - be the square of a quantity R). + Unlike most other scores, :math:`R^2` score may be negative (it need not + actually be the square of a quantity R). This metric is not well-defined for single samples and will return a NaN value if n_samples is less than two. From 5efcb10c3e9f87103c404c32a036beb623182c83 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Mon, 19 Apr 2021 11:06:38 -0400 Subject: [PATCH 340/478] MAINT Vendors packaging/version.py for pep440 versioning (#19826) * MAINT Vendors packaging/version for pep440 versioning * ENH Uses folder structure of packaging * ENH Uses fixes parse_version * ENH Uses packaging name * MAINT Adds packaging * BLD Use vendored version [cd build] --- setup.py | 2 +- sklearn/externals/_packaging/__init__.py | 0 sklearn/externals/_packaging/_structures.py | 90 +++ sklearn/externals/_packaging/version.py | 527 ++++++++++++++++++ .../preprocessing/tests/test_polynomial.py | 4 +- sklearn/setup.py | 1 + sklearn/utils/fixes.py | 8 +- 7 files changed, 621 insertions(+), 11 deletions(-) create mode 100644 sklearn/externals/_packaging/__init__.py create mode 100644 sklearn/externals/_packaging/_structures.py create mode 100644 sklearn/externals/_packaging/version.py diff --git a/setup.py b/setup.py index e44f941e0a114..9758f62de1301 100755 --- a/setup.py +++ b/setup.py @@ -14,7 +14,6 @@ from distutils.command.clean import clean as Clean from distutils.command.sdist import sdist -from pkg_resources import parse_version import traceback import importlib try: @@ -51,6 +50,7 @@ # does not need the compiled code import sklearn import sklearn._min_dependencies as min_deps # noqa +from sklearn.externals._packaging.version import parse as parse_version # noqa VERSION = sklearn.__version__ diff --git a/sklearn/externals/_packaging/__init__.py b/sklearn/externals/_packaging/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/sklearn/externals/_packaging/_structures.py b/sklearn/externals/_packaging/_structures.py new file mode 100644 index 0000000000000..837e3a7946d70 --- /dev/null +++ b/sklearn/externals/_packaging/_structures.py @@ -0,0 +1,90 @@ +"""Vendoered from +https://github.com/pypa/packaging/blob/main/packaging/_structures.py +""" +# Copyright (c) Donald Stufft and individual contributors. +# All rights reserved. + +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: + +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. + +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. + +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +class InfinityType: + def __repr__(self) -> str: + return "Infinity" + + def __hash__(self) -> int: + return hash(repr(self)) + + def __lt__(self, other: object) -> bool: + return False + + def __le__(self, other: object) -> bool: + return False + + def __eq__(self, other: object) -> bool: + return isinstance(other, self.__class__) + + def __ne__(self, other: object) -> bool: + return not isinstance(other, self.__class__) + + def __gt__(self, other: object) -> bool: + return True + + def __ge__(self, other: object) -> bool: + return True + + def __neg__(self: object) -> "NegativeInfinityType": + return NegativeInfinity + + +Infinity = InfinityType() + + +class NegativeInfinityType: + def __repr__(self) -> str: + return "-Infinity" + + def __hash__(self) -> int: + return hash(repr(self)) + + def __lt__(self, other: object) -> bool: + return True + + def __le__(self, other: object) -> bool: + return True + + def __eq__(self, other: object) -> bool: + return isinstance(other, self.__class__) + + def __ne__(self, other: object) -> bool: + return not isinstance(other, self.__class__) + + def __gt__(self, other: object) -> bool: + return False + + def __ge__(self, other: object) -> bool: + return False + + def __neg__(self: object) -> InfinityType: + return Infinity + + +NegativeInfinity = NegativeInfinityType() diff --git a/sklearn/externals/_packaging/version.py b/sklearn/externals/_packaging/version.py new file mode 100644 index 0000000000000..ea83bbb8b5389 --- /dev/null +++ b/sklearn/externals/_packaging/version.py @@ -0,0 +1,527 @@ +"""Vendoered from +https://github.com/pypa/packaging/blob/main/packaging/version.py +""" +# Copyright (c) Donald Stufft and individual contributors. +# All rights reserved. + +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: + +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. + +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. + +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import collections +import itertools +import re +import warnings +from typing import Callable, Iterator, List, Optional, SupportsInt, Tuple, Union + +from ._structures import Infinity, InfinityType, NegativeInfinity, NegativeInfinityType + +__all__ = ["parse", "Version", "LegacyVersion", "InvalidVersion", "VERSION_PATTERN"] + +InfiniteTypes = Union[InfinityType, NegativeInfinityType] +PrePostDevType = Union[InfiniteTypes, Tuple[str, int]] +SubLocalType = Union[InfiniteTypes, int, str] +LocalType = Union[ + NegativeInfinityType, + Tuple[ + Union[ + SubLocalType, + Tuple[SubLocalType, str], + Tuple[NegativeInfinityType, SubLocalType], + ], + ..., + ], +] +CmpKey = Tuple[ + int, Tuple[int, ...], PrePostDevType, PrePostDevType, PrePostDevType, LocalType +] +LegacyCmpKey = Tuple[int, Tuple[str, ...]] +VersionComparisonMethod = Callable[ + [Union[CmpKey, LegacyCmpKey], Union[CmpKey, LegacyCmpKey]], bool +] + +_Version = collections.namedtuple( + "_Version", ["epoch", "release", "dev", "pre", "post", "local"] +) + + +def parse(version: str) -> Union["LegacyVersion", "Version"]: + """ + Parse the given version string and return either a :class:`Version` object + or a :class:`LegacyVersion` object depending on if the given version is + a valid PEP 440 version or a legacy version. + """ + try: + return Version(version) + except InvalidVersion: + return LegacyVersion(version) + + +class InvalidVersion(ValueError): + """ + An invalid version was found, users should refer to PEP 440. + """ + + +class _BaseVersion: + _key: Union[CmpKey, LegacyCmpKey] + + def __hash__(self) -> int: + return hash(self._key) + + # Please keep the duplicated `isinstance` check + # in the six comparisons hereunder + # unless you find a way to avoid adding overhead function calls. + def __lt__(self, other: "_BaseVersion") -> bool: + if not isinstance(other, _BaseVersion): + return NotImplemented + + return self._key < other._key + + def __le__(self, other: "_BaseVersion") -> bool: + if not isinstance(other, _BaseVersion): + return NotImplemented + + return self._key <= other._key + + def __eq__(self, other: object) -> bool: + if not isinstance(other, _BaseVersion): + return NotImplemented + + return self._key == other._key + + def __ge__(self, other: "_BaseVersion") -> bool: + if not isinstance(other, _BaseVersion): + return NotImplemented + + return self._key >= other._key + + def __gt__(self, other: "_BaseVersion") -> bool: + if not isinstance(other, _BaseVersion): + return NotImplemented + + return self._key > other._key + + def __ne__(self, other: object) -> bool: + if not isinstance(other, _BaseVersion): + return NotImplemented + + return self._key != other._key + + +class LegacyVersion(_BaseVersion): + def __init__(self, version: str) -> None: + self._version = str(version) + self._key = _legacy_cmpkey(self._version) + + warnings.warn( + "Creating a LegacyVersion has been deprecated and will be " + "removed in the next major release", + DeprecationWarning, + ) + + def __str__(self) -> str: + return self._version + + def __repr__(self) -> str: + return f"" + + @property + def public(self) -> str: + return self._version + + @property + def base_version(self) -> str: + return self._version + + @property + def epoch(self) -> int: + return -1 + + @property + def release(self) -> None: + return None + + @property + def pre(self) -> None: + return None + + @property + def post(self) -> None: + return None + + @property + def dev(self) -> None: + return None + + @property + def local(self) -> None: + return None + + @property + def is_prerelease(self) -> bool: + return False + + @property + def is_postrelease(self) -> bool: + return False + + @property + def is_devrelease(self) -> bool: + return False + + +_legacy_version_component_re = re.compile(r"(\d+ | [a-z]+ | \.| -)", re.VERBOSE) + +_legacy_version_replacement_map = { + "pre": "c", + "preview": "c", + "-": "final-", + "rc": "c", + "dev": "@", +} + + +def _parse_version_parts(s: str) -> Iterator[str]: + for part in _legacy_version_component_re.split(s): + part = _legacy_version_replacement_map.get(part, part) + + if not part or part == ".": + continue + + if part[:1] in "0123456789": + # pad for numeric comparison + yield part.zfill(8) + else: + yield "*" + part + + # ensure that alpha/beta/candidate are before final + yield "*final" + + +def _legacy_cmpkey(version: str) -> LegacyCmpKey: + + # We hardcode an epoch of -1 here. A PEP 440 version can only have a epoch + # greater than or equal to 0. This will effectively put the LegacyVersion, + # which uses the defacto standard originally implemented by setuptools, + # as before all PEP 440 versions. + epoch = -1 + + # This scheme is taken from pkg_resources.parse_version setuptools prior to + # it's adoption of the packaging library. + parts: List[str] = [] + for part in _parse_version_parts(version.lower()): + if part.startswith("*"): + # remove "-" before a prerelease tag + if part < "*final": + while parts and parts[-1] == "*final-": + parts.pop() + + # remove trailing zeros from each series of numeric parts + while parts and parts[-1] == "00000000": + parts.pop() + + parts.append(part) + + return epoch, tuple(parts) + + +# Deliberately not anchored to the start and end of the string, to make it +# easier for 3rd party code to reuse +VERSION_PATTERN = r""" + v? + (?: + (?:(?P[0-9]+)!)? # epoch + (?P[0-9]+(?:\.[0-9]+)*) # release segment + (?P
                                          # pre-release
+            [-_\.]?
+            (?P(a|b|c|rc|alpha|beta|pre|preview))
+            [-_\.]?
+            (?P[0-9]+)?
+        )?
+        (?P                                         # post release
+            (?:-(?P[0-9]+))
+            |
+            (?:
+                [-_\.]?
+                (?Ppost|rev|r)
+                [-_\.]?
+                (?P[0-9]+)?
+            )
+        )?
+        (?P                                          # dev release
+            [-_\.]?
+            (?Pdev)
+            [-_\.]?
+            (?P[0-9]+)?
+        )?
+    )
+    (?:\+(?P[a-z0-9]+(?:[-_\.][a-z0-9]+)*))?       # local version
+"""
+
+
+class Version(_BaseVersion):
+
+    _regex = re.compile(r"^\s*" + VERSION_PATTERN + r"\s*$", re.VERBOSE | re.IGNORECASE)
+
+    def __init__(self, version: str) -> None:
+
+        # Validate the version and parse it into pieces
+        match = self._regex.search(version)
+        if not match:
+            raise InvalidVersion(f"Invalid version: '{version}'")
+
+        # Store the parsed out pieces of the version
+        self._version = _Version(
+            epoch=int(match.group("epoch")) if match.group("epoch") else 0,
+            release=tuple(int(i) for i in match.group("release").split(".")),
+            pre=_parse_letter_version(match.group("pre_l"), match.group("pre_n")),
+            post=_parse_letter_version(
+                match.group("post_l"), match.group("post_n1") or match.group("post_n2")
+            ),
+            dev=_parse_letter_version(match.group("dev_l"), match.group("dev_n")),
+            local=_parse_local_version(match.group("local")),
+        )
+
+        # Generate a key which will be used for sorting
+        self._key = _cmpkey(
+            self._version.epoch,
+            self._version.release,
+            self._version.pre,
+            self._version.post,
+            self._version.dev,
+            self._version.local,
+        )
+
+    def __repr__(self) -> str:
+        return f""
+
+    def __str__(self) -> str:
+        parts = []
+
+        # Epoch
+        if self.epoch != 0:
+            parts.append(f"{self.epoch}!")
+
+        # Release segment
+        parts.append(".".join(str(x) for x in self.release))
+
+        # Pre-release
+        if self.pre is not None:
+            parts.append("".join(str(x) for x in self.pre))
+
+        # Post-release
+        if self.post is not None:
+            parts.append(f".post{self.post}")
+
+        # Development release
+        if self.dev is not None:
+            parts.append(f".dev{self.dev}")
+
+        # Local version segment
+        if self.local is not None:
+            parts.append(f"+{self.local}")
+
+        return "".join(parts)
+
+    @property
+    def epoch(self) -> int:
+        _epoch: int = self._version.epoch
+        return _epoch
+
+    @property
+    def release(self) -> Tuple[int, ...]:
+        _release: Tuple[int, ...] = self._version.release
+        return _release
+
+    @property
+    def pre(self) -> Optional[Tuple[str, int]]:
+        _pre: Optional[Tuple[str, int]] = self._version.pre
+        return _pre
+
+    @property
+    def post(self) -> Optional[int]:
+        return self._version.post[1] if self._version.post else None
+
+    @property
+    def dev(self) -> Optional[int]:
+        return self._version.dev[1] if self._version.dev else None
+
+    @property
+    def local(self) -> Optional[str]:
+        if self._version.local:
+            return ".".join(str(x) for x in self._version.local)
+        else:
+            return None
+
+    @property
+    def public(self) -> str:
+        return str(self).split("+", 1)[0]
+
+    @property
+    def base_version(self) -> str:
+        parts = []
+
+        # Epoch
+        if self.epoch != 0:
+            parts.append(f"{self.epoch}!")
+
+        # Release segment
+        parts.append(".".join(str(x) for x in self.release))
+
+        return "".join(parts)
+
+    @property
+    def is_prerelease(self) -> bool:
+        return self.dev is not None or self.pre is not None
+
+    @property
+    def is_postrelease(self) -> bool:
+        return self.post is not None
+
+    @property
+    def is_devrelease(self) -> bool:
+        return self.dev is not None
+
+    @property
+    def major(self) -> int:
+        return self.release[0] if len(self.release) >= 1 else 0
+
+    @property
+    def minor(self) -> int:
+        return self.release[1] if len(self.release) >= 2 else 0
+
+    @property
+    def micro(self) -> int:
+        return self.release[2] if len(self.release) >= 3 else 0
+
+
+def _parse_letter_version(
+    letter: str, number: Union[str, bytes, SupportsInt]
+) -> Optional[Tuple[str, int]]:
+
+    if letter:
+        # We consider there to be an implicit 0 in a pre-release if there is
+        # not a numeral associated with it.
+        if number is None:
+            number = 0
+
+        # We normalize any letters to their lower case form
+        letter = letter.lower()
+
+        # We consider some words to be alternate spellings of other words and
+        # in those cases we want to normalize the spellings to our preferred
+        # spelling.
+        if letter == "alpha":
+            letter = "a"
+        elif letter == "beta":
+            letter = "b"
+        elif letter in ["c", "pre", "preview"]:
+            letter = "rc"
+        elif letter in ["rev", "r"]:
+            letter = "post"
+
+        return letter, int(number)
+    if not letter and number:
+        # We assume if we are given a number, but we are not given a letter
+        # then this is using the implicit post release syntax (e.g. 1.0-1)
+        letter = "post"
+
+        return letter, int(number)
+
+    return None
+
+
+_local_version_separators = re.compile(r"[\._-]")
+
+
+def _parse_local_version(local: str) -> Optional[LocalType]:
+    """
+    Takes a string like abc.1.twelve and turns it into ("abc", 1, "twelve").
+    """
+    if local is not None:
+        return tuple(
+            part.lower() if not part.isdigit() else int(part)
+            for part in _local_version_separators.split(local)
+        )
+    return None
+
+
+def _cmpkey(
+    epoch: int,
+    release: Tuple[int, ...],
+    pre: Optional[Tuple[str, int]],
+    post: Optional[Tuple[str, int]],
+    dev: Optional[Tuple[str, int]],
+    local: Optional[Tuple[SubLocalType]],
+) -> CmpKey:
+
+    # When we compare a release version, we want to compare it with all of the
+    # trailing zeros removed. So we'll use a reverse the list, drop all the now
+    # leading zeros until we come to something non zero, then take the rest
+    # re-reverse it back into the correct order and make it a tuple and use
+    # that for our sorting key.
+    _release = tuple(
+        reversed(list(itertools.dropwhile(lambda x: x == 0, reversed(release))))
+    )
+
+    # We need to "trick" the sorting algorithm to put 1.0.dev0 before 1.0a0.
+    # We'll do this by abusing the pre segment, but we _only_ want to do this
+    # if there is not a pre or a post segment. If we have one of those then
+    # the normal sorting rules will handle this case correctly.
+    if pre is None and post is None and dev is not None:
+        _pre: PrePostDevType = NegativeInfinity
+    # Versions without a pre-release (except as noted above) should sort after
+    # those with one.
+    elif pre is None:
+        _pre = Infinity
+    else:
+        _pre = pre
+
+    # Versions without a post segment should sort before those with one.
+    if post is None:
+        _post: PrePostDevType = NegativeInfinity
+
+    else:
+        _post = post
+
+    # Versions without a development segment should sort after those with one.
+    if dev is None:
+        _dev: PrePostDevType = Infinity
+
+    else:
+        _dev = dev
+
+    if local is None:
+        # Versions without a local segment should sort before those with one.
+        _local: LocalType = NegativeInfinity
+    else:
+        # Versions with a local segment need that segment parsed to implement
+        # the sorting rules in PEP440.
+        # - Alpha numeric segments sort before numeric segments
+        # - Alpha numeric segments sort lexicographically
+        # - Numeric segments sort numerically
+        # - Shorter versions sort before longer versions when the prefixes
+        #   match exactly
+        _local = tuple(
+            (i, "") if isinstance(i, int) else (NegativeInfinity, i) for i in local
+        )
+
+    return epoch, _release, _pre, _post, _dev, _local
diff --git a/sklearn/preprocessing/tests/test_polynomial.py b/sklearn/preprocessing/tests/test_polynomial.py
index 59c3a59df8873..1f70ec9854a54 100644
--- a/sklearn/preprocessing/tests/test_polynomial.py
+++ b/sklearn/preprocessing/tests/test_polynomial.py
@@ -11,9 +11,7 @@
 from sklearn.preprocessing import (
     KBinsDiscretizer, PolynomialFeatures, SplineTransformer
 )
-from sklearn.utils.fixes import linspace, sp_version
-
-from pkg_resources import parse_version
+from sklearn.utils.fixes import linspace, sp_version, parse_version
 
 
 @pytest.mark.parametrize("est", (PolynomialFeatures, SplineTransformer))
diff --git a/sklearn/setup.py b/sklearn/setup.py
index e5d7e6e26b3ab..ae8a929d6b9cb 100644
--- a/sklearn/setup.py
+++ b/sklearn/setup.py
@@ -51,6 +51,7 @@ def configuration(parent_package='', top_path=None):
     config.add_subpackage('_loss/')
     config.add_subpackage('_loss/tests')
     config.add_subpackage('externals')
+    config.add_subpackage('externals/_packaging')
 
     # submodules which have their own setup.py
     config.add_subpackage('cluster')
diff --git a/sklearn/utils/fixes.py b/sklearn/utils/fixes.py
index 593e0eb332a99..a5a455ee7b9a1 100644
--- a/sklearn/utils/fixes.py
+++ b/sklearn/utils/fixes.py
@@ -11,7 +11,6 @@
 # License: BSD 3 clause
 
 from functools import update_wrapper
-from distutils.version import LooseVersion
 import functools
 
 import numpy as np
@@ -23,12 +22,7 @@
 from .._config import config_context, get_config
 
 from .deprecation import deprecated
-
-try:
-    from pkg_resources import parse_version  # type: ignore
-except ImportError:
-    # setuptools not installed
-    parse_version = LooseVersion  # type: ignore
+from ..externals._packaging.version import parse as parse_version
 
 
 np_version = parse_version(np.__version__)

From 0df9efe2c1407f3fb887c22056452c791fd83dc9 Mon Sep 17 00:00:00 2001
From: Helder Geovane Gomes de Lima 
Date: Mon, 19 Apr 2021 18:48:39 -0300
Subject: [PATCH 341/478] DOC Fixes typo in doc/modules/cross_validation.rst
 (#19925)

---
 doc/modules/cross_validation.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/modules/cross_validation.rst b/doc/modules/cross_validation.rst
index 0b090fd7385b6..98b3c41ee5c72 100644
--- a/doc/modules/cross_validation.rst
+++ b/doc/modules/cross_validation.rst
@@ -813,7 +813,7 @@ samples that are part of the validation set, and to -1 for all other samples.
 Using cross-validation iterators to split train and test
 --------------------------------------------------------
 
-The above group cross-validation functions may also be useful for spitting a
+The above group cross-validation functions may also be useful for splitting a
 dataset into training and testing subsets. Note that the convenience
 function :func:`train_test_split` is a wrapper around :func:`ShuffleSplit`
 and thus only allows for stratified splitting (using the class labels)

From dd7b7e5ef950ac026ac44d758af9167eafcc9ee2 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion 
Date: Tue, 20 Apr 2021 11:14:39 +0200
Subject: [PATCH 342/478] MAINT Remove `get_memview_*` helpers in
 `neighbours.BinaryTree` (#19893)

Co-authored-by: Roman Yurchak 
---
 sklearn/neighbors/_ball_tree.pyx    |   2 +-
 sklearn/neighbors/_binary_tree.pxi  | 108 +++++++++-------------------
 sklearn/neighbors/_dist_metrics.pyx |  12 +---
 sklearn/neighbors/_kd_tree.pyx      |   2 +-
 sklearn/neighbors/tests/test_lof.py |   8 +--
 5 files changed, 40 insertions(+), 92 deletions(-)

diff --git a/sklearn/neighbors/_ball_tree.pyx b/sklearn/neighbors/_ball_tree.pyx
index 81ce9606f7b80..16e9407aa72ca 100644
--- a/sklearn/neighbors/_ball_tree.pyx
+++ b/sklearn/neighbors/_ball_tree.pyx
@@ -44,7 +44,7 @@ cdef int allocate_data(BinaryTree tree, ITYPE_t n_nodes,
                        ITYPE_t n_features) except -1:
     """Allocate arrays needed for the KD Tree"""
     tree.node_bounds_arr = np.zeros((1, n_nodes, n_features), dtype=DTYPE)
-    tree.node_bounds = get_memview_DTYPE_3D(tree.node_bounds_arr)
+    tree.node_bounds = tree.node_bounds_arr
     return 0
 
 
diff --git a/sklearn/neighbors/_binary_tree.pxi b/sklearn/neighbors/_binary_tree.pxi
index cabad951c4975..de85ec49166ec 100755
--- a/sklearn/neighbors/_binary_tree.pxi
+++ b/sklearn/neighbors/_binary_tree.pxi
@@ -196,47 +196,6 @@ cdef NodeData_t nd_tmp
 NodeData = np.asarray((&nd_tmp)).dtype
 
 
-######################################################################
-# Numpy 1.3-1.4 compatibility utilities
-cdef DTYPE_t[::1] get_memview_DTYPE_1D(
-                               np.ndarray[DTYPE_t, ndim=1, mode='c'] X):
-    return  ( X.data)
-
-
-cdef DTYPE_t[:, ::1] get_memview_DTYPE_2D(
-                               np.ndarray[DTYPE_t, ndim=2, mode='c'] X):
-    return  ( X.data)
-
-
-cdef DTYPE_t[:, :, ::1] get_memview_DTYPE_3D(
-                               np.ndarray[DTYPE_t, ndim=3, mode='c'] X):
-    return \
-                                                       ( X.data)
-
-
-cdef ITYPE_t[::1] get_memview_ITYPE_1D(
-                               np.ndarray[ITYPE_t, ndim=1, mode='c'] X):
-    return  ( X.data)
-
-
-cdef ITYPE_t[:, ::1] get_memview_ITYPE_2D(
-                               np.ndarray[ITYPE_t, ndim=2, mode='c'] X):
-    return  ( X.data)
-
-
-cdef NodeHeapData_t[::1] get_memview_NodeHeapData_1D(
-                    np.ndarray[NodeHeapData_t, ndim=1, mode='c'] X):
-    return  ( X.data)
-
-
-cdef NodeData_t[::1] get_memview_NodeData_1D(
-                    np.ndarray[NodeData_t, ndim=1, mode='c'] X):
-    return  ( X.data)
-
-######################################################################
-
-
-
 ######################################################################
 # Define doc strings, substituting the appropriate class name using
 # the DOC_DICT variable defined in the pyx files.
@@ -574,15 +533,15 @@ cdef class NeighborsHeap:
     def __cinit__(self):
         self.distances_arr = np.zeros((1, 1), dtype=DTYPE, order='C')
         self.indices_arr = np.zeros((1, 1), dtype=ITYPE, order='C')
-        self.distances = get_memview_DTYPE_2D(self.distances_arr)
-        self.indices = get_memview_ITYPE_2D(self.indices_arr)
+        self.distances = self.distances_arr
+        self.indices = self.indices_arr
 
     def __init__(self, n_pts, n_nbrs):
         self.distances_arr = np.full((n_pts, n_nbrs), np.inf, dtype=DTYPE,
                                      order='C')
         self.indices_arr = np.zeros((n_pts, n_nbrs), dtype=ITYPE, order='C')
-        self.distances = get_memview_DTYPE_2D(self.distances_arr)
-        self.indices = get_memview_ITYPE_2D(self.indices_arr)
+        self.distances = self.distances_arr
+        self.indices = self.indices_arr
 
     def get_arrays(self, sort=True):
         """Get the arrays of distances and indices within the heap.
@@ -806,12 +765,12 @@ cdef class NodeHeap:
 
     def __cinit__(self):
         self.data_arr = np.zeros(1, dtype=NodeHeapData, order='C')
-        self.data = get_memview_NodeHeapData_1D(self.data_arr)
+        self.data = self.data_arr
 
     def __init__(self, size_guess=100):
         size_guess = max(size_guess, 1)  # need space for at least one item
         self.data_arr = np.zeros(size_guess, dtype=NodeHeapData, order='C')
-        self.data = get_memview_NodeHeapData_1D(self.data_arr)
+        self.data = self.data_arr
         self.n = size_guess
         self.clear()
 
@@ -823,8 +782,7 @@ cdef class NodeHeap:
         cdef ITYPE_t size = self.data.shape[0]
         cdef np.ndarray new_data_arr = np.zeros(new_size,
                                                 dtype=NodeHeapData)
-        cdef NodeHeapData_t[::1] new_data =\
-                                    get_memview_NodeHeapData_1D(new_data_arr)
+        cdef NodeHeapData_t[::1] new_data = new_data_arr
 
         if size > 0 and new_size > 0:
             data_ptr = &self.data[0]
@@ -933,8 +891,8 @@ cdef class BinaryTree:
     cdef np.ndarray node_data_arr
     cdef np.ndarray node_bounds_arr
 
-    cdef readonly DTYPE_t[:, ::1] data
-    cdef readonly DTYPE_t[::1] sample_weight
+    cdef readonly const DTYPE_t[:, ::1] data
+    cdef readonly const DTYPE_t[::1] sample_weight
     cdef public DTYPE_t sum_weight
     cdef public ITYPE_t[::1] idx_array
     cdef public NodeData_t[::1] node_data
@@ -964,11 +922,11 @@ cdef class BinaryTree:
         self.node_data_arr = np.empty(1, dtype=NodeData, order='C')
         self.node_bounds_arr = np.empty((1, 1, 1), dtype=DTYPE)
 
-        self.data = get_memview_DTYPE_2D(self.data_arr)
-        self.sample_weight = get_memview_DTYPE_1D(self.sample_weight_arr)
-        self.idx_array = get_memview_ITYPE_1D(self.idx_array_arr)
-        self.node_data = get_memview_NodeData_1D(self.node_data_arr)
-        self.node_bounds = get_memview_DTYPE_3D(self.node_bounds_arr)
+        self.data = self.data_arr
+        self.sample_weight = self.sample_weight_arr
+        self.idx_array = self.idx_array_arr
+        self.node_data = self.node_data_arr
+        self.node_bounds = self.node_bounds_arr
 
         self.leaf_size = 0
         self.n_levels = 0
@@ -1028,8 +986,7 @@ cdef class BinaryTree:
         if sample_weight is not None:
             self.sample_weight_arr = np.asarray(
                 sample_weight, dtype=DTYPE, order='C')
-            self.sample_weight = get_memview_DTYPE_1D(
-                self.sample_weight_arr)
+            self.sample_weight = self.sample_weight_arr
             self.sum_weight = np.sum(self.sample_weight)
         else:
             self.sample_weight = None
@@ -1037,10 +994,10 @@ cdef class BinaryTree:
             self.sum_weight =  n_samples
 
     def _update_memviews(self):
-        self.data = get_memview_DTYPE_2D(self.data_arr)
-        self.idx_array = get_memview_ITYPE_1D(self.idx_array_arr)
-        self.node_data = get_memview_NodeData_1D(self.node_data_arr)
-        self.node_bounds = get_memview_DTYPE_3D(self.node_bounds_arr)
+        self.data = self.data_arr
+        self.idx_array = self.idx_array_arr
+        self.node_data = self.node_data_arr
+        self.node_bounds = self.node_bounds_arr
 
 
     def __reduce__(self):
@@ -1279,7 +1236,7 @@ cdef class BinaryTree:
 
         # flatten X, and save original shape information
         np_Xarr = X.reshape((-1, self.data.shape[1]))
-        cdef DTYPE_t[:, ::1] Xarr = get_memview_DTYPE_2D(np_Xarr)
+        cdef const DTYPE_t[:, ::1] Xarr = np_Xarr
         cdef DTYPE_t reduced_dist_LB
         cdef ITYPE_t i
         cdef DTYPE_t* pt
@@ -1410,8 +1367,7 @@ cdef class BinaryTree:
             raise ValueError("query data dimension must "
                              "match training data dimension")
 
-        cdef DTYPE_t[:, ::1] Xarr =\
-                get_memview_DTYPE_2D(X.reshape((-1, self.data.shape[1])))
+        cdef const DTYPE_t[:, ::1] Xarr = X.reshape((-1, self.data.shape[1]))
 
         # prepare r for query
         r = np.asarray(r, dtype=DTYPE, order='C')
@@ -1423,7 +1379,7 @@ cdef class BinaryTree:
                 raise ValueError("r must be broadcastable to X.shape")
 
         rarr_np = r.reshape(-1)  # store explicitly to keep in scope
-        cdef DTYPE_t[::1] rarr = get_memview_DTYPE_1D(rarr_np)
+        cdef DTYPE_t[::1] rarr = rarr_np
 
         if not count_only:
             indices = calloc(Xarr.shape[0], sizeof(ITYPE_t*))
@@ -1436,13 +1392,13 @@ cdef class BinaryTree:
                     raise MemoryError()
 
         np_idx_arr = np.zeros(self.data.shape[0], dtype=ITYPE)
-        idx_arr_i = get_memview_ITYPE_1D(np_idx_arr)
+        idx_arr_i = np_idx_arr
 
         np_dist_arr = np.zeros(self.data.shape[0], dtype=DTYPE)
-        dist_arr_i = get_memview_DTYPE_1D(np_dist_arr)
+        dist_arr_i = np_dist_arr
 
         counts_arr = np.zeros(Xarr.shape[0], dtype=ITYPE)
-        counts = get_memview_ITYPE_1D(counts_arr)
+        counts = counts_arr
 
         pt = &Xarr[0, 0]
         memory_error = False
@@ -1609,10 +1565,10 @@ cdef class BinaryTree:
             raise ValueError("query data dimension must "
                              "match training data dimension")
         Xarr_np = X.reshape((-1, n_features))
-        cdef DTYPE_t[:, ::1] Xarr = get_memview_DTYPE_2D(Xarr_np)
+        cdef DTYPE_t[:, ::1] Xarr = Xarr_np
 
         log_density_arr = np.zeros(Xarr.shape[0], dtype=DTYPE)
-        cdef DTYPE_t[::1] log_density = get_memview_DTYPE_1D(log_density_arr)
+        cdef DTYPE_t[::1] log_density = log_density_arr
 
         cdef DTYPE_t* pt = &Xarr[0, 0]
 
@@ -1626,9 +1582,9 @@ cdef class BinaryTree:
         #       computed between node pairs.
         if breadth_first:
             node_log_min_bounds_arr = np.full(self.n_nodes, -np.inf)
-            node_log_min_bounds = get_memview_DTYPE_1D(node_log_min_bounds_arr)
+            node_log_min_bounds = node_log_min_bounds_arr
             node_bound_widths_arr = np.zeros(self.n_nodes)
-            node_bound_widths = get_memview_DTYPE_1D(node_bound_widths_arr)
+            node_bound_widths = node_bound_widths_arr
             for i in range(Xarr.shape[0]):
                 log_density[i] = self._kde_single_breadthfirst(
                                             pt, kernel_c, h_c,
@@ -1704,7 +1660,7 @@ cdef class BinaryTree:
                              "match training data dimension")
 
         np_Xarr = X.reshape((-1, self.data.shape[1]))
-        cdef DTYPE_t[:, ::1] Xarr = get_memview_DTYPE_2D(np_Xarr)
+        cdef DTYPE_t[:, ::1] Xarr = np_Xarr
 
         # prepare r for query
         r = np.asarray(r, dtype=DTYPE, order='C')
@@ -1713,11 +1669,11 @@ cdef class BinaryTree:
             raise ValueError("r must be a 1-dimensional array")
         i_rsort = np.argsort(r)
         rarr_np = r[i_rsort]  # needed to keep memory in scope
-        cdef DTYPE_t[::1] rarr = get_memview_DTYPE_1D(rarr_np)
+        cdef DTYPE_t[::1] rarr = rarr_np
 
         # create array to hold counts
         count = np.zeros(r.shape[0], dtype=ITYPE)
-        cdef ITYPE_t[::1] carr = get_memview_ITYPE_1D(count)
+        cdef ITYPE_t[::1] carr = count
 
         cdef DTYPE_t* pt = &Xarr[0, 0]
 
diff --git a/sklearn/neighbors/_dist_metrics.pyx b/sklearn/neighbors/_dist_metrics.pyx
index 398591bcdf49f..cf0c703a5d491 100755
--- a/sklearn/neighbors/_dist_metrics.pyx
+++ b/sklearn/neighbors/_dist_metrics.pyx
@@ -14,11 +14,6 @@ np.import_array()  # required in order to use C-API
 
 ######################################################################
 # Numpy 1.3-1.4 compatibility utilities
-cdef DTYPE_t[:, ::1] get_memview_DTYPE_2D(
-                               np.ndarray[DTYPE_t, ndim=2, mode='c'] X):
-    return  ( X.data)
-
-
 cdef DTYPE_t* get_vec_ptr(np.ndarray[DTYPE_t, ndim=1, mode='c'] vec):
     return &vec[0]
 
@@ -398,16 +393,13 @@ cdef class DistanceMetric:
         if Y is None:
             Darr = np.zeros((Xarr.shape[0], Xarr.shape[0]),
                          dtype=DTYPE, order='C')
-            self.pdist(get_memview_DTYPE_2D(Xarr),
-                       get_memview_DTYPE_2D(Darr))
+            self.pdist(Xarr, Darr)
         else:
             Yarr = np.asarray(Y, dtype=DTYPE, order='C')
             self._validate_data(Yarr)
             Darr = np.zeros((Xarr.shape[0], Yarr.shape[0]),
                          dtype=DTYPE, order='C')
-            self.cdist(get_memview_DTYPE_2D(Xarr),
-                       get_memview_DTYPE_2D(Yarr),
-                       get_memview_DTYPE_2D(Darr))
+            self.cdist(Xarr, Yarr, Darr)
         return Darr
 
 
diff --git a/sklearn/neighbors/_kd_tree.pyx b/sklearn/neighbors/_kd_tree.pyx
index bc1ab764a6fcf..175b61962da99 100644
--- a/sklearn/neighbors/_kd_tree.pyx
+++ b/sklearn/neighbors/_kd_tree.pyx
@@ -38,7 +38,7 @@ cdef int allocate_data(BinaryTree tree, ITYPE_t n_nodes,
                        ITYPE_t n_features) except -1:
     """Allocate arrays needed for the KD Tree"""
     tree.node_bounds_arr = np.zeros((2, n_nodes, n_features), dtype=DTYPE)
-    tree.node_bounds = get_memview_DTYPE_3D(tree.node_bounds_arr)
+    tree.node_bounds = tree.node_bounds_arr
     return 0
 
 
diff --git a/sklearn/neighbors/tests/test_lof.py b/sklearn/neighbors/tests/test_lof.py
index 5d479d5b141f7..ec67bddae29e8 100644
--- a/sklearn/neighbors/tests/test_lof.py
+++ b/sklearn/neighbors/tests/test_lof.py
@@ -15,8 +15,8 @@
 
 from sklearn.utils import check_random_state
 from sklearn.utils._testing import assert_array_almost_equal
-from sklearn.utils.estimator_checks import check_estimator
 from sklearn.utils.estimator_checks import check_outlier_corruption
+from sklearn.utils.estimator_checks import parametrize_with_checks
 
 from sklearn.datasets import load_iris
 
@@ -208,11 +208,11 @@ def test_hasattr_prediction():
     assert not hasattr(clf, 'score_samples')
 
 
-def test_novelty_true_common_tests():
-
+@parametrize_with_checks([neighbors.LocalOutlierFactor(novelty=True)])
+def test_novelty_true_common_tests(estimator, check):
     # the common tests are run for the default LOF (novelty=False).
     # here we run these common tests for LOF when novelty=True
-    check_estimator(neighbors.LocalOutlierFactor(novelty=True))
+    check(estimator)
 
 
 @pytest.mark.parametrize('expected_outliers', [30, 53])

From 9b7ff272534f130893e95933db46a3ff295190b2 Mon Sep 17 00:00:00 2001
From: Bharat Raghunathan 
Date: Tue, 20 Apr 2021 15:11:17 +0530
Subject: [PATCH 343/478] DOC improve learning-rate AdaBoost estimator (#19919)

---
 sklearn/ensemble/_weight_boosting.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/sklearn/ensemble/_weight_boosting.py b/sklearn/ensemble/_weight_boosting.py
index d5354232a4385..92c5e15731d63 100644
--- a/sklearn/ensemble/_weight_boosting.py
+++ b/sklearn/ensemble/_weight_boosting.py
@@ -313,9 +313,9 @@ class AdaBoostClassifier(ClassifierMixin, BaseWeightBoosting):
         In case of perfect fit, the learning procedure is stopped early.
 
     learning_rate : float, default=1.
-        Learning rate shrinks the contribution of each classifier by
-        ``learning_rate``. There is a trade-off between ``learning_rate`` and
-        ``n_estimators``.
+        Weight applied to each classifier at each boosting iteration. A higher
+        learning rate increases the contribution of each classifier. There is
+        a trade-off between the `learning_rate` and `n_estimators` parameters.
 
     algorithm : {'SAMME', 'SAMME.R'}, default='SAMME.R'
         If 'SAMME.R' then use the SAMME.R real boosting algorithm.
@@ -898,9 +898,9 @@ class AdaBoostRegressor(RegressorMixin, BaseWeightBoosting):
         In case of perfect fit, the learning procedure is stopped early.
 
     learning_rate : float, default=1.
-        Learning rate shrinks the contribution of each regressor by
-        ``learning_rate``. There is a trade-off between ``learning_rate`` and
-        ``n_estimators``.
+        Weight applied to each classifier at each boosting iteration. A higher
+        learning rate increases the contribution of each classifier. There is
+        a trade-off between the `learning_rate` and `n_estimators` parameters.
 
     loss : {'linear', 'square', 'exponential'}, default='linear'
         The loss function to use when updating the weights after each

From 9099c796399e5c9653d5d307b4ad8a46047b1cdb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= 
Date: Tue, 20 Apr 2021 15:57:42 +0200
Subject: [PATCH 344/478] EXA Fix plot_map_data_to_normal.py example legend
 (#19930)

Lambda was meant to be on a different line.
---
 examples/preprocessing/plot_map_data_to_normal.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/preprocessing/plot_map_data_to_normal.py b/examples/preprocessing/plot_map_data_to_normal.py
index ff465df78b0df..581ca20a83a42 100644
--- a/examples/preprocessing/plot_map_data_to_normal.py
+++ b/examples/preprocessing/plot_map_data_to_normal.py
@@ -132,7 +132,7 @@
         ax.hist(X_trans, color=color, bins=BINS)
         title = 'After {}'.format(meth_name)
         if lmbda is not None:
-            title += r'\n$\lambda$ = {}'.format(lmbda)
+            title += '\n$\\lambda$ = {}'.format(lmbda)
         ax.set_title(title, fontsize=FONT_SIZE)
         ax.tick_params(axis='both', which='major', labelsize=FONT_SIZE)
         ax.set_xlim([-3.5, 3.5])

From ce217db386aaddfa5b5dde3fe47d42a1964120a0 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" 
Date: Tue, 20 Apr 2021 11:22:13 -0400
Subject: [PATCH 345/478] FIX Fixes memory view bug in distance metrics
 (#19933)

---
 sklearn/neighbors/_dist_metrics.pxd | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/neighbors/_dist_metrics.pxd b/sklearn/neighbors/_dist_metrics.pxd
index 856d5bb2dde5b..30124c309bc49 100644
--- a/sklearn/neighbors/_dist_metrics.pxd
+++ b/sklearn/neighbors/_dist_metrics.pxd
@@ -67,9 +67,9 @@ cdef class DistanceMetric:
     cdef DTYPE_t rdist(self, DTYPE_t* x1, DTYPE_t* x2,
                        ITYPE_t size) nogil except -1
 
-    cdef int pdist(self, DTYPE_t[:, ::1] X, DTYPE_t[:, ::1] D) except -1
+    cdef int pdist(self, const DTYPE_t[:, ::1] X, DTYPE_t[:, ::1] D) except -1
 
-    cdef int cdist(self, DTYPE_t[:, ::1] X, DTYPE_t[:, ::1] Y,
+    cdef int cdist(self, const DTYPE_t[:, ::1] X, const DTYPE_t[:, ::1] Y,
                    DTYPE_t[:, ::1] D) except -1
 
     cdef DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) nogil except -1

From 7ddd6e5d34911346afe6839c16fc06fc820fc013 Mon Sep 17 00:00:00 2001
From: Gleb Levitskiy <36483986+GLevV@users.noreply.github.com>
Date: Tue, 20 Apr 2021 19:39:19 +0000
Subject: [PATCH 346/478] KBinsDiscretizer efficiency improvement to 'kmeans'
 strategy (#19934)

* efficiency improvement

* update doc

* lint

* lint
---
 doc/whats_new/v1.0.rst                   | 4 ++++
 sklearn/preprocessing/_discretization.py | 3 ++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index a78cbe69b746d..8a2351b04ecc2 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -386,6 +386,10 @@ Changelog
   supporting sparse matrix and raise the appropriate error message.
   :pr:`19879` by :user:`Guillaume Lemaitre `.
 
+- |Efficiency| Changed ``algorithm`` argument for :class:`cluster.KMeans` in 
+  :class:`preprocessing.KBinsDiscretizer` from ``auto`` to ``full``. 
+  :pr:`19934` by :user:`Gleb Levitskiy `.
+
 :mod:`sklearn.tree`
 ...................
 
diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py
index 22fa236f3314e..9ce95a97544a5 100644
--- a/sklearn/preprocessing/_discretization.py
+++ b/sklearn/preprocessing/_discretization.py
@@ -205,7 +205,8 @@ def fit(self, X, y=None):
                 init = (uniform_edges[1:] + uniform_edges[:-1])[:, None] * 0.5
 
                 # 1D k-means procedure
-                km = KMeans(n_clusters=n_bins[jj], init=init, n_init=1)
+                km = KMeans(n_clusters=n_bins[jj], init=init, n_init=1,
+                            algorithm='full')
                 centers = km.fit(column[:, None]).cluster_centers_[:, 0]
                 # Must sort, centers may be unsorted even with sorted init
                 centers.sort()

From 1a601c0f553b5cd097c086c1f4fa12bd5afaed9c Mon Sep 17 00:00:00 2001
From: TFiFiE 
Date: Tue, 20 Apr 2021 23:05:14 +0200
Subject: [PATCH 347/478] DOC Remove misleading "linear kernel" statements
 (#19937)

---
 sklearn/svm/_classes.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/svm/_classes.py b/sklearn/svm/_classes.py
index c402779f4eeb6..674fa294dcf3c 100644
--- a/sklearn/svm/_classes.py
+++ b/sklearn/svm/_classes.py
@@ -105,7 +105,7 @@ class LinearSVC(LinearClassifierMixin,
     coef_ : ndarray of shape (1, n_features) if n_classes == 2 \
             else (n_classes, n_features)
         Weights assigned to the features (coefficients in the primal
-        problem). This is only available in the case of a linear kernel.
+        problem).
 
         ``coef_`` is a readonly property derived from ``raw_coef_`` that
         follows the internal memory layout of liblinear.
@@ -326,7 +326,7 @@ class LinearSVR(RegressorMixin, LinearModel):
     coef_ : ndarray of shape (n_features) if n_classes == 2 \
             else (n_classes, n_features)
         Weights assigned to the features (coefficients in the primal
-        problem). This is only available in the case of a linear kernel.
+        problem).
 
         `coef_` is a readonly property derived from `raw_coef_` that
         follows the internal memory layout of liblinear.

From 4946bfcd72ab45e821eadbb260b7187116f7c1ae Mon Sep 17 00:00:00 2001
From: Kei Ishikawa <30857855+kstoneriv3@users.noreply.github.com>
Date: Tue, 20 Apr 2021 23:12:33 +0200
Subject: [PATCH 348/478] FIX fix a bug in KernelPCA.inverse_transform (#19732)

Co-authored-by: Olivier Grisel 
---
 doc/whats_new/v0.24.rst                       |  6 +++++
 sklearn/decomposition/_kernel_pca.py          |  2 --
 .../decomposition/tests/test_kernel_pca.py    | 23 +++++++++++--------
 3 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst
index 880d1879637ed..81953db29efa3 100644
--- a/doc/whats_new/v0.24.rst
+++ b/doc/whats_new/v0.24.rst
@@ -26,6 +26,12 @@ Changelog
   with `sample_weight` parameter and `least_absolute_deviation` loss function.
   :pr:`19407` by :user:`Vadim Ushtanit `.
 
+:mod:`sklearn.decomposition`
+............................
+
+- |Fix| Fixed a bug in :class:`decomposition.KernelPCA`'s
+  ``inverse_transform``.  :pr:`19732` by :user:`Kei Ishikawa `.
+
 :mod:`sklearn.linear_model`
 ...........................
 
diff --git a/sklearn/decomposition/_kernel_pca.py b/sklearn/decomposition/_kernel_pca.py
index 5655eddb0bf31..7ea1d118e4391 100644
--- a/sklearn/decomposition/_kernel_pca.py
+++ b/sklearn/decomposition/_kernel_pca.py
@@ -364,8 +364,6 @@ def inverse_transform(self, X):
                                  "the inverse transform is not available.")
 
         K = self._get_kernel(X, self.X_transformed_fit_)
-        n_samples = self.X_transformed_fit_.shape[0]
-        K.flat[::n_samples + 1] += self.alpha
         return np.dot(K, self.dual_coef_)
 
     def _more_tags(self):
diff --git a/sklearn/decomposition/tests/test_kernel_pca.py b/sklearn/decomposition/tests/test_kernel_pca.py
index 2acccb0df6781..adf68f1db1a6c 100644
--- a/sklearn/decomposition/tests/test_kernel_pca.py
+++ b/sklearn/decomposition/tests/test_kernel_pca.py
@@ -286,16 +286,19 @@ def test_kernel_conditioning():
     assert np.all(kpca.lambdas_ == _check_psd_eigenvalues(kpca.lambdas_))
 
 
-@pytest.mark.parametrize("kernel",
-                         ["linear", "poly", "rbf", "sigmoid", "cosine"])
-def test_kernel_pca_inverse_transform(kernel):
-    X, *_ = make_blobs(n_samples=100, n_features=4, centers=[[1, 1, 1, 1]],
-                       random_state=0)
-
-    kp = KernelPCA(n_components=2, kernel=kernel, fit_inverse_transform=True)
-    X_trans = kp.fit_transform(X)
-    X_inv = kp.inverse_transform(X_trans)
-    assert_allclose(X, X_inv)
+def test_kernel_pca_inverse_transform_reconstruction():
+    # Test if the reconstruction is a good approximation.
+    # Note that in general it is not possible to get an arbitrarily good
+    # reconstruction because of kernel centering that does not
+    # preserve all the information of the original data.
+    X, *_ = make_blobs(n_samples=100, n_features=4, random_state=0)
+
+    kpca = KernelPCA(
+        n_components=20, kernel='rbf', fit_inverse_transform=True, alpha=1e-3
+    )
+    X_trans = kpca.fit_transform(X)
+    X_reconst = kpca.inverse_transform(X_trans)
+    assert np.linalg.norm(X - X_reconst) / np.linalg.norm(X) < 1e-1
 
 
 def test_32_64_decomposition_shape():

From 0bd7cedababab7bd70ebacb31d46eccd2371a3bd Mon Sep 17 00:00:00 2001
From: Kei Ishikawa <30857855+kstoneriv3@users.noreply.github.com>
Date: Tue, 20 Apr 2021 23:21:54 +0200
Subject: [PATCH 349/478] ENH Enrich docstring on `inverse_transform` of
 `KernelPCA` (#19910)

---
 sklearn/decomposition/_kernel_pca.py | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/sklearn/decomposition/_kernel_pca.py b/sklearn/decomposition/_kernel_pca.py
index 7ea1d118e4391..415ee034c1769 100644
--- a/sklearn/decomposition/_kernel_pca.py
+++ b/sklearn/decomposition/_kernel_pca.py
@@ -346,6 +346,26 @@ def transform(self, X):
     def inverse_transform(self, X):
         """Transform X back to original space.
 
+        ``inverse_transform`` approximates the inverse transformation using
+        a learned pre-image. The pre-image is learned by kernel ridge
+        regression of the original data on their low-dimensional representation
+        vectors.
+
+        .. note:
+            :meth:`~sklearn.decomposition.fit` internally uses a centered
+            kernel. As the centered kernel no longer contains the information
+            of the mean of kernel features, such information is not taken into
+            account in reconstruction.
+
+        .. note::
+            When users want to compute inverse transformation for 'linear'
+            kernel, it is recommended that they use
+            :class:`~sklearn.decomposition.PCA` instead. Unlike
+            :class:`~sklearn.decomposition.PCA`,
+            :class:`~sklearn.decomposition.KernelPCA`'s ``inverse_transform``
+            does not reconstruct the mean of data when 'linear' kernel is used
+            due to the use of centered kernel.
+
         Parameters
         ----------
         X : {array-like, sparse matrix} of shape (n_samples, n_components)

From 004b44d007408aa2db1fdaf4428990d0d7b7f85a Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" 
Date: Tue, 20 Apr 2021 17:29:02 -0400
Subject: [PATCH 350/478] FIX OneHotEncoder.fit no longer alters the drop
 parameter (#19924)

---
 doc/whats_new/v0.24.rst                      |  3 +++
 doc/whats_new/v1.0.rst                       |  2 +-
 sklearn/preprocessing/_encoders.py           | 11 +++++------
 sklearn/preprocessing/tests/test_encoders.py |  2 ++
 4 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst
index 81953db29efa3..5b0b753f0f294 100644
--- a/doc/whats_new/v0.24.rst
+++ b/doc/whats_new/v0.24.rst
@@ -62,6 +62,9 @@ Changelog
 - |Fix| :meth:`preprocessing.OrdinalEncoder.transfrom` correctly handles
   unknown values for string dtypes. :pr:`19888` by `Thomas Fan`_.
 
+- |Fix| :meth:`preprocessing.OneHotEncoder.fit` no longer alters the `drop`
+  parameter. :pr:`19924` by `Thomas Fan`_.
+
 :mod:`sklearn.multioutput`
 ..........................
 
diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 8a2351b04ecc2..458bb8dfff8a0 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -105,7 +105,7 @@ Changelog
 - |Fix| Improved convergence detection based on center change in
   :class:`cluster.MiniBatchKMeans` which was almost never achievable.
   :pr:`17622` by :user:`Jérémie du Boisberranger `.
-  
+
 - |FIX| :class:`cluster.AgglomerativeClustering` now supports readonly
   memory-mapped datasets. :pr:`19883` by `Julien Jerphanion `.
 
diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index 7c62cbdcbc565..36ca74ac09cdb 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -363,22 +363,21 @@ def _compute_drop_idx(self):
 
         else:
             try:
-                self.drop = np.asarray(self.drop, dtype=object)
-                droplen = len(self.drop)
+                drop_array = np.asarray(self.drop, dtype=object)
+                droplen = len(drop_array)
             except (ValueError, TypeError):
                 msg = (
                     "Wrong input for parameter `drop`. Expected "
                     "'first', 'if_binary', None or array of objects, got {}"
                     )
-                raise ValueError(msg.format(type(self.drop)))
+                raise ValueError(msg.format(type(drop_array)))
             if droplen != len(self.categories_):
                 msg = ("`drop` should have length equal to the number "
                        "of features ({}), got {}")
-                raise ValueError(msg.format(len(self.categories_),
-                                            len(self.drop)))
+                raise ValueError(msg.format(len(self.categories_), droplen))
             missing_drops = []
             drop_indices = []
-            for col_idx, (val, cat_list) in enumerate(zip(self.drop,
+            for col_idx, (val, cat_list) in enumerate(zip(drop_array,
                                                           self.categories_)):
                 if not is_scalar_nan(val):
                     drop_idx = np.where(cat_list == val)[0]
diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
index 94e2c276dcd58..72fa46544b198 100644
--- a/sklearn/preprocessing/tests/test_encoders.py
+++ b/sklearn/preprocessing/tests/test_encoders.py
@@ -748,6 +748,8 @@ def test_one_hot_encoder_drop_manual(missing_value):
            [0, 1, 0, 1, 1],
            [0, 0, 0, 0, 0]]
     assert_array_equal(trans, exp)
+    assert enc.drop is cats_to_drop
+
     dropped_cats = [cat[feature]
                     for cat, feature in zip(enc.categories_,
                                             enc.drop_idx_)]

From d5ebdca662b0c43c283009e3aeb5bc270c6100a2 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion 
Date: Wed, 21 Apr 2021 10:49:18 +0200
Subject: [PATCH 351/478] [MRG] Refactor `feature_selection.f_regression` and
 introduce `feature_selection.r_regression` (#17169)

Co-authored-by: Dmytro S Lituiev 
Co-authored-by: Chiara Marmo 
Co-authored-by: Guillaume Lemaitre 
Co-authored-by: Olivier Grisel 
---
 doc/modules/classes.rst                       |   1 +
 doc/whats_new/v1.0.rst                        |   8 +
 sklearn/feature_selection/__init__.py         |   2 +
 .../_univariate_selection.py                  | 149 ++++++++++++------
 .../tests/test_feature_select.py              |  59 +++++--
 5 files changed, 160 insertions(+), 59 deletions(-)

diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index 45195dcedec64..5462e06f81214 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -560,6 +560,7 @@ From text
    feature_selection.chi2
    feature_selection.f_classif
    feature_selection.f_regression
+   feature_selection.r_regression
    feature_selection.mutual_info_classif
    feature_selection.mutual_info_regression
 
diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 458bb8dfff8a0..270ae456b5213 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -204,6 +204,14 @@ Changelog
   input strings would result in negative indices in the transformed data.
   :pr:`19035` by :user:`Liu Yu `.
 
+:mod:`sklearn.feature_selection`
+................................
+
+- |Feature| :func:`feature_selection.r_regression` computes Pearson's R
+  correlation coefficients between the features and the target.
+  :pr:`17169` by `Dmytro Lituiev `
+  and `Julien Jerphanion `.
+
 :mod:`sklearn.inspection`
 .........................
 
diff --git a/sklearn/feature_selection/__init__.py b/sklearn/feature_selection/__init__.py
index 86e8a2af39084..ef894b40065de 100644
--- a/sklearn/feature_selection/__init__.py
+++ b/sklearn/feature_selection/__init__.py
@@ -8,6 +8,7 @@
 from ._univariate_selection import f_classif
 from ._univariate_selection import f_oneway
 from ._univariate_selection import f_regression
+from ._univariate_selection import r_regression
 from ._univariate_selection import SelectPercentile
 from ._univariate_selection import SelectKBest
 from ._univariate_selection import SelectFpr
@@ -44,6 +45,7 @@
            'f_classif',
            'f_oneway',
            'f_regression',
+           'r_regression',
            'mutual_info_classif',
            'mutual_info_regression',
            'SelectorMixin']
diff --git a/sklearn/feature_selection/_univariate_selection.py b/sklearn/feature_selection/_univariate_selection.py
index be3298387f612..d9db03e479163 100644
--- a/sklearn/feature_selection/_univariate_selection.py
+++ b/sklearn/feature_selection/_univariate_selection.py
@@ -230,60 +230,53 @@ def chi2(X, y):
     return _chisquare(observed, expected)
 
 
-@_deprecate_positional_args
-def f_regression(X, y, *, center=True):
-    """Univariate linear regression tests.
+def r_regression(X, y, *, center=True):
+    """Compute Pearson's r for each features and the target.
+
+    Pearson's r is also known as the Pearson correlation coefficient.
+
+    .. versionadded:: 1.0
 
     Linear model for testing the individual effect of each of many regressors.
     This is a scoring function to be used in a feature selection procedure, not
     a free standing feature selection procedure.
 
-    This is done in 2 steps:
-
-    1. The correlation between each regressor and the target is computed,
-       that is, ((X[:, i] - mean(X[:, i])) * (y - mean_y)) / (std(X[:, i]) *
-       std(y)).
-    2. It is converted to an F score then to a p-value.
+    The cross correlation between each regressor and the target is computed
+    as ((X[:, i] - mean(X[:, i])) * (y - mean_y)) / (std(X[:, i]) * std(y)).
 
     For more on usage see the :ref:`User Guide `.
 
     Parameters
     ----------
-    X : {array-like, sparse matrix}  shape = (n_samples, n_features)
-        The set of regressors that will be tested sequentially.
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        The data matrix.
 
-    y : array of shape(n_samples).
-        The data matrix
+    y : array-like of shape (n_samples,)
+        The target vector.
 
     center : bool, default=True
-        If true, X and y will be centered.
+        Whether or not to center the data matrix `X` and the target vector `y`.
+        By default, `X` and `y` will be centered.
 
     Returns
     -------
-    F : array, shape=(n_features,)
-        F values of features.
-
-    pval : array, shape=(n_features,)
-        p-values of F-scores.
+    correlation_coefficient : ndarray of shape (n_features,)
+        Pearson's R correlation coefficients of features.
 
     See Also
     --------
-    mutual_info_regression : Mutual information for a continuous target.
-    f_classif : ANOVA F-value between label/feature for classification tasks.
-    chi2 : Chi-squared stats of non-negative features for classification tasks.
-    SelectKBest : Select features based on the k highest scores.
-    SelectFpr : Select features based on a false positive rate test.
-    SelectFdr : Select features based on an estimated false discovery rate.
-    SelectFwe : Select features based on family-wise error rate.
-    SelectPercentile : Select features based on percentile of the highest
-        scores.
+    f_regression: Univariate linear regression tests returning f-statistic
+        and p-values
+    mutual_info_regression: Mutual information for a continuous target.
+    f_classif: ANOVA F-value between label/feature for classification tasks.
+    chi2: Chi-squared stats of non-negative features for classification tasks.
     """
     X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
                      dtype=np.float64)
     n_samples = X.shape[0]
 
-    # compute centered values
-    # note that E[(x - mean(x))*(y - mean(y))] = E[x*(y - mean(y))], so we
+    # Compute centered values
+    # Note that E[(x - mean(x))*(y - mean(y))] = E[x*(y - mean(y))], so we
     # need not center X
     if center:
         y = y - np.mean(y)
@@ -291,22 +284,86 @@ def f_regression(X, y, *, center=True):
             X_means = X.mean(axis=0).getA1()
         else:
             X_means = X.mean(axis=0)
-        # compute the scaled standard deviations via moments
+        # Compute the scaled standard deviations via moments
         X_norms = np.sqrt(row_norms(X.T, squared=True) -
                           n_samples * X_means ** 2)
     else:
         X_norms = row_norms(X.T)
 
-    # compute the correlation
-    corr = safe_sparse_dot(y, X)
-    corr /= X_norms
-    corr /= np.linalg.norm(y)
+    correlation_coefficient = safe_sparse_dot(y, X)
+    correlation_coefficient /= X_norms
+    correlation_coefficient /= np.linalg.norm(y)
+    return correlation_coefficient
+
+
+@_deprecate_positional_args
+def f_regression(X, y, *, center=True):
+    """Univariate linear regression tests returning F-statistic and p-values.
+
+    Quick linear model for testing the effect of a single regressor,
+    sequentially for many regressors.
+
+    This is done in 2 steps:
+
+    1. The cross correlation between each regressor and the target is computed,
+       that is, ((X[:, i] - mean(X[:, i])) * (y - mean_y)) / (std(X[:, i]) *
+       std(y)) using r_regression function.
+    2. It is converted to an F score and then to a p-value.
+
+    :func:`f_regression` is derived from :func:`r_regression` and will rank
+    features in the same order if all the features are positively correlated
+    with the target.
+
+    Note however that contrary to :func:`f_regression`, :func:`r_regression`
+    values lie in [-1, 1] and can thus be negative. :func:`f_regression` is
+    therefore recommended as a feature selection criterion to identify
+    potentially predictive feature for a downstream classifier, irrespective of
+    the sign of the association with the target variable.
 
-    # convert to p-value
-    degrees_of_freedom = y.size - (2 if center else 1)
-    F = corr ** 2 / (1 - corr ** 2) * degrees_of_freedom
-    pv = stats.f.sf(F, 1, degrees_of_freedom)
-    return F, pv
+    Furthermore :func:`f_regression` returns p-values while
+    :func:`r_regression` does not.
+
+    Read more in the :ref:`User Guide `.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        The data matrix.
+
+    y : array-like of shape (n_samples,)
+        The target vector.
+
+    center : bool, default=True
+        Whether or not to center the data matrix `X` and the target vector `y`.
+        By default, `X` and `y` will be centered.
+
+    Returns
+    -------
+    f_statistic : ndarray of shape (n_features,)
+        F-statistic for each feature.
+
+    p_values : ndarray of shape (n_features,)
+        P-values associated with the F-statistic.
+
+    See Also
+    --------
+    r_regression: Pearson's R between label/feature for regression tasks.
+    f_classif: ANOVA F-value between label/feature for classification tasks.
+    chi2: Chi-squared stats of non-negative features for classification tasks.
+    SelectKBest: Select features based on the k highest scores.
+    SelectFpr: Select features based on a false positive rate test.
+    SelectFdr: Select features based on an estimated false discovery rate.
+    SelectFwe: Select features based on family-wise error rate.
+    SelectPercentile: Select features based on percentile of the highest
+        scores.
+    """
+    correlation_coefficient = r_regression(X, y, center=center)
+    deg_of_freedom = y.size - (2 if center else 1)
+
+    corr_coef_squared = correlation_coefficient ** 2
+    f_statistic = corr_coef_squared / (1 - corr_coef_squared) * deg_of_freedom
+    p_values = stats.f.sf(f_statistic, 1, deg_of_freedom)
+    return f_statistic, p_values
 
 
 ######################################################################
@@ -503,12 +560,12 @@ class SelectKBest(_BaseFilter):
 
     See Also
     --------
-    f_classif : ANOVA F-value between label/feature for classification tasks.
-    mutual_info_classif : Mutual information for a discrete target.
-    chi2 : Chi-squared stats of non-negative features for classification tasks.
-    f_regression : F-value between label/feature for regression tasks.
-    mutual_info_regression : Mutual information for a continuous target.
-    SelectPercentile : Select features based on percentile of the highest
+    f_classif: ANOVA F-value between label/feature for classification tasks.
+    mutual_info_classif: Mutual information for a discrete target.
+    chi2: Chi-squared stats of non-negative features for classification tasks.
+    f_regression: F-value between label/feature for regression tasks.
+    mutual_info_regression: Mutual information for a continuous target.
+    SelectPercentile: Select features based on percentile of the highest
         scores.
     SelectFpr : Select features based on a false positive rate test.
     SelectFdr : Select features based on an estimated false discovery rate.
diff --git a/sklearn/feature_selection/tests/test_feature_select.py b/sklearn/feature_selection/tests/test_feature_select.py
index 61f709094147e..852c8228b2a76 100644
--- a/sklearn/feature_selection/tests/test_feature_select.py
+++ b/sklearn/feature_selection/tests/test_feature_select.py
@@ -4,11 +4,12 @@
 import itertools
 import warnings
 import numpy as np
+from numpy.testing import assert_allclose
 from scipy import stats, sparse
 
 import pytest
 
-from sklearn.utils._testing import assert_almost_equal
+from sklearn.utils._testing import assert_almost_equal, _convert_container
 from sklearn.utils._testing import assert_array_equal
 from sklearn.utils._testing import assert_array_almost_equal
 from sklearn.utils._testing import assert_warns
@@ -18,9 +19,20 @@
 
 from sklearn.datasets import make_classification, make_regression
 from sklearn.feature_selection import (
-    chi2, f_classif, f_oneway, f_regression, mutual_info_classif,
-    mutual_info_regression, SelectPercentile, SelectKBest, SelectFpr,
-    SelectFdr, SelectFwe, GenericUnivariateSelect)
+    chi2,
+    f_classif,
+    f_oneway,
+    f_regression,
+    GenericUnivariateSelect,
+    mutual_info_classif,
+    mutual_info_regression,
+    r_regression,
+    SelectPercentile,
+    SelectKBest,
+    SelectFpr,
+    SelectFdr,
+    SelectFwe,
+)
 
 
 ##############################################################################
@@ -71,6 +83,27 @@ def test_f_classif():
     assert_array_almost_equal(pv_sparse, pv)
 
 
+@pytest.mark.parametrize("center", [True, False])
+def test_r_regression(center):
+    X, y = make_regression(n_samples=2000, n_features=20, n_informative=5,
+                           shuffle=False, random_state=0)
+
+    corr_coeffs = r_regression(X, y, center=center)
+    assert ((-1 < corr_coeffs).all())
+    assert ((corr_coeffs < 1).all())
+
+    sparse_X = _convert_container(X, "sparse")
+
+    sparse_corr_coeffs = r_regression(sparse_X, y, center=center)
+    assert_allclose(sparse_corr_coeffs, corr_coeffs)
+
+    # Testing against numpy for reference
+    Z = np.hstack((X, y[:, np.newaxis]))
+    correlation_matrix = np.corrcoef(Z, rowvar=False)
+    np_corr_coeffs = correlation_matrix[:-1, -1]
+    assert_array_almost_equal(np_corr_coeffs, corr_coeffs, decimal=3)
+
+
 def test_f_regression():
     # Test whether the F test yields meaningful results
     # on a simple simulated regression problem
@@ -87,14 +120,14 @@ def test_f_regression():
     # with centering, compare with sparse
     F, pv = f_regression(X, y, center=True)
     F_sparse, pv_sparse = f_regression(sparse.csr_matrix(X), y, center=True)
-    assert_array_almost_equal(F_sparse, F)
-    assert_array_almost_equal(pv_sparse, pv)
+    assert_allclose(F_sparse, F)
+    assert_allclose(pv_sparse, pv)
 
     # again without centering, compare with sparse
     F, pv = f_regression(X, y, center=False)
     F_sparse, pv_sparse = f_regression(sparse.csr_matrix(X), y, center=False)
-    assert_array_almost_equal(F_sparse, F)
-    assert_array_almost_equal(pv_sparse, pv)
+    assert_allclose(F_sparse, F)
+    assert_allclose(pv_sparse, pv)
 
 
 def test_f_regression_input_dtype():
@@ -106,8 +139,8 @@ def test_f_regression_input_dtype():
 
     F1, pv1 = f_regression(X, y)
     F2, pv2 = f_regression(X, y.astype(float))
-    assert_array_almost_equal(F1, F2, 5)
-    assert_array_almost_equal(pv1, pv2, 5)
+    assert_allclose(F1, F2, 5)
+    assert_allclose(pv1, pv2, 5)
 
 
 def test_f_regression_center():
@@ -123,7 +156,7 @@ def test_f_regression_center():
 
     F1, _ = f_regression(X, Y, center=True)
     F2, _ = f_regression(X, Y, center=False)
-    assert_array_almost_equal(F1 * (n_samples - 1.) / (n_samples - 2.), F2)
+    assert_allclose(F1 * (n_samples - 1.) / (n_samples - 2.), F2)
     assert_almost_equal(F2[0], 0.232558139)  # value from statsmodels OLS
 
 
@@ -262,7 +295,7 @@ def test_select_heuristics_classif():
             f_classif, mode=mode, param=0.01).fit(X, y).transform(X)
         assert_array_equal(X_r, X_r2)
         support = univariate_filter.get_support()
-        assert_array_almost_equal(support, gtruth)
+        assert_allclose(support, gtruth)
 
 
 ##############################################################################
@@ -272,7 +305,7 @@ def test_select_heuristics_classif():
 def assert_best_scores_kept(score_filter):
     scores = score_filter.scores_
     support = score_filter.get_support()
-    assert_array_almost_equal(np.sort(scores[support]),
+    assert_allclose(np.sort(scores[support]),
                               np.sort(scores)[-support.sum():])
 
 

From b84afe541c679b0cb57b3b1f7f438400392d11ee Mon Sep 17 00:00:00 2001
From: Alexandr Fonari 
Date: Wed, 21 Apr 2021 18:39:07 +0000
Subject: [PATCH 352/478] FIX prevent division by zero with constant target in
 GPR (#19703)

Co-authored-by: Sasha Fonari 
Co-authored-by: Chiara Marmo 
Co-authored-by: Guillaume Lemaitre 
---
 doc/whats_new/v0.24.rst                    | 11 +++++++++++
 sklearn/gaussian_process/_gpr.py           |  5 ++++-
 sklearn/gaussian_process/tests/test_gpr.py | 23 ++++++++++++++++++++++
 3 files changed, 38 insertions(+), 1 deletion(-)

diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst
index 5b0b753f0f294..305648dbdcfc9 100644
--- a/doc/whats_new/v0.24.rst
+++ b/doc/whats_new/v0.24.rst
@@ -32,6 +32,17 @@ Changelog
 - |Fix| Fixed a bug in :class:`decomposition.KernelPCA`'s
   ``inverse_transform``.  :pr:`19732` by :user:`Kei Ishikawa `.
 
+:mod:`sklearn.gaussian_process`
+...............................
+
+- |Fix| Avoid division by zero when scaling constant target in
+  :class:`gaussian_process.GaussianProcessRegressor`. It was due to a std. dev.
+  equal to 0. Now, such case is detected and the std. dev. is affected to 1
+  avoiding a division by zero and thus the presence of NaN values in the
+  normalized target.
+  :pr:`19703` by :user:`sobkevich`, :user:`Boris Villazón-Terrazas `
+  and :user:`Alexandr Fonari `.
+
 :mod:`sklearn.linear_model`
 ...........................
 
diff --git a/sklearn/gaussian_process/_gpr.py b/sklearn/gaussian_process/_gpr.py
index 4e8814dd69951..8f9575ffe42df 100644
--- a/sklearn/gaussian_process/_gpr.py
+++ b/sklearn/gaussian_process/_gpr.py
@@ -14,6 +14,7 @@
 from ..base import BaseEstimator, RegressorMixin, clone
 from ..base import MultiOutputMixin
 from .kernels import RBF, ConstantKernel as C
+from ..preprocessing._data import _handle_zeros_in_scale
 from ..utils import check_random_state
 from ..utils.optimize import _check_optimize_result
 from ..utils.validation import _deprecate_positional_args
@@ -197,7 +198,9 @@ def fit(self, X, y):
         # Normalize target value
         if self.normalize_y:
             self._y_train_mean = np.mean(y, axis=0)
-            self._y_train_std = np.std(y, axis=0)
+            self._y_train_std = _handle_zeros_in_scale(
+                np.std(y, axis=0), copy=False
+            )
 
             # Remove mean and make unit variance
             y = (y - self._y_train_mean) / self._y_train_std
diff --git a/sklearn/gaussian_process/tests/test_gpr.py b/sklearn/gaussian_process/tests/test_gpr.py
index a5bfa05c47313..440e421cb95cc 100644
--- a/sklearn/gaussian_process/tests/test_gpr.py
+++ b/sklearn/gaussian_process/tests/test_gpr.py
@@ -546,3 +546,26 @@ def test_bound_check_fixed_hyperparameter():
                         periodicity_bounds="fixed")  # seasonal component
     kernel = k1 + k2
     GaussianProcessRegressor(kernel=kernel).fit(X, y)
+
+
+# FIXME: we should test for multitargets as well. However, GPR is broken:
+# see: https://github.com/scikit-learn/scikit-learn/pull/19706
+@pytest.mark.parametrize('kernel', kernels)
+def test_constant_target(kernel):
+    """Check that the std. dev. is affected to 1 when normalizing a constant
+    feature.
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/18318
+    NaN where affected to the target when scaling due to null std. dev. with
+    constant target.
+    """
+    y_constant = np.ones(X.shape[0], dtype=np.float64)
+
+    gpr = GaussianProcessRegressor(kernel=kernel, normalize_y=True)
+    gpr.fit(X, y_constant)
+    assert gpr._y_train_std == pytest.approx(1.0)
+
+    y_pred, y_cov = gpr.predict(X, return_cov=True)
+    assert_allclose(y_pred, y_constant)
+    # set atol because we compare to zero
+    assert_allclose(np.diag(y_cov), 0., atol=1e-9)

From a67b284f90299989c4cc03f848dc9cc1be57c623 Mon Sep 17 00:00:00 2001
From: Andrew Delong 
Date: Wed, 21 Apr 2021 17:34:28 -0400
Subject: [PATCH 353/478] FIX Encoder should accept categories having dtype='S'
 (#19727)

Co-authored-by: Guillaume Lemaitre 
---
 doc/whats_new/v0.24.rst                      | 21 +++++---
 sklearn/preprocessing/_encoders.py           |  2 +-
 sklearn/preprocessing/tests/test_encoders.py | 17 +++---
 sklearn/utils/_encode.py                     |  4 +-
 sklearn/utils/_testing.py                    | 48 +++++++++++++----
 sklearn/utils/tests/test_testing.py          | 54 ++++++++++++++++----
 6 files changed, 108 insertions(+), 38 deletions(-)

diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst
index 305648dbdcfc9..41dfcfbc4d1c9 100644
--- a/doc/whats_new/v0.24.rst
+++ b/doc/whats_new/v0.24.rst
@@ -62,6 +62,13 @@ Changelog
   :class:`model_selection.HalvingGridSearchCV` were not properly converted to
   numpy arrays. :pr:`19211` by `Nicolas Hug`_.
 
+:mod:`sklearn.multioutput`
+..........................
+
+- |Fix| :class:`multioutput.MultiOutputRegressor` now works with estimators
+  that dynamically define `predict` during fitting, such as
+  :class:`ensemble.StackingRegressor`. :pr:`19308` by `Thomas Fan`_.
+
 :mod:`sklearn.preprocessing`
 ............................
 
@@ -70,19 +77,17 @@ Changelog
   `'use_encoded_value'` strategies.
   :pr:`19234` by `Guillaume Lemaitre `.
 
+- |Fix| Fix encoder categories having dtype='S'
+  :class:`preprocessing.OneHotEncoder` and
+  :class:`preprocessing.OrdinalEncoder`.
+  :pr:`19727` by :user:`Andrew Delong `.
+
 - |Fix| :meth:`preprocessing.OrdinalEncoder.transfrom` correctly handles
   unknown values for string dtypes. :pr:`19888` by `Thomas Fan`_.
 
 - |Fix| :meth:`preprocessing.OneHotEncoder.fit` no longer alters the `drop`
   parameter. :pr:`19924` by `Thomas Fan`_.
 
-:mod:`sklearn.multioutput`
-..........................
-
-- |Fix| :class:`multioutput.MultiOutputRegressor` now works with estimators
-  that dynamically define `predict` during fitting, such as
-  :class:`ensemble.StackingRegressor`. :pr:`19308` by `Thomas Fan`_.
-
 :mod:`sklearn.semi_supervised`
 ..............................
 
@@ -91,7 +96,7 @@ Changelog
   :pr:`19271` by :user:`Zhaowei Wang `.
 
 :mod:`sklearn.tree`
-.......................
+...................
 
 - |Fix| Fix a bug in `fit` of :class:`tree.BaseDecisionTree` that caused
   segmentation faults under certain conditions. `fit` now deep copies the
diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index 36ca74ac09cdb..ba1d48df175ee 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -92,7 +92,7 @@ def _fit(self, X, handle_unknown='error', force_all_finite=True):
                 cats = _unique(Xi)
             else:
                 cats = np.array(self.categories[i], dtype=Xi.dtype)
-                if Xi.dtype.kind not in 'OU':
+                if Xi.dtype.kind not in 'OUS':
                     sorted_cats = np.sort(cats)
                     error_msg = ("Unsorted categories are not "
                                  "supported for numerical categories")
diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
index 72fa46544b198..ef2ac000a0c83 100644
--- a/sklearn/preprocessing/tests/test_encoders.py
+++ b/sklearn/preprocessing/tests/test_encoders.py
@@ -692,7 +692,8 @@ def test_encoder_dtypes():
 
     for X in [np.array([[1, 2], [3, 4]], dtype='int64'),
               np.array([[1, 2], [3, 4]], dtype='float64'),
-              np.array([['a', 'b'], ['c', 'd']]),  # string dtype
+              np.array([['a', 'b'], ['c', 'd']]),      # unicode dtype
+              np.array([[b'a', b'b'], [b'c', b'd']]),  # string dtype
               np.array([[1, 'a'], [3, 'b']], dtype='object')]:
         enc.fit(X)
         assert all([enc.categories_[i].dtype == X.dtype for i in range(2)])
@@ -827,21 +828,25 @@ def test_encoders_has_categorical_tags(Encoder):
     assert 'categorical' in Encoder()._get_tags()['X_types']
 
 
-@pytest.mark.parametrize('input_dtype', ['O', 'U'])
-@pytest.mark.parametrize('category_dtype', ['O', 'U'])
+# deliberately omit 'OS' as an invalid combo
+@pytest.mark.parametrize('input_dtype, category_dtype', ['OO', 'OU',
+                                                         'UO', 'UU', 'US',
+                                                         'SO', 'SU', 'SS'])
 @pytest.mark.parametrize('array_type', ['list', 'array', 'dataframe'])
-def test_encoders_unicode_categories(input_dtype, category_dtype, array_type):
-    """Check that encoding work with string and object dtypes.
+def test_encoders_string_categories(input_dtype, category_dtype, array_type):
+    """Check that encoding work with object, unicode, and byte string dtypes.
     Non-regression test for:
     https://github.com/scikit-learn/scikit-learn/issues/15616
     https://github.com/scikit-learn/scikit-learn/issues/15726
+    https://github.com/scikit-learn/scikit-learn/issues/19677
     """
 
     X = np.array([['b'], ['a']], dtype=input_dtype)
     categories = [np.array(['b', 'a'], dtype=category_dtype)]
     ohe = OneHotEncoder(categories=categories, sparse=False).fit(X)
 
-    X_test = _convert_container([['a'], ['a'], ['b'], ['a']], array_type)
+    X_test = _convert_container([['a'], ['a'], ['b'], ['a']], array_type,
+                                dtype=input_dtype)
     X_trans = ohe.transform(X_test)
 
     expected = np.array([[0, 1], [0, 1], [1, 0], [0, 1]])
diff --git a/sklearn/utils/_encode.py b/sklearn/utils/_encode.py
index b43afa998698b..2295150a6626b 100644
--- a/sklearn/utils/_encode.py
+++ b/sklearn/utils/_encode.py
@@ -173,7 +173,7 @@ def _encode(values, *, uniques, check_unknown=True):
     encoded : ndarray
         Encoded values
     """
-    if values.dtype.kind in 'OU':
+    if values.dtype.kind in 'OUS':
         try:
             return _map_to_integer(values, uniques)
         except KeyError as e:
@@ -214,7 +214,7 @@ def _check_unknown(values, known_values, return_mask=False):
     """
     valid_mask = None
 
-    if values.dtype.kind in 'UO':
+    if values.dtype.kind in 'OUS':
         values_set = set(values)
         values_set, missing_in_values = _extract_missing(values_set)
 
diff --git a/sklearn/utils/_testing.py b/sklearn/utils/_testing.py
index 779e7b6574e3e..8fc77748740d5 100644
--- a/sklearn/utils/_testing.py
+++ b/sklearn/utils/_testing.py
@@ -758,30 +758,58 @@ def assert_run_python_script(source_code, timeout=60):
         os.unlink(source_file)
 
 
-def _convert_container(container, constructor_name, columns_name=None):
+def _convert_container(
+    container, constructor_name, columns_name=None, dtype=None
+):
+    """Convert a given container to a specific array-like with a dtype.
+
+    Parameters
+    ----------
+    container : array-like
+        The container to convert.
+    constructor_name : {"list", "tuple", "array", "sparse", "dataframe", \
+            "series", "index", "slice", "sparse_csr", "sparse_csc"}
+        The type of the returned container.
+    columns_name : index or array-like, default=None
+        For pandas container supporting `columns_names`, it will affect
+        specific names.
+    dtype : dtype, default=None
+        Force the dtype of the container. Does not apply to `"slice"`
+        container.
+
+    Returns
+    -------
+    converted_container
+    """
     if constructor_name == 'list':
-        return list(container)
+        if dtype is None:
+            return list(container)
+        else:
+            return np.asarray(container, dtype=dtype).tolist()
     elif constructor_name == 'tuple':
-        return tuple(container)
+        if dtype is None:
+            return tuple(container)
+        else:
+            return tuple(np.asarray(container, dtype=dtype).tolist())
     elif constructor_name == 'array':
-        return np.asarray(container)
+        return np.asarray(container, dtype=dtype)
     elif constructor_name == 'sparse':
-        return sp.sparse.csr_matrix(container)
+        return sp.sparse.csr_matrix(container, dtype=dtype)
     elif constructor_name == 'dataframe':
         pd = pytest.importorskip('pandas')
-        return pd.DataFrame(container, columns=columns_name)
+        return pd.DataFrame(container, columns=columns_name, dtype=dtype)
     elif constructor_name == 'series':
         pd = pytest.importorskip('pandas')
-        return pd.Series(container)
+        return pd.Series(container, dtype=dtype)
     elif constructor_name == 'index':
         pd = pytest.importorskip('pandas')
-        return pd.Index(container)
+        return pd.Index(container, dtype=dtype)
     elif constructor_name == 'slice':
         return slice(container[0], container[1])
     elif constructor_name == 'sparse_csr':
-        return sp.sparse.csr_matrix(container)
+        return sp.sparse.csr_matrix(container, dtype=dtype)
     elif constructor_name == 'sparse_csc':
-        return sp.sparse.csc_matrix(container)
+        return sp.sparse.csc_matrix(container, dtype=dtype)
 
 
 def raises(expected_exc_type, match=None, may_pass=False, err_msg=None):
diff --git a/sklearn/utils/tests/test_testing.py b/sklearn/utils/tests/test_testing.py
index 1d4b3780953a7..8685409a4fd44 100644
--- a/sklearn/utils/tests/test_testing.py
+++ b/sklearn/utils/tests/test_testing.py
@@ -624,19 +624,51 @@ def test_create_memmap_backed_data(monkeypatch):
 
 @pytest.mark.parametrize(
     "constructor_name, container_type",
-    [('list', list),
-     ('tuple', tuple),
-     ('array', np.ndarray),
-     ('sparse', sparse.csr_matrix),
-     ('dataframe', pytest.importorskip('pandas').DataFrame),
-     ('series', pytest.importorskip('pandas').Series),
-     ('index', pytest.importorskip('pandas').Index),
-     ('slice', slice)]
+    [
+        ('list', list),
+        ('tuple', tuple),
+        ('array', np.ndarray),
+        ('sparse', sparse.csr_matrix),
+        ('sparse_csr', sparse.csr_matrix),
+        ('sparse_csc', sparse.csc_matrix),
+        ('dataframe', lambda: pytest.importorskip('pandas').DataFrame),
+        ('series', lambda: pytest.importorskip('pandas').Series),
+        ('index', lambda: pytest.importorskip('pandas').Index),
+        ('slice', slice),
+    ]
 )
-def test_convert_container(constructor_name, container_type):
+@pytest.mark.parametrize(
+    "dtype, superdtype",
+    [
+        (np.int32, np.integer),
+        (np.int64, np.integer),
+        (np.float32, np.floating),
+        (np.float64, np.floating),
+    ]
+)
+def test_convert_container(
+    constructor_name, container_type, dtype, superdtype,
+):
+    """Check that we convert the container to the right type of array with the
+    right data type."""
+    if constructor_name in ("dataframe", "series", "index"):
+        # delay the import of pandas within the function to only skip this test
+        # instead of the whole file
+        container_type = container_type()
     container = [0, 1]
-    assert isinstance(_convert_container(container, constructor_name),
-                      container_type)
+    container_converted = _convert_container(
+        container, constructor_name, dtype=dtype,
+    )
+    assert isinstance(container_converted, container_type)
+
+    if constructor_name in ("list", "tuple", "index"):
+        # list and tuple will use Python class dtype: int, float
+        # pandas index will always use high precision: np.int64 and np.float64
+        assert np.issubdtype(type(container_converted[0]), superdtype)
+    elif hasattr(container_converted, "dtype"):
+        assert container_converted.dtype == dtype
+    elif hasattr(container_converted, "dtypes"):
+        assert container_converted.dtypes[0] == dtype
 
 
 def test_raises():

From dbed806a7aad5d253cf1ca0a3bca9bda5e391456 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" 
Date: Thu, 22 Apr 2021 04:49:24 -0400
Subject: [PATCH 354/478] FIX Fixes regression in CCA due to change of cutoff
 values in SciPy  (#19646)

---
 doc/whats_new/v0.24.rst                       |  6 ++++++
 sklearn/cross_decomposition/_pls.py           | 21 +++++++++++++++++--
 sklearn/cross_decomposition/tests/test_pls.py | 16 ++++++++++++++
 3 files changed, 41 insertions(+), 2 deletions(-)

diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst
index 41dfcfbc4d1c9..dc1727b2264a5 100644
--- a/doc/whats_new/v0.24.rst
+++ b/doc/whats_new/v0.24.rst
@@ -19,6 +19,12 @@ Changelog
   :term:`get_feature_names` on transformers with an empty column selection.
   :pr:`19579` by `Thomas Fan`_.
 
+:mod:`sklearn.cross_decomposition`
+..................................
+
+- |Fix| Fixed a regression in :class:`cross_decomposition.CCA`. :pr:`19646`
+  by `Thomas Fan`_.
+
 :mod:`sklearn.ensemble`
 .......................
 
diff --git a/sklearn/cross_decomposition/_pls.py b/sklearn/cross_decomposition/_pls.py
index 42d727b9ae2be..3c886a0dd0c1b 100644
--- a/sklearn/cross_decomposition/_pls.py
+++ b/sklearn/cross_decomposition/_pls.py
@@ -23,6 +23,24 @@
 __all__ = ['PLSCanonical', 'PLSRegression', 'PLSSVD']
 
 
+def _pinv2_old(a):
+    # Used previous scipy pinv2 that was updated in:
+    # https://github.com/scipy/scipy/pull/10067
+    # We can not set `cond` or `rcond` for pinv2 in scipy >= 1.3 to keep the
+    # same behavior of pinv2 for scipy < 1.3, because the condition used to
+    # determine the rank is dependent on the output of svd.
+    u, s, vh = svd(a, full_matrices=False, check_finite=False)
+
+    t = u.dtype.char.lower()
+    factor = {'f': 1E3, 'd': 1E6}
+    cond = np.max(s) * factor[t] * np.finfo(t).eps
+    rank = np.sum(s > cond)
+
+    u = u[:, :rank]
+    u /= s[:rank]
+    return np.transpose(np.conjugate(np.dot(u, vh[:rank])))
+
+
 def _get_first_singular_vectors_power_method(X, Y, mode="A", max_iter=500,
                                              tol=1e-06, norm_y_weights=False):
     """Return the first left and right singular vectors of X'Y.
@@ -44,8 +62,7 @@ def _get_first_singular_vectors_power_method(X, Y, mode="A", max_iter=500,
         # As a result, and as detailed in the Wegelin's review, CCA (i.e. mode
         # B) will be unstable if n_features > n_samples or n_targets >
         # n_samples
-        X_pinv = pinv2(X, check_finite=False, cond=10*eps)
-        Y_pinv = pinv2(Y, check_finite=False, cond=10*eps)
+        X_pinv, Y_pinv = _pinv2_old(X), _pinv2_old(Y)
 
     for i in range(max_iter):
         if mode == "B":
diff --git a/sklearn/cross_decomposition/tests/test_pls.py b/sklearn/cross_decomposition/tests/test_pls.py
index 04c791fd4154a..1179161b8436c 100644
--- a/sklearn/cross_decomposition/tests/test_pls.py
+++ b/sklearn/cross_decomposition/tests/test_pls.py
@@ -552,3 +552,19 @@ def test_svd_flip_1d():
 
     assert_allclose(v, v_expected.ravel())
     assert_allclose(v, [-1, -2, -3])
+
+
+def test_loadings_converges():
+    """Test that CCA converges. Non-regression test for #19549."""
+    X, y = make_regression(n_samples=200, n_features=20, n_targets=20,
+                           random_state=20)
+
+    cca = CCA(n_components=10, max_iter=500)
+
+    with pytest.warns(None) as record:
+        cca.fit(X, y)
+    # ConvergenceWarning is not raised
+    assert not record
+
+    # Loadings converges to reasonable values
+    assert np.all(np.abs(cca.x_loadings_) < 1)

From efc703cb0a2e7a5bbc224aa54910d2f67a5ffb16 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre 
Date: Thu, 22 Apr 2021 15:26:04 +0200
Subject: [PATCH 355/478] DOC order whats new 0.24.2

---
 doc/whats_new/v0.24.rst | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst
index dc1727b2264a5..b3768f92155eb 100644
--- a/doc/whats_new/v0.24.rst
+++ b/doc/whats_new/v0.24.rst
@@ -25,6 +25,12 @@ Changelog
 - |Fix| Fixed a regression in :class:`cross_decomposition.CCA`. :pr:`19646`
   by `Thomas Fan`_.
 
+:mod:`sklearn.decomposition`
+............................
+
+- |Fix| Fixed a bug in :class:`decomposition.KernelPCA`'s
+  ``inverse_transform``.  :pr:`19732` by :user:`Kei Ishikawa `.
+
 :mod:`sklearn.ensemble`
 .......................
 
@@ -32,12 +38,6 @@ Changelog
   with `sample_weight` parameter and `least_absolute_deviation` loss function.
   :pr:`19407` by :user:`Vadim Ushtanit `.
 
-:mod:`sklearn.decomposition`
-............................
-
-- |Fix| Fixed a bug in :class:`decomposition.KernelPCA`'s
-  ``inverse_transform``.  :pr:`19732` by :user:`Kei Ishikawa `.
-
 :mod:`sklearn.gaussian_process`
 ...............................
 

From c88c89cffd87c34299ebb8db6192c973823bd827 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre 
Date: Thu, 22 Apr 2021 18:50:46 +0200
Subject: [PATCH 356/478] DOC move whats new entry from 1.0 to 0.24

---
 doc/whats_new/v0.24.rst | 7 ++++++-
 doc/whats_new/v1.0.rst  | 5 -----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst
index b3768f92155eb..34744de8a6b91 100644
--- a/doc/whats_new/v0.24.rst
+++ b/doc/whats_new/v0.24.rst
@@ -7,7 +7,7 @@
 Version 0.24.2
 ==============
 
-**TBD 2021**
+**April 2021**
 
 Changelog
 ---------
@@ -68,6 +68,11 @@ Changelog
   :class:`model_selection.HalvingGridSearchCV` were not properly converted to
   numpy arrays. :pr:`19211` by `Nicolas Hug`_.
 
+- |Fix| The `fit` method of the successive halving parameter search
+  (:class:`model_selection.HalvingGridSearchCV`, and
+  :class:`model_selection.HalvingRandomSearchCV`) now correctly handles the
+  `groups` parameter. :pr:`19847` by :user:`Xiaoyu Chai `.
+
 :mod:`sklearn.multioutput`
 ..........................
 
diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 270ae456b5213..3b3884e68e185 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -319,11 +319,6 @@ Changelog
   :pr:`18649` by `Leandro Hermida ` and
   `Rodion Martynov `.
 
-- |Fix| The `fit` method of the successive halving parameter search
-  (:class:`model_selection.HalvingGridSearchCV`, and
-  :class:`model_selection.HalvingRandomSearchCV`) now correctly handles the
-  `groups` parameter. :pr:`19847` by :user:`Xiaoyu Chai `.
-
 :mod:`sklearn.naive_bayes`
 ..........................
 

From 09684342745cfc3509432885396e7be776e64cee Mon Sep 17 00:00:00 2001
From: Chiara Marmo 
Date: Fri, 23 Apr 2021 13:02:14 +0200
Subject: [PATCH 357/478] MAINT Remove tests for metric configuration ignoring
 pos_label (#19961)

---
 sklearn/metrics/tests/test_common.py | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py
index dbf1bdd458f1a..66df47a778b38 100644
--- a/sklearn/metrics/tests/test_common.py
+++ b/sklearn/metrics/tests/test_common.py
@@ -342,16 +342,6 @@ def precision_recall_curve_padded_thresholds(*args, **kwargs):
     "weighted_average_precision_score",
     "micro_average_precision_score",
     "samples_average_precision_score",
-
-    # pos_label support deprecated; to be removed in 0.18:
-    "weighted_f0.5_score", "weighted_f1_score", "weighted_f2_score",
-    "weighted_precision_score", "weighted_recall_score",
-
-    "micro_f0.5_score", "micro_f1_score", "micro_f2_score",
-    "micro_precision_score", "micro_recall_score",
-
-    "macro_f0.5_score", "macro_f1_score", "macro_f2_score",
-    "macro_precision_score", "macro_recall_score",
 }
 
 # Metrics with a "labels" argument

From 2641baf16d9de5191316745ec46120cc8b57a666 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" 
Date: Fri, 23 Apr 2021 07:50:50 -0400
Subject: [PATCH 358/478] FIX Fixes PLSRegression regression for constant Yk
 (#19922)

---
 doc/whats_new/v0.24.rst                       |  4 ++++
 sklearn/cross_decomposition/_pls.py           | 21 ++++++++++++++-----
 sklearn/cross_decomposition/tests/test_pls.py | 15 +++++++++++++
 3 files changed, 35 insertions(+), 5 deletions(-)

diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst
index 34744de8a6b91..a14a649fc94a9 100644
--- a/doc/whats_new/v0.24.rst
+++ b/doc/whats_new/v0.24.rst
@@ -25,6 +25,10 @@ Changelog
 - |Fix| Fixed a regression in :class:`cross_decomposition.CCA`. :pr:`19646`
   by `Thomas Fan`_.
 
+- |Fix| :class:`cross_decomposition.PLSRegression` raises warning for
+  constant y residuals instead of a `StopIteration` error. :pr:`19922`
+  by `Thomas Fan`_.
+
 :mod:`sklearn.decomposition`
 ............................
 
diff --git a/sklearn/cross_decomposition/_pls.py b/sklearn/cross_decomposition/_pls.py
index 3c886a0dd0c1b..2f6e63d556388 100644
--- a/sklearn/cross_decomposition/_pls.py
+++ b/sklearn/cross_decomposition/_pls.py
@@ -52,7 +52,11 @@ def _get_first_singular_vectors_power_method(X, Y, mode="A", max_iter=500,
     """
 
     eps = np.finfo(X.dtype).eps
-    y_score = next(col for col in Y.T if np.any(np.abs(col) > eps))
+    try:
+        y_score = next(col for col in Y.T if np.any(np.abs(col) > eps))
+    except StopIteration as e:
+        raise StopIteration("Y residual is constant") from e
+
     x_weights_old = 100  # init to big value for first convergence check
 
     if mode == 'B':
@@ -256,10 +260,17 @@ def fit(self, X, Y):
                 Yk_mask = np.all(np.abs(Yk) < 10 * Y_eps, axis=0)
                 Yk[:, Yk_mask] = 0.0
 
-                x_weights, y_weights, n_iter_ = \
-                    _get_first_singular_vectors_power_method(
-                        Xk, Yk, mode=self.mode, max_iter=self.max_iter,
-                        tol=self.tol, norm_y_weights=norm_y_weights)
+                try:
+                    x_weights, y_weights, n_iter_ = \
+                        _get_first_singular_vectors_power_method(
+                            Xk, Yk, mode=self.mode, max_iter=self.max_iter,
+                            tol=self.tol, norm_y_weights=norm_y_weights)
+                except StopIteration as e:
+                    if str(e) != "Y residual is constant":
+                        raise
+                    warnings.warn(f"Y residual is constant at iteration {k}")
+                    break
+
                 self.n_iter_.append(n_iter_)
 
             elif self.algorithm == "svd":
diff --git a/sklearn/cross_decomposition/tests/test_pls.py b/sklearn/cross_decomposition/tests/test_pls.py
index 1179161b8436c..644e1418e3edc 100644
--- a/sklearn/cross_decomposition/tests/test_pls.py
+++ b/sklearn/cross_decomposition/tests/test_pls.py
@@ -568,3 +568,18 @@ def test_loadings_converges():
 
     # Loadings converges to reasonable values
     assert np.all(np.abs(cca.x_loadings_) < 1)
+
+
+def test_pls_constant_y():
+    """Checks warning when y is constant. Non-regression test for #19831"""
+    rng = np.random.RandomState(42)
+    x = rng.rand(100, 3)
+    y = np.zeros(100)
+
+    pls = PLSRegression()
+
+    msg = "Y residual is constant at iteration"
+    with pytest.warns(UserWarning, match=msg):
+        pls.fit(x, y)
+
+    assert_allclose(pls.x_rotations_, 0)

From 6927fa26aedf48162314b675016180e3356ad557 Mon Sep 17 00:00:00 2001
From: flyingdutchman23 
Date: Mon, 26 Apr 2021 14:22:59 +0200
Subject: [PATCH 359/478] FIX mislabelling multiclass target when labels is
 provided in top_k_accuracy_score (#19721)

---
 doc/whats_new/v0.24.rst               |  8 ++++++++
 sklearn/metrics/_ranking.py           |  4 +++-
 sklearn/metrics/tests/test_ranking.py | 24 ++++++++++++++++++++++++
 3 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst
index a14a649fc94a9..79f6ecb15c3d0 100644
--- a/doc/whats_new/v0.24.rst
+++ b/doc/whats_new/v0.24.rst
@@ -59,6 +59,14 @@ Changelog
 - |Fix|: Fixed a bug in :class:`linear_model.LogisticRegression`: the
   sample_weight object is not modified anymore. :pr:`19182` by
   :user:`Yosuke KOBAYASHI `.
+  
+:mod:`sklearn.metrics`
+......................
+
+- |Fix| :func:`metrics.top_k_accuracy_score` now supports multiclass
+  problems where only two classes appear in `y_true` and all the classes
+  are specified in `labels`.
+  :pr:`19721` by :user:`Joris Clement `.
 
 :mod:`sklearn.model_selection`
 ..............................
diff --git a/sklearn/metrics/_ranking.py b/sklearn/metrics/_ranking.py
index f1627e84fbcfe..8c458ac81e529 100644
--- a/sklearn/metrics/_ranking.py
+++ b/sklearn/metrics/_ranking.py
@@ -1598,7 +1598,7 @@ def top_k_accuracy_score(y_true, y_score, *, k=2, normalize=True,
         non-thresholded decision values (as returned by
         :term:`decision_function` on some classifiers). The binary case expects
         scores with shape (n_samples,) while the multiclass case expects scores
-        with shape (n_samples, n_classes). In the nulticlass case, the order of
+        with shape (n_samples, n_classes). In the multiclass case, the order of
         the class scores must correspond to the order of ``labels``, if
         provided, or else to the numerical or lexicographical order of the
         labels in ``y_true``.
@@ -1655,6 +1655,8 @@ def top_k_accuracy_score(y_true, y_score, *, k=2, normalize=True,
     y_true = check_array(y_true, ensure_2d=False, dtype=None)
     y_true = column_or_1d(y_true)
     y_type = type_of_target(y_true)
+    if y_type == "binary" and labels is not None and len(labels) > 2:
+        y_type = "multiclass"
     y_score = check_array(y_score, ensure_2d=False)
     y_score = column_or_1d(y_score) if y_type == 'binary' else y_score
     check_consistent_length(y_true, y_score, sample_weight)
diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py
index c37ff34feddec..85a00ca520f7b 100644
--- a/sklearn/metrics/tests/test_ranking.py
+++ b/sklearn/metrics/tests/test_ranking.py
@@ -1657,6 +1657,30 @@ def test_top_k_accuracy_score_binary(y_score, k, true_score):
     assert score == score_acc == pytest.approx(true_score)
 
 
+@pytest.mark.parametrize('y_true, true_score, labels', [
+    (np.array([0, 1, 1, 2]), 0.75, [0, 1, 2, 3]),
+    (np.array([0, 1, 1, 1]), 0.5, [0, 1, 2, 3]),
+    (np.array([1, 1, 1, 1]), 0.5, [0, 1, 2, 3]),
+    (np.array(['a', 'e', 'e', 'a']), 0.75, ['a', 'b', 'd', 'e']),
+])
+@pytest.mark.parametrize("labels_as_ndarray", [True, False])
+def test_top_k_accuracy_score_multiclass_with_labels(
+        y_true, true_score, labels, labels_as_ndarray
+):
+    """Test when labels and y_score are multiclass."""
+    if labels_as_ndarray:
+        labels = np.asarray(labels)
+    y_score = np.array([
+        [0.4, 0.3, 0.2, 0.1],
+        [0.1, 0.3, 0.4, 0.2],
+        [0.4, 0.1, 0.2, 0.3],
+        [0.3, 0.2, 0.4, 0.1],
+    ])
+
+    score = top_k_accuracy_score(y_true, y_score, k=2, labels=labels)
+    assert score == pytest.approx(true_score)
+
+
 def test_top_k_accuracy_score_increasing():
     # Make sure increasing k leads to a higher score
     X, y = datasets.make_classification(n_classes=10, n_samples=1000,

From ff0949907cef8e2fc1236b92e2789620ccab820a Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" 
Date: Mon, 26 Apr 2021 09:40:29 -0400
Subject: [PATCH 360/478] CI Fixes MAC ar build error (#19968)

---
 build_tools/azure/install.sh | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh
index fbe0c90a473ab..d2711d6bd610e 100755
--- a/build_tools/azure/install.sh
+++ b/build_tools/azure/install.sh
@@ -51,6 +51,10 @@ if [[ "$DISTRIB" == "conda" ]]; then
             # sklearn/svm/_libsvm.cpython-38-darwin.so,
             # 2): Symbol not found: _svm_check_parameter error
             TO_INSTALL="$TO_INSTALL compilers>=1.0.4,!=1.1.0 llvm-openmp"
+        else
+            # Without openmp, we use the system clang. Here we use /usr/bin/ar
+            # instead because llvm-ar errors
+            export AR=/usr/bin/ar
         fi
     fi
 	make_conda $TO_INSTALL

From d22fe3e922ba5ea063fa3afe0574e86884449539 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" 
Date: Mon, 26 Apr 2021 10:39:03 -0400
Subject: [PATCH 361/478] CI Lowers precision for doctest in LinearRegression
 (#19988)

---
 sklearn/linear_model/_base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py
index 5783e4740a08c..808ec9f3b3bb0 100644
--- a/sklearn/linear_model/_base.py
+++ b/sklearn/linear_model/_base.py
@@ -591,7 +591,7 @@ class LinearRegression(MultiOutputMixin, RegressorMixin, LinearModel):
     >>> reg.coef_
     array([1., 2.])
     >>> reg.intercept_
-    3.0000...
+    3.0...
     >>> reg.predict(np.array([[3, 5]]))
     array([16.])
     """

From d852aa0825769d7623d662bfa1f5a2bb5dfbae6d Mon Sep 17 00:00:00 2001
From: Alihan Zihna 
Date: Mon, 26 Apr 2021 16:55:17 +0100
Subject: [PATCH 362/478] TST Changes assert to pytest style in /mixture/tests
 (#19983)

Co-authored-by: Alihan Zihna 
---
 .../mixture/tests/test_bayesian_mixture.py    |  95 ++++----
 .../mixture/tests/test_gaussian_mixture.py    | 203 ++++++++++--------
 2 files changed, 168 insertions(+), 130 deletions(-)

diff --git a/sklearn/mixture/tests/test_bayesian_mixture.py b/sklearn/mixture/tests/test_bayesian_mixture.py
index 1d061da908e3c..dc2cbda4b66e7 100644
--- a/sklearn/mixture/tests/test_bayesian_mixture.py
+++ b/sklearn/mixture/tests/test_bayesian_mixture.py
@@ -2,12 +2,12 @@
 #         Thierry Guillemot 
 # License: BSD 3 clause
 import copy
+import re
 
 import numpy as np
 from scipy.special import gammaln
 import pytest
 
-from sklearn.utils._testing import assert_raise_message
 from sklearn.utils._testing import assert_almost_equal
 from sklearn.utils._testing import assert_array_equal
 
@@ -66,11 +66,13 @@ def test_bayesian_mixture_covariance_type():
     covariance_type = 'bad_covariance_type'
     bgmm = BayesianGaussianMixture(covariance_type=covariance_type,
                                    random_state=rng)
-    assert_raise_message(ValueError,
-                         "Invalid value for 'covariance_type': %s "
-                         "'covariance_type' should be in "
-                         "['spherical', 'tied', 'diag', 'full']"
-                         % covariance_type, bgmm.fit, X)
+
+    msg = re.escape(
+        f"Invalid value for 'covariance_type': {covariance_type} "
+        "'covariance_type' should be in ['spherical', 'tied', 'diag', 'full']"
+    )
+    with pytest.raises(ValueError, match=msg):
+        bgmm.fit(X)
 
 
 def test_bayesian_mixture_weight_concentration_prior_type():
@@ -81,11 +83,13 @@ def test_bayesian_mixture_weight_concentration_prior_type():
     bad_prior_type = 'bad_prior_type'
     bgmm = BayesianGaussianMixture(
         weight_concentration_prior_type=bad_prior_type, random_state=rng)
-    assert_raise_message(ValueError,
-                         "Invalid value for 'weight_concentration_prior_type':"
-                         " %s 'weight_concentration_prior_type' should be in "
-                         "['dirichlet_process', 'dirichlet_distribution']"
-                         % bad_prior_type, bgmm.fit, X)
+    msg = re.escape(
+        "Invalid value for 'weight_concentration_prior_type':"
+        f" {bad_prior_type} 'weight_concentration_prior_type' should be in "
+        "['dirichlet_process', 'dirichlet_distribution']"
+    )
+    with pytest.raises(ValueError, match=msg):
+        bgmm.fit(X)
 
 
 def test_bayesian_mixture_weights_prior_initialisation():
@@ -98,11 +102,12 @@ def test_bayesian_mixture_weights_prior_initialisation():
     bgmm = BayesianGaussianMixture(
         weight_concentration_prior=bad_weight_concentration_prior_,
         random_state=0)
-    assert_raise_message(ValueError,
-                         "The parameter 'weight_concentration_prior' "
-                         "should be greater than 0., but got %.3f."
-                         % bad_weight_concentration_prior_,
-                         bgmm.fit, X)
+    msg = (
+        "The parameter 'weight_concentration_prior' should be greater "
+        f"than 0., but got {bad_weight_concentration_prior_:.3f}."
+    )
+    with pytest.raises(ValueError, match=msg):
+        bgmm.fit(X)
 
     # Check correct init for a given value of weight_concentration_prior
     weight_concentration_prior = rng.rand()
@@ -128,11 +133,12 @@ def test_bayesian_mixture_mean_prior_initialisation():
     bgmm = BayesianGaussianMixture(
         mean_precision_prior=bad_mean_precision_prior_,
         random_state=rng)
-    assert_raise_message(ValueError,
-                         "The parameter 'mean_precision_prior' should be "
-                         "greater than 0., but got %.3f."
-                         % bad_mean_precision_prior_,
-                         bgmm.fit, X)
+    msg = (
+        "The parameter 'mean_precision_prior' "
+        f"should be greater than 0., but got {bad_mean_precision_prior_:.3f}."
+    )
+    with pytest.raises(ValueError, match=msg):
+        bgmm.fit(X)
 
     # Check correct init for a given value of mean_precision_prior
     mean_precision_prior = rng.rand()
@@ -150,9 +156,9 @@ def test_bayesian_mixture_mean_prior_initialisation():
     bgmm = BayesianGaussianMixture(n_components=n_components,
                                    mean_prior=mean_prior,
                                    random_state=rng)
-    assert_raise_message(ValueError,
-                         "The parameter 'means' should have the shape of ",
-                         bgmm.fit, X)
+    msg = "The parameter 'means' should have the shape of "
+    with pytest.raises(ValueError, match=msg):
+        bgmm.fit(X)
 
     # Check correct init for a given value of mean_prior
     mean_prior = rng.rand(n_features)
@@ -177,11 +183,12 @@ def test_bayesian_mixture_precisions_prior_initialisation():
     bgmm = BayesianGaussianMixture(
         degrees_of_freedom_prior=bad_degrees_of_freedom_prior_,
         random_state=rng)
-    assert_raise_message(ValueError,
-                         "The parameter 'degrees_of_freedom_prior' should be "
-                         "greater than %d, but got %.3f."
-                         % (n_features - 1, bad_degrees_of_freedom_prior_),
-                         bgmm.fit, X)
+    msg = (
+        "The parameter 'degrees_of_freedom_prior' should be greater than"
+        f" {n_features -1}, but got {bad_degrees_of_freedom_prior_:.3f}."
+    )
+    with pytest.raises(ValueError, match=msg):
+        bgmm.fit(X)
 
     # Check correct init for a given value of degrees_of_freedom_prior
     degrees_of_freedom_prior = rng.rand() + n_features - 1.
@@ -219,11 +226,12 @@ def test_bayesian_mixture_precisions_prior_initialisation():
     bgmm = BayesianGaussianMixture(covariance_type='spherical',
                                    covariance_prior=bad_covariance_prior_,
                                    random_state=rng)
-    assert_raise_message(ValueError,
-                         "The parameter 'spherical covariance_prior' "
-                         "should be greater than 0., but got %.3f."
-                         % bad_covariance_prior_,
-                         bgmm.fit, X)
+    msg = (
+        "The parameter 'spherical covariance_prior' "
+        f"should be greater than 0., but got {bad_covariance_prior_:.3f}."
+    )
+    with pytest.raises(ValueError, match=msg):
+        bgmm.fit(X)
 
     # Check correct init for the default value of covariance_prior
     covariance_prior_default = {
@@ -247,9 +255,10 @@ def test_bayesian_mixture_check_is_fitted():
     # Check raise message
     bgmm = BayesianGaussianMixture(random_state=rng)
     X = rng.rand(n_samples, n_features)
-    assert_raise_message(ValueError,
-                         'This BayesianGaussianMixture instance is not '
-                         'fitted yet.', bgmm.score, X)
+
+    msg = "This BayesianGaussianMixture instance is not fitted yet."
+    with pytest.raises(ValueError, match=msg):
+        bgmm.score(X)
 
 
 def test_bayesian_mixture_weights():
@@ -475,11 +484,13 @@ def test_bayesian_mixture_predict_predict_proba():
                 covariance_type=covar_type)
 
             # Check a warning message arrive if we don't do fit
-            assert_raise_message(NotFittedError,
-                                 "This BayesianGaussianMixture instance"
-                                 " is not fitted yet. Call 'fit' with "
-                                 "appropriate arguments before using "
-                                 "this estimator.", bgmm.predict, X)
+            msg = (
+                "This BayesianGaussianMixture instance is not fitted yet. "
+                "Call 'fit' with appropriate arguments before using this "
+                "estimator."
+            )
+            with pytest.raises(NotFittedError, match=msg):
+                bgmm.predict(X)
 
             bgmm.fit(X)
             Y_pred = bgmm.predict(X)
diff --git a/sklearn/mixture/tests/test_gaussian_mixture.py b/sklearn/mixture/tests/test_gaussian_mixture.py
index ea5ea0c2eb649..2d8dc81e54275 100644
--- a/sklearn/mixture/tests/test_gaussian_mixture.py
+++ b/sklearn/mixture/tests/test_gaussian_mixture.py
@@ -2,6 +2,7 @@
 #         Thierry Guillemot 
 # License: BSD 3 clause
 
+import re
 import sys
 import copy
 import warnings
@@ -29,8 +30,6 @@
 from sklearn.utils._testing import assert_almost_equal
 from sklearn.utils._testing import assert_array_almost_equal
 from sklearn.utils._testing import assert_array_equal
-from sklearn.utils._testing import assert_raise_message
-from sklearn.utils._testing import assert_warns_message
 from sklearn.utils._testing import ignore_warnings
 
 
@@ -105,55 +104,66 @@ def test_gaussian_mixture_attributes():
 
     n_components_bad = 0
     gmm = GaussianMixture(n_components=n_components_bad)
-    assert_raise_message(ValueError,
-                         "Invalid value for 'n_components': %d "
-                         "Estimation requires at least one component"
-                         % n_components_bad, gmm.fit, X)
+    msg = (
+        f"Invalid value for 'n_components': {n_components_bad} "
+        "Estimation requires at least one component"
+    )
+    with pytest.raises(ValueError, match=msg):
+        gmm.fit(X)
 
     # covariance_type should be in [spherical, diag, tied, full]
     covariance_type_bad = 'bad_covariance_type'
     gmm = GaussianMixture(covariance_type=covariance_type_bad)
-    assert_raise_message(ValueError,
-                         "Invalid value for 'covariance_type': %s "
-                         "'covariance_type' should be in "
-                         "['spherical', 'tied', 'diag', 'full']"
-                         % covariance_type_bad,
-                         gmm.fit, X)
+    msg = (
+        f"Invalid value for 'covariance_type': {covariance_type_bad} "
+        "'covariance_type' should be in ['spherical', 'tied', 'diag', 'full']"
+    )
+    with pytest.raises(ValueError):
+        gmm.fit(X)
 
     tol_bad = -1
     gmm = GaussianMixture(tol=tol_bad)
-    assert_raise_message(ValueError,
-                         "Invalid value for 'tol': %.5f "
-                         "Tolerance used by the EM must be non-negative"
-                         % tol_bad, gmm.fit, X)
+    msg = (
+        f"Invalid value for 'tol': {tol_bad:.5f} "
+        "Tolerance used by the EM must be non-negative"
+    )
+    with pytest.raises(ValueError, match=msg):
+        gmm.fit(X)
 
     reg_covar_bad = -1
     gmm = GaussianMixture(reg_covar=reg_covar_bad)
-    assert_raise_message(ValueError,
-                         "Invalid value for 'reg_covar': %.5f "
-                         "regularization on covariance must be "
-                         "non-negative" % reg_covar_bad, gmm.fit, X)
+    msg = (
+        f"Invalid value for 'reg_covar': {reg_covar_bad:.5f} "
+        "regularization on covariance must be non-negative"
+    )
+    with pytest.raises(ValueError, match=msg):
+        gmm.fit(X)
 
     max_iter_bad = 0
     gmm = GaussianMixture(max_iter=max_iter_bad)
-    assert_raise_message(ValueError,
-                         "Invalid value for 'max_iter': %d "
-                         "Estimation requires at least one iteration"
-                         % max_iter_bad, gmm.fit, X)
+    msg = (
+        f"Invalid value for 'max_iter': {max_iter_bad} "
+        "Estimation requires at least one iteration"
+    )
+    with pytest.raises(ValueError, match=msg):
+        gmm.fit(X)
 
     n_init_bad = 0
     gmm = GaussianMixture(n_init=n_init_bad)
-    assert_raise_message(ValueError,
-                         "Invalid value for 'n_init': %d "
-                         "Estimation requires at least one run"
-                         % n_init_bad, gmm.fit, X)
+    msg = (
+        f"Invalid value for 'n_init': {n_init_bad} "
+        "Estimation requires at least one run"
+    )
+    with pytest.raises(ValueError, match=msg):
+        gmm.fit(X)
 
     init_params_bad = 'bad_method'
     gmm = GaussianMixture(init_params=init_params_bad)
-    assert_raise_message(ValueError,
-                         "Unimplemented initialization method '%s'"
-                         % init_params_bad,
-                         gmm.fit, X)
+    msg = (
+        f"Unimplemented initialization method '{init_params_bad}'"
+    )
+    with pytest.raises(ValueError, match=msg):
+        gmm.fit(X)
 
     # test good parameters
     n_components, tol, n_init, max_iter, reg_covar = 2, 1e-4, 3, 30, 1e-1
@@ -184,31 +194,34 @@ def test_check_weights():
     # Check bad shape
     weights_bad_shape = rng.rand(n_components, 1)
     g.weights_init = weights_bad_shape
-    assert_raise_message(ValueError,
-                         "The parameter 'weights' should have the shape of "
-                         "(%d,), but got %s" %
-                         (n_components, str(weights_bad_shape.shape)),
-                         g.fit, X)
+    msg = re.escape(
+        "The parameter 'weights' should have the shape of "
+        f"({n_components},), but got {str(weights_bad_shape.shape)}"
+    )
+    with pytest.raises(ValueError, match=msg):
+        g.fit(X)
 
     # Check bad range
     weights_bad_range = rng.rand(n_components) + 1
     g.weights_init = weights_bad_range
-    assert_raise_message(ValueError,
-                         "The parameter 'weights' should be in the range "
-                         "[0, 1], but got max value %.5f, min value %.5f"
-                         % (np.min(weights_bad_range),
-                            np.max(weights_bad_range)),
-                         g.fit, X)
+    msg = re.escape(
+        "The parameter 'weights' should be in the range [0, 1], but got"
+        f" max value {np.min(weights_bad_range):.5f}, "
+        f"min value {np.max(weights_bad_range):.5f}"
+    )
+    with pytest.raises(ValueError, match=msg):
+        g.fit(X)
 
     # Check bad normalization
     weights_bad_norm = rng.rand(n_components)
     weights_bad_norm = weights_bad_norm / (weights_bad_norm.sum() + 1)
     g.weights_init = weights_bad_norm
-    assert_raise_message(ValueError,
-                         "The parameter 'weights' should be normalized, "
-                         "but got sum(weights) = %.5f"
-                         % np.sum(weights_bad_norm),
-                         g.fit, X)
+    msg = re.escape(
+        "The parameter 'weights' should be normalized, "
+        f"but got sum(weights) = {np.sum(weights_bad_norm):.5f}"
+    )
+    with pytest.raises(ValueError, match=msg):
+        g.fit(X)
 
     # Check good weights matrix
     weights = rand_data.weights
@@ -229,9 +242,9 @@ def test_check_means():
     # Check means bad shape
     means_bad_shape = rng.rand(n_components + 1, n_features)
     g.means_init = means_bad_shape
-    assert_raise_message(ValueError,
-                         "The parameter 'means' should have the shape of ",
-                         g.fit, X)
+    msg = "The parameter 'means' should have the shape of "
+    with pytest.raises(ValueError, match=msg):
+        g.fit(X)
 
     # Check good means matrix
     means = rand_data.means
@@ -278,17 +291,21 @@ def test_check_precisions():
 
         # Check precisions with bad shapes
         g.precisions_init = precisions_bad_shape[covar_type]
-        assert_raise_message(ValueError,
-                             "The parameter '%s precision' should have "
-                             "the shape of" % covar_type,
-                             g.fit, X)
+        msg = (
+            f"The parameter '{covar_type} precision' should have "
+            "the shape of"
+        )
+        with pytest.raises(ValueError, match=msg):
+            g.fit(X)
 
         # Check not positive precisions
         g.precisions_init = precisions_not_positive[covar_type]
-        assert_raise_message(ValueError,
-                             "'%s precision' should be %s"
-                             % (covar_type, not_positive_errors[covar_type]),
-                             g.fit, X)
+        msg = (
+            f"'{covar_type} precision' should be "
+            f"{not_positive_errors[covar_type]}"
+        )
+        with pytest.raises(ValueError, match=msg):
+            g.fit(X)
 
         # Check the correct init of precisions_init
         g.precisions_init = rand_data.precisions[covar_type]
@@ -532,10 +549,12 @@ def test_gaussian_mixture_predict_predict_proba():
                             covariance_type=covar_type)
 
         # Check a warning message arrive if we don't do fit
-        assert_raise_message(NotFittedError,
-                             "This GaussianMixture instance is not fitted "
-                             "yet. Call 'fit' with appropriate arguments "
-                             "before using this estimator.", g.predict, X)
+        msg = (
+            "This GaussianMixture instance is not fitted yet. Call 'fit' "
+            "with appropriate arguments before using this estimator."
+        )
+        with pytest.raises(NotFittedError, match=msg):
+            g.predict(X)
 
         g.fit(X)
         Y_pred = g.predict(X)
@@ -660,12 +679,13 @@ def test_gaussian_mixture_fit_convergence_warning():
         g = GaussianMixture(n_components=n_components, n_init=1,
                             max_iter=max_iter, reg_covar=0, random_state=rng,
                             covariance_type=covar_type)
-        assert_warns_message(ConvergenceWarning,
-                             'Initialization %d did not converge. '
-                             'Try different init parameters, '
-                             'or increase max_iter, tol '
-                             'or check for degenerate data.'
-                             % max_iter, g.fit, X)
+        msg = (
+            f"Initialization {max_iter} did not converge. Try different init "
+            "parameters, or increase max_iter, tol or check for degenerate"
+            " data."
+        )
+        with pytest.warns(ConvergenceWarning, match=msg):
+            g.fit(X)
 
 
 def test_multiple_init():
@@ -831,10 +851,12 @@ def test_score():
     gmm1 = GaussianMixture(n_components=n_components, n_init=1,
                            max_iter=1, reg_covar=0, random_state=rng,
                            covariance_type=covar_type)
-    assert_raise_message(NotFittedError,
-                         "This GaussianMixture instance is not fitted "
-                         "yet. Call 'fit' with appropriate arguments "
-                         "before using this estimator.", gmm1.score, X)
+    msg = (
+        "This GaussianMixture instance is not fitted yet. Call 'fit' with "
+        "appropriate arguments before using this estimator."
+    )
+    with pytest.raises(NotFittedError, match=msg):
+        gmm1.score(X)
 
     # Check score value
     with warnings.catch_warnings():
@@ -861,10 +883,12 @@ def test_score_samples():
     # Check the error message if we don't call fit
     gmm = GaussianMixture(n_components=n_components, n_init=1, reg_covar=0,
                           random_state=rng, covariance_type=covar_type)
-    assert_raise_message(NotFittedError,
-                         "This GaussianMixture instance is not fitted "
-                         "yet. Call 'fit' with appropriate arguments "
-                         "before using this estimator.", gmm.score_samples, X)
+    msg = (
+        "This GaussianMixture instance is not fitted yet. Call 'fit' with "
+        "appropriate arguments before using this estimator."
+    )
+    with pytest.raises(NotFittedError, match=msg):
+        gmm.score_samples(X)
 
     gmm_score_samples = gmm.fit(X).score_samples(X)
     assert gmm_score_samples.shape[0] == rand_data.n_samples
@@ -914,13 +938,14 @@ def test_regularisation():
 
         with warnings.catch_warnings():
             warnings.simplefilter("ignore", RuntimeWarning)
-            assert_raise_message(ValueError,
-                                 "Fitting the mixture model failed because "
-                                 "some components have ill-defined empirical "
-                                 "covariance (for instance caused by "
-                                 "singleton or collapsed samples). Try to "
-                                 "decrease the number of components, or "
-                                 "increase reg_covar.", gmm.fit, X)
+            msg = re.escape(
+                "Fitting the mixture model failed because some components have"
+                " ill-defined empirical covariance (for instance caused by "
+                "singleton or collapsed samples). Try to decrease the number "
+                "of components, or increase reg_covar."
+            )
+            with pytest.raises(ValueError, match=msg):
+                gmm.fit(X)
 
             gmm.set_params(reg_covar=1e-6).fit(X)
 
@@ -958,12 +983,14 @@ def test_sample():
         gmm = GaussianMixture(n_components=n_components,
                               covariance_type=covar_type, random_state=rng)
         # To sample we need that GaussianMixture is fitted
-        assert_raise_message(NotFittedError, "This GaussianMixture instance "
-                             "is not fitted", gmm.sample, 0)
+        msg = "This GaussianMixture instance is not fitted"
+        with pytest.raises(NotFittedError, match=msg):
+            gmm.sample(0)
         gmm.fit(X)
 
-        assert_raise_message(ValueError, "Invalid value for 'n_samples",
-                             gmm.sample, 0)
+        msg = "Invalid value for 'n_samples'"
+        with pytest.raises(ValueError, match=msg):
+            gmm.sample(0)
 
         # Just to make sure the class samples correctly
         n_samples = 20000

From 8156c1082886bd23c7e6486a7d654412df1d9325 Mon Sep 17 00:00:00 2001
From: Dmitry Kobak 
Date: Mon, 26 Apr 2021 21:29:59 +0200
Subject: [PATCH 363/478] ENH Improve initialization and learning rate in t-SNE
 (#19491)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Tom Dupré la Tour 
---
 doc/modules/manifold.rst                      |  15 +-
 doc/whats_new/v1.0.rst                        |   6 +
 sklearn/manifold/_t_sne.py                    |  73 +++++++--
 sklearn/manifold/tests/test_t_sne.py          | 149 ++++++++++++++++--
 .../tests/test_neighbors_pipeline.py          |   3 +
 sklearn/tests/test_docstring_parameters.py    |   5 +
 sklearn/utils/estimator_checks.py             |   1 +
 7 files changed, 226 insertions(+), 26 deletions(-)

diff --git a/doc/modules/manifold.rst b/doc/modules/manifold.rst
index 8de2a73477c87..72e8c7485df44 100644
--- a/doc/modules/manifold.rst
+++ b/doc/modules/manifold.rst
@@ -555,7 +555,10 @@ between natural clusters in the data. If the factor is too high, the KL
 divergence could increase during this phase. Usually it does not have to be
 tuned. A critical parameter is the learning rate. If it is too low gradient
 descent will get stuck in a bad local minimum. If it is too high the KL
-divergence will increase during optimization. More tips can be found in
+divergence will increase during optimization. A heuristic suggested in
+Belkina et al. (2019) is to set the learning rate to the sample size
+divided by the early exaggeration factor. We implement this heuristic
+as `learning_rate='auto'` argument. More tips can be found in
 Laurens van der Maaten's FAQ (see references). The last parameter, angle,
 is a tradeoff between performance and accuracy. Larger angles imply that we
 can approximate larger regions by a single point, leading to better speed
@@ -614,9 +617,15 @@ the internal structure of the data.
     `_
     van der Maaten, L.J.P.
 
-  * `"Accelerating t-SNE using Tree-Based Algorithms."
+  * `"Accelerating t-SNE using Tree-Based Algorithms"
     `_
-    L.J.P. van der Maaten.  Journal of Machine Learning Research 15(Oct):3221-3245, 2014.
+    van der Maaten, L.J.P.; Journal of Machine Learning Research 15(Oct):3221-3245, 2014.
+    
+  * `"Automated optimized parameters for T-distributed stochastic neighbor
+    embedding improve visualization and analysis of large datasets"
+    `_
+    Belkina, A.C., Ciccolella, C.O., Anno, R., Halpert, R., Spidlen, J.,
+    Snyder-Cappione, J.E., Nature Communications 10, 5415 (2019). 
 
 Tips on practical use
 =====================
diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 3b3884e68e185..0a13d22860d07 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -279,6 +279,12 @@ Changelog
   during affinity matrix computation for :class:`manifold.TSNE`.
   :pr:`19472` by :user:`Dmitry Kobak `.
 
+- |Enhancement| Implement `'auto'` heuristic for the `learning_rate` in
+  :class:`manifold.TSNE`. It will become default in 1.2. The default
+  initialization will change to `pca` in 1.2. PCA initialization will
+  be scaled to have standard deviation 1e-4 in 1.2.
+  :pr:`19491` by :user:`Dmitry Kobak `.
+
 :mod:`sklearn.metrics`
 ......................
 
diff --git a/sklearn/manifold/_t_sne.py b/sklearn/manifold/_t_sne.py
index b6072a6e198c4..682fdc095d3bf 100644
--- a/sklearn/manifold/_t_sne.py
+++ b/sklearn/manifold/_t_sne.py
@@ -517,13 +517,19 @@ class TSNE(BaseEstimator):
         optimization, the early exaggeration factor or the learning rate
         might be too high.
 
-    learning_rate : float, default=200.0
+    learning_rate : float or 'auto', default=200.0
         The learning rate for t-SNE is usually in the range [10.0, 1000.0]. If
         the learning rate is too high, the data may look like a 'ball' with any
         point approximately equidistant from its nearest neighbours. If the
         learning rate is too low, most points may look compressed in a dense
         cloud with few outliers. If the cost function gets stuck in a bad local
         minimum increasing the learning rate may help.
+        Note that many other t-SNE implementations (bhtsne, FIt-SNE, openTSNE,
+        etc.) use a definition of learning_rate that is 4 times smaller than
+        ours. So our learning_rate=200 corresponds to learning_rate=800 in
+        those other implementations. The 'auto' option sets the learning_rate
+        to `max(N / early_exaggeration / 4, 50)` where N is the sample size,
+        following [4] and [5]. This will become default in 1.2.
 
     n_iter : int, default=1000
         Maximum number of iterations for the optimization. Should be at
@@ -559,7 +565,8 @@ class TSNE(BaseEstimator):
         Initialization of embedding. Possible options are 'random', 'pca',
         and a numpy array of shape (n_samples, n_components).
         PCA initialization cannot be used with precomputed distances and is
-        usually more globally stable than random initialization.
+        usually more globally stable than random initialization. `init='pca'`
+        will become default in 1.2.
 
     verbose : int, default=0
         Verbosity level.
@@ -631,7 +638,8 @@ class TSNE(BaseEstimator):
     >>> import numpy as np
     >>> from sklearn.manifold import TSNE
     >>> X = np.array([[0, 0, 0], [0, 1, 1], [1, 0, 1], [1, 1, 1]])
-    >>> X_embedded = TSNE(n_components=2).fit_transform(X)
+    >>> X_embedded = TSNE(n_components=2, learning_rate='auto',
+    ...                   init='random').fit_transform(X)
     >>> X_embedded.shape
     (4, 2)
 
@@ -647,6 +655,14 @@ class TSNE(BaseEstimator):
     [3] L.J.P. van der Maaten. Accelerating t-SNE using Tree-Based Algorithms.
         Journal of Machine Learning Research 15(Oct):3221-3245, 2014.
         https://lvdmaaten.github.io/publications/papers/JMLR_2014.pdf
+
+    [4] Belkina, A. C., Ciccolella, C. O., Anno, R., Halpert, R., Spidlen, J.,
+        & Snyder-Cappione, J. E. (2019). Automated optimized parameters for
+        T-distributed stochastic neighbor embedding improve visualization
+        and analysis of large datasets. Nature Communications, 10(1), 1-12.
+
+    [5] Kobak, D., & Berens, P. (2019). The art of using t-SNE for single-cell
+        transcriptomics. Nature Communications, 10(1), 1-14.
     """
     # Control the number of exploration iterations with early_exaggeration on
     _EXPLORATION_N_ITER = 250
@@ -656,9 +672,9 @@ class TSNE(BaseEstimator):
 
     @_deprecate_positional_args
     def __init__(self, n_components=2, *, perplexity=30.0,
-                 early_exaggeration=12.0, learning_rate=200.0, n_iter=1000,
+                 early_exaggeration=12.0, learning_rate="warn", n_iter=1000,
                  n_iter_without_progress=300, min_grad_norm=1e-7,
-                 metric="euclidean", init="random", verbose=0,
+                 metric="euclidean", init="warn", verbose=0,
                  random_state=None, method='barnes_hut', angle=0.5,
                  n_jobs=None, square_distances='legacy'):
         self.n_components = n_components
@@ -681,12 +697,39 @@ def __init__(self, n_components=2, *, perplexity=30.0,
     def _fit(self, X, skip_num_points=0):
         """Private function to fit the model using X as training data."""
 
+        if isinstance(self.init, str) and self.init == 'warn':
+            # See issue #18018
+            warnings.warn("The default initialization in TSNE will change "
+                          "from 'random' to 'pca' in 1.2.", FutureWarning)
+            self._init = 'random'
+        else:
+            self._init = self.init
+        if self.learning_rate == 'warn':
+            # See issue #18018
+            warnings.warn("The default learning rate in TSNE will change "
+                          "from 200.0 to 'auto' in 1.2.", FutureWarning)
+            self._learning_rate = 200.0
+        else:
+            self._learning_rate = self.learning_rate
+
+        if isinstance(self._init, str) and self._init == 'pca' and issparse(X):
+            raise TypeError("PCA initialization is currently not suported "
+                            "with the sparse input matrix. Use "
+                            "init=\"random\" instead.")
         if self.method not in ['barnes_hut', 'exact']:
             raise ValueError("'method' must be 'barnes_hut' or 'exact'")
         if self.angle < 0.0 or self.angle > 1.0:
             raise ValueError("'angle' must be between 0.0 - 1.0")
         if self.square_distances not in [True, 'legacy']:
             raise ValueError("'square_distances' must be True or 'legacy'.")
+        if self._learning_rate == 'auto':
+            # See issue #18018
+            self._learning_rate = X.shape[0] / self.early_exaggeration / 4
+            self._learning_rate = np.maximum(self._learning_rate, 50)
+        else:
+            if not (self._learning_rate > 0):
+                raise ValueError("'learning_rate' must be a positive number "
+                                 "or 'auto'.")
         if self.metric != "euclidean" and self.square_distances is not True:
             warnings.warn(
                 "'square_distances' has been introduced in 0.24 to help phase "
@@ -706,7 +749,7 @@ def _fit(self, X, skip_num_points=0):
             X = self._validate_data(X, accept_sparse=['csr', 'csc', 'coo'],
                                     dtype=[np.float32, np.float64])
         if self.metric == "precomputed":
-            if isinstance(self.init, str) and self.init == 'pca':
+            if isinstance(self._init, str) and self._init == 'pca':
                 raise ValueError("The parameter init=\"pca\" cannot be "
                                  "used with metric=\"precomputed\".")
             if X.shape[0] != X.shape[1]:
@@ -817,13 +860,21 @@ def _fit(self, X, skip_num_points=0):
             P = _joint_probabilities_nn(distances_nn, self.perplexity,
                                         self.verbose)
 
-        if isinstance(self.init, np.ndarray):
-            X_embedded = self.init
-        elif self.init == 'pca':
+        if isinstance(self._init, np.ndarray):
+            X_embedded = self._init
+        elif self._init == 'pca':
             pca = PCA(n_components=self.n_components, svd_solver='randomized',
                       random_state=random_state)
             X_embedded = pca.fit_transform(X).astype(np.float32, copy=False)
-        elif self.init == 'random':
+            # TODO: Update in 1.2
+            # PCA is rescaled so that PC1 has standard deviation 1e-4 which is
+            # the default value for random initialization. See issue #18018.
+            warnings.warn("The PCA initialization in TSNE will change to "
+                          "have the standard deviation of PC1 equal to 1e-4 "
+                          "in 1.2. This will ensure better convergence.",
+                          FutureWarning)
+            # X_embedded = X_embedded / np.std(X_embedded[:, 0]) * 1e-4
+        elif self._init == 'random':
             # The embedding is initialized with iid samples from Gaussians with
             # standard deviation 1e-4.
             X_embedded = 1e-4 * random_state.randn(
@@ -857,7 +908,7 @@ def _tsne(self, P, degrees_of_freedom, n_samples, X_embedded,
             "it": 0,
             "n_iter_check": self._N_ITER_CHECK,
             "min_grad_norm": self.min_grad_norm,
-            "learning_rate": self.learning_rate,
+            "learning_rate": self._learning_rate,
             "verbose": self.verbose,
             "kwargs": dict(skip_num_points=skip_num_points),
             "args": [P, degrees_of_freedom, n_samples, self.n_components],
diff --git a/sklearn/manifold/tests/test_t_sne.py b/sklearn/manifold/tests/test_t_sne.py
index bd0cc3df339bf..7f0840fb7b82f 100644
--- a/sklearn/manifold/tests/test_t_sne.py
+++ b/sklearn/manifold/tests/test_t_sne.py
@@ -247,6 +247,8 @@ def test_trustworthiness():
     assert_almost_equal(trustworthiness(X, X_embedded, n_neighbors=1), 0.2)
 
 
+# TODO: Remove filterwarning in 1.2
+@pytest.mark.filterwarnings("ignore:.*TSNE will change.*:FutureWarning")
 @pytest.mark.parametrize("method", ['exact', 'barnes_hut'])
 @pytest.mark.parametrize("init", ('random', 'pca'))
 def test_preserve_trustworthiness_approximately(method, init):
@@ -261,6 +263,8 @@ def test_preserve_trustworthiness_approximately(method, init):
     assert t > 0.85
 
 
+# TODO: Remove filterwarnings in 1.2
+@pytest.mark.filterwarnings("ignore:.*TSNE will change.*:FutureWarning")
 def test_optimization_minimizes_kl_divergence():
     """t-SNE should give a lower KL divergence with more iterations."""
     random_state = check_random_state(0)
@@ -275,6 +279,8 @@ def test_optimization_minimizes_kl_divergence():
     assert kl_divergences[2] <= kl_divergences[1]
 
 
+# TODO: Remove filterwarnings in 1.2
+@pytest.mark.filterwarnings("ignore:.*TSNE will change.*:FutureWarning")
 @pytest.mark.parametrize('method', ['exact', 'barnes_hut'])
 def test_fit_csr_matrix(method):
     # X can be a sparse matrix.
@@ -289,6 +295,8 @@ def test_fit_csr_matrix(method):
                     1.0, rtol=1.1e-1)
 
 
+# TODO: Remove filterwarnings in 1.2
+@pytest.mark.filterwarnings("ignore:.*TSNE will change.*:FutureWarning")
 def test_preserve_trustworthiness_approximately_with_precomputed_distances():
     # Nearest neighbors should be preserved approximately.
     random_state = check_random_state(0)
@@ -298,7 +306,7 @@ def test_preserve_trustworthiness_approximately_with_precomputed_distances():
         tsne = TSNE(n_components=2, perplexity=2, learning_rate=100.0,
                     early_exaggeration=2.0, metric="precomputed",
                     random_state=i, verbose=0, n_iter=500,
-                    square_distances=True)
+                    square_distances=True, init='random')
         X_embedded = tsne.fit_transform(D)
         t = trustworthiness(D, X_embedded, n_neighbors=1, metric="precomputed")
         assert t > .95
@@ -314,6 +322,8 @@ def test_trustworthiness_not_euclidean_metric():
                             metric='precomputed'))
 
 
+# TODO: Remove filterwarnings in 1.2
+@pytest.mark.filterwarnings("ignore:.*TSNE will change.*:FutureWarning")
 def test_early_exaggeration_too_small():
     # Early exaggeration factor must be >= 1.
     tsne = TSNE(early_exaggeration=0.99)
@@ -321,6 +331,8 @@ def test_early_exaggeration_too_small():
         tsne.fit_transform(np.array([[0.0], [0.0]]))
 
 
+# TODO: Remove filterwarnings in 1.2
+@pytest.mark.filterwarnings("ignore:.*TSNE will change.*:FutureWarning")
 def test_too_few_iterations():
     # Number of gradient descent iterations must be at least 200.
     tsne = TSNE(n_iter=199)
@@ -328,6 +340,8 @@ def test_too_few_iterations():
         tsne.fit_transform(np.array([[0.0], [0.0]]))
 
 
+# TODO: Remove filterwarnings in 1.2
+@pytest.mark.filterwarnings("ignore:.*TSNE will change.*:FutureWarning")
 @pytest.mark.parametrize('method, retype', [
     ('exact', np.asarray),
     ('barnes_hut', np.asarray),
@@ -339,27 +353,35 @@ def test_too_few_iterations():
 ])
 def test_bad_precomputed_distances(method, D, retype, message_regex):
     tsne = TSNE(metric="precomputed", method=method,
-                square_distances=True)
+                square_distances=True, init='random', random_state=42)
     with pytest.raises(ValueError, match=message_regex):
         tsne.fit_transform(retype(D))
 
 
+# TODO: Remove filterwarnings in 1.2
+@pytest.mark.filterwarnings("ignore:.*TSNE will change.*:FutureWarning")
 def test_exact_no_precomputed_sparse():
-    tsne = TSNE(metric='precomputed', method='exact', square_distances=True)
+    tsne = TSNE(metric='precomputed', method='exact', square_distances=True,
+                init='random', random_state=42)
     with pytest.raises(TypeError, match='sparse'):
         tsne.fit_transform(sp.csr_matrix([[0, 5], [5, 0]]))
 
 
+# TODO: Remove filterwarnings in 1.2
+@pytest.mark.filterwarnings("ignore:.*TSNE will change.*:FutureWarning")
 def test_high_perplexity_precomputed_sparse_distances():
     # Perplexity should be less than 50
     dist = np.array([[1., 0., 0.], [0., 1., 0.], [1., 0., 0.]])
     bad_dist = sp.csr_matrix(dist)
-    tsne = TSNE(metric="precomputed", square_distances=True)
+    tsne = TSNE(metric="precomputed", square_distances=True,
+                init='random', random_state=42)
     msg = "3 neighbors per samples are required, but some samples have only 1"
     with pytest.raises(ValueError, match=msg):
         tsne.fit_transform(bad_dist)
 
 
+# TODO: Remove filterwarnings in 1.2
+@pytest.mark.filterwarnings("ignore:.*TSNE will change.*:FutureWarning")
 @ignore_warnings(category=EfficiencyWarning)
 def test_sparse_precomputed_distance():
     """Make sure that TSNE works identically for sparse and dense matrix"""
@@ -372,7 +394,8 @@ def test_sparse_precomputed_distance():
     assert sp.issparse(D_sparse)
     assert_almost_equal(D_sparse.A, D)
 
-    tsne = TSNE(metric="precomputed", random_state=0, square_distances=True)
+    tsne = TSNE(metric="precomputed", random_state=0, square_distances=True,
+                init='random')
     Xt_dense = tsne.fit_transform(D)
 
     for fmt in ['csr', 'lil']:
@@ -380,6 +403,8 @@ def test_sparse_precomputed_distance():
         assert_almost_equal(Xt_dense, Xt_sparse)
 
 
+# TODO: Remove filterwarnings in 1.2
+@pytest.mark.filterwarnings("ignore:.*TSNE will change.*:FutureWarning")
 def test_non_positive_computed_distances():
     # Computed distance matrices must be positive.
     def metric(x, y):
@@ -392,6 +417,8 @@ def metric(x, y):
         tsne.fit_transform(X)
 
 
+# TODO: Remove filterwarnings in 1.2
+@pytest.mark.filterwarnings("ignore:.*TSNE will change.*:FutureWarning")
 def test_init_not_available():
     # 'init' must be 'pca', 'random', or numpy array.
     tsne = TSNE(init="not available")
@@ -400,6 +427,8 @@ def test_init_not_available():
         tsne.fit_transform(np.array([[0.0], [1.0]]))
 
 
+# TODO: Remove filterwarnings in 1.2
+@pytest.mark.filterwarnings("ignore:.*TSNE will change.*:FutureWarning")
 def test_init_ndarray():
     # Initialize TSNE with ndarray and test fit
     tsne = TSNE(init=np.zeros((100, 2)))
@@ -411,10 +440,12 @@ def test_init_ndarray_precomputed():
     # Initialize TSNE with ndarray and metric 'precomputed'
     # Make sure no FutureWarning is thrown from _fit
     tsne = TSNE(init=np.zeros((100, 2)), metric="precomputed",
-                square_distances=True)
+                square_distances=True, learning_rate=50.0)
     tsne.fit(np.zeros((100, 100)))
 
 
+# TODO: Remove filterwarnings in 1.2
+@pytest.mark.filterwarnings("ignore:.*TSNE will change.*:FutureWarning")
 def test_distance_not_available():
     # 'metric' must be valid.
     tsne = TSNE(metric="not available", method='exact', square_distances=True)
@@ -427,6 +458,8 @@ def test_distance_not_available():
         tsne.fit_transform(np.array([[0.0], [1.0]]))
 
 
+# TODO: Remove filterwarnings in 1.2
+@pytest.mark.filterwarnings("ignore:.*TSNE will change.*:FutureWarning")
 def test_method_not_available():
     # 'nethod' must be 'barnes_hut' or 'exact'
     tsne = TSNE(method='not available')
@@ -434,6 +467,8 @@ def test_method_not_available():
         tsne.fit_transform(np.array([[0.0], [1.0]]))
 
 
+# TODO: Remove filterwarnings in 1.2
+@pytest.mark.filterwarnings("ignore:.*TSNE will change.*:FutureWarning")
 def test_square_distances_not_available():
     # square_distances must be True or 'legacy'.
     tsne = TSNE(square_distances="not_available")
@@ -441,6 +476,8 @@ def test_square_distances_not_available():
         tsne.fit_transform(np.array([[0.0], [1.0]]))
 
 
+# TODO: Remove filterwarnings in 1.2
+@pytest.mark.filterwarnings("ignore:.*TSNE will change.*:FutureWarning")
 def test_angle_out_of_range_checks():
     # check the angle parameter range
     for angle in [-1, -1e-6, 1 + 1e-6, 2]:
@@ -450,8 +487,10 @@ def test_angle_out_of_range_checks():
             tsne.fit_transform(np.array([[0.0], [1.0]]))
 
 
+# TODO: Remove filterwarnings in 1.2
+@pytest.mark.filterwarnings("ignore:.*TSNE will change.*:FutureWarning")
 def test_pca_initialization_not_compatible_with_precomputed_kernel():
-    # Precomputed distance matrices must be square matrices.
+    # Precomputed distance matrices cannot use PCA initialization.
     tsne = TSNE(metric="precomputed", init="pca", square_distances=True)
     with pytest.raises(ValueError, match="The parameter init=\"pca\" cannot"
                                          " be used with"
@@ -459,6 +498,15 @@ def test_pca_initialization_not_compatible_with_precomputed_kernel():
         tsne.fit_transform(np.array([[0.0], [1.0]]))
 
 
+def test_pca_initialization_not_compatible_with_sparse_input():
+    # Sparse input matrices cannot use PCA initialization.
+    tsne = TSNE(init="pca", learning_rate=100.0)
+    with pytest.raises(TypeError, match="PCA initialization.*"):
+        tsne.fit_transform(sp.csr_matrix([[0, 5], [5, 0]]))
+
+
+# TODO: Remove filterwarnings in 1.2
+@pytest.mark.filterwarnings("ignore:.*TSNE will change.*:FutureWarning")
 def test_n_components_range():
     # barnes_hut method should only be used with n_components <= 3
     tsne = TSNE(n_components=4, method="barnes_hut")
@@ -466,6 +514,8 @@ def test_n_components_range():
         tsne.fit_transform(np.array([[0.0], [1.0]]))
 
 
+# TODO: Remove filterwarnings in 1.2
+@pytest.mark.filterwarnings("ignore:.*TSNE will change.*:FutureWarning")
 def test_early_exaggeration_used():
     # check that the ``early_exaggeration`` parameter has an effect
     random_state = check_random_state(0)
@@ -585,6 +635,8 @@ def _run_answer_test(pos_input, pos_output, neighbors, grad_output,
     assert_array_almost_equal(grad_bh, grad_output, decimal=4)
 
 
+# TODO: Remove filterwarnings in 1.2
+@pytest.mark.filterwarnings("ignore:.*TSNE will change.*:FutureWarning")
 def test_verbose():
     # Verbose options write to stdout.
     random_state = check_random_state(0)
@@ -607,6 +659,8 @@ def test_verbose():
     assert("early exaggeration" in out)
 
 
+# TODO: Remove filterwarnings in 1.2
+@pytest.mark.filterwarnings("ignore:.*TSNE will change.*:FutureWarning")
 def test_chebyshev_metric():
     # t-SNE should allow metrics that cannot be squared (issue #3526).
     random_state = check_random_state(0)
@@ -615,6 +669,8 @@ def test_chebyshev_metric():
     tsne.fit_transform(X)
 
 
+# TODO: Remove filterwarnings in 1.2
+@pytest.mark.filterwarnings("ignore:.*TSNE will change.*:FutureWarning")
 def test_reduction_to_one_component():
     # t-SNE should allow reduction to one component (issue #4154).
     random_state = check_random_state(0)
@@ -624,6 +680,8 @@ def test_reduction_to_one_component():
     assert(np.all(np.isfinite(X_embedded)))
 
 
+# TODO: Remove filterwarnings in 1.2
+@pytest.mark.filterwarnings("ignore:.*TSNE will change.*:FutureWarning")
 @pytest.mark.parametrize('method', ['barnes_hut', 'exact'])
 @pytest.mark.parametrize('dt', [np.float32, np.float64])
 def test_64bit(method, dt):
@@ -642,6 +700,8 @@ def test_64bit(method, dt):
     assert effective_type == np.float32
 
 
+# TODO: Remove filterwarnings in 1.2
+@pytest.mark.filterwarnings("ignore:.*TSNE will change.*:FutureWarning")
 @pytest.mark.parametrize('method', ['barnes_hut', 'exact'])
 def test_kl_divergence_not_nan(method):
     # Ensure kl_divergence_ is computed at last iteration
@@ -713,6 +773,8 @@ def test_n_iter_without_progress():
                 "last -1 episodes. Finished." in out)
 
 
+# TODO: Remove filterwarnings in 1.2
+@pytest.mark.filterwarnings("ignore:.*TSNE will change.*:FutureWarning")
 def test_min_grad_norm():
     # Make sure that the parameter min_grad_norm is used correctly
     random_state = check_random_state(0)
@@ -756,6 +818,8 @@ def test_min_grad_norm():
     assert n_smaller_gradient_norms <= 1
 
 
+# TODO: Remove filterwarnings in 1.2
+@pytest.mark.filterwarnings("ignore:.*TSNE will change.*:FutureWarning")
 def test_accessible_kl_divergence():
     # Ensures that the accessible kl_divergence matches the computed value
     random_state = check_random_state(0)
@@ -784,6 +848,8 @@ def test_accessible_kl_divergence():
     assert_almost_equal(tsne.kl_divergence_, float(error), decimal=5)
 
 
+# TODO: Remove filterwarnings in 1.2
+@pytest.mark.filterwarnings("ignore:.*TSNE will change.*:FutureWarning")
 @pytest.mark.parametrize('method', ['barnes_hut', 'exact'])
 def test_uniform_grid(method):
     """Make sure that TSNE can approximately recover a uniform 2D grid
@@ -885,6 +951,8 @@ def test_gradient_bh_multithread_match_sequential():
         assert_allclose(grad_multithread, grad_multithread)
 
 
+# TODO: Remove filterwarnings in 1.2
+@pytest.mark.filterwarnings("ignore:.*TSNE will change.*:FutureWarning")
 def test_tsne_with_different_distance_metrics():
     """Make sure that TSNE works for different distance metrics"""
     random_state = check_random_state(0)
@@ -896,10 +964,11 @@ def test_tsne_with_different_distance_metrics():
     for metric, dist_func in zip(metrics, dist_funcs):
         X_transformed_tsne = TSNE(
             metric=metric, n_components=n_components_embedding,
-            random_state=0, n_iter=300, square_distances=True).fit_transform(X)
+            random_state=0, n_iter=300, square_distances=True,
+            init='random').fit_transform(X)
         X_transformed_tsne_precomputed = TSNE(
             metric='precomputed', n_components=n_components_embedding,
-            random_state=0, n_iter=300,
+            random_state=0, n_iter=300, init='random',
             square_distances=True).fit_transform(dist_func(X))
         assert_array_equal(X_transformed_tsne, X_transformed_tsne_precomputed)
 
@@ -926,11 +995,11 @@ def test_tsne_different_square_distances(method, metric, square_distances):
     X_transformed_tsne = TSNE(
         metric=metric, n_components=n_components_embedding,
         square_distances=square_distances, method=method,
-        random_state=0).fit_transform(X)
+        random_state=0, init='random').fit_transform(X)
     X_transformed_tsne_precomputed = TSNE(
         metric='precomputed', n_components=n_components_embedding,
         square_distances=square_distances, method=method,
-        random_state=0).fit_transform(X_precomputed)
+        random_state=0, init='random').fit_transform(X_precomputed)
 
     assert_allclose(X_transformed_tsne, X_transformed_tsne_precomputed)
 
@@ -943,7 +1012,8 @@ def test_tsne_square_distances_futurewarning(metric, square_distances):
     random_state = check_random_state(0)
 
     X = random_state.randn(5, 2)
-    tsne = TSNE(metric=metric, square_distances=square_distances)
+    tsne = TSNE(metric=metric, square_distances=square_distances,
+                learning_rate=200.0, init="random")
 
     if metric != 'euclidean' and square_distances is not True:
         with pytest.warns(FutureWarning, match="'square_distances'.*"):
@@ -954,6 +1024,61 @@ def test_tsne_square_distances_futurewarning(metric, square_distances):
         assert not record
 
 
+# TODO: Remove in 1.2
+@pytest.mark.parametrize('init', [None, 'random', 'pca'])
+def test_tsne_init_futurewarning(init):
+    """Make sure that a FutureWarning is only raised when the
+    init is not specified or is 'pca'."""
+    random_state = check_random_state(0)
+
+    X = random_state.randn(5, 2)
+    kwargs = dict(learning_rate=200.0, init=init)
+    tsne = TSNE(**{k: v for k, v in kwargs.items() if v is not None})
+
+    if init is None:
+        with pytest.warns(FutureWarning, match="The default initialization.*"):
+            tsne.fit_transform(X)
+    elif init == 'pca':
+        with pytest.warns(FutureWarning, match="The PCA initialization.*"):
+            tsne.fit_transform(X)
+    else:
+        with pytest.warns(None) as record:
+            tsne.fit_transform(X)
+        assert not record
+
+
+# TODO: Remove in 1.2
+@pytest.mark.parametrize('learning_rate', [None, 200.0])
+def test_tsne_learning_rate_futurewarning(learning_rate):
+    """Make sure that a FutureWarning is only raised when the learning rate
+    is not specified"""
+    random_state = check_random_state(0)
+
+    X = random_state.randn(5, 2)
+    kwargs = dict(learning_rate=learning_rate, init='random')
+    tsne = TSNE(**{k: v for k, v in kwargs.items() if v is not None})
+
+    if learning_rate is None:
+        with pytest.warns(FutureWarning, match="The default learning rate.*"):
+            tsne.fit_transform(X)
+    else:
+        with pytest.warns(None) as record:
+            tsne.fit_transform(X)
+        assert not record
+
+
+# TODO: Remove filterwarnings in 1.2
+@pytest.mark.filterwarnings("ignore:.*TSNE will change.*:FutureWarning")
+def test_tsne_negative_learning_rate():
+    """Make sure that negative learning rate results in a ValueError"""
+    random_state = check_random_state(0)
+    X = random_state.randn(5, 2)
+    with pytest.raises(ValueError, match="'learning_rate' must be.*"):
+        TSNE(learning_rate=-50.0).fit_transform(X)
+
+
+# TODO: Remove filterwarnings in 1.2
+@pytest.mark.filterwarnings("ignore:.*TSNE will change.*:FutureWarning")
 @pytest.mark.parametrize('method', ['exact', 'barnes_hut'])
 def test_tsne_n_jobs(method):
     """Make sure that the n_jobs parameter doesn't impact the output"""
diff --git a/sklearn/neighbors/tests/test_neighbors_pipeline.py b/sklearn/neighbors/tests/test_neighbors_pipeline.py
index f8f9472bdac48..5b5f294d2d243 100644
--- a/sklearn/neighbors/tests/test_neighbors_pipeline.py
+++ b/sklearn/neighbors/tests/test_neighbors_pipeline.py
@@ -6,6 +6,7 @@
 """
 
 import numpy as np
+import pytest
 
 from sklearn.utils._testing import assert_array_almost_equal
 from sklearn.cluster.tests.common import generate_clustered_data
@@ -111,6 +112,8 @@ def test_isomap():
     assert_array_almost_equal(Xt_chain, Xt_compact)
 
 
+# TODO: Remove filterwarning in 1.2
+@pytest.mark.filterwarnings("ignore:.*TSNE will change.*:FutureWarning")
 def test_tsne():
     # Test chaining KNeighborsTransformer and TSNE
     n_iter = 250
diff --git a/sklearn/tests/test_docstring_parameters.py b/sklearn/tests/test_docstring_parameters.py
index ee2fe055a4b43..719df2f4a0f77 100644
--- a/sklearn/tests/test_docstring_parameters.py
+++ b/sklearn/tests/test_docstring_parameters.py
@@ -253,6 +253,11 @@ def test_fit_docstring_attributes(name, Estimator):
     if Estimator.__name__ == 'NMF':
         est.init = 'nndsvda'
 
+    # FIXME: TO BE REMOVED for 1.2 (avoid FutureWarning)
+    if Estimator.__name__ == 'TSNE':
+        est.learning_rate = 200.0
+        est.init = 'random'
+
     X, y = make_classification(n_samples=20, n_features=3,
                                n_redundant=0, n_classes=2,
                                random_state=2)
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index 71f5b3b42de42..f0c0383a7bfe8 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -3116,6 +3116,7 @@ def check_requires_y_none(name, estimator_orig):
             warnings.warn(warning_msg, FutureWarning)
 
 
+@ignore_warnings(category=FutureWarning)
 def check_n_features_in_after_fitting(name, estimator_orig):
     # Make sure that n_features_in are checked after fitting
     tags = _safe_tags(estimator_orig)

From e4bb9fa86b0df873ad750b6d59090843d9d23d50 Mon Sep 17 00:00:00 2001
From: Ray Bell 
Date: Mon, 26 Apr 2021 16:04:08 -0400
Subject: [PATCH 364/478] DOC: add import in binary tree class examples
 (#19991)

Co-authored-by: Ray Bell 
---
 sklearn/neighbors/_binary_tree.pxi | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sklearn/neighbors/_binary_tree.pxi b/sklearn/neighbors/_binary_tree.pxi
index de85ec49166ec..3adfa1b31006a 100755
--- a/sklearn/neighbors/_binary_tree.pxi
+++ b/sklearn/neighbors/_binary_tree.pxi
@@ -246,6 +246,7 @@ Examples
 Query for k-nearest neighbors
 
     >>> import numpy as np
+    >>> from sklearn.neighbors import {BinaryTree}
     >>> rng = np.random.RandomState(0)
     >>> X = rng.random_sample((10, 3))  # 10 points in 3 dimensions
     >>> tree = {BinaryTree}(X, leaf_size=2)              # doctest: +SKIP

From eecde00c7a706546271ff40d7d492b5f27046d2b Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre 
Date: Tue, 27 Apr 2021 11:38:53 +0200
Subject: [PATCH 365/478] FIX support multiple str/single category with dense
 DictVectorizer (#19982)

---
 doc/whats_new/v0.24.rst                       |  7 +++
 .../feature_extraction/_dict_vectorizer.py    | 21 +--------
 .../tests/test_dict_vectorizer.py             | 45 +++++++++++++++++++
 3 files changed, 53 insertions(+), 20 deletions(-)

diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst
index 79f6ecb15c3d0..bfcd134bdd2bd 100644
--- a/doc/whats_new/v0.24.rst
+++ b/doc/whats_new/v0.24.rst
@@ -42,6 +42,13 @@ Changelog
   with `sample_weight` parameter and `least_absolute_deviation` loss function.
   :pr:`19407` by :user:`Vadim Ushtanit `.
 
+:mod:`feature_extraction`
+.........................
+
+- |Fix| Fixed a bug to support multiple strings for a category when
+  `sparse=False` in :class:`feature_extraction.DictVectorizer`.
+  :pr:`19982` by :user:`Guillaume Lemaitre `.
+
 :mod:`sklearn.gaussian_process`
 ...............................
 
diff --git a/sklearn/feature_extraction/_dict_vectorizer.py b/sklearn/feature_extraction/_dict_vectorizer.py
index e0516407c205a..44b50dc45a103 100644
--- a/sklearn/feature_extraction/_dict_vectorizer.py
+++ b/sklearn/feature_extraction/_dict_vectorizer.py
@@ -347,26 +347,7 @@ def transform(self, X):
         Xa : {array, sparse matrix}
             Feature vectors; always 2-d.
         """
-        if self.sparse:
-            return self._transform(X, fitting=False)
-
-        else:
-            dtype = self.dtype
-            vocab = self.vocabulary_
-            X = _tosequence(X)
-            Xa = np.zeros((len(X), len(vocab)), dtype=dtype)
-
-            for i, x in enumerate(X):
-                for f, v in x.items():
-                    if isinstance(v, str):
-                        f = "%s%s%s" % (f, self.separator, v)
-                        v = 1
-                    try:
-                        Xa[i, vocab[f]] = dtype(v)
-                    except KeyError:
-                        pass
-
-            return Xa
+        return self._transform(X, fitting=False)
 
     def get_feature_names(self):
         """Returns a list of feature names, ordered by their indices.
diff --git a/sklearn/feature_extraction/tests/test_dict_vectorizer.py b/sklearn/feature_extraction/tests/test_dict_vectorizer.py
index 519201b580598..9984bdc5aa3da 100644
--- a/sklearn/feature_extraction/tests/test_dict_vectorizer.py
+++ b/sklearn/feature_extraction/tests/test_dict_vectorizer.py
@@ -6,6 +6,7 @@
 import numpy as np
 import scipy.sparse as sp
 from numpy.testing import assert_array_equal
+from numpy.testing import assert_allclose
 
 import pytest
 
@@ -165,3 +166,47 @@ def test_n_features_in():
     d = [{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': 1}]
     dv.fit(d)
     assert not hasattr(dv, 'n_features_in_')
+
+
+def test_dictvectorizer_dense_sparse_equivalence():
+    """Check the equivalence between between sparse and dense DictVectorizer.
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/19978
+    """
+    movie_entry_fit = [
+        {"category": ["thriller", "drama"], "year": 2003},
+        {"category": ["animation", "family"], "year": 2011},
+        {"year": 1974},
+    ]
+    movie_entry_transform = [{"category": ["thriller"], "unseen_feature": "3"}]
+    dense_vectorizer = DictVectorizer(sparse=False)
+    sparse_vectorizer = DictVectorizer(sparse=True)
+
+    dense_vector_fit = dense_vectorizer.fit_transform(movie_entry_fit)
+    sparse_vector_fit = sparse_vectorizer.fit_transform(movie_entry_fit)
+
+    assert not sp.issparse(dense_vector_fit)
+    assert sp.issparse(sparse_vector_fit)
+
+    assert_allclose(dense_vector_fit, sparse_vector_fit.toarray())
+
+    dense_vector_transform = dense_vectorizer.transform(movie_entry_transform)
+    sparse_vector_transform = sparse_vectorizer.transform(
+        movie_entry_transform
+    )
+
+    assert not sp.issparse(dense_vector_transform)
+    assert sp.issparse(sparse_vector_transform)
+
+    assert_allclose(dense_vector_transform, sparse_vector_transform.toarray())
+
+    dense_inverse_transform = dense_vectorizer.inverse_transform(
+        dense_vector_transform
+    )
+    sparse_inverse_transform = sparse_vectorizer.inverse_transform(
+        sparse_vector_transform
+    )
+
+    expected_inverse = [{"category=thriller": 1.0}]
+    assert dense_inverse_transform == expected_inverse
+    assert sparse_inverse_transform == expected_inverse

From 9694d5a4b517420f9a2953c67b8c06100b256efd Mon Sep 17 00:00:00 2001
From: Fatos Morina 
Date: Tue, 27 Apr 2021 12:37:56 +0200
Subject: [PATCH 366/478] Remove the unused import of csc_matrix (#19989)

---
 sklearn/tree/_tree.pyx | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx
index f4484ab1a3314..afd6aa8d6cf51 100644
--- a/sklearn/tree/_tree.pyx
+++ b/sklearn/tree/_tree.pyx
@@ -29,7 +29,6 @@ cimport numpy as np
 np.import_array()
 
 from scipy.sparse import issparse
-from scipy.sparse import csc_matrix
 from scipy.sparse import csr_matrix
 
 from ._utils cimport Stack

From 674e7dff128700de7b1e9588b18a78ad6f38f12a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?=
 <34657725+jeremiedbb@users.noreply.github.com>
Date: Tue, 27 Apr 2021 12:47:28 +0200
Subject: [PATCH 367/478] DOC retroactive changed model entry (#19992)

---
 doc/whats_new/v0.24.rst | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst
index bfcd134bdd2bd..72a96aa74f470 100644
--- a/doc/whats_new/v0.24.rst
+++ b/doc/whats_new/v0.24.rst
@@ -209,6 +209,9 @@ random sampling procedures.
 
 - |Fix| :class:`linear_model.Perceptron` when `penalty='elasticnet'`.
 
+- |Fix| Change in the random sampling procedures for the center initialization
+  of :class:`cluster.KMeans`.
+
 Details are listed in the changelog below.
 
 (While we are trying to better inform users by providing this information, we

From bf0886bae0ccbc8c5d285b6e2affe7e40474f970 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sylvain=20Mari=C3=A9?=
 
Date: Tue, 27 Apr 2021 16:06:10 +0200
Subject: [PATCH 368/478] [MRG after #12145] Add "Randomized SVD" solver option
 to KernelPCA for faster partial decompositions, like in PCA (#12069)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Sylvain MARIE 
Co-authored-by: Thomas J Fan 
Co-authored-by: Nicolas Hug 
Co-authored-by: Joel Nothman 
Co-authored-by: Olivier Grisel 
Co-authored-by: Olivier Grisel 
Co-authored-by: Tom Dupré la Tour 
---
 ...kernel_pca_solvers_time_vs_n_components.py | 148 ++++++++++
 ...ch_kernel_pca_solvers_time_vs_n_samples.py | 153 ++++++++++
 doc/modules/decomposition.rst                 | 122 ++++++--
 doc/whats_new/v1.0.rst                        |  13 +-
 sklearn/decomposition/_kernel_pca.py          |  86 +++++-
 .../decomposition/tests/test_kernel_pca.py    | 276 +++++++++++++++---
 sklearn/utils/extmath.py                      | 172 ++++++++++-
 sklearn/utils/tests/test_extmath.py           | 129 +++++++-
 8 files changed, 1017 insertions(+), 82 deletions(-)
 create mode 100644 benchmarks/bench_kernel_pca_solvers_time_vs_n_components.py
 create mode 100644 benchmarks/bench_kernel_pca_solvers_time_vs_n_samples.py

diff --git a/benchmarks/bench_kernel_pca_solvers_time_vs_n_components.py b/benchmarks/bench_kernel_pca_solvers_time_vs_n_components.py
new file mode 100644
index 0000000000000..d871967ad1327
--- /dev/null
+++ b/benchmarks/bench_kernel_pca_solvers_time_vs_n_components.py
@@ -0,0 +1,148 @@
+"""
+=============================================================
+Kernel PCA Solvers comparison benchmark: time vs n_components
+=============================================================
+
+This benchmark shows that the approximate solvers provided in Kernel PCA can
+help significantly improve its execution speed when an approximate solution
+(small `n_components`) is acceptable. In many real-world datasets a few
+hundreds of principal components are indeed sufficient enough to capture the
+underlying distribution.
+
+Description:
+------------
+A fixed number of training (default: 2000) and test (default: 1000) samples
+with 2 features is generated using the `make_circles` helper method.
+
+KernelPCA models are trained on the training set with an increasing number of
+principal components, between 1 and `max_n_compo` (default: 1999), with
+`n_compo_grid_size` positions (default: 10). For each value of `n_components`
+to try, KernelPCA models are trained for the various possible `eigen_solver`
+values. The execution times are displayed in a plot at the end of the
+experiment.
+
+What you can observe:
+---------------------
+When the number of requested principal components is small, the dense solver
+takes more time to complete, while the randomized method returns similar
+results with shorter execution times.
+
+Going further:
+--------------
+You can adjust `max_n_compo` and `n_compo_grid_size` if you wish to explore a
+different range of values for `n_components`.
+
+You can also set `arpack_all=True` to activate arpack solver for large number
+of components (this takes more time).
+"""
+# Authors: Sylvain MARIE, Schneider Electric
+
+import time
+
+import numpy as np
+import matplotlib.pyplot as plt
+
+from numpy.testing import assert_array_almost_equal
+from sklearn.decomposition import KernelPCA
+from sklearn.datasets import make_circles
+
+
+print(__doc__)
+
+
+# 1- Design the Experiment
+# ------------------------
+n_train, n_test = 2000, 1000            # the sample sizes to use
+max_n_compo = 1999                      # max n_components to try
+n_compo_grid_size = 10                  # nb of positions in the grid to try
+# generate the grid
+n_compo_range = [np.round(np.exp((x / (n_compo_grid_size - 1))
+                                 * np.log(max_n_compo)))
+                 for x in range(0, n_compo_grid_size)]
+
+n_iter = 3          # the number of times each experiment will be repeated
+arpack_all = False  # set to True if you wish to run arpack for all n_compo
+
+
+# 2- Generate random data
+# -----------------------
+n_features = 2
+X, y = make_circles(n_samples=(n_train + n_test), factor=.3, noise=.05,
+                    random_state=0)
+X_train, X_test = X[:n_train, :], X[n_train:, :]
+
+
+# 3- Benchmark
+# ------------
+# init
+ref_time = np.empty((len(n_compo_range), n_iter)) * np.nan
+a_time = np.empty((len(n_compo_range), n_iter)) * np.nan
+r_time = np.empty((len(n_compo_range), n_iter)) * np.nan
+# loop
+for j, n_components in enumerate(n_compo_range):
+
+    n_components = int(n_components)
+    print("Performing kPCA with n_components = %i" % n_components)
+
+    # A- reference (dense)
+    print("  - dense solver")
+    for i in range(n_iter):
+        start_time = time.perf_counter()
+        ref_pred = KernelPCA(n_components, eigen_solver="dense") \
+            .fit(X_train).transform(X_test)
+        ref_time[j, i] = time.perf_counter() - start_time
+
+    # B- arpack (for small number of components only, too slow otherwise)
+    if arpack_all or n_components < 100:
+        print("  - arpack solver")
+        for i in range(n_iter):
+            start_time = time.perf_counter()
+            a_pred = KernelPCA(n_components, eigen_solver="arpack") \
+                .fit(X_train).transform(X_test)
+            a_time[j, i] = time.perf_counter() - start_time
+            # check that the result is still correct despite the approx
+            assert_array_almost_equal(np.abs(a_pred), np.abs(ref_pred))
+
+    # C- randomized
+    print("  - randomized solver")
+    for i in range(n_iter):
+        start_time = time.perf_counter()
+        r_pred = KernelPCA(n_components, eigen_solver="randomized") \
+            .fit(X_train).transform(X_test)
+        r_time[j, i] = time.perf_counter() - start_time
+        # check that the result is still correct despite the approximation
+        assert_array_almost_equal(np.abs(r_pred), np.abs(ref_pred))
+
+# Compute statistics for the 3 methods
+avg_ref_time = ref_time.mean(axis=1)
+std_ref_time = ref_time.std(axis=1)
+avg_a_time = a_time.mean(axis=1)
+std_a_time = a_time.std(axis=1)
+avg_r_time = r_time.mean(axis=1)
+std_r_time = r_time.std(axis=1)
+
+
+# 4- Plots
+# --------
+fig, ax = plt.subplots(figsize=(12, 8))
+
+# Display 1 plot with error bars per method
+ax.errorbar(n_compo_range, avg_ref_time, yerr=std_ref_time,
+            marker='x', linestyle='', color='r', label='full')
+ax.errorbar(n_compo_range, avg_a_time, yerr=std_a_time, marker='x',
+            linestyle='', color='g', label='arpack')
+ax.errorbar(n_compo_range, avg_r_time, yerr=std_r_time, marker='x',
+            linestyle='', color='b', label='randomized')
+ax.legend(loc='upper left')
+
+# customize axes
+ax.set_xscale('log')
+ax.set_xlim(1, max(n_compo_range) * 1.1)
+ax.set_ylabel("Execution time (s)")
+ax.set_xlabel("n_components")
+
+ax.set_title("kPCA Execution time comparison on %i samples with %i "
+             "features, according to the choice of `eigen_solver`"
+             "" % (n_train, n_features))
+
+plt.show()
diff --git a/benchmarks/bench_kernel_pca_solvers_time_vs_n_samples.py b/benchmarks/bench_kernel_pca_solvers_time_vs_n_samples.py
new file mode 100644
index 0000000000000..d238802a68d64
--- /dev/null
+++ b/benchmarks/bench_kernel_pca_solvers_time_vs_n_samples.py
@@ -0,0 +1,153 @@
+"""
+==========================================================
+Kernel PCA Solvers comparison benchmark: time vs n_samples
+==========================================================
+
+This benchmark shows that the approximate solvers provided in Kernel PCA can
+help significantly improve its execution speed when an approximate solution
+(small `n_components`) is acceptable. In many real-world datasets the number of
+samples is very large, but a few hundreds of principal components are
+sufficient enough to capture the underlying distribution.
+
+Description:
+------------
+An increasing number of examples is used to train a KernelPCA, between
+`min_n_samples` (default: 101) and `max_n_samples` (default: 4000) with
+`n_samples_grid_size` positions (default: 4). Samples have 2 features, and are
+generated using `make_circles`. For each training sample size, KernelPCA models
+are trained for the various possible `eigen_solver` values. All of them are
+trained to obtain `n_components` principal components (default: 100). The
+execution times are displayed in a plot at the end of the experiment.
+
+What you can observe:
+---------------------
+When the number of samples provided gets large, the dense solver takes a lot
+of time to complete, while the randomized method returns similar results in
+much shorter execution times.
+
+Going further:
+--------------
+You can increase `max_n_samples` and `nb_n_samples_to_try` if you wish to
+explore a wider range of values for `n_samples`.
+
+You can also set `include_arpack=True` to add this other solver in the
+experiments (much slower).
+
+Finally you can have a look at the second example of this series, "Kernel PCA
+Solvers comparison benchmark: time vs n_components", where this time the number
+of examples is fixed, and the desired number of components varies.
+"""
+# Author: Sylvain MARIE, Schneider Electric
+
+import time
+
+import numpy as np
+import matplotlib.pyplot as plt
+
+from numpy.testing import assert_array_almost_equal
+from sklearn.decomposition import KernelPCA
+from sklearn.datasets import make_circles
+
+
+print(__doc__)
+
+
+# 1- Design the Experiment
+# ------------------------
+min_n_samples, max_n_samples = 101, 4000  # min and max n_samples to try
+n_samples_grid_size = 4                   # nb of positions in the grid to try
+# generate the grid
+n_samples_range = [min_n_samples + np.floor((x / (n_samples_grid_size - 1))
+                                            * (max_n_samples - min_n_samples))
+                   for x in range(0, n_samples_grid_size)]
+
+n_components = 100      # the number of principal components we want to use
+n_iter = 3              # the number of times each experiment will be repeated
+include_arpack = False  # set this to True to include arpack solver (slower)
+
+
+# 2- Generate random data
+# -----------------------
+n_features = 2
+X, y = make_circles(n_samples=max_n_samples, factor=.3, noise=.05,
+                    random_state=0)
+
+
+# 3- Benchmark
+# ------------
+# init
+ref_time = np.empty((len(n_samples_range), n_iter)) * np.nan
+a_time = np.empty((len(n_samples_range), n_iter)) * np.nan
+r_time = np.empty((len(n_samples_range), n_iter)) * np.nan
+
+# loop
+for j, n_samples in enumerate(n_samples_range):
+
+    n_samples = int(n_samples)
+    print("Performing kPCA with n_samples = %i" % n_samples)
+
+    X_train = X[:n_samples, :]
+    X_test = X_train
+
+    # A- reference (dense)
+    print("  - dense")
+    for i in range(n_iter):
+        start_time = time.perf_counter()
+        ref_pred = KernelPCA(n_components, eigen_solver="dense") \
+            .fit(X_train).transform(X_test)
+        ref_time[j, i] = time.perf_counter() - start_time
+
+    # B- arpack
+    if include_arpack:
+        print("  - arpack")
+        for i in range(n_iter):
+            start_time = time.perf_counter()
+            a_pred = KernelPCA(n_components, eigen_solver="arpack") \
+                .fit(X_train).transform(X_test)
+            a_time[j, i] = time.perf_counter() - start_time
+            # check that the result is still correct despite the approx
+            assert_array_almost_equal(np.abs(a_pred), np.abs(ref_pred))
+
+    # C- randomized
+    print("  - randomized")
+    for i in range(n_iter):
+        start_time = time.perf_counter()
+        r_pred = KernelPCA(n_components, eigen_solver="randomized") \
+            .fit(X_train).transform(X_test)
+        r_time[j, i] = time.perf_counter() - start_time
+        # check that the result is still correct despite the approximation
+        assert_array_almost_equal(np.abs(r_pred), np.abs(ref_pred))
+
+# Compute statistics for the 3 methods
+avg_ref_time = ref_time.mean(axis=1)
+std_ref_time = ref_time.std(axis=1)
+avg_a_time = a_time.mean(axis=1)
+std_a_time = a_time.std(axis=1)
+avg_r_time = r_time.mean(axis=1)
+std_r_time = r_time.std(axis=1)
+
+
+# 4- Plots
+# --------
+fig, ax = plt.subplots(figsize=(12, 8))
+
+# Display 1 plot with error bars per method
+ax.errorbar(n_samples_range, avg_ref_time, yerr=std_ref_time,
+            marker='x', linestyle='', color='r', label='full')
+if include_arpack:
+    ax.errorbar(n_samples_range, avg_a_time, yerr=std_a_time, marker='x',
+                linestyle='', color='g', label='arpack')
+ax.errorbar(n_samples_range, avg_r_time, yerr=std_r_time, marker='x',
+            linestyle='', color='b', label='randomized')
+ax.legend(loc='upper left')
+
+# customize axes
+ax.set_xlim(min(n_samples_range) * 0.9, max(n_samples_range) * 1.1)
+ax.set_ylabel("Execution time (s)")
+ax.set_xlabel("n_samples")
+
+ax.set_title("Execution time comparison of kPCA with %i components on samples "
+             "with %i features, according to the choice of `eigen_solver`"
+             "" % (n_components, n_features))
+
+plt.show()
diff --git a/doc/modules/decomposition.rst b/doc/modules/decomposition.rst
index e971d784c63d6..fd51f60d8bfc6 100644
--- a/doc/modules/decomposition.rst
+++ b/doc/modules/decomposition.rst
@@ -166,32 +166,16 @@ Note: the implementation of ``inverse_transform`` in :class:`PCA` with
 
 .. topic:: References:
 
-    * `"Finding structure with randomness: Stochastic algorithms for
+    * Algorithm 4.3 in
+      `"Finding structure with randomness: Stochastic algorithms for
       constructing approximate matrix decompositions"
       `_
       Halko, et al., 2009
 
-
-.. _kernel_PCA:
-
-Kernel PCA
-----------
-
-:class:`KernelPCA` is an extension of PCA which achieves non-linear
-dimensionality reduction through the use of kernels (see :ref:`metrics`). It
-has many applications including denoising, compression and structured
-prediction (kernel dependency estimation). :class:`KernelPCA` supports both
-``transform`` and ``inverse_transform``.
-
-.. figure:: ../auto_examples/decomposition/images/sphx_glr_plot_kernel_pca_001.png
-    :target: ../auto_examples/decomposition/plot_kernel_pca.html
-    :align: center
-    :scale: 75%
-
-.. topic:: Examples:
-
-    * :ref:`sphx_glr_auto_examples_decomposition_plot_kernel_pca.py`
-
+    * `"An implementation of a randomized algorithm for principal component
+      analysis"
+      `_
+      A. Szlam et al. 2014
 
 .. _SparsePCA:
 
@@ -278,6 +262,100 @@ factorization, while larger values shrink many coefficients to zero.
      R. Jenatton, G. Obozinski, F. Bach, 2009
 
 
+.. _kernel_PCA:
+
+Kernel Principal Component Analysis (kPCA)
+==========================================
+
+Exact Kernel PCA
+----------------
+
+:class:`KernelPCA` is an extension of PCA which achieves non-linear
+dimensionality reduction through the use of kernels (see :ref:`metrics`). It
+has many applications including denoising, compression and structured
+prediction (kernel dependency estimation). :class:`KernelPCA` supports both
+``transform`` and ``inverse_transform``.
+
+.. figure:: ../auto_examples/decomposition/images/sphx_glr_plot_kernel_pca_001.png
+    :target: ../auto_examples/decomposition/plot_kernel_pca.html
+    :align: center
+    :scale: 75%
+
+.. topic:: Examples:
+
+    * :ref:`sphx_glr_auto_examples_decomposition_plot_kernel_pca.py`
+
+.. topic:: References:
+
+    * Kernel PCA was introduced in "Kernel principal component analysis"
+      Bernhard Schoelkopf, Alexander J. Smola, and Klaus-Robert Mueller. 1999.
+      In Advances in kernel methods, MIT Press, Cambridge, MA, USA 327-352.
+
+
+.. _kPCA_Solvers:
+
+Choice of solver for Kernel PCA
+-------------------------------
+
+While in :class:`PCA` the number of components is bounded by the number of
+features, in :class:`KernelPCA` the number of components is bounded by the
+number of samples. Many real-world datasets have large number of samples! In
+these cases finding *all* the components with a full kPCA is a waste of
+computation time, as data is mostly described by the first few components
+(e.g. ``n_components<=100``). In other words, the centered Gram matrix that
+is eigendecomposed in the Kernel PCA fitting process has an effective rank that
+is much smaller than its size. This is a situation where approximate
+eigensolvers can provide speedup with very low precision loss.
+
+The optional parameter ``eigen_solver='randomized'`` can be used to
+*significantly* reduce the computation time when the number of requested
+``n_components`` is small compared with the number of samples. It relies on
+randomized decomposition methods to find an approximate solution in a shorter
+time.
+
+The time complexity of the randomized :class:`KernelPCA` is
+:math:`O(n_{\mathrm{samples}}^2 \cdot n_{\mathrm{components}})`
+instead of :math:`O(n_{\mathrm{samples}}^3)` for the exact method
+implemented with ``eigen_solver='dense'``.
+
+The memory footprint of randomized :class:`KernelPCA` is also proportional to
+:math:`2 \cdot n_{\mathrm{samples}} \cdot n_{\mathrm{components}}` instead of
+:math:`n_{\mathrm{samples}}^2` for the exact method.
+
+Note: this technique is the same as in :ref:`RandomizedPCA`.
+
+In addition to the above two solvers, ``eigen_solver='arpack'`` can be used as
+an alternate way to get an approximate decomposition. In practice, this method
+only provides reasonable execution times when the number of components to find
+is extremely small. It is enabled by default when the desired number of
+components is less than 10 (strict) and the number of samples is more than 200
+(strict). See :class:`KernelPCA` for details.
+
+.. topic:: References:
+
+    * *dense* solver:
+      `scipy.linalg.eigh documentation
+      `_
+
+    * *randomized* solver:
+
+        - Algorithm 4.3 in
+          `"Finding structure with randomness: Stochastic algorithms for
+          constructing approximate matrix decompositions"
+          `_
+          Halko, et al., 2009
+
+        - `"An implementation of a randomized algorithm for principal component
+          analysis"
+          `_
+          A. Szlam et al. 2014
+
+    * *arpack* solver:
+      `scipy.sparse.linalg.eigsh documentation
+      `_
+      R. B. Lehoucq, D. C. Sorensen, and C. Yang, 1998
+
+
 .. _LSA:
 
 Truncated singular value decomposition and latent semantic analysis
diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 0a13d22860d07..0cd1d6a89d158 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -159,14 +159,17 @@ Changelog
 - |Fix| Fixes incorrect multiple data-conversion warnings when clustering
   boolean data. :pr:`19046` by :user:`Surya Prakash `.
 
-:mod:`sklearn.decomposition`
-............................
-
 - |Fix| Fixed :func:`dict_learning`, used by :class:`DictionaryLearning`, to
   ensure determinism of the output. Achieved by flipping signs of the SVD
   output which is used to initialize the code.
   :pr:`18433` by :user:`Bruno Charron `.
 
+- |Enhancement| added a new approximate solver (randomized SVD, available with
+  `eigen_solver='randomized'`) to :class:`decomposition.KernelPCA`. This
+  significantly accelerates computation when the number of samples is much
+  larger than the desired number of components.
+  :pr:`12069` by :user:`Sylvain Marié `.
+
 - |Fix| Fixed a bug in :class:`MiniBatchDictionaryLearning`,
   :class:`MiniBatchSparsePCA` and :func:`dict_learning_online` where the
   update of the dictionary was incorrect. :pr:`19198` by
@@ -395,8 +398,8 @@ Changelog
   supporting sparse matrix and raise the appropriate error message.
   :pr:`19879` by :user:`Guillaume Lemaitre `.
 
-- |Efficiency| Changed ``algorithm`` argument for :class:`cluster.KMeans` in 
-  :class:`preprocessing.KBinsDiscretizer` from ``auto`` to ``full``. 
+- |Efficiency| Changed ``algorithm`` argument for :class:`cluster.KMeans` in
+  :class:`preprocessing.KBinsDiscretizer` from ``auto`` to ``full``.
   :pr:`19934` by :user:`Gleb Levitskiy `.
 
 :mod:`sklearn.tree`
diff --git a/sklearn/decomposition/_kernel_pca.py b/sklearn/decomposition/_kernel_pca.py
index 415ee034c1769..8663193a8383e 100644
--- a/sklearn/decomposition/_kernel_pca.py
+++ b/sklearn/decomposition/_kernel_pca.py
@@ -1,6 +1,7 @@
 """Kernel Principal Components Analysis."""
 
 # Author: Mathieu Blondel 
+#         Sylvain Marie 
 # License: BSD 3 clause
 
 import numpy as np
@@ -8,7 +9,7 @@
 from scipy.sparse.linalg import eigsh
 
 from ..utils._arpack import _init_arpack_v0
-from ..utils.extmath import svd_flip
+from ..utils.extmath import svd_flip, _randomized_eigsh
 from ..utils.validation import check_is_fitted, _check_psd_eigenvalues
 from ..utils.deprecation import deprecated
 from ..exceptions import NotFittedError
@@ -24,6 +25,12 @@ class KernelPCA(TransformerMixin, BaseEstimator):
     Non-linear dimensionality reduction through the use of kernels (see
     :ref:`metrics`).
 
+    It uses the `scipy.linalg.eigh` LAPACK implementation of the full SVD or
+    the `scipy.sparse.linalg.eigsh` ARPACK implementation of the truncated SVD,
+    depending on the shape of the input data and the number of components to
+    extract. It can also use a randomized truncated SVD by the method of
+    Halko et al. 2009, see `eigen_solver`.
+
     Read more in the :ref:`User Guide `.
 
     Parameters
@@ -59,10 +66,37 @@ class KernelPCA(TransformerMixin, BaseEstimator):
         Learn the inverse transform for non-precomputed kernels.
         (i.e. learn to find the pre-image of a point)
 
-    eigen_solver : {'auto', 'dense', 'arpack'}, default='auto'
-        Select eigensolver to use. If n_components is much less than
-        the number of training samples, arpack may be more efficient
-        than the dense eigensolver.
+    eigen_solver : {'auto', 'dense', 'arpack', 'randomized'}, \
+        default='auto'
+        Select eigensolver to use. If `n_components` is much
+        less than the number of training samples, randomized (or arpack to a
+        smaller extend) may be more efficient than the dense eigensolver.
+        Randomized SVD is performed according to the method of Halko et al.
+
+        auto :
+            the solver is selected by a default policy based on n_samples
+            (the number of training samples) and `n_components`:
+            if the number of components to extract is less than 10 (strict) and
+            the number of samples is more than 200 (strict), the 'arpack'
+            method is enabled. Otherwise the exact full eigenvalue
+            decomposition is computed and optionally truncated afterwards
+            ('dense' method).
+        dense :
+            run exact full eigenvalue decomposition calling the standard
+            LAPACK solver via `scipy.linalg.eigh`, and select the components
+            by postprocessing
+        arpack :
+            run SVD truncated to n_components calling ARPACK solver using
+            `scipy.sparse.linalg.eigsh`. It requires strictly
+            0 < n_components < n_samples
+        randomized :
+            run randomized SVD by the method of Halko et al. The current
+            implementation selects eigenvalues based on their module; therefore
+            using this method can lead to unexpected results if the kernel is
+            not positive semi-definite.
+
+        .. versionchanged:: 1.0
+           `'randomized'` was added.
 
     tol : float, default=0
         Convergence tolerance for arpack.
@@ -72,6 +106,13 @@ class KernelPCA(TransformerMixin, BaseEstimator):
         Maximum number of iterations for arpack.
         If None, optimal value will be chosen by arpack.
 
+    iterated_power : int >= 0, or 'auto', default='auto'
+        Number of iterations for the power method computed by
+        svd_solver == 'randomized'. When 'auto', it is set to 7 when
+        `n_components < 0.1 * min(X.shape)`, other it is set to 4.
+
+        .. versionadded:: 1.0
+
     remove_zero_eig : bool, default=False
         If True, then all components with zero eigenvalues are removed, so
         that the number of components in the output may be < n_components
@@ -80,8 +121,8 @@ class KernelPCA(TransformerMixin, BaseEstimator):
         with zero eigenvalues are removed regardless.
 
     random_state : int, RandomState instance or None, default=None
-        Used when ``eigen_solver`` == 'arpack'. Pass an int for reproducible
-        results across multiple function calls.
+        Used when ``eigen_solver`` == 'arpack' or 'randomized'. Pass an int
+        for reproducible results across multiple function calls.
         See :term:`Glossary `.
 
         .. versionadded:: 0.18
@@ -141,12 +182,22 @@ class KernelPCA(TransformerMixin, BaseEstimator):
         and Klaus-Robert Mueller. 1999. Kernel principal
         component analysis. In Advances in kernel methods,
         MIT Press, Cambridge, MA, USA 327-352.
+
+    For eigen_solver == 'arpack', refer to `scipy.sparse.linalg.eigsh`.
+
+    For eigen_solver == 'randomized', see:
+        Finding structure with randomness: Stochastic algorithms
+        for constructing approximate matrix decompositions Halko, et al., 2009
+        (arXiv:909)
+        A randomized algorithm for the decomposition of matrices
+        Per-Gunnar Martinsson, Vladimir Rokhlin and Mark Tygert
     """
     @_deprecate_positional_args
     def __init__(self, n_components=None, *, kernel="linear",
                  gamma=None, degree=3, coef0=1, kernel_params=None,
                  alpha=1.0, fit_inverse_transform=False, eigen_solver='auto',
-                 tol=0, max_iter=None, remove_zero_eig=False,
+                 tol=0, max_iter=None, iterated_power='auto',
+                 remove_zero_eig=False,
                  random_state=None, copy_X=True, n_jobs=None):
         if fit_inverse_transform and kernel == 'precomputed':
             raise ValueError(
@@ -160,9 +211,10 @@ def __init__(self, n_components=None, *, kernel="linear",
         self.alpha = alpha
         self.fit_inverse_transform = fit_inverse_transform
         self.eigen_solver = eigen_solver
-        self.remove_zero_eig = remove_zero_eig
         self.tol = tol
         self.max_iter = max_iter
+        self.iterated_power = iterated_power
+        self.remove_zero_eig = remove_zero_eig
         self.random_state = random_state
         self.n_jobs = n_jobs
         self.copy_X = copy_X
@@ -191,9 +243,14 @@ def _fit_transform(self, K):
         # center kernel
         K = self._centerer.fit_transform(K)
 
+        # adjust n_components according to user inputs
         if self.n_components is None:
-            n_components = K.shape[0]
+            n_components = K.shape[0]  # use all dimensions
         else:
+            if self.n_components < 1:
+                raise ValueError(
+                    f"`n_components` should be >= 1, got: {self.n_component}"
+                )
             n_components = min(K.shape[0], self.n_components)
 
         # compute eigenvectors
@@ -206,6 +263,7 @@ def _fit_transform(self, K):
             eigen_solver = self.eigen_solver
 
         if eigen_solver == 'dense':
+            # Note: eigvals specifies the indices of smallest/largest to return
             self.lambdas_, self.alphas_ = linalg.eigh(
                 K, eigvals=(K.shape[0] - n_components, K.shape[0] - 1))
         elif eigen_solver == 'arpack':
@@ -215,6 +273,14 @@ def _fit_transform(self, K):
                                                 tol=self.tol,
                                                 maxiter=self.max_iter,
                                                 v0=v0)
+        elif eigen_solver == 'randomized':
+            self.lambdas_, self.alphas_ = _randomized_eigsh(
+                K, n_components=n_components, n_iter=self.iterated_power,
+                random_state=self.random_state, selection='module'
+            )
+        else:
+            raise ValueError("Unsupported value for `eigen_solver`: %r"
+                             % eigen_solver)
 
         # make sure that the eigenvalues are ok and fix numerical issues
         self.lambdas_ = _check_psd_eigenvalues(self.lambdas_,
diff --git a/sklearn/decomposition/tests/test_kernel_pca.py b/sklearn/decomposition/tests/test_kernel_pca.py
index adf68f1db1a6c..5c8d052a7aa14 100644
--- a/sklearn/decomposition/tests/test_kernel_pca.py
+++ b/sklearn/decomposition/tests/test_kernel_pca.py
@@ -3,11 +3,13 @@
 import pytest
 
 from sklearn.utils._testing import (assert_array_almost_equal,
-                                   assert_allclose)
+                                    assert_array_equal,
+                                    assert_allclose)
 
 from sklearn.decomposition import PCA, KernelPCA
 from sklearn.datasets import make_circles
 from sklearn.datasets import make_blobs
+from sklearn.exceptions import NotFittedError
 from sklearn.linear_model import Perceptron
 from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import StandardScaler
@@ -17,6 +19,12 @@
 
 
 def test_kernel_pca():
+    """Nominal test for all solvers and all known kernels + a custom one
+
+    It tests
+     - that fit_transform is equivalent to fit+transform
+     - that the shapes of transforms and inverse transforms are correct
+    """
     rng = np.random.RandomState(0)
     X_fit = rng.random_sample((5, 4))
     X_pred = rng.random_sample((2, 4))
@@ -26,7 +34,7 @@ def histogram(x, y, **kwargs):
         assert kwargs == {}    # no kernel_params that we didn't ask for
         return np.minimum(x, y).sum()
 
-    for eigen_solver in ("auto", "dense", "arpack"):
+    for eigen_solver in ("auto", "dense", "arpack", "randomized"):
         for kernel in ("linear", "rbf", "poly", histogram):
             # histogram kernel produces singular matrix inside linalg.solve
             # XXX use a least-squares approximation?
@@ -55,12 +63,31 @@ def histogram(x, y, **kwargs):
                 assert X_pred2.shape == X_pred.shape
 
 
+def test_kernel_pca_invalid_solver():
+    """Check that kPCA raises an error if the solver parameter is invalid
+
+    """
+    with pytest.raises(ValueError):
+        KernelPCA(eigen_solver="unknown").fit(np.random.randn(10, 10))
+
+
 def test_kernel_pca_invalid_parameters():
+    """Check that kPCA raises an error if the parameters are invalid
+
+    Tests fitting inverse transform with a precomputed kernel raises a
+    ValueError.
+    """
     with pytest.raises(ValueError):
         KernelPCA(10, fit_inverse_transform=True, kernel='precomputed')
 
 
 def test_kernel_pca_consistent_transform():
+    """Check robustness to mutations in the original training array
+
+    Test that after fitting a kPCA model, it stays independent of any
+    mutation of the values of the original data object by relying on an
+    internal copy.
+    """
     # X_fit_ needs to retain the old, unmodified copy of X
     state = np.random.RandomState(0)
     X = state.rand(10, 10)
@@ -74,6 +101,10 @@ def test_kernel_pca_consistent_transform():
 
 
 def test_kernel_pca_deterministic_output():
+    """Test that Kernel PCA produces deterministic output
+
+    Tests that the same inputs and random state produce the same output.
+    """
     rng = np.random.RandomState(0)
     X = rng.rand(10, 10)
     eigen_solver = ('arpack', 'dense')
@@ -89,15 +120,20 @@ def test_kernel_pca_deterministic_output():
 
 
 def test_kernel_pca_sparse():
+    """Test that kPCA works on a sparse data input.
+
+    Same test as ``test_kernel_pca except inverse_transform`` since it's not
+    implemented for sparse matrices.
+    """
     rng = np.random.RandomState(0)
     X_fit = sp.csr_matrix(rng.random_sample((5, 4)))
     X_pred = sp.csr_matrix(rng.random_sample((2, 4)))
 
-    for eigen_solver in ("auto", "arpack"):
+    for eigen_solver in ("auto", "arpack", "randomized"):
         for kernel in ("linear", "rbf", "poly"):
             # transform fit data
             kpca = KernelPCA(4, kernel=kernel, eigen_solver=eigen_solver,
-                             fit_inverse_transform=False)
+                             fit_inverse_transform=False, random_state=0)
             X_fit_transformed = kpca.fit_transform(X_fit)
             X_fit_transformed2 = kpca.fit(X_fit).transform(X_fit)
             assert_array_almost_equal(np.abs(X_fit_transformed),
@@ -108,31 +144,47 @@ def test_kernel_pca_sparse():
             assert (X_pred_transformed.shape[1] ==
                          X_fit_transformed.shape[1])
 
-            # inverse transform
-            # X_pred2 = kpca.inverse_transform(X_pred_transformed)
-            # assert X_pred2.shape == X_pred.shape)
+            # inverse transform: not available for sparse matrices
+            # XXX: should we raise another exception type here? For instance:
+            # NotImplementedError.
+            with pytest.raises(NotFittedError):
+                kpca.inverse_transform(X_pred_transformed)
 
 
-def test_kernel_pca_linear_kernel():
+@pytest.mark.parametrize("solver", ["auto", "dense", "arpack", "randomized"])
+@pytest.mark.parametrize("n_features", [4, 10])
+def test_kernel_pca_linear_kernel(solver, n_features):
+    """Test that kPCA with linear kernel is equivalent to PCA for all solvers.
+
+    KernelPCA with linear kernel should produce the same output as PCA.
+    """
     rng = np.random.RandomState(0)
-    X_fit = rng.random_sample((5, 4))
-    X_pred = rng.random_sample((2, 4))
+    X_fit = rng.random_sample((5, n_features))
+    X_pred = rng.random_sample((2, n_features))
 
     # for a linear kernel, kernel PCA should find the same projection as PCA
     # modulo the sign (direction)
     # fit only the first four components: fifth is near zero eigenvalue, so
     # can be trimmed due to roundoff error
+    n_comps = 3 if solver == "arpack" else 4
     assert_array_almost_equal(
-        np.abs(KernelPCA(4).fit(X_fit).transform(X_pred)),
-        np.abs(PCA(4).fit(X_fit).transform(X_pred)))
+        np.abs(KernelPCA(n_comps, eigen_solver=solver).fit(X_fit)
+               .transform(X_pred)),
+        np.abs(PCA(n_comps, svd_solver=solver if solver != "dense" else "full")
+               .fit(X_fit).transform(X_pred)))
 
 
 def test_kernel_pca_n_components():
+    """Test that `n_components` is correctly taken into account for projections
+
+    For all solvers this tests that the output has the correct shape depending
+    on the selected number of components.
+    """
     rng = np.random.RandomState(0)
     X_fit = rng.random_sample((5, 4))
     X_pred = rng.random_sample((2, 4))
 
-    for eigen_solver in ("dense", "arpack"):
+    for eigen_solver in ("dense", "arpack", "randomized"):
         for c in [1, 2, 4]:
             kpca = KernelPCA(n_components=c, eigen_solver=eigen_solver)
             shape = kpca.fit(X_fit).transform(X_pred).shape
@@ -141,6 +193,11 @@ def test_kernel_pca_n_components():
 
 
 def test_remove_zero_eig():
+    """Check that the ``remove_zero_eig`` parameter works correctly.
+
+    Tests that the null-space (Zero) eigenvalues are removed when
+    remove_zero_eig=True, whereas they are not by default.
+    """
     X = np.array([[1 - 1e-30, 1], [1, 1], [1, 1 - 1e-20]])
 
     # n_components=None (default) => remove_zero_eig is True
@@ -158,9 +215,11 @@ def test_remove_zero_eig():
 
 
 def test_leave_zero_eig():
-    """This test checks that fit().transform() returns the same result as
+    """Non-regression test for issue #12141 (PR #12143)
+
+    This test checks that fit().transform() returns the same result as
     fit_transform() in case of non-removed zero eigenvalue.
-    Non-regression test for issue #12141 (PR #12143)"""
+    """
     X_fit = np.array([[1, 1], [0, 0]])
 
     # Assert that even with all np warnings on, there is no div by zero warning
@@ -184,23 +243,29 @@ def test_leave_zero_eig():
 
 
 def test_kernel_pca_precomputed():
+    """Test that kPCA works with a precomputed kernel, for all solvers
+
+    """
     rng = np.random.RandomState(0)
     X_fit = rng.random_sample((5, 4))
     X_pred = rng.random_sample((2, 4))
 
-    for eigen_solver in ("dense", "arpack"):
-        X_kpca = KernelPCA(4, eigen_solver=eigen_solver).\
-            fit(X_fit).transform(X_pred)
+    for eigen_solver in ("dense", "arpack", "randomized"):
+        X_kpca = KernelPCA(
+            4, eigen_solver=eigen_solver, random_state=0
+        ).fit(X_fit).transform(X_pred)
+
         X_kpca2 = KernelPCA(
-            4, eigen_solver=eigen_solver, kernel='precomputed').fit(
-                np.dot(X_fit, X_fit.T)).transform(np.dot(X_pred, X_fit.T))
+            4, eigen_solver=eigen_solver, kernel='precomputed', random_state=0
+        ).fit(np.dot(X_fit, X_fit.T)).transform(np.dot(X_pred, X_fit.T))
 
         X_kpca_train = KernelPCA(
-            4, eigen_solver=eigen_solver,
-            kernel='precomputed').fit_transform(np.dot(X_fit, X_fit.T))
+            4, eigen_solver=eigen_solver, kernel='precomputed', random_state=0
+        ).fit_transform(np.dot(X_fit, X_fit.T))
+
         X_kpca_train2 = KernelPCA(
-            4, eigen_solver=eigen_solver, kernel='precomputed').fit(
-                np.dot(X_fit, X_fit.T)).transform(np.dot(X_fit, X_fit.T))
+            4, eigen_solver=eigen_solver, kernel='precomputed', random_state=0
+        ).fit(np.dot(X_fit, X_fit.T)).transform(np.dot(X_fit, X_fit.T))
 
         assert_array_almost_equal(np.abs(X_kpca),
                                   np.abs(X_kpca2))
@@ -209,7 +274,42 @@ def test_kernel_pca_precomputed():
                                   np.abs(X_kpca_train2))
 
 
+@pytest.mark.parametrize("solver", ["auto", "dense", "arpack", "randomized"])
+def test_kernel_pca_precomputed_non_symmetric(solver):
+    """Check that the kernel centerer works.
+
+    Tests that a non symmetric precomputed kernel is actually accepted
+    because the kernel centerer does its job correctly.
+    """
+
+    # a non symmetric gram matrix
+    K = [
+        [1, 2],
+        [3, 40]
+    ]
+    kpca = KernelPCA(kernel="precomputed", eigen_solver=solver,
+                     n_components=1, random_state=0)
+    kpca.fit(K)  # no error
+
+    # same test with centered kernel
+    Kc = [
+        [9, -9],
+        [-9, 9]
+    ]
+    kpca_c = KernelPCA(kernel="precomputed", eigen_solver=solver,
+                       n_components=1, random_state=0)
+    kpca_c.fit(Kc)
+
+    # comparison between the non-centered and centered versions
+    assert_array_equal(kpca.alphas_, kpca_c.alphas_)
+    assert_array_equal(kpca.lambdas_, kpca_c.lambdas_)
+
+
 def test_kernel_pca_invalid_kernel():
+    """Tests that using an invalid kernel name raises a ValueError
+
+    An invalid kernel name should raise a ValueError at fit time.
+    """
     rng = np.random.RandomState(0)
     X_fit = rng.random_sample((2, 4))
     kpca = KernelPCA(kernel="tototiti")
@@ -218,8 +318,11 @@ def test_kernel_pca_invalid_kernel():
 
 
 def test_gridsearch_pipeline():
-    # Test if we can do a grid-search to find parameters to separate
-    # circles with a perceptron model.
+    """Check that kPCA works as expected in a grid search pipeline
+
+    Test if we can do a grid-search to find parameters to separate
+    circles with a perceptron model.
+    """
     X, y = make_circles(n_samples=400, factor=.3, noise=.05,
                         random_state=0)
     kpca = KernelPCA(kernel="rbf", n_components=2)
@@ -232,8 +335,11 @@ def test_gridsearch_pipeline():
 
 
 def test_gridsearch_pipeline_precomputed():
-    # Test if we can do a grid-search to find parameters to separate
-    # circles with a perceptron model using a precomputed kernel.
+    """Check that kPCA works as expected in a grid search pipeline (2)
+
+    Test if we can do a grid-search to find parameters to separate
+    circles with a perceptron model. This test uses a precomputed kernel.
+    """
     X, y = make_circles(n_samples=400, factor=.3, noise=.05,
                         random_state=0)
     kpca = KernelPCA(kernel="precomputed", n_components=2)
@@ -247,7 +353,12 @@ def test_gridsearch_pipeline_precomputed():
 
 
 def test_nested_circles():
-    # Test the linear separability of the first 2D KPCA transform
+    """Check that kPCA projects in a space where nested circles are separable
+
+    Tests that 2D nested circles become separable with a perceptron when
+    projected in the first 2 kPCA using an RBF kernel, while raw samples
+    are not directly separable in the original space.
+    """
     X, y = make_circles(n_samples=400, factor=.3, noise=.05,
                         random_state=0)
 
@@ -270,8 +381,10 @@ def test_nested_circles():
 
 
 def test_kernel_conditioning():
-    """ Test that ``_check_psd_eigenvalues`` is correctly called
-    Non-regression test for issue #12140 (PR #12145)"""
+    """Check that ``_check_psd_eigenvalues`` is correctly called in kPCA
+
+    Non-regression test for issue #12140 (PR #12145).
+    """
 
     # create a pathological X leading to small non-zero eigenvalue
     X = [[5, 1],
@@ -286,11 +399,93 @@ def test_kernel_conditioning():
     assert np.all(kpca.lambdas_ == _check_psd_eigenvalues(kpca.lambdas_))
 
 
+@pytest.mark.parametrize("solver", ["auto", "dense", "arpack", "randomized"])
+def test_precomputed_kernel_not_psd(solver):
+    """Check how KernelPCA works with non-PSD kernels depending on n_components
+
+    Tests for all methods what happens with a non PSD gram matrix (this
+    can happen in an isomap scenario, or with custom kernel functions, or
+    maybe with ill-posed datasets).
+
+    When ``n_component`` is large enough to capture a negative eigenvalue, an
+    error should be raised. Otherwise, KernelPCA should run without error
+    since the negative eigenvalues are not selected.
+    """
+
+    # a non PSD kernel with large eigenvalues, already centered
+    # it was captured from an isomap call and multiplied by 100 for compacity
+    K = [
+        [4.48, -1., 8.07, 2.33, 2.33, 2.33, -5.76, -12.78],
+        [-1., -6.48, 4.5, -1.24, -1.24, -1.24, -0.81, 7.49],
+        [8.07, 4.5, 15.48, 2.09, 2.09, 2.09, -11.1, -23.23],
+        [2.33, -1.24, 2.09, 4., -3.65, -3.65, 1.02, -0.9],
+        [2.33, -1.24, 2.09, -3.65, 4., -3.65, 1.02, -0.9],
+        [2.33, -1.24, 2.09, -3.65, -3.65, 4., 1.02, -0.9],
+        [-5.76, -0.81, -11.1, 1.02, 1.02, 1.02, 4.86, 9.75],
+        [-12.78, 7.49, -23.23, -0.9, -0.9, -0.9, 9.75, 21.46]
+    ]
+    # this gram matrix has 5 positive eigenvalues and 3 negative ones
+    # [ 52.72,   7.65,   7.65,   5.02,   0.  ,  -0.  ,  -6.13, -15.11]
+
+    # 1. ask for enough components to get a significant negative one
+    kpca = KernelPCA(kernel="precomputed", eigen_solver=solver, n_components=7)
+    # make sure that the appropriate error is raised
+    with pytest.raises(ValueError,
+                       match="There are significant negative eigenvalues"):
+        kpca.fit(K)
+
+    # 2. ask for a small enough n_components to get only positive ones
+    kpca = KernelPCA(kernel="precomputed", eigen_solver=solver, n_components=2)
+    if solver == 'randomized':
+        # the randomized method is still inconsistent with the others on this
+        # since it selects the eigenvalues based on the largest 2 modules, not
+        # on the largest 2 values.
+        #
+        # At least we can ensure that we return an error instead of returning
+        # the wrong eigenvalues
+        with pytest.raises(ValueError,
+                           match="There are significant negative eigenvalues"):
+            kpca.fit(K)
+    else:
+        # general case: make sure that it works
+        kpca.fit(K)
+
+
+@pytest.mark.parametrize("n_components", [4, 10, 20])
+def test_kernel_pca_solvers_equivalence(n_components):
+    """Check that 'dense' 'arpack' & 'randomized' solvers give similar results
+    """
+
+    # Generate random data
+    n_train, n_test = 2000, 100
+    X, _ = make_circles(n_samples=(n_train + n_test), factor=.3, noise=.05,
+                        random_state=0)
+    X_fit, X_pred = X[:n_train, :], X[n_train:, :]
+
+    # reference (full)
+    ref_pred = KernelPCA(n_components, eigen_solver="dense", random_state=0
+                         ).fit(X_fit).transform(X_pred)
+
+    # arpack
+    a_pred = KernelPCA(n_components, eigen_solver="arpack", random_state=0
+                       ).fit(X_fit).transform(X_pred)
+    # check that the result is still correct despite the approx
+    assert_array_almost_equal(np.abs(a_pred), np.abs(ref_pred))
+
+    # randomized
+    r_pred = KernelPCA(n_components, eigen_solver="randomized", random_state=0
+                       ).fit(X_fit).transform(X_pred)
+    # check that the result is still correct despite the approximation
+    assert_array_almost_equal(np.abs(r_pred), np.abs(ref_pred))
+
+
 def test_kernel_pca_inverse_transform_reconstruction():
-    # Test if the reconstruction is a good approximation.
-    # Note that in general it is not possible to get an arbitrarily good
-    # reconstruction because of kernel centering that does not
-    # preserve all the information of the original data.
+    """Test if the reconstruction is a good approximation.
+
+    Note that in general it is not possible to get an arbitrarily good
+    reconstruction because of kernel centering that does not
+    preserve all the information of the original data.
+    """
     X, *_ = make_blobs(n_samples=100, n_features=4, random_state=0)
 
     kpca = KernelPCA(
@@ -302,8 +497,11 @@ def test_kernel_pca_inverse_transform_reconstruction():
 
 
 def test_32_64_decomposition_shape():
-    """ Test that the decomposition is similar for 32 and 64 bits data """
-    # see https://github.com/scikit-learn/scikit-learn/issues/18146
+    """Test that the decomposition is similar for 32 and 64 bits data
+
+    Non regression test for
+    https://github.com/scikit-learn/scikit-learn/issues/18146
+    """
     X, y = make_blobs(
         n_samples=30,
         centers=[[0, 0, 0], [1, 1, 1]],
@@ -321,6 +519,10 @@ def test_32_64_decomposition_shape():
 
 # TODO: Remove in 1.1
 def test_kernel_pcc_pairwise_is_deprecated():
+    """Check that `_pairwise` is correctly marked with deprecation warning
+
+    Tests that a `FutureWarning` is issued when `_pairwise` is accessed.
+    """
     kp = KernelPCA(kernel='precomputed')
     msg = r"Attribute _pairwise was deprecated in version 0\.24"
     with pytest.warns(FutureWarning, match=msg):
diff --git a/sklearn/utils/extmath.py b/sklearn/utils/extmath.py
index add8c5883a751..c72c54bd1aa4d 100644
--- a/sklearn/utils/extmath.py
+++ b/sklearn/utils/extmath.py
@@ -249,6 +249,9 @@ def randomized_svd(M, n_components, *, n_oversamples=10, n_iter='auto',
                    flip_sign=True, random_state='warn'):
     """Computes a truncated randomized SVD.
 
+    This method solves the fixed-rank approximation problem described in the
+    Halko et al paper (problem (1.5), p5).
+
     Parameters
     ----------
     M : {ndarray, sparse matrix}
@@ -262,13 +265,23 @@ def randomized_svd(M, n_components, *, n_oversamples=10, n_iter='auto',
         to ensure proper conditioning. The total number of random vectors
         used to find the range of M is n_components + n_oversamples. Smaller
         number can improve speed but can negatively impact the quality of
-        approximation of singular vectors and singular values.
+        approximation of singular vectors and singular values. Users might wish
+        to increase this parameter up to `2*k - n_components` where k is the
+        effective rank, for large matrices, noisy problems, matrices with
+        slowly decaying spectrums, or to increase precision accuracy. See Halko
+        et al (pages 5, 23 and 26).
 
     n_iter : int or 'auto', default='auto'
         Number of power iterations. It can be used to deal with very noisy
         problems. When 'auto', it is set to 4, unless `n_components` is small
-        (< .1 * min(X.shape)) `n_iter` in which case is set to 7.
-        This improves precision with few components.
+        (< .1 * min(X.shape)) in which case `n_iter` is set to 7.
+        This improves precision with few components. Note that in general
+        users should rather increase `n_oversamples` before increasing `n_iter`
+        as the principle of the randomized method is to avoid usage of these
+        more costly power iterations steps. When `n_components` is equal
+        or greater to the effective matrix rank and the spectrum does not
+        present a slow decay, `n_iter=0` or `1` should even work fine in theory
+        (see Halko et al paper, page 9).
 
         .. versionchanged:: 0.18
 
@@ -316,12 +329,15 @@ def randomized_svd(M, n_components, *, n_oversamples=10, n_iter='auto',
     computations. It is particularly fast on large matrices on which
     you wish to extract only a small number of components. In order to
     obtain further speed up, `n_iter` can be set <=2 (at the cost of
-    loss of precision).
+    loss of precision). To increase the precision it is recommended to
+    increase `n_oversamples`, up to `2*k-n_components` where k is the
+    effective rank. Usually, `n_components` is chosen to be greater than k
+    so increasing `n_oversamples` up to `n_components` should be enough.
 
     References
     ----------
     * Finding structure with randomness: Stochastic algorithms for constructing
-      approximate matrix decompositions
+      approximate matrix decompositions (Algorithm 4.3)
       Halko, et al., 2009 https://arxiv.org/abs/0909.4061
 
     * A randomized algorithm for the decomposition of matrices
@@ -393,6 +409,152 @@ def randomized_svd(M, n_components, *, n_oversamples=10, n_iter='auto',
         return U[:, :n_components], s[:n_components], Vt[:n_components, :]
 
 
+@_deprecate_positional_args
+def _randomized_eigsh(M, n_components, *, n_oversamples=10, n_iter='auto',
+                      power_iteration_normalizer='auto',
+                      selection='module', random_state=None):
+    """Computes a truncated eigendecomposition using randomized methods
+
+    This method solves the fixed-rank approximation problem described in the
+    Halko et al paper.
+
+    The choice of which components to select can be tuned with the `selection`
+    parameter.
+
+    .. versionadded:: 0.24
+
+    Parameters
+    ----------
+    M : ndarray or sparse matrix
+        Matrix to decompose, it should be real symmetric square or complex
+        hermitian
+
+    n_components : int
+        Number of eigenvalues and vectors to extract.
+
+    n_oversamples : int, default=10
+        Additional number of random vectors to sample the range of M so as
+        to ensure proper conditioning. The total number of random vectors
+        used to find the range of M is n_components + n_oversamples. Smaller
+        number can improve speed but can negatively impact the quality of
+        approximation of eigenvectors and eigenvalues. Users might wish
+        to increase this parameter up to `2*k - n_components` where k is the
+        effective rank, for large matrices, noisy problems, matrices with
+        slowly decaying spectrums, or to increase precision accuracy. See Halko
+        et al (pages 5, 23 and 26).
+
+    n_iter : int or 'auto', default='auto'
+        Number of power iterations. It can be used to deal with very noisy
+        problems. When 'auto', it is set to 4, unless `n_components` is small
+        (< .1 * min(X.shape)) in which case `n_iter` is set to 7.
+        This improves precision with few components. Note that in general
+        users should rather increase `n_oversamples` before increasing `n_iter`
+        as the principle of the randomized method is to avoid usage of these
+        more costly power iterations steps. When `n_components` is equal
+        or greater to the effective matrix rank and the spectrum does not
+        present a slow decay, `n_iter=0` or `1` should even work fine in theory
+        (see Halko et al paper, page 9).
+
+    power_iteration_normalizer : {'auto', 'QR', 'LU', 'none'}, default='auto'
+        Whether the power iterations are normalized with step-by-step
+        QR factorization (the slowest but most accurate), 'none'
+        (the fastest but numerically unstable when `n_iter` is large, e.g.
+        typically 5 or larger), or 'LU' factorization (numerically stable
+        but can lose slightly in accuracy). The 'auto' mode applies no
+        normalization if `n_iter` <= 2 and switches to LU otherwise.
+
+    selection : {'value', 'module'}, default='module'
+        Strategy used to select the n components. When `selection` is `'value'`
+        (not yet implemented, will become the default when implemented), the
+        components corresponding to the n largest eigenvalues are returned.
+        When `selection` is `'module'`, the components corresponding to the n
+        eigenvalues with largest modules are returned.
+
+    random_state : int, RandomState instance, default=None
+        The seed of the pseudo random number generator to use when shuffling
+        the data, i.e. getting the random vectors to initialize the algorithm.
+        Pass an int for reproducible results across multiple function calls.
+        See :term:`Glossary `.
+
+    Notes
+    -----
+    This algorithm finds a (usually very good) approximate truncated
+    eigendecomposition using randomized methods to speed up the computations.
+
+    This method is particularly fast on large matrices on which
+    you wish to extract only a small number of components. In order to
+    obtain further speed up, `n_iter` can be set <=2 (at the cost of
+    loss of precision). To increase the precision it is recommended to
+    increase `n_oversamples`, up to `2*k-n_components` where k is the
+    effective rank. Usually, `n_components` is chosen to be greater than k
+    so increasing `n_oversamples` up to `n_components` should be enough.
+
+    Strategy 'value': not implemented yet.
+    Algorithms 5.3, 5.4 and 5.5 in the Halko et al paper should provide good
+    condidates for a future implementation.
+
+    Strategy 'module':
+    The principle is that for diagonalizable matrices, the singular values and
+    eigenvalues are related: if t is an eigenvalue of A, then :math:`|t|` is a
+    singular value of A. This method relies on a randomized SVD to find the n
+    singular components corresponding to the n singular values with largest
+    modules, and then uses the signs of the singular vectors to find the true
+    sign of t: if the sign of left and right singular vectors are different
+    then the corresponding eigenvalue is negative.
+
+    Returns
+    -------
+    eigvals : 1D array of shape (n_components,) containing the `n_components`
+        eigenvalues selected (see ``selection`` parameter).
+    eigvecs : 2D array of shape (M.shape[0], n_components) containing the
+        `n_components` eigenvectors corresponding to the `eigvals`, in the
+        corresponding order. Note that this follows the `scipy.linalg.eigh`
+        convention.
+
+    See Also
+    --------
+    :func:`randomized_svd`
+
+    References
+    ----------
+    * Finding structure with randomness: Stochastic algorithms for constructing
+      approximate matrix decompositions (Algorithm 4.3 for strategy 'module')
+      Halko, et al., 2009 https://arxiv.org/abs/0909.4061
+
+    """
+    if selection == 'value':  # pragma: no cover
+        # to do : an algorithm can be found in the Halko et al reference
+        raise NotImplementedError()
+
+    elif selection == 'module':
+        # Note: no need for deterministic U and Vt (flip_sign=True),
+        # as we only use the dot product UVt afterwards
+        U, S, Vt = randomized_svd(
+            M, n_components=n_components, n_oversamples=n_oversamples,
+            n_iter=n_iter,
+            power_iteration_normalizer=power_iteration_normalizer,
+            flip_sign=False, random_state=random_state)
+
+        eigvecs = U[:, :n_components]
+        eigvals = S[:n_components]
+
+        # Conversion of Singular values into Eigenvalues:
+        # For any eigenvalue t, the corresponding singular value is |t|.
+        # So if there is a negative eigenvalue t, the corresponding singular
+        # value will be -t, and the left (U) and right (V) singular vectors
+        # will have opposite signs.
+        # Fastest way: see 
+        diag_VtU = np.einsum('ji,ij->j',
+                             Vt[:n_components, :], U[:, :n_components])
+        signs = np.sign(diag_VtU)
+        eigvals = eigvals * signs
+
+    else:  # pragma: no cover
+        raise ValueError("Invalid `selection`: %r" % selection)
+
+    return eigvals, eigvecs
+
+
 @_deprecate_positional_args
 def weighted_mode(a, w, *, axis=0):
     """Returns an array of the weighted modal (most common) value in a.
diff --git a/sklearn/utils/tests/test_extmath.py b/sklearn/utils/tests/test_extmath.py
index 8e53d94d911f0..1a77d08b12388 100644
--- a/sklearn/utils/tests/test_extmath.py
+++ b/sklearn/utils/tests/test_extmath.py
@@ -8,11 +8,12 @@
 from scipy import sparse
 from scipy import linalg
 from scipy import stats
+from scipy.sparse.linalg import eigsh
 from scipy.special import expit
 
 import pytest
 from sklearn.utils import gen_batches
-
+from sklearn.utils._arpack import _init_arpack_v0
 from sklearn.utils._testing import assert_almost_equal
 from sklearn.utils._testing import assert_allclose
 from sklearn.utils._testing import assert_allclose_dense_sparse
@@ -23,7 +24,7 @@
 from sklearn.utils._testing import skip_if_32bit
 
 from sklearn.utils.extmath import density, _safe_accumulator_op
-from sklearn.utils.extmath import randomized_svd
+from sklearn.utils.extmath import randomized_svd, _randomized_eigsh
 from sklearn.utils.extmath import row_norms
 from sklearn.utils.extmath import weighted_mode
 from sklearn.utils.extmath import cartesian
@@ -34,7 +35,7 @@
 from sklearn.utils.extmath import softmax
 from sklearn.utils.extmath import stable_cumsum
 from sklearn.utils.extmath import safe_sparse_dot
-from sklearn.datasets import make_low_rank_matrix
+from sklearn.datasets import make_low_rank_matrix, make_sparse_spd_matrix
 
 
 def test_density():
@@ -161,6 +162,128 @@ def test_randomized_svd_low_rank_all_dtypes(dtype):
     check_randomized_svd_low_rank(dtype)
 
 
+@pytest.mark.parametrize('dtype',
+                         (np.int32, np.int64, np.float32, np.float64))
+def test_randomized_eigsh(dtype):
+    """Test that `_randomized_eigsh` returns the appropriate components"""
+
+    rng = np.random.RandomState(42)
+    X = np.diag(np.array([1., -2., 0., 3.], dtype=dtype))
+    # random rotation that preserves the eigenvalues of X
+    rand_rot = np.linalg.qr(rng.normal(size=X.shape))[0]
+    X = rand_rot @ X @ rand_rot.T
+
+    # with 'module' selection method, the negative eigenvalue shows up
+    eigvals, eigvecs = _randomized_eigsh(X, n_components=2, selection='module')
+    # eigenvalues
+    assert eigvals.shape == (2,)
+    assert_array_almost_equal(eigvals, [3., -2.])  # negative eigenvalue here
+    # eigenvectors
+    assert eigvecs.shape == (4, 2)
+
+    # with 'value' selection method, the negative eigenvalue does not show up
+    with pytest.raises(NotImplementedError):
+        _randomized_eigsh(X, n_components=2, selection='value')
+
+
+@pytest.mark.parametrize('k', (10, 50, 100, 199, 200))
+def test_randomized_eigsh_compared_to_others(k):
+    """Check that `_randomized_eigsh` is similar to other `eigsh`
+
+    Tests that for a random PSD matrix, `_randomized_eigsh` provides results
+    comparable to LAPACK (scipy.linalg.eigh) and ARPACK
+    (scipy.sparse.linalg.eigsh).
+
+    Note: some versions of ARPACK do not support k=n_features.
+    """
+
+    # make a random PSD matrix
+    n_features = 200
+    X = make_sparse_spd_matrix(n_features, random_state=0)
+
+    # compare two versions of randomized
+    # rough and fast
+    eigvals, eigvecs = _randomized_eigsh(X, n_components=k, selection='module',
+                                         n_iter=25, random_state=0)
+    # more accurate but slow (TODO find realistic settings here)
+    eigvals_qr, eigvecs_qr = _randomized_eigsh(
+        X, n_components=k, n_iter=25, n_oversamples=20, random_state=0,
+        power_iteration_normalizer="QR", selection='module'
+    )
+
+    # with LAPACK
+    eigvals_lapack, eigvecs_lapack = linalg.eigh(X, eigvals=(n_features - k,
+                                                             n_features - 1))
+    indices = eigvals_lapack.argsort()[::-1]
+    eigvals_lapack = eigvals_lapack[indices]
+    eigvecs_lapack = eigvecs_lapack[:, indices]
+
+    # -- eigenvalues comparison
+    assert eigvals_lapack.shape == (k,)
+    # comparison precision
+    assert_array_almost_equal(eigvals, eigvals_lapack, decimal=6)
+    assert_array_almost_equal(eigvals_qr, eigvals_lapack, decimal=6)
+
+    # -- eigenvectors comparison
+    assert eigvecs_lapack.shape == (n_features, k)
+    # flip eigenvectors' sign to enforce deterministic output
+    dummy_vecs = np.zeros_like(eigvecs).T
+    eigvecs, _ = svd_flip(eigvecs, dummy_vecs)
+    eigvecs_qr, _ = svd_flip(eigvecs_qr, dummy_vecs)
+    eigvecs_lapack, _ = svd_flip(eigvecs_lapack, dummy_vecs)
+    assert_array_almost_equal(eigvecs, eigvecs_lapack, decimal=4)
+    assert_array_almost_equal(eigvecs_qr, eigvecs_lapack, decimal=6)
+
+    # comparison ARPACK ~ LAPACK (some ARPACK implems do not support k=n)
+    if k < n_features:
+        v0 = _init_arpack_v0(n_features, random_state=0)
+        # "LA" largest algebraic <=> selection="value" in randomized_eigsh
+        eigvals_arpack, eigvecs_arpack = eigsh(X, k, which="LA", tol=0,
+                                               maxiter=None, v0=v0)
+        indices = eigvals_arpack.argsort()[::-1]
+        # eigenvalues
+        eigvals_arpack = eigvals_arpack[indices]
+        assert_array_almost_equal(eigvals_lapack, eigvals_arpack, decimal=10)
+        # eigenvectors
+        eigvecs_arpack = eigvecs_arpack[:, indices]
+        eigvecs_arpack, _ = svd_flip(eigvecs_arpack, dummy_vecs)
+        assert_array_almost_equal(eigvecs_arpack, eigvecs_lapack, decimal=8)
+
+
+@pytest.mark.parametrize("n,rank", [
+    (10, 7),
+    (100, 10),
+    (100, 80),
+    (500, 10),
+    (500, 250),
+    (500, 400),
+])
+def test_randomized_eigsh_reconst_low_rank(n, rank):
+    """Check that randomized_eigsh is able to reconstruct a low rank psd matrix
+
+    Tests that the decomposition provided by `_randomized_eigsh` leads to
+    orthonormal eigenvectors, and that a low rank PSD matrix can be effectively
+    reconstructed with good accuracy using it.
+    """
+    assert rank < n
+
+    # create a low rank PSD
+    rng = np.random.RandomState(69)
+    X = rng.randn(n, rank)
+    A = X @ X.T
+
+    # approximate A with the "right" number of components
+    S, V = _randomized_eigsh(A, n_components=rank, random_state=rng)
+    # orthonormality checks
+    assert_array_almost_equal(np.linalg.norm(V, axis=0), np.ones(S.shape))
+    assert_array_almost_equal(V.T @ V, np.diag(np.ones(S.shape)))
+    # reconstruction
+    A_reconstruct = V @ np.diag(S) @ V.T
+
+    # test that the approximation is good
+    assert_array_almost_equal(A_reconstruct, A, decimal=6)
+
+
 @pytest.mark.parametrize('dtype',
                          (np.float32, np.float64))
 def test_row_norms(dtype):

From 6a562d31e4488b067bff89fbce21382076823fa3 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre 
Date: Tue, 27 Apr 2021 17:45:20 +0200
Subject: [PATCH 369/478] ENH improve KernelCenterer documentation and tests
 (#19901)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Julien Jerphanion 
Co-authored-by: Sylvain Marié 
---
 doc/modules/preprocessing.rst            | 63 ++++++++++++++++++--
 sklearn/preprocessing/_data.py           | 36 +++++++++---
 sklearn/preprocessing/tests/test_data.py | 74 ++++++++++++++++++++++++
 3 files changed, 162 insertions(+), 11 deletions(-)

diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
index cdde7479b1a4f..0afd79b754608 100644
--- a/doc/modules/preprocessing.rst
+++ b/doc/modules/preprocessing.rst
@@ -240,10 +240,65 @@ Centering kernel matrices
 -------------------------
 
 If you have a kernel matrix of a kernel :math:`K` that computes a dot product
-in a feature space defined by function :math:`\phi`,
-a :class:`KernelCenterer` can transform the kernel matrix
-so that it contains inner products in the feature space
-defined by :math:`\phi` followed by removal of the mean in that space.
+in a feature space (possibly implicitly) defined by a function
+:math:`\phi(\cdot)`, a :class:`KernelCenterer` can transform the kernel matrix
+so that it contains inner products in the feature space defined by :math:`\phi`
+followed by the removal of the mean in that space. In other words,
+:class:`KernelCenterer` computes the centered Gram matrix associated to a
+positive semidefinite kernel :math:`K`.
+
+**Mathematical formulation**
+
+We can have a look at the mathematical formulation now that we have the
+intuition. Let :math:`K` be a kernel matrix of shape `(n_samples, n_samples)`
+computed from :math:`X`, a data matrix of shape `(n_samples, n_features)`,
+during the `fit` step. :math:`K` is defined by
+
+.. math::
+  K(X, X) = \phi(X) . \phi(X)^{T}
+
+:math:`\phi(X)` is a function mapping of :math:`X` to a Hilbert space. A
+centered kernel :math:`\tilde{K}` is defined as:
+
+.. math::
+  \tilde{K}(X, X) = \tilde{\phi}(X) . \tilde{\phi}(X)^{T}
+
+where :math:`\tilde{\phi}(X)` results from centering :math:`\phi(X)` in the
+Hilbert space.
+
+Thus, one could compute :math:`\tilde{K}` by mapping :math:`X` using the
+function :math:`\phi(\cdot)` and center the data in this new space. However,
+kernels are often used because they allows some algebra calculations that
+avoid computing explicitly this mapping using :math:`\phi(\cdot)`. Indeed, one
+can implicitly center as shown in Appendix B in [Scholkopf1998]_:
+
+.. math::
+  \tilde{K} = K - 1_{\text{n}_{samples}} K - K 1_{\text{n}_{samples}} + 1_{\text{n}_{samples}} K 1_{\text{n}_{samples}}
+
+:math:`1_{\text{n}_{samples}}` is a matrix of `(n_samples, n_samples)` where
+all entries are equal to :math:`\frac{1}{\text{n}_{samples}}`. In the
+`transform` step, the kernel becomes :math:`K_{test}(X, Y)` defined as:
+
+.. math::
+  K_{test}(X, Y) = \phi(Y) . \phi(X)^{T}
+
+:math:`Y` is the test dataset of shape `(n_samples_test, n_features)` and thus
+:math:`K_{test}` is of shape `(n_samples_test, n_samples)`. In this case,
+centering :math:`K_{test}` is done as:
+
+.. math::
+  \tilde{K}_{test}(X, Y) = K_{test} - 1'_{\text{n}_{samples}} K - K_{test} 1_{\text{n}_{samples}} + 1'_{\text{n}_{samples}} K 1_{\text{n}_{samples}}
+
+:math:`1'_{\text{n}_{samples}}` is a matrix of shape
+`(n_samples_test, n_samples)` where all entries are equal to
+:math:`\frac{1}{\text{n}_{samples}}`.
+
+.. topic:: References
+
+  .. [Scholkopf1998] B. Schölkopf, A. Smola, and K.R. Müller,
+    `"Nonlinear component analysis as a kernel eigenvalue problem."
+    `_
+    Neural computation 10.5 (1998): 1299-1319.
 
 .. _preprocessing_transformer:
 
diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py
index 80cb132174328..befd3e61b96fc 100644
--- a/sklearn/preprocessing/_data.py
+++ b/sklearn/preprocessing/_data.py
@@ -1951,24 +1951,46 @@ def _more_tags(self):
 
 
 class KernelCenterer(TransformerMixin, BaseEstimator):
-    """Center a kernel matrix.
+    r"""Center an arbitrary kernel matrix :math:`K`.
 
-    Let K(x, z) be a kernel defined by phi(x)^T phi(z), where phi is a
-    function mapping x to a Hilbert space. KernelCenterer centers (i.e.,
-    normalize to have zero mean) the data without explicitly computing phi(x).
-    It is equivalent to centering phi(x) with
-    sklearn.preprocessing.StandardScaler(with_std=False).
+    Let define a kernel :math:`K` such that:
+
+    .. math::
+        K(X, Y) = \phi(X) . \phi(Y)^{T}
+
+    :math:`\phi(X)` is a function mapping of rows of :math:`X` to a
+    Hilbert space and :math:`K` is of shape `(n_samples, n_samples)`.
+
+    This class allows to compute :math:`\tilde{K}(X, Y)` such that:
+
+    .. math::
+        \tilde{K(X, Y)} = \tilde{\phi}(X) . \tilde{\phi}(Y)^{T}
+
+    :math:`\tilde{\phi}(X)` is the centered mapped data in the Hilbert
+    space.
+
+    `KernelCenterer` centers the features without explicitly computing the
+    mapping :math:`\phi(\cdot)`. Working with centered kernels is sometime
+    expected when dealing with algebra computation such as eigendecomposition
+    for :class:`~sklearn.decomposition.KernelPCA` for instance.
 
     Read more in the :ref:`User Guide `.
 
     Attributes
     ----------
-    K_fit_rows_ : array of shape (n_samples,)
+    K_fit_rows_ : ndarray of shape (n_samples,)
         Average of each column of kernel matrix.
 
     K_fit_all_ : float
         Average of kernel matrix.
 
+    References
+    ----------
+    .. [1] `Schölkopf, Bernhard, Alexander Smola, and Klaus-Robert Müller.
+       "Nonlinear component analysis as a kernel eigenvalue problem."
+       Neural computation 10.5 (1998): 1299-1319.
+       `_
+
     Examples
     --------
     >>> from sklearn.preprocessing import KernelCenterer
diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py
index 45d967d5f39a2..2cc51a4208675 100644
--- a/sklearn/preprocessing/tests/test_data.py
+++ b/sklearn/preprocessing/tests/test_data.py
@@ -2167,6 +2167,80 @@ def test_center_kernel():
     K_pred_centered2 = centerer.transform(K_pred)
     assert_array_almost_equal(K_pred_centered, K_pred_centered2)
 
+    # check the results coherence with the method proposed in:
+    # B. Schölkopf, A. Smola, and K.R. Müller,
+    # "Nonlinear component analysis as a kernel eigenvalue problem"
+    # equation (B.3)
+
+    # K_centered3 = (I - 1_M) K (I - 1_M)
+    #             =  K - 1_M K - K 1_M + 1_M K 1_M
+    ones_M = np.ones_like(K_fit) / K_fit.shape[0]
+    K_fit_centered3 = (
+        K_fit - ones_M @ K_fit - K_fit @ ones_M + ones_M @ K_fit @ ones_M
+    )
+    assert_allclose(K_fit_centered, K_fit_centered3)
+
+    # K_test_centered3 = (K_test - 1'_M K)(I - 1_M)
+    #                  = K_test - 1'_M K - K_test 1_M + 1'_M K 1_M
+    ones_prime_M = np.ones_like(K_pred) / K_fit.shape[0]
+    K_pred_centered3 = (
+        K_pred - ones_prime_M @ K_fit - K_pred @ ones_M +
+        ones_prime_M @ K_fit @ ones_M
+    )
+    assert_allclose(K_pred_centered, K_pred_centered3)
+
+
+def test_kernelcenterer_non_linear_kernel():
+    """Check kernel centering for non-linear kernel."""
+    rng = np.random.RandomState(0)
+    X, X_test = rng.randn(100, 50), rng.randn(20, 50)
+
+    def phi(X):
+        """Our mapping function phi."""
+        return np.vstack([
+            np.clip(X, a_min=0, a_max=None),
+            -np.clip(X, a_min=None, a_max=0),
+        ])
+
+    phi_X = phi(X)
+    phi_X_test = phi(X_test)
+
+    # centered the projection
+    scaler = StandardScaler(with_std=False)
+    phi_X_center = scaler.fit_transform(phi_X)
+    phi_X_test_center = scaler.transform(phi_X_test)
+
+    # create the different kernel
+    K = phi_X @ phi_X.T
+    K_test = phi_X_test @ phi_X.T
+    K_center = phi_X_center @ phi_X_center.T
+    K_test_center = phi_X_test_center @ phi_X_center.T
+
+    kernel_centerer = KernelCenterer()
+    kernel_centerer.fit(K)
+
+    assert_allclose(kernel_centerer.transform(K), K_center)
+    assert_allclose(kernel_centerer.transform(K_test), K_test_center)
+
+    # check the results coherence with the method proposed in:
+    # B. Schölkopf, A. Smola, and K.R. Müller,
+    # "Nonlinear component analysis as a kernel eigenvalue problem"
+    # equation (B.3)
+
+    # K_centered = (I - 1_M) K (I - 1_M)
+    #            =  K - 1_M K - K 1_M + 1_M K 1_M
+    ones_M = np.ones_like(K) / K.shape[0]
+    K_centered = K - ones_M @ K - K @ ones_M + ones_M @ K @ ones_M
+    assert_allclose(kernel_centerer.transform(K), K_centered)
+
+    # K_test_centered = (K_test - 1'_M K)(I - 1_M)
+    #                 = K_test - 1'_M K - K_test 1_M + 1'_M K 1_M
+    ones_prime_M = np.ones_like(K_test) / K.shape[0]
+    K_test_centered = (
+        K_test - ones_prime_M @ K - K_test @ ones_M + ones_prime_M @ K @ ones_M
+    )
+    assert_allclose(kernel_centerer.transform(K_test), K_test_centered)
+
 
 def test_cv_pipeline_precomputed():
     # Cross-validate a regression on four coplanar points with the same

From 7b1c9afcaf34e622de76d1f5d5e929e5aaffc514 Mon Sep 17 00:00:00 2001
From: iwhalvic 
Date: Tue, 27 Apr 2021 11:23:08 -0500
Subject: [PATCH 370/478] FIX Use cho_solve when return_std=True for
 GaussianProcessRegressor (#19939)

---
 doc/whats_new/v0.24.rst                    |  7 +++
 sklearn/gaussian_process/_gpr.py           | 24 ++++-----
 sklearn/gaussian_process/tests/test_gpr.py | 57 +++++++++++++---------
 3 files changed, 49 insertions(+), 39 deletions(-)

diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst
index 72a96aa74f470..f54e20e5154bc 100644
--- a/doc/whats_new/v0.24.rst
+++ b/doc/whats_new/v0.24.rst
@@ -52,6 +52,13 @@ Changelog
 :mod:`sklearn.gaussian_process`
 ...............................
 
+- |Fix| Avoid explicitly forming inverse covariance matrix in
+  :class:`gaussian_process.GaussianProcessRegressor` when set to output
+  standard deviation. With certain covariance matrices this inverse is unstable
+  to compute explicitly. Calling Cholesky solver mitigates this issue in
+  computation.
+  :pr:`19939` by :user:`Ian Halvic `.
+
 - |Fix| Avoid division by zero when scaling constant target in
   :class:`gaussian_process.GaussianProcessRegressor`. It was due to a std. dev.
   equal to 0. Now, such case is detected and the std. dev. is affected to 1
diff --git a/sklearn/gaussian_process/_gpr.py b/sklearn/gaussian_process/_gpr.py
index 8f9575ffe42df..9b1d0ae409526 100644
--- a/sklearn/gaussian_process/_gpr.py
+++ b/sklearn/gaussian_process/_gpr.py
@@ -8,7 +8,7 @@
 from operator import itemgetter
 
 import numpy as np
-from scipy.linalg import cholesky, cho_solve, solve_triangular
+from scipy.linalg import cholesky, cho_solve
 import scipy.optimize
 
 from ..base import BaseEstimator, RegressorMixin, clone
@@ -270,8 +270,6 @@ def obj_func(theta, eval_gradient=True):
         K[np.diag_indices_from(K)] += self.alpha
         try:
             self.L_ = cholesky(K, lower=True)  # Line 2
-            # self.L_ changed, self._K_inv needs to be recomputed
-            self._K_inv = None
         except np.linalg.LinAlgError as exc:
             exc.args = ("The kernel, %s, is not returning a "
                         "positive definite matrix. Try gradually "
@@ -345,31 +343,27 @@ def predict(self, X, return_std=False, return_cov=False):
         else:  # Predict based on GP posterior
             K_trans = self.kernel_(X, self.X_train_)
             y_mean = K_trans.dot(self.alpha_)  # Line 4 (y_mean = f_star)
-
             # undo normalisation
             y_mean = self._y_train_std * y_mean + self._y_train_mean
 
             if return_cov:
-                v = cho_solve((self.L_, True), K_trans.T)  # Line 5
-                y_cov = self.kernel_(X) - K_trans.dot(v)  # Line 6
+                # Solve K @ V = K_trans.T
+                V = cho_solve((self.L_, True), K_trans.T)  # Line 5
+                y_cov = self.kernel_(X) - K_trans.dot(V)  # Line 6
 
                 # undo normalisation
                 y_cov = y_cov * self._y_train_std**2
 
                 return y_mean, y_cov
             elif return_std:
-                # cache result of K_inv computation
-                if self._K_inv is None:
-                    # compute inverse K_inv of K based on its Cholesky
-                    # decomposition L and its inverse L_inv
-                    L_inv = solve_triangular(self.L_.T,
-                                             np.eye(self.L_.shape[0]))
-                    self._K_inv = L_inv.dot(L_inv.T)
+                # Solve K @ V = K_trans.T
+                V = cho_solve((self.L_, True), K_trans.T)  # Line 5
 
                 # Compute variance of predictive distribution
+                # Use einsum to avoid explicitly forming the large matrix
+                # K_trans @ V just to extract its diagonal afterward.
                 y_var = self.kernel_.diag(X)
-                y_var -= np.einsum("ij,ij->i",
-                                   np.dot(K_trans, self._K_inv), K_trans)
+                y_var -= np.einsum("ij,ji->i", K_trans, V)
 
                 # Check if any of the variances is negative because of
                 # numerical issues. If yes: set the variance to 0.
diff --git a/sklearn/gaussian_process/tests/test_gpr.py b/sklearn/gaussian_process/tests/test_gpr.py
index 440e421cb95cc..66e3c96a8f029 100644
--- a/sklearn/gaussian_process/tests/test_gpr.py
+++ b/sklearn/gaussian_process/tests/test_gpr.py
@@ -20,10 +20,12 @@
 from sklearn.gaussian_process.tests._mini_sequence_kernel import MiniSeqKernel
 from sklearn.exceptions import ConvergenceWarning
 
-from sklearn.utils._testing \
-    import (assert_array_less,
-            assert_almost_equal, assert_array_almost_equal,
-            assert_array_equal, assert_allclose)
+from sklearn.utils._testing import (
+    assert_array_less,
+    assert_almost_equal,
+    assert_array_almost_equal,
+    assert_allclose
+)
 
 
 def f(x):
@@ -185,7 +187,8 @@ def test_no_optimizer():
 
 
 @pytest.mark.parametrize('kernel', kernels)
-def test_predict_cov_vs_std(kernel):
+@pytest.mark.parametrize("target", [y, np.ones(X.shape[0], dtype=np.float64)])
+def test_predict_cov_vs_std(kernel, target):
     if sys.maxsize <= 2 ** 32 and sys.version_info[:2] == (3, 6):
         pytest.xfail("This test may fail on 32bit Py3.6")
 
@@ -455,25 +458,6 @@ def test_no_fit_default_predict():
     assert_array_almost_equal(y_cov1, y_cov2)
 
 
-@pytest.mark.parametrize('kernel', kernels)
-def test_K_inv_reset(kernel):
-    y2 = f(X2).ravel()
-
-    # Test that self._K_inv is reset after a new fit
-    gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)
-    assert hasattr(gpr, '_K_inv')
-    assert gpr._K_inv is None
-    gpr.predict(X, return_std=True)
-    assert gpr._K_inv is not None
-    gpr.fit(X2, y2)
-    assert gpr._K_inv is None
-    gpr.predict(X2, return_std=True)
-    gpr2 = GaussianProcessRegressor(kernel=kernel).fit(X2, y2)
-    gpr2.predict(X2, return_std=True)
-    # the value of K_inv should be independent of the first fit
-    assert_array_equal(gpr._K_inv, gpr2._K_inv)
-
-
 def test_warning_bounds():
     kernel = RBF(length_scale_bounds=[1e-5, 1e-3])
     gpr = GaussianProcessRegressor(kernel=kernel)
@@ -569,3 +553,28 @@ def test_constant_target(kernel):
     assert_allclose(y_pred, y_constant)
     # set atol because we compare to zero
     assert_allclose(np.diag(y_cov), 0., atol=1e-9)
+
+
+def test_gpr_consistency_std_cov_non_invertible_kernel():
+    """Check the consistency between the returned std. dev. and the covariance.
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/19936
+    Inconsistencies were observed when the kernel cannot be inverted (or
+    numerically stable).
+    """
+    kernel = (C(8.98576054e+05, (1e-12, 1e12)) *
+              RBF([5.91326520e+02, 1.32584051e+03], (1e-12, 1e12)) +
+              WhiteKernel(noise_level=1e-5))
+    gpr = GaussianProcessRegressor(kernel=kernel, alpha=0, optimizer=None)
+    X_train = np.array([[0., 0.], [1.54919334, -0.77459667], [-1.54919334, 0.],
+                        [0., -1.54919334], [0.77459667, 0.77459667],
+                        [-0.77459667, 1.54919334]])
+    y_train = np.array([[-2.14882017e-10], [-4.66975823e+00], [4.01823986e+00],
+                        [-1.30303674e+00], [-1.35760156e+00],
+                        [3.31215668e+00]])
+    gpr.fit(X_train, y_train)
+    X_test = np.array([[-1.93649167, -1.93649167], [1.93649167, -1.93649167],
+                       [-1.93649167, 1.93649167], [1.93649167, 1.93649167]])
+    pred1, std = gpr.predict(X_test, return_std=True)
+    pred2, cov = gpr.predict(X_test, return_cov=True)
+    assert_allclose(std, np.sqrt(np.diagonal(cov)), rtol=1e-5)

From 23032e72882647f7a54c4c8e10440567dfb53e80 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" 
Date: Tue, 27 Apr 2021 12:52:39 -0400
Subject: [PATCH 371/478] ENH Makes global configuration thread local (#18736)

---
 doc/whats_new/v1.0.rst       |  5 ++++
 sklearn/_config.py           | 29 +++++++++++++------
 sklearn/tests/test_config.py | 55 ++++++++++++++++++++++++++++++++++++
 3 files changed, 81 insertions(+), 8 deletions(-)

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 0cd1d6a89d158..6e3c063a45dcb 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -76,6 +76,11 @@ Changelog
   - For :class:`tree.ExtraTreeRegressor`, `criterion="mse"` is deprecated,
     use `"squared_error"` instead which is now the default.
 
+:mod:`sklearn.base`
+...................
+
+- |Fix| :func:`config_context` is now threadsafe. :pr:`18736` by `Thomas Fan`_.
+
 :mod:`sklearn.calibration`
 ..........................
 
diff --git a/sklearn/_config.py b/sklearn/_config.py
index feb5e86287c38..e81d50849db05 100644
--- a/sklearn/_config.py
+++ b/sklearn/_config.py
@@ -2,6 +2,7 @@
 """
 import os
 from contextlib import contextmanager as contextmanager
+import threading
 
 _global_config = {
     'assume_finite': bool(os.environ.get('SKLEARN_ASSUME_FINITE', False)),
@@ -9,6 +10,15 @@
     'print_changed_only': True,
     'display': 'text',
 }
+_threadlocal = threading.local()
+
+
+def _get_threadlocal_config():
+    """Get a threadlocal **mutable** configuration. If the configuration
+    does not exist, copy the default global configuration."""
+    if not hasattr(_threadlocal, 'global_config'):
+        _threadlocal.global_config = _global_config.copy()
+    return _threadlocal.global_config
 
 
 def get_config():
@@ -24,7 +34,9 @@ def get_config():
     config_context : Context manager for global scikit-learn configuration.
     set_config : Set global scikit-learn configuration.
     """
-    return _global_config.copy()
+    # Return a copy of the threadlocal configuration so that users will
+    # not be able to modify the configuration with the returned dict.
+    return _get_threadlocal_config().copy()
 
 
 def set_config(assume_finite=None, working_memory=None,
@@ -72,14 +84,16 @@ def set_config(assume_finite=None, working_memory=None,
     config_context : Context manager for global scikit-learn configuration.
     get_config : Retrieve current values of the global configuration.
     """
+    local_config = _get_threadlocal_config()
+
     if assume_finite is not None:
-        _global_config['assume_finite'] = assume_finite
+        local_config['assume_finite'] = assume_finite
     if working_memory is not None:
-        _global_config['working_memory'] = working_memory
+        local_config['working_memory'] = working_memory
     if print_changed_only is not None:
-        _global_config['print_changed_only'] = print_changed_only
+        local_config['print_changed_only'] = print_changed_only
     if display is not None:
-        _global_config['display'] = display
+        local_config['display'] = display
 
 
 @contextmanager
@@ -120,8 +134,7 @@ def config_context(**new_config):
     Notes
     -----
     All settings, not just those presently modified, will be returned to
-    their previous values when the context manager is exited. This is not
-    thread-safe.
+    their previous values when the context manager is exited.
 
     Examples
     --------
@@ -141,7 +154,7 @@ def config_context(**new_config):
     set_config : Set global scikit-learn configuration.
     get_config : Retrieve current values of the global configuration.
     """
-    old_config = get_config().copy()
+    old_config = get_config()
     set_config(**new_config)
 
     try:
diff --git a/sklearn/tests/test_config.py b/sklearn/tests/test_config.py
index 22ec862ef24a3..6d458088a37a8 100644
--- a/sklearn/tests/test_config.py
+++ b/sklearn/tests/test_config.py
@@ -1,5 +1,13 @@
+import time
+from concurrent.futures import ThreadPoolExecutor
+
+from joblib import Parallel
+import joblib
 import pytest
+
 from sklearn import get_config, set_config, config_context
+from sklearn.utils.fixes import delayed
+from sklearn.utils.fixes import parse_version
 
 
 def test_config_context():
@@ -76,3 +84,50 @@ def test_set_config():
     # No unknown arguments
     with pytest.raises(TypeError):
         set_config(do_something_else=True)
+
+
+def set_assume_finite(assume_finite, sleep_duration):
+    """Return the value of assume_finite after waiting `sleep_duration`."""
+    with config_context(assume_finite=assume_finite):
+        time.sleep(sleep_duration)
+        return get_config()['assume_finite']
+
+
+@pytest.mark.parametrize("backend",
+                         ["loky", "multiprocessing", "threading"])
+def test_config_threadsafe_joblib(backend):
+    """Test that the global config is threadsafe with all joblib backends.
+    Two jobs are spawned and sets assume_finite to two different values.
+    When the job with a duration 0.1s completes, the assume_finite value
+    should be the same as the value passed to the function. In other words,
+    it is not influenced by the other job setting assume_finite to True.
+    """
+
+    if (parse_version(joblib.__version__) < parse_version('0.12')
+            and backend == 'loky'):
+        pytest.skip('loky backend does not exist in joblib <0.12')  # noqa
+
+    assume_finites = [False, True]
+    sleep_durations = [0.1, 0.2]
+
+    items = Parallel(backend=backend, n_jobs=2)(
+        delayed(set_assume_finite)(assume_finite, sleep_dur)
+        for assume_finite, sleep_dur
+        in zip(assume_finites, sleep_durations))
+
+    assert items == [False, True]
+
+
+def test_config_threadsafe():
+    """Uses threads directly to test that the global config does not change
+    between threads. Same test as `test_config_threadsafe_joblib` but with
+    `ThreadPoolExecutor`."""
+
+    assume_finites = [False, True]
+    sleep_durations = [0.1, 0.2]
+
+    with ThreadPoolExecutor(max_workers=2) as e:
+        items = [output for output in
+                 e.map(set_assume_finite, assume_finites, sleep_durations)]
+
+    assert items == [False, True]

From a45c0c99a38cffca6724cb8fd38b12edd4fb6b35 Mon Sep 17 00:00:00 2001
From: Maria Telenczuk 
Date: Tue, 27 Apr 2021 19:22:16 +0200
Subject: [PATCH 372/478] DEP Deprecates 'normalize' in _bayes.py (#17746)

---
 doc/whats_new/v1.0.rst                    |  2 ++
 sklearn/linear_model/_bayes.py            | 32 ++++++++++++++++++-----
 sklearn/linear_model/tests/test_bayes.py  |  2 ++
 sklearn/linear_model/tests/test_common.py |  5 +++-
 4 files changed, 33 insertions(+), 8 deletions(-)

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 6e3c063a45dcb..977d83890e0c0 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -269,6 +269,8 @@ Changelog
   Ridge, RidgeClassifier, RidgeCV or RidgeClassifierCV were deprecated in:
   :pr:`17772` by :user:`Maria Telenczuk ` and
   :user:`Alexandre Gramfort `.
+  BayesianRidge, ARDRegression were deprecated in:
+  :pr:`17746` by :user:`Maria Telenczuk `.
 
 - |Fix|: `sample_weight` are now fully taken into account in linear models
   when `normalize=True` for both feature centering and feature
diff --git a/sklearn/linear_model/_bayes.py b/sklearn/linear_model/_bayes.py
index 634417e2b0efa..2eae8b5c13cee 100644
--- a/sklearn/linear_model/_bayes.py
+++ b/sklearn/linear_model/_bayes.py
@@ -11,6 +11,7 @@
 
 from ._base import LinearModel, _rescale_data
 from ..base import RegressorMixin
+from ._base import _deprecate_normalize
 from ..utils.extmath import fast_logdet
 from scipy.linalg import pinvh
 from ..utils.validation import _check_sample_weight
@@ -84,6 +85,10 @@ class BayesianRidge(RegressorMixin, LinearModel):
         :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``
         on an estimator with ``normalize=False``.
 
+        .. deprecated:: 1.0
+            ``normalize`` was deprecated in version 1.0 and will be removed in
+            1.2.
+
     copy_X : bool, default=True
         If True, X will be copied; else, it may be overwritten.
 
@@ -158,7 +163,7 @@ class BayesianRidge(RegressorMixin, LinearModel):
     def __init__(self, *, n_iter=300, tol=1.e-3, alpha_1=1.e-6, alpha_2=1.e-6,
                  lambda_1=1.e-6, lambda_2=1.e-6, alpha_init=None,
                  lambda_init=None, compute_score=False, fit_intercept=True,
-                 normalize=False, copy_X=True, verbose=False):
+                 normalize='deprecated', copy_X=True, verbose=False):
         self.n_iter = n_iter
         self.tol = tol
         self.alpha_1 = alpha_1
@@ -193,6 +198,10 @@ def fit(self, X, y, sample_weight=None):
         -------
         self : returns an instance of self.
         """
+        self._normalize = _deprecate_normalize(
+            self.normalize, default=False,
+            estimator_name=self.__class__.__name__
+        )
 
         if self.n_iter < 1:
             raise ValueError('n_iter should be greater than or equal to 1.'
@@ -205,7 +214,7 @@ def fit(self, X, y, sample_weight=None):
                                                  dtype=X.dtype)
 
         X, y, X_offset_, y_offset_, X_scale_ = self._preprocess_data(
-            X, y, self.fit_intercept, self.normalize, self.copy_X,
+            X, y, self.fit_intercept, self._normalize, self.copy_X,
             sample_weight=sample_weight)
 
         if sample_weight is not None:
@@ -325,7 +334,7 @@ def predict(self, X, return_std=False):
         if return_std is False:
             return y_mean
         else:
-            if self.normalize:
+            if self._normalize:
                 X = (X - self.X_offset_) / self.X_scale_
             sigmas_squared_data = (np.dot(X, self.sigma_) * X).sum(axis=1)
             y_std = np.sqrt(sigmas_squared_data + (1. / self.alpha_))
@@ -445,6 +454,10 @@ class ARDRegression(RegressorMixin, LinearModel):
         :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``
         on an estimator with ``normalize=False``.
 
+        .. deprecated:: 1.0
+            ``normalize`` was deprecated in version 1.0 and will be removed in
+            1.2.
+
     copy_X : bool, default=True
         If True, X will be copied; else, it may be overwritten.
 
@@ -510,8 +523,8 @@ class ARDRegression(RegressorMixin, LinearModel):
     @_deprecate_positional_args
     def __init__(self, *, n_iter=300, tol=1.e-3, alpha_1=1.e-6, alpha_2=1.e-6,
                  lambda_1=1.e-6, lambda_2=1.e-6, compute_score=False,
-                 threshold_lambda=1.e+4, fit_intercept=True, normalize=False,
-                 copy_X=True, verbose=False):
+                 threshold_lambda=1.e+4, fit_intercept=True,
+                 normalize='deprecated', copy_X=True, verbose=False):
         self.n_iter = n_iter
         self.tol = tol
         self.fit_intercept = fit_intercept
@@ -543,6 +556,11 @@ def fit(self, X, y):
         -------
         self : returns an instance of self.
         """
+        self._normalize = _deprecate_normalize(
+            self.normalize, default=False,
+            estimator_name=self.__class__.__name__
+        )
+
         X, y = self._validate_data(X, y, dtype=np.float64, y_numeric=True,
                                    ensure_min_samples=2)
 
@@ -550,7 +568,7 @@ def fit(self, X, y):
         coef_ = np.zeros(n_features)
 
         X, y, X_offset_, y_offset_, X_scale_ = self._preprocess_data(
-            X, y, self.fit_intercept, self.normalize, self.copy_X)
+            X, y, self.fit_intercept, self._normalize, self.copy_X)
 
         self.X_offset_ = X_offset_
         self.X_scale_ = X_scale_
@@ -686,7 +704,7 @@ def predict(self, X, return_std=False):
         if return_std is False:
             return y_mean
         else:
-            if self.normalize:
+            if self._normalize:
                 X = (X - self.X_offset_) / self.X_scale_
             X = X[:, self.lambda_ < self.threshold_lambda]
             sigmas_squared_data = (np.dot(X, self.sigma_) * X).sum(axis=1)
diff --git a/sklearn/linear_model/tests/test_bayes.py b/sklearn/linear_model/tests/test_bayes.py
index 529702ff752ac..a22a0243cdcb7 100644
--- a/sklearn/linear_model/tests/test_bayes.py
+++ b/sklearn/linear_model/tests/test_bayes.py
@@ -274,6 +274,8 @@ def test_update_sigma(seed):
     np.testing.assert_allclose(sigma, sigma_woodbury)
 
 
+# FIXME: 'normalize' to be removed in 1.2 in LinearRegression
+@pytest.mark.filterwarnings("ignore:'normalize' was deprecated")
 def test_ard_regression_predict_normalize_true():
     """Check that we can predict with `normalize=True` and `return_std=True`.
     Non-regression test for:
diff --git a/sklearn/linear_model/tests/test_common.py b/sklearn/linear_model/tests/test_common.py
index 96a996d18dac7..f255384be4167 100644
--- a/sklearn/linear_model/tests/test_common.py
+++ b/sklearn/linear_model/tests/test_common.py
@@ -12,6 +12,8 @@
 from sklearn.linear_model import RidgeCV
 from sklearn.linear_model import RidgeClassifier
 from sklearn.linear_model import RidgeClassifierCV
+from sklearn.linear_model import BayesianRidge
+from sklearn.linear_model import ARDRegression
 
 from sklearn.utils import check_random_state
 
@@ -24,7 +26,8 @@
 )
 @pytest.mark.parametrize(
     "estimator",
-    [LinearRegression, Ridge, RidgeCV, RidgeClassifier, RidgeClassifierCV]
+    [LinearRegression, Ridge, RidgeCV, RidgeClassifier, RidgeClassifierCV,
+     BayesianRidge, ARDRegression]
 )
 # FIXME remove test in 1.2
 def test_linear_model_normalize_deprecation_message(

From a9cc0ed86fca1480acbd8aaf211f062ee2abd5b7 Mon Sep 17 00:00:00 2001
From: Maria Telenczuk 
Date: Wed, 28 Apr 2021 15:37:41 +0200
Subject: [PATCH 373/478] DOC correct the orders of the x labels (#19997)

---
 examples/impute/plot_missing_values.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py
index f5d75f68c3d09..3ea5c61427ff0 100644
--- a/examples/impute/plot_missing_values.py
+++ b/examples/impute/plot_missing_values.py
@@ -121,11 +121,7 @@ def get_scores_for_imputer(imputer, X_missing, y_missing):
     return impute_scores
 
 
-x_labels = ['Full data',
-            'Zero imputation',
-            'Mean Imputation',
-            'KNN Imputation',
-            'Iterative Imputation']
+x_labels = []
 
 mses_california = np.zeros(5)
 stds_california = np.zeros(5)
@@ -149,6 +145,7 @@ def get_full_score(X_full, y_full):
 mses_california[0], stds_california[0] = get_full_score(X_california,
                                                         y_california)
 mses_diabetes[0], stds_diabetes[0] = get_full_score(X_diabetes, y_diabetes)
+x_labels.append('Full data')
 
 
 # %%
@@ -172,6 +169,7 @@ def get_impute_zero_score(X_missing, y_missing):
     X_miss_california, y_miss_california)
 mses_diabetes[1], stds_diabetes[1] = get_impute_zero_score(X_miss_diabetes,
                                                            y_miss_diabetes)
+x_labels.append('Zero imputation')
 
 
 # %%
@@ -191,6 +189,7 @@ def get_impute_knn_score(X_missing, y_missing):
     X_miss_california, y_miss_california)
 mses_diabetes[2], stds_diabetes[2] = get_impute_knn_score(X_miss_diabetes,
                                                           y_miss_diabetes)
+x_labels.append('KNN Imputation')
 
 
 # %%
@@ -209,6 +208,7 @@ def get_impute_mean(X_missing, y_missing):
                                                          y_miss_california)
 mses_diabetes[3], stds_diabetes[3] = get_impute_mean(X_miss_diabetes,
                                                      y_miss_diabetes)
+x_labels.append('Mean Imputation')
 
 
 # %%
@@ -237,6 +237,7 @@ def get_impute_iterative(X_missing, y_missing):
     X_miss_california, y_miss_california)
 mses_diabetes[4], stds_diabetes[4] = get_impute_iterative(X_miss_diabetes,
                                                           y_miss_diabetes)
+x_labels.append('Iterative Imputation')
 
 mses_diabetes = mses_diabetes * -1
 mses_california = mses_california * -1

From 9c3b402f0082cfc17da3ab9430a203ecc2ac4dfc Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" 
Date: Thu, 29 Apr 2021 15:46:56 -0400
Subject: [PATCH 374/478] ENH Makes ColumnTransformer more flexible by only
 checking for non-dropped columns (#19263)

---
 doc/modules/compose.rst                       |  14 ++
 doc/whats_new/v1.0.rst                        |   5 +
 sklearn/compose/_column_transformer.py        | 106 ++++++++----
 .../compose/tests/test_column_transformer.py  | 151 +++++++++---------
 4 files changed, 164 insertions(+), 112 deletions(-)

diff --git a/doc/modules/compose.rst b/doc/modules/compose.rst
index 6e827304c38cd..a9195ba9ab022 100644
--- a/doc/modules/compose.rst
+++ b/doc/modules/compose.rst
@@ -527,6 +527,20 @@ above example would be::
                                   ('countvectorizer', CountVectorizer(),
                                    'title')])
 
+If :class:`~sklearn.compose.ColumnTransformer` is fitted with a dataframe
+and the dataframe only has string column names, then transforming a dataframe
+will use the column names to select the columns::
+
+
+  >>> ct = ColumnTransformer(
+  ...          [("scale", StandardScaler(), ["expert_rating"])]).fit(X)
+  >>> X_new = pd.DataFrame({"expert_rating": [5, 6, 1],
+  ...                       "ignored_new_col": [1.2, 0.3, -0.1]})
+  >>> ct.transform(X_new)
+  array([[ 0.9...],
+         [ 2.1...],
+         [-3.9...]])
+
 .. _visualizing_composite_estimators:
 
 Visualizing Composite Estimators
diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 977d83890e0c0..d26c5dd0c347d 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -131,6 +131,11 @@ Changelog
   of each transformer in `output_indices_`. :pr:`18393` by
   :user:`Luca Bittarello `.
 
+- |Enhancement| :class:`compose.ColumnTransformer` now allows DataFrame input to
+  have its columns appear in a changed order in `transform`. Further, columns that
+  are dropped will not be required in transform, and additional columns will be
+  ignored if `remainder='drop'`. :pr:`19263` by `Thomas Fan`_
+
 - |FIX| :meth:`compose.ColumnTransformer.get_feature_names` supports
   non-string feature names returned by any of its transformers.
   :pr:`18459` by :user:`Albert Villanova del Moral ` and
diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py
index 2f2da882652c0..441fc95a106f1 100644
--- a/sklearn/compose/_column_transformer.py
+++ b/sklearn/compose/_column_transformer.py
@@ -244,7 +244,8 @@ def set_params(self, **kwargs):
         self._set_params('_transformers', **kwargs)
         return self
 
-    def _iter(self, fitted=False, replace_strings=False):
+    def _iter(self, fitted=False, replace_strings=False,
+              column_as_strings=False):
         """
         Generate (name, trans, column, weight) tuples.
 
@@ -262,11 +263,11 @@ def _iter(self, fitted=False, replace_strings=False):
                 in zip(self.transformers, self._columns)
             ]
             # add transformer tuple for remainder
-            if self._remainder[2] is not None:
+            if self._remainder[2]:
                 transformers = chain(transformers, [self._remainder])
         get_weight = (self.transformer_weights or {}).get
 
-        for name, trans, column in transformers:
+        for name, trans, columns in transformers:
             if replace_strings:
                 # replace 'passthrough' with identity transformer and
                 # skip in case of 'drop'
@@ -276,10 +277,21 @@ def _iter(self, fitted=False, replace_strings=False):
                     )
                 elif trans == 'drop':
                     continue
-                elif _is_empty_column_selection(column):
+                elif _is_empty_column_selection(columns):
                     continue
 
-            yield (name, trans, column, get_weight(name))
+            if column_as_strings and self._only_str_columns:
+                # Convert all columns to using their string labels
+                columns_is_scalar = np.isscalar(columns)
+
+                indices = self._transformer_to_input_indices[name]
+                columns = self._feature_names_in[indices]
+
+                if columns_is_scalar:
+                    # selection is done with one dimension
+                    columns = columns[0]
+
+            yield (name, trans, columns, get_weight(name))
 
     def _validate_transformers(self):
         if not self.transformers:
@@ -305,12 +317,17 @@ def _validate_column_callables(self, X):
         """
         Converts callable column specifications.
         """
-        columns = []
-        for _, _, column in self.transformers:
-            if callable(column):
-                column = column(X)
-            columns.append(column)
-        self._columns = columns
+        all_columns = []
+        transformer_to_input_indices = {}
+        for name, _, columns in self.transformers:
+            if callable(columns):
+                columns = columns(X)
+            all_columns.append(columns)
+            transformer_to_input_indices[name] = _get_column_indices(X,
+                                                                     columns)
+
+        self._columns = all_columns
+        self._transformer_to_input_indices = transformer_to_input_indices
 
     def _validate_remainder(self, X):
         """
@@ -328,12 +345,10 @@ def _validate_remainder(self, X):
                 self.remainder)
 
         self._n_features = X.shape[1]
-        cols = []
-        for columns in self._columns:
-            cols.extend(_get_column_indices(X, columns))
-
-        remaining_idx = sorted(set(range(self._n_features)) - set(cols))
-        self._remainder = ('remainder', self.remainder, remaining_idx or None)
+        cols = set(chain(*self._transformer_to_input_indices.values()))
+        remaining = sorted(set(range(self._n_features)) - cols)
+        self._remainder = ('remainder', self.remainder, remaining)
+        self._transformer_to_input_indices['remainder'] = remaining
 
     @property
     def named_transformers_(self):
@@ -443,7 +458,8 @@ def _log_message(self, name, idx, total):
             return None
         return '(%d of %d) Processing %s' % (idx, total, name)
 
-    def _fit_transform(self, X, y, func, fitted=False):
+    def _fit_transform(self, X, y, func, fitted=False,
+                       column_as_strings=False):
         """
         Private function to fit and/or transform on demand.
 
@@ -452,7 +468,9 @@ def _fit_transform(self, X, y, func, fitted=False):
         ``fitted=True`` ensures the fitted transformers are used.
         """
         transformers = list(
-            self._iter(fitted=fitted, replace_strings=True))
+            self._iter(
+                fitted=fitted, replace_strings=True,
+                column_as_strings=column_as_strings))
         try:
             return Parallel(n_jobs=self.n_jobs)(
                 delayed(func)(
@@ -518,6 +536,8 @@ def fit_transform(self, X, y=None):
         # TODO: this should be `feature_names_in_` when we start having it
         if hasattr(X, "columns"):
             self._feature_names_in = np.asarray(X.columns)
+            self._only_str_columns = all(isinstance(col, str)
+                                         for col in self._feature_names_in)
         else:
             self._feature_names_in = None
         X = _check_X(X)
@@ -572,20 +592,34 @@ def transform(self, X):
         """
         check_is_fitted(self)
         X = _check_X(X)
-        if hasattr(X, "columns"):
-            X_feature_names = np.asarray(X.columns)
+
+        fit_dataframe_and_transform_dataframe = (
+            self._feature_names_in is not None and hasattr(X, "columns"))
+
+        if fit_dataframe_and_transform_dataframe:
+            named_transformers = self.named_transformers_
+            # check that all names seen in fit are in transform, unless
+            # they were dropped
+            non_dropped_indices = [
+                ind for name, ind in self._transformer_to_input_indices.items()
+                if name in named_transformers and
+                isinstance(named_transformers[name], str) and
+                named_transformers[name] != 'drop']
+
+            all_indices = set(chain(*non_dropped_indices))
+            all_names = set(self._feature_names_in[ind] for ind in all_indices)
+
+            diff = all_names - set(X.columns)
+            if diff:
+                raise ValueError(f"columns are missing: {diff}")
         else:
-            X_feature_names = None
-
-        self._check_n_features(X, reset=False)
-        if (self._feature_names_in is not None and
-            X_feature_names is not None and
-                np.any(self._feature_names_in != X_feature_names)):
-            raise RuntimeError(
-                "Given feature/column names do not match the ones for the "
-                "data given during fit."
-            )
-        Xs = self._fit_transform(X, None, _transform_one, fitted=True)
+            # ndarray was used for fitting or transforming, thus we only
+            # check that n_features_in_ is consistent
+            self._check_n_features(X, reset=False)
+
+        Xs = self._fit_transform(
+            X, None, _transform_one, fitted=True,
+            column_as_strings=fit_dataframe_and_transform_dataframe)
         self._validate_output(Xs)
 
         if not Xs:
@@ -629,10 +663,12 @@ def _sk_visual_block_(self):
             transformers = self.transformers
         elif hasattr(self, "_remainder"):
             remainder_columns = self._remainder[2]
-            if self._feature_names_in is not None:
+            if (self._feature_names_in is not None and
+                    remainder_columns and
+                    not all(isinstance(col, str)
+                            for col in remainder_columns)):
                 remainder_columns = (
-                    self._feature_names_in[remainder_columns].tolist()
-                )
+                    self._feature_names_in[remainder_columns].tolist())
             transformers = chain(self.transformers,
                                  [('remainder', self.remainder,
                                    remainder_columns)])
diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py
index 549292ab51445..9278d67296ec5 100644
--- a/sklearn/compose/tests/test_column_transformer.py
+++ b/sklearn/compose/tests/test_column_transformer.py
@@ -4,7 +4,6 @@
 import re
 import pickle
 
-import warnings
 import numpy as np
 from scipy import sparse
 import pytest
@@ -1260,82 +1259,6 @@ def test_column_transformer_negative_column_indexes():
     assert_array_equal(tf_1.fit_transform(X), tf_2.fit_transform(X))
 
 
-@pytest.mark.parametrize("explicit_colname", ['first', 'second'])
-def test_column_transformer_reordered_column_names_remainder(explicit_colname):
-    """Regression test for issue #14223: 'Named col indexing fails with
-       ColumnTransformer remainder on changing DataFrame column ordering'
-
-       Should raise error on changed order combined with remainder.
-       Should allow for added columns in `transform` input DataFrame
-       as long as all preceding columns match.
-    """
-    pd = pytest.importorskip('pandas')
-
-    X_fit_array = np.array([[0, 1, 2], [2, 4, 6]]).T
-    X_fit_df = pd.DataFrame(X_fit_array, columns=['first', 'second'])
-
-    X_trans_array = np.array([[2, 4, 6], [0, 1, 2]]).T
-    X_trans_df = pd.DataFrame(X_trans_array, columns=['second', 'first'])
-
-    tf = ColumnTransformer([('bycol', Trans(), explicit_colname)],
-                           remainder=Trans())
-
-    tf.fit(X_fit_df)
-    err_msg = ("Given feature/column names do not match the ones for the "
-               "data given during fit.")
-    with pytest.raises(RuntimeError, match=err_msg):
-        tf.transform(X_trans_df)
-
-    # ValueError for added columns
-    X_extended_df = X_fit_df.copy()
-    X_extended_df['third'] = [3, 6, 9]
-    err_msg = ("X has 3 features, but ColumnTransformer is expecting 2 "
-               "features as input.")
-    with pytest.raises(ValueError, match=err_msg):
-        tf.transform(X_extended_df)
-
-    # No 'columns' AttributeError when transform input is a numpy array
-    X_array = X_fit_array.copy()
-    err_msg = 'Specifying the columns'
-    with pytest.raises(ValueError, match=err_msg):
-        tf.transform(X_array)
-
-
-def test_feature_name_validation():
-    """Tests if the proper warning/error is raised if the columns do not match
-    during fit and transform."""
-    pd = pytest.importorskip("pandas")
-
-    X = np.ones(shape=(3, 2))
-    X_extra = np.ones(shape=(3, 3))
-    df = pd.DataFrame(X, columns=['a', 'b'])
-    df_extra = pd.DataFrame(X_extra, columns=['a', 'b', 'c'])
-
-    tf = ColumnTransformer([('bycol', Trans(), ['a', 'b'])])
-    tf.fit(df)
-
-    msg = ("X has 3 features, but ColumnTransformer is expecting 2 features "
-           "as input.")
-    with pytest.raises(ValueError, match=msg):
-        tf.transform(df_extra)
-
-    tf = ColumnTransformer([('bycol', Trans(), [0])])
-    tf.fit(df)
-
-    with pytest.raises(ValueError, match=msg):
-        tf.transform(X_extra)
-
-    with warnings.catch_warnings(record=True) as warns:
-        tf.transform(X)
-    assert not warns
-
-    tf = ColumnTransformer([('bycol', Trans(), ['a'])],
-                           remainder=Trans())
-    tf.fit(df)
-    with pytest.raises(ValueError, match=msg):
-        tf.transform(df_extra)
-
-
 @pytest.mark.parametrize("array_type", [np.asarray, sparse.csr_matrix])
 def test_column_transformer_mask_indexing(array_type):
     # Regression test for #14510
@@ -1516,6 +1439,80 @@ def test_sk_visual_block_remainder_fitted_numpy(remainder):
     assert visual_block.estimators == (scaler, remainder)
 
 
+@pytest.mark.parametrize("explicit_colname", ['first', 'second', 0, 1])
+@pytest.mark.parametrize("remainder", [Trans(), 'passthrough', 'drop'])
+def test_column_transformer_reordered_column_names_remainder(explicit_colname,
+                                                             remainder):
+    """Test the interaction between remainder and column transformer"""
+    pd = pytest.importorskip('pandas')
+
+    X_fit_array = np.array([[0, 1, 2], [2, 4, 6]]).T
+    X_fit_df = pd.DataFrame(X_fit_array, columns=['first', 'second'])
+
+    X_trans_array = np.array([[2, 4, 6], [0, 1, 2]]).T
+    X_trans_df = pd.DataFrame(X_trans_array, columns=['second', 'first'])
+
+    tf = ColumnTransformer([('bycol', Trans(), explicit_colname)],
+                           remainder=remainder)
+
+    tf.fit(X_fit_df)
+    X_fit_trans = tf.transform(X_fit_df)
+
+    # Changing the order still works
+    X_trans = tf.transform(X_trans_df)
+    assert_allclose(X_trans, X_fit_trans)
+
+    # extra columns are ignored
+    X_extended_df = X_fit_df.copy()
+    X_extended_df['third'] = [3, 6, 9]
+    X_trans = tf.transform(X_extended_df)
+    assert_allclose(X_trans, X_fit_trans)
+
+    if isinstance(explicit_colname, str):
+        # Raise error if columns are specified by names but input only allows
+        # to specify by position, e.g. numpy array instead of a pandas df.
+        X_array = X_fit_array.copy()
+        err_msg = 'Specifying the columns'
+        with pytest.raises(ValueError, match=err_msg):
+            tf.transform(X_array)
+
+
+def test_feature_name_validation_missing_columns_drop_passthough():
+    """Test the interaction between {'drop', 'passthrough'} and
+    missing column names."""
+    pd = pytest.importorskip("pandas")
+
+    X = np.ones(shape=(3, 4))
+    df = pd.DataFrame(X, columns=['a', 'b', 'c', 'd'])
+
+    df_dropped = df.drop('c', axis=1)
+
+    # with remainder='passthrough', all columns seen during `fit` must be
+    # present
+    tf = ColumnTransformer([('bycol', Trans(), [1])], remainder='passthrough')
+    tf.fit(df)
+    msg = r"columns are missing: {'c'}"
+    with pytest.raises(ValueError, match=msg):
+        tf.transform(df_dropped)
+
+    # with remainder='drop', it is allowed to have column 'c' missing
+    tf = ColumnTransformer([('bycol', Trans(), [1])],
+                           remainder='drop')
+    tf.fit(df)
+
+    df_dropped_trans = tf.transform(df_dropped)
+    df_fit_trans = tf.transform(df)
+    assert_allclose(df_dropped_trans, df_fit_trans)
+
+    # bycol drops 'c', thus it is allowed for 'c' to be missing
+    tf = ColumnTransformer([('bycol', 'drop', ['c'])],
+                           remainder='passthrough')
+    tf.fit(df)
+    df_dropped_trans = tf.transform(df_dropped)
+    df_fit_trans = tf.transform(df)
+    assert_allclose(df_dropped_trans, df_fit_trans)
+
+
 @pytest.mark.parametrize("selector", [[], [False, False]])
 def test_get_feature_names_empty_selection(selector):
     """Test that get_feature_names is only called for transformers that

From 36a9257b9ae797ff150034e6054a3081d2941ae5 Mon Sep 17 00:00:00 2001
From: Helder Geovane Gomes de Lima 
Date: Thu, 29 Apr 2021 20:33:33 -0300
Subject: [PATCH 375/478] DOC Fix typo on comment about t-SNE (#20009)

---
 doc/modules/manifold.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/modules/manifold.rst b/doc/modules/manifold.rst
index 72e8c7485df44..e6e8e842fa7fc 100644
--- a/doc/modules/manifold.rst
+++ b/doc/modules/manifold.rst
@@ -602,7 +602,7 @@ be well separated by non linear methods that focus on the local structure (e.g.
 an SVM with a Gaussian RBF kernel). However, failing to visualize well
 separated homogeneously labeled groups with t-SNE in 2D does not necessarily
 imply that the data cannot be correctly classified by a supervised model. It
-might be the case that 2 dimensions are not low enough to accurately represents
+might be the case that 2 dimensions are not high enough to accurately represent
 the internal structure of the data.
 
 

From 4023a0f94bde429456f45b983c84c5f35475480f Mon Sep 17 00:00:00 2001
From: Haoyin Xu 
Date: Thu, 29 Apr 2021 19:38:53 -0400
Subject: [PATCH 376/478] CLN Fix _add_node parameter name (#20008)

---
 sklearn/tree/_tree.pxd | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/tree/_tree.pxd b/sklearn/tree/_tree.pxd
index 8957f0342892a..a6be4ece56970 100644
--- a/sklearn/tree/_tree.pxd
+++ b/sklearn/tree/_tree.pxd
@@ -58,7 +58,7 @@ cdef class Tree:
     cdef SIZE_t _add_node(self, SIZE_t parent, bint is_left, bint is_leaf,
                           SIZE_t feature, double threshold, double impurity,
                           SIZE_t n_node_samples,
-                          double weighted_n_samples) nogil except -1
+                          double weighted_n_node_samples) nogil except -1
     cdef int _resize(self, SIZE_t capacity) nogil except -1
     cdef int _resize_c(self, SIZE_t capacity=*) nogil except -1
 

From 86bda0ae8472687e19bc071c4cbb957a21738650 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre 
Date: Mon, 3 May 2021 09:25:54 +0200
Subject: [PATCH 377/478] DOC add bug fixes releases in News web section

---
 doc/templates/index.html | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/doc/templates/index.html b/doc/templates/index.html
index c098fc05948af..ccc6ef51a08e0 100644
--- a/doc/templates/index.html
+++ b/doc/templates/index.html
@@ -155,6 +155,10 @@ 

News

  • On-going development: What's new (Changelog) +
  • April 2021. scikit-learn 0.24.2 is available for download (Changelog). +
  • +
  • January 2021. scikit-learn 0.24.1 is available for download (Changelog). +
  • December 2020. scikit-learn 0.24.0 is available for download (Changelog).
  • August 2020. scikit-learn 0.23.2 is available for download (Changelog). From a9ce392f3a58da5caf5ac9bd287205e220082fc5 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Mon, 3 May 2021 16:05:25 +0200 Subject: [PATCH 378/478] DOC Add figure tag properties in css (#20028) --- doc/themes/scikit-learn-modern/static/css/theme.css | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/doc/themes/scikit-learn-modern/static/css/theme.css b/doc/themes/scikit-learn-modern/static/css/theme.css index ed7a86a20fa3b..4d2b78c6a7322 100644 --- a/doc/themes/scikit-learn-modern/static/css/theme.css +++ b/doc/themes/scikit-learn-modern/static/css/theme.css @@ -849,7 +849,8 @@ div.body dd > p { hyphens: none; } -img.align-center, .figure.align-center, object.align-center { +img.align-center, figure.align-center, +.figure.align-center, object.align-center { display: block; margin-left: auto; margin-right: auto; @@ -857,7 +858,8 @@ img.align-center, .figure.align-center, object.align-center { text-align: center; } -img.align-right, .figure.align-right, object.align-right { +img.align-right, figure.align-right, +.figure.align-right, object.align-right { clear: right; float: right; margin-left: 1em; From 4803a0adfe0be1f4de788a77471a6b34df71b43f Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Wed, 5 May 2021 19:43:44 +0200 Subject: [PATCH 379/478] DOC Add a note about the involvement of the contributor in maintenance. (#20044) Co-authored-by: Julien Jerphanion Co-authored-by: Nicolas Hug Co-authored-by: Nicolas Hug --- build_tools/circle/build_doc.sh | 2 +- doc/developers/contributing.rst | 9 ++++++--- doc/developers/tips.rst | 2 +- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/build_tools/circle/build_doc.sh b/build_tools/circle/build_doc.sh index 37afb1841d368..c447db180697c 100755 --- a/build_tools/circle/build_doc.sh +++ b/build_tools/circle/build_doc.sh @@ -1,4 +1,4 @@ -#!/usr/bin/env bash +q#!/usr/bin/env bash set -x set -e diff --git a/doc/developers/contributing.rst b/doc/developers/contributing.rst index 7f3aeb9537413..2a7ec7afe48a4 100644 --- a/doc/developers/contributing.rst +++ b/doc/developers/contributing.rst @@ -455,9 +455,12 @@ complies with the following rules before marking a PR as ``[MRG]``. The functionality is useful in practice and, if possible, compare it to other methods available in scikit-learn. -10. New features often need to be illustrated with narrative documentation in - the user guide, with small code snippets. If relevant, please also add - references in the literature, with PDF links when possible. +10. New features have some maintenance overhead. We expect PR authors + to take part in the maintenance for the code they submit, at least + initially. New features need to be illustrated with narrative + documentation in the user guide, with small code snippets. + If relevant, please also add references in the literature, with PDF links + when possible. 11. The user guide should also include expected time and space complexity of the algorithm and scalability, e.g. "this algorithm can scale to a diff --git a/doc/developers/tips.rst b/doc/developers/tips.rst index 8cf5bd5b5d094..36e2cd4a58779 100644 --- a/doc/developers/tips.rst +++ b/doc/developers/tips.rst @@ -190,7 +190,7 @@ PR-NEW: Fix # PR-NEW or Issue: Maintenance cost :: - Every feature we include has a [maintenance cost](http://scikit-learn.org/dev/faq.html#why-are-you-so-selective-on-what-algorithms-you-include-in-scikit-learn). Our maintainers are mostly volunteers. For a new feature to be included, we need evidence that it is often useful and, ideally, [well-established](http://scikit-learn.org/dev/faq.html#what-are-the-inclusion-criteria-for-new-algorithms) in the literature or in practice. That doesn't stop you implementing it for yourself and publishing it in a separate repository, or even [scikit-learn-contrib](https://scikit-learn-contrib.github.io). + Every feature we include has a [maintenance cost](http://scikit-learn.org/dev/faq.html#why-are-you-so-selective-on-what-algorithms-you-include-in-scikit-learn). Our maintainers are mostly volunteers. For a new feature to be included, we need evidence that it is often useful and, ideally, [well-established](http://scikit-learn.org/dev/faq.html#what-are-the-inclusion-criteria-for-new-algorithms) in the literature or in practice. Also, we expect PR authors to take part in the maintenance for the code they submit, at least initially. That doesn't stop you implementing it for yourself and publishing it in a separate repository, or even [scikit-learn-contrib](https://scikit-learn-contrib.github.io). PR-WIP: What's needed before merge? :: From 0eb9ad73c53c8f3cc0ea03d33312035853bee29b Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 5 May 2021 20:59:59 +0100 Subject: [PATCH 380/478] MNT fix bad shebang in build_doc.sh (#20050) --- build_tools/circle/build_doc.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build_tools/circle/build_doc.sh b/build_tools/circle/build_doc.sh index c447db180697c..37afb1841d368 100755 --- a/build_tools/circle/build_doc.sh +++ b/build_tools/circle/build_doc.sh @@ -1,4 +1,4 @@ -q#!/usr/bin/env bash +#!/usr/bin/env bash set -x set -e From de1262c35e2aa4ee062d050281ee576ce9e35c94 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Wed, 5 May 2021 21:38:17 -0400 Subject: [PATCH 381/478] CLN Remove **kwargs in Neighbors estiamtors (#20013) --- sklearn/neighbors/_classification.py | 7 +++---- sklearn/neighbors/_regression.py | 10 ++++------ sklearn/neighbors/tests/test_neighbors.py | 4 +--- 3 files changed, 8 insertions(+), 13 deletions(-) diff --git a/sklearn/neighbors/_classification.py b/sklearn/neighbors/_classification.py index 71b869977f6aa..83078e9f77ba9 100644 --- a/sklearn/neighbors/_classification.py +++ b/sklearn/neighbors/_classification.py @@ -147,14 +147,13 @@ class KNeighborsClassifier(KNeighborsMixin, @_deprecate_positional_args def __init__(self, n_neighbors=5, *, weights='uniform', algorithm='auto', leaf_size=30, - p=2, metric='minkowski', metric_params=None, n_jobs=None, - **kwargs): + p=2, metric='minkowski', metric_params=None, n_jobs=None): super().__init__( n_neighbors=n_neighbors, algorithm=algorithm, leaf_size=leaf_size, metric=metric, p=p, metric_params=metric_params, - n_jobs=n_jobs, **kwargs) + n_jobs=n_jobs) self.weights = _check_weights(weights) def fit(self, X, y): @@ -415,7 +414,7 @@ def __init__(self, radius=1.0, *, weights='uniform', algorithm=algorithm, leaf_size=leaf_size, metric=metric, p=p, metric_params=metric_params, - n_jobs=n_jobs, **kwargs) + n_jobs=n_jobs) self.weights = _check_weights(weights) self.outlier_label = outlier_label diff --git a/sklearn/neighbors/_regression.py b/sklearn/neighbors/_regression.py index d3878cd54aa06..62d6cf33575e4 100644 --- a/sklearn/neighbors/_regression.py +++ b/sklearn/neighbors/_regression.py @@ -146,13 +146,12 @@ class KNeighborsRegressor(KNeighborsMixin, @_deprecate_positional_args def __init__(self, n_neighbors=5, *, weights='uniform', algorithm='auto', leaf_size=30, - p=2, metric='minkowski', metric_params=None, n_jobs=None, - **kwargs): + p=2, metric='minkowski', metric_params=None, n_jobs=None): super().__init__( n_neighbors=n_neighbors, algorithm=algorithm, leaf_size=leaf_size, metric=metric, p=p, - metric_params=metric_params, n_jobs=n_jobs, **kwargs) + metric_params=metric_params, n_jobs=n_jobs) self.weights = _check_weights(weights) def _more_tags(self): @@ -346,14 +345,13 @@ class RadiusNeighborsRegressor(RadiusNeighborsMixin, @_deprecate_positional_args def __init__(self, radius=1.0, *, weights='uniform', algorithm='auto', leaf_size=30, - p=2, metric='minkowski', metric_params=None, n_jobs=None, - **kwargs): + p=2, metric='minkowski', metric_params=None, n_jobs=None): super().__init__( radius=radius, algorithm=algorithm, leaf_size=leaf_size, p=p, metric=metric, metric_params=metric_params, - n_jobs=n_jobs, **kwargs) + n_jobs=n_jobs) self.weights = _check_weights(weights) def fit(self, X, y): diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index 8ce52119faa02..5df7a6419b0b5 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -978,7 +978,6 @@ def test_RadiusNeighborsRegressor_multioutput_with_uniform_weight(): def test_RadiusNeighborsRegressor_multioutput(n_samples=40, n_features=5, n_test_pts=10, - n_neighbors=3, random_state=0): # Test k-neighbors in multi-output regression with various weight rng = np.random.RandomState(random_state) @@ -991,8 +990,7 @@ def test_RadiusNeighborsRegressor_multioutput(n_samples=40, weights = ['uniform', 'distance', _weight_func] for algorithm, weights in product(ALGORITHMS, weights): - rnn = neighbors.RadiusNeighborsRegressor(n_neighbors=n_neighbors, - weights=weights, + rnn = neighbors.RadiusNeighborsRegressor(weights=weights, algorithm=algorithm) rnn.fit(X, y) epsilon = 1E-5 * (2 * rng.rand(1, n_features) - 1) From 99754cd4f6525b41d489fd541453fbd0c2e02fd5 Mon Sep 17 00:00:00 2001 From: Haoyin Xu Date: Mon, 10 May 2021 08:06:35 -0400 Subject: [PATCH 382/478] DOC Add note for videos & improve syncing instructions (#20022) Co-authored-by: Guillaume Lemaitre --- doc/developers/contributing.rst | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/doc/developers/contributing.rst b/doc/developers/contributing.rst index 2a7ec7afe48a4..0284ad179fc19 100644 --- a/doc/developers/contributing.rst +++ b/doc/developers/contributing.rst @@ -218,6 +218,13 @@ latest up-to-date workflow. `Transcript `__ +.. note:: + In January 2021, the default branch name changed from ``master`` to ``main`` + for the scikit-learn GitHub repository to use more inclusive terms. + These videos were created prior to the renaming of the branch. + For contributors who are viewing these videos to set up their + working environment and submitting a PR, ``master`` should be replaced to ``main``. + How to contribute ----------------- @@ -274,12 +281,14 @@ You should now have a working installation of scikit-learn, and your git repository properly configured. The next steps now describe the process of modifying code and submitting a PR: -7. Synchronize your main branch with the upstream main branch: +7. Synchronize your ``main`` branch with the ``upstream/main`` branch, + more details on `GitHub Docs `_: .. prompt:: bash $ git checkout main - git pull upstream main + git fetch upstream + git merge upstream/main 8. Create a feature branch to hold your development changes: @@ -416,12 +425,12 @@ complies with the following rules before marking a PR as ``[MRG]``. The verify the correct behavior of the fix or feature. In this manner, further modifications on the code base are granted to be consistent with the desired behavior. In the case of bug fixes, at the time of the PR, the - non-regression tests should fail for the code base in the main branch + non-regression tests should fail for the code base in the ``main`` branch and pass for the PR code. 5. **Make sure that your PR does not add PEP8 violations**. To check the code that you changed, you can run the following command (see - :ref:`above ` to set up the upstream remote): + :ref:`above ` to set up the ``upstream`` remote): .. prompt:: bash $ @@ -934,7 +943,7 @@ scikit-learn. Make sure it is up to date: In the benchmark suite, the benchmarks are organized following the same structure as scikit-learn. For example, you can compare the performance of a -specific estimator between upstream/main and the branch you are working on: +specific estimator between ``upstream/main`` and the branch you are working on: .. prompt:: bash $ From 6cc3afbe0221df9a0b51eedc714a8b46c21f5c50 Mon Sep 17 00:00:00 2001 From: Chuliang Xiao Date: Mon, 10 May 2021 09:57:19 -0400 Subject: [PATCH 383/478] CLN Replace self.steps[-1][-1] with self.steps[-1][1] (#20063) --- sklearn/pipeline.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 1c9a62d02b7d0..024bfe4f1dd38 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -416,7 +416,7 @@ def predict(self, X, **predict_params): Xt = X for _, name, transform in self._iter(with_final=False): Xt = transform.transform(Xt) - return self.steps[-1][-1].predict(Xt, **predict_params) + return self.steps[-1][1].predict(Xt, **predict_params) @if_delegate_has_method(delegate='_final_estimator') def fit_predict(self, X, y=None, **fit_params): @@ -451,7 +451,7 @@ def fit_predict(self, X, y=None, **fit_params): fit_params_last_step = fit_params_steps[self.steps[-1][0]] with _print_elapsed_time('Pipeline', self._log_message(len(self.steps) - 1)): - y_pred = self.steps[-1][-1].fit_predict(Xt, y, + y_pred = self.steps[-1][1].fit_predict(Xt, y, **fit_params_last_step) return y_pred @@ -476,7 +476,7 @@ def predict_proba(self, X, **predict_proba_params): Xt = X for _, name, transform in self._iter(with_final=False): Xt = transform.transform(Xt) - return self.steps[-1][-1].predict_proba(Xt, **predict_proba_params) + return self.steps[-1][1].predict_proba(Xt, **predict_proba_params) @if_delegate_has_method(delegate='_final_estimator') def decision_function(self, X): @@ -495,7 +495,7 @@ def decision_function(self, X): Xt = X for _, name, transform in self._iter(with_final=False): Xt = transform.transform(Xt) - return self.steps[-1][-1].decision_function(Xt) + return self.steps[-1][1].decision_function(Xt) @if_delegate_has_method(delegate='_final_estimator') def score_samples(self, X): @@ -514,7 +514,7 @@ def score_samples(self, X): Xt = X for _, _, transformer in self._iter(with_final=False): Xt = transformer.transform(Xt) - return self.steps[-1][-1].score_samples(Xt) + return self.steps[-1][1].score_samples(Xt) @if_delegate_has_method(delegate='_final_estimator') def predict_log_proba(self, X, **predict_log_proba_params): @@ -537,7 +537,7 @@ def predict_log_proba(self, X, **predict_log_proba_params): Xt = X for _, name, transform in self._iter(with_final=False): Xt = transform.transform(Xt) - return self.steps[-1][-1].predict_log_proba( + return self.steps[-1][1].predict_log_proba( Xt, **predict_log_proba_params ) @@ -629,11 +629,11 @@ def score(self, X, y=None, sample_weight=None): score_params = {} if sample_weight is not None: score_params['sample_weight'] = sample_weight - return self.steps[-1][-1].score(Xt, y, **score_params) + return self.steps[-1][1].score(Xt, y, **score_params) @property def classes_(self): - return self.steps[-1][-1].classes_ + return self.steps[-1][1].classes_ def _more_tags(self): # check if first estimator expects pairwise input From ee2298213a74c0b120de5a5bd2b2f83a84a134d3 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Mon, 10 May 2021 12:11:49 -0400 Subject: [PATCH 384/478] TST Adjust atol in test_ridge_regression_check_arguments based on 32bit-ness (#20071) --- sklearn/linear_model/tests/test_ridge.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sklearn/linear_model/tests/test_ridge.py b/sklearn/linear_model/tests/test_ridge.py index b812788239b14..d83248cfae4af 100644 --- a/sklearn/linear_model/tests/test_ridge.py +++ b/sklearn/linear_model/tests/test_ridge.py @@ -5,6 +5,7 @@ import pytest +from sklearn.utils import _IS_32BIT from sklearn.utils._testing import assert_almost_equal from sklearn.utils._testing import assert_allclose from sklearn.utils._testing import assert_array_almost_equal @@ -1279,7 +1280,8 @@ def test_ridge_regression_check_arguments_validity(return_intercept, y += true_intercept X_testing = arr_type(X) - alpha, atol, tol = 1e-3, 1e-4, 1e-6 + alpha, tol = 1e-3, 1e-6 + atol = 1e-3 if _IS_32BIT else 1e-4 if solver not in ['sag', 'auto'] and return_intercept: with pytest.raises(ValueError, match="In Ridge, only 'sag' solver"): From 2bd3a4db529d707a9862d69cc1ddbcbe7a6054b8 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Mon, 10 May 2021 22:10:21 +0200 Subject: [PATCH 385/478] ENH Consistent loss name for absolute error (#19733) --- doc/modules/ensemble.rst | 2 +- doc/whats_new/v1.0.rst | 29 ++++++++++++ sklearn/ensemble/_base.py | 12 ++--- sklearn/ensemble/_forest.py | 47 ++++++++++++------- sklearn/ensemble/_gb.py | 32 ++++++++----- sklearn/ensemble/_gb_losses.py | 3 +- .../gradient_boosting.py | 19 ++++++-- .../ensemble/_hist_gradient_boosting/loss.py | 2 +- .../tests/test_compare_lightgbm.py | 2 +- .../tests/test_gradient_boosting.py | 25 +++++----- .../tests/test_loss.py | 10 ++-- .../_hist_gradient_boosting/utils.pyx | 6 +-- sklearn/ensemble/tests/test_forest.py | 24 +++++++--- .../ensemble/tests/test_gradient_boosting.py | 30 +++++++----- sklearn/linear_model/_ransac.py | 24 +++++++--- sklearn/linear_model/tests/test_ransac.py | 12 +++-- sklearn/tree/_classes.py | 28 ++++++++--- sklearn/tree/tests/test_tree.py | 20 ++++---- 18 files changed, 224 insertions(+), 103 deletions(-) diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst index 329215406c39c..21610228b9b37 100644 --- a/doc/modules/ensemble.rst +++ b/doc/modules/ensemble.rst @@ -944,7 +944,7 @@ controls the number of iterations of the boosting process:: 0.8965 Available losses for regression are 'squared_error', -'least_absolute_deviation', which is less sensitive to outliers, and +'absolute_error', which is less sensitive to outliers, and 'poisson', which is well suited to model counts and frequencies. For classification, 'binary_crossentropy' is used for binary classification and 'categorical_crossentropy' is used for multiclass classification. By default diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index d26c5dd0c347d..8ad8a295d72e0 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -76,6 +76,35 @@ Changelog - For :class:`tree.ExtraTreeRegressor`, `criterion="mse"` is deprecated, use `"squared_error"` instead which is now the default. +- |API| The option for using the absolute error via ``loss`` and + ``criterion`` parameters was made more consistent. The preferred way is by + setting the value to `"absolute_error"`. Old option names are still valid, + produce the same models, but are deprecated and will be removed in version + 1.2. + :pr:`19733` by :user:`Christian Lorentzen `. + + - For :class:`ensemble.ExtraTreesRegressor`, `criterion="mae"` is deprecated, + use `"absolute_error"` instead. + + - For :class:`ensemble.GradientBoostingRegressor`, `loss="lad"` is deprecated, + use `"absolute_error"` instead. + + - For :class:`ensemble.RandomForestRegressor`, `criterion="mae"` is deprecated, + use `"absolute_error"` instead. + + - For :class:`ensemble.HistGradientBoostingRegressor`, + `loss="least_absolute_deviation"` is deprecated, use `"absolute_error"` + instead. + + - For :class:`linear_model.RANSACRegressor`, `loss="absolute_loss"` is + deprecated, use `"absolute_error"` instead which is now the default. + + - For :class:`tree.DecisionTreeRegressor`, `criterion="mae"` is deprecated, + use `"absolute_error"` instead. + + - For :class:`tree.ExtraTreeRegressor`, `criterion="mae"` is deprecated, + use `"absolute_error"` instead. + :mod:`sklearn.base` ................... diff --git a/sklearn/ensemble/_base.py b/sklearn/ensemble/_base.py index 095d801de166d..c58a0c7dbe9c7 100644 --- a/sklearn/ensemble/_base.py +++ b/sklearn/ensemble/_base.py @@ -153,13 +153,13 @@ def _make_estimator(self, append=True, random_state=None): for p in self.estimator_params}) # TODO: Remove in v1.2 - # criterion "mse" would cause warnings in every call to + # criterion "mse" and "mae" would cause warnings in every call to # DecisionTreeRegressor.fit(..) - if ( - isinstance(estimator, (DecisionTreeRegressor, ExtraTreeRegressor)) - and getattr(estimator, "criterion", None) == "mse" - ): - estimator.set_params(criterion="squared_error") + if isinstance(estimator, (DecisionTreeRegressor, ExtraTreeRegressor)): + if getattr(estimator, "criterion", None) == "mse": + estimator.set_params(criterion="squared_error") + elif getattr(estimator, "criterion", None) == "mae": + estimator.set_params(criterion="absolute_error") if random_state is not None: _set_random_states(estimator, random_state) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index 140c1c93e8eef..8eef1f3429227 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -346,16 +346,21 @@ def fit(self, X, y, sample_weight=None): # Check parameters self._validate_estimator() # TODO: Remove in v1.2 - if ( - isinstance(self, (RandomForestRegressor, ExtraTreesRegressor)) - and self.criterion == "mse" - ): - warn( - "Criterion 'mse' was deprecated in v1.0 and will be " - "removed in version 1.2. Use `criterion='squared_error'` " - "which is equivalent.", - FutureWarning - ) + if isinstance(self, (RandomForestRegressor, ExtraTreesRegressor)): + if self.criterion == "mse": + warn( + "Criterion 'mse' was deprecated in v1.0 and will be " + "removed in version 1.2. Use `criterion='squared_error'` " + "which is equivalent.", + FutureWarning + ) + elif self.criterion == "mae": + warn( + "Criterion 'mae' was deprecated in v1.0 and will be " + "removed in version 1.2. Use `criterion='absolute_error'` " + "which is equivalent.", + FutureWarning + ) if not self.bootstrap and self.oob_score: raise ValueError("Out of bag estimation only available" @@ -1321,11 +1326,12 @@ class RandomForestRegressor(ForestRegressor): The default value of ``n_estimators`` changed from 10 to 100 in 0.22. - criterion : {"squared_error", "mse", "mae"}, default="squared_error" + criterion : {"squared_error", "mse", "absolute_error", "mae"}, \ + default="squared_error" The function to measure the quality of a split. Supported criteria are "squared_error" for the mean squared error, which is equal to - variance reduction as feature selection criterion, and "mae" for the - mean absolute error. + variance reduction as feature selection criterion, and "absolute_error" + for the mean absolute error. .. versionadded:: 0.18 Mean Absolute Error (MAE) criterion. @@ -1334,6 +1340,10 @@ class RandomForestRegressor(ForestRegressor): Criterion "mse" was deprecated in v1.0 and will be removed in version 1.2. Use `criterion="squared_error"` which is equivalent. + .. deprecated:: 1.0 + Criterion "mae" was deprecated in v1.0 and will be removed in + version 1.2. Use `criterion="absolute_error"` which is equivalent. + max_depth : int, default=None The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than @@ -1936,10 +1946,11 @@ class ExtraTreesRegressor(ForestRegressor): The default value of ``n_estimators`` changed from 10 to 100 in 0.22. - criterion : {"squared_error", "mse", "mae"}, default="squared_error" + criterion : {"squared_error", "mse", "absolute_error", "mae"}, \ + default="squared_error" The function to measure the quality of a split. Supported criteria - are "squared_error" and "mse" for the mean squared error, which is - equal to variance reduction as feature selection criterion, and "mae" + are "squared_error" for the mean squared error, which is equal to + variance reduction as feature selection criterion, and "absolute_error" for the mean absolute error. .. versionadded:: 0.18 @@ -1949,6 +1960,10 @@ class ExtraTreesRegressor(ForestRegressor): Criterion "mse" was deprecated in v1.0 and will be removed in version 1.2. Use `criterion="squared_error"` which is equivalent. + .. deprecated:: 1.0 + Criterion "mae" was deprecated in v1.0 and will be removed in + version 1.2. Use `criterion="absolute_error"` which is equivalent. + max_depth : int, default=None The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than diff --git a/sklearn/ensemble/_gb.py b/sklearn/ensemble/_gb.py index 4984575bce8c3..527bbcb559b5f 100644 --- a/sklearn/ensemble/_gb.py +++ b/sklearn/ensemble/_gb.py @@ -238,11 +238,17 @@ def _check_params(self): or self.loss not in _gb_losses.LOSS_FUNCTIONS): raise ValueError("Loss '{0:s}' not supported. ".format(self.loss)) + # TODO: Remove in v1.2 if self.loss == "ls": warnings.warn("The loss 'ls' was deprecated in v1.0 and " "will be removed in version 1.2. Use 'squared_error'" " which is equivalent.", FutureWarning) + elif self.loss == "lad": + warnings.warn("The loss 'lad' was deprecated in v1.0 and " + "will be removed in version 1.2. Use " + "'absolute_error' which is equivalent.", + FutureWarning) if self.loss == 'deviance': loss_class = (_gb_losses.MultinomialDeviance @@ -403,7 +409,7 @@ def fit(self, X, y, sample_weight=None, monitor=None): ------- self : object """ - if self.criterion == 'mae': + if self.criterion in ('absolute_error', 'mae'): # TODO: This should raise an error from 1.1 self._warn_mae_for_criterion() @@ -1340,19 +1346,22 @@ class GradientBoostingRegressor(RegressorMixin, BaseGradientBoosting): Parameters ---------- - loss : {'squared_error', 'ls', 'lad', 'huber', 'quantile'}, \ - default='squared_error' + loss : {'squared_error', 'ls', 'absolute_error', 'lad', 'huber', \ + 'quantile'}, default='squared_error' Loss function to be optimized. 'squared_error' refers to the squared - error for regression. - 'lad' (least absolute deviation) is a highly robust - loss function solely based on order information of the input - variables. 'huber' is a combination of the two. 'quantile' - allows quantile regression (use `alpha` to specify the quantile). + error for regression. 'absolute_error' refers to the absolute error of + regression and is a robust loss function. 'huber' is a + combination of the two. 'quantile' allows quantile regression (use + `alpha` to specify the quantile). .. deprecated:: 1.0 The loss 'ls' was deprecated in v1.0 and will be removed in version 1.2. Use `loss='squared_error'` which is equivalent. + .. deprecated:: 1.0 + The loss 'lad' was deprecated in v1.0 and will be removed in + version 1.2. Use `loss='absolute_error'` which is equivalent. + learning_rate : float, default=0.1 Learning rate shrinks the contribution of each tree by `learning_rate`. There is a trade-off between learning_rate and n_estimators. @@ -1383,7 +1392,7 @@ class GradientBoostingRegressor(RegressorMixin, BaseGradientBoosting): .. deprecated:: 0.24 `criterion='mae'` is deprecated and will be removed in version 1.1 (renaming of 0.26). The correct way of minimizing the absolute - error is to use `loss='lad'` instead. + error is to use `loss='absolute_error'` instead. .. deprecated:: 1.0 Criterion 'mse' was deprecated in v1.0 and will be removed in @@ -1644,7 +1653,8 @@ class GradientBoostingRegressor(RegressorMixin, BaseGradientBoosting): """ # TODO: remove "ls" in verion 1.2 - _SUPPORTED_LOSS = ("squared_error", 'ls', 'lad', 'huber', 'quantile') + _SUPPORTED_LOSS = ("squared_error", 'ls', "absolute_error", 'lad', 'huber', + 'quantile') @_deprecate_positional_args def __init__(self, *, loss="squared_error", learning_rate=0.1, @@ -1681,7 +1691,7 @@ def _warn_mae_for_criterion(self): warnings.warn("criterion='mae' was deprecated in version 0.24 and " "will be removed in version 1.1 (renaming of 0.26). The " "correct way of minimizing the absolute error is to use " - " loss='lad' instead.", FutureWarning) + " loss='absolute_error' instead.", FutureWarning) def predict(self, X): """Predict regression target for X. diff --git a/sklearn/ensemble/_gb_losses.py b/sklearn/ensemble/_gb_losses.py index f33c7086b596b..67a3b1b364f47 100644 --- a/sklearn/ensemble/_gb_losses.py +++ b/sklearn/ensemble/_gb_losses.py @@ -856,10 +856,11 @@ def get_init_raw_predictions(self, X, estimator): return raw_predictions.reshape(-1, 1).astype(np.float64) -# TODO: Remove entry 'ls' in version 1.2. +# TODO: Remove entry 'ls' and 'lad' in version 1.2. LOSS_FUNCTIONS = { "squared_error": LeastSquaresError, 'ls': LeastSquaresError, + "absolute_error": LeastAbsoluteError, 'lad': LeastAbsoluteError, 'huber': HuberLossFunction, 'quantile': QuantileLossFunction, diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index d3b62a5df784a..6d5de978add9b 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -893,8 +893,8 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting): Parameters ---------- - loss : {'squared_error', 'least_squares', 'least_absolute_deviation', \ - 'poisson'}, default='squared_error' + loss : {'squared_error', 'least_squares', 'absolute_error', \ + 'least_absolute_deviation', 'poisson'}, default='squared_error' The loss function to use in the boosting process. Note that the "least squares" and "poisson" losses actually implement "half least squares loss" and "half poisson deviance" to simplify the @@ -908,6 +908,11 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting): The loss 'least_squares' was deprecated in v1.0 and will be removed in version 1.2. Use `loss='squared_error'` which is equivalent. + .. deprecated:: 1.0 + The loss 'least_absolute_deviation' was deprecated in v1.0 and will + be removed in version 1.2. Use `loss='absolute_error'` which is + equivalent. + learning_rate : float, default=0.1 The learning rate, also known as *shrinkage*. This is used as a multiplicative factor for the leaves values. Use ``1`` for no @@ -1037,7 +1042,7 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting): 0.92... """ - _VALID_LOSSES = ('squared_error', 'least_squares', + _VALID_LOSSES = ('squared_error', 'least_squares', 'absolute_error', 'least_absolute_deviation', 'poisson') @_deprecate_positional_args @@ -1113,6 +1118,7 @@ def _encode_y(self, y): return y def _get_loss(self, sample_weight): + # TODO: Remove in v1.2 if self.loss == "least_squares": warnings.warn( "The loss 'least_squares' was deprecated in v1.0 and will be " @@ -1120,6 +1126,13 @@ def _get_loss(self, sample_weight): "equivalent.", FutureWarning) return _LOSSES["squared_error"](sample_weight=sample_weight) + elif self.loss == "least_absolute_deviation": + warnings.warn( + "The loss 'least_absolute_deviation' was deprecated in v1.0 " + " and will be removed in version 1.2. Use 'absolute_error' " + "which is equivalent.", + FutureWarning) + return _LOSSES["absolute_error"](sample_weight=sample_weight) return _LOSSES[self.loss](sample_weight=sample_weight) diff --git a/sklearn/ensemble/_hist_gradient_boosting/loss.py b/sklearn/ensemble/_hist_gradient_boosting/loss.py index c336bd347e4cf..036f075bdabd8 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/loss.py +++ b/sklearn/ensemble/_hist_gradient_boosting/loss.py @@ -420,7 +420,7 @@ def predict_proba(self, raw_predictions): _LOSSES = { 'squared_error': LeastSquares, - 'least_absolute_deviation': LeastAbsoluteDeviation, + 'absolute_error': LeastAbsoluteDeviation, 'binary_crossentropy': BinaryCrossEntropy, 'categorical_crossentropy': CategoricalCrossEntropy, 'poisson': Poisson, diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_compare_lightgbm.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_compare_lightgbm.py index f34dffab2671c..ac58f39422687 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_compare_lightgbm.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_compare_lightgbm.py @@ -34,7 +34,7 @@ def test_same_predictions_regression(seed, min_samples_leaf, n_samples, # and max_leaf_nodes is low enough. # - To ignore discrepancies caused by small differences the binning # strategy, data is pre-binned if n_samples > 255. - # - We don't check the least_absolute_deviation loss here. This is because + # - We don't check the absolute_error loss here. This is because # LightGBM's computation of the median (used for the initial value of # raw_prediction) is a bit off (they'll e.g. return midpoints when there # is no need to.). Since these tests only run 1 iteration, the diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index b2322f29f85d1..213d46cf58f04 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -192,26 +192,26 @@ def test_should_stop(scores, n_iter_no_change, tol, stopping): assert gbdt._should_stop(scores) == stopping -def test_least_absolute_deviation(): +def test_absolute_error(): # For coverage only. X, y = make_regression(n_samples=500, random_state=0) - gbdt = HistGradientBoostingRegressor(loss='least_absolute_deviation', + gbdt = HistGradientBoostingRegressor(loss='absolute_error', random_state=0) gbdt.fit(X, y) assert gbdt.score(X, y) > .9 -def test_least_absolute_deviation_sample_weight(): +def test_absolute_error_sample_weight(): # non regression test for issue #19400 # make sure no error is thrown during fit of - # HistGradientBoostingRegressor with least_absolute_deviation loss function + # HistGradientBoostingRegressor with absolute_error loss function # and passing sample_weight rng = np.random.RandomState(0) n_samples = 100 X = rng.uniform(-1, 1, size=(n_samples, 2)) y = rng.uniform(-1, 1, size=n_samples) sample_weight = rng.uniform(0, 1, size=n_samples) - gbdt = HistGradientBoostingRegressor(loss='least_absolute_deviation') + gbdt = HistGradientBoostingRegressor(loss='absolute_error') gbdt.fit(X, y, sample_weight=sample_weight) @@ -650,8 +650,7 @@ def test_sample_weight_effect(problem, duplication): est_dup._raw_predict(X_dup)) -@pytest.mark.parametrize('loss_name', ('squared_error', - 'least_absolute_deviation')) +@pytest.mark.parametrize('loss_name', ('squared_error', 'absolute_error')) def test_sum_hessians_are_sample_weight(loss_name): # For losses with constant hessians, the sum_hessians field of the # histograms must be equal to the sum of the sample weight of samples at @@ -993,14 +992,18 @@ def test_uint8_predict(Est): # TODO: Remove in v1.2 -def test_loss_least_squares_deprecated(): +@pytest.mark.parametrize("old_loss, new_loss", [ + ("least_squares", "squared_error"), + ("least_absolute_deviation", "absolute_error"), +]) +def test_loss_deprecated(old_loss, new_loss): X, y = make_regression(n_samples=50, random_state=0) - est1 = HistGradientBoostingRegressor(loss="least_squares", random_state=0) + est1 = HistGradientBoostingRegressor(loss=old_loss, random_state=0) with pytest.warns(FutureWarning, - match="The loss 'least_squares' was deprecated"): + match=f"The loss '{old_loss}' was deprecated"): est1.fit(X, y) - est2 = HistGradientBoostingRegressor(loss="squared_error", random_state=0) + est2 = HistGradientBoostingRegressor(loss=new_loss, random_state=0) est2.fit(X, y) assert_allclose(est1.predict(X), est2.predict(X)) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py index ce7b4acedbae5..345e72c642668 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py @@ -103,7 +103,7 @@ def fprime2(x: np.ndarray) -> np.ndarray: @pytest.mark.parametrize('loss, n_classes, prediction_dim', [ ("squared_error", 0, 1), - ('least_absolute_deviation', 0, 1), + ("absolute_error", 0, 1), ('binary_crossentropy', 2, 1), ('categorical_crossentropy', 3, 3), ('poisson', 0, 1), @@ -118,7 +118,7 @@ def test_numerical_gradients(loss, n_classes, prediction_dim, seed=0): rng = np.random.RandomState(seed) n_samples = 100 - if loss in ("squared_error", 'least_absolute_deviation'): + if loss in ("squared_error", "absolute_error"): y_true = rng.normal(size=n_samples).astype(Y_DTYPE) elif loss in ('poisson'): y_true = rng.poisson(size=n_samples).astype(Y_DTYPE) @@ -172,10 +172,10 @@ def test_baseline_least_squares(): baseline_prediction) -def test_baseline_least_absolute_deviation(): +def test_baseline_absolute_error(): rng = np.random.RandomState(0) - loss = _LOSSES['least_absolute_deviation'](sample_weight=None) + loss = _LOSSES["absolute_error"](sample_weight=None) y_train = rng.normal(size=100) baseline_prediction = loss.get_baseline_prediction(y_train, None, 1) assert baseline_prediction.shape == tuple() # scalar @@ -256,7 +256,7 @@ def test_baseline_categorical_crossentropy(): @pytest.mark.parametrize('loss, problem', [ ("squared_error", 'regression'), - ('least_absolute_deviation', 'regression'), + ("absolute_error", 'regression'), ('binary_crossentropy', 'classification'), ('categorical_crossentropy', 'classification'), ('poisson', 'poisson_regression'), diff --git a/sklearn/ensemble/_hist_gradient_boosting/utils.pyx b/sklearn/ensemble/_hist_gradient_boosting/utils.pyx index d1168acf94835..3b323b3e298b8 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/utils.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/utils.pyx @@ -43,7 +43,7 @@ def get_equivalent_estimator(estimator, lib='lightgbm'): lightgbm_loss_mapping = { 'squared_error': 'regression_l2', - 'least_absolute_deviation': 'regression_l1', + 'absolute_error': 'regression_l1', 'binary_crossentropy': 'binary', 'categorical_crossentropy': 'multiclass' } @@ -76,7 +76,7 @@ def get_equivalent_estimator(estimator, lib='lightgbm'): # XGB xgboost_loss_mapping = { 'squared_error': 'reg:linear', - 'least_absolute_deviation': 'LEAST_ABSOLUTE_DEV_NOT_SUPPORTED', + 'absolute_error': 'LEAST_ABSOLUTE_DEV_NOT_SUPPORTED', 'binary_crossentropy': 'reg:logistic', 'categorical_crossentropy': 'multi:softmax' } @@ -101,7 +101,7 @@ def get_equivalent_estimator(estimator, lib='lightgbm'): catboost_loss_mapping = { 'squared_error': 'RMSE', # catboost does not support MAE when leaf_estimation_method is Newton - 'least_absolute_deviation': 'LEAST_ASBOLUTE_DEV_NOT_SUPPORTED', + 'absolute_error': 'LEAST_ASBOLUTE_DEV_NOT_SUPPORTED', 'binary_crossentropy': 'Logloss', 'categorical_crossentropy': 'MultiClass' } diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py index b6c1fea0e2f29..c74a1ca0c603e 100644 --- a/sklearn/ensemble/tests/test_forest.py +++ b/sklearn/ensemble/tests/test_forest.py @@ -176,7 +176,9 @@ def check_regression_criterion(name, criterion): @pytest.mark.parametrize('name', FOREST_REGRESSORS) -@pytest.mark.parametrize('criterion', ("squared_error", "mae", "friedman_mse")) +@pytest.mark.parametrize('criterion', ( + "squared_error", "absolute_error", "friedman_mse" +)) def test_regression(name, criterion): check_regression_criterion(name, criterion) @@ -261,10 +263,14 @@ def check_importances(name, criterion, dtype, tolerance): itertools.chain(product(FOREST_CLASSIFIERS, ["gini", "entropy"]), product(FOREST_REGRESSORS, - ["squared_error", "friedman_mse", "mae"]))) + [ + "squared_error", + "friedman_mse", + "absolute_error" + ]))) def test_importances(dtype, name, criterion): tolerance = 0.01 - if name in FOREST_REGRESSORS and criterion == "mae": + if name in FOREST_REGRESSORS and criterion == "absolute_error": tolerance = 0.05 check_importances(name, criterion, dtype, tolerance) @@ -1498,14 +1504,18 @@ def test_n_features_deprecation(Estimator): # TODO: Remove in v1.2 -def test_mse_deprecated(): - est1 = RandomForestRegressor(criterion="mse", random_state=0) +@pytest.mark.parametrize("old_criterion, new_criterion", [ + ("mse", "squared_error"), + ("mae", "absolute_error"), +]) +def test_criterion_deprecated(old_criterion, new_criterion): + est1 = RandomForestRegressor(criterion=old_criterion, random_state=0) with pytest.warns(FutureWarning, - match="Criterion 'mse' was deprecated"): + match=f"Criterion '{old_criterion}' was deprecated"): est1.fit(X, y) - est2 = RandomForestRegressor(criterion="squared_error", random_state=0) + est2 = RandomForestRegressor(criterion=new_criterion, random_state=0) est2.fit(X, y) assert_allclose(est1.predict(X), est2.predict(X)) diff --git a/sklearn/ensemble/tests/test_gradient_boosting.py b/sklearn/ensemble/tests/test_gradient_boosting.py index 166d6bdfc5c11..30c0cdc0cc8fd 100644 --- a/sklearn/ensemble/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/tests/test_gradient_boosting.py @@ -133,7 +133,7 @@ def test_gbdt_loss_alpha_error(params, err_msg): @pytest.mark.parametrize( "GradientBoosting, loss", [(GradientBoostingClassifier, "ls"), - (GradientBoostingClassifier, "lad"), + (GradientBoostingClassifier, "absolute_error"), (GradientBoostingClassifier, "quantile"), (GradientBoostingClassifier, "huber"), (GradientBoostingRegressor, "deviance"), @@ -171,7 +171,7 @@ def test_classification_synthetic(loss): assert error_rate < 0.08 -@pytest.mark.parametrize('loss', ('squared_error', 'lad', 'huber')) +@pytest.mark.parametrize('loss', ('squared_error', 'absolute_error', 'huber')) @pytest.mark.parametrize('subsample', (1.0, 0.5)) def test_regression_dataset(loss, subsample): # Check consistency on regression dataset with least squares @@ -508,7 +508,7 @@ def test_degenerate_targets(): def test_quantile_loss(): - # Check if quantile loss with alpha=0.5 equals lad. + # Check if quantile loss with alpha=0.5 equals absolute_error. clf_quantile = GradientBoostingRegressor(n_estimators=100, loss='quantile', max_depth=4, alpha=0.5, random_state=7) @@ -516,12 +516,12 @@ def test_quantile_loss(): clf_quantile.fit(X_reg, y_reg) y_quantile = clf_quantile.predict(X_reg) - clf_lad = GradientBoostingRegressor(n_estimators=100, loss='lad', - max_depth=4, random_state=7) + clf_ae = GradientBoostingRegressor(n_estimators=100, loss='absolute_error', + max_depth=4, random_state=7) - clf_lad.fit(X_reg, y_reg) - y_lad = clf_lad.predict(X_reg) - assert_array_almost_equal(y_quantile, y_lad, decimal=4) + clf_ae.fit(X_reg, y_reg) + y_ae = clf_ae.predict(X_reg) + assert_array_almost_equal(y_quantile, y_ae, decimal=4) def test_symbol_labels(): @@ -1067,7 +1067,7 @@ def test_non_uniform_weights_toy_edge_case_reg(): y = [0, 0, 1, 0] # ignore the first 2 training samples by setting their weight to 0 sample_weight = [0, 0, 1, 1] - for loss in ('huber', 'squared_error', 'lad', 'quantile'): + for loss in ('huber', 'squared_error', 'absolute_error', 'quantile'): gb = GradientBoostingRegressor(learning_rate=1.0, n_estimators=2, loss=loss) gb.fit(X, y, sample_weight=sample_weight) @@ -1390,13 +1390,17 @@ def test_criterion_mse_deprecated(Estimator): # TODO: Remove in v1.2 -def test_loss_ls_deprecated(): - est1 = GradientBoostingRegressor(loss="ls", random_state=0) +@pytest.mark.parametrize("old_loss, new_loss", [ + ("ls", "squared_error"), + ("lad", "absolute_error"), +]) +def test_loss_deprecated(old_loss, new_loss): + est1 = GradientBoostingRegressor(loss=old_loss, random_state=0) with pytest.warns(FutureWarning, - match="The loss 'ls' was deprecated"): + match=f"The loss '{old_loss}' was deprecated"): est1.fit(X, y) - est2 = GradientBoostingRegressor(loss="squared_error", random_state=0) + est2 = GradientBoostingRegressor(loss=new_loss, random_state=0) est2.fit(X, y) assert_allclose(est1.predict(X), est2.predict(X)) diff --git a/sklearn/linear_model/_ransac.py b/sklearn/linear_model/_ransac.py index 2fc8143f432c8..3cde1f1235ec8 100644 --- a/sklearn/linear_model/_ransac.py +++ b/sklearn/linear_model/_ransac.py @@ -137,9 +137,9 @@ class RANSACRegressor(MetaEstimatorMixin, RegressorMixin, as 0.99 (the default) and e is the current fraction of inliers w.r.t. the total number of samples. - loss : string, callable, default='absolute_loss' - String inputs, 'absolute_loss' and 'squared_error' are supported which - find the absolute loss and squared error per sample respectively. + loss : string, callable, default='absolute_error' + String inputs, 'absolute_error' and 'squared_error' are supported which + find the absolute error and squared error per sample respectively. If ``loss`` is a callable, then it should be a function that takes two arrays as inputs, the true and predicted value and returns a 1-D @@ -155,6 +155,10 @@ class RANSACRegressor(MetaEstimatorMixin, RegressorMixin, The loss 'squared_loss' was deprecated in v1.0 and will be removed in version 1.2. Use `loss='squared_error'` which is equivalent. + .. deprecated:: 1.0 + The loss 'absolute_loss' was deprecated in v1.0 and will be removed + in version 1.2. Use `loss='absolute_error'` which is equivalent. + random_state : int, RandomState instance, default=None The generator used to initialize the centers. Pass an int for reproducible output across multiple function calls. @@ -212,7 +216,7 @@ def __init__(self, base_estimator=None, *, min_samples=None, residual_threshold=None, is_data_valid=None, is_model_valid=None, max_trials=100, max_skips=np.inf, stop_n_inliers=np.inf, stop_score=np.inf, - stop_probability=0.99, loss='absolute_loss', + stop_probability=0.99, loss='absolute_error', random_state=None): self.base_estimator = base_estimator @@ -293,7 +297,15 @@ def fit(self, X, y, sample_weight=None): else: residual_threshold = self.residual_threshold - if self.loss == "absolute_loss": + # TODO: Remove absolute_loss in v1.2. + if self.loss in ("absolute_error", "absolute_loss"): + if self.loss == "absolute_loss": + warnings.warn( + "The loss 'absolute_loss' was deprecated in v1.0 and will " + "be removed in version 1.2. Use `loss='absolute_error'` " + "which is equivalent.", + FutureWarning + ) if y.ndim == 1: loss_function = lambda y_true, y_pred: np.abs(y_true - y_pred) else: @@ -319,7 +331,7 @@ def fit(self, X, y, sample_weight=None): else: raise ValueError( - "loss should be 'absolute_loss', 'squared_error' or a " + "loss should be 'absolute_error', 'squared_error' or a " "callable. Got %s. " % self.loss) random_state = check_random_state(self.random_state) diff --git a/sklearn/linear_model/tests/test_ransac.py b/sklearn/linear_model/tests/test_ransac.py index 857696bf387d5..071a67efcf28f 100644 --- a/sklearn/linear_model/tests/test_ransac.py +++ b/sklearn/linear_model/tests/test_ransac.py @@ -539,13 +539,17 @@ def test_ransac_final_model_fit_sample_weight(): # TODO: Remove in v1.2 -def test_loss_squared_loss_deprecated(): - est1 = RANSACRegressor(loss="squared_loss", random_state=0) +@pytest.mark.parametrize("old_loss, new_loss", [ + ("absolute_loss", "squared_error"), + ("squared_loss", "absolute_error"), +]) +def test_loss_deprecated(old_loss, new_loss): + est1 = RANSACRegressor(loss=old_loss, random_state=0) with pytest.warns(FutureWarning, - match="The loss 'squared_loss' was deprecated"): + match=f"The loss '{old_loss}' was deprecated"): est1.fit(X, y) - est2 = RANSACRegressor(loss="squared_error", random_state=0) + est2 = RANSACRegressor(loss=new_loss, random_state=0) est2.fit(X, y) assert_allclose(est1.predict(X), est2.predict(X)) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 420292881f7db..de5aebfa8a6e3 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -62,10 +62,11 @@ CRITERIA_CLF = {"gini": _criterion.Gini, "entropy": _criterion.Entropy} -# TODO: Remove "mse" in version 1.2. +# TODO: Remove "mse" and "mae" in version 1.2. CRITERIA_REG = {"squared_error": _criterion.MSE, "mse": _criterion.MSE, "friedman_mse": _criterion.FriedmanMSE, + "absolute_error": _criterion.MAE, "mae": _criterion.MAE, "poisson": _criterion.Poisson} @@ -360,6 +361,13 @@ def fit(self, X, y, sample_weight=None, check_input=True, "which is equivalent.", FutureWarning ) + elif self.criterion == "mae": + warnings.warn( + "Criterion 'mae' was deprecated in v1.0 and will be " + "removed in version 1.2. Use `criterion='absolute_error'` " + "which is equivalent.", + FutureWarning + ) else: # Make a deepcopy in case the criterion has mutable attributes that # might be shared and modified concurrently during parallel fitting @@ -1001,16 +1009,16 @@ class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree): Parameters ---------- - criterion : {"squared_error", "mse", "friedman_mse", "mae", "poisson"}, \ - default="squared_error" + criterion : {"squared_error", "mse", "friedman_mse", "absolute_error", \ + "mae", "poisson"}, default="squared_error" The function to measure the quality of a split. Supported criteria are "squared_error" for the mean squared error, which is equal to variance reduction as feature selection criterion and minimizes the L2 loss using the mean of each terminal node, "friedman_mse", which uses mean squared error with Friedman's improvement score for potential - splits, "mae" for the mean absolute error, which minimizes the L1 loss - using the median of each terminal node, and "poisson" which uses - reduction in Poisson deviance to find splits. + splits, "absolute_error" for the mean absolute error, which minimizes + the L1 loss using the median of each terminal node, and "poisson" which + uses reduction in Poisson deviance to find splits. .. versionadded:: 0.18 Mean Absolute Error (MAE) criterion. @@ -1022,6 +1030,10 @@ class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree): Criterion "mse" was deprecated in v1.0 and will be removed in version 1.2. Use `criterion="squared_error"` which is equivalent. + .. deprecated:: 1.0 + Criterion "mae" was deprecated in v1.0 and will be removed in + version 1.2. Use `criterion="absolute_error"` which is equivalent. + splitter : {"best", "random"}, default="best" The strategy used to choose the split at each node. Supported strategies are "best" to choose the best split and "random" to choose @@ -1577,6 +1589,10 @@ class ExtraTreeRegressor(DecisionTreeRegressor): Criterion "mse" was deprecated in v1.0 and will be removed in version 1.2. Use `criterion="squared_error"` which is equivalent. + .. deprecated:: 1.0 + Criterion "mae" was deprecated in v1.0 and will be removed in + version 1.2. Use `criterion="absolute_error"` which is equivalent. + splitter : {"random", "best"}, default="random" The strategy used to choose the split at each node. Supported strategies are "best" to choose the best split and "random" to choose diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index 2a1da1e2bfce0..a6e30a9941756 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -51,7 +51,7 @@ from sklearn.utils import compute_sample_weight CLF_CRITERIONS = ("gini", "entropy") -REG_CRITERIONS = ("squared_error", "mae", "friedman_mse", "poisson") +REG_CRITERIONS = ("squared_error", "absolute_error", "friedman_mse", "poisson") CLF_TREES = { "DecisionTreeClassifier": DecisionTreeClassifier, @@ -294,7 +294,7 @@ def test_diabetes_overfit(name, Tree, criterion): @pytest.mark.parametrize( "criterion, max_depth, metric, max_loss", [("squared_error", 15, mean_squared_error, 60), - ("mae", 20, mean_squared_error, 60), + ("absolute_error", 20, mean_squared_error, 60), ("friedman_mse", 15, mean_squared_error, 60), ("poisson", 15, mean_poisson_deviance, 30)] ) @@ -1772,7 +1772,7 @@ def test_mae(): = 0.75 ------ """ - dt_mae = DecisionTreeRegressor(random_state=0, criterion="mae", + dt_mae = DecisionTreeRegressor(random_state=0, criterion="absolute_error", max_leaf_nodes=2) # Test MAE where sample weights are non-uniform (as illustrated above): @@ -2121,12 +2121,16 @@ def test_X_idx_sorted_deprecated(TreeEstimator): # TODO: Remove in v1.2 @pytest.mark.parametrize("Tree", REG_TREES.values()) -def test_mse_deprecated(Tree): - tree = Tree(criterion="mse") +@pytest.mark.parametrize("old_criterion, new_criterion", [ + ("mse", "squared_error"), + ("mae", "absolute_error"), +]) +def test_criterion_deprecated(Tree, old_criterion, new_criterion): + tree = Tree(criterion=old_criterion) with pytest.warns(FutureWarning, - match="Criterion 'mse' was deprecated"): + match=f"Criterion '{old_criterion}' was deprecated"): tree.fit(X, y) - tree_sqer = Tree(criterion="squared_error").fit(X, y) - assert_allclose(tree.predict(X), tree_sqer.predict(X)) + tree_new = Tree(criterion=new_criterion).fit(X, y) + assert_allclose(tree.predict(X), tree_new.predict(X)) From 5e85941bd66e651c17ce8d37a24e52fd0b00af28 Mon Sep 17 00:00:00 2001 From: kmatt10 Date: Tue, 11 May 2021 16:10:41 +0800 Subject: [PATCH 386/478] DOC Added return value info to SimpleImputer.transform docstring (#20005) Co-authored-by: Thomas J. Fan Co-authored-by: Guillaume Lemaitre --- sklearn/impute/_base.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sklearn/impute/_base.py b/sklearn/impute/_base.py index 46f2301a1879d..e345fe44f0895 100644 --- a/sklearn/impute/_base.py +++ b/sklearn/impute/_base.py @@ -426,6 +426,12 @@ def transform(self, X): ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) The input data to complete. + + Returns + ------- + X_imputed : {ndarray, sparse matrix} of shape \ + (n_samples, n_features_out) + `X` with imputed values. """ check_is_fitted(self) From 6d67937b3ce28fd3fc966d3d417df56c08c98502 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?= <34657725+jeremiedbb@users.noreply.github.com> Date: Tue, 11 May 2021 11:08:13 +0200 Subject: [PATCH 387/478] cln deprecations fixes (#19323) Co-authored-by: Olivier Grisel --- sklearn/utils/fixes.py | 11 ----------- sklearn/utils/tests/test_fixes.py | 6 ------ 2 files changed, 17 deletions(-) diff --git a/sklearn/utils/fixes.py b/sklearn/utils/fixes.py index a5a455ee7b9a1..13ecba4afc472 100644 --- a/sklearn/utils/fixes.py +++ b/sklearn/utils/fixes.py @@ -18,10 +18,7 @@ import scipy import scipy.stats from scipy.sparse.linalg import lsqr as sparse_lsqr # noqa -from numpy.ma import MaskedArray as _MaskedArray # TODO: remove in 1.0 from .._config import config_context, get_config - -from .deprecation import deprecated from ..externals._packaging.version import parse as parse_version @@ -151,14 +148,6 @@ class loguniform(scipy.stats.reciprocal): """ -@deprecated( - 'MaskedArray is deprecated in version 0.23 and will be removed in version ' - '1.0 (renaming of 0.25). Use numpy.ma.MaskedArray instead.' -) -class MaskedArray(_MaskedArray): - pass # TODO: remove in 1.0 - - def _take_along_axis(arr, indices, axis): """Implements a simplified version of np.take_along_axis if numpy version < 1.15""" diff --git a/sklearn/utils/tests/test_fixes.py b/sklearn/utils/tests/test_fixes.py index 03e11f5bc1a08..bcd57379fcff6 100644 --- a/sklearn/utils/tests/test_fixes.py +++ b/sklearn/utils/tests/test_fixes.py @@ -14,7 +14,6 @@ from sklearn.utils.fixes import _joblib_parallel_args from sklearn.utils.fixes import _object_dtype_isnan from sklearn.utils.fixes import loguniform -from sklearn.utils.fixes import MaskedArray from sklearn.utils.fixes import linspace, parse_version, np_version @@ -87,11 +86,6 @@ def test_loguniform(low, high, base): ) -def test_masked_array_deprecated(): # TODO: remove in 1.0 - with pytest.warns(FutureWarning, match='is deprecated'): - MaskedArray() - - def test_linspace(): """Test that linespace works like np.linespace as of numpy version 1.16.""" start, stop = 0, 10 From 8635580e5cec7afeab4d8d647a705d66ac5c4adc Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Tue, 11 May 2021 15:46:10 +0200 Subject: [PATCH 388/478] DOC Add communication guidelines. (#20048) * Add communication guidelines. * Address comments. Fix sphinx warning. * Some clarification. * Update doc/developers/contributing.rst Co-authored-by: Nicolas Hug * Address comments. * Address comments. * Update doc/developers/contributing.rst Co-authored-by: Nicolas Hug Co-authored-by: Nicolas Hug --- doc/developers/contributing.rst | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/doc/developers/contributing.rst b/doc/developers/contributing.rst index 0284ad179fc19..c808a806b3076 100644 --- a/doc/developers/contributing.rst +++ b/doc/developers/contributing.rst @@ -1262,6 +1262,38 @@ from high-level questions to a more detailed check-list. :ref:`saved_replies` includes some frequent comments that reviewers may make. +.. _communication: + +Communication Guidelines +------------------------ + +Reviewing open pull requests (PRs) helps move the project forward. It is a +great way to get familiar with the codebase and should motivate the +contributor to keep involved in the project. [1]_ + +- Every PR, good or bad, is an act of generosity. Opening with a positive + comment will help the author feel rewarded, and your subsequent remarks may + be heard more clearly. You may feel good also. +- Begin if possible with the large issues, so the author knows they’ve been + understood. Resist the temptation to immediately go line by line, or to open + with small pervasive issues. +- Do not let perfect be the enemy of the good. If you find yourself making + many small suggestions that are a matter of subjective taste rather than + somewhat objective, the following approaches are suggested: + + - refrain from submitting these; + - prefix them as "Nit" so that the contributor knows it's OK not to address; + - follow up in a subsequent PR, out of courtesy, you may want to let the + original contributor know. + +- Do not rush, take the time to make your comments clear and justify your + suggestions. +- You are the face of the project. Bad days occur to everyone, in that + occasion you deserve a break: try to take your time and stay offline. + +.. [1] Adapted from the numpy `communication guidelines + `_. + Reading the existing code base ============================== From 1991531116a2cde1d44a7390a598cfff2957d010 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?= <34657725+jeremiedbb@users.noreply.github.com> Date: Tue, 11 May 2021 22:38:11 +0200 Subject: [PATCH 389/478] MNT Clean deprecations for 1.0 | SGD (#19320) Co-authored-by: Olivier Grisel --- sklearn/linear_model/_stochastic_gradient.py | 49 ------------------- .../tests/test_passive_aggressive.py | 14 ------ sklearn/linear_model/tests/test_sgd.py | 13 ----- 3 files changed, 76 deletions(-) diff --git a/sklearn/linear_model/_stochastic_gradient.py b/sklearn/linear_model/_stochastic_gradient.py index 44ecf564ffcc5..92b02155246df 100644 --- a/sklearn/linear_model/_stochastic_gradient.py +++ b/sklearn/linear_model/_stochastic_gradient.py @@ -37,7 +37,6 @@ from ._sgd_fast import EpsilonInsensitive from ._sgd_fast import SquaredEpsilonInsensitive from ..utils.fixes import _joblib_parallel_args -from ..utils import deprecated LEARNING_RATE_TYPES = {"constant": 1, "optimal": 2, "invscaling": 3, "adaptive": 4, "pa1": 5, "pa2": 6} @@ -309,39 +308,6 @@ def _make_validation_score_cb(self, validation_mask, X, y, sample_weight, self, X[validation_mask], y[validation_mask], sample_weight[validation_mask], classes=classes) - # mypy error: Decorated property not supported - @deprecated("Attribute standard_coef_ was deprecated " # type: ignore - "in version 0.23 and will be removed in 1.0 " - "(renaming of 0.25).") - @property - def standard_coef_(self): - return self._standard_coef - - # mypy error: Decorated property not supported - @deprecated( # type: ignore - "Attribute standard_intercept_ was deprecated " - "in version 0.23 and will be removed in 1.0 (renaming of 0.25)." - ) - @property - def standard_intercept_(self): - return self._standard_intercept - - # mypy error: Decorated property not supported - @deprecated("Attribute average_coef_ was deprecated " # type: ignore - "in version 0.23 and will be removed in 1.0 " - "(renaming of 0.25).") - @property - def average_coef_(self): - return self._average_coef - - # mypy error: Decorated property not supported - @deprecated("Attribute average_intercept_ was deprecated " # type: ignore - "in version 0.23 and will be removed in 1.0 " - "(renaming of 0.25).") - @property - def average_intercept_(self): - return self._average_intercept - def _prepare_fit_binary(est, y, i): """Initialization for fit_binary. @@ -1570,21 +1536,6 @@ class SGDRegressor(BaseSGDRegressor): intercept_ : ndarray of shape (1,) The intercept term. - average_coef_ : ndarray of shape (n_features,) - Averaged weights assigned to the features. Only available - if ``average=True``. - - .. deprecated:: 0.23 - Attribute ``average_coef_`` was deprecated - in version 0.23 and will be removed in 1.0 (renaming of 0.25). - - average_intercept_ : ndarray of shape (1,) - The averaged intercept term. Only available if ``average=True``. - - .. deprecated:: 0.23 - Attribute ``average_intercept_`` was deprecated - in version 0.23 and will be removed in 1.0 (renaming of 0.25). - n_iter_ : int The actual number of iterations before reaching the stopping criterion. diff --git a/sklearn/linear_model/tests/test_passive_aggressive.py b/sklearn/linear_model/tests/test_passive_aggressive.py index d0d099eeacc8d..251e4408464e2 100644 --- a/sklearn/linear_model/tests/test_passive_aggressive.py +++ b/sklearn/linear_model/tests/test_passive_aggressive.py @@ -270,17 +270,3 @@ def test_regressor_undefined_methods(): for meth in ("transform",): with pytest.raises(AttributeError): getattr(reg, meth) - - -# TODO: remove in 1.0 -@pytest.mark.parametrize('klass', [PassiveAggressiveClassifier, - PassiveAggressiveRegressor]) -def test_passive_aggressive_deprecated_attr(klass): - est = klass(average=True) - est.fit(X, y) - - msg = "Attribute {} was deprecated" - for att in ['average_coef_', 'average_intercept_', - 'standard_coef_', 'standard_intercept_']: - with pytest.warns(FutureWarning, match=msg.format(att)): - getattr(est, att) diff --git a/sklearn/linear_model/tests/test_sgd.py b/sklearn/linear_model/tests/test_sgd.py index 8465631828613..1fcf99997a031 100644 --- a/sklearn/linear_model/tests/test_sgd.py +++ b/sklearn/linear_model/tests/test_sgd.py @@ -299,19 +299,6 @@ def test_plain_has_no_average_attr(klass): assert not hasattr(clf, '_standard_coef') -# TODO: remove in 1.0 -@pytest.mark.parametrize('klass', [SGDClassifier, SGDRegressor]) -def test_sgd_deprecated_attr(klass): - est = klass(average=True, eta0=.01) - est.fit(X, Y) - - msg = "Attribute {} was deprecated" - for att in ['average_coef_', 'average_intercept_', - 'standard_coef_', 'standard_intercept_']: - with pytest.warns(FutureWarning, match=msg.format(att)): - getattr(est, att) - - @pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor, SGDOneClassSVM, SparseSGDOneClassSVM]) From 32b60fb9769f7e43915e51e66a94994024a72764 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?= <34657725+jeremiedbb@users.noreply.github.com> Date: Tue, 11 May 2021 22:40:50 +0200 Subject: [PATCH 390/478] MNT Clean deprecations for 1.0 | plot_tree (#19324) --- sklearn/tree/_export.py | 37 +++++++++---------------------- sklearn/tree/tests/test_export.py | 11 --------- 2 files changed, 10 insertions(+), 38 deletions(-) diff --git a/sklearn/tree/_export.py b/sklearn/tree/_export.py index affe1b68cfe9a..17680db2b855d 100644 --- a/sklearn/tree/_export.py +++ b/sklearn/tree/_export.py @@ -25,8 +25,6 @@ from ._reingold_tilford import buchheim, Tree from . import DecisionTreeClassifier -import warnings - def _color_brew(n): """Generate n colors with equally spaced hues. @@ -80,10 +78,9 @@ def __repr__(self): @_deprecate_positional_args def plot_tree(decision_tree, *, max_depth=None, feature_names=None, - class_names=None, label='all', filled=False, - impurity=True, node_ids=False, - proportion=False, rotate='deprecated', rounded=False, - precision=3, ax=None, fontsize=None): + class_names=None, label='all', filled=False, impurity=True, + node_ids=False, proportion=False, rounded=False, precision=3, + ax=None, fontsize=None): """Plot a decision tree. The sample counts that are shown are weighted with any sample_weights that @@ -135,14 +132,6 @@ def plot_tree(decision_tree, *, max_depth=None, feature_names=None, When set to ``True``, change the display of 'values' and/or 'samples' to be proportions and percentages respectively. - rotate : bool, default=False - This parameter has no effect on the matplotlib tree visualisation and - it is kept here for backward compatibility. - - .. deprecated:: 0.23 - ``rotate`` is deprecated in 0.23 and will be removed in 1.0 - (renaming of 0.25). - rounded : bool, default=False When set to ``True``, draw node boxes with rounded corners and use Helvetica fonts instead of Times-Roman. @@ -180,16 +169,10 @@ def plot_tree(decision_tree, *, max_depth=None, feature_names=None, check_is_fitted(decision_tree) - if rotate != 'deprecated': - warnings.warn(("'rotate' has no effect and is deprecated in 0.23. " - "It will be removed in 1.0 (renaming of 0.25)."), - FutureWarning) - exporter = _MPLTreeExporter( max_depth=max_depth, feature_names=feature_names, - class_names=class_names, label=label, filled=filled, - impurity=impurity, node_ids=node_ids, - proportion=proportion, rotate=rotate, rounded=rounded, + class_names=class_names, label=label, filled=filled, impurity=impurity, + node_ids=node_ids, proportion=proportion, rounded=rounded, precision=precision, fontsize=fontsize) return exporter.export(decision_tree, ax=ax) @@ -198,7 +181,7 @@ class _BaseTreeExporter: def __init__(self, max_depth=None, feature_names=None, class_names=None, label='all', filled=False, impurity=True, node_ids=False, - proportion=False, rotate=False, rounded=False, + proportion=False, rounded=False, precision=3, fontsize=None): self.max_depth = max_depth self.feature_names = feature_names @@ -208,7 +191,6 @@ def __init__(self, max_depth=None, feature_names=None, self.impurity = impurity self.node_ids = node_ids self.proportion = proportion - self.rotate = rotate self.rounded = rounded self.precision = precision self.fontsize = fontsize @@ -380,11 +362,12 @@ def __init__(self, out_file=SENTINEL, max_depth=None, max_depth=max_depth, feature_names=feature_names, class_names=class_names, label=label, filled=filled, impurity=impurity, node_ids=node_ids, proportion=proportion, - rotate=rotate, rounded=rounded, precision=precision) + rounded=rounded, precision=precision) self.leaves_parallel = leaves_parallel self.out_file = out_file self.special_characters = special_characters self.fontname = fontname + self.rotate = rotate # PostScript compatibility for special characters if special_characters: @@ -531,14 +514,14 @@ class _MPLTreeExporter(_BaseTreeExporter): def __init__(self, max_depth=None, feature_names=None, class_names=None, label='all', filled=False, impurity=True, node_ids=False, - proportion=False, rotate=False, rounded=False, + proportion=False, rounded=False, precision=3, fontsize=None): super().__init__( max_depth=max_depth, feature_names=feature_names, class_names=class_names, label=label, filled=filled, impurity=impurity, node_ids=node_ids, proportion=proportion, - rotate=rotate, rounded=rounded, precision=precision) + rounded=rounded, precision=precision) self.fontsize = fontsize # validate diff --git a/sklearn/tree/tests/test_export.py b/sklearn/tree/tests/test_export.py index 7b94fbb527dc9..d12daeaa657be 100644 --- a/sklearn/tree/tests/test_export.py +++ b/sklearn/tree/tests/test_export.py @@ -463,17 +463,6 @@ def test_plot_tree_gini(pyplot): assert nodes[2].get_text() == "gini = 0.0\nsamples = 3\nvalue = [0, 3]" -# FIXME: to be removed in 1.0 -def test_plot_tree_rotate_deprecation(pyplot): - tree = DecisionTreeClassifier() - tree.fit(X, y) - # test that a warning is raised when rotate is used. - match = (r"'rotate' has no effect and is deprecated in 0.23. " - r"It will be removed in 1.0 \(renaming of 0.25\).") - with pytest.warns(FutureWarning, match=match): - plot_tree(tree, rotate=True) - - def test_not_fitted_tree(pyplot): # Testing if not fitted tree throws the correct error From 0f85e6b32fd230320ca79926dc278d036a4b853a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?= <34657725+jeremiedbb@users.noreply.github.com> Date: Tue, 11 May 2021 22:41:36 +0200 Subject: [PATCH 391/478] MNT Clean deprecations for 1.0 | Search (#19321) Co-authored-by: Olivier Grisel --- sklearn/model_selection/__init__.py | 2 - sklearn/model_selection/_search.py | 74 +------------------- sklearn/model_selection/tests/test_search.py | 52 +------------- 3 files changed, 3 insertions(+), 125 deletions(-) diff --git a/sklearn/model_selection/__init__.py b/sklearn/model_selection/__init__.py index f79db2a5acc17..2dfb295f5c14c 100644 --- a/sklearn/model_selection/__init__.py +++ b/sklearn/model_selection/__init__.py @@ -30,7 +30,6 @@ from ._search import RandomizedSearchCV from ._search import ParameterGrid from ._search import ParameterSampler -from ._search import fit_grid_point if typing.TYPE_CHECKING: # Avoid errors in type checkers (e.g. mypy) for experimental estimators. @@ -64,7 +63,6 @@ 'cross_val_predict', 'cross_val_score', 'cross_validate', - 'fit_grid_point', 'learning_curve', 'permutation_test_score', 'train_test_split', diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index 6e837a2f97b24..d4444ce09dcb5 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -44,7 +44,7 @@ from ..metrics import check_scoring from ..utils import deprecated -__all__ = ['GridSearchCV', 'ParameterGrid', 'fit_grid_point', +__all__ = ['GridSearchCV', 'ParameterGrid', 'ParameterSampler', 'RandomizedSearchCV'] @@ -314,78 +314,6 @@ def __len__(self): return self.n_iter -# FIXME Remove fit_grid_point in 1.0 -@deprecated( - "fit_grid_point is deprecated in version 0.23 " - "and will be removed in version 1.0 (renaming of 0.25)" -) -def fit_grid_point(X, y, estimator, parameters, train, test, scorer, - verbose, error_score=np.nan, **fit_params): - """Run fit on one set of parameters. - - Parameters - ---------- - X : array-like, sparse matrix or list - Input data. - - y : array-like or None - Targets for input data. - - estimator : estimator object - A object of that type is instantiated for each grid point. - This is assumed to implement the scikit-learn estimator interface. - Either estimator needs to provide a ``score`` function, - or ``scoring`` must be passed. - - parameters : dict - Parameters to be set on estimator for this grid point. - - train : ndarray, dtype int or bool - Boolean mask or indices for training set. - - test : ndarray, dtype int or bool - Boolean mask or indices for test set. - - scorer : callable or None - The scorer callable object / function must have its signature as - ``scorer(estimator, X, y)``. - - If ``None`` the estimator's score method is used. - - verbose : int - Verbosity level. - - **fit_params : kwargs - Additional parameter passed to the fit function of the estimator. - - error_score : 'raise' or numeric, default=np.nan - Value to assign to the score if an error occurs in estimator fitting. - If set to 'raise', the error is raised. If a numeric value is given, - FitFailedWarning is raised. This parameter does not affect the refit - step, which will always raise the error. - - Returns - ------- - score : float - Score of this parameter setting on given test split. - - parameters : dict - The parameters that have been evaluated. - - n_samples_test : int - Number of test samples in this split. - """ - # NOTE we are not using the return value as the scorer by itself should be - # validated before. We use check_scoring only to reject multimetric scorer - check_scoring(estimator, scorer) - results = _fit_and_score(estimator, X, y, scorer, train, - test, verbose, parameters, - fit_params=fit_params, - return_n_test_samples=True, - error_score=error_score) - return results["test_scores"], parameters, results["n_test_samples"] - - def _check_param_grid(param_grid): if hasattr(param_grid, 'items'): param_grid = [param_grid] diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py index b74e250e94192..2576d5f24006d 100644 --- a/sklearn/model_selection/tests/test_search.py +++ b/sklearn/model_selection/tests/test_search.py @@ -28,13 +28,12 @@ from scipy.stats import bernoulli, expon, uniform from sklearn.base import BaseEstimator, ClassifierMixin -from sklearn.base import clone, is_classifier +from sklearn.base import is_classifier from sklearn.exceptions import NotFittedError from sklearn.datasets import make_classification from sklearn.datasets import make_blobs from sklearn.datasets import make_multilabel_classification -from sklearn.model_selection import fit_grid_point from sklearn.model_selection import train_test_split from sklearn.model_selection import KFold from sklearn.model_selection import StratifiedKFold @@ -129,6 +128,7 @@ def score(self): def assert_grid_iter_equals_getitem(grid): assert list(grid) == [grid[i] for i in range(len(grid))] + @pytest.mark.parametrize("klass", [ParameterGrid, partial(ParameterSampler, n_iter=10)]) @pytest.mark.parametrize( @@ -1271,54 +1271,6 @@ def test_grid_search_correct_score_results(): assert_almost_equal(correct_score, cv_scores[i]) -# FIXME remove test_fit_grid_point as the function will be removed on 1.0 -@ignore_warnings(category=FutureWarning) -def test_fit_grid_point(): - X, y = make_classification(random_state=0) - cv = StratifiedKFold() - svc = LinearSVC(random_state=0) - scorer = make_scorer(accuracy_score) - - for params in ({'C': 0.1}, {'C': 0.01}, {'C': 0.001}): - for train, test in cv.split(X, y): - this_scores, this_params, n_test_samples = fit_grid_point( - X, y, clone(svc), params, train, test, - scorer, verbose=False) - - est = clone(svc).set_params(**params) - est.fit(X[train], y[train]) - expected_score = scorer(est, X[test], y[test]) - - # Test the return values of fit_grid_point - assert_almost_equal(this_scores, expected_score) - assert params == this_params - assert n_test_samples == test.size - - # Should raise an error upon multimetric scorer - error_msg = ("For evaluating multiple scores, use " - "sklearn.model_selection.cross_validate instead.") - with pytest.raises(ValueError, match=error_msg): - fit_grid_point( - X, y, svc, params, train, test, {'score': scorer}, - verbose=True - ) - - -# FIXME remove test_fit_grid_point_deprecated as -# fit_grid_point will be removed on 1.0 -def test_fit_grid_point_deprecated(): - X, y = make_classification(random_state=0) - svc = LinearSVC(random_state=0) - scorer = make_scorer(accuracy_score) - msg = ("fit_grid_point is deprecated in version 0.23 " - "and will be removed in version 1.0") - params = {'C': 0.1} - train, test = next(StratifiedKFold().split(X, y)) - - with pytest.warns(FutureWarning, match=msg): - fit_grid_point(X, y, svc, params, train, test, scorer, verbose=False) - - def test_pickle(): # Test that a fit search can be pickled clf = MockClassifier() From 0012d845f579ff62493a5cc9492e77c204f195d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?= <34657725+jeremiedbb@users.noreply.github.com> Date: Tue, 11 May 2021 22:42:23 +0200 Subject: [PATCH 392/478] MNT Clean deprecations for 1.0 | AffinityPropagation (#19318) --- sklearn/cluster/_affinity_propagation.py | 18 ++++-------------- .../cluster/tests/test_affinity_propagation.py | 8 -------- sklearn/tests/test_docstring_parameters.py | 4 ---- 3 files changed, 4 insertions(+), 26 deletions(-) diff --git a/sklearn/cluster/_affinity_propagation.py b/sklearn/cluster/_affinity_propagation.py index cb9230cd2382f..93b98d8aff7ee 100644 --- a/sklearn/cluster/_affinity_propagation.py +++ b/sklearn/cluster/_affinity_propagation.py @@ -35,7 +35,7 @@ def all_equal_similarities(): @_deprecate_positional_args def affinity_propagation(S, *, preference=None, convergence_iter=15, max_iter=200, damping=0.5, copy=True, verbose=False, - return_n_iter=False, random_state='warn'): + return_n_iter=False, random_state=None): """Perform Affinity Propagation Clustering of data. Read more in the :ref:`User Guide `. @@ -75,7 +75,7 @@ def affinity_propagation(S, *, preference=None, convergence_iter=15, return_n_iter : bool, default=False Whether or not to return the number of iterations. - random_state : int, RandomState instance or None, default=0 + random_state : int, RandomState instance or None, default=None Pseudo-random number generator to control the starting state. Use an int for reproducible results across function calls. See the :term:`Glossary `. @@ -144,16 +144,6 @@ def affinity_propagation(S, *, preference=None, convergence_iter=15, if return_n_iter else (np.array([0]), np.array([0] * n_samples))) - if random_state == 'warn': - warnings.warn( - "'random_state' has been introduced in 0.23. It will be set to " - "None starting from 1.0 (renaming of 0.25) which means that " - "results will differ at every function call. Set 'random_state' " - "to None to silence this warning, or to 0 to keep the behavior of " - "versions <0.23.", - FutureWarning - ) - random_state = 0 random_state = check_random_state(random_state) # Place preference on the diagonal of S @@ -295,7 +285,7 @@ class AffinityPropagation(ClusterMixin, BaseEstimator): verbose : bool, default=False Whether to be verbose. - random_state : int, RandomState instance or None, default=0 + random_state : int, RandomState instance or None, default=None Pseudo-random number generator to control the starting state. Use an int for reproducible results across function calls. See the :term:`Glossary `. @@ -365,7 +355,7 @@ class AffinityPropagation(ClusterMixin, BaseEstimator): @_deprecate_positional_args def __init__(self, *, damping=.5, max_iter=200, convergence_iter=15, copy=True, preference=None, affinity='euclidean', - verbose=False, random_state='warn'): + verbose=False, random_state=None): self.damping = damping self.max_iter = max_iter diff --git a/sklearn/cluster/tests/test_affinity_propagation.py b/sklearn/cluster/tests/test_affinity_propagation.py index 51b4fd425349e..ae2806bf38e59 100644 --- a/sklearn/cluster/tests/test_affinity_propagation.py +++ b/sklearn/cluster/tests/test_affinity_propagation.py @@ -209,14 +209,6 @@ def test_affinity_propagation_random_state(): assert np.mean((centers0 - centers76) ** 2) > 1 -# FIXME: to be removed in 1.0 -def test_affinity_propagation_random_state_warning(): - # test that a warning is raised when random_state is not defined. - X = np.array([[0, 0], [1, 1], [-2, -2]]) - match = "'random_state' has been introduced in 0.23." - with pytest.warns(FutureWarning, match=match): - AffinityPropagation().fit(X) - @pytest.mark.parametrize('centers', [csr_matrix(np.zeros((1, 10))), np.zeros((1, 10))]) def test_affinity_propagation_convergence_warning_dense_sparse(centers): diff --git a/sklearn/tests/test_docstring_parameters.py b/sklearn/tests/test_docstring_parameters.py index 719df2f4a0f77..099c27341927e 100644 --- a/sklearn/tests/test_docstring_parameters.py +++ b/sklearn/tests/test_docstring_parameters.py @@ -245,10 +245,6 @@ def test_fit_docstring_attributes(name, Estimator): if 'PLS' in Estimator.__name__ or 'CCA' in Estimator.__name__: est.n_components = 1 # default = 2 is invalid for single target. - # FIXME: TO BE REMOVED for 1.0 (avoid FutureWarning) - if Estimator.__name__ == 'AffinityPropagation': - est.random_state = 63 - # FIXME: TO BE REMOVED for 1.1 (avoid FutureWarning) if Estimator.__name__ == 'NMF': est.init = 'nndsvda' From 3bcbf85b3a133a6b27c272bf8566e8c26660903e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?= <34657725+jeremiedbb@users.noreply.github.com> Date: Tue, 11 May 2021 22:42:53 +0200 Subject: [PATCH 393/478] MNT Clean deprecations for 1.0 | SVM (#19322) --- sklearn/svm/_classes.py | 33 --------------------------------- sklearn/svm/tests/test_svm.py | 15 --------------- 2 files changed, 48 deletions(-) diff --git a/sklearn/svm/_classes.py b/sklearn/svm/_classes.py index 674fa294dcf3c..b151f5267da50 100644 --- a/sklearn/svm/_classes.py +++ b/sklearn/svm/_classes.py @@ -7,7 +7,6 @@ from ..utils.validation import _num_samples from ..utils.validation import _deprecate_positional_args from ..utils.multiclass import check_classification_targets -from ..utils.deprecation import deprecated class LinearSVC(LinearClassifierMixin, @@ -1045,22 +1044,6 @@ def __init__(self, *, kernel='rbf', degree=3, gamma='scale', shrinking=shrinking, probability=False, cache_size=cache_size, class_weight=None, max_iter=max_iter, random_state=None) - # mypy error: Decorated property not supported - @deprecated( # type: ignore - "The probA_ attribute is deprecated in version 0.23 and will be " - "removed in version 1.0 (renaming of 0.25).") - @property - def probA_(self): - return self._probA - - # mypy error: Decorated property not supported - @deprecated( # type: ignore - "The probB_ attribute is deprecated in version 0.23 and will be " - "removed in version 1.0 (renaming of 0.25).") - @property - def probB_(self): - return self._probB - def _more_tags(self): return { '_xfail_checks': { @@ -1435,22 +1418,6 @@ def predict(self, X): y = super().predict(X) return np.asarray(y, dtype=np.intp) - # mypy error: Decorated property not supported - @deprecated( # type: ignore - "The probA_ attribute is deprecated in version 0.23 and will be " - "removed in version 1.0.") - @property - def probA_(self): - return self._probA - - # mypy error: Decorated property not supported - @deprecated( # type: ignore - "The probB_ attribute is deprecated in version 0.23 and will be " - "removed in version 1.0.") - @property - def probB_(self): - return self._probB - def _more_tags(self): return { '_xfail_checks': { diff --git a/sklearn/svm/tests/test_svm.py b/sklearn/svm/tests/test_svm.py index f1e2cea4be2dc..3fe57ad1b8375 100644 --- a/sklearn/svm/tests/test_svm.py +++ b/sklearn/svm/tests/test_svm.py @@ -1252,21 +1252,6 @@ def test_n_support_oneclass_svr(): assert reg.n_support_ == 4 -# TODO: Remove in 1.0 when probA_ and probB_ are deprecated -@pytest.mark.parametrize("SVMClass, data", [ - (svm.OneClassSVM, (X, )), - (svm.SVR, (X, Y)) -]) -@pytest.mark.parametrize("deprecated_prob", ["probA_", "probB_"]) -def test_svm_probA_proB_deprecated(SVMClass, data, deprecated_prob): - clf = SVMClass().fit(*data) - - msg = ("The {} attribute is deprecated in version 0.23 and will be " - "removed in version 1.0").format(deprecated_prob) - with pytest.warns(FutureWarning, match=msg): - getattr(clf, deprecated_prob) - - @pytest.mark.parametrize("Estimator", [svm.SVC, svm.SVR]) def test_custom_kernel_not_array_input(Estimator): """Test using a custom kernel that is not fed with array-like for floats""" From 847fc6a27431d96eaef926773608168e8edb9e12 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?= <34657725+jeremiedbb@users.noreply.github.com> Date: Wed, 12 May 2021 00:00:03 +0200 Subject: [PATCH 394/478] MNT Clean deprecations for 1.0 | KMeans (#19317) Co-authored-by: Olivier Grisel --- sklearn/cluster/_bicluster.py | 43 ++----------- sklearn/cluster/_kmeans.py | 86 +++---------------------- sklearn/cluster/tests/test_bicluster.py | 13 ---- sklearn/cluster/tests/test_k_means.py | 26 -------- 4 files changed, 13 insertions(+), 155 deletions(-) diff --git a/sklearn/cluster/_bicluster.py b/sklearn/cluster/_bicluster.py index 3bde33399a8e0..2b5184fae40ae 100644 --- a/sklearn/cluster/_bicluster.py +++ b/sklearn/cluster/_bicluster.py @@ -3,7 +3,6 @@ # License: BSD 3 clause from abc import ABCMeta, abstractmethod -import warnings import numpy as np @@ -89,14 +88,13 @@ class BaseSpectral(BiclusterMixin, BaseEstimator, metaclass=ABCMeta): @abstractmethod def __init__(self, n_clusters=3, svd_method="randomized", n_svd_vecs=None, mini_batch=False, init="k-means++", - n_init=10, n_jobs='deprecated', random_state=None): + n_init=10, random_state=None): self.n_clusters = n_clusters self.svd_method = svd_method self.n_svd_vecs = n_svd_vecs self.mini_batch = mini_batch self.init = init self.n_init = n_init - self.n_jobs = n_jobs self.random_state = random_state def _check_parameters(self): @@ -116,10 +114,6 @@ def fit(self, X, y=None): y : Ignored """ - if self.n_jobs != 'deprecated': - warnings.warn("'n_jobs' was deprecated in version 0.23 and will be" - " removed in 1.0 (renaming of 0.25).", FutureWarning) - X = self._validate_data(X, accept_sparse='csr', dtype=np.float64) self._check_parameters() self._fit(X) @@ -171,8 +165,7 @@ def _k_means(self, data, n_clusters): random_state=self.random_state) else: model = KMeans(n_clusters, init=self.init, - n_init=self.n_init, n_jobs=self.n_jobs, - random_state=self.random_state) + n_init=self.n_init, random_state=self.random_state) model.fit(data) centroid = model.cluster_centers_ labels = model.labels_ @@ -242,19 +235,6 @@ class SpectralCoclustering(BaseSpectral): chosen and the algorithm runs once. Otherwise, the algorithm is run for each initialization and the best solution chosen. - n_jobs : int, default=None - The number of jobs to use for the computation. This works by breaking - down the pairwise matrix into n_jobs even slices and computing them in - parallel. - - ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. - ``-1`` means using all processors. See :term:`Glossary ` - for more details. - - .. deprecated:: 0.23 - ``n_jobs`` was deprecated in version 0.23 and will be removed in - 1.0 (renaming of 0.25). - random_state : int, RandomState instance, default=None Used for randomizing the singular value decomposition and the k-means initialization. Use an int to make the randomness deterministic. @@ -300,14 +280,13 @@ class SpectralCoclustering(BaseSpectral): @_deprecate_positional_args def __init__(self, n_clusters=3, *, svd_method='randomized', n_svd_vecs=None, mini_batch=False, init='k-means++', - n_init=10, n_jobs='deprecated', random_state=None): + n_init=10, random_state=None): super().__init__(n_clusters, svd_method, n_svd_vecs, mini_batch, init, n_init, - n_jobs, random_state) def _fit(self, X): @@ -394,19 +373,6 @@ class SpectralBiclustering(BaseSpectral): chosen and the algorithm runs once. Otherwise, the algorithm is run for each initialization and the best solution chosen. - n_jobs : int, default=None - The number of jobs to use for the computation. This works by breaking - down the pairwise matrix into n_jobs even slices and computing them in - parallel. - - ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. - ``-1`` means using all processors. See :term:`Glossary ` - for more details. - - .. deprecated:: 0.23 - ``n_jobs`` was deprecated in version 0.23 and will be removed in - 1.0 (renaming of 0.25). - random_state : int, RandomState instance, default=None Used for randomizing the singular value decomposition and the k-means initialization. Use an int to make the randomness deterministic. @@ -453,14 +419,13 @@ class SpectralBiclustering(BaseSpectral): def __init__(self, n_clusters=3, *, method='bistochastic', n_components=6, n_best=3, svd_method='randomized', n_svd_vecs=None, mini_batch=False, init='k-means++', - n_init=10, n_jobs='deprecated', random_state=None): + n_init=10, random_state=None): super().__init__(n_clusters, svd_method, n_svd_vecs, mini_batch, init, n_init, - n_jobs, random_state) self.method = method self.n_components = n_components diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index 44c2837a8802a..ccb472b7f94dc 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -254,9 +254,9 @@ def _tolerance(X, tol): @_deprecate_positional_args def k_means(X, n_clusters, *, sample_weight=None, init='k-means++', - precompute_distances='deprecated', n_init=10, max_iter=300, - verbose=False, tol=1e-4, random_state=None, copy_x=True, - n_jobs='deprecated', algorithm="auto", return_n_iter=False): + n_init=10, max_iter=300, verbose=False, tol=1e-4, + random_state=None, copy_x=True, algorithm="auto", + return_n_iter=False): """K-means clustering algorithm. Read more in the :ref:`User Guide `. @@ -293,21 +293,6 @@ def k_means(X, n_clusters, *, sample_weight=None, init='k-means++', If a callable is passed, it should take arguments X, n_clusters and a random state and return an initialization. - precompute_distances : {'auto', True, False} - Precompute distances (faster but takes more memory). - - 'auto' : do not precompute distances if n_samples * n_clusters > 12 - million. This corresponds to about 100MB overhead per job using - double precision. - - True : always precompute distances - - False : never precompute distances - - .. deprecated:: 0.23 - 'precompute_distances' was deprecated in version 0.23 and will be - removed in 1.0 (renaming of 0.25). It has no effect. - n_init : int, default=10 Number of time the k-means algorithm will be run with different centroid seeds. The final results will be the best output of @@ -339,17 +324,6 @@ def k_means(X, n_clusters, *, sample_weight=None, init='k-means++', copy_x is False. If the original data is sparse, but not in CSR format, a copy will be made even if copy_x is False. - n_jobs : int, default=None - The number of OpenMP threads to use for the computation. Parallelism is - sample-wise on the main cython loop which assigns each sample to its - closest center. - - ``None`` or ``-1`` means using all processors. - - .. deprecated:: 0.23 - ``n_jobs`` was deprecated in version 0.23 and will be removed in - 1.0 (renaming of 0.25). - algorithm : {"auto", "full", "elkan"}, default="auto" K-means algorithm to use. The classical EM-style algorithm is "full". The "elkan" variation is more efficient on data with well-defined @@ -382,8 +356,7 @@ def k_means(X, n_clusters, *, sample_weight=None, init='k-means++', """ est = KMeans( n_clusters=n_clusters, init=init, n_init=n_init, max_iter=max_iter, - verbose=verbose, precompute_distances=precompute_distances, tol=tol, - random_state=random_state, copy_x=copy_x, n_jobs=n_jobs, + verbose=verbose, tol=tol, random_state=random_state, copy_x=copy_x, algorithm=algorithm ).fit(X, sample_weight=sample_weight) if return_n_iter: @@ -747,21 +720,6 @@ class KMeans(TransformerMixin, ClusterMixin, BaseEstimator): in the cluster centers of two consecutive iterations to declare convergence. - precompute_distances : {'auto', True, False}, default='auto' - Precompute distances (faster but takes more memory). - - 'auto' : do not precompute distances if n_samples * n_clusters > 12 - million. This corresponds to about 100MB overhead per job using - double precision. - - True : always precompute distances. - - False : never precompute distances. - - .. deprecated:: 0.23 - 'precompute_distances' was deprecated in version 0.22 and will be - removed in 1.0 (renaming of 0.25). It has no effect. - verbose : int, default=0 Verbosity mode. @@ -780,17 +738,6 @@ class KMeans(TransformerMixin, ClusterMixin, BaseEstimator): copy_x is False. If the original data is sparse, but not in CSR format, a copy will be made even if copy_x is False. - n_jobs : int, default=None - The number of OpenMP threads to use for the computation. Parallelism is - sample-wise on the main cython loop which assigns each sample to its - closest center. - - ``None`` or ``-1`` means using all processors. - - .. deprecated:: 0.23 - ``n_jobs`` was deprecated in version 0.23 and will be removed in - 1.0 (renaming of 0.25). - algorithm : {"auto", "full", "elkan"}, default="auto" K-means algorithm to use. The classical EM-style algorithm is "full". The "elkan" variation is more efficient on data with well-defined @@ -868,38 +815,20 @@ class KMeans(TransformerMixin, ClusterMixin, BaseEstimator): """ @_deprecate_positional_args def __init__(self, n_clusters=8, *, init='k-means++', n_init=10, - max_iter=300, tol=1e-4, precompute_distances='deprecated', - verbose=0, random_state=None, copy_x=True, - n_jobs='deprecated', algorithm='auto'): + max_iter=300, tol=1e-4, verbose=0, random_state=None, + copy_x=True, algorithm='auto'): self.n_clusters = n_clusters self.init = init self.max_iter = max_iter self.tol = tol - self.precompute_distances = precompute_distances self.n_init = n_init self.verbose = verbose self.random_state = random_state self.copy_x = copy_x - self.n_jobs = n_jobs self.algorithm = algorithm def _check_params(self, X): - # precompute_distances - if self.precompute_distances != 'deprecated': - warnings.warn("'precompute_distances' was deprecated in version " - "0.23 and will be removed in 1.0 (renaming of 0.25)" - ". It has no effect", FutureWarning) - - # n_jobs - if self.n_jobs != 'deprecated': - warnings.warn("'n_jobs' was deprecated in version 0.23 and will be" - " removed in 1.0 (renaming of 0.25).", FutureWarning) - self._n_threads = self.n_jobs - else: - self._n_threads = None - self._n_threads = _openmp_effective_n_threads(self._n_threads) - # n_init if self.n_init <= 0: raise ValueError( @@ -1088,6 +1017,7 @@ def fit(self, X, y=None, sample_weight=None): self._check_params(X) random_state = check_random_state(self.random_state) sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) + self._n_threads = _openmp_effective_n_threads() # Validate init array init = self.init @@ -1757,6 +1687,7 @@ def fit(self, X, y=None, sample_weight=None): self._check_params(X) random_state = check_random_state(self.random_state) sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) + self._n_threads = _openmp_effective_n_threads() n_samples, n_features = X.shape # Validate init array @@ -1906,6 +1837,7 @@ def partial_fit(self, X, y=None, sample_weight=None): if not has_centers: # this instance has not been fitted yet (fit or partial_fit) self._check_params(X) + self._n_threads = _openmp_effective_n_threads() # Validate init array init = self.init diff --git a/sklearn/cluster/tests/test_bicluster.py b/sklearn/cluster/tests/test_bicluster.py index 97ca3db0201b6..93e9a00c7bce8 100644 --- a/sklearn/cluster/tests/test_bicluster.py +++ b/sklearn/cluster/tests/test_bicluster.py @@ -262,16 +262,3 @@ def test_n_features_in_(est): assert not hasattr(est, 'n_features_in_') est.fit(X) assert est.n_features_in_ == 3 - - -@pytest.mark.parametrize("klass", [SpectralBiclustering, SpectralCoclustering]) -@pytest.mark.parametrize("n_jobs", [None, 1]) -def test_n_jobs_deprecated(klass, n_jobs): - # FIXME: remove in 1.0 - depr_msg = ("'n_jobs' was deprecated in version 0.23 and will be removed " - "in 1.0") - S, _, _ = make_biclusters((30, 30), 3, noise=0.5, random_state=0) - est = klass(random_state=0, n_jobs=n_jobs) - - with pytest.warns(FutureWarning, match=depr_msg): - est.fit(S) diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py index 248b2e1ddd498..8ba7f45691b70 100644 --- a/sklearn/cluster/tests/test_k_means.py +++ b/sklearn/cluster/tests/test_k_means.py @@ -884,32 +884,6 @@ def test_result_equal_in_diff_n_threads(Estimator): assert_array_equal(result_1, result_2) -@pytest.mark.parametrize("precompute_distances", ["auto", False, True]) -def test_precompute_distance_deprecated(precompute_distances): - # FIXME: remove in 1.0 - depr_msg = ("'precompute_distances' was deprecated in version 0.23 and " - "will be removed in 1.0") - X, _ = make_blobs(n_samples=10, n_features=2, centers=2, random_state=0) - kmeans = KMeans(n_clusters=2, n_init=1, init='random', random_state=0, - precompute_distances=precompute_distances) - - with pytest.warns(FutureWarning, match=depr_msg): - kmeans.fit(X) - - -@pytest.mark.parametrize("n_jobs", [None, 1]) -def test_n_jobs_deprecated(n_jobs): - # FIXME: remove in 1.0 - depr_msg = ("'n_jobs' was deprecated in version 0.23 and will be removed " - "in 1.0") - X, _ = make_blobs(n_samples=10, n_features=2, centers=2, random_state=0) - kmeans = KMeans(n_clusters=2, n_init=1, init='random', random_state=0, - n_jobs=n_jobs) - - with pytest.warns(FutureWarning, match=depr_msg): - kmeans.fit(X) - - @pytest.mark.parametrize("attr", ["counts_", "init_size_", "random_state_"]) def test_minibatch_kmeans_deprecated_attributes(attr): # check that we raise a deprecation warning when accessing `init_size_` From bfb5b39d06d29b965238411a07d462ab69d7b38c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Fauchereau?= Date: Thu, 13 May 2021 13:10:54 +0000 Subject: [PATCH 395/478] DOC typo in doc/developers/tips.rst (#20088) --- doc/developers/tips.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/developers/tips.rst b/doc/developers/tips.rst index 36e2cd4a58779..7bef6580c1a6e 100644 --- a/doc/developers/tips.rst +++ b/doc/developers/tips.rst @@ -229,7 +229,7 @@ Debugging memory errors in Cython with valgrind While python/numpy's built-in memory management is relatively robust, it can lead to performance penalties for some routines. For this reason, much of -the high-performance code in scikit-learn in written in cython. This +the high-performance code in scikit-learn is written in cython. This performance gain comes with a tradeoff, however: it is very easy for memory bugs to crop up in cython code, especially in situations where that code relies heavily on pointer arithmetic. From 48ab1bf71aea9b7036108179e00e0b2e1c3fcf7e Mon Sep 17 00:00:00 2001 From: ZeyuSun <38712452+ZeyuSun@users.noreply.github.com> Date: Thu, 13 May 2021 11:43:56 -0400 Subject: [PATCH 396/478] DOC correct behavior of needs_threshold in make_score (#20079) Co-authored-by: Thomas J. Fan --- sklearn/metrics/_scorer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py index 8a814242cb6f1..39c4523f9bde6 100644 --- a/sklearn/metrics/_scorer.py +++ b/sklearn/metrics/_scorer.py @@ -611,7 +611,8 @@ def make_scorer(score_func, *, greater_is_better=True, needs_proba=False, output of :term:`predict_proba` (For binary `y_true`, the score function is supposed to accept probability of the positive class). If `needs_threshold=True`, the score function is supposed to accept the - output of :term:`decision_function`. + output of :term:`decision_function` or :term:`predict_proba` when + :term:`decision_function` is not present. """ sign = 1 if greater_is_better else -1 if needs_proba and needs_threshold: From f6e6ad2d9e9172c55c778392b27b69c6af87bd98 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=20Dupr=C3=A9=20la=20Tour?= Date: Fri, 14 May 2021 08:30:27 -0700 Subject: [PATCH 397/478] MNT clean futurewarning for 1.0 | _deprecate_positional_args (#20002) Co-authored-by: Olivier Grisel Co-authored-by: Thomas J. Fan --- azure-pipelines.yml | 3 +- doc/modules/learning_curve.rst | 6 ++-- doc/modules/model_evaluation.rst | 2 +- doc/whats_new/v1.0.rst | 12 +++++++ examples/manifold/plot_compare_methods.py | 6 ++-- examples/manifold/plot_manifold_sphere.py | 10 +++--- sklearn/base.py | 2 -- sklearn/calibration.py | 3 -- sklearn/cluster/_affinity_propagation.py | 4 +-- sklearn/cluster/_agglomerative.py | 5 +-- sklearn/cluster/_bicluster.py | 4 +-- sklearn/cluster/_birch.py | 3 +- sklearn/cluster/_dbscan.py | 4 +-- sklearn/cluster/_kmeans.py | 4 --- sklearn/cluster/_mean_shift.py | 5 +-- sklearn/cluster/_optics.py | 4 --- sklearn/cluster/_spectral.py | 4 --- sklearn/cluster/tests/test_hierarchical.py | 3 +- sklearn/compose/_column_transformer.py | 3 -- sklearn/compose/_target.py | 2 -- sklearn/covariance/_elliptic_envelope.py | 2 -- sklearn/covariance/_empirical_covariance.py | 3 -- sklearn/covariance/_graph_lasso.py | 4 --- sklearn/covariance/_robust_covariance.py | 2 -- sklearn/covariance/_shrunk_covariance.py | 5 --- sklearn/cross_decomposition/_pls.py | 5 --- sklearn/datasets/_base.py | 9 ------ sklearn/datasets/_california_housing.py | 2 -- sklearn/datasets/_covtype.py | 2 -- sklearn/datasets/_kddcup99.py | 2 -- sklearn/datasets/_lfw.py | 3 -- sklearn/datasets/_olivetti_faces.py | 2 -- sklearn/datasets/_openml.py | 2 -- sklearn/datasets/_rcv1.py | 2 -- sklearn/datasets/_samples_generator.py | 21 ------------- sklearn/datasets/_species_distributions.py | 2 -- sklearn/datasets/_svmlight_format_io.py | 4 --- sklearn/datasets/_twenty_newsgroups.py | 3 -- sklearn/decomposition/_dict_learning.py | 8 +---- sklearn/decomposition/_factor_analysis.py | 3 +- sklearn/decomposition/_fastica.py | 3 -- sklearn/decomposition/_incremental_pca.py | 2 -- sklearn/decomposition/_kernel_pca.py | 2 -- sklearn/decomposition/_lda.py | 2 -- sklearn/decomposition/_nmf.py | 3 -- sklearn/decomposition/_pca.py | 2 -- sklearn/decomposition/_sparse_pca.py | 3 -- sklearn/decomposition/_truncated_svd.py | 2 -- sklearn/discriminant_analysis.py | 2 -- sklearn/dummy.py | 3 -- sklearn/ensemble/_bagging.py | 4 +-- sklearn/ensemble/_forest.py | 6 ---- sklearn/ensemble/_gb.py | 3 -- .../gradient_boosting.py | 5 +-- sklearn/ensemble/_iforest.py | 2 -- sklearn/ensemble/_stacking.py | 3 -- sklearn/ensemble/_voting.py | 3 -- sklearn/ensemble/_weight_boosting.py | 3 -- .../feature_extraction/_dict_vectorizer.py | 2 -- sklearn/feature_extraction/_hash.py | 2 -- sklearn/feature_extraction/image.py | 5 --- sklearn/feature_extraction/text.py | 5 --- sklearn/feature_selection/_from_model.py | 2 -- sklearn/feature_selection/_mutual_info.py | 3 -- sklearn/feature_selection/_rfe.py | 3 -- .../_univariate_selection.py | 18 +++-------- sklearn/gaussian_process/_gpc.py | 3 -- sklearn/gaussian_process/_gpr.py | 2 -- sklearn/impute/_base.py | 3 -- sklearn/impute/_knn.py | 2 -- sklearn/inspection/_partial_dependence.py | 2 -- sklearn/inspection/_permutation_importance.py | 2 -- .../inspection/_plot/partial_dependence.py | 2 -- sklearn/isotonic.py | 4 +-- sklearn/kernel_approximation.py | 6 +--- sklearn/kernel_ridge.py | 2 -- sklearn/linear_model/_base.py | 2 -- sklearn/linear_model/_bayes.py | 3 -- sklearn/linear_model/_coordinate_descent.py | 11 ------- sklearn/linear_model/_huber.py | 2 -- sklearn/linear_model/_least_angle.py | 8 ----- sklearn/linear_model/_logistic.py | 3 -- sklearn/linear_model/_omp.py | 5 --- sklearn/linear_model/_passive_aggressive.py | 3 -- sklearn/linear_model/_perceptron.py | 2 -- sklearn/linear_model/_ransac.py | 2 -- sklearn/linear_model/_ridge.py | 8 ----- sklearn/linear_model/_sag.py | 2 -- sklearn/linear_model/_stochastic_gradient.py | 6 ---- sklearn/linear_model/_theil_sen.py | 2 -- sklearn/linear_model/tests/test_omp.py | 2 +- sklearn/manifold/_isomap.py | 2 -- sklearn/manifold/_locally_linear.py | 3 -- sklearn/manifold/_mds.py | 3 -- sklearn/manifold/_spectral_embedding.py | 3 -- sklearn/manifold/_t_sne.py | 3 -- sklearn/metrics/_classification.py | 19 ------------ sklearn/metrics/_plot/confusion_matrix.py | 4 --- .../metrics/_plot/precision_recall_curve.py | 4 --- sklearn/metrics/_plot/roc_curve.py | 4 --- sklearn/metrics/_ranking.py | 10 ------ sklearn/metrics/_regression.py | 10 ------ sklearn/metrics/_scorer.py | 3 -- sklearn/metrics/cluster/_bicluster.py | 2 -- sklearn/metrics/cluster/_supervised.py | 8 ----- sklearn/metrics/cluster/_unsupervised.py | 3 -- sklearn/metrics/cluster/tests/test_common.py | 5 +-- .../metrics/cluster/tests/test_supervised.py | 31 +++++++++++-------- sklearn/metrics/pairwise.py | 11 ------- sklearn/metrics/tests/test_classification.py | 6 ++-- sklearn/mixture/_bayesian_mixture.py | 2 -- sklearn/mixture/_gaussian_mixture.py | 2 -- sklearn/model_selection/_search.py | 6 ---- sklearn/model_selection/_split.py | 13 -------- sklearn/model_selection/_validation.py | 7 ----- sklearn/model_selection/tests/test_split.py | 6 ++-- sklearn/multiclass.py | 4 --- sklearn/multioutput.py | 6 +--- sklearn/naive_bayes.py | 6 ---- sklearn/neighbors/_classification.py | 3 -- sklearn/neighbors/_graph.py | 6 +--- sklearn/neighbors/_kde.py | 2 -- sklearn/neighbors/_lof.py | 2 -- sklearn/neighbors/_nca.py | 2 -- sklearn/neighbors/_nearest_centroid.py | 2 -- sklearn/neighbors/_regression.py | 3 -- sklearn/neighbors/_unsupervised.py | 2 -- .../neural_network/_multilayer_perceptron.py | 4 +-- sklearn/neural_network/_rbm.py | 3 +- sklearn/pipeline.py | 3 -- sklearn/preprocessing/_data.py | 18 +---------- sklearn/preprocessing/_discretization.py | 2 -- sklearn/preprocessing/_encoders.py | 3 -- .../preprocessing/_function_transformer.py | 2 -- sklearn/preprocessing/_label.py | 4 --- sklearn/preprocessing/_polynomial.py | 4 +-- sklearn/random_projection.py | 4 --- sklearn/semi_supervised/_label_propagation.py | 4 --- sklearn/svm/_bounds.py | 2 -- sklearn/svm/_classes.py | 8 ----- sklearn/tree/_classes.py | 6 ---- sklearn/tree/_export.py | 4 --- sklearn/tree/tests/test_tree.py | 2 +- sklearn/utils/__init__.py | 7 +---- sklearn/utils/class_weight.py | 4 --- sklearn/utils/extmath.py | 6 ---- sklearn/utils/graph.py | 2 -- sklearn/utils/sparsefuncs.py | 2 -- sklearn/utils/validation.py | 11 ++----- 149 files changed, 84 insertions(+), 586 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 3cd2b5bb4cd9f..412de99f5e57d 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -44,7 +44,8 @@ jobs: inputs: versionSpec: '3.9' - bash: | - pip install flake8 mypy==0.782 + # Include pytest compatibility with mypy + pip install pytest flake8 mypy==0.782 displayName: Install linters - bash: | ./build_tools/circle/linting.sh diff --git a/doc/modules/learning_curve.rst b/doc/modules/learning_curve.rst index 4fb90df937e15..249571aa2320a 100644 --- a/doc/modules/learning_curve.rst +++ b/doc/modules/learning_curve.rst @@ -79,9 +79,9 @@ The function :func:`validation_curve` can help in this case:: >>> np.random.shuffle(indices) >>> X, y = X[indices], y[indices] - >>> train_scores, valid_scores = validation_curve(Ridge(), X, y, "alpha", - ... np.logspace(-7, 3, 3), - ... cv=5) + >>> train_scores, valid_scores = validation_curve( + ... Ridge(), X, y, param_name="alpha", param_range=np.logspace(-7, 3, 3), + ... cv=5) >>> train_scores array([[0.93..., 0.94..., 0.92..., 0.91..., 0.92...], [0.93..., 0.94..., 0.92..., 0.91..., 0.92...], diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index c807af982e277..bc781efc35d58 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -1095,7 +1095,7 @@ with a svm classifier in a multiclass problem:: LinearSVC() >>> pred_decision = est.decision_function([[-1], [2], [3]]) >>> y_true = [0, 2, 3] - >>> hinge_loss(y_true, pred_decision, labels) + >>> hinge_loss(y_true, pred_decision, labels=labels) 0.56... .. _log_loss: diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index 8ad8a295d72e0..f94e7001fdc97 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -12,6 +12,18 @@ Version 1.0.0 .. include:: changelog_legend.inc +Enforcing keyword-only arguments +-------------------------------- + +In an effort to promote clear and non-ambiguous use of the library, most +constructor and function parameters must now be passed as keyword arguments +(i.e. using the `param=value` syntax) instead of positional. If a keyword-only +parameter is used as positional, a `TypeError` is now raised. +:issue:`15005` :pr:`20002` by `Joel Nothman`_, `Adrin Jalali`_, `Thomas Fan`_, +`Nicolas Hug`_, and `Tom Dupre la Tour`_. See `SLEP009 +`_ +for more details. + Put the changes in their relevant module. Changed models diff --git a/examples/manifold/plot_compare_methods.py b/examples/manifold/plot_compare_methods.py index ed01e8ac19b89..c78ecc234186a 100644 --- a/examples/manifold/plot_compare_methods.py +++ b/examples/manifold/plot_compare_methods.py @@ -53,14 +53,16 @@ # Set-up manifold methods LLE = partial(manifold.LocallyLinearEmbedding, - n_neighbors, n_components, eigen_solver='auto') + n_neighbors=n_neighbors, n_components=n_components, + eigen_solver='auto') methods = OrderedDict() methods['LLE'] = LLE(method='standard') methods['LTSA'] = LLE(method='ltsa') methods['Hessian LLE'] = LLE(method='hessian') methods['Modified LLE'] = LLE(method='modified') -methods['Isomap'] = manifold.Isomap(n_neighbors, n_components) +methods['Isomap'] = manifold.Isomap(n_neighbors=n_neighbors, + n_components=n_components) methods['MDS'] = manifold.MDS(n_components, max_iter=100, n_init=1) methods['SE'] = manifold.SpectralEmbedding(n_components=n_components, n_neighbors=n_neighbors) diff --git a/examples/manifold/plot_manifold_sphere.py b/examples/manifold/plot_manifold_sphere.py index 2b6566c4ecd92..fbc125fb8773f 100644 --- a/examples/manifold/plot_manifold_sphere.py +++ b/examples/manifold/plot_manifold_sphere.py @@ -78,9 +78,9 @@ for i, method in enumerate(methods): t0 = time() - trans_data = manifold\ - .LocallyLinearEmbedding(n_neighbors, 2, - method=method).fit_transform(sphere_data).T + trans_data = manifold.LocallyLinearEmbedding( + n_neighbors=n_neighbors, n_components=2, + method=method).fit_transform(sphere_data).T t1 = time() print("%s: %.2g sec" % (methods[i], t1 - t0)) @@ -93,8 +93,8 @@ # Perform Isomap Manifold learning. t0 = time() -trans_data = manifold.Isomap(n_neighbors, n_components=2)\ - .fit_transform(sphere_data).T +trans_data = manifold.Isomap(n_neighbors=n_neighbors, + n_components=2).fit_transform(sphere_data).T t1 = time() print("%s: %.2g sec" % ('ISO', t1 - t0)) diff --git a/sklearn/base.py b/sklearn/base.py index ec264b0cf5edc..e8b51df634a1f 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -23,10 +23,8 @@ from .utils.validation import check_array from .utils.validation import _num_features from .utils._estimator_html_repr import estimator_html_repr -from .utils.validation import _deprecate_positional_args -@_deprecate_positional_args def clone(estimator, *, safe=True): """Constructs a new unfitted estimator with the same parameters. diff --git a/sklearn/calibration.py b/sklearn/calibration.py index c6289d1df2936..084f3bf242e3c 100644 --- a/sklearn/calibration.py +++ b/sklearn/calibration.py @@ -36,7 +36,6 @@ from .isotonic import IsotonicRegression from .svm import LinearSVC from .model_selection import check_cv, cross_val_predict -from .utils.validation import _deprecate_positional_args class CalibratedClassifierCV(ClassifierMixin, @@ -215,7 +214,6 @@ class CalibratedClassifierCV(ClassifierMixin, .. [4] Predicting Good Probabilities with Supervised Learning, A. Niculescu-Mizil & R. Caruana, ICML 2005 """ - @_deprecate_positional_args def __init__(self, base_estimator=None, *, method='sigmoid', cv=None, n_jobs=None, ensemble=True): self.base_estimator = base_estimator @@ -788,7 +786,6 @@ def predict(self, T): return expit(-(self.a_ * T + self.b_)) -@_deprecate_positional_args def calibration_curve(y_true, y_prob, *, normalize=False, n_bins=5, strategy='uniform'): """Compute true and predicted probabilities for a calibration curve. diff --git a/sklearn/cluster/_affinity_propagation.py b/sklearn/cluster/_affinity_propagation.py index 93b98d8aff7ee..ccae0b7538b58 100644 --- a/sklearn/cluster/_affinity_propagation.py +++ b/sklearn/cluster/_affinity_propagation.py @@ -12,7 +12,7 @@ from ..base import BaseEstimator, ClusterMixin from ..utils import as_float_array, check_random_state from ..utils.deprecation import deprecated -from ..utils.validation import check_is_fitted, _deprecate_positional_args +from ..utils.validation import check_is_fitted from ..metrics import euclidean_distances from ..metrics import pairwise_distances_argmin from .._config import config_context @@ -32,7 +32,6 @@ def all_equal_similarities(): return all_equal_preferences() and all_equal_similarities() -@_deprecate_positional_args def affinity_propagation(S, *, preference=None, convergence_iter=15, max_iter=200, damping=0.5, copy=True, verbose=False, return_n_iter=False, random_state=None): @@ -352,7 +351,6 @@ class AffinityPropagation(ClusterMixin, BaseEstimator): array([[1, 2], [4, 2]]) """ - @_deprecate_positional_args def __init__(self, *, damping=.5, max_iter=200, convergence_iter=15, copy=True, preference=None, affinity='euclidean', verbose=False, random_state=None): diff --git a/sklearn/cluster/_agglomerative.py b/sklearn/cluster/_agglomerative.py index ee0a117824dd8..4b0089b707233 100644 --- a/sklearn/cluster/_agglomerative.py +++ b/sklearn/cluster/_agglomerative.py @@ -21,7 +21,7 @@ from ..utils import check_array from ..utils._fast_dict import IntFloatDict from ..utils.fixes import _astype_copy_false -from ..utils.validation import _deprecate_positional_args, check_memory +from ..utils.validation import check_memory # mypy error: Module 'sklearn.cluster' has no attribute '_hierarchical_fast' from . import _hierarchical_fast as _hierarchical # type: ignore from ._feature_agglomeration import AgglomerationTransform @@ -134,7 +134,6 @@ def _single_linkage_tree(connectivity, n_samples, n_nodes, n_clusters, ############################################################################### # Hierarchical tree building functions -@_deprecate_positional_args def ward_tree(X, *, connectivity=None, n_clusters=None, return_distance=False): """Ward clustering based on a Feature matrix. @@ -800,7 +799,6 @@ class AgglomerativeClustering(ClusterMixin, BaseEstimator): array([1, 1, 1, 0, 0, 0]) """ - @_deprecate_positional_args def __init__(self, n_clusters=2, *, affinity="euclidean", memory=None, connectivity=None, compute_full_tree='auto', @@ -1068,7 +1066,6 @@ class FeatureAgglomeration(AgglomerativeClustering, AgglomerationTransform): >>> X_reduced.shape (1797, 32) """ - @_deprecate_positional_args def __init__(self, n_clusters=2, *, affinity="euclidean", memory=None, connectivity=None, compute_full_tree='auto', diff --git a/sklearn/cluster/_bicluster.py b/sklearn/cluster/_bicluster.py index 2b5184fae40ae..c8ff1bb036662 100644 --- a/sklearn/cluster/_bicluster.py +++ b/sklearn/cluster/_bicluster.py @@ -17,7 +17,7 @@ from ..utils.extmath import (make_nonnegative, randomized_svd, safe_sparse_dot) -from ..utils.validation import assert_all_finite, _deprecate_positional_args +from ..utils.validation import assert_all_finite __all__ = ['SpectralCoclustering', @@ -277,7 +277,6 @@ class SpectralCoclustering(BaseSpectral): `__. """ - @_deprecate_positional_args def __init__(self, n_clusters=3, *, svd_method='randomized', n_svd_vecs=None, mini_batch=False, init='k-means++', n_init=10, random_state=None): @@ -415,7 +414,6 @@ class SpectralBiclustering(BaseSpectral): `__. """ - @_deprecate_positional_args def __init__(self, n_clusters=3, *, method='bistochastic', n_components=6, n_best=3, svd_method='randomized', n_svd_vecs=None, mini_batch=False, init='k-means++', diff --git a/sklearn/cluster/_birch.py b/sklearn/cluster/_birch.py index 0587fe075a952..da1bf894f03f8 100644 --- a/sklearn/cluster/_birch.py +++ b/sklearn/cluster/_birch.py @@ -14,7 +14,7 @@ from ..base import TransformerMixin, ClusterMixin, BaseEstimator from ..utils.extmath import row_norms from ..utils import deprecated -from ..utils.validation import check_is_fitted, _deprecate_positional_args +from ..utils.validation import check_is_fitted from ..exceptions import ConvergenceWarning from . import AgglomerativeClustering from .._config import config_context @@ -440,7 +440,6 @@ class Birch(ClusterMixin, TransformerMixin, BaseEstimator): >>> brc.predict(X) array([0, 0, 0, 1, 1, 1]) """ - @_deprecate_positional_args def __init__(self, *, threshold=0.5, branching_factor=50, n_clusters=3, compute_labels=True, copy=True): self.threshold = threshold diff --git a/sklearn/cluster/_dbscan.py b/sklearn/cluster/_dbscan.py index a841a9b7c213c..bbc3470256e90 100644 --- a/sklearn/cluster/_dbscan.py +++ b/sklearn/cluster/_dbscan.py @@ -14,13 +14,12 @@ from scipy import sparse from ..base import BaseEstimator, ClusterMixin -from ..utils.validation import _check_sample_weight, _deprecate_positional_args +from ..utils.validation import _check_sample_weight from ..neighbors import NearestNeighbors from ._dbscan_inner import dbscan_inner -@_deprecate_positional_args def dbscan(X, eps=0.5, *, min_samples=5, metric='minkowski', metric_params=None, algorithm='auto', leaf_size=30, p=2, sample_weight=None, n_jobs=None): @@ -269,7 +268,6 @@ class DBSCAN(ClusterMixin, BaseEstimator): DBSCAN revisited, revisited: why and how you should (still) use DBSCAN. ACM Transactions on Database Systems (TODS), 42(3), 19. """ - @_deprecate_positional_args def __init__(self, eps=0.5, *, min_samples=5, metric='euclidean', metric_params=None, algorithm='auto', leaf_size=30, p=None, n_jobs=None): diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index ccb472b7f94dc..8b24be6ace987 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -24,7 +24,6 @@ from ..utils.extmath import row_norms, stable_cumsum from ..utils.sparsefuncs_fast import assign_rows_csr from ..utils.sparsefuncs import mean_variance_axis -from ..utils.validation import _deprecate_positional_args from ..utils import check_array from ..utils import check_random_state from ..utils import deprecated @@ -252,7 +251,6 @@ def _tolerance(X, tol): return np.mean(variances) * tol -@_deprecate_positional_args def k_means(X, n_clusters, *, sample_weight=None, init='k-means++', n_init=10, max_iter=300, verbose=False, tol=1e-4, random_state=None, copy_x=True, algorithm="auto", @@ -813,7 +811,6 @@ class KMeans(TransformerMixin, ClusterMixin, BaseEstimator): array([[10., 2.], [ 1., 2.]]) """ - @_deprecate_positional_args def __init__(self, n_clusters=8, *, init='k-means++', n_init=10, max_iter=300, tol=1e-4, verbose=0, random_state=None, copy_x=True, algorithm='auto'): @@ -1508,7 +1505,6 @@ class MiniBatchKMeans(KMeans): >>> kmeans.predict([[0, 0], [4, 4]]) array([0, 1], dtype=int32) """ - @_deprecate_positional_args def __init__(self, n_clusters=8, *, init='k-means++', max_iter=100, batch_size=1024, verbose=0, compute_labels=True, random_state=None, tol=0.0, max_no_improvement=10, diff --git a/sklearn/cluster/_mean_shift.py b/sklearn/cluster/_mean_shift.py index fa62d2c8d9fe7..f48ef46e8dbef 100644 --- a/sklearn/cluster/_mean_shift.py +++ b/sklearn/cluster/_mean_shift.py @@ -19,7 +19,7 @@ from joblib import Parallel from collections import defaultdict -from ..utils.validation import check_is_fitted, _deprecate_positional_args +from ..utils.validation import check_is_fitted from ..utils.fixes import delayed from ..utils import check_random_state, gen_batches, check_array from ..base import BaseEstimator, ClusterMixin @@ -28,7 +28,6 @@ from .._config import config_context -@_deprecate_positional_args def estimate_bandwidth(X, *, quantile=0.3, n_samples=None, random_state=0, n_jobs=None): """Estimate the bandwidth to use with the mean-shift algorithm. @@ -109,7 +108,6 @@ def _mean_shift_single_seed(my_mean, X, nbrs, max_iter): return tuple(my_mean), len(points_within), completed_iterations -@_deprecate_positional_args def mean_shift(X, *, bandwidth=None, seeds=None, bin_seeding=False, min_bin_freq=1, cluster_all=True, max_iter=300, n_jobs=None): @@ -352,7 +350,6 @@ class MeanShift(ClusterMixin, BaseEstimator): Machine Intelligence. 2002. pp. 603-619. """ - @_deprecate_positional_args def __init__(self, *, bandwidth=None, seeds=None, bin_seeding=False, min_bin_freq=1, cluster_all=True, n_jobs=None, max_iter=300): self.bandwidth = bandwidth diff --git a/sklearn/cluster/_optics.py b/sklearn/cluster/_optics.py index d0b94f43454b3..af0e8531aa7b8 100755 --- a/sklearn/cluster/_optics.py +++ b/sklearn/cluster/_optics.py @@ -17,7 +17,6 @@ from ..exceptions import DataConversionWarning from ..metrics.pairwise import PAIRWISE_BOOLEAN_FUNCTIONS from ..utils import gen_batches, get_chunk_n_rows -from ..utils.validation import _deprecate_positional_args from ..neighbors import NearestNeighbors from ..base import BaseEstimator, ClusterMixin from ..metrics import pairwise_distances @@ -204,7 +203,6 @@ class OPTICS(ClusterMixin, BaseEstimator): >>> clustering.labels_ array([0, 0, 0, 1, 1, 1]) """ - @_deprecate_positional_args def __init__(self, *, min_samples=5, max_eps=np.inf, metric='minkowski', p=2, metric_params=None, cluster_method='xi', eps=None, xi=0.05, predecessor_correction=True, min_cluster_size=None, @@ -348,7 +346,6 @@ def _compute_core_distances_(X, neighbors, min_samples, working_memory): return core_distances -@_deprecate_positional_args def compute_optics_graph(X, *, min_samples, max_eps, metric, p, metric_params, algorithm, leaf_size, n_jobs): """Computes the OPTICS reachability graph. @@ -552,7 +549,6 @@ def _set_reach_dist(core_distances_, reachability_, predecessor_, predecessor_[unproc[improved]] = point_index -@_deprecate_positional_args def cluster_optics_dbscan(*, reachability, core_distances, ordering, eps): """Performs DBSCAN extraction for an arbitrary epsilon. diff --git a/sklearn/cluster/_spectral.py b/sklearn/cluster/_spectral.py index e9a5d7a7b4302..a1371b925595d 100644 --- a/sklearn/cluster/_spectral.py +++ b/sklearn/cluster/_spectral.py @@ -11,7 +11,6 @@ from ..base import BaseEstimator, ClusterMixin from ..utils import check_random_state, as_float_array -from ..utils.validation import _deprecate_positional_args from ..utils.deprecation import deprecated from ..metrics.pairwise import pairwise_kernels from ..neighbors import kneighbors_graph, NearestNeighbors @@ -19,7 +18,6 @@ from ._kmeans import k_means -@_deprecate_positional_args def discretize(vectors, *, copy=True, max_svd_restarts=30, n_iter_max=20, random_state=None): """Search for a partition matrix (clustering) which is closest to the @@ -158,7 +156,6 @@ def discretize(vectors, *, copy=True, max_svd_restarts=30, n_iter_max=20, return labels -@_deprecate_positional_args def spectral_clustering(affinity, *, n_clusters=8, n_components=None, eigen_solver=None, random_state=None, n_init=10, eigen_tol=0.0, assign_labels='kmeans', @@ -455,7 +452,6 @@ class SpectralClustering(ClusterMixin, BaseEstimator): Stella X. Yu, Jianbo Shi https://www1.icsi.berkeley.edu/~stellayu/publication/doc/2003kwayICCV.pdf """ - @_deprecate_positional_args def __init__(self, n_clusters=8, *, eigen_solver=None, n_components=None, random_state=None, n_init=10, gamma=1., affinity='rbf', n_neighbors=10, eigen_tol=0.0, assign_labels='kmeans', diff --git a/sklearn/cluster/tests/test_hierarchical.py b/sklearn/cluster/tests/test_hierarchical.py index 513dbf8e9218e..bd70b2c1aac54 100644 --- a/sklearn/cluster/tests/test_hierarchical.py +++ b/sklearn/cluster/tests/test_hierarchical.py @@ -670,7 +670,8 @@ def test_n_components(): connectivity = np.eye(5) for linkage_func in _TREE_BUILDERS.values(): - assert ignore_warnings(linkage_func)(X, connectivity)[1] == 5 + assert ignore_warnings(linkage_func)( + X, connectivity=connectivity)[1] == 5 def test_agg_n_clusters(): diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index 441fc95a106f1..6c15b81be98c2 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -21,7 +21,6 @@ from ..utils import _get_column_indices from ..utils.metaestimators import _BaseComposition from ..utils.validation import check_array, check_is_fitted -from ..utils.validation import _deprecate_positional_args from ..utils.fixes import delayed @@ -179,7 +178,6 @@ class ColumnTransformer(TransformerMixin, _BaseComposition): """ _required_parameters = ['transformers'] - @_deprecate_positional_args def __init__(self, transformers, *, remainder='drop', @@ -867,7 +865,6 @@ class make_column_selector: [-0.30151134, 0. , 1. , 0. ], [ 0.90453403, 0. , 0. , 1. ]]) """ - @_deprecate_positional_args def __init__(self, pattern=None, *, dtype_include=None, dtype_exclude=None): self.pattern = pattern diff --git a/sklearn/compose/_target.py b/sklearn/compose/_target.py index 1a80046c66376..12fe13ee848b9 100644 --- a/sklearn/compose/_target.py +++ b/sklearn/compose/_target.py @@ -10,7 +10,6 @@ from ..utils.validation import check_is_fitted from ..utils import check_array, _safe_indexing from ..preprocessing import FunctionTransformer -from ..utils.validation import _deprecate_positional_args from ..exceptions import NotFittedError __all__ = ['TransformedTargetRegressor'] @@ -109,7 +108,6 @@ class TransformedTargetRegressor(RegressorMixin, BaseEstimator): `. """ - @_deprecate_positional_args def __init__(self, regressor=None, *, transformer=None, func=None, inverse_func=None, check_inverse=True): self.regressor = regressor diff --git a/sklearn/covariance/_elliptic_envelope.py b/sklearn/covariance/_elliptic_envelope.py index e599f0435f48c..ad7904dc7831a 100644 --- a/sklearn/covariance/_elliptic_envelope.py +++ b/sklearn/covariance/_elliptic_envelope.py @@ -5,7 +5,6 @@ import numpy as np from . import MinCovDet from ..utils.validation import check_is_fitted -from ..utils.validation import _deprecate_positional_args from ..metrics import accuracy_score from ..base import OutlierMixin @@ -120,7 +119,6 @@ class EllipticEnvelope(OutlierMixin, MinCovDet): minimum covariance determinant estimator" Technometrics 41(3), 212 (1999) """ - @_deprecate_positional_args def __init__(self, *, store_precision=True, assume_centered=False, support_fraction=None, contamination=0.1, random_state=None): diff --git a/sklearn/covariance/_empirical_covariance.py b/sklearn/covariance/_empirical_covariance.py index fb1797e50f96a..02bddd0f50330 100644 --- a/sklearn/covariance/_empirical_covariance.py +++ b/sklearn/covariance/_empirical_covariance.py @@ -19,7 +19,6 @@ from ..utils import check_array from ..utils.extmath import fast_logdet from ..metrics.pairwise import pairwise_distances -from ..utils.validation import _deprecate_positional_args def log_likelihood(emp_cov, precision): @@ -49,7 +48,6 @@ def log_likelihood(emp_cov, precision): return log_likelihood_ -@_deprecate_positional_args def empirical_covariance(X, *, assume_centered=False): """Computes the Maximum likelihood covariance estimator @@ -146,7 +144,6 @@ class EmpiricalCovariance(BaseEstimator): array([0.0622..., 0.0193...]) """ - @_deprecate_positional_args def __init__(self, *, store_precision=True, assume_centered=False): self.store_precision = store_precision self.assume_centered = assume_centered diff --git a/sklearn/covariance/_graph_lasso.py b/sklearn/covariance/_graph_lasso.py index 57167b81fe9e0..091d4f82e7e3e 100644 --- a/sklearn/covariance/_graph_lasso.py +++ b/sklearn/covariance/_graph_lasso.py @@ -19,7 +19,6 @@ from ..exceptions import ConvergenceWarning from ..utils.validation import check_random_state -from ..utils.validation import _deprecate_positional_args from ..utils.fixes import delayed # mypy error: Module 'sklearn.linear_model' has no attribute '_cd_fast' from ..linear_model import _cd_fast as cd_fast # type: ignore @@ -77,7 +76,6 @@ def alpha_max(emp_cov): # The g-lasso algorithm -@_deprecate_positional_args def graphical_lasso(emp_cov, alpha, *, cov_init=None, mode='cd', tol=1e-4, enet_tol=1e-4, max_iter=100, verbose=False, return_costs=False, eps=np.finfo(np.float64).eps, @@ -366,7 +364,6 @@ class GraphicalLasso(EmpiricalCovariance): -------- graphical_lasso, GraphicalLassoCV """ - @_deprecate_positional_args def __init__(self, alpha=.01, *, mode='cd', tol=1e-4, enet_tol=1e-4, max_iter=100, verbose=False, assume_centered=False): super().__init__(assume_centered=assume_centered) @@ -675,7 +672,6 @@ class GraphicalLassoCV(GraphicalLasso): values of alpha then come out as missing values, but the optimum may be close to these missing values. """ - @_deprecate_positional_args def __init__(self, *, alphas=4, n_refinements=4, cv=None, tol=1e-4, enet_tol=1e-4, max_iter=100, mode='cd', n_jobs=None, verbose=False, assume_centered=False): diff --git a/sklearn/covariance/_robust_covariance.py b/sklearn/covariance/_robust_covariance.py index d4331b591e43f..337ba23f19059 100644 --- a/sklearn/covariance/_robust_covariance.py +++ b/sklearn/covariance/_robust_covariance.py @@ -17,7 +17,6 @@ from . import empirical_covariance, EmpiricalCovariance from ..utils.extmath import fast_logdet from ..utils import check_random_state, check_array -from ..utils.validation import _deprecate_positional_args # Minimum Covariance Determinant @@ -615,7 +614,6 @@ class MinCovDet(EmpiricalCovariance): """ _nonrobust_covariance = staticmethod(empirical_covariance) - @_deprecate_positional_args def __init__(self, *, store_precision=True, assume_centered=False, support_fraction=None, random_state=None): self.store_precision = store_precision diff --git a/sklearn/covariance/_shrunk_covariance.py b/sklearn/covariance/_shrunk_covariance.py index 72b13681200ff..5fe590b33a1db 100644 --- a/sklearn/covariance/_shrunk_covariance.py +++ b/sklearn/covariance/_shrunk_covariance.py @@ -18,7 +18,6 @@ from . import empirical_covariance, EmpiricalCovariance from ..utils import check_array -from ..utils.validation import _deprecate_positional_args # ShrunkCovariance estimator @@ -118,7 +117,6 @@ class ShrunkCovariance(EmpiricalCovariance): where mu = trace(cov) / n_features """ - @_deprecate_positional_args def __init__(self, *, store_precision=True, assume_centered=False, shrinkage=0.1): super().__init__(store_precision=store_precision, @@ -253,7 +251,6 @@ def ledoit_wolf_shrinkage(X, assume_centered=False, block_size=1000): return shrinkage -@_deprecate_positional_args def ledoit_wolf(X, *, assume_centered=False, block_size=1000): """Estimates the shrunk Ledoit-Wolf covariance matrix. @@ -391,7 +388,6 @@ class LedoitWolf(EmpiricalCovariance): Ledoit and Wolf, Journal of Multivariate Analysis, Volume 88, Issue 2, February 2004, pages 365-411. """ - @_deprecate_positional_args def __init__(self, *, store_precision=True, assume_centered=False, block_size=1000): super().__init__(store_precision=store_precision, @@ -431,7 +427,6 @@ def fit(self, X, y=None): # OAS estimator -@_deprecate_positional_args def oas(X, *, assume_centered=False): """Estimate covariance with the Oracle Approximating Shrinkage algorithm. diff --git a/sklearn/cross_decomposition/_pls.py b/sklearn/cross_decomposition/_pls.py index 2f6e63d556388..11e5d7bb8c335 100644 --- a/sklearn/cross_decomposition/_pls.py +++ b/sklearn/cross_decomposition/_pls.py @@ -16,7 +16,6 @@ from ..utils import check_array, check_consistent_length from ..utils.extmath import svd_flip from ..utils.validation import check_is_fitted, FLOAT_DTYPES -from ..utils.validation import _deprecate_positional_args from ..exceptions import ConvergenceWarning from ..utils.deprecation import deprecated @@ -588,7 +587,6 @@ class PLSRegression(_PLS): # - "plspm " with function plsreg2(X, Y) # - "pls" with function oscorespls.fit(X, Y) - @_deprecate_positional_args def __init__(self, n_components=2, *, scale=True, max_iter=500, tol=1e-06, copy=True): super().__init__( @@ -705,7 +703,6 @@ class PLSCanonical(_PLS): # exactly implement the Wold algorithm since it does not normalize # y_weights to one. - @_deprecate_positional_args def __init__(self, n_components=2, *, scale=True, algorithm="nipals", max_iter=500, tol=1e-06, copy=True): super().__init__( @@ -807,7 +804,6 @@ class CCA(_PLS): PLSSVD """ - @_deprecate_positional_args def __init__(self, n_components=2, *, scale=True, max_iter=500, tol=1e-06, copy=True): super().__init__(n_components=n_components, scale=scale, @@ -893,7 +889,6 @@ class PLSSVD(TransformerMixin, BaseEstimator): PLSCanonical CCA """ - @_deprecate_positional_args def __init__(self, n_components=2, *, scale=True, copy=True): self.n_components = n_components self.scale = scale diff --git a/sklearn/datasets/_base.py b/sklearn/datasets/_base.py index 948b4f7cba61e..da64faac54a36 100644 --- a/sklearn/datasets/_base.py +++ b/sklearn/datasets/_base.py @@ -17,7 +17,6 @@ from ..utils import Bunch from ..utils import check_random_state from ..utils import check_pandas_support -from ..utils.validation import _deprecate_positional_args import numpy as np @@ -88,7 +87,6 @@ def _convert_data_dataframe(caller_name, data, target, return combined_df, X, y -@_deprecate_positional_args def load_files(container_path, *, description=None, categories=None, load_content=True, shuffle=True, encoding=None, decode_error='strict', random_state=0): @@ -276,7 +274,6 @@ def load_data(module_path, data_file_name): return data, target, target_names -@_deprecate_positional_args def load_wine(*, return_X_y=False, as_frame=False): """Load and return the wine dataset (classification). @@ -391,7 +388,6 @@ def load_wine(*, return_X_y=False, as_frame=False): feature_names=feature_names) -@_deprecate_positional_args def load_iris(*, return_X_y=False, as_frame=False): """Load and return the iris dataset (classification). @@ -506,7 +502,6 @@ def load_iris(*, return_X_y=False, as_frame=False): filename=iris_csv_filename) -@_deprecate_positional_args def load_breast_cancer(*, return_X_y=False, as_frame=False): """Load and return the breast cancer wisconsin dataset (classification). @@ -631,7 +626,6 @@ def load_breast_cancer(*, return_X_y=False, as_frame=False): filename=csv_filename) -@_deprecate_positional_args def load_digits(*, n_class=10, return_X_y=False, as_frame=False): """Load and return the digits dataset (classification). @@ -755,7 +749,6 @@ def load_digits(*, n_class=10, return_X_y=False, as_frame=False): DESCR=descr) -@_deprecate_positional_args def load_diabetes(*, return_X_y=False, as_frame=False): """Load and return the diabetes dataset (regression). @@ -854,7 +847,6 @@ def load_diabetes(*, return_X_y=False, as_frame=False): target_filename=target_filename) -@_deprecate_positional_args def load_linnerud(*, return_X_y=False, as_frame=False): """Load and return the physical excercise linnerud dataset. @@ -958,7 +950,6 @@ def load_linnerud(*, return_X_y=False, as_frame=False): target_filename=target_filename) -@_deprecate_positional_args def load_boston(*, return_X_y=False): """Load and return the boston house-prices dataset (regression). diff --git a/sklearn/datasets/_california_housing.py b/sklearn/datasets/_california_housing.py index a25f8d63eceef..dd0b4ff25014b 100644 --- a/sklearn/datasets/_california_housing.py +++ b/sklearn/datasets/_california_housing.py @@ -36,7 +36,6 @@ from ._base import _pkl_filepath from ._base import RemoteFileMetadata from ..utils import Bunch -from ..utils.validation import _deprecate_positional_args # The original data can be found at: @@ -50,7 +49,6 @@ logger = logging.getLogger(__name__) -@_deprecate_positional_args def fetch_california_housing(*, data_home=None, download_if_missing=True, return_X_y=False, as_frame=False): """Load the California housing dataset (regression). diff --git a/sklearn/datasets/_covtype.py b/sklearn/datasets/_covtype.py index 305f465369604..85d0c0732e15f 100644 --- a/sklearn/datasets/_covtype.py +++ b/sklearn/datasets/_covtype.py @@ -29,7 +29,6 @@ from ..utils import Bunch from ._base import _pkl_filepath from ..utils import check_random_state -from ..utils.validation import _deprecate_positional_args # The original data can be found in: @@ -59,7 +58,6 @@ TARGET_NAMES = ["Cover_Type"] -@_deprecate_positional_args def fetch_covtype(*, data_home=None, download_if_missing=True, random_state=None, shuffle=False, return_X_y=False, as_frame=False): diff --git a/sklearn/datasets/_kddcup99.py b/sklearn/datasets/_kddcup99.py index 26fb14197a211..f7bf454cc420e 100644 --- a/sklearn/datasets/_kddcup99.py +++ b/sklearn/datasets/_kddcup99.py @@ -24,7 +24,6 @@ from ..utils import Bunch from ..utils import check_random_state from ..utils import shuffle as shuffle_method -from ..utils.validation import _deprecate_positional_args # The original data can be found at: @@ -46,7 +45,6 @@ logger = logging.getLogger(__name__) -@_deprecate_positional_args def fetch_kddcup99(*, subset=None, data_home=None, shuffle=False, random_state=None, percent10=True, download_if_missing=True, return_X_y=False, diff --git a/sklearn/datasets/_lfw.py b/sklearn/datasets/_lfw.py index dd56e532afdc3..73e5ac66bb4d4 100644 --- a/sklearn/datasets/_lfw.py +++ b/sklearn/datasets/_lfw.py @@ -19,7 +19,6 @@ from ._base import get_data_home, _fetch_remote, RemoteFileMetadata from ..utils import Bunch -from ..utils.validation import _deprecate_positional_args from ..utils.fixes import parse_version logger = logging.getLogger(__name__) @@ -216,7 +215,6 @@ def _fetch_lfw_people(data_folder_path, slice_=None, color=False, resize=None, return faces, target, target_names -@_deprecate_positional_args def fetch_lfw_people(*, data_home=None, funneled=True, resize=0.5, min_faces_per_person=0, color=False, slice_=(slice(70, 195), slice(78, 172)), @@ -387,7 +385,6 @@ def _fetch_lfw_pairs(index_file_path, data_folder_path, slice_=None, return pairs, target, np.array(['Different persons', 'Same person']) -@_deprecate_positional_args def fetch_lfw_pairs(*, subset='train', data_home=None, funneled=True, resize=0.5, color=False, slice_=(slice(70, 195), slice(78, 172)), diff --git a/sklearn/datasets/_olivetti_faces.py b/sklearn/datasets/_olivetti_faces.py index 76388a4a92a42..53609439bba90 100644 --- a/sklearn/datasets/_olivetti_faces.py +++ b/sklearn/datasets/_olivetti_faces.py @@ -25,7 +25,6 @@ from ._base import RemoteFileMetadata from ._base import _pkl_filepath from ..utils import check_random_state, Bunch -from ..utils.validation import _deprecate_positional_args # The original data can be found at: # https://cs.nyu.edu/~roweis/data/olivettifaces.mat @@ -36,7 +35,6 @@ 'd5fca46a4b8906c18e454d41af987794')) -@_deprecate_positional_args def fetch_olivetti_faces(*, data_home=None, shuffle=False, random_state=0, download_if_missing=True, return_X_y=False): """Load the Olivetti faces data-set from AT&T (classification). diff --git a/sklearn/datasets/_openml.py b/sklearn/datasets/_openml.py index b589c9faa5213..ec3c3a9ae961d 100644 --- a/sklearn/datasets/_openml.py +++ b/sklearn/datasets/_openml.py @@ -27,7 +27,6 @@ from ..utils import get_chunk_n_rows from ..utils import _chunk_generator from ..utils import check_pandas_support # noqa -from ..utils.validation import _deprecate_positional_args __all__ = ['fetch_openml'] @@ -690,7 +689,6 @@ def _valid_data_column_names(features_list, target_columns): return valid_data_column_names -@_deprecate_positional_args def fetch_openml( name: Optional[str] = None, *, diff --git a/sklearn/datasets/_rcv1.py b/sklearn/datasets/_rcv1.py index 2d3a809848e83..4d1bd8e9ba44f 100644 --- a/sklearn/datasets/_rcv1.py +++ b/sklearn/datasets/_rcv1.py @@ -25,7 +25,6 @@ from ._svmlight_format_io import load_svmlight_files from ..utils import shuffle as shuffle_ from ..utils import Bunch -from ..utils.validation import _deprecate_positional_args # The original vectorized data can be found at: @@ -76,7 +75,6 @@ logger = logging.getLogger(__name__) -@_deprecate_positional_args def fetch_rcv1(*, data_home=None, subset='all', download_if_missing=True, random_state=None, shuffle=False, return_X_y=False): """Load the RCV1 multilabel dataset (classification). diff --git a/sklearn/datasets/_samples_generator.py b/sklearn/datasets/_samples_generator.py index 66d1baaaa9cb2..3a9e1812cb1e7 100644 --- a/sklearn/datasets/_samples_generator.py +++ b/sklearn/datasets/_samples_generator.py @@ -18,7 +18,6 @@ from ..utils import check_array, check_random_state from ..utils import shuffle as util_shuffle from ..utils.random import sample_without_replacement -from ..utils.validation import _deprecate_positional_args def _generate_hypercube(samples, dimensions, rng): @@ -34,7 +33,6 @@ def _generate_hypercube(samples, dimensions, rng): return out -@_deprecate_positional_args def make_classification(n_samples=100, n_features=20, *, n_informative=2, n_redundant=2, n_repeated=0, n_classes=2, n_clusters_per_class=2, weights=None, flip_y=0.01, @@ -264,7 +262,6 @@ def make_classification(n_samples=100, n_features=20, *, n_informative=2, return X, y -@_deprecate_positional_args def make_multilabel_classification(n_samples=100, n_features=20, *, n_classes=5, n_labels=2, length=50, allow_unlabeled=True, @@ -427,7 +424,6 @@ def sample_example(): return X, Y -@_deprecate_positional_args def make_hastie_10_2(n_samples=12000, *, random_state=None): """Generates data for binary classification used in Hastie et al. 2009, Example 10.2. @@ -476,7 +472,6 @@ def make_hastie_10_2(n_samples=12000, *, random_state=None): return X, y -@_deprecate_positional_args def make_regression(n_samples=100, n_features=100, *, n_informative=10, n_targets=1, bias=0.0, effective_rank=None, tail_strength=0.5, noise=0.0, shuffle=True, coef=False, @@ -600,7 +595,6 @@ def make_regression(n_samples=100, n_features=100, *, n_informative=10, return X, y -@_deprecate_positional_args def make_circles(n_samples=100, *, shuffle=True, noise=None, random_state=None, factor=.8): """Make a large circle containing a smaller circle in 2d. @@ -680,7 +674,6 @@ def make_circles(n_samples=100, *, shuffle=True, noise=None, random_state=None, return X, y -@_deprecate_positional_args def make_moons(n_samples=100, *, shuffle=True, noise=None, random_state=None): """Make two interleaving half circles. @@ -747,7 +740,6 @@ def make_moons(n_samples=100, *, shuffle=True, noise=None, random_state=None): return X, y -@_deprecate_positional_args def make_blobs(n_samples=100, n_features=2, *, centers=None, cluster_std=1.0, center_box=(-10.0, 10.0), shuffle=True, random_state=None, return_centers=False): @@ -906,7 +898,6 @@ def make_blobs(n_samples=100, n_features=2, *, centers=None, cluster_std=1.0, return X, y -@_deprecate_positional_args def make_friedman1(n_samples=100, n_features=10, *, noise=0.0, random_state=None): """Generate the "Friedman #1" regression problem. @@ -970,7 +961,6 @@ def make_friedman1(n_samples=100, n_features=10, *, noise=0.0, return X, y -@_deprecate_positional_args def make_friedman2(n_samples=100, *, noise=0.0, random_state=None): """Generate the "Friedman #2" regression problem. @@ -1036,7 +1026,6 @@ def make_friedman2(n_samples=100, *, noise=0.0, random_state=None): return X, y -@_deprecate_positional_args def make_friedman3(n_samples=100, *, noise=0.0, random_state=None): """Generate the "Friedman #3" regression problem. @@ -1101,7 +1090,6 @@ def make_friedman3(n_samples=100, *, noise=0.0, random_state=None): return X, y -@_deprecate_positional_args def make_low_rank_matrix(n_samples=100, n_features=100, *, effective_rank=10, tail_strength=0.5, random_state=None): """Generate a mostly low rank matrix with bell-shaped singular values. @@ -1173,7 +1161,6 @@ def make_low_rank_matrix(n_samples=100, n_features=100, *, effective_rank=10, return np.dot(np.dot(u, s), v.T) -@_deprecate_positional_args def make_sparse_coded_signal(n_samples, *, n_components, n_features, n_nonzero_coefs, random_state=None): """Generate a signal as a sparse combination of dictionary elements. @@ -1236,7 +1223,6 @@ def make_sparse_coded_signal(n_samples, *, n_components, n_features, return map(np.squeeze, (Y, D, X)) -@_deprecate_positional_args def make_sparse_uncorrelated(n_samples=100, n_features=10, *, random_state=None): """Generate a random regression problem with sparse uncorrelated design. @@ -1289,7 +1275,6 @@ def make_sparse_uncorrelated(n_samples=100, n_features=10, *, return X, y -@_deprecate_positional_args def make_spd_matrix(n_dim, *, random_state=None): """Generate a random symmetric, positive-definite matrix. @@ -1323,7 +1308,6 @@ def make_spd_matrix(n_dim, *, random_state=None): return X -@_deprecate_positional_args def make_sparse_spd_matrix(dim=1, *, alpha=0.95, norm_diag=False, smallest_coef=.1, largest_coef=.9, random_state=None): @@ -1398,7 +1382,6 @@ def make_sparse_spd_matrix(dim=1, *, alpha=0.95, norm_diag=False, return prec -@_deprecate_positional_args def make_swiss_roll(n_samples=100, *, noise=0.0, random_state=None): """Generate a swiss roll dataset. @@ -1451,7 +1434,6 @@ def make_swiss_roll(n_samples=100, *, noise=0.0, random_state=None): return X, t -@_deprecate_positional_args def make_s_curve(n_samples=100, *, noise=0.0, random_state=None): """Generate an S curve dataset. @@ -1494,7 +1476,6 @@ def make_s_curve(n_samples=100, *, noise=0.0, random_state=None): return X, t -@_deprecate_positional_args def make_gaussian_quantiles(*, mean=None, cov=1., n_samples=100, n_features=2, n_classes=3, shuffle=True, random_state=None): @@ -1590,7 +1571,6 @@ def _shuffle(data, random_state=None): return result, row_idx, col_idx -@_deprecate_positional_args def make_biclusters(shape, n_clusters, *, noise=0.0, minval=10, maxval=100, shuffle=True, random_state=None): """Generate an array with constant block diagonal structure for @@ -1682,7 +1662,6 @@ def make_biclusters(shape, n_clusters, *, noise=0.0, minval=10, return result, rows, cols -@_deprecate_positional_args def make_checkerboard(shape, n_clusters, *, noise=0.0, minval=10, maxval=100, shuffle=True, random_state=None): """Generate an array with block checkerboard structure for diff --git a/sklearn/datasets/_species_distributions.py b/sklearn/datasets/_species_distributions.py index bc3fa3bcc7a04..039883ca4b06a 100644 --- a/sklearn/datasets/_species_distributions.py +++ b/sklearn/datasets/_species_distributions.py @@ -50,7 +50,6 @@ from ._base import _fetch_remote from ._base import RemoteFileMetadata from ..utils import Bunch -from ..utils.validation import _deprecate_positional_args from ._base import _pkl_filepath # The original data can be found at: @@ -138,7 +137,6 @@ def construct_grids(batch): return (xgrid, ygrid) -@_deprecate_positional_args def fetch_species_distributions(*, data_home=None, download_if_missing=True): """Loader for species distribution dataset from Phillips et. al. (2006) diff --git a/sklearn/datasets/_svmlight_format_io.py b/sklearn/datasets/_svmlight_format_io.py index 8997624da0755..4a1d1eb02e6da 100644 --- a/sklearn/datasets/_svmlight_format_io.py +++ b/sklearn/datasets/_svmlight_format_io.py @@ -25,7 +25,6 @@ from .. import __version__ from ..utils import check_array, IS_PYPY -from ..utils.validation import _deprecate_positional_args if not IS_PYPY: from ._svmlight_format_fast import _load_svmlight_file @@ -38,7 +37,6 @@ def _load_svmlight_file(*args, **kwargs): 'for the status updates).') -@_deprecate_positional_args def load_svmlight_file(f, *, n_features=None, dtype=np.float64, multilabel=False, zero_based="auto", query_id=False, offset=0, length=-1): @@ -202,7 +200,6 @@ def _open_and_load(f, dtype, multilabel, zero_based, query_id, return data, indices, indptr, labels, query -@_deprecate_positional_args def load_svmlight_files(files, *, n_features=None, dtype=np.float64, multilabel=False, zero_based="auto", query_id=False, offset=0, length=-1): @@ -387,7 +384,6 @@ def _dump_svmlight(X, y, f, multilabel, one_based, comment, query_id): f.write((line_pattern % feat).encode('ascii')) -@_deprecate_positional_args def dump_svmlight_file(X, y, f, *, zero_based=True, comment=None, query_id=None, multilabel=False): diff --git a/sklearn/datasets/_twenty_newsgroups.py b/sklearn/datasets/_twenty_newsgroups.py index cb181d2108403..c41bf767d9ed5 100644 --- a/sklearn/datasets/_twenty_newsgroups.py +++ b/sklearn/datasets/_twenty_newsgroups.py @@ -46,7 +46,6 @@ from ..feature_extraction.text import CountVectorizer from .. import preprocessing from ..utils import check_random_state, Bunch -from ..utils.validation import _deprecate_positional_args logger = logging.getLogger(__name__) @@ -148,7 +147,6 @@ def strip_newsgroup_footer(text): return text -@_deprecate_positional_args def fetch_20newsgroups(*, data_home=None, subset='train', categories=None, shuffle=True, random_state=42, remove=(), @@ -326,7 +324,6 @@ def fetch_20newsgroups(*, data_home=None, subset='train', categories=None, return data -@_deprecate_positional_args def fetch_20newsgroups_vectorized(*, subset="train", remove=(), data_home=None, download_if_missing=True, return_X_y=False, normalize=True, as_frame=False): diff --git a/sklearn/decomposition/_dict_learning.py b/sklearn/decomposition/_dict_learning.py index bd8a95e37dbaf..1c48542a1c9ec 100644 --- a/sklearn/decomposition/_dict_learning.py +++ b/sklearn/decomposition/_dict_learning.py @@ -19,7 +19,7 @@ from ..utils import (check_array, check_random_state, gen_even_slices, gen_batches) from ..utils.extmath import randomized_svd, row_norms, svd_flip -from ..utils.validation import check_is_fitted, _deprecate_positional_args +from ..utils.validation import check_is_fitted from ..utils.fixes import delayed from ..linear_model import Lasso, orthogonal_mp_gram, LassoLars, Lars @@ -193,7 +193,6 @@ def _sparse_encode(X, dictionary, gram, cov=None, algorithm='lasso_lars', # XXX : could be moved to the linear_model module -@_deprecate_positional_args def sparse_encode(X, dictionary, *, gram=None, cov=None, algorithm='lasso_lars', n_nonzero_coefs=None, alpha=None, copy_cov=True, init=None, max_iter=1000, n_jobs=None, @@ -427,7 +426,6 @@ def _update_dict(dictionary, Y, code, A=None, B=None, verbose=False, print(f"{n_unused} unused atoms resampled.") -@_deprecate_positional_args def dict_learning(X, n_components, *, alpha, max_iter=100, tol=1e-8, method='lars', n_jobs=None, dict_init=None, code_init=None, callback=None, verbose=False, random_state=None, @@ -626,7 +624,6 @@ def dict_learning(X, n_components, *, alpha, max_iter=100, tol=1e-8, return code, dictionary, errors -@_deprecate_positional_args def dict_learning_online(X, n_components=2, *, alpha=1, n_iter=100, return_code=True, dict_init=None, callback=None, batch_size=3, verbose=False, shuffle=True, @@ -1063,7 +1060,6 @@ class SparseCoder(_BaseSparseCoding, BaseEstimator): """ _required_parameters = ["dictionary"] - @_deprecate_positional_args def __init__(self, dictionary, *, transform_algorithm='omp', transform_n_nonzero_coefs=None, transform_alpha=None, split_sign=False, n_jobs=None, positive_code=False, @@ -1299,7 +1295,6 @@ class DictionaryLearning(_BaseSparseCoding, BaseEstimator): SparsePCA MiniBatchSparsePCA """ - @_deprecate_positional_args def __init__(self, n_components=None, *, alpha=1, max_iter=1000, tol=1e-8, fit_algorithm='lars', transform_algorithm='omp', transform_n_nonzero_coefs=None, transform_alpha=None, @@ -1537,7 +1532,6 @@ class MiniBatchDictionaryLearning(_BaseSparseCoding, BaseEstimator): MiniBatchSparsePCA """ - @_deprecate_positional_args def __init__(self, n_components=None, *, alpha=1, n_iter=1000, fit_algorithm='lars', n_jobs=None, batch_size=3, shuffle=True, dict_init=None, transform_algorithm='omp', diff --git a/sklearn/decomposition/_factor_analysis.py b/sklearn/decomposition/_factor_analysis.py index 5dd9f13094a89..830e81e9268d5 100644 --- a/sklearn/decomposition/_factor_analysis.py +++ b/sklearn/decomposition/_factor_analysis.py @@ -28,7 +28,7 @@ from ..base import BaseEstimator, TransformerMixin from ..utils import check_random_state from ..utils.extmath import fast_logdet, randomized_svd, squared_norm -from ..utils.validation import check_is_fitted, _deprecate_positional_args +from ..utils.validation import check_is_fitted from ..exceptions import ConvergenceWarning @@ -147,7 +147,6 @@ class FactorAnalysis(TransformerMixin, BaseEstimator): FastICA: Independent component analysis, a latent variable model with non-Gaussian latent variables. """ - @_deprecate_positional_args def __init__(self, n_components=None, *, tol=1e-2, copy=True, max_iter=1000, noise_variance_init=None, svd_method='randomized', diff --git a/sklearn/decomposition/_fastica.py b/sklearn/decomposition/_fastica.py index a57ddada85694..6c374e6e420f8 100644 --- a/sklearn/decomposition/_fastica.py +++ b/sklearn/decomposition/_fastica.py @@ -20,7 +20,6 @@ from ..utils import check_array, as_float_array, check_random_state from ..utils.validation import check_is_fitted from ..utils.validation import FLOAT_DTYPES -from ..utils.validation import _deprecate_positional_args __all__ = ['fastica', 'FastICA'] @@ -147,7 +146,6 @@ def _cube(x, fun_args): return x ** 3, (3 * x ** 2).mean(axis=-1) -@_deprecate_positional_args def fastica(X, n_components=None, *, algorithm="parallel", whiten=True, fun="logcosh", fun_args=None, max_iter=200, tol=1e-04, w_init=None, random_state=None, return_X_mean=False, compute_sources=True, @@ -392,7 +390,6 @@ def my_g(x): pp. 411-430* """ - @_deprecate_positional_args def __init__(self, n_components=None, *, algorithm='parallel', whiten=True, fun='logcosh', fun_args=None, max_iter=200, tol=1e-4, w_init=None, random_state=None): diff --git a/sklearn/decomposition/_incremental_pca.py b/sklearn/decomposition/_incremental_pca.py index 10a1cceadd65e..486d4a22d8cdb 100644 --- a/sklearn/decomposition/_incremental_pca.py +++ b/sklearn/decomposition/_incremental_pca.py @@ -10,7 +10,6 @@ from ._base import _BasePCA from ..utils import gen_batches from ..utils.extmath import svd_flip, _incremental_mean_and_var -from ..utils.validation import _deprecate_positional_args class IncrementalPCA(_BasePCA): @@ -164,7 +163,6 @@ class IncrementalPCA(_BasePCA): SparsePCA TruncatedSVD """ - @_deprecate_positional_args def __init__(self, n_components=None, *, whiten=False, copy=True, batch_size=None): self.n_components = n_components diff --git a/sklearn/decomposition/_kernel_pca.py b/sklearn/decomposition/_kernel_pca.py index 8663193a8383e..1e1cdb1722029 100644 --- a/sklearn/decomposition/_kernel_pca.py +++ b/sklearn/decomposition/_kernel_pca.py @@ -16,7 +16,6 @@ from ..base import BaseEstimator, TransformerMixin from ..preprocessing import KernelCenterer from ..metrics.pairwise import pairwise_kernels -from ..utils.validation import _deprecate_positional_args class KernelPCA(TransformerMixin, BaseEstimator): @@ -192,7 +191,6 @@ class KernelPCA(TransformerMixin, BaseEstimator): A randomized algorithm for the decomposition of matrices Per-Gunnar Martinsson, Vladimir Rokhlin and Mark Tygert """ - @_deprecate_positional_args def __init__(self, n_components=None, *, kernel="linear", gamma=None, degree=3, coef0=1, kernel_params=None, alpha=1.0, fit_inverse_transform=False, eigen_solver='auto', diff --git a/sklearn/decomposition/_lda.py b/sklearn/decomposition/_lda.py index e554d299fe478..34432557814c2 100644 --- a/sklearn/decomposition/_lda.py +++ b/sklearn/decomposition/_lda.py @@ -20,7 +20,6 @@ from ..utils import check_random_state, gen_batches, gen_even_slices from ..utils.validation import check_non_negative from ..utils.validation import check_is_fitted -from ..utils.validation import _deprecate_positional_args from ..utils.fixes import delayed from ._online_lda_fast import (mean_change, _dirichlet_expectation_1d, @@ -293,7 +292,6 @@ class LatentDirichletAllocation(TransformerMixin, BaseEstimator): https://github.com/blei-lab/onlineldavb """ - @_deprecate_positional_args def __init__(self, n_components=10, *, doc_topic_prior=None, topic_word_prior=None, learning_method='batch', learning_decay=.7, learning_offset=10., max_iter=10, diff --git a/sklearn/decomposition/_nmf.py b/sklearn/decomposition/_nmf.py index b978f1a33d3af..c8239147eb6c4 100644 --- a/sklearn/decomposition/_nmf.py +++ b/sklearn/decomposition/_nmf.py @@ -20,7 +20,6 @@ from ..utils import check_random_state, check_array from ..utils.extmath import randomized_svd, safe_sparse_dot, squared_norm from ..utils.validation import check_is_fitted, check_non_negative -from ..utils.validation import _deprecate_positional_args EPSILON = np.finfo(np.float32).eps @@ -850,7 +849,6 @@ def _fit_multiplicative_update(X, W, H, beta_loss='frobenius', return W, H, n_iter -@_deprecate_positional_args def non_negative_factorization(X, W=None, H=None, n_components=None, *, init='warn', update_H=True, solver='cd', beta_loss='frobenius', tol=1e-4, @@ -1200,7 +1198,6 @@ class NMF(TransformerMixin, BaseEstimator): Fevotte, C., & Idier, J. (2011). Algorithms for nonnegative matrix factorization with the beta-divergence. Neural Computation, 23(9). """ - @_deprecate_positional_args def __init__(self, n_components=None, *, init='warn', solver='cd', beta_loss='frobenius', tol=1e-4, max_iter=200, random_state=None, alpha=0., l1_ratio=0., verbose=0, diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py index eb0a73919021a..765320ccdb5a8 100644 --- a/sklearn/decomposition/_pca.py +++ b/sklearn/decomposition/_pca.py @@ -25,7 +25,6 @@ from ..utils.extmath import fast_logdet, randomized_svd, svd_flip from ..utils.extmath import stable_cumsum from ..utils.validation import check_is_fitted -from ..utils.validation import _deprecate_positional_args def _assess_dimension(spectrum, rank, n_samples): @@ -328,7 +327,6 @@ class PCA(_BasePCA): >>> print(pca.singular_values_) [6.30061...] """ - @_deprecate_positional_args def __init__(self, n_components=None, *, copy=True, whiten=False, svd_solver='auto', tol=0.0, iterated_power='auto', random_state=None): diff --git a/sklearn/decomposition/_sparse_pca.py b/sklearn/decomposition/_sparse_pca.py index 2348ada255fd4..7f280db3a3af6 100644 --- a/sklearn/decomposition/_sparse_pca.py +++ b/sklearn/decomposition/_sparse_pca.py @@ -6,7 +6,6 @@ from ..utils import check_random_state from ..utils.validation import check_is_fitted -from ..utils.validation import _deprecate_positional_args from ..linear_model import ridge_regression from ..base import BaseEstimator, TransformerMixin from ._dict_learning import dict_learning, dict_learning_online @@ -111,7 +110,6 @@ class SparsePCA(TransformerMixin, BaseEstimator): MiniBatchSparsePCA DictionaryLearning """ - @_deprecate_positional_args def __init__(self, n_components=None, *, alpha=1, ridge_alpha=0.01, max_iter=1000, tol=1e-8, method='lars', n_jobs=None, U_init=None, V_init=None, verbose=False, random_state=None): @@ -304,7 +302,6 @@ class MiniBatchSparsePCA(SparsePCA): SparsePCA DictionaryLearning """ - @_deprecate_positional_args def __init__(self, n_components=None, *, alpha=1, ridge_alpha=0.01, n_iter=100, callback=None, batch_size=3, verbose=False, shuffle=True, n_jobs=None, method='lars', random_state=None): diff --git a/sklearn/decomposition/_truncated_svd.py b/sklearn/decomposition/_truncated_svd.py index 1ea6b15c3ebd7..74239567dee48 100644 --- a/sklearn/decomposition/_truncated_svd.py +++ b/sklearn/decomposition/_truncated_svd.py @@ -15,7 +15,6 @@ from ..utils._arpack import _init_arpack_v0 from ..utils.extmath import randomized_svd, safe_sparse_dot, svd_flip from ..utils.sparsefuncs import mean_variance_axis -from ..utils.validation import _deprecate_positional_args from ..utils.validation import check_is_fitted @@ -119,7 +118,6 @@ class TruncatedSVD(TransformerMixin, BaseEstimator): class to data once, then keep the instance around to do transformations. """ - @_deprecate_positional_args def __init__(self, n_components=2, *, algorithm="randomized", n_iter=5, random_state=None, tol=0.): self.algorithm = algorithm diff --git a/sklearn/discriminant_analysis.py b/sklearn/discriminant_analysis.py index 2e80f94404175..4d94b19574f53 100644 --- a/sklearn/discriminant_analysis.py +++ b/sklearn/discriminant_analysis.py @@ -22,7 +22,6 @@ from .utils.multiclass import check_classification_targets from .utils.extmath import softmax from .preprocessing import StandardScaler -from .utils.validation import _deprecate_positional_args __all__ = ['LinearDiscriminantAnalysis', 'QuadraticDiscriminantAnalysis'] @@ -749,7 +748,6 @@ class QuadraticDiscriminantAnalysis(ClassifierMixin, BaseEstimator): -------- LinearDiscriminantAnalysis : Linear Discriminant Analysis. """ - @_deprecate_positional_args def __init__(self, *, priors=None, reg_param=0., store_covariance=False, tol=1.0e-4): self.priors = np.asarray(priors) if priors is not None else None diff --git a/sklearn/dummy.py b/sklearn/dummy.py index ad5ab3f24731d..575b38aa7d2a8 100644 --- a/sklearn/dummy.py +++ b/sklearn/dummy.py @@ -17,7 +17,6 @@ from .utils.random import _random_choice_csc from .utils.stats import _weighted_percentile from .utils.multiclass import class_distribution -from .utils.validation import _deprecate_positional_args class DummyClassifier(MultiOutputMixin, ClassifierMixin, BaseEstimator): @@ -94,7 +93,6 @@ class DummyClassifier(MultiOutputMixin, ClassifierMixin, BaseEstimator): >>> dummy_clf.score(X, y) 0.75 """ - @_deprecate_positional_args def __init__(self, *, strategy="prior", random_state=None, constant=None): self.strategy = strategy @@ -444,7 +442,6 @@ class DummyRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): >>> dummy_regr.score(X, y) 0.0 """ - @_deprecate_positional_args def __init__(self, *, strategy="mean", constant=None, quantile=None): self.strategy = strategy self.constant = constant diff --git a/sklearn/ensemble/_bagging.py b/sklearn/ensemble/_bagging.py index 1ac309f00ad69..a4be68ba5e2d6 100644 --- a/sklearn/ensemble/_bagging.py +++ b/sklearn/ensemble/_bagging.py @@ -22,7 +22,7 @@ from ..utils.multiclass import check_classification_targets from ..utils.random import sample_without_replacement from ..utils.validation import has_fit_parameter, check_is_fitted, \ - _check_sample_weight, _deprecate_positional_args + _check_sample_weight from ..utils.fixes import delayed @@ -593,7 +593,6 @@ class BaggingClassifier(ClassifierMixin, BaseBagging): .. [4] G. Louppe and P. Geurts, "Ensembles on Random Patches", Machine Learning and Knowledge Discovery in Databases, 346-361, 2012. """ - @_deprecate_positional_args def __init__(self, base_estimator=None, n_estimators=10, *, @@ -979,7 +978,6 @@ class BaggingRegressor(RegressorMixin, BaseBagging): .. [4] G. Louppe and P. Geurts, "Ensembles on Random Patches", Machine Learning and Knowledge Discovery in Databases, 346-361, 2012. """ - @_deprecate_positional_args def __init__(self, base_estimator=None, n_estimators=10, *, diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index 8eef1f3429227..5a93acd0c0554 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -64,7 +64,6 @@ class calls the ``fit`` method of each sub-estimator on random samples from ..utils.fixes import _joblib_parallel_args from ..utils.multiclass import check_classification_targets, type_of_target from ..utils.validation import check_is_fitted, _check_sample_weight -from ..utils.validation import _deprecate_positional_args __all__ = ["RandomForestClassifier", @@ -1254,7 +1253,6 @@ class labels (multi-output problem). >>> print(clf.predict([[0, 0, 0, 0]])) [1] """ - @_deprecate_positional_args def __init__(self, n_estimators=100, *, criterion="gini", @@ -1559,7 +1557,6 @@ class RandomForestRegressor(ForestRegressor): >>> print(regr.predict([[0, 0, 0, 0]])) [-8.32987858] """ - @_deprecate_positional_args def __init__(self, n_estimators=100, *, criterion="squared_error", @@ -1876,7 +1873,6 @@ class labels (multi-output problem). >>> clf.predict([[0, 0, 0, 0]]) array([1]) """ - @_deprecate_positional_args def __init__(self, n_estimators=100, *, criterion="gini", @@ -2172,7 +2168,6 @@ class ExtraTreesRegressor(ForestRegressor): >>> reg.score(X_test, y_test) 0.2708... """ - @_deprecate_positional_args def __init__(self, n_estimators=100, *, criterion="squared_error", @@ -2390,7 +2385,6 @@ class RandomTreesEmbedding(BaseForest): criterion = "squared_error" max_features = 1 - @_deprecate_positional_args def __init__(self, n_estimators=100, *, max_depth=5, diff --git a/sklearn/ensemble/_gb.py b/sklearn/ensemble/_gb.py index 527bbcb559b5f..54e4e510cd9b9 100644 --- a/sklearn/ensemble/_gb.py +++ b/sklearn/ensemble/_gb.py @@ -54,7 +54,6 @@ from ..utils.validation import check_is_fitted, _check_sample_weight from ..utils.multiclass import check_classification_targets from ..exceptions import NotFittedError -from ..utils.validation import _deprecate_positional_args class VerboseReporter: @@ -1112,7 +1111,6 @@ class GradientBoostingClassifier(ClassifierMixin, BaseGradientBoosting): _SUPPORTED_LOSS = ('deviance', 'exponential') - @_deprecate_positional_args def __init__(self, *, loss='deviance', learning_rate=0.1, n_estimators=100, subsample=1.0, criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0., @@ -1656,7 +1654,6 @@ class GradientBoostingRegressor(RegressorMixin, BaseGradientBoosting): _SUPPORTED_LOSS = ("squared_error", 'ls', "absolute_error", 'lad', 'huber', 'quantile') - @_deprecate_positional_args def __init__(self, *, loss="squared_error", learning_rate=0.1, n_estimators=100, subsample=1.0, criterion='friedman_mse', min_samples_split=2, diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 6d5de978add9b..99eb0d265b100 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -12,8 +12,7 @@ from ...utils import check_random_state, resample from ...utils.validation import (check_is_fitted, check_consistent_length, - _check_sample_weight, - _deprecate_positional_args) + _check_sample_weight) from ...utils.multiclass import check_classification_targets from ...metrics import check_scoring from ...model_selection import train_test_split @@ -1045,7 +1044,6 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting): _VALID_LOSSES = ('squared_error', 'least_squares', 'absolute_error', 'least_absolute_deviation', 'poisson') - @_deprecate_positional_args def __init__(self, loss='squared_error', *, learning_rate=0.1, max_iter=100, max_leaf_nodes=31, max_depth=None, min_samples_leaf=20, l2_regularization=0., max_bins=255, @@ -1304,7 +1302,6 @@ class HistGradientBoostingClassifier(ClassifierMixin, _VALID_LOSSES = ('binary_crossentropy', 'categorical_crossentropy', 'auto') - @_deprecate_positional_args def __init__(self, loss='auto', *, learning_rate=0.1, max_iter=100, max_leaf_nodes=31, max_depth=None, min_samples_leaf=20, l2_regularization=0., max_bins=255, diff --git a/sklearn/ensemble/_iforest.py b/sklearn/ensemble/_iforest.py index 9c3f547f23459..3d2ac0928bd3f 100644 --- a/sklearn/ensemble/_iforest.py +++ b/sklearn/ensemble/_iforest.py @@ -16,7 +16,6 @@ ) from ..utils.fixes import _joblib_parallel_args from ..utils.validation import check_is_fitted, _num_samples -from ..utils.validation import _deprecate_positional_args from ..base import OutlierMixin from ._bagging import BaseBagging @@ -181,7 +180,6 @@ class IsolationForest(OutlierMixin, BaseBagging): >>> clf.predict([[0.1], [0], [90]]) array([ 1, 1, -1]) """ - @_deprecate_positional_args def __init__(self, *, n_estimators=100, max_samples="auto", diff --git a/sklearn/ensemble/_stacking.py b/sklearn/ensemble/_stacking.py index 3522b381389d3..db5f5c26cf746 100644 --- a/sklearn/ensemble/_stacking.py +++ b/sklearn/ensemble/_stacking.py @@ -32,7 +32,6 @@ from ..utils.multiclass import check_classification_targets from ..utils.validation import check_is_fitted from ..utils.validation import column_or_1d -from ..utils.validation import _deprecate_positional_args from ..utils.fixes import delayed @@ -397,7 +396,6 @@ class StackingClassifier(ClassifierMixin, _BaseStacking): 0.9... """ - @_deprecate_positional_args def __init__(self, estimators, final_estimator=None, *, cv=None, stack_method='auto', n_jobs=None, passthrough=False, verbose=0): @@ -647,7 +645,6 @@ class StackingRegressor(RegressorMixin, _BaseStacking): 0.3... """ - @_deprecate_positional_args def __init__(self, estimators, final_estimator=None, *, cv=None, n_jobs=None, passthrough=False, verbose=0): super().__init__( diff --git a/sklearn/ensemble/_voting.py b/sklearn/ensemble/_voting.py index 2072d5c7c5501..2c8db5bfbc633 100644 --- a/sklearn/ensemble/_voting.py +++ b/sklearn/ensemble/_voting.py @@ -30,7 +30,6 @@ from ..utils.validation import check_is_fitted from ..utils.multiclass import check_classification_targets from ..utils.validation import column_or_1d -from ..utils.validation import _deprecate_positional_args from ..exceptions import NotFittedError from ..utils._estimator_html_repr import _VisualBlock from ..utils.fixes import delayed @@ -242,7 +241,6 @@ class VotingClassifier(ClassifierMixin, _BaseVoting): >>> print(eclf3.transform(X).shape) (6, 6) """ - @_deprecate_positional_args def __init__(self, estimators, *, voting='hard', weights=None, n_jobs=None, flatten_transform=True, verbose=False): super().__init__(estimators=estimators) @@ -451,7 +449,6 @@ class VotingRegressor(RegressorMixin, _BaseVoting): >>> print(er.fit(X, y).predict(X)) [ 3.3 5.7 11.8 19.7 28. 40.3] """ - @_deprecate_positional_args def __init__(self, estimators, *, weights=None, n_jobs=None, verbose=False): super().__init__(estimators=estimators) diff --git a/sklearn/ensemble/_weight_boosting.py b/sklearn/ensemble/_weight_boosting.py index 92c5e15731d63..1b6689b50fafc 100644 --- a/sklearn/ensemble/_weight_boosting.py +++ b/sklearn/ensemble/_weight_boosting.py @@ -41,7 +41,6 @@ from ..utils.validation import _check_sample_weight from ..utils.validation import has_fit_parameter from ..utils.validation import _num_samples -from ..utils.validation import _deprecate_positional_args __all__ = [ 'AdaBoostClassifier', @@ -400,7 +399,6 @@ class AdaBoostClassifier(ClassifierMixin, BaseWeightBoosting): >>> clf.score(X, y) 0.983... """ - @_deprecate_positional_args def __init__(self, base_estimator=None, *, n_estimators=50, @@ -964,7 +962,6 @@ class AdaBoostRegressor(RegressorMixin, BaseWeightBoosting): .. [2] H. Drucker, "Improving Regressors using Boosting Techniques", 1997. """ - @_deprecate_positional_args def __init__(self, base_estimator=None, *, n_estimators=50, diff --git a/sklearn/feature_extraction/_dict_vectorizer.py b/sklearn/feature_extraction/_dict_vectorizer.py index 44b50dc45a103..a34775575d93a 100644 --- a/sklearn/feature_extraction/_dict_vectorizer.py +++ b/sklearn/feature_extraction/_dict_vectorizer.py @@ -12,7 +12,6 @@ from ..base import BaseEstimator, TransformerMixin from ..utils import check_array, tosequence -from ..utils.validation import _deprecate_positional_args def _tosequence(X): @@ -96,7 +95,6 @@ class DictVectorizer(TransformerMixin, BaseEstimator): sklearn.preprocessing.OrdinalEncoder : Handles nominal/categorical features encoded as columns of arbitrary data types. """ - @_deprecate_positional_args def __init__(self, *, dtype=np.float64, separator="=", sparse=True, sort=True): self.dtype = dtype diff --git a/sklearn/feature_extraction/_hash.py b/sklearn/feature_extraction/_hash.py index 57f927649bd6f..9ace92c58c30a 100644 --- a/sklearn/feature_extraction/_hash.py +++ b/sklearn/feature_extraction/_hash.py @@ -7,7 +7,6 @@ import scipy.sparse as sp from ..utils import IS_PYPY -from ..utils.validation import _deprecate_positional_args from ..base import BaseEstimator, TransformerMixin if not IS_PYPY: @@ -89,7 +88,6 @@ class FeatureHasher(TransformerMixin, BaseEstimator): DictVectorizer : Vectorizes string-valued features using a hash table. sklearn.preprocessing.OneHotEncoder : Handles nominal/categorical features. """ - @_deprecate_positional_args def __init__(self, n_features=(2 ** 20), *, input_type="dict", dtype=np.float64, alternate_sign=True): self._validate_params(n_features, input_type) diff --git a/sklearn/feature_extraction/image.py b/sklearn/feature_extraction/image.py index 5cd692fd6aa4f..71b4c1b57c6e8 100644 --- a/sklearn/feature_extraction/image.py +++ b/sklearn/feature_extraction/image.py @@ -16,7 +16,6 @@ from numpy.lib.stride_tricks import as_strided from ..utils import check_array, check_random_state -from ..utils.validation import _deprecate_positional_args from ..base import BaseEstimator __all__ = ['PatchExtractor', @@ -130,7 +129,6 @@ def _to_graph(n_x, n_y, n_z, mask=None, img=None, return return_as(graph) -@_deprecate_positional_args def img_to_graph(img, *, mask=None, return_as=sparse.coo_matrix, dtype=None): """Graph of the pixel-to-pixel gradient connections @@ -167,7 +165,6 @@ def img_to_graph(img, *, mask=None, return_as=sparse.coo_matrix, dtype=None): return _to_graph(n_x, n_y, n_z, mask, img, return_as, dtype) -@_deprecate_positional_args def grid_to_graph(n_x, n_y, n_z=1, *, mask=None, return_as=sparse.coo_matrix, dtype=int): """Graph of the pixel-to-pixel connections @@ -305,7 +302,6 @@ def _extract_patches(arr, patch_shape=8, extraction_step=1): return patches -@_deprecate_positional_args def extract_patches_2d(image, patch_size, *, max_patches=None, random_state=None): """Reshape a 2D image into a collection of patches @@ -483,7 +479,6 @@ class PatchExtractor(BaseEstimator): >>> print('Patches shape: {}'.format(pe_trans.shape)) Patches shape: (545706, 2, 2) """ - @_deprecate_positional_args def __init__(self, *, patch_size=None, max_patches=None, random_state=None): self.patch_size = patch_size diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index fad0e53ed31ca..00debc059440c 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -33,7 +33,6 @@ from ..utils import _IS_32BIT from ..utils.fixes import _astype_copy_false from ..exceptions import NotFittedError -from ..utils.validation import _deprecate_positional_args __all__ = ['HashingVectorizer', @@ -679,7 +678,6 @@ class HashingVectorizer(TransformerMixin, _VectorizerMixin, BaseEstimator): CountVectorizer, TfidfVectorizer """ - @_deprecate_positional_args def __init__(self, *, input='content', encoding='utf-8', decode_error='strict', strip_accents=None, lowercase=True, preprocessor=None, tokenizer=None, @@ -1004,7 +1002,6 @@ class CountVectorizer(_VectorizerMixin, BaseEstimator): when pickling. This attribute is provided only for introspection and can be safely removed using delattr or set to None before pickling. """ - @_deprecate_positional_args def __init__(self, *, input='content', encoding='utf-8', decode_error='strict', strip_accents=None, lowercase=True, preprocessor=None, tokenizer=None, @@ -1424,7 +1421,6 @@ class TfidfTransformer(TransformerMixin, BaseEstimator): Introduction to Information Retrieval. Cambridge University Press, pp. 118-120. """ - @_deprecate_positional_args def __init__(self, *, norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False): self.norm = norm @@ -1733,7 +1729,6 @@ class TfidfVectorizer(CountVectorizer): >>> print(X.shape) (4, 9) """ - @_deprecate_positional_args def __init__(self, *, input='content', encoding='utf-8', decode_error='strict', strip_accents=None, lowercase=True, preprocessor=None, tokenizer=None, analyzer='word', diff --git a/sklearn/feature_selection/_from_model.py b/sklearn/feature_selection/_from_model.py index 4889f73518fe9..d3603f13be499 100644 --- a/sklearn/feature_selection/_from_model.py +++ b/sklearn/feature_selection/_from_model.py @@ -12,7 +12,6 @@ from ..exceptions import NotFittedError from ..utils.metaestimators import if_delegate_has_method -from ..utils.validation import _deprecate_positional_args def _calculate_threshold(estimator, importances, threshold): @@ -165,7 +164,6 @@ class SelectFromModel(MetaEstimatorMixin, SelectorMixin, BaseEstimator): SequentialFeatureSelector : Sequential cross-validation based feature selection. Does not rely on importance weights. """ - @_deprecate_positional_args def __init__(self, estimator, *, threshold=None, prefit=False, norm_order=1, max_features=None, importance_getter='auto'): diff --git a/sklearn/feature_selection/_mutual_info.py b/sklearn/feature_selection/_mutual_info.py index c5205fc68f370..79f7aea029f89 100644 --- a/sklearn/feature_selection/_mutual_info.py +++ b/sklearn/feature_selection/_mutual_info.py @@ -11,7 +11,6 @@ from ..utils import check_random_state from ..utils.fixes import _astype_copy_false from ..utils.validation import check_array, check_X_y -from ..utils.validation import _deprecate_positional_args from ..utils.multiclass import check_classification_targets @@ -288,7 +287,6 @@ def _estimate_mi(X, y, discrete_features='auto', discrete_target=False, return np.array(mi) -@_deprecate_positional_args def mutual_info_regression(X, y, *, discrete_features='auto', n_neighbors=3, copy=True, random_state=None): """Estimate mutual information for a continuous target variable. @@ -368,7 +366,6 @@ def mutual_info_regression(X, y, *, discrete_features='auto', n_neighbors=3, copy, random_state) -@_deprecate_positional_args def mutual_info_classif(X, y, *, discrete_features='auto', n_neighbors=3, copy=True, random_state=None): """Estimate mutual information for a discrete target variable. diff --git a/sklearn/feature_selection/_rfe.py b/sklearn/feature_selection/_rfe.py index 16519dfba6761..d972ee7c991e9 100644 --- a/sklearn/feature_selection/_rfe.py +++ b/sklearn/feature_selection/_rfe.py @@ -15,7 +15,6 @@ from ..utils.metaestimators import _safe_split from ..utils._tags import _safe_tags from ..utils.validation import check_is_fitted -from ..utils.validation import _deprecate_positional_args from ..utils.fixes import delayed from ..base import BaseEstimator from ..base import MetaEstimatorMixin @@ -152,7 +151,6 @@ class RFE(SelectorMixin, MetaEstimatorMixin, BaseEstimator): for cancer classification using support vector machines", Mach. Learn., 46(1-3), 389--422, 2002. """ - @_deprecate_positional_args def __init__(self, estimator, *, n_features_to_select=None, step=1, verbose=0, importance_getter='auto'): self.estimator = estimator @@ -524,7 +522,6 @@ class RFECV(RFE): for cancer classification using support vector machines", Mach. Learn., 46(1-3), 389--422, 2002. """ - @_deprecate_positional_args def __init__(self, estimator, *, step=1, min_features_to_select=1, cv=None, scoring=None, verbose=0, n_jobs=None, importance_getter='auto'): diff --git a/sklearn/feature_selection/_univariate_selection.py b/sklearn/feature_selection/_univariate_selection.py index d9db03e479163..989288dbb4ec7 100644 --- a/sklearn/feature_selection/_univariate_selection.py +++ b/sklearn/feature_selection/_univariate_selection.py @@ -17,7 +17,6 @@ safe_mask) from ..utils.extmath import safe_sparse_dot, row_norms from ..utils.validation import check_is_fitted -from ..utils.validation import _deprecate_positional_args from ._base import SelectorMixin @@ -296,7 +295,6 @@ def r_regression(X, y, *, center=True): return correlation_coefficient -@_deprecate_positional_args def f_regression(X, y, *, center=True): """Univariate linear regression tests returning F-statistic and p-values. @@ -485,7 +483,6 @@ class SelectPercentile(_BaseFilter): GenericUnivariateSelect : Univariate feature selector with configurable mode. """ - @_deprecate_positional_args def __init__(self, score_func=f_classif, *, percentile=10): super().__init__(score_func=score_func) self.percentile = percentile @@ -573,7 +570,6 @@ class SelectKBest(_BaseFilter): GenericUnivariateSelect : Univariate feature selector with configurable mode. """ - @_deprecate_positional_args def __init__(self, score_func=f_classif, *, k=10): super().__init__(score_func=score_func) self.k = k @@ -654,7 +650,6 @@ class SelectFpr(_BaseFilter): GenericUnivariateSelect : Univariate feature selector with configurable mode. """ - @_deprecate_positional_args def __init__(self, score_func=f_classif, *, alpha=5e-2): super().__init__(score_func=score_func) self.alpha = alpha @@ -722,7 +717,6 @@ class SelectFdr(_BaseFilter): GenericUnivariateSelect : Univariate feature selector with configurable mode. """ - @_deprecate_positional_args def __init__(self, score_func=f_classif, *, alpha=5e-2): super().__init__(score_func=score_func) self.alpha = alpha @@ -787,7 +781,6 @@ class SelectFwe(_BaseFilter): GenericUnivariateSelect : Univariate feature selector with configurable mode. """ - @_deprecate_positional_args def __init__(self, score_func=f_classif, *, alpha=5e-2): super().__init__(score_func=score_func) self.alpha = alpha @@ -857,13 +850,12 @@ class GenericUnivariateSelect(_BaseFilter): SelectFwe : Select features based on family-wise error rate. """ - _selection_modes = {'percentile': SelectPercentile, - 'k_best': SelectKBest, - 'fpr': SelectFpr, - 'fdr': SelectFdr, - 'fwe': SelectFwe} + _selection_modes: dict = {'percentile': SelectPercentile, + 'k_best': SelectKBest, + 'fpr': SelectFpr, + 'fdr': SelectFdr, + 'fwe': SelectFwe} - @_deprecate_positional_args def __init__(self, score_func=f_classif, *, mode='percentile', param=1e-5): super().__init__(score_func=score_func) self.mode = mode diff --git a/sklearn/gaussian_process/_gpc.py b/sklearn/gaussian_process/_gpc.py index e6fe3eb26df49..d2b418b131c2f 100644 --- a/sklearn/gaussian_process/_gpc.py +++ b/sklearn/gaussian_process/_gpc.py @@ -19,7 +19,6 @@ from ..utils.optimize import _check_optimize_result from ..preprocessing import LabelEncoder from ..multiclass import OneVsRestClassifier, OneVsOneClassifier -from ..utils.validation import _deprecate_positional_args # Values required for approximating the logistic sigmoid by @@ -145,7 +144,6 @@ def optimizer(obj_func, initial_theta, bounds): The log-marginal-likelihood of ``self.kernel_.theta`` """ - @_deprecate_positional_args def __init__(self, kernel=None, *, optimizer="fmin_l_bfgs_b", n_restarts_optimizer=0, max_iter_predict=100, warm_start=False, copy_X_train=True, random_state=None): @@ -595,7 +593,6 @@ def optimizer(obj_func, initial_theta, bounds): .. versionadded:: 0.18 """ - @_deprecate_positional_args def __init__(self, kernel=None, *, optimizer="fmin_l_bfgs_b", n_restarts_optimizer=0, max_iter_predict=100, warm_start=False, copy_X_train=True, random_state=None, diff --git a/sklearn/gaussian_process/_gpr.py b/sklearn/gaussian_process/_gpr.py index 9b1d0ae409526..ae9e5c403fcf2 100644 --- a/sklearn/gaussian_process/_gpr.py +++ b/sklearn/gaussian_process/_gpr.py @@ -17,7 +17,6 @@ from ..preprocessing._data import _handle_zeros_in_scale from ..utils import check_random_state from ..utils.optimize import _check_optimize_result -from ..utils.validation import _deprecate_positional_args class GaussianProcessRegressor(MultiOutputMixin, @@ -153,7 +152,6 @@ def optimizer(obj_func, initial_theta, bounds): (array([653.0..., 592.1...]), array([316.6..., 316.6...])) """ - @_deprecate_positional_args def __init__(self, kernel=None, *, alpha=1e-10, optimizer="fmin_l_bfgs_b", n_restarts_optimizer=0, normalize_y=False, copy_X_train=True, random_state=None): diff --git a/sklearn/impute/_base.py b/sklearn/impute/_base.py index e345fe44f0895..85303f29c93e9 100644 --- a/sklearn/impute/_base.py +++ b/sklearn/impute/_base.py @@ -15,7 +15,6 @@ from ..utils.sparsefuncs import _get_median from ..utils.validation import check_is_fitted from ..utils.validation import FLOAT_DTYPES -from ..utils.validation import _deprecate_positional_args from ..utils._mask import _get_mask from ..utils import is_scalar_nan @@ -211,7 +210,6 @@ class SimpleImputer(_BaseImputer): upon :meth:`transform` if strategy is not "constant". """ - @_deprecate_positional_args def __init__(self, *, missing_values=np.nan, strategy="mean", fill_value=None, verbose=0, copy=True, add_indicator=False): super().__init__( @@ -626,7 +624,6 @@ class MissingIndicator(TransformerMixin, BaseEstimator): [False, False]]) """ - @_deprecate_positional_args def __init__(self, *, missing_values=np.nan, features="missing-only", sparse="auto", error_on_new=True): self.missing_values = missing_values diff --git a/sklearn/impute/_knn.py b/sklearn/impute/_knn.py index c4b407fdd66e7..b9cfe0e1a60a0 100644 --- a/sklearn/impute/_knn.py +++ b/sklearn/impute/_knn.py @@ -13,7 +13,6 @@ from ..utils import is_scalar_nan from ..utils._mask import _get_mask from ..utils.validation import check_is_fitted -from ..utils.validation import _deprecate_positional_args class KNNImputer(_BaseImputer): @@ -96,7 +95,6 @@ class KNNImputer(_BaseImputer): [5.5, 6. , 5. ], [8. , 8. , 7. ]]) """ - @_deprecate_positional_args def __init__(self, *, missing_values=np.nan, n_neighbors=5, weights="uniform", metric="nan_euclidean", copy=True, add_indicator=False): diff --git a/sklearn/inspection/_partial_dependence.py b/sklearn/inspection/_partial_dependence.py index 0736130f41524..d10cae40302a3 100644 --- a/sklearn/inspection/_partial_dependence.py +++ b/sklearn/inspection/_partial_dependence.py @@ -22,7 +22,6 @@ from ..utils import _get_column_indices from ..utils.validation import check_is_fitted from ..utils import Bunch -from ..utils.validation import _deprecate_positional_args from ..tree import DecisionTreeRegressor from ..ensemble import RandomForestRegressor from ..exceptions import NotFittedError @@ -203,7 +202,6 @@ def _partial_dependence_brute(est, grid, features, X, response_method): return averaged_predictions, predictions -@_deprecate_positional_args def partial_dependence(estimator, X, features, *, response_method='auto', percentiles=(0.05, 0.95), grid_resolution=100, method='auto', kind='legacy'): diff --git a/sklearn/inspection/_permutation_importance.py b/sklearn/inspection/_permutation_importance.py index 2a7b6cd23147b..8dadf19434693 100644 --- a/sklearn/inspection/_permutation_importance.py +++ b/sklearn/inspection/_permutation_importance.py @@ -8,7 +8,6 @@ from ..utils import Bunch from ..utils import check_random_state from ..utils import check_array -from ..utils.validation import _deprecate_positional_args from ..utils.fixes import delayed @@ -80,7 +79,6 @@ def _create_importances_bunch(baseline_score, permuted_score): importances=importances) -@_deprecate_positional_args def permutation_importance(estimator, X, y, *, scoring=None, n_repeats=5, n_jobs=None, random_state=None, sample_weight=None): """Permutation importance for feature evaluation [BRE]_. diff --git a/sklearn/inspection/_plot/partial_dependence.py b/sklearn/inspection/_plot/partial_dependence.py index a2ee1886066e2..f170460cf2ab6 100644 --- a/sklearn/inspection/_plot/partial_dependence.py +++ b/sklearn/inspection/_plot/partial_dependence.py @@ -17,7 +17,6 @@ from ...utils.fixes import delayed -@_deprecate_positional_args def plot_partial_dependence( estimator, X, @@ -539,7 +538,6 @@ class PartialDependenceDisplay: partial_dependence : Compute Partial Dependence values. plot_partial_dependence : Plot Partial Dependence. """ - @_deprecate_positional_args def __init__( self, pd_results, diff --git a/sklearn/isotonic.py b/sklearn/isotonic.py index b57ce23f8cc52..f4050fd2bc025 100644 --- a/sklearn/isotonic.py +++ b/sklearn/isotonic.py @@ -11,7 +11,7 @@ from .base import BaseEstimator, TransformerMixin, RegressorMixin from .utils import check_array, check_consistent_length -from .utils.validation import _check_sample_weight, _deprecate_positional_args +from .utils.validation import _check_sample_weight from ._isotonic import _inplace_contiguous_isotonic_regression, _make_unique @@ -76,7 +76,6 @@ def check_increasing(x, y): return increasing_bool -@_deprecate_positional_args def isotonic_regression(y, *, sample_weight=None, y_min=None, y_max=None, increasing=True): """Solve the isotonic regression model. @@ -216,7 +215,6 @@ class IsotonicRegression(RegressorMixin, TransformerMixin, BaseEstimator): >>> iso_reg.predict([.1, .2]) array([1.8628..., 3.7256...]) """ - @_deprecate_positional_args def __init__(self, *, y_min=None, y_max=None, increasing=True, out_of_bounds='nan'): self.y_min = y_min diff --git a/sklearn/kernel_approximation.py b/sklearn/kernel_approximation.py index ca02aac3e982c..e7020dea0e970 100644 --- a/sklearn/kernel_approximation.py +++ b/sklearn/kernel_approximation.py @@ -24,7 +24,7 @@ from .utils.extmath import safe_sparse_dot from .utils.validation import check_is_fitted from .metrics.pairwise import pairwise_kernels, KERNEL_PARAMS -from .utils.validation import check_non_negative, _deprecate_positional_args +from .utils.validation import check_non_negative class PolynomialCountSketch(BaseEstimator, TransformerMixin): @@ -253,7 +253,6 @@ class RBFSampler(TransformerMixin, BaseEstimator): Benjamin Recht. (https://people.eecs.berkeley.edu/~brecht/papers/08.rah.rec.nips.pdf) """ - @_deprecate_positional_args def __init__(self, *, gamma=1., n_components=100, random_state=None): self.gamma = gamma self.n_components = n_components @@ -369,7 +368,6 @@ class SkewedChi2Sampler(TransformerMixin, BaseEstimator): sklearn.metrics.pairwise.chi2_kernel : The exact chi squared kernel. """ - @_deprecate_positional_args def __init__(self, *, skewedness=1., n_components=100, random_state=None): self.skewedness = skewedness self.n_components = n_components @@ -500,7 +498,6 @@ class AdditiveChi2Sampler(TransformerMixin, BaseEstimator): A. Vedaldi and A. Zisserman, Pattern Analysis and Machine Intelligence, 2011 """ - @_deprecate_positional_args def __init__(self, *, sample_steps=2, sample_interval=None): self.sample_steps = sample_steps self.sample_interval = sample_interval @@ -728,7 +725,6 @@ class Nystroem(TransformerMixin, BaseEstimator): sklearn.metrics.pairwise.kernel_metrics : List of built-in kernels. """ - @_deprecate_positional_args def __init__(self, kernel="rbf", *, gamma=None, coef0=None, degree=None, kernel_params=None, n_components=100, random_state=None, n_jobs=None): diff --git a/sklearn/kernel_ridge.py b/sklearn/kernel_ridge.py index 8a27ea572b344..e562c22daed2f 100644 --- a/sklearn/kernel_ridge.py +++ b/sklearn/kernel_ridge.py @@ -10,7 +10,6 @@ from .metrics.pairwise import pairwise_kernels from .linear_model._ridge import _solve_cholesky_kernel from .utils.validation import check_is_fitted, _check_sample_weight -from .utils.validation import _deprecate_positional_args from .utils.deprecation import deprecated @@ -113,7 +112,6 @@ class KernelRidge(MultiOutputMixin, RegressorMixin, BaseEstimator): >>> clf.fit(X, y) KernelRidge(alpha=1.0) """ - @_deprecate_positional_args def __init__(self, alpha=1, *, kernel="linear", gamma=None, degree=3, coef0=1, kernel_params=None): self.alpha = alpha diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py index 808ec9f3b3bb0..09eeced4f3a09 100644 --- a/sklearn/linear_model/_base.py +++ b/sklearn/linear_model/_base.py @@ -31,7 +31,6 @@ from ..preprocessing._data import _is_constant_feature from ..utils import check_array from ..utils.validation import FLOAT_DTYPES -from ..utils.validation import _deprecate_positional_args from ..utils import check_random_state from ..utils.extmath import safe_sparse_dot from ..utils.extmath import _incremental_mean_and_var @@ -595,7 +594,6 @@ class LinearRegression(MultiOutputMixin, RegressorMixin, LinearModel): >>> reg.predict(np.array([[3, 5]])) array([16.]) """ - @_deprecate_positional_args def __init__(self, *, fit_intercept=True, normalize='deprecated', copy_X=True, n_jobs=None, positive=False): self.fit_intercept = fit_intercept diff --git a/sklearn/linear_model/_bayes.py b/sklearn/linear_model/_bayes.py index 2eae8b5c13cee..1d25ac20aa34e 100644 --- a/sklearn/linear_model/_bayes.py +++ b/sklearn/linear_model/_bayes.py @@ -15,7 +15,6 @@ from ..utils.extmath import fast_logdet from scipy.linalg import pinvh from ..utils.validation import _check_sample_weight -from ..utils.validation import _deprecate_positional_args ############################################################################### @@ -159,7 +158,6 @@ class BayesianRidge(RegressorMixin, LinearModel): M. E. Tipping, Sparse Bayesian Learning and the Relevance Vector Machine, Journal of Machine Learning Research, Vol. 1, 2001. """ - @_deprecate_positional_args def __init__(self, *, n_iter=300, tol=1.e-3, alpha_1=1.e-6, alpha_2=1.e-6, lambda_1=1.e-6, lambda_2=1.e-6, alpha_init=None, lambda_init=None, compute_score=False, fit_intercept=True, @@ -520,7 +518,6 @@ class ARDRegression(RegressorMixin, LinearModel): which ``self.lambda_ < self.threshold_lambda`` are kept and the rest are discarded. """ - @_deprecate_positional_args def __init__(self, *, n_iter=300, tol=1.e-3, alpha_1=1.e-6, alpha_2=1.e-6, lambda_1=1.e-6, lambda_2=1.e-6, compute_score=False, threshold_lambda=1.e+4, fit_intercept=True, diff --git a/sklearn/linear_model/_coordinate_descent.py b/sklearn/linear_model/_coordinate_descent.py index 6a23fedd9902e..1d93a6695b0e0 100644 --- a/sklearn/linear_model/_coordinate_descent.py +++ b/sklearn/linear_model/_coordinate_descent.py @@ -24,7 +24,6 @@ from ..utils.fixes import _astype_copy_false, _joblib_parallel_args from ..utils.validation import check_is_fitted, _check_sample_weight from ..utils.validation import column_or_1d -from ..utils.validation import _deprecate_positional_args from ..utils.fixes import delayed # mypy error: Module 'sklearn.linear_model' has no attribute '_cd_fast' @@ -169,7 +168,6 @@ def _alpha_grid(X, y, Xy=None, l1_ratio=1.0, fit_intercept=True, num=n_alphas)[::-1] -@_deprecate_positional_args def lasso_path(X, y, *, eps=1e-3, n_alphas=100, alphas=None, precompute='auto', Xy=None, copy_X=True, coef_init=None, verbose=False, return_n_iter=False, positive=False, **params): @@ -314,7 +312,6 @@ def lasso_path(X, y, *, eps=1e-3, n_alphas=100, alphas=None, positive=positive, return_n_iter=return_n_iter, **params) -@_deprecate_positional_args def enet_path(X, y, *, l1_ratio=0.5, eps=1e-3, n_alphas=100, alphas=None, precompute='auto', Xy=None, copy_X=True, coef_init=None, verbose=False, return_n_iter=False, positive=False, @@ -701,7 +698,6 @@ class ElasticNet(MultiOutputMixin, RegressorMixin, LinearModel): """ path = staticmethod(enet_path) - @_deprecate_positional_args def __init__(self, alpha=1.0, *, l1_ratio=0.5, fit_intercept=True, normalize=False, precompute=False, max_iter=1000, copy_X=True, tol=1e-4, warm_start=False, positive=False, @@ -1026,7 +1022,6 @@ class Lasso(ElasticNet): """ path = staticmethod(enet_path) - @_deprecate_positional_args def __init__(self, alpha=1.0, *, fit_intercept=True, normalize=False, precompute=False, copy_X=True, max_iter=1000, tol=1e-4, warm_start=False, positive=False, @@ -1512,7 +1507,6 @@ class LassoCV(RegressorMixin, LinearModelCV): """ path = staticmethod(lasso_path) - @_deprecate_positional_args def __init__(self, *, eps=1e-3, n_alphas=100, alphas=None, fit_intercept=True, normalize=False, precompute='auto', max_iter=1000, tol=1e-4, @@ -1719,7 +1713,6 @@ class ElasticNetCV(RegressorMixin, LinearModelCV): """ path = staticmethod(enet_path) - @_deprecate_positional_args def __init__(self, *, l1_ratio=0.5, eps=1e-3, n_alphas=100, alphas=None, fit_intercept=True, normalize=False, precompute='auto', max_iter=1000, tol=1e-4, cv=None, copy_X=True, @@ -1876,7 +1869,6 @@ class MultiTaskElasticNet(Lasso): To avoid unnecessary memory duplication the X and y arguments of the fit method should be directly passed as Fortran-contiguous numpy arrays. """ - @_deprecate_positional_args def __init__(self, alpha=1.0, *, l1_ratio=0.5, fit_intercept=True, normalize=False, copy_X=True, max_iter=1000, tol=1e-4, warm_start=False, random_state=None, selection='cyclic'): @@ -2077,7 +2069,6 @@ class MultiTaskLasso(MultiTaskElasticNet): To avoid unnecessary memory duplication the X and y arguments of the fit method should be directly passed as Fortran-contiguous numpy arrays. """ - @_deprecate_positional_args def __init__(self, alpha=1.0, *, fit_intercept=True, normalize=False, copy_X=True, max_iter=1000, tol=1e-4, warm_start=False, random_state=None, selection='cyclic'): @@ -2260,7 +2251,6 @@ class MultiTaskElasticNetCV(RegressorMixin, LinearModelCV): """ path = staticmethod(enet_path) - @_deprecate_positional_args def __init__(self, *, l1_ratio=0.5, eps=1e-3, n_alphas=100, alphas=None, fit_intercept=True, normalize=False, max_iter=1000, tol=1e-4, cv=None, copy_X=True, @@ -2441,7 +2431,6 @@ class MultiTaskLassoCV(RegressorMixin, LinearModelCV): """ path = staticmethod(lasso_path) - @_deprecate_positional_args def __init__(self, *, eps=1e-3, n_alphas=100, alphas=None, fit_intercept=True, normalize=False, max_iter=1000, tol=1e-4, copy_X=True, diff --git a/sklearn/linear_model/_huber.py b/sklearn/linear_model/_huber.py index 56062fa783eb8..a8ae066d9ff63 100644 --- a/sklearn/linear_model/_huber.py +++ b/sklearn/linear_model/_huber.py @@ -9,7 +9,6 @@ from ._base import LinearModel from ..utils import axis0_safe_slice from ..utils.validation import _check_sample_weight -from ..utils.validation import _deprecate_positional_args from ..utils.extmath import safe_sparse_dot from ..utils.optimize import _check_optimize_result @@ -223,7 +222,6 @@ class HuberRegressor(LinearModel, RegressorMixin, BaseEstimator): .. [2] Art B. Owen (2006), A robust hybrid of lasso and ridge regression. https://statweb.stanford.edu/~owen/reports/hhu.pdf """ - @_deprecate_positional_args def __init__(self, *, epsilon=1.35, max_iter=100, alpha=0.0001, warm_start=False, fit_intercept=True, tol=1e-05): self.epsilon = epsilon diff --git a/sklearn/linear_model/_least_angle.py b/sklearn/linear_model/_least_angle.py index 55e37ff51fc6a..0932d0bd1aee3 100644 --- a/sklearn/linear_model/_least_angle.py +++ b/sklearn/linear_model/_least_angle.py @@ -24,13 +24,11 @@ from ..utils import check_random_state from ..model_selection import check_cv from ..exceptions import ConvergenceWarning -from ..utils.validation import _deprecate_positional_args from ..utils.fixes import delayed SOLVE_TRIANGULAR_ARGS = {'check_finite': False} -@_deprecate_positional_args def lars_path( X, y, @@ -175,7 +173,6 @@ def lars_path( return_n_iter=return_n_iter, positive=positive) -@_deprecate_positional_args def lars_path_gram( Xy, Gram, @@ -910,7 +907,6 @@ class Lars(MultiOutputMixin, RegressorMixin, LinearModel): method = "lar" positive = False - @_deprecate_positional_args def __init__(self, *, fit_intercept=True, verbose=False, normalize=True, precompute='auto', n_nonzero_coefs=500, eps=np.finfo(float).eps, copy_X=True, fit_path=True, @@ -1172,7 +1168,6 @@ class LassoLars(Lars): """ method = 'lasso' - @_deprecate_positional_args def __init__(self, alpha=1.0, *, fit_intercept=True, verbose=False, normalize=True, precompute='auto', max_iter=500, eps=np.finfo(float).eps, copy_X=True, fit_path=True, @@ -1434,7 +1429,6 @@ class LarsCV(Lars): method = "lar" - @_deprecate_positional_args def __init__(self, *, fit_intercept=True, verbose=False, max_iter=500, normalize=True, precompute='auto', cv=None, max_n_alphas=1000, n_jobs=None, eps=np.finfo(float).eps, @@ -1681,7 +1675,6 @@ class LassoLarsCV(LarsCV): method = 'lasso' - @_deprecate_positional_args def __init__(self, *, fit_intercept=True, verbose=False, max_iter=500, normalize=True, precompute='auto', cv=None, max_n_alphas=1000, n_jobs=None, eps=np.finfo(float).eps, @@ -1820,7 +1813,6 @@ class LassoLarsIC(LassoLars): -------- lars_path, LassoLars, LassoLarsCV """ - @_deprecate_positional_args def __init__(self, criterion='aic', *, fit_intercept=True, verbose=False, normalize=True, precompute='auto', max_iter=500, eps=np.finfo(float).eps, copy_X=True, positive=False): diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index be28c5806ede5..c9f1f42f1eeec 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -29,7 +29,6 @@ from ..utils.extmath import row_norms from ..utils.optimize import _newton_cg, _check_optimize_result from ..utils.validation import check_is_fitted, _check_sample_weight -from ..utils.validation import _deprecate_positional_args from ..utils.multiclass import check_classification_targets from ..utils.fixes import _joblib_parallel_args from ..utils.fixes import delayed @@ -1254,7 +1253,6 @@ class LogisticRegression(LinearClassifierMixin, >>> clf.score(X, y) 0.97... """ - @_deprecate_positional_args def __init__(self, penalty='l2', *, dual=False, tol=1e-4, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='lbfgs', max_iter=100, @@ -1745,7 +1743,6 @@ class LogisticRegressionCV(LogisticRegression, LogisticRegression """ - @_deprecate_positional_args def __init__(self, *, Cs=10, fit_intercept=True, cv=None, dual=False, penalty='l2', scoring=None, solver='lbfgs', tol=1e-4, max_iter=100, class_weight=None, n_jobs=None, verbose=0, diff --git a/sklearn/linear_model/_omp.py b/sklearn/linear_model/_omp.py index 3f995f0f34318..c362fd4d73469 100644 --- a/sklearn/linear_model/_omp.py +++ b/sklearn/linear_model/_omp.py @@ -16,7 +16,6 @@ from ._base import LinearModel, _pre_fit from ..base import RegressorMixin, MultiOutputMixin from ..utils import as_float_array, check_array -from ..utils.validation import _deprecate_positional_args from ..utils.fixes import delayed from ..model_selection import check_cv @@ -266,7 +265,6 @@ def _gram_omp(Gram, Xy, n_nonzero_coefs, tol_0=None, tol=None, return gamma, indices[:n_active], n_active -@_deprecate_positional_args def orthogonal_mp(X, y, *, n_nonzero_coefs=None, tol=None, precompute=False, copy_X=True, return_path=False, return_n_iter=False): @@ -410,7 +408,6 @@ def orthogonal_mp(X, y, *, n_nonzero_coefs=None, tol=None, precompute=False, return np.squeeze(coef) -@_deprecate_positional_args def orthogonal_mp_gram(Gram, Xy, *, n_nonzero_coefs=None, tol=None, norms_squared=None, copy_Gram=True, copy_Xy=True, return_path=False, @@ -628,7 +625,6 @@ class OrthogonalMatchingPursuit(MultiOutputMixin, RegressorMixin, LinearModel): sklearn.decomposition.sparse_encode OrthogonalMatchingPursuitCV """ - @_deprecate_positional_args def __init__(self, *, n_nonzero_coefs=None, tol=None, fit_intercept=True, normalize=True, precompute='auto'): self.n_nonzero_coefs = n_nonzero_coefs @@ -866,7 +862,6 @@ class OrthogonalMatchingPursuitCV(RegressorMixin, LinearModel): sklearn.decomposition.sparse_encode """ - @_deprecate_positional_args def __init__(self, *, copy=True, fit_intercept=True, normalize=True, max_iter=None, cv=None, n_jobs=None, verbose=False): self.copy = copy diff --git a/sklearn/linear_model/_passive_aggressive.py b/sklearn/linear_model/_passive_aggressive.py index f3fa17ad1325e..678061be3c691 100644 --- a/sklearn/linear_model/_passive_aggressive.py +++ b/sklearn/linear_model/_passive_aggressive.py @@ -1,7 +1,6 @@ # Authors: Rob Zinkov, Mathieu Blondel # License: BSD 3 clause -from ..utils.validation import _deprecate_positional_args from ._stochastic_gradient import BaseSGDClassifier from ._stochastic_gradient import BaseSGDRegressor from ._stochastic_gradient import DEFAULT_EPSILON @@ -164,7 +163,6 @@ class PassiveAggressiveClassifier(BaseSGDClassifier): K. Crammer, O. Dekel, J. Keshat, S. Shalev-Shwartz, Y. Singer - JMLR (2006) """ - @_deprecate_positional_args def __init__(self, *, C=1.0, fit_intercept=True, max_iter=1000, tol=1e-3, early_stopping=False, validation_fraction=0.1, n_iter_no_change=5, shuffle=True, verbose=0, loss="hinge", @@ -391,7 +389,6 @@ class PassiveAggressiveRegressor(BaseSGDRegressor): K. Crammer, O. Dekel, J. Keshat, S. Shalev-Shwartz, Y. Singer - JMLR (2006) """ - @_deprecate_positional_args def __init__(self, *, C=1.0, fit_intercept=True, max_iter=1000, tol=1e-3, early_stopping=False, validation_fraction=0.1, n_iter_no_change=5, shuffle=True, verbose=0, diff --git a/sklearn/linear_model/_perceptron.py b/sklearn/linear_model/_perceptron.py index 9d886eb1ca065..b2bb145b904c8 100644 --- a/sklearn/linear_model/_perceptron.py +++ b/sklearn/linear_model/_perceptron.py @@ -1,7 +1,6 @@ # Author: Mathieu Blondel # License: BSD 3 clause -from ..utils.validation import _deprecate_positional_args from ._stochastic_gradient import BaseSGDClassifier @@ -154,7 +153,6 @@ class Perceptron(BaseSGDClassifier): https://en.wikipedia.org/wiki/Perceptron and references therein. """ - @_deprecate_positional_args def __init__(self, *, penalty=None, alpha=0.0001, l1_ratio=0.15, fit_intercept=True, max_iter=1000, tol=1e-3, shuffle=True, verbose=0, eta0=1.0, diff --git a/sklearn/linear_model/_ransac.py b/sklearn/linear_model/_ransac.py index 3cde1f1235ec8..f53785cfe0ced 100644 --- a/sklearn/linear_model/_ransac.py +++ b/sklearn/linear_model/_ransac.py @@ -12,7 +12,6 @@ from ..utils import check_random_state, check_consistent_length from ..utils.random import sample_without_replacement from ..utils.validation import check_is_fitted, _check_sample_weight -from ..utils.validation import _deprecate_positional_args from ._base import LinearRegression from ..utils.validation import has_fit_parameter from ..exceptions import ConvergenceWarning @@ -211,7 +210,6 @@ class RANSACRegressor(MetaEstimatorMixin, RegressorMixin, .. [2] https://www.sri.com/sites/default/files/publications/ransac-publication.pdf .. [3] http://www.bmva.org/bmvc/2009/Papers/Paper355/Paper355.pdf """ # noqa: E501 - @_deprecate_positional_args def __init__(self, base_estimator=None, *, min_samples=None, residual_threshold=None, is_data_valid=None, is_model_valid=None, max_trials=100, max_skips=np.inf, diff --git a/sklearn/linear_model/_ridge.py b/sklearn/linear_model/_ridge.py index 343bc6a170c9b..433e0c4313efc 100644 --- a/sklearn/linear_model/_ridge.py +++ b/sklearn/linear_model/_ridge.py @@ -28,7 +28,6 @@ from ..utils import compute_sample_weight from ..utils import column_or_1d from ..utils.validation import _check_sample_weight -from ..utils.validation import _deprecate_positional_args from ..preprocessing import LabelBinarizer from ..model_selection import GridSearchCV from ..metrics import check_scoring @@ -236,7 +235,6 @@ def _get_valid_accept_sparse(is_X_sparse, solver): return ['csr', 'csc', 'coo'] -@_deprecate_positional_args def ridge_regression(X, y, alpha, *, sample_weight=None, solver='auto', max_iter=None, tol=1e-3, verbose=0, random_state=None, return_n_iter=False, return_intercept=False, @@ -521,7 +519,6 @@ def _ridge_regression(X, y, alpha, sample_weight=None, solver='auto', class _BaseRidge(LinearModel, metaclass=ABCMeta): @abstractmethod - @_deprecate_positional_args def __init__(self, alpha=1.0, *, fit_intercept=True, normalize='deprecated', copy_X=True, max_iter=None, tol=1e-3, solver="auto", random_state=None): @@ -739,7 +736,6 @@ class Ridge(MultiOutputMixin, RegressorMixin, _BaseRidge): >>> clf.fit(X, y) Ridge() """ - @_deprecate_positional_args def __init__(self, alpha=1.0, *, fit_intercept=True, normalize='deprecated', copy_X=True, max_iter=None, tol=1e-3, solver="auto", random_state=None): @@ -901,7 +897,6 @@ class RidgeClassifier(LinearClassifierMixin, _BaseRidge): >>> clf.score(X, y) 0.9595... """ - @_deprecate_positional_args def __init__(self, alpha=1.0, *, fit_intercept=True, normalize='deprecated', copy_X=True, max_iter=None, tol=1e-3, class_weight=None, solver="auto", @@ -1127,7 +1122,6 @@ class _RidgeGCV(LinearModel): http://cbcl.mit.edu/publications/ps/MIT-CSAIL-TR-2007-025.pdf https://www.mit.edu/~9.520/spring07/Classes/rlsslides.pdf """ - @_deprecate_positional_args def __init__(self, alphas=(0.1, 1.0, 10.0), *, fit_intercept=True, normalize='deprecated', scoring=None, copy_X=True, @@ -1601,7 +1595,6 @@ def fit(self, X, y, sample_weight=None): class _BaseRidgeCV(LinearModel): - @_deprecate_positional_args def __init__(self, alphas=(0.1, 1.0, 10.0), *, fit_intercept=True, normalize='deprecated', scoring=None, cv=None, gcv_mode=None, store_cv_values=False, @@ -1936,7 +1929,6 @@ class RidgeClassifierCV(LinearClassifierMixin, _BaseRidgeCV): a one-versus-all approach. Concretely, this is implemented by taking advantage of the multi-variate response support in Ridge. """ - @_deprecate_positional_args def __init__(self, alphas=(0.1, 1.0, 10.0), *, fit_intercept=True, normalize='deprecated', scoring=None, cv=None, class_weight=None, store_cv_values=False): diff --git a/sklearn/linear_model/_sag.py b/sklearn/linear_model/_sag.py index d0bd001081d61..4d76677e83356 100644 --- a/sklearn/linear_model/_sag.py +++ b/sklearn/linear_model/_sag.py @@ -13,7 +13,6 @@ from ..exceptions import ConvergenceWarning from ..utils import check_array from ..utils.validation import _check_sample_weight -from ..utils.validation import _deprecate_positional_args from ..utils.extmath import row_norms @@ -85,7 +84,6 @@ def get_auto_step_size(max_squared_sum, alpha_scaled, loss, fit_intercept, return step -@_deprecate_positional_args def sag_solver(X, y, sample_weight=None, loss='log', alpha=1., beta=0., max_iter=1000, tol=0.001, verbose=0, random_state=None, check_input=True, max_squared_sum=None, diff --git a/sklearn/linear_model/_stochastic_gradient.py b/sklearn/linear_model/_stochastic_gradient.py index 92b02155246df..78565178706a8 100644 --- a/sklearn/linear_model/_stochastic_gradient.py +++ b/sklearn/linear_model/_stochastic_gradient.py @@ -21,7 +21,6 @@ from ..utils.extmath import safe_sparse_dot from ..utils.multiclass import _check_partial_fit_first_call from ..utils.validation import check_is_fitted, _check_sample_weight -from ..utils.validation import _deprecate_positional_args from ..utils.fixes import delayed from ..exceptions import ConvergenceWarning from ..model_selection import StratifiedShuffleSplit, ShuffleSplit @@ -71,7 +70,6 @@ def __call__(self, coef, intercept): class BaseSGD(SparseCoefMixin, BaseEstimator, metaclass=ABCMeta): """Base class for SGD classification and regression.""" - @_deprecate_positional_args def __init__(self, loss, *, penalty='l2', alpha=0.0001, C=1.0, l1_ratio=0.15, fit_intercept=True, max_iter=1000, tol=1e-3, shuffle=True, verbose=0, epsilon=0.1, random_state=None, @@ -454,7 +452,6 @@ class BaseSGDClassifier(LinearClassifierMixin, BaseSGD, metaclass=ABCMeta): } @abstractmethod - @_deprecate_positional_args def __init__(self, loss="hinge", *, penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, max_iter=1000, tol=1e-3, shuffle=True, verbose=0, epsilon=DEFAULT_EPSILON, n_jobs=None, @@ -962,7 +959,6 @@ class SGDClassifier(BaseSGDClassifier): >>> print(clf.predict([[-0.8, -1]])) [1] """ - @_deprecate_positional_args def __init__(self, loss="hinge", *, penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, max_iter=1000, tol=1e-3, shuffle=True, @@ -1120,7 +1116,6 @@ class BaseSGDRegressor(RegressorMixin, BaseSGD): } @abstractmethod - @_deprecate_positional_args def __init__(self, loss="squared_error", *, penalty="l2", alpha=0.0001, l1_ratio=0.15, fit_intercept=True, max_iter=1000, tol=1e-3, shuffle=True, verbose=0, epsilon=DEFAULT_EPSILON, @@ -1565,7 +1560,6 @@ class SGDRegressor(BaseSGDRegressor): Ridge, ElasticNet, Lasso, sklearn.svm.SVR """ - @_deprecate_positional_args def __init__(self, loss="squared_error", *, penalty="l2", alpha=0.0001, l1_ratio=0.15, fit_intercept=True, max_iter=1000, tol=1e-3, shuffle=True, verbose=0, epsilon=DEFAULT_EPSILON, diff --git a/sklearn/linear_model/_theil_sen.py b/sklearn/linear_model/_theil_sen.py index 39f3d5c69fb00..4c75613c28a9b 100644 --- a/sklearn/linear_model/_theil_sen.py +++ b/sklearn/linear_model/_theil_sen.py @@ -20,7 +20,6 @@ from ._base import LinearModel from ..base import RegressorMixin from ..utils import check_random_state -from ..utils.validation import _deprecate_positional_args from ..utils.fixes import delayed from ..exceptions import ConvergenceWarning @@ -291,7 +290,6 @@ class TheilSenRegressor(RegressorMixin, LinearModel): Xin Dang, Hanxiang Peng, Xueqin Wang and Heping Zhang http://home.olemiss.edu/~xdang/papers/MTSE.pdf """ - @_deprecate_positional_args def __init__(self, *, fit_intercept=True, copy_X=True, max_subpopulation=1e4, n_subsamples=None, max_iter=300, tol=1.e-3, random_state=None, n_jobs=None, verbose=False): diff --git a/sklearn/linear_model/tests/test_omp.py b/sklearn/linear_model/tests/test_omp.py index 1d2eb6a239786..06df7fd349e8b 100644 --- a/sklearn/linear_model/tests/test_omp.py +++ b/sklearn/linear_model/tests/test_omp.py @@ -163,7 +163,7 @@ def test_identical_regressors(): "The requested precision might not have been met." ) with pytest.warns(RuntimeWarning, match=warning_message): - orthogonal_mp(newX, newy, 2) + orthogonal_mp(newX, newy, n_nonzero_coefs=2) def test_swapped_regressors(): diff --git a/sklearn/manifold/_isomap.py b/sklearn/manifold/_isomap.py index d843c3ddd8462..63be19c1c287d 100644 --- a/sklearn/manifold/_isomap.py +++ b/sklearn/manifold/_isomap.py @@ -7,7 +7,6 @@ from ..base import BaseEstimator, TransformerMixin from ..neighbors import NearestNeighbors, kneighbors_graph from ..utils.validation import check_is_fitted -from ..utils.validation import _deprecate_positional_args from ..utils.graph import graph_shortest_path from ..decomposition import KernelPCA from ..preprocessing import KernelCenterer @@ -123,7 +122,6 @@ class Isomap(TransformerMixin, BaseEstimator): .. [1] Tenenbaum, J.B.; De Silva, V.; & Langford, J.C. A global geometric framework for nonlinear dimensionality reduction. Science 290 (5500) """ - @_deprecate_positional_args def __init__(self, *, n_neighbors=5, n_components=2, eigen_solver='auto', tol=0, max_iter=None, path_method='auto', neighbors_algorithm='auto', n_jobs=None, metric='minkowski', diff --git a/sklearn/manifold/_locally_linear.py b/sklearn/manifold/_locally_linear.py index 7a4e0ace9fccd..0fcd5f543c4d0 100644 --- a/sklearn/manifold/_locally_linear.py +++ b/sklearn/manifold/_locally_linear.py @@ -15,7 +15,6 @@ from ..utils.extmath import stable_cumsum from ..utils.validation import check_is_fitted from ..utils.validation import FLOAT_DTYPES -from ..utils.validation import _deprecate_positional_args from ..neighbors import NearestNeighbors @@ -189,7 +188,6 @@ def null_space(M, k, k_skip=1, eigen_solver='arpack', tol=1E-6, max_iter=100, raise ValueError("Unrecognized eigen_solver '%s'" % eigen_solver) -@_deprecate_positional_args def locally_linear_embedding( X, *, n_neighbors, n_components, reg=1e-3, eigen_solver='auto', tol=1e-6, max_iter=100, method='standard', hessian_tol=1E-4, @@ -636,7 +634,6 @@ class LocallyLinearEmbedding(TransformerMixin, dimensionality reduction via tangent space alignment. Journal of Shanghai Univ. 8:406 (2004) """ - @_deprecate_positional_args def __init__(self, *, n_neighbors=5, n_components=2, reg=1E-3, eigen_solver='auto', tol=1E-6, max_iter=100, method='standard', hessian_tol=1E-4, modified_tol=1E-12, diff --git a/sklearn/manifold/_mds.py b/sklearn/manifold/_mds.py index 6a144e3033e8e..d92ab67767fa3 100644 --- a/sklearn/manifold/_mds.py +++ b/sklearn/manifold/_mds.py @@ -14,7 +14,6 @@ from ..metrics import euclidean_distances from ..utils import check_random_state, check_array, check_symmetric from ..isotonic import IsotonicRegression -from ..utils.validation import _deprecate_positional_args from ..utils.deprecation import deprecated from ..utils.fixes import delayed @@ -132,7 +131,6 @@ def _smacof_single(dissimilarities, metric=True, n_components=2, init=None, return X, stress, it + 1 -@_deprecate_positional_args def smacof(dissimilarities, *, metric=True, n_components=2, init=None, n_init=8, n_jobs=None, max_iter=300, verbose=0, eps=1e-3, random_state=None, return_n_iter=False): @@ -372,7 +370,6 @@ class MDS(BaseEstimator): hypothesis" Kruskal, J. Psychometrika, 29, (1964) """ - @_deprecate_positional_args def __init__(self, n_components=2, *, metric=True, n_init=4, max_iter=300, verbose=0, eps=1e-3, n_jobs=None, random_state=None, dissimilarity="euclidean"): diff --git a/sklearn/manifold/_spectral_embedding.py b/sklearn/manifold/_spectral_embedding.py index 76f52946e8e87..7fd371ee5af2f 100644 --- a/sklearn/manifold/_spectral_embedding.py +++ b/sklearn/manifold/_spectral_embedding.py @@ -25,7 +25,6 @@ from ..utils.fixes import lobpcg from ..metrics.pairwise import rbf_kernel from ..neighbors import kneighbors_graph, NearestNeighbors -from ..utils.validation import _deprecate_positional_args from ..utils.deprecation import deprecated @@ -141,7 +140,6 @@ def _set_diag(laplacian, value, norm_laplacian): return laplacian -@_deprecate_positional_args def spectral_embedding(adjacency, *, n_components=8, eigen_solver=None, random_state=None, eigen_tol=0.0, norm_laplacian=True, drop_first=True): @@ -456,7 +454,6 @@ class SpectralEmbedding(BaseEstimator): Jianbo Shi, Jitendra Malik http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.160.2324 """ - @_deprecate_positional_args def __init__(self, n_components=2, *, affinity="nearest_neighbors", gamma=None, random_state=None, eigen_solver=None, n_neighbors=None, n_jobs=None): diff --git a/sklearn/manifold/_t_sne.py b/sklearn/manifold/_t_sne.py index 682fdc095d3bf..8e42d48f4ef07 100644 --- a/sklearn/manifold/_t_sne.py +++ b/sklearn/manifold/_t_sne.py @@ -20,7 +20,6 @@ from ..utils import check_random_state from ..utils._openmp_helpers import _openmp_effective_n_threads from ..utils.validation import check_non_negative -from ..utils.validation import _deprecate_positional_args from ..decomposition import PCA from ..metrics.pairwise import pairwise_distances # mypy error: Module 'sklearn.manifold' has no attribute '_utils' @@ -401,7 +400,6 @@ def _gradient_descent(objective, p0, it, n_iter, return p, error, i -@_deprecate_positional_args def trustworthiness(X, X_embedded, *, n_neighbors=5, metric='euclidean'): r"""Expresses to what extent the local structure is retained. @@ -670,7 +668,6 @@ class TSNE(BaseEstimator): # Control the number of iterations between progress checks _N_ITER_CHECK = 50 - @_deprecate_positional_args def __init__(self, n_components=2, *, perplexity=30.0, early_exaggeration=12.0, learning_rate="warn", n_iter=1000, n_iter_without_progress=300, min_grad_norm=1e-7, diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py index a68e17656a73b..434fd89f5bbd9 100644 --- a/sklearn/metrics/_classification.py +++ b/sklearn/metrics/_classification.py @@ -37,7 +37,6 @@ from ..utils.multiclass import unique_labels from ..utils.multiclass import type_of_target from ..utils.validation import _num_samples -from ..utils.validation import _deprecate_positional_args from ..utils.sparsefuncs import count_nonzero from ..exceptions import UndefinedMetricWarning @@ -137,7 +136,6 @@ def _weighted_sum(sample_score, sample_weight, normalize=False): return sample_score.sum() -@_deprecate_positional_args def accuracy_score(y_true, y_pred, *, normalize=True, sample_weight=None): """Accuracy classification score. @@ -210,7 +208,6 @@ def accuracy_score(y_true, y_pred, *, normalize=True, sample_weight=None): return _weighted_sum(score, sample_weight, normalize) -@_deprecate_positional_args def confusion_matrix(y_true, y_pred, *, labels=None, sample_weight=None, normalize=None): """Compute confusion matrix to evaluate the accuracy of a classification. @@ -366,7 +363,6 @@ def confusion_matrix(y_true, y_pred, *, labels=None, sample_weight=None, return cm -@_deprecate_positional_args def multilabel_confusion_matrix(y_true, y_pred, *, sample_weight=None, labels=None, samplewise=False): """Compute a confusion matrix for each class or sample. @@ -568,7 +564,6 @@ def multilabel_confusion_matrix(y_true, y_pred, *, sample_weight=None, return np.array([tn, fp, fn, tp]).T.reshape(-1, 2, 2) -@_deprecate_positional_args def cohen_kappa_score(y1, y2, *, labels=None, weights=None, sample_weight=None): r"""Cohen's kappa: a statistic that measures inter-annotator agreement. @@ -650,7 +645,6 @@ class labels [2]_. return 1 - k -@_deprecate_positional_args def jaccard_score(y_true, y_pred, *, labels=None, pos_label=1, average='binary', sample_weight=None, zero_division="warn"): """Jaccard similarity coefficient score. @@ -796,7 +790,6 @@ def jaccard_score(y_true, y_pred, *, labels=None, pos_label=1, return np.average(jaccard, weights=weights) -@_deprecate_positional_args def matthews_corrcoef(y_true, y_pred, *, sample_weight=None): """Compute the Matthews correlation coefficient (MCC). @@ -886,7 +879,6 @@ def matthews_corrcoef(y_true, y_pred, *, sample_weight=None): return mcc -@_deprecate_positional_args def zero_one_loss(y_true, y_pred, *, normalize=True, sample_weight=None): """Zero-one classification loss. @@ -957,7 +949,6 @@ def zero_one_loss(y_true, y_pred, *, normalize=True, sample_weight=None): return n_samples - score -@_deprecate_positional_args def f1_score(y_true, y_pred, *, labels=None, pos_label=1, average='binary', sample_weight=None, zero_division="warn"): """Compute the F1 score, also known as balanced F-score or F-measure. @@ -1082,7 +1073,6 @@ def f1_score(y_true, y_pred, *, labels=None, pos_label=1, average='binary', zero_division=zero_division) -@_deprecate_positional_args def fbeta_score(y_true, y_pred, *, beta, labels=None, pos_label=1, average='binary', sample_weight=None, zero_division="warn"): """Compute the F-beta score. @@ -1310,7 +1300,6 @@ def _check_set_wise_labels(y_true, y_pred, average, labels, pos_label): return labels -@_deprecate_positional_args def precision_recall_fscore_support(y_true, y_pred, *, beta=1.0, labels=None, pos_label=1, average=None, warn_for=('precision', 'recall', @@ -1551,7 +1540,6 @@ def precision_recall_fscore_support(y_true, y_pred, *, beta=1.0, labels=None, return precision, recall, f_score, true_sum -@_deprecate_positional_args def precision_score(y_true, y_pred, *, labels=None, pos_label=1, average='binary', sample_weight=None, zero_division="warn"): @@ -1671,7 +1659,6 @@ def precision_score(y_true, y_pred, *, labels=None, pos_label=1, return p -@_deprecate_positional_args def recall_score(y_true, y_pred, *, labels=None, pos_label=1, average='binary', sample_weight=None, zero_division="warn"): """Compute the recall. @@ -1789,7 +1776,6 @@ def recall_score(y_true, y_pred, *, labels=None, pos_label=1, average='binary', return r -@_deprecate_positional_args def balanced_accuracy_score(y_true, y_pred, *, sample_weight=None, adjusted=False): """Compute the balanced accuracy. @@ -1870,7 +1856,6 @@ def balanced_accuracy_score(y_true, y_pred, *, sample_weight=None, return score -@_deprecate_positional_args def classification_report(y_true, y_pred, *, labels=None, target_names=None, sample_weight=None, digits=2, output_dict=False, zero_division="warn"): @@ -2072,7 +2057,6 @@ class 2 1.00 0.67 0.80 3 return report -@_deprecate_positional_args def hamming_loss(y_true, y_pred, *, sample_weight=None): """Compute the average Hamming loss. @@ -2164,7 +2148,6 @@ def hamming_loss(y_true, y_pred, *, sample_weight=None): raise ValueError("{0} is not supported".format(y_type)) -@_deprecate_positional_args def log_loss(y_true, y_pred, *, eps=1e-15, normalize=True, sample_weight=None, labels=None): r"""Log loss, aka logistic loss or cross-entropy loss. @@ -2293,7 +2276,6 @@ def log_loss(y_true, y_pred, *, eps=1e-15, normalize=True, sample_weight=None, return _weighted_sum(loss, sample_weight, normalize) -@_deprecate_positional_args def hinge_loss(y_true, pred_decision, *, labels=None, sample_weight=None): """Average hinge loss (non-regularized). @@ -2433,7 +2415,6 @@ def hinge_loss(y_true, pred_decision, *, labels=None, sample_weight=None): return np.average(losses, weights=sample_weight) -@_deprecate_positional_args def brier_score_loss(y_true, y_prob, *, sample_weight=None, pos_label=None): """Compute the Brier score loss. diff --git a/sklearn/metrics/_plot/confusion_matrix.py b/sklearn/metrics/_plot/confusion_matrix.py index 9fcecec775e6e..dd941a7e28e43 100644 --- a/sklearn/metrics/_plot/confusion_matrix.py +++ b/sklearn/metrics/_plot/confusion_matrix.py @@ -6,7 +6,6 @@ from ...utils import check_matplotlib_support from ...utils import deprecated from ...utils.multiclass import unique_labels -from ...utils.validation import _deprecate_positional_args from ...base import is_classifier @@ -72,12 +71,10 @@ class ConfusionMatrixDisplay: ... display_labels=clf.classes_) >>> disp.plot() # doctest: +SKIP """ - @_deprecate_positional_args def __init__(self, confusion_matrix, *, display_labels=None): self.confusion_matrix = confusion_matrix self.display_labels = display_labels - @_deprecate_positional_args def plot(self, *, include_values=True, cmap='viridis', xticks_rotation='horizontal', values_format=None, ax=None, colorbar=True): @@ -435,7 +432,6 @@ def from_predictions( "ConfusionMatrixDisplay.from_predictions or " "ConfusionMatrixDisplay.from_estimator." ) -@_deprecate_positional_args def plot_confusion_matrix(estimator, X, y_true, *, labels=None, sample_weight=None, normalize=None, display_labels=None, include_values=True, diff --git a/sklearn/metrics/_plot/precision_recall_curve.py b/sklearn/metrics/_plot/precision_recall_curve.py index dcc20bbce25a7..f144c19e53e38 100644 --- a/sklearn/metrics/_plot/precision_recall_curve.py +++ b/sklearn/metrics/_plot/precision_recall_curve.py @@ -4,7 +4,6 @@ from .. import precision_recall_curve from ...utils import check_matplotlib_support -from ...utils.validation import _deprecate_positional_args class PrecisionRecallDisplay: @@ -71,7 +70,6 @@ class PrecisionRecallDisplay: >>> disp = PrecisionRecallDisplay(precision=precision, recall=recall) >>> disp.plot() # doctest: +SKIP """ - @_deprecate_positional_args def __init__(self, precision, recall, *, average_precision=None, estimator_name=None, pos_label=None): self.estimator_name = estimator_name @@ -80,7 +78,6 @@ def __init__(self, precision, recall, *, self.average_precision = average_precision self.pos_label = pos_label - @_deprecate_positional_args def plot(self, ax=None, *, name=None, **kwargs): """Plot visualization. @@ -140,7 +137,6 @@ def plot(self, ax=None, *, name=None, **kwargs): return self -@_deprecate_positional_args def plot_precision_recall_curve(estimator, X, y, *, sample_weight=None, response_method="auto", name=None, ax=None, pos_label=None, **kwargs): diff --git a/sklearn/metrics/_plot/roc_curve.py b/sklearn/metrics/_plot/roc_curve.py index 308ae4f4bf85d..35fde6ae031b8 100644 --- a/sklearn/metrics/_plot/roc_curve.py +++ b/sklearn/metrics/_plot/roc_curve.py @@ -4,7 +4,6 @@ from .. import roc_curve from ...utils import check_matplotlib_support -from ...utils.validation import _deprecate_positional_args class RocCurveDisplay: @@ -67,7 +66,6 @@ class RocCurveDisplay: >>> display.plot() # doctest: +SKIP >>> plt.show() # doctest: +SKIP """ - @_deprecate_positional_args def __init__(self, *, fpr, tpr, roc_auc=None, estimator_name=None, pos_label=None): self.estimator_name = estimator_name @@ -76,7 +74,6 @@ def __init__(self, *, fpr, tpr, self.roc_auc = roc_auc self.pos_label = pos_label - @_deprecate_positional_args def plot(self, ax=None, *, name=None, **kwargs): """Plot visualization @@ -132,7 +129,6 @@ def plot(self, ax=None, *, name=None, **kwargs): return self -@_deprecate_positional_args def plot_roc_curve(estimator, X, y, *, sample_weight=None, drop_intermediate=True, response_method="auto", name=None, ax=None, pos_label=None, **kwargs): diff --git a/sklearn/metrics/_ranking.py b/sklearn/metrics/_ranking.py index 8c458ac81e529..8482b9b87aedb 100644 --- a/sklearn/metrics/_ranking.py +++ b/sklearn/metrics/_ranking.py @@ -32,7 +32,6 @@ from ..utils.multiclass import type_of_target from ..utils.extmath import stable_cumsum from ..utils.sparsefuncs import count_nonzero -from ..utils.validation import _deprecate_positional_args from ..exceptions import UndefinedMetricWarning from ..preprocessing import label_binarize from ..utils._encode import _encode, _unique @@ -107,7 +106,6 @@ def auc(x, y): return area -@_deprecate_positional_args def average_precision_score(y_true, y_score, *, average="macro", pos_label=1, sample_weight=None): """Compute average precision (AP) from prediction scores. @@ -350,7 +348,6 @@ def _binary_roc_auc_score(y_true, y_score, sample_weight=None, max_fpr=None): return 0.5 * (1 + (partial_auc - min_area) / (max_area - min_area)) -@_deprecate_positional_args def roc_auc_score(y_true, y_score, *, average="macro", sample_weight=None, max_fpr=None, multi_class="raise", labels=None): """Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC) @@ -737,7 +734,6 @@ def _binary_clf_curve(y_true, y_score, pos_label=None, sample_weight=None): return fps, tps, y_score[threshold_idxs] -@_deprecate_positional_args def precision_recall_curve(y_true, probas_pred, *, pos_label=None, sample_weight=None): """Compute precision-recall pairs for different probability thresholds. @@ -832,7 +828,6 @@ def precision_recall_curve(y_true, probas_pred, *, pos_label=None, return np.r_[precision[sl], 1], np.r_[recall[sl], 0], thresholds[sl] -@_deprecate_positional_args def roc_curve(y_true, y_score, *, pos_label=None, sample_weight=None, drop_intermediate=True): """Compute Receiver operating characteristic (ROC). @@ -965,7 +960,6 @@ def roc_curve(y_true, y_score, *, pos_label=None, sample_weight=None, return fpr, tpr, thresholds -@_deprecate_positional_args def label_ranking_average_precision_score(y_true, y_score, *, sample_weight=None): """Compute ranking-based average precision. @@ -1055,7 +1049,6 @@ def label_ranking_average_precision_score(y_true, y_score, *, return out -@_deprecate_positional_args def coverage_error(y_true, y_score, *, sample_weight=None): """Coverage error measure. @@ -1115,7 +1108,6 @@ def coverage_error(y_true, y_score, *, sample_weight=None): return np.average(coverage, weights=sample_weight) -@_deprecate_positional_args def label_ranking_loss(y_true, y_score, *, sample_weight=None): """Compute Ranking loss measure. @@ -1318,7 +1310,6 @@ def _check_dcg_target_type(y_true): supported_fmt, y_type)) -@_deprecate_positional_args def dcg_score(y_true, y_score, *, k=None, log_base=2, sample_weight=None, ignore_ties=False): """Compute Discounted Cumulative Gain. @@ -1475,7 +1466,6 @@ def _ndcg_sample_scores(y_true, y_score, k=None, ignore_ties=False): return gain -@_deprecate_positional_args def ndcg_score(y_true, y_score, *, k=None, sample_weight=None, ignore_ties=False): """Compute Normalized Discounted Cumulative Gain. diff --git a/sklearn/metrics/_regression.py b/sklearn/metrics/_regression.py index c2a0e7f7f033b..ba3edab2f61cb 100644 --- a/sklearn/metrics/_regression.py +++ b/sklearn/metrics/_regression.py @@ -30,7 +30,6 @@ from ..utils.validation import (check_array, check_consistent_length, _num_samples) from ..utils.validation import column_or_1d -from ..utils.validation import _deprecate_positional_args from ..utils.validation import _check_sample_weight from ..utils.stats import _weighted_percentile from ..exceptions import UndefinedMetricWarning @@ -123,7 +122,6 @@ def _check_reg_targets(y_true, y_pred, multioutput, dtype="numeric"): return y_type, y_true, y_pred, multioutput -@_deprecate_positional_args def mean_absolute_error(y_true, y_pred, *, sample_weight=None, multioutput='uniform_average'): @@ -354,7 +352,6 @@ def mean_absolute_percentage_error(y_true, y_pred, return np.average(output_errors, weights=multioutput) -@_deprecate_positional_args def mean_squared_error(y_true, y_pred, *, sample_weight=None, multioutput='uniform_average', squared=True): @@ -434,7 +431,6 @@ def mean_squared_error(y_true, y_pred, *, return np.average(output_errors, weights=multioutput) -@_deprecate_positional_args def mean_squared_log_error(y_true, y_pred, *, sample_weight=None, multioutput='uniform_average'): @@ -501,7 +497,6 @@ def mean_squared_log_error(y_true, y_pred, *, multioutput=multioutput) -@_deprecate_positional_args def median_absolute_error(y_true, y_pred, *, multioutput='uniform_average', sample_weight=None): """Median absolute error regression loss. @@ -575,7 +570,6 @@ def median_absolute_error(y_true, y_pred, *, multioutput='uniform_average', return np.average(output_errors, weights=multioutput) -@_deprecate_positional_args def explained_variance_score(y_true, y_pred, *, sample_weight=None, multioutput='uniform_average'): @@ -667,7 +661,6 @@ def explained_variance_score(y_true, y_pred, *, return np.average(output_scores, weights=avg_weights) -@_deprecate_positional_args def r2_score(y_true, y_pred, *, sample_weight=None, multioutput="uniform_average"): """:math:`R^2` (coefficient of determination) regression score function. @@ -839,7 +832,6 @@ def max_error(y_true, y_pred): return np.max(np.abs(y_true - y_pred)) -@_deprecate_positional_args def mean_tweedie_deviance(y_true, y_pred, *, sample_weight=None, power=0): """Mean Tweedie deviance regression loss. @@ -904,7 +896,6 @@ def mean_tweedie_deviance(y_true, y_pred, *, sample_weight=None, power=0): return np.average(dev, weights=sample_weight) -@_deprecate_positional_args def mean_poisson_deviance(y_true, y_pred, *, sample_weight=None): """Mean Poisson deviance regression loss. @@ -942,7 +933,6 @@ def mean_poisson_deviance(y_true, y_pred, *, sample_weight=None): ) -@_deprecate_positional_args def mean_gamma_deviance(y_true, y_pred, *, sample_weight=None): """Mean Gamma deviance regression loss. diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py index 39c4523f9bde6..63427b01d7fc2 100644 --- a/sklearn/metrics/_scorer.py +++ b/sklearn/metrics/_scorer.py @@ -43,7 +43,6 @@ from .cluster import fowlkes_mallows_score from ..utils.multiclass import type_of_target -from ..utils.validation import _deprecate_positional_args from ..base import is_regressor @@ -397,7 +396,6 @@ def _passthrough_scorer(estimator, *args, **kwargs): return estimator.score(*args, **kwargs) -@_deprecate_positional_args def check_scoring(estimator, scoring=None, *, allow_none=False): """Determine scorer from user options. @@ -534,7 +532,6 @@ def _check_multimetric_scoring(estimator, scoring): return scorers -@_deprecate_positional_args def make_scorer(score_func, *, greater_is_better=True, needs_proba=False, needs_threshold=False, **kwargs): """Make a scorer from a performance metric or loss function. diff --git a/sklearn/metrics/cluster/_bicluster.py b/sklearn/metrics/cluster/_bicluster.py index e267b44cee229..b58cc8ac77805 100644 --- a/sklearn/metrics/cluster/_bicluster.py +++ b/sklearn/metrics/cluster/_bicluster.py @@ -2,7 +2,6 @@ from scipy.optimize import linear_sum_assignment from ...utils.validation import check_consistent_length, check_array -from ...utils.validation import _deprecate_positional_args __all__ = ["consensus_score"] @@ -45,7 +44,6 @@ def _pairwise_similarity(a, b, similarity): return result -@_deprecate_positional_args def consensus_score(a, b, *, similarity="jaccard"): """The similarity of two sets of biclusters. diff --git a/sklearn/metrics/cluster/_supervised.py b/sklearn/metrics/cluster/_supervised.py index 19d1552518db4..ccc8077a3aab9 100644 --- a/sklearn/metrics/cluster/_supervised.py +++ b/sklearn/metrics/cluster/_supervised.py @@ -25,7 +25,6 @@ from ._expected_mutual_info_fast import expected_mutual_information from ...utils.fixes import _astype_copy_false from ...utils.multiclass import type_of_target -from ...utils.validation import _deprecate_positional_args from ...utils.validation import check_array, check_consistent_length @@ -84,7 +83,6 @@ def _generalized_average(U, V, average_method): "'arithmetic', or 'max'") -@_deprecate_positional_args def contingency_matrix(labels_true, labels_pred, *, eps=None, sparse=False, dtype=np.int64): """Build a contingency matrix describing the relationship between labels. @@ -390,7 +388,6 @@ def adjusted_rand_score(labels_true, labels_pred): (tp + fp) * (fp + tn)) -@_deprecate_positional_args def homogeneity_completeness_v_measure(labels_true, labels_pred, *, beta=1.0): """Compute the homogeneity and completeness and V-Measure scores at once. @@ -611,7 +608,6 @@ def completeness_score(labels_true, labels_pred): return homogeneity_completeness_v_measure(labels_true, labels_pred)[1] -@_deprecate_positional_args def v_measure_score(labels_true, labels_pred, *, beta=1.0): """V-measure cluster labeling given a ground truth. @@ -711,7 +707,6 @@ def v_measure_score(labels_true, labels_pred, *, beta=1.0): beta=beta)[2] -@_deprecate_positional_args def mutual_info_score(labels_true, labels_pred, *, contingency=None): """Mutual Information between two clusterings. @@ -799,7 +794,6 @@ def mutual_info_score(labels_true, labels_pred, *, contingency=None): return np.clip(mi.sum(), 0.0, None) -@_deprecate_positional_args def adjusted_mutual_info_score(labels_true, labels_pred, *, average_method='arithmetic'): """Adjusted Mutual Information between two clusterings. @@ -920,7 +914,6 @@ def adjusted_mutual_info_score(labels_true, labels_pred, *, return ami -@_deprecate_positional_args def normalized_mutual_info_score(labels_true, labels_pred, *, average_method='arithmetic'): """Normalized Mutual Information between two clusterings. @@ -1021,7 +1014,6 @@ def normalized_mutual_info_score(labels_true, labels_pred, *, return nmi -@_deprecate_positional_args def fowlkes_mallows_score(labels_true, labels_pred, *, sparse=False): """Measure the similarity of two clusterings of a set of points. diff --git a/sklearn/metrics/cluster/_unsupervised.py b/sklearn/metrics/cluster/_unsupervised.py index c597277a55b31..2b94557626486 100644 --- a/sklearn/metrics/cluster/_unsupervised.py +++ b/sklearn/metrics/cluster/_unsupervised.py @@ -16,7 +16,6 @@ from ..pairwise import pairwise_distances_chunked from ..pairwise import pairwise_distances from ...preprocessing import LabelEncoder -from ...utils.validation import _deprecate_positional_args def check_number_of_labels(n_labels, n_samples): @@ -35,7 +34,6 @@ def check_number_of_labels(n_labels, n_samples): "to n_samples - 1 (inclusive)" % n_labels) -@_deprecate_positional_args def silhouette_score(X, labels, *, metric='euclidean', sample_size=None, random_state=None, **kwds): """Compute the mean Silhouette Coefficient of all samples. @@ -149,7 +147,6 @@ def _silhouette_reduce(D_chunk, start, labels, label_freqs): return intra_clust_dists, inter_clust_dists -@_deprecate_positional_args def silhouette_samples(X, labels, *, metric='euclidean', **kwds): """Compute the Silhouette Coefficient for each sample. diff --git a/sklearn/metrics/cluster/tests/test_common.py b/sklearn/metrics/cluster/tests/test_common.py index a0d87ad4baa61..48c7c24218d83 100644 --- a/sklearn/metrics/cluster/tests/test_common.py +++ b/sklearn/metrics/cluster/tests/test_common.py @@ -1,4 +1,5 @@ from functools import partial +from itertools import chain import pytest import numpy as np @@ -128,7 +129,7 @@ def test_normalized_output(metric_name): # 0.22 AMI and NMI changes @pytest.mark.filterwarnings('ignore::FutureWarning') @pytest.mark.parametrize( - "metric_name", dict(SUPERVISED_METRICS, **UNSUPERVISED_METRICS) + "metric_name", chain(SUPERVISED_METRICS, UNSUPERVISED_METRICS) ) def test_permute_labels(metric_name): # All clustering metrics do not change score due to permutations of labels @@ -151,7 +152,7 @@ def test_permute_labels(metric_name): # 0.22 AMI and NMI changes @pytest.mark.filterwarnings('ignore::FutureWarning') @pytest.mark.parametrize( - "metric_name", dict(SUPERVISED_METRICS, **UNSUPERVISED_METRICS) + "metric_name", chain(SUPERVISED_METRICS, UNSUPERVISED_METRICS) ) # For all clustering metrics Input parameters can be both # in the form of arrays lists, positive, negative or string diff --git a/sklearn/metrics/cluster/tests/test_supervised.py b/sklearn/metrics/cluster/tests/test_supervised.py index c6d3d2f808843..c4e0149224d2d 100644 --- a/sklearn/metrics/cluster/tests/test_supervised.py +++ b/sklearn/metrics/cluster/tests/test_supervised.py @@ -80,17 +80,20 @@ def test_perfect_matches(): means = {"min", "geometric", "arithmetic", "max"} for score_func in score_funcs_with_changing_means: for mean in means: - assert score_func([], [], mean) == pytest.approx(1.0) - assert score_func([0], [1], mean) == pytest.approx(1.0) - assert score_func([0, 0, 0], [0, 0, 0], mean) == pytest.approx(1.0) - assert score_func( - [0, 1, 0], [42, 7, 42], mean) == pytest.approx(1.0) - assert score_func( - [0., 1., 0.], [42., 7., 42.], mean) == pytest.approx(1.0) - assert score_func( - [0., 1., 2.], [42., 7., 2.], mean) == pytest.approx(1.0) - assert score_func( - [0, 1, 2], [42, 7, 2], mean) == pytest.approx(1.0) + assert score_func([], [], + average_method=mean) == pytest.approx(1.0) + assert score_func([0], [1], + average_method=mean) == pytest.approx(1.0) + assert score_func([0, 0, 0], [0, 0, 0], + average_method=mean) == pytest.approx(1.0) + assert score_func([0, 1, 0], [42, 7, 42], + average_method=mean) == pytest.approx(1.0) + assert score_func([0., 1., 0.], [42., 7., 42.], + average_method=mean) == pytest.approx(1.0) + assert score_func([0., 1., 2.], [42., 7., 2.], + average_method=mean) == pytest.approx(1.0) + assert score_func([0, 1, 2], [42, 7, 2], + average_method=mean) == pytest.approx(1.0) def test_homogeneous_but_not_complete_labeling(): @@ -296,9 +299,11 @@ def test_exactly_zero_info_score(): labels_a, labels_b) == pytest.approx(0.0) for method in ["min", "geometric", "arithmetic", "max"]: assert adjusted_mutual_info_score( - labels_a, labels_b, method) == pytest.approx(0.0) + labels_a, labels_b, + average_method=method) == pytest.approx(0.0) assert normalized_mutual_info_score( - labels_a, labels_b, method) == pytest.approx(0.0) + labels_a, labels_b, + average_method=method) == pytest.approx(0.0) def test_v_measure_and_mutual_information(seed=36): diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 45eb256d59f67..c9e9f60d8aaf3 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -28,7 +28,6 @@ from ..utils.extmath import row_norms, safe_sparse_dot from ..preprocessing import normalize from ..utils._mask import _get_mask -from ..utils.validation import _deprecate_positional_args from ..utils.fixes import delayed from ..utils.fixes import sp_version, parse_version @@ -61,7 +60,6 @@ def _return_float_dtype(X, Y): return X, Y, dtype -@_deprecate_positional_args def check_pairwise_arrays(X, Y, *, precomputed=False, dtype=None, accept_sparse='csr', force_all_finite=True, copy=False): @@ -199,7 +197,6 @@ def check_paired_arrays(X, Y): # Pairwise distances -@_deprecate_positional_args def euclidean_distances(X, Y=None, *, Y_norm_squared=None, squared=False, X_norm_squared=None): """ @@ -352,7 +349,6 @@ def _euclidean_distances(X, Y, X_norm_squared=None, Y_norm_squared=None, return distances if squared else np.sqrt(distances, out=distances) -@_deprecate_positional_args def nan_euclidean_distances(X, Y=None, *, squared=False, missing_values=np.nan, copy=True): """Calculate the euclidean distances in the presence of missing values. @@ -543,7 +539,6 @@ def _argmin_min_reduce(dist, start): return indices, values -@_deprecate_positional_args def pairwise_distances_argmin_min(X, Y, *, axis=1, metric="euclidean", metric_kwargs=None): """Compute minimum distances between one point and a set of points. @@ -630,7 +625,6 @@ def pairwise_distances_argmin_min(X, Y, *, axis=1, metric="euclidean", return indices, values -@_deprecate_positional_args def pairwise_distances_argmin(X, Y, *, axis=1, metric="euclidean", metric_kwargs=None): """Compute minimum distances between one point and a set of points. @@ -752,7 +746,6 @@ def haversine_distances(X, Y=None): return DistanceMetric.get_metric('haversine').pairwise(X, Y) -@_deprecate_positional_args def manhattan_distances(X, Y=None, *, sum_over_features=True): """Compute the L1 distances between the vectors in X and Y. @@ -949,7 +942,6 @@ def paired_cosine_distances(X, Y): 'cityblock': paired_manhattan_distances} -@_deprecate_positional_args def paired_distances(X, Y, *, metric="euclidean", **kwds): """ Computes the paired distances between X and Y. @@ -1499,7 +1491,6 @@ def _precompute_metric_params(X, Y, metric=None, **kwds): return {} -@_deprecate_positional_args def pairwise_distances_chunked(X, Y=None, *, reduce_func=None, metric='euclidean', n_jobs=None, working_memory=None, **kwds): @@ -1664,7 +1655,6 @@ def pairwise_distances_chunked(X, Y=None, *, reduce_func=None, yield D_chunk -@_deprecate_positional_args def pairwise_distances(X, Y=None, metric="euclidean", *, n_jobs=None, force_all_finite=True, **kwds): """Compute the distance matrix from a vector array X and optional Y. @@ -1887,7 +1877,6 @@ def kernel_metrics(): } -@_deprecate_positional_args def pairwise_kernels(X, Y=None, metric="linear", *, filter_params=False, n_jobs=None, **kwds): """Compute the kernel between arrays X and optional array Y. diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py index 7b634e88f2275..feed701f6cead 100644 --- a/sklearn/metrics/tests/test_classification.py +++ b/sklearn/metrics/tests/test_classification.py @@ -246,13 +246,13 @@ def test_precision_recall_f_binary_single_class(): assert 1. == precision_score([1, 1], [1, 1]) assert 1. == recall_score([1, 1], [1, 1]) assert 1. == f1_score([1, 1], [1, 1]) - assert 1. == fbeta_score([1, 1], [1, 1], 0) + assert 1. == fbeta_score([1, 1], [1, 1], beta=0) assert 0. == precision_score([-1, -1], [-1, -1]) assert 0. == recall_score([-1, -1], [-1, -1]) assert 0. == f1_score([-1, -1], [-1, -1]) - assert 0. == fbeta_score([-1, -1], [-1, -1], float('inf')) - assert fbeta_score([-1, -1], [-1, -1], float('inf')) == pytest.approx( + assert 0. == fbeta_score([-1, -1], [-1, -1], beta=float('inf')) + assert fbeta_score([-1, -1], [-1, -1], beta=float('inf')) == pytest.approx( fbeta_score([-1, -1], [-1, -1], beta=1e5)) diff --git a/sklearn/mixture/_bayesian_mixture.py b/sklearn/mixture/_bayesian_mixture.py index bd1954ddc15c8..b733c91baf99e 100644 --- a/sklearn/mixture/_bayesian_mixture.py +++ b/sklearn/mixture/_bayesian_mixture.py @@ -15,7 +15,6 @@ from ._gaussian_mixture import _estimate_gaussian_parameters from ._gaussian_mixture import _estimate_log_gaussian_prob from ..utils import check_array -from ..utils.validation import _deprecate_positional_args def _log_dirichlet_norm(dirichlet_concentration): @@ -325,7 +324,6 @@ class BayesianGaussianMixture(BaseMixture): inference for Dirichlet process mixtures". Bayesian analysis 1.1 `_ """ - @_deprecate_positional_args def __init__(self, *, n_components=1, covariance_type='full', tol=1e-3, reg_covar=1e-6, max_iter=100, n_init=1, init_params='kmeans', weight_concentration_prior_type='dirichlet_process', diff --git a/sklearn/mixture/_gaussian_mixture.py b/sklearn/mixture/_gaussian_mixture.py index 4bb14f9ca5bd7..777141be4feb8 100644 --- a/sklearn/mixture/_gaussian_mixture.py +++ b/sklearn/mixture/_gaussian_mixture.py @@ -11,7 +11,6 @@ from ._base import BaseMixture, _check_shape from ..utils import check_array from ..utils.extmath import row_norms -from ..utils.validation import _deprecate_positional_args ############################################################################### @@ -604,7 +603,6 @@ class GaussianMixture(BaseMixture): BayesianGaussianMixture : Gaussian mixture model fit with a variational inference. """ - @_deprecate_positional_args def __init__(self, n_components=1, *, covariance_type='full', tol=1e-3, reg_covar=1e-6, max_iter=100, n_init=1, init_params='kmeans', weights_init=None, means_init=None, precisions_init=None, diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index d4444ce09dcb5..07ad3d7dbafe5 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -37,7 +37,6 @@ from ..utils.random import sample_without_replacement from ..utils._tags import _safe_tags from ..utils.validation import indexable, check_is_fitted, _check_fit_params -from ..utils.validation import _deprecate_positional_args from ..utils.metaestimators import if_delegate_has_method from ..utils.fixes import delayed from ..metrics._scorer import _check_multimetric_scoring @@ -239,7 +238,6 @@ class ParameterSampler: ... {'b': 1.038159, 'a': 2}] True """ - @_deprecate_positional_args def __init__(self, param_distributions, n_iter, *, random_state=None): if not isinstance(param_distributions, (Mapping, Iterable)): raise TypeError('Parameter distribution is not a dict or ' @@ -340,7 +338,6 @@ class BaseSearchCV(MetaEstimatorMixin, BaseEstimator, metaclass=ABCMeta): """ @abstractmethod - @_deprecate_positional_args def __init__(self, estimator, *, scoring=None, n_jobs=None, refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs', error_score=np.nan, @@ -648,7 +645,6 @@ def _check_refit_for_multimetric(self, scores): and not callable(self.refit)): raise ValueError(multimetric_refit_msg) - @_deprecate_positional_args def fit(self, X, y=None, *, groups=None, **fit_params): """Run fit with all sets of parameters. @@ -1206,7 +1202,6 @@ class GridSearchCV(BaseSearchCV): """ _required_parameters = ["estimator", "param_grid"] - @_deprecate_positional_args def __init__(self, estimator, param_grid, *, scoring=None, n_jobs=None, refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs', @@ -1541,7 +1536,6 @@ class RandomizedSearchCV(BaseSearchCV): """ _required_parameters = ["estimator", "param_distributions"] - @_deprecate_positional_args def __init__(self, estimator, param_distributions, *, n_iter=10, scoring=None, n_jobs=None, refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs', diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index 13edbeef071f5..5eaeb5df5be8e 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -27,7 +27,6 @@ from ..utils import _approximate_mode from ..utils.validation import _num_samples, column_or_1d from ..utils.validation import check_array -from ..utils.validation import _deprecate_positional_args from ..utils.multiclass import type_of_target from ..base import _pprint @@ -272,7 +271,6 @@ class _BaseKFold(BaseCrossValidator, metaclass=ABCMeta): """Base class for KFold, GroupKFold, and StratifiedKFold""" @abstractmethod - @_deprecate_positional_args def __init__(self, n_splits, *, shuffle, random_state): if not isinstance(n_splits, numbers.Integral): raise ValueError('The number of folds must be of Integral type. ' @@ -426,7 +424,6 @@ class KFold(_BaseKFold): RepeatedKFold : Repeats K-Fold n times. """ - @_deprecate_positional_args def __init__(self, n_splits=5, *, shuffle=False, random_state=None): super().__init__(n_splits=n_splits, shuffle=shuffle, @@ -635,7 +632,6 @@ class StratifiedKFold(_BaseKFold): -------- RepeatedStratifiedKFold : Repeats Stratified K-Fold n times. """ - @_deprecate_positional_args def __init__(self, n_splits=5, *, shuffle=False, random_state=None): super().__init__(n_splits=n_splits, shuffle=shuffle, random_state=random_state) @@ -1009,7 +1005,6 @@ class TimeSeriesSplit(_BaseKFold): with a test set of size ``n_samples//(n_splits + 1)`` by default, where ``n_samples`` is the number of samples. """ - @_deprecate_positional_args def __init__(self, n_splits=5, *, @@ -1339,7 +1334,6 @@ class _RepeatedSplits(metaclass=ABCMeta): Constructor parameters for cv. Must not contain random_state and shuffle. """ - @_deprecate_positional_args def __init__(self, cv, *, n_repeats=10, random_state=None, **cvargs): if not isinstance(n_repeats, numbers.Integral): raise ValueError("Number of repetitions must be of Integral type.") @@ -1467,7 +1461,6 @@ class RepeatedKFold(_RepeatedSplits): -------- RepeatedStratifiedKFold : Repeats Stratified K-Fold n times. """ - @_deprecate_positional_args def __init__(self, *, n_splits=5, n_repeats=10, random_state=None): super().__init__( KFold, n_repeats=n_repeats, @@ -1523,7 +1516,6 @@ class RepeatedStratifiedKFold(_RepeatedSplits): -------- RepeatedKFold : Repeats K-Fold n times. """ - @_deprecate_positional_args def __init__(self, *, n_splits=5, n_repeats=10, random_state=None): super().__init__( StratifiedKFold, n_repeats=n_repeats, random_state=random_state, @@ -1532,7 +1524,6 @@ def __init__(self, *, n_splits=5, n_repeats=10, random_state=None): class BaseShuffleSplit(metaclass=ABCMeta): """Base class for ShuffleSplit and StratifiedShuffleSplit""" - @_deprecate_positional_args def __init__(self, n_splits=10, *, test_size=None, train_size=None, random_state=None): self.n_splits = n_splits @@ -1666,7 +1657,6 @@ class ShuffleSplit(BaseShuffleSplit): TRAIN: [3 4 1] TEST: [5 2] TRAIN: [3 5 1] TEST: [2 4] """ - @_deprecate_positional_args def __init__(self, n_splits=10, *, test_size=None, train_size=None, random_state=None): super().__init__( @@ -1757,7 +1747,6 @@ class GroupShuffleSplit(ShuffleSplit): TRAIN: [2 3 4 5 6 7] TEST: [0 1] TRAIN: [0 1 5 6 7] TEST: [2 3 4] ''' - @_deprecate_positional_args def __init__(self, n_splits=5, *, test_size=None, train_size=None, random_state=None): super().__init__( @@ -1873,7 +1862,6 @@ class StratifiedShuffleSplit(BaseShuffleSplit): TRAIN: [4 1 0] TEST: [2 3 5] TRAIN: [0 5 1] TEST: [3 4 2] """ - @_deprecate_positional_args def __init__(self, n_splits=10, *, test_size=None, train_size=None, random_state=None): super().__init__( @@ -2206,7 +2194,6 @@ def split(self, X=None, y=None, groups=None): yield train, test -@_deprecate_positional_args def check_cv(cv=5, y=None, *, classifier=False): """Input checker utility for building a cross-validator diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index 9765303a30b8d..e473db977bb30 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -24,7 +24,6 @@ from ..utils import indexable, check_random_state, _safe_indexing from ..utils.validation import _check_fit_params from ..utils.validation import _num_samples -from ..utils.validation import _deprecate_positional_args from ..utils.fixes import delayed from ..utils.metaestimators import _safe_split from ..metrics import check_scoring @@ -38,7 +37,6 @@ 'permutation_test_score', 'learning_curve', 'validation_curve'] -@_deprecate_positional_args def cross_validate(estimator, X, y=None, *, groups=None, scoring=None, cv=None, n_jobs=None, verbose=0, fit_params=None, pre_dispatch='2*n_jobs', return_train_score=False, @@ -317,7 +315,6 @@ def _normalize_score_results(scores, scaler_score_key='score'): return {scaler_score_key: scores} -@_deprecate_positional_args def cross_val_score(estimator, X, y=None, *, groups=None, scoring=None, cv=None, n_jobs=None, verbose=0, fit_params=None, pre_dispatch='2*n_jobs', error_score=np.nan): @@ -722,7 +719,6 @@ def _score(estimator, X_test, y_test, scorer, error_score="raise"): return scores -@_deprecate_positional_args def cross_val_predict(estimator, X, y=None, *, groups=None, cv=None, n_jobs=None, verbose=0, fit_params=None, pre_dispatch='2*n_jobs', method='predict'): @@ -1059,7 +1055,6 @@ def _check_is_permutation(indices, n_samples): return True -@_deprecate_positional_args def permutation_test_score(estimator, X, y, *, groups=None, cv=None, n_permutations=100, n_jobs=None, random_state=0, verbose=0, scoring=None, fit_params=None): @@ -1224,7 +1219,6 @@ def _shuffle(y, groups, random_state): return _safe_indexing(y, indices) -@_deprecate_positional_args def learning_curve(estimator, X, y, *, groups=None, train_sizes=np.linspace(0.1, 1.0, 5), cv=None, scoring=None, exploit_incremental_learning=False, @@ -1534,7 +1528,6 @@ def _incremental_fit_estimator(estimator, X, y, classes, train, test, return np.array(ret).T -@_deprecate_positional_args def validation_curve(estimator, X, y, *, param_name, param_range, groups=None, cv=None, scoring=None, n_jobs=None, pre_dispatch="all", verbose=0, error_score=np.nan, fit_params=None): diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py index c66d8e1836ac9..98d173f141d96 100644 --- a/sklearn/model_selection/tests/test_split.py +++ b/sklearn/model_selection/tests/test_split.py @@ -697,15 +697,15 @@ def test_stratified_shuffle_split_init(): y = np.asarray([0, 1, 1, 1, 2, 2, 2]) # Check that error is raised if there is a class with only one sample with pytest.raises(ValueError): - next(StratifiedShuffleSplit(3, 0.2).split(X, y)) + next(StratifiedShuffleSplit(3, test_size=0.2).split(X, y)) # Check that error is raised if the test set size is smaller than n_classes with pytest.raises(ValueError): - next(StratifiedShuffleSplit(3, 2).split(X, y)) + next(StratifiedShuffleSplit(3, test_size=2).split(X, y)) # Check that error is raised if the train set size is smaller than # n_classes with pytest.raises(ValueError): - next(StratifiedShuffleSplit(3, 3, 2).split(X, y)) + next(StratifiedShuffleSplit(3, test_size=3, train_size=2).split(X, y)) X = np.arange(9) y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2]) diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py index da29fdd4daf11..d75556bf60ab4 100644 --- a/sklearn/multiclass.py +++ b/sklearn/multiclass.py @@ -51,7 +51,6 @@ from .utils.validation import _num_samples from .utils.validation import check_is_fitted from .utils.validation import check_X_y, check_array -from .utils.validation import _deprecate_positional_args from .utils.multiclass import (_check_partial_fit_first_call, check_classification_targets, _ovr_decision_function) @@ -245,7 +244,6 @@ class OneVsRestClassifier(MultiOutputMixin, ClassifierMixin, sklearn.preprocessing.MultiLabelBinarizer : Transform iterable of iterables to binary indicator matrix. """ - @_deprecate_positional_args def __init__(self, estimator, *, n_jobs=None): self.estimator = estimator self.n_jobs = n_jobs @@ -609,7 +607,6 @@ class OneVsOneClassifier(MetaEstimatorMixin, ClassifierMixin, BaseEstimator): >>> clf.predict(X_test[:10]) array([2, 1, 0, 2, 0, 2, 0, 1, 1, 1]) """ - @_deprecate_positional_args def __init__(self, estimator, *, n_jobs=None): self.estimator = estimator self.n_jobs = n_jobs @@ -867,7 +864,6 @@ class OutputCodeClassifier(MetaEstimatorMixin, ClassifierMixin, BaseEstimator): Hastie T., Tibshirani R., Friedman J., page 606 (second-edition) 2008. """ - @_deprecate_positional_args def __init__(self, estimator, *, code_size=1.5, random_state=None, n_jobs=None): self.estimator = estimator diff --git a/sklearn/multioutput.py b/sklearn/multioutput.py index 9b64d28f41eb8..e78683fea3835 100644 --- a/sklearn/multioutput.py +++ b/sklearn/multioutput.py @@ -25,7 +25,7 @@ from .utils import check_array, check_X_y, check_random_state from .utils.metaestimators import if_delegate_has_method from .utils.validation import (check_is_fitted, has_fit_parameter, - _check_fit_params, _deprecate_positional_args) + _check_fit_params) from .utils.multiclass import check_classification_targets from .utils.fixes import delayed @@ -65,7 +65,6 @@ class _MultiOutputEstimator(MetaEstimatorMixin, BaseEstimator, metaclass=ABCMeta): @abstractmethod - @_deprecate_positional_args def __init__(self, estimator, *, n_jobs=None): self.estimator = estimator self.n_jobs = n_jobs @@ -260,7 +259,6 @@ class MultiOutputRegressor(RegressorMixin, _MultiOutputEstimator): >>> clf.predict(X[[0]]) array([[176..., 35..., 57...]]) """ - @_deprecate_positional_args def __init__(self, estimator, *, n_jobs=None): super().__init__(estimator, n_jobs=n_jobs) @@ -339,7 +337,6 @@ class MultiOutputClassifier(ClassifierMixin, _MultiOutputEstimator): >>> clf.predict(X[-2:]) array([[1, 1, 0], [1, 1, 1]]) """ - @_deprecate_positional_args def __init__(self, estimator, *, n_jobs=None): super().__init__(estimator, n_jobs=n_jobs) @@ -440,7 +437,6 @@ def _more_tags(self): class _BaseChain(BaseEstimator, metaclass=ABCMeta): - @_deprecate_positional_args def __init__(self, base_estimator, *, order=None, cv=None, random_state=None): self.base_estimator = base_estimator diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index 70f5993f98b1a..7e936ac3a0c8e 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -32,7 +32,6 @@ from .utils.multiclass import _check_partial_fit_first_call from .utils.validation import check_is_fitted, check_non_negative from .utils.validation import _check_sample_weight -from .utils.validation import _deprecate_positional_args __all__ = ['BernoulliNB', 'GaussianNB', 'MultinomialNB', 'ComplementNB', @@ -189,7 +188,6 @@ class labels known to the classifier [1] """ - @_deprecate_positional_args def __init__(self, *, priors=None, var_smoothing=1e-9): self.priors = priors self.var_smoothing = var_smoothing @@ -795,7 +793,6 @@ class MultinomialNB(_BaseDiscreteNB): https://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html """ - @_deprecate_positional_args def __init__(self, *, alpha=1.0, fit_prior=True, class_prior=None): self.alpha = alpha self.fit_prior = fit_prior @@ -920,7 +917,6 @@ class ComplementNB(_BaseDiscreteNB): https://people.csail.mit.edu/jrennie/papers/icml03-nb.pdf """ - @_deprecate_positional_args def __init__(self, *, alpha=1.0, fit_prior=True, class_prior=None, norm=False): self.alpha = alpha @@ -1047,7 +1043,6 @@ class BernoulliNB(_BaseDiscreteNB): naive Bayes -- Which naive Bayes? 3rd Conf. on Email and Anti-Spam (CEAS). """ - @_deprecate_positional_args def __init__(self, *, alpha=1.0, binarize=.0, fit_prior=True, class_prior=None): self.alpha = alpha @@ -1182,7 +1177,6 @@ class CategoricalNB(_BaseDiscreteNB): [3] """ - @_deprecate_positional_args def __init__(self, *, alpha=1.0, fit_prior=True, class_prior=None, min_categories=None): self.alpha = alpha diff --git a/sklearn/neighbors/_classification.py b/sklearn/neighbors/_classification.py index 83078e9f77ba9..29ab582c15ab9 100644 --- a/sklearn/neighbors/_classification.py +++ b/sklearn/neighbors/_classification.py @@ -17,7 +17,6 @@ from ._base import _check_weights, _get_weights from ._base import NeighborsBase, KNeighborsMixin, RadiusNeighborsMixin from ..base import ClassifierMixin -from ..utils.validation import _deprecate_positional_args class KNeighborsClassifier(KNeighborsMixin, @@ -144,7 +143,6 @@ class KNeighborsClassifier(KNeighborsMixin, https://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm """ - @_deprecate_positional_args def __init__(self, n_neighbors=5, *, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=None): @@ -404,7 +402,6 @@ class RadiusNeighborsClassifier(RadiusNeighborsMixin, https://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm """ - @_deprecate_positional_args def __init__(self, radius=1.0, *, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', outlier_label=None, metric_params=None, n_jobs=None, diff --git a/sklearn/neighbors/_graph.py b/sklearn/neighbors/_graph.py index a23b45399f05c..7676d42d62c18 100644 --- a/sklearn/neighbors/_graph.py +++ b/sklearn/neighbors/_graph.py @@ -8,7 +8,7 @@ from ._base import NeighborsBase from ._unsupervised import NearestNeighbors from ..base import TransformerMixin -from ..utils.validation import check_is_fitted, _deprecate_positional_args +from ..utils.validation import check_is_fitted def _check_params(X, metric, p, metric_params): @@ -36,7 +36,6 @@ def _query_include_self(X, include_self, mode): return X -@_deprecate_positional_args def kneighbors_graph(X, n_neighbors, *, mode='connectivity', metric='minkowski', p=2, metric_params=None, include_self=False, n_jobs=None): @@ -113,7 +112,6 @@ def kneighbors_graph(X, n_neighbors, *, mode='connectivity', return X.kneighbors_graph(X=query, n_neighbors=n_neighbors, mode=mode) -@_deprecate_positional_args def radius_neighbors_graph(X, radius, *, mode='connectivity', metric='minkowski', p=2, metric_params=None, include_self=False, n_jobs=None): @@ -300,7 +298,6 @@ class KNeighborsTransformer(KNeighborsMixin, ... KNeighborsTransformer(n_neighbors=5, mode='distance'), ... Isomap(neighbors_algorithm='precomputed')) """ - @_deprecate_positional_args def __init__(self, *, mode='distance', n_neighbors=5, algorithm='auto', leaf_size=30, metric='minkowski', p=2, metric_params=None, n_jobs=1): @@ -483,7 +480,6 @@ class RadiusNeighborsTransformer(RadiusNeighborsMixin, ... RadiusNeighborsTransformer(radius=42.0, mode='distance'), ... DBSCAN(min_samples=30, metric='precomputed')) """ - @_deprecate_positional_args def __init__(self, *, mode='distance', radius=1., algorithm='auto', leaf_size=30, metric='minkowski', p=2, metric_params=None, n_jobs=1): diff --git a/sklearn/neighbors/_kde.py b/sklearn/neighbors/_kde.py index 5a5ad55d3261c..816b023e0f23e 100644 --- a/sklearn/neighbors/_kde.py +++ b/sklearn/neighbors/_kde.py @@ -9,7 +9,6 @@ from ..base import BaseEstimator from ..utils import check_random_state from ..utils.validation import _check_sample_weight, check_is_fitted -from ..utils.validation import _deprecate_positional_args from ..utils.extmath import row_norms from ._ball_tree import BallTree, DTYPE @@ -94,7 +93,6 @@ class KernelDensity(BaseEstimator): >>> log_density array([-1.52955942, -1.51462041, -1.60244657]) """ - @_deprecate_positional_args def __init__(self, *, bandwidth=1.0, algorithm='auto', kernel='gaussian', metric="euclidean", atol=0, rtol=0, breadth_first=True, leaf_size=40, metric_params=None): diff --git a/sklearn/neighbors/_lof.py b/sklearn/neighbors/_lof.py index 29bf1a5e73f91..941b9de781f9a 100644 --- a/sklearn/neighbors/_lof.py +++ b/sklearn/neighbors/_lof.py @@ -10,7 +10,6 @@ from ..base import OutlierMixin from ..utils.validation import check_is_fitted -from ..utils.validation import _deprecate_positional_args from ..utils import check_array __all__ = ["LocalOutlierFactor"] @@ -177,7 +176,6 @@ class LocalOutlierFactor(KNeighborsMixin, .. [1] Breunig, M. M., Kriegel, H. P., Ng, R. T., & Sander, J. (2000, May). LOF: identifying density-based local outliers. In ACM sigmod record. """ - @_deprecate_positional_args def __init__(self, n_neighbors=20, *, algorithm='auto', leaf_size=30, metric='minkowski', p=2, metric_params=None, contamination="auto", novelty=False, n_jobs=None): diff --git a/sklearn/neighbors/_nca.py b/sklearn/neighbors/_nca.py index a4ef02b687d97..5951b66ea7dbf 100644 --- a/sklearn/neighbors/_nca.py +++ b/sklearn/neighbors/_nca.py @@ -23,7 +23,6 @@ from ..utils.multiclass import check_classification_targets from ..utils.random import check_random_state from ..utils.validation import check_is_fitted, check_array, check_scalar -from ..utils.validation import _deprecate_positional_args from ..exceptions import ConvergenceWarning @@ -162,7 +161,6 @@ class NeighborhoodComponentsAnalysis(TransformerMixin, BaseEstimator): """ - @_deprecate_positional_args def __init__(self, n_components=None, *, init='auto', warm_start=False, max_iter=50, tol=1e-5, callback=None, verbose=0, random_state=None): diff --git a/sklearn/neighbors/_nearest_centroid.py b/sklearn/neighbors/_nearest_centroid.py index 0c726cdc0a62c..c5f6a612b0395 100644 --- a/sklearn/neighbors/_nearest_centroid.py +++ b/sklearn/neighbors/_nearest_centroid.py @@ -16,7 +16,6 @@ from ..metrics.pairwise import pairwise_distances from ..preprocessing import LabelEncoder from ..utils.validation import check_is_fitted -from ..utils.validation import _deprecate_positional_args from ..utils.sparsefuncs import csc_median_axis_0 from ..utils.multiclass import check_classification_targets @@ -86,7 +85,6 @@ class NearestCentroid(ClassifierMixin, BaseEstimator): """ - @_deprecate_positional_args def __init__(self, metric='euclidean', *, shrink_threshold=None): self.metric = metric self.shrink_threshold = shrink_threshold diff --git a/sklearn/neighbors/_regression.py b/sklearn/neighbors/_regression.py index 62d6cf33575e4..96beb1ee022af 100644 --- a/sklearn/neighbors/_regression.py +++ b/sklearn/neighbors/_regression.py @@ -17,7 +17,6 @@ from ._base import _get_weights, _check_weights from ._base import NeighborsBase, KNeighborsMixin, RadiusNeighborsMixin from ..base import RegressorMixin -from ..utils.validation import _deprecate_positional_args from ..utils.deprecation import deprecated @@ -143,7 +142,6 @@ class KNeighborsRegressor(KNeighborsMixin, https://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm """ - @_deprecate_positional_args def __init__(self, n_neighbors=5, *, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=None): @@ -342,7 +340,6 @@ class RadiusNeighborsRegressor(RadiusNeighborsMixin, https://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm """ - @_deprecate_positional_args def __init__(self, radius=1.0, *, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=None): diff --git a/sklearn/neighbors/_unsupervised.py b/sklearn/neighbors/_unsupervised.py index a6af48d9ed341..0f14c56e8bac2 100644 --- a/sklearn/neighbors/_unsupervised.py +++ b/sklearn/neighbors/_unsupervised.py @@ -2,7 +2,6 @@ from ._base import NeighborsBase from ._base import KNeighborsMixin from ._base import RadiusNeighborsMixin -from ..utils.validation import _deprecate_positional_args class NearestNeighbors(KNeighborsMixin, @@ -111,7 +110,6 @@ class NearestNeighbors(KNeighborsMixin, https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm """ - @_deprecate_positional_args def __init__(self, *, n_neighbors=5, radius=1.0, algorithm='auto', leaf_size=30, metric='minkowski', p=2, metric_params=None, n_jobs=None): diff --git a/sklearn/neural_network/_multilayer_perceptron.py b/sklearn/neural_network/_multilayer_perceptron.py index e349dfd844f96..72120ad369275 100644 --- a/sklearn/neural_network/_multilayer_perceptron.py +++ b/sklearn/neural_network/_multilayer_perceptron.py @@ -25,7 +25,7 @@ from ..utils import column_or_1d from ..exceptions import ConvergenceWarning from ..utils.extmath import safe_sparse_dot -from ..utils.validation import check_is_fitted, _deprecate_positional_args +from ..utils.validation import check_is_fitted from ..utils.multiclass import _check_partial_fit_first_call, unique_labels from ..utils.multiclass import type_of_target from ..utils.optimize import _check_optimize_result @@ -943,7 +943,6 @@ class MLPClassifier(ClassifierMixin, BaseMultilayerPerceptron): Kingma, Diederik, and Jimmy Ba. "Adam: A method for stochastic optimization." arXiv preprint arXiv:1412.6980 (2014). """ - @_deprecate_positional_args def __init__(self, hidden_layer_sizes=(100,), activation="relu", *, solver='adam', alpha=0.0001, batch_size='auto', learning_rate="constant", @@ -1366,7 +1365,6 @@ class MLPRegressor(RegressorMixin, BaseMultilayerPerceptron): Kingma, Diederik, and Jimmy Ba. "Adam: A method for stochastic optimization." arXiv preprint arXiv:1412.6980 (2014). """ - @_deprecate_positional_args def __init__(self, hidden_layer_sizes=(100,), activation="relu", *, solver='adam', alpha=0.0001, batch_size='auto', learning_rate="constant", diff --git a/sklearn/neural_network/_rbm.py b/sklearn/neural_network/_rbm.py index 7aa64c503bb21..b69a2c496a2c9 100644 --- a/sklearn/neural_network/_rbm.py +++ b/sklearn/neural_network/_rbm.py @@ -20,7 +20,7 @@ from ..utils import gen_even_slices from ..utils.extmath import safe_sparse_dot from ..utils.extmath import log_logistic -from ..utils.validation import check_is_fitted, _deprecate_positional_args +from ..utils.validation import check_is_fitted class BernoulliRBM(TransformerMixin, BaseEstimator): @@ -106,7 +106,6 @@ class BernoulliRBM(TransformerMixin, BaseEstimator): Approximations to the Likelihood Gradient. International Conference on Machine Learning (ICML) 2008 """ - @_deprecate_positional_args def __init__(self, n_components=256, *, learning_rate=0.1, batch_size=10, n_iter=10, verbose=0, random_state=None): self.n_components = n_components diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 024bfe4f1dd38..e2ff6806ff3da 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -26,7 +26,6 @@ from .utils.deprecation import deprecated from .utils._tags import _safe_tags from .utils.validation import check_memory -from .utils.validation import _deprecate_positional_args from .utils.fixes import delayed from .utils.metaestimators import _BaseComposition @@ -110,7 +109,6 @@ class Pipeline(_BaseComposition): # BaseEstimator interface _required_parameters = ['steps'] - @_deprecate_positional_args def __init__(self, steps, *, memory=None, verbose=False): self.steps = steps self.memory = memory @@ -846,7 +844,6 @@ class FeatureUnion(TransformerMixin, _BaseComposition): """ _required_parameters = ["transformer_list"] - @_deprecate_positional_args def __init__(self, transformer_list, *, n_jobs=None, transformer_weights=None, verbose=False): self.transformer_list = transformer_list diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py index befd3e61b96fc..393693fc87d2d 100644 --- a/sklearn/preprocessing/_data.py +++ b/sklearn/preprocessing/_data.py @@ -28,7 +28,7 @@ min_max_axis) from ..utils.validation import (check_is_fitted, check_random_state, _check_sample_weight, - FLOAT_DTYPES, _deprecate_positional_args) + FLOAT_DTYPES) from ._encoders import OneHotEncoder @@ -106,7 +106,6 @@ def _handle_zeros_in_scale(scale, copy=True, constant_mask=None): return scale -@_deprecate_positional_args def scale(X, *, axis=0, with_mean=True, with_std=True, copy=True): """Standardize a dataset along any axis. @@ -344,7 +343,6 @@ class MinMaxScaler(TransformerMixin, BaseEstimator): `. """ - @_deprecate_positional_args def __init__(self, feature_range=(0, 1), *, copy=True, clip=False): self.feature_range = feature_range self.copy = copy @@ -492,7 +490,6 @@ def _more_tags(self): return {'allow_nan': True} -@_deprecate_positional_args def minmax_scale(X, feature_range=(0, 1), *, axis=0, copy=True): """Transform features by scaling each feature to a given range. @@ -707,7 +704,6 @@ class StandardScaler(TransformerMixin, BaseEstimator): `. """ # noqa - @_deprecate_positional_args def __init__(self, *, copy=True, with_mean=True, with_std=True): self.with_mean = with_mean self.with_std = with_std @@ -1026,7 +1022,6 @@ class MaxAbsScaler(TransformerMixin, BaseEstimator): `. """ - @_deprecate_positional_args def __init__(self, *, copy=True): self.copy = copy @@ -1161,7 +1156,6 @@ def _more_tags(self): return {'allow_nan': True} -@_deprecate_positional_args def maxabs_scale(X, *, axis=0, copy=True): """Scale each feature to the [-1, 1] range without breaking the sparsity. @@ -1337,7 +1331,6 @@ class RobustScaler(TransformerMixin, BaseEstimator): https://en.wikipedia.org/wiki/Median https://en.wikipedia.org/wiki/Interquartile_range """ - @_deprecate_positional_args def __init__(self, *, with_centering=True, with_scaling=True, quantile_range=(25.0, 75.0), copy=True, unit_variance=False): self.with_centering = with_centering @@ -1471,7 +1464,6 @@ def _more_tags(self): return {'allow_nan': True} -@_deprecate_positional_args def robust_scale(X, *, axis=0, with_centering=True, with_scaling=True, quantile_range=(25.0, 75.0), copy=True, unit_variance=False): """Standardize a dataset along any axis @@ -1579,7 +1571,6 @@ def robust_scale(X, *, axis=0, with_centering=True, with_scaling=True, return X -@_deprecate_positional_args def normalize(X, norm='l2', *, axis=1, copy=True, return_norm=False): """Scale input vectors individually to unit norm (vector length). @@ -1738,7 +1729,6 @@ class Normalizer(TransformerMixin, BaseEstimator): normalize : Equivalent function without the estimator API. """ - @_deprecate_positional_args def __init__(self, norm='l2', *, copy=True): self.norm = norm self.copy = copy @@ -1790,7 +1780,6 @@ def _more_tags(self): return {'stateless': True} -@_deprecate_positional_args def binarize(X, *, threshold=0.0, copy=True): """Boolean thresholding of array-like or scipy.sparse matrix. @@ -1894,7 +1883,6 @@ class Binarizer(TransformerMixin, BaseEstimator): binarize : Equivalent function without the estimator API. """ - @_deprecate_positional_args def __init__(self, *, threshold=0.0, copy=True): self.threshold = threshold self.copy = copy @@ -2241,7 +2229,6 @@ class QuantileTransformer(TransformerMixin, BaseEstimator): `. """ - @_deprecate_positional_args def __init__(self, *, n_quantiles=1000, output_distribution='uniform', ignore_implicit_zeros=False, subsample=int(1e5), random_state=None, copy=True): @@ -2560,7 +2547,6 @@ def _more_tags(self): return {'allow_nan': True} -@_deprecate_positional_args def quantile_transform(X, *, axis=0, n_quantiles=1000, output_distribution='uniform', ignore_implicit_zeros=False, @@ -2779,7 +2765,6 @@ class PowerTransformer(TransformerMixin, BaseEstimator): .. [2] G.E.P. Box and D.R. Cox, "An Analysis of Transformations", Journal of the Royal Statistical Society B, 26, 211-252 (1964). """ - @_deprecate_positional_args def __init__(self, method='yeo-johnson', *, standardize=True, copy=True): self.method = method self.standardize = standardize @@ -3057,7 +3042,6 @@ def _more_tags(self): return {'allow_nan': True} -@_deprecate_positional_args def power_transform(X, method='yeo-johnson', *, standardize=True, copy=True): """ Power transforms are a family of parametric, monotonic transformations diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index 9ce95a97544a5..d7565ff2fb4b3 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -15,7 +15,6 @@ from ..base import BaseEstimator, TransformerMixin from ..utils.validation import check_array from ..utils.validation import check_is_fitted -from ..utils.validation import _deprecate_positional_args class KBinsDiscretizer(TransformerMixin, BaseEstimator): @@ -125,7 +124,6 @@ class KBinsDiscretizer(TransformerMixin, BaseEstimator): """ - @_deprecate_positional_args def __init__(self, n_bins=5, *, encode='onehot', strategy='quantile', dtype=None): self.n_bins = n_bins diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index ba1d48df175ee..385b4ed83d3eb 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -10,7 +10,6 @@ from ..base import BaseEstimator, TransformerMixin from ..utils import check_array, is_scalar_nan from ..utils.validation import check_is_fitted -from ..utils.validation import _deprecate_positional_args from ..utils._mask import _get_mask from ..utils._encode import _encode, _check_unknown, _unique @@ -330,7 +329,6 @@ class OneHotEncoder(_BaseEncoder): [1., 0., 1., 0.]]) """ - @_deprecate_positional_args def __init__(self, *, categories='auto', drop=None, sparse=True, dtype=np.float64, handle_unknown='error'): self.categories = categories @@ -741,7 +739,6 @@ class OrdinalEncoder(_BaseEncoder): ['Female', 2]], dtype=object) """ - @_deprecate_positional_args def __init__(self, *, categories='auto', dtype=np.float64, handle_unknown='error', unknown_value=None): self.categories = categories diff --git a/sklearn/preprocessing/_function_transformer.py b/sklearn/preprocessing/_function_transformer.py index ca176aeb87a10..25975add1baf2 100644 --- a/sklearn/preprocessing/_function_transformer.py +++ b/sklearn/preprocessing/_function_transformer.py @@ -2,7 +2,6 @@ from ..base import BaseEstimator, TransformerMixin from ..utils.validation import _allclose_dense_sparse -from ..utils.validation import _deprecate_positional_args def _identity(X): @@ -84,7 +83,6 @@ class FunctionTransformer(TransformerMixin, BaseEstimator): [1.0986..., 1.3862...]]) """ - @_deprecate_positional_args def __init__(self, func=None, inverse_func=None, *, validate=False, accept_sparse=False, check_inverse=True, kw_args=None, inv_kw_args=None): diff --git a/sklearn/preprocessing/_label.py b/sklearn/preprocessing/_label.py index 2b43dfffe716d..d07b7997ad36a 100644 --- a/sklearn/preprocessing/_label.py +++ b/sklearn/preprocessing/_label.py @@ -21,7 +21,6 @@ from ..utils.validation import check_array from ..utils.validation import check_is_fitted from ..utils.validation import _num_samples -from ..utils.validation import _deprecate_positional_args from ..utils.multiclass import unique_labels from ..utils.multiclass import type_of_target from ..utils._encode import _encode, _unique @@ -257,7 +256,6 @@ class LabelBinarizer(TransformerMixin, BaseEstimator): scheme. """ - @_deprecate_positional_args def __init__(self, *, neg_label=0, pos_label=1, sparse_output=False): if neg_label >= pos_label: raise ValueError("neg_label={0} must be strictly less than " @@ -406,7 +404,6 @@ def _more_tags(self): return {'X_types': ['1dlabels']} -@_deprecate_positional_args def label_binarize(y, *, classes, neg_label=0, pos_label=1, sparse_output=False): """Binarize labels in a one-vs-all fashion. @@ -720,7 +717,6 @@ class MultiLabelBinarizer(TransformerMixin, BaseEstimator): scheme. """ - @_deprecate_positional_args def __init__(self, *, classes=None, sparse_output=False): self.classes = classes self.sparse_output = sparse_output diff --git a/sklearn/preprocessing/_polynomial.py b/sklearn/preprocessing/_polynomial.py index d1ec49d7539bf..44ac0d2175c4c 100644 --- a/sklearn/preprocessing/_polynomial.py +++ b/sklearn/preprocessing/_polynomial.py @@ -13,8 +13,7 @@ from ..base import BaseEstimator, TransformerMixin from ..utils import check_array from ..utils.fixes import linspace -from ..utils.validation import (check_is_fitted, FLOAT_DTYPES, - _deprecate_positional_args) +from ..utils.validation import check_is_fitted, FLOAT_DTYPES from ._csr_polynomial_expansion import _csr_polynomial_expansion @@ -99,7 +98,6 @@ class PolynomialFeatures(TransformerMixin, BaseEstimator): See :ref:`examples/linear_model/plot_polynomial_interpolation.py ` """ - @_deprecate_positional_args def __init__(self, degree=2, *, interaction_only=False, include_bias=True, order='C'): self.degree = degree diff --git a/sklearn/random_projection.py b/sklearn/random_projection.py index 8e968088e8141..06e4839e50eca 100644 --- a/sklearn/random_projection.py +++ b/sklearn/random_projection.py @@ -39,7 +39,6 @@ from .utils.extmath import safe_sparse_dot from .utils.random import sample_without_replacement from .utils.validation import check_is_fitted -from .utils.validation import _deprecate_positional_args from .exceptions import DataDimensionalityWarning @@ -48,7 +47,6 @@ "johnson_lindenstrauss_min_dim"] -@_deprecate_positional_args def johnson_lindenstrauss_min_dim(n_samples, *, eps=0.1): """Find a 'safe' number of components to randomly project to. @@ -477,7 +475,6 @@ class GaussianRandomProjection(BaseRandomProjection): SparseRandomProjection """ - @_deprecate_positional_args def __init__(self, n_components='auto', *, eps=0.1, random_state=None): super().__init__( n_components=n_components, @@ -618,7 +615,6 @@ class SparseRandomProjection(BaseRandomProjection): https://users.soe.ucsc.edu/~optas/papers/jl.pdf """ - @_deprecate_positional_args def __init__(self, n_components='auto', *, density='auto', eps=0.1, dense_output=False, random_state=None): super().__init__( diff --git a/sklearn/semi_supervised/_label_propagation.py b/sklearn/semi_supervised/_label_propagation.py index 8ba99b9603e05..e89dfab9310ab 100644 --- a/sklearn/semi_supervised/_label_propagation.py +++ b/sklearn/semi_supervised/_label_propagation.py @@ -68,7 +68,6 @@ from ..utils.extmath import safe_sparse_dot from ..utils.multiclass import check_classification_targets from ..utils.validation import check_is_fitted -from ..utils.validation import _deprecate_positional_args from ..exceptions import ConvergenceWarning @@ -106,7 +105,6 @@ class BaseLabelPropagation(ClassifierMixin, BaseEstimator, metaclass=ABCMeta): for more details. """ - @_deprecate_positional_args def __init__(self, kernel='rbf', *, gamma=20, n_neighbors=7, alpha=1, max_iter=30, tol=1e-3, n_jobs=None): @@ -382,7 +380,6 @@ class LabelPropagation(BaseLabelPropagation): _variant = 'propagation' - @_deprecate_positional_args def __init__(self, kernel='rbf', *, gamma=20, n_neighbors=7, max_iter=1000, tol=1e-3, n_jobs=None): super().__init__(kernel=kernel, gamma=gamma, @@ -496,7 +493,6 @@ class LabelSpreading(BaseLabelPropagation): _variant = 'spreading' - @_deprecate_positional_args def __init__(self, kernel='rbf', *, gamma=20, n_neighbors=7, alpha=0.2, max_iter=30, tol=1e-3, n_jobs=None): diff --git a/sklearn/svm/_bounds.py b/sklearn/svm/_bounds.py index b35728041f6cf..97cbd6d5be355 100644 --- a/sklearn/svm/_bounds.py +++ b/sklearn/svm/_bounds.py @@ -6,11 +6,9 @@ from ..preprocessing import LabelBinarizer from ..utils.validation import check_consistent_length, check_array -from ..utils.validation import _deprecate_positional_args from ..utils.extmath import safe_sparse_dot -@_deprecate_positional_args def l1_min_c(X, y, *, loss='squared_hinge', fit_intercept=True, intercept_scaling=1.0): """ diff --git a/sklearn/svm/_classes.py b/sklearn/svm/_classes.py index b151f5267da50..050855c25c06a 100644 --- a/sklearn/svm/_classes.py +++ b/sklearn/svm/_classes.py @@ -5,7 +5,6 @@ from ..linear_model._base import LinearClassifierMixin, SparseCoefMixin, \ LinearModel from ..utils.validation import _num_samples -from ..utils.validation import _deprecate_positional_args from ..utils.multiclass import check_classification_targets @@ -178,7 +177,6 @@ class LinearSVC(LinearClassifierMixin, >>> print(clf.predict([[0, 0, 0, 0]])) [1] """ - @_deprecate_positional_args def __init__(self, penalty='l2', loss='squared_hinge', *, dual=True, tol=1e-4, C=1.0, multi_class='ovr', fit_intercept=True, intercept_scaling=1, class_weight=None, verbose=0, @@ -373,7 +371,6 @@ class LinearSVR(RegressorMixin, LinearModel): various loss functions and regularization regimes. """ - @_deprecate_positional_args def __init__(self, *, epsilon=0.0, tol=1e-4, C=1.0, loss='epsilon_insensitive', fit_intercept=True, intercept_scaling=1., dual=True, verbose=0, @@ -645,7 +642,6 @@ class SVC(BaseSVC): _impl = 'c_svc' - @_deprecate_positional_args def __init__(self, *, C=1.0, kernel='rbf', degree=3, gamma='scale', coef0=0.0, shrinking=True, probability=False, tol=1e-3, cache_size=200, class_weight=None, @@ -866,7 +862,6 @@ class NuSVC(BaseSVC): _impl = 'nu_svc' - @_deprecate_positional_args def __init__(self, *, nu=0.5, kernel='rbf', degree=3, gamma='scale', coef0=0.0, shrinking=True, probability=False, tol=1e-3, cache_size=200, class_weight=None, verbose=False, max_iter=-1, @@ -1033,7 +1028,6 @@ class SVR(RegressorMixin, BaseLibSVM): _impl = 'epsilon_svr' - @_deprecate_positional_args def __init__(self, *, kernel='rbf', degree=3, gamma='scale', coef0=0.0, tol=1e-3, C=1.0, epsilon=0.1, shrinking=True, cache_size=200, verbose=False, max_iter=-1): @@ -1186,7 +1180,6 @@ class NuSVR(RegressorMixin, BaseLibSVM): _impl = 'nu_svr' - @_deprecate_positional_args def __init__(self, *, nu=0.5, C=1.0, kernel='rbf', degree=3, gamma='scale', coef0=0.0, shrinking=True, tol=1e-3, cache_size=200, verbose=False, max_iter=-1): @@ -1325,7 +1318,6 @@ class OneClassSVM(OutlierMixin, BaseLibSVM): _impl = 'one_class' - @_deprecate_positional_args def __init__(self, *, kernel='rbf', degree=3, gamma='scale', coef0=0.0, tol=1e-3, nu=0.5, shrinking=True, cache_size=200, verbose=False, max_iter=-1): diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index de5aebfa8a6e3..a79a850f3b7c7 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -36,7 +36,6 @@ from ..utils import compute_sample_weight from ..utils.multiclass import check_classification_targets from ..utils.validation import check_is_fitted -from ..utils.validation import _deprecate_positional_args from ._criterion import Criterion from ._splitter import Splitter @@ -89,7 +88,6 @@ class BaseDecisionTree(MultiOutputMixin, BaseEstimator, metaclass=ABCMeta): """ @abstractmethod - @_deprecate_positional_args def __init__(self, *, criterion, splitter, @@ -851,7 +849,6 @@ class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree): array([ 1. , 0.93..., 0.86..., 0.93..., 0.93..., 0.93..., 0.93..., 1. , 0.93..., 1. ]) """ - @_deprecate_positional_args def __init__(self, *, criterion="gini", splitter="best", @@ -1212,7 +1209,6 @@ class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree): array([-0.39..., -0.46..., 0.02..., 0.06..., -0.50..., 0.16..., 0.11..., -0.73..., -0.30..., -0.00...]) """ - @_deprecate_positional_args def __init__(self, *, criterion="squared_error", splitter="best", @@ -1525,7 +1521,6 @@ class ExtraTreeClassifier(DecisionTreeClassifier): >>> cls.score(X_test, y_test) 0.8947... """ - @_deprecate_positional_args def __init__(self, *, criterion="gini", splitter="random", @@ -1756,7 +1751,6 @@ class ExtraTreeRegressor(DecisionTreeRegressor): >>> reg.score(X_test, y_test) 0.33... """ - @_deprecate_positional_args def __init__(self, *, criterion="squared_error", splitter="random", diff --git a/sklearn/tree/_export.py b/sklearn/tree/_export.py index 17680db2b855d..a9763128c3a7e 100644 --- a/sklearn/tree/_export.py +++ b/sklearn/tree/_export.py @@ -17,7 +17,6 @@ import numpy as np from ..utils.validation import check_is_fitted -from ..utils.validation import _deprecate_positional_args from ..base import is_classifier from . import _criterion @@ -76,7 +75,6 @@ def __repr__(self): SENTINEL = Sentinel() -@_deprecate_positional_args def plot_tree(decision_tree, *, max_depth=None, feature_names=None, class_names=None, label='all', filled=False, impurity=True, node_ids=False, proportion=False, rounded=False, precision=3, @@ -648,7 +646,6 @@ def recurse(self, node, tree, ax, scale_x, scale_y, height, depth=0): ax.annotate("\n (...) \n", xy_parent, xy, **kwargs) -@_deprecate_positional_args def export_graphviz(decision_tree, out_file=None, *, max_depth=None, feature_names=None, class_names=None, label='all', filled=False, leaves_parallel=False, impurity=True, @@ -804,7 +801,6 @@ def compute_depth_(current_node, current_depth, return max(depths) -@_deprecate_positional_args def export_text(decision_tree, *, feature_names=None, max_depth=10, spacing=3, decimals=2, show_weights=False): """Build a text report showing the rules of a decision tree. diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index a6e30a9941756..4cb6f85fb5d0a 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -63,7 +63,7 @@ "ExtraTreeRegressor": ExtraTreeRegressor, } -ALL_TREES = dict() +ALL_TREES: dict = dict() ALL_TREES.update(CLF_TREES) ALL_TREES.update(REG_TREES) diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index 972d56f66d900..c1f7c2e641502 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -30,8 +30,7 @@ assert_all_finite, check_random_state, column_or_1d, check_array, check_consistent_length, check_X_y, indexable, - check_symmetric, check_scalar, - _deprecate_positional_args) + check_symmetric, check_scalar) from .. import get_config @@ -632,7 +631,6 @@ def shuffle(*arrays, random_state=None, n_samples=None): random_state=random_state) -@_deprecate_positional_args def safe_sqr(X, *, copy=True): """Element wise squaring of array-likes and sparse matrices. @@ -672,7 +670,6 @@ def _chunk_generator(gen, chunksize): return -@_deprecate_positional_args def gen_batches(n, batch_size, *, min_batch_size=0): """Generator to create slices containing batch_size elements, from 0 to n. @@ -726,7 +723,6 @@ def gen_batches(n, batch_size, *, min_batch_size=0): yield slice(start, n) -@_deprecate_positional_args def gen_even_slices(n, n_packs, *, n_samples=None): """Generator to create n_packs slices going up to n. @@ -914,7 +910,6 @@ def _print_elapsed_time(source, message=None): timeit.default_timer() - start)) -@_deprecate_positional_args def get_chunk_n_rows(row_bytes, *, max_n_rows=None, working_memory=None): """Calculates how many rows can be processed within working_memory. diff --git a/sklearn/utils/class_weight.py b/sklearn/utils/class_weight.py index 47d1dd25860dd..0daebccd51322 100644 --- a/sklearn/utils/class_weight.py +++ b/sklearn/utils/class_weight.py @@ -4,10 +4,7 @@ import numpy as np -from .validation import _deprecate_positional_args - -@_deprecate_positional_args def compute_class_weight(class_weight, *, classes, y): """Estimate class weights for unbalanced datasets. @@ -72,7 +69,6 @@ def compute_class_weight(class_weight, *, classes, y): return weight -@_deprecate_positional_args def compute_sample_weight(class_weight, y, *, indices=None): """Estimate sample weights by class for unbalanced datasets. diff --git a/sklearn/utils/extmath.py b/sklearn/utils/extmath.py index c72c54bd1aa4d..13d24486cbc79 100644 --- a/sklearn/utils/extmath.py +++ b/sklearn/utils/extmath.py @@ -21,7 +21,6 @@ from .fixes import np_version, parse_version from .sparsefuncs_fast import csr_row_norms from .validation import check_array -from .validation import _deprecate_positional_args def squared_norm(x): @@ -116,7 +115,6 @@ def density(w, **kwargs): return d -@_deprecate_positional_args def safe_sparse_dot(a, b, *, dense_output=False): """Dot product that handle the sparse matrix case correctly. @@ -158,7 +156,6 @@ def safe_sparse_dot(a, b, *, dense_output=False): return ret -@_deprecate_positional_args def randomized_range_finder(A, *, size, n_iter, power_iteration_normalizer='auto', random_state=None): @@ -243,7 +240,6 @@ def randomized_range_finder(A, *, size, n_iter, return Q -@_deprecate_positional_args def randomized_svd(M, n_components, *, n_oversamples=10, n_iter='auto', power_iteration_normalizer='auto', transpose='auto', flip_sign=True, random_state='warn'): @@ -409,7 +405,6 @@ def randomized_svd(M, n_components, *, n_oversamples=10, n_iter='auto', return U[:, :n_components], s[:n_components], Vt[:n_components, :] -@_deprecate_positional_args def _randomized_eigsh(M, n_components, *, n_oversamples=10, n_iter='auto', power_iteration_normalizer='auto', selection='module', random_state=None): @@ -555,7 +550,6 @@ def _randomized_eigsh(M, n_components, *, n_oversamples=10, n_iter='auto', return eigvals, eigvecs -@_deprecate_positional_args def weighted_mode(a, w, *, axis=0): """Returns an array of the weighted modal (most common) value in a. diff --git a/sklearn/utils/graph.py b/sklearn/utils/graph.py index b98fd6ac0baa0..8d5d6782b46f4 100644 --- a/sklearn/utils/graph.py +++ b/sklearn/utils/graph.py @@ -13,13 +13,11 @@ from scipy import sparse from .graph_shortest_path import graph_shortest_path # noqa -from .validation import _deprecate_positional_args ############################################################################### # Path and connected component analysis. # Code adapted from networkx -@_deprecate_positional_args def single_source_shortest_path_length(graph, source, *, cutoff=None): """Return the shortest path length from source to all reachable nodes. diff --git a/sklearn/utils/sparsefuncs.py b/sklearn/utils/sparsefuncs.py index fcd7a3f3fe54e..3f85fc39e3053 100644 --- a/sklearn/utils/sparsefuncs.py +++ b/sklearn/utils/sparsefuncs.py @@ -5,7 +5,6 @@ # License: BSD 3 clause import scipy.sparse as sp import numpy as np -from .validation import _deprecate_positional_args from .sparsefuncs_fast import ( csr_mean_variance_axis0 as _csr_mean_var_axis0, @@ -120,7 +119,6 @@ def mean_variance_axis(X, axis, weights=None, return_sum_weights=False): _raise_typeerror(X) -@_deprecate_positional_args def incr_mean_variance_axis(X, *, axis, last_mean, last_var, last_n, weights=None): """Compute incremental mean and variance along an axis on a CSR or diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 536d585caa8b7..acfc8f5d10db2 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -32,7 +32,7 @@ FLOAT_DTYPES = (np.float64, np.float32, np.float16) -def _deprecate_positional_args(func=None, *, version="1.0 (renaming of 0.25)"): +def _deprecate_positional_args(func=None, *, version="1.1 (renaming of 0.26)"): """Decorator for methods that issues warnings for positional arguments. Using the keyword-only argument syntax in pep 3102, arguments after the @@ -42,7 +42,7 @@ def _deprecate_positional_args(func=None, *, version="1.0 (renaming of 0.25)"): ---------- func : callable, default=None Function to check arguments on. - version : callable, default="1.0 (renaming of 0.25)" + version : callable, default="1.1 (renaming of 0.26)" The version when positional arguments will result in error. """ def _inner_deprecate_positional_args(f): @@ -111,7 +111,6 @@ def _assert_all_finite(X, allow_nan=False, msg_dtype=None): raise ValueError("Input contains NaN") -@_deprecate_positional_args def assert_all_finite(X, *, allow_nan=False): """Throw a ValueError if X contains NaN or infinity. @@ -124,7 +123,6 @@ def assert_all_finite(X, *, allow_nan=False): _assert_all_finite(X.data if sp.issparse(X) else X, allow_nan) -@_deprecate_positional_args def as_float_array(X, *, copy=True, force_all_finite=True): """Converts an array-like to an array of floats. @@ -458,7 +456,6 @@ def _ensure_no_complex_data(array): "{}\n".format(array)) -@_deprecate_positional_args def check_array(array, accept_sparse=False, *, accept_large_sparse=True, dtype="numeric", order=None, copy=False, force_all_finite=True, ensure_2d=True, allow_nd=False, ensure_min_samples=1, @@ -761,7 +758,6 @@ def _check_large_sparse(X, accept_large_sparse=False): % indices_datatype) -@_deprecate_positional_args def check_X_y(X, y, accept_sparse=False, *, accept_large_sparse=True, dtype="numeric", order=None, copy=False, force_all_finite=True, ensure_2d=True, allow_nd=False, multi_output=False, @@ -890,7 +886,6 @@ def check_X_y(X, y, accept_sparse=False, *, accept_large_sparse=True, return X, y -@_deprecate_positional_args def column_or_1d(y, *, warn=False): """ Ravel column or 1d numpy array, else raises an error. @@ -971,7 +966,6 @@ def has_fit_parameter(estimator, parameter): return parameter in signature(estimator.fit).parameters -@_deprecate_positional_args def check_symmetric(array, *, tol=1E-10, raise_warning=True, raise_exception=False): """Make sure that array is 2D, square and symmetric. @@ -1031,7 +1025,6 @@ def check_symmetric(array, *, tol=1E-10, raise_warning=True, return array -@_deprecate_positional_args def check_is_fitted(estimator, attributes=None, *, msg=None, all_or_any=all): """Perform is_fitted validation for estimator. From 5073d692f04dea88d595252a6cc0382509b6947d Mon Sep 17 00:00:00 2001 From: groceryheist Date: Fri, 14 May 2021 15:40:16 -0700 Subject: [PATCH 398/478] DOC Clarify wording in ensemble.rst (#20094) --- doc/modules/ensemble.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst index 21610228b9b37..91fc892f79d0a 100644 --- a/doc/modules/ensemble.rst +++ b/doc/modules/ensemble.rst @@ -761,12 +761,12 @@ the parameter ``loss``: * Classification - * Binomial deviance (``'deviance'``): The negative binomial - log-likelihood loss function for binary classification (provides + * Binomial deviance (``'deviance'``): The binomial + negative log-likelihood loss function for binary classification (provides probability estimates). The initial model is given by the log odds-ratio. - * Multinomial deviance (``'deviance'``): The negative multinomial - log-likelihood loss function for multi-class classification with + * Multinomial deviance (``'deviance'``): The multinomial + negative log-likelihood loss function for multi-class classification with ``n_classes`` mutually exclusive classes. It provides probability estimates. The initial model is given by the prior probability of each class. At each iteration ``n_classes`` From d73822f84f2832dcc25f0ff58769f60871a78025 Mon Sep 17 00:00:00 2001 From: Yu Feng Date: Sun, 16 May 2021 06:49:42 -0700 Subject: [PATCH 399/478] DOC Add notes about the location of function body. (#20095) --- sklearn/svm/src/libsvm/svm.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/svm/src/libsvm/svm.h b/sklearn/svm/src/libsvm/svm.h index 0e509c61c37ed..a1634119858f1 100644 --- a/sklearn/svm/src/libsvm/svm.h +++ b/sklearn/svm/src/libsvm/svm.h @@ -118,7 +118,7 @@ struct svm_csr_model /* 0 if svm_model is created by svm_train */ }; - +/* svm_ functions are defined by libsvm_template.cpp from generic versions in svm.cpp */ struct svm_model *svm_train(const struct svm_problem *prob, const struct svm_parameter *param, int *status, BlasFunctions *blas_functions); void svm_cross_validation(const struct svm_problem *prob, const struct svm_parameter *param, int nr_fold, double *target, BlasFunctions *blas_functions); @@ -145,6 +145,7 @@ void svm_set_print_string_function(void (*print_func)(const char *)); /* sparse version */ +/* svm_csr_ functions are defined by libsvm_template.cpp from generic versions in svm.cpp */ struct svm_csr_model *svm_csr_train(const struct svm_csr_problem *prob, const struct svm_parameter *param, int *status, BlasFunctions *blas_functions); void svm_csr_cross_validation(const struct svm_csr_problem *prob, const struct svm_parameter *param, int nr_fold, double *target, BlasFunctions *blas_functions); From 29e21a33ad61ecc840c816bfd0a29921e2b64940 Mon Sep 17 00:00:00 2001 From: tom1092 <33375092+tom1092@users.noreply.github.com> Date: Mon, 17 May 2021 10:47:09 +0200 Subject: [PATCH 400/478] DOC Update hinge loss function in SVM (#20077) Co-authored-by: Guillaume Lemaitre --- doc/modules/svm.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/svm.rst b/doc/modules/svm.rst index 57d2cfb3cb7a7..fcf1d3e23976b 100644 --- a/doc/modules/svm.rst +++ b/doc/modules/svm.rst @@ -677,7 +677,7 @@ The primal problem can be equivalently formulated as .. math:: - \min_ {w, b} \frac{1}{2} w^T w + C \sum_{i=1}\max(0, y_i (w^T \phi(x_i) + b)), + \min_ {w, b} \frac{1}{2} w^T w + C \sum_{i=1}\max(0, 1 - y_i (w^T \phi(x_i) + b)), where we make use of the `hinge loss `_. This is the form that is From 053d2d1af477d9dc17e69162b9f2298c0fda5905 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Mon, 17 May 2021 04:52:34 -0400 Subject: [PATCH 401/478] CI Uses minimum version for doc-min-dependencies (#20057) --- .circleci/config.yml | 6 ++++++ build_tools/circle/build_doc.sh | 6 +++--- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index bc4acd8a35fcb..b407e8b15dd38 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -16,6 +16,9 @@ jobs: - SCIKIT_IMAGE_VERSION: 'min' - SPHINX_VERSION: 'min' - PANDAS_VERSION: 'min' + - SPHINX_GALLERY_VERSION: 'min' + - NUMPYDOC_VERSION: 'min' + - SPHINX_PROMPT_VERSION: 'min' steps: - checkout - run: ./build_tools/circle/checkout_merge_commit.sh @@ -57,6 +60,9 @@ jobs: - SCIKIT_IMAGE_VERSION: 'latest' - SPHINX_VERSION: 'min' - PANDAS_VERSION: 'latest' + - SPHINX_GALLERY_VERSION: 'latest' + - NUMPYDOC_VERSION: 'latest' + - SPHINX_PROMPT_VERSION: 'latest' steps: - checkout - run: ./build_tools/circle/checkout_merge_commit.sh diff --git a/build_tools/circle/build_doc.sh b/build_tools/circle/build_doc.sh index 37afb1841d368..563d09fc0b7bd 100755 --- a/build_tools/circle/build_doc.sh +++ b/build_tools/circle/build_doc.sh @@ -177,9 +177,9 @@ conda create -n $CONDA_ENV_NAME --yes --quiet \ joblib memory_profiler packaging seaborn pillow pytest coverage source activate testenv -pip install sphinx-gallery -pip install numpydoc -pip install sphinx-prompt +pip install "$(get_dep sphinx-gallery $SPHINX_GALLERY_VERSION)" +pip install "$(get_dep numpydoc $NUMPYDOC_VERSION)" +pip install "$(get_dep sphinx-prompt $SPHINX_PROMPT_VERSION)" # Set parallelism to 3 to overlap IO bound tasks with CPU bound tasks on CI # workers with 2 cores when building the compiled extensions of scikit-learn. From 40b45e6b7c65e08311a13cc7a8528c3988e3b405 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Tue, 18 May 2021 16:09:36 +0200 Subject: [PATCH 402/478] [MRG] CI Push Scipy minimum version to 1.1.0. Remove Python 3.6 from builds. (#20069) * Push scipy min version to 1.0.0 * Update all ubuntu images to 20.04 focal. * Add ubuntu images 18.04 bionic and scipy fron conda-forge. * Fix conditions. * Pin python 3.6 for ubuntu bionic. * Change pipeline name. * Change matrix element name. * Keep python 3.9 from system not conda in Ubuntu 20.04. * Remove python directive when unnecessary. * Cleanup. * Downgrade to python 3.6 as scipy 1.0.0 is incompatible with 3.8. * Fix comment. * Fix comment. * Pin pytest again as we are forced to use 3.6. * Move to conda installer for 32bit linux. * Install miniconda for ubuntu 32bit. * Install wget for ubuntu 32bit. * Revert 32bit OS to ubuntu bionic 18.04. * Install scipy from pip in 32bit system. * Fix doctest failures. * Revert example rendering. * Relax pytest version in ubuntu install. * Skip failing tests. * Put comment at the right place. * Remove python3.6. Ubuntu32 still needs to be adapted. * Push numpy and scipy min versions for compatibility with 3.7. * Push matplotlib min version for compatibility with 3.7. Install numpy via pip in 32bit linux. * Install numpy before scipy in Linux 32bit. * Pass numpy version to linux32. * Test 32bit architecture on debian buster (still exists for 32bit with python 3.7). * Install matplotlib from distribution. * Syntax error... * Stick to the numpy debian version to avoid Expected 124 from C header, got 112 from PyObject error. * Clean comments. * Revert skip in doctest to check with new dependencies. * Rename distrib. * Skip again... * Fix test on check_array. * Remove comment and fix lint at the same time. * Clean import. * Increase atol in test_derivatives to make the test pass in py37_conda_openblas environment. * Avoid sparse matrix dependent on scipy version. * Skip docstring test for pandas versions less then 1.1.0. * Fix lint error. * Empty commit to force checks. * Add minimal dependencies in changelog. * Update to python 3.7 CircleCI and Travis builds. * Move to debian buster for python3.7 dependencies. * Fix the container tag. * Lower the minimal pandas version for compatibility with python 3.7. --- .circleci/config.yml | 10 +-- .travis.yml | 10 +-- azure-pipelines.yml | 66 +++++++++++-------- build_tools/azure/install.sh | 4 +- build_tools/azure/posix-32.yml | 4 +- build_tools/azure/test_script.sh | 2 +- doc/conftest.py | 5 ++ doc/modules/sgd.rst | 2 +- .../supervised_learning.rst | 2 +- doc/whats_new/v1.0.rst | 6 ++ pyproject.toml | 2 +- sklearn/_min_dependencies.py | 11 ++-- sklearn/decomposition/_truncated_svd.py | 15 +++-- .../tests/test_loss.py | 2 +- sklearn/utils/tests/test_validation.py | 11 +--- 15 files changed, 80 insertions(+), 72 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index b407e8b15dd38..f4ee4e4cf1dfb 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -3,12 +3,12 @@ version: 2 jobs: doc-min-dependencies: docker: - - image: circleci/python:3.7.3-stretch + - image: circleci/python:3.7.7-buster environment: - OMP_NUM_THREADS: 2 - MKL_NUM_THREADS: 2 - CONDA_ENV_NAME: testenv - - PYTHON_VERSION: 3.6 + - PYTHON_VERSION: 3.7 - NUMPY_VERSION: 'min' - SCIPY_VERSION: 'min' - MATPLOTLIB_VERSION: 'min' @@ -47,7 +47,7 @@ jobs: doc: docker: - - image: circleci/python:3.7.3-stretch + - image: circleci/python:3.7.7-buster environment: - OMP_NUM_THREADS: 2 - MKL_NUM_THREADS: 2 @@ -96,7 +96,7 @@ jobs: lint: docker: - - image: circleci/python:3.6 + - image: circleci/python:3.7 steps: - checkout - run: ./build_tools/circle/checkout_merge_commit.sh @@ -130,7 +130,7 @@ jobs: deploy: docker: - - image: circleci/python:3.6 + - image: circleci/python:3.7 steps: - checkout - run: ./build_tools/circle/checkout_merge_commit.sh diff --git a/.travis.yml b/.travis.yml index 1e6ed78d28ac2..09f05b57eecfa 100644 --- a/.travis.yml +++ b/.travis.yml @@ -40,19 +40,11 @@ jobs: - CPU_COUNT=4 # Linux environments to build the scikit-learn wheels for the ARM64 - # architecture and Python 3.6 and newer. This is used both at release time + # architecture and Python 3.7 and newer. This is used both at release time # with the manual trigger in the commit message in the release branch and as # a scheduled task to build the weekly dev build on the main branch. The # weekly frequency is meant to avoid depleting the Travis CI credits too # fast. - - python: 3.6 - os: linux - arch: arm64 - if: type = cron or commit_message =~ /\[cd build\]/ - env: - - BUILD_WHEEL=true - - CIBW_BUILD=cp36-manylinux_aarch64 - - python: 3.7 os: linux arch: arm64 diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 412de99f5e57d..31baf41ff4cb1 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -11,7 +11,7 @@ jobs: - job: git_commit displayName: Get Git Commit pool: - vmImage: ubuntu-18.04 + vmImage: ubuntu-20.04 steps: - bash: | set -ex @@ -38,7 +38,7 @@ jobs: ) displayName: Linting pool: - vmImage: ubuntu-18.04 + vmImage: ubuntu-20.04 steps: - task: UsePythonVersion@0 inputs: @@ -57,7 +57,7 @@ jobs: - template: build_tools/azure/posix.yml parameters: name: Linux_Nightly - vmImage: ubuntu-18.04 + vmImage: ubuntu-20.04 dependsOn: [git_commit, linting] condition: | and( @@ -83,7 +83,7 @@ jobs: - template: build_tools/azure/posix.yml parameters: name: Linux_Nightly_ICC - vmImage: ubuntu-18.04 + vmImage: ubuntu-20.04 dependsOn: [git_commit, linting] condition: | and( @@ -105,7 +105,7 @@ jobs: - template: build_tools/azure/posix.yml parameters: name: Linux_Runs - vmImage: ubuntu-18.04 + vmImage: ubuntu-20.04 dependsOn: [git_commit] condition: | and( @@ -119,10 +119,30 @@ jobs: BLAS: 'mkl' COVERAGE: 'true' +# Check compilation with Ubuntu bionic 18.04 LTS and scipy from conda-forge - template: build_tools/azure/posix.yml parameters: - name: Linux + name: Ubuntu_Bionic vmImage: ubuntu-18.04 + dependsOn: [git_commit, linting] + condition: | + and( + succeeded(), + not(contains(dependencies['git_commit']['outputs']['commit.message'], '[ci skip]')), + ne(variables['Build.Reason'], 'Schedule') + ) + matrix: + py37_conda: + DISTRIB: 'conda' + PYTHON_VERSION: '3.7' + BLAS: 'openblas' + COVERAGE: 'false' + BUILD_WITH_ICC: 'false' + +- template: build_tools/azure/posix.yml + parameters: + name: Linux + vmImage: ubuntu-20.04 dependsOn: [linting, git_commit] condition: | and( @@ -132,32 +152,23 @@ jobs: ) matrix: # Linux environment to test that scikit-learn can be built against - # versions of numpy, scipy with ATLAS that comes with Ubuntu Bionic 18.04 - # i.e. numpy 1.13.3 and scipy 0.19 - py36_ubuntu_atlas: + # versions of numpy, scipy with ATLAS that comes with Ubuntu Focal 20.04 + # i.e. numpy 1.17.4 and scipy 1.3.3 + ubuntu_atlas: DISTRIB: 'ubuntu' - PYTHON_VERSION: '3.6' JOBLIB_VERSION: 'min' PANDAS_VERSION: 'none' THREADPOOLCTL_VERSION: 'min' - PYTEST_VERSION: 'min' - PYTEST_XDIST_VERSION: 'none' COVERAGE: 'false' - # Linux + Python 3.6 build with OpenBLAS and without SITE_JOBLIB - py36_conda_openblas: + # Linux + Python 3.7 build with OpenBLAS and without SITE_JOBLIB + py37_conda_openblas: DISTRIB: 'conda' - PYTHON_VERSION: '3.6' + PYTHON_VERSION: '3.7' BLAS: 'openblas' NUMPY_VERSION: 'min' SCIPY_VERSION: 'min' MATPLOTLIB_VERSION: 'min' - # latest version of joblib available in conda for Python 3.6 - JOBLIB_VERSION: '0.13.2' THREADPOOLCTL_VERSION: '2.0.0' - # temporary pin pytest due to unknown failure with pytest 5.4 and - # python 3.6 - PYTEST_VERSION: 'min' - PYTEST_XDIST_VERSION: 'none' # Linux environment to test the latest available dependencies and MKL. # It runs tests requiring lightgbm, pandas and PyAMG. pylatest_pip_openblas_pandas: @@ -171,7 +182,7 @@ jobs: - template: build_tools/azure/posix-32.yml parameters: name: Linux32 - vmImage: ubuntu-18.04 + vmImage: ubuntu-20.04 dependsOn: [linting, git_commit] condition: | and( @@ -180,14 +191,11 @@ jobs: ne(variables['Build.Reason'], 'Schedule') ) matrix: - py36_ubuntu_atlas_32bit: - DISTRIB: 'ubuntu-32' - PYTHON_VERSION: '3.6' + debian_atlas_32bit: + DISTRIB: 'debian-32' JOBLIB_VERSION: 'min' # disable pytest xdist due to unknown bug with 32-bit container PYTEST_XDIST_VERSION: 'none' - # temporary pin pytest due to unknown failure with pytest 5.4 and - # python 3.6 PYTEST_VERSION: 'min' THREADPOOLCTL_VERSION: 'min' @@ -231,6 +239,6 @@ jobs: PYTHON_ARCH: '64' PYTEST_VERSION: '*' COVERAGE: 'true' - py36_pip_openblas_32bit: - PYTHON_VERSION: '3.6' + py37_pip_openblas_32bit: + PYTHON_VERSION: '3.7' PYTHON_ARCH: '32' diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh index d2711d6bd610e..048ffe300ee2a 100755 --- a/build_tools/azure/install.sh +++ b/build_tools/azure/install.sh @@ -70,9 +70,9 @@ elif [[ "$DISTRIB" == "ubuntu" ]]; then python -m pip install $(get_dep cython $CYTHON_VERSION) \ $(get_dep joblib $JOBLIB_VERSION) -elif [[ "$DISTRIB" == "ubuntu-32" ]]; then +elif [[ "$DISTRIB" == "debian-32" ]]; then apt-get update - apt-get install -y python3-dev python3-scipy python3-matplotlib libatlas3-base libatlas-base-dev python3-virtualenv python3-pandas ccache + apt-get install -y python3-dev python3-numpy python3-scipy python3-matplotlib libatlas3-base libatlas-base-dev python3-virtualenv python3-pandas ccache python3 -m virtualenv --system-site-packages --python=python3 $VIRTUALENV source $VIRTUALENV/bin/activate diff --git a/build_tools/azure/posix-32.yml b/build_tools/azure/posix-32.yml index 5e4689a2505e5..039236a70fbe5 100644 --- a/build_tools/azure/posix-32.yml +++ b/build_tools/azure/posix-32.yml @@ -45,7 +45,7 @@ jobs: -w /io --detach --name skcontainer - -e DISTRIB=ubuntu-32 + -e DISTRIB=debian-32 -e TEST_DIR=/temp_dir -e JUNITXML=$JUNITXML -e VIRTUALENV=testvenv @@ -63,7 +63,7 @@ jobs: -e OMP_NUM_THREADS=$OMP_NUM_THREADS -e OPENBLAS_NUM_THREADS=$OPENBLAS_NUM_THREADS -e SKLEARN_SKIP_NETWORK_TESTS=$SKLEARN_SKIP_NETWORK_TESTS - i386/ubuntu:18.04 + i386/debian:10.9 sleep 1000000 displayName: 'Start container' - script: > diff --git a/build_tools/azure/test_script.sh b/build_tools/azure/test_script.sh index 858d691b38216..6e05d7d858e52 100755 --- a/build_tools/azure/test_script.sh +++ b/build_tools/azure/test_script.sh @@ -4,7 +4,7 @@ set -e if [[ "$DISTRIB" =~ ^conda.* ]]; then source activate $VIRTUALENV -elif [[ "$DISTRIB" == "ubuntu" ]] || [[ "$DISTRIB" == "ubuntu-32" ]]; then +elif [[ "$DISTRIB" == "ubuntu" ]] || [[ "$DISTRIB" == "debian-32" ]]; then source $VIRTUALENV/bin/activate fi diff --git a/doc/conftest.py b/doc/conftest.py index 5468184bf5509..a2770e5d36a10 100644 --- a/doc/conftest.py +++ b/doc/conftest.py @@ -7,6 +7,7 @@ from sklearn.utils import IS_PYPY from sklearn.utils._testing import SkipTest from sklearn.utils._testing import check_skip_network +from sklearn.utils.fixes import parse_version from sklearn.datasets import get_data_home from sklearn.datasets._base import _pkl_filepath from sklearn.datasets._twenty_newsgroups import CACHE_NAME @@ -80,6 +81,10 @@ def setup_grid_search(): def setup_preprocessing(): try: import pandas # noqa + if parse_version(pandas.__version__) < parse_version('1.1.0'): + raise SkipTest( + "Skipping preprocessing.rst, pandas version < 1.1.0" + ) except ImportError: raise SkipTest("Skipping preprocessing.rst, pandas not installed") diff --git a/doc/modules/sgd.rst b/doc/modules/sgd.rst index 0a1d8407e64ae..0b618289b84ec 100644 --- a/doc/modules/sgd.rst +++ b/doc/modules/sgd.rst @@ -130,7 +130,7 @@ Using ``loss="log"`` or ``loss="modified_huber"`` enables the :math:`P(y|x)` per sample :math:`x`:: >>> clf = SGDClassifier(loss="log", max_iter=5).fit(X, y) - >>> clf.predict_proba([[1., 1.]]) + >>> clf.predict_proba([[1., 1.]]) # doctest: +SKIP array([[0.00..., 0.99...]]) The concrete penalty can be set via the ``penalty`` parameter. diff --git a/doc/tutorial/statistical_inference/supervised_learning.rst b/doc/tutorial/statistical_inference/supervised_learning.rst index 3d87830fa0b26..e326b614472de 100644 --- a/doc/tutorial/statistical_inference/supervised_learning.rst +++ b/doc/tutorial/statistical_inference/supervised_learning.rst @@ -173,7 +173,7 @@ Linear models: :math:`y = X\beta + \epsilon` >>> regr = linear_model.LinearRegression() >>> regr.fit(diabetes_X_train, diabetes_y_train) LinearRegression() - >>> print(regr.coef_) + >>> print(regr.coef_) # doctest: +SKIP [ 0.30349955 -237.63931533 510.53060544 327.73698041 -814.13170937 492.81458798 102.84845219 184.60648906 743.51961675 76.09517222] diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index f94e7001fdc97..87b0441bade5f 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -12,6 +12,12 @@ Version 1.0.0 .. include:: changelog_legend.inc +Minimal dependencies +-------------------- + +Version 1.0.0 of scikit-learn requires python 3.7+, numpy 1.14.5+ and +scipy 1.1.0+. Optional minimal dependency is matplotlib 2.2.2+. + Enforcing keyword-only arguments -------------------------------- diff --git a/pyproject.toml b/pyproject.toml index c55c68b3182b8..84468f65341da 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,5 +11,5 @@ requires = [ # see: https://github.com/scipy/oldest-supported-numpy/blob/master/setup.cfg "oldest-supported-numpy", - "scipy>=0.19.1", + "scipy>=1.1.0", ] diff --git a/sklearn/_min_dependencies.py b/sklearn/_min_dependencies.py index 56d44586cdc6d..aa01b7fdfa352 100644 --- a/sklearn/_min_dependencies.py +++ b/sklearn/_min_dependencies.py @@ -5,12 +5,11 @@ # numpy scipy and cython should by in sync with pyproject.toml if platform.python_implementation() == 'PyPy': - SCIPY_MIN_VERSION = '1.1.0' NUMPY_MIN_VERSION = '1.19.0' else: - SCIPY_MIN_VERSION = '0.19.1' - NUMPY_MIN_VERSION = '1.13.3' + NUMPY_MIN_VERSION = '1.14.5' +SCIPY_MIN_VERSION = '1.1.0' JOBLIB_MIN_VERSION = '0.11' THREADPOOLCTL_MIN_VERSION = '2.0.0' PYTEST_MIN_VERSION = '5.0.1' @@ -26,9 +25,9 @@ 'joblib': (JOBLIB_MIN_VERSION, 'install'), 'threadpoolctl': (THREADPOOLCTL_MIN_VERSION, 'install'), 'cython': (CYTHON_MIN_VERSION, 'build'), - 'matplotlib': ('2.1.1', 'benchmark, docs, examples, tests'), - 'scikit-image': ('0.13', 'docs, examples, tests'), - 'pandas': ('0.25.0', 'benchmark, docs, examples, tests'), + 'matplotlib': ('2.2.2', 'benchmark, docs, examples, tests'), + 'scikit-image': ('0.14', 'docs, examples, tests'), + 'pandas': ('0.23.4', 'benchmark, docs, examples, tests'), 'seaborn': ('0.9.0', 'docs, examples'), 'memory_profiler': ('0.57.0', 'benchmark, docs'), 'pytest': (PYTEST_MIN_VERSION, 'tests'), diff --git a/sklearn/decomposition/_truncated_svd.py b/sklearn/decomposition/_truncated_svd.py index 74239567dee48..7aa36c59da00e 100644 --- a/sklearn/decomposition/_truncated_svd.py +++ b/sklearn/decomposition/_truncated_svd.py @@ -87,18 +87,21 @@ class TruncatedSVD(TransformerMixin, BaseEstimator): Examples -------- >>> from sklearn.decomposition import TruncatedSVD - >>> from scipy.sparse import random as sparse_random - >>> X = sparse_random(100, 100, density=0.01, format='csr', - ... random_state=42) + >>> from scipy.sparse import csr_matrix + >>> import numpy as np + >>> np.random.seed(0) + >>> X_dense = np.random.rand(100, 100) + >>> X_dense[:, 2 * np.arange(50)] = 0 + >>> X = csr_matrix(X_dense) >>> svd = TruncatedSVD(n_components=5, n_iter=7, random_state=42) >>> svd.fit(X) TruncatedSVD(n_components=5, n_iter=7, random_state=42) >>> print(svd.explained_variance_ratio_) - [0.0646... 0.0633... 0.0639... 0.0535... 0.0406...] + [0.0157... 0.0512... 0.0499... 0.0479... 0.0453...] >>> print(svd.explained_variance_ratio_.sum()) - 0.286... + 0.2102... >>> print(svd.singular_values_) - [1.553... 1.512... 1.510... 1.370... 1.199...] + [35.2410... 4.5981... 4.5420... 4.4486... 4.3288...] See Also -------- diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py index 345e72c642668..9f4294a101700 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py @@ -98,7 +98,7 @@ def fprime2(x: np.ndarray) -> np.ndarray: optimum = optimum.ravel() assert_allclose(loss.inverse_link_function(optimum), y_true) assert_allclose(func(optimum), 0, atol=1e-14) - assert_allclose(get_gradients(y_true, optimum), 0, atol=1e-7) + assert_allclose(get_gradients(y_true, optimum), 0, atol=1e-6) @pytest.mark.parametrize('loss, n_classes, prediction_dim', [ diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index 66f7d9ae77687..c244d6f6caffc 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -24,7 +24,7 @@ from sklearn.utils import check_X_y from sklearn.utils import deprecated from sklearn.utils._mocking import MockDataFrame -from sklearn.utils.fixes import np_version, parse_version +from sklearn.utils.fixes import parse_version from sklearn.utils.estimator_checks import _NotAnArray from sklearn.random_projection import _sparse_random_matrix from sklearn.linear_model import ARDRegression @@ -49,7 +49,6 @@ _num_features, FLOAT_DTYPES) from sklearn.utils.validation import _check_fit_params -from sklearn.utils.fixes import parse_version import sklearn @@ -345,7 +344,7 @@ def test_check_array(): assert isinstance(result, np.ndarray) -# TODO: Check for error in 1.1 when implicit conversation is removed +# TODO: Check for error in 1.1 when implicit conversion is removed @pytest.mark.parametrize("X", [ [['1', '2'], ['3', '4']], np.array([['1', '2'], ['3', '4']], dtype='U'), @@ -368,14 +367,10 @@ def test_check_array_numeric_warns(X): [['11', '12'], ['13', 'xx']], np.array([['11', '12'], ['13', 'xx']], dtype='U'), np.array([['11', '12'], ['13', 'xx']], dtype='S'), - [[b'a', b'b'], [b'c', b'd']], - np.array([[b'a', b'b'], [b'c', b'd']], dtype='V1') + [[b'a', b'b'], [b'c', b'd']] ]) def test_check_array_dtype_numeric_errors(X): """Error when string-ike array can not be converted""" - if (np_version < parse_version("1.14") - and hasattr(X, "dtype") and X.dtype.kind == "V"): - pytest.skip("old numpy would convert V dtype into float silently") expected_warn_msg = "Unable to convert array of bytes/strings" with pytest.raises(ValueError, match=expected_warn_msg): check_array(X, dtype="numeric") From ca6caa28ab92cbf75a3cc2a411d2a225abd9a4ce Mon Sep 17 00:00:00 2001 From: Ashvith Shetty Date: Wed, 19 May 2021 00:39:11 +0530 Subject: [PATCH 403/478] TST Removed the estimators from the IGNORED list in test_fit_docstring_attributes (#20103) --- sklearn/tests/test_docstring_parameters.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/sklearn/tests/test_docstring_parameters.py b/sklearn/tests/test_docstring_parameters.py index 099c27341927e..cc10f11fcd574 100644 --- a/sklearn/tests/test_docstring_parameters.py +++ b/sklearn/tests/test_docstring_parameters.py @@ -288,13 +288,6 @@ def test_fit_docstring_attributes(name, Estimator): with ignore_warnings(category=FutureWarning): assert hasattr(est, attr.name) - IGNORED = {'Birch', 'LarsCV', 'Lasso', - 'OrthogonalMatchingPursuit'} - - if Estimator.__name__ in IGNORED: - pytest.xfail( - reason="Estimator has too many undocumented attributes.") - fit_attr = [k for k in est.__dict__.keys() if k.endswith('_') and not k.startswith('_')] fit_attr_names = [attr.name for attr in attributes] From 094992b7a784491abd1ec0ce011e9f7956f09397 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=20Dupr=C3=A9=20la=20Tour?= Date: Tue, 18 May 2021 15:28:03 -0700 Subject: [PATCH 404/478] DOC fix new line alignment --- doc/related_projects.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/related_projects.rst b/doc/related_projects.rst index fb02ea8beaf0d..5d50196000e44 100644 --- a/doc/related_projects.rst +++ b/doc/related_projects.rst @@ -96,7 +96,7 @@ enhance the functionality of scikit-learn's estimators. cross-validated parameter search using any of these strategies. - `sklearn-deap `_ Use evolutionary - algorithms instead of gridsearch in scikit-learn. + algorithms instead of gridsearch in scikit-learn. **Model export for production** From 2a43ed2bfc4614e449c1dadb87ce95a21e7e3457 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Wed, 19 May 2021 12:20:25 +0200 Subject: [PATCH 405/478] CI Fix min dependencies for scikit-image (#20108) --- build_tools/circle/build_doc.sh | 2 +- sklearn/_min_dependencies.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/build_tools/circle/build_doc.sh b/build_tools/circle/build_doc.sh index 563d09fc0b7bd..3935b9a8deaa8 100755 --- a/build_tools/circle/build_doc.sh +++ b/build_tools/circle/build_doc.sh @@ -172,11 +172,11 @@ conda create -n $CONDA_ENV_NAME --yes --quiet \ "$(get_dep cython $CYTHON_VERSION)" \ "$(get_dep matplotlib $MATPLOTLIB_VERSION)" \ "$(get_dep sphinx $SPHINX_VERSION)" \ - "$(get_dep scikit-image $SCIKIT_IMAGE_VERSION)" \ "$(get_dep pandas $PANDAS_VERSION)" \ joblib memory_profiler packaging seaborn pillow pytest coverage source activate testenv +pip install "$(get_dep scikit-image $SCIKIT_IMAGE_VERSION)" pip install "$(get_dep sphinx-gallery $SPHINX_GALLERY_VERSION)" pip install "$(get_dep numpydoc $NUMPYDOC_VERSION)" pip install "$(get_dep sphinx-prompt $SPHINX_PROMPT_VERSION)" diff --git a/sklearn/_min_dependencies.py b/sklearn/_min_dependencies.py index aa01b7fdfa352..d878a04eb4523 100644 --- a/sklearn/_min_dependencies.py +++ b/sklearn/_min_dependencies.py @@ -26,7 +26,7 @@ 'threadpoolctl': (THREADPOOLCTL_MIN_VERSION, 'install'), 'cython': (CYTHON_MIN_VERSION, 'build'), 'matplotlib': ('2.2.2', 'benchmark, docs, examples, tests'), - 'scikit-image': ('0.14', 'docs, examples, tests'), + 'scikit-image': ('0.14.5', 'docs, examples, tests'), 'pandas': ('0.23.4', 'benchmark, docs, examples, tests'), 'seaborn': ('0.9.0', 'docs, examples'), 'memory_profiler': ('0.57.0', 'benchmark, docs'), From 3014fcfcd0253ccfd7831bf85a36b763189a6417 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juan=20Carlos=20Alfaro=20Jim=C3=A9nez?= Date: Wed, 19 May 2021 15:29:30 +0200 Subject: [PATCH 406/478] MNT Move parameter validation from `__init__` to `fit` in `neighbors` module (#20072) --- doc/whats_new/v1.0.rst | 6 +++++ sklearn/neighbors/_base.py | 10 ++++---- sklearn/neighbors/_classification.py | 8 +++++-- sklearn/neighbors/_regression.py | 8 +++++-- sklearn/neighbors/tests/test_neighbors.py | 28 +++++++++++++++-------- 5 files changed, 40 insertions(+), 20 deletions(-) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index 87b0441bade5f..34e9f0670ba81 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -415,6 +415,12 @@ Changelog - |FIX| :class:`neighbors.DistanceMetric` subclasses now support readonly memory-mapped datasets. :pr:`19883` by `Julien Jerphanion `. +- |FIX| :class:`neighbors.NearestNeighbors`, :class:`neighbors.KNeighborsClassifier`, + :class:`neighbors.RadiusNeighborsClassifier`, :class:`neighbors.KNeighborsRegressor` + and :class:`neighbors.RadiusNeighborsRegressor` does not validate `weights` in + `__init__` and validates `weights` in `fit` instead. :pr:`20072` by + :user:`Juan Carlos Alfaro Jiménez `. + :mod:`sklearn.pipeline` ....................... diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py index 9a222762ec615..c6438165aba1a 100644 --- a/sklearn/neighbors/_base.py +++ b/sklearn/neighbors/_base.py @@ -58,14 +58,13 @@ def _check_weights(weights): """Check to make sure weights are valid""" - if weights in (None, 'uniform', 'distance'): - return weights - elif callable(weights): - return weights - else: + if (weights not in (None, 'uniform', 'distance') and + not callable(weights)): raise ValueError("weights not recognized: should be 'uniform', " "'distance', or a callable function") + return weights + def _get_weights(dist, weights): """Get the weights from an array of distances and a parameter ``weights`` @@ -312,7 +311,6 @@ def __init__(self, n_neighbors=None, radius=None, self.metric_params = metric_params self.p = p self.n_jobs = n_jobs - self._check_algorithm_metric() def _check_algorithm_metric(self): if self.algorithm not in ['auto', 'brute', diff --git a/sklearn/neighbors/_classification.py b/sklearn/neighbors/_classification.py index 29ab582c15ab9..1fd1fb01c9762 100644 --- a/sklearn/neighbors/_classification.py +++ b/sklearn/neighbors/_classification.py @@ -152,7 +152,7 @@ def __init__(self, n_neighbors=5, *, leaf_size=leaf_size, metric=metric, p=p, metric_params=metric_params, n_jobs=n_jobs) - self.weights = _check_weights(weights) + self.weights = weights def fit(self, X, y): """Fit the k-nearest neighbors classifier from the training dataset. @@ -172,6 +172,8 @@ def fit(self, X, y): self : KNeighborsClassifier The fitted k-nearest neighbors classifier. """ + self.weights = _check_weights(self.weights) + return self._fit(X, y) def predict(self, X): @@ -412,7 +414,7 @@ def __init__(self, radius=1.0, *, weights='uniform', leaf_size=leaf_size, metric=metric, p=p, metric_params=metric_params, n_jobs=n_jobs) - self.weights = _check_weights(weights) + self.weights = weights self.outlier_label = outlier_label def fit(self, X, y): @@ -433,6 +435,8 @@ def fit(self, X, y): self : RadiusNeighborsClassifier The fitted radius neighbors classifier. """ + self.weights = _check_weights(self.weights) + self._fit(X, y) classes_ = self.classes_ diff --git a/sklearn/neighbors/_regression.py b/sklearn/neighbors/_regression.py index 96beb1ee022af..be60abcc64cb5 100644 --- a/sklearn/neighbors/_regression.py +++ b/sklearn/neighbors/_regression.py @@ -150,7 +150,7 @@ def __init__(self, n_neighbors=5, *, weights='uniform', algorithm=algorithm, leaf_size=leaf_size, metric=metric, p=p, metric_params=metric_params, n_jobs=n_jobs) - self.weights = _check_weights(weights) + self.weights = weights def _more_tags(self): # For cross-validation routines to split data correctly @@ -183,6 +183,8 @@ def fit(self, X, y): self : KNeighborsRegressor The fitted k-nearest neighbors regressor. """ + self.weights = _check_weights(self.weights) + return self._fit(X, y) def predict(self, X): @@ -349,7 +351,7 @@ def __init__(self, radius=1.0, *, weights='uniform', leaf_size=leaf_size, p=p, metric=metric, metric_params=metric_params, n_jobs=n_jobs) - self.weights = _check_weights(weights) + self.weights = weights def fit(self, X, y): """Fit the radius neighbors regressor from the training dataset. @@ -369,6 +371,8 @@ def fit(self, X, y): self : RadiusNeighborsRegressor The fitted radius neighbors regressor. """ + self.weights = _check_weights(self.weights) + return self._fit(X, y) def predict(self, X): diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index 5df7a6419b0b5..555687b7ea74a 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -1167,24 +1167,28 @@ def test_radius_neighbors_graph_sparse(seed=36): def test_neighbors_badargs(): # Test bad argument values: these should all raise ValueErrors - with pytest.raises(ValueError): - neighbors.NearestNeighbors(algorithm='blah') - X = rng.random_sample((10, 2)) Xsparse = csr_matrix(X) X3 = rng.random_sample((10, 3)) y = np.ones(10) + est = neighbors.NearestNeighbors(algorithm='blah') + with pytest.raises(ValueError): + est.fit(X) + for cls in (neighbors.KNeighborsClassifier, neighbors.RadiusNeighborsClassifier, neighbors.KNeighborsRegressor, neighbors.RadiusNeighborsRegressor): + est = cls(weights='blah') with pytest.raises(ValueError): - cls(weights='blah') + est.fit(X, y) + est = cls(p=-1) with pytest.raises(ValueError): - cls(p=-1) + est.fit(X, y) + est = cls(algorithm='blah') with pytest.raises(ValueError): - cls(algorithm='blah') + est.fit(X, y) nbrs = cls(algorithm='ball_tree', metric='haversine') with pytest.raises(ValueError): @@ -1253,10 +1257,11 @@ def test_neighbors_metrics(n_samples=20, n_features=3, # KD tree doesn't support all metrics if (algorithm == 'kd_tree' and metric not in neighbors.KDTree.valid_metrics): + est = neighbors.NearestNeighbors(algorithm=algorithm, + metric=metric, + metric_params=metric_params) with pytest.raises(ValueError): - neighbors.NearestNeighbors(algorithm=algorithm, - metric=metric, - metric_params=metric_params) + est.fit(X) continue neigh = neighbors.NearestNeighbors(n_neighbors=n_neighbors, algorithm=algorithm, @@ -1359,8 +1364,11 @@ def test_valid_brute_metric_for_auto_algorithm(): def test_metric_params_interface(): + X = rng.rand(5, 5) + y = rng.randint(0, 2, 5) + est = neighbors.KNeighborsClassifier(metric_params={'p': 3}) with pytest.warns(SyntaxWarning): - neighbors.KNeighborsClassifier(metric_params={'p': 3}) + est.fit(X, y) def test_predict_sparse_ball_kd_tree(): From 1ac047d29a43bd1556d5c90e40376340a08bc3a6 Mon Sep 17 00:00:00 2001 From: Eleni Markou Date: Wed, 19 May 2021 16:34:22 +0300 Subject: [PATCH 407/478] DOC Replace broken link in clustering.rst (#20102) --- doc/modules/clustering.rst | 2 +- sklearn/metrics/cluster/_supervised.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst index 7f9fe2a7bd12e..0245c48920f11 100644 --- a/doc/modules/clustering.rst +++ b/doc/modules/clustering.rst @@ -1653,7 +1653,7 @@ Drawbacks * E. B. Fowkles and C. L. Mallows, 1983. "A method for comparing two hierarchical clusterings". Journal of the American Statistical Association. - http://wildfire.stat.ucla.edu/pdflibrary/fowlkes.pdf + https://www.tandfonline.com/doi/abs/10.1080/01621459.1983.10478008 * `Wikipedia entry for the Fowlkes-Mallows Index `_ diff --git a/sklearn/metrics/cluster/_supervised.py b/sklearn/metrics/cluster/_supervised.py index ccc8077a3aab9..7814e7ba50e1c 100644 --- a/sklearn/metrics/cluster/_supervised.py +++ b/sklearn/metrics/cluster/_supervised.py @@ -1076,7 +1076,7 @@ def fowlkes_mallows_score(labels_true, labels_pred, *, sparse=False): .. [1] `E. B. Fowkles and C. L. Mallows, 1983. "A method for comparing two hierarchical clusterings". Journal of the American Statistical Association - `_ + `_ .. [2] `Wikipedia entry for the Fowlkes-Mallows Index `_ From 1b6a651296787bcfa850f443a85308f62dffdf47 Mon Sep 17 00:00:00 2001 From: Alihan Zihna Date: Thu, 20 May 2021 01:12:57 +0100 Subject: [PATCH 408/478] TST Changes assert to pytest style in decomposition, datasets, covariance, compose (#20104) Co-authored-by: Alihan Zihna --- .../compose/tests/test_column_transformer.py | 81 +++++---- .../tests/test_robust_covariance.py | 14 +- sklearn/datasets/tests/test_openml.py | 160 +++++++++--------- .../datasets/tests/test_samples_generator.py | 29 ++-- sklearn/decomposition/tests/test_nmf.py | 92 ++++++---- 5 files changed, 206 insertions(+), 170 deletions(-) diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py index 9278d67296ec5..b672885dad645 100644 --- a/sklearn/compose/tests/test_column_transformer.py +++ b/sklearn/compose/tests/test_column_transformer.py @@ -9,7 +9,6 @@ import pytest from numpy.testing import assert_allclose -from sklearn.utils._testing import assert_raise_message from sklearn.utils._testing import assert_array_equal from sklearn.utils._testing import assert_allclose_dense_sparse from sklearn.utils._testing import assert_almost_equal @@ -540,14 +539,17 @@ def test_column_transformer_error_msg_1D(): X_array = np.array([[0., 1., 2.], [2., 4., 6.]]).T col_trans = ColumnTransformer([('trans', StandardScaler(), 0)]) - assert_raise_message(ValueError, "1D data passed to a transformer", - col_trans.fit, X_array) - assert_raise_message(ValueError, "1D data passed to a transformer", - col_trans.fit_transform, X_array) + msg = '1D data passed to a transformer' + with pytest.raises(ValueError, match=msg): + col_trans.fit(X_array) + + with pytest.raises(ValueError, match=msg): + col_trans.fit_transform(X_array) col_trans = ColumnTransformer([('trans', TransRaise(), 0)]) for func in [col_trans.fit, col_trans.fit_transform]: - assert_raise_message(ValueError, "specific message", func, X_array) + with pytest.raises(ValueError, match="specific message"): + func(X_array) def test_2D_transformer_output(): @@ -556,11 +558,13 @@ def test_2D_transformer_output(): # if one transformer is dropped, test that name is still correct ct = ColumnTransformer([('trans1', 'drop', 0), ('trans2', TransNo2D(), 1)]) - assert_raise_message(ValueError, "the 'trans2' transformer should be 2D", - ct.fit_transform, X_array) + + msg = "the 'trans2' transformer should be 2D" + with pytest.raises(ValueError, match=msg): + ct.fit_transform(X_array) # because fit is also doing transform, this raises already on fit - assert_raise_message(ValueError, "the 'trans2' transformer should be 2D", - ct.fit, X_array) + with pytest.raises(ValueError, match=msg): + ct.fit(X_array) def test_2D_transformer_output_pandas(): @@ -571,11 +575,12 @@ def test_2D_transformer_output_pandas(): # if one transformer is dropped, test that name is still correct ct = ColumnTransformer([('trans1', TransNo2D(), 'col1')]) - assert_raise_message(ValueError, "the 'trans1' transformer should be 2D", - ct.fit_transform, X_df) + msg = "the 'trans1' transformer should be 2D" + with pytest.raises(ValueError, match=msg): + ct.fit_transform(X_df) # because fit is also doing transform, this raises already on fit - assert_raise_message(ValueError, "the 'trans1' transformer should be 2D", - ct.fit, X_df) + with pytest.raises(ValueError, match=msg): + ct.fit(X_df) @pytest.mark.parametrize("remainder", ['drop', 'passthrough']) @@ -585,14 +590,14 @@ def test_column_transformer_invalid_columns(remainder): # general invalid for col in [1.5, ['string', 1], slice(1, 's'), np.array([1.])]: ct = ColumnTransformer([('trans', Trans(), col)], remainder=remainder) - assert_raise_message(ValueError, "No valid specification", - ct.fit, X_array) + with pytest.raises(ValueError, match="No valid specification"): + ct.fit(X_array) # invalid for arrays for col in ['string', ['string', 'other'], slice('a', 'b')]: ct = ColumnTransformer([('trans', Trans(), col)], remainder=remainder) - assert_raise_message(ValueError, "Specifying the columns", - ct.fit, X_array) + with pytest.raises(ValueError, match="Specifying the columns"): + ct.fit(X_array) # transformed n_features does not match fitted n_features col = [0, 1] @@ -621,9 +626,9 @@ def predict(self, X): X_array = np.array([[0, 1, 2], [2, 4, 6]]).T ct = ColumnTransformer([('trans', NoTrans(), [0])]) - assert_raise_message(TypeError, - "All estimators should implement fit and transform", - ct.fit, X_array) + msg = "All estimators should implement fit and transform" + with pytest.raises(TypeError, match=msg): + ct.fit(X_array) def test_make_column_transformer(): @@ -659,13 +664,13 @@ def test_make_column_transformer_kwargs(): assert ct.remainder == 'drop' assert ct.sparse_threshold == 0.5 # invalid keyword parameters should raise an error message - assert_raise_message( - TypeError, + msg = re.escape( "make_column_transformer() got an unexpected " - "keyword argument 'transformer_weights'", - make_column_transformer, (scaler, 'first'), (norm, ['second']), - transformer_weights={'pca': 10, 'Transf': 1} + "keyword argument 'transformer_weights'" ) + with pytest.raises(TypeError, match=msg): + make_column_transformer((scaler, 'first'), (norm, ['second']), + transformer_weights={'pca': 10, 'Transf': 1}) def test_make_column_transformer_remainder_transformer(): @@ -893,10 +898,11 @@ def test_column_transformer_special_strings(): for val in [None, 'other']: ct = ColumnTransformer( [('trans1', Trans(), [0]), ('trans2', None, [1])]) - assert_raise_message(TypeError, "All estimators should implement", - ct.fit_transform, X_array) - assert_raise_message(TypeError, "All estimators should implement", - ct.fit, X_array) + msg = "All estimators should implement" + with pytest.raises(TypeError, match=msg): + ct.fit_transform(X_array) + with pytest.raises(TypeError, match=msg): + ct.fit(X_array) def test_column_transformer_remainder(): @@ -946,14 +952,15 @@ def test_column_transformer_remainder(): # error on invalid arg ct = ColumnTransformer([('trans1', Trans(), [0])], remainder=1) - assert_raise_message( - ValueError, - "remainder keyword needs to be one of \'drop\', \'passthrough\', " - "or estimator.", ct.fit, X_array) - assert_raise_message( - ValueError, + msg = ( "remainder keyword needs to be one of \'drop\', \'passthrough\', " - "or estimator.", ct.fit_transform, X_array) + "or estimator." + ) + with pytest.raises(ValueError, match=msg): + ct.fit(X_array) + + with pytest.raises(ValueError, match=msg): + ct.fit_transform(X_array) # check default for make_column_transformer ct = make_column_transformer((Trans(), [0])) diff --git a/sklearn/covariance/tests/test_robust_covariance.py b/sklearn/covariance/tests/test_robust_covariance.py index 01f32563710aa..1a6a1508170e7 100644 --- a/sklearn/covariance/tests/test_robust_covariance.py +++ b/sklearn/covariance/tests/test_robust_covariance.py @@ -10,7 +10,6 @@ import pytest from sklearn.utils._testing import assert_array_almost_equal -from sklearn.utils._testing import assert_raise_message from sklearn import datasets from sklearn.covariance import empirical_covariance, MinCovDet @@ -43,15 +42,17 @@ def test_mcd(): def test_fast_mcd_on_invalid_input(): X = np.arange(100) - assert_raise_message(ValueError, 'Expected 2D array, got 1D array instead', - fast_mcd, X) + msg = 'Expected 2D array, got 1D array instead' + with pytest.raises(ValueError, match=msg): + fast_mcd(X) def test_mcd_class_on_invalid_input(): X = np.arange(100) mcd = MinCovDet() - assert_raise_message(ValueError, 'Expected 2D array, got 1D array instead', - mcd.fit, X) + msg = 'Expected 2D array, got 1D array instead' + with pytest.raises(ValueError, match=msg): + mcd.fit(X) def launch_mcd_on_dataset(n_samples, n_features, n_outliers, tol_loc, tol_cov, @@ -133,7 +134,8 @@ def test_mcd_support_covariance_is_zero(): msg = ('The covariance matrix of the support data is equal to 0, try to ' 'increase support_fraction') for X in [X_1, X_2]: - assert_raise_message(ValueError, msg, MinCovDet().fit, X) + with pytest.raises(ValueError, match=msg): + MinCovDet().fit(X) def test_mcd_increasing_det_warning(): diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py index dac0762eb2160..663d2ae3088ed 100644 --- a/sklearn/datasets/tests/test_openml.py +++ b/sklearn/datasets/tests/test_openml.py @@ -22,8 +22,6 @@ _get_local_path, _retry_with_clean_cache, _feature_to_dtype) -from sklearn.utils._testing import (assert_warns_message, - assert_raise_message) from sklearn.utils import is_scalar_nan from sklearn.utils._testing import assert_allclose, assert_array_equal from urllib.error import HTTPError @@ -888,21 +886,20 @@ def test_fetch_openml_australian(monkeypatch, gzip_response): expected_features = 14 expected_missing = 0 _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response) - assert_warns_message( - UserWarning, - "Version 1 of dataset Australian is inactive,", - _fetch_dataset_from_openml, - **{'data_id': data_id, 'data_name': data_name, - 'data_version': data_version, - 'target_column': target_column, - 'expected_observations': expected_observations, - 'expected_features': expected_features, - 'expected_missing': expected_missing, - 'expect_sparse': True, - 'expected_data_dtype': np.float64, - 'expected_target_dtype': object, - 'compare_default_target': False} # numpy specific check - ) + msg = "Version 1 of dataset Australian is inactive," + with pytest.warns(UserWarning, match=msg): + _fetch_dataset_from_openml( + **{'data_id': data_id, 'data_name': data_name, + 'data_version': data_version, + 'target_column': target_column, + 'expected_observations': expected_observations, + 'expected_features': expected_features, + 'expected_missing': expected_missing, + 'expect_sparse': True, + 'expected_data_dtype': np.float64, + 'expected_target_dtype': object, + 'compare_default_target': False} # numpy specific check + ) @pytest.mark.parametrize('gzip_response', [True, False]) @@ -1095,14 +1092,14 @@ def test_fetch_openml_inactive(monkeypatch, gzip_response): # fetch inactive dataset by id data_id = 40675 _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response) - glas2 = assert_warns_message( - UserWarning, "Version 1 of dataset glass2 is inactive,", fetch_openml, - data_id=data_id, cache=False, as_frame=False) + msg = "Version 1 of dataset glass2 is inactive," + with pytest.warns(UserWarning, match=msg): + glas2 = fetch_openml(data_id=data_id, cache=False, as_frame=False) # fetch inactive dataset by name and version assert glas2.data.shape == (163, 9) - glas2_by_version = assert_warns_message( - UserWarning, "Version 1 of dataset glass2 is inactive,", fetch_openml, - data_id=None, name="glass2", version=1, cache=False, as_frame=False) + with pytest.warns(UserWarning, match=msg): + glas2_by_version = fetch_openml(data_id=None, name='glass2', + cache=False, version=1, as_frame=False) assert int(glas2_by_version.details['id']) == data_id @@ -1112,8 +1109,9 @@ def test_fetch_nonexiting(monkeypatch, gzip_response): data_id = 40675 _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response) # Note that we only want to search by name (not data id) - assert_raise_message(ValueError, "No active dataset glass2 found", - fetch_openml, name='glass2', cache=False) + msg = "No active dataset glass2 found" + with pytest.raises(ValueError, match=msg): + fetch_openml(name='glass2', cache=False) @pytest.mark.parametrize('gzip_response', [True, False]) @@ -1122,10 +1120,10 @@ def test_raises_illegal_multitarget(monkeypatch, gzip_response): targets = ['sepalwidth', 'class'] _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response) # Note that we only want to search by name (not data id) - assert_raise_message(ValueError, - "Can only handle homogeneous multi-target datasets,", - fetch_openml, data_id=data_id, - target_column=targets, cache=False) + msg = "Can only handle homogeneous multi-target datasets," + with pytest.raises(ValueError, match=msg): + fetch_openml(data_id=data_id, target_column=targets, + cache=False) @pytest.mark.parametrize('gzip_response', [True, False]) @@ -1135,23 +1133,27 @@ def test_warn_ignore_attribute(monkeypatch, gzip_response): expected_ignore_msg = "target_column={} has flag is_ignore." _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response) # single column test - assert_warns_message(UserWarning, expected_row_id_msg.format('MouseID'), - fetch_openml, data_id=data_id, - target_column='MouseID', - cache=False, as_frame=False) - assert_warns_message(UserWarning, expected_ignore_msg.format('Genotype'), - fetch_openml, data_id=data_id, - target_column='Genotype', - cache=False, as_frame=False) + target_col = 'MouseID' + msg = expected_row_id_msg.format(target_col) + with pytest.warns(UserWarning, match=msg): + fetch_openml(data_id=data_id, target_column=target_col, + cache=False, as_frame=False) + target_col = 'Genotype' + msg = expected_ignore_msg.format(target_col) + with pytest.warns(UserWarning, match=msg): + fetch_openml(data_id=data_id, target_column=target_col, + cache=False, as_frame=False) # multi column test - assert_warns_message(UserWarning, expected_row_id_msg.format('MouseID'), - fetch_openml, data_id=data_id, - target_column=['MouseID', 'class'], - cache=False, as_frame=False) - assert_warns_message(UserWarning, expected_ignore_msg.format('Genotype'), - fetch_openml, data_id=data_id, - target_column=['Genotype', 'class'], - cache=False, as_frame=False) + target_col = 'MouseID' + msg = expected_row_id_msg.format(target_col) + with pytest.warns(UserWarning, match=msg): + fetch_openml(data_id=data_id, target_column=[target_col, 'class'], + cache=False, as_frame=False) + target_col = 'Genotype' + msg = expected_ignore_msg.format(target_col) + with pytest.warns(UserWarning, match=msg): + fetch_openml(data_id=data_id, target_column=[target_col, 'class'], + cache=False, as_frame=False) @pytest.mark.parametrize('gzip_response', [True, False]) @@ -1159,73 +1161,77 @@ def test_string_attribute_without_dataframe(monkeypatch, gzip_response): data_id = 40945 _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response) # single column test - assert_raise_message(ValueError, - ('STRING attributes are not supported for ' - 'array representation. Try as_frame=True'), - fetch_openml, data_id=data_id, cache=False, - as_frame=False) + msg = ( + 'STRING attributes are not supported for ' + 'array representation. Try as_frame=True' + ) + with pytest.raises(ValueError, match=msg): + fetch_openml(data_id=data_id, cache=False, as_frame=False) @pytest.mark.parametrize('gzip_response', [True, False]) def test_dataset_with_openml_error(monkeypatch, gzip_response): data_id = 1 _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response) - assert_warns_message( - UserWarning, + msg = ( "OpenML registered a problem with the dataset. It might be unusable. " - "Error:", - fetch_openml, data_id=data_id, cache=False, as_frame=False + "Error:" ) + with pytest.warns(UserWarning, match=msg): + fetch_openml(data_id=data_id, cache=False, as_frame=False) @pytest.mark.parametrize('gzip_response', [True, False]) def test_dataset_with_openml_warning(monkeypatch, gzip_response): data_id = 3 _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response) - assert_warns_message( - UserWarning, + msg = ( "OpenML raised a warning on the dataset. It might be unusable. " - "Warning:", - fetch_openml, data_id=data_id, cache=False, as_frame=False + "Warning:" ) + with pytest.warns(UserWarning, match=msg): + fetch_openml(data_id=data_id, cache=False, as_frame=False) @pytest.mark.parametrize('gzip_response', [True, False]) def test_illegal_column(monkeypatch, gzip_response): data_id = 61 _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response) - assert_raise_message(KeyError, "Could not find target_column=", - fetch_openml, data_id=data_id, - target_column='undefined', cache=False) + msg = "Could not find target_column=" + with pytest.raises(KeyError, match=msg): + fetch_openml(data_id=data_id, target_column='undefined', cache=False) - assert_raise_message(KeyError, "Could not find target_column=", - fetch_openml, data_id=data_id, - target_column=['undefined', 'class'], - cache=False) + with pytest.raises(KeyError, match=msg): + fetch_openml(data_id=data_id, target_column=['undefined', 'class'], + cache=False) @pytest.mark.parametrize('gzip_response', [True, False]) def test_fetch_openml_raises_missing_values_target(monkeypatch, gzip_response): data_id = 2 _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response) - assert_raise_message(ValueError, "Target column ", - fetch_openml, data_id=data_id, target_column='family') + msg = 'Target column ' + with pytest.raises(ValueError, match=msg): + fetch_openml(data_id=data_id, target_column='family') def test_fetch_openml_raises_illegal_argument(): - assert_raise_message(ValueError, "Dataset data_id=", - fetch_openml, data_id=-1, name="name") + msg = 'Dataset data_id=' + with pytest.raises(ValueError, match=msg): + fetch_openml(data_id=-1, name="name") - assert_raise_message(ValueError, "Dataset data_id=", - fetch_openml, data_id=-1, name=None, - version="version") + with pytest.raises(ValueError, match=msg): + fetch_openml(data_id=-1, name=None, version="version") - assert_raise_message(ValueError, "Dataset data_id=", - fetch_openml, data_id=-1, name="name", - version="version") + with pytest.raises(ValueError, match=msg): + fetch_openml(data_id=-1, name="name", version="version") - assert_raise_message(ValueError, "Neither name nor data_id are provided. " - "Please provide name or data_id.", fetch_openml) + msg = ( + "Neither name nor data_id are provided. " + "Please provide name or data_id." + ) + with pytest.raises(ValueError, match=msg): + fetch_openml() @pytest.mark.parametrize('gzip_response', [True, False]) diff --git a/sklearn/datasets/tests/test_samples_generator.py b/sklearn/datasets/tests/test_samples_generator.py index fcdb1222bd116..df8989b69f59c 100644 --- a/sklearn/datasets/tests/test_samples_generator.py +++ b/sklearn/datasets/tests/test_samples_generator.py @@ -1,4 +1,5 @@ +import re from collections import defaultdict from functools import partial @@ -9,7 +10,6 @@ from sklearn.utils._testing import assert_array_equal from sklearn.utils._testing import assert_almost_equal from sklearn.utils._testing import assert_array_almost_equal -from sklearn.utils._testing import assert_raise_message from sklearn.datasets import make_classification from sklearn.datasets import make_multilabel_classification @@ -337,21 +337,22 @@ def test_make_blobs_error(): n_samples = [20, 20, 20] centers = np.array([[0.0, 0.0], [1.0, 1.0], [0.0, 1.0]]) cluster_stds = np.array([0.05, 0.2, 0.4]) - wrong_centers_msg = ("Length of `n_samples` not consistent " - "with number of centers. Got n_samples = {} " - "and centers = {}".format(n_samples, centers[:-1])) - assert_raise_message(ValueError, wrong_centers_msg, - make_blobs, n_samples, centers=centers[:-1]) - wrong_std_msg = ("Length of `clusters_std` not consistent with " - "number of centers. Got centers = {} " - "and cluster_std = {}".format(centers, cluster_stds[:-1])) - assert_raise_message(ValueError, wrong_std_msg, - make_blobs, n_samples, - centers=centers, cluster_std=cluster_stds[:-1]) + wrong_centers_msg = re.escape( + "Length of `n_samples` not consistent with number of centers. " + f"Got n_samples = {n_samples} and centers = {centers[:-1]}" + ) + with pytest.raises(ValueError, match=wrong_centers_msg): + make_blobs(n_samples, centers=centers[:-1]) + wrong_std_msg = re.escape( + "Length of `clusters_std` not consistent with number of centers. " + f"Got centers = {centers} and cluster_std = {cluster_stds[:-1]}" + ) + with pytest.raises(ValueError, match=wrong_std_msg): + make_blobs(n_samples, centers=centers, cluster_std=cluster_stds[:-1]) wrong_type_msg = ("Parameter `centers` must be array-like. " "Got {!r} instead".format(3)) - assert_raise_message(ValueError, wrong_type_msg, - make_blobs, n_samples, centers=3) + with pytest.raises(ValueError, match=wrong_type_msg): + make_blobs(n_samples, centers=3) def test_make_friedman1(): diff --git a/sklearn/decomposition/tests/test_nmf.py b/sklearn/decomposition/tests/test_nmf.py index 88c1ba406ad99..8bf0feb0b630d 100644 --- a/sklearn/decomposition/tests/test_nmf.py +++ b/sklearn/decomposition/tests/test_nmf.py @@ -1,3 +1,5 @@ +import re + import numpy as np import scipy.sparse as sp @@ -8,7 +10,6 @@ import pytest -from sklearn.utils._testing import assert_raise_message from sklearn.utils._testing import assert_array_equal from sklearn.utils._testing import assert_array_almost_equal from sklearn.utils._testing import assert_almost_equal @@ -45,34 +46,43 @@ def test_parameter_checking(): # FIXME : should be removed in 1.1 init = 'nndsvda' msg = "Invalid solver parameter: got 'spam' instead of one of" - assert_raise_message(ValueError, msg, NMF(solver=name, init=init).fit, A) + with pytest.raises(ValueError, match=msg): + NMF(solver=name, init=init).fit(A) msg = "Invalid init parameter: got 'spam' instead of one of" - assert_raise_message(ValueError, msg, NMF(init=name).fit, A) + with pytest.raises(ValueError, match=msg): + NMF(init=name).fit(A) msg = "Invalid regularization parameter: got 'spam' instead of one of" - assert_raise_message(ValueError, msg, NMF(regularization=name, - init=init).fit, A) + with pytest.raises(ValueError, match=msg): + NMF(regularization=name, init=init).fit(A) msg = "Invalid beta_loss parameter: got 'spam' instead of one" - assert_raise_message(ValueError, msg, NMF(solver='mu', init=init, - beta_loss=name).fit, A) - msg = "Invalid beta_loss parameter: solver 'cd' does not handle " - msg += "beta_loss = 1.0" - assert_raise_message(ValueError, msg, NMF(solver='cd', init=init, - beta_loss=1.0).fit, A) + with pytest.raises(ValueError, match=msg): + NMF(solver='mu', init=init, beta_loss=name).fit(A) + msg = ( + "Invalid beta_loss parameter: solver 'cd' does not handle " + "beta_loss = 1.0" + ) + with pytest.raises(ValueError, match=msg): + NMF(solver='cd', init=init, beta_loss=1.0).fit(A) msg = "Negative values in data passed to" - assert_raise_message(ValueError, msg, NMF(init=init).fit, -A) - assert_raise_message(ValueError, msg, nmf._initialize_nmf, -A, - 2, 'nndsvd') + with pytest.raises(ValueError, match=msg): + NMF(init=init).fit(-A) + with pytest.raises(ValueError, match=msg): + nmf._initialize_nmf(-A, 2, 'nndsvd') clf = NMF(2, tol=0.1, init=init).fit(A) - assert_raise_message(ValueError, msg, clf.transform, -A) + with pytest.raises(ValueError, match=msg): + clf.transform(-A) for init in ['nndsvd', 'nndsvda', 'nndsvdar']: - msg = ("init = '{}' can only be used when " - "n_components <= min(n_samples, n_features)" - .format(init)) - assert_raise_message(ValueError, msg, NMF(3, init=init).fit, A) - assert_raise_message(ValueError, msg, nmf._initialize_nmf, A, - 3, init) + msg = re.escape( + "init = '{}' can only be used when " + "n_components <= min(n_samples, n_features)" + .format(init) + ) + with pytest.raises(ValueError, match=msg): + NMF(3, init=init).fit(A) + with pytest.raises(ValueError, match=msg): + nmf._initialize_nmf(A, 3, init) def test_initialize_close(): @@ -257,21 +267,30 @@ def test_non_negative_factorization_checking(): A = np.ones((2, 2)) # Test parameters checking is public function nnmf = non_negative_factorization - msg = ("Number of components must be a positive integer; " - "got (n_components=1.5)") - assert_raise_message(ValueError, msg, nnmf, A, A, A, 1.5, init='random') - msg = ("Number of components must be a positive integer; " - "got (n_components='2')") - assert_raise_message(ValueError, msg, nnmf, A, A, A, '2', init='random') - msg = "Negative values in data passed to NMF (input H)" - assert_raise_message(ValueError, msg, nnmf, A, A, -A, 2, init='custom') - msg = "Negative values in data passed to NMF (input W)" - assert_raise_message(ValueError, msg, nnmf, A, -A, A, 2, init='custom') - msg = "Array passed to NMF (input H) is full of zeros" - assert_raise_message(ValueError, msg, nnmf, A, A, 0 * A, 2, init='custom') + msg = re.escape( + "Number of components must be a positive integer; " + "got (n_components=1.5)" + ) + with pytest.raises(ValueError, match=msg): + nnmf(A, A, A, 1.5, init='random') + msg = re.escape( + "Number of components must be a positive integer; " + "got (n_components='2')" + ) + with pytest.raises(ValueError, match=msg): + nnmf(A, A, A, '2', init='random') + msg = re.escape("Negative values in data passed to NMF (input H)") + with pytest.raises(ValueError, match=msg): + nnmf(A, A, -A, 2, init='custom') + msg = re.escape("Negative values in data passed to NMF (input W)") + with pytest.raises(ValueError, match=msg): + nnmf(A, -A, A, 2, init='custom') + msg = re.escape("Array passed to NMF (input H) is full of zeros") + with pytest.raises(ValueError, match=msg): + nnmf(A, A, 0 * A, 2, init='custom') msg = "Invalid regularization parameter: got 'spam' instead of one of" - assert_raise_message(ValueError, msg, nnmf, A, A, 0 * A, 2, init='custom', - regularization='spam') + with pytest.raises(ValueError, match=msg): + nnmf(A, A, 0 * A, 2, init='custom', regularization='spam') def _beta_divergence_dense(X, W, H, beta): @@ -425,7 +444,8 @@ def _assert_nmf_no_nan(X, beta_loss): msg = "When beta_loss <= 0 and X contains zeros, the solver may diverge." for beta_loss in (-0.6, 0.): - assert_raise_message(ValueError, msg, _assert_nmf_no_nan, X, beta_loss) + with pytest.raises(ValueError, match=msg): + _assert_nmf_no_nan(X, beta_loss) _assert_nmf_no_nan(X + 1e-9, beta_loss) for beta_loss in (0.2, 1., 1.2, 2., 2.5): From c67518350f91072f9d37ed09c5ef7edf555b6cf6 Mon Sep 17 00:00:00 2001 From: yoch Date: Thu, 20 May 2021 17:27:08 +0300 Subject: [PATCH 409/478] DOC use reshape instead of manually reshaping in plot_color_quantization (#19960) --- examples/cluster/plot_color_quantization.py | 24 ++++++++------------- 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/examples/cluster/plot_color_quantization.py b/examples/cluster/plot_color_quantization.py index ccc45eff73306..384e58f75e328 100644 --- a/examples/cluster/plot_color_quantization.py +++ b/examples/cluster/plot_color_quantization.py @@ -50,36 +50,30 @@ print("Fitting model on a small sub-sample of the data") t0 = time() -image_array_sample = shuffle(image_array, random_state=0)[:1000] +image_array_sample = shuffle(image_array, random_state=0, n_samples=1_000) kmeans = KMeans(n_clusters=n_colors, random_state=0).fit(image_array_sample) -print("done in %0.3fs." % (time() - t0)) +print(f"done in {time() - t0:0.3f}s.") # Get labels for all points print("Predicting color indices on the full image (k-means)") t0 = time() labels = kmeans.predict(image_array) -print("done in %0.3fs." % (time() - t0)) +print(f"done in {time() - t0:0.3f}s.") -codebook_random = shuffle(image_array, random_state=0)[:n_colors] +codebook_random = shuffle(image_array, random_state=0, n_samples=n_colors) print("Predicting color indices on the full image (random)") t0 = time() labels_random = pairwise_distances_argmin(codebook_random, image_array, axis=0) -print("done in %0.3fs." % (time() - t0)) +print(f"done in {time() - t0:0.3f}s.") def recreate_image(codebook, labels, w, h): """Recreate the (compressed) image from the code book & labels""" - d = codebook.shape[1] - image = np.zeros((w, h, d)) - label_idx = 0 - for i in range(w): - for j in range(h): - image[i][j] = codebook[labels[label_idx]] - label_idx += 1 - return image + return codebook[labels].reshape(w, h, -1) + # Display all results, alongside original image plt.figure(1) @@ -91,12 +85,12 @@ def recreate_image(codebook, labels, w, h): plt.figure(2) plt.clf() plt.axis('off') -plt.title('Quantized image (64 colors, K-Means)') +plt.title(f'Quantized image ({n_colors} colors, K-Means)') plt.imshow(recreate_image(kmeans.cluster_centers_, labels, w, h)) plt.figure(3) plt.clf() plt.axis('off') -plt.title('Quantized image (64 colors, Random)') +plt.title(f'Quantized image ({n_colors} colors, Random)') plt.imshow(recreate_image(codebook_random, labels_random, w, h)) plt.show() From e8e719dc8acfb58446f0d1fa92e5f9ef7dd1ad0c Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Fri, 21 May 2021 14:41:15 +0200 Subject: [PATCH 410/478] [DOC] Update roadmap. (#20116) --- doc/roadmap.rst | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/doc/roadmap.rst b/doc/roadmap.rst index 30c9f58339502..2bead90522739 100644 --- a/doc/roadmap.rst +++ b/doc/roadmap.rst @@ -70,16 +70,16 @@ the document up to date as we work on these issues. * document current handling * column reordering issue :issue:`7242` - * avoiding unnecessary conversion to ndarray :issue:`12147` + * avoiding unnecessary conversion to ndarray |ss| :issue:`12147` |se| * returning DataFrames from transformers :issue:`5523` - * getting DataFrames from dataset loaders :issue:`10733`, + * getting DataFrames from dataset loaders |ss| :issue:`10733` |se|, |ss| :issue:`13902` |se| - * Sparse currently not considered :issue:`12800` + * Sparse currently not considered |ss| :issue:`12800` |se| #. Improved handling of categorical features * Tree-based models should be able to handle both continuous and categorical - features :issue:`12866` and :issue:`15550`. + features :issue:`12866` and |ss| :issue:`15550` |se|. * |ss| In dataset loaders :issue:`13902` |se| * As generic transformers to be used with ColumnTransforms (e.g. ordinal encoding supervised by correlation with target variable) :issue:`5853`, @@ -89,7 +89,7 @@ the document up to date as we work on these issues. #. Improved handling of missing data * Making sure meta-estimators are lenient towards missing data, - :issue:`15319` + |ss| :issue:`15319` |se| * Non-trivial imputers |ss| :issue:`11977`, :issue:`12852` |se| * Learners directly handling missing data |ss| :issue:`13911` |se| * An amputation sample generator to make parts of a dataset go missing @@ -125,19 +125,20 @@ the document up to date as we work on these issues. components * More flexible estimator checks that do not select by estimator name - :issue:`6599` :issue:`6715` - * Example of how to develop an estimator or a meta-estimator, :issue:`14582` + |ss| :issue:`6599` |se| :issue:`6715` + * Example of how to develop an estimator or a meta-estimator, + |ss| :issue:`14582` |se| * More self-sufficient running of scikit-learn-contrib or a similar resource #. Support resampling and sample reduction * Allow subsampling of majority classes (in a pipeline?) :issue:`3855` - * Implement random forests with resampling :issue:`8732` + * Implement random forests with resampling :issue:`13227` #. Better interfaces for interactive development - * |ss| __repr__ |se| and HTML visualisations of estimators - |ss| :issue:`6323` |se| and :pr:`14180`. + * |ss| __repr__ and HTML visualisations of estimators + :issue:`6323` and :pr:`14180` |se|. * Include plotting tools, not just as examples. :issue:`9173` #. Improved tools for model diagnostics and basic inference @@ -249,7 +250,7 @@ Subpackage-specific goals * perhaps we want to be able to get back more than multiple metrics * the handling of random states in CV splitters is a poor design and contradicts the validation of similar parameters in estimators, - :issue:`15177` + `SLEP011 `_ * exploit warm-starting and path algorithms so the benefits of `EstimatorCV` objects can be accessed via `GridSearchCV` and used in Pipelines. :issue:`1626` From 36a4dcafedbcbb112e1d96fd04e73ba922523bae Mon Sep 17 00:00:00 2001 From: Bryan Chen Date: Fri, 21 May 2021 09:59:40 -0400 Subject: [PATCH 411/478] DOC Add Evalml to scikit-learn related projects (#20109) --- doc/related_projects.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/doc/related_projects.rst b/doc/related_projects.rst index 5d50196000e44..0cef93f0fd196 100644 --- a/doc/related_projects.rst +++ b/doc/related_projects.rst @@ -58,6 +58,12 @@ enhance the functionality of scikit-learn's estimators. it can stream minibatches, use data checkpoints, build funky pipelines, and serialize models with custom per-step savers. +- `EvalML `_ + EvalML is an AutoML library which builds, optimizes, and evaluates + machine learning pipelines using domain-specific objective functions. + It incorporates multiple modeling libraries under one API, and + the objects that EvalML creates use an sklearn-compatible API. + **Experimentation frameworks** - `Sacred `_ Tool to help you configure, From 5081c2fcd28863dfaa28e4633b39a1c2a2906e3c Mon Sep 17 00:00:00 2001 From: Nate Parsons <4307001+thehomebrewnerd@users.noreply.github.com> Date: Sat, 22 May 2021 08:03:10 -0500 Subject: [PATCH 412/478] DOC Update Featuretools link in Related Project Page (#20120) --- doc/related_projects.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/related_projects.rst b/doc/related_projects.rst index 0cef93f0fd196..033d53ddb94ee 100644 --- a/doc/related_projects.rst +++ b/doc/related_projects.rst @@ -46,7 +46,7 @@ enhance the functionality of scikit-learn's estimators. preprocessors as well as the estimators. Works as a drop-in replacement for a scikit-learn estimator. -- `Featuretools `_ +- `Featuretools `_ A framework to perform automated feature engineering. It can be used for transforming temporal and relational datasets into feature matrices for machine learning. From aa898de885ed4861a03e4f79b28f92f70914643d Mon Sep 17 00:00:00 2001 From: Alihan Zihna Date: Sat, 22 May 2021 19:55:22 +0100 Subject: [PATCH 413/478] TST Changes assert to pytest style in svm, manifold, linear_model, feature_extraction, decomposition (#19999) Changed the assert_raises, assert_raise_message, assert_warns in the following files: * test_factor_analysis.py * test_text.py * test_bayes.py * test_ransac.py * test_sag.py * test_locally_linear.py * test_bounds.py * test_sparse.py * test_svm.py --- .../tests/test_factor_analysis.py | 12 +++--- sklearn/feature_extraction/tests/test_text.py | 37 +++++++++++-------- sklearn/linear_model/tests/test_bayes.py | 4 +- sklearn/linear_model/tests/test_ransac.py | 13 ++++--- sklearn/linear_model/tests/test_sag.py | 26 +++++++------ sklearn/manifold/tests/test_locally_linear.py | 7 ++-- sklearn/svm/tests/test_bounds.py | 7 ++-- sklearn/svm/tests/test_sparse.py | 8 ++-- sklearn/svm/tests/test_svm.py | 27 +++++++++----- 9 files changed, 80 insertions(+), 61 deletions(-) diff --git a/sklearn/decomposition/tests/test_factor_analysis.py b/sklearn/decomposition/tests/test_factor_analysis.py index f889e49ea4a3a..45d4de948039d 100644 --- a/sklearn/decomposition/tests/test_factor_analysis.py +++ b/sklearn/decomposition/tests/test_factor_analysis.py @@ -7,8 +7,6 @@ import numpy as np import pytest -from sklearn.utils._testing import assert_warns -from sklearn.utils._testing import assert_raises from sklearn.utils._testing import assert_almost_equal from sklearn.utils._testing import assert_array_almost_equal from sklearn.exceptions import ConvergenceWarning @@ -69,14 +67,16 @@ def test_factor_analysis(): with pytest.raises(ValueError): fa.fit(X[:, :2]) - f = lambda x, y: np.abs(getattr(x, y)) # sign will not be equal + def f(x, y): + return np.abs(getattr(x, y)) # sign will not be equal fa1, fa2 = fas for attr in ['loglike_', 'components_', 'noise_variance_']: assert_almost_equal(f(fa1, attr), f(fa2, attr)) fa1.max_iter = 1 fa1.verbose = True - assert_warns(ConvergenceWarning, fa1.fit, X) + with pytest.warns(ConvergenceWarning): + fa1.fit(X) # Test get_covariance and get_precision with n_components == n_features # with n_components < n_features and with n_components == 0 @@ -101,8 +101,8 @@ def test_factor_analysis(): assert not np.allclose(results[rot1], results[rot2]) assert np.allclose(projections[rot1], projections[rot2], atol=3) - assert_raises(ValueError, - FactorAnalysis(rotation='not_implemented').fit_transform, X) + with pytest.raises(ValueError): + FactorAnalysis(rotation="not_implemented").fit_transform(X) # test against R's psych::principal with rotate="varimax" # (i.e., the values below stem from rotating the components in R) diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py index 767b04ddb5d95..0033ae84948ac 100644 --- a/sklearn/feature_extraction/tests/test_text.py +++ b/sklearn/feature_extraction/tests/test_text.py @@ -29,7 +29,6 @@ from numpy.testing import assert_array_equal from sklearn.utils import IS_PYPY from sklearn.utils._testing import (assert_almost_equal, - assert_raise_message, fails_if_pypy, assert_allclose_dense_sparse, skip_if_32bit) @@ -1093,7 +1092,8 @@ def func(): hv = HashingVectorizer() hv.fit_transform(['hello world', np.nan, 'hello hello']) - assert_raise_message(exception, message, func) + with pytest.raises(exception, match=message): + func() def test_tfidfvectorizer_binary(): @@ -1127,11 +1127,16 @@ def test_vectorizer_string_object_as_input(Vectorizer): message = ("Iterable over raw text documents expected, " "string object received.") vec = Vectorizer() - assert_raise_message( - ValueError, message, vec.fit_transform, "hello world!") - assert_raise_message(ValueError, message, vec.fit, "hello world!") + + with pytest.raises(ValueError, match=message): + vec.fit_transform("hello world!") + + with pytest.raises(ValueError, match=message): + vec.fit("hello world!") vec.fit(["some text", "some other text"]) - assert_raise_message(ValueError, message, vec.transform, "hello world!") + + with pytest.raises(ValueError, match=message): + vec.transform("hello world!") @pytest.mark.parametrize("X_dtype", [np.float32, np.float64]) @@ -1186,20 +1191,22 @@ def test_vectorizers_invalid_ngram_range(vec): # vectorizers could be initialized with invalid ngram range # test for raising error message invalid_range = vec.ngram_range - message = ("Invalid value for ngram_range=%s " - "lower boundary larger than the upper boundary." - % str(invalid_range)) + message = re.escape( + f"Invalid value for ngram_range={invalid_range} " + "lower boundary larger than the upper boundary." + ) if isinstance(vec, HashingVectorizer) and IS_PYPY: pytest.xfail(reason='HashingVectorizer is not supported on PyPy') - assert_raise_message( - ValueError, message, vec.fit, ["good news everyone"]) - assert_raise_message( - ValueError, message, vec.fit_transform, ["good news everyone"]) + with pytest.raises(ValueError, match=message): + vec.fit(['good news everyone']) + + with pytest.raises(ValueError, match=message): + vec.fit_transform(['good news everyone']) if isinstance(vec, HashingVectorizer): - assert_raise_message( - ValueError, message, vec.transform, ["good news everyone"]) + with pytest.raises(ValueError, match=message): + vec.transform(['good news everyone']) def _check_stop_words_consistency(estimator): diff --git a/sklearn/linear_model/tests/test_bayes.py b/sklearn/linear_model/tests/test_bayes.py index a22a0243cdcb7..fab87c5adf007 100644 --- a/sklearn/linear_model/tests/test_bayes.py +++ b/sklearn/linear_model/tests/test_bayes.py @@ -13,7 +13,6 @@ from sklearn.utils._testing import assert_array_almost_equal from sklearn.utils._testing import assert_almost_equal from sklearn.utils._testing import assert_array_less -from sklearn.utils._testing import assert_raise_message from sklearn.utils import check_random_state from sklearn.linear_model import BayesianRidge, ARDRegression from sklearn.linear_model import Ridge @@ -29,7 +28,8 @@ def test_n_iter(): y = np.array([1, 2, 6, 8, 10]) clf = BayesianRidge(n_iter=0) msg = "n_iter should be greater than or equal to 1." - assert_raise_message(ValueError, msg, clf.fit, X, y) + with pytest.raises(ValueError, match=msg): + clf.fit(X, y) def test_bayesian_ridge_scores(): diff --git a/sklearn/linear_model/tests/test_ransac.py b/sklearn/linear_model/tests/test_ransac.py index 071a67efcf28f..da7167c0feb2a 100644 --- a/sklearn/linear_model/tests/test_ransac.py +++ b/sklearn/linear_model/tests/test_ransac.py @@ -6,7 +6,6 @@ from numpy.testing import assert_array_equal from sklearn.utils import check_random_state -from sklearn.utils._testing import assert_raises_regexp from sklearn.utils._testing import assert_allclose from sklearn.datasets import make_regression from sklearn.linear_model import LinearRegression, RANSACRegressor @@ -159,7 +158,8 @@ def test_ransac_resid_thresh_no_inliers(): max_trials=5) msg = ("RANSAC could not find a valid consensus set") - assert_raises_regexp(ValueError, msg, ransac_estimator.fit, X, y) + with pytest.raises(ValueError, match=msg): + ransac_estimator.fit(X, y) assert ransac_estimator.n_skips_no_inliers_ == 5 assert ransac_estimator.n_skips_invalid_data_ == 0 assert ransac_estimator.n_skips_invalid_model_ == 0 @@ -175,7 +175,8 @@ def is_data_valid(X, y): max_trials=5) msg = ("RANSAC could not find a valid consensus set") - assert_raises_regexp(ValueError, msg, ransac_estimator.fit, X, y) + with pytest.raises(ValueError, match=msg): + ransac_estimator.fit(X, y) assert ransac_estimator.n_skips_no_inliers_ == 0 assert ransac_estimator.n_skips_invalid_data_ == 5 assert ransac_estimator.n_skips_invalid_model_ == 0 @@ -191,7 +192,8 @@ def is_model_valid(estimator, X, y): max_trials=5) msg = ("RANSAC could not find a valid consensus set") - assert_raises_regexp(ValueError, msg, ransac_estimator.fit, X, y) + with pytest.raises(ValueError, match=msg): + ransac_estimator.fit(X, y) assert ransac_estimator.n_skips_no_inliers_ == 0 assert ransac_estimator.n_skips_invalid_data_ == 0 assert ransac_estimator.n_skips_invalid_model_ == 5 @@ -208,7 +210,8 @@ def is_data_valid(X, y): max_skips=3) msg = ("RANSAC skipped more iterations than `max_skips`") - assert_raises_regexp(ValueError, msg, ransac_estimator.fit, X, y) + with pytest.raises(ValueError, match=msg): + ransac_estimator.fit(X, y) assert ransac_estimator.n_skips_no_inliers_ == 0 assert ransac_estimator.n_skips_invalid_data_ == 4 assert ransac_estimator.n_skips_invalid_model_ == 0 diff --git a/sklearn/linear_model/tests/test_sag.py b/sklearn/linear_model/tests/test_sag.py index 22432185cc09b..62a7175271bd8 100644 --- a/sklearn/linear_model/tests/test_sag.py +++ b/sklearn/linear_model/tests/test_sag.py @@ -4,6 +4,7 @@ # License: BSD 3 clause import math +import re import pytest import numpy as np import scipy.sparse as sp @@ -19,7 +20,6 @@ from sklearn.utils._testing import assert_almost_equal from sklearn.utils._testing import assert_array_almost_equal from sklearn.utils._testing import assert_allclose -from sklearn.utils._testing import assert_raise_message from sklearn.utils import compute_class_weight from sklearn.utils import check_random_state from sklearn.preprocessing import LabelEncoder, LabelBinarizer @@ -449,8 +449,8 @@ def test_get_auto_step_size(): assert_almost_equal(step_size_log, step_size_log_, decimal=4) msg = 'Unknown loss function for SAG solver, got wrong instead of' - assert_raise_message(ValueError, msg, get_auto_step_size, - max_squared_sum_, alpha, "wrong", fit_intercept) + with pytest.raises(ValueError, match=msg): + get_auto_step_size(max_squared_sum_, alpha, "wrong", fit_intercept) @pytest.mark.parametrize("seed", range(3)) # locally tested with 1000 seeds @@ -737,11 +737,9 @@ def test_classifier_single_class(): X = [[1, 2], [3, 4]] y = [1, 1] - assert_raise_message(ValueError, - "This solver needs samples of at least 2 classes " - "in the data", - LogisticRegression(solver='sag').fit, - X, y) + msg = "This solver needs samples of at least 2 classes in the data" + with pytest.raises(ValueError, match=msg): + LogisticRegression(solver='sag').fit(X, y) def test_step_size_alpha_error(): @@ -749,15 +747,19 @@ def test_step_size_alpha_error(): y = [1, -1] fit_intercept = False alpha = 1. - msg = ("Current sag implementation does not handle the case" - " step_size * alpha_scaled == 1") + msg = re.escape( + "Current sag implementation does not handle the case" + " step_size * alpha_scaled == 1" + ) clf1 = LogisticRegression(solver='sag', C=1. / alpha, fit_intercept=fit_intercept) - assert_raise_message(ZeroDivisionError, msg, clf1.fit, X, y) + with pytest.raises(ZeroDivisionError, match=msg): + clf1.fit(X, y) clf2 = Ridge(fit_intercept=fit_intercept, solver='sag', alpha=alpha) - assert_raise_message(ZeroDivisionError, msg, clf2.fit, X, y) + with pytest.raises(ZeroDivisionError, match=msg): + clf2.fit(X, y) def test_multinomial_loss(): diff --git a/sklearn/manifold/tests/test_locally_linear.py b/sklearn/manifold/tests/test_locally_linear.py index 952da3ef41163..dc5df2f8896aa 100644 --- a/sklearn/manifold/tests/test_locally_linear.py +++ b/sklearn/manifold/tests/test_locally_linear.py @@ -8,7 +8,6 @@ from sklearn import neighbors, manifold from sklearn.manifold._locally_linear import barycenter_kneighbors_graph from sklearn.utils._testing import ignore_warnings -from sklearn.utils._testing import assert_raise_message eigen_solvers = ['dense', 'arpack'] @@ -106,11 +105,13 @@ def test_lle_init_parameters(): clf = manifold.LocallyLinearEmbedding(eigen_solver="error") msg = "unrecognized eigen_solver 'error'" - assert_raise_message(ValueError, msg, clf.fit, X) + with pytest.raises(ValueError, match=msg): + clf.fit(X) clf = manifold.LocallyLinearEmbedding(method="error") msg = "unrecognized method 'error'" - assert_raise_message(ValueError, msg, clf.fit, X) + with pytest.raises(ValueError, match=msg): + clf.fit(X) def test_pipeline(): diff --git a/sklearn/svm/tests/test_bounds.py b/sklearn/svm/tests/test_bounds.py index 423d5ed7a7fba..70e6152d7fdea 100644 --- a/sklearn/svm/tests/test_bounds.py +++ b/sklearn/svm/tests/test_bounds.py @@ -9,8 +9,6 @@ from sklearn.linear_model import LogisticRegression from sklearn.svm._newrand import set_seed_wrap, bounded_rand_int_wrap -from sklearn.utils._testing import assert_raise_message - dense_X = [[-1, 0], [0, 1], [1, 1], [1, 1]] sparse_X = sp.csr_matrix(dense_X) @@ -38,8 +36,9 @@ def test_l1_min_c(loss, X_label, Y_label, intercept_label): def test_l1_min_c_l2_loss(): # loss='l2' should raise ValueError - assert_raise_message(ValueError, "loss type not in", - l1_min_c, dense_X, Y1, loss="l2") + msg = 'loss type not in' + with pytest.raises(ValueError, match=msg): + l1_min_c(dense_X, Y1, loss="l2") def check_l1_min_c(X, y, loss, fit_intercept=True, intercept_scaling=None): diff --git a/sklearn/svm/tests/test_sparse.py b/sklearn/svm/tests/test_sparse.py index bb935e55e1912..5e1196fa84faf 100644 --- a/sklearn/svm/tests/test_sparse.py +++ b/sklearn/svm/tests/test_sparse.py @@ -9,8 +9,7 @@ from sklearn.svm.tests import test_svm from sklearn.exceptions import ConvergenceWarning from sklearn.utils.extmath import safe_sparse_dot -from sklearn.utils._testing import (assert_raise_message, ignore_warnings, - skip_if_32bit) +from sklearn.utils._testing import ignore_warnings, skip_if_32bit # test sample 1 @@ -69,7 +68,8 @@ def check_svm_model_equal(dense_svm, sparse_svm, X_train, y_train, X_test): sparse_svm.predict_proba(X_test), 4) msg = "cannot use sparse input in 'SVC' trained on dense data" if sparse.isspmatrix(X_test): - assert_raise_message(ValueError, msg, dense_svm.predict, X_test) + with pytest.raises(ValueError, match=msg): + dense_svm.predict(X_test) @skip_if_32bit @@ -148,7 +148,7 @@ def test_svc_iris(): for k in ('linear', 'poly', 'rbf'): sp_clf = svm.SVC(kernel=k).fit(iris.data, iris.target) clf = svm.SVC(kernel=k).fit(iris.data.toarray(), - iris.target) + iris.target) assert_array_almost_equal(clf.support_vectors_, sp_clf.support_vectors_.toarray()) diff --git a/sklearn/svm/tests/test_svm.py b/sklearn/svm/tests/test_svm.py index 3fe57ad1b8375..97411c8c3c81b 100644 --- a/sklearn/svm/tests/test_svm.py +++ b/sklearn/svm/tests/test_svm.py @@ -19,7 +19,6 @@ from sklearn.metrics import f1_score from sklearn.metrics.pairwise import rbf_kernel from sklearn.utils import check_random_state -from sklearn.utils._testing import assert_raise_message from sklearn.utils._testing import ignore_warnings from sklearn.utils.validation import _num_samples from sklearn.utils import shuffle @@ -123,7 +122,8 @@ def test_precomputed(): # same as before, but using a callable function instead of the kernel # matrix. kernel is just a linear kernel - kfunc = lambda x, y: np.dot(x, y.T) + def kfunc(x, y): + return np.dot(x, y.T) clf = svm.SVC(kernel=kfunc) clf.fit(np.array(X), Y) pred = clf.predict(T) @@ -739,13 +739,16 @@ def test_linear_svx_uppercase_loss_penality_raises_error(): X, y = [[0.0], [1.0]], [0, 1] - assert_raise_message(ValueError, "loss='SQuared_hinge' is not supported", - svm.LinearSVC(loss="SQuared_hinge").fit, X, y) + msg = "loss='SQuared_hinge' is not supported" + with pytest.raises(ValueError, match=msg): + svm.LinearSVC(loss="SQuared_hinge").fit(X, y) - assert_raise_message(ValueError, - ("The combination of penalty='L2'" - " and loss='squared_hinge' is not supported"), - svm.LinearSVC(penalty="L2").fit, X, y) + msg = ( + "The combination of penalty='L2'" + " and loss='squared_hinge' is not supported" + ) + with pytest.raises(ValueError, match=msg): + svm.LinearSVC(penalty="L2").fit(X, y) def test_linearsvc(): @@ -1043,10 +1046,12 @@ def test_linear_svc_intercept_scaling(): for i in [-1, 0]: lsvc = svm.LinearSVC(intercept_scaling=i) + msg = ('Intercept scaling is %r but needs to be greater than 0.' ' To disable fitting an intercept,' ' set fit_intercept=False.' % lsvc.intercept_scaling) - assert_raise_message(ValueError, msg, lsvc.fit, X, Y) + with pytest.raises(ValueError, match=msg): + lsvc.fit(X, Y) def test_lsvc_intercept_scaling_zero(): @@ -1076,7 +1081,9 @@ def test_hasattr_predict_proba(): G.probability = True assert hasattr(G, 'predict_proba') msg = "predict_proba is not available when fitted with probability=False" - assert_raise_message(NotFittedError, msg, G.predict_proba, iris.data) + + with pytest.raises(NotFittedError, match=msg): + G.predict_proba(iris.data) def test_decision_function_shape_two_class(): From 5b7136f04068e7dcdf5ae8ec4aa729107ee905c0 Mon Sep 17 00:00:00 2001 From: Haoyin Xu Date: Mon, 24 May 2021 13:46:45 -0400 Subject: [PATCH 414/478] MNT Update license year to 2021 (#20126) --- COPYING | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/COPYING b/COPYING index 558c4c1245615..62bab0b0b5961 100644 --- a/COPYING +++ b/COPYING @@ -1,6 +1,6 @@ BSD 3-Clause License -Copyright (c) 2007-2020 The scikit-learn developers. +Copyright (c) 2007-2021 The scikit-learn developers. All rights reserved. Redistribution and use in source and binary forms, with or without From 88be3c1357f98b3f11e6fa1bf20e0ff249b6362e Mon Sep 17 00:00:00 2001 From: tliu68 <54865879+tliu68@users.noreply.github.com> Date: Tue, 25 May 2021 14:23:07 +0800 Subject: [PATCH 415/478] Fix GaussianMixture UnboundLocalError (#20030) --- doc/whats_new/v1.0.rst | 9 ++++ sklearn/mixture/_base.py | 6 +-- .../mixture/tests/test_gaussian_mixture.py | 45 +++++++++++++++++++ 3 files changed, 57 insertions(+), 3 deletions(-) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index 34e9f0670ba81..4e7ade1083921 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -377,6 +377,15 @@ Changelog :pr:`18328` by :user:`Albert Villanova del Moral ` and :user:`Alonso Silva Allende `. +:mod:`sklearn.mixture` +.............................. + +- |Fix| Ensure that the best parameters are set appropriately + in the case of divergency for :class:`mixture.GaussianMixture` and + :class:`mixture.BayesianGaussianMixture`. + :pr:`20030` by :user:`Tingshan Liu ` and + :user:`Benjamin Pedigo `. + :mod:`sklearn.model_selection` .............................. diff --git a/sklearn/mixture/_base.py b/sklearn/mixture/_base.py index 6acb6c2e09292..d3414c33eb5d0 100644 --- a/sklearn/mixture/_base.py +++ b/sklearn/mixture/_base.py @@ -203,7 +203,7 @@ def fit_predict(self, X, y=None): do_init = not(self.warm_start and hasattr(self, 'converged_')) n_init = self.n_init if do_init else 1 - max_lower_bound = -np.infty + max_lower_bound = -np.inf self.converged_ = False random_state = check_random_state(self.random_state) @@ -215,7 +215,7 @@ def fit_predict(self, X, y=None): if do_init: self._initialize_parameters(X, random_state) - lower_bound = (-np.infty if do_init else self.lower_bound_) + lower_bound = (-np.inf if do_init else self.lower_bound_) for n_iter in range(1, self.max_iter + 1): prev_lower_bound = lower_bound @@ -234,7 +234,7 @@ def fit_predict(self, X, y=None): self._print_verbose_msg_init_end(lower_bound) - if lower_bound > max_lower_bound: + if lower_bound > max_lower_bound or max_lower_bound == -np.inf: max_lower_bound = lower_bound best_params = self._get_parameters() best_n_iter = n_iter diff --git a/sklearn/mixture/tests/test_gaussian_mixture.py b/sklearn/mixture/tests/test_gaussian_mixture.py index 2d8dc81e54275..c8e85823260cd 100644 --- a/sklearn/mixture/tests/test_gaussian_mixture.py +++ b/sklearn/mixture/tests/test_gaussian_mixture.py @@ -1040,3 +1040,48 @@ def test_init(): max_iter=1, random_state=random_state).fit(X) assert gmm2.lower_bound_ >= gmm1.lower_bound_ + + +def test_gaussian_mixture_setting_best_params(): + """`GaussianMixture`'s best_parameters, `n_iter_` and `lower_bound_` + must be set appropriately in the case of divergence. + + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/18216 + """ + rnd = np.random.RandomState(0) + n_samples = 30 + X = rnd.uniform(size=(n_samples, 3)) + + # following initialization parameters were found to lead to divergence + means_init = np.array([ + [0.670637869618158, 0.21038256107384043, 0.12892629765485303], + [0.09394051075844147, 0.5759464955561779, 0.929296197576212], + [0.5033230372781258, 0.9569852381759425, 0.08654043447295741], + [0.18578301420435747, 0.5531158970919143, 0.19388943970532435], + [0.4548589928173794, 0.35182513658825276, 0.568146063202464], + [0.609279894978321, 0.7929063819678847, 0.9620097270828052], + ]) + precisions_init = np.array([999999.999604483, 999999.9990869573, + 553.7603944542167, 204.78596008931834, + 15.867423501783637, 85.4595728389735]) + weights_init = [0.03333333333333341, 0.03333333333333341, + 0.06666666666666674, 0.06666666666666674, + 0.7000000000000001, 0.10000000000000007] + + gmm = GaussianMixture(covariance_type="spherical", reg_covar=0, + means_init=means_init, weights_init=weights_init, + random_state=rnd, n_components=len(weights_init), + precisions_init=precisions_init) + # ensure that no error is thrown during fit + gmm.fit(X) + + # check that the fit did not converge + assert not gmm.converged_ + + # check that parameters are set for gmm + for attr in [ + "weights_", "means_", "covariances_", "precisions_cholesky_", + "n_iter_", "lower_bound_", + ]: + assert hasattr(gmm, attr) From c1cc67dd06d31a9b110377afe0c94b0cd50848d5 Mon Sep 17 00:00:00 2001 From: David Dale Date: Tue, 25 May 2021 15:02:37 +0300 Subject: [PATCH 416/478] FEA Add QuantileRegressor estimator (#9978) Co-authored-by: David Dale Co-authored-by: Christian Lorentzen --- doc/modules/classes.rst | 1 + doc/modules/linear_model.rst | 77 +++++ doc/whats_new/v1.0.rst | 5 + .../linear_model/plot_quantile_regression.py | 110 +++++++ sklearn/linear_model/__init__.py | 2 + sklearn/linear_model/_quantile.py | 280 ++++++++++++++++++ sklearn/linear_model/tests/test_quantile.py | 254 ++++++++++++++++ 7 files changed, 729 insertions(+) create mode 100644 examples/linear_model/plot_quantile_regression.py create mode 100644 sklearn/linear_model/_quantile.py create mode 100644 sklearn/linear_model/tests/test_quantile.py diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index 5462e06f81214..cdeb6f0523422 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -839,6 +839,7 @@ Any estimator using the Huber loss would also be robust to outliers, e.g. :template: class.rst linear_model.HuberRegressor + linear_model.QuantileRegressor linear_model.RANSACRegressor linear_model.TheilSenRegressor diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index f1f376dc641c9..7fc14693c198d 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -1423,6 +1423,83 @@ Note that this estimator is different from the R implementation of Robust Regres squares implementation with weights given to each sample on the basis of how much the residual is greater than a certain threshold. +.. _quantile_regression: + +Quantile Regression +=================== + +Quantile regression estimates the median or other quantiles of :math:`y` +conditional on :math:`X`, while ordinary least squares (OLS) estimates the +conditional mean. + +As a linear model, the :class:`QuantileRegressor` gives linear predictions +:math:`\hat{y}(w, X) = Xw` for the :math:`q`-th quantile, :math:`q \in (0, 1)`. +The weights or coefficients :math:`w` are then found by the following +minimization problem: + +.. math:: + \min_{w} {\frac{1}{n_{\text{samples}}} + \sum_i PB_q(y_i - X_i w) + \alpha ||w||_1}. + +This consists of the pinball loss (also known as linear loss), +see also :class:`~sklearn.metrics.mean_pinball_loss`, + +.. math:: + PB_q(t) = q \max(t, 0) + (1 - q) \max(-t, 0) = + \begin{cases} + q t, & t > 0, \\ + 0, & t = 0, \\ + (1-q) t, & t < 0 + \end{cases} + +and the L1 penalty controlled by parameter ``alpha``, similar to +:class:`Lasso`. + +As the pinball loss is only linear in the residuals, quantile regression is +much more robust to outliers than squared error based estimation of the mean. +Somewhat in between is the :class:`HuberRegressor`. + +Quantile regression may be useful if one is interested in predicting an +interval instead of point prediction. Sometimes, prediction intervals are +calculated based on the assumption that prediction error is distributed +normally with zero mean and constant variance. Quantile regression provides +sensible prediction intervals even for errors with non-constant (but +predictable) variance or non-normal distribution. + +.. figure:: /auto_examples/linear_model/images/sphx_glr_plot_quantile_regression_001.png + :target: ../auto_examples/linear_model/plot_quantile_regression.html + :align: center + :scale: 50% + +Based on minimizing the pinball loss, conditional quantiles can also be +estimated by models other than linear models. For example, +:class:`~sklearn.ensemble.GradientBoostingRegressor` can predict conditional +quantiles if its parameter ``loss`` is set to ``"quantile"`` and parameter +``alpha`` is set to the quantile that should be predicted. See the example in +:ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_quantile.py`. + +Most implementations of quantile regression are based on linear programming +problem. The current implementation is based on +:func:`scipy.optimize.linprog`. + +.. topic:: Examples: + + * :ref:`sphx_glr_auto_examples_linear_model_plot_quantile_regression.py` + +.. topic:: References: + + * Koenker, R., & Bassett Jr, G. (1978). `Regression quantiles. + `_ + Econometrica: journal of the Econometric Society, 33-50. + + * Portnoy, S., & Koenker, R. (1997). The Gaussian hare and the Laplacian + tortoise: computability of squared-error versus absolute-error estimators. + Statistical Science, 12, 279-300. https://doi.org/10.1214/ss/1030037960 + + * Koenker, R. (2005). Quantile Regression. + Cambridge University Press. https://doi.org/10.1017/CBO9780511754098 + + .. _polynomial_regression: Polynomial regression: extending linear models with basis functions diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index 4e7ade1083921..29a4bce98ecb0 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -282,6 +282,11 @@ Changelog :mod:`sklearn.linear_model` ........................... +- |Feature| Added :class:`linear_model.QuantileRegressor` which implements + linear quantile regression with L1 penalty. + :pr:`9978` by :user:`David Dale ` and + :user:`Christian Lorentzen `. + - |Feature| The new :class:`linear_model.SGDOneClassSVM` provides an SGD implementation of the linear One-Class SVM. Combined with kernel approximation techniques, this implementation approximates the solution of diff --git a/examples/linear_model/plot_quantile_regression.py b/examples/linear_model/plot_quantile_regression.py new file mode 100644 index 0000000000000..8af7785cc6733 --- /dev/null +++ b/examples/linear_model/plot_quantile_regression.py @@ -0,0 +1,110 @@ +""" +=================== +Quantile regression +=================== +This example illustrates how quantile regression can predict non-trivial +conditional quantiles. + +The left figure shows the case when the error distribution is normal, +but has non-constant variance, i.e. with heteroscedasticity. + +The right figure shows an example of an asymmetric error distribution, +namely the Pareto distribution. +""" +print(__doc__) +# Authors: David Dale +# Christian Lorentzen +# License: BSD 3 clause +import numpy as np +import matplotlib.pyplot as plt + +from sklearn.linear_model import QuantileRegressor, LinearRegression +from sklearn.metrics import mean_absolute_error, mean_squared_error +from sklearn.model_selection import cross_val_score + + +def plot_points_highlighted(x, y, model_low, model_high, ax): + """Plot points with highlighting.""" + mask = y <= model_low.predict(X) + ax.scatter(x[mask], y[mask], c="k", marker="x") + mask = y > model_high.predict(X) + ax.scatter(x[mask], y[mask], c="k", marker="x") + mask = (y > model_low.predict(X)) & (y <= model_high.predict(X)) + ax.scatter(x[mask], y[mask], c="k") + + +fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5), sharey=True) + +rng = np.random.RandomState(42) +x = np.linspace(0, 10, 100) +X = x[:, np.newaxis] +y = 10 + 0.5 * x + rng.normal(loc=0, scale=0.5 + 0.5 * x, size=x.shape[0]) +y_mean = 10 + 0.5 * x +ax1.plot(x, y_mean, "k--") + +quantiles = [0.05, 0.5, 0.95] +models = [] +for quantile in quantiles: + qr = QuantileRegressor(quantile=quantile, alpha=0) + qr.fit(X, y) + ax1.plot(x, qr.predict(X)) + models.append(qr) + +plot_points_highlighted(x, y, models[0], models[2], ax1) +ax1.set_xlabel("x") +ax1.set_ylabel("y") +ax1.set_title("Quantiles of heteroscedastic Normal distributed target") +ax1.legend(["true mean"] + quantiles) + + +a = 5 +y = 10 + 0.5 * x + 10 * (rng.pareto(a, size=x.shape[0]) - 1 / (a - 1)) +ax2.plot(x, y_mean, "k--") + +models = [] +for quantile in quantiles: + qr = QuantileRegressor(quantile=quantile, alpha=0) + qr.fit(X, y) + ax2.plot([0, 10], qr.predict([[0], [10]])) + models.append(qr) + +plot_points_highlighted(x, y, models[0], models[2], ax2) +ax2.set_xlabel("x") +ax2.set_ylabel("y") +ax2.set_title("Quantiles of asymmetric Pareto distributed target") +ax2.legend(["true mean"] + quantiles, loc="lower right") +ax2.yaxis.set_tick_params(labelbottom=True) + +plt.show() + +# %% +# Note that both targets have the same mean value, indicated by the dashed +# black line. As the Normal distribution is symmetric, mean and median are +# identical and the predicted 0.5 quantile almost hits the true mean. +# In the Pareto case, the difference between predicted median and true mean +# is evident. We also marked the points below the 0.05 and above 0.95 +# predicted quantiles by small crosses. You might count them and consider +# that we have 100 samples in total. +# +# The second part of the example shows that LinearRegression minimizes MSE +# in order to predict the mean, while QuantileRegressor with `quantile=0.5` +# minimizes MAE in order to predict the median. Both do their own job well. + +models = [LinearRegression(), QuantileRegressor(alpha=0)] +names = ["OLS", "Quantile"] + +print("# In-sample performance") +for model_name, model in zip(names, models): + print(model_name + ":") + model.fit(X, y) + mae = mean_absolute_error(model.predict(X), y) + rmse = np.sqrt(mean_squared_error(model.predict(X), y)) + print(f"MAE = {mae:.4} RMSE = {rmse:.4}") +print("\n# Cross-validated performance") +for model_name, model in zip(names, models): + print(model_name + ":") + mae = -cross_val_score(model, X, y, cv=3, + scoring="neg_mean_absolute_error").mean() + rmse = np.sqrt(-cross_val_score(model, X, y, cv=3, + scoring="neg_mean_squared_error").mean()) + print(f"MAE = {mae:.4} RMSE = {rmse:.4}") diff --git a/sklearn/linear_model/__init__.py b/sklearn/linear_model/__init__.py index f715e30795961..02e8cafaa7b88 100644 --- a/sklearn/linear_model/__init__.py +++ b/sklearn/linear_model/__init__.py @@ -28,6 +28,7 @@ from ._passive_aggressive import PassiveAggressiveRegressor from ._perceptron import Perceptron +from ._quantile import QuantileRegressor from ._ransac import RANSACRegressor from ._theil_sen import TheilSenRegressor @@ -59,6 +60,7 @@ 'PassiveAggressiveClassifier', 'PassiveAggressiveRegressor', 'Perceptron', + 'QuantileRegressor', 'Ridge', 'RidgeCV', 'RidgeClassifier', diff --git a/sklearn/linear_model/_quantile.py b/sklearn/linear_model/_quantile.py new file mode 100644 index 0000000000000..bf8fea4552c9d --- /dev/null +++ b/sklearn/linear_model/_quantile.py @@ -0,0 +1,280 @@ +# Authors: David Dale +# Christian Lorentzen +# License: BSD 3 clause +import warnings + +import numpy as np +from scipy.optimize import linprog + +from ..base import BaseEstimator, RegressorMixin +from ._base import LinearModel +from ..exceptions import ConvergenceWarning +from ..utils.validation import _check_sample_weight +from ..utils.fixes import sp_version, parse_version + + +class QuantileRegressor(LinearModel, RegressorMixin, BaseEstimator): + """Linear regression model that predicts conditional quantiles. + + The linear :class:`QuantileRegressor` optimizes the pinball loss for a + desired `quantile` and is robust to outliers. + + This model uses an L1 regularization like + :class:`~sklearn.linear_model.Lasso`. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 1.0 + + Parameters + ---------- + quantile : float, default=0.5 + The quantile that the model tries to predict. It must be strictly + between 0 and 1. If 0.5 (default), the model predicts the 50% + quantile, i.e. the median. + + alpha : float, default=1.0 + Regularization constant that multiplies the L1 penalty term. + + fit_intercept : bool, default=True + Whether or not to fit the intercept. + + solver : {'highs-ds', 'highs-ipm', 'highs', 'interior-point', \ + 'revised simplex'}, default='interior-point' + Method used by :func:`scipy.optimize.linprog` to solve the linear + programming formulation. Note that the highs methods are recommended + for usage with `scipy>=1.6.0` because they are the fastest ones. + + solver_options : dict, default=None + Additional parameters passed to :func:`scipy.optimize.linprog` as + options. If `None` and if `solver='interior-point'`, then + `{"lstsq": True}` is passed to :func:`scipy.optimize.linprog` for the + sake of stability. + + Attributes + ---------- + coef_ : array of shape (n_features,) + Estimated coefficients for the features. + + intercept_ : float + The intercept of the model, aka bias term. + + n_iter_ : int + The actual number of iterations performed by the solver. + + See Also + -------- + Lasso : The Lasso is a linear model that estimates sparse coefficients + with l1 regularization. + HuberRegressor : Linear regression model that is robust to outliers. + + Examples + -------- + >>> from sklearn.linear_model import QuantileRegressor + >>> import numpy as np + >>> n_samples, n_features = 10, 2 + >>> rng = np.random.RandomState(0) + >>> y = rng.randn(n_samples) + >>> X = rng.randn(n_samples, n_features) + >>> reg = QuantileRegressor(quantile=0.8).fit(X, y) + >>> np.mean(y <= reg.predict(X)) + 0.8 + """ + + def __init__( + self, + *, + quantile=0.5, + alpha=1.0, + fit_intercept=True, + solver="interior-point", + solver_options=None, + ): + self.quantile = quantile + self.alpha = alpha + self.fit_intercept = fit_intercept + self.solver = solver + self.solver_options = solver_options + + def fit(self, X, y, sample_weight=None): + """Fit the model according to the given training data. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data. + + y : array-like of shape (n_samples,) + Target values. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + Returns + ------- + self : object + Returns self. + """ + X, y = self._validate_data( + X, y, accept_sparse=False, y_numeric=True, multi_output=False + ) + sample_weight = _check_sample_weight(sample_weight, X) + + n_features = X.shape[1] + n_params = n_features + + if self.fit_intercept: + n_params += 1 + # Note that centering y and X with _preprocess_data does not work + # for quantile regression. + + # The objective is defined as 1/n * sum(pinball loss) + alpha * L1. + # So we rescale the penalty term, which is equivalent. + if self.alpha >= 0: + alpha = np.sum(sample_weight) * self.alpha + else: + raise ValueError( + f"Penalty alpha must be a non-negative number, " + f"got {self.alpha}" + ) + + if self.quantile >= 1.0 or self.quantile <= 0.0: + raise ValueError( + f"Quantile should be strictly between 0.0 and 1.0, got " + f"{self.quantile}" + ) + + if not isinstance(self.fit_intercept, bool): + raise ValueError( + f"The argument fit_intercept must be bool, " + f"got {self.fit_intercept}" + ) + + if self.solver not in ( + "highs-ds", + "highs-ipm", + "highs", + "interior-point", + "revised simplex", + ): + raise ValueError( + f"Invalid value for argument solver, got {self.solver}" + ) + elif self.solver == "revised simplex" and sp_version < parse_version( + "1.3.0" + ): + raise ValueError( + f"Solver 'revised simplex' is only available " + f"with scipy>=1.3.0, got {sp_version}" + ) + elif self.solver in ( + "highs-ds", + "highs-ipm", + "highs", + ) and sp_version < parse_version("1.6.0"): + raise ValueError( + f"Solver {self.solver} is only available " + f"with scipy>=1.6.0, got {sp_version}" + ) + + if self.solver_options is not None and not isinstance( + self.solver_options, dict + ): + raise ValueError( + f"Invalid value for argument solver_options, " + f"must be None or a dictionary, got " + f"{self.solver_options}" + ) + + # make default solver more stable + if self.solver_options is None and self.solver == "interior-point": + solver_options = {"lstsq": True} + else: + solver_options = self.solver_options + + # Use linear programming formulation of quantile regression + # min_x c x + # A_eq x = b_eq + # 0 <= x + # x = (s0, s, t0, t, u, v) = slack variables + # intercept = s0 + t0 + # coef = s + t + # c = (alpha * 1_p, alpha * 1_p, quantile * 1_n, (1-quantile) * 1_n) + # residual = y - X@coef - intercept = u - v + # A_eq = (1_n, X, -1_n, -X, diag(1_n), -diag(1_n)) + # b_eq = y + # p = n_features + fit_intercept + # n = n_samples + # 1_n = vector of length n with entries equal one + # see https://stats.stackexchange.com/questions/384909/ + # + # Filtering out zero samples weights from the beginning makes life + # easier for the linprog solver. + mask = sample_weight != 0 + n_mask = int(np.sum(mask)) # use n_mask instead of n_samples + c = np.concatenate( + [ + np.full(2 * n_params, fill_value=alpha), + sample_weight[mask] * self.quantile, + sample_weight[mask] * (1 - self.quantile), + ] + ) + if self.fit_intercept: + # do not penalize the intercept + c[0] = 0 + c[n_params] = 0 + + A_eq = np.concatenate( + [ + np.ones((n_mask, 1)), + X[mask], + -np.ones((n_mask, 1)), + -X[mask], + np.eye(n_mask), + -np.eye(n_mask), + ], + axis=1, + ) + else: + A_eq = np.concatenate( + [X[mask], -X[mask], np.eye(n_mask), -np.eye(n_mask)], axis=1 + ) + + b_eq = y[mask] + + result = linprog( + c=c, + A_eq=A_eq, + b_eq=b_eq, + method=self.solver, + options=solver_options, + ) + solution = result.x + if not result.success: + failure = { + 1: "Iteration limit reached.", + 2: "Problem appears to be infeasible.", + 3: "Problem appears to be unbounded.", + 4: "Numerical difficulties encountered.", + } + warnings.warn( + f"Linear programming for QuantileRegressor did not succeed.\n" + f"Status is {result.status}: " + + failure.setdefault(result.status, "unknown reason") + "\n" + + "Result message of linprog:\n" + result.message, + ConvergenceWarning + ) + + # positive slack - negative slack + # solution is an array with (params_pos, params_neg, u, v) + params = solution[:n_params] - solution[n_params:2 * n_params] + + self.n_iter_ = result.nit + + if self.fit_intercept: + self.coef_ = params[1:] + self.intercept_ = params[0] + else: + self.coef_ = params + self.intercept_ = 0.0 + return self diff --git a/sklearn/linear_model/tests/test_quantile.py b/sklearn/linear_model/tests/test_quantile.py new file mode 100644 index 0000000000000..6118889f4d1b6 --- /dev/null +++ b/sklearn/linear_model/tests/test_quantile.py @@ -0,0 +1,254 @@ +# Authors: David Dale +# Christian Lorentzen +# License: BSD 3 clause + +import numpy as np +import pytest +from pytest import approx +from scipy.optimize import minimize + +from sklearn.datasets import make_regression +from sklearn.exceptions import ConvergenceWarning +from sklearn.linear_model import HuberRegressor, QuantileRegressor +from sklearn.metrics import mean_pinball_loss +from sklearn.utils._testing import assert_allclose +from sklearn.utils.fixes import parse_version, sp_version + + +@pytest.fixture +def X_y_data(): + X, y = make_regression(n_samples=10, n_features=1, random_state=0, noise=1) + return X, y + + +@pytest.mark.parametrize( + "params, err_msg", + [ + ({"quantile": 2}, "Quantile should be strictly between 0.0 and 1.0"), + ({"quantile": 1}, "Quantile should be strictly between 0.0 and 1.0"), + ({"quantile": 0}, "Quantile should be strictly between 0.0 and 1.0"), + ({"quantile": -1}, "Quantile should be strictly between 0.0 and 1.0"), + ({"alpha": -1.5}, "Penalty alpha must be a non-negative number"), + ({"fit_intercept": "blah"}, "The argument fit_intercept must be bool"), + ({"fit_intercept": 0}, "The argument fit_intercept must be bool"), + ({"solver": "blah"}, "Invalid value for argument solver"), + ( + {"solver_options": "blah"}, + "Invalid value for argument solver_options", + ), + ], +) +def test_init_parameters_validation(X_y_data, params, err_msg): + """Test that invalid init parameters raise errors.""" + X, y = X_y_data + with pytest.raises(ValueError, match=err_msg): + QuantileRegressor(**params).fit(X, y) + + +@pytest.mark.parametrize("solver", ("highs-ds", "highs-ipm", "highs")) +@pytest.mark.skipif(sp_version >= parse_version('1.6.0'), + reason="Solvers are available as of scipy 1.6.0") +def test_too_new_solver_methods_raise_error(X_y_data, solver): + """Test that highs solver raises for scipy<1.6.0.""" + X, y = X_y_data + with pytest.raises(ValueError, match="scipy>=1.6.0"): + QuantileRegressor(solver=solver).fit(X, y) + + +@pytest.mark.parametrize( + "quantile, alpha, intercept, coef", + [ + # for 50% quantile w/o regularization, any slope in [1, 10] is okay + [0.5, 0, 1, None], + # if positive error costs more, the slope is maximal + [0.51, 0, 1, 10], + # if negative error costs more, the slope is minimal + [0.49, 0, 1, 1], + # for a small lasso penalty, the slope is also minimal + [0.5, 0.01, 1, 1], + # for a large lasso penalty, the model predicts the constant median + [0.5, 100, 2, 0], + ], +) +def test_quantile_toy_example(quantile, alpha, intercept, coef): + # test how different parameters affect a small intuitive example + X = [[0], [1], [1]] + y = [1, 2, 11] + model = QuantileRegressor(quantile=quantile, alpha=alpha).fit(X, y) + assert_allclose(model.intercept_, intercept, atol=1e-2) + if coef is not None: + assert_allclose(model.coef_[0], coef, atol=1e-2) + if alpha < 100: + assert model.coef_[0] >= 1 + assert model.coef_[0] <= 10 + + +@pytest.mark.parametrize("fit_intercept", [True, False]) +def test_quantile_equals_huber_for_low_epsilon(fit_intercept): + X, y = make_regression( + n_samples=100, n_features=20, random_state=0, noise=1.0 + ) + alpha = 1e-4 + huber = HuberRegressor( + epsilon=1 + 1e-4, alpha=alpha, fit_intercept=fit_intercept + ).fit(X, y) + quant = QuantileRegressor(alpha=alpha, fit_intercept=fit_intercept).fit( + X, y + ) + assert_allclose(huber.coef_, quant.coef_, atol=1e-1) + if fit_intercept: + assert huber.intercept_ == approx(quant.intercept_, abs=1e-1) + # check that we still predict fraction + assert np.mean(y < quant.predict(X)) == approx(0.5, abs=1e-1) + + +@pytest.mark.parametrize("q", [0.5, 0.9, 0.05]) +def test_quantile_estimates_calibration(q): + # Test that model estimates percentage of points below the prediction + X, y = make_regression( + n_samples=1000, n_features=20, random_state=0, noise=1.0 + ) + quant = QuantileRegressor( + quantile=q, + alpha=0, + solver_options={"lstsq": False}, + ).fit(X, y) + assert np.mean(y < quant.predict(X)) == approx(q, abs=1e-2) + + +def test_quantile_sample_weight(): + # test that with unequal sample weights we still estimate weighted fraction + n = 1000 + X, y = make_regression( + n_samples=n, n_features=5, random_state=0, noise=10.0 + ) + weight = np.ones(n) + # when we increase weight of upper observations, + # estimate of quantile should go up + weight[y > y.mean()] = 100 + quant = QuantileRegressor( + quantile=0.5, + alpha=1e-8, + solver_options={"lstsq": False} + ) + quant.fit(X, y, sample_weight=weight) + fraction_below = np.mean(y < quant.predict(X)) + assert fraction_below > 0.5 + weighted_fraction_below = np.average(y < quant.predict(X), weights=weight) + assert weighted_fraction_below == approx(0.5, abs=3e-2) + + +@pytest.mark.parametrize("quantile", [0.2, 0.5, 0.8]) +def test_asymmetric_error(quantile): + """Test quantile regression for asymmetric distributed targets.""" + n_samples = 1000 + rng = np.random.RandomState(42) + # take care that X @ coef + intercept > 0 + X = np.concatenate( + ( + np.abs(rng.randn(n_samples)[:, None]), + -rng.randint(2, size=(n_samples, 1)), + ), + axis=1, + ) + intercept = 1.23 + coef = np.array([0.5, -2]) + # For an exponential distribution with rate lambda, e.g. exp(-lambda * x), + # the quantile at level q is: + # quantile(q) = - log(1 - q) / lambda + # scale = 1/lambda = -quantile(q) / log(1-q) + y = rng.exponential( + scale=-(X @ coef + intercept) / np.log(1 - quantile), size=n_samples + ) + model = QuantileRegressor( + quantile=quantile, + alpha=0, + solver="interior-point", + solver_options={"tol": 1e-5}, + ).fit(X, y) + assert model.intercept_ == approx(intercept, rel=0.2) + assert_allclose(model.coef_, coef, rtol=0.6) + assert_allclose(np.mean(model.predict(X) > y), quantile) + + # Now compare to Nelder-Mead optimization with L1 penalty + alpha = 0.01 + model.set_params(alpha=alpha).fit(X, y) + model_coef = np.r_[model.intercept_, model.coef_] + + def func(coef): + loss = mean_pinball_loss(y, X @ coef[1:] + coef[0], alpha=quantile) + L1 = np.sum(np.abs(coef[1:])) + return loss + alpha * L1 + + res = minimize( + fun=func, + x0=[1, 0, -1], + method="Nelder-Mead", + tol=1e-12, + options={"maxiter": 2000}, + ) + + assert func(model_coef) == approx(func(res.x), rel=1e-3) + assert_allclose(model.intercept_, res.x[0], rtol=1e-3) + assert_allclose(model.coef_, res.x[1:], rtol=1e-3) + assert_allclose(np.mean(model.predict(X) > y), quantile, rtol=8e-3) + + +@pytest.mark.parametrize("quantile", [0.2, 0.5, 0.8]) +def test_equivariance(quantile): + """Test equivariace of quantile regression. + + See Koenker (2005) Quantile Regression, Chapter 2.2.3. + """ + rng = np.random.RandomState(42) + n_samples, n_features = 100, 5 + X, y = make_regression( + n_samples=n_samples, + n_features=n_features, + n_informative=n_features, + noise=0, + random_state=rng, + shuffle=False, + ) + # make y asymmetric + y += rng.exponential(scale=100, size=y.shape) + params = dict(alpha=0, solver_options={"lstsq": True, "tol": 1e-10}) + model1 = QuantileRegressor(quantile=quantile, **params).fit(X, y) + + # coef(q; a*y, X) = a * coef(q; y, X) + a = 2.5 + model2 = QuantileRegressor(quantile=quantile, **params).fit(X, a * y) + assert model2.intercept_ == approx(a * model1.intercept_, rel=1e-5) + assert_allclose(model2.coef_, a * model1.coef_, rtol=1e-5) + + # coef(1-q; -a*y, X) = -a * coef(q; y, X) + model2 = QuantileRegressor(quantile=1 - quantile, **params).fit(X, -a * y) + assert model2.intercept_ == approx(-a * model1.intercept_, rel=1e-5) + assert_allclose(model2.coef_, -a * model1.coef_, rtol=1e-5) + + # coef(q; y + X @ g, X) = coef(q; y, X) + g + g_intercept, g_coef = rng.randn(), rng.randn(n_features) + model2 = QuantileRegressor(quantile=quantile, **params) + model2.fit(X, y + X @ g_coef + g_intercept) + assert model2.intercept_ == approx(model1.intercept_ + g_intercept) + assert_allclose(model2.coef_, model1.coef_ + g_coef, rtol=1e-6) + + # coef(q; y, X @ A) = A^-1 @ coef(q; y, X) + A = rng.randn(n_features, n_features) + model2 = QuantileRegressor(quantile=quantile, **params) + model2.fit(X @ A, y) + assert model2.intercept_ == approx(model1.intercept_, rel=1e-5) + assert_allclose(model2.coef_, np.linalg.solve(A, model1.coef_), rtol=1e-5) + + +def test_linprog_failure(): + """Test that linprog fails.""" + X = np.linspace(0, 10, num=10).reshape(-1, 1) + y = np.linspace(0, 10, num=10) + reg = QuantileRegressor( + alpha=0, solver="interior-point", solver_options={"maxiter": 1} + ) + + msg = "Linear programming for QuantileRegressor did not succeed." + with pytest.warns(ConvergenceWarning, match=msg): + reg.fit(X, y) From 6ec090f3935e1c7cf3d836ddb47f9db8502a98aa Mon Sep 17 00:00:00 2001 From: Eleni Markou Date: Wed, 26 May 2021 16:27:04 +0300 Subject: [PATCH 417/478] DOC fix broken links in faq.rst and glossary.rst (#20122) --- doc/faq.rst | 1 - doc/glossary.rst | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/doc/faq.rst b/doc/faq.rst index 4038106bc93d7..43ef246594de1 100644 --- a/doc/faq.rst +++ b/doc/faq.rst @@ -20,7 +20,6 @@ sy-kit learn. sci stands for science! Why scikit? ------------ There are multiple scikits, which are scientific toolboxes built around SciPy. -You can find a list at ``_. Apart from scikit-learn, another popular one is `scikit-image `_. How can I contribute to scikit-learn? diff --git a/doc/glossary.rst b/doc/glossary.rst index a43eda4a79b67..ba924387bc5eb 100644 --- a/doc/glossary.rst +++ b/doc/glossary.rst @@ -171,7 +171,7 @@ General Concepts one-hot encode categorical features. See also :ref:`preprocessing_categorical_features` and the `categorical-encoding - `_ + `_ package for tools related to encoding categorical features. clone From 7c212a2966a942d22da8935f1068e49a45ebc340 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 26 May 2021 16:18:15 +0200 Subject: [PATCH 418/478] EXA improve the example for QuantileRegressor (#20133) --- .../linear_model/plot_quantile_regression.py | 352 ++++++++++++++---- 1 file changed, 281 insertions(+), 71 deletions(-) diff --git a/examples/linear_model/plot_quantile_regression.py b/examples/linear_model/plot_quantile_regression.py index 8af7785cc6733..404d7a314d553 100644 --- a/examples/linear_model/plot_quantile_regression.py +++ b/examples/linear_model/plot_quantile_regression.py @@ -2,6 +2,7 @@ =================== Quantile regression =================== + This example illustrates how quantile regression can predict non-trivial conditional quantiles. @@ -11,100 +12,309 @@ The right figure shows an example of an asymmetric error distribution, namely the Pareto distribution. """ + print(__doc__) + # Authors: David Dale # Christian Lorentzen +# Guillaume Lemaitre # License: BSD 3 clause + +# %% +# Dataset generation +# ------------------ +# +# To illustrate the behaviour of quantile regression, we will generate two +# synthetic datasets. The true generative random processess for both datasets +# will be composed by the same expected value with a linear relationship with a +# single feature `x`. import numpy as np + +rng = np.random.RandomState(42) +x = np.linspace(start=0, stop=10, num=100) +X = x[:, np.newaxis] +y_true_mean = 10 + 0.5 * x + +# %% +# We will create two subsequent problems by changing the distribution of the +# target `y` while keeping the same expected value: +# +# - in the first case, a heteroscedastic Normal noise is added; +# - in the second case, an asymmetric Pareto noise is added. +y_normal = y_true_mean + rng.normal( + loc=0, scale=0.5 + 0.5 * x, size=x.shape[0] +) +a = 5 +y_pareto = y_true_mean + 10 * (rng.pareto(a, size=x.shape[0]) - 1 / (a - 1)) + +# %% +# Let's first visualize the datasets as well as the distribution of the +# residuals `y - mean(y)`. import matplotlib.pyplot as plt -from sklearn.linear_model import QuantileRegressor, LinearRegression -from sklearn.metrics import mean_absolute_error, mean_squared_error -from sklearn.model_selection import cross_val_score +_, axs = plt.subplots( + nrows=2, ncols=2, figsize=(15, 11), sharex="row", sharey="row" +) +axs[0, 0].plot(x, y_true_mean, label="True mean") +axs[0, 0].scatter( + x, y_normal, color="black", alpha=0.5, label="Observations" +) +axs[1, 0].hist(y_true_mean - y_normal, edgecolor="black") -def plot_points_highlighted(x, y, model_low, model_high, ax): - """Plot points with highlighting.""" - mask = y <= model_low.predict(X) - ax.scatter(x[mask], y[mask], c="k", marker="x") - mask = y > model_high.predict(X) - ax.scatter(x[mask], y[mask], c="k", marker="x") - mask = (y > model_low.predict(X)) & (y <= model_high.predict(X)) - ax.scatter(x[mask], y[mask], c="k") +axs[0, 1].plot(x, y_true_mean, label="True mean") +axs[0, 1].scatter( + x, y_pareto, color="black", alpha=0.5, label="Observations" +) +axs[1, 1].hist(y_true_mean - y_pareto, edgecolor="black") -fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5), sharey=True) +axs[0, 0].set_title("Dataset with heteroscedastic Normal distributed targets") +axs[0, 1].set_title("Dataset with asymmetric Pareto distributed target") +axs[1, 0].set_title( + "Residuals distribution for heteroscedastic Normal distributed targets" +) +axs[1, 1].set_title( + "Residuals distribution for asymmetric Pareto distributed target" +) +axs[0, 0].legend() +axs[0, 1].legend() +axs[0, 0].set_ylabel("y") +axs[1, 0].set_ylabel("Counts") +axs[0, 1].set_xlabel("x") +axs[0, 0].set_xlabel("x") +axs[1, 0].set_xlabel("Residuals") +_ = axs[1, 1].set_xlabel("Residuals") -rng = np.random.RandomState(42) -x = np.linspace(0, 10, 100) -X = x[:, np.newaxis] -y = 10 + 0.5 * x + rng.normal(loc=0, scale=0.5 + 0.5 * x, size=x.shape[0]) -y_mean = 10 + 0.5 * x -ax1.plot(x, y_mean, "k--") +# %% +# With the heteroscedastic Normal distributed target, we observe that the +# variance of the noise is increasing when the value of the feature `x` is +# increasing. +# +# With the asymmetric Pareto distributed target, we observe that the positive +# residuals are bounded. +# +# These types of noisy targets make the estimation via +# :class:`~sklearn.linear_model.LinearRegression` less efficient, i.e. we need +# more data to get stable results and, in addition, large outliers can have a +# huge impact on the fitted coefficients. (Stated otherwise: in a setting with +# constant variance, ordinary least squares estimators converge much faster to +# the *true* coefficients with increasing sample size.) +# +# In this asymmetric setting, the median or different quantiles give additional +# insights. On top of that, median estimation is much more robust to outliers +# and heavy tailed distributions. But note that extreme quantiles are estimated +# by very view data points. 95% quantile are more or less estimated by the 5% +# largest values and thus also a bit sensitive outliers. +# +# In the remainder of this tutorial, we will show how +# :class:`~sklearn.linear_model.QuantileRegressor` can be used in practice and +# give the intuition into the properties of the fitted models. Finally, +# we will compare the both :class:`~sklearn.linear_model.QuantileRegressor` +# and :class:`~sklearn.linear_model.LinearRegression`. +# +# Fitting a `QuantileRegressor` +# ----------------------------- +# +# In this section, we want to estimate the conditional median as well as +# a low and high quantile fixed at 5% and 95%, respectively. Thus, we will get +# three linear models, one for each quantile. +# +# We will use the quantiles at 5% and 95% to find the outliers in the training +# sample beyond the central 90% interval. +from sklearn.linear_model import QuantileRegressor quantiles = [0.05, 0.5, 0.95] -models = [] +predictions = {} +out_bounds_predictions = np.zeros_like(y_true_mean, dtype=np.bool_) for quantile in quantiles: qr = QuantileRegressor(quantile=quantile, alpha=0) - qr.fit(X, y) - ax1.plot(x, qr.predict(X)) - models.append(qr) + y_pred = qr.fit(X, y_normal).predict(X) + predictions[quantile] = y_pred -plot_points_highlighted(x, y, models[0], models[2], ax1) -ax1.set_xlabel("x") -ax1.set_ylabel("y") -ax1.set_title("Quantiles of heteroscedastic Normal distributed target") -ax1.legend(["true mean"] + quantiles) + if quantile == min(quantiles): + out_bounds_predictions = np.logical_or( + out_bounds_predictions, y_pred >= y_normal + ) + elif quantile == max(quantiles): + out_bounds_predictions = np.logical_or( + out_bounds_predictions, y_pred <= y_normal + ) +# %% +# Now, we can plot the three linear models and the distinguished samples that +# are within the central 90% interval from samples that are outside this +# interval. +plt.plot(X, y_true_mean, color="black", linestyle="dashed", label="True mean") -a = 5 -y = 10 + 0.5 * x + 10 * (rng.pareto(a, size=x.shape[0]) - 1 / (a - 1)) -ax2.plot(x, y_mean, "k--") +for quantile, y_pred in predictions.items(): + plt.plot(X, y_pred, label=f"Quantile: {quantile}") + +plt.scatter( + x[out_bounds_predictions], + y_normal[out_bounds_predictions], + color="black", + marker="+", + alpha=0.5, + label="Outside interval", +) +plt.scatter( + x[~out_bounds_predictions], + y_normal[~out_bounds_predictions], + color="black", + alpha=0.5, + label="Inside interval", +) -models = [] +plt.legend() +plt.xlabel("x") +plt.ylabel("y") +_ = plt.title("Quantiles of heteroscedastic Normal distributed target") + +# %% +# Since the noise is still Normally distributed, in particular is symmetric, +# the true conditional mean and the true conditional median coincide. Indeed, +# we see that the estimated median almost hits the true mean. We observe the +# effect of having an increasing noise variance on the 5% and 95% quantiles: +# the slopes of those quantiles are very different and the interval between +# them becomes wider with increasing `x`. +# +# To get an additional intuition regarding the meaning of the 5% and 95% +# quantiles estimators, one can count the number of samples above and below the +# predicted quantiles (represented by a cross on the above plot), considering +# that we have a total of 100 samples. +# +# We can repeat the same experiment using the asymmetric Pareto distributed +# target. +quantiles = [0.05, 0.5, 0.95] +predictions = {} +out_bounds_predictions = np.zeros_like(y_true_mean, dtype=np.bool_) for quantile in quantiles: qr = QuantileRegressor(quantile=quantile, alpha=0) - qr.fit(X, y) - ax2.plot([0, 10], qr.predict([[0], [10]])) - models.append(qr) + y_pred = qr.fit(X, y_pareto).predict(X) + predictions[quantile] = y_pred + + if quantile == min(quantiles): + out_bounds_predictions = np.logical_or( + out_bounds_predictions, y_pred >= y_pareto + ) + elif quantile == max(quantiles): + out_bounds_predictions = np.logical_or( + out_bounds_predictions, y_pred <= y_pareto + ) + +# %% +plt.plot(X, y_true_mean, color="black", linestyle="dashed", label="True mean") + +for quantile, y_pred in predictions.items(): + plt.plot(X, y_pred, label=f"Quantile: {quantile}") + +plt.scatter( + x[out_bounds_predictions], + y_pareto[out_bounds_predictions], + color="black", + marker="+", + alpha=0.5, + label="Outside interval", +) +plt.scatter( + x[~out_bounds_predictions], + y_pareto[~out_bounds_predictions], + color="black", + alpha=0.5, + label="Inside interval", +) + +plt.legend() +plt.xlabel("x") +plt.ylabel("y") +_ = plt.title("Quantiles of asymmetric Pareto distributed target") -plot_points_highlighted(x, y, models[0], models[2], ax2) -ax2.set_xlabel("x") -ax2.set_ylabel("y") -ax2.set_title("Quantiles of asymmetric Pareto distributed target") -ax2.legend(["true mean"] + quantiles, loc="lower right") -ax2.yaxis.set_tick_params(labelbottom=True) -plt.show() +# %% +# Due to the asymmetry of the distribution of the noise, we observe that the +# true mean and estimated conditional median are different. We also observe +# that each quantile model has different parameters to better fit the desired +# quantile. Note that ideally, all quantiles would be parallel in this case, +# which would become more visible with more data points or less extreme +# quantiles, e.g. 10% and 90%. +# +# Comparing `QuantileRegressor` and `LinearRegression` +# ---------------------------------------------------- +# +# In this section, we will linger on the difference regarding the error that +# :class:`~sklearn.linear_model.QuantileRegressor` and +# :class:`~sklearn.linear_model.LinearRegression` are minimizing. +# +# Indeed, :class:`~sklearn.linear_model.LinearRegression` is a least squares +# approach minimizing the mean squared error (MSE) between the training and +# predicted targets. In contrast, +# :class:`~sklearn.linear_model.QuantileRegressor` with `quantile=0.5` +# minimizes the mean absolute error (MAE) instead. +# +# Let's first compute the training errors of such models in terms of mean +# squared error and mean absolute error. We will use the asymmetric Pareto +# distributed target to make it more interesting as mean and median are not +# equal. +from sklearn.linear_model import LinearRegression +from sklearn.metrics import mean_absolute_error +from sklearn.metrics import mean_squared_error + +linear_regression = LinearRegression() +quantile_regression = QuantileRegressor(quantile=0.5, alpha=0) + +y_pred_lr = linear_regression.fit(X, y_pareto).predict(X) +y_pred_qr = quantile_regression.fit(X, y_pareto).predict(X) + +print( + f"""Training error (in-sample performance) + {linear_regression.__class__.__name__}: + MAE = {mean_absolute_error(y_pareto, y_pred_lr):.3f} + MSE = {mean_squared_error(y_pareto, y_pred_lr):.3f} + {quantile_regression.__class__.__name__}: + MAE = {mean_absolute_error(y_pareto, y_pred_qr):.3f} + MSE = {mean_squared_error(y_pareto, y_pred_qr):.3f} + """ +) + +# %% +# On the training set, we see that MAE is lower for +# :class:`~sklearn.linear_model.QuantileRegressor` than +# :class:`~sklearn.linear_model.LinearRegression`. In contrast to that, MSE is +# lower for :class:`~sklearn.linear_model.LinearRegression` than +# :class:`~sklearn.linear_model.QuantileRegressor`. These results confirms that +# MAE is the loss minimized by :class:`~sklearn.linear_model.QuantileRegressor` +# while MSE is the loss minimized +# :class:`~sklearn.linear_model.LinearRegression`. +# +# We can make a similar evaluation but looking a the test error obtained by +# cross-validation. +from sklearn.model_selection import cross_validate + +cv_results_lr = cross_validate( + linear_regression, + X, + y_pareto, + cv=3, + scoring=["neg_mean_absolute_error", "neg_mean_squared_error"], +) +cv_results_qr = cross_validate( + quantile_regression, + X, + y_pareto, + cv=3, + scoring=["neg_mean_absolute_error", "neg_mean_squared_error"], +) +print( + f"""Test error (cross-validated performance) + {linear_regression.__class__.__name__}: + MAE = {-cv_results_lr["test_neg_mean_absolute_error"].mean():.3f} + MSE = {-cv_results_lr["test_neg_mean_squared_error"].mean():.3f} + {quantile_regression.__class__.__name__}: + MAE = {-cv_results_qr["test_neg_mean_absolute_error"].mean():.3f} + MSE = {-cv_results_qr["test_neg_mean_squared_error"].mean():.3f} + """ +) # %% -# Note that both targets have the same mean value, indicated by the dashed -# black line. As the Normal distribution is symmetric, mean and median are -# identical and the predicted 0.5 quantile almost hits the true mean. -# In the Pareto case, the difference between predicted median and true mean -# is evident. We also marked the points below the 0.05 and above 0.95 -# predicted quantiles by small crosses. You might count them and consider -# that we have 100 samples in total. -# -# The second part of the example shows that LinearRegression minimizes MSE -# in order to predict the mean, while QuantileRegressor with `quantile=0.5` -# minimizes MAE in order to predict the median. Both do their own job well. - -models = [LinearRegression(), QuantileRegressor(alpha=0)] -names = ["OLS", "Quantile"] - -print("# In-sample performance") -for model_name, model in zip(names, models): - print(model_name + ":") - model.fit(X, y) - mae = mean_absolute_error(model.predict(X), y) - rmse = np.sqrt(mean_squared_error(model.predict(X), y)) - print(f"MAE = {mae:.4} RMSE = {rmse:.4}") -print("\n# Cross-validated performance") -for model_name, model in zip(names, models): - print(model_name + ":") - mae = -cross_val_score(model, X, y, cv=3, - scoring="neg_mean_absolute_error").mean() - rmse = np.sqrt(-cross_val_score(model, X, y, cv=3, - scoring="neg_mean_squared_error").mean()) - print(f"MAE = {mae:.4} RMSE = {rmse:.4}") +# We reach similar conclusions on the out-of-sample evaluation. From 7c873713df056a9554dd545b0d5f0be93630219b Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 26 May 2021 17:45:01 +0200 Subject: [PATCH 419/478] DOC change figure in user guide of quantile regression --- doc/modules/linear_model.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index 7fc14693c198d..4b76c35245d36 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -1466,7 +1466,7 @@ normally with zero mean and constant variance. Quantile regression provides sensible prediction intervals even for errors with non-constant (but predictable) variance or non-normal distribution. -.. figure:: /auto_examples/linear_model/images/sphx_glr_plot_quantile_regression_001.png +.. figure:: /auto_examples/linear_model/images/sphx_glr_plot_quantile_regression_002.png :target: ../auto_examples/linear_model/plot_quantile_regression.html :align: center :scale: 50% From 3c72fe50513d886e454ec7f64cdba7f44f2fbd95 Mon Sep 17 00:00:00 2001 From: Alihan Zihna Date: Thu, 27 May 2021 07:44:12 +0100 Subject: [PATCH 420/478] TST Changes assert_raises to raises in sklearn/utils/test_estimator_checks.py (#20138) Co-authored-by: Alihan Zihna --- sklearn/utils/tests/test_estimator_checks.py | 154 +++++++++---------- 1 file changed, 77 insertions(+), 77 deletions(-) diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py index 4792f50f2baef..301ba2ffd6776 100644 --- a/sklearn/utils/tests/test_estimator_checks.py +++ b/sklearn/utils/tests/test_estimator_checks.py @@ -12,8 +12,7 @@ from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.utils import deprecated from sklearn.utils._testing import ( - assert_raises, - assert_raises_regex, + raises, assert_warns, ignore_warnings, MinimalClassifier, @@ -413,7 +412,8 @@ def test_not_an_array_array_function(): raise SkipTest("array_function protocol not supported in numpy <1.17") not_array = _NotAnArray(np.ones(10)) msg = "Don't want to call array_function sum!" - assert_raises_regex(TypeError, msg, np.sum, not_array) + with raises(TypeError, match=msg): + np.sum(not_array) # always returns True assert np.may_share_memory(not_array, None) @@ -437,92 +437,93 @@ def test_check_estimator(): # check that we have a set_params and can clone msg = "Passing a class was deprecated" - assert_raises_regex(TypeError, msg, check_estimator, object) + with raises(TypeError, match=msg): + check_estimator(object) msg = ( "Parameter 'p' of estimator 'HasMutableParameters' is of type " "object which is not allowed" ) # check that the "default_constructible" test checks for mutable parameters check_estimator(HasImmutableParameters()) # should pass - assert_raises_regex( - AssertionError, msg, check_estimator, HasMutableParameters() - ) + with raises(AssertionError, match=msg): + check_estimator(HasMutableParameters()) # check that values returned by get_params match set_params msg = "get_params result does not match what was passed to set_params" - assert_raises_regex(AssertionError, msg, check_estimator, - ModifiesValueInsteadOfRaisingError()) + with raises(AssertionError, match=msg): + check_estimator(ModifiesValueInsteadOfRaisingError()) assert_warns(UserWarning, check_estimator, RaisesErrorInSetParams()) - assert_raises_regex(AssertionError, msg, check_estimator, - ModifiesAnotherValue()) + with raises(AssertionError, match=msg): + check_estimator(ModifiesAnotherValue()) # check that we have a fit method msg = "object has no attribute 'fit'" - assert_raises_regex(AttributeError, msg, check_estimator, BaseEstimator()) + with raises(AttributeError, match=msg): + check_estimator(BaseEstimator()) # check that fit does input validation msg = "Did not raise" - assert_raises_regex(AssertionError, msg, check_estimator, - BaseBadClassifier()) + with raises(AssertionError, match=msg): + check_estimator(BaseBadClassifier()) # check that sample_weights in fit accepts pandas.Series type try: from pandas import Series # noqa msg = ("Estimator NoSampleWeightPandasSeriesType raises error if " "'sample_weight' parameter is of type pandas.Series") - assert_raises_regex( - ValueError, msg, check_estimator, NoSampleWeightPandasSeriesType()) + with raises(ValueError, match=msg): + check_estimator(NoSampleWeightPandasSeriesType()) except ImportError: pass # check that predict does input validation (doesn't accept dicts in input) msg = "Estimator doesn't check for NaN and inf in predict" - assert_raises_regex(AssertionError, msg, check_estimator, - NoCheckinPredict()) + with raises(AssertionError, match=msg): + check_estimator(NoCheckinPredict()) # check that estimator state does not change # at transform/predict/predict_proba time msg = 'Estimator changes __dict__ during predict' - assert_raises_regex(AssertionError, msg, check_estimator, ChangesDict()) + with raises(AssertionError, match=msg): + check_estimator(ChangesDict()) # check that `fit` only changes attribures that # are private (start with an _ or end with a _). msg = ('Estimator ChangesWrongAttribute should not change or mutate ' 'the parameter wrong_attribute from 0 to 1 during fit.') - assert_raises_regex(AssertionError, msg, - check_estimator, ChangesWrongAttribute()) + with raises(AssertionError, match=msg): + check_estimator(ChangesWrongAttribute()) check_estimator(ChangesUnderscoreAttribute()) # check that `fit` doesn't add any public attribute msg = (r'Estimator adds public attribute\(s\) during the fit method.' ' Estimators are only allowed to add private attributes' ' either started with _ or ended' ' with _ but wrong_attribute added') - assert_raises_regex(AssertionError, msg, - check_estimator, SetsWrongAttribute()) + with raises(AssertionError, match=msg): + check_estimator(SetsWrongAttribute()) # check for sample order invariance name = NotInvariantSampleOrder.__name__ method = 'predict' msg = ("{method} of {name} is not invariant when applied to a dataset" "with different sample order.").format(method=method, name=name) - assert_raises_regex(AssertionError, msg, - check_estimator, NotInvariantSampleOrder()) + with raises(AssertionError, match=msg): + check_estimator(NotInvariantSampleOrder()) # check for invariant method name = NotInvariantPredict.__name__ method = 'predict' msg = ("{method} of {name} is not invariant when applied " "to a subset.").format(method=method, name=name) - assert_raises_regex(AssertionError, msg, - check_estimator, NotInvariantPredict()) + with raises(AssertionError, match=msg): + check_estimator(NotInvariantPredict()) # check for sparse matrix input handling name = NoSparseClassifier.__name__ msg = "Estimator %s doesn't seem to fail gracefully on sparse data" % name - assert_raises_regex( - AssertionError, msg, check_estimator, NoSparseClassifier() - ) + with raises(AssertionError, match=msg): + check_estimator(NoSparseClassifier()) # Large indices test on bad estimator msg = ('Estimator LargeSparseNotSupportedClassifier doesn\'t seem to ' r'support \S{3}_64 matrix, and is not failing gracefully.*') - assert_raises_regex(AssertionError, msg, check_estimator, - LargeSparseNotSupportedClassifier()) + with raises(AssertionError, match=msg): + check_estimator(LargeSparseNotSupportedClassifier()) # does error on binary_only untagged estimator msg = 'Only 2 classes are supported' - assert_raises_regex(ValueError, msg, check_estimator, - UntaggedBinaryClassifier()) + with raises(ValueError, match=msg): + check_estimator(UntaggedBinaryClassifier()) # non-regression test for estimators transforming to sparse data check_estimator(SparseTransformer()) @@ -537,8 +538,8 @@ def test_check_estimator(): # Check regressor with requires_positive_y estimator tag msg = 'negative y values not supported!' - assert_raises_regex(ValueError, msg, check_estimator, - RequiresPositiveYRegressor()) + with raises(ValueError, match=msg): + check_estimator(RequiresPositiveYRegressor()) # Does not raise error on classifier with poor_score tag check_estimator(PoorScoreLogisticRegression()) @@ -547,7 +548,8 @@ def test_check_estimator(): def test_check_outlier_corruption(): # should raise AssertionError decision = np.array([0., 1., 1.5, 2.]) - assert_raises(AssertionError, check_outlier_corruption, 1, 2, decision) + with raises(AssertionError): + check_outlier_corruption(1, 2, decision) # should pass decision = np.array([0., 1., 1., 2.]) check_outlier_corruption(1, 2, decision) @@ -555,8 +557,8 @@ def test_check_outlier_corruption(): def test_check_estimator_transformer_no_mixin(): # check that TransformerMixin is not required for transformer tests to run - assert_raises_regex(AttributeError, '.*fit_transform.*', - check_estimator, BadTransformerWithoutMixin()) + with raises(AttributeError, '.*fit_transform.*'): + check_estimator(BadTransformerWithoutMixin()) def test_check_estimator_clones(): @@ -593,8 +595,8 @@ def test_check_estimators_unfitted(): # check that a ValueError/AttributeError is raised when calling predict # on an unfitted estimator msg = "Did not raise" - assert_raises_regex(AssertionError, msg, check_estimators_unfitted, - "estimator", NoSparseClassifier()) + with raises(AssertionError, match=msg): + check_estimators_unfitted("estimator", NoSparseClassifier()) # check that CorrectNotFittedError inherit from either ValueError # or AttributeError @@ -610,19 +612,22 @@ class NonConformantEstimatorNoParamSet(BaseEstimator): def __init__(self, you_should_set_this_=None): pass - assert_raises_regex(AssertionError, - "Estimator estimator_name should not set any" - " attribute apart from parameters during init." - r" Found attributes \['you_should_not_set_this_'\].", - check_no_attributes_set_in_init, - 'estimator_name', - NonConformantEstimatorPrivateSet()) - assert_raises_regex(AttributeError, - "Estimator estimator_name should store all " - "parameters as an attribute during init.", - check_no_attributes_set_in_init, - 'estimator_name', - NonConformantEstimatorNoParamSet()) + msg = ( + "Estimator estimator_name should not set any" + " attribute apart from parameters during init." + r" Found attributes \['you_should_not_set_this_'\]." + ) + with raises(AssertionError, match=msg): + check_no_attributes_set_in_init('estimator_name', + NonConformantEstimatorPrivateSet()) + + msg = ( + "Estimator estimator_name should store all parameters as an attribute" + " during init" + ) + with raises(AttributeError, match=msg): + check_no_attributes_set_in_init('estimator_name', + NonConformantEstimatorNoParamSet()) def test_check_estimator_pairwise(): @@ -639,32 +644,24 @@ def test_check_estimator_pairwise(): def test_check_classifier_data_not_an_array(): - assert_raises_regex(AssertionError, - 'Not equal to tolerance', - check_classifier_data_not_an_array, - 'estimator_name', - EstimatorInconsistentForPandas()) + with raises(AssertionError, match='Not equal to tolerance'): + check_classifier_data_not_an_array('estimator_name', + EstimatorInconsistentForPandas()) def test_check_regressor_data_not_an_array(): - assert_raises_regex(AssertionError, - 'Not equal to tolerance', - check_regressor_data_not_an_array, - 'estimator_name', - EstimatorInconsistentForPandas()) + with raises(AssertionError, match='Not equal to tolerance'): + check_regressor_data_not_an_array('estimator_name', + EstimatorInconsistentForPandas()) def test_check_estimator_get_tags_default_keys(): estimator = EstimatorMissingDefaultTags() err_msg = (r"EstimatorMissingDefaultTags._get_tags\(\) is missing entries" r" for the following default tags: {'allow_nan'}") - assert_raises_regex( - AssertionError, - err_msg, - check_estimator_get_tags_default_keys, - estimator.__class__.__name__, - estimator, - ) + with raises(AssertionError, match=err_msg): + check_estimator_get_tags_default_keys(estimator.__class__.__name__, + estimator) # noop check when _get_tags is not available estimator = MinimalTransformer() @@ -688,12 +685,15 @@ def run_tests_without_pytest(): def test_check_class_weight_balanced_linear_classifier(): # check that ill-computed balanced weights raises an exception - assert_raises_regex(AssertionError, - "Classifier estimator_name is not computing" - " class_weight=balanced properly.", - check_class_weight_balanced_linear_classifier, - 'estimator_name', - BadBalancedWeightsClassifier) + msg = ( + "Classifier estimator_name is not computing class_weight=balanced " + "properly" + ) + with raises(AssertionError, match=msg): + check_class_weight_balanced_linear_classifier( + 'estimator_name', + BadBalancedWeightsClassifier + ) def test_all_estimators_all_public(): From 07880f0ad3c6716772603559f8e1d07c01dc0929 Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Thu, 27 May 2021 11:29:22 +0200 Subject: [PATCH 421/478] DOC Update minimal versions for dependencies (#20143) --- README.rst | 11 ++++++----- doc/install.rst | 3 ++- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/README.rst b/README.rst index ebc4339b2ab58..3c685fa4af13e 100644 --- a/README.rst +++ b/README.rst @@ -26,13 +26,13 @@ .. |DOI| image:: https://zenodo.org/badge/21369/scikit-learn/scikit-learn.svg .. _DOI: https://zenodo.org/badge/latestdoi/21369/scikit-learn/scikit-learn -.. |PythonMinVersion| replace:: 3.6 -.. |NumPyMinVersion| replace:: 1.13.3 -.. |SciPyMinVersion| replace:: 0.19.1 +.. |PythonMinVersion| replace:: 3.7 +.. |NumPyMinVersion| replace:: 1.14.5 +.. |SciPyMinVersion| replace:: 1.1.0 .. |JoblibMinVersion| replace:: 0.11 .. |ThreadpoolctlMinVersion| replace:: 2.0.0 -.. |MatplotlibMinVersion| replace:: 2.1.1 -.. |Scikit-ImageMinVersion| replace:: 0.13 +.. |MatplotlibMinVersion| replace:: 2.2.2 +.. |Scikit-ImageMinVersion| replace:: 0.14.5 .. |PandasMinVersion| replace:: 0.25.0 .. |SeabornMinVersion| replace:: 0.9.0 .. |PytestMinVersion| replace:: 5.0.1 @@ -70,6 +70,7 @@ scikit-learn requires: **Scikit-learn 0.20 was the last version to support Python 2.7 and Python 3.4.** scikit-learn 0.23 and later require Python 3.6 or newer. +scikit-learn 1.0 and later require Python 3.7 or newer. Scikit-learn plotting capabilities (i.e., functions start with ``plot_`` and classes end with "Display") require Matplotlib (>= |MatplotlibMinVersion|). diff --git a/doc/install.rst b/doc/install.rst index 7912cc4dc4df6..d0b0f50e78f90 100644 --- a/doc/install.rst +++ b/doc/install.rst @@ -142,7 +142,8 @@ purpose. Scikit-learn 0.20 was the last version to support Python 2.7 and Python 3.4. Scikit-learn 0.21 supported Python 3.5-3.7. Scikit-learn 0.22 supported Python 3.5-3.8. - Scikit-learn now requires Python 3.6 or newer. + Scikit-learn 0.23 - 0.24 require Python 3.6 or newer. + Scikit-learn 1.0 and later requires Python 3.7 or newer. .. note:: From 99472deef6b87197049d173657ccb7939b938f3e Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Thu, 27 May 2021 12:45:06 +0200 Subject: [PATCH 422/478] MAINT silence spurious mypy error (#20147) --- sklearn/metrics/tests/test_common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py index 66df47a778b38..63e37f5590959 100644 --- a/sklearn/metrics/tests/test_common.py +++ b/sklearn/metrics/tests/test_common.py @@ -821,7 +821,7 @@ def test_regression_thresholded_inf_nan_input(metric, y_true, y_score): # Add an additional case for classification only # non-regression test for: # https://github.com/scikit-learn/scikit-learn/issues/6809 - [([np.nan, 1, 2], [1, 2, 3])] + [([np.nan, 1, 2], [1, 2, 3])] # type: ignore ) def test_classification_inf_nan_input(metric, y_true, y_score): """check that classification metrics raise a message mentioning the From 67f6a5c6d9ac2a7051d6009237a59462faffd04e Mon Sep 17 00:00:00 2001 From: naozin555 <37050583+naozin555@users.noreply.github.com> Date: Thu, 27 May 2021 20:44:21 +0900 Subject: [PATCH 423/478] Add missing link to user guide in PolynomialFeatures API documentation (#20146) --- sklearn/preprocessing/_polynomial.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sklearn/preprocessing/_polynomial.py b/sklearn/preprocessing/_polynomial.py index 44ac0d2175c4c..ac4703dbb4cb2 100644 --- a/sklearn/preprocessing/_polynomial.py +++ b/sklearn/preprocessing/_polynomial.py @@ -30,6 +30,8 @@ class PolynomialFeatures(TransformerMixin, BaseEstimator): For example, if an input sample is two dimensional and of the form [a, b], the degree-2 polynomial features are [1, a, b, a^2, ab, b^2]. + Read more in the :ref:`User Guide `. + Parameters ---------- degree : int, default=2 From aa86c83b8e31df0367b299e85c88828f48eb1940 Mon Sep 17 00:00:00 2001 From: Venkatachalam N Date: Thu, 27 May 2021 18:58:24 +0530 Subject: [PATCH 424/478] ENH Allowing sparse inputs for prediction in AffinityPropagation (#20117) --- doc/whats_new/v1.0.rst | 5 +++++ sklearn/cluster/_affinity_propagation.py | 2 +- .../tests/test_affinity_propagation.py | 19 +++++++++++++++++++ 3 files changed, 25 insertions(+), 1 deletion(-) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index 29a4bce98ecb0..7255fe82ff628 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -150,6 +150,11 @@ Changelog - |Efficiency| :class:`cluster.MiniBatchKMeans` is now faster in multicore settings. :pr:`17622` by :user:`Jérémie du Boisberranger `. +- |Enhancement| The `predict` and `fit_predict` methods of + :class:`cluster.AffinityPropagation` now accept sparse data type for input + data. + :pr:`20117` by :user:`Venkatachalam Natchiappan ` + - |Fix| Fixed a bug in :class:`cluster.MiniBatchKMeans` where the sample weights were partially ignored when the input is sparse. :pr:`17622` by :user:`Jérémie du Boisberranger `. diff --git a/sklearn/cluster/_affinity_propagation.py b/sklearn/cluster/_affinity_propagation.py index ccae0b7538b58..59620ab31f63d 100644 --- a/sklearn/cluster/_affinity_propagation.py +++ b/sklearn/cluster/_affinity_propagation.py @@ -436,7 +436,7 @@ def predict(self, X): Cluster labels. """ check_is_fitted(self) - X = self._validate_data(X, reset=False) + X = self._validate_data(X, reset=False, accept_sparse='csr') if not hasattr(self, "cluster_centers_"): raise ValueError("Predict method is not supported when " "affinity='precomputed'.") diff --git a/sklearn/cluster/tests/test_affinity_propagation.py b/sklearn/cluster/tests/test_affinity_propagation.py index ae2806bf38e59..a42a8112782a5 100644 --- a/sklearn/cluster/tests/test_affinity_propagation.py +++ b/sklearn/cluster/tests/test_affinity_propagation.py @@ -238,6 +238,25 @@ def test_affinity_propagation_float32(): assert_array_equal(afp.labels_, expected) +def test_sparse_input_for_predict(): + # Test to make sure sparse inputs are accepted for predict + # (non-regression test for issue #20049) + af = AffinityPropagation(affinity="euclidean", random_state=42) + af.fit(X) + labels = af.predict(csr_matrix((2, 2))) + assert_array_equal(labels, (2, 2)) + + +def test_sparse_input_for_fit_predict(): + # Test to make sure sparse inputs are accepted for fit_predict + # (non-regression test for issue #20049) + af = AffinityPropagation(affinity="euclidean", random_state=42) + rng = np.random.RandomState(42) + X = csr_matrix(rng.randint(0, 2, size=(5, 5))) + labels = af.fit_predict(X) + assert_array_equal(labels, (0, 1, 1, 2, 3)) + + # TODO: Remove in 1.1 def test_affinity_propagation_pairwise_is_deprecated(): afp = AffinityPropagation(affinity='precomputed') From 495ff48a7cc9a2dc913fd1a7018f2ea36205655e Mon Sep 17 00:00:00 2001 From: kobaski Date: Thu, 27 May 2021 22:43:13 +0900 Subject: [PATCH 425/478] [MRG] resolve ambiguity of the nested cross-val example (#20148) * resolve ambiguity of the nested cross-val example * Update examples/model_selection/plot_nested_cross_validation_iris.py make it more explict Co-authored-by: Olivier Grisel Co-authored-by: Olivier Grisel --- examples/model_selection/plot_nested_cross_validation_iris.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/model_selection/plot_nested_cross_validation_iris.py b/examples/model_selection/plot_nested_cross_validation_iris.py index d6aea44e6c546..a2c53841bc4da 100644 --- a/examples/model_selection/plot_nested_cross_validation_iris.py +++ b/examples/model_selection/plot_nested_cross_validation_iris.py @@ -80,11 +80,12 @@ outer_cv = KFold(n_splits=4, shuffle=True, random_state=i) # Non_nested parameter search and scoring - clf = GridSearchCV(estimator=svm, param_grid=p_grid, cv=inner_cv) + clf = GridSearchCV(estimator=svm, param_grid=p_grid, cv=outer_cv) clf.fit(X_iris, y_iris) non_nested_scores[i] = clf.best_score_ # Nested CV with parameter optimization + clf = GridSearchCV(estimator=svm, param_grid=p_grid, cv=inner_cv) nested_score = cross_val_score(clf, X=X_iris, y=y_iris, cv=outer_cv) nested_scores[i] = nested_score.mean() From c9d223ccc58e2569b8e67f1d0217dd57a93ec07f Mon Sep 17 00:00:00 2001 From: jnboehm Date: Thu, 27 May 2021 16:00:56 +0200 Subject: [PATCH 426/478] [MRG] Expand documentation of random_state for spectral methods (#17314) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Jan Niklas Böhm Co-authored-by: Thomas J. Fan Co-authored-by: Olivier Grisel --- sklearn/cluster/_spectral.py | 34 +++++++++++++++++-------- sklearn/manifold/_spectral_embedding.py | 32 +++++++++++++++++------ 2 files changed, 48 insertions(+), 18 deletions(-) diff --git a/sklearn/cluster/_spectral.py b/sklearn/cluster/_spectral.py index a1371b925595d..cda6dac64ee54 100644 --- a/sklearn/cluster/_spectral.py +++ b/sklearn/cluster/_spectral.py @@ -197,11 +197,18 @@ def spectral_clustering(affinity, *, n_clusters=8, n_components=None, used. random_state : int, RandomState instance, default=None - A pseudo random number generator used for the initialization of the - lobpcg eigenvectors decomposition when eigen_solver == 'amg' and by - the K-Means initialization. Use an int to make the randomness - deterministic. - See :term:`Glossary `. + A pseudo random number generator used for the initialization + of the lobpcg eigenvectors decomposition when `eigen_solver == + 'amg'`, and for the K-Means initialization. Use an int to make + the results deterministic across calls (See + :term:`Glossary `). + + .. note:: + When using `eigen_solver == 'amg'`, + it is necessary to also fix the global numpy seed with + `np.random.seed(int)` to get deterministic results. See + https://github.com/pyamg/pyamg/issues/139 for further + information. n_init : int, default=10 Number of time the k-means algorithm will be run with different @@ -322,11 +329,18 @@ class SpectralClustering(ClusterMixin, BaseEstimator): Number of eigenvectors to use for the spectral embedding random_state : int, RandomState instance, default=None - A pseudo random number generator used for the initialization of the - lobpcg eigenvectors decomposition when ``eigen_solver='amg'`` and by - the K-Means initialization. Use an int to make the randomness - deterministic. - See :term:`Glossary `. + A pseudo random number generator used for the initialization + of the lobpcg eigenvectors decomposition when `eigen_solver == + 'amg'`, and for the K-Means initialization. Use an int to make + the results deterministic across calls (See + :term:`Glossary `). + + .. note:: + When using `eigen_solver == 'amg'`, + it is necessary to also fix the global numpy seed with + `np.random.seed(int)` to get deterministic results. See + https://github.com/pyamg/pyamg/issues/139 for further + information. n_init : int, default=10 Number of time the k-means algorithm will be run with different diff --git a/sklearn/manifold/_spectral_embedding.py b/sklearn/manifold/_spectral_embedding.py index 7fd371ee5af2f..49e64401b6c00 100644 --- a/sklearn/manifold/_spectral_embedding.py +++ b/sklearn/manifold/_spectral_embedding.py @@ -178,10 +178,18 @@ def spectral_embedding(adjacency, *, n_components=8, eigen_solver=None, used. random_state : int, RandomState instance or None, default=None - Determines the random number generator used for the initialization of - the lobpcg eigenvectors decomposition when ``solver`` == 'amg'. Pass - an int for reproducible results across multiple function calls. - See :term: `Glossary `. + A pseudo random number generator used for the initialization + of the lobpcg eigen vectors decomposition when `eigen_solver == + 'amg'`, and for the K-Means initialization. Use an int to make + the results deterministic across calls (See + :term:`Glossary `). + + .. note:: + When using `eigen_solver == 'amg'`, + it is necessary to also fix the global numpy seed with + `np.random.seed(int)` to get deterministic results. See + https://github.com/pyamg/pyamg/issues/139 for further + information. eigen_tol : float, default=0.0 Stopping criterion for eigendecomposition of the Laplacian matrix @@ -396,10 +404,18 @@ class SpectralEmbedding(BaseEstimator): 1/n_features. random_state : int, RandomState instance or None, default=None - Determines the random number generator used for the initialization of - the lobpcg eigenvectors when ``solver`` == 'amg'. Pass an int for - reproducible results across multiple function calls. - See :term: `Glossary `. + A pseudo random number generator used for the initialization + of the lobpcg eigen vectors decomposition when `eigen_solver == + 'amg'`, and for the K-Means initialization. Use an int to make + the results deterministic across calls (See + :term:`Glossary `). + + .. note:: + When using `eigen_solver == 'amg'`, + it is necessary to also fix the global numpy seed with + `np.random.seed(int)` to get deterministic results. See + https://github.com/pyamg/pyamg/issues/139 for further + information. eigen_solver : {'arpack', 'lobpcg', 'amg'}, default=None The eigenvalue decomposition strategy to use. AMG requires pyamg From bc91f01e541de34adf084b07d5154db15cab9b58 Mon Sep 17 00:00:00 2001 From: michalkrawczyk Date: Fri, 28 May 2021 12:25:55 +0200 Subject: [PATCH 427/478] DOC improve penalty/solver/muticlass support in LogisticRegression* (#19855) Co-authored-by: Guillaume Lemaitre --- sklearn/linear_model/_logistic.py | 114 ++++++++++++++++++++---------- 1 file changed, 77 insertions(+), 37 deletions(-) diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index c9f1f42f1eeec..abca6bb30e71f 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -1041,10 +1041,17 @@ class LogisticRegression(LinearClassifierMixin, Parameters ---------- penalty : {'l1', 'l2', 'elasticnet', 'none'}, default='l2' - Used to specify the norm used in the penalization. The 'newton-cg', - 'sag' and 'lbfgs' solvers support only l2 penalties. 'elasticnet' is - only supported by the 'saga' solver. If 'none' (not supported by the - liblinear solver), no regularization is applied. + Specify the norm of the penalty: + + - `'none'`: no penalty is added; + - `'l2'`: add a L2 penalty term and it is the default choice; + - `'l1'`: add a L1 penalty term; + - `'elasticnet'`: both L1 and L2 penalty terms are added. + + .. warning:: + Some penalties may not work with some solvers. See the parameter + `solver` below, to know the compatibility between the penalty and + solver. .. versionadded:: 0.19 l1 penalty with SAGA solver (allowing 'multinomial' + L1) @@ -1100,21 +1107,38 @@ class LogisticRegression(LinearClassifierMixin, solver : {'newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'}, \ default='lbfgs' - Algorithm to use in the optimization problem. - - - For small datasets, 'liblinear' is a good choice, whereas 'sag' and - 'saga' are faster for large ones. - - For multiclass problems, only 'newton-cg', 'sag', 'saga' and 'lbfgs' - handle multinomial loss; 'liblinear' is limited to one-versus-rest - schemes. - - 'newton-cg', 'lbfgs', 'sag' and 'saga' handle L2 or no penalty - - 'liblinear' and 'saga' also handle L1 penalty - - 'saga' also supports 'elasticnet' penalty - - 'liblinear' does not support setting ``penalty='none'`` - - Note that 'sag' and 'saga' fast convergence is only guaranteed on - features with approximately the same scale. You can - preprocess the data with a scaler from sklearn.preprocessing. + Algorithm to use in the optimization problem. Default is 'lbfgs'. + To choose a solver, you might want to consider the following aspects: + + - For small datasets, 'liblinear' is a good choice, whereas 'sag' + and 'saga' are faster for large ones; + - For multiclass problems, only 'newton-cg', 'sag', 'saga' and + 'lbfgs' handle multinomial loss; + - 'liblinear' is limited to one-versus-rest schemes. + + .. warning:: + The choice of the algorithm depends on the penalty chosen: + Supported penalties by solver: + + - 'newton-cg' - ['l2', 'none'] + - 'lbfgs' - ['l2', 'none'] + - 'liblinear' - ['l1', 'l2'] + - 'sag' - ['l2', 'none'] + - 'saga' - ['elasticnet', 'l1', 'l2', 'none'] + + .. note:: + 'sag' and 'saga' fast convergence is only guaranteed on + features with approximately the same scale. You can + preprocess the data with a scaler from :mod:`sklearn.preprocessing`. + + .. seealso:: + Refer to the User Guide for more information regarding + :class:`LogisticRegression` and more specifically the + `Table `_ + summarazing solver/penalty supports. + .. versionadded:: 0.17 Stochastic Average Gradient descent solver. @@ -1549,9 +1573,16 @@ class LogisticRegressionCV(LogisticRegression, n_samples > n_features. penalty : {'l1', 'l2', 'elasticnet'}, default='l2' - Used to specify the norm used in the penalization. The 'newton-cg', - 'sag' and 'lbfgs' solvers support only l2 penalties. 'elasticnet' is - only supported by the 'saga' solver. + Specify the norm of the penalty: + + - `'l2'`: add a L2 penalty term (used by default); + - `'l1'`: add a L1 penalty term; + - `'elasticnet'`: both L1 and L2 penalty terms are added. + + .. warning:: + Some penalties may not work with some solvers. See the parameter + `solver` below, to know the compatibility between the penalty and + solver. scoring : str or callable, default=None A string (see model evaluation documentation) or @@ -1563,21 +1594,30 @@ class LogisticRegressionCV(LogisticRegression, solver : {'newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'}, \ default='lbfgs' - Algorithm to use in the optimization problem. - - - For small datasets, 'liblinear' is a good choice, whereas 'sag' and - 'saga' are faster for large ones. - - For multiclass problems, only 'newton-cg', 'sag', 'saga' and 'lbfgs' - handle multinomial loss; 'liblinear' is limited to one-versus-rest - schemes. - - 'newton-cg', 'lbfgs' and 'sag' only handle L2 penalty, whereas - 'liblinear' and 'saga' handle L1 penalty. - - 'liblinear' might be slower in LogisticRegressionCV because it does - not handle warm-starting. - - Note that 'sag' and 'saga' fast convergence is only guaranteed on - features with approximately the same scale. You can preprocess the data - with a scaler from sklearn.preprocessing. + Algorithm to use in the optimization problem. Default is 'lbfgs'. + To choose a solver, you might want to consider the following aspects: + + - For small datasets, 'liblinear' is a good choice, whereas 'sag' + and 'saga' are faster for large ones; + - For multiclass problems, only 'newton-cg', 'sag', 'saga' and + 'lbfgs' handle multinomial loss; + - 'liblinear' might be slower in :class:`LogisticRegressionCV` + because it does not handle warm-starting. 'liblinear' is + limited to one-versus-rest schemes. + + .. warning:: + The choice of the algorithm depends on the penalty chosen: + + - 'newton-cg' - ['l2'] + - 'lbfgs' - ['l2'] + - 'liblinear' - ['l1', 'l2'] + - 'sag' - ['l2'] + - 'saga' - ['elasticnet', 'l1', 'l2'] + + .. note:: + 'sag' and 'saga' fast convergence is only guaranteed on features + with approximately the same scale. You can preprocess the data with + a scaler from :mod:`sklearn.preprocessing`. .. versionadded:: 0.17 Stochastic Average Gradient descent solver. From 9406b3d2a4715fc71005194d30e0256c897453a0 Mon Sep 17 00:00:00 2001 From: naozin555 <37050583+naozin555@users.noreply.github.com> Date: Fri, 28 May 2021 20:53:10 +0900 Subject: [PATCH 428/478] TST Removed assert_warns_message from feature_selection/tests (#20158) --- .../feature_selection/tests/test_feature_select.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/sklearn/feature_selection/tests/test_feature_select.py b/sklearn/feature_selection/tests/test_feature_select.py index 852c8228b2a76..b5e289cee9a00 100644 --- a/sklearn/feature_selection/tests/test_feature_select.py +++ b/sklearn/feature_selection/tests/test_feature_select.py @@ -12,9 +12,7 @@ from sklearn.utils._testing import assert_almost_equal, _convert_container from sklearn.utils._testing import assert_array_equal from sklearn.utils._testing import assert_array_almost_equal -from sklearn.utils._testing import assert_warns from sklearn.utils._testing import ignore_warnings -from sklearn.utils._testing import assert_warns_message from sklearn.utils import safe_mask from sklearn.datasets import make_classification, make_regression @@ -271,8 +269,8 @@ def test_select_kbest_zero(): support = univariate_filter.get_support() gtruth = np.zeros(10, dtype=bool) assert_array_equal(support, gtruth) - X_selected = assert_warns_message(UserWarning, 'No features were selected', - univariate_filter.transform, X) + with pytest.warns(UserWarning, match="No features were selected"): + X_selected = univariate_filter.transform(X) assert X_selected.shape == (20, 0) @@ -620,7 +618,8 @@ def test_f_classif_constant_feature(): X, y = make_classification(n_samples=10, n_features=5) X[:, 0] = 2.0 - assert_warns(UserWarning, f_classif, X, y) + with pytest.warns(UserWarning): + f_classif(X, y) def test_no_feature_selected(): @@ -639,8 +638,8 @@ def test_no_feature_selected(): ] for selector in strict_selectors: assert_array_equal(selector.get_support(), np.zeros(10)) - X_selected = assert_warns_message( - UserWarning, 'No features were selected', selector.transform, X) + with pytest.warns(UserWarning, match="No features were selected"): + X_selected = selector.transform(X) assert X_selected.shape == (40, 0) From 3a64fecd1f1d30a17998b254f94613adee48a930 Mon Sep 17 00:00:00 2001 From: Whidou Date: Fri, 28 May 2021 13:59:21 +0200 Subject: [PATCH 429/478] DOC Improve the description of california_housing (#20160) Co-authored-by: Whidou --- sklearn/datasets/descr/california_housing.rst | 26 ++++++++++++------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/sklearn/datasets/descr/california_housing.rst b/sklearn/datasets/descr/california_housing.rst index 9ab3b679b68f5..494803a125d12 100644 --- a/sklearn/datasets/descr/california_housing.rst +++ b/sklearn/datasets/descr/california_housing.rst @@ -10,27 +10,33 @@ California Housing dataset :Number of Attributes: 8 numeric, predictive attributes and the target :Attribute Information: - - MedInc median income in block - - HouseAge median house age in block - - AveRooms average number of rooms - - AveBedrms average number of bedrooms - - Population block population - - AveOccup average house occupancy - - Latitude house block latitude - - Longitude house block longitude + - MedInc median income in block group + - HouseAge median house age in block group + - AveRooms average number of rooms per household + - AveBedrms average number of bedrooms per household + - Population block group population + - AveOccup average number of household members + - Latitude block group latitude + - Longitude block group longitude :Missing Attribute Values: None This dataset was obtained from the StatLib repository. -http://lib.stat.cmu.edu/datasets/ +https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html -The target variable is the median house value for California districts. +The target variable is the median house value for California districts, +expressed in hundreds of thousands of dollars ($100,000). This dataset was derived from the 1990 U.S. census, using one row per census block group. A block group is the smallest geographical unit for which the U.S. Census Bureau publishes sample data (a block group typically has a population of 600 to 3,000 people). +An household is a group of people residing within a home. Since the average +number of rooms and bedrooms in this dataset are provided per household, these +columns may take surpinsingly large values for block groups with few households +and many empty houses, such as vacation resorts. + It can be downloaded/loaded using the :func:`sklearn.datasets.fetch_california_housing` function. From deda6e2a5a01ad22096862bded5f66e9578cc39e Mon Sep 17 00:00:00 2001 From: Alihan Zihna Date: Fri, 28 May 2021 21:31:18 +0100 Subject: [PATCH 430/478] EXA improve example of forest feature importances digits (#19429) Co-authored-by: Alihan Zihna Co-authored-by: Christian Lorentzen --- .../ensemble/plot_forest_importances_faces.py | 87 ++++++++++++++----- 1 file changed, 65 insertions(+), 22 deletions(-) diff --git a/examples/ensemble/plot_forest_importances_faces.py b/examples/ensemble/plot_forest_importances_faces.py index 6cea84ca4744c..ff2ec6f67ed99 100644 --- a/examples/ensemble/plot_forest_importances_faces.py +++ b/examples/ensemble/plot_forest_importances_faces.py @@ -3,46 +3,89 @@ Pixel importances with a parallel forest of trees ================================================= -This example shows the use of forests of trees to evaluate the impurity-based -importance of the pixels in an image classification task (faces). -The hotter the pixel, the more important. +This example shows the use of a forest of trees to evaluate the impurity +based importance of the pixels in an image classification task on the faces +dataset. The hotter the pixel, the more important it is. The code below also illustrates how the construction and the computation of the predictions can be parallelized within multiple jobs. """ +# %% print(__doc__) -from time import time -import matplotlib.pyplot as plt - +# %% +# Loading the data and model fitting +# ---------------------------------- +# First, we load the olivetti faces dataset and limit the dataset to contain +# only the first five classes. Then we train a random forest on the dataset +# and evaluate the impurity-based feature importance. One drawback of this +# method is that it cannot be evaluated on a separate test set. For this +# example, we are interested in representing the information learned from +# the full dataset. Also, we'll set the number of cores to use for the tasks. from sklearn.datasets import fetch_olivetti_faces -from sklearn.ensemble import ExtraTreesClassifier -# Number of cores to use to perform parallel fitting of the forest model -n_jobs = 1 +# %% +# We select the number of cores to use to perform parallel fitting of +# the forest model. `-1` means use all available cores. +n_jobs = -1 +# %% # Load the faces dataset data = fetch_olivetti_faces() X, y = data.data, data.target -mask = y < 5 # Limit to 5 classes +# %% +# Limit the dataset to 5 classes. +mask = y < 5 X = X[mask] y = y[mask] -# Build a forest and compute the pixel importances -print("Fitting ExtraTreesClassifier on faces data with %d cores..." % n_jobs) -t0 = time() -forest = ExtraTreesClassifier(n_estimators=1000, - max_features=128, - n_jobs=n_jobs, - random_state=0) +# %% +# A random forest classifier will be fitted to compute the feature importances. +from sklearn.ensemble import RandomForestClassifier + +forest = RandomForestClassifier( + n_estimators=750, n_jobs=n_jobs, random_state=42) forest.fit(X, y) -print("done in %0.3fs" % (time() - t0)) + +# %% +# Feature importance based on mean decrease in impurity (MDI) +# ----------------------------------------------------------- +# Feature importances are provided by the fitted attribute +# `feature_importances_` and they are computed as the mean and standard +# deviation of accumulation of the impurity decrease within each tree. +# +# .. warning:: +# Impurity-based feature importances can be misleading for high cardinality +# features (many unique values). See :ref:`permutation_importance` as +# an alternative. +import time +import matplotlib.pyplot as plt + +start_time = time.time() +img_shape = data.images[0].shape importances = forest.feature_importances_ -importances = importances.reshape(data.images[0].shape) +elapsed_time = time.time() - start_time -# Plot pixel importances -plt.matshow(importances, cmap=plt.cm.hot) -plt.title("Pixel importances with forests of trees") +print(f"Elapsed time to compute the importances: " + f"{elapsed_time:.3f} seconds") +imp_reshaped = importances.reshape(img_shape) +plt.matshow(imp_reshaped, cmap=plt.cm.hot) +plt.title("Pixel importances using impurity values") +plt.colorbar() plt.show() + +# %% +# Can you still recognize a face? + +# %% +# The limitations of MDI is not a problem for this dataset because: +# +# 1. All features are (ordered) numeric and will thus not suffer the +# cardinality bias +# 2. We are only interested to represent knowledge of the forest acquired +# on the training set. +# +# If these two conditions are not met, it is recommended to instead use +# the :func:`~sklearn.inspection.permutation_importance`. From eea26e7e81bc4120ed00d8bb39f58100747cecdc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?= <34657725+jeremiedbb@users.noreply.github.com> Date: Mon, 31 May 2021 11:37:14 +0200 Subject: [PATCH 431/478] MNT Clean deprecations for 1.0 | pairwise_distances (#19325) * cln deprecations pairwise_distances * cln match --- sklearn/metrics/pairwise.py | 20 +++------- sklearn/metrics/tests/test_pairwise.py | 51 ++++++++++---------------- 2 files changed, 25 insertions(+), 46 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index c9e9f60d8aaf3..5257f1bc6b95f 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -1468,25 +1468,17 @@ def _precompute_metric_params(X, Y, metric=None, **kwds): if X is Y: V = np.var(X, axis=0, ddof=1, dtype=dtype) else: - warnings.warn( - "from version 1.0 (renaming of 0.25), pairwise_distances for " - "metric='seuclidean' will require V to be specified if Y is " - "passed.", - FutureWarning - ) - V = np.var(np.vstack([X, Y]), axis=0, ddof=1, dtype=dtype) + raise ValueError( + "The 'V' parameter is required for the seuclidean metric " + "when Y is passed.") return {'V': V} if metric == "mahalanobis" and 'VI' not in kwds: if X is Y: VI = np.linalg.inv(np.cov(X.T)).T else: - warnings.warn( - "from version 1.0 (renaming of 0.25), pairwise_distances for " - "metric='mahalanobis' will require VI to be specified if Y " - "is passed.", - FutureWarning - ) - VI = np.linalg.inv(np.cov(np.vstack([X, Y]).T)).T + raise ValueError( + "The 'VI' parameter is required for the mahalanobis metric " + "when Y is passed.") return {'VI': VI} return {} diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index 1ff62af04c05f..fba887d63b084 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -1323,44 +1323,31 @@ def test_check_preserve_type(): @pytest.mark.parametrize("metric", ["seuclidean", "mahalanobis"]) @pytest.mark.parametrize("dist_function", [pairwise_distances, pairwise_distances_chunked]) -@pytest.mark.parametrize("y_is_x", [True, False], ids=["Y is X", "Y is not X"]) -def test_pairwise_distances_data_derived_params(n_jobs, metric, dist_function, - y_is_x): +def test_pairwise_distances_data_derived_params(n_jobs, metric, dist_function): # check that pairwise_distances give the same result in sequential and # parallel, when metric has data-derived parameters. with config_context(working_memory=0.1): # to have more than 1 chunk rng = np.random.RandomState(0) X = rng.random_sample((100, 10)) - if y_is_x: - Y = X - expected_dist_default_params = squareform(pdist(X, metric=metric)) - if metric == "seuclidean": - params = {'V': np.var(X, axis=0, ddof=1)} - else: - params = {'VI': np.linalg.inv(np.cov(X.T)).T} - else: - Y = rng.random_sample((100, 10)) - expected_dist_default_params = cdist(X, Y, metric=metric) - if metric == "seuclidean": - params = {'V': np.var(np.vstack([X, Y]), axis=0, ddof=1)} - else: - params = {'VI': np.linalg.inv(np.cov(np.vstack([X, Y]).T)).T} - - expected_dist_explicit_params = cdist(X, Y, metric=metric, **params) - # TODO: Remove warn_checker in 1.0 - if y_is_x: - warn_checker = pytest.warns(None) - else: - warn_checker = pytest.warns(FutureWarning, - match="to be specified if Y is passed") - with warn_checker: - dist = np.vstack(tuple(dist_function(X, Y, - metric=metric, - n_jobs=n_jobs))) - - assert_allclose(dist, expected_dist_explicit_params) - assert_allclose(dist, expected_dist_default_params) + expected_dist = squareform(pdist(X, metric=metric)) + dist = np.vstack(tuple(dist_function(X, metric=metric, n_jobs=n_jobs))) + + assert_allclose(dist, expected_dist) + + +@pytest.mark.parametrize("metric", ["seuclidean", "mahalanobis"]) +def test_pairwise_distances_data_derived_params_error(metric): + # check that pairwise_distances raises an error when Y is passed but + # metric has data-derived params that are not provided by the user. + rng = np.random.RandomState(0) + X = rng.random_sample((100, 10)) + Y = rng.random_sample((100, 10)) + + with pytest.raises(ValueError, + match=fr"The '(V|VI)' parameter is required for the " + fr"{metric} metric"): + pairwise_distances(X, Y, metric=metric) @pytest.mark.parametrize( From 1c36b49eb266d72d4211cd29c9a645e690925538 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Mon, 31 May 2021 14:49:14 +0200 Subject: [PATCH 432/478] MNT avoid pandas deprecation warning in test_validation.py (#20171) --- sklearn/utils/tests/test_validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index c244d6f6caffc..ae2d5181f35a6 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -456,7 +456,7 @@ def test_check_array_pandas_dtype_casting(): # check that we handle pandas dtypes in a semi-reasonable way # this is actually tricky because we can't really know that this # should be integer ahead of converting it. - cat_df = pd.DataFrame([pd.Categorical([1, 2, 3])]) + cat_df = pd.DataFrame({"cat_col": pd.Categorical([1, 2, 3])}) assert (check_array(cat_df).dtype == np.int64) assert (check_array(cat_df, dtype=FLOAT_DTYPES).dtype == np.float64) From c8753d4174be948aefa2edfbe0f2e17a6b2bccb3 Mon Sep 17 00:00:00 2001 From: Takeshi Oura Date: Mon, 31 May 2021 23:33:03 +0900 Subject: [PATCH 433/478] ENH Preserving dtype for numpy.float32 in Least Angle Regression (#20155) --- doc/whats_new/v1.0.rst | 4 ++ sklearn/linear_model/_least_angle.py | 21 ++++++-- .../linear_model/tests/test_least_angle.py | 53 ++++++++++++++++++- 3 files changed, 72 insertions(+), 6 deletions(-) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index 7255fe82ff628..525f3439860ef 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -344,6 +344,10 @@ Changelog is now faster. This is especially noticeable on large sparse input. :pr:`19734` by :user:`Fred Robinson `. +- |Enhancement| `fit` method preserves dtype for numpy.float32 in + :class:`Lars`, :class:`LassoLars`, :class:`LassoLars`, :class:`LarsCV` and + :class:`LassoLarsCV`. :pr:`20155` by :user:`Takeshi Oura `. + :mod:`sklearn.manifold` ....................... diff --git a/sklearn/linear_model/_least_angle.py b/sklearn/linear_model/_least_angle.py index 0932d0bd1aee3..3485344b99e02 100644 --- a/sklearn/linear_model/_least_angle.py +++ b/sklearn/linear_model/_least_angle.py @@ -476,12 +476,23 @@ def _lars_path_solver( max_features = min(max_iter, n_features) + dtypes = set(a.dtype for a in (X, y, Xy, Gram) if a is not None) + if len(dtypes) == 1: + # use the precision level of input data if it is consistent + return_dtype = next(iter(dtypes)) + else: + # fallback to double precision otherwise + return_dtype = np.float64 + if return_path: - coefs = np.zeros((max_features + 1, n_features)) - alphas = np.zeros(max_features + 1) + coefs = np.zeros((max_features + 1, n_features), dtype=return_dtype) + alphas = np.zeros(max_features + 1, dtype=return_dtype) else: - coef, prev_coef = np.zeros(n_features), np.zeros(n_features) - alpha, prev_alpha = np.array([0.]), np.array([0.]) # better ideas? + coef, prev_coef = (np.zeros(n_features, dtype=return_dtype), + np.zeros(n_features, dtype=return_dtype)) + alpha, prev_alpha = (np.array([0.], dtype=return_dtype), + np.array([0.], dtype=return_dtype)) + # above better ideas? n_iter, n_active = 0, 0 active, indices = list(), np.arange(n_features) @@ -948,7 +959,7 @@ def _fit(self, X, y, max_iter, alpha, fit_path, Xy=None): self.alphas_ = [] self.n_iter_ = [] - self.coef_ = np.empty((n_targets, n_features)) + self.coef_ = np.empty((n_targets, n_features), dtype=X.dtype) if fit_path: self.active_ = [] diff --git a/sklearn/linear_model/tests/test_least_angle.py b/sklearn/linear_model/tests/test_least_angle.py index 4321c39b45e92..656b7e3fef718 100644 --- a/sklearn/linear_model/tests/test_least_angle.py +++ b/sklearn/linear_model/tests/test_least_angle.py @@ -14,7 +14,7 @@ from sklearn import linear_model, datasets from sklearn.linear_model._least_angle import _lars_path_residues from sklearn.linear_model import LassoLarsIC, lars_path -from sklearn.linear_model import Lars, LassoLars +from sklearn.linear_model import Lars, LassoLars, LarsCV, LassoLarsCV # TODO: use another dataset that has multiple drops diabetes = datasets.load_diabetes() @@ -777,3 +777,54 @@ def test_copy_X_with_auto_gram(): linear_model.lars_path(X, y, Gram='auto', copy_X=True, method='lasso') # X did not change assert_allclose(X, X_before) + + +@pytest.mark.parametrize("LARS, has_coef_path, args", + ((Lars, True, {}), + (LassoLars, True, {}), + (LassoLarsIC, False, {}), + (LarsCV, True, {}), + # max_iter=5 is for avoiding ConvergenceWarning + (LassoLarsCV, True, {"max_iter": 5}))) +@pytest.mark.parametrize("dtype", (np.float32, np.float64)) +def test_lars_dtype_match(LARS, has_coef_path, args, dtype): + # The test ensures that the fit method preserves input dtype + rng = np.random.RandomState(0) + X = rng.rand(6, 6).astype(dtype) + y = rng.rand(6).astype(dtype) + + model = LARS(**args) + model.fit(X, y) + assert model.coef_.dtype == dtype + if has_coef_path: + assert model.coef_path_.dtype == dtype + assert model.intercept_.dtype == dtype + + +@pytest.mark.parametrize("LARS, has_coef_path, args", + ((Lars, True, {}), + (LassoLars, True, {}), + (LassoLarsIC, False, {}), + (LarsCV, True, {}), + # max_iter=5 is for avoiding ConvergenceWarning + (LassoLarsCV, True, {"max_iter": 5}))) +def test_lars_numeric_consistency(LARS, has_coef_path, args): + # The test ensures numerical consistency between trained coefficients + # of float32 and float64. + rtol = 1e-5 + atol = 1e-5 + + rng = np.random.RandomState(0) + X_64 = rng.rand(6, 6) + y_64 = rng.rand(6) + + model_64 = LARS(**args).fit(X_64, y_64) + model_32 = LARS(**args).fit(X_64.astype(np.float32), + y_64.astype(np.float32)) + + assert_allclose(model_64.coef_, model_32.coef_, rtol=rtol, atol=atol) + if has_coef_path: + assert_allclose(model_64.coef_path_, model_32.coef_path_, + rtol=rtol, atol=atol) + assert_allclose(model_64.intercept_, model_32.intercept_, + rtol=rtol, atol=atol) From 7bb3e22b3c454a59619a56c314be04b4b303e09a Mon Sep 17 00:00:00 2001 From: Alihan Zihna Date: Mon, 31 May 2021 20:36:23 +0100 Subject: [PATCH 434/478] TST change load_boston in test_base to make_* (#20174) Co-authored-by: maikia Co-authored-by: Alihan Zihna --- sklearn/tests/test_base.py | 36 +++++++++++++++++------------------- 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/sklearn/tests/test_base.py b/sklearn/tests/test_base.py index c91419bf10a0e..3556f2fa20219 100644 --- a/sklearn/tests/test_base.py +++ b/sklearn/tests/test_base.py @@ -294,26 +294,24 @@ def test_set_params_updates_valid_params(): assert gscv.estimator.C == 42.0 -def test_score_sample_weight(): - +@pytest.mark.parametrize("tree,dataset", [ + (DecisionTreeClassifier(max_depth=2, random_state=0), + datasets.make_classification(random_state=0)), + (DecisionTreeRegressor(max_depth=2, random_state=0), + datasets.make_regression(random_state=0)), +]) +def test_score_sample_weight(tree, dataset): rng = np.random.RandomState(0) - - # test both ClassifierMixin and RegressorMixin - estimators = [DecisionTreeClassifier(max_depth=2), - DecisionTreeRegressor(max_depth=2)] - sets = [datasets.load_iris(), - datasets.load_boston()] - - for est, ds in zip(estimators, sets): - est.fit(ds.data, ds.target) - # generate random sample weights - sample_weight = rng.randint(1, 10, size=len(ds.target)) - # check that the score with and without sample weights are different - assert (est.score(ds.data, ds.target) != - est.score(ds.data, ds.target, - sample_weight=sample_weight)), ( - "Unweighted and weighted scores " - "are unexpectedly equal") + # check that the score with and without sample weights are different + X, y = dataset + + tree.fit(X, y) + # generate random sample weights + sample_weight = rng.randint(1, 10, size=len(y)) + score_unweighted = tree.score(X, y) + score_weighted = tree.score(X, y, sample_weight=sample_weight) + msg = "Unweighted and weighted scores are unexpectedly equal" + assert score_unweighted != score_weighted, msg def test_clone_pandas_dataframe(): From 56f4b836275c49fb40b5642f7f4c69da009c6e93 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Tue, 1 Jun 2021 10:33:08 +0200 Subject: [PATCH 435/478] Fix number of splines in legend of example plot (#20142) --- examples/linear_model/plot_polynomial_interpolation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/linear_model/plot_polynomial_interpolation.py b/examples/linear_model/plot_polynomial_interpolation.py index 34972b9522c68..c6cd1f9d591bd 100644 --- a/examples/linear_model/plot_polynomial_interpolation.py +++ b/examples/linear_model/plot_polynomial_interpolation.py @@ -128,7 +128,7 @@ def f(x): splt = SplineTransformer(n_knots=4, degree=3).fit(X_train) axes[1].plot(x_plot, splt.transform(X_plot)) -axes[1].legend(axes[1].lines, [f"spline {n}" for n in range(4)]) +axes[1].legend(axes[1].lines, [f"spline {n}" for n in range(6)]) axes[1].set_title("SplineTransformer") # plot knots of spline @@ -138,7 +138,7 @@ def f(x): # %% # In the left plot, we recognize the lines corresponding to simple monomials -# from ``x**0`` to ``x**3``. In the right figure, we see the four B-spline +# from ``x**0`` to ``x**3``. In the right figure, we see the six B-spline # basis functions of ``degree=3`` and also the four knot positions that were # chosen during ``fit``. Note that there are ``degree`` number of additional # knots each to the left and to the right of the fitted interval. These are From 337f47abc6e74b91cfbda7709c641176719f979d Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 1 Jun 2021 10:44:22 +0200 Subject: [PATCH 436/478] DOC fix hyperlink for some users --- doc/whats_new/v1.0.rst | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index 525f3439860ef..de2449d32ed5f 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -164,7 +164,8 @@ Changelog :pr:`17622` by :user:`Jérémie du Boisberranger `. - |FIX| :class:`cluster.AgglomerativeClustering` now supports readonly - memory-mapped datasets. :pr:`19883` by `Julien Jerphanion `. + memory-mapped datasets. + :pr:`19883` by :user:`Julien Jerphanion `. - |API| :class:`cluster.Birch` attributes, `fit_` and `partial_fit_`, are deprecated and will be removed in 1.2. :pr:`19297` by `Thomas Fan`_. @@ -274,8 +275,8 @@ Changelog - |Feature| :func:`feature_selection.r_regression` computes Pearson's R correlation coefficients between the features and the target. - :pr:`17169` by `Dmytro Lituiev ` - and `Julien Jerphanion `. + :pr:`17169` by :user:`Dmytro Lituiev ` + and :user:`Julien Jerphanion `. :mod:`sklearn.inspection` ......................... @@ -407,8 +408,8 @@ Changelog :class:`model_selection.StratifiedKFold` and `model_selection.GroupKFold`, providing an ability to split data preserving the distribution of classes in each split while keeping each group within a single split. - :pr:`18649` by `Leandro Hermida ` and - `Rodion Martynov `. + :pr:`18649` by :user:`Leandro Hermida ` and + :user:`Rodion Martynov `. :mod:`sklearn.naive_bayes` .......................... @@ -436,7 +437,7 @@ Changelog :user:`Julien Jerphanion `. - |FIX| :class:`neighbors.DistanceMetric` subclasses now support readonly - memory-mapped datasets. :pr:`19883` by `Julien Jerphanion `. + memory-mapped datasets. :pr:`19883` by :user:`Julien Jerphanion `. - |FIX| :class:`neighbors.NearestNeighbors`, :class:`neighbors.KNeighborsClassifier`, :class:`neighbors.RadiusNeighborsClassifier`, :class:`neighbors.KNeighborsRegressor` From 1e24ea2b0df0cd828cd5dcbc6ee8e00ef0642c52 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 1 Jun 2021 11:01:32 +0200 Subject: [PATCH 437/478] DOC fix missing hyperlink in whats new --- doc/whats_new/v1.0.rst | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index de2449d32ed5f..6ecc421bafd48 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -405,9 +405,10 @@ Changelog .............................. - |Feature| added :class:`model_selection.StratifiedGroupKFold`, that combines - :class:`model_selection.StratifiedKFold` and `model_selection.GroupKFold`, - providing an ability to split data preserving the distribution of classes in - each split while keeping each group within a single split. + :class:`model_selection.StratifiedKFold` and + :class:`model_selection.GroupKFold`, providing an ability to split data + preserving the distribution of classes in each split while keeping each + group within a single split. :pr:`18649` by :user:`Leandro Hermida ` and :user:`Rodion Martynov `. From 8bc36080d9855d29e1fcbc86da46a9e89e86c046 Mon Sep 17 00:00:00 2001 From: KurumeYuta <84881778+KurumeYuta@users.noreply.github.com> Date: Tue, 1 Jun 2021 18:38:16 +0900 Subject: [PATCH 438/478] [MRG] Fix Sparse PCA optimization task #19775 (#20153) --- doc/modules/decomposition.rst | 14 ++++++++----- sklearn/decomposition/_dict_learning.py | 26 ++++++++++++++++++------- 2 files changed, 28 insertions(+), 12 deletions(-) diff --git a/doc/modules/decomposition.rst b/doc/modules/decomposition.rst index fd51f60d8bfc6..0939318050d5c 100644 --- a/doc/modules/decomposition.rst +++ b/doc/modules/decomposition.rst @@ -230,12 +230,14 @@ problem solved is a PCA problem (dictionary learning) with an .. math:: (U^*, V^*) = \underset{U, V}{\operatorname{arg\,min\,}} & \frac{1}{2} - ||X-UV||_2^2+\alpha||V||_1 \\ + ||X-UV||_{\text{Fro}}^2+\alpha||V||_{1,1} \\ \text{subject to } & ||U_k||_2 = 1 \text{ for all } 0 \leq k < n_{components} - -The sparsity-inducing :math:`\ell_1` norm also prevents learning +:math:`||.||_{\text{Fro}}` stands for the Frobenius norm and :math:`||.||_{1,1}` +stands for the entry-wise matrix norm which is the sum of the absolute values +of all the entries in the matrix. +The sparsity-inducing :math:`||.||_{1,1}` matrix norm also prevents learning components from noise when few training samples are available. The degree of penalization (and thus sparsity) can be adjusted through the hyperparameter ``alpha``. Small values lead to a gently regularized @@ -510,7 +512,7 @@ dictionary fixed, and then updating the dictionary to best fit the sparse code. .. math:: (U^*, V^*) = \underset{U, V}{\operatorname{arg\,min\,}} & \frac{1}{2} - ||X-UV||_2^2+\alpha||U||_1 \\ + ||X-UV||_{\text{Fro}}^2+\alpha||U||_{1,1} \\ \text{subject to } & ||V_k||_2 = 1 \text{ for all } 0 \leq k < n_{\mathrm{atoms}} @@ -525,7 +527,9 @@ dictionary fixed, and then updating the dictionary to best fit the sparse code. .. centered:: |pca_img2| |dict_img2| - +:math:`||.||_{\text{Fro}}` stands for the Frobenius norm and :math:`||.||_{1,1}` +stands for the entry-wise matrix norm which is the sum of the absolute values +of all the entries in the matrix. After using such a procedure to fit the dictionary, the transform is simply a sparse coding step that shares the same implementation with all dictionary learning objects (see :ref:`SparseCoder`). diff --git a/sklearn/decomposition/_dict_learning.py b/sklearn/decomposition/_dict_learning.py index 1c48542a1c9ec..80b64570b3401 100644 --- a/sklearn/decomposition/_dict_learning.py +++ b/sklearn/decomposition/_dict_learning.py @@ -436,11 +436,13 @@ def dict_learning(X, n_components, *, alpha, max_iter=100, tol=1e-8, Finds the best dictionary and the corresponding sparse code for approximating the data matrix X by solving:: - (U^*, V^*) = argmin 0.5 || X - U V ||_2^2 + alpha * || U ||_1 + (U^*, V^*) = argmin 0.5 || X - U V ||_Fro^2 + alpha * || U ||_1,1 (U,V) with || V_k ||_2 = 1 for all 0 <= k < n_components - where V is the dictionary and U is the sparse code. + where V is the dictionary and U is the sparse code. ||.||_Fro stands for + the Frobenius norm and ||.||_1,1 stands for the entry-wise matrix norm + which is the sum of the absolute values of all the entries in the matrix. Read more in the :ref:`User Guide `. @@ -637,12 +639,14 @@ def dict_learning_online(X, n_components=2, *, alpha=1, n_iter=100, Finds the best dictionary and the corresponding sparse code for approximating the data matrix X by solving:: - (U^*, V^*) = argmin 0.5 || X - U V ||_2^2 + alpha * || U ||_1 + (U^*, V^*) = argmin 0.5 || X - U V ||_Fro^2 + alpha * || U ||_1,1 (U,V) with || V_k ||_2 = 1 for all 0 <= k < n_components - where V is the dictionary and U is the sparse code. This is - accomplished by repeatedly iterating over mini-batches by slicing + where V is the dictionary and U is the sparse code. ||.||_Fro stands for + the Frobenius norm and ||.||_1,1 stands for the entry-wise matrix norm + which is the sum of the absolute values of all the entries in the matrix. + This is accomplished by repeatedly iterating over mini-batches by slicing the input data. Read more in the :ref:`User Guide `. @@ -1137,10 +1141,14 @@ class DictionaryLearning(_BaseSparseCoding, BaseEstimator): Solves the optimization problem:: - (U^*,V^*) = argmin 0.5 || X - U V ||_2^2 + alpha * || U ||_1 + (U^*,V^*) = argmin 0.5 || X - U V ||_Fro^2 + alpha * || U ||_1,1 (U,V) with || V_k ||_2 = 1 for all 0 <= k < n_components + ||.||_Fro stands for the Frobenius norm and ||.||_1,1 stands for + the entry-wise matrix norm which is the sum of the absolute values + of all the entries in the matrix. + Read more in the :ref:`User Guide `. Parameters @@ -1367,10 +1375,14 @@ class MiniBatchDictionaryLearning(_BaseSparseCoding, BaseEstimator): Solves the optimization problem:: - (U^*,V^*) = argmin 0.5 || X - U V ||_2^2 + alpha * || U ||_1 + (U^*,V^*) = argmin 0.5 || X - U V ||_Fro^2 + alpha * || U ||_1,1 (U,V) with || V_k ||_2 = 1 for all 0 <= k < n_components + ||.||_Fro stands for the Frobenius norm and ||.||_1,1 stands for + the entry-wise matrix norm which is the sum of the absolute values + of all the entries in the matrix. + Read more in the :ref:`User Guide `. Parameters From 777ac15e67ff4c5ff1be89e3db6f1c385a1b415e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?= <34657725+jeremiedbb@users.noreply.github.com> Date: Tue, 1 Jun 2021 19:14:04 +0200 Subject: [PATCH 439/478] MNT add n_features_in_ through the feature_extraction module (#20180) --- sklearn/feature_extraction/tests/test_text.py | 9 --------- sklearn/feature_extraction/text.py | 15 ++++++++------- sklearn/tests/test_common.py | 1 - sklearn/utils/estimator_checks.py | 3 ++- 4 files changed, 10 insertions(+), 18 deletions(-) diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py index 0033ae84948ac..324d4f0875854 100644 --- a/sklearn/feature_extraction/tests/test_text.py +++ b/sklearn/feature_extraction/tests/test_text.py @@ -504,15 +504,6 @@ def test_vectorizer(): with pytest.raises(ValueError): t3.transform(counts_train) - # test idf transform with incompatible n_features - X = [[1, 1, 5], - [1, 1, 0]] - t3.fit(X) - X_incompt = [[1, 3], - [1, 3]] - with pytest.raises(ValueError): - t3.transform(X_incompt) - # L1-normalized term frequencies sum to one assert_array_almost_equal(np.sum(tf, axis=1), [1.0] * n_train) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 00debc059440c..fed5c16ffbb54 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -1386,6 +1386,11 @@ class TfidfTransformer(TransformerMixin, BaseEstimator): .. versionadded:: 0.20 + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 1.0 + Examples -------- >>> from sklearn.feature_extraction.text import TfidfTransformer @@ -1436,7 +1441,7 @@ def fit(self, X, y=None): X : sparse matrix of shape n_samples, n_features) A matrix of term/token counts. """ - X = check_array(X, accept_sparse=('csr', 'csc')) + X = self._validate_data(X, accept_sparse=('csr', 'csc')) if not sp.issparse(X): X = sp.csr_matrix(X) dtype = X.dtype if X.dtype in FLOAT_DTYPES else np.float64 @@ -1476,7 +1481,8 @@ def transform(self, X, copy=True): ------- vectors : sparse matrix of shape (n_samples, n_features) """ - X = check_array(X, accept_sparse='csr', dtype=FLOAT_DTYPES, copy=copy) + X = self._validate_data(X, accept_sparse='csr', + dtype=FLOAT_DTYPES, copy=copy, reset=False) if not sp.issparse(X): X = sp.csr_matrix(X, dtype=np.float64) @@ -1493,11 +1499,6 @@ def transform(self, X, copy=True): check_is_fitted(self, attributes=["idf_"], msg='idf vector is not fitted') - expected_n_features = self._idf_diag.shape[0] - if n_features != expected_n_features: - raise ValueError("Input has n_features=%d while the model" - " has been trained with n_features=%d" % ( - n_features, expected_n_features)) # *= doesn't work X = X * self._idf_diag diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index 8ec4125547722..6fd57c9e8d4fc 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -262,7 +262,6 @@ def test_search_cv(estimator, check, request): # check_classifiers_train would need to be updated with the error message N_FEATURES_IN_AFTER_FIT_MODULES_TO_IGNORE = { 'compose', - 'feature_extraction', 'model_selection', 'multiclass', 'multioutput', diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index f0c0383a7bfe8..2058c8308ec29 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -3121,7 +3121,8 @@ def check_n_features_in_after_fitting(name, estimator_orig): # Make sure that n_features_in are checked after fitting tags = _safe_tags(estimator_orig) - if "2darray" not in tags["X_types"] or tags["no_validation"]: + if ("2darray" not in tags["X_types"] and "sparse" not in tags["X_types"] or + tags["no_validation"]): return rng = np.random.RandomState(0) From c09be6ab8cf5366daea4a59ffe33cd437f58d4a7 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Tue, 1 Jun 2021 13:41:00 -0400 Subject: [PATCH 440/478] TST Fixes test and mis-matched pandas version (#20149) --- README.rst | 2 +- doc/whats_new/v1.0.rst | 2 +- sklearn/_min_dependencies.py | 4 ++-- sklearn/tests/test_min_dependencies_readme.py | 4 +++- 4 files changed, 7 insertions(+), 5 deletions(-) diff --git a/README.rst b/README.rst index 3c685fa4af13e..cf625bcd0f30d 100644 --- a/README.rst +++ b/README.rst @@ -27,7 +27,7 @@ .. _DOI: https://zenodo.org/badge/latestdoi/21369/scikit-learn/scikit-learn .. |PythonMinVersion| replace:: 3.7 -.. |NumPyMinVersion| replace:: 1.14.5 +.. |NumPyMinVersion| replace:: 1.14.6 .. |SciPyMinVersion| replace:: 1.1.0 .. |JoblibMinVersion| replace:: 0.11 .. |ThreadpoolctlMinVersion| replace:: 2.0.0 diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index 6ecc421bafd48..930f99bda4cbb 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -15,7 +15,7 @@ Version 1.0.0 Minimal dependencies -------------------- -Version 1.0.0 of scikit-learn requires python 3.7+, numpy 1.14.5+ and +Version 1.0.0 of scikit-learn requires python 3.7+, numpy 1.14.6+ and scipy 1.1.0+. Optional minimal dependency is matplotlib 2.2.2+. Enforcing keyword-only arguments diff --git a/sklearn/_min_dependencies.py b/sklearn/_min_dependencies.py index d878a04eb4523..6a6ff13c479d1 100644 --- a/sklearn/_min_dependencies.py +++ b/sklearn/_min_dependencies.py @@ -7,7 +7,7 @@ if platform.python_implementation() == 'PyPy': NUMPY_MIN_VERSION = '1.19.0' else: - NUMPY_MIN_VERSION = '1.14.5' + NUMPY_MIN_VERSION = '1.14.6' SCIPY_MIN_VERSION = '1.1.0' JOBLIB_MIN_VERSION = '0.11' @@ -27,7 +27,7 @@ 'cython': (CYTHON_MIN_VERSION, 'build'), 'matplotlib': ('2.2.2', 'benchmark, docs, examples, tests'), 'scikit-image': ('0.14.5', 'docs, examples, tests'), - 'pandas': ('0.23.4', 'benchmark, docs, examples, tests'), + 'pandas': ('0.25.0', 'benchmark, docs, examples, tests'), 'seaborn': ('0.9.0', 'docs, examples'), 'memory_profiler': ('0.57.0', 'benchmark, docs'), 'pytest': (PYTEST_MIN_VERSION, 'tests'), diff --git a/sklearn/tests/test_min_dependencies_readme.py b/sklearn/tests/test_min_dependencies_readme.py index f3958a88b6158..45825a18092a1 100644 --- a/sklearn/tests/test_min_dependencies_readme.py +++ b/sklearn/tests/test_min_dependencies_readme.py @@ -37,9 +37,11 @@ def test_min_dependencies_readme(): continue package, version = matched.group(2), matched.group(5) + package = package.lower() if package in dependent_packages: version = parse_version(version) min_version = parse_version(dependent_packages[package][0]) - assert version == min_version + assert version == min_version, (f"{package} has a mismatched " + "version") From 952b10fd549c4562fb47fb4e357b3e4b375f4fb5 Mon Sep 17 00:00:00 2001 From: Jonathan Schneider Date: Tue, 1 Jun 2021 19:45:16 +0200 Subject: [PATCH 441/478] [MRG] DOC Improve documentation of Latent Dirichlet Allocation (#20181) Rename `max_iters` to public name `max_doc_update_iter` to prevent confusion between max_iter and max_iters Improve parameter documentation --- sklearn/decomposition/_lda.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/sklearn/decomposition/_lda.py b/sklearn/decomposition/_lda.py index 34432557814c2..75b123a118338 100644 --- a/sklearn/decomposition/_lda.py +++ b/sklearn/decomposition/_lda.py @@ -29,7 +29,7 @@ def _update_doc_distribution(X, exp_topic_word_distr, doc_topic_prior, - max_iters, + max_doc_update_iter, mean_change_tol, cal_sstats, random_state): """E-step: update document-topic distribution. @@ -45,7 +45,7 @@ def _update_doc_distribution(X, exp_topic_word_distr, doc_topic_prior, doc_topic_prior : float Prior of document topic distribution `theta`. - max_iters : int + max_doc_update_iter : int Max number of iterations for updating document topic distribution in the E-step. @@ -105,7 +105,7 @@ def _update_doc_distribution(X, exp_topic_word_distr, doc_topic_prior, exp_topic_word_d = exp_topic_word_distr[:, ids] # Iterate between `doc_topic_d` and `norm_phi` until convergence - for _ in range(0, max_iters): + for _ in range(0, max_doc_update_iter): last_d = doc_topic_d # The optimal phi_{dwk} is proportional to @@ -187,7 +187,9 @@ class LatentDirichletAllocation(TransformerMixin, BaseEstimator): called tau_0. max_iter : int, default=10 - The maximum number of iterations. + The maximum number of passes over the training data (aka epochs). + It only impacts the behavior in the :meth:`fit` method, and not the + :meth:`partial_fit` method. batch_size : int, default=128 Number of documents to use in each EM iteration. Only used in online From 5ff8632201d1faae8e6e0bc464d07b84c6df2578 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Tue, 1 Jun 2021 21:09:37 +0200 Subject: [PATCH 442/478] MNT add path as a static abstract method to LinearModelCV (#19970) --- sklearn/linear_model/_coordinate_descent.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/sklearn/linear_model/_coordinate_descent.py b/sklearn/linear_model/_coordinate_descent.py index 1d93a6695b0e0..da50a3a817a38 100644 --- a/sklearn/linear_model/_coordinate_descent.py +++ b/sklearn/linear_model/_coordinate_descent.py @@ -1176,6 +1176,11 @@ def _get_estimator(self): def _is_multitask(self): """Bool indicating if class is meant for multidimensional target.""" + @staticmethod + @abstractmethod + def path(X, y, **kwargs): + """Compute path with coordinate descent.""" + def fit(self, X, y): """Fit linear model with coordinate descent. From 07a0cf37c9c1efa1ae7fddc9a33d0ee8798e635e Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Tue, 1 Jun 2021 16:06:28 -0400 Subject: [PATCH 443/478] MNT Finish removing python 3.6 (#20185) --- README.rst | 4 ++-- doc/developers/advanced_installation.rst | 2 +- setup.py | 9 ++++----- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/README.rst b/README.rst index cf625bcd0f30d..b5ee90a304eff 100644 --- a/README.rst +++ b/README.rst @@ -17,8 +17,8 @@ .. |Nightly wheels| image:: https://github.com/scikit-learn/scikit-learn/workflows/Wheel%20builder/badge.svg?event=schedule .. _`Nightly wheels`: https://github.com/scikit-learn/scikit-learn/actions?query=workflow%3A%22Wheel+builder%22+event%3Aschedule -.. |PythonVersion| image:: https://img.shields.io/badge/python-3.6%20%7C%203.7%20%7C%203.8%20%7C%203.9-blue -.. _PythonVersion: https://img.shields.io/badge/python-3.6%20%7C%203.7%20%7C%203.8%20%7C%203.9-blue +.. |PythonVersion| image:: https://img.shields.io/badge/python-3.7%20%7C%203.8%20%7C%203.9-blue +.. _PythonVersion: https://img.shields.io/badge/python-3.7%20%7C%203.8%20%7C%203.9-blue .. |PyPi| image:: https://badge.fury.io/py/scikit-learn.svg .. _PyPi: https://badge.fury.io/py/scikit-learn diff --git a/doc/developers/advanced_installation.rst b/doc/developers/advanced_installation.rst index 489f6447d57c8..c1dec51723861 100644 --- a/doc/developers/advanced_installation.rst +++ b/doc/developers/advanced_installation.rst @@ -94,7 +94,7 @@ Runtime dependencies Scikit-learn requires the following dependencies both at build time and at runtime: -- Python (>= 3.6), +- Python (>= 3.7), - NumPy (>= |NumpyMinVersion|), - SciPy (>= |ScipyMinVersion|), - Joblib (>= |JoblibMinVersion|), diff --git a/setup.py b/setup.py index 9758f62de1301..91602bafca408 100755 --- a/setup.py +++ b/setup.py @@ -19,7 +19,7 @@ try: import builtins except ImportError: - # Python 2 compat: just to be able to declare that Python >=3.6 is needed. + # Python 2 compat: just to be able to declare that Python >=3.7 is needed. import __builtin__ as builtins # This is a bit (!) hackish: we are setting a global variable so that the @@ -145,7 +145,7 @@ def build_extensions(self): except ImportError: # Numpy should not be a dependency just to be able to introspect - # that python 3.6 is required. + # that python 3.7 is required. pass @@ -251,7 +251,6 @@ def setup_package(): 'Operating System :: Unix', 'Operating System :: MacOS', 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.8', 'Programming Language :: Python :: 3.9', @@ -261,7 +260,7 @@ def setup_package(): 'Implementation :: PyPy') ], cmdclass=cmdclass, - python_requires=">=3.6", + python_requires=">=3.7", install_requires=min_deps.tag_to_packages['install'], package_data={'': ['*.pxd']}, **extra_setuptools_args) @@ -280,7 +279,7 @@ def setup_package(): else: if sys.version_info < (3, 6): raise RuntimeError( - "Scikit-learn requires Python 3.6 or later. The current" + "Scikit-learn requires Python 3.7 or later. The current" " Python version is %s installed in %s." % (platform.python_version(), sys.executable)) From 6850c04186b88e88e9c8cd6eb673721af806e3da Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Tue, 1 Jun 2021 17:22:47 -0400 Subject: [PATCH 444/478] API Deprecates support for np.matrix in check_array (#20165) * API Deprecates support for np.matrix in check_array * DOC Adds whats new * ENH Adds link to numpy.matrix --- doc/whats_new/v1.0.rst | 10 ++++++++++ sklearn/utils/tests/test_validation.py | 19 +++++++++++++++++++ sklearn/utils/validation.py | 8 ++++++++ 3 files changed, 37 insertions(+) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index 930f99bda4cbb..fac578bcb1b03 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -123,6 +123,9 @@ Changelog - For :class:`tree.ExtraTreeRegressor`, `criterion="mae"` is deprecated, use `"absolute_error"` instead. +- |API| `np.matrix` usage is deprecated in 1.0 and will raise a `TypeError` in + 1.2. :pr:`20165` by `Thomas Fan`_. + :mod:`sklearn.base` ................... @@ -512,6 +515,13 @@ Changelog precision of the computed variance was very poor when the real variance is exactly zero. :pr:`19766` by :user:`Jérémie du Boisberranger `. +:mod:`sklearn.validation` +......................... + +- |Fix| Support for `np.matrix` is deprecated in + :func:`~sklearn.utils.check_array` in 1.0 and will raise a `TypeError` in + 1.2. :pr:`20165` by `Thomas Fan`_. + Code and Documentation Contributors ----------------------------------- diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index ae2d5181f35a6..b3e28d7deeeef 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -57,6 +57,9 @@ from sklearn.utils._testing import TempMemmap +# TODO: Remove np.matrix usage in 1.2 +@pytest.mark.filterwarnings( + "ignore:np.matrix usage is deprecated in 1.0:FutureWarning") @pytest.mark.filterwarnings( "ignore:the matrix subclass:PendingDeprecationWarning") def test_as_float_array(): @@ -115,6 +118,9 @@ def test_as_float_array_nan(X): assert_allclose_dense_sparse(X_converted, X) +# TODO: Remove np.matrix usage in 1.2 +@pytest.mark.filterwarnings( + "ignore:np.matrix usage is deprecated in 1.0:FutureWarning") @pytest.mark.filterwarnings( "ignore:the matrix subclass:PendingDeprecationWarning") def test_np_matrix(): @@ -1379,3 +1385,16 @@ def test_num_features_errors_scalars(X): ) with pytest.raises(TypeError, match=msg): _num_features(X) + + +# TODO: Remove in 1.2 +@pytest.mark.filterwarnings( + "ignore:the matrix subclass:PendingDeprecationWarning") +def test_check_array_deprecated_matrix(): + """Test that matrix support is deprecated in 1.0.""" + + X = np.matrix(np.arange(5)) + msg = ("np.matrix usage is deprecated in 1.0 and will raise a TypeError " + "in 1.2. Please convert to a numpy array with np.asarray.") + with pytest.warns(FutureWarning, match=msg): + check_array(X) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index acfc8f5d10db2..b7af987d60c83 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -543,6 +543,14 @@ def check_array(array, accept_sparse=False, *, accept_large_sparse=True, array_converted : object The converted and validated array. """ + if isinstance(array, np.matrix): + warnings.warn( + "np.matrix usage is deprecated in 1.0 and will raise a TypeError " + "in 1.2. Please convert to a numpy array with np.asarray. For " + "more information see: " + "https://numpy.org/doc/stable/reference/generated/numpy.matrix.html", # noqa + FutureWarning) + # store reference to original array to check if copy is needed when # function returns array_orig = array From 5d25ce13ae0fa8f1f9e02d046d1820b6dcfd6155 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?= <34657725+jeremiedbb@users.noreply.github.com> Date: Wed, 2 Jun 2021 17:41:41 +0200 Subject: [PATCH 445/478] TST enable test docstring params for feature extraction module (#20188) --- sklearn/tests/test_docstring_parameters.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/tests/test_docstring_parameters.py b/sklearn/tests/test_docstring_parameters.py index cc10f11fcd574..bceaa21801872 100644 --- a/sklearn/tests/test_docstring_parameters.py +++ b/sklearn/tests/test_docstring_parameters.py @@ -183,7 +183,6 @@ def _construct_searchcv_instance(SearchCV): 'discriminant_analysis', 'dummy', 'ensemble', - 'feature_extraction', 'feature_selection', 'gaussian_process', 'impute', From e203750cf085d229c1755873d7208b6813de0443 Mon Sep 17 00:00:00 2001 From: tsuga <2888173+tsuga@users.noreply.github.com> Date: Thu, 3 Jun 2021 10:54:41 +0900 Subject: [PATCH 446/478] DOC fix a reference in sklearn.ensemble.GradientBoostingRegressor (#20198) --- sklearn/ensemble/_gb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/ensemble/_gb.py b/sklearn/ensemble/_gb.py index 54e4e510cd9b9..78fee588ecf4e 100644 --- a/sklearn/ensemble/_gb.py +++ b/sklearn/ensemble/_gb.py @@ -1612,7 +1612,7 @@ class GradientBoostingRegressor(RegressorMixin, BaseGradientBoosting): HistGradientBoostingRegressor : Histogram-based Gradient Boosting Classification Tree. sklearn.tree.DecisionTreeRegressor : A decision tree regressor. - sklearn.tree.RandomForestRegressor : A random forest regressor. + sklearn.ensemble.RandomForestRegressor : A random forest regressor. Notes ----- From 64bafa313ef7afcbed74bbb0189da48ccf8e2230 Mon Sep 17 00:00:00 2001 From: Conner Shen Date: Thu, 3 Jun 2021 06:15:30 -0400 Subject: [PATCH 447/478] FIX mcc zero divsion (#19977) --- sklearn/metrics/_classification.py | 5 ++- sklearn/metrics/tests/test_classification.py | 35 ++++++++------------ sklearn/utils/_testing.py | 24 -------------- 3 files changed, 15 insertions(+), 49 deletions(-) diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py index 434fd89f5bbd9..ada2af3f111e2 100644 --- a/sklearn/metrics/_classification.py +++ b/sklearn/metrics/_classification.py @@ -871,12 +871,11 @@ def matthews_corrcoef(y_true, y_pred, *, sample_weight=None): cov_ytyp = n_correct * n_samples - np.dot(t_sum, p_sum) cov_ypyp = n_samples ** 2 - np.dot(p_sum, p_sum) cov_ytyt = n_samples ** 2 - np.dot(t_sum, t_sum) - mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp) - if np.isnan(mcc): + if cov_ypyp * cov_ytyt == 0: return 0. else: - return mcc + return cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp) def zero_one_loss(y_true, y_pred, *, normalize=True, sample_weight=None): diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py index feed701f6cead..df352a8031948 100644 --- a/sklearn/metrics/tests/test_classification.py +++ b/sklearn/metrics/tests/test_classification.py @@ -20,7 +20,6 @@ from sklearn.utils._testing import assert_array_equal from sklearn.utils._testing import assert_array_almost_equal from sklearn.utils._testing import assert_allclose -from sklearn.utils._testing import assert_warns_div0 from sklearn.utils._testing import assert_no_warnings from sklearn.utils._testing import assert_warns_message from sklearn.utils._testing import ignore_warnings @@ -622,7 +621,6 @@ def test_cohen_kappa(): weights="quadratic"), 0.9541, decimal=4) -@ignore_warnings def test_matthews_corrcoef_nan(): assert matthews_corrcoef([0], [1]) == 0.0 assert matthews_corrcoef([0, 0], [0, 1]) == 0.0 @@ -684,17 +682,11 @@ def test_matthews_corrcoef(): assert_almost_equal(matthews_corrcoef(y_true, y_true_inv2), -1) # For the zero vector case, the corrcoef cannot be calculated and should - # result in a RuntimeWarning - mcc = assert_warns_div0(matthews_corrcoef, [0, 0, 0, 0], [0, 0, 0, 0]) - - # But will output 0 - assert_almost_equal(mcc, 0.) + # output 0 + assert_almost_equal(matthews_corrcoef([0, 0, 0, 0], [0, 0, 0, 0]), 0.) # And also for any other vector with 0 variance - mcc = assert_warns_div0(matthews_corrcoef, y_true, ['a'] * len(y_true)) - - # But will output 0 - assert_almost_equal(mcc, 0.) + assert_almost_equal(matthews_corrcoef(y_true, ['a'] * len(y_true)), 0.) # These two vectors have 0 correlation and hence mcc should be 0 y_1 = [1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1] @@ -731,12 +723,15 @@ def test_matthews_corrcoef_multiclass(): assert_almost_equal(matthews_corrcoef(y_true, y_pred_min), -12 / np.sqrt(24 * 16)) - # Zero variance will result in an mcc of zero and a Runtime Warning + # Zero variance will result in an mcc of zero y_true = [0, 1, 2] y_pred = [3, 3, 3] - mcc = assert_warns_message(RuntimeWarning, 'invalid value encountered', - matthews_corrcoef, y_true, y_pred) - assert_almost_equal(mcc, 0.0) + assert_almost_equal(matthews_corrcoef(y_true, y_pred), 0.0) + + # Also for ground truth with zero variance + y_true = [3, 3, 3] + y_pred = [0, 1, 2] + assert_almost_equal(matthews_corrcoef(y_true, y_pred), 0.0) # These two vectors have 0 correlation and hence mcc should be 0 y_1 = [0, 1, 2, 0, 1, 2, 0, 1, 2] @@ -754,16 +749,12 @@ def test_matthews_corrcoef_multiclass(): sample_weight=sample_weight), -1) # For the zero vector case, the corrcoef cannot be calculated and should - # result in a RuntimeWarning + # output 0 y_true = [0, 0, 1, 2] y_pred = [0, 0, 1, 2] sample_weight = [1, 1, 0, 0] - mcc = assert_warns_message(RuntimeWarning, 'invalid value encountered', - matthews_corrcoef, y_true, y_pred, - sample_weight=sample_weight) - - # But will output 0 - assert_almost_equal(mcc, 0.) + assert_almost_equal(matthews_corrcoef(y_true, y_pred, + sample_weight=sample_weight), 0.) @pytest.mark.parametrize('n_points', [100, 10000]) diff --git a/sklearn/utils/_testing.py b/sklearn/utils/_testing.py index 8fc77748740d5..55ea23afbf9ec 100644 --- a/sklearn/utils/_testing.py +++ b/sklearn/utils/_testing.py @@ -187,30 +187,6 @@ def check_in_message(msg): return message in msg return result -def assert_warns_div0(func, *args, **kw): - """Assume that numpy's warning for divide by zero is raised. - - Handles the case of platforms that do not support warning on divide by - zero. - - Parameters - ---------- - func - *args - **kw - """ - - with np.errstate(divide='warn', invalid='warn'): - try: - assert_warns(RuntimeWarning, np.divide, 1, np.zeros(1)) - except AssertionError: - # This platform does not report numpy divide by zeros - return func(*args, **kw) - return assert_warns_message(RuntimeWarning, - 'invalid value encountered', - func, *args, **kw) - - # To remove when we support numpy 1.7 def assert_no_warnings(func, *args, **kw): """ From 7f35724cfb72519e8e02cd44341c580e499d9fb3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?= <34657725+jeremiedbb@users.noreply.github.com> Date: Thu, 3 Jun 2021 13:29:01 +0200 Subject: [PATCH 448/478] TST Add TransformedTargetRegressor to test_meta_estimators_delegate_data_validation (#20175) Co-authored-by: Guillaume Lemaitre --- sklearn/compose/_column_transformer.py | 6 ++++++ sklearn/compose/_target.py | 6 ++++++ sklearn/tests/test_common.py | 2 +- sklearn/tests/test_metaestimators.py | 2 +- 4 files changed, 14 insertions(+), 2 deletions(-) diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index 6c15b81be98c2..ada175c7f32c6 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -141,6 +141,12 @@ class ColumnTransformer(TransformerMixin, _BaseComposition): .. versionadded:: 1.0 + n_features_in_ : int + Number of features seen during :term:`fit`. Only defined if the + underlying transformers expose such an attribute when fit. + + .. versionadded:: 0.24 + Notes ----- The order of the columns in the transformed feature matrix follows the diff --git a/sklearn/compose/_target.py b/sklearn/compose/_target.py index 12fe13ee848b9..af996623d8aa3 100644 --- a/sklearn/compose/_target.py +++ b/sklearn/compose/_target.py @@ -82,6 +82,12 @@ class TransformedTargetRegressor(RegressorMixin, BaseEstimator): transformer_ : object Transformer used in ``fit`` and ``predict``. + n_features_in_ : int + Number of features seen during :term:`fit`. Only defined if the + underlying regressor exposes such an attribute when fit. + + .. versionadded:: 0.24 + Examples -------- >>> import numpy as np diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index 6fd57c9e8d4fc..848788647cf3f 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -261,7 +261,7 @@ def test_search_cv(estimator, check, request): # # check_classifiers_train would need to be updated with the error message N_FEATURES_IN_AFTER_FIT_MODULES_TO_IGNORE = { - 'compose', + 'feature_extraction', 'model_selection', 'multiclass', 'multioutput', diff --git a/sklearn/tests/test_metaestimators.py b/sklearn/tests/test_metaestimators.py index ad716c3e4cd2f..c7412c98d4290 100644 --- a/sklearn/tests/test_metaestimators.py +++ b/sklearn/tests/test_metaestimators.py @@ -169,7 +169,7 @@ def _generate_meta_estimator_instances_with_pipeline(): for _, Estimator in sorted(all_estimators()): sig = set(signature(Estimator).parameters) - if "estimator" in sig or "base_estimator" in sig: + if "estimator" in sig or "base_estimator" in sig or "regressor" in sig: if is_regressor(Estimator): estimator = make_pipeline(TfidfVectorizer(), Ridge()) param_grid = {"ridge__alpha": [0.1, 1.0]} From bd7ebf5aede5015c2127de6f7b670b446bb337a3 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 3 Jun 2021 13:34:48 +0200 Subject: [PATCH 449/478] TST enable n_feature_in_ test for feature_extraction module --- sklearn/tests/test_common.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index 848788647cf3f..e891374f91051 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -261,7 +261,6 @@ def test_search_cv(estimator, check, request): # # check_classifiers_train would need to be updated with the error message N_FEATURES_IN_AFTER_FIT_MODULES_TO_IGNORE = { - 'feature_extraction', 'model_selection', 'multiclass', 'multioutput', From 3a23e26da6f61622f7ebfcf8dfda3575c38d50fb Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Thu, 3 Jun 2021 08:28:27 -0400 Subject: [PATCH 450/478] FIX Uses points instead of pixels in plot_tree (#20023) --- sklearn/tree/_export.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/tree/_export.py b/sklearn/tree/_export.py index a9763128c3a7e..a4ba02d5f8932 100644 --- a/sklearn/tree/_export.py +++ b/sklearn/tree/_export.py @@ -611,7 +611,7 @@ def export(self, decision_tree, ax=None): def recurse(self, node, tree, ax, scale_x, scale_y, height, depth=0): import matplotlib.pyplot as plt kwargs = dict(bbox=self.bbox_args.copy(), ha='center', va='center', - zorder=100 - 10 * depth, xycoords='axes pixels', + zorder=100 - 10 * depth, xycoords='axes points', arrowprops=self.arrow_args.copy()) kwargs['arrowprops']['edgecolor'] = plt.rcParams['text.color'] From 6bfaceded8e99396ff18356c9b97a7a673b6d9e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?= <34657725+jeremiedbb@users.noreply.github.com> Date: Thu, 3 Jun 2021 14:41:23 +0200 Subject: [PATCH 451/478] MNT n_features_in through the multiclass module (#20193) --- sklearn/multiclass.py | 78 +++++++++++++++------- sklearn/tests/test_common.py | 1 - sklearn/tests/test_docstring_parameters.py | 4 +- sklearn/tests/test_metaestimators.py | 3 +- 4 files changed, 56 insertions(+), 30 deletions(-) diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py index d75556bf60ab4..99a6db2051030 100644 --- a/sklearn/multiclass.py +++ b/sklearn/multiclass.py @@ -50,13 +50,13 @@ from .utils._tags import _safe_tags from .utils.validation import _num_samples from .utils.validation import check_is_fitted -from .utils.validation import check_X_y, check_array +from .utils.validation import column_or_1d +from .utils.validation import _assert_all_finite from .utils.multiclass import (_check_partial_fit_first_call, check_classification_targets, _ovr_decision_function) from .utils.metaestimators import _safe_split, if_delegate_has_method from .utils.fixes import delayed -from .exceptions import NotFittedError from joblib import Parallel @@ -114,24 +114,28 @@ def _check_estimator(estimator): class _ConstantPredictor(BaseEstimator): def fit(self, X, y): + self._check_n_features(X, reset=True) self.y_ = y return self def predict(self, X): check_is_fitted(self) + self._check_n_features(X, reset=True) - return np.repeat(self.y_, X.shape[0]) + return np.repeat(self.y_, _num_samples(X)) def decision_function(self, X): check_is_fitted(self) + self._check_n_features(X, reset=True) - return np.repeat(self.y_, X.shape[0]) + return np.repeat(self.y_, _num_samples(X)) def predict_proba(self, X): check_is_fitted(self) + self._check_n_features(X, reset=True) return np.repeat([np.hstack([1 - self.y_, self.y_])], - X.shape[0], axis=0) + _num_samples(X), axis=0) class OneVsRestClassifier(MultiOutputMixin, ClassifierMixin, @@ -219,6 +223,12 @@ class OneVsRestClassifier(MultiOutputMixin, ClassifierMixin, multilabel_ : boolean Whether a OneVsRestClassifier is a multilabel classifier. + n_features_in_ : int + Number of features seen during :term:`fit`. Only defined if the + underlying estimator exposes such an attribute when fit. + + .. versionadded:: 0.24 + Examples -------- >>> import numpy as np @@ -282,6 +292,9 @@ def fit(self, X, y): self.label_binarizer_.classes_[i]]) for i, column in enumerate(columns)) + if hasattr(self.estimators_[0], "n_features_in_"): + self.n_features_in_ = self.estimators_[0].n_features_in_ + return self @if_delegate_has_method('estimator') @@ -338,6 +351,9 @@ def partial_fit(self, X, y, classes=None): delayed(_partial_fit_binary)(estimator, X, column) for estimator, column in zip(self.estimators_, columns)) + if hasattr(self.estimators_[0], "n_features_in_"): + self.n_features_in_ = self.estimators_[0].n_features_in_ + return self def predict(self, X): @@ -504,19 +520,6 @@ def _more_tags(self): def _first_estimator(self): return self.estimators_[0] - @property - def n_features_in_(self): - # For consistency with other estimators we raise a AttributeError so - # that hasattr() fails if the OVR estimator isn't fitted. - try: - check_is_fitted(self) - except NotFittedError as nfe: - raise AttributeError( - "{} object has no n_features_in_ attribute." - .format(self.__class__.__name__) - ) from nfe - return self.estimators_[0].n_features_in_ - def _fit_ovo_binary(estimator, X, y, i, j): """Fit a single binary estimator (one-vs-one).""" @@ -525,7 +528,7 @@ def _fit_ovo_binary(estimator, X, y, i, j): y_binary = np.empty(y.shape, int) y_binary[y == i] = 0 y_binary[y == j] = 1 - indcond = np.arange(X.shape[0])[cond] + indcond = np.arange(_num_samples(X))[cond] return _fit_binary(estimator, _safe_split(estimator, X, None, indices=indcond)[0], y_binary, classes=[i, j]), indcond @@ -593,6 +596,12 @@ class OneVsOneClassifier(MetaEstimatorMixin, ClassifierMixin, BaseEstimator): (renaming of 0.25) and onward, `pairwise_indices_` will use the pairwise estimator tag instead. + n_features_in_ : int + Number of features seen during :term:`fit`. Only defined if the + underlying estimator exposes such an attribute when fit. + + .. versionadded:: 0.24 + Examples -------- >>> from sklearn.datasets import load_iris @@ -626,6 +635,7 @@ def fit(self, X, y): ------- self """ + # We need to validate the data because we do a safe_indexing later. X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc'], force_all_finite=False) check_classification_targets(y) @@ -642,6 +652,9 @@ def fit(self, X, y): self.estimators_ = estimators_indices[0] + if hasattr(self.estimators_[0], "n_features_in_"): + self.n_features_in_ = self.estimators_[0].n_features_in_ + pairwise = _is_pairwise(self) self.pairwise_indices_ = ( estimators_indices[1] if pairwise else None) @@ -686,8 +699,9 @@ def partial_fit(self, X, y, classes=None): "must be subset of {1}".format(np.unique(y), self.classes_)) - X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'], - force_all_finite=False) + X, y = self._validate_data( + X, y, accept_sparse=['csr', 'csc'], force_all_finite=False, + reset=_check_partial_fit_first_call(self, classes)) check_classification_targets(y) combinations = itertools.combinations(range(self.n_classes_), 2) self.estimators_ = Parallel( @@ -699,6 +713,9 @@ def partial_fit(self, X, y, classes=None): self.pairwise_indices_ = None + if hasattr(self.estimators_[0], "n_features_in_"): + self.n_features_in_ = self.estimators_[0].n_features_in_ + return self def predict(self, X): @@ -832,6 +849,12 @@ class OutputCodeClassifier(MetaEstimatorMixin, ClassifierMixin, BaseEstimator): code_book_ : numpy array of shape [n_classes, code_size] Binary array containing the code of each class. + n_features_in_ : int + Number of features seen during :term:`fit`. Only defined if the + underlying estimator exposes such an attribute when fit. + + .. versionadded:: 0.24 + Examples -------- >>> from sklearn.multiclass import OutputCodeClassifier @@ -886,7 +909,9 @@ def fit(self, X, y): ------- self """ - X, y = self._validate_data(X, y, accept_sparse=True) + y = column_or_1d(y, warn=True) + _assert_all_finite(y) + if self.code_size <= 0: raise ValueError("code_size should be greater than 0, got {0}" "".format(self.code_size)) @@ -897,6 +922,9 @@ def fit(self, X, y): self.classes_ = np.unique(y) n_classes = self.classes_.shape[0] + if n_classes == 0: + raise ValueError("OutputCodeClassifier can not be fit when no " + "class is present.") code_size_ = int(n_classes * self.code_size) # FIXME: there are more elaborate methods than generating the codebook @@ -912,12 +940,15 @@ def fit(self, X, y): classes_index = {c: i for i, c in enumerate(self.classes_)} Y = np.array([self.code_book_[classes_index[y[i]]] - for i in range(X.shape[0])], dtype=int) + for i in range(_num_samples(y))], dtype=int) self.estimators_ = Parallel(n_jobs=self.n_jobs)( delayed(_fit_binary)(self.estimator, X, Y[:, i]) for i in range(Y.shape[1])) + if hasattr(self.estimators_[0], "n_features_in_"): + self.n_features_in_ = self.estimators_[0].n_features_in_ + return self def predict(self, X): @@ -934,7 +965,6 @@ def predict(self, X): Predicted multi-class targets. """ check_is_fitted(self) - X = check_array(X, accept_sparse=True) Y = np.array([_predict_binary(e, X) for e in self.estimators_]).T pred = euclidean_distances(Y, self.code_book_).argmin(axis=1) return self.classes_[pred] diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index e891374f91051..bbffd7fa197cf 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -262,7 +262,6 @@ def test_search_cv(estimator, check, request): # check_classifiers_train would need to be updated with the error message N_FEATURES_IN_AFTER_FIT_MODULES_TO_IGNORE = { 'model_selection', - 'multiclass', 'multioutput', 'pipeline', } diff --git a/sklearn/tests/test_docstring_parameters.py b/sklearn/tests/test_docstring_parameters.py index bceaa21801872..97da48f1e6524 100644 --- a/sklearn/tests/test_docstring_parameters.py +++ b/sklearn/tests/test_docstring_parameters.py @@ -192,7 +192,6 @@ def _construct_searchcv_instance(SearchCV): 'linear_model', 'manifold', 'model_selection', - 'multiclass', 'multioutput', 'naive_bayes', 'neighbors', @@ -219,8 +218,7 @@ def test_fit_docstring_attributes(name, Estimator): 'CountVectorizer', 'DictVectorizer', 'FeatureUnion', 'GaussianRandomProjection', 'MultiOutputClassifier', 'MultiOutputRegressor', - 'NoSampleWeightWrapper', 'OneVsOneClassifier', - 'OutputCodeClassifier', 'Pipeline', 'RFE', 'RFECV', + 'NoSampleWeightWrapper', 'Pipeline', 'RFE', 'RFECV', 'RegressorChain', 'SelectFromModel', 'SparseCoder', 'SparseRandomProjection', 'SpectralBiclustering', 'StackingClassifier', diff --git a/sklearn/tests/test_metaestimators.py b/sklearn/tests/test_metaestimators.py index c7412c98d4290..9a19008c3b322 100644 --- a/sklearn/tests/test_metaestimators.py +++ b/sklearn/tests/test_metaestimators.py @@ -219,8 +219,7 @@ def _generate_meta_estimator_instances_with_pipeline(): "IterativeImputer", "MultiOutputClassifier", "MultiOutputRegressor", - "OneVsOneClassifier", - "OutputCodeClassifier", + "OneVsOneClassifier", # input validation can't be avoided "RANSACRegressor", "RFE", "RFECV", From 1038024a438e2bc76e7e48edde7b7ca732dc506b Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Thu, 3 Jun 2021 09:47:34 -0400 Subject: [PATCH 452/478] CI Removes python 3.6 builds from wheel building (#20184) --- .github/workflows/wheels.yml | 5 +++-- pyproject.toml | 5 ++++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index ffddf9ef88db3..a280c29c31683 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -36,7 +36,7 @@ jobs: name: Check build trigger run: bash build_tools/github/check_build_trigger.sh - # Build the wheels for Linux, Windows and macOS for Python 3.6 and newer + # Build the wheels for Linux, Windows and macOS for Python 3.7 and newer build_wheels: name: Build wheel for cp${{ matrix.python }}-${{ matrix.platform_id }}-${{ matrix.manylinux_image }} runs-on: ${{ matrix.os }} @@ -48,7 +48,7 @@ jobs: fail-fast: false matrix: os: [windows-latest, ubuntu-latest, macos-latest] - python: [36, 37, 38, 39] + python: [37, 38, 39] bitness: [32, 64] manylinux_image: [manylinux1, manylinux2010] include: @@ -102,6 +102,7 @@ jobs: CIBW_TEST_REQUIRES: pytest pandas threadpoolctl CIBW_TEST_COMMAND: bash {project}/build_tools/github/test_wheels.sh CIBW_TEST_COMMAND_WINDOWS: bash {project}/build_tools/github/test_windows_wheels.sh ${{ matrix.python }} ${{ matrix.bitness }} + CIBW_BUILD_VERBOSITY: 1 run: bash build_tools/github/build_wheels.sh diff --git a/pyproject.toml b/pyproject.toml index 84468f65341da..d172baaea7088 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,7 +9,10 @@ requires = [ # wheels on PyPI # # see: https://github.com/scipy/oldest-supported-numpy/blob/master/setup.cfg - "oldest-supported-numpy", + "oldest-supported-numpy; python_version!='3.7' or platform_machine=='aarch64' or platform_system=='AIX' or platform_python_implementation == 'PyPy'", + + # Override oldest-supported-numpy setting because pandas 0.25.0 requires 1.14.6 + "numpy==1.14.6; python_version=='3.7' and platform_machine!='aarch64' and platform_system!='AIX' and platform_python_implementation != 'PyPy'", "scipy>=1.1.0", ] From 95f5fb48e161625027cd245f941d15148c9e7949 Mon Sep 17 00:00:00 2001 From: mlondschien <61679398+mlondschien@users.noreply.github.com> Date: Fri, 4 Jun 2021 14:08:05 +0200 Subject: [PATCH 453/478] FIX Fix typo in error message in `fetch_openml` (#20201) --- sklearn/datasets/_openml.py | 2 +- sklearn/datasets/tests/test_openml.py | 9 +++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/sklearn/datasets/_openml.py b/sklearn/datasets/_openml.py index ec3c3a9ae961d..2eedf57fa085e 100644 --- a/sklearn/datasets/_openml.py +++ b/sklearn/datasets/_openml.py @@ -840,7 +840,7 @@ def fetch_openml( raise ValueError( "Dataset data_id={} and version={} passed, but you can only " "specify a numeric data_id or a version, not " - "both.".format(data_id, name)) + "both.".format(data_id, version)) else: raise ValueError( "Neither name nor data_id are provided. Please provide name or " diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py index 663d2ae3088ed..39cd4c9ee1912 100644 --- a/sklearn/datasets/tests/test_openml.py +++ b/sklearn/datasets/tests/test_openml.py @@ -1216,15 +1216,16 @@ def test_fetch_openml_raises_missing_values_target(monkeypatch, gzip_response): def test_fetch_openml_raises_illegal_argument(): - msg = 'Dataset data_id=' + msg = 'Dataset data_id=-1 and version=version passed, but you can only' with pytest.raises(ValueError, match=msg): - fetch_openml(data_id=-1, name="name") + fetch_openml(data_id=-1, name=None, version="version") + msg = "Dataset data_id=-1 and name=name passed, but you can only" with pytest.raises(ValueError, match=msg): - fetch_openml(data_id=-1, name=None, version="version") + fetch_openml(data_id=-1, name="nAmE") with pytest.raises(ValueError, match=msg): - fetch_openml(data_id=-1, name="name", version="version") + fetch_openml(data_id=-1, name="nAmE", version="version") msg = ( "Neither name nor data_id are provided. " From 7b965c7893089ce7f22aca383fa14521c69204c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Fauchereau?= Date: Fri, 4 Jun 2021 12:11:14 +0000 Subject: [PATCH 454/478] FIX Fix error when using Calibrated with Voting (#20087) --- doc/whats_new/v1.0.rst | 4 ++++ sklearn/calibration.py | 34 ++++++++++++++++++------------- sklearn/tests/test_calibration.py | 20 +++++++++++++++++- 3 files changed, 43 insertions(+), 15 deletions(-) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index fac578bcb1b03..fc7950f3590e0 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -138,6 +138,10 @@ Changelog :class:`calibration.CalibratedClassifierCV` can now properly be used on prefitted pipelines. :pr:`19641` by :user:`Alek Lefebvre `. +- |Fix| Fixed an error when using a ::class:`ensemble.VotingClassifier` + as `base_estimator` in ::class:`calibration.CalibratedClassifierCV`. + :pr:`20087` by :user:`Clément Fauchereau `. + :mod:`sklearn.cluster` ...................... diff --git a/sklearn/calibration.py b/sklearn/calibration.py index 084f3bf242e3c..abdbed1bb797b 100644 --- a/sklearn/calibration.py +++ b/sklearn/calibration.py @@ -257,9 +257,10 @@ def fit(self, X, y, sample_weight=None): check_is_fitted(self.base_estimator, attributes=["classes_"]) self.classes_ = self.base_estimator.classes_ - pred_method = _get_prediction_method(base_estimator) + pred_method, method_name = _get_prediction_method(base_estimator) n_classes = len(self.classes_) - predictions = _compute_predictions(pred_method, X, n_classes) + predictions = _compute_predictions(pred_method, method_name, X, + n_classes) calibrated_classifier = _fit_calibrator( base_estimator, predictions, y, self.classes_, self.method, @@ -310,12 +311,13 @@ def fit(self, X, y, sample_weight=None): ) else: this_estimator = clone(base_estimator) - method_name = _get_prediction_method(this_estimator).__name__ + _, method_name = _get_prediction_method(this_estimator) pred_method = partial( cross_val_predict, estimator=this_estimator, X=X, y=y, cv=cv, method=method_name, n_jobs=self.n_jobs ) - predictions = _compute_predictions(pred_method, X, n_classes) + predictions = _compute_predictions(pred_method, method_name, X, + n_classes) if sample_weight is not None and supports_sw: this_estimator.fit(X, y, sample_weight) @@ -441,8 +443,9 @@ def _fit_classifier_calibrator_pair(estimator, X, y, train, test, supports_sw, estimator.fit(X_train, y_train) n_classes = len(classes) - pred_method = _get_prediction_method(estimator) - predictions = _compute_predictions(pred_method, X_test, n_classes) + pred_method, method_name = _get_prediction_method(estimator) + predictions = _compute_predictions(pred_method, method_name, X_test, + n_classes) calibrated_classifier = _fit_calibrator( estimator, predictions, y_test, classes, method, sample_weight=sw_test @@ -465,18 +468,21 @@ def _get_prediction_method(clf): ------- prediction_method : callable The prediction method. + method_name : str + The name of the prediction method. """ if hasattr(clf, 'decision_function'): method = getattr(clf, 'decision_function') + return method, 'decision_function' elif hasattr(clf, 'predict_proba'): method = getattr(clf, 'predict_proba') + return method, 'predict_proba' else: raise RuntimeError("'base_estimator' has no 'decision_function' or " "'predict_proba' method.") - return method -def _compute_predictions(pred_method, X, n_classes): +def _compute_predictions(pred_method, method_name, X, n_classes): """Return predictions for `X` and reshape binary outputs to shape (n_samples, 1). @@ -485,6 +491,9 @@ def _compute_predictions(pred_method, X, n_classes): pred_method : callable Prediction method. + method_name: str + Name of the prediction method + X : array-like or None Data used to obtain predictions. @@ -498,10 +507,6 @@ def _compute_predictions(pred_method, X, n_classes): (X.shape[0], 1). """ predictions = pred_method(X=X) - if hasattr(pred_method, '__name__'): - method_name = pred_method.__name__ - else: - method_name = signature(pred_method).parameters['method'].default if method_name == 'decision_function': if predictions.ndim == 1: @@ -634,8 +639,9 @@ def predict_proba(self, X): The predicted probabilities. Can be exact zeros. """ n_classes = len(self.classes) - pred_method = _get_prediction_method(self.base_estimator) - predictions = _compute_predictions(pred_method, X, n_classes) + pred_method, method_name = _get_prediction_method(self.base_estimator) + predictions = _compute_predictions(pred_method, method_name, X, + n_classes) label_encoder = LabelEncoder().fit(self.classes) pos_class_indices = label_encoder.transform( diff --git a/sklearn/tests/test_calibration.py b/sklearn/tests/test_calibration.py index 53d620b41031c..210d90f99f845 100644 --- a/sklearn/tests/test_calibration.py +++ b/sklearn/tests/test_calibration.py @@ -20,7 +20,8 @@ from sklearn.preprocessing import LabelEncoder from sklearn.model_selection import KFold, cross_val_predict from sklearn.naive_bayes import MultinomialNB -from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor +from sklearn.ensemble import (RandomForestClassifier, RandomForestRegressor, + VotingClassifier) from sklearn.svm import LinearSVC from sklearn.isotonic import IsotonicRegression from sklearn.feature_extraction import DictVectorizer @@ -607,3 +608,20 @@ def test_calibrated_classifier_cv_deprecation(data): calibrators, calib_clf.calibrated_classifiers_[0].calibrators ): assert clf1 is clf2 + + +def test_calibration_votingclassifier(): + # Check that `CalibratedClassifier` works with `VotingClassifier`. + # The method `predict_proba` from `VotingClassifier` is dynamically + # defined via a property that only works when voting="soft". + X, y = make_classification(n_samples=10, n_features=5, + n_classes=2, random_state=7) + vote = VotingClassifier( + estimators=[('dummy'+str(i), DummyClassifier()) for i in range(3)], + voting="soft" + ) + vote.fit(X, y) + + calib_clf = CalibratedClassifierCV(base_estimator=vote, cv="prefit") + # smoke test: should not raise an error + calib_clf.fit(X, y) From a1a6b3a9602283792ec4091cdb990be1afab9163 Mon Sep 17 00:00:00 2001 From: murata-yu <67666318+murata-yu@users.noreply.github.com> Date: Fri, 4 Jun 2021 22:23:05 +0900 Subject: [PATCH 455/478] FIX Fix RandomForestRegressor doesn't accept max_samples=1.0 (#20159) Co-authored-by: Olivier Grisel Co-authored-by: Thomas J. Fan --- doc/whats_new/v1.0.rst | 6 ++++ sklearn/ensemble/_forest.py | 17 ++++++----- sklearn/ensemble/tests/test_forest.py | 43 +++++++++++++++++++++++---- 3 files changed, 53 insertions(+), 13 deletions(-) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index fc7950f3590e0..ece6ff15ac51b 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -270,6 +270,12 @@ Changelog :class:`ensemble.StackingClassifier` and :class:`ensemble.StackingRegressor`. :pr:`19564` by `Thomas Fan`_. +- |Fix| Fixed the range of the argument max_samples to be (0.0, 1.0] + in :class:`ensemble.RandomForestClassifier`, + :class:`ensemble.RandomForestRegressor`, where `max_samples=1.0` is + interpreted as using all `n_samples` for bootstrapping. :pr:`20159` by + :user:`murata-yu`. + :mod:`sklearn.feature_extraction` ................................. diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index 5a93acd0c0554..06ca0c171efc6 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -86,7 +86,7 @@ def _get_n_samples_bootstrap(n_samples, max_samples): max_samples : int or float The maximum number of samples to draw from the total available: - if float, this indicates a fraction of the total and should be - the interval `(0, 1)`; + the interval `(0.0, 1.0]`; - if int, this indicates the exact number of samples; - if None, this indicates the total number of samples. @@ -105,8 +105,8 @@ def _get_n_samples_bootstrap(n_samples, max_samples): return max_samples if isinstance(max_samples, numbers.Real): - if not (0 < max_samples < 1): - msg = "`max_samples` must be in range (0, 1) but got value {}" + if not (0 < max_samples <= 1): + msg = "`max_samples` must be in range (0.0, 1.0] but got value {}" raise ValueError(msg.format(max_samples)) return round(n_samples * max_samples) @@ -1163,7 +1163,7 @@ class RandomForestClassifier(ForestClassifier): - If None (default), then draw `X.shape[0]` samples. - If int, then draw `max_samples` samples. - If float, then draw `max_samples * X.shape[0]` samples. Thus, - `max_samples` should be in the interval `(0, 1)`. + `max_samples` should be in the interval `(0.0, 1.0]`. .. versionadded:: 0.22 @@ -1473,7 +1473,7 @@ class RandomForestRegressor(ForestRegressor): - If None (default), then draw `X.shape[0]` samples. - If int, then draw `max_samples` samples. - If float, then draw `max_samples * X.shape[0]` samples. Thus, - `max_samples` should be in the interval `(0, 1)`. + `max_samples` should be in the interval `(0.0, 1.0]`. .. versionadded:: 0.22 @@ -1557,6 +1557,7 @@ class RandomForestRegressor(ForestRegressor): >>> print(regr.predict([[0, 0, 0, 0]])) [-8.32987858] """ + def __init__(self, n_estimators=100, *, criterion="squared_error", @@ -1789,7 +1790,7 @@ class ExtraTreesClassifier(ForestClassifier): - If None (default), then draw `X.shape[0]` samples. - If int, then draw `max_samples` samples. - If float, then draw `max_samples * X.shape[0]` samples. Thus, - `max_samples` should be in the interval `(0, 1)`. + `max_samples` should be in the interval `(0.0, 1.0]`. .. versionadded:: 0.22 @@ -1873,6 +1874,7 @@ class labels (multi-output problem). >>> clf.predict([[0, 0, 0, 0]]) array([1]) """ + def __init__(self, n_estimators=100, *, criterion="gini", @@ -2095,7 +2097,7 @@ class ExtraTreesRegressor(ForestRegressor): - If None (default), then draw `X.shape[0]` samples. - If int, then draw `max_samples` samples. - If float, then draw `max_samples * X.shape[0]` samples. Thus, - `max_samples` should be in the interval `(0, 1)`. + `max_samples` should be in the interval `(0.0, 1.0]`. .. versionadded:: 0.22 @@ -2168,6 +2170,7 @@ class ExtraTreesRegressor(ForestRegressor): >>> reg.score(X_test, y_test) 0.2708... """ + def __init__(self, n_estimators=100, *, criterion="squared_error", diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py index c74a1ca0c603e..52615d037cf63 100644 --- a/sklearn/ensemble/tests/test_forest.py +++ b/sklearn/ensemble/tests/test_forest.py @@ -50,6 +50,8 @@ from sklearn.svm import LinearSVC from sklearn.utils.validation import check_random_state +from sklearn.metrics import mean_squared_error + from sklearn.tree._classes import SPARSE_SPLITTERS @@ -1419,16 +1421,14 @@ def test_forest_degenerate_feature_importances(): 'max_samples, exc_type, exc_msg', [(int(1e9), ValueError, "`max_samples` must be in range 1 to 6 but got value 1000000000"), - (1.0, ValueError, - r"`max_samples` must be in range \(0, 1\) but got value 1.0"), (2.0, ValueError, - r"`max_samples` must be in range \(0, 1\) but got value 2.0"), + r"`max_samples` must be in range \(0.0, 1.0\] but got value 2.0"), (0.0, ValueError, - r"`max_samples` must be in range \(0, 1\) but got value 0.0"), + r"`max_samples` must be in range \(0.0, 1.0\] but got value 0.0"), (np.nan, ValueError, - r"`max_samples` must be in range \(0, 1\) but got value nan"), + r"`max_samples` must be in range \(0.0, 1.0\] but got value nan"), (np.inf, ValueError, - r"`max_samples` must be in range \(0, 1\) but got value inf"), + r"`max_samples` must be in range \(0.0, 1.0\] but got value inf"), ('str max_samples?!', TypeError, r"`max_samples` should be int or float, but got " r"type '\'"), @@ -1443,6 +1443,37 @@ def test_max_samples_exceptions(name, max_samples, exc_type, exc_msg): est.fit(X, y) +@pytest.mark.parametrize('name', FOREST_REGRESSORS) +def test_max_samples_boundary_regressors(name): + X_train, X_test, y_train, y_test = train_test_split( + X_reg, y_reg, train_size=0.7, test_size=0.3, random_state=0) + + ms_1_model = FOREST_REGRESSORS[name](max_samples=1.0, random_state=0) + ms_1_predict = ms_1_model.fit(X_train, y_train).predict(X_test) + + ms_None_model = FOREST_REGRESSORS[name](max_samples=None, random_state=0) + ms_None_predict = ms_None_model.fit(X_train, y_train).predict(X_test) + + ms_1_ms = mean_squared_error(ms_1_predict, y_test) + ms_None_ms = mean_squared_error(ms_None_predict, y_test) + + assert ms_1_ms == pytest.approx(ms_None_ms) + + +@pytest.mark.parametrize('name', FOREST_CLASSIFIERS) +def test_max_samples_boundary_classifiers(name): + X_train, X_test, y_train, _ = train_test_split( + X_large, y_large, random_state=0, stratify=y_large) + + ms_1_model = FOREST_CLASSIFIERS[name](max_samples=1.0, random_state=0) + ms_1_proba = ms_1_model.fit(X_train, y_train).predict_proba(X_test) + + ms_None_model = FOREST_CLASSIFIERS[name](max_samples=None, random_state=0) + ms_None_proba = ms_None_model.fit(X_train, y_train).predict_proba(X_test) + + np.testing.assert_allclose(ms_1_proba, ms_None_proba) + + def test_forest_y_sparse(): X = [[1, 2, 3]] y = csr_matrix([4, 5, 6]) From 36915ae390fab4742f98c82dc6802f072c4effa5 Mon Sep 17 00:00:00 2001 From: Brian Sun <52805678+bsun94@users.noreply.github.com> Date: Sat, 5 Jun 2021 22:45:23 -0400 Subject: [PATCH 456/478] ENH Adds Poisson criterion in RandomForestRegressor (#19836) Co-authored-by: Christian Lorentzen Co-authored-by: Alihan Zihna Co-authored-by: Alihan Zihna Co-authored-by: Chiara Marmo Co-authored-by: Olivier Grisel Co-authored-by: naozin555 <37050583+naozin555@users.noreply.github.com> Co-authored-by: Venkatachalam N Co-authored-by: Thomas J. Fan --- doc/whats_new/v1.0.rst | 4 ++ sklearn/ensemble/_forest.py | 18 +++++- sklearn/ensemble/tests/test_forest.py | 89 +++++++++++++++++++++++++++ 3 files changed, 108 insertions(+), 3 deletions(-) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index ece6ff15ac51b..b66c87815bae7 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -270,6 +270,10 @@ Changelog :class:`ensemble.StackingClassifier` and :class:`ensemble.StackingRegressor`. :pr:`19564` by `Thomas Fan`_. +- |Enhancement| Documented and tested support of the Poisson criterion for + :class:`ensemble.RandomForestRegressor`. :pr:`19836` by + :user:`Brian Sun `. + - |Fix| Fixed the range of the argument max_samples to be (0.0, 1.0] in :class:`ensemble.RandomForestClassifier`, :class:`ensemble.RandomForestRegressor`, where `max_samples=1.0` is diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index 06ca0c171efc6..bc29c0362bb3e 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -323,6 +323,14 @@ def fit(self, X, y, sample_weight=None): # [:, np.newaxis] that does not. y = np.reshape(y, (-1, 1)) + if self.criterion == "poisson": + if np.any(y < 0): + raise ValueError("Some value(s) of y are negative which is " + "not allowed for Poisson regression.") + if np.sum(y) <= 0: + raise ValueError("Sum of y is not strictly positive which " + "is necessary for Poisson regression.") + self.n_outputs_ = y.shape[1] y, expanded_class_weight = self._validate_y_class_weight(y) @@ -1324,16 +1332,20 @@ class RandomForestRegressor(ForestRegressor): The default value of ``n_estimators`` changed from 10 to 100 in 0.22. - criterion : {"squared_error", "mse", "absolute_error", "mae"}, \ + criterion : {"squared_error", "mse", "absolute_error", "poisson"}, \ default="squared_error" The function to measure the quality of a split. Supported criteria are "squared_error" for the mean squared error, which is equal to - variance reduction as feature selection criterion, and "absolute_error" - for the mean absolute error. + variance reduction as feature selection criterion, "absolute_error" + for the mean absolute error, and "poisson" which uses reduction in + Poisson deviance to find splits. .. versionadded:: 0.18 Mean Absolute Error (MAE) criterion. + .. versionadded:: 1.0 + Poisson criterion. + .. deprecated:: 1.0 Criterion "mse" was deprecated in v1.0 and will be removed in version 1.2. Use `criterion="squared_error"` which is equivalent. diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py index 52615d037cf63..6c4aa905abe55 100644 --- a/sklearn/ensemble/tests/test_forest.py +++ b/sklearn/ensemble/tests/test_forest.py @@ -27,6 +27,8 @@ import joblib from numpy.testing import assert_allclose +from sklearn.dummy import DummyRegressor +from sklearn.metrics import mean_poisson_deviance from sklearn.utils._testing import assert_almost_equal from sklearn.utils._testing import assert_array_almost_equal from sklearn.utils._testing import assert_array_equal @@ -185,6 +187,76 @@ def test_regression(name, criterion): check_regression_criterion(name, criterion) +def test_poisson_vs_mse(): + """Test that random forest with poisson criterion performs better than + mse for a poisson target.""" + rng = np.random.RandomState(42) + n_train, n_test, n_features = 500, 500, 10 + X = datasets.make_low_rank_matrix(n_samples=n_train + n_test, + n_features=n_features, random_state=rng) + X = np.abs(X) + X /= np.max(np.abs(X), axis=0) + # We create a log-linear Poisson model + coef = rng.uniform(low=-4, high=1, size=n_features) + y = rng.poisson(lam=np.exp(X @ coef)) + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=n_test, + random_state=rng) + + forest_poi = RandomForestRegressor( + criterion="poisson", + min_samples_leaf=10, + max_features="sqrt", + random_state=rng) + forest_mse = RandomForestRegressor( + criterion="squared_error", + min_samples_leaf=10, + max_features="sqrt", + random_state=rng) + + forest_poi.fit(X_train, y_train) + forest_mse.fit(X_train, y_train) + dummy = DummyRegressor(strategy="mean").fit(X_train, y_train) + + for X, y, val in [(X_train, y_train, "train"), (X_test, y_test, "test")]: + metric_poi = mean_poisson_deviance(y, forest_poi.predict(X)) + # squared_error forest might produce non-positive predictions => clip + # If y = 0 for those, the poisson deviance gets too good. + # If we drew more samples, we would eventually get y > 0 and the + # poisson deviance would explode, i.e. be undefined. Therefore, we do + # not clip to a tiny value like 1e-15, but to 0.1. This acts like a + # mild penalty to the non-positive predictions. + metric_mse = mean_poisson_deviance( + y, + np.clip(forest_mse.predict(X), 1e-6, None)) + metric_dummy = mean_poisson_deviance(y, dummy.predict(X)) + # As squared_error might correctly predict 0 in train set, its train + # score can be better than Poisson. This is no longer the case for the + # test set. But keep the above comment for clipping in mind. + if val == "test": + assert metric_poi < metric_mse + assert metric_poi < metric_dummy + + +@pytest.mark.parametrize('criterion', ('poisson', 'squared_error')) +def test_balance_property_random_forest(criterion): + """"Test that sum(y_pred)==sum(y_true) on the training set.""" + rng = np.random.RandomState(42) + n_train, n_test, n_features = 500, 500, 10 + X = datasets.make_low_rank_matrix(n_samples=n_train + n_test, + n_features=n_features, random_state=rng) + + coef = rng.uniform(low=-2, high=2, size=n_features) / np.max(X, axis=0) + y = rng.poisson(lam=np.exp(X @ coef)) + + reg = RandomForestRegressor(criterion=criterion, + n_estimators=10, + bootstrap=False, + random_state=rng) + reg.fit(X, y) + + assert np.sum(reg.predict(X)) == pytest.approx(np.sum(y)) + + def check_regressor_attributes(name): # Regression models should not have a classes_ attribute. r = FOREST_REGRESSORS[name](random_state=0) @@ -1367,6 +1439,23 @@ def test_min_impurity_decrease(): assert tree.min_impurity_decrease == 0.1 +def test_poisson_y_positive_check(): + est = RandomForestRegressor(criterion="poisson") + X = np.zeros((3, 3)) + + y = [-1, 1, 3] + err_msg = (r"Some value\(s\) of y are negative which is " + r"not allowed for Poisson regression.") + with pytest.raises(ValueError, match=err_msg): + est.fit(X, y) + + y = [0, 0, 0] + err_msg = (r"Sum of y is not strictly positive which " + r"is necessary for Poisson regression.") + with pytest.raises(ValueError, match=err_msg): + est.fit(X, y) + + # mypy error: Variable "DEFAULT_JOBLIB_BACKEND" is not valid type class MyBackend(DEFAULT_JOBLIB_BACKEND): # type: ignore def __init__(self, *args, **kwargs): From 9884ccd609b818e2a87ea1cb4dfde56a0b624860 Mon Sep 17 00:00:00 2001 From: Nanshan Li Date: Mon, 7 Jun 2021 00:58:35 +0800 Subject: [PATCH 457/478] TST Replace assert_warns from decomposition/tests (#20214) --- sklearn/decomposition/tests/test_fastica.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/sklearn/decomposition/tests/test_fastica.py b/sklearn/decomposition/tests/test_fastica.py index 9f37ac25c2f76..4379b07697d0c 100644 --- a/sklearn/decomposition/tests/test_fastica.py +++ b/sklearn/decomposition/tests/test_fastica.py @@ -10,7 +10,6 @@ from sklearn.utils._testing import assert_almost_equal from sklearn.utils._testing import assert_array_almost_equal -from sklearn.utils._testing import assert_warns from sklearn.decomposition import FastICA, fastica, PCA from sklearn.decomposition._fastica import _gs_decorrelation @@ -141,7 +140,9 @@ def test_fastica_nowhiten(): # test for issue #697 ica = FastICA(n_components=1, whiten=False, random_state=0) - assert_warns(UserWarning, ica.fit, m) + warn_msg = "Ignoring n_components with whiten=False." + with pytest.warns(UserWarning, match=warn_msg): + ica.fit(m) assert hasattr(ica, 'mixing_') @@ -164,9 +165,14 @@ def test_fastica_convergence_fail(): m = np.dot(mixing, s) # Do fastICA with tolerance 0. to ensure failing convergence - ica = FastICA(algorithm="parallel", n_components=2, random_state=rng, - max_iter=2, tol=0.) - assert_warns(ConvergenceWarning, ica.fit, m.T) + warn_msg = ( + "FastICA did not converge. Consider increasing tolerance " + "or the maximum number of iterations." + ) + with pytest.warns(ConvergenceWarning, match=warn_msg): + ica = FastICA(algorithm="parallel", n_components=2, random_state=rng, + max_iter=2, tol=0.) + ica.fit(m.T) @pytest.mark.parametrize('add_noise', [True, False]) From 800aee6d48be102a27a6f6d3df1822e52c628951 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 7 Jun 2021 12:29:45 +0200 Subject: [PATCH 458/478] TST check n_features_in_ in pipeline module (#20192) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Olivier Grisel Co-authored-by: Jérémie du Boisberranger <34657725+jeremiedbb@users.noreply.github.com> Co-authored-by: Olivier Grisel --- sklearn/pipeline.py | 16 ++++++++ sklearn/tests/test_common.py | 1 - sklearn/tests/test_docstring_parameters.py | 5 +-- sklearn/tests/test_metaestimators.py | 44 ++++++++++++++-------- 4 files changed, 47 insertions(+), 19 deletions(-) diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index e2ff6806ff3da..090d157b069bf 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -83,6 +83,13 @@ class Pipeline(_BaseComposition): Read-only attribute to access any step parameter by user given name. Keys are step names and values are steps parameters. + n_features_in_ : int + Number of features seen during :term:`fit`. Only defined if the + underlying first estimator in `steps` exposes such an attribute + when fit. + + .. versionadded:: 0.24 + See Also -------- make_pipeline : Convenience function for simplified pipeline construction. @@ -826,6 +833,15 @@ class FeatureUnion(TransformerMixin, _BaseComposition): If True, the time elapsed while fitting each transformer will be printed as it is completed. + Attributes + ---------- + n_features_in_ : int + Number of features seen during :term:`fit`. Only defined if the + underlying first transformer in `transformer_list` exposes such an + attribute when fit. + + .. versionadded:: 0.24 + See Also -------- make_union : Convenience function for simplified feature union diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index bbffd7fa197cf..5e190437ca4a9 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -263,7 +263,6 @@ def test_search_cv(estimator, check, request): N_FEATURES_IN_AFTER_FIT_MODULES_TO_IGNORE = { 'model_selection', 'multioutput', - 'pipeline', } N_FEATURES_IN_AFTER_FIT_ESTIMATORS = [ diff --git a/sklearn/tests/test_docstring_parameters.py b/sklearn/tests/test_docstring_parameters.py index 97da48f1e6524..74a3e91a52a32 100644 --- a/sklearn/tests/test_docstring_parameters.py +++ b/sklearn/tests/test_docstring_parameters.py @@ -196,7 +196,6 @@ def _construct_searchcv_instance(SearchCV): 'naive_bayes', 'neighbors', 'neural_network', - 'pipeline', 'preprocessing', 'random_projection', 'semi_supervised', @@ -215,10 +214,10 @@ def test_fit_docstring_attributes(name, Estimator): attributes = doc['Attributes'] IGNORED = {'ClassifierChain', 'ColumnTransformer', - 'CountVectorizer', 'DictVectorizer', 'FeatureUnion', + 'CountVectorizer', 'DictVectorizer', 'GaussianRandomProjection', 'MultiOutputClassifier', 'MultiOutputRegressor', - 'NoSampleWeightWrapper', 'Pipeline', 'RFE', 'RFECV', + 'NoSampleWeightWrapper', 'RFE', 'RFECV', 'RegressorChain', 'SelectFromModel', 'SparseCoder', 'SparseRandomProjection', 'SpectralBiclustering', 'StackingClassifier', diff --git a/sklearn/tests/test_metaestimators.py b/sklearn/tests/test_metaestimators.py index 9a19008c3b322..5e9057429fa94 100644 --- a/sklearn/tests/test_metaestimators.py +++ b/sklearn/tests/test_metaestimators.py @@ -21,6 +21,7 @@ from sklearn.exceptions import NotFittedError from sklearn.semi_supervised import SelfTrainingClassifier from sklearn.linear_model import Ridge, LogisticRegression +from sklearn.preprocessing import StandardScaler, MaxAbsScaler class DelegatorData: @@ -185,6 +186,19 @@ def _generate_meta_estimator_instances_with_pipeline(): else: yield Estimator(estimator) + elif "transformer_list" in sig: + # FeatureUnion + transformer_list = [ + ("trans1", make_pipeline(TfidfVectorizer(), MaxAbsScaler())), + ( + "trans2", + make_pipeline( + TfidfVectorizer(), StandardScaler(with_mean=False) + ), + ), + ] + yield Estimator(transformer_list) + elif "estimators" in sig: # stacking, voting if is_regressor(Estimator): @@ -211,21 +225,21 @@ def _generate_meta_estimator_instances_with_pipeline(): # They should be able to work on any data and delegate data validation to # their inner estimator(s). DATA_VALIDATION_META_ESTIMATORS_TO_IGNORE = [ - "AdaBoostClassifier", - "AdaBoostRegressor", - "BaggingClassifier", - "BaggingRegressor", - "ClassifierChain", - "IterativeImputer", - "MultiOutputClassifier", - "MultiOutputRegressor", - "OneVsOneClassifier", # input validation can't be avoided - "RANSACRegressor", - "RFE", - "RFECV", - "RegressorChain", - "SelfTrainingClassifier", - "SequentialFeatureSelector" # not applicable (2D data mandatory) + "AdaBoostClassifier", + "AdaBoostRegressor", + "BaggingClassifier", + "BaggingRegressor", + "ClassifierChain", + "IterativeImputer", + "MultiOutputClassifier", + "MultiOutputRegressor", + "OneVsOneClassifier", # input validation can't be avoided + "RANSACRegressor", + "RFE", + "RFECV", + "RegressorChain", + "SelfTrainingClassifier", + "SequentialFeatureSelector", # not applicable (2D data mandatory) ] DATA_VALIDATION_META_ESTIMATORS = [ From 778125645fbc84d6749c7b506662e12deb90c018 Mon Sep 17 00:00:00 2001 From: mlondschien <61679398+mlondschien@users.noreply.github.com> Date: Mon, 7 Jun 2021 16:45:22 +0200 Subject: [PATCH 459/478] Allow `n_knots=None` if knots are explicitly specified in `SplineTransformer` (#20191) Co-authored-by: Olivier Grisel --- sklearn/preprocessing/_polynomial.py | 15 +++++++++------ sklearn/preprocessing/tests/test_polynomial.py | 6 +++--- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/sklearn/preprocessing/_polynomial.py b/sklearn/preprocessing/_polynomial.py index ac4703dbb4cb2..930e85c783711 100644 --- a/sklearn/preprocessing/_polynomial.py +++ b/sklearn/preprocessing/_polynomial.py @@ -350,7 +350,8 @@ class SplineTransformer(TransformerMixin, BaseEstimator): ---------- n_knots : int, default=5 Number of knots of the splines if `knots` equals one of - {'uniform', 'quantile'}. Must be larger or equal 2. + {'uniform', 'quantile'}. Must be larger or equal 2. Ignored if `knots` + is array-like. degree : int, default=3 The polynomial degree of the spline basis. Must be a non-negative @@ -546,15 +547,17 @@ def fit(self, X, y=None): ): raise ValueError("degree must be a non-negative integer.") - if not ( - isinstance(self.n_knots, numbers.Integral) and self.n_knots >= 2 - ): - raise ValueError("n_knots must be a positive integer >= 2.") - if isinstance(self.knots, str) and self.knots in [ "uniform", "quantile", ]: + if not ( + isinstance(self.n_knots, numbers.Integral) + and self.n_knots >= 2 + ): + raise ValueError("n_knots must be a positive integer >= 2, " + f"got: {self.n_knots}") + base_knots = self._get_base_knot_positions( X, n_knots=self.n_knots, knots=self.knots ) diff --git a/sklearn/preprocessing/tests/test_polynomial.py b/sklearn/preprocessing/tests/test_polynomial.py index 1f70ec9854a54..b9be4e775b8d3 100644 --- a/sklearn/preprocessing/tests/test_polynomial.py +++ b/sklearn/preprocessing/tests/test_polynomial.py @@ -96,9 +96,9 @@ def test_spline_transformer_manual_knot_input(): """ X = np.arange(20).reshape(10, 2) knots = [[0.5, 1], [1.5, 2], [5, 10]] - st1 = SplineTransformer(degree=3, knots=knots).fit(X) + st1 = SplineTransformer(degree=3, knots=knots, n_knots=None).fit(X) knots = np.asarray(knots) - st2 = SplineTransformer(degree=3, knots=knots).fit(X) + st2 = SplineTransformer(degree=3, knots=knots, n_knots=None).fit(X) for i in range(X.shape[1]): assert_allclose(st1.bsplines_[i].t, st2.bsplines_[i].t) @@ -216,7 +216,7 @@ def test_spline_transformer_linear_regression(bias, intercept): ("uniform", 12, 8), ( [[-1.0, 0.0], [0, 1.0], [0.1, 2.0], [0.2, 3.0], [0.3, 4.0], [1, 5.0]], - 100, # this gets ignored. + None, 3 ) ]) From 673625b29466310fa86a06b0a1577150cd34cc8a Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Mon, 7 Jun 2021 18:15:17 +0200 Subject: [PATCH 460/478] FIX make check_complex_data deterministic (#20221) --- sklearn/utils/estimator_checks.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 2058c8308ec29..cb1c96adbd153 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -967,6 +967,7 @@ def check_complex_data(name, estimator_orig): # Something both valid for classification and regression y = rng.randint(low=0, high=2, size=10) + 1j estimator = clone(estimator_orig) + set_random_state(estimator, random_state=0) with raises(ValueError, match="Complex data not supported"): estimator.fit(X, y) From b15e312b29ddc9d527aa33002a0844b21e8dfb5a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?= <34657725+jeremiedbb@users.noreply.github.com> Date: Mon, 7 Jun 2021 23:48:17 +0200 Subject: [PATCH 461/478] TST test_fit_docstring_attributes include properties (#20190) --- sklearn/cluster/_bicluster.py | 3 +++ sklearn/model_selection/_search.py | 8 ++++++ sklearn/tests/test_docstring_parameters.py | 29 ++++++++++++++++++++-- 3 files changed, 38 insertions(+), 2 deletions(-) diff --git a/sklearn/cluster/_bicluster.py b/sklearn/cluster/_bicluster.py index c8ff1bb036662..9267052b48f75 100644 --- a/sklearn/cluster/_bicluster.py +++ b/sklearn/cluster/_bicluster.py @@ -255,6 +255,9 @@ class SpectralCoclustering(BaseSpectral): column_labels_ : array-like of shape (n_cols,) The bicluster label of each column. + biclusters_ : tuple of two ndarrays + The tuple contains the `rows_` and `columns_` arrays. + Examples -------- >>> from sklearn.cluster import SpectralCoclustering diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index 07ad3d7dbafe5..3ee0bcc4ec153 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -1177,6 +1177,10 @@ class GridSearchCV(BaseSearchCV): multimetric_ : bool Whether or not the scorers compute several metrics. + classes_ : ndarray of shape (n_classes,) + The classes labels. This is present only if ``refit`` is specified and + the underlying estimator is a classifier. + Notes ----- The parameters selected are those that maximize the score of the left out @@ -1499,6 +1503,10 @@ class RandomizedSearchCV(BaseSearchCV): multimetric_ : bool Whether or not the scorers compute several metrics. + classes_ : ndarray of shape (n_classes,) + The classes labels. This is present only if ``refit`` is specified and + the underlying estimator is a classifier. + Notes ----- The parameters selected are those that maximize the score of the held-out diff --git a/sklearn/tests/test_docstring_parameters.py b/sklearn/tests/test_docstring_parameters.py index 74a3e91a52a32..a3a0605308c79 100644 --- a/sklearn/tests/test_docstring_parameters.py +++ b/sklearn/tests/test_docstring_parameters.py @@ -224,6 +224,7 @@ def test_fit_docstring_attributes(name, Estimator): 'StackingRegressor', 'TfidfVectorizer', 'VotingClassifier', 'VotingRegressor', 'SequentialFeatureSelector', 'HalvingGridSearchCV', 'HalvingRandomSearchCV'} + if Estimator.__name__ in IGNORED or Estimator.__name__.startswith('_'): pytest.skip("Estimator cannot be fit easily to test fit attributes") @@ -284,10 +285,34 @@ def test_fit_docstring_attributes(name, Estimator): with ignore_warnings(category=FutureWarning): assert hasattr(est, attr.name) - fit_attr = [k for k in est.__dict__.keys() if k.endswith('_') - and not k.startswith('_')] + fit_attr = _get_all_fitted_attributes(est) fit_attr_names = [attr.name for attr in attributes] undocumented_attrs = set(fit_attr).difference(fit_attr_names) undocumented_attrs = set(undocumented_attrs).difference(skipped_attributes) assert not undocumented_attrs,\ "Undocumented attributes: {}".format(undocumented_attrs) + + +def _get_all_fitted_attributes(estimator): + "Get all the fitted attributes of an estimator including properties" + # attributes + fit_attr = list(estimator.__dict__.keys()) + + # properties + with warnings.catch_warnings(): + warnings.filterwarnings("error", category=FutureWarning) + + for name in dir(estimator.__class__): + obj = getattr(estimator.__class__, name) + if not isinstance(obj, property): + continue + + # ignore properties that raises an AttributeError and deprecated + # properties + try: + getattr(estimator, name) + except (AttributeError, FutureWarning): + continue + fit_attr.append(name) + + return [k for k in fit_attr if k.endswith('_') and not k.startswith('_')] From 7f308675a75cabb2222d61b5d6f293e85c43581c Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Mon, 7 Jun 2021 19:39:34 -0400 Subject: [PATCH 462/478] FIX Uses the color max for colormap in ConfusionMatrixDisplay (#19784) --- doc/whats_new/v1.0.rst | 3 +++ sklearn/metrics/_plot/confusion_matrix.py | 2 +- .../_plot/tests/test_confusion_matrix_display.py | 14 ++++++++++++++ 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index b66c87815bae7..02c77459ddc22 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -403,6 +403,9 @@ Changelog are integral. :pr:`9843` by :user:`Jon Crall `. +- |Fix| :meth:`metrics.ConfusionMatrixDisplay.plot` uses the correct max + for colormap. :pr:`19784` by `Thomas Fan`_. + - |Fix| Samples with zero `sample_weight` values do not affect the results from :func:`metrics.det_curve`, :func:`metrics.precision_recall_curve` and :func:`metrics.roc_curve`. diff --git a/sklearn/metrics/_plot/confusion_matrix.py b/sklearn/metrics/_plot/confusion_matrix.py index dd941a7e28e43..06d2d002a8191 100644 --- a/sklearn/metrics/_plot/confusion_matrix.py +++ b/sklearn/metrics/_plot/confusion_matrix.py @@ -119,7 +119,7 @@ def plot(self, *, include_values=True, cmap='viridis', n_classes = cm.shape[0] self.im_ = ax.imshow(cm, interpolation='nearest', cmap=cmap) self.text_ = None - cmap_min, cmap_max = self.im_.cmap(0), self.im_.cmap(256) + cmap_min, cmap_max = self.im_.cmap(0), self.im_.cmap(1.0) if include_values: self.text_ = np.empty_like(cm, dtype=object) diff --git a/sklearn/metrics/_plot/tests/test_confusion_matrix_display.py b/sklearn/metrics/_plot/tests/test_confusion_matrix_display.py index ed0bc04117396..b1498afae89ae 100644 --- a/sklearn/metrics/_plot/tests/test_confusion_matrix_display.py +++ b/sklearn/metrics/_plot/tests/test_confusion_matrix_display.py @@ -380,3 +380,17 @@ def test_confusion_matrix_with_unknown_labels(pyplot, constructor_name): display_labels = [tick.get_text() for tick in disp.ax_.get_xticklabels()] expected_labels = [str(i) for i in range(n_classes + 1)] assert_array_equal(expected_labels, display_labels) + + +def test_colormap_max(pyplot): + """Check that the max color is used for the color of the text.""" + + from matplotlib import cm + gray = cm.get_cmap('gray', 1024) + confusion_matrix = np.array([[1.0, 0.0], [0.0, 1.0]]) + + disp = ConfusionMatrixDisplay(confusion_matrix) + disp.plot(cmap=gray) + + color = disp.text_[1, 0].get_color() + assert_allclose(color, [1.0, 1.0, 1.0, 1.0]) From 5c3cb6b0af04344d41d542b718d682604d6aa685 Mon Sep 17 00:00:00 2001 From: solosilence Date: Tue, 8 Jun 2021 06:29:59 +0530 Subject: [PATCH 463/478] STY Changing .format method to f-string formatting (#20215) --- benchmarks/bench_20newsgroups.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/benchmarks/bench_20newsgroups.py b/benchmarks/bench_20newsgroups.py index 8efc740e937da..9546c8f1d6a39 100644 --- a/benchmarks/bench_20newsgroups.py +++ b/benchmarks/bench_20newsgroups.py @@ -46,18 +46,16 @@ print("20 newsgroups") print("=============") - print("X_train.shape = {0}".format(X_train.shape)) - print("X_train.format = {0}".format(X_train.format)) - print("X_train.dtype = {0}".format(X_train.dtype)) - print("X_train density = {0}" - "".format(X_train.nnz / np.product(X_train.shape))) - print("y_train {0}".format(y_train.shape)) - print("X_test {0}".format(X_test.shape)) - print("X_test.format = {0}".format(X_test.format)) - print("X_test.dtype = {0}".format(X_test.dtype)) - print("y_test {0}".format(y_test.shape)) + print(f"X_train.shape = {X_train.shape}") + print(f"X_train.format = {X_train.format}") + print(f"X_train.dtype = {X_train.dtype}") + print(f"X_train density = {X_train.nnz / np.product(X_train.shape)}") + print(f"y_train {y_train.shape}") + print(f"X_test {X_test.shape}") + print(f"X_test.format = {X_test.format}") + print(f"X_test.dtype = {X_test.dtype}") + print(f"y_test {y_test.shape}") print() - print("Classifier Training") print("===================") accuracy, train_time, test_time = {}, {}, {} From c53d33ea965edee4fd59f85181694efc437c0e8b Mon Sep 17 00:00:00 2001 From: Shao Yang Hong Date: Wed, 9 Jun 2021 16:01:00 +0800 Subject: [PATCH 464/478] [MRG] Listed valid metrics in neighbors.rst (#19379) Co-authored-by: Julien Jerphanion Co-authored-by: Thomas J. Fan Co-authored-by: Chiara Marmo --- doc/modules/neighbors.rst | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/doc/modules/neighbors.rst b/doc/modules/neighbors.rst index bb84b79e8570a..f394f011af11a 100644 --- a/doc/modules/neighbors.rst +++ b/doc/modules/neighbors.rst @@ -464,6 +464,20 @@ leaf nodes. The level of this switch can be specified with the parameter ``leaf_size`` is not referenced for brute force queries. +Valid Metrics for Nearest Neighbor Algorithms +--------------------------------------------- + +For a list of available metrics, see the documentation of the :class:`DistanceMetric` +class. + +A list of valid metrics for any of the above algorithms can be obtained by using their +``valid_metric`` attribute. For example, valid metrics for ``KDTree`` can be generated by: + + >>> from sklearn.neighbors import KDTree + >>> print(sorted(KDTree.valid_metrics)) + ['chebyshev', 'cityblock', 'euclidean', 'infinity', 'l1', 'l2', 'manhattan', 'minkowski', 'p'] + + .. _nearest_centroid_classifier: Nearest Centroid Classifier From 45fc4b76f3ebcf36d3c470d80c85f652a7a0c322 Mon Sep 17 00:00:00 2001 From: Nanshan Li Date: Wed, 9 Jun 2021 17:04:58 +0800 Subject: [PATCH 465/478] DOC Document n_features_in_ in cluster (#20228) --- sklearn/cluster/_affinity_propagation.py | 3 +++ sklearn/cluster/_agglomerative.py | 6 ++++++ sklearn/cluster/_bicluster.py | 6 ++++++ sklearn/cluster/_birch.py | 3 +++ sklearn/cluster/_dbscan.py | 3 +++ sklearn/cluster/_kmeans.py | 6 ++++++ sklearn/cluster/_mean_shift.py | 3 +++ sklearn/cluster/_optics.py | 3 +++ sklearn/cluster/_spectral.py | 3 +++ sklearn/tests/test_docstring_parameters.py | 1 - 10 files changed, 36 insertions(+), 1 deletion(-) diff --git a/sklearn/cluster/_affinity_propagation.py b/sklearn/cluster/_affinity_propagation.py index 59620ab31f63d..67c1fb42b650b 100644 --- a/sklearn/cluster/_affinity_propagation.py +++ b/sklearn/cluster/_affinity_propagation.py @@ -309,6 +309,9 @@ class AffinityPropagation(ClusterMixin, BaseEstimator): n_iter_ : int Number of iterations taken to converge. + n_features_in_ : int + Number of features seen during :term:`fit`. + Notes ----- For an example, see :ref:`examples/cluster/plot_affinity_propagation.py diff --git a/sklearn/cluster/_agglomerative.py b/sklearn/cluster/_agglomerative.py index 4b0089b707233..05f57ff238bcf 100644 --- a/sklearn/cluster/_agglomerative.py +++ b/sklearn/cluster/_agglomerative.py @@ -773,6 +773,9 @@ class AgglomerativeClustering(ClusterMixin, BaseEstimator): .. versionadded:: 0.21 ``n_connected_components_`` was added to replace ``n_components_``. + n_features_in_ : int + Number of features seen during :term:`fit`. + children_ : array-like of shape (n_samples-1, 2) The children of each non-leaf node. Values less than `n_samples` correspond to leaves of the tree which are the original samples. @@ -1039,6 +1042,9 @@ class FeatureAgglomeration(AgglomerativeClustering, AgglomerationTransform): .. versionadded:: 0.21 ``n_connected_components_`` was added to replace ``n_components_``. + n_features_in_ : int + Number of features seen during :term:`fit`. + children_ : array-like of shape (n_nodes-1, 2) The children of each non-leaf node. Values less than `n_features` correspond to leaves of the tree which are the original samples. diff --git a/sklearn/cluster/_bicluster.py b/sklearn/cluster/_bicluster.py index 9267052b48f75..1be7dd4e64186 100644 --- a/sklearn/cluster/_bicluster.py +++ b/sklearn/cluster/_bicluster.py @@ -258,6 +258,9 @@ class SpectralCoclustering(BaseSpectral): biclusters_ : tuple of two ndarrays The tuple contains the `rows_` and `columns_` arrays. + n_features_in_ : int + Number of features seen during :term:`fit`. + Examples -------- >>> from sklearn.cluster import SpectralCoclustering @@ -395,6 +398,9 @@ class SpectralBiclustering(BaseSpectral): column_labels_ : array-like of shape (n_cols,) Column partition labels. + n_features_in_ : int + Number of features seen during :term:`fit`. + Examples -------- >>> from sklearn.cluster import SpectralBiclustering diff --git a/sklearn/cluster/_birch.py b/sklearn/cluster/_birch.py index da1bf894f03f8..81c9312f1488a 100644 --- a/sklearn/cluster/_birch.py +++ b/sklearn/cluster/_birch.py @@ -401,6 +401,9 @@ class Birch(ClusterMixin, TransformerMixin, BaseEstimator): if partial_fit is used instead of fit, they are assigned to the last batch of data. + n_features_in_ : int + Number of features seen during :term:`fit`. + See Also -------- MiniBatchKMeans : Alternative implementation that does incremental updates diff --git a/sklearn/cluster/_dbscan.py b/sklearn/cluster/_dbscan.py index bbc3470256e90..abbb35e6e04af 100644 --- a/sklearn/cluster/_dbscan.py +++ b/sklearn/cluster/_dbscan.py @@ -217,6 +217,9 @@ class DBSCAN(ClusterMixin, BaseEstimator): Cluster labels for each point in the dataset given to fit(). Noisy samples are given the label -1. + n_features_in_ : int + Number of features seen during :term:`fit`. + Examples -------- >>> from sklearn.cluster import DBSCAN diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index 8b24be6ace987..fc9ba7a868d10 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -766,6 +766,9 @@ class KMeans(TransformerMixin, ClusterMixin, BaseEstimator): n_iter_ : int Number of iterations run. + n_features_in_ : int + Number of features seen during :term:`fit`. + See Also -------- MiniBatchKMeans : Alternative online implementation that does incremental @@ -1465,6 +1468,9 @@ class MiniBatchKMeans(KMeans): This attribute is deprecated in 0.24 and will be removed in 1.1 (renaming of 0.26). + n_features_in_ : int + Number of features seen during :term:`fit`. + See Also -------- KMeans : The classic implementation of the clustering method based on the diff --git a/sklearn/cluster/_mean_shift.py b/sklearn/cluster/_mean_shift.py index f48ef46e8dbef..147ec6c626eb0 100644 --- a/sklearn/cluster/_mean_shift.py +++ b/sklearn/cluster/_mean_shift.py @@ -311,6 +311,9 @@ class MeanShift(ClusterMixin, BaseEstimator): .. versionadded:: 0.22 + n_features_in_ : int + Number of features seen during :term:`fit`. + Examples -------- >>> from sklearn.cluster import MeanShift diff --git a/sklearn/cluster/_optics.py b/sklearn/cluster/_optics.py index af0e8531aa7b8..0f2b96346660b 100755 --- a/sklearn/cluster/_optics.py +++ b/sklearn/cluster/_optics.py @@ -178,6 +178,9 @@ class OPTICS(ClusterMixin, BaseEstimator): ``X[ordering_][start:end + 1]`` form a cluster. Only available when ``cluster_method='xi'``. + n_features_in_ : int + Number of features seen during :term:`fit`. + See Also -------- DBSCAN : A similar clustering for a specified neighborhood radius (eps). diff --git a/sklearn/cluster/_spectral.py b/sklearn/cluster/_spectral.py index cda6dac64ee54..de0192987f595 100644 --- a/sklearn/cluster/_spectral.py +++ b/sklearn/cluster/_spectral.py @@ -418,6 +418,9 @@ class SpectralClustering(ClusterMixin, BaseEstimator): labels_ : ndarray of shape (n_samples,) Labels of each point + n_features_in_ : int + Number of features seen during :term:`fit`. + Examples -------- >>> from sklearn.cluster import SpectralClustering diff --git a/sklearn/tests/test_docstring_parameters.py b/sklearn/tests/test_docstring_parameters.py index a3a0605308c79..85d8ad0cf6a36 100644 --- a/sklearn/tests/test_docstring_parameters.py +++ b/sklearn/tests/test_docstring_parameters.py @@ -176,7 +176,6 @@ def _construct_searchcv_instance(SearchCV): N_FEATURES_MODULES_TO_IGNORE = { - 'cluster', 'compose', 'covariance', 'decomposition', From a25382629b6c3a2bb41d486a45f9dde6ccd021dc Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 9 Jun 2021 11:22:59 +0200 Subject: [PATCH 466/478] TST make sure to test SearchCV on both classification and regression (#20202) --- sklearn/tests/test_common.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index 5e190437ca4a9..6588c677854ac 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -12,6 +12,7 @@ import re import pkgutil from inspect import isgenerator +from itertools import product from functools import partial import pytest @@ -212,8 +213,11 @@ def test_class_support_removed(): def _generate_search_cv_instances(): - for SearchCV, (Estimator, param_grid) in zip( - [GridSearchCV, RandomizedSearchCV], + for SearchCV, (Estimator, param_grid) in product( + [ + GridSearchCV, + RandomizedSearchCV, + ], [ (Ridge, {"alpha": [0.1, 1.0]}), (LogisticRegression, {"C": [0.1, 1.0]}), @@ -221,8 +225,11 @@ def _generate_search_cv_instances(): ): yield SearchCV(Estimator(), param_grid) - for SearchCV, (Estimator, param_grid) in zip( - [GridSearchCV, RandomizedSearchCV], + for SearchCV, (Estimator, param_grid) in product( + [ + GridSearchCV, + RandomizedSearchCV, + ], [ (Ridge, {"ridge__alpha": [0.1, 1.0]}), (LogisticRegression, {"logisticregression__C": [0.1, 1.0]}), From 007da8db4a90de82aa6ca46fc51e33c846599994 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Wed, 9 Jun 2021 10:49:58 -0400 Subject: [PATCH 467/478] FIX Do not reset for non-fit in multiclass (#20205) --- sklearn/multiclass.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py index 99a6db2051030..ad420506a9694 100644 --- a/sklearn/multiclass.py +++ b/sklearn/multiclass.py @@ -114,25 +114,34 @@ def _check_estimator(estimator): class _ConstantPredictor(BaseEstimator): def fit(self, X, y): - self._check_n_features(X, reset=True) + check_params = dict(force_all_finite=False, dtype=None, + ensure_2d=False, accept_sparse=True) + self._validate_data(X, y, reset=True, + validate_separately=(check_params, check_params)) self.y_ = y return self def predict(self, X): check_is_fitted(self) - self._check_n_features(X, reset=True) + self._validate_data(X, force_all_finite=False, dtype=None, + accept_sparse=True, + ensure_2d=False, reset=False) return np.repeat(self.y_, _num_samples(X)) def decision_function(self, X): check_is_fitted(self) - self._check_n_features(X, reset=True) + self._validate_data(X, force_all_finite=False, dtype=None, + accept_sparse=True, + ensure_2d=False, reset=False) return np.repeat(self.y_, _num_samples(X)) def predict_proba(self, X): check_is_fitted(self) - self._check_n_features(X, reset=True) + self._validate_data(X, force_all_finite=False, dtype=None, + accept_sparse=True, + ensure_2d=False, reset=False) return np.repeat([np.hstack([1 - self.y_, self.y_])], _num_samples(X), axis=0) From 1cd282d600088d2547d827af72a99e036106417a Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 9 Jun 2021 16:58:03 +0200 Subject: [PATCH 468/478] DOC add n_features_in_ in the documentation (#20236) --- sklearn/cluster/_affinity_propagation.py | 2 + sklearn/cluster/_agglomerative.py | 4 + sklearn/cluster/_bicluster.py | 4 + sklearn/cluster/_birch.py | 2 + sklearn/cluster/_dbscan.py | 2 + sklearn/cluster/_kmeans.py | 4 + sklearn/cluster/_mean_shift.py | 2 + sklearn/cluster/_optics.py | 2 + sklearn/cluster/_spectral.py | 2 + sklearn/covariance/_elliptic_envelope.py | 5 ++ sklearn/covariance/_empirical_covariance.py | 5 ++ sklearn/covariance/_graph_lasso.py | 10 +++ sklearn/covariance/_robust_covariance.py | 5 ++ sklearn/covariance/_shrunk_covariance.py | 15 ++++ sklearn/decomposition/_dict_learning.py | 10 +++ sklearn/decomposition/_factor_analysis.py | 5 ++ sklearn/decomposition/_fastica.py | 5 ++ sklearn/decomposition/_incremental_pca.py | 5 ++ sklearn/decomposition/_kernel_pca.py | 5 ++ sklearn/decomposition/_lda.py | 5 ++ sklearn/decomposition/_nmf.py | 5 ++ sklearn/decomposition/_pca.py | 5 ++ sklearn/decomposition/_sparse_pca.py | 10 +++ sklearn/decomposition/_truncated_svd.py | 5 ++ sklearn/discriminant_analysis.py | 10 +++ sklearn/dummy.py | 10 +++ sklearn/ensemble/_bagging.py | 10 +++ sklearn/ensemble/_forest.py | 25 +++++++ sklearn/ensemble/_gb.py | 10 +++ .../gradient_boosting.py | 8 ++ sklearn/ensemble/_iforest.py | 5 ++ sklearn/ensemble/_weight_boosting.py | 10 +++ .../_univariate_selection.py | 30 ++++++++ .../feature_selection/_variance_threshold.py | 5 ++ sklearn/gaussian_process/_gpc.py | 5 ++ sklearn/gaussian_process/_gpr.py | 5 ++ sklearn/impute/_base.py | 10 +++ sklearn/impute/_iterative.py | 5 ++ sklearn/impute/_knn.py | 5 ++ sklearn/kernel_approximation.py | 24 ++++++ sklearn/kernel_ridge.py | 5 ++ sklearn/linear_model/_base.py | 5 ++ sklearn/linear_model/_bayes.py | 10 +++ sklearn/linear_model/_coordinate_descent.py | 40 ++++++++++ sklearn/linear_model/_glm/glm.py | 15 ++++ sklearn/linear_model/_huber.py | 5 ++ sklearn/linear_model/_least_angle.py | 24 ++++++ sklearn/linear_model/_logistic.py | 9 +++ sklearn/linear_model/_omp.py | 10 +++ sklearn/linear_model/_passive_aggressive.py | 10 +++ sklearn/linear_model/_perceptron.py | 5 ++ sklearn/linear_model/_quantile.py | 5 ++ sklearn/linear_model/_ransac.py | 5 ++ sklearn/linear_model/_ridge.py | 20 +++++ sklearn/linear_model/_stochastic_gradient.py | 15 ++++ sklearn/linear_model/_theil_sen.py | 5 ++ sklearn/manifold/_isomap.py | 5 ++ sklearn/manifold/_locally_linear.py | 5 ++ sklearn/manifold/_mds.py | 5 ++ sklearn/manifold/_spectral_embedding.py | 5 ++ sklearn/manifold/_t_sne.py | 5 ++ .../_search_successive_halving.py | 14 ++++ sklearn/naive_bayes.py | 31 +++++++- sklearn/neighbors/_classification.py | 10 +++ sklearn/neighbors/_graph.py | 10 +++ sklearn/neighbors/_kde.py | 5 ++ sklearn/neighbors/_lof.py | 5 ++ sklearn/neighbors/_nca.py | 5 ++ sklearn/neighbors/_nearest_centroid.py | 5 ++ sklearn/neighbors/_regression.py | 10 +++ sklearn/neighbors/_unsupervised.py | 5 ++ .../neural_network/_multilayer_perceptron.py | 10 +++ sklearn/neural_network/_rbm.py | 5 ++ sklearn/preprocessing/_data.py | 75 +++++++++++++++---- sklearn/preprocessing/_discretization.py | 11 ++- sklearn/preprocessing/_polynomial.py | 45 ++++++----- sklearn/semi_supervised/_label_propagation.py | 10 +++ sklearn/semi_supervised/_self_training.py | 5 ++ sklearn/svm/_classes.py | 35 +++++++++ sklearn/tests/test_docstring_parameters.py | 52 +++++-------- sklearn/tree/_classes.py | 20 +++++ 81 files changed, 809 insertions(+), 73 deletions(-) diff --git a/sklearn/cluster/_affinity_propagation.py b/sklearn/cluster/_affinity_propagation.py index 67c1fb42b650b..cf0da5c5bc0f3 100644 --- a/sklearn/cluster/_affinity_propagation.py +++ b/sklearn/cluster/_affinity_propagation.py @@ -312,6 +312,8 @@ class AffinityPropagation(ClusterMixin, BaseEstimator): n_features_in_ : int Number of features seen during :term:`fit`. + .. versionadded:: 0.24 + Notes ----- For an example, see :ref:`examples/cluster/plot_affinity_propagation.py diff --git a/sklearn/cluster/_agglomerative.py b/sklearn/cluster/_agglomerative.py index 05f57ff238bcf..a1adb8492ab89 100644 --- a/sklearn/cluster/_agglomerative.py +++ b/sklearn/cluster/_agglomerative.py @@ -776,6 +776,8 @@ class AgglomerativeClustering(ClusterMixin, BaseEstimator): n_features_in_ : int Number of features seen during :term:`fit`. + .. versionadded:: 0.24 + children_ : array-like of shape (n_samples-1, 2) The children of each non-leaf node. Values less than `n_samples` correspond to leaves of the tree which are the original samples. @@ -1045,6 +1047,8 @@ class FeatureAgglomeration(AgglomerativeClustering, AgglomerationTransform): n_features_in_ : int Number of features seen during :term:`fit`. + .. versionadded:: 0.24 + children_ : array-like of shape (n_nodes-1, 2) The children of each non-leaf node. Values less than `n_features` correspond to leaves of the tree which are the original samples. diff --git a/sklearn/cluster/_bicluster.py b/sklearn/cluster/_bicluster.py index 1be7dd4e64186..939f044002f2d 100644 --- a/sklearn/cluster/_bicluster.py +++ b/sklearn/cluster/_bicluster.py @@ -261,6 +261,8 @@ class SpectralCoclustering(BaseSpectral): n_features_in_ : int Number of features seen during :term:`fit`. + .. versionadded:: 0.24 + Examples -------- >>> from sklearn.cluster import SpectralCoclustering @@ -401,6 +403,8 @@ class SpectralBiclustering(BaseSpectral): n_features_in_ : int Number of features seen during :term:`fit`. + .. versionadded:: 0.24 + Examples -------- >>> from sklearn.cluster import SpectralBiclustering diff --git a/sklearn/cluster/_birch.py b/sklearn/cluster/_birch.py index 81c9312f1488a..fc4bfdcfc902d 100644 --- a/sklearn/cluster/_birch.py +++ b/sklearn/cluster/_birch.py @@ -404,6 +404,8 @@ class Birch(ClusterMixin, TransformerMixin, BaseEstimator): n_features_in_ : int Number of features seen during :term:`fit`. + .. versionadded:: 0.24 + See Also -------- MiniBatchKMeans : Alternative implementation that does incremental updates diff --git a/sklearn/cluster/_dbscan.py b/sklearn/cluster/_dbscan.py index abbb35e6e04af..e862ee1080ace 100644 --- a/sklearn/cluster/_dbscan.py +++ b/sklearn/cluster/_dbscan.py @@ -220,6 +220,8 @@ class DBSCAN(ClusterMixin, BaseEstimator): n_features_in_ : int Number of features seen during :term:`fit`. + .. versionadded:: 0.24 + Examples -------- >>> from sklearn.cluster import DBSCAN diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index fc9ba7a868d10..6b54ec99ae825 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -769,6 +769,8 @@ class KMeans(TransformerMixin, ClusterMixin, BaseEstimator): n_features_in_ : int Number of features seen during :term:`fit`. + .. versionadded:: 0.24 + See Also -------- MiniBatchKMeans : Alternative online implementation that does incremental @@ -1471,6 +1473,8 @@ class MiniBatchKMeans(KMeans): n_features_in_ : int Number of features seen during :term:`fit`. + .. versionadded:: 0.24 + See Also -------- KMeans : The classic implementation of the clustering method based on the diff --git a/sklearn/cluster/_mean_shift.py b/sklearn/cluster/_mean_shift.py index 147ec6c626eb0..619d52cb7313b 100644 --- a/sklearn/cluster/_mean_shift.py +++ b/sklearn/cluster/_mean_shift.py @@ -314,6 +314,8 @@ class MeanShift(ClusterMixin, BaseEstimator): n_features_in_ : int Number of features seen during :term:`fit`. + .. versionadded:: 0.24 + Examples -------- >>> from sklearn.cluster import MeanShift diff --git a/sklearn/cluster/_optics.py b/sklearn/cluster/_optics.py index 0f2b96346660b..1d04ea7a3214f 100755 --- a/sklearn/cluster/_optics.py +++ b/sklearn/cluster/_optics.py @@ -181,6 +181,8 @@ class OPTICS(ClusterMixin, BaseEstimator): n_features_in_ : int Number of features seen during :term:`fit`. + .. versionadded:: 0.24 + See Also -------- DBSCAN : A similar clustering for a specified neighborhood radius (eps). diff --git a/sklearn/cluster/_spectral.py b/sklearn/cluster/_spectral.py index de0192987f595..8cdbd859fde02 100644 --- a/sklearn/cluster/_spectral.py +++ b/sklearn/cluster/_spectral.py @@ -421,6 +421,8 @@ class SpectralClustering(ClusterMixin, BaseEstimator): n_features_in_ : int Number of features seen during :term:`fit`. + .. versionadded:: 0.24 + Examples -------- >>> from sklearn.cluster import SpectralClustering diff --git a/sklearn/covariance/_elliptic_envelope.py b/sklearn/covariance/_elliptic_envelope.py index ad7904dc7831a..3e0c6a41d5913 100644 --- a/sklearn/covariance/_elliptic_envelope.py +++ b/sklearn/covariance/_elliptic_envelope.py @@ -83,6 +83,11 @@ class EllipticEnvelope(OutlierMixin, MinCovDet): Mahalanobis distances of the training set (on which :meth:`fit` is called) observations. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + Examples -------- >>> import numpy as np diff --git a/sklearn/covariance/_empirical_covariance.py b/sklearn/covariance/_empirical_covariance.py index 02bddd0f50330..9c3d94c863c72 100644 --- a/sklearn/covariance/_empirical_covariance.py +++ b/sklearn/covariance/_empirical_covariance.py @@ -125,6 +125,11 @@ class EmpiricalCovariance(BaseEstimator): Estimated pseudo-inverse matrix. (stored only if store_precision is True) + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + Examples -------- >>> import numpy as np diff --git a/sklearn/covariance/_graph_lasso.py b/sklearn/covariance/_graph_lasso.py index 091d4f82e7e3e..398a8af72f3a9 100644 --- a/sklearn/covariance/_graph_lasso.py +++ b/sklearn/covariance/_graph_lasso.py @@ -339,6 +339,11 @@ class GraphicalLasso(EmpiricalCovariance): n_iter_ : int Number of iterations run. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + Examples -------- >>> import numpy as np @@ -635,6 +640,11 @@ class GraphicalLassoCV(GraphicalLasso): n_iter_ : int Number of iterations run for the optimal alpha. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + Examples -------- >>> import numpy as np diff --git a/sklearn/covariance/_robust_covariance.py b/sklearn/covariance/_robust_covariance.py index 337ba23f19059..2323d14d3359a 100644 --- a/sklearn/covariance/_robust_covariance.py +++ b/sklearn/covariance/_robust_covariance.py @@ -582,6 +582,11 @@ class MinCovDet(EmpiricalCovariance): Mahalanobis distances of the training set (on which :meth:`fit` is called) observations. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + Examples -------- >>> import numpy as np diff --git a/sklearn/covariance/_shrunk_covariance.py b/sklearn/covariance/_shrunk_covariance.py index 5fe590b33a1db..a4dea261f2a45 100644 --- a/sklearn/covariance/_shrunk_covariance.py +++ b/sklearn/covariance/_shrunk_covariance.py @@ -91,6 +91,11 @@ class ShrunkCovariance(EmpiricalCovariance): Estimated pseudo inverse matrix. (stored only if store_precision is True) + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + Examples -------- >>> import numpy as np @@ -356,6 +361,11 @@ class LedoitWolf(EmpiricalCovariance): Coefficient in the convex combination used for the computation of the shrunk estimate. Range is [0, 1]. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + Examples -------- >>> import numpy as np @@ -533,6 +543,11 @@ class OAS(EmpiricalCovariance): coefficient in the convex combination used for the computation of the shrunk estimate. Range is [0, 1]. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + Examples -------- >>> import numpy as np diff --git a/sklearn/decomposition/_dict_learning.py b/sklearn/decomposition/_dict_learning.py index 80b64570b3401..030ac06b454b1 100644 --- a/sklearn/decomposition/_dict_learning.py +++ b/sklearn/decomposition/_dict_learning.py @@ -1259,6 +1259,11 @@ class DictionaryLearning(_BaseSparseCoding, BaseEstimator): error_ : array vector of errors at each iteration + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + n_iter_ : int Number of iterations run. @@ -1492,6 +1497,11 @@ class MiniBatchDictionaryLearning(_BaseSparseCoding, BaseEstimator): `A` `(n_components, n_components)` is the dictionary covariance matrix. `B` `(n_features, n_components)` is the data approximation matrix. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + n_iter_ : int Number of iterations run. diff --git a/sklearn/decomposition/_factor_analysis.py b/sklearn/decomposition/_factor_analysis.py index 830e81e9268d5..f3167ff225584 100644 --- a/sklearn/decomposition/_factor_analysis.py +++ b/sklearn/decomposition/_factor_analysis.py @@ -120,6 +120,11 @@ class FactorAnalysis(TransformerMixin, BaseEstimator): mean_ : ndarray of shape (n_features,) Per-feature empirical mean, estimated from the training set. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + Examples -------- >>> from sklearn.datasets import load_digits diff --git a/sklearn/decomposition/_fastica.py b/sklearn/decomposition/_fastica.py index 6c374e6e420f8..5faf1985d3fc9 100644 --- a/sklearn/decomposition/_fastica.py +++ b/sklearn/decomposition/_fastica.py @@ -362,6 +362,11 @@ def my_g(x): mean_ : ndarray of shape(n_features,) The mean over features. Only set if `self.whiten` is True. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + n_iter_ : int If the algorithm is "deflation", n_iter is the maximum number of iterations run across all components. Else diff --git a/sklearn/decomposition/_incremental_pca.py b/sklearn/decomposition/_incremental_pca.py index 486d4a22d8cdb..b1221d69cf914 100644 --- a/sklearn/decomposition/_incremental_pca.py +++ b/sklearn/decomposition/_incremental_pca.py @@ -107,6 +107,11 @@ class IncrementalPCA(_BasePCA): batch_size_ : int Inferred batch size from ``batch_size``. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + Examples -------- >>> from sklearn.datasets import load_digits diff --git a/sklearn/decomposition/_kernel_pca.py b/sklearn/decomposition/_kernel_pca.py index 1e1cdb1722029..70a12f5cb2e38 100644 --- a/sklearn/decomposition/_kernel_pca.py +++ b/sklearn/decomposition/_kernel_pca.py @@ -164,6 +164,11 @@ class KernelPCA(TransformerMixin, BaseEstimator): The data used to fit the model. If `copy_X=False`, then `X_fit_` is a reference. This attribute is used for the calls to transform. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + Examples -------- >>> from sklearn.datasets import load_digits diff --git a/sklearn/decomposition/_lda.py b/sklearn/decomposition/_lda.py index 75b123a118338..3739a66a871e3 100644 --- a/sklearn/decomposition/_lda.py +++ b/sklearn/decomposition/_lda.py @@ -248,6 +248,11 @@ class LatentDirichletAllocation(TransformerMixin, BaseEstimator): n_batch_iter_ : int Number of iterations of the EM step. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + n_iter_ : int Number of passes over the dataset. diff --git a/sklearn/decomposition/_nmf.py b/sklearn/decomposition/_nmf.py index c8239147eb6c4..39d38af4c5f5a 100644 --- a/sklearn/decomposition/_nmf.py +++ b/sklearn/decomposition/_nmf.py @@ -1179,6 +1179,11 @@ class NMF(TransformerMixin, BaseEstimator): n_iter_ : int Actual number of iterations. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + Examples -------- >>> import numpy as np diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py index 765320ccdb5a8..afeedeba28edb 100644 --- a/sklearn/decomposition/_pca.py +++ b/sklearn/decomposition/_pca.py @@ -264,6 +264,11 @@ class PCA(_BasePCA): Equal to the average of (min(n_features, n_samples) - n_components) smallest eigenvalues of the covariance matrix of X. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + See Also -------- KernelPCA : Kernel Principal Component Analysis. diff --git a/sklearn/decomposition/_sparse_pca.py b/sklearn/decomposition/_sparse_pca.py index 7f280db3a3af6..19ff950228f62 100644 --- a/sklearn/decomposition/_sparse_pca.py +++ b/sklearn/decomposition/_sparse_pca.py @@ -88,6 +88,11 @@ class SparsePCA(TransformerMixin, BaseEstimator): Per-feature empirical mean, estimated from the training set. Equal to ``X.mean(axis=0)``. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + Examples -------- >>> import numpy as np @@ -279,6 +284,11 @@ class MiniBatchSparsePCA(SparsePCA): Per-feature empirical mean, estimated from the training set. Equal to ``X.mean(axis=0)``. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + Examples -------- >>> import numpy as np diff --git a/sklearn/decomposition/_truncated_svd.py b/sklearn/decomposition/_truncated_svd.py index 7aa36c59da00e..677c6f1f36fb7 100644 --- a/sklearn/decomposition/_truncated_svd.py +++ b/sklearn/decomposition/_truncated_svd.py @@ -84,6 +84,11 @@ class TruncatedSVD(TransformerMixin, BaseEstimator): The singular values are equal to the 2-norms of the ``n_components`` variables in the lower-dimensional space. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + Examples -------- >>> from sklearn.decomposition import TruncatedSVD diff --git a/sklearn/discriminant_analysis.py b/sklearn/discriminant_analysis.py index 4d94b19574f53..3cb6cc1712f29 100644 --- a/sklearn/discriminant_analysis.py +++ b/sklearn/discriminant_analysis.py @@ -278,6 +278,11 @@ class LinearDiscriminantAnalysis(LinearClassifierMixin, classes_ : array-like of shape (n_classes,) Unique class labels. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + See Also -------- QuadraticDiscriminantAnalysis : Quadratic Discriminant Analysis. @@ -732,6 +737,11 @@ class QuadraticDiscriminantAnalysis(ClassifierMixin, BaseEstimator): classes_ : ndarray of shape (n_classes,) Unique class labels. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + Examples -------- >>> from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis diff --git a/sklearn/dummy.py b/sklearn/dummy.py index 575b38aa7d2a8..d78336730fc99 100644 --- a/sklearn/dummy.py +++ b/sklearn/dummy.py @@ -75,6 +75,11 @@ class DummyClassifier(MultiOutputMixin, ClassifierMixin, BaseEstimator): n_outputs_ : int Number of outputs. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + sparse_output_ : bool True if the array returned from predict is to be in sparse CSC format. Is automatically set to True if the input y is passed in sparse format. @@ -425,6 +430,11 @@ class DummyRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): Mean or median or quantile of the training targets or constant value given by the user. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + n_outputs_ : int Number of outputs. diff --git a/sklearn/ensemble/_bagging.py b/sklearn/ensemble/_bagging.py index a4be68ba5e2d6..d63c42d8f5539 100644 --- a/sklearn/ensemble/_bagging.py +++ b/sklearn/ensemble/_bagging.py @@ -537,6 +537,11 @@ class BaggingClassifier(ClassifierMixin, BaseBagging): Attribute `n_features_` was deprecated in version 1.0 and will be removed in 1.2. Use `n_features_in_` instead. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + estimators_ : list of estimators The collection of fitted base estimators. @@ -928,6 +933,11 @@ class BaggingRegressor(RegressorMixin, BaseBagging): Attribute `n_features_` was deprecated in version 1.0 and will be removed in 1.2. Use `n_features_in_` instead. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + estimators_ : list of estimators The collection of fitted sub-estimators. diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index bc29c0362bb3e..ef2de299c27ea 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -1199,6 +1199,11 @@ class labels (multi-output problem). Attribute `n_features_` was deprecated in version 1.0 and will be removed in 1.2. Use `n_features_in_` instead. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + n_outputs_ : int The number of outputs when ``fit`` is performed. @@ -1516,6 +1521,11 @@ class RandomForestRegressor(ForestRegressor): Attribute `n_features_` was deprecated in version 1.0 and will be removed in 1.2. Use `n_features_in_` instead. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + n_outputs_ : int The number of outputs when ``fit`` is performed. @@ -1841,6 +1851,11 @@ class labels (multi-output problem). Attribute `n_features_` was deprecated in version 1.0 and will be removed in 1.2. Use `n_features_in_` instead. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + n_outputs_ : int The number of outputs when ``fit`` is performed. @@ -2140,6 +2155,11 @@ class ExtraTreesRegressor(ForestRegressor): Attribute `n_features_` was deprecated in version 1.0 and will be removed in 1.2. Use `n_features_in_` instead. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + n_outputs_ : int The number of outputs. @@ -2368,6 +2388,11 @@ class RandomTreesEmbedding(BaseForest): Attribute `n_features_` was deprecated in version 1.0 and will be removed in 1.2. Use `n_features_in_` instead. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + n_outputs_ : int The number of outputs when ``fit`` is performed. diff --git a/sklearn/ensemble/_gb.py b/sklearn/ensemble/_gb.py index 78fee588ecf4e..496757ee9d605 100644 --- a/sklearn/ensemble/_gb.py +++ b/sklearn/ensemble/_gb.py @@ -1052,6 +1052,11 @@ class GradientBoostingClassifier(ClassifierMixin, BaseGradientBoosting): Attribute `n_features_` was deprecated in version 1.0 and will be removed in 1.2. Use `n_features_in_` instead. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + n_classes_ : int The number of classes. @@ -1604,6 +1609,11 @@ class GradientBoostingRegressor(RegressorMixin, BaseGradientBoosting): Attribute `n_features_` was deprecated in version 1.0 and will be removed in 1.2. Use `n_features_in_` instead. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + max_features_ : int The inferred value of max_features. diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 99eb0d265b100..b33b0652ca5be 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -1030,6 +1030,10 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting): is_categorical_ : ndarray, shape (n_features, ) or None Boolean mask for the categorical features. ``None`` if there are no categorical features. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 Examples -------- @@ -1288,6 +1292,10 @@ class HistGradientBoostingClassifier(ClassifierMixin, is_categorical_ : ndarray, shape (n_features, ) or None Boolean mask for the categorical features. ``None`` if there are no categorical features. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 Examples -------- diff --git a/sklearn/ensemble/_iforest.py b/sklearn/ensemble/_iforest.py index 3d2ac0928bd3f..fb8614ae0528e 100644 --- a/sklearn/ensemble/_iforest.py +++ b/sklearn/ensemble/_iforest.py @@ -147,6 +147,11 @@ class IsolationForest(OutlierMixin, BaseBagging): Attribute `n_features_` was deprecated in version 1.0 and will be removed in 1.2. Use `n_features_in_` instead. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + Notes ----- The implementation is based on an ensemble of ExtraTreeRegressor. The diff --git a/sklearn/ensemble/_weight_boosting.py b/sklearn/ensemble/_weight_boosting.py index 1b6689b50fafc..7d146e428a50b 100644 --- a/sklearn/ensemble/_weight_boosting.py +++ b/sklearn/ensemble/_weight_boosting.py @@ -359,6 +359,11 @@ class AdaBoostClassifier(ClassifierMixin, BaseWeightBoosting): high cardinality features (many unique values). See :func:`sklearn.inspection.permutation_importance` as an alternative. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + See Also -------- AdaBoostRegressor : An AdaBoost regressor that begins by fitting a @@ -935,6 +940,11 @@ class AdaBoostRegressor(RegressorMixin, BaseWeightBoosting): high cardinality features (many unique values). See :func:`sklearn.inspection.permutation_importance` as an alternative. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + Examples -------- >>> from sklearn.ensemble import AdaBoostRegressor diff --git a/sklearn/feature_selection/_univariate_selection.py b/sklearn/feature_selection/_univariate_selection.py index 989288dbb4ec7..f74ca0e0ac2e2 100644 --- a/sklearn/feature_selection/_univariate_selection.py +++ b/sklearn/feature_selection/_univariate_selection.py @@ -453,6 +453,11 @@ class SelectPercentile(_BaseFilter): pvalues_ : array-like of shape (n_features,) p-values of feature scores, None if `score_func` returned only scores. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + Examples -------- >>> from sklearn.datasets import load_digits @@ -539,6 +544,11 @@ class SelectKBest(_BaseFilter): pvalues_ : array-like of shape (n_features,) p-values of feature scores, None if `score_func` returned only scores. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + Examples -------- >>> from sklearn.datasets import load_digits @@ -624,6 +634,11 @@ class SelectFpr(_BaseFilter): pvalues_ : array-like of shape (n_features,) p-values of feature scores. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + Examples -------- >>> from sklearn.datasets import load_breast_cancer @@ -698,6 +713,11 @@ class SelectFdr(_BaseFilter): pvalues_ : array-like of shape (n_features,) p-values of feature scores. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + References ---------- https://en.wikipedia.org/wiki/False_discovery_rate @@ -768,6 +788,11 @@ class SelectFwe(_BaseFilter): pvalues_ : array-like of shape (n_features,) p-values of feature scores. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + See Also -------- f_classif : ANOVA F-value between label/feature for classification tasks. @@ -823,6 +848,11 @@ class GenericUnivariateSelect(_BaseFilter): pvalues_ : array-like of shape (n_features,) p-values of feature scores, None if `score_func` returned scores only. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + Examples -------- >>> from sklearn.datasets import load_breast_cancer diff --git a/sklearn/feature_selection/_variance_threshold.py b/sklearn/feature_selection/_variance_threshold.py index 39892876a6478..aabbc44ab8fc8 100644 --- a/sklearn/feature_selection/_variance_threshold.py +++ b/sklearn/feature_selection/_variance_threshold.py @@ -28,6 +28,11 @@ class VarianceThreshold(SelectorMixin, BaseEstimator): variances_ : array, shape (n_features,) Variances of individual features. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + Notes ----- Allows NaN in the input. diff --git a/sklearn/gaussian_process/_gpc.py b/sklearn/gaussian_process/_gpc.py index d2b418b131c2f..491c33b9621e8 100644 --- a/sklearn/gaussian_process/_gpc.py +++ b/sklearn/gaussian_process/_gpc.py @@ -576,6 +576,11 @@ def optimizer(obj_func, initial_theta, bounds): n_classes_ : int The number of classes in the training data + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + Examples -------- >>> from sklearn.datasets import load_iris diff --git a/sklearn/gaussian_process/_gpr.py b/sklearn/gaussian_process/_gpr.py index ae9e5c403fcf2..4583e013d06df 100644 --- a/sklearn/gaussian_process/_gpr.py +++ b/sklearn/gaussian_process/_gpr.py @@ -137,6 +137,11 @@ def optimizer(obj_func, initial_theta, bounds): log_marginal_likelihood_value_ : float The log-marginal-likelihood of ``self.kernel_.theta`` + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + Examples -------- >>> from sklearn.datasets import make_friedman2 diff --git a/sklearn/impute/_base.py b/sklearn/impute/_base.py index 85303f29c93e9..396b3b95234dc 100644 --- a/sklearn/impute/_base.py +++ b/sklearn/impute/_base.py @@ -187,6 +187,11 @@ class SimpleImputer(_BaseImputer): Indicator used to add binary indicators for missing values. ``None`` if add_indicator is False. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + See Also -------- IterativeImputer : Multivariate imputation of missing values. @@ -604,6 +609,11 @@ class MissingIndicator(TransformerMixin, BaseEstimator): They are computed during ``fit``. For ``features='all'``, it is to ``range(n_features)``. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + Examples -------- >>> import numpy as np diff --git a/sklearn/impute/_iterative.py b/sklearn/impute/_iterative.py index f5688fa96d238..3832bd9d35aa0 100644 --- a/sklearn/impute/_iterative.py +++ b/sklearn/impute/_iterative.py @@ -164,6 +164,11 @@ class IterativeImputer(_BaseImputer): Number of iteration rounds that occurred. Will be less than ``self.max_iter`` if early stopping criterion was reached. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + n_features_with_missing_ : int Number of features with missing values. diff --git a/sklearn/impute/_knn.py b/sklearn/impute/_knn.py index b9cfe0e1a60a0..f32232512dcde 100644 --- a/sklearn/impute/_knn.py +++ b/sklearn/impute/_knn.py @@ -76,6 +76,11 @@ class KNNImputer(_BaseImputer): Indicator used to add binary indicators for missing values. ``None`` if add_indicator is False. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + References ---------- * Olga Troyanskaya, Michael Cantor, Gavin Sherlock, Pat Brown, Trevor diff --git a/sklearn/kernel_approximation.py b/sklearn/kernel_approximation.py index e7020dea0e970..d6d67fe85e941 100644 --- a/sklearn/kernel_approximation.py +++ b/sklearn/kernel_approximation.py @@ -77,6 +77,11 @@ class PolynomialCountSketch(BaseEstimator, TransformerMixin): Array with random entries in {+1, -1}, used to represent the 2-wise independent hash functions for Count Sketch computation. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + Examples -------- >>> from sklearn.kernel_approximation import PolynomialCountSketch @@ -228,6 +233,10 @@ class RBFSampler(TransformerMixin, BaseEstimator): Random projection directions drawn from the Fourier transform of the RBF kernel. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 Examples -------- @@ -340,6 +349,11 @@ class SkewedChi2Sampler(TransformerMixin, BaseEstimator): Bias term, which will be added to the data. It is uniformly distributed between 0 and 2*pi. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + Examples -------- >>> from sklearn.kernel_approximation import SkewedChi2Sampler @@ -462,6 +476,11 @@ class AdditiveChi2Sampler(TransformerMixin, BaseEstimator): Stored sampling interval. Specified as a parameter if sample_steps not in {1,2,3}. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + Examples -------- >>> from sklearn.datasets import load_digits @@ -690,6 +709,11 @@ class Nystroem(TransformerMixin, BaseEstimator): Normalization matrix needed for embedding. Square root of the kernel matrix on ``components_``. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + Examples -------- >>> from sklearn import datasets, svm diff --git a/sklearn/kernel_ridge.py b/sklearn/kernel_ridge.py index e562c22daed2f..2bb0b83763625 100644 --- a/sklearn/kernel_ridge.py +++ b/sklearn/kernel_ridge.py @@ -89,6 +89,11 @@ class KernelRidge(MultiOutputMixin, RegressorMixin, BaseEstimator): kernel == "precomputed" this is instead the precomputed training matrix, of shape (n_samples, n_samples). + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + References ---------- * Kevin P. Murphy diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py index 09eeced4f3a09..3a55e3b0090c5 100644 --- a/sklearn/linear_model/_base.py +++ b/sklearn/linear_model/_base.py @@ -560,6 +560,11 @@ class LinearRegression(MultiOutputMixin, RegressorMixin, LinearModel): Independent term in the linear model. Set to 0.0 if `fit_intercept = False`. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + See Also -------- Ridge : Ridge regression addresses some of the diff --git a/sklearn/linear_model/_bayes.py b/sklearn/linear_model/_bayes.py index 1d25ac20aa34e..aabd3d2e0f5a2 100644 --- a/sklearn/linear_model/_bayes.py +++ b/sklearn/linear_model/_bayes.py @@ -131,6 +131,11 @@ class BayesianRidge(RegressorMixin, LinearModel): If `normalize=True`, parameter used to scale data to a unit standard deviation. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + Examples -------- >>> from sklearn import linear_model @@ -491,6 +496,11 @@ class ARDRegression(RegressorMixin, LinearModel): If `normalize=True`, parameter used to scale data to a unit standard deviation. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + Examples -------- >>> from sklearn import linear_model diff --git a/sklearn/linear_model/_coordinate_descent.py b/sklearn/linear_model/_coordinate_descent.py index da50a3a817a38..99517ff6e5bbf 100644 --- a/sklearn/linear_model/_coordinate_descent.py +++ b/sklearn/linear_model/_coordinate_descent.py @@ -666,6 +666,11 @@ class ElasticNet(MultiOutputMixin, RegressorMixin, LinearModel): Given param alpha, the dual gaps at the end of the optimization, same shape as each observation of y. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + Examples -------- >>> from sklearn.linear_model import ElasticNet @@ -993,6 +998,11 @@ class Lasso(ElasticNet): Number of iterations run by the coordinate descent solver to reach the specified tolerance. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + Examples -------- >>> from sklearn import linear_model @@ -1482,6 +1492,11 @@ class LassoCV(RegressorMixin, LinearModelCV): Number of iterations run by the coordinate descent solver to reach the specified tolerance for the optimal alpha. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + Examples -------- >>> from sklearn.linear_model import LassoCV @@ -1667,6 +1682,11 @@ class ElasticNetCV(RegressorMixin, LinearModelCV): Number of iterations run by the coordinate descent solver to reach the specified tolerance for the optimal alpha. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + Examples -------- >>> from sklearn.linear_model import ElasticNetCV @@ -1848,6 +1868,11 @@ class MultiTaskElasticNet(Lasso): (n_tasks, n_features) Sparse representation of the `coef_`. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + Examples -------- >>> from sklearn import linear_model @@ -2049,6 +2074,11 @@ class MultiTaskLasso(MultiTaskElasticNet): (n_tasks, n_features) Sparse representation of the `coef_`. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + Examples -------- >>> from sklearn import linear_model @@ -2228,6 +2258,11 @@ class MultiTaskElasticNetCV(RegressorMixin, LinearModelCV): dual_gap_ : float The dual gap at the end of the optimization for the optimal alpha. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + Examples -------- >>> from sklearn import linear_model @@ -2407,6 +2442,11 @@ class MultiTaskLassoCV(RegressorMixin, LinearModelCV): dual_gap_ : float The dual gap at the end of the optimization for the optimal alpha. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + Examples -------- >>> from sklearn.linear_model import MultiTaskLassoCV diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index 7d98f7734b322..5da65c77cf2f4 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -433,6 +433,11 @@ class PoissonRegressor(GeneralizedLinearRegressor): intercept_ : float Intercept (a.k.a. bias) added to linear predictor. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + n_iter_ : int Actual number of iterations used in the solver. @@ -517,6 +522,11 @@ class GammaRegressor(GeneralizedLinearRegressor): intercept_ : float Intercept (a.k.a. bias) added to linear predictor. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + n_iter_ : int Actual number of iterations used in the solver. @@ -633,6 +643,11 @@ class TweedieRegressor(GeneralizedLinearRegressor): n_iter_ : int Actual number of iterations used in the solver. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + Examples ---------- >>> from sklearn import linear_model diff --git a/sklearn/linear_model/_huber.py b/sklearn/linear_model/_huber.py index a8ae066d9ff63..93cdb4ae8b5dc 100644 --- a/sklearn/linear_model/_huber.py +++ b/sklearn/linear_model/_huber.py @@ -179,6 +179,11 @@ class HuberRegressor(LinearModel, RegressorMixin, BaseEstimator): scale_ : float The value by which ``|y - X'w - c|`` is scaled down. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + n_iter_ : int Number of iterations that ``scipy.optimize.minimize(method="L-BFGS-B")`` has run for. diff --git a/sklearn/linear_model/_least_angle.py b/sklearn/linear_model/_least_angle.py index 3485344b99e02..a1fe31557cbe6 100644 --- a/sklearn/linear_model/_least_angle.py +++ b/sklearn/linear_model/_least_angle.py @@ -899,6 +899,11 @@ class Lars(MultiOutputMixin, RegressorMixin, LinearModel): The number of iterations taken by lars_path to find the grid of alphas for each target. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + Examples -------- >>> from sklearn import linear_model @@ -1157,6 +1162,11 @@ class LassoLars(Lars): The number of iterations taken by lars_path to find the grid of alphas for each target. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + Examples -------- >>> from sklearn import linear_model @@ -1420,6 +1430,11 @@ class LarsCV(Lars): n_iter_ : array-like or int the number of iterations run by Lars with the optimal alpha. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + Examples -------- >>> from sklearn.linear_model import LarsCV @@ -1654,6 +1669,11 @@ class LassoLarsCV(LarsCV): active_ : list of int Indices of active variables at the end of the path. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + Examples -------- >>> from sklearn.linear_model import LassoLarsCV @@ -1799,6 +1819,10 @@ class LassoLarsIC(LassoLars): chosen. This value is larger by a factor of ``n_samples`` compared to Eqns. 2.15 and 2.16 in (Zou et al, 2007). + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 Examples -------- diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index abca6bb30e71f..c4876486e16de 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -1212,6 +1212,11 @@ class LogisticRegression(LinearClassifierMixin, corresponds to outcome 1 (True) and `-intercept_` corresponds to outcome 0 (False). + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + n_iter_ : ndarray of shape (n_classes,) or (1, ) Actual number of iterations for all classes. If binary or multinomial, it returns only 1 element. For liblinear solver, only the maximum @@ -1764,6 +1769,10 @@ class LogisticRegressionCV(LogisticRegression, If ``penalty='elasticnet'``, the shape is ``(n_classes, n_folds, n_cs, n_l1_ratios)`` or ``(1, n_folds, n_cs, n_l1_ratios)``. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 Examples -------- diff --git a/sklearn/linear_model/_omp.py b/sklearn/linear_model/_omp.py index c362fd4d73469..d61f8ba82a20c 100644 --- a/sklearn/linear_model/_omp.py +++ b/sklearn/linear_model/_omp.py @@ -592,6 +592,11 @@ class OrthogonalMatchingPursuit(MultiOutputMixin, RegressorMixin, LinearModel): `n_nonzero_coefs` is None and `tol` is None this value is either set to 10% of `n_features` or 1, whichever is greater. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + Examples -------- >>> from sklearn.linear_model import OrthogonalMatchingPursuit @@ -835,6 +840,11 @@ class OrthogonalMatchingPursuitCV(RegressorMixin, LinearModel): Number of active features across every target for the model refit with the best hyperparameters got by cross-validating across all folds. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + Examples -------- >>> from sklearn.linear_model import OrthogonalMatchingPursuitCV diff --git a/sklearn/linear_model/_passive_aggressive.py b/sklearn/linear_model/_passive_aggressive.py index 678061be3c691..3a0a82debcc7b 100644 --- a/sklearn/linear_model/_passive_aggressive.py +++ b/sklearn/linear_model/_passive_aggressive.py @@ -120,6 +120,11 @@ class PassiveAggressiveClassifier(BaseSGDClassifier): intercept_ : array, shape = [1] if n_classes == 2 else [n_classes] Constants in decision function. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + n_iter_ : int The actual number of iterations to reach the stopping criterion. For multiclass fits, it is the maximum over every binary fit. @@ -354,6 +359,11 @@ class PassiveAggressiveRegressor(BaseSGDRegressor): intercept_ : array, shape = [1] if n_classes == 2 else [n_classes] Constants in decision function. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + n_iter_ : int The actual number of iterations to reach the stopping criterion. diff --git a/sklearn/linear_model/_perceptron.py b/sklearn/linear_model/_perceptron.py index b2bb145b904c8..632996cd00c48 100644 --- a/sklearn/linear_model/_perceptron.py +++ b/sklearn/linear_model/_perceptron.py @@ -117,6 +117,11 @@ class Perceptron(BaseSGDClassifier): The function that determines the loss, or difference between the output of the algorithm and the target values. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + n_iter_ : int The actual number of iterations to reach the stopping criterion. For multiclass fits, it is the maximum over every binary fit. diff --git a/sklearn/linear_model/_quantile.py b/sklearn/linear_model/_quantile.py index bf8fea4552c9d..a39f48a804ffc 100644 --- a/sklearn/linear_model/_quantile.py +++ b/sklearn/linear_model/_quantile.py @@ -59,6 +59,11 @@ class QuantileRegressor(LinearModel, RegressorMixin, BaseEstimator): intercept_ : float The intercept of the model, aka bias term. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + n_iter_ : int The actual number of iterations performed by the solver. diff --git a/sklearn/linear_model/_ransac.py b/sklearn/linear_model/_ransac.py index f53785cfe0ced..5ee5b1e2fa502 100644 --- a/sklearn/linear_model/_ransac.py +++ b/sklearn/linear_model/_ransac.py @@ -192,6 +192,11 @@ class RANSACRegressor(MetaEstimatorMixin, RegressorMixin, .. versionadded:: 0.19 + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + Examples -------- >>> from sklearn.linear_model import RANSACRegressor diff --git a/sklearn/linear_model/_ridge.py b/sklearn/linear_model/_ridge.py index 433e0c4313efc..d82aca05fee7c 100644 --- a/sklearn/linear_model/_ridge.py +++ b/sklearn/linear_model/_ridge.py @@ -717,6 +717,11 @@ class Ridge(MultiOutputMixin, RegressorMixin, _BaseRidge): .. versionadded:: 0.17 + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + See Also -------- RidgeClassifier : Ridge classifier. @@ -877,6 +882,11 @@ class RidgeClassifier(LinearClassifierMixin, _BaseRidge): classes_ : ndarray of shape (n_classes,) The classes labels. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + See Also -------- Ridge : Ridge regression. @@ -1793,6 +1803,11 @@ class RidgeCV(MultiOutputMixin, RegressorMixin, _BaseRidgeCV): .. versionadded:: 0.23 + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + Examples -------- >>> from sklearn.datasets import load_diabetes @@ -1908,6 +1923,11 @@ class RidgeClassifierCV(LinearClassifierMixin, _BaseRidgeCV): classes_ : ndarray of shape (n_classes,) The classes labels. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + Examples -------- >>> from sklearn.datasets import load_breast_cancer diff --git a/sklearn/linear_model/_stochastic_gradient.py b/sklearn/linear_model/_stochastic_gradient.py index 78565178706a8..eb84c06ac93b3 100644 --- a/sklearn/linear_model/_stochastic_gradient.py +++ b/sklearn/linear_model/_stochastic_gradient.py @@ -934,6 +934,11 @@ class SGDClassifier(BaseSGDClassifier): Number of weight updates performed during training. Same as ``(n_iter_ * n_samples)``. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + See Also -------- sklearn.svm.LinearSVC : Linear support vector classification. @@ -1538,6 +1543,11 @@ class SGDRegressor(BaseSGDRegressor): Number of weight updates performed during training. Same as ``(n_iter_ * n_samples)``. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + Examples -------- >>> import numpy as np @@ -1693,6 +1703,11 @@ class SGDOneClassSVM(BaseSGD, OutlierMixin): loss_function_ : concrete ``LossFunction`` + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + Examples -------- >>> import numpy as np diff --git a/sklearn/linear_model/_theil_sen.py b/sklearn/linear_model/_theil_sen.py index 4c75613c28a9b..c14b6979ef4d9 100644 --- a/sklearn/linear_model/_theil_sen.py +++ b/sklearn/linear_model/_theil_sen.py @@ -272,6 +272,11 @@ class TheilSenRegressor(RegressorMixin, LinearModel): Number of combinations taken into account from 'n choose k', where n is the number of samples and k is the number of subsamples. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + Examples -------- >>> from sklearn.linear_model import TheilSenRegressor diff --git a/sklearn/manifold/_isomap.py b/sklearn/manifold/_isomap.py index 63be19c1c287d..4cf3b1885d2d0 100644 --- a/sklearn/manifold/_isomap.py +++ b/sklearn/manifold/_isomap.py @@ -104,6 +104,11 @@ class Isomap(TransformerMixin, BaseEstimator): dist_matrix_ : array-like, shape (n_samples, n_samples) Stores the geodesic distance matrix of training data. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + Examples -------- >>> from sklearn.datasets import load_digits diff --git a/sklearn/manifold/_locally_linear.py b/sklearn/manifold/_locally_linear.py index 0fcd5f543c4d0..17e829270f1a7 100644 --- a/sklearn/manifold/_locally_linear.py +++ b/sklearn/manifold/_locally_linear.py @@ -603,6 +603,11 @@ class LocallyLinearEmbedding(TransformerMixin, reconstruction_error_ : float Reconstruction error associated with `embedding_` + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + nbrs_ : NearestNeighbors object Stores nearest neighbors instance, including BallTree or KDtree if applicable. diff --git a/sklearn/manifold/_mds.py b/sklearn/manifold/_mds.py index d92ab67767fa3..f833f24f981a3 100644 --- a/sklearn/manifold/_mds.py +++ b/sklearn/manifold/_mds.py @@ -343,6 +343,11 @@ class MDS(BaseEstimator): - or constructs a dissimilarity matrix from data using Euclidean distances. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + n_iter_ : int The number of iterations corresponding to the best stress. diff --git a/sklearn/manifold/_spectral_embedding.py b/sklearn/manifold/_spectral_embedding.py index 49e64401b6c00..01bdf06b92ed0 100644 --- a/sklearn/manifold/_spectral_embedding.py +++ b/sklearn/manifold/_spectral_embedding.py @@ -440,6 +440,11 @@ class SpectralEmbedding(BaseEstimator): affinity_matrix_ : ndarray of shape (n_samples, n_samples) Affinity_matrix constructed from samples or precomputed. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + n_neighbors_ : int Number of nearest neighbors effectively used. diff --git a/sklearn/manifold/_t_sne.py b/sklearn/manifold/_t_sne.py index 8e42d48f4ef07..7142909ae292c 100644 --- a/sklearn/manifold/_t_sne.py +++ b/sklearn/manifold/_t_sne.py @@ -627,6 +627,11 @@ class TSNE(BaseEstimator): kl_divergence_ : float Kullback-Leibler divergence after optimization. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + n_iter_ : int Number of iterations run. diff --git a/sklearn/model_selection/_search_successive_halving.py b/sklearn/model_selection/_search_successive_halving.py index 2f5c465d6cf41..81c70945b894e 100644 --- a/sklearn/model_selection/_search_successive_halving.py +++ b/sklearn/model_selection/_search_successive_halving.py @@ -560,6 +560,13 @@ class HalvingGridSearchCV(BaseSuccessiveHalving): This is present only if ``refit`` is not False. + multimetric_ : bool + Whether or not the scorers compute several metrics. + + classes_ : ndarray of shape (n_classes,) + The classes labels. This is present only if ``refit`` is specified and + the underlying estimator is a classifier. + See Also -------- :class:`HalvingRandomSearchCV`: @@ -850,6 +857,13 @@ class HalvingRandomSearchCV(BaseSuccessiveHalving): This is present only if ``refit`` is not False. + multimetric_ : bool + Whether or not the scorers compute several metrics. + + classes_ : ndarray of shape (n_classes,) + The classes labels. This is present only if ``refit`` is specified and + the underlying estimator is a classifier. + See Also -------- :class:`HalvingGridSearchCV`: diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index 7e936ac3a0c8e..7c46a771a2fd4 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -150,10 +150,15 @@ class GaussianNB(_BaseNB): probability of each class. classes_ : ndarray of shape (n_classes,) - class labels known to the classifier + class labels known to the classifier. epsilon_ : float - absolute additive value to variances + absolute additive value to variances. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 sigma_ : ndarray of shape (n_classes, n_features) Variance of each feature per class. @@ -168,7 +173,7 @@ class labels known to the classifier .. versionadded:: 1.0 theta_ : ndarray of shape (n_classes, n_features) - mean of each feature per class + mean of each feature per class. Examples -------- @@ -767,6 +772,11 @@ class MultinomialNB(_BaseDiscreteNB): Attribute `n_features_` was deprecated in version 1.0 and will be removed in 1.2. Use `n_features_in_` instead. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + Examples -------- >>> import numpy as np @@ -896,6 +906,11 @@ class ComplementNB(_BaseDiscreteNB): Attribute `n_features_` was deprecated in version 1.0 and will be removed in 1.2. Use `n_features_in_` instead. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + Examples -------- >>> import numpy as np @@ -1016,6 +1031,11 @@ class BernoulliNB(_BaseDiscreteNB): Attribute `n_features_` was deprecated in version 1.0 and will be removed in 1.2. Use `n_features_in_` instead. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + Examples -------- >>> import numpy as np @@ -1157,6 +1177,11 @@ class CategoricalNB(_BaseDiscreteNB): Attribute `n_features_` was deprecated in version 1.0 and will be removed in 1.2. Use `n_features_in_` instead. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + n_categories_ : ndarray of shape (n_features,), dtype=np.int64 Number of categories for each feature. This value is inferred from the data or set by the minimum number of categories. diff --git a/sklearn/neighbors/_classification.py b/sklearn/neighbors/_classification.py index 1fd1fb01c9762..76dd3db7444ab 100644 --- a/sklearn/neighbors/_classification.py +++ b/sklearn/neighbors/_classification.py @@ -101,6 +101,11 @@ class KNeighborsClassifier(KNeighborsMixin, `p` parameter value if the `effective_metric_` attribute is set to 'minkowski'. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + n_samples_fit_ : int Number of samples in the fitted data. @@ -365,6 +370,11 @@ class RadiusNeighborsClassifier(RadiusNeighborsMixin, `p` parameter value if the `effective_metric_` attribute is set to 'minkowski'. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + n_samples_fit_ : int Number of samples in the fitted data. diff --git a/sklearn/neighbors/_graph.py b/sklearn/neighbors/_graph.py index 7676d42d62c18..247aef31ba2f7 100644 --- a/sklearn/neighbors/_graph.py +++ b/sklearn/neighbors/_graph.py @@ -286,6 +286,11 @@ class KNeighborsTransformer(KNeighborsMixin, `p` parameter value if the `effective_metric_` attribute is set to 'minkowski'. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + n_samples_fit_ : int Number of samples in the fitted data. @@ -468,6 +473,11 @@ class RadiusNeighborsTransformer(RadiusNeighborsMixin, `p` parameter value if the `effective_metric_` attribute is set to 'minkowski'. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + n_samples_fit_ : int Number of samples in the fitted data. diff --git a/sklearn/neighbors/_kde.py b/sklearn/neighbors/_kde.py index 816b023e0f23e..1ebd713b16e69 100644 --- a/sklearn/neighbors/_kde.py +++ b/sklearn/neighbors/_kde.py @@ -71,6 +71,11 @@ class KernelDensity(BaseEstimator): Attributes ---------- + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + tree_ : ``BinaryTree`` instance The tree algorithm for fast generalized N-point problems. diff --git a/sklearn/neighbors/_lof.py b/sklearn/neighbors/_lof.py index 941b9de781f9a..7b87076516687 100644 --- a/sklearn/neighbors/_lof.py +++ b/sklearn/neighbors/_lof.py @@ -157,6 +157,11 @@ class LocalOutlierFactor(KNeighborsMixin, effective_metric_params_ : dict The effective additional keyword arguments for the metric function. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + n_samples_fit_ : int It is the number of samples in the fitted data. diff --git a/sklearn/neighbors/_nca.py b/sklearn/neighbors/_nca.py index 5951b66ea7dbf..a3701a28909e8 100644 --- a/sklearn/neighbors/_nca.py +++ b/sklearn/neighbors/_nca.py @@ -121,6 +121,11 @@ class NeighborhoodComponentsAnalysis(TransformerMixin, BaseEstimator): components_ : ndarray of shape (n_components, n_features) The linear transformation learned during fitting. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + n_iter_ : int Counts the number of iterations performed by the optimizer. diff --git a/sklearn/neighbors/_nearest_centroid.py b/sklearn/neighbors/_nearest_centroid.py index c5f6a612b0395..4908465d7fafd 100644 --- a/sklearn/neighbors/_nearest_centroid.py +++ b/sklearn/neighbors/_nearest_centroid.py @@ -55,6 +55,11 @@ class NearestCentroid(ClassifierMixin, BaseEstimator): classes_ : array of shape (n_classes,) The unique classes labels. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + Examples -------- >>> from sklearn.neighbors import NearestCentroid diff --git a/sklearn/neighbors/_regression.py b/sklearn/neighbors/_regression.py index be60abcc64cb5..64a4e3df8fcae 100644 --- a/sklearn/neighbors/_regression.py +++ b/sklearn/neighbors/_regression.py @@ -106,6 +106,11 @@ class KNeighborsRegressor(KNeighborsMixin, `p` parameter value if the `effective_metric_` attribute is set to 'minkowski'. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + n_samples_fit_ : int Number of samples in the fitted data. @@ -313,6 +318,11 @@ class RadiusNeighborsRegressor(RadiusNeighborsMixin, `p` parameter value if the `effective_metric_` attribute is set to 'minkowski'. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + n_samples_fit_ : int Number of samples in the fitted data. diff --git a/sklearn/neighbors/_unsupervised.py b/sklearn/neighbors/_unsupervised.py index 0f14c56e8bac2..df452ff4ff1fa 100644 --- a/sklearn/neighbors/_unsupervised.py +++ b/sklearn/neighbors/_unsupervised.py @@ -72,6 +72,11 @@ class NearestNeighbors(KNeighborsMixin, effective_metric_params_ : dict Parameters for the metric used to compute distances to neighbors. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + n_samples_fit_ : int Number of samples in the fitted data. diff --git a/sklearn/neural_network/_multilayer_perceptron.py b/sklearn/neural_network/_multilayer_perceptron.py index 72120ad369275..e6c1ba340a7b3 100644 --- a/sklearn/neural_network/_multilayer_perceptron.py +++ b/sklearn/neural_network/_multilayer_perceptron.py @@ -886,6 +886,11 @@ class MLPClassifier(ClassifierMixin, BaseMultilayerPerceptron): The ith element in the list represents the bias vector corresponding to layer i + 1. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + n_iter_ : int The number of iterations the solver has run. @@ -1310,6 +1315,11 @@ class MLPRegressor(RegressorMixin, BaseMultilayerPerceptron): The ith element in the list represents the bias vector corresponding to layer i + 1. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + n_iter_ : int The number of iterations the solver has run. diff --git a/sklearn/neural_network/_rbm.py b/sklearn/neural_network/_rbm.py index b69a2c496a2c9..42a9eb81e30cd 100644 --- a/sklearn/neural_network/_rbm.py +++ b/sklearn/neural_network/_rbm.py @@ -85,6 +85,11 @@ class BernoulliRBM(TransformerMixin, BaseEstimator): where batch_size in the number of examples per minibatch and n_components is the number of hidden units. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + Examples -------- diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py index 393693fc87d2d..82e6d5d85ec19 100644 --- a/sklearn/preprocessing/_data.py +++ b/sklearn/preprocessing/_data.py @@ -307,6 +307,11 @@ class MinMaxScaler(TransformerMixin, BaseEstimator): .. versionadded:: 0.17 *data_range_* + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + n_samples_seen_ : int The number of samples processed by the estimator. It will be reset on new calls to fit, but increments across @@ -657,6 +662,11 @@ class StandardScaler(TransformerMixin, BaseEstimator): The variance for each feature in the training set. Used to compute `scale_`. Equal to ``None`` when ``with_std=False``. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + n_samples_seen_ : int or ndarray of shape (n_features,) The number of samples processed by the estimator for each feature. If there are no missing samples, the ``n_samples_seen`` will be an @@ -990,6 +1000,11 @@ class MaxAbsScaler(TransformerMixin, BaseEstimator): max_abs_ : ndarray of shape (n_features,) Per feature maximum absolute value. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + n_samples_seen_ : int The number of samples processed by the estimator. Will be reset on new calls to fit, but increments across ``partial_fit`` calls. @@ -1300,6 +1315,11 @@ class RobustScaler(TransformerMixin, BaseEstimator): .. versionadded:: 0.17 *scale_* attribute. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + Examples -------- >>> from sklearn.preprocessing import RobustScaler @@ -1701,19 +1721,12 @@ class Normalizer(TransformerMixin, BaseEstimator): copy (if the input is already a numpy array or a scipy.sparse CSR matrix). - Examples - -------- - >>> from sklearn.preprocessing import Normalizer - >>> X = [[4, 1, 2, 2], - ... [1, 3, 9, 3], - ... [5, 7, 5, 1]] - >>> transformer = Normalizer().fit(X) # fit does nothing. - >>> transformer - Normalizer() - >>> transformer.transform(X) - array([[0.8, 0.2, 0.4, 0.4], - [0.1, 0.3, 0.9, 0.3], - [0.5, 0.7, 0.5, 0.1]]) + Attributes + ---------- + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 Notes ----- @@ -1727,6 +1740,20 @@ class Normalizer(TransformerMixin, BaseEstimator): See Also -------- normalize : Equivalent function without the estimator API. + + Examples + -------- + >>> from sklearn.preprocessing import Normalizer + >>> X = [[4, 1, 2, 2], + ... [1, 3, 9, 3], + ... [5, 7, 5, 1]] + >>> transformer = Normalizer().fit(X) # fit does nothing. + >>> transformer + Normalizer() + >>> transformer.transform(X) + array([[0.8, 0.2, 0.4, 0.4], + [0.1, 0.3, 0.9, 0.3], + [0.5, 0.7, 0.5, 0.1]]) """ def __init__(self, norm='l2', *, copy=True): @@ -1856,6 +1883,13 @@ class Binarizer(TransformerMixin, BaseEstimator): set to False to perform inplace binarization and avoid a copy (if the input is already a numpy array or a scipy.sparse CSR matrix). + Attributes + ---------- + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + Examples -------- >>> from sklearn.preprocessing import Binarizer @@ -1972,6 +2006,11 @@ class KernelCenterer(TransformerMixin, BaseEstimator): K_fit_all_ : float Average of kernel matrix. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + References ---------- .. [1] `Schölkopf, Bernhard, Alexander Smola, and Klaus-Robert Müller. @@ -2199,6 +2238,11 @@ class QuantileTransformer(TransformerMixin, BaseEstimator): references_ : ndarray of shape (n_quantiles, ) Quantiles of references. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + Examples -------- >>> import numpy as np @@ -2724,6 +2768,11 @@ class PowerTransformer(TransformerMixin, BaseEstimator): lambdas_ : ndarray of float of shape (n_features,) The parameters of the power transformation for the selected features. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + Examples -------- >>> import numpy as np diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index d7565ff2fb4b3..327c6211d66f2 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -64,13 +64,18 @@ class KBinsDiscretizer(TransformerMixin, BaseEstimator): Attributes ---------- + bin_edges_ : ndarray of ndarray of shape (n_features,) + The edges of each bin. Contain arrays of varying shapes ``(n_bins_, )`` + Ignored features will have empty arrays. + n_bins_ : ndarray of shape (n_features,), dtype=np.int_ Number of bins per feature. Bins whose width are too small (i.e., <= 1e-8) are removed with a warning. - bin_edges_ : ndarray of ndarray of shape (n_features,) - The edges of each bin. Contain arrays of varying shapes ``(n_bins_, )`` - Ignored features will have empty arrays. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 See Also -------- diff --git a/sklearn/preprocessing/_polynomial.py b/sklearn/preprocessing/_polynomial.py index 930e85c783711..6c520354b379d 100644 --- a/sklearn/preprocessing/_polynomial.py +++ b/sklearn/preprocessing/_polynomial.py @@ -53,26 +53,6 @@ class PolynomialFeatures(TransformerMixin, BaseEstimator): .. versionadded:: 0.21 - Examples - -------- - >>> import numpy as np - >>> from sklearn.preprocessing import PolynomialFeatures - >>> X = np.arange(6).reshape(3, 2) - >>> X - array([[0, 1], - [2, 3], - [4, 5]]) - >>> poly = PolynomialFeatures(2) - >>> poly.fit_transform(X) - array([[ 1., 0., 1., 0., 0., 1.], - [ 1., 2., 3., 4., 6., 9.], - [ 1., 4., 5., 16., 20., 25.]]) - >>> poly = PolynomialFeatures(interaction_only=True) - >>> poly.fit_transform(X) - array([[ 1., 0., 1., 0.], - [ 1., 2., 3., 6.], - [ 1., 4., 5., 20.]]) - Attributes ---------- powers_ : ndarray of shape (n_output_features, n_input_features) @@ -81,6 +61,11 @@ class PolynomialFeatures(TransformerMixin, BaseEstimator): n_input_features_ : int The total number of input features. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + n_output_features_ : int The total number of polynomial output features. The number of output features is computed by iterating over all suitably sized combinations @@ -99,6 +84,26 @@ class PolynomialFeatures(TransformerMixin, BaseEstimator): See :ref:`examples/linear_model/plot_polynomial_interpolation.py ` + + Examples + -------- + >>> import numpy as np + >>> from sklearn.preprocessing import PolynomialFeatures + >>> X = np.arange(6).reshape(3, 2) + >>> X + array([[0, 1], + [2, 3], + [4, 5]]) + >>> poly = PolynomialFeatures(2) + >>> poly.fit_transform(X) + array([[ 1., 0., 1., 0., 0., 1.], + [ 1., 2., 3., 4., 6., 9.], + [ 1., 4., 5., 16., 20., 25.]]) + >>> poly = PolynomialFeatures(interaction_only=True) + >>> poly.fit_transform(X) + array([[ 1., 0., 1., 0.], + [ 1., 2., 3., 6.], + [ 1., 4., 5., 20.]]) """ def __init__(self, degree=2, *, interaction_only=False, include_bias=True, order='C'): diff --git a/sklearn/semi_supervised/_label_propagation.py b/sklearn/semi_supervised/_label_propagation.py index e89dfab9310ab..944b6b7acb149 100644 --- a/sklearn/semi_supervised/_label_propagation.py +++ b/sklearn/semi_supervised/_label_propagation.py @@ -350,6 +350,11 @@ class LabelPropagation(BaseLabelPropagation): transduction_ : ndarray of shape (n_samples) Label assigned to each item via the transduction. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + n_iter_ : int Number of iterations run. @@ -463,6 +468,11 @@ class LabelSpreading(BaseLabelPropagation): transduction_ : ndarray of shape (n_samples,) Label assigned to each item via the transduction. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + n_iter_ : int Number of iterations run. diff --git a/sklearn/semi_supervised/_self_training.py b/sklearn/semi_supervised/_self_training.py index 54fa9ba45e1b8..761909903e8b0 100644 --- a/sklearn/semi_supervised/_self_training.py +++ b/sklearn/semi_supervised/_self_training.py @@ -86,6 +86,11 @@ class SelfTrainingClassifier(MetaEstimatorMixin, BaseEstimator): When a sample has iteration -1, the sample was not labeled in any iteration. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + n_iter_ : int The number of rounds of self-training, that is the number of times the base estimator is fitted on relabeled variants of the training set. diff --git a/sklearn/svm/_classes.py b/sklearn/svm/_classes.py index 050855c25c06a..8946e77ef905f 100644 --- a/sklearn/svm/_classes.py +++ b/sklearn/svm/_classes.py @@ -114,6 +114,11 @@ class LinearSVC(LinearClassifierMixin, classes_ : ndarray of shape (n_classes,) The unique classes labels. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + n_iter_ : int Maximum number of iterations run across all classes. @@ -331,6 +336,11 @@ class LinearSVR(RegressorMixin, LinearModel): intercept_ : ndarray of shape (1) if n_classes == 2 else (n_classes) Constants in decision function. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + n_iter_ : int Maximum number of iterations run across all classes. @@ -583,6 +593,11 @@ class SVC(BaseSVC): intercept_ : ndarray of shape (n_classes * (n_classes - 1) / 2,) Constants in decision function. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + support_ : ndarray of shape (n_SV) Indices of support vectors. @@ -803,6 +818,11 @@ class NuSVC(BaseSVC): intercept_ : ndarray of shape (n_classes * (n_classes - 1) / 2,) Constants in decision function. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + support_ : ndarray of shape (n_SV,) Indices of support vectors. @@ -981,6 +1001,11 @@ class SVR(RegressorMixin, BaseLibSVM): intercept_ : ndarray of shape (1,) Constants in decision function. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + n_support_ : ndarray of shape (n_classes,), dtype=int32 Number of support vectors for each class. @@ -1133,6 +1158,11 @@ class NuSVR(RegressorMixin, BaseLibSVM): intercept_ : ndarray of shape (1,) Constants in decision function. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + n_support_ : ndarray of shape (n_classes,), dtype=int32 Number of support vectors for each class. @@ -1281,6 +1311,11 @@ class OneClassSVM(OutlierMixin, BaseLibSVM): intercept_ : ndarray of shape (1,) Constant in the decision function. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + n_support_ : ndarray of shape (n_classes,), dtype=int32 Number of support vectors for each class. diff --git a/sklearn/tests/test_docstring_parameters.py b/sklearn/tests/test_docstring_parameters.py index 85d8ad0cf6a36..ae9a29622c4aa 100644 --- a/sklearn/tests/test_docstring_parameters.py +++ b/sklearn/tests/test_docstring_parameters.py @@ -176,30 +176,8 @@ def _construct_searchcv_instance(SearchCV): N_FEATURES_MODULES_TO_IGNORE = { - 'compose', - 'covariance', - 'decomposition', - 'discriminant_analysis', - 'dummy', - 'ensemble', - 'feature_selection', - 'gaussian_process', - 'impute', - 'isotonic', - 'kernel_approximation', - 'kernel_ridge', - 'linear_model', - 'manifold', 'model_selection', 'multioutput', - 'naive_bayes', - 'neighbors', - 'neural_network', - 'preprocessing', - 'random_projection', - 'semi_supervised', - 'svm', - 'tree' } @@ -212,22 +190,28 @@ def test_fit_docstring_attributes(name, Estimator): doc = docscrape.ClassDoc(Estimator) attributes = doc['Attributes'] - IGNORED = {'ClassifierChain', 'ColumnTransformer', - 'CountVectorizer', 'DictVectorizer', - 'GaussianRandomProjection', - 'MultiOutputClassifier', 'MultiOutputRegressor', - 'NoSampleWeightWrapper', 'RFE', 'RFECV', - 'RegressorChain', 'SelectFromModel', - 'SparseCoder', 'SparseRandomProjection', - 'SpectralBiclustering', 'StackingClassifier', - 'StackingRegressor', 'TfidfVectorizer', 'VotingClassifier', - 'VotingRegressor', 'SequentialFeatureSelector', - 'HalvingGridSearchCV', 'HalvingRandomSearchCV'} + IGNORED = { + 'ClassifierChain', + 'CountVectorizer', 'DictVectorizer', + 'GaussianRandomProjection', + 'MultiOutputClassifier', 'MultiOutputRegressor', + 'NoSampleWeightWrapper', 'RFE', 'RFECV', + 'RegressorChain', 'SelectFromModel', + 'SparseCoder', 'SparseRandomProjection', + 'SpectralBiclustering', 'StackingClassifier', + 'StackingRegressor', 'TfidfVectorizer', 'VotingClassifier', + 'VotingRegressor', 'SequentialFeatureSelector', + } if Estimator.__name__ in IGNORED or Estimator.__name__.startswith('_'): pytest.skip("Estimator cannot be fit easily to test fit attributes") - if Estimator.__name__ in ("RandomizedSearchCV", "GridSearchCV"): + if Estimator.__name__ in ( + "HalvingRandomSearchCV", + "RandomizedSearchCV", + "HalvingGridSearchCV", + "GridSearchCV", + ): est = _construct_searchcv_instance(Estimator) else: est = _construct_instance(Estimator) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index a79a850f3b7c7..ba5bf2873bf18 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -796,6 +796,11 @@ class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree): n_features_ : int The number of features when ``fit`` is performed. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + n_outputs_ : int The number of outputs when ``fit`` is performed. @@ -1161,6 +1166,11 @@ class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree): n_features_ : int The number of features when ``fit`` is performed. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + n_outputs_ : int The number of outputs when ``fit`` is performed. @@ -1477,6 +1487,11 @@ class ExtraTreeClassifier(DecisionTreeClassifier): n_features_ : int The number of features when ``fit`` is performed. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + n_outputs_ : int The number of outputs when ``fit`` is performed. @@ -1699,6 +1714,11 @@ class ExtraTreeRegressor(DecisionTreeRegressor): n_features_ : int The number of features when ``fit`` is performed. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + feature_importances_ : ndarray of shape (n_features,) Return impurity-based feature importances (the higher, the more important the feature). From bf380eb4a9f67a7dc80528d11f8e239680144a65 Mon Sep 17 00:00:00 2001 From: amrcode Date: Wed, 23 Sep 2020 12:23:22 -0400 Subject: [PATCH 469/478] Update _supervised.py --- sklearn/metrics/cluster/_supervised.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/sklearn/metrics/cluster/_supervised.py b/sklearn/metrics/cluster/_supervised.py index 7814e7ba50e1c..38119449ff487 100644 --- a/sklearn/metrics/cluster/_supervised.py +++ b/sklearn/metrics/cluster/_supervised.py @@ -735,10 +735,10 @@ def mutual_info_score(labels_true, labels_pred, *, contingency=None): Parameters ---------- labels_true : int array, shape = [n_samples] - A clustering of the data into disjoint subsets. + A clustering of the data into disjoint subsets (U). labels_pred : int array-like of shape (n_samples,) - A clustering of the data into disjoint subsets. + A clustering of the data into disjoint subsets (V). contingency : {ndarray, sparse matrix} of shape \ (n_classes_true, n_classes_pred), default=None @@ -749,7 +749,7 @@ def mutual_info_score(labels_true, labels_pred, *, contingency=None): Returns ------- mi : float - Mutual information, a non-negative value + Mutual information, a non-negative value, measured in nats using the natural logarithm Notes ----- @@ -823,10 +823,10 @@ def adjusted_mutual_info_score(labels_true, labels_pred, *, Parameters ---------- labels_true : int array, shape = [n_samples] - A clustering of the data into disjoint subsets. + A clustering of the data into disjoint subsets (U). labels_pred : int array-like of shape (n_samples,) - A clustering of the data into disjoint subsets. + A clustering of the data into disjoint subsets (V). average_method : str, default='arithmetic' How to compute the normalizer in the denominator. Possible options @@ -843,7 +843,8 @@ def adjusted_mutual_info_score(labels_true, labels_pred, *, ami: float (upperlimited by 1.0) The AMI returns a value of 1 when the two partitions are identical (ie perfectly matched). Random partitions (independent labellings) have - an expected AMI around 0 on average hence can be negative. + an expected AMI around 0 on average hence can be negative. The value is + in adjusted nats (based on the natural logarithm). See Also -------- @@ -959,7 +960,8 @@ def normalized_mutual_info_score(labels_true, labels_pred, *, Returns ------- nmi : float - score between 0.0 and 1.0. 1.0 stands for perfectly complete labeling + score between 0.0 and 1.0 in normalized nats (based on the natural + logarithm). 1.0 stands for perfectly complete labeling. See Also -------- From 7421170c067faebcb30ef55df438c5f1044f88dd Mon Sep 17 00:00:00 2001 From: amrcode Date: Tue, 20 Oct 2020 08:31:48 -0400 Subject: [PATCH 470/478] Update _supervised.py Fix lint issues in doc updates. --- sklearn/metrics/cluster/_supervised.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sklearn/metrics/cluster/_supervised.py b/sklearn/metrics/cluster/_supervised.py index 38119449ff487..a4e349649a968 100644 --- a/sklearn/metrics/cluster/_supervised.py +++ b/sklearn/metrics/cluster/_supervised.py @@ -749,7 +749,8 @@ def mutual_info_score(labels_true, labels_pred, *, contingency=None): Returns ------- mi : float - Mutual information, a non-negative value, measured in nats using the natural logarithm + Mutual information, a non-negative value, measured in nats using the + natural logarithm Notes ----- @@ -960,7 +961,7 @@ def normalized_mutual_info_score(labels_true, labels_pred, *, Returns ------- nmi : float - score between 0.0 and 1.0 in normalized nats (based on the natural + score between 0.0 and 1.0 in normalized nats (based on the natural logarithm). 1.0 stands for perfectly complete labeling. See Also From 7cb0dbecd4477f8b690e333bd2c3e623f40d2828 Mon Sep 17 00:00:00 2001 From: amrcode Date: Wed, 21 Oct 2020 08:42:19 -0400 Subject: [PATCH 471/478] Update sklearn/metrics/cluster/_supervised.py to add a period in the return value doc Co-authored-by: Chiara Marmo --- sklearn/metrics/cluster/_supervised.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/metrics/cluster/_supervised.py b/sklearn/metrics/cluster/_supervised.py index a4e349649a968..0de0d3a57657f 100644 --- a/sklearn/metrics/cluster/_supervised.py +++ b/sklearn/metrics/cluster/_supervised.py @@ -750,7 +750,7 @@ def mutual_info_score(labels_true, labels_pred, *, contingency=None): ------- mi : float Mutual information, a non-negative value, measured in nats using the - natural logarithm + natural logarithm. Notes ----- From 14df87ff74bb792507e463c2a21189957960b113 Mon Sep 17 00:00:00 2001 From: ahagen Date: Tue, 5 Jan 2021 09:02:32 -0500 Subject: [PATCH 472/478] Updates from review --- sklearn/metrics/cluster/_supervised.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sklearn/metrics/cluster/_supervised.py b/sklearn/metrics/cluster/_supervised.py index 0de0d3a57657f..4eb56c1caaccb 100644 --- a/sklearn/metrics/cluster/_supervised.py +++ b/sklearn/metrics/cluster/_supervised.py @@ -735,10 +735,10 @@ def mutual_info_score(labels_true, labels_pred, *, contingency=None): Parameters ---------- labels_true : int array, shape = [n_samples] - A clustering of the data into disjoint subsets (U). + A clustering of the data into disjoint subsets, called $U$ in the above formula. labels_pred : int array-like of shape (n_samples,) - A clustering of the data into disjoint subsets (V). + A clustering of the data into disjoint subsets, called $V$ in the above formula. contingency : {ndarray, sparse matrix} of shape \ (n_classes_true, n_classes_pred), default=None @@ -824,10 +824,10 @@ def adjusted_mutual_info_score(labels_true, labels_pred, *, Parameters ---------- labels_true : int array, shape = [n_samples] - A clustering of the data into disjoint subsets (U). + A clustering of the data into disjoint subsets, called $U$ in the above formula. labels_pred : int array-like of shape (n_samples,) - A clustering of the data into disjoint subsets (V). + A clustering of the data into disjoint subsets, called $V$ in the above formula. average_method : str, default='arithmetic' How to compute the normalizer in the denominator. Possible options @@ -961,7 +961,7 @@ def normalized_mutual_info_score(labels_true, labels_pred, *, Returns ------- nmi : float - score between 0.0 and 1.0 in normalized nats (based on the natural + Score between 0.0 and 1.0 in normalized nats (based on the natural logarithm). 1.0 stands for perfectly complete labeling. See Also From 6c1c7879f3ae94ec17a1c718d5ad7a127291fd37 Mon Sep 17 00:00:00 2001 From: ahagen Date: Tue, 5 Jan 2021 12:15:50 -0500 Subject: [PATCH 473/478] Fix line lengths --- sklearn/metrics/cluster/_supervised.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/sklearn/metrics/cluster/_supervised.py b/sklearn/metrics/cluster/_supervised.py index 4eb56c1caaccb..095675e691b8b 100644 --- a/sklearn/metrics/cluster/_supervised.py +++ b/sklearn/metrics/cluster/_supervised.py @@ -735,10 +735,12 @@ def mutual_info_score(labels_true, labels_pred, *, contingency=None): Parameters ---------- labels_true : int array, shape = [n_samples] - A clustering of the data into disjoint subsets, called $U$ in the above formula. + A clustering of the data into disjoint subsets, called $U$ in the + above formula. labels_pred : int array-like of shape (n_samples,) - A clustering of the data into disjoint subsets, called $V$ in the above formula. + A clustering of the data into disjoint subsets, called $V$ in the + above formula. contingency : {ndarray, sparse matrix} of shape \ (n_classes_true, n_classes_pred), default=None @@ -824,10 +826,12 @@ def adjusted_mutual_info_score(labels_true, labels_pred, *, Parameters ---------- labels_true : int array, shape = [n_samples] - A clustering of the data into disjoint subsets, called $U$ in the above formula. + A clustering of the data into disjoint subsets, called $U$ in the + above formula. labels_pred : int array-like of shape (n_samples,) - A clustering of the data into disjoint subsets, called $V$ in the above formula. + A clustering of the data into disjoint subsets, called $V$ in the + above formula. average_method : str, default='arithmetic' How to compute the normalizer in the denominator. Possible options From b6e1275397ef5a54443dadf4f80f8f8afdeaad23 Mon Sep 17 00:00:00 2001 From: amrcode Date: Tue, 2 Feb 2021 08:58:57 -0500 Subject: [PATCH 474/478] Update _supervised.py Line length adjustment --- sklearn/metrics/cluster/_supervised.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/metrics/cluster/_supervised.py b/sklearn/metrics/cluster/_supervised.py index 095675e691b8b..0865f5af82e1d 100644 --- a/sklearn/metrics/cluster/_supervised.py +++ b/sklearn/metrics/cluster/_supervised.py @@ -710,8 +710,8 @@ def v_measure_score(labels_true, labels_pred, *, beta=1.0): def mutual_info_score(labels_true, labels_pred, *, contingency=None): """Mutual Information between two clusterings. - The Mutual Information is a measure of the similarity between two labels of - the same data. Where :math:`|U_i|` is the number of the samples + The Mutual Information is a measure of the similarity between two labels + of the same data. Where :math:`|U_i|` is the number of the samples in cluster :math:`U_i` and :math:`|V_j|` is the number of the samples in cluster :math:`V_j`, the Mutual Information between clusterings :math:`U` and :math:`V` is given as: From 85aa9ec4cc101db6e5a34611633b3c93d78204c9 Mon Sep 17 00:00:00 2001 From: ahagen Date: Tue, 2 Feb 2021 09:49:35 -0500 Subject: [PATCH 475/478] Remove trailing whitespace --- sklearn/metrics/cluster/_supervised.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/metrics/cluster/_supervised.py b/sklearn/metrics/cluster/_supervised.py index 0865f5af82e1d..d7695f52e522e 100644 --- a/sklearn/metrics/cluster/_supervised.py +++ b/sklearn/metrics/cluster/_supervised.py @@ -710,7 +710,7 @@ def v_measure_score(labels_true, labels_pred, *, beta=1.0): def mutual_info_score(labels_true, labels_pred, *, contingency=None): """Mutual Information between two clusterings. - The Mutual Information is a measure of the similarity between two labels + The Mutual Information is a measure of the similarity between two labels of the same data. Where :math:`|U_i|` is the number of the samples in cluster :math:`U_i` and :math:`|V_j|` is the number of the samples in cluster :math:`V_j`, the Mutual Information From e8ff40438fc212ab0f288601ab26abae9399c040 Mon Sep 17 00:00:00 2001 From: amrcode Date: Fri, 11 Jun 2021 08:14:43 -0400 Subject: [PATCH 476/478] Update math notation --- sklearn/metrics/cluster/_supervised.py | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/sklearn/metrics/cluster/_supervised.py b/sklearn/metrics/cluster/_supervised.py index d7695f52e522e..390e4cd279cf9 100644 --- a/sklearn/metrics/cluster/_supervised.py +++ b/sklearn/metrics/cluster/_supervised.py @@ -725,21 +725,22 @@ def mutual_info_score(labels_true, labels_pred, *, contingency=None): a permutation of the class or cluster label values won't change the score value in any way. - This metric is furthermore symmetric: switching ``label_true`` with - ``label_pred`` will return the same score value. This can be useful to - measure the agreement of two independent label assignments strategies - on the same dataset when the real ground truth is not known. + This metric is furthermore symmetric: switching :math:`U` (i.e + ``label_true``) with :math:`V` (i.e. ``label_pred``) will return the + same score value. This can be useful to measure the agreement of two + independent label assignments strategies on the same dataset when the + real ground truth is not known. Read more in the :ref:`User Guide `. Parameters ---------- labels_true : int array, shape = [n_samples] - A clustering of the data into disjoint subsets, called $U$ in the + A clustering of the data into disjoint subsets, called :math:`U` in the above formula. labels_pred : int array-like of shape (n_samples,) - A clustering of the data into disjoint subsets, called $V$ in the + A clustering of the data into disjoint subsets, called :math:`V` in the above formula. contingency : {ndarray, sparse matrix} of shape \ @@ -813,10 +814,10 @@ def adjusted_mutual_info_score(labels_true, labels_pred, *, a permutation of the class or cluster label values won't change the score value in any way. - This metric is furthermore symmetric: switching ``label_true`` with - ``label_pred`` will return the same score value. This can be useful to - measure the agreement of two independent label assignments strategies - on the same dataset when the real ground truth is not known. + This metric is furthermore symmetric: switching :math:`U` (``label_true``) + with :math:`V` (``labels_pred``) will return the same score value. This can + be useful to measure the agreement of two independent label assignments + strategies on the same dataset when the real ground truth is not known. Be mindful that this function is an order of magnitude slower than other metrics, such as the Adjusted Rand Index. @@ -826,11 +827,11 @@ def adjusted_mutual_info_score(labels_true, labels_pred, *, Parameters ---------- labels_true : int array, shape = [n_samples] - A clustering of the data into disjoint subsets, called $U$ in the + A clustering of the data into disjoint subsets, called :math:`U` in the above formula. labels_pred : int array-like of shape (n_samples,) - A clustering of the data into disjoint subsets, called $V$ in the + A clustering of the data into disjoint subsets, called :math:`V` in the above formula. average_method : str, default='arithmetic' From bdc8ba9083f9dcfe4bc2eade5a50e3a01eb069f6 Mon Sep 17 00:00:00 2001 From: ahagen Date: Fri, 11 Jun 2021 08:19:49 -0400 Subject: [PATCH 477/478] Fix line lengths --- sklearn/metrics/cluster/_supervised.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/sklearn/metrics/cluster/_supervised.py b/sklearn/metrics/cluster/_supervised.py index 390e4cd279cf9..cf27012a3bbd7 100644 --- a/sklearn/metrics/cluster/_supervised.py +++ b/sklearn/metrics/cluster/_supervised.py @@ -736,12 +736,12 @@ def mutual_info_score(labels_true, labels_pred, *, contingency=None): Parameters ---------- labels_true : int array, shape = [n_samples] - A clustering of the data into disjoint subsets, called :math:`U` in the - above formula. + A clustering of the data into disjoint subsets, called :math:`U` in + the above formula. labels_pred : int array-like of shape (n_samples,) - A clustering of the data into disjoint subsets, called :math:`V` in the - above formula. + A clustering of the data into disjoint subsets, called :math:`V` in + the above formula. contingency : {ndarray, sparse matrix} of shape \ (n_classes_true, n_classes_pred), default=None @@ -827,12 +827,12 @@ def adjusted_mutual_info_score(labels_true, labels_pred, *, Parameters ---------- labels_true : int array, shape = [n_samples] - A clustering of the data into disjoint subsets, called :math:`U` in the - above formula. + A clustering of the data into disjoint subsets, called :math:`U` in + the above formula. labels_pred : int array-like of shape (n_samples,) - A clustering of the data into disjoint subsets, called :math:`V` in the - above formula. + A clustering of the data into disjoint subsets, called :math:`V` in + the above formula. average_method : str, default='arithmetic' How to compute the normalizer in the denominator. Possible options From 26e2064d6739795331072ef002aee44d000d7de6 Mon Sep 17 00:00:00 2001 From: ahagen Date: Fri, 11 Jun 2021 08:59:22 -0400 Subject: [PATCH 478/478] Remove trailing whitespace --- sklearn/metrics/cluster/_supervised.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/sklearn/metrics/cluster/_supervised.py b/sklearn/metrics/cluster/_supervised.py index cf27012a3bbd7..636ba3e189394 100644 --- a/sklearn/metrics/cluster/_supervised.py +++ b/sklearn/metrics/cluster/_supervised.py @@ -725,10 +725,10 @@ def mutual_info_score(labels_true, labels_pred, *, contingency=None): a permutation of the class or cluster label values won't change the score value in any way. - This metric is furthermore symmetric: switching :math:`U` (i.e - ``label_true``) with :math:`V` (i.e. ``label_pred``) will return the - same score value. This can be useful to measure the agreement of two - independent label assignments strategies on the same dataset when the + This metric is furthermore symmetric: switching :math:`U` (i.e + ``label_true``) with :math:`V` (i.e. ``label_pred``) will return the + same score value. This can be useful to measure the agreement of two + independent label assignments strategies on the same dataset when the real ground truth is not known. Read more in the :ref:`User Guide `. @@ -736,11 +736,11 @@ def mutual_info_score(labels_true, labels_pred, *, contingency=None): Parameters ---------- labels_true : int array, shape = [n_samples] - A clustering of the data into disjoint subsets, called :math:`U` in + A clustering of the data into disjoint subsets, called :math:`U` in the above formula. labels_pred : int array-like of shape (n_samples,) - A clustering of the data into disjoint subsets, called :math:`V` in + A clustering of the data into disjoint subsets, called :math:`V` in the above formula. contingency : {ndarray, sparse matrix} of shape \ @@ -814,9 +814,9 @@ def adjusted_mutual_info_score(labels_true, labels_pred, *, a permutation of the class or cluster label values won't change the score value in any way. - This metric is furthermore symmetric: switching :math:`U` (``label_true``) - with :math:`V` (``labels_pred``) will return the same score value. This can - be useful to measure the agreement of two independent label assignments + This metric is furthermore symmetric: switching :math:`U` (``label_true``) + with :math:`V` (``labels_pred``) will return the same score value. This can + be useful to measure the agreement of two independent label assignments strategies on the same dataset when the real ground truth is not known. Be mindful that this function is an order of magnitude slower than other @@ -827,11 +827,11 @@ def adjusted_mutual_info_score(labels_true, labels_pred, *, Parameters ---------- labels_true : int array, shape = [n_samples] - A clustering of the data into disjoint subsets, called :math:`U` in + A clustering of the data into disjoint subsets, called :math:`U` in the above formula. labels_pred : int array-like of shape (n_samples,) - A clustering of the data into disjoint subsets, called :math:`V` in + A clustering of the data into disjoint subsets, called :math:`V` in the above formula. average_method : str, default='arithmetic'