From eedda91913983e774fe32e11f10ffab92cf137bf Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Wed, 16 Dec 2020 03:58:39 -0500
Subject: [PATCH 001/478] BUG Fixes fetch_kddcup99 for return_X_y and as_frame
 (#19011)

---
 sklearn/datasets/_kddcup99.py         | 6 +++---
 sklearn/datasets/tests/test_common.py | 8 ++++++++
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/sklearn/datasets/_kddcup99.py b/sklearn/datasets/_kddcup99.py
index e5c8bb2f298de..539b7ffaf862e 100644
--- a/sklearn/datasets/_kddcup99.py
+++ b/sklearn/datasets/_kddcup99.py
@@ -199,15 +199,15 @@ def fetch_kddcup99(*, subset=None, data_home=None, shuffle=False,
     with open(join(module_path, 'descr', 'kddcup99.rst')) as rst_file:
         fdescr = rst_file.read()
 
-    if return_X_y:
-        return data, target
-
     frame = None
     if as_frame:
         frame, data, target = _convert_data_dataframe(
             "fetch_kddcup99", data, target, feature_names, target_names
         )
 
+    if return_X_y:
+        return data, target
+
     return Bunch(
         data=data,
         target=target,
diff --git a/sklearn/datasets/tests/test_common.py b/sklearn/datasets/tests/test_common.py
index 073eb2023eedf..2a905b75e94eb 100644
--- a/sklearn/datasets/tests/test_common.py
+++ b/sklearn/datasets/tests/test_common.py
@@ -75,6 +75,14 @@ def check_as_frame(bunch, dataset_func,
     if expected_target_dtype is not None:
         assert np.all(frame_bunch.target.dtypes == expected_target_dtype)
 
+    # Test for return_X_y and as_frame=True
+    frame_X, frame_y = dataset_func(as_frame=True, return_X_y=True)
+    assert isinstance(frame_X, pd.DataFrame)
+    if frame_y.ndim > 1:
+        assert isinstance(frame_X, pd.DataFrame)
+    else:
+        assert isinstance(frame_y, pd.Series)
+
 
 def _skip_network_tests():
     return os.environ.get('SKLEARN_SKIP_NETWORK_TESTS', '1') == '1'

From 38b7155b11946ac2e97883424db8339ac95e3d93 Mon Sep 17 00:00:00 2001
From: Neal Fultz <nfultz@gmail.com>
Date: Wed, 16 Dec 2020 01:07:42 -0800
Subject: [PATCH 002/478] DOC fix citations for de Leeuw in IsotonicRegression
 (#18952)

Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
---
 sklearn/isotonic.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sklearn/isotonic.py b/sklearn/isotonic.py
index 35d0004aa4a73..b57ce23f8cc52 100644
--- a/sklearn/isotonic.py
+++ b/sklearn/isotonic.py
@@ -190,7 +190,7 @@ class IsotonicRegression(RegressorMixin, TransformerMixin, BaseEstimator):
 
     Notes
     -----
-    Ties are broken using the secondary method from Leeuw, 1977.
+    Ties are broken using the secondary method from de Leeuw, 1977.
 
     References
     ----------
@@ -201,11 +201,11 @@ class IsotonicRegression(RegressorMixin, TransformerMixin, BaseEstimator):
 
     Isotone Optimization in R : Pool-Adjacent-Violators
     Algorithm (PAVA) and Active Set Methods
-    Leeuw, Hornik, Mair
+    de Leeuw, Hornik, Mair
     Journal of Statistical Software 2009
 
     Correctness of Kruskal's algorithms for monotone regression with ties
-    Leeuw, Psychometrica, 1977
+    de Leeuw, Psychometrica, 1977
 
     Examples
     --------

From 6b605584a2232a68f18b2b68536457e1a2118ca3 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Wed, 16 Dec 2020 11:35:25 +0100
Subject: [PATCH 003/478] Trigger [cd build] to test #18782


From d304331b450344e4660550b15d8174b15fb616c7 Mon Sep 17 00:00:00 2001
From: RamyaNP <56212418+RamyaNP@users.noreply.github.com>
Date: Wed, 16 Dec 2020 22:05:19 +0530
Subject: [PATCH 004/478] DOC fix multiclass AUC formulas in user guide
 (#18559)

---
 doc/modules/model_evaluation.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst
index e092137ab7982..0bc08f24bb19c 100644
--- a/doc/modules/model_evaluation.rst
+++ b/doc/modules/model_evaluation.rst
@@ -1423,7 +1423,7 @@ uniformly:
 
 .. math::
 
-   \frac{2}{c(c-1)}\sum_{j=1}^{c}\sum_{k > j}^c (\text{AUC}(j | k) +
+   \frac{1}{c(c-1)}\sum_{j=1}^{c}\sum_{k > j}^c (\text{AUC}(j | k) +
    \text{AUC}(k | j))
 
 where :math:`c` is the number of classes and :math:`\text{AUC}(j | k)` is the
@@ -1438,7 +1438,7 @@ prevalence:
 
 .. math::
 
-   \frac{2}{c(c-1)}\sum_{j=1}^{c}\sum_{k > j}^c p(j \cup k)(
+   \frac{1}{c(c-1)}\sum_{j=1}^{c}\sum_{k > j}^c p(j \cup k)(
    \text{AUC}(j | k) + \text{AUC}(k | j))
 
 where :math:`c` is the number of classes. This algorithm is used by setting

From e1408d05c56a9cee22127b2edb9e5ecbd26852bc Mon Sep 17 00:00:00 2001
From: Brian Rice <brianrice013@gmail.com>
Date: Thu, 17 Dec 2020 02:32:24 -0600
Subject: [PATCH 005/478] MNT better error message in RidgeCV (#19020)

---
 sklearn/linear_model/_ridge.py           | 2 +-
 sklearn/linear_model/tests/test_ridge.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/sklearn/linear_model/_ridge.py b/sklearn/linear_model/_ridge.py
index 199a1cd760660..f3f1074312f60 100644
--- a/sklearn/linear_model/_ridge.py
+++ b/sklearn/linear_model/_ridge.py
@@ -1466,7 +1466,7 @@ def fit(self, X, y, sample_weight=None):
 
         if np.any(self.alphas <= 0):
             raise ValueError(
-                "alphas must be positive. Got {} containing some "
+                "alphas must be strictly positive. Got {} containing some "
                 "negative or null value instead.".format(self.alphas))
 
         X, y, X_offset, y_offset, X_scale = LinearModel._preprocess_data(
diff --git a/sklearn/linear_model/tests/test_ridge.py b/sklearn/linear_model/tests/test_ridge.py
index 7d52de903aee5..2da9a60fb301e 100644
--- a/sklearn/linear_model/tests/test_ridge.py
+++ b/sklearn/linear_model/tests/test_ridge.py
@@ -1126,13 +1126,13 @@ def test_ridgecv_negative_alphas():
     # Negative integers
     ridge = RidgeCV(alphas=(-1, -10, -100))
     assert_raises_regex(ValueError,
-                        "alphas must be positive",
+                        "alphas must be strictly positive",
                         ridge.fit, X, y)
 
     # Negative floats
     ridge = RidgeCV(alphas=(-0.1, -1.0, -10.0))
     assert_raises_regex(ValueError,
-                        "alphas must be positive",
+                        "alphas must be strictly positive",
                         ridge.fit, X, y)
 
 
From 3bca0412c10b89bb474bcf2f38442e2b1f36e6f4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juan=20Carlos=20Alfaro=20Jim=C3=A9nez?=
 <JuanCarlos.Alfaro@uclm.es>
Date: Thu, 17 Dec 2020 11:41:01 +0100
Subject: [PATCH 006/478] CI Avoid Travis stuck on failure (#19018)

---
 .travis.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 21fa7789495a7..4702fb63c497c 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -88,9 +88,9 @@ jobs:
         - BUILD_WHEEL=true
         - CIBW_BUILD=cp39-manylinux_aarch64
 
-install: source build_tools/travis/install.sh
-script: source build_tools/travis/script.sh
-after_success: source build_tools/travis/after_success.sh
+install: source build_tools/travis/install.sh || travis_terminate 1
+script: source build_tools/travis/script.sh || travis_terminate 1
+after_success: source build_tools/travis/after_success.sh || travis_terminate 1
 
 notifications:
   webhooks:

From e406860586036fae87269cf795497591d1c48827 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Thu, 17 Dec 2020 14:20:54 -0500
Subject: [PATCH 007/478] CI Only build wheels on main repo (#19026)

---
 .github/workflows/wheels.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
index 3d4244861d53d..ac1d495642049 100644
--- a/.github/workflows/wheels.yml
+++ b/.github/workflows/wheels.yml
@@ -22,6 +22,7 @@ jobs:
   check_build_trigger:
     name: Check build trigger
     runs-on: ubuntu-latest
+    if: github.repository == 'scikit-learn/scikit-learn'
     outputs:
       build: ${{ steps.check_build_trigger.outputs.build }}
 
@@ -86,7 +87,7 @@ jobs:
                             SKLEARN_SKIP_NETWORK_TESTS=1
                             SKLEARN_BUILD_PARALLEL=3
           CIBW_BUILD: cp${{ matrix.python }}-${{ matrix.platform_id }}
-          CIBW_REPAIR_WHEEL_COMMAND_WINDOWS: bash build_tools/github/repair_windows_wheels.sh {wheel} {dest_dir} ${{ matrix.bitness }} 
+          CIBW_REPAIR_WHEEL_COMMAND_WINDOWS: bash build_tools/github/repair_windows_wheels.sh {wheel} {dest_dir} ${{ matrix.bitness }}
           CIBW_BEFORE_TEST_WINDOWS: bash build_tools/github/build_minimal_windows_image.sh ${{ matrix.python }} ${{ matrix.bitness }}
           CIBW_TEST_REQUIRES: pytest pandas threadpoolctl
           CIBW_TEST_COMMAND: bash {project}/build_tools/github/test_wheels.sh

From be4f8a509f1382a9bbd24194bcfd19c6563fcf31 Mon Sep 17 00:00:00 2001
From: Thomas A Caswell <tcaswell@gmail.com>
Date: Thu, 17 Dec 2020 17:06:10 -0500
Subject: [PATCH 008/478] DOC correct some typo in bug_triaging.rst (#19032)

---
 doc/developers/bug_triaging.rst | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/doc/developers/bug_triaging.rst b/doc/developers/bug_triaging.rst
index 2cd87590ca511..ff96ec9f0faae 100644
--- a/doc/developers/bug_triaging.rst
+++ b/doc/developers/bug_triaging.rst
@@ -1,7 +1,7 @@
 .. _bug_triaging:
 
 Bug triaging and issue curation
-================================
+===============================
 
 The `issue tracker <https://github.com/scikit-learn/scikit-learn/issues>`_
 is important to the communication in the project: it helps
@@ -10,7 +10,7 @@ priorities. For this reason, it is important to curate it, adding labels
 to issues and closing issues that are not necessary.
 
 Working on issues to improve them
---------------------------------------
+---------------------------------
 
 Improving issues increases their chances of being successfully resolved.
 Guidelines on submitting good issues can be found :ref:`here
@@ -36,7 +36,7 @@ The following actions are typically useful:
 
    Online discussions may be harder than it seems at first glance, in
    particular given that a person new to open-source may have a very
-   different understanding of the process than a seasonned maintainer.
+   different understanding of the process than a seasoned maintainer.
 
    Overall, it is useful to stay positive and assume good will. `The
    following article
@@ -44,14 +44,14 @@ The following actions are typically useful:
    explores how to lead online discussions in the context of open source.
 
 Working on PRs to help review
-------------------------------
+-----------------------------
 
 Reviewing code is also encouraged. Contributors and users are welcome to
 participate to the review process following our :ref:`review guidelines
 <code_review>`.
 
 Triaging operations for members of the core and triage teams
--------------------------------------------------------------
+------------------------------------------------------------
 
 In addition to the above, members of the core team and the triage team
 can do the following important tasks:
@@ -91,7 +91,7 @@ See the github description for `roles in the organization
     should be closed.
 
 A typical workflow for triaging issues
-----------------------------------------
+--------------------------------------
 
 The following workflow [1]_ is a good way to approach issue triaging:
 

From 105d37a51bbab43613a0d678d7bf0e3d728314e1 Mon Sep 17 00:00:00 2001
From: Amol Deshmukh <34318357+des137@users.noreply.github.com>
Date: Fri, 18 Dec 2020 05:16:46 -0500
Subject: [PATCH 009/478] DOC Revises a sentence in the description of RFE
 (#19033)

---
 doc/modules/feature_selection.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/modules/feature_selection.rst b/doc/modules/feature_selection.rst
index 7a4993520c1d5..0b758bf72bc0c 100644
--- a/doc/modules/feature_selection.rst
+++ b/doc/modules/feature_selection.rst
@@ -119,12 +119,12 @@ Recursive feature elimination
 =============================
 
 Given an external estimator that assigns weights to features (e.g., the
-coefficients of a linear model), recursive feature elimination (:class:`RFE`)
+coefficients of a linear model), the goal of recursive feature elimination (:class:`RFE`)
 is to select features by recursively considering smaller and smaller sets of
 features. First, the estimator is trained on the initial set of features and
 the importance of each feature is obtained either through any specific attribute
 (such as ``coef_``, ``feature_importances_``) or callable. Then, the least important
-features are pruned from current set of features.That procedure is recursively
+features are pruned from current set of features. That procedure is recursively
 repeated on the pruned set until the desired number of features to select is
 eventually reached.
 

From 51bd34378f9c9c813c44778b9b03a6925ee6dc2c Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 18 Dec 2020 11:17:22 +0100
Subject: [PATCH 010/478] MNT skip preprocessing.rst when pandas is not
 installed  (#19016)

---
 doc/conftest.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/doc/conftest.py b/doc/conftest.py
index c950303acf280..4496bb74152ac 100644
--- a/doc/conftest.py
+++ b/doc/conftest.py
@@ -3,8 +3,6 @@
 from os.path import join
 import warnings
 
-import numpy as np
-
 from sklearn.utils import IS_PYPY
 from sklearn.utils._testing import SkipTest
 from sklearn.utils._testing import check_skip_network
@@ -72,6 +70,13 @@ def setup_grid_search():
         raise SkipTest("Skipping grid_search.rst, pandas not installed")
 
 
+def setup_preprocessing():
+    try:
+        import pandas  # noqa
+    except ImportError:
+        raise SkipTest("Skipping preprocessing.rst, pandas not installed")
+
+
 def setup_unsupervised_learning():
     try:
         import skimage  # noqa
@@ -105,5 +110,7 @@ def pytest_runtest_setup(item):
         setup_impute()
     elif fname.endswith('modules/grid_search.rst'):
         setup_grid_search()
+    elif fname.endswith('modules/preprocessing.rst'):
+        setup_preprocessing()
     elif fname.endswith('statistical_inference/unsupervised_learning.rst'):
         setup_unsupervised_learning()

From 2218ec46227c92301ac6837c4a8ae9b8dc5d3960 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 18 Dec 2020 17:09:19 +0100
Subject: [PATCH 011/478] MNT change 0.25 to 1.0 and 0.26 to 1.1 in deprecation
 messages (#19005)

Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
---
 doc/developers/develop.rst                    |  4 +-
 doc/glossary.rst                              |  6 +-
 doc/modules/classes.rst                       |  4 +-
 doc/whats_new/v0.23.rst                       |  4 +-
 doc/whats_new/v0.24.rst                       | 41 +++++----
 sklearn/base.py                               |  9 +-
 sklearn/calibration.py                        |  8 +-
 sklearn/cluster/_affinity_propagation.py      | 19 ++--
 sklearn/cluster/_bicluster.py                 |  6 +-
 sklearn/cluster/_kmeans.py                    | 26 +++---
 sklearn/cluster/_spectral.py                  |  4 +-
 .../tests/test_affinity_propagation.py        | 11 +--
 sklearn/cluster/tests/test_bicluster.py       |  4 +-
 sklearn/cluster/tests/test_k_means.py         | 12 +--
 sklearn/cluster/tests/test_spectral.py        |  2 +-
 sklearn/covariance/_graph_lasso.py            | 15 ++--
 .../covariance/tests/test_graphical_lasso.py  |  7 +-
 sklearn/cross_decomposition/_pls.py           | 90 ++++++++++---------
 sklearn/cross_decomposition/tests/test_pls.py | 10 +--
 sklearn/decomposition/_dict_learning.py       |  8 +-
 sklearn/decomposition/_kernel_pca.py          |  4 +-
 sklearn/decomposition/_nmf.py                 |  2 +-
 .../decomposition/tests/test_dict_learning.py |  2 +-
 .../decomposition/tests/test_kernel_pca.py    |  2 +-
 sklearn/decomposition/tests/test_nmf.py       | 20 ++---
 sklearn/ensemble/_forest.py                   | 16 ++--
 sklearn/ensemble/_gb.py                       | 36 ++++----
 .../ensemble/tests/test_gradient_boosting.py  |  8 +-
 sklearn/exceptions.py                         |  4 +-
 sklearn/inspection/_partial_dependence.py     | 10 +--
 .../inspection/_plot/partial_dependence.py    |  2 +-
 .../tests/test_partial_dependence.py          |  4 +-
 sklearn/kernel_ridge.py                       |  4 +-
 sklearn/linear_model/_stochastic_gradient.py  | 15 ++--
 .../tests/test_passive_aggressive.py          |  2 +-
 sklearn/linear_model/tests/test_sgd.py        |  2 +-
 sklearn/manifold/_mds.py                      |  4 +-
 sklearn/manifold/_spectral_embedding.py       |  4 +-
 sklearn/manifold/_t_sne.py                    | 24 ++---
 sklearn/manifold/tests/test_mds.py            |  4 +-
 .../manifold/tests/test_spectral_embedding.py |  2 +-
 sklearn/manifold/tests/test_t_sne.py          |  2 +-
 sklearn/metrics/pairwise.py                   | 18 ++--
 sklearn/metrics/tests/test_pairwise.py        |  2 +-
 sklearn/model_selection/_search.py            |  8 +-
 sklearn/model_selection/tests/test_search.py  |  8 +-
 .../model_selection/tests/test_validation.py  |  5 +-
 sklearn/multiclass.py                         | 26 +++---
 sklearn/naive_bayes.py                        | 16 ++--
 sklearn/neighbors/_base.py                    |  4 +-
 sklearn/neighbors/_regression.py              |  4 +-
 sklearn/neighbors/tests/test_neighbors.py     |  2 +-
 sklearn/pipeline.py                           |  4 +-
 sklearn/preprocessing/_data.py                |  4 +-
 sklearn/preprocessing/tests/test_data.py      |  4 +-
 sklearn/svm/_base.py                          |  4 +-
 sklearn/svm/_classes.py                       |  8 +-
 sklearn/svm/tests/test_svm.py                 |  4 +-
 sklearn/tests/test_base.py                    |  5 +-
 sklearn/tests/test_calibration.py             |  2 +-
 sklearn/tests/test_docstring_parameters.py    |  8 +-
 sklearn/tests/test_kernel_ridge.py            |  2 +-
 sklearn/tests/test_multiclass.py              | 16 ++--
 sklearn/tests/test_naive_bayes.py             |  8 +-
 sklearn/tree/_classes.py                      | 41 +++++----
 sklearn/tree/_criterion.pyx                   |  2 +-
 sklearn/tree/_export.py                       |  6 +-
 sklearn/tree/tests/test_export.py             |  6 +-
 sklearn/tree/tests/test_tree.py               |  2 +-
 sklearn/utils/estimator_checks.py             |  8 +-
 sklearn/utils/fixes.py                        |  6 +-
 sklearn/utils/metaestimators.py               |  5 +-
 sklearn/utils/tests/test_fixes.py             |  2 +-
 sklearn/utils/tests/test_validation.py        | 12 +--
 sklearn/utils/validation.py                   | 17 ++--
 75 files changed, 393 insertions(+), 339 deletions(-)

diff --git a/doc/developers/develop.rst b/doc/developers/develop.rst
index 08ce24933dd8e..c68becf18f93c 100644
--- a/doc/developers/develop.rst
+++ b/doc/developers/develop.rst
@@ -228,8 +228,8 @@ to slice rows and columns.
 
 .. deprecated:: 0.24
 
-    The _pairwise attribute is deprecated in 0.24. From 0.26 onward,
-    the `pairwise` estimator tag should be used instead.
+    The _pairwise attribute is deprecated in 0.24. From 1.1 (renaming of 0.26)
+    onward, the `pairwise` estimator tag should be used instead.
 
 Universal attributes
 ^^^^^^^^^^^^^^^^^^^^
diff --git a/doc/glossary.rst b/doc/glossary.rst
index cb4bb9e3fd3d6..30e647be1c0f4 100644
--- a/doc/glossary.rst
+++ b/doc/glossary.rst
@@ -389,9 +389,9 @@ General Concepts
 
                 .. deprecated:: 0.24
 
-                    The _pairwise attribute is deprecated in 0.24. From 0.26
-                    onward, the `pairwise` estimator tag should be used
-                    instead.
+                    The _pairwise attribute is deprecated in 0.24. From 1.1
+                    (renaming of 0.26) onward, the `pairwise` estimator tag
+                    should be used instead.
 
         For more detailed info, see :ref:`estimator_tags`.
 
diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index 2e9ab3884b1b5..84f8097cbbe9d 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -1637,5 +1637,5 @@ Utilities from joblib:
 Recently deprecated
 ===================
 
-To be removed in 0.25
----------------------
+To be removed in 1.0 (renaming of 0.25)
+---------------------------------------
diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst
index da9f2f01d29a2..598d9adc5cef4 100644
--- a/doc/whats_new/v0.23.rst
+++ b/doc/whats_new/v0.23.rst
@@ -166,8 +166,8 @@ In an effort to promote clear and non-ambiguous use of the library, most
 constructor and function parameters are now expected to be passed as keyword
 arguments (i.e. using the `param=value` syntax) instead of positional. To
 ease the transition, a `FutureWarning` is raised if a keyword-only parameter
-is used as positional. In version 0.25, these parameters will be strictly
-keyword-only, and a `TypeError` will be raised.
+is used as positional. In version 1.0 (renaming of 0.25), these parameters
+will be strictly keyword-only, and a `TypeError` will be raised.
 :issue:`15005` by `Joel Nothman`_, `Adrin Jalali`_, `Thomas Fan`_, and
 `Nicolas Hug`_. See `SLEP009
 <https://scikit-learn-enhancement-proposals.readthedocs.io/en/latest/slep009/proposal.html>`_
diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst
index a5b0ec36d62aa..7197b74b94faa 100644
--- a/doc/whats_new/v0.24.rst
+++ b/doc/whats_new/v0.24.rst
@@ -104,8 +104,8 @@ Changelog
   initial cluster centroids. :pr:`17937` by :user:`g-walsh`
 
 - |API| :class:`cluster.MiniBatchKMeans` attributes, `counts_` and
-  `init_size_`, are deprecated and will be removed in 0.26. :pr:`17864` by
-  :user:`Jérémie du Boisberranger <jeremiedbb>`.
+  `init_size_`, are deprecated and will be removed in 1.1 (renaming of 0.26).
+  :pr:`17864` by :user:`Jérémie du Boisberranger <jeremiedbb>`.
 
 :mod:`sklearn.compose`
 ......................
@@ -128,7 +128,8 @@ Changelog
 - |API| Deprecates `cv_alphas_` in favor of `cv_results_['alphas']` and
   `grid_scores_` in favor of split scores in `cv_results_` in
   :class:`covariance.GraphicalLassoCV`. `cv_alphas_` and `grid_scores_` will be
-  removed in version 0.26. :pr:`16392` by `Thomas Fan`_.
+  removed in version 1.1 (renaming of 0.26).
+  :pr:`16392` by `Thomas Fan`_.
 
 :mod:`sklearn.cross_decomposition`
 ..................................
@@ -149,7 +150,7 @@ Changelog
 - |API| For :class:`cross_decomposition.NMF`,
   the `init` value, when 'init=None' and
   n_components <= min(n_samples, n_features) will be changed from
-  `'nndsvd'` to `'nndsvda'` in 0.26.
+  `'nndsvd'` to `'nndsvda'` in 1.1 (renaming of 0.26).
   :pr:`18525` by :user:`Chiara Marmo <cmarmo>`.
 
 - |API| The bounds of the `n_components` parameter is now restricted:
@@ -159,20 +160,23 @@ Changelog
     and :class:`cross_decomposition.PLSCanonical`.
   - into `[1, n_features]` or :class:`cross_decomposition.PLSRegression`.
 
-  An error will be raised in 0.26. :pr:`17095` by `Nicolas Hug`_.
+  An error will be raised in 1.1 (renaming of 0.26).
+  :pr:`17095` by `Nicolas Hug`_.
 
 - |API| For :class:`cross_decomposition.PLSSVD`,
   :class:`cross_decomposition.CCA`, and
   :class:`cross_decomposition.PLSCanonical`, the `x_scores_` and `y_scores_`
-  attributes were deprecated and will be removed in 0.26. They can be
-  retrieved by calling `transform` on the training data. The `norm_y_weights`
-  attribute will also be removed. :pr:`17095` by `Nicolas Hug`_.
+  attributes were deprecated and will be removed in 1.1 (renaming of 0.26).
+  They can be retrieved by calling `transform` on the training data.
+  The `norm_y_weights` attribute will also be removed.
+  :pr:`17095` by `Nicolas Hug`_.
 
 - |API| For :class:`cross_decomposition.PLSRegression`,
   :class:`cross_decomposition.PLSCanonical`,
   :class:`cross_decomposition.CCA`, and
   :class:`cross_decomposition.PLSSVD`, the `x_mean_`, `y_mean_`, `x_std_`, and
-  `y_std_` attributes were deprecated and will be removed in 0.26.
+  `y_std_` attributes were deprecated and will be removed in 1.1
+  (renaming of 0.26).
   :pr:`18768` by :user:`Maren Westermann <marenwestermann>`.
 
 - |Fix| :class:`decomposition.TruncatedSVD` becomes deterministic by using the
@@ -240,8 +244,9 @@ Changelog
 
 - |Fix| Fix :class:`decomposition.SparseCoder` such that it follows
   scikit-learn API and support cloning. The attribute `components_` is
-  deprecated in 0.24 and will be removed in 0.26. This attribute was
-  redundant with the `dictionary` attribute and constructor parameter.
+  deprecated in 0.24 and will be removed in 1.1 (renaming of 0.26).
+  This attribute was redundant with the `dictionary` attribute and constructor
+  parameter.
   :pr:`17679` by :user:`Xavier Dupré <sdpython>`.
 
 - |Fix| :meth:`TruncatedSVD.fit_transform` consistently returns the same
@@ -302,7 +307,8 @@ Changelog
 
 - |API| :class:`exceptions.ChangedBehaviorWarning` and
   :class:`exceptions.NonBLASDotWarning` are deprecated and will be removed in
-  v0.26, :pr:`17804` by `Adrin Jalali`_.
+  1.1 (renaming of 0.26).
+  :pr:`17804` by `Adrin Jalali`_.
 
 :mod:`sklearn.feature_extraction`
 .................................
@@ -390,7 +396,8 @@ Changelog
   :user:`Roei Kahny <RoeiKa>`.
 
 - |API| Positional arguments are deprecated in
-  :meth:`inspection.PartialDependenceDisplay.plot` and will error in 0.26.
+  :meth:`inspection.PartialDependenceDisplay.plot` and will error in 1.1
+  (renaming of 0.26).
   :pr:`18293` by `Thomas Fan`_.
 
 :mod:`sklearn.isotonic`
@@ -458,8 +465,8 @@ Changelog
 
 - |Enhancement| Add `square_distances` parameter to :class:`manifold.TSNE`,
   which provides backward compatibility during deprecation of legacy squaring
-  behavior. Distances will be squared by default in 0.26, and this parameter
-  will be removed in 0.28. :pr:`17662` by
+  behavior. Distances will be squared by default in 1.1 (renaming of 0.26),
+  and this parameter will be removed in 1.3. :pr:`17662` by
   :user:`Joshua Newton <joshuacwnewton>`.
 
 - |Fix| :class:`manifold.MDS` now correctly sets its `_pairwise` attribute.
@@ -645,8 +652,8 @@ Changelog
 - |API| The attributes ``coef_`` and ``intercept_`` are now deprecated in
   :class:`naive_bayes.MultinomialNB`, :class:`naive_bayes.ComplementNB`,
   :class:`naive_bayes.BernoulliNB` and :class:`naive_bayes.CategoricalNB`,
-  and will be removed in v0.26. :pr:`17427` by
-  :user:`Juan Carlos Alfaro Jiménez <alfaro96>`.
+  and will be removed in v1.1 (renaming of 0.26).
+  :pr:`17427` by :user:`Juan Carlos Alfaro Jiménez <alfaro96>`.
 
 :mod:`sklearn.neighbors`
 ........................
diff --git a/sklearn/base.py b/sklearn/base.py
index 3d49ec4fe96f6..3626e931aa9cf 100644
--- a/sklearn/base.py
+++ b/sklearn/base.py
@@ -845,9 +845,12 @@ def _is_pairwise(estimator):
 
     if has_pairwise_attribute:
         if pairwise_attribute != pairwise_tag:
-            warnings.warn("_pairwise was deprecated in 0.24 and will be "
-                          "removed in 0.26. Set the estimator tags of your "
-                          "estimator instead", FutureWarning)
+            warnings.warn(
+                "_pairwise was deprecated in 0.24 and will be removed in 1.1 "
+                "(renaming of 0.26). Set the estimator tags of your estimator "
+                "instead",
+                FutureWarning
+            )
         return pairwise_attribute
 
     # use pairwise tag when the attribute is not present
diff --git a/sklearn/calibration.py b/sklearn/calibration.py
index 46faf680923f5..3c997c906497c 100644
--- a/sklearn/calibration.py
+++ b/sklearn/calibration.py
@@ -599,7 +599,7 @@ class _CalibratedClassifier:
 
         .. deprecated:: 0.24
            `calibrators_` is deprecated from 0.24 and will be removed in
-           0.26. Use `calibrators` instead.
+           1.1 (renaming of 0.26). Use `calibrators` instead.
     """
     def __init__(self, base_estimator, calibrators, *, classes,
                  method='sigmoid'):
@@ -608,11 +608,11 @@ def __init__(self, base_estimator, calibrators, *, classes,
         self.classes = classes
         self.method = method
 
-    # TODO: Remove in 0.26
+    # TODO: Remove in 1.1
     # mypy error: Decorated property not supported
     @deprecated(  # type: ignore
-        "calibrators_ is deprecated in 0.24 and will be removed in 0.26. "
-        "Use calibrators instead."
+        "calibrators_ is deprecated in 0.24 and will be removed in 1.1"
+        "(renaming of 0.26). Use calibrators instead."
     )
     @property
     def calibrators_(self):
diff --git a/sklearn/cluster/_affinity_propagation.py b/sklearn/cluster/_affinity_propagation.py
index 9937962095895..cb9230cd2382f 100644
--- a/sklearn/cluster/_affinity_propagation.py
+++ b/sklearn/cluster/_affinity_propagation.py
@@ -145,13 +145,14 @@ def affinity_propagation(S, *, preference=None, convergence_iter=15,
                     else (np.array([0]), np.array([0] * n_samples)))
 
     if random_state == 'warn':
-        warnings.warn(("'random_state' has been introduced in 0.23. "
-                       "It will be set to None starting from 0.25 which "
-                       "means that results will differ at every function "
-                       "call. Set 'random_state' to None to silence this "
-                       "warning, or to 0 to keep the behavior of versions "
-                       "<0.23."),
-                      FutureWarning)
+        warnings.warn(
+            "'random_state' has been introduced in 0.23. It will be set to "
+            "None starting from 1.0 (renaming of 0.25) which means that "
+            "results will differ at every function call. Set 'random_state' "
+            "to None to silence this warning, or to 0 to keep the behavior of "
+            "versions <0.23.",
+            FutureWarning
+        )
         random_state = 0
     random_state = check_random_state(random_state)
 
@@ -375,10 +376,10 @@ def __init__(self, *, damping=.5, max_iter=200, convergence_iter=15,
         self.affinity = affinity
         self.random_state = random_state
 
-    # TODO: Remove in 0.26
+    # TODO: Remove in 1.1
     # mypy error: Decorated property not supported
     @deprecated("Attribute _pairwise was deprecated in "  # type: ignore
-                "version 0.24 and will be removed in 0.26.")
+                "version 0.24 and will be removed in 1.1 (renaming of 0.26).")
     @property
     def _pairwise(self):
         return self.affinity == "precomputed"
diff --git a/sklearn/cluster/_bicluster.py b/sklearn/cluster/_bicluster.py
index d80463f211aba..6d293206bddd8 100644
--- a/sklearn/cluster/_bicluster.py
+++ b/sklearn/cluster/_bicluster.py
@@ -118,7 +118,7 @@ def fit(self, X, y=None):
         """
         if self.n_jobs != 'deprecated':
             warnings.warn("'n_jobs' was deprecated in version 0.23 and will be"
-                          " removed in 0.25.", FutureWarning)
+                          " removed in 1.0 (renaming of 0.25).", FutureWarning)
 
         X = self._validate_data(X, accept_sparse='csr', dtype=np.float64)
         self._check_parameters()
@@ -240,7 +240,7 @@ class SpectralCoclustering(BaseSpectral):
 
         .. deprecated:: 0.23
             ``n_jobs`` was deprecated in version 0.23 and will be removed in
-            0.25.
+            1.0 (renaming of 0.25).
 
     random_state : int, RandomState instance, default=None
         Used for randomizing the singular value decomposition and the k-means
@@ -392,7 +392,7 @@ class SpectralBiclustering(BaseSpectral):
 
         .. deprecated:: 0.23
             ``n_jobs`` was deprecated in version 0.23 and will be removed in
-            0.25.
+            1.0 (renaming of 0.25).
 
     random_state : int, RandomState instance, default=None
         Used for randomizing the singular value decomposition and the k-means
diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py
index f23df27dc8ad5..d10dfba0d08b3 100644
--- a/sklearn/cluster/_kmeans.py
+++ b/sklearn/cluster/_kmeans.py
@@ -212,7 +212,7 @@ def k_means(X, n_clusters, *, sample_weight=None, init='k-means++',
 
         .. deprecated:: 0.23
             'precompute_distances' was deprecated in version 0.23 and will be
-            removed in 0.25. It has no effect.
+            removed in 1.0 (renaming of 0.25). It has no effect.
 
     n_init : int, default=10
         Number of time the k-means algorithm will be run with different
@@ -254,7 +254,7 @@ def k_means(X, n_clusters, *, sample_weight=None, init='k-means++',
 
         .. deprecated:: 0.23
             ``n_jobs`` was deprecated in version 0.23 and will be removed in
-            0.25.
+            1.0 (renaming of 0.25).
 
     algorithm : {"auto", "full", "elkan"}, default="auto"
         K-means algorithm to use. The classical EM-style algorithm is "full".
@@ -657,7 +657,7 @@ class KMeans(TransformerMixin, ClusterMixin, BaseEstimator):
 
         .. deprecated:: 0.23
             'precompute_distances' was deprecated in version 0.22 and will be
-            removed in 0.25. It has no effect.
+            removed in 1.0 (renaming of 0.25). It has no effect.
 
     verbose : int, default=0
         Verbosity mode.
@@ -686,7 +686,7 @@ class KMeans(TransformerMixin, ClusterMixin, BaseEstimator):
 
         .. deprecated:: 0.23
             ``n_jobs`` was deprecated in version 0.23 and will be removed in
-            0.25.
+            1.0 (renaming of 0.25).
 
     algorithm : {"auto", "full", "elkan"}, default="auto"
         K-means algorithm to use. The classical EM-style algorithm is "full".
@@ -784,13 +784,13 @@ def _check_params(self, X):
         # precompute_distances
         if self.precompute_distances != 'deprecated':
             warnings.warn("'precompute_distances' was deprecated in version "
-                          "0.23 and will be removed in 0.25. It has no "
-                          "effect", FutureWarning)
+                          "0.23 and will be removed in 1.0 (renaming of 0.25)"
+                          ". It has no effect", FutureWarning)
 
         # n_jobs
         if self.n_jobs != 'deprecated':
             warnings.warn("'n_jobs' was deprecated in version 0.23 and will be"
-                          " removed in 0.25.", FutureWarning)
+                          " removed in 1.0 (renaming of 0.25).", FutureWarning)
             self._n_threads = self.n_jobs
         else:
             self._n_threads = None
@@ -1512,13 +1512,15 @@ class MiniBatchKMeans(KMeans):
         Weigth sum of each cluster.
 
         .. deprecated:: 0.24
-           This attribute is deprecated in 0.24 and will be removed in 0.26.
+           This attribute is deprecated in 0.24 and will be removed in
+           1.1 (renaming of 0.26).
 
     init_size_ : int
         The effective number of samples used for the initialization.
 
         .. deprecated:: 0.24
-           This attribute is deprecated in 0.24 and will be removed in 0.26.
+           This attribute is deprecated in 0.24 and will be removed in
+           1.1 (renaming of 0.26).
 
     See Also
     --------
@@ -1577,19 +1579,19 @@ def __init__(self, n_clusters=8, *, init='k-means++', max_iter=100,
         self.reassignment_ratio = reassignment_ratio
 
     @deprecated("The attribute 'counts_' is deprecated in 0.24"  # type: ignore
-                " and will be removed in 0.26.")
+                " and will be removed in 1.1 (renaming of 0.26).")
     @property
     def counts_(self):
         return self._counts
 
     @deprecated("The attribute 'init_size_' is deprecated in "  # type: ignore
-                "0.24 and will be removed in 0.26.")
+                "0.24 and will be removed in 1.1 (renaming of 0.26).")
     @property
     def init_size_(self):
         return self._init_size
 
     @deprecated("The attribute 'random_state_' is deprecated "  # type: ignore
-                "in 0.24 and will be removed in 0.26.")
+                "in 0.24 and will be removed in 1.1 (renaming of 0.26).")
     @property
     def random_state_(self):
         return getattr(self, "_random_state", None)
diff --git a/sklearn/cluster/_spectral.py b/sklearn/cluster/_spectral.py
index f9a01dc2c00da..79a0b77954028 100644
--- a/sklearn/cluster/_spectral.py
+++ b/sklearn/cluster/_spectral.py
@@ -571,10 +571,10 @@ def _more_tags(self):
         return {'pairwise': self.affinity in ["precomputed",
                                               "precomputed_nearest_neighbors"]}
 
-    # TODO: Remove in 0.26
+    # TODO: Remove in 1.1
     # mypy error: Decorated property not supported
     @deprecated("Attribute _pairwise was deprecated in "  # type: ignore
-                "version 0.24 and will be removed in 0.26.")
+                "version 0.24 and will be removed in 1.1 (renaming of 0.26).")
     @property
     def _pairwise(self):
         return self.affinity in ["precomputed",
diff --git a/sklearn/cluster/tests/test_affinity_propagation.py b/sklearn/cluster/tests/test_affinity_propagation.py
index f3e367ddf022f..446b0f43c74d9 100644
--- a/sklearn/cluster/tests/test_affinity_propagation.py
+++ b/sklearn/cluster/tests/test_affinity_propagation.py
@@ -204,16 +204,11 @@ def test_affinity_propagation_random_state():
     assert np.mean((centers0 - centers76) ** 2) > 1
 
 
-# FIXME: to be removed in 0.25
+# FIXME: to be removed in 1.0
 def test_affinity_propagation_random_state_warning():
     # test that a warning is raised when random_state is not defined.
     X = np.array([[0, 0], [1, 1], [-2, -2]])
-    match = ("'random_state' has been introduced in 0.23. "
-             "It will be set to None starting from 0.25 which "
-             "means that results will differ at every function "
-             "call. Set 'random_state' to None to silence this "
-             "warning, or to 0 to keep the behavior of versions "
-             "<0.23.")
+    match = "'random_state' has been introduced in 0.23."
     with pytest.warns(FutureWarning, match=match):
         AffinityPropagation().fit(X)
 
@@ -246,7 +241,7 @@ def test_affinity_propagation_float32():
     assert_array_equal(afp.labels_, expected)
 
 
-# TODO: Remove in 0.26
+# TODO: Remove in 1.1
 def test_affinity_propagation_pairwise_is_deprecated():
     afp = AffinityPropagation(affinity='precomputed')
     msg = r"Attribute _pairwise was deprecated in version 0\.24"
diff --git a/sklearn/cluster/tests/test_bicluster.py b/sklearn/cluster/tests/test_bicluster.py
index 6e3e664c622a8..97ca3db0201b6 100644
--- a/sklearn/cluster/tests/test_bicluster.py
+++ b/sklearn/cluster/tests/test_bicluster.py
@@ -267,9 +267,9 @@ def test_n_features_in_(est):
 @pytest.mark.parametrize("klass", [SpectralBiclustering, SpectralCoclustering])
 @pytest.mark.parametrize("n_jobs", [None, 1])
 def test_n_jobs_deprecated(klass, n_jobs):
-    # FIXME: remove in 0.25
+    # FIXME: remove in 1.0
     depr_msg = ("'n_jobs' was deprecated in version 0.23 and will be removed "
-                "in 0.25.")
+                "in 1.0")
     S, _, _ = make_biclusters((30, 30), 3, noise=0.5, random_state=0)
     est = klass(random_state=0, n_jobs=n_jobs)
 
diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py
index 063781bbd6532..341b00c5c137f 100644
--- a/sklearn/cluster/tests/test_k_means.py
+++ b/sklearn/cluster/tests/test_k_means.py
@@ -854,9 +854,9 @@ def test_result_of_kmeans_equal_in_diff_n_threads():
 
 @pytest.mark.parametrize("precompute_distances", ["auto", False, True])
 def test_precompute_distance_deprecated(precompute_distances):
-    # FIXME: remove in 0.25
+    # FIXME: remove in 1.0
     depr_msg = ("'precompute_distances' was deprecated in version 0.23 and "
-                "will be removed in 0.25.")
+                "will be removed in 1.0")
     X, _ = make_blobs(n_samples=10, n_features=2, centers=2, random_state=0)
     kmeans = KMeans(n_clusters=2, n_init=1, init='random', random_state=0,
                     precompute_distances=precompute_distances)
@@ -867,9 +867,9 @@ def test_precompute_distance_deprecated(precompute_distances):
 
 @pytest.mark.parametrize("n_jobs", [None, 1])
 def test_n_jobs_deprecated(n_jobs):
-    # FIXME: remove in 0.25
+    # FIXME: remove in 1.0
     depr_msg = ("'n_jobs' was deprecated in version 0.23 and will be removed "
-                "in 0.25.")
+                "in 1.0")
     X, _ = make_blobs(n_samples=10, n_features=2, centers=2, random_state=0)
     kmeans = KMeans(n_clusters=2, n_init=1, init='random', random_state=0,
                     n_jobs=n_jobs)
@@ -881,9 +881,9 @@ def test_n_jobs_deprecated(n_jobs):
 @pytest.mark.parametrize("attr", ["counts_", "init_size_", "random_state_"])
 def test_minibatch_kmeans_deprecated_attributes(attr):
     # check that we raise a deprecation warning when accessing `init_size_`
-    # FIXME: remove in 0.26
+    # FIXME: remove in 1.1
     depr_msg = (f"The attribute '{attr}' is deprecated in 0.24 and will be "
-                f"removed in 0.26.")
+                f"removed in 1.1")
     km = MiniBatchKMeans(n_clusters=2, n_init=1, init='random', random_state=0)
     km.fit(X)
 
diff --git a/sklearn/cluster/tests/test_spectral.py b/sklearn/cluster/tests/test_spectral.py
index 19b1496d2719e..2c0ac67016749 100644
--- a/sklearn/cluster/tests/test_spectral.py
+++ b/sklearn/cluster/tests/test_spectral.py
@@ -268,7 +268,7 @@ def test_verbose(assign_labels, capsys):
         assert re.search(r"Iteration [0-9]+, inertia", captured.out)
 
 
-# TODO: Remove in 0.26
+# TODO: Remove in 1.1
 @pytest.mark.parametrize("affinity", ["precomputed",
                                       "precomputed_nearest_neighbors"])
 def test_pairwise_is_deprecated(affinity):
diff --git a/sklearn/covariance/_graph_lasso.py b/sklearn/covariance/_graph_lasso.py
index c43b465def374..6dc88fb7908fb 100644
--- a/sklearn/covariance/_graph_lasso.py
+++ b/sklearn/covariance/_graph_lasso.py
@@ -607,14 +607,16 @@ class GraphicalLassoCV(GraphicalLasso):
 
         .. deprecated:: 0.24
             The `cv_alphas_` attribute is deprecated in version 0.24 in favor
-            of `cv_results_['alphas']` and will be removed in version 0.26.
+            of `cv_results_['alphas']` and will be removed in version
+            1.1 (renaming of 0.26).
 
     grid_scores_ : ndarray of shape (n_alphas, n_folds)
         Log-likelihood score on left-out data across folds.
 
         .. deprecated:: 0.24
             The `grid_scores_` attribute is deprecated in version 0.24 in favor
-            of `cv_results_` and will be removed in version 0.26.
+            of `cv_results_` and will be removed in version
+            1.1 (renaming of 0.26).
 
     cv_results_ : dict of ndarrays
         A dict with keys:
@@ -828,11 +830,11 @@ def fit(self, X, y=None):
             verbose=inner_verbose, return_n_iter=True)
         return self
 
-    # TODO: Remove in 0.26 when grid_scores_ is deprecated
+    # TODO: Remove in 1.1 when grid_scores_ is deprecated
     # mypy error: Decorated property not supported
     @deprecated(  # type: ignore
         "The grid_scores_ attribute is deprecated in version 0.24 in favor "
-        "of cv_results_ and will be removed in version 0.26"
+        "of cv_results_ and will be removed in version 1.1 (renaming of 0.26)."
     )
     @property
     def grid_scores_(self):
@@ -842,11 +844,12 @@ def grid_scores_(self):
             [self.cv_results_["split{}_score".format(i)]
              for i in range(n_alphas)]).T
 
-    # TODO: Remove in 0.26 when cv_alphas_ is deprecated
+    # TODO: Remove in 1.1 when cv_alphas_ is deprecated
     # mypy error: Decorated property not supported
     @deprecated(  # type: ignore
         "The cv_alphas_ attribute is deprecated in version 0.24 in favor "
-        "of cv_results_['alpha'] and will be removed in version 0.26"
+        "of cv_results_['alpha'] and will be removed in version 1.1 "
+        "(renaming of 0.26)."
     )
     @property
     def cv_alphas_(self):
diff --git a/sklearn/covariance/tests/test_graphical_lasso.py b/sklearn/covariance/tests/test_graphical_lasso.py
index 2030056f34ab5..9bcce6673dd65 100644
--- a/sklearn/covariance/tests/test_graphical_lasso.py
+++ b/sklearn/covariance/tests/test_graphical_lasso.py
@@ -152,7 +152,7 @@ def test_graphical_lasso_cv(random_state=1):
     GraphicalLassoCV(alphas=[0.8, 0.5], tol=1e-1, n_jobs=1).fit(X)
 
 
-# TODO: Remove in 0.26 when grid_scores_ is deprecated
+# TODO: Remove in 1.1 when grid_scores_ is deprecated
 def test_graphical_lasso_cv_grid_scores_and_cv_alphas_deprecated():
     splits = 4
     n_alphas = 5
@@ -168,13 +168,14 @@ def test_graphical_lasso_cv_grid_scores_and_cv_alphas_deprecated():
 
     total_alphas = n_refinements * n_alphas + 1
     msg = (r"The grid_scores_ attribute is deprecated in version 0\.24 in "
-           r"favor of cv_results_ and will be removed in version 0\.26")
+           r"favor of cv_results_ and will be removed in version 1\.1 "
+           r"\(renaming of 0\.26\).")
     with pytest.warns(FutureWarning, match=msg):
         assert cov.grid_scores_.shape == (total_alphas, splits)
 
     msg = (r"The cv_alphas_ attribute is deprecated in version 0\.24 in "
            r"favor of cv_results_\['alpha'\] and will be removed in version "
-           r"0\.26")
+           r"1\.1 \(renaming of 0\.26\)")
     with pytest.warns(FutureWarning, match=msg):
         assert len(cov.cv_alphas_) == total_alphas
 
diff --git a/sklearn/cross_decomposition/_pls.py b/sklearn/cross_decomposition/_pls.py
index f35c049c37ae8..7c1dc303e361f 100644
--- a/sklearn/cross_decomposition/_pls.py
+++ b/sklearn/cross_decomposition/_pls.py
@@ -181,12 +181,13 @@ def fit(self, X, Y):
             # see Wegelin page 25
             rank_upper_bound = p
             if not 1 <= n_components <= rank_upper_bound:
-                # TODO: raise an error in 0.26
+                # TODO: raise an error in 1.1
                 warnings.warn(
                     f"As of version 0.24, n_components({n_components}) should "
                     f"be in [1, n_features]."
                     f"n_components={rank_upper_bound} will be used instead. "
-                    f"In version 0.26, an error will be raised.",
+                    f"In version 1.1 (renaming of 0.26), an error will be "
+                    f"raised.",
                     FutureWarning
                 )
                 n_components = rank_upper_bound
@@ -195,13 +196,14 @@ def fit(self, X, Y):
             # X and the rank of Y: see Wegelin page 12
             rank_upper_bound = min(n, p, q)
             if not 1 <= self.n_components <= rank_upper_bound:
-                # TODO: raise an error in 0.26
+                # TODO: raise an error in 1.1
                 warnings.warn(
                     f"As of version 0.24, n_components({n_components}) should "
                     f"be in [1, min(n_features, n_samples, n_targets)] = "
                     f"[1, {rank_upper_bound}]. "
                     f"n_components={rank_upper_bound} will be used instead. "
-                    f"In version 0.26, an error will be raised.",
+                    f"In version 1.1 (renaming of 0.26), an error will be "
+                    f"raised.",
                     FutureWarning
                 )
                 n_components = rank_upper_bound
@@ -210,7 +212,7 @@ def fit(self, X, Y):
             raise ValueError("algorithm should be 'svd' or 'nipals', got "
                              f"{self.algorithm}.")
 
-        self._norm_y_weights = (self.deflation_mode == 'canonical')  # 0.26
+        self._norm_y_weights = (self.deflation_mode == 'canonical')  # 1.1
         norm_y_weights = self._norm_y_weights
 
         # Scale (in place)
@@ -406,60 +408,60 @@ def fit_transform(self, X, y=None):
     # mypy error: Decorated property not supported
     @deprecated(  # type: ignore
         "Attribute norm_y_weights was deprecated in version 0.24 and "
-        "will be removed in 0.26.")
+        "will be removed in 1.1 (renaming of 0.26).")
     @property
     def norm_y_weights(self):
         return self._norm_y_weights
 
     @deprecated(  # type: ignore
         "Attribute x_mean_ was deprecated in version 0.24 and "
-        "will be removed in 0.26.")
+        "will be removed in 1.1 (renaming of 0.26).")
     @property
     def x_mean_(self):
         return self._x_mean
 
     @deprecated(  # type: ignore
         "Attribute y_mean_ was deprecated in version 0.24 and "
-        "will be removed in 0.26.")
+        "will be removed in 1.1 (renaming of 0.26).")
     @property
     def y_mean_(self):
         return self._y_mean
 
     @deprecated(  # type: ignore
         "Attribute x_std_ was deprecated in version 0.24 and "
-        "will be removed in 0.26.")
+        "will be removed in 1.1 (renaming of 0.26).")
     @property
     def x_std_(self):
         return self._x_std
 
     @deprecated(  # type: ignore
         "Attribute y_std_ was deprecated in version 0.24 and "
-        "will be removed in 0.26.")
+        "will be removed in 1.1 (renaming of 0.26).")
     @property
     def y_std_(self):
         return self._y_std
 
     @property
     def x_scores_(self):
-        # TODO: raise error in 0.26 instead
+        # TODO: raise error in 1.1 instead
         if not isinstance(self, PLSRegression):
             pass
             warnings.warn(
                 "Attribute x_scores_ was deprecated in version 0.24 and "
-                "will be removed in 0.26. Use est.transform(X) on the "
-                "training data instead.",
+                "will be removed in 1.1 (renaming of 0.26). Use "
+                "est.transform(X) on the training data instead.",
                 FutureWarning
             )
         return self._x_scores
 
     @property
     def y_scores_(self):
-        # TODO: raise error in 0.26 instead
+        # TODO: raise error in 1.1 instead
         if not isinstance(self, PLSRegression):
             warnings.warn(
                 "Attribute y_scores_ was deprecated in version 0.24 and "
-                "will be removed in 0.26. Use est.transform(X) on the "
-                "training data instead.",
+                "will be removed in 1.1 (renaming of 0.26). Use "
+                "est.transform(X) on the training data instead.",
                 FutureWarning
             )
         return self._y_scores
@@ -625,15 +627,17 @@ class PLSCanonical(_PLS):
         The transformed training samples.
 
         .. deprecated:: 0.24
-           `x_scores_` is deprecated in 0.24 and will be removed in 0.26. You
-           can just call `transform` on the training data instead.
+           `x_scores_` is deprecated in 0.24 and will be removed in 1.1
+           (renaming of 0.26). You can just call `transform` on the training
+           data instead.
 
     y_scores_ : ndarray of shape (n_samples, n_components)
         The transformed training targets.
 
         .. deprecated:: 0.24
-           `y_scores_` is deprecated in 0.24 and will be removed in 0.26. You
-           can just call `transform` on the training data instead.
+           `y_scores_` is deprecated in 0.24 and will be removed in 1.1
+           (renaming of 0.26). You can just call `transform` on the training
+           data instead.
 
     x_rotations_ : ndarray of shape (n_features, n_components)
         The projection matrix used to transform `X`.
@@ -735,15 +739,17 @@ class CCA(_PLS):
         The transformed training samples.
 
         .. deprecated:: 0.24
-           `x_scores_` is deprecated in 0.24 and will be removed in 0.26. You
-           can just call `transform` on the training data instead.
+           `x_scores_` is deprecated in 0.24 and will be removed in 1.1
+           (renaming of 0.26). You can just call `transform` on the training
+           data instead.
 
     y_scores_ : ndarray of shape (n_samples, n_components)
         The transformed training targets.
 
         .. deprecated:: 0.24
-           `y_scores_` is deprecated in 0.24 and will be removed in 0.26. You
-           can just call `transform` on the training data instead.
+           `y_scores_` is deprecated in 0.24 and will be removed in 1.1
+           (renaming of 0.26). You can just call `transform` on the training
+           data instead.
 
     x_rotations_ : ndarray of shape (n_features, n_components)
         The projection matrix used to transform `X`.
@@ -824,15 +830,17 @@ class PLSSVD(TransformerMixin, BaseEstimator):
         The transformed training samples.
 
         .. deprecated:: 0.24
-           `x_scores_` is deprecated in 0.24 and will be removed in 0.26. You
-           can just call `transform` on the training data instead.
+           `x_scores_` is deprecated in 0.24 and will be removed in 1.1
+           (renaming of 0.26). You can just call `transform` on the training
+           data instead.
 
     y_scores_ : ndarray of shape (n_samples, n_components)
         The transformed training targets.
 
         .. deprecated:: 0.24
-           `y_scores_` is deprecated in 0.24 and will be removed in 0.26. You
-           can just call `transform` on the training data instead.
+           `y_scores_` is deprecated in 0.24 and will be removed in 1.1
+           (renaming of 0.26). You can just call `transform` on the training
+           data instead.
 
     Examples
     --------
@@ -886,13 +894,13 @@ def fit(self, X, Y):
         n_components = self.n_components
         rank_upper_bound = min(X.shape[0], X.shape[1], Y.shape[1])
         if not 1 <= n_components <= rank_upper_bound:
-            # TODO: raise an error in 0.26
+            # TODO: raise an error in 1.1
             warnings.warn(
                 f"As of version 0.24, n_components({n_components}) should be "
                 f"in [1, min(n_features, n_samples, n_targets)] = "
                 f"[1, {rank_upper_bound}]. "
                 f"n_components={rank_upper_bound} will be used instead. "
-                f"In version 0.26, an error will be raised.",
+                f"In version 1.1 (renaming of 0.26), an error will be raised.",
                 FutureWarning
             )
             n_components = rank_upper_bound
@@ -908,8 +916,8 @@ def fit(self, X, Y):
         U, Vt = svd_flip(U, Vt)
         V = Vt.T
 
-        self._x_scores = np.dot(X, U)  # TODO: remove in 0.26
-        self._y_scores = np.dot(Y, V)  # TODO: remove in 0.26
+        self._x_scores = np.dot(X, U)  # TODO: remove in 1.1
+        self._y_scores = np.dot(Y, V)  # TODO: remove in 1.1
         self.x_weights_ = U
         self.y_weights_ = V
         return self
@@ -917,8 +925,9 @@ def fit(self, X, Y):
     # mypy error: Decorated property not supported
     @deprecated(  # type: ignore
         "Attribute x_scores_ was deprecated in version 0.24 and "
-        "will be removed in 0.26. Use est.transform(X) on the "
-        "training data instead.")
+        "will be removed in 1.1 (renaming of 0.26). Use est.transform(X) on "
+        "the training data instead."
+    )
     @property
     def x_scores_(self):
         return self._x_scores
@@ -926,36 +935,37 @@ def x_scores_(self):
     # mypy error: Decorated property not supported
     @deprecated(  # type: ignore
         "Attribute y_scores_ was deprecated in version 0.24 and "
-        "will be removed in 0.26. Use est.transform(X, Y) on the "
-        "training data instead.")
+        "will be removed in 1.1 (renaming of 0.26). Use est.transform(X, Y) "
+        "on the training data instead."
+    )
     @property
     def y_scores_(self):
         return self._y_scores
 
     @deprecated(  # type: ignore
         "Attribute x_mean_ was deprecated in version 0.24 and "
-        "will be removed in 0.26.")
+        "will be removed in 1.1 (renaming of 0.26).")
     @property
     def x_mean_(self):
         return self._x_mean
 
     @deprecated(  # type: ignore
         "Attribute y_mean_ was deprecated in version 0.24 and "
-        "will be removed in 0.26.")
+        "will be removed in 1.1 (renaming of 0.26).")
     @property
     def y_mean_(self):
         return self._y_mean
 
     @deprecated(  # type: ignore
         "Attribute x_std_ was deprecated in version 0.24 and "
-        "will be removed in 0.26.")
+        "will be removed in 1.1 (renaming of 0.26).")
     @property
     def x_std_(self):
         return self._x_std
 
     @deprecated(  # type: ignore
         "Attribute y_std_ was deprecated in version 0.24 and "
-        "will be removed in 0.26.")
+        "will be removed in 1.1 (renaming of 0.26).")
     @property
     def y_std_(self):
         return self._y_std
diff --git a/sklearn/cross_decomposition/tests/test_pls.py b/sklearn/cross_decomposition/tests/test_pls.py
index a36a95ed153cf..c01e790ca1644 100644
--- a/sklearn/cross_decomposition/tests/test_pls.py
+++ b/sklearn/cross_decomposition/tests/test_pls.py
@@ -315,7 +315,7 @@ def test_convergence_fail():
         pls_nipals.fit(X, Y)
 
 
-@pytest.mark.filterwarnings('ignore:.*scores_ was deprecated')  # 0.26
+@pytest.mark.filterwarnings('ignore:.*scores_ was deprecated')  # 1.1
 @pytest.mark.parametrize('Est', (PLSSVD, PLSRegression, PLSCanonical))
 def test_attibutes_shapes(Est):
     # Make sure attributes are of the correct shape depending on n_components
@@ -439,7 +439,7 @@ def test_scale_and_stability(Est, X, Y):
 @pytest.mark.parametrize('n_components', (0, 4))
 def test_n_components_bounds(Est, n_components):
     # n_components should be in [1, min(n_samples, n_features, n_targets)]
-    # TODO: catch error instead of warning in 0.26
+    # TODO: catch error instead of warning in 1.1
     rng = np.random.RandomState(0)
     X = rng.randn(10, 5)
     Y = rng.randn(10, 3)
@@ -454,7 +454,7 @@ def test_n_components_bounds(Est, n_components):
 @pytest.mark.parametrize('n_components', (0, 6))
 def test_n_components_bounds_pls_regression(n_components):
     # For PLSRegression, the upper bound for n_components is n_features
-    # TODO: catch error instead of warning in 0.26
+    # TODO: catch error instead of warning in 1.1
     rng = np.random.RandomState(0)
     X = rng.randn(10, 5)
     Y = rng.randn(10, 3)
@@ -471,7 +471,7 @@ def test_scores_deprecations(Est):
     # Make sure x_scores_ and y_scores_ are deprecated.
     # It's not deprecated for PLSRegression because y_score_ is different from
     # transform(Y_train)
-    # TODO: remove attributes and test in 0.26
+    # TODO: remove attributes and test in 1.1
     rng = np.random.RandomState(0)
     X = rng.randn(10, 5)
     Y = rng.randn(10, 3)
@@ -492,7 +492,7 @@ def test_norm_y_weights_deprecation(Est):
         est.norm_y_weights
 
 
-# TODO: Remove test in 0.26
+# TODO: Remove test in 1.1
 @pytest.mark.parametrize('Estimator',
                          (PLSRegression, PLSCanonical, CCA, PLSSVD))
 @pytest.mark.parametrize('attribute',
diff --git a/sklearn/decomposition/_dict_learning.py b/sklearn/decomposition/_dict_learning.py
index 74a4d4f4d17a4..781f288b70351 100644
--- a/sklearn/decomposition/_dict_learning.py
+++ b/sklearn/decomposition/_dict_learning.py
@@ -1024,8 +1024,8 @@ class SparseCoder(_BaseSparseCoding, BaseEstimator):
         The unchanged dictionary atoms.
 
         .. deprecated:: 0.24
-           This attribute is deprecated in 0.24 and will be removed in 0.26.
-           Use `dictionary` instead.
+           This attribute is deprecated in 0.24 and will be removed in
+           1.1 (renaming of 0.26). Use `dictionary` instead.
 
     Examples
     --------
@@ -1089,8 +1089,8 @@ def fit(self, X, y=None):
         return self
 
     @deprecated("The attribute 'components_' is deprecated "  # type: ignore
-                "in 0.24 and will be removed in 0.26. Use the "
-                "'dictionary' instead.")
+                "in 0.24 and will be removed in 1.1 (renaming of 0.26). Use "
+                "the 'dictionary' instead.")
     @property
     def components_(self):
         return self.dictionary
diff --git a/sklearn/decomposition/_kernel_pca.py b/sklearn/decomposition/_kernel_pca.py
index ff1d2d869834f..5655eddb0bf31 100644
--- a/sklearn/decomposition/_kernel_pca.py
+++ b/sklearn/decomposition/_kernel_pca.py
@@ -167,10 +167,10 @@ def __init__(self, n_components=None, *, kernel="linear",
         self.n_jobs = n_jobs
         self.copy_X = copy_X
 
-    # TODO: Remove in 0.26
+    # TODO: Remove in 1.1
     # mypy error: Decorated property not supported
     @deprecated("Attribute _pairwise was deprecated in "  # type: ignore
-                "version 0.24 and will be removed in 0.26.")
+                "version 0.24 and will be removed in 1.1 (renaming of 0.26).")
     @property
     def _pairwise(self):
         return self.kernel == "precomputed"
diff --git a/sklearn/decomposition/_nmf.py b/sklearn/decomposition/_nmf.py
index d801b418d5a18..5d01060951ae1 100644
--- a/sklearn/decomposition/_nmf.py
+++ b/sklearn/decomposition/_nmf.py
@@ -312,7 +312,7 @@ def _initialize_nmf(X, n_components, init='warn', eps=1e-6,
         warnings.warn(("The 'init' value, when 'init=None' and "
                        "n_components is less than n_samples and "
                        "n_features, will be changed from 'nndsvd' to "
-                       "'nndsvda' in 0.26."), FutureWarning)
+                       "'nndsvda' in 1.1 (renaming of 0.26)."), FutureWarning)
         init = None
 
     check_non_negative(X, "NMF initialization")
diff --git a/sklearn/decomposition/tests/test_dict_learning.py b/sklearn/decomposition/tests/test_dict_learning.py
index b1f67a187acd2..c9590f3136678 100644
--- a/sklearn/decomposition/tests/test_dict_learning.py
+++ b/sklearn/decomposition/tests/test_dict_learning.py
@@ -557,7 +557,7 @@ def test_sparse_coder_common_transformer():
     check_transformers_unfitted(sc.__class__.__name__, sc)
 
 
-# TODO: remove in 0.26
+# TODO: remove in 1.1
 def test_sparse_coder_deprecation():
     # check that we raise a deprecation warning when accessing `components_`
     rng = np.random.RandomState(777)
diff --git a/sklearn/decomposition/tests/test_kernel_pca.py b/sklearn/decomposition/tests/test_kernel_pca.py
index 2b12314b3c980..2acccb0df6781 100644
--- a/sklearn/decomposition/tests/test_kernel_pca.py
+++ b/sklearn/decomposition/tests/test_kernel_pca.py
@@ -316,7 +316,7 @@ def test_32_64_decomposition_shape():
             kpca.fit_transform(X.astype(np.float32)).shape)
 
 
-# TODO: Remove in 0.26
+# TODO: Remove in 1.1
 def test_kernel_pcc_pairwise_is_deprecated():
     kp = KernelPCA(kernel='precomputed')
     msg = r"Attribute _pairwise was deprecated in version 0\.24"
diff --git a/sklearn/decomposition/tests/test_nmf.py b/sklearn/decomposition/tests/test_nmf.py
index ff6b4ed8b4245..88c1ba406ad99 100644
--- a/sklearn/decomposition/tests/test_nmf.py
+++ b/sklearn/decomposition/tests/test_nmf.py
@@ -42,7 +42,7 @@ def test_initialize_nn_output():
 def test_parameter_checking():
     A = np.ones((2, 2))
     name = 'spam'
-    # FIXME : should be removed in 0.26
+    # FIXME : should be removed in 1.1
     init = 'nndsvda'
     msg = "Invalid solver parameter: got 'spam' instead of one of"
     assert_raise_message(ValueError, msg, NMF(solver=name, init=init).fit, A)
@@ -179,7 +179,7 @@ def test_n_components_greater_n_features():
     # Smoke test for the case of more components than features.
     rng = np.random.mtrand.RandomState(42)
     A = np.abs(rng.randn(30, 10))
-    # FIXME : should be removed in 0.26
+    # FIXME : should be removed in 1.1
     init = 'random'
     NMF(n_components=15, random_state=0, tol=1e-2, init=init).fit(A)
 
@@ -441,7 +441,7 @@ def test_nmf_regularization():
     rng = np.random.mtrand.RandomState(42)
     X = np.abs(rng.randn(n_samples, n_features))
 
-    # FIXME : should be removed in 0.26
+    # FIXME : should be removed in 1.1
     init = 'nndsvda'
     # L1 regularization should increase the number of zeros
     l1_ratio = 1.
@@ -552,7 +552,7 @@ def test_nmf_dtype_match(dtype_in, dtype_out, solver, regularization):
     # Check that NMF preserves dtype (float32 and float64)
     X = np.random.RandomState(0).randn(20, 15).astype(dtype_in, copy=False)
     np.abs(X, out=X)
-    # FIXME : should be removed in 0.26
+    # FIXME : should be removed in 1.1
     init = 'nndsvda'
     nmf = NMF(solver=solver, regularization=regularization, init=init)
 
@@ -568,7 +568,7 @@ def test_nmf_float32_float64_consistency(solver, regularization):
     # Check that the result of NMF is the same between float32 and float64
     X = np.random.RandomState(0).randn(50, 7)
     np.abs(X, out=X)
-    # FIXME : should be removed in 0.26
+    # FIXME : should be removed in 1.1
     init = 'nndsvda'
     nmf32 = NMF(solver=solver, regularization=regularization, random_state=0,
                 init=init)
@@ -595,13 +595,13 @@ def test_nmf_custom_init_dtype_error():
         non_negative_factorization(X, H=H, update_H=False)
 
 
-# FIXME : should be removed in 0.26
+# FIXME : should be removed in 1.1
 def test_init_default_deprecation():
     # Test FutureWarning on init default
-    msg = ("The 'init' value, when 'init=None' and "
-           "n_components is less than n_samples and "
-           "n_features, will be changed from 'nndsvd' to "
-           "'nndsvda' in 0.26.")
+    msg = (r"The 'init' value, when 'init=None' and "
+           r"n_components is less than n_samples and "
+           r"n_features, will be changed from 'nndsvd' to "
+           r"'nndsvda' in 1.1 \(renaming of 0.26\).")
     rng = np.random.mtrand.RandomState(42)
     A = np.abs(rng.randn(6, 5))
     with pytest.warns(FutureWarning, match=msg):
diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py
index 81fc319fdfadb..ff1e781f7e166 100644
--- a/sklearn/ensemble/_forest.py
+++ b/sklearn/ensemble/_forest.py
@@ -991,8 +991,8 @@ class RandomForestClassifier(ForestClassifier):
            ``min_impurity_split`` has been deprecated in favor of
            ``min_impurity_decrease`` in 0.19. The default value of
            ``min_impurity_split`` has changed from 1e-7 to 0 in 0.23 and it
-           will be removed in 0.25. Use ``min_impurity_decrease`` instead.
-
+           will be removed in 1.0 (renaming of 0.25).
+           Use ``min_impurity_decrease`` instead.
 
     bootstrap : bool, default=True
         Whether bootstrap samples are used when building trees. If False, the
@@ -1314,7 +1314,8 @@ class RandomForestRegressor(ForestRegressor):
            ``min_impurity_split`` has been deprecated in favor of
            ``min_impurity_decrease`` in 0.19. The default value of
            ``min_impurity_split`` has changed from 1e-7 to 0 in 0.23 and it
-           will be removed in 0.25. Use ``min_impurity_decrease`` instead.
+           will be removed in 1.0 (renaming of 0.25).
+           Use ``min_impurity_decrease`` instead.
 
     bootstrap : bool, default=True
         Whether bootstrap samples are used when building trees. If False, the
@@ -1596,7 +1597,8 @@ class ExtraTreesClassifier(ForestClassifier):
            ``min_impurity_split`` has been deprecated in favor of
            ``min_impurity_decrease`` in 0.19. The default value of
            ``min_impurity_split`` has changed from 1e-7 to 0 in 0.23 and it
-           will be removed in 0.25. Use ``min_impurity_decrease`` instead.
+           will be removed in 1.0 (renaming of 0.25).
+           Use ``min_impurity_decrease`` instead.
 
     bootstrap : bool, default=False
         Whether bootstrap samples are used when building trees. If False, the
@@ -1914,7 +1916,8 @@ class ExtraTreesRegressor(ForestRegressor):
            ``min_impurity_split`` has been deprecated in favor of
            ``min_impurity_decrease`` in 0.19. The default value of
            ``min_impurity_split`` has changed from 1e-7 to 0 in 0.23 and it
-           will be removed in 0.25. Use ``min_impurity_decrease`` instead.
+           will be removed in 1.0 (renaming of 0.25).
+           Use ``min_impurity_decrease`` instead.
 
     bootstrap : bool, default=False
         Whether bootstrap samples are used when building trees. If False, the
@@ -2173,7 +2176,8 @@ class RandomTreesEmbedding(BaseForest):
            ``min_impurity_split`` has been deprecated in favor of
            ``min_impurity_decrease`` in 0.19. The default value of
            ``min_impurity_split`` has changed from 1e-7 to 0 in 0.23 and it
-           will be removed in 0.25. Use ``min_impurity_decrease`` instead.
+           will be removed in 1.0 (renaming of 0.25).
+           Use ``min_impurity_decrease`` instead.
 
     sparse_output : bool, default=True
         Whether or not to return a sparse CSR matrix, as default behavior,
diff --git a/sklearn/ensemble/_gb.py b/sklearn/ensemble/_gb.py
index a25b716edc22b..15f5404f4701c 100644
--- a/sklearn/ensemble/_gb.py
+++ b/sklearn/ensemble/_gb.py
@@ -398,7 +398,7 @@ def fit(self, X, y, sample_weight=None, monitor=None):
         self : object
         """
         if self.criterion == 'mae':
-            # TODO: This should raise an error from 0.26
+            # TODO: This should raise an error from 1.1
             self._warn_mae_for_criterion()
 
         # if not warmstart - clear the estimator state
@@ -812,8 +812,9 @@ class GradientBoostingClassifier(ClassifierMixin, BaseGradientBoosting):
         .. versionadded:: 0.18
         .. deprecated:: 0.24
             `criterion='mae'` is deprecated and will be removed in version
-            0.26. Use `criterion='friedman_mse'` or `'mse'` instead, as trees
-            should use a least-square criterion in Gradient Boosting.
+            1.1 (renaming of 0.26). Use `criterion='friedman_mse'` or `'mse'`
+            instead, as trees should use a least-square criterion in
+            Gradient Boosting.
 
     min_samples_split : int or float, default=2
         The minimum number of samples required to split an internal node:
@@ -878,7 +879,8 @@ class GradientBoostingClassifier(ClassifierMixin, BaseGradientBoosting):
            ``min_impurity_split`` has been deprecated in favor of
            ``min_impurity_decrease`` in 0.19. The default value of
            ``min_impurity_split`` has changed from 1e-7 to 0 in 0.23 and it
-           will be removed in 0.25. Use ``min_impurity_decrease`` instead.
+           will be removed in 1.0 (renaming of 0.25).
+           Use ``min_impurity_decrease`` instead.
 
     init : estimator or 'zero', default=None
         An estimator object that is used to compute the initial predictions.
@@ -1112,9 +1114,9 @@ def _validate_y(self, y, sample_weight):
         return y
 
     def _warn_mae_for_criterion(self):
-        # TODO: This should raise an error from 0.26
+        # TODO: This should raise an error from 1.1
         warnings.warn("criterion='mae' was deprecated in version 0.24 and "
-                      "will be removed in version 0.26. Use "
+                      "will be removed in version 1.1 (renaming of 0.26). Use "
                       "criterion='friedman_mse' or 'mse' instead, as trees "
                       "should use a least-square criterion in Gradient "
                       "Boosting.", FutureWarning)
@@ -1339,8 +1341,8 @@ class GradientBoostingRegressor(RegressorMixin, BaseGradientBoosting):
         .. versionadded:: 0.18
         .. deprecated:: 0.24
             `criterion='mae'` is deprecated and will be removed in version
-            0.26. The correct way of minimizing the absolute error is to use
-            `loss='lad'` instead.
+            1.1 (renaming of 0.26). The correct way of minimizing the absolute
+            error is to use `loss='lad'` instead.
 
     min_samples_split : int or float, default=2
         The minimum number of samples required to split an internal node:
@@ -1405,7 +1407,8 @@ class GradientBoostingRegressor(RegressorMixin, BaseGradientBoosting):
            ``min_impurity_split`` has been deprecated in favor of
            ``min_impurity_decrease`` in 0.19. The default value of
            ``min_impurity_split`` has changed from 1e-7 to 0 in 0.23 and it
-           will be removed in 0.25. Use ``min_impurity_decrease`` instead.
+           will be removed in 1.0 (renaming of 0.25).
+           Use ``min_impurity_decrease`` instead.
 
     init : estimator or 'zero', default=None
         An estimator object that is used to compute the initial predictions.
@@ -1535,7 +1538,7 @@ class GradientBoostingRegressor(RegressorMixin, BaseGradientBoosting):
 
         .. deprecated:: 0.24
             Attribute ``n_classes_`` was deprecated in version 0.24 and
-            will be removed in 0.26.
+            will be removed in 1.1 (renaming of 0.26).
 
     n_estimators_ : int
         The number of estimators as selected by early stopping (if
@@ -1623,11 +1626,11 @@ def _validate_y(self, y, sample_weight=None):
         return y
 
     def _warn_mae_for_criterion(self):
-        # TODO: This should raise an error from 0.26
+        # TODO: This should raise an error from 1.1
         warnings.warn("criterion='mae' was deprecated in version 0.24 and "
-                      "will be removed in version 0.26. The correct way of "
-                      "minimizing the absolute error is to use loss='lad' "
-                      "instead.", FutureWarning)
+                      "will be removed in version 1.1 (renaming of 0.26). The "
+                      "correct way of minimizing the absolute error is to use "
+                      " loss='lad' instead.", FutureWarning)
 
     def predict(self, X):
         """Predict regression target for X.
@@ -1692,10 +1695,11 @@ def apply(self, X):
         leaves = leaves.reshape(X.shape[0], self.estimators_.shape[0])
         return leaves
 
-    # FIXME: to be removed in 0.26
+    # FIXME: to be removed in 1.1
     # mypy error: Decorated property not supported
     @deprecated("Attribute n_classes_ was deprecated "  # type: ignore
-                "in version 0.24 and will be removed in 0.26.")
+                "in version 0.24 and will be removed in 1.1 "
+                "(renaming of 0.26).")
     @property
     def n_classes_(self):
         try:
diff --git a/sklearn/ensemble/tests/test_gradient_boosting.py b/sklearn/ensemble/tests/test_gradient_boosting.py
index 812ff16933758..256b79db4865c 100644
--- a/sklearn/ensemble/tests/test_gradient_boosting.py
+++ b/sklearn/ensemble/tests/test_gradient_boosting.py
@@ -1308,7 +1308,7 @@ def test_gbr_degenerate_feature_importances():
                        np.zeros(10, dtype=np.float64))
 
 
-# TODO: Remove in 0.26 when `n_classes_` is deprecated
+# TODO: Remove in 1.1 when `n_classes_` is deprecated
 def test_gbr_deprecated_attr():
     # check that accessing n_classes_ in GradientBoostingRegressor raises
     # a deprecation warning
@@ -1320,7 +1320,7 @@ def test_gbr_deprecated_attr():
         gbr.n_classes_
 
 
-# TODO: Remove in 0.26 when `n_classes_` is deprecated
+# TODO: Remove in 1.1 when `n_classes_` is deprecated
 @pytest.mark.filterwarnings("ignore:Attribute n_classes_ was deprecated")
 def test_attr_error_raised_if_not_fitted():
     # check that accessing n_classes_ in not fitted GradientBoostingRegressor
@@ -1335,7 +1335,7 @@ def test_attr_error_raised_if_not_fitted():
         gbr.n_classes_
 
 
-# TODO: Update in 0.26 to check for the error raised
+# TODO: Update in 1.1 to check for the error raised
 @pytest.mark.parametrize('estimator', [
     GradientBoostingClassifier(criterion='mae'),
     GradientBoostingRegressor(criterion='mae')
@@ -1344,6 +1344,6 @@ def test_criterion_mae_deprecation(estimator):
     # checks whether a deprecation warning is issues when criterion='mae'
     # is used.
     msg = ("criterion='mae' was deprecated in version 0.24 and "
-           "will be removed in version 0.26.")
+           "will be removed in version 1.1")
     with pytest.warns(FutureWarning, match=msg):
         estimator.fit(X, y)
diff --git a/sklearn/exceptions.py b/sklearn/exceptions.py
index 3a3188be35468..2ab7545705115 100644
--- a/sklearn/exceptions.py
+++ b/sklearn/exceptions.py
@@ -41,7 +41,7 @@ class NotFittedError(ValueError, AttributeError):
 
 
 @deprecated("ChangedBehaviorWarning is deprecated in 0.24 and will be removed "
-            "in 0.26")
+            "in 1.1")
 class ChangedBehaviorWarning(UserWarning):
     """Warning class used to notify the user of any change in the behavior.
 
@@ -114,7 +114,7 @@ class FitFailedWarning(RuntimeWarning):
 
 
 @deprecated("NonBLASDotWarning is deprecated in 0.24 and will be removed in "
-            "0.26")
+            "1.1")
 class NonBLASDotWarning(EfficiencyWarning):
     """Warning used when the dot operation does not use BLAS.
 
diff --git a/sklearn/inspection/_partial_dependence.py b/sklearn/inspection/_partial_dependence.py
index 60646c5126ded..1e9c0c9718a51 100644
--- a/sklearn/inspection/_partial_dependence.py
+++ b/sklearn/inspection/_partial_dependence.py
@@ -307,7 +307,7 @@ def partial_dependence(estimator, X, features, *, response_method='auto',
 
         .. versionadded:: 0.24
         .. deprecated:: 0.24
-            `kind='legacy'` is deprecated and will be removed in version 0.26.
+            `kind='legacy'` is deprecated and will be removed in version 1.1.
             `kind='average'` will be the new default. It is intended to migrate
             from the ndarray output to :class:`~sklearn.utils.Bunch` output.
 
@@ -504,12 +504,12 @@ def partial_dependence(estimator, X, features, *, response_method='auto',
     if kind == 'legacy':
         warnings.warn(
             "A Bunch will be returned in place of 'predictions' from version"
-            " 0.26 with partial dependence results accessible via the "
-            "'average' key. In the meantime, pass kind='average' to get the "
-            "future behaviour.",
+            " 1.1 (renaming of 0.26) with partial dependence results "
+            "accessible via the 'average' key. In the meantime, pass "
+            "kind='average' to get the future behaviour.",
             FutureWarning
         )
-        # TODO 0.26: Remove kind == 'legacy' section
+        # TODO 1.1: Remove kind == 'legacy' section
         return averaged_predictions, values
     elif kind == 'average':
         return Bunch(average=averaged_predictions, values=values)
diff --git a/sklearn/inspection/_plot/partial_dependence.py b/sklearn/inspection/_plot/partial_dependence.py
index 4790c2bb3842a..d6604d7ae675f 100644
--- a/sklearn/inspection/_plot/partial_dependence.py
+++ b/sklearn/inspection/_plot/partial_dependence.py
@@ -817,7 +817,7 @@ def _plot_two_way_partial_dependence(
             ax.set_xlabel(self.feature_names[feature_idx[0]])
         ax.set_ylabel(self.feature_names[feature_idx[1]])
 
-    @_deprecate_positional_args(version="0.26")
+    @_deprecate_positional_args(version="1.1")
     def plot(self, *, ax=None, n_cols=3, line_kw=None, contour_kw=None):
         """Plot partial dependence plots.
 
diff --git a/sklearn/inspection/tests/test_partial_dependence.py b/sklearn/inspection/tests/test_partial_dependence.py
index 9fc1b2683545b..997c61c0e5f8b 100644
--- a/sklearn/inspection/tests/test_partial_dependence.py
+++ b/sklearn/inspection/tests/test_partial_dependence.py
@@ -100,7 +100,7 @@ def test_output_shape(Estimator, method, data, grid_resolution,
         est, X=X, features=features, method=method, kind=kind,
         grid_resolution=grid_resolution
     )
-    # FIXME: Remove 'legacy' support in 0.26
+    # FIXME: Remove 'legacy' support in 1.1
     pdp, axes = result if kind == 'legacy' else (result, result["values"])
 
     expected_pdp_shape = (n_targets,
@@ -711,7 +711,7 @@ def test_warning_for_kind_legacy():
     est.fit(X, y)
 
     err_msg = ("A Bunch will be returned in place of 'predictions' from "
-               "version 0.26")
+               "version 1.1")
     with pytest.warns(FutureWarning, match=err_msg):
         partial_dependence(est, X=X, features=[1, 2])
 
diff --git a/sklearn/kernel_ridge.py b/sklearn/kernel_ridge.py
index 119b27e9084ae..8a27ea572b344 100644
--- a/sklearn/kernel_ridge.py
+++ b/sklearn/kernel_ridge.py
@@ -136,10 +136,10 @@ def _get_kernel(self, X, Y=None):
     def _more_tags(self):
         return {'pairwise': self.kernel == 'precomputed'}
 
-    # TODO: Remove in 0.26
+    # TODO: Remove in 1.1
     # mypy error: Decorated property not supported
     @deprecated("Attribute _pairwise was deprecated in "  # type: ignore
-                "version 0.24 and will be removed in 0.26.")
+                "version 0.24 and will be removed in 1.1 (renaming of 0.26).")
     @property
     def _pairwise(self):
         return self.kernel == "precomputed"
diff --git a/sklearn/linear_model/_stochastic_gradient.py b/sklearn/linear_model/_stochastic_gradient.py
index e99116ca4f3e3..7b019e5545534 100644
--- a/sklearn/linear_model/_stochastic_gradient.py
+++ b/sklearn/linear_model/_stochastic_gradient.py
@@ -291,7 +291,8 @@ def _make_validation_score_cb(self, validation_mask, X, y, sample_weight,
 
     # mypy error: Decorated property not supported
     @deprecated("Attribute standard_coef_ was deprecated "  # type: ignore
-                "in version 0.23 and will be removed in 0.25.")
+                "in version 0.23 and will be removed in 1.0 "
+                "(renaming of 0.25).")
     @property
     def standard_coef_(self):
         return self._standard_coef
@@ -299,7 +300,7 @@ def standard_coef_(self):
     # mypy error: Decorated property not supported
     @deprecated(  # type: ignore
         "Attribute standard_intercept_ was deprecated "
-        "in version 0.23 and will be removed in 0.25."
+        "in version 0.23 and will be removed in 1.0 (renaming of 0.25)."
     )
     @property
     def standard_intercept_(self):
@@ -307,14 +308,16 @@ def standard_intercept_(self):
 
     # mypy error: Decorated property not supported
     @deprecated("Attribute average_coef_ was deprecated "  # type: ignore
-                "in version 0.23 and will be removed in 0.25.")
+                "in version 0.23 and will be removed in 1.0 "
+                "(renaming of 0.25).")
     @property
     def average_coef_(self):
         return self._average_coef
 
     # mypy error: Decorated property not supported
     @deprecated("Attribute average_intercept_ was deprecated "  # type: ignore
-                "in version 0.23 and will be removed in 0.25.")
+                "in version 0.23 and will be removed in 1.0 "
+                "(renaming of 0.25).")
     @property
     def average_intercept_(self):
         return self._average_intercept
@@ -1531,14 +1534,14 @@ class SGDRegressor(BaseSGDRegressor):
 
         .. deprecated:: 0.23
             Attribute ``average_coef_`` was deprecated
-            in version 0.23 and will be removed in 0.25.
+            in version 0.23 and will be removed in 1.0 (renaming of 0.25).
 
     average_intercept_ : ndarray of shape (1,)
         The averaged intercept term. Only available if ``average=True``.
 
         .. deprecated:: 0.23
             Attribute ``average_intercept_`` was deprecated
-            in version 0.23 and will be removed in 0.25.
+            in version 0.23 and will be removed in 1.0 (renaming of 0.25).
 
     n_iter_ : int
         The actual number of iterations before reaching the stopping criterion.
diff --git a/sklearn/linear_model/tests/test_passive_aggressive.py b/sklearn/linear_model/tests/test_passive_aggressive.py
index 27381059eaf33..f67a768844213 100644
--- a/sklearn/linear_model/tests/test_passive_aggressive.py
+++ b/sklearn/linear_model/tests/test_passive_aggressive.py
@@ -267,7 +267,7 @@ def test_regressor_undefined_methods():
     for meth in ("transform",):
         assert_raises(AttributeError, lambda x: getattr(reg, x), meth)
 
-# TODO: remove in 0.25
+# TODO: remove in 1.0
 @pytest.mark.parametrize('klass', [PassiveAggressiveClassifier,
                                    PassiveAggressiveRegressor])
 def test_passive_aggressive_deprecated_attr(klass):
diff --git a/sklearn/linear_model/tests/test_sgd.py b/sklearn/linear_model/tests/test_sgd.py
index 8e8d3f94b6c99..d5063981ff9aa 100644
--- a/sklearn/linear_model/tests/test_sgd.py
+++ b/sklearn/linear_model/tests/test_sgd.py
@@ -269,7 +269,7 @@ def test_plain_has_no_average_attr(klass):
     assert not hasattr(clf, '_standard_coef')
 
 
-# TODO: remove in 0.25
+# TODO: remove in 1.0
 @pytest.mark.parametrize('klass', [SGDClassifier, SGDRegressor])
 def test_sgd_deprecated_attr(klass):
     est = klass(average=True, eta0=.01)
diff --git a/sklearn/manifold/_mds.py b/sklearn/manifold/_mds.py
index ac749d737c762..6a144e3033e8e 100644
--- a/sklearn/manifold/_mds.py
+++ b/sklearn/manifold/_mds.py
@@ -389,10 +389,10 @@ def __init__(self, n_components=2, *, metric=True, n_init=4,
     def _more_tags(self):
         return {'pairwise': self.dissimilarity == 'precomputed'}
 
-    # TODO: Remove in 0.26
+    # TODO: Remove in 1.1
     # mypy error: Decorated property not supported
     @deprecated("Attribute _pairwise was deprecated in "  # type: ignore
-                "version 0.24 and will be removed in 0.26.")
+                "version 0.24 and will be removed in 1.1 (renaming of 0.26).")
     @property
     def _pairwise(self):
         return self.dissimilarity == "precomputed"
diff --git a/sklearn/manifold/_spectral_embedding.py b/sklearn/manifold/_spectral_embedding.py
index b77da83ecad30..70f817904ac65 100644
--- a/sklearn/manifold/_spectral_embedding.py
+++ b/sklearn/manifold/_spectral_embedding.py
@@ -472,10 +472,10 @@ def _more_tags(self):
         return {'pairwise': self.affinity in ["precomputed",
                                               "precomputed_nearest_neighbors"]}
 
-    # TODO: Remove in 0.26
+    # TODO: Remove in 1.1
     # mypy error: Decorated property not supported
     @deprecated("Attribute _pairwise was deprecated in "  # type: ignore
-                "version 0.24 and will be removed in 0.26.")
+                "version 0.24 and will be removed in 1.1 (renaming of 0.26).")
     @property
     def _pairwise(self):
         return self.affinity in ["precomputed",
diff --git a/sklearn/manifold/_t_sne.py b/sklearn/manifold/_t_sne.py
index d07a9faf62d35..b6072a6e198c4 100644
--- a/sklearn/manifold/_t_sne.py
+++ b/sklearn/manifold/_t_sne.py
@@ -611,8 +611,8 @@ class TSNE(BaseEstimator):
            legacy squaring behavior.
         .. deprecated:: 0.24
            Legacy squaring behavior was deprecated in 0.24. The ``'legacy'``
-           value will be removed in 0.26, at which point the default value will
-           change to ``True``.
+           value will be removed in 1.1 (renaming of 0.26), at which point the
+           default value will change to ``True``.
 
     Attributes
     ----------
@@ -675,7 +675,7 @@ def __init__(self, n_components=2, *, perplexity=30.0,
         self.method = method
         self.angle = angle
         self.n_jobs = n_jobs
-        # TODO Revisit deprecation of square_distances for 0.26-0.28 (#12401)
+        # TODO Revisit deprecation of square_distances for 1.1-1.3 (#12401)
         self.square_distances = square_distances
 
     def _fit(self, X, skip_num_points=0):
@@ -688,14 +688,16 @@ def _fit(self, X, skip_num_points=0):
         if self.square_distances not in [True, 'legacy']:
             raise ValueError("'square_distances' must be True or 'legacy'.")
         if self.metric != "euclidean" and self.square_distances is not True:
-            warnings.warn(("'square_distances' has been introduced in 0.24"
-                           "to help phase out legacy squaring behavior. The "
-                           "'legacy' setting will be removed in 0.26, and the "
-                           "default setting will be changed to True. In 0.28, "
-                           "'square_distances' will be removed altogether,"
-                           "and distances will be squared by default. Set "
-                           "'square_distances'=True to silence this warning."),
-                          FutureWarning)
+            warnings.warn(
+                "'square_distances' has been introduced in 0.24 to help phase "
+                "out legacy squaring behavior. The 'legacy' setting will be "
+                "removed in 1.1 (renaming of 0.26), and the default setting "
+                "will be changed to True. In 1.3, 'square_distances' will be "
+                "removed altogether, and distances will be squared by "
+                "default. Set 'square_distances'=True to silence this "
+                "warning.",
+                FutureWarning
+            )
         if self.method == 'barnes_hut':
             X = self._validate_data(X, accept_sparse=['csr'],
                                     ensure_min_samples=2,
diff --git a/sklearn/manifold/tests/test_mds.py b/sklearn/manifold/tests/test_mds.py
index 7ca046385a6ed..6e2016c798772 100644
--- a/sklearn/manifold/tests/test_mds.py
+++ b/sklearn/manifold/tests/test_mds.py
@@ -65,7 +65,7 @@ def test_MDS():
     mds_clf.fit(sim)
 
 
-# TODO: Remove in 0.26
+# TODO: Remove in 1.1
 def test_MDS_pairwise_deprecated():
     mds_clf = mds.MDS(metric='precomputed')
     msg = r"Attribute _pairwise was deprecated in version 0\.24"
@@ -73,7 +73,7 @@ def test_MDS_pairwise_deprecated():
         mds_clf._pairwise
 
 
-# TODO: Remove in 0.26
+# TODO: Remove in 1.1
 @ignore_warnings(category=FutureWarning)
 @pytest.mark.parametrize("dissimilarity, expected_pairwise", [
    ("precomputed", True),
diff --git a/sklearn/manifold/tests/test_spectral_embedding.py b/sklearn/manifold/tests/test_spectral_embedding.py
index 02cbd303134e6..8fcf113874927 100644
--- a/sklearn/manifold/tests/test_spectral_embedding.py
+++ b/sklearn/manifold/tests/test_spectral_embedding.py
@@ -347,7 +347,7 @@ def test_spectral_embedding_first_eigen_vector():
         assert np.std(embedding[:, 1]) > 1e-3
 
 
-# TODO: Remove in 0.26
+# TODO: Remove in 1.1
 @pytest.mark.parametrize("affinity", ["precomputed",
                                       "precomputed_nearest_neighbors"])
 def test_spectral_embedding_pairwise_deprecated(affinity):
diff --git a/sklearn/manifold/tests/test_t_sne.py b/sklearn/manifold/tests/test_t_sne.py
index 97d6ea5ce5933..716c031d4f5bf 100644
--- a/sklearn/manifold/tests/test_t_sne.py
+++ b/sklearn/manifold/tests/test_t_sne.py
@@ -898,7 +898,7 @@ def test_tsne_with_different_distance_metrics():
 @ignore_warnings(category=FutureWarning)
 def test_tsne_different_square_distances(method, metric, square_distances):
     # Make sure that TSNE works for different square_distances settings
-    # FIXME remove test when square_distances=True becomes the default in 0.26
+    # FIXME remove test when square_distances=True becomes the default in 1.1
     random_state = check_random_state(0)
     n_components_original = 3
     n_components_embedding = 2
diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py
index 5263b593e9594..a3b4accc03655 100644
--- a/sklearn/metrics/pairwise.py
+++ b/sklearn/metrics/pairwise.py
@@ -1447,18 +1447,24 @@ def _precompute_metric_params(X, Y, metric=None, **kwds):
         if X is Y:
             V = np.var(X, axis=0, ddof=1, dtype=dtype)
         else:
-            warnings.warn("from version 0.25, pairwise_distances for "
-                          "metric='seuclidean' will require V to be "
-                          "specified if Y is passed.", FutureWarning)
+            warnings.warn(
+                "from version 1.0 (renaming of 0.25), pairwise_distances for "
+                "metric='seuclidean' will require V to be specified if Y is "
+                "passed.",
+                FutureWarning
+            )
             V = np.var(np.vstack([X, Y]), axis=0, ddof=1, dtype=dtype)
         return {'V': V}
     if metric == "mahalanobis" and 'VI' not in kwds:
         if X is Y:
             VI = np.linalg.inv(np.cov(X.T)).T
         else:
-            warnings.warn("from version 0.25, pairwise_distances for "
-                          "metric='mahalanobis' will require VI to be "
-                          "specified if Y is passed.", FutureWarning)
+            warnings.warn(
+                "from version 1.0 (renaming of 0.25), pairwise_distances for "
+                "metric='mahalanobis' will require VI to be specified if Y "
+                "is passed.",
+                FutureWarning
+            )
             VI = np.linalg.inv(np.cov(np.vstack([X, Y]).T)).T
         return {'VI': VI}
     return {}
diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py
index 92dea4e791dfe..88c285421fca6 100644
--- a/sklearn/metrics/tests/test_pairwise.py
+++ b/sklearn/metrics/tests/test_pairwise.py
@@ -1281,7 +1281,7 @@ def test_pairwise_distances_data_derived_params(n_jobs, metric, dist_function,
                 params = {'VI': np.linalg.inv(np.cov(np.vstack([X, Y]).T)).T}
 
         expected_dist_explicit_params = cdist(X, Y, metric=metric, **params)
-        # TODO: Remove warn_checker in 0.25
+        # TODO: Remove warn_checker in 1.0
         if y_is_x:
             warn_checker = pytest.warns(None)
         else:
diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py
index 51f43debf78ed..213204b50c2a7 100644
--- a/sklearn/model_selection/_search.py
+++ b/sklearn/model_selection/_search.py
@@ -314,10 +314,10 @@ def __len__(self):
             return self.n_iter
 
 
-# FIXME Remove fit_grid_point in 0.25
+# FIXME Remove fit_grid_point in 1.0
 @deprecated(
     "fit_grid_point is deprecated in version 0.23 "
-    "and will be removed in version 0.25"
+    "and will be removed in version 1.0 (renaming of 0.25)"
 )
 def fit_grid_point(X, y, estimator, parameters, train, test, scorer,
                    verbose, error_score=np.nan, **fit_params):
@@ -440,10 +440,10 @@ def _more_tags(self):
                               "DataConversionWarning not caught"},
         }
 
-    # TODO: Remove in 0.26
+    # TODO: Remove in 1.1
     # mypy error: Decorated property not supported
     @deprecated("Attribute _pairwise was deprecated in "  # type: ignore
-                "version 0.24 and will be removed in 0.26.")
+                "version 0.24 and will be removed in 1.1 (renaming of 0.26).")
     @property
     def _pairwise(self):
         # allows cross-validation to see 'precomputed' metrics
diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py
index b1194600c530d..af2ca92aee26b 100644
--- a/sklearn/model_selection/tests/test_search.py
+++ b/sklearn/model_selection/tests/test_search.py
@@ -1266,7 +1266,7 @@ def test_grid_search_correct_score_results():
                 assert_almost_equal(correct_score, cv_scores[i])
 
 
-# FIXME remove test_fit_grid_point as the function will be removed on 0.25
+# FIXME remove test_fit_grid_point as the function will be removed on 1.0
 @ignore_warnings(category=FutureWarning)
 def test_fit_grid_point():
     X, y = make_classification(random_state=0)
@@ -1297,13 +1297,13 @@ def test_fit_grid_point():
 
 
 # FIXME remove test_fit_grid_point_deprecated as
-# fit_grid_point will be removed on 0.25
+# fit_grid_point will be removed on 1.0
 def test_fit_grid_point_deprecated():
     X, y = make_classification(random_state=0)
     svc = LinearSVC(random_state=0)
     scorer = make_scorer(accuracy_score)
     msg = ("fit_grid_point is deprecated in version 0.23 "
-           "and will be removed in version 0.25")
+           "and will be removed in version 1.0")
     params = {'C': 0.1}
     train, test = next(StratifiedKFold().split(X, y))
 
@@ -1963,7 +1963,7 @@ def _more_tags(self):
     assert pairwise == cv._get_tags()['pairwise'], attr_message
 
 
-# TODO: Remove in 0.26
+# TODO: Remove in 1.1
 @ignore_warnings(category=FutureWarning)
 def test_search_cv__pairwise_property_delegated_to_base_estimator():
     """
diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py
index 4437a7a4cb35c..8405d3b38c452 100644
--- a/sklearn/model_selection/tests/test_validation.py
+++ b/sklearn/model_selection/tests/test_validation.py
@@ -1962,7 +1962,7 @@ def custom_scorer(clf, X, y):
         assert "test_{}".format(name) in cv_results
 
 
-# TODO: Remove in 0.26 when the _pairwise attribute is removed
+# TODO: Remove in 1.1 when the _pairwise attribute is removed
 def test_validation_pairwise():
     # checks the interactions between the pairwise estimator tag
     # and the _pairwise attribute
@@ -1981,7 +1981,6 @@ def _more_tags(self):
             return {'pairwise': False}
 
     svm = IncorrectTagSVM(kernel='precomputed')
-    msg = ("_pairwise was deprecated in 0.24 and will be removed in 0.26. "
-           "Set the estimator tags of your estimator instead")
+    msg = "_pairwise was deprecated in 0.24 and will be removed in 1.1"
     with pytest.warns(FutureWarning, match=msg):
         cross_validate(svm, linear_kernel, y, cv=2)
diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py
index 182a412f8313f..da29fdd4daf11 100644
--- a/sklearn/multiclass.py
+++ b/sklearn/multiclass.py
@@ -186,7 +186,7 @@ class OneVsRestClassifier(MultiOutputMixin, ClassifierMixin,
 
         .. deprecated:: 0.24
             This attribute is deprecated in 0.24 and will
-            be removed in 0.26. If you use this attribute
+            be removed in 1.1 (renaming of 0.26). If you use this attribute
             in :class:`~sklearn.feature_selection.RFE` or
             :class:`~sklearn.feature_selection.SelectFromModel`,
             you may pass a callable to the `importance_getter`
@@ -200,7 +200,7 @@ class OneVsRestClassifier(MultiOutputMixin, ClassifierMixin,
 
         .. deprecated:: 0.24
             This attribute is deprecated in 0.24 and will
-            be removed in 0.26. If you use this attribute
+            be removed in 1.1 (renaming of 0.26). If you use this attribute
             in :class:`~sklearn.feature_selection.RFE` or
             :class:`~sklearn.feature_selection.SelectFromModel`,
             you may pass a callable to the `importance_getter`
@@ -456,10 +456,10 @@ def multilabel_(self):
     def n_classes_(self):
         return len(self.classes_)
 
-    # TODO: Remove coef_ attribute in 0.26
+    # TODO: Remove coef_ attribute in 1.1
     # mypy error: Decorated property not supported
     @deprecated("Attribute coef_ was deprecated in "  # type: ignore
-                "version 0.24 and will be removed in 0.26. "
+                "version 0.24 and will be removed in 1.1 (renaming of 0.26). "
                 "If you observe this warning while using RFE "
                 "or SelectFromModel, use the importance_getter "
                 "parameter instead.")
@@ -474,10 +474,10 @@ def coef_(self):
             return sp.vstack(coefs)
         return np.vstack(coefs)
 
-    # TODO: Remove intercept_ attribute in 0.26
+    # TODO: Remove intercept_ attribute in 1.1
     # mypy error: Decorated property not supported
     @deprecated("Attribute intercept_ was deprecated in "  # type: ignore
-                "version 0.24 and will be removed in 0.26. "
+                "version 0.24 and will be removed in 1.1 (renaming of 0.26). "
                 "If you observe this warning while using RFE "
                 "or SelectFromModel, use the importance_getter "
                 "parameter instead.")
@@ -489,10 +489,10 @@ def intercept_(self):
                 "Base estimator doesn't have an intercept_ attribute.")
         return np.array([e.intercept_.ravel() for e in self.estimators_])
 
-    # TODO: Remove in 0.26
+    # TODO: Remove in 1.1
     # mypy error: Decorated property not supported
     @deprecated("Attribute _pairwise was deprecated in "  # type: ignore
-                "version 0.24 and will be removed in 0.26.")
+                "version 0.24 and will be removed in 1.1 (renaming of 0.26).")
     @property
     def _pairwise(self):
         """Indicate if wrapped estimator is using a precomputed Gram matrix"""
@@ -591,9 +591,9 @@ class OneVsOneClassifier(MetaEstimatorMixin, ClassifierMixin, BaseEstimator):
 
         .. deprecated:: 0.24
 
-            The _pairwise attribute is deprecated in 0.24. From 0.26 and
-            onward, `pairwise_indices_` will use the pairwise estimator tag
-            instead.
+            The _pairwise attribute is deprecated in 0.24. From 1.1
+            (renaming of 0.25) and onward, `pairwise_indices_` will use the
+            pairwise estimator tag instead.
 
     Examples
     --------
@@ -769,10 +769,10 @@ def decision_function(self, X):
     def n_classes_(self):
         return len(self.classes_)
 
-    # TODO: Remove in 0.26
+    # TODO: Remove in 1.1
     # mypy error: Decorated property not supported
     @deprecated("Attribute _pairwise was deprecated in "  # type: ignore
-                "version 0.24 and will be removed in 0.26.")
+                "version 0.24 and will be removed in 1.1 (renaming of 0.26).")
     @property
     def _pairwise(self):
         """Indicate if wrapped estimator is using a precomputed Gram matrix"""
diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py
index 93a2da00549de..bcc7a9d24ce1c 100644
--- a/sklearn/naive_bayes.py
+++ b/sklearn/naive_bayes.py
@@ -648,7 +648,7 @@ def _init_counters(self, n_effective_classes, n_features):
 
     # mypy error: Decorated property not supported
     @deprecated("Attribute coef_ was deprecated in "  # type: ignore
-                "version 0.24 and will be removed in 0.26.")
+                "version 0.24 and will be removed in 1.1 (renaming of 0.26).")
     @property
     def coef_(self):
         return (self.feature_log_prob_[1:]
@@ -656,7 +656,7 @@ def coef_(self):
 
     # mypy error: Decorated property not supported
     @deprecated("Attribute intercept_ was deprecated in "  # type: ignore
-                "version 0.24 and will be removed in 0.26.")
+                "version 0.24 and will be removed in 1.1 (renaming of 0.26).")
     @property
     def intercept_(self):
         return (self.class_log_prior_[1:]
@@ -708,7 +708,8 @@ class MultinomialNB(_BaseDiscreteNB):
         as a linear model.
 
         .. deprecated:: 0.24
-            ``coef_`` is deprecated in 0.24 and will be removed in 0.26.
+            ``coef_`` is deprecated in 0.24 and will be removed in 1.1
+            (renaming of 0.26).
 
     feature_count_ : ndarray of shape (n_classes, n_features)
         Number of samples encountered for each (class, feature)
@@ -724,7 +725,8 @@ class MultinomialNB(_BaseDiscreteNB):
         as a linear model.
 
         .. deprecated:: 0.24
-            ``intercept_`` is deprecated in 0.24 and will be removed in 0.26.
+            ``intercept_`` is deprecated in 0.24 and will be removed in 1.1
+            (renaming of 0.26).
 
     n_features_ : int
         Number of features of each sample.
@@ -830,7 +832,8 @@ class ComplementNB(_BaseDiscreteNB):
         as a linear model.
 
         .. deprecated:: 0.24
-            ``coef_`` is deprecated in 0.24 and will be removed in 0.26.
+            ``coef_`` is deprecated in 0.24 and will be removed in 1.1
+            (renaming of 0.26).
 
     feature_all_ : ndarray of shape (n_features,)
         Number of samples encountered for each feature during fitting. This
@@ -848,7 +851,8 @@ class ComplementNB(_BaseDiscreteNB):
         as a linear model.
 
         .. deprecated:: 0.24
-            ``coef_`` is deprecated in 0.24 and will be removed in 0.26.
+            ``coef_`` is deprecated in 0.24 and will be removed in 1.1
+            (renaming of 0.26).
 
     n_features_ : int
         Number of features of each sample.
diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py
index 1e666043347cf..54cf473b2ab75 100644
--- a/sklearn/neighbors/_base.py
+++ b/sklearn/neighbors/_base.py
@@ -528,10 +528,10 @@ def _more_tags(self):
         # For cross-validation routines to split data correctly
         return {'pairwise': self.metric == 'precomputed'}
 
-    # TODO: Remove in 0.26
+    # TODO: Remove in 1.1
     # mypy error: Decorated property not supported
     @deprecated("Attribute _pairwise was deprecated in "  # type: ignore
-                "version 0.24 and will be removed in 0.26.")
+                "version 0.24 and will be removed in 1.1 (renaming of 0.26).")
     @property
     def _pairwise(self):
         # For cross-validation routines to split data correctly
diff --git a/sklearn/neighbors/_regression.py b/sklearn/neighbors/_regression.py
index 49bc199a86ec6..9bf28f037294a 100644
--- a/sklearn/neighbors/_regression.py
+++ b/sklearn/neighbors/_regression.py
@@ -160,10 +160,10 @@ def _more_tags(self):
         # For cross-validation routines to split data correctly
         return {'pairwise': self.metric == 'precomputed'}
 
-    # TODO: Remove in 0.26
+    # TODO: Remove in 1.1
     # mypy error: Decorated property not supported
     @deprecated("Attribute _pairwise was deprecated in "  # type: ignore
-                "version 0.24 and will be removed in 0.26.")
+                "version 0.24 and will be removed in 1.1 (renaming of 0.26).")
     @property
     def _pairwise(self):
         # For cross-validation routines to split data correctly
diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py
index ab0b793176b04..513df1edb1bec 100644
--- a/sklearn/neighbors/tests/test_neighbors.py
+++ b/sklearn/neighbors/tests/test_neighbors.py
@@ -1733,7 +1733,7 @@ def test_auto_algorithm(X, metric, metric_params, expected_algo):
     assert model._fit_method == expected_algo
 
 
-# TODO: Remove in 0.26
+# TODO: Remove in 1.1
 @pytest.mark.parametrize("NearestNeighbors", [neighbors.KNeighborsClassifier,
                                               neighbors.KNeighborsRegressor,
                                               neighbors.NearestNeighbors])
diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py
index 6df8cddc476c4..00aad1a8e5315 100644
--- a/sklearn/pipeline.py
+++ b/sklearn/pipeline.py
@@ -629,10 +629,10 @@ def _more_tags(self):
         # check if first estimator expects pairwise input
         return {'pairwise': _safe_tags(self.steps[0][1], "pairwise")}
 
-    # TODO: Remove in 0.26
+    # TODO: Remove in 1.1
     # mypy error: Decorated property not supported
     @deprecated("Attribute _pairwise was deprecated in "  # type: ignore
-                "version 0.24 and will be removed in 0.26.")
+                "version 0.24 and will be removed in 1.1 (renaming of 0.26).")
     @property
     def _pairwise(self):
         # check if first estimator expects pairwise input
diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py
index 3f6cbb7546439..478d41ecc768a 100644
--- a/sklearn/preprocessing/_data.py
+++ b/sklearn/preprocessing/_data.py
@@ -2312,10 +2312,10 @@ def transform(self, K, copy=True):
     def _more_tags(self):
         return {'pairwise': True}
 
-    # TODO: Remove in 0.26
+    # TODO: Remove in 1.1
     # mypy error: Decorated property not supported
     @deprecated("Attribute _pairwise was deprecated in "  # type: ignore
-                "version 0.24 and will be removed in 0.26.")
+                "version 0.24 and will be removed in 1.1.")
     @property
     def _pairwise(self):
         return True
diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py
index 4fef462b9d849..b0fbee8db9455 100644
--- a/sklearn/preprocessing/tests/test_data.py
+++ b/sklearn/preprocessing/tests/test_data.py
@@ -2246,7 +2246,7 @@ def test_cv_pipeline_precomputed():
     # did the pipeline set the pairwise attribute?
     assert pipeline._get_tags()['pairwise']
 
-    # TODO: Remove in 0.26
+    # TODO: Remove in 1.1
     msg = r"Attribute _pairwise was deprecated in version 0\.24"
     with pytest.warns(FutureWarning, match=msg):
         assert pipeline._pairwise
@@ -2258,7 +2258,7 @@ def test_cv_pipeline_precomputed():
     assert_array_almost_equal(y_true, y_pred)
 
 
-# TODO: Remove in 0.26
+# TODO: Remove in 1.1
 def test_pairwise_deprecated():
     kcent = KernelCenterer()
     msg = r"Attribute _pairwise was deprecated in version 0\.24"
diff --git a/sklearn/svm/_base.py b/sklearn/svm/_base.py
index c5196a5801607..fa09badf64691 100644
--- a/sklearn/svm/_base.py
+++ b/sklearn/svm/_base.py
@@ -107,10 +107,10 @@ def _more_tags(self):
         # Used by cross_val_score.
         return {'pairwise': self.kernel == 'precomputed'}
 
-    # TODO: Remove in 0.26
+    # TODO: Remove in 1.1
     # mypy error: Decorated property not supported
     @deprecated("Attribute _pairwise was deprecated in "  # type: ignore
-                "version 0.24 and will be removed in 0.26.")
+                "version 0.24 and will be removed in 1.1 (renaming of 0.26).")
     @property
     def _pairwise(self):
         # Used by cross_val_score.
diff --git a/sklearn/svm/_classes.py b/sklearn/svm/_classes.py
index 295ff577b642e..908ece408bb1d 100644
--- a/sklearn/svm/_classes.py
+++ b/sklearn/svm/_classes.py
@@ -1048,7 +1048,7 @@ def __init__(self, *, kernel='rbf', degree=3, gamma='scale',
     # mypy error: Decorated property not supported
     @deprecated(  # type: ignore
         "The probA_ attribute is deprecated in version 0.23 and will be "
-        "removed in version 0.25.")
+        "removed in version 1.0 (renaming of 0.25).")
     @property
     def probA_(self):
         return self._probA
@@ -1056,7 +1056,7 @@ def probA_(self):
     # mypy error: Decorated property not supported
     @deprecated(  # type: ignore
         "The probB_ attribute is deprecated in version 0.23 and will be "
-        "removed in version 0.25.")
+        "removed in version 1.0 (renaming of 0.25).")
     @property
     def probB_(self):
         return self._probB
@@ -1434,7 +1434,7 @@ def predict(self, X):
     # mypy error: Decorated property not supported
     @deprecated(  # type: ignore
         "The probA_ attribute is deprecated in version 0.23 and will be "
-        "removed in version 0.25.")
+        "removed in version 1.0.")
     @property
     def probA_(self):
         return self._probA
@@ -1442,7 +1442,7 @@ def probA_(self):
     # mypy error: Decorated property not supported
     @deprecated(  # type: ignore
         "The probB_ attribute is deprecated in version 0.23 and will be "
-        "removed in version 0.25.")
+        "removed in version 1.0.")
     @property
     def probB_(self):
         return self._probB
diff --git a/sklearn/svm/tests/test_svm.py b/sklearn/svm/tests/test_svm.py
index ad8402e5bbd18..4d57f4f7da450 100644
--- a/sklearn/svm/tests/test_svm.py
+++ b/sklearn/svm/tests/test_svm.py
@@ -1235,7 +1235,7 @@ def test_n_support_oneclass_svr():
     assert reg.n_support_ == 4
 
 
-# TODO: Remove in 0.25 when probA_ and probB_ are deprecated
+# TODO: Remove in 1.0 when probA_ and probB_ are deprecated
 @pytest.mark.parametrize("SVMClass, data", [
     (svm.OneClassSVM, (X, )),
     (svm.SVR, (X, Y))
@@ -1245,7 +1245,7 @@ def test_svm_probA_proB_deprecated(SVMClass, data, deprecated_prob):
     clf = SVMClass().fit(*data)
 
     msg = ("The {} attribute is deprecated in version 0.23 and will be "
-           "removed in version 0.25.").format(deprecated_prob)
+           "removed in version 1.0").format(deprecated_prob)
     with pytest.warns(FutureWarning, match=msg):
         getattr(clf, deprecated_prob)
 
diff --git a/sklearn/tests/test_base.py b/sklearn/tests/test_base.py
index 0c07db459d128..7dd8d02f3c0bf 100644
--- a/sklearn/tests/test_base.py
+++ b/sklearn/tests/test_base.py
@@ -540,7 +540,7 @@ def test_repr_html_wraps():
         assert "<style>" in output
 
 
-# TODO: Remove in 0.26 when the _pairwise attribute is removed
+# TODO: Remove in 1.1 when the _pairwise attribute is removed
 def test_is_pairwise():
     # simple checks for _is_pairwise
     pca = KernelPCA(kernel='precomputed')
@@ -553,8 +553,7 @@ class IncorrectTagPCA(KernelPCA):
         _pairwise = False
 
     pca = IncorrectTagPCA(kernel='precomputed')
-    msg = ("_pairwise was deprecated in 0.24 and will be removed in 0.26. "
-           "Set the estimator tags of your estimator instead")
+    msg = "_pairwise was deprecated in 0.24 and will be removed in 1.1"
     with pytest.warns(FutureWarning, match=msg):
         assert not _is_pairwise(pca)
 
diff --git a/sklearn/tests/test_calibration.py b/sklearn/tests/test_calibration.py
index 061bbdf43d8cc..4a19303ee7b01 100644
--- a/sklearn/tests/test_calibration.py
+++ b/sklearn/tests/test_calibration.py
@@ -544,7 +544,7 @@ def test_calibration_attributes(clf, cv):
         assert calib_clf.n_features_in_ == X.shape[1]
 
 
-# FIXME: remove in 0.26
+# FIXME: remove in 1.1
 def test_calibrated_classifier_cv_deprecation(data):
     # Check that we raise the proper deprecation warning if accessing
     # `calibrators_` from the `_CalibratedClassifier`.
diff --git a/sklearn/tests/test_docstring_parameters.py b/sklearn/tests/test_docstring_parameters.py
index 8d8399f0cf4da..1756e0e4a65a6 100644
--- a/sklearn/tests/test_docstring_parameters.py
+++ b/sklearn/tests/test_docstring_parameters.py
@@ -212,11 +212,11 @@ def test_fit_docstring_attributes(name, Estimator):
     if 'PLS' in Estimator.__name__ or 'CCA' in Estimator.__name__:
         est.n_components = 1  # default = 2 is invalid for single target.
 
-    # TO BE REMOVED for v0.25 (avoid FutureWarning)
+    # FIXME: TO BE REMOVED for 1.0 (avoid FutureWarning)
     if Estimator.__name__ == 'AffinityPropagation':
         est.random_state = 63
 
-    # TO BE REMOVED for v0.26 (avoid FutureWarning)
+    # FIXME: TO BE REMOVED for 1.1 (avoid FutureWarning)
     if Estimator.__name__ == 'NMF':
         est.init = 'nndsvda'
 
@@ -235,8 +235,8 @@ def test_fit_docstring_attributes(name, Estimator):
         est.fit(X, y)
 
     skipped_attributes = {'n_features_in_',
-                          'x_scores_',  # For PLS, TODO remove in 0.26
-                          'y_scores_'}  # For PLS, TODO remove in 0.26
+                          'x_scores_',  # For PLS, TODO remove in 1.1
+                          'y_scores_'}  # For PLS, TODO remove in 1.1
 
     for attr in attributes:
         if attr.name in skipped_attributes:
diff --git a/sklearn/tests/test_kernel_ridge.py b/sklearn/tests/test_kernel_ridge.py
index 6d09ca928a8ad..ef251f58b5baa 100644
--- a/sklearn/tests/test_kernel_ridge.py
+++ b/sklearn/tests/test_kernel_ridge.py
@@ -87,7 +87,7 @@ def test_kernel_ridge_multi_output():
     assert_array_almost_equal(pred2, pred3)
 
 
-# TODO: Remove in 0.26
+# TODO: Remove in 1.1
 def test_kernel_ridge_pairwise_is_deprecated():
     k_ridge = KernelRidge(kernel='precomputed')
     msg = r"Attribute _pairwise was deprecated in version 0\.24"
diff --git a/sklearn/tests/test_multiclass.py b/sklearn/tests/test_multiclass.py
index b60a85c7bde00..96bd1b807a95f 100644
--- a/sklearn/tests/test_multiclass.py
+++ b/sklearn/tests/test_multiclass.py
@@ -441,7 +441,7 @@ def test_ovr_pipeline():
     assert_array_equal(ovr.predict(iris.data), ovr_pipe.predict(iris.data))
 
 
-# TODO: Remove this test in version 0.26
+# TODO: Remove this test in version 1.1
 # when the coef_ attribute is removed
 @ignore_warnings(category=FutureWarning)
 def test_ovr_coef_():
@@ -461,7 +461,7 @@ def test_ovr_coef_():
                          sp.issparse(ovr.coef_))
 
 
-# TODO: Remove this test in version 0.26
+# TODO: Remove this test in version 1.1
 # when the coef_ attribute is removed
 @ignore_warnings(category=FutureWarning)
 def test_ovr_coef_exceptions():
@@ -476,16 +476,16 @@ def test_ovr_coef_exceptions():
     assert_raises(AttributeError, lambda x: ovr.coef_, None)
 
 
-# TODO: Remove this test in version 0.26 when
+# TODO: Remove this test in version 1.1 when
 # the coef_ and intercept_ attributes are removed
 def test_ovr_deprecated_coef_intercept():
     ovr = OneVsRestClassifier(SVC(kernel="linear"))
     ovr = ovr.fit(iris.data, iris.target)
 
-    msg = ("Attribute {0} was deprecated in version 0.24 "
-           "and will be removed in 0.26. If you observe "
-           "this warning while using RFE or SelectFromModel, "
-           "use the importance_getter parameter instead.")
+    msg = (r"Attribute {0} was deprecated in version 0.24 "
+           r"and will be removed in 1.1 \(renaming of 0.26\). If you observe "
+           r"this warning while using RFE or SelectFromModel, "
+           r"use the importance_getter parameter instead.")
 
     for att in ["coef_", "intercept_"]:
         with pytest.warns(FutureWarning, match=msg.format(att)):
@@ -802,7 +802,7 @@ def test_pairwise_tag(MultiClassClassifier):
     assert ovr_true._get_tags()["pairwise"]
 
 
-# TODO: Remove in 0.26
+# TODO: Remove in 1.1
 @pytest.mark.parametrize("MultiClassClassifier", [OneVsRestClassifier,
                                                   OneVsOneClassifier])
 def test_pairwise_deprecated(MultiClassClassifier):
diff --git a/sklearn/tests/test_naive_bayes.py b/sklearn/tests/test_naive_bayes.py
index e95eccc0c747d..c3b0874adc9d2 100644
--- a/sklearn/tests/test_naive_bayes.py
+++ b/sklearn/tests/test_naive_bayes.py
@@ -194,7 +194,7 @@ def test_gnb_naive_bayes_scale_invariance():
     assert_array_equal(labels[1], labels[2])
 
 
-# TODO: Remove in version 0.26
+# TODO: Remove in version 1.1
 @pytest.mark.parametrize("cls", [MultinomialNB, ComplementNB, BernoulliNB,
                                  CategoricalNB])
 def test_discretenb_deprecated_coef_intercept(cls):
@@ -319,7 +319,7 @@ def test_discretenb_input_check_partial_fit(cls):
     assert_raises(ValueError, clf.predict, X2[:, :-1])
 
 
-# TODO: Remove in version 0.26
+# TODO: Remove in version 1.1
 @ignore_warnings(category=FutureWarning)
 def test_discretenb_predict_proba():
     # Test discrete NB classes' probability scores
@@ -423,7 +423,7 @@ def test_discretenb_sample_weight_multiclass(cls):
     assert_array_equal(clf.predict(X), [0, 1, 1, 2])
 
 
-# TODO: Remove in version 0.26
+# TODO: Remove in version 1.1
 @ignore_warnings(category=FutureWarning)
 @pytest.mark.parametrize('cls', [BernoulliNB, MultinomialNB])
 def test_discretenb_coef_intercept_shape(cls):
@@ -517,7 +517,7 @@ def test_mnb_prior_unobserved_targets():
     assert clf.predict([[1, 1]]) == 2
 
 
-# TODO: Remove in version 0.26
+# TODO: Remove in version 1.1
 @ignore_warnings(category=FutureWarning)
 def test_mnb_sample_weight():
     clf = MultinomialNB()
diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
index 01d821d9e5c82..c09ebe388aa5d 100644
--- a/sklearn/tree/_classes.py
+++ b/sklearn/tree/_classes.py
@@ -312,11 +312,13 @@ def fit(self, X, y, sample_weight=None, check_input=True,
 
         min_impurity_split = self.min_impurity_split
         if min_impurity_split is not None:
-            warnings.warn("The min_impurity_split parameter is deprecated. "
-                          "Its default value has changed from 1e-7 to 0 in "
-                          "version 0.23, and it will be removed in 0.25. "
-                          "Use the min_impurity_decrease parameter instead.",
-                          FutureWarning)
+            warnings.warn(
+                "The min_impurity_split parameter is deprecated. Its default "
+                "value has changed from 1e-7 to 0 in version 0.23, and it "
+                "will be removed in 1.0 (renaming of 0.25). Use the "
+                "min_impurity_decrease parameter instead.",
+                FutureWarning
+            )
 
             if min_impurity_split < 0.:
                 raise ValueError("min_impurity_split must be greater than "
@@ -328,12 +330,15 @@ def fit(self, X, y, sample_weight=None, check_input=True,
             raise ValueError("min_impurity_decrease must be greater than "
                              "or equal to 0")
 
-        # TODO: Remove in v0.26
+        # TODO: Remove in 1.1
         if X_idx_sorted != "deprecated":
-            warnings.warn("The parameter 'X_idx_sorted' is deprecated and has "
-                          "no effect. It will be removed in v0.26. You can "
-                          "suppress this warning by not passing any value to "
-                          "the 'X_idx_sorted' parameter.", FutureWarning)
+            warnings.warn(
+                "The parameter 'X_idx_sorted' is deprecated and has no "
+                "effect. It will be removed in 1.1 (renaming of 0.26). You "
+                "can suppress this warning by not passing any value to the "
+                "'X_idx_sorted' parameter.",
+                FutureWarning
+            )
 
         # Build tree
         criterion = self.criterion
@@ -710,7 +715,8 @@ class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree):
            ``min_impurity_split`` has been deprecated in favor of
            ``min_impurity_decrease`` in 0.19. The default value of
            ``min_impurity_split`` has changed from 1e-7 to 0 in 0.23 and it
-           will be removed in 0.25. Use ``min_impurity_decrease`` instead.
+           will be removed in 1.0 (renaming of 0.25).
+           Use ``min_impurity_decrease`` instead.
 
     class_weight : dict, list of dict or "balanced", default=None
         Weights associated with classes in the form ``{class_label: weight}``.
@@ -879,7 +885,7 @@ def fit(self, X, y, sample_weight=None, check_input=True,
 
         X_idx_sorted : deprecated, default="deprecated"
             This parameter is deprecated and has no effect.
-            It will be removed in v0.26.
+            It will be removed in 1.1 (renaming of 0.26).
 
             .. deprecated :: 0.24
 
@@ -1096,7 +1102,8 @@ class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree):
            ``min_impurity_split`` has been deprecated in favor of
            ``min_impurity_decrease`` in 0.19. The default value of
            ``min_impurity_split`` has changed from 1e-7 to 0 in 0.23 and it
-           will be removed in 0.25. Use ``min_impurity_decrease`` instead.
+           will be removed in 1.0 (renaming of 0.25).
+           Use ``min_impurity_decrease`` instead.
 
     ccp_alpha : non-negative float, default=0.0
         Complexity parameter used for Minimal Cost-Complexity Pruning. The
@@ -1227,7 +1234,7 @@ def fit(self, X, y, sample_weight=None, check_input=True,
 
         X_idx_sorted : deprecated, default="deprecated"
             This parameter is deprecated and has no effect.
-            It will be removed in v0.26.
+            It will be removed in 1.1 (renaming of 0.26).
 
             .. deprecated :: 0.24
 
@@ -1382,7 +1389,8 @@ class ExtraTreeClassifier(DecisionTreeClassifier):
            ``min_impurity_split`` has been deprecated in favor of
            ``min_impurity_decrease`` in 0.19. The default value of
            ``min_impurity_split`` has changed from 1e-7 to 0 in 0.23 and it
-           will be removed in 0.25. Use ``min_impurity_decrease`` instead.
+           will be removed in 1.0 (renaming of 0.25).
+           Use ``min_impurity_decrease`` instead.
 
     class_weight : dict, list of dict or "balanced", default=None
         Weights associated with classes in the form ``{class_label: weight}``.
@@ -1631,7 +1639,8 @@ class ExtraTreeRegressor(DecisionTreeRegressor):
            ``min_impurity_split`` has been deprecated in favor of
            ``min_impurity_decrease`` in 0.19. The default value of
            ``min_impurity_split`` has changed from 1e-7 to 0 in 0.23 and it
-           will be removed in 0.25. Use ``min_impurity_decrease`` instead.
+           will be removed in 1.0 (renaming of 0.25).
+           Use ``min_impurity_decrease`` instead.
 
     max_leaf_nodes : int, default=None
         Grow a tree with ``max_leaf_nodes`` in best-first fashion.
diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
index cf4b4858538dd..db8a3cb821df3 100644
--- a/sklearn/tree/_criterion.pyx
+++ b/sklearn/tree/_criterion.pyx
@@ -1344,7 +1344,7 @@ cdef class Poisson(RegressionCriterion):
     implemented impurity:
         1/n * sum(y_true * log(y_true/y_pred)
     """
-    # FIXME in 0.25:
+    # FIXME in 1.0:
     # min_impurity_split with default = 0 forces us to use a non-negative
     # impurity like the Poisson deviance. Without this restriction, one could
     # throw away the 'constant' term sum(y_true * log(y_true)) and just use
diff --git a/sklearn/tree/_export.py b/sklearn/tree/_export.py
index 9ef4faeb0f56f..1615c8eb15028 100644
--- a/sklearn/tree/_export.py
+++ b/sklearn/tree/_export.py
@@ -140,8 +140,8 @@ def plot_tree(decision_tree, *, max_depth=None, feature_names=None,
         it is kept here for backward compatibility.
 
         .. deprecated:: 0.23
-           ``rotate`` is deprecated in 0.23 and will be removed in 0.25.
-
+           ``rotate`` is deprecated in 0.23 and will be removed in 1.0
+           (renaming of 0.25).
 
     rounded : bool, default=False
         When set to ``True``, draw node boxes with rounded corners and use
@@ -182,7 +182,7 @@ def plot_tree(decision_tree, *, max_depth=None, feature_names=None,
 
     if rotate != 'deprecated':
         warnings.warn(("'rotate' has no effect and is deprecated in 0.23. "
-                       "It will be removed in 0.25."),
+                       "It will be removed in 1.0 (renaming of 0.25)."),
                       FutureWarning)
 
     exporter = _MPLTreeExporter(
diff --git a/sklearn/tree/tests/test_export.py b/sklearn/tree/tests/test_export.py
index f12f1daeb57c1..a1b04e171e59a 100644
--- a/sklearn/tree/tests/test_export.py
+++ b/sklearn/tree/tests/test_export.py
@@ -450,13 +450,13 @@ def test_plot_tree_gini(pyplot):
     assert nodes[2].get_text() == "gini = 0.0\nsamples = 3\nvalue = [0, 3]"
 
 
-# FIXME: to be removed in 0.25
+# FIXME: to be removed in 1.0
 def test_plot_tree_rotate_deprecation(pyplot):
     tree = DecisionTreeClassifier()
     tree.fit(X, y)
     # test that a warning is raised when rotate is used.
-    match = ("'rotate' has no effect and is deprecated in 0.23. "
-             "It will be removed in 0.25.")
+    match = (r"'rotate' has no effect and is deprecated in 0.23. "
+             r"It will be removed in 1.0 \(renaming of 0.25\).")
     with pytest.warns(FutureWarning, match=match):
         plot_tree(tree, rotate=True)
 
diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py
index 32480ed9bbf82..be66316f7187a 100644
--- a/sklearn/tree/tests/test_tree.py
+++ b/sklearn/tree/tests/test_tree.py
@@ -2103,7 +2103,7 @@ def test_decision_tree_regressor_sample_weight_consistentcy(
     assert_allclose(tree1.predict(X), tree2.predict(X))
 
 
-# TODO: Remove in v0.26
+# TODO: Remove in v1.1
 @pytest.mark.parametrize("TreeEstimator", [DecisionTreeClassifier,
                                            DecisionTreeRegressor])
 def test_X_idx_sorted_deprecated(TreeEstimator):
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index 7925487bad522..499bca6d6391f 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -578,7 +578,7 @@ def _set_checking_parameters(estimator):
             estimator.set_params(max_iter=20)
         # NMF
         if estimator.__class__.__name__ == 'NMF':
-            # FIXME : init should be removed in 0.26
+            # FIXME : init should be removed in 1.1
             estimator.set_params(max_iter=500, init='nndsvda')
         # MLP
         if estimator.__class__.__name__ in ['MLPClassifier', 'MLPRegressor']:
@@ -3065,8 +3065,8 @@ def check_n_features_in(name, estimator_orig):
             "n_features_in_ attribute, unless the 'no_validation' tag is "
             "True. This attribute should be equal to the number of features "
             "passed to the fit method. "
-            "An error will be raised from version 0.25 when calling "
-            "check_estimator(). "
+            "An error will be raised from version 1.0 (renaming of 0.25) "
+            "when calling check_estimator(). "
             "See SLEP010: "
             "https://scikit-learn-enhancement-proposals.readthedocs.io/en/latest/slep010/proposal.html",  # noqa
             FutureWarning
@@ -3089,7 +3089,7 @@ def check_requires_y_none(name, estimator_orig):
     warning_msg = ("As of scikit-learn 0.23, estimators should have a "
                    "'requires_y' tag set to the appropriate value. "
                    "The default value of the tag is False. "
-                   "An error will be raised from version 0.25 when calling "
+                   "An error will be raised from version 1.0 when calling "
                    "check_estimator() if the tag isn't properly set.")
 
     expected_err_msgs = (
diff --git a/sklearn/utils/fixes.py b/sklearn/utils/fixes.py
index 56d4281eb0945..ea7a4c8f83344 100644
--- a/sklearn/utils/fixes.py
+++ b/sklearn/utils/fixes.py
@@ -19,7 +19,7 @@
 import scipy
 import scipy.stats
 from scipy.sparse.linalg import lsqr as sparse_lsqr  # noqa
-from numpy.ma import MaskedArray as _MaskedArray  # TODO: remove in 0.25
+from numpy.ma import MaskedArray as _MaskedArray  # TODO: remove in 1.0
 from .._config import config_context, get_config
 
 from .deprecation import deprecated
@@ -159,10 +159,10 @@ class loguniform(scipy.stats.reciprocal):
 
 @deprecated(
     'MaskedArray is deprecated in version 0.23 and will be removed in version '
-    '0.25. Use numpy.ma.MaskedArray instead.'
+    '1.0 (renaming of 0.25). Use numpy.ma.MaskedArray instead.'
 )
 class MaskedArray(_MaskedArray):
-    pass  # TODO: remove in 0.25
+    pass  # TODO: remove in 1.0
 
 
 def _take_along_axis(arr, indices, axis):
diff --git a/sklearn/utils/metaestimators.py b/sklearn/utils/metaestimators.py
index 5512d5f531c0f..753596bc03c5d 100644
--- a/sklearn/utils/metaestimators.py
+++ b/sklearn/utils/metaestimators.py
@@ -159,8 +159,9 @@ def _safe_split(estimator, X, y, indices, train_indices=None):
 
     .. deprecated:: 0.24
 
-        The _pairwise attribute is deprecated in 0.24. From 0.26 and onward,
-        this function will check for the pairwise estimator tag.
+        The _pairwise attribute is deprecated in 0.24. From 1.1
+        (renaming of 0.26) and onward, this function will check for the
+        pairwise estimator tag.
 
     Labels y will always be indexed only along the first axis.
 
diff --git a/sklearn/utils/tests/test_fixes.py b/sklearn/utils/tests/test_fixes.py
index 063f4bf6eb41c..28824a6acee55 100644
--- a/sklearn/utils/tests/test_fixes.py
+++ b/sklearn/utils/tests/test_fixes.py
@@ -86,6 +86,6 @@ def test_loguniform(low, high, base):
     )
 
 
-def test_masked_array_deprecated():  # TODO: remove in 0.25
+def test_masked_array_deprecated():  # TODO: remove in 1.0
     with pytest.warns(FutureWarning, match='is deprecated'):
         MaskedArray()
diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py
index 5f147aaa15b0a..293af1732e1f4 100644
--- a/sklearn/utils/tests/test_validation.py
+++ b/sklearn/utils/tests/test_validation.py
@@ -338,7 +338,7 @@ def test_check_array():
     assert isinstance(result, np.ndarray)
 
 
-# TODO: Check for error in 0.26 when implicit conversation is removed
+# TODO: Check for error in 1.1 when implicit conversation is removed
 @pytest.mark.parametrize("X", [
    [['1', '2'], ['3', '4']],
    np.array([['1', '2'], ['3', '4']], dtype='U'),
@@ -350,12 +350,12 @@ def test_check_array_numeric_warns(X):
     """Test that check_array warns when it converts a bytes/string into a
     float."""
     expected_msg = (r"Arrays of bytes/strings is being converted to decimal .*"
-                    r"deprecated in 0.24 and will be removed in 0.26")
+                    r"deprecated in 0.24 and will be removed in 1.1")
     with pytest.warns(FutureWarning, match=expected_msg):
         check_array(X, dtype="numeric")
 
 
-# TODO: remove in 0.26
+# TODO: remove in 1.1
 @ignore_warnings(category=FutureWarning)
 @pytest.mark.parametrize("X", [
    [['11', '12'], ['13', 'xx']],
@@ -405,7 +405,7 @@ def test_check_array_pandas_na_support(pd_dtype, dtype, expected_dtype):
         check_array(X, force_all_finite=True)
 
 
-# TODO: remove test in 0.26 once this behavior is deprecated
+# TODO: remove test in 1.1 once this behavior is deprecated
 def test_check_array_pandas_dtype_object_conversion():
     # test that data-frame like objects with dtype object
     # get converted
@@ -1169,12 +1169,12 @@ def f3(a, *, b, c=1, d=1):
 
 
 def test_deprecate_positional_args_warns_for_function_version():
-    @_deprecate_positional_args(version="0.26")
+    @_deprecate_positional_args(version="1.1")
     def f1(a, *, b):
         pass
 
     with pytest.warns(FutureWarning,
-                      match=r"From version 0.26 passing these as positional"):
+                      match=r"From version 1.1 passing these as positional"):
         f1(1, 2)
 
 
diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py
index 9c860f0bae28a..902a9f4ddf426 100644
--- a/sklearn/utils/validation.py
+++ b/sklearn/utils/validation.py
@@ -32,7 +32,7 @@
 FLOAT_DTYPES = (np.float64, np.float32, np.float16)
 
 
-def _deprecate_positional_args(func=None, *, version="0.25"):
+def _deprecate_positional_args(func=None, *, version="1.0 (renaming of 0.25)"):
     """Decorator for methods that issues warnings for positional arguments.
 
     Using the keyword-only argument syntax in pep 3102, arguments after the
@@ -42,7 +42,7 @@ def _deprecate_positional_args(func=None, *, version="0.25"):
     ----------
     func : callable, default=None
         Function to check arguments on.
-    version : callable, default="0.25"
+    version : callable, default="1.0 (renaming of 0.25)"
         The version when positional arguments will result in error.
     """
     def _inner_deprecate_positional_args(f):
@@ -642,12 +642,13 @@ def check_array(array, accept_sparse=False, *, accept_large_sparse=True,
 
         # make sure we actually converted to numeric:
         if dtype_numeric and array.dtype.kind in "OUSV":
-            warnings.warn("Arrays of bytes/strings is being converted to "
-                          "decimal numbers if dtype='numeric'. This behavior "
-                          "is deprecated in 0.24 and will be removed in 0.26 "
-                          "Please convert your data to numeric values "
-                          "explicitly instead.",
-                          FutureWarning, stacklevel=2)
+            warnings.warn(
+                "Arrays of bytes/strings is being converted to decimal "
+                "numbers if dtype='numeric'. This behavior is deprecated in "
+                "0.24 and will be removed in 1.1 (renaming of 0.26). Please "
+                "convert your data to numeric values explicitly instead.",
+                FutureWarning, stacklevel=2
+            )
             try:
                 array = array.astype(np.float64)
             except ValueError as e:

From 64c1ff09c2da4769a32a8af641ab3069cfbca7a1 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Sat, 19 Dec 2020 05:46:45 -0500
Subject: [PATCH 012/478] CI Removes scipy-dev from travis (#19036)

---
 .travis.yml | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 4702fb63c497c..dac2972c469af 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -33,14 +33,6 @@ env:
 
 jobs:
   include:
-    # Linux environment to test scikit-learn against NumPy and SciPy
-    # master installed from their continuous integration wheels in a
-    # virtual environment with Python interpreter provided by Travis.
-    - python: 3.7
-      env:
-        - CHECK_WARNINGS=true
-      if: type = cron OR commit_message =~ /\[scipy-dev\]/
-
     - python: 3.7
       env:
         - CHECK_WARNING=true

From 54375d24a423d77fc5fac1071643a588fc98e818 Mon Sep 17 00:00:00 2001
From: Kot271828 <48412793+Kot271828@users.noreply.github.com>
Date: Sun, 20 Dec 2020 01:38:15 +0900
Subject: [PATCH 013/478] DOC correct shape of returned ndarray in
 BinaryTree.query (#18969)

---
 sklearn/neighbors/_binary_tree.pxi | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/neighbors/_binary_tree.pxi b/sklearn/neighbors/_binary_tree.pxi
index a31eb7893fff4..1acff082c7d76 100755
--- a/sklearn/neighbors/_binary_tree.pxi
+++ b/sklearn/neighbors/_binary_tree.pxi
@@ -1323,11 +1323,11 @@ cdef class BinaryTree:
         i    : if return_distance == False
         (d,i) : if return_distance == True
 
-        d : ndarray of shape X.shape[:-1] + k, dtype=double
+        d : ndarray of shape X.shape[:-1] + (k,), dtype=double
             Each entry gives the list of distances to the neighbors of the
             corresponding point.
 
-        i : ndarray of shape X.shape[:-1] + k, dtype=int
+        i : ndarray of shape X.shape[:-1] + (k,), dtype=int
             Each entry gives the list of indices of neighbors of the
             corresponding point.
         """

From 9780abda5fd54a491a6a98cd542da02094f912a5 Mon Sep 17 00:00:00 2001
From: "Adam J. Stewart" <ajstewart426@gmail.com>
Date: Sun, 20 Dec 2020 03:40:30 -0600
Subject: [PATCH 014/478] DOC Fix typo in Gaussian Process docs (#19039)

---
 doc/modules/gaussian_process.rst | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/doc/modules/gaussian_process.rst b/doc/modules/gaussian_process.rst
index 040c3c4f9adf5..6aa9cb417aa5d 100644
--- a/doc/modules/gaussian_process.rst
+++ b/doc/modules/gaussian_process.rst
@@ -156,9 +156,9 @@ required for fitting and predicting: while fitting KRR is fast in principle,
 the grid-search for hyperparameter optimization scales exponentially with the
 number of hyperparameters ("curse of dimensionality"). The gradient-based
 optimization of the parameters in GPR does not suffer from this exponential
-scaling and is thus considerable faster on this example with 3-dimensional
+scaling and is thus considerably faster on this example with 3-dimensional
 hyperparameter space. The time for predicting is similar; however, generating
-the variance of the predictive distribution of GPR takes considerable longer
+the variance of the predictive distribution of GPR takes considerably longer
 than just predicting the mean.
 
 GPR on Mauna Loa CO2 data
@@ -294,7 +294,7 @@ with different choices of the hyperparameters. The first figure shows the
 predicted probability of GPC with arbitrarily chosen hyperparameters and with
 the hyperparameters corresponding to the maximum log-marginal-likelihood (LML).
 
-While the hyperparameters chosen by optimizing LML have a considerable larger
+While the hyperparameters chosen by optimizing LML have a considerably larger
 LML, they perform slightly worse according to the log-loss on test data. The
 figure shows that this is because they exhibit a steep change of the class
 probabilities at the class boundaries (which is good) but have predicted
@@ -384,7 +384,7 @@ equivalent call to ``__call__``: ``np.diag(k(X, X)) == k.diag(X)``
 
 Kernels are parameterized by a vector :math:`\theta` of hyperparameters. These
 hyperparameters can for instance control length-scales or periodicity of a
-kernel (see below). All kernels support computing analytic gradients 
+kernel (see below). All kernels support computing analytic gradients
 of the kernel's auto-covariance with respect to :math:`log(\theta)` via setting
 ``eval_gradient=True`` in the ``__call__`` method.
 That is, a ``(len(X), len(X), len(theta))`` array is returned where the entry

From dc1ea2751e8f4e18f61c7e6d767cf42c6e636256 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?=
 <34657725+jeremiedbb@users.noreply.github.com>
Date: Sun, 20 Dec 2020 18:35:58 +0100
Subject: [PATCH 015/478] ENH Avoid repeated input checks in kmeans++ (#19002)

Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
---
 doc/whats_new/v1.0.rst                 |  6 ++
 sklearn/cluster/_kmeans.py             |  5 +-
 sklearn/metrics/pairwise.py            | 79 ++++++++++++++++++--------
 sklearn/metrics/tests/test_pairwise.py | 28 +++++++++
 4 files changed, 91 insertions(+), 27 deletions(-)

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 211f1e4049d65..2f53a9d7fee69 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -44,7 +44,13 @@ Changelog
     :pr:`123456` by :user:`Joe Bloggs <joeongithub>`.
     where 123456 is the *pull request* number, not the issue number.
 
+:mod:`sklearn.cluster`
+......................
 
+- |Efficiency| The "k-means++" initialization of :class:`cluster.KMeans` and
+  :class:`cluster.MiniBatchKMeans` is now faster, especially in multicore
+  settings. :pr:`19002` by :user:`Jon Crall <Erotemic>` and
+  :user:`Jérémie du Boisberranger <jeremiedbb>`.
 
 Code and Documentation Contributors
 -----------------------------------
diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py
index d10dfba0d08b3..e3cb8e6c17fea 100644
--- a/sklearn/cluster/_kmeans.py
+++ b/sklearn/cluster/_kmeans.py
@@ -20,6 +20,7 @@
 
 from ..base import BaseEstimator, ClusterMixin, TransformerMixin
 from ..metrics.pairwise import euclidean_distances
+from ..metrics.pairwise import _euclidean_distances
 from ..utils.extmath import row_norms, stable_cumsum
 from ..utils.sparsefuncs_fast import assign_rows_csr
 from ..utils.sparsefuncs import mean_variance_axis
@@ -103,7 +104,7 @@ def _kmeans_plusplus(X, n_clusters, x_squared_norms,
     indices[0] = center_id
 
     # Initialize list of closest distances and calculate current potential
-    closest_dist_sq = euclidean_distances(
+    closest_dist_sq = _euclidean_distances(
         centers[0, np.newaxis], X, Y_norm_squared=x_squared_norms,
         squared=True)
     current_pot = closest_dist_sq.sum()
@@ -120,7 +121,7 @@ def _kmeans_plusplus(X, n_clusters, x_squared_norms,
                 out=candidate_ids)
 
         # Compute distances to center candidates
-        distance_to_candidates = euclidean_distances(
+        distance_to_candidates = _euclidean_distances(
             X[candidate_ids], X, Y_norm_squared=x_squared_norms, squared=True)
 
         # update closest distances squared and potential for each candidate
diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py
index a3b4accc03655..a3cf7f4bf1d72 100644
--- a/sklearn/metrics/pairwise.py
+++ b/sklearn/metrics/pairwise.py
@@ -230,7 +230,8 @@ def euclidean_distances(X, Y=None, *, Y_norm_squared=None, squared=False,
     Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features), \
             default=None
 
-    Y_norm_squared : array-like of shape (n_samples_Y,), default=None
+    Y_norm_squared : array-like of shape (n_samples_Y,) or (n_samples_Y, 1) \
+            or (1, n_samples_Y), default=None
         Pre-computed dot-products of vectors in Y (e.g.,
         ``(Y**2).sum(axis=1)``)
         May be ignored in some cases, see the note below.
@@ -238,7 +239,8 @@ def euclidean_distances(X, Y=None, *, Y_norm_squared=None, squared=False,
     squared : bool, default=False
         Return squared Euclidean distances.
 
-    X_norm_squared : array-like of shape (n_samples,), default=None
+    X_norm_squared : array-like of shape (n_samples_X,) or (n_samples_X, 1) \
+            or (1, n_samples_X), default=None
         Pre-computed dot-products of vectors in X (e.g.,
         ``(X**2).sum(axis=1)``)
         May be ignored in some cases, see the note below.
@@ -271,38 +273,65 @@ def euclidean_distances(X, Y=None, *, Y_norm_squared=None, squared=False,
     """
     X, Y = check_pairwise_arrays(X, Y)
 
-    # If norms are passed as float32, they are unused. If arrays are passed as
-    # float32, norms needs to be recomputed on upcast chunks.
-    # TODO: use a float64 accumulator in row_norms to avoid the latter.
     if X_norm_squared is not None:
-        XX = check_array(X_norm_squared)
-        if XX.shape == (1, X.shape[0]):
-            XX = XX.T
-        elif XX.shape != (X.shape[0], 1):
+        X_norm_squared = check_array(X_norm_squared, ensure_2d=False)
+        original_shape = X_norm_squared.shape
+        if X_norm_squared.shape == (X.shape[0],):
+            X_norm_squared = X_norm_squared.reshape(-1, 1)
+        if X_norm_squared.shape == (1, X.shape[0]):
+            X_norm_squared = X_norm_squared.T
+        if X_norm_squared.shape != (X.shape[0], 1):
             raise ValueError(
-                "Incompatible dimensions for X and X_norm_squared")
-        if XX.dtype == np.float32:
+                f"Incompatible dimensions for X of shape {X.shape} and "
+                f"X_norm_squared of shape {original_shape}.")
+
+    if Y_norm_squared is not None:
+        Y_norm_squared = check_array(Y_norm_squared, ensure_2d=False)
+        original_shape = Y_norm_squared.shape
+        if Y_norm_squared.shape == (Y.shape[0],):
+            Y_norm_squared = Y_norm_squared.reshape(1, -1)
+        if Y_norm_squared.shape == (Y.shape[0], 1):
+            Y_norm_squared = Y_norm_squared.T
+        if Y_norm_squared.shape != (1, Y.shape[0]):
+            raise ValueError(
+                f"Incompatible dimensions for Y of shape {Y.shape} and "
+                f"Y_norm_squared of shape {original_shape}.")
+
+    return _euclidean_distances(X, Y, X_norm_squared, Y_norm_squared, squared)
+
+
+def _euclidean_distances(X, Y, X_norm_squared=None, Y_norm_squared=None,
+                         squared=False):
+    """Computational part of euclidean_distances
+
+    Assumes inputs are already checked.
+
+    If norms are passed as float32, they are unused. If arrays are passed as
+    float32, norms needs to be recomputed on upcast chunks.
+    TODO: use a float64 accumulator in row_norms to avoid the latter.
+    """
+    if X_norm_squared is not None:
+        if X_norm_squared.dtype == np.float32:
             XX = None
+        else:
+            XX = X_norm_squared.reshape(-1, 1)
     elif X.dtype == np.float32:
         XX = None
     else:
         XX = row_norms(X, squared=True)[:, np.newaxis]
 
-    if X is Y and XX is not None:
-        # shortcut in the common case euclidean_distances(X, X)
-        YY = XX.T
-    elif Y_norm_squared is not None:
-        YY = np.atleast_2d(Y_norm_squared)
-
-        if YY.shape != (1, Y.shape[0]):
-            raise ValueError(
-                "Incompatible dimensions for Y and Y_norm_squared")
-        if YY.dtype == np.float32:
-            YY = None
-    elif Y.dtype == np.float32:
-        YY = None
+    if Y is X:
+        YY = None if XX is None else XX.T
     else:
-        YY = row_norms(Y, squared=True)[np.newaxis, :]
+        if Y_norm_squared is not None:
+            if Y_norm_squared.dtype == np.float32:
+                YY = None
+            else:
+                YY = Y_norm_squared.reshape(1, -1)
+        elif Y.dtype == np.float32:
+            YY = None
+        else:
+            YY = row_norms(Y, squared=True)[np.newaxis, :]
 
     if X.dtype == np.float32:
         # To minimize precision issues with float32, we compute the distance
diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py
index 88c285421fca6..4cc01b43b8b53 100644
--- a/sklearn/metrics/tests/test_pairwise.py
+++ b/sklearn/metrics/tests/test_pairwise.py
@@ -660,6 +660,34 @@ def test_euclidean_distances_with_norms(dtype, y_array_constr):
         assert_allclose(wrong_D, D1)
 
 
+def test_euclidean_distances_norm_shapes():
+    # Check all accepted shapes for the norms or appropriate error messages.
+    rng = np.random.RandomState(0)
+    X = rng.random_sample((10, 10))
+    Y = rng.random_sample((20, 10))
+
+    X_norm_squared = (X ** 2).sum(axis=1)
+    Y_norm_squared = (Y ** 2).sum(axis=1)
+
+    D1 = euclidean_distances(X, Y,
+                             X_norm_squared=X_norm_squared,
+                             Y_norm_squared=Y_norm_squared)
+    D2 = euclidean_distances(X, Y,
+                             X_norm_squared=X_norm_squared.reshape(-1, 1),
+                             Y_norm_squared=Y_norm_squared.reshape(-1, 1))
+    D3 = euclidean_distances(X, Y,
+                             X_norm_squared=X_norm_squared.reshape(1, -1),
+                             Y_norm_squared=Y_norm_squared.reshape(1, -1))
+
+    assert_allclose(D2, D1)
+    assert_allclose(D3, D1)
+
+    with pytest.raises(ValueError, match="Incompatible dimensions for X"):
+        euclidean_distances(X, Y, X_norm_squared=X_norm_squared[:5])
+    with pytest.raises(ValueError, match="Incompatible dimensions for Y"):
+        euclidean_distances(X, Y, Y_norm_squared=Y_norm_squared[:5])
+
+
 @pytest.mark.parametrize("dtype", [np.float32, np.float64])
 @pytest.mark.parametrize("x_array_constr", [np.array, csr_matrix],
                          ids=["dense", "sparse"])

From 6af03a525c929312f26986f68d3866c217a6838b Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Sun, 20 Dec 2020 17:55:43 +0000
Subject: [PATCH 016/478] DOC removed algorithm parameter of CCA (#19047)

---
 sklearn/cross_decomposition/_pls.py | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/sklearn/cross_decomposition/_pls.py b/sklearn/cross_decomposition/_pls.py
index 7c1dc303e361f..66adacb64b1f3 100644
--- a/sklearn/cross_decomposition/_pls.py
+++ b/sklearn/cross_decomposition/_pls.py
@@ -700,14 +700,8 @@ class CCA(_PLS):
     scale : bool, default=True
         Whether to scale `X` and `Y`.
 
-    algorithm : {'nipals', 'svd'}, default='nipals'
-        The algorithm used to estimate the first singular vectors of the
-        cross-covariance matrix. 'nipals' uses the power method while 'svd'
-        will compute the whole SVD.
-
     max_iter : int, default=500
-        the maximum number of iterations of the power method when
-        `algorithm='nipals'`. Ignored otherwise.
+        the maximum number of iterations of the power method.
 
     tol : real, default 1e-06
         The tolerance used as convergence criteria in the power method: the
@@ -763,7 +757,7 @@ class CCA(_PLS):
 
     n_iter_ : list of shape (n_components,)
         Number of iterations of the power method, for each
-        component. Empty if `algorithm='svd'`.
+        component.
 
     Examples
     --------

From 6657a82011b185d1e98544df06ac28a3e8148901 Mon Sep 17 00:00:00 2001
From: Rohan Paul <rohan-paul@users.noreply.github.com>
Date: Mon, 21 Dec 2020 18:39:45 +0530
Subject: [PATCH 017/478] [DOC] Improved consistency of the docstring of an
 ignored y parameter (#19049)

---
 sklearn/covariance/_empirical_covariance.py | 4 ++--
 sklearn/covariance/_graph_lasso.py          | 4 ++--
 sklearn/covariance/_robust_covariance.py    | 2 +-
 sklearn/covariance/_shrunk_covariance.py    | 6 +++---
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/sklearn/covariance/_empirical_covariance.py b/sklearn/covariance/_empirical_covariance.py
index b2f7a20bb9c9f..fc9d83191b470 100644
--- a/sklearn/covariance/_empirical_covariance.py
+++ b/sklearn/covariance/_empirical_covariance.py
@@ -196,7 +196,7 @@ def fit(self, X, y=None):
           n_features is the number of features.
 
         y : Ignored
-            Not used, present for API consistence purpose.
+            Not used, present for API consistency by convention.
 
         Returns
         -------
@@ -226,7 +226,7 @@ def score(self, X_test, y=None):
             the data used in fit (including centering).
 
         y : Ignored
-            Not used, present for API consistence purpose.
+            Not used, present for API consistency by convention.
 
         Returns
         -------
diff --git a/sklearn/covariance/_graph_lasso.py b/sklearn/covariance/_graph_lasso.py
index 6dc88fb7908fb..57167b81fe9e0 100644
--- a/sklearn/covariance/_graph_lasso.py
+++ b/sklearn/covariance/_graph_lasso.py
@@ -386,7 +386,7 @@ def fit(self, X, y=None):
             Data from which to compute the covariance estimate
 
         y : Ignored
-            Not used, present for API consistence purpose.
+            Not used, present for API consistency by convention.
 
         Returns
         -------
@@ -696,7 +696,7 @@ def fit(self, X, y=None):
             Data from which to compute the covariance estimate
 
         y : Ignored
-            Not used, present for API consistence purpose.
+            Not used, present for API consistency by convention.
 
         Returns
         -------
diff --git a/sklearn/covariance/_robust_covariance.py b/sklearn/covariance/_robust_covariance.py
index 51bf64619227b..6ee5d7adaeb5b 100644
--- a/sklearn/covariance/_robust_covariance.py
+++ b/sklearn/covariance/_robust_covariance.py
@@ -633,7 +633,7 @@ def fit(self, X, y=None):
             and `n_features` is the number of features.
 
         y: Ignored
-            Not used, present for API consistence purpose.
+            Not used, present for API consistency by convention.
 
         Returns
         -------
diff --git a/sklearn/covariance/_shrunk_covariance.py b/sklearn/covariance/_shrunk_covariance.py
index 1949d67de11ab..cada82996ca22 100644
--- a/sklearn/covariance/_shrunk_covariance.py
+++ b/sklearn/covariance/_shrunk_covariance.py
@@ -136,7 +136,7 @@ def fit(self, X, y=None):
             and n_features is the number of features.
 
         y: Ignored
-            not used, present for API consistence purpose.
+            Not used, present for API consistency by convention.
 
         Returns
         -------
@@ -408,7 +408,7 @@ def fit(self, X, y=None):
             Training data, where `n_samples` is the number of samples
             and `n_features` is the number of features.
         y : Ignored
-            not used, present for API consistence purpose.
+            Not used, present for API consistency by convention.
 
         Returns
         -------
@@ -584,7 +584,7 @@ def fit(self, X, y=None):
             Training data, where `n_samples` is the number of samples
             and `n_features` is the number of features.
         y : Ignored
-            not used, present for API consistence purpose.
+            Not used, present for API consistency by convention.
 
         Returns
         -------

From 983d1ee3de55db97af7099ad503b0e5cbd5d4f19 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juan=20Carlos=20Alfaro=20Jim=C3=A9nez?=
 <JuanCarlos.Alfaro@uclm.es>
Date: Mon, 21 Dec 2020 17:13:51 +0100
Subject: [PATCH 018/478] DOC Fix typo in experimental features documentation
 (#19053)

---
 doc/developers/maintainer.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/developers/maintainer.rst b/doc/developers/maintainer.rst
index 03072185f8479..9770a0a570e8b 100644
--- a/doc/developers/maintainer.rst
+++ b/doc/developers/maintainer.rst
@@ -374,9 +374,9 @@ Also, the (private) experimental features that are imported must be in a
 submodule/subpackage of the public subpackage, e.g.
 ``sklearn/ensemble/_hist_gradient_boosting/`` or
 ``sklearn/impute/_iterative.py``. This is needed so that pickles still work
-in the future when the features aren't experimental anymore
+in the future when the features aren't experimental anymore.
 
-To avoid type checker (e.g. mypy) errors a direct import of experimenal
+To avoid type checker (e.g. mypy) errors a direct import of experimental
 estimators should be done in the parent module, protected by the
 ``if typing.TYPE_CHECKING`` check. See `sklearn/ensemble/__init__.py
 <https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/ensemble/__init__.py>`_,

From dc56167b90621cdea53630ba2d2731fc36a0f062 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@gmail.com>
Date: Mon, 21 Dec 2020 19:24:20 +0100
Subject: [PATCH 019/478] MNT silence spurious FitFailedWarning in
 test_search_cv (#18833)

Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
---
 sklearn/tests/test_common.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
index 730f1135b833a..33c9b712a873f 100644
--- a/sklearn/tests/test_common.py
+++ b/sklearn/tests/test_common.py
@@ -19,6 +19,7 @@
 from sklearn.utils import all_estimators
 from sklearn.utils._testing import ignore_warnings
 from sklearn.exceptions import ConvergenceWarning
+from sklearn.exceptions import FitFailedWarning
 from sklearn.utils.estimator_checks import check_estimator
 
 import sklearn
@@ -246,6 +247,7 @@ def test_search_cv(estimator, check, request):
             ConvergenceWarning,
             UserWarning,
             FutureWarning,
+            FitFailedWarning,
         )
     ):
         check(estimator)

From a92ec1b7582b14fc20e57ffe0c9aa2a00f637766 Mon Sep 17 00:00:00 2001
From: Tim Gates <tim.gates@iress.com>
Date: Tue, 22 Dec 2020 08:58:27 +1100
Subject: [PATCH 020/478] DOC fix simple typo, variales -> variables (#19054)

---
 sklearn/utils/fixes.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/utils/fixes.py b/sklearn/utils/fixes.py
index ea7a4c8f83344..49519ed55c82c 100644
--- a/sklearn/utils/fixes.py
+++ b/sklearn/utils/fixes.py
@@ -138,7 +138,7 @@ class loguniform(scipy.stats.reciprocal):
 
     The logarithmic probability density function (PDF) is uniform. When
     ``x`` is a uniformly distributed random variable between 0 and 1, ``10**x``
-    are random variales that are equally likely to be returned.
+    are random variables that are equally likely to be returned.
 
     This class is an alias to ``scipy.stats.reciprocal``, which uses the
     reciprocal distribution:

From afbd4b307e0f875af67f826563ea26f05cd46489 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 22 Dec 2020 11:24:25 +0100
Subject: [PATCH 021/478] MNT remove docstring check in pypy build (#19057)

---
 build_tools/circle/build_test_pypy.sh | 2 --
 1 file changed, 2 deletions(-)

diff --git a/build_tools/circle/build_test_pypy.sh b/build_tools/circle/build_test_pypy.sh
index 54b54aea01dd3..c1def8ce2670b 100755
--- a/build_tools/circle/build_test_pypy.sh
+++ b/build_tools/circle/build_test_pypy.sh
@@ -35,5 +35,3 @@ export SKLEARN_BUILD_PARALLEL=3
 pip install --no-build-isolation -e .
 
 python -m pytest sklearn
-python -m pytest doc/sphinxext/
-python -m pytest $(find doc -name '*.rst' | sort)

From 3e736795d69aa3b0b86c5d4f4bdebbf3532fbce9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juan=20Carlos=20Alfaro=20Jim=C3=A9nez?=
 <JuanCarlos.Alfaro@uclm.es>
Date: Tue, 22 Dec 2020 11:59:52 +0100
Subject: [PATCH 022/478] CI Use Travis secret tokens to upload the ARM64
 wheels (#19017)

---
 .travis.yml                         | 3 ---
 build_tools/travis/after_success.sh | 4 +++-
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index dac2972c469af..3f631d9f8bc90 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -27,9 +27,6 @@ env:
                         OPENBLAS_NUM_THREADS=2
                         SKLEARN_BUILD_PARALLEL=8
                         SKLEARN_SKIP_NETWORK_TESTS=1"
-    # Nightly upload token and staging upload token are set in Travis settings
-    - SCIKIT_LEARN_NIGHTLY_UPLOAD_TOKEN=__token__
-    - SCIKIT_LEARN_STAGING_UPLOAD_TOKEN=__token__
 
 jobs:
   include:
diff --git a/build_tools/travis/after_success.sh b/build_tools/travis/after_success.sh
index ecf7998fb7d43..2123f7efafc22 100755
--- a/build_tools/travis/after_success.sh
+++ b/build_tools/travis/after_success.sh
@@ -8,7 +8,9 @@ set -e
 
 # The wheels cannot be uploaded on PRs
 if [[ $BUILD_WHEEL == true && $TRAVIS_EVENT_TYPE != pull_request ]]; then
-    if [ $TRAVIS_EVENT_TYPE == cron ]; then
+    # Nightly upload token and staging upload token are set in
+    # Travis settings (originally generated at Anaconda cloud)
+    if [[ $TRAVIS_EVENT_TYPE == cron ]]; then
         ANACONDA_ORG="scipy-wheels-nightly"
         ANACONDA_TOKEN="$SCIKIT_LEARN_NIGHTLY_UPLOAD_TOKEN"
     else

From 7a1aad0a371345eb4d6f4a127f7f25beee5ae341 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Tue, 22 Dec 2020 12:03:54 +0100
Subject: [PATCH 023/478] Trigger [cd build] to test the result of #19017


From b026c4d7c98935c097fa191f4a7ca799160c91d7 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@gmail.com>
Date: Tue, 22 Dec 2020 15:24:50 +0100
Subject: [PATCH 024/478] DOC 0.24.0 release highlights formatting (#19059)

---
 .../release_highlights/plot_release_highlights_0_24_0.py    | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/release_highlights/plot_release_highlights_0_24_0.py b/examples/release_highlights/plot_release_highlights_0_24_0.py
index abaa349a50e94..f5b10dfb21acc 100644
--- a/examples/release_highlights/plot_release_highlights_0_24_0.py
+++ b/examples/release_highlights/plot_release_highlights_0_24_0.py
@@ -146,7 +146,7 @@
 sfs = SequentialFeatureSelector(knn, n_features_to_select=2)
 sfs.fit(X, y)
 print("Features selected by forward sequential selection: "
-      f"{feature_names[sfs.get_support().tolist()]}")
+      f"{feature_names[sfs.get_support()].tolist()}")
 
 ##############################################################################
 # New PolynomialCountSketch kernel approximation function
@@ -172,8 +172,8 @@
                                                     random_state=42)
 pipe.fit(X_train, y_train).score(X_test, y_test)
 
-# ##############################################################################
-# # For comparison, here is the score of a linear baseline for the same data:
+##############################################################################
+# For comparison, here is the score of a linear baseline for the same data:
 
 linear_baseline = make_pipeline(MinMaxScaler(),
                                 LogisticRegression(max_iter=1000))

From ae3c48cf8979351ee5bb0f383d822ab61d295839 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 22 Dec 2020 16:02:21 +0100
Subject: [PATCH 025/478] MNT update the number of wheels generated to upload
 to PyPI

---
 build_tools/github/check_wheels.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/build_tools/github/check_wheels.py b/build_tools/github/check_wheels.py
index c1c183f279b54..05c70085b4081 100644
--- a/build_tools/github/check_wheels.py
+++ b/build_tools/github/check_wheels.py
@@ -11,11 +11,11 @@
 build_matrix = wheel_config['jobs']['build_wheels']['strategy']['matrix']
 n_python_versions = len(build_matrix['python'])
 
-# For each python version we have: 5 wheels
+# For each python version we have: 6 wheels
 # 1 osx wheel (x86_64)
-# 2 linux wheel (i686 + x86_64)
+# 3 linux wheel (i686 + x86_64 + arm64)
 # 2 windows wheel (win32 + wind_amd64)
-n_wheels = 5 * n_python_versions
+n_wheels = 6 * n_python_versions
 
 # plus one more for the sdist
 n_wheels += 1

From 2fda48b7d4cb479157117f99a2568e6360615378 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 22 Dec 2020 16:22:02 +0100
Subject: [PATCH 026/478] MNT fix publish to pypi conditions

---
 .github/workflows/publish_pypi.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/publish_pypi.yml b/.github/workflows/publish_pypi.yml
index 0e2b9ffd0f258..677188a3567b3 100644
--- a/.github/workflows/publish_pypi.yml
+++ b/.github/workflows/publish_pypi.yml
@@ -40,10 +40,10 @@ jobs:
         user: __token__
         password: ${{ secrets.TEST_PYPI_TOKEN }}
         repository_url: https://test.pypi.org/legacy/
-      if: ${{ github.event.inputs.pypi_repo }} == 'testpypi'
+      if: ${{ github.event.inputs.pypi_repo == 'testpypi' }}
     - name: Publish package to PyPI
       uses: pypa/gh-action-pypi-publish@v1.4.1
       with:
         user: __token__
         password: ${{ secrets.PYPI_TOKEN }}
-      if: ${{ github.event.inputs.pypi_repo }} == 'pypi'
+      if: ${{ github.event.inputs.pypi_repo == 'pypi' }}

From aacb1de05e2c43c5a76ca05365f48060ebfda239 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Tue, 22 Dec 2020 11:48:46 -0500
Subject: [PATCH 027/478] CI Publish to Pypi workflow for aarch64 wheels
 (#19060)

---
 build_tools/github/check_wheels.py | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/build_tools/github/check_wheels.py b/build_tools/github/check_wheels.py
index 05c70085b4081..64cebe3b6b0c4 100644
--- a/build_tools/github/check_wheels.py
+++ b/build_tools/github/check_wheels.py
@@ -11,15 +11,25 @@
 build_matrix = wheel_config['jobs']['build_wheels']['strategy']['matrix']
 n_python_versions = len(build_matrix['python'])
 
-# For each python version we have: 6 wheels
+# For each python version we have: 5 wheels
 # 1 osx wheel (x86_64)
-# 3 linux wheel (i686 + x86_64 + arm64)
+# 2 linux wheel (i686 + x86_64)
 # 2 windows wheel (win32 + wind_amd64)
-n_wheels = 6 * n_python_versions
+n_wheels = 5 * n_python_versions
 
 # plus one more for the sdist
 n_wheels += 1
 
+# aarch64 builds from travis
+travis_config_path = Path.cwd() / ".travis.yml"
+with travis_config_path.open('r') as f:
+    travis_config = yaml.safe_load(f)
+
+jobs = travis_config['jobs']['include']
+travis_builds = [j for j in jobs
+                 if any("CIBW_BUILD" in env for env in j["env"])]
+n_wheels += len(travis_builds)
+
 dist_files = list(Path("dist").glob('**/*'))
 n_dist_files = len(dist_files)
 

From 3cdfb56d340e77c2ffb5ad341ec4abebd8094a25 Mon Sep 17 00:00:00 2001
From: YusukeNagasaka <YusukeNagasaka@users.noreply.github.com>
Date: Wed, 23 Dec 2020 01:49:26 +0900
Subject: [PATCH 028/478] Parallelize init_bound_dense in Elkan algorithm
 (#19052)

Co-authored-by: ynaga <nagasaka.yusuke@LAPTOP-QMPN9IB4.g01.fujitsu.local>
Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
Co-authored-by: jeremie du boisberranger <jeremiedbb@yahoo.fr>
---
 doc/whats_new/v1.0.rst             | 4 ++++
 sklearn/cluster/_k_means_elkan.pyx | 4 ++--
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 2f53a9d7fee69..0eeef56d35930 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -52,6 +52,10 @@ Changelog
   settings. :pr:`19002` by :user:`Jon Crall <Erotemic>` and
   :user:`Jérémie du Boisberranger <jeremiedbb>`.
 
+- |Efficiency| :class:`cluster.KMeans` with `algorithm='elkan'` is now faster
+  in multicore settings. :pr:`19052` by
+  :user:`Yusuke Nagasaka <YusukeNagasaka>`.
+
 Code and Documentation Contributors
 -----------------------------------
 
diff --git a/sklearn/cluster/_k_means_elkan.pyx b/sklearn/cluster/_k_means_elkan.pyx
index fcacfb0d72046..1010e581f5e7f 100644
--- a/sklearn/cluster/_k_means_elkan.pyx
+++ b/sklearn/cluster/_k_means_elkan.pyx
@@ -83,7 +83,7 @@ def init_bounds_dense(
         floating min_dist, dist
         int best_cluster, i, j
 
-    for i in range(n_samples):
+    for i in prange(n_samples, schedule='static', nogil=True):
         best_cluster = 0
         min_dist = _euclidean_dense_dense(&X[i, 0], &centers[0, 0],
                                           n_features, False)
@@ -159,7 +159,7 @@ def init_bounds_sparse(
 
         floating[::1] centers_squared_norms = row_norms(centers, squared=True)
 
-    for i in range(n_samples):
+    for i in prange(n_samples, schedule='static', nogil=True):
         best_cluster = 0
         min_dist = _euclidean_sparse_dense(
             X_data[X_indptr[i]: X_indptr[i + 1]],

From 6b4f82433dc2f219dbff7fe8fa42c10b72379be6 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Tue, 22 Dec 2020 17:31:29 -0500
Subject: [PATCH 029/478] FIX ElasticNet.fit does not modify sample_weight in
 place (#19055)

---
 doc/whats_new/v1.0.rst                        |  6 ++++++
 sklearn/linear_model/_coordinate_descent.py   |  2 +-
 .../tests/test_coordinate_descent.py          | 19 +++++++++++++++++++
 3 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 0eeef56d35930..0372dcdd1fd4e 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -56,6 +56,12 @@ Changelog
   in multicore settings. :pr:`19052` by
   :user:`Yusuke Nagasaka <YusukeNagasaka>`.
 
+:mod:`sklearn.linear_model`
+...........................
+
+- |Fix| :meth:`ElasticNet.fit` no longer modifies `sample_weight` in place.
+  :pr:`19055` by `Thomas Fan`_.
+
 Code and Documentation Contributors
 -----------------------------------
 
diff --git a/sklearn/linear_model/_coordinate_descent.py b/sklearn/linear_model/_coordinate_descent.py
index dfd22e4c5a288..6dcaa5043b414 100644
--- a/sklearn/linear_model/_coordinate_descent.py
+++ b/sklearn/linear_model/_coordinate_descent.py
@@ -790,7 +790,7 @@ def fit(self, X, y, sample_weight=None, check_input=True):
                                                      dtype=X.dtype)
             # simplify things by rescaling sw to sum up to n_samples
             # => np.average(x, weights=sw) = np.mean(sw * x)
-            sample_weight *= (n_samples / np.sum(sample_weight))
+            sample_weight = sample_weight * (n_samples / np.sum(sample_weight))
             # Objective function is:
             # 1/2 * np.average(squared error, weights=sw) + alpha * penalty
             # but coordinate descent minimizes:
diff --git a/sklearn/linear_model/tests/test_coordinate_descent.py b/sklearn/linear_model/tests/test_coordinate_descent.py
index f9a7efc987699..f0095b235ac2b 100644
--- a/sklearn/linear_model/tests/test_coordinate_descent.py
+++ b/sklearn/linear_model/tests/test_coordinate_descent.py
@@ -1214,3 +1214,22 @@ def test_linear_models_cv_fit_for_all_backends(backend, estimator):
 
     with joblib.parallel_backend(backend=backend):
         estimator(n_jobs=2, cv=3).fit(X, y)
+
+
+@pytest.mark.parametrize("check_input", [True, False])
+def test_enet_sample_weight_does_not_overwrite_sample_weight(check_input):
+    """Check that ElasticNet does not overwrite sample_weights."""
+
+    rng = np.random.RandomState(0)
+    n_samples, n_features = 10, 5
+
+    X = rng.rand(n_samples, n_features)
+    y = rng.rand(n_samples)
+
+    sample_weight_1_25 = 1.25 * np.ones_like(y)
+    sample_weight = sample_weight_1_25.copy()
+
+    reg = ElasticNet()
+    reg.fit(X, y, sample_weight=sample_weight, check_input=check_input)
+
+    assert_array_equal(sample_weight, sample_weight_1_25)

From b7a48a0d7d70ac5c789755e8aeeb8ec4d57dbb30 Mon Sep 17 00:00:00 2001
From: Chiara Marmo <cmarmo@users.noreply.github.com>
Date: Sat, 2 Jan 2021 15:19:32 +0100
Subject: [PATCH 030/478] TST Fix scipy DeprecationWarning from wminkowski in
 nightly (#18930)

---
 sklearn/metrics/tests/test_pairwise.py | 45 ++++++++++++++++++++++++--
 1 file changed, 42 insertions(+), 3 deletions(-)

diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py
index 4cc01b43b8b53..1ff62af04c05f 100644
--- a/sklearn/metrics/tests/test_pairwise.py
+++ b/sklearn/metrics/tests/test_pairwise.py
@@ -4,8 +4,16 @@
 from numpy import linalg
 
 from scipy.sparse import dok_matrix, csr_matrix, issparse
-from scipy.spatial.distance import cosine, cityblock, minkowski, wminkowski
+from scipy.spatial.distance import cosine, cityblock, minkowski
 from scipy.spatial.distance import cdist, pdist, squareform
+try:
+    from scipy.spatial.distance import wminkowski
+except ImportError:
+    # In scipy 1.6.0, wminkowski is deprecated and minkowski
+    # should be used instead.
+    from scipy.spatial.distance import minkowski as wminkowski
+
+from sklearn.utils.fixes import sp_version, parse_version
 
 import pytest
 
@@ -233,6 +241,7 @@ def test_pairwise_precomputed_non_negative():
         pairwise_distances(np.full((5, 5), -1), metric='precomputed')
 
 
+_minkowski_kwds = {'w': np.arange(1, 5).astype('double', copy=False), 'p': 1}
 _wminkowski_kwds = {'w': np.arange(1, 5).astype('double', copy=False), 'p': 1}
 
 
@@ -245,8 +254,38 @@ def callable_rbf_kernel(x, y, **kwds):
 @pytest.mark.parametrize(
         'func, metric, kwds',
         [(pairwise_distances, 'euclidean', {}),
-         (pairwise_distances, wminkowski, _wminkowski_kwds),
-         (pairwise_distances, 'wminkowski', _wminkowski_kwds),
+         pytest.param(
+             pairwise_distances, minkowski, _minkowski_kwds,
+             marks=pytest.mark.skipif(
+                 sp_version < parse_version("1.0"),
+                 reason="minkowski does not accept the w "
+                        "parameter prior to scipy 1.0."
+             )
+         ),
+         pytest.param(
+             pairwise_distances, 'minkowski', _minkowski_kwds,
+             marks=pytest.mark.skipif(
+                 sp_version < parse_version("1.0"),
+                 reason="minkowski does not accept the w "
+                        "parameter prior to scipy 1.0."
+             )
+         ),
+         pytest.param(
+             pairwise_distances, wminkowski, _wminkowski_kwds,
+             marks=pytest.mark.skipif(
+                 sp_version >= parse_version("1.6.0"),
+                 reason="wminkowski is now minkowski "
+                        "and it has been already tested."
+             )
+         ),
+         pytest.param(
+             pairwise_distances, 'wminkowski', _wminkowski_kwds,
+             marks=pytest.mark.skipif(
+                 sp_version >= parse_version("1.6.0"),
+                 reason="wminkowski is now minkowski "
+                        "and it has been already tested."
+             )
+         ),
          (pairwise_kernels, 'polynomial', {'degree': 1}),
          (pairwise_kernels, callable_rbf_kernel, {'gamma': .1})])
 @pytest.mark.parametrize('array_constr', [np.array, csr_matrix])

From 8f72c2a3341c29f3327b0f7e96d9841bb05e8bed Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Sat, 2 Jan 2021 16:31:19 -0500
Subject: [PATCH 031/478] ENH Adds n_features_in_ checking in gaussian_process
 (#18743)

---
 sklearn/gaussian_process/_gpc.py | 14 +++++++++-----
 sklearn/gaussian_process/_gpr.py |  7 ++++---
 sklearn/tests/test_common.py     |  1 -
 3 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/sklearn/gaussian_process/_gpc.py b/sklearn/gaussian_process/_gpc.py
index 7db163624d175..e6fe3eb26df49 100644
--- a/sklearn/gaussian_process/_gpc.py
+++ b/sklearn/gaussian_process/_gpc.py
@@ -14,7 +14,7 @@
 from ..base import BaseEstimator, ClassifierMixin, clone
 from .kernels \
     import RBF, CompoundKernel, ConstantKernel as C
-from ..utils.validation import check_is_fitted, check_array
+from ..utils.validation import check_is_fitted
 from ..utils import check_random_state
 from ..utils.optimize import _check_optimize_result
 from ..preprocessing import LabelEncoder
@@ -689,9 +689,11 @@ def predict(self, X):
         check_is_fitted(self)
 
         if self.kernel is None or self.kernel.requires_vector_input:
-            X = check_array(X, ensure_2d=True, dtype="numeric")
+            X = self._validate_data(X, ensure_2d=True, dtype="numeric",
+                                    reset=False)
         else:
-            X = check_array(X, ensure_2d=False, dtype=None)
+            X = self._validate_data(X, ensure_2d=False, dtype=None,
+                                    reset=False)
 
         return self.base_estimator_.predict(X)
 
@@ -717,9 +719,11 @@ def predict_proba(self, X):
                              "one_vs_rest mode instead.")
 
         if self.kernel is None or self.kernel.requires_vector_input:
-            X = check_array(X, ensure_2d=True, dtype="numeric")
+            X = self._validate_data(X, ensure_2d=True, dtype="numeric",
+                                    reset=False)
         else:
-            X = check_array(X, ensure_2d=False, dtype=None)
+            X = self._validate_data(X, ensure_2d=False, dtype=None,
+                                    reset=False)
 
         return self.base_estimator_.predict_proba(X)
 
diff --git a/sklearn/gaussian_process/_gpr.py b/sklearn/gaussian_process/_gpr.py
index 7807fc64b5530..b4ab0441efc71 100644
--- a/sklearn/gaussian_process/_gpr.py
+++ b/sklearn/gaussian_process/_gpr.py
@@ -15,7 +15,6 @@
 from ..base import MultiOutputMixin
 from .kernels import RBF, ConstantKernel as C
 from ..utils import check_random_state
-from ..utils.validation import check_array
 from ..utils.optimize import _check_optimize_result
 from ..utils.validation import _deprecate_positional_args
 
@@ -320,9 +319,11 @@ def predict(self, X, return_std=False, return_cov=False):
                 "returning full covariance.")
 
         if self.kernel is None or self.kernel.requires_vector_input:
-            X = check_array(X, ensure_2d=True, dtype="numeric")
+            X = self._validate_data(X, ensure_2d=True, dtype="numeric",
+                                    reset=False)
         else:
-            X = check_array(X, ensure_2d=False, dtype=None)
+            X = self._validate_data(X, ensure_2d=False, dtype=None,
+                                    reset=False)
 
         if not hasattr(self, "X_train_"):  # Unfitted;predict based on GP prior
             if self.kernel is None:
diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
index 33c9b712a873f..8e1a9fb219105 100644
--- a/sklearn/tests/test_common.py
+++ b/sklearn/tests/test_common.py
@@ -272,7 +272,6 @@ def test_search_cv(estimator, check, request):
     'ensemble',
     'feature_extraction',
     'feature_selection',
-    'gaussian_process',
     'isotonic',
     'linear_model',
     'manifold',

From 5946f8bfed039540c7527a06f2e6e9f1fb2335c3 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Sat, 2 Jan 2021 16:42:08 -0500
Subject: [PATCH 032/478] ENH Adds n_features_in_ checks to linear and svm
 modules (#18578)

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
Co-authored-by: Olivier Grisel <olivier.grisel@gmail.com>
---
 sklearn/linear_model/_base.py                | 11 +++--------
 sklearn/linear_model/_glm/glm.py             | 13 ++++++-------
 sklearn/linear_model/_stochastic_gradient.py | 18 +++++++++---------
 sklearn/svm/_base.py                         |  9 +++------
 sklearn/tests/test_common.py                 |  2 --
 5 files changed, 21 insertions(+), 32 deletions(-)

diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py
index 2399e1216238f..56fc3e9f71edf 100644
--- a/sklearn/linear_model/_base.py
+++ b/sklearn/linear_model/_base.py
@@ -217,7 +217,8 @@ def fit(self, X, y):
     def _decision_function(self, X):
         check_is_fitted(self)
 
-        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])
+        X = self._validate_data(X, accept_sparse=['csr', 'csc', 'coo'],
+                                reset=False)
         return safe_sparse_dot(X, self.coef_.T,
                                dense_output=True) + self.intercept_
 
@@ -281,13 +282,7 @@ class would be predicted.
         """
         check_is_fitted(self)
 
-        X = check_array(X, accept_sparse='csr')
-
-        n_features = self.coef_.shape[1]
-        if X.shape[1] != n_features:
-            raise ValueError("X has %d features per sample; expecting %d"
-                             % (X.shape[1], n_features))
-
+        X = self._validate_data(X, accept_sparse='csr', reset=False)
         scores = safe_sparse_dot(X, self.coef_.T,
                                  dense_output=True) + self.intercept_
         return scores.ravel() if scores.shape[1] == 1 else scores
diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index 783f3d30cfa56..6bbf6f2b36d55 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -12,7 +12,6 @@
 import scipy.optimize
 
 from ...base import BaseEstimator, RegressorMixin
-from ...utils import check_array, check_X_y
 from ...utils.optimize import _check_optimize_result
 from ...utils.validation import check_is_fitted, _check_sample_weight
 from ..._loss.glm_distribution import (
@@ -221,9 +220,9 @@ def fit(self, X, y, sample_weight=None):
         family = self._family_instance
         link = self._link_instance
 
-        X, y = check_X_y(X, y, accept_sparse=['csc', 'csr'],
-                         dtype=[np.float64, np.float32],
-                         y_numeric=True, multi_output=False)
+        X, y = self._validate_data(X, y, accept_sparse=['csc', 'csr'],
+                                   dtype=[np.float64, np.float32],
+                                   y_numeric=True, multi_output=False)
 
         weights = _check_sample_weight(sample_weight, X)
 
@@ -311,9 +310,9 @@ def _linear_predictor(self, X):
             Returns predicted values of linear predictor.
         """
         check_is_fitted(self)
-        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'],
-                        dtype=[np.float64, np.float32], ensure_2d=True,
-                        allow_nd=False)
+        X = self._validate_data(X, accept_sparse=['csr', 'csc', 'coo'],
+                                dtype=[np.float64, np.float32], ensure_2d=True,
+                                allow_nd=False, reset=False)
         return X @ self.coef_ + self.intercept_
 
     def predict(self, X):
diff --git a/sklearn/linear_model/_stochastic_gradient.py b/sklearn/linear_model/_stochastic_gradient.py
index 7b019e5545534..948910e61b51c 100644
--- a/sklearn/linear_model/_stochastic_gradient.py
+++ b/sklearn/linear_model/_stochastic_gradient.py
@@ -15,7 +15,7 @@
 from ._base import LinearClassifierMixin, SparseCoefMixin
 from ._base import make_dataset
 from ..base import BaseEstimator, RegressorMixin
-from ..utils import check_array, check_random_state, check_X_y
+from ..utils import check_random_state
 from ..utils.extmath import safe_sparse_dot
 from ..utils.multiclass import _check_partial_fit_first_call
 from ..utils.validation import check_is_fitted, _check_sample_weight
@@ -491,8 +491,10 @@ def _partial_fit(self, X, y, alpha, C,
                      loss, learning_rate, max_iter,
                      classes, sample_weight,
                      coef_init, intercept_init):
-        X, y = check_X_y(X, y, accept_sparse='csr', dtype=np.float64,
-                         order="C", accept_large_sparse=False)
+        first_call = not hasattr(self, "classes_")
+        X, y = self._validate_data(X, y, accept_sparse='csr', dtype=np.float64,
+                                   order="C", accept_large_sparse=False,
+                                   reset=first_call)
 
         n_samples, n_features = X.shape
 
@@ -1138,9 +1140,10 @@ def __init__(self, loss="squared_loss", *, penalty="l2", alpha=0.0001,
 
     def _partial_fit(self, X, y, alpha, C, loss, learning_rate,
                      max_iter, sample_weight, coef_init, intercept_init):
+        first_call = getattr(self, "coef_", None) is None
         X, y = self._validate_data(X, y, accept_sparse="csr", copy=False,
                                    order='C', dtype=np.float64,
-                                   accept_large_sparse=False)
+                                   accept_large_sparse=False, reset=first_call)
         y = y.astype(np.float64, copy=False)
 
         n_samples, n_features = X.shape
@@ -1148,12 +1151,9 @@ def _partial_fit(self, X, y, alpha, C, loss, learning_rate,
         sample_weight = _check_sample_weight(sample_weight, X)
 
         # Allocate datastructures from input arguments
-        if getattr(self, "coef_", None) is None:
+        if first_call:
             self._allocate_parameter_mem(1, n_features, coef_init,
                                          intercept_init)
-        elif n_features != self.coef_.shape[-1]:
-            raise ValueError("Number of features %d does not match previous "
-                             "data %d." % (n_features, self.coef_.shape[-1]))
         if self.average > 0 and getattr(self, "_average_coef", None) is None:
             self._average_coef = np.zeros(n_features,
                                           dtype=np.float64,
@@ -1269,7 +1269,7 @@ def _decision_function(self, X):
         """
         check_is_fitted(self)
 
-        X = check_array(X, accept_sparse='csr')
+        X = self._validate_data(X, accept_sparse='csr', reset=False)
 
         scores = safe_sparse_dot(X, self.coef_.T,
                                  dense_output=True) + self.intercept_
diff --git a/sklearn/svm/_base.py b/sklearn/svm/_base.py
index fa09badf64691..67808278cc59a 100644
--- a/sklearn/svm/_base.py
+++ b/sklearn/svm/_base.py
@@ -471,8 +471,9 @@ def _validate_for_predict(self, X):
         check_is_fitted(self)
 
         if not callable(self.kernel):
-            X = check_array(X, accept_sparse='csr', dtype=np.float64,
-                            order="C", accept_large_sparse=False)
+            X = self._validate_data(X, accept_sparse='csr', dtype=np.float64,
+                                    order="C", accept_large_sparse=False,
+                                    reset=False)
 
         if self._sparse and not sp.isspmatrix(X):
             X = sp.csr_matrix(X)
@@ -489,10 +490,6 @@ def _validate_for_predict(self, X):
                 raise ValueError("X.shape[1] = %d should be equal to %d, "
                                  "the number of samples at training time" %
                                  (X.shape[1], self.shape_fit_[0]))
-        elif not callable(self.kernel) and X.shape[1] != self.shape_fit_[1]:
-            raise ValueError("X.shape[1] = %d should be equal to %d, "
-                             "the number of features at training time" %
-                             (X.shape[1], self.shape_fit_[1]))
         return X
 
     @property
diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
index 8e1a9fb219105..b900f94231419 100644
--- a/sklearn/tests/test_common.py
+++ b/sklearn/tests/test_common.py
@@ -273,7 +273,6 @@ def test_search_cv(estimator, check, request):
     'feature_extraction',
     'feature_selection',
     'isotonic',
-    'linear_model',
     'manifold',
     'mixture',
     'model_selection',
@@ -284,7 +283,6 @@ def test_search_cv(estimator, check, request):
     'pipeline',
     'random_projection',
     'semi_supervised',
-    'svm',
 }
 
 N_FEATURES_IN_AFTER_FIT_ESTIMATORS = [

From def0e68085a4339490325dcbfc79143f21ca001e Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Sun, 3 Jan 2021 13:25:14 -0500
Subject: [PATCH 033/478] CI Fixes scipy-dev build instance (#19062)

---
 build_tools/azure/install.sh | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh
index b124c6e168d52..866c4530e3f5c 100755
--- a/build_tools/azure/install.sh
+++ b/build_tools/azure/install.sh
@@ -96,7 +96,11 @@ elif [[ "$DISTRIB" == "conda-pip-scipy-dev" ]]; then
     python -m pip install -U pip
     echo "Installing numpy and scipy master wheels"
     dev_anaconda_url=https://pypi.anaconda.org/scipy-wheels-nightly/simple
-    pip install --pre --upgrade --timeout=60 --extra-index $dev_anaconda_url numpy scipy pandas
+    pip install --pre --upgrade --timeout=60 --extra-index $dev_anaconda_url numpy pandas
+
+    # issue with metadata in scipy dev builds https://github.com/scipy/scipy/issues/13196
+    # --use-deprecated=legacy-resolver needs to be included
+    pip install --pre --upgrade --timeout=60 --extra-index $dev_anaconda_url scipy --use-deprecated=legacy-resolver
     pip install --pre cython
     setup_ccache
     echo "Installing joblib master"

From 377776b4deae84fb66cf68839dcd77fc96067d6a Mon Sep 17 00:00:00 2001
From: Harry Wei <haochuanwei@users.noreply.github.com>
Date: Mon, 4 Jan 2021 18:00:06 +0800
Subject: [PATCH 034/478] DOC typo correction in neighbors.rst (#19099)

---
 doc/modules/neighbors.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/modules/neighbors.rst b/doc/modules/neighbors.rst
index 36a9c86d98e24..bb84b79e8570a 100644
--- a/doc/modules/neighbors.rst
+++ b/doc/modules/neighbors.rst
@@ -433,7 +433,7 @@ based on the following assumptions:
   training points
 * ``leaf_size`` is close to its default value of ``30``
 * when :math:`D > 15`, the intrinsic dimensionality of the data is generally
-  to high for tree-based methods
+  too high for tree-based methods
 
 Effect of ``leaf_size``
 -----------------------

From 280087899302c89ad906c0b50c7a3c1a1e648231 Mon Sep 17 00:00:00 2001
From: yzhenman <65328572+yzhenman@users.noreply.github.com>
Date: Mon, 4 Jan 2021 02:04:55 -0800
Subject: [PATCH 035/478] DOC fix dataset used for visualization in digits
 classification example (#19095)

---
 examples/classification/plot_digits_classification.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/classification/plot_digits_classification.py b/examples/classification/plot_digits_classification.py
index 35843883df1b2..0d1a79f609f7d 100644
--- a/examples/classification/plot_digits_classification.py
+++ b/examples/classification/plot_digits_classification.py
@@ -78,8 +78,9 @@
 # digit value in the title.
 
 _, axes = plt.subplots(nrows=1, ncols=4, figsize=(10, 3))
-for ax, image, prediction in zip(axes, digits.images, predicted):
+for ax, image, prediction in zip(axes, X_test, predicted):
     ax.set_axis_off()
+    image = image.reshape(8, 8)
     ax.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
     ax.set_title(f'Prediction: {prediction}')
 

From e325bf760f55fb1095a66f1223af2cd396685b2f Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Mon, 4 Jan 2021 16:06:20 +0100
Subject: [PATCH 036/478] TST explicit convert array to float that will
 contains np.nan (#19101)

---
 sklearn/metrics/tests/test_common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py
index 767228d460789..181baf19de3c2 100644
--- a/sklearn/metrics/tests/test_common.py
+++ b/sklearn/metrics/tests/test_common.py
@@ -198,7 +198,7 @@ def precision_recall_curve_padded_thresholds(*args, **kwargs):
     return np.array([
         precision,
         recall,
-        np.pad(thresholds,
+        np.pad(thresholds.astype(np.float64),
                pad_width=(0, pad_threshholds),
                mode='constant',
                constant_values=[np.nan])

From ef4e95feebeda0929881cb9c0b3f00ef79bdbb93 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Tue, 5 Jan 2021 04:27:02 -0500
Subject: [PATCH 037/478] TST Adapts wminkowski for scipy 1.6.0 (#19096)

---
 sklearn/neighbors/tests/test_dist_metrics.py | 28 ++++++++++++++++++--
 sklearn/neighbors/tests/test_neighbors.py    | 16 +++++++++--
 2 files changed, 40 insertions(+), 4 deletions(-)

diff --git a/sklearn/neighbors/tests/test_dist_metrics.py b/sklearn/neighbors/tests/test_dist_metrics.py
index 60a4e97880e15..441bcc134fe6b 100644
--- a/sklearn/neighbors/tests/test_dist_metrics.py
+++ b/sklearn/neighbors/tests/test_dist_metrics.py
@@ -55,7 +55,19 @@ def test_cdist(metric):
     keys = argdict.keys()
     for vals in itertools.product(*argdict.values()):
         kwargs = dict(zip(keys, vals))
-        D_true = cdist(X1, X2, metric, **kwargs)
+        if metric == "wminkowski":
+            if sp_version >= parse_version("1.8.0"):
+                pytest.skip("wminkowski will be removed in SciPy 1.8.0")
+
+            # wminkoski is deprecated in SciPy 1.6.0 and removed in 1.8.0
+            ExceptionToAssert = None
+            if sp_version >= parse_version("1.6.0"):
+                ExceptionToAssert = DeprecationWarning
+            with pytest.warns(ExceptionToAssert):
+                D_true = cdist(X1, X2, metric, **kwargs)
+        else:
+            D_true = cdist(X1, X2, metric, **kwargs)
+
         check_cdist(metric, kwargs, D_true)
 
 
@@ -83,7 +95,19 @@ def test_pdist(metric):
     keys = argdict.keys()
     for vals in itertools.product(*argdict.values()):
         kwargs = dict(zip(keys, vals))
-        D_true = cdist(X1, X1, metric, **kwargs)
+        if metric == "wminkowski":
+            if sp_version >= parse_version("1.8.0"):
+                pytest.skip("wminkowski will be removed in SciPy 1.8.0")
+
+            # wminkoski is deprecated in SciPy 1.6.0 and removed in 1.8.0
+            ExceptionToAssert = None
+            if sp_version >= parse_version("1.6.0"):
+                ExceptionToAssert = DeprecationWarning
+            with pytest.warns(ExceptionToAssert):
+                D_true = cdist(X1, X1, metric, **kwargs)
+        else:
+            D_true = cdist(X1, X1, metric, **kwargs)
+
         check_pdist(metric, kwargs, D_true)
 
 
diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py
index 513df1edb1bec..2b6c9a48d545d 100644
--- a/sklearn/neighbors/tests/test_neighbors.py
+++ b/sklearn/neighbors/tests/test_neighbors.py
@@ -26,6 +26,7 @@
 from sklearn.utils._testing import assert_raise_message
 from sklearn.utils._testing import ignore_warnings
 from sklearn.utils.validation import check_random_state
+from sklearn.utils.fixes import sp_version, parse_version
 
 import joblib
 
@@ -1244,6 +1245,9 @@ def test_neighbors_metrics(n_samples=20, n_features=3,
     test = rng.rand(n_query_pts, n_features)
 
     for metric, metric_params in metrics:
+        if metric == "wminkowski" and sp_version >= parse_version("1.8.0"):
+            # wminkowski will be removed in SciPy 1.8.0
+            continue
         results = {}
         p = metric_params.pop('p', 2)
         for algorithm in algorithms:
@@ -1265,8 +1269,16 @@ def test_neighbors_metrics(n_samples=20, n_features=3,
                           if metric == 'haversine' else slice(None))
 
             neigh.fit(X[:, feature_sl])
-            results[algorithm] = neigh.kneighbors(test[:, feature_sl],
-                                                  return_distance=True)
+
+            # wminkoski is deprecated in SciPy 1.6.0 and removed in 1.8.0
+            ExceptionToAssert = None
+            if (metric == "wminkowski" and algorithm == 'brute'
+                    and sp_version >= parse_version("1.6.0")):
+                ExceptionToAssert = DeprecationWarning
+
+            with pytest.warns(ExceptionToAssert):
+                results[algorithm] = neigh.kneighbors(test[:, feature_sl],
+                                                      return_distance=True)
 
         assert_array_almost_equal(results['brute'][0], results['ball_tree'][0])
         assert_array_almost_equal(results['brute'][1], results['ball_tree'][1])

From 2d95acfe95c28bd8e8a5793021a43ff71c5c6864 Mon Sep 17 00:00:00 2001
From: Adam Midvidy <amidvidy@gmail.com>
Date: Tue, 5 Jan 2021 04:44:52 -0500
Subject: [PATCH 038/478] ENH/DOC Add additional validation for user-supplied
 gram matrixes (#19004)

---
 doc/whats_new/v1.0.rst                        |  3 +
 ...puted_gram_matrix_with_weighted_samples.py | 53 ++++++++++++
 sklearn/linear_model/_base.py                 | 82 ++++++++++++++++---
 sklearn/linear_model/_coordinate_descent.py   |  3 +-
 .../tests/test_coordinate_descent.py          | 39 +++++++++
 5 files changed, 169 insertions(+), 11 deletions(-)
 create mode 100644 examples/linear_model/plot_elastic_net_precomputed_gram_matrix_with_weighted_samples.py

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 0372dcdd1fd4e..ecdb38440bf0b 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -62,6 +62,9 @@ Changelog
 - |Fix| :meth:`ElasticNet.fit` no longer modifies `sample_weight` in place.
   :pr:`19055` by `Thomas Fan`_.
 
+- |Enhancement| Validate user-supplied gram matrix passed to linear models
+  via the `precompute` argument. :pr:`19004` by :user:`Adam Midvidy <amidvidy>`.
+
 Code and Documentation Contributors
 -----------------------------------
 
diff --git a/examples/linear_model/plot_elastic_net_precomputed_gram_matrix_with_weighted_samples.py b/examples/linear_model/plot_elastic_net_precomputed_gram_matrix_with_weighted_samples.py
new file mode 100644
index 0000000000000..852ea545c5fd6
--- /dev/null
+++ b/examples/linear_model/plot_elastic_net_precomputed_gram_matrix_with_weighted_samples.py
@@ -0,0 +1,53 @@
+"""
+==========================================================================
+Fitting an Elastic Net with a precomputed Gram Matrix and Weighted Samples
+==========================================================================
+
+The following example shows how to precompute the gram matrix
+while using weighted samples with an ElasticNet.
+
+If weighted samples are used, the design matrix must be centered and then
+rescaled by the square root of the weight vector before the gram matrix
+is computed.
+
+.. note::
+  `sample_weight` vector is also rescaled to sum to `n_samples`, see the
+   documentation for the `sample_weight` parameter to
+   :func:`linear_model.ElasticNet.fit`.
+
+"""
+
+print(__doc__)
+
+# %%
+# Let's start by loading the dataset and creating some sample weights.
+import numpy as np
+from sklearn.datasets import make_regression
+
+rng = np.random.RandomState(0)
+
+n_samples = int(1e5)
+X, y = make_regression(n_samples=n_samples, noise=0.5, random_state=rng)
+
+sample_weight = rng.lognormal(size=n_samples)
+# normalize the sample weights
+normalized_weights = sample_weight * (n_samples / (sample_weight.sum()))
+
+# %%
+# To fit the elastic net using the `precompute` option together with the sample
+# weights, we must first center the design matrix,  and rescale it by the
+# normalized weights prior to computing the gram matrix.
+X_offset = np.average(X, axis=0, weights=normalized_weights)
+X_centered = (X - np.average(X, axis=0, weights=normalized_weights))
+X_scaled = X_centered * np.sqrt(normalized_weights)[:, np.newaxis]
+gram = np.dot(X_scaled.T, X_scaled)
+
+# %%
+# We can now proceed with fitting. We must passed the centered design matrix to
+# `fit` otherwise the elastic net estimator will detect that it is uncentered
+# and discard the gram matrix we passed. However, if we pass the scaled design
+# matrix, the preprocessing code will incorrectly rescale it a second time.
+from sklearn.linear_model import ElasticNet
+
+lm = ElasticNet(alpha=0.01, precompute=gram)
+lm.fit(X_centered, y, sample_weight=normalized_weights)
diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py
index 56fc3e9f71edf..2fe1440ffd7c8 100644
--- a/sklearn/linear_model/_base.py
+++ b/sklearn/linear_model/_base.py
@@ -37,6 +37,7 @@
 from ..utils._seq_dataset import ArrayDataset32, CSRDataset32
 from ..utils._seq_dataset import ArrayDataset64, CSRDataset64
 from ..utils.validation import check_is_fitted, _check_sample_weight
+
 from ..utils.fixes import delayed
 from ..preprocessing import normalize as f_normalize
 
@@ -570,6 +571,61 @@ def rmatvec(b):
         return self
 
 
+def _check_precomputed_gram_matrix(X, precompute, X_offset, X_scale,
+                                   rtol=1e-7,
+                                   atol=1e-5):
+    """Computes a single element of the gram matrix and compares it to
+    the corresponding element of the user supplied gram matrix.
+
+    If the values do not match a ValueError will be thrown.
+
+    Parameters
+    ----------
+    X : ndarray of shape (n_samples, n_features)
+        Data array.
+
+    precompute : array-like of shape (n_features, n_features)
+        User-supplied gram matrix.
+
+    X_offset : ndarray of shape (n_features,)
+        Array of feature means used to center design matrix.
+
+    X_scale : ndarray of shape (n_features,)
+        Array of feature scale factors used to normalize design matrix.
+
+    rtol : float, default=1e-7
+        Relative tolerance; see numpy.allclose.
+
+    atol : float, default=1e-5
+        absolute tolerance; see :func`numpy.allclose`. Note that the default
+        here is more tolerant than the default for
+        :func:`numpy.testing.assert_allclose`, where `atol=0`.
+
+    Raises
+    ------
+    ValueError
+        Raised when the provided Gram matrix is not consistent.
+    """
+
+    n_features = X.shape[1]
+    f1 = n_features // 2
+    f2 = min(f1+1, n_features-1)
+
+    v1 = (X[:, f1] - X_offset[f1]) * X_scale[f1]
+    v2 = (X[:, f2] - X_offset[f2]) * X_scale[f2]
+
+    expected = np.dot(v1, v2)
+    actual = precompute[f1, f2]
+
+    if not np.isclose(expected, actual, rtol=rtol, atol=atol):
+        raise ValueError("Gram matrix passed in via 'precompute' parameter "
+                         "did not pass validation when a single element was "
+                         "checked - please check that it was computed "
+                         f"properly. For element ({f1},{f2}) we computed "
+                         f"{expected} but the user-supplied value was "
+                         f"{actual}.")
+
+
 def _pre_fit(X, y, Xy, precompute, normalize, fit_intercept, copy,
              check_input=True, sample_weight=None):
     """Aux function used at beginning of fit in linear models
@@ -595,16 +651,22 @@ def _pre_fit(X, y, Xy, precompute, normalize, fit_intercept, copy,
             check_input=check_input, sample_weight=sample_weight)
     if sample_weight is not None:
         X, y = _rescale_data(X, y, sample_weight=sample_weight)
-    if hasattr(precompute, '__array__') and (
-        fit_intercept and not np.allclose(X_offset, np.zeros(n_features)) or
-            normalize and not np.allclose(X_scale, np.ones(n_features))):
-        warnings.warn("Gram matrix was provided but X was centered"
-                      " to fit intercept, "
-                      "or X was normalized : recomputing Gram matrix.",
-                      UserWarning)
-        # recompute Gram
-        precompute = 'auto'
-        Xy = None
+    if hasattr(precompute, '__array__'):
+        if (fit_intercept and not np.allclose(X_offset, np.zeros(n_features))
+                or normalize and not np.allclose(X_scale,
+                                                 np.ones(n_features))):
+            warnings.warn(
+                "Gram matrix was provided but X was centered to fit "
+                "intercept, or X was normalized : recomputing Gram matrix.",
+                UserWarning
+            )
+            # recompute Gram
+            precompute = 'auto'
+            Xy = None
+        elif check_input:
+            # If we're going to use the user's precomputed gram matrix, we
+            # do a quick check to make sure its not totally bogus.
+            _check_precomputed_gram_matrix(X, precompute, X_offset, X_scale)
 
     # precompute if n_samples > n_features
     if isinstance(precompute, str) and precompute == 'auto':
diff --git a/sklearn/linear_model/_coordinate_descent.py b/sklearn/linear_model/_coordinate_descent.py
index 6dcaa5043b414..f2a004be81048 100644
--- a/sklearn/linear_model/_coordinate_descent.py
+++ b/sklearn/linear_model/_coordinate_descent.py
@@ -729,7 +729,8 @@ def fit(self, X, y, sample_weight=None, check_input=True):
             Target. Will be cast to X's dtype if necessary.
 
         sample_weight : float or array-like of shape (n_samples,), default=None
-            Sample weight.
+            Sample weight. Internally, the `sample_weight` vector will be
+            rescaled to sum to `n_samples`.
 
             .. versionadded:: 0.23
 
diff --git a/sklearn/linear_model/tests/test_coordinate_descent.py b/sklearn/linear_model/tests/test_coordinate_descent.py
index f0095b235ac2b..232a59e846ff7 100644
--- a/sklearn/linear_model/tests/test_coordinate_descent.py
+++ b/sklearn/linear_model/tests/test_coordinate_descent.py
@@ -743,6 +743,45 @@ def test_precompute_invalid_argument():
                         "Got 'auto'", ElasticNet(precompute='auto').fit, X, y)
 
 
+def test_elasticnet_precompute_incorrect_gram():
+    # check that passing an invalid precomputed Gram matrix will raise an
+    # error.
+    X, y, _, _ = build_dataset()
+
+    rng = np.random.RandomState(0)
+
+    X_centered = X - np.average(X, axis=0)
+    garbage = rng.standard_normal(X.shape)
+    precompute = np.dot(garbage.T, garbage)
+
+    clf = ElasticNet(alpha=0.01, precompute=precompute)
+    msg = "Gram matrix.*did not pass validation.*"
+    with pytest.raises(ValueError, match=msg):
+        clf.fit(X_centered, y)
+
+
+def test_elasticnet_precompute_gram_weighted_samples():
+    # check the equivalence between passing a precomputed Gram matrix and
+    # internal computation using sample weights.
+    X, y, _, _ = build_dataset()
+
+    rng = np.random.RandomState(0)
+    sample_weight = rng.lognormal(size=y.shape)
+
+    w_norm = sample_weight * (y.shape / np.sum(sample_weight))
+    X_c = (X - np.average(X, axis=0, weights=w_norm))
+    X_r = X_c * np.sqrt(w_norm)[:, np.newaxis]
+    gram = np.dot(X_r.T, X_r)
+
+    clf1 = ElasticNet(alpha=0.01, precompute=gram)
+    clf1.fit(X_c, y, sample_weight=sample_weight)
+
+    clf2 = ElasticNet(alpha=0.01, precompute=False)
+    clf2.fit(X, y, sample_weight=sample_weight)
+
+    assert_allclose(clf1.coef_, clf2.coef_)
+
+
 def test_warm_start_convergence():
     X, y, _, _ = build_dataset()
     model = ElasticNet(alpha=1e-3, tol=1e-3).fit(X, y)

From dfc5e16066b3a3bbf34238cc0f67639d0965f1a8 Mon Sep 17 00:00:00 2001
From: Shao Yang Hong <hongsy2006@gmail.com>
Date: Tue, 5 Jan 2021 17:52:32 +0800
Subject: [PATCH 039/478] DOC Add var_ attribute and deprecate sigma_ in
 GaussianNB (#18842)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Jérémie du Boisberranger <34657725+jeremiedbb@users.noreply.github.com>
Co-authored-by: Guillaume Lemaitre <g.lemaitre58@gmail.com>
---
 doc/whats_new/v1.0.rst            |  8 ++++++++
 sklearn/naive_bayes.py            | 33 +++++++++++++++++++++++--------
 sklearn/tests/test_naive_bayes.py | 19 +++++++++++++-----
 3 files changed, 47 insertions(+), 13 deletions(-)

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index ecdb38440bf0b..07e616cc47710 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -65,6 +65,14 @@ Changelog
 - |Enhancement| Validate user-supplied gram matrix passed to linear models
   via the `precompute` argument. :pr:`19004` by :user:`Adam Midvidy <amidvidy>`.
 
+:mod:`sklearn.naive_bayes`
+..........................
+
+- |API| The attribute ``sigma_`` is now deprecated in
+  :class:`naive_bayes.GaussianNB` and will be removed in 1.2.
+  Use ``var_`` instead.
+  :pr:`18842` by :user:`Hong Shao Yang <hongshaoyang>`.
+
 Code and Documentation Contributors
 -----------------------------------
 
diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py
index bcc7a9d24ce1c..a1a1cdd965a49 100644
--- a/sklearn/naive_bayes.py
+++ b/sklearn/naive_bayes.py
@@ -154,7 +154,16 @@ class labels known to the classifier
         absolute additive value to variances
 
     sigma_ : ndarray of shape (n_classes, n_features)
-        variance of each feature per class
+        Variance of each feature per class.
+
+        .. deprecated:: 1.0
+           `sigma_` is deprecated in 1.0 and will be removed in 1.2.
+           Use `var_` instead.
+
+    var_ : ndarray of shape (n_classes, n_features)
+        Variance of each feature per class.
+
+        .. versionadded:: 1.0
 
     theta_ : ndarray of shape (n_classes, n_features)
         mean of each feature per class
@@ -377,7 +386,7 @@ def _partial_fit(self, X, y, classes=None, _refit=False,
             n_features = X.shape[1]
             n_classes = len(self.classes_)
             self.theta_ = np.zeros((n_classes, n_features))
-            self.sigma_ = np.zeros((n_classes, n_features))
+            self.var_ = np.zeros((n_classes, n_features))
 
             self.class_count_ = np.zeros(n_classes, dtype=np.float64)
 
@@ -405,7 +414,7 @@ def _partial_fit(self, X, y, classes=None, _refit=False,
                 msg = "Number of features %d does not match previous data %d."
                 raise ValueError(msg % (X.shape[1], self.theta_.shape[1]))
             # Put epsilon back in each time
-            self.sigma_[:, :] -= self.epsilon_
+            self.var_[:, :] -= self.epsilon_
 
         classes = self.classes_
 
@@ -429,14 +438,14 @@ def _partial_fit(self, X, y, classes=None, _refit=False,
                 N_i = X_i.shape[0]
 
             new_theta, new_sigma = self._update_mean_variance(
-                self.class_count_[i], self.theta_[i, :], self.sigma_[i, :],
+                self.class_count_[i], self.theta_[i, :], self.var_[i, :],
                 X_i, sw_i)
 
             self.theta_[i, :] = new_theta
-            self.sigma_[i, :] = new_sigma
+            self.var_[i, :] = new_sigma
             self.class_count_[i] += N_i
 
-        self.sigma_[:, :] += self.epsilon_
+        self.var_[:, :] += self.epsilon_
 
         # Update if only no priors is provided
         if self.priors is None:
@@ -449,14 +458,22 @@ def _joint_log_likelihood(self, X):
         joint_log_likelihood = []
         for i in range(np.size(self.classes_)):
             jointi = np.log(self.class_prior_[i])
-            n_ij = - 0.5 * np.sum(np.log(2. * np.pi * self.sigma_[i, :]))
+            n_ij = - 0.5 * np.sum(np.log(2. * np.pi * self.var_[i, :]))
             n_ij -= 0.5 * np.sum(((X - self.theta_[i, :]) ** 2) /
-                                 (self.sigma_[i, :]), 1)
+                                 (self.var_[i, :]), 1)
             joint_log_likelihood.append(jointi + n_ij)
 
         joint_log_likelihood = np.array(joint_log_likelihood).T
         return joint_log_likelihood
 
+    @deprecated(  # type: ignore
+        "Attribute sigma_ was deprecated in 1.0 and will be removed in"
+        "1.2. Use var_ instead."
+    )
+    @property
+    def sigma_(self):
+        return self.var_
+
 
 _ALPHA_MIN = 1e-10
 
diff --git a/sklearn/tests/test_naive_bayes.py b/sklearn/tests/test_naive_bayes.py
index c3b0874adc9d2..753b8c981e07d 100644
--- a/sklearn/tests/test_naive_bayes.py
+++ b/sklearn/tests/test_naive_bayes.py
@@ -58,6 +58,15 @@ def test_gnb():
     assert_raises(ValueError, GaussianNB().partial_fit, X, y, classes=[0, 1])
 
 
+# TODO remove in 1.2 once sigma_ attribute is removed (GH #18842)
+def test_gnb_var():
+    clf = GaussianNB()
+    clf.fit(X, y)
+
+    with pytest.warns(FutureWarning, match="Attribute sigma_ was deprecated"):
+        assert_array_equal(clf.sigma_, clf.var_)
+
+
 def test_gnb_prior():
     # Test whether class priors are properly set.
     clf = GaussianNB().fit(X, y)
@@ -76,7 +85,7 @@ def test_gnb_sample_weight():
     clf_sw = GaussianNB().fit(X, y, sw)
 
     assert_array_almost_equal(clf.theta_, clf_sw.theta_)
-    assert_array_almost_equal(clf.sigma_, clf_sw.sigma_)
+    assert_array_almost_equal(clf.var_, clf_sw.var_)
 
     # Fitting twice with half sample-weights should result
     # in same result as fitting once with full weights
@@ -86,7 +95,7 @@ def test_gnb_sample_weight():
     clf2.partial_fit(X, y, sample_weight=sw / 2)
 
     assert_array_almost_equal(clf1.theta_, clf2.theta_)
-    assert_array_almost_equal(clf1.sigma_, clf2.sigma_)
+    assert_array_almost_equal(clf1.var_, clf2.var_)
 
     # Check that duplicate entries and correspondingly increased sample
     # weights yield the same result
@@ -97,7 +106,7 @@ def test_gnb_sample_weight():
     clf_sw = GaussianNB().fit(X, y, sample_weight)
 
     assert_array_almost_equal(clf_dupl.theta_, clf_sw.theta_)
-    assert_array_almost_equal(clf_dupl.sigma_, clf_sw.sigma_)
+    assert_array_almost_equal(clf_dupl.var_, clf_sw.var_)
 
 
 def test_gnb_neg_priors():
@@ -174,13 +183,13 @@ def test_gnb_partial_fit():
     clf = GaussianNB().fit(X, y)
     clf_pf = GaussianNB().partial_fit(X, y, np.unique(y))
     assert_array_almost_equal(clf.theta_, clf_pf.theta_)
-    assert_array_almost_equal(clf.sigma_, clf_pf.sigma_)
+    assert_array_almost_equal(clf.var_, clf_pf.var_)
     assert_array_almost_equal(clf.class_prior_, clf_pf.class_prior_)
 
     clf_pf2 = GaussianNB().partial_fit(X[0::2, :], y[0::2], np.unique(y))
     clf_pf2.partial_fit(X[1::2], y[1::2])
     assert_array_almost_equal(clf.theta_, clf_pf2.theta_)
-    assert_array_almost_equal(clf.sigma_, clf_pf2.sigma_)
+    assert_array_almost_equal(clf.var_, clf_pf2.var_)
     assert_array_almost_equal(clf.class_prior_, clf_pf2.class_prior_)
 
 
From da395dde3804152562e461662e947c0edde4b5e3 Mon Sep 17 00:00:00 2001
From: "Paulo S. Costa" <pauloscosta5@gmail.com>
Date: Tue, 5 Jan 2021 19:19:15 -0800
Subject: [PATCH 040/478] DOC Fix cross-validation wording in RidgeCV (#19121)

---
 sklearn/linear_model/_ridge.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/sklearn/linear_model/_ridge.py b/sklearn/linear_model/_ridge.py
index f3f1074312f60..2d360c6edbc58 100644
--- a/sklearn/linear_model/_ridge.py
+++ b/sklearn/linear_model/_ridge.py
@@ -1670,8 +1670,7 @@ class RidgeCV(MultiOutputMixin, RegressorMixin, _BaseRidgeCV):
 
     See glossary entry for :term:`cross-validation estimator`.
 
-    By default, it performs Leave-One-Out Cross-Validation, which is a form of
-    efficient Leave-One-Out cross-validation.
+    By default, it performs efficient Leave-One-Out Cross-Validation.
 
     Read more in the :ref:`User Guide <ridge_regression>`.
 

From d0ef86956317615204a78263f149f838c43c76a0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= <loic.esteve@ymail.com>
Date: Wed, 6 Jan 2021 10:00:25 +0100
Subject: [PATCH 041/478] Add Github Discussions to issue template. (#19108)

---
 .github/ISSUE_TEMPLATE/config.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
index 850734b78f31a..7d39c399ca81b 100644
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -1,5 +1,8 @@
 blank_issues_enabled: false
 contact_links:
+  - name: Discussions
+    url: https://github.com/scikit-learn/scikit-learn/discussions
+    about: Ask questions and discuss with other scikit-learn community members
   - name: Stack overflow
     url: https://stackoverflow.com/questions/tagged/scikit-learn
     about: Please ask and answer usage questions on stackoverflow

From 6902b1014745cbf3b1a7a85eac60b7e8e882d715 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Wed, 6 Jan 2021 04:11:53 -0500
Subject: [PATCH 042/478] TST Skips test_compare_to_ELKI for arm (#19115)

* TST Skips test for arm [cd build]

* CI Skip for 32bit linux [cd build]
---
 sklearn/cluster/tests/test_optics.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/sklearn/cluster/tests/test_optics.py b/sklearn/cluster/tests/test_optics.py
index 03ca4995c0446..4428b6c00d7eb 100644
--- a/sklearn/cluster/tests/test_optics.py
+++ b/sklearn/cluster/tests/test_optics.py
@@ -1,6 +1,8 @@
 # Authors: Shane Grigsby <refuge@rocktalus.com>
 #          Adrin Jalali <adrin.jalali@gmail.com>
 # License: BSD 3 clause
+import platform
+import sys
 
 import numpy as np
 import pytest
@@ -15,8 +17,10 @@
 from sklearn.utils._testing import assert_array_equal
 from sklearn.utils._testing import assert_raise_message
 from sklearn.utils._testing import assert_allclose
+from sklearn.utils.fixes import sp_version, parse_version
 
 from sklearn.cluster.tests.common import generate_clustered_data
+from sklearn.utils import _IS_32BIT
 
 
 rng = np.random.RandomState(0)
@@ -314,6 +318,11 @@ def test_processing_order():
     assert_array_equal(clust.ordering_, [0, 1, 2, 3])
 
 
+@pytest.mark.skipif(sp_version >= parse_version("1.6.0")
+                    and (platform.machine() == "aarch64" or
+                         (sys.platform == "linux" and _IS_32BIT)),
+                    reason=("Test fails for SciPy 1.6.0 on ARM and on 32-bit "
+                            "linux. See #19111"))
 def test_compare_to_ELKI():
     # Expected values, computed with (future) ELKI 0.7.5 using:
     # java -jar elki.jar cli -dbc.in csv -dbc.filter FixedDBIDsFilter

From cbfe0ede80beca86750ab8113f17c18c8c8042ce Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@gmail.com>
Date: Wed, 6 Jan 2021 15:48:57 +0100
Subject: [PATCH 043/478] CI Reduce travis nightly load (#19113)

Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
---
 .travis.yml | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 3f631d9f8bc90..3c995f35253ae 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -36,15 +36,21 @@ jobs:
         - BUILD_WITH_ICC=true
       if: type = cron OR commit_message =~ /\[icc-build\]/
 
-    - python: 3.7
+    # Manual trigger of linux/arm64 tests in PR without triggering the full
+    # wheel building process for all the Python versions. 
+    - python: 3.9
       os: linux
       arch: arm64
-      if: type = cron OR commit_message =~ /\[arm64\]/
+      if: commit_message =~ /\[arm64\]/
       env:
         - CPU_COUNT=8
 
-    # Linux environments to build the scikit-learn wheels
-    # for the ARM64 arquitecture and Python 3.6 and newer
+    # Linux environments to build the scikit-learn wheels for the ARM64
+    # architecture and Python 3.6 and newer. This is used both at release time
+    # with the manual trigger in the commit message in the release branch and as
+    # a scheduled task to build the weekly dev build on the master branch. The
+    # weekly frequency is meant to avoid depleting the Travis CI credits too
+    # fast.
     - python: 3.6
       os: linux
       arch: arm64

From c83125c04b5a3255802da933c6bbba1cafc703d6 Mon Sep 17 00:00:00 2001
From: shinnar <shinnar@users.noreply.github.com>
Date: Fri, 8 Jan 2021 03:12:02 -0500
Subject: [PATCH 044/478] DOC Fix docstring of HalvingSearch estimators
 (#19133)

---
 sklearn/model_selection/_search_successive_halving.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/model_selection/_search_successive_halving.py b/sklearn/model_selection/_search_successive_halving.py
index 512595b1943ce..b522ce7fbda41 100644
--- a/sklearn/model_selection/_search_successive_halving.py
+++ b/sklearn/model_selection/_search_successive_halving.py
@@ -448,7 +448,7 @@ class HalvingGridSearchCV(BaseSuccessiveHalving):
 
         The refitted estimator is made available at the ``best_estimator_``
         attribute and permits using ``predict`` directly on this
-        ``GridSearchCV`` instance.
+        ``HalvingGridSearchCV`` instance.
 
     error_score : 'raise' or numeric
         Value to assign to the score if an error occurs in estimator fitting.
@@ -735,7 +735,7 @@ class HalvingRandomSearchCV(BaseSuccessiveHalving):
 
         The refitted estimator is made available at the ``best_estimator_``
         attribute and permits using ``predict`` directly on this
-        ``GridSearchCV`` instance.
+        ``HalvingRandomSearchCV`` instance.
 
     error_score : 'raise' or numeric
         Value to assign to the score if an error occurs in estimator fitting.

From 6d3d1b853e0edf014bd38c438c8c9682d67a06f5 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 8 Jan 2021 12:39:00 +0100
Subject: [PATCH 045/478] FIX accept meta-estimator in SelfTrainingClassifier
 (#19126)

Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 doc/whats_new/v0.24.rst                       | 17 ++++++++++++
 sklearn/semi_supervised/_self_training.py     |  8 +++---
 .../tests/test_self_training.py               | 27 ++++++++++++++++++-
 3 files changed, 47 insertions(+), 5 deletions(-)

diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst
index 7197b74b94faa..c501405275696 100644
--- a/doc/whats_new/v0.24.rst
+++ b/doc/whats_new/v0.24.rst
@@ -2,6 +2,23 @@
 
 .. currentmodule:: sklearn
 
+.. _changes_0_24_1:
+
+Version 0.24.1
+==============
+
+Changelog
+---------
+
+:mod:`sklearn.semi_supervised`
+..............................
+
+- |Fix| :class:`semi_supervised.SelfTrainingClassifier` is now accepting
+  meta-estimator (e.g. :class:`ensemble.StackingClassifier`). The validation
+  of this estimator is done on the fitted estimator, once we know the existence
+  of the method `predict_proba`.
+  :pr:`19126` by :user:`Guillaume Lemaitre <glemaitre>`.
+
 .. _changes_0_24:
 
 Version 0.24.0
diff --git a/sklearn/semi_supervised/_self_training.py b/sklearn/semi_supervised/_self_training.py
index c3ed0baeaae68..8c79065c830d1 100644
--- a/sklearn/semi_supervised/_self_training.py
+++ b/sklearn/semi_supervised/_self_training.py
@@ -205,10 +205,10 @@ def fit(self, X, y):
                 X[safe_mask(X, has_label)],
                 self.transduction_[has_label])
 
-            if self.n_iter_ == 1:
-                # Only validate in the first iteration so that n_iter=0 is
-                # equivalent to the base_estimator itself.
-                _validate_estimator(self.base_estimator)
+            # Validate the fitted estimator since `predict_proba` can be
+            # delegated to an underlying "final" fitted estimator as
+            # generally done in meta-estimator or pipeline.
+            _validate_estimator(self.base_estimator_)
 
             # Predict on the unlabeled samples
             prob = self.base_estimator_.predict_proba(
diff --git a/sklearn/semi_supervised/tests/test_self_training.py b/sklearn/semi_supervised/tests/test_self_training.py
index b5c44996d5e52..7c5287be9974c 100644
--- a/sklearn/semi_supervised/tests/test_self_training.py
+++ b/sklearn/semi_supervised/tests/test_self_training.py
@@ -4,14 +4,16 @@
 from numpy.testing import assert_array_equal
 import pytest
 
+from sklearn.ensemble import StackingClassifier
 from sklearn.exceptions import NotFittedError
-from sklearn.semi_supervised import SelfTrainingClassifier
 from sklearn.neighbors import KNeighborsClassifier
 from sklearn.svm import SVC
 from sklearn.model_selection import train_test_split
 from sklearn.datasets import load_iris, make_blobs
 from sklearn.metrics import accuracy_score
 
+from sklearn.semi_supervised import SelfTrainingClassifier
+
 # Author: Oliver Rausch <rauscho@ethz.ch>
 # License: BSD 3 clause
 
@@ -318,3 +320,26 @@ def test_k_best_selects_best():
 
     for row in most_confident_svc.tolist():
         assert row in added_by_st
+
+
+def test_base_estimator_meta_estimator():
+    # Check that a meta-estimator relying on an estimator implementing
+    # `predict_proba` will work even if it does expose this method before being
+    # fitted.
+    # Non-regression test for:
+    # https://github.com/scikit-learn/scikit-learn/issues/19119
+
+    base_estimator = StackingClassifier(
+        estimators=[
+            ("svc_1", SVC(probability=True)), ("svc_2", SVC(probability=True)),
+        ],
+        final_estimator=SVC(probability=True), cv=2
+    )
+
+    # make sure that the `base_estimator` does not expose `predict_proba`
+    # without being fitted
+    assert not hasattr(base_estimator, "predict_proba")
+
+    clf = SelfTrainingClassifier(base_estimator=base_estimator)
+    clf.fit(X_train, y_train_missing_labels)
+    clf.predict_proba(X_test)

From 22cd233e1932457947e9994285dc7fd4e93881e4 Mon Sep 17 00:00:00 2001
From: Connor Tann <connor_tann@hotmail.com>
Date: Fri, 8 Jan 2021 13:34:35 +0000
Subject: [PATCH 046/478] DOC Fix typo in datasets.rst (#19136)

Fix typo in dataset loading docs
---
 doc/datasets.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/datasets.rst b/doc/datasets.rst
index 30efdae06b1e3..b9484a02ce84c 100644
--- a/doc/datasets.rst
+++ b/doc/datasets.rst
@@ -39,8 +39,8 @@ an array of shape ``n_samples`` * ``n_features`` with
 key ``data`` (except for 20newsgroups) and a numpy array of
 length ``n_samples``, containing the target values, with key ``target``.
 
-The Bunch object is a dictionary that exposes its keys are attributes.
-For more information about Bunch object, see :class:`~sklearn.utils.Bunch`:
+The Bunch object is a dictionary that exposes its keys as attributes.
+For more information about Bunch object, see :class:`~sklearn.utils.Bunch`.
 
 It's also possible for almost all of these function to constrain the output
 to be a tuple containing only the data and the target, by setting the

From 266a11b2e17cf86effefe7b498b61ca31217ad31 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 8 Jan 2021 18:55:53 +0100
Subject: [PATCH 047/478] DOC Update docs guideline regarding docstring
 formatting (#18243)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
Co-authored-by: Jérémie du Boisberranger <34657725+jeremiedbb@users.noreply.github.com>
---
 doc/developers/contributing.rst               | 14 ++++++-
 doc/glossary.rst                              |  7 ++++
 .../model_selection/plot_learning_curve.py    |  2 +-
 sklearn/dummy.py                              |  8 ++--
 sklearn/linear_model/_least_angle.py          | 38 ++++++++++---------
 sklearn/preprocessing/_discretization.py      |  6 +--
 6 files changed, 48 insertions(+), 27 deletions(-)

diff --git a/doc/developers/contributing.rst b/doc/developers/contributing.rst
index 78c1175620c4f..8a3c460c615a8 100644
--- a/doc/developers/contributing.rst
+++ b/doc/developers/contributing.rst
@@ -779,6 +779,8 @@ Finally, follow the formatting rules below to make it consistently good:
 
     sample_weight : array-like of shape (n_samples,), default=None
 
+    multioutput_array : ndarray of shape (n_samples, n_classes) or list of such arrays
+
   In general have the following in mind:
 
       1. Use Python basic types. (``bool`` instead of ``boolean``)
@@ -792,10 +794,18 @@ Finally, follow the formatting rules below to make it consistently good:
       5. Specify ``dataframe`` when "frame-like" features are being used, such
          as the column names.
       6. When specifying the data type of a list, use ``of`` as a delimiter:
-         ``list of int``.
+         ``list of int``. When the parameter supports arrays giving details
+         about the shape and/or data type and a list of such arrays, you can
+         use one of ``array-like of shape (n_samples,) or list of such arrays``.
       7. When specifying the dtype of an ndarray, use e.g. ``dtype=np.int32``
          after defining the shape:
-         ``ndarray of shape (n_samples,), dtype=np.int32``.
+         ``ndarray of shape (n_samples,), dtype=np.int32``. You can specify
+         multiple dtype as a set:
+         ``array-like of shape (n_samples,), dtype={np.float64, np.float32}``.
+         If one wants to mention arbitrary precision, use `integral` and
+         `floating` rather than the Python dtype `int` and `float`. When both
+         `int` and `floating` are supported, there is no need to specify the
+         dtype.
       8. When the default is ``None``, ``None`` only needs to be specified at the
          end with ``default=None``. Be sure to include in the docstring, what it
          means for the parameter or attribute to be ``None``.
diff --git a/doc/glossary.rst b/doc/glossary.rst
index 30e647be1c0f4..a43eda4a79b67 100644
--- a/doc/glossary.rst
+++ b/doc/glossary.rst
@@ -255,6 +255,13 @@ General Concepts
         or vectorizing.  Our estimators do not work with struct arrays, for
         instance.
 
+        Our documentation can sometimes give information about the dtype
+        precision, e.g. `np.int32`, `np.int64`, etc. When the precision is
+        provided, it refers to the NumPy dtype. If an arbitrary precision is
+        used, the documentation will refer to dtype `integer` or `floating`.
+        Note that in this case, the precision can be platform dependent.
+        The `numeric` dtype refers to accepting both `integer` and `floating`.
+
         TODO: Mention efficiency and precision issues; casting policy.
 
     duck typing
diff --git a/examples/model_selection/plot_learning_curve.py b/examples/model_selection/plot_learning_curve.py
index ee9809f27e44f..71cc565c3528c 100644
--- a/examples/model_selection/plot_learning_curve.py
+++ b/examples/model_selection/plot_learning_curve.py
@@ -77,7 +77,7 @@ def plot_learning_curve(estimator, title, X, y, axes=None, ylim=None, cv=None,
         ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
         for more details.
 
-    train_sizes : array-like of shape (n_ticks,), dtype={int, float}
+    train_sizes : array-like of shape (n_ticks,)
         Relative or absolute numbers of training examples that will be used to
         generate the learning curve. If the ``dtype`` is float, it is regarded
         as a fraction of the maximum size of the training set (that is
diff --git a/sklearn/dummy.py b/sklearn/dummy.py
index 66992d83f83f4..ad5ab3f24731d 100644
--- a/sklearn/dummy.py
+++ b/sklearn/dummy.py
@@ -64,13 +64,13 @@ class DummyClassifier(MultiOutputMixin, ClassifierMixin, BaseEstimator):
 
     Attributes
     ----------
-    classes_ : ndarray of shape (n_classes,) or list thereof
+    classes_ : ndarray of shape (n_classes,) or list of such arrays
         Class labels for each output.
 
     n_classes_ : int or list of int
         Number of label for each output.
 
-    class_prior_ : ndarray of shape (n_classes,) or list thereof
+    class_prior_ : ndarray of shape (n_classes,) or list of such arrays
         Probability of each class for each output.
 
     n_outputs_ : int
@@ -272,7 +272,7 @@ def predict_proba(self, X):
 
         Returns
         -------
-        P : ndarray of shape (n_samples, n_classes) or list thereof
+        P : ndarray of shape (n_samples, n_classes) or list of such arrays
             Returns the probability of the sample for each class in
             the model, where classes are ordered arithmetically, for each
             output.
@@ -335,7 +335,7 @@ def predict_log_proba(self, X):
 
         Returns
         -------
-        P : ndarray of shape (n_samples, n_classes) or list thereof
+        P : ndarray of shape (n_samples, n_classes) or list of such arrays
             Returns the log probability of the sample for each class in
             the model, where classes are ordered arithmetically for each
             output.
diff --git a/sklearn/linear_model/_least_angle.py b/sklearn/linear_model/_least_angle.py
index e1d9146b5f2ea..55e37ff51fc6a 100644
--- a/sklearn/linear_model/_least_angle.py
+++ b/sklearn/linear_model/_least_angle.py
@@ -864,21 +864,22 @@ class Lars(MultiOutputMixin, RegressorMixin, LinearModel):
 
     Attributes
     ----------
-    alphas_ : array-like of shape (n_alphas + 1,) or list of thereof of \
-            shape (n_targets,)
+    alphas_ : array-like of shape (n_alphas + 1,) or list of such arrays
         Maximum of covariances (in absolute value) at each iteration.
         ``n_alphas`` is either ``max_iter``, ``n_features`` or the
         number of nodes in the path with ``alpha >= alpha_min``, whichever
-        is smaller.
+        is smaller. If this is a list of array-like, the length of the outer
+        list is `n_targets`.
 
-    active_ : list of shape (n_alphas,) or list of thereof of shape \
-            (n_targets,)
+    active_ : list of shape (n_alphas,) or list of such lists
         Indices of active variables at the end of the path.
+        If this is a list of list, the length of the outer list is `n_targets`.
 
-    coef_path_ : array-like of shape (n_features, n_alphas + 1) or list of \
-            thereof of shape (n_targets,)
+    coef_path_ : array-like of shape (n_features, n_alphas + 1) or list \
+            of such arrays
         The varying values of the coefficients along the path. It is not
-        present if the ``fit_path`` parameter is ``False``.
+        present if the ``fit_path`` parameter is ``False``. If this is a list
+        of array-like, the length of the outer list is `n_targets`.
 
     coef_ : array-like of shape (n_features,) or (n_targets, n_features)
         Parameter vector (w in the formulation formula).
@@ -1121,21 +1122,23 @@ class LassoLars(Lars):
 
     Attributes
     ----------
-    alphas_ : array-like of shape (n_alphas + 1,) or list of thereof of shape \
-            (n_targets,)
+    alphas_ : array-like of shape (n_alphas + 1,) or list of such arrays
         Maximum of covariances (in absolute value) at each iteration.
         ``n_alphas`` is either ``max_iter``, ``n_features`` or the
         number of nodes in the path with ``alpha >= alpha_min``, whichever
-        is smaller.
+        is smaller. If this is a list of array-like, the length of the outer
+        list is `n_targets`.
 
-    active_ : list of length n_alphas or list of thereof of shape (n_targets,)
+    active_ : list of length n_alphas or list of such lists
         Indices of active variables at the end of the path.
+        If this is a list of list, the length of the outer list is `n_targets`.
 
-    coef_path_ : array-like of shape (n_features, n_alphas + 1) or list of \
-            thereof of shape (n_targets,)
+    coef_path_ : array-like of shape (n_features, n_alphas + 1) or list \
+            of such arrays
         If a list is passed it's expected to be one of n_targets such arrays.
         The varying values of the coefficients along the path. It is not
-        present if the ``fit_path`` parameter is ``False``.
+        present if the ``fit_path`` parameter is ``False``. If this is a list
+        of array-like, the length of the outer list is `n_targets`.
 
     coef_ : array-like of shape (n_features,) or (n_targets, n_features)
         Parameter vector (w in the formulation formula).
@@ -1382,8 +1385,9 @@ class LarsCV(Lars):
 
     Attributes
     ----------
-    active_ : list of length n_alphas or list of thereof of shape (n_targets,)
+    active_ : list of length n_alphas or list of such lists
         Indices of active variables at the end of the path.
+        If this is a list of lists, the outer list length is `n_targets`.
 
     coef_ : array-like of shape (n_features,)
         parameter vector (w in the formulation formula)
@@ -1775,7 +1779,7 @@ class LassoLarsIC(LassoLars):
     alpha_ : float
         the alpha parameter chosen by the information criterion
 
-    alphas_ : array-like of shape (n_alphas + 1,) or list thereof
+    alphas_ : array-like of shape (n_alphas + 1,) or list of such arrays
         Maximum of covariances (in absolute value) at each iteration.
         ``n_alphas`` is either ``max_iter``, ``n_features`` or the
         number of nodes in the path with ``alpha >= alpha_min``, whichever
diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py
index a628533ac13d0..22fa236f3314e 100644
--- a/sklearn/preprocessing/_discretization.py
+++ b/sklearn/preprocessing/_discretization.py
@@ -139,7 +139,7 @@ def fit(self, X, y=None):
 
         Parameters
         ----------
-        X : array-like of shape (n_samples, n_features), dtype={int, float}
+        X : array-like of shape (n_samples, n_features)
             Data to be discretized.
 
         y : None
@@ -276,7 +276,7 @@ def transform(self, X):
 
         Parameters
         ----------
-        X : array-like of shape (n_samples, n_features), dtype={int, float}
+        X : array-like of shape (n_samples, n_features)
             Data to be discretized.
 
         Returns
@@ -326,7 +326,7 @@ def inverse_transform(self, Xt):
 
         Parameters
         ----------
-        Xt : array-like of shape (n_samples, n_features), dtype={int, float}
+        Xt : array-like of shape (n_samples, n_features)
             Transformed data in the binned space.
 
         Returns

From 1e46db669318fe20458d7cf135f6107e19e90970 Mon Sep 17 00:00:00 2001
From: Kunj <kunjparikh6@gmail.com>
Date: Sat, 9 Jan 2021 09:47:13 -0800
Subject: [PATCH 048/478] DOC Update docs for StandardScaler.scale_ to include
 0 variance (#19124)

Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
---
 sklearn/preprocessing/_data.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py
index 478d41ecc768a..3921b898c072d 100644
--- a/sklearn/preprocessing/_data.py
+++ b/sklearn/preprocessing/_data.py
@@ -617,8 +617,11 @@ class StandardScaler(TransformerMixin, BaseEstimator):
     Attributes
     ----------
     scale_ : ndarray of shape (n_features,) or None
-        Per feature relative scaling of the data. This is calculated using
-        `np.sqrt(var_)`. Equal to ``None`` when ``with_std=False``.
+        Per feature relative scaling of the data to achieve zero mean and unit
+        variance. Generally this is calculated using `np.sqrt(var_)`. If a
+        variance is zero, we can't achieve unit variance, and the data is left
+        as-is, giving a scaling factor of 1. `scale_` is equal to `None`
+        when `with_std=False`.
 
         .. versionadded:: 0.17
            *scale_*

From 34de1b9b2122783601b245450a1885d18558ac81 Mon Sep 17 00:00:00 2001
From: Zero <Zeroto521@gmail.com>
Date: Mon, 11 Jan 2021 02:42:09 +0800
Subject: [PATCH 049/478] ENH add fontname argument in export_graphviz for
 non-English characters  (#18959)

---
 doc/whats_new/v1.0.rst            |  7 ++++++
 sklearn/tree/_export.py           | 29 +++++++++++++-----------
 sklearn/tree/tests/test_export.py | 37 ++++++++++++++++++++-----------
 3 files changed, 47 insertions(+), 26 deletions(-)

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 07e616cc47710..e15b8c878dc5c 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -44,6 +44,13 @@ Changelog
     :pr:`123456` by :user:`Joe Bloggs <joeongithub>`.
     where 123456 is the *pull request* number, not the issue number.
 
+:mod:`sklearn.tree`
+...................
+
+- |Enhancement| Add `fontname` argument in :func:`tree.export_graphviz`
+  for non-English characters. :pr:`18959` by :user:`Zero <Zeroto521>`
+  and :user:`wstates <wstates>`.
+
 :mod:`sklearn.cluster`
 ......................
 
diff --git a/sklearn/tree/_export.py b/sklearn/tree/_export.py
index 1615c8eb15028..ff29790e3699e 100644
--- a/sklearn/tree/_export.py
+++ b/sklearn/tree/_export.py
@@ -371,18 +371,17 @@ def __init__(self, out_file=SENTINEL, max_depth=None,
                  feature_names=None, class_names=None, label='all',
                  filled=False, leaves_parallel=False, impurity=True,
                  node_ids=False, proportion=False, rotate=False, rounded=False,
-                 special_characters=False, precision=3):
+                 special_characters=False, precision=3, fontname='helvetica'):
 
         super().__init__(
             max_depth=max_depth, feature_names=feature_names,
             class_names=class_names, label=label, filled=filled,
-            impurity=impurity,
-            node_ids=node_ids, proportion=proportion, rotate=rotate,
-            rounded=rounded,
-            precision=precision)
+            impurity=impurity, node_ids=node_ids, proportion=proportion,
+            rotate=rotate, rounded=rounded, precision=precision)
         self.leaves_parallel = leaves_parallel
         self.out_file = out_file
         self.special_characters = special_characters
+        self.fontname = fontname
 
         # PostScript compatibility for special characters
         if special_characters:
@@ -449,16 +448,17 @@ def head(self):
             self.out_file.write(
                 ', style="%s", color="black"'
                 % ", ".join(rounded_filled))
-        if self.rounded:
-            self.out_file.write(', fontname=helvetica')
+
+        self.out_file.write(', fontname="%s"' % self.fontname)
         self.out_file.write('] ;\n')
 
         # Specify graph & edge aesthetics
         if self.leaves_parallel:
             self.out_file.write(
                 'graph [ranksep=equally, splines=polyline] ;\n')
-        if self.rounded:
-            self.out_file.write('edge [fontname=helvetica] ;\n')
+
+        self.out_file.write('edge [fontname="%s"] ;\n' % self.fontname)
+
         if self.rotate:
             self.out_file.write('rankdir=LR ;\n')
 
@@ -667,7 +667,8 @@ def export_graphviz(decision_tree, out_file=None, *, max_depth=None,
                     feature_names=None, class_names=None, label='all',
                     filled=False, leaves_parallel=False, impurity=True,
                     node_ids=False, proportion=False, rotate=False,
-                    rounded=False, special_characters=False, precision=3):
+                    rounded=False, special_characters=False, precision=3,
+                    fontname='helvetica'):
     """Export a decision tree in DOT format.
 
     This function generates a GraphViz representation of the decision tree,
@@ -734,8 +735,7 @@ def export_graphviz(decision_tree, out_file=None, *, max_depth=None,
         When set to ``True``, orient tree left to right rather than top-down.
 
     rounded : bool, default=False
-        When set to ``True``, draw node boxes with rounded corners and use
-        Helvetica fonts instead of Times-Roman.
+        When set to ``True``, draw node boxes with rounded corners.
 
     special_characters : bool, default=False
         When set to ``False``, ignore special characters for PostScript
@@ -745,6 +745,9 @@ def export_graphviz(decision_tree, out_file=None, *, max_depth=None,
         Number of digits of precision for floating point in the values of
         impurity, threshold and value attributes of each node.
 
+    fontname : str, default='helvetica'
+        Name of font used to render text.
+
     Returns
     -------
     dot_data : string
@@ -784,7 +787,7 @@ def export_graphviz(decision_tree, out_file=None, *, max_depth=None,
             filled=filled, leaves_parallel=leaves_parallel, impurity=impurity,
             node_ids=node_ids, proportion=proportion, rotate=rotate,
             rounded=rounded, special_characters=special_characters,
-            precision=precision)
+            precision=precision, fontname=fontname)
         exporter.export(decision_tree)
 
         if return_string:
diff --git a/sklearn/tree/tests/test_export.py b/sklearn/tree/tests/test_export.py
index a1b04e171e59a..6a7bf33b2143f 100644
--- a/sklearn/tree/tests/test_export.py
+++ b/sklearn/tree/tests/test_export.py
@@ -33,7 +33,8 @@ def test_graphviz_toy():
     # Test export code
     contents1 = export_graphviz(clf, out_file=None)
     contents2 = 'digraph Tree {\n' \
-                'node [shape=box] ;\n' \
+                'node [shape=box, fontname="helvetica"] ;\n' \
+                'edge [fontname="helvetica"] ;\n' \
                 '0 [label="X[0] <= 0.0\\ngini = 0.5\\nsamples = 6\\n' \
                 'value = [3, 3]"] ;\n' \
                 '1 [label="gini = 0.0\\nsamples = 3\\nvalue = [3, 0]"] ;\n' \
@@ -50,7 +51,8 @@ def test_graphviz_toy():
     contents1 = export_graphviz(clf, feature_names=["feature0", "feature1"],
                                 out_file=None)
     contents2 = 'digraph Tree {\n' \
-                'node [shape=box] ;\n' \
+                'node [shape=box, fontname="helvetica"] ;\n' \
+                'edge [fontname="helvetica"] ;\n' \
                 '0 [label="feature0 <= 0.0\\ngini = 0.5\\nsamples = 6\\n' \
                 'value = [3, 3]"] ;\n' \
                 '1 [label="gini = 0.0\\nsamples = 3\\nvalue = [3, 0]"] ;\n' \
@@ -66,7 +68,8 @@ def test_graphviz_toy():
     # Test with class_names
     contents1 = export_graphviz(clf, class_names=["yes", "no"], out_file=None)
     contents2 = 'digraph Tree {\n' \
-                'node [shape=box] ;\n' \
+                'node [shape=box, fontname="helvetica"] ;\n' \
+                'edge [fontname="helvetica"] ;\n' \
                 '0 [label="X[0] <= 0.0\\ngini = 0.5\\nsamples = 6\\n' \
                 'value = [3, 3]\\nclass = yes"] ;\n' \
                 '1 [label="gini = 0.0\\nsamples = 3\\nvalue = [3, 0]\\n' \
@@ -84,11 +87,11 @@ def test_graphviz_toy():
     # Test plot_options
     contents1 = export_graphviz(clf, filled=True, impurity=False,
                                 proportion=True, special_characters=True,
-                                rounded=True, out_file=None)
+                                rounded=True, out_file=None, fontname="sans")
     contents2 = 'digraph Tree {\n' \
                 'node [shape=box, style="filled, rounded", color="black", ' \
-                'fontname=helvetica] ;\n' \
-                'edge [fontname=helvetica] ;\n' \
+                'fontname="sans"] ;\n' \
+                'edge [fontname="sans"] ;\n' \
                 '0 [label=<X<SUB>0</SUB> &le; 0.0<br/>samples = 100.0%<br/>' \
                 'value = [0.5, 0.5]>, fillcolor="#ffffff"] ;\n' \
                 '1 [label=<samples = 50.0%<br/>value = [1.0, 0.0]>, ' \
@@ -107,7 +110,8 @@ def test_graphviz_toy():
     contents1 = export_graphviz(clf, max_depth=0,
                                 class_names=True, out_file=None)
     contents2 = 'digraph Tree {\n' \
-                'node [shape=box] ;\n' \
+                'node [shape=box, fontname="helvetica"] ;\n' \
+                'edge [fontname="helvetica"] ;\n' \
                 '0 [label="X[0] <= 0.0\\ngini = 0.5\\nsamples = 6\\n' \
                 'value = [3, 3]\\nclass = y[0]"] ;\n' \
                 '1 [label="(...)"] ;\n' \
@@ -122,7 +126,9 @@ def test_graphviz_toy():
     contents1 = export_graphviz(clf, max_depth=0, filled=True,
                                 out_file=None, node_ids=True)
     contents2 = 'digraph Tree {\n' \
-                'node [shape=box, style="filled", color="black"] ;\n' \
+                'node [shape=box, style="filled", color="black", '\
+                'fontname="helvetica"] ;\n' \
+                'edge [fontname="helvetica"] ;\n' \
                 '0 [label="node #0\\nX[0] <= 0.0\\ngini = 0.5\\n' \
                 'samples = 6\\nvalue = [3, 3]", fillcolor="#ffffff"] ;\n' \
                 '1 [label="(...)", fillcolor="#C0C0C0"] ;\n' \
@@ -143,7 +149,9 @@ def test_graphviz_toy():
     contents1 = export_graphviz(clf, filled=True,
                                 impurity=False, out_file=None)
     contents2 = 'digraph Tree {\n' \
-                'node [shape=box, style="filled", color="black"] ;\n' \
+                'node [shape=box, style="filled", color="black", ' \
+                'fontname="helvetica"] ;\n' \
+                'edge [fontname="helvetica"] ;\n' \
                 '0 [label="X[0] <= 0.0\\nsamples = 6\\n' \
                 'value = [[3.0, 1.5, 0.0]\\n' \
                 '[3.0, 1.0, 0.5]]", fillcolor="#ffffff"] ;\n' \
@@ -174,12 +182,13 @@ def test_graphviz_toy():
     clf.fit(X, y)
 
     contents1 = export_graphviz(clf, filled=True, leaves_parallel=True,
-                                out_file=None, rotate=True, rounded=True)
+                                out_file=None, rotate=True, rounded=True,
+                                fontname="sans")
     contents2 = 'digraph Tree {\n' \
                 'node [shape=box, style="filled, rounded", color="black", ' \
-                'fontname=helvetica] ;\n' \
+                'fontname="sans"] ;\n' \
                 'graph [ranksep=equally, splines=polyline] ;\n' \
-                'edge [fontname=helvetica] ;\n' \
+                'edge [fontname="sans"] ;\n' \
                 'rankdir=LR ;\n' \
                 '0 [label="X[0] <= 0.0\\nmse = 1.0\\nsamples = 6\\n' \
                 'value = 0.0", fillcolor="#f2c09c"] ;\n' \
@@ -203,7 +212,9 @@ def test_graphviz_toy():
 
     contents1 = export_graphviz(clf, filled=True, out_file=None)
     contents2 = 'digraph Tree {\n' \
-                'node [shape=box, style="filled", color="black"] ;\n' \
+                'node [shape=box, style="filled", color="black", '\
+                'fontname="helvetica"] ;\n' \
+                'edge [fontname="helvetica"] ;\n' \
                 '0 [label="gini = 0.0\\nsamples = 6\\nvalue = 6.0", ' \
                 'fillcolor="#ffffff"] ;\n' \
                 '}'

From fe52d1e87af8729111155f821138e4baa618099c Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@gmail.com>
Date: Mon, 11 Jan 2021 14:12:16 +0100
Subject: [PATCH 050/478] CI Use macos-10.13 compatible libomp when building
 the wheels (#19064)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Jérémie du Boisberranger <34657725+jeremiedbb@users.noreply.github.com>
---
 .github/workflows/wheels.yml       |  1 +
 build_tools/github/build_wheels.sh | 10 +++++++++-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
index ac1d495642049..17726ec9a112b 100644
--- a/.github/workflows/wheels.yml
+++ b/.github/workflows/wheels.yml
@@ -86,6 +86,7 @@ jobs:
                             OPENBLAS_NUM_THREADS=2
                             SKLEARN_SKIP_NETWORK_TESTS=1
                             SKLEARN_BUILD_PARALLEL=3
+                            MACOSX_DEPLOYMENT_TARGET=10.13
           CIBW_BUILD: cp${{ matrix.python }}-${{ matrix.platform_id }}
           CIBW_REPAIR_WHEEL_COMMAND_WINDOWS: bash build_tools/github/repair_windows_wheels.sh {wheel} {dest_dir} ${{ matrix.bitness }}
           CIBW_BEFORE_TEST_WINDOWS: bash build_tools/github/build_minimal_windows_image.sh ${{ matrix.python }} ${{ matrix.bitness }}
diff --git a/build_tools/github/build_wheels.sh b/build_tools/github/build_wheels.sh
index 917fc14fdb651..9b45481cbb978 100644
--- a/build_tools/github/build_wheels.sh
+++ b/build_tools/github/build_wheels.sh
@@ -5,7 +5,15 @@ set -x
 
 # OpenMP is not present on macOS by default
 if [[ "$RUNNER_OS" == "macOS" ]]; then
-    brew install libomp
+    # Make sure to use a libomp version binary compatible with the oldest
+    # supported version of the macos SDK as libomp will be vendored into the
+    # scikit-learn wheels for macos. The list of bottles can be found at:
+    # https://formulae.brew.sh/api/formula/libomp.json. Currently, the oldest
+    # supported macos version is: High Sierra / 10.13. When upgrading this, be
+    # sure to update the MACOSX_DEPLOYMENT_TARGET environment variable in
+    # wheels.yml accordingly.
+    wget https://homebrew.bintray.com/bottles/libomp-11.0.0.high_sierra.bottle.tar.gz
+    brew install libomp-11.0.0.high_sierra.bottle.tar.gz
     export CC=/usr/bin/clang
     export CXX=/usr/bin/clang++
     export CPPFLAGS="$CPPFLAGS -Xpreprocessor -fopenmp"

From 8ce2c2ba92005b62d962c763cea0d6f6d84e9cad Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Mon, 11 Jan 2021 17:37:31 +0000
Subject: [PATCH 051/478] DOC minor broken links fix in parallelism docs
 (#19151)

Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
---
 doc/computing/parallelism.rst | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/doc/computing/parallelism.rst b/doc/computing/parallelism.rst
index c30d0790c1f01..3dce5ef66bb1d 100644
--- a/doc/computing/parallelism.rst
+++ b/doc/computing/parallelism.rst
@@ -114,9 +114,11 @@ threads than the number of CPUs on a machine. Over-subscription happens when
 a program is running too many threads at the same time.
 
 Suppose you have a machine with 8 CPUs. Consider a case where you're running
-a :class:`~GridSearchCV` (parallelized with joblib) with ``n_jobs=8`` over
-a :class:`~HistGradientBoostingClassifier` (parallelized with OpenMP). Each
-instance of :class:`~HistGradientBoostingClassifier` will spawn 8 threads
+a :class:`~sklearn.model_selection.GridSearchCV` (parallelized with joblib)
+with ``n_jobs=8`` over a
+:class:`~sklearn.ensemble.HistGradientBoostingClassifier` (parallelized with
+OpenMP). Each instance of
+:class:`~sklearn.ensemble.HistGradientBoostingClassifier` will spawn 8 threads
 (since you have 8 CPUs). That's a total of ``8 * 8 = 64`` threads, which
 leads to oversubscription of physical CPU resources and to scheduling
 overhead.
@@ -129,9 +131,10 @@ is the default), joblib will tell its child **processes** to limit the
 number of threads they can use, so as to avoid oversubscription. In practice
 the heuristic that joblib uses is to tell the processes to use ``max_threads
 = n_cpus // n_jobs``, via their corresponding environment variable. Back to
-our example from above, since the joblib backend of :class:`~GridSearchCV`
-is ``loky``, each process will only be able to use 1 thread instead of 8,
-thus mitigating the oversubscription issue.
+our example from above, since the joblib backend of
+:class:`~sklearn.model_selection.GridSearchCV` is ``loky``, each process will
+only be able to use 1 thread instead of 8, thus mitigating the
+oversubscription issue.
 
 Note that:
 

From 8df0aa6baec24147eb362a69cb7a57b913cc2f6c Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Mon, 11 Jan 2021 12:44:04 -0500
Subject: [PATCH 052/478] DOC Adds default to SpectralClustering (#19149)

---
 sklearn/cluster/_spectral.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/cluster/_spectral.py b/sklearn/cluster/_spectral.py
index 79a0b77954028..b86d5870025c3 100644
--- a/sklearn/cluster/_spectral.py
+++ b/sklearn/cluster/_spectral.py
@@ -356,7 +356,7 @@ class SpectralClustering(ClusterMixin, BaseEstimator):
         increase with similarity) should be used. This property is not checked
         by the clustering algorithm.
 
-    n_neighbors : int
+    n_neighbors : int, default=10
         Number of neighbors to use when constructing the affinity matrix using
         the nearest neighbors method. Ignored for ``affinity='rbf'``.
 

From 158584a8cae1c7d895cf2b0f9a297d67571385d1 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@gmail.com>
Date: Mon, 11 Jan 2021 18:46:18 +0100
Subject: [PATCH 053/478] DOC Update installation instructions for macos/arm64
 (#19146)

---
 doc/developers/advanced_installation.rst |  7 +++-
 doc/install.rst                          | 45 +++++++++++++++++-------
 2 files changed, 39 insertions(+), 13 deletions(-)

diff --git a/doc/developers/advanced_installation.rst b/doc/developers/advanced_installation.rst
index fdda0076428af..7b305c13200a7 100644
--- a/doc/developers/advanced_installation.rst
+++ b/doc/developers/advanced_installation.rst
@@ -238,6 +238,11 @@ to enable OpenMP support:
 
 - or install `libomp` with Homebrew to extend the default Apple clang compiler.
 
+For Apple Silicon M1 hardware, only the conda-forge method below is known to
+work at the time of writing (January 2021). You can install the `macos/arm64`
+distribution of conda using the `miniforge installer
+<https://github.com/conda-forge/miniforge#miniforge>`_
+
 macOS compilers from conda-forge
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -257,7 +262,7 @@ scikit-learn from source:
 .. prompt:: bash $
 
     conda create -n sklearn-dev -c conda-forge python numpy scipy cython \
-        joblib threadpoolctl pytest "compilers>=1.0.4,!=1.1.0" llvm-openmp
+        joblib threadpoolctl pytest compilers llvm-openmp
     conda activate sklearn-dev
     make clean
     pip install --verbose --no-build-isolation --editable .
diff --git a/doc/install.rst b/doc/install.rst
index 57cb489a11262..7912cc4dc4df6 100644
--- a/doc/install.rst
+++ b/doc/install.rst
@@ -59,7 +59,10 @@ Installing the latest release
          <span class="sk-expandable" data-packager="pip" data-os="windows">Install the 64bit version of Python 3, for instance from <a href="https://www.python.org/">https://www.python.org</a>.</span
          ><span class="sk-expandable" data-packager="pip" data-os="mac">Install Python 3 using <a href="https://brew.sh/">homebrew</a> (<code>brew install python</code>) or by manually installing the package from <a href="https://www.python.org">https://www.python.org</a>.</span
          ><span class="sk-expandable" data-packager="pip" data-os="linux">Install python3 and python3-pip using the package manager of the Linux Distribution.</span
-         ><span class="sk-expandable" data-packager="conda"><a href="https://docs.conda.io/projects/conda/en/latest/user-guide/install/">Install conda</a> (no administrator permission required).</span>
+         ><span class="sk-expandable" data-packager="conda"
+            >Install conda using the <a href="https://docs.conda.io/projects/conda/en/latest/user-guide/install/">Anaconda or miniconda</a>
+             installers or the <a href="https://https://github.com/conda-forge/miniforge#miniforge">miniforge</a> installers
+             (no administrator permission required for any of those).</span>
        </div>
 
 Then run:
@@ -106,17 +109,15 @@ In order to check your installation you can use
   </div>
 
 Note that in order to avoid potential conflicts with other packages it is
-strongly recommended to use a virtual environment, e.g. python3 ``virtualenv``
-(see `python3 virtualenv documentation
-<https://docs.python.org/3/tutorial/venv.html>`_) or `conda environments
+strongly recommended to use a `virtual environment (venv)
+<https://docs.python.org/3/tutorial/venv.html>`_ or a `conda environment
 <https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html>`_.
 
-Using an isolated environment makes possible to install a specific version of
-scikit-learn and its dependencies independently of any previously installed
-Python packages.
-In particular under Linux is it discouraged to install pip packages alongside
-the packages managed by the package manager of the distribution
-(apt, dnf, pacman...).
+Using such an isolated environment makes it possible to install a specific
+version of scikit-learn with pip or conda and its dependencies independently of
+any previously installed Python packages. In particular under Linux is it
+discouraged to install pip packages alongside the packages managed by the
+package manager of the distribution (apt, dnf, pacman...).
 
 Note that you should always remember to activate the environment of your choice
 prior to running any Python command whenever you start a new terminal session.
@@ -127,8 +128,6 @@ and NumPy and SciPy are not recompiled from source, which can happen when using
 particular configurations of operating system and hardware (such as Linux on
 a Raspberry Pi).
 
-If you must install scikit-learn and its dependencies with pip, you can install
-it as ``scikit-learn[alldeps]``.
 
 Scikit-learn plotting capabilities (i.e., functions start with "plot\_"
 and classes end with "Display") require Matplotlib. The examples require
@@ -151,6 +150,28 @@ purpose.
    For installing on PyPy, PyPy3-v5.10+, Numpy 1.14.0+, and scipy 1.1.0+
    are required.
 
+.. _install_on_apple_silicon_m1:
+
+Installing on Apple Silicon M1 hardware
+=======================================
+
+The recently introduced `macos/arm64` platform (sometimes also known as
+`macos/aarch64`) requires the open source community to upgrade the build
+configuation and automation to properly support it.
+
+At the time of writing (January 2021), the only way to get a working
+installation of scikit-learn on this hardware is to install scikit-learn and its
+dependencies from the conda-forge distribution, for instance using the miniforge
+installers:
+
+https://github.com/conda-forge/miniforge
+
+The following issue tracks progress on making it possible to install
+scikit-learn from PyPI with pip:
+
+https://github.com/scikit-learn/scikit-learn/issues/19137
+
+
 .. _install_by_distribution:
 
 Third party distributions of scikit-learn

From aa1918cecff1161c36fcf06fa0fe4d1c69ece701 Mon Sep 17 00:00:00 2001
From: Miao Cai <philliphily@gmail.com>
Date: Tue, 12 Jan 2021 01:50:13 +0800
Subject: [PATCH 054/478] DOC Mention to use a command prompt in Windows
 install (#19125)

Co-authored-by: Guillaume Lemaitre <g.lemaitre58@gmail.com>
---
 doc/developers/advanced_installation.rst | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/doc/developers/advanced_installation.rst b/doc/developers/advanced_installation.rst
index 7b305c13200a7..7fbceeeab4c47 100644
--- a/doc/developers/advanced_installation.rst
+++ b/doc/developers/advanced_installation.rst
@@ -206,7 +206,8 @@ console:
 
     python -c "import struct; print(struct.calcsize('P') * 8)"
 
-For 64-bit Python, configure the build environment with:
+For 64-bit Python, configure the build environment by running the following
+commands in ``cmd`` or an Anaconda Prompt (if you use Anaconda):
 
     ::
 

From 1bb0306a1309f9a57d8c652dec731a95cbd0052b Mon Sep 17 00:00:00 2001
From: ly648499246 <648499246@qq.com>
Date: Tue, 12 Jan 2021 11:29:05 +0800
Subject: [PATCH 055/478] FIX An overflow issue in HashingVectorizer (#19035)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Jérémie du Boisberranger <34657725+jeremiedbb@users.noreply.github.com>
Co-authored-by: amzliun <amzliun@amazon.com>
---
 doc/whats_new/v1.0.rst                        | 7 +++++++
 sklearn/feature_extraction/_hashing_fast.pyx  | 7 ++++++-
 sklearn/feature_extraction/tests/test_text.py | 7 +++++++
 3 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index e15b8c878dc5c..3b7de0aa4a21d 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -44,6 +44,13 @@ Changelog
     :pr:`123456` by :user:`Joe Bloggs <joeongithub>`.
     where 123456 is the *pull request* number, not the issue number.
 
+:mod:`sklearn.feature_extraction`
+.................................
+
+- |Fix| Fixed a bug in class:`feature_extraction.HashingVectorizer` where some
+  input strings would result in negative indices in the transformed data.
+  :pr:`19035` by :user:`Liu Yu <ly648499246>`.
+
 :mod:`sklearn.tree`
 ...................
 
diff --git a/sklearn/feature_extraction/_hashing_fast.pyx b/sklearn/feature_extraction/_hashing_fast.pyx
index c23c6bc0bb49f..3a3102444af98 100644
--- a/sklearn/feature_extraction/_hashing_fast.pyx
+++ b/sklearn/feature_extraction/_hashing_fast.pyx
@@ -68,7 +68,12 @@ def transform(raw_X, Py_ssize_t n_features, dtype,
             h = murmurhash3_bytes_s32(<bytes>f, seed)
 
             array.resize_smart(indices, len(indices) + 1)
-            indices[len(indices) - 1] = abs(h) % n_features
+            if h == - 2147483648:
+                # abs(-2**31) is undefined behavior because h is a `np.int32`
+                # The following is defined such that it is equal to: abs(-2**31) % n_features
+                indices[len(indices) - 1] = (2147483647 - (n_features - 1)) % n_features
+            else:
+                indices[len(indices) - 1] = abs(h) % n_features
             # improve inner product preservation in the hashed space
             if alternate_sign:
                 value *= (h >= 0) * 2 - 1
diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py
index 6f22e70f1827a..9abe2b2e57240 100644
--- a/sklearn/feature_extraction/tests/test_text.py
+++ b/sklearn/feature_extraction/tests/test_text.py
@@ -1385,3 +1385,10 @@ def test_tie_breaking_sample_order_invariance():
     vocab1 = vec.fit(['hello', 'world']).vocabulary_
     vocab2 = vec.fit(['world', 'hello']).vocabulary_
     assert vocab1 == vocab2
+
+
+def test_nonnegative_hashing_vectorizer_result_indices():
+    # add test for pr 19035
+    hashing = HashingVectorizer(n_features=1000000, ngram_range=(2, 3))
+    indices = hashing.transform(['22pcs efuture']).indices
+    assert indices[0] >= 0

From 4ecddb01d9380ea3889e894d8898f1d095301944 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Tue, 12 Jan 2021 06:20:12 -0500
Subject: [PATCH 056/478] DOC Uses float instead of real in cross_decomposition
 (#19156)

---
 sklearn/cross_decomposition/_pls.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sklearn/cross_decomposition/_pls.py b/sklearn/cross_decomposition/_pls.py
index 66adacb64b1f3..817d4edbd9e88 100644
--- a/sklearn/cross_decomposition/_pls.py
+++ b/sklearn/cross_decomposition/_pls.py
@@ -499,7 +499,7 @@ class PLSRegression(_PLS):
         The maximum number of iterations of the power method when
         `algorithm='nipals'`. Ignored otherwise.
 
-    tol : real, default 1e-06
+    tol : float, default=1e-06
         The tolerance used as convergence criteria in the power method: the
         algorithm stops whenever the squared norm of `u_i - u_{i-1}` is less
         than `tol`, where `u` corresponds to the left singular vector.
@@ -597,7 +597,7 @@ class PLSCanonical(_PLS):
         the maximum number of iterations of the power method when
         `algorithm='nipals'`. Ignored otherwise.
 
-    tol : real, default 1e-06
+    tol : float, default=1e-06
         The tolerance used as convergence criteria in the power method: the
         algorithm stops whenever the squared norm of `u_i - u_{i-1}` is less
         than `tol`, where `u` corresponds to the left singular vector.
@@ -703,7 +703,7 @@ class CCA(_PLS):
     max_iter : int, default=500
         the maximum number of iterations of the power method.
 
-    tol : real, default 1e-06
+    tol : float, default=1e-06
         The tolerance used as convergence criteria in the power method: the
         algorithm stops whenever the squared norm of `u_i - u_{i-1}` is less
         than `tol`, where `u` corresponds to the left singular vector.

From be4c1d1fee6ee3ec40935283f9e1ab22ebce27cf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juan=20Carlos=20Alfaro=20Jim=C3=A9nez?=
 <JuanCarlos.Alfaro@uclm.es>
Date: Tue, 12 Jan 2021 15:30:36 +0100
Subject: [PATCH 057/478] MNT Replace PDF build by ZIP of the HTML (#17564)

---
 build_tools/circle/build_doc.sh     | 11 ++++------
 build_tools/circle/list_versions.py | 31 +++++++++++++++++++++--------
 doc/Makefile                        | 19 +++++++++++++++---
 3 files changed, 43 insertions(+), 18 deletions(-)

diff --git a/build_tools/circle/build_doc.sh b/build_tools/circle/build_doc.sh
index 96ae64df1c44d..691006bd2dab0 100755
--- a/build_tools/circle/build_doc.sh
+++ b/build_tools/circle/build_doc.sh
@@ -116,8 +116,8 @@ fi
 
 if [[ "$CIRCLE_BRANCH" =~ ^master$|^[0-9]+\.[0-9]+\.X$ && -z "$CI_PULL_REQUEST" ]]
 then
-    # PDF linked into HTML
-    make_args="dist LATEXMKOPTS=-halt-on-error"
+    # ZIP linked into HTML
+    make_args=dist
 elif [[ "$build_type" =~ ^QUICK ]]
 then
     make_args=html-noplot
@@ -133,13 +133,10 @@ fi
 make_args="SPHINXOPTS=-T $make_args"  # show full traceback on exception
 
 # Installing required system packages to support the rendering of math
-# notation in the HTML documentation
+# notation in the HTML documentation and to optimize the image files
 sudo -E apt-get -yq update
-sudo -E apt-get -yq remove texlive-binaries --purge
 sudo -E apt-get -yq --no-install-suggests --no-install-recommends \
-    install dvipng texlive-latex-base texlive-latex-extra \
-    texlive-latex-recommended texlive-fonts-recommended \
-    latexmk gsfonts ccache
+    install dvipng gsfonts ccache zip optipng
 
 # deactivate circleci virtualenv and setup a miniconda env instead
 if [[ `type -t deactivate` ]]; then
diff --git a/build_tools/circle/list_versions.py b/build_tools/circle/list_versions.py
index 19fa8aa2dc991..9d64497012445 100755
--- a/build_tools/circle/list_versions.py
+++ b/build_tools/circle/list_versions.py
@@ -8,6 +8,7 @@
 from distutils.version import LooseVersion
 from urllib.request import urlopen
 
+
 def json_urlread(url):
     try:
         return json.loads(urlopen(url).read().decode('utf8'))
@@ -32,10 +33,23 @@ def human_readable_data_quantity(quantity, multiple=1024):
             quantity /= multiple
 
 
-def get_pdf_size(version):
+def get_file_extension(version):
+    if version == 'dev':
+        # The 'dev' branch should be explictly handled
+        return 'zip'
+
+    current_version = LooseVersion(version)
+    min_zip_version = LooseVersion('1.0.0')
+
+    return 'zip' if current_version >= min_zip_version else 'pdf'
+
+
+def get_file_size(version):
     api_url = ROOT_URL + '%s/_downloads' % version
     for path_details in json_urlread(api_url):
-        if path_details['name'] == 'scikit-learn-docs.pdf':
+        file_extension = get_file_extension(version)
+        file_path = f'scikit-learn-docs.{file_extension}'
+        if path_details['name'] == file_path:
             return human_readable_data_quantity(path_details['size'], 1000)
 
 
@@ -64,8 +78,8 @@ def get_pdf_size(version):
     if path_details['type'] == 'dir':
         html = urlopen(RAW_FMT % name).read().decode('utf8')
         version_num = VERSION_RE.search(html).group(1)
-        pdf_size = get_pdf_size(name)
-        dirs[name] = (version_num, pdf_size)
+        file_size = get_file_size(name)
+        dirs[name] = (version_num, file_size)
 
     if path_details['type'] == 'symlink':
         symlinks[name] = json_urlread(path_details['_links']['self'])['target']
@@ -81,7 +95,7 @@ def get_pdf_size(version):
 for name in (NAMED_DIRS +
              sorted((k for k in dirs if k[:1].isdigit()),
                     key=LooseVersion, reverse=True)):
-    version_num, pdf_size = dirs[name]
+    version_num, file_size = dirs[name]
     if version_num in seen:
         # symlink came first
         continue
@@ -91,7 +105,8 @@ def get_pdf_size(version):
     path = 'https://scikit-learn.org/%s/' % name
     out = ('* `Scikit-learn %s%s documentation <%s>`_'
            % (version_num, name_display, path))
-    if pdf_size is not None:
-        out += (' (`PDF %s <%s/_downloads/scikit-learn-docs.pdf>`_)'
-                % (pdf_size, path))
+    if file_size is not None:
+        file_extension = get_file_extension(version_num)
+        out += (f' (`{file_extension.upper()} {file_size} <{path}/'
+                f'_downloads/scikit-learn-docs.{file_extension}>`_)')
     print(out)
diff --git a/doc/Makefile b/doc/Makefile
index 1cbce7dba9662..6146d11123017 100644
--- a/doc/Makefile
+++ b/doc/Makefile
@@ -17,7 +17,7 @@ ALLSPHINXOPTS   = -T -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS)\
     $(EXAMPLES_PATTERN_OPTS) .
 
 
-.PHONY: help clean html dirhtml pickle json latex latexpdf changes linkcheck doctest optipng
+.PHONY: help clean html dirhtml ziphtml pickle json latex latexpdf changes linkcheck doctest optipng
 
 all: html-noplot
 
@@ -25,6 +25,7 @@ help:
 	@echo "Please use \`make <target>' where <target> is one of"
 	@echo "  html      to make standalone HTML files"
 	@echo "  dirhtml   to make HTML files named index.html in directories"
+	@echo "  ziphtml   to make a ZIP of the HTML"
 	@echo "  pickle    to make pickle files"
 	@echo "  json      to make JSON files"
 	@echo "  latex     to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
@@ -58,6 +59,19 @@ dirhtml:
 	@echo
 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 
+ziphtml:
+	@if [ ! -d "$(BUILDDIR)/html/stable/" ]; then \
+		make html; \
+	fi
+	# Optimize the images to reduce the size of the ZIP
+	optipng $(BUILDDIR)/html/stable/_images/*.png
+	# Exclude the output directory to avoid infinity recursion
+	cd $(BUILDDIR)/html/stable; \
+	zip -q -x _downloads \
+	       -r _downloads/scikit-learn-docs.zip .
+	@echo
+	@echo "Build finished. The ZIP of the HTML is in $(BUILDDIR)/html/stable/_downloads."
+
 pickle:
 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 	@echo
@@ -106,5 +120,4 @@ optipng:
 	find _build auto_examples */generated -name '*.png' -print0 \
 	  | xargs -0 -n 1 -P 4 optipng -o10
 
-dist: html latexpdf
-	cp _build/latex/user_guide.pdf _build/html/stable/_downloads/scikit-learn-docs.pdf
+dist: html ziphtml

From 9f86a25a38a58bb45e00e9c3e2b26a1dc84161a8 Mon Sep 17 00:00:00 2001
From: Sina Tootoonian <sina.tootoonian@gmail.com>
Date: Wed, 13 Jan 2021 07:18:06 +0000
Subject: [PATCH 058/478] DOC Normalization of linear_model decision_function
 (#19142)

---
 sklearn/linear_model/_base.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py
index 2fe1440ffd7c8..211406e642702 100644
--- a/sklearn/linear_model/_base.py
+++ b/sklearn/linear_model/_base.py
@@ -266,8 +266,8 @@ def decision_function(self, X):
         """
         Predict confidence scores for samples.
 
-        The confidence score for a sample is the signed distance of that
-        sample to the hyperplane.
+        The confidence score for a sample is proportional to the signed
+        distance of that sample to the hyperplane.
 
         Parameters
         ----------

From ea5342c1c9951327687a905639bbb67fd40bfd18 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Wed, 13 Jan 2021 03:23:27 -0500
Subject: [PATCH 059/478] CI Adds skipping to azure pipelines with commit
 message (#19134)

---
 azure-pipelines.yml | 117 +++++++++++++++++++++++++++-----------------
 1 file changed, 71 insertions(+), 46 deletions(-)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 1a42c533fb2ee..870c5f0e1d313 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -8,17 +8,11 @@ schedules:
   always: true
 
 jobs:
-- job: linting
-  displayName: Linting
+- job: git_commit
+  displayName: Get Git Commit
   pool:
     vmImage: ubuntu-18.04
   steps:
-    - task: UsePythonVersion@0
-      inputs:
-        versionSpec: '3.9'
-    - bash: |
-        pip install flake8 mypy==0.782
-      displayName: Install linters
     - bash: |
         set -ex
         if [[ $BUILD_REASON == "PullRequest" ]]; then
@@ -26,48 +20,53 @@ jobs:
           # which has a "Merge ID into ID" as a commit message. The latest commit
           # message is the second to last commit
           COMMIT_ID=$(echo $BUILD_SOURCEVERSIONMESSAGE | awk '{print $2}')
-          COMMIT_MESSAGE=$(git log $COMMIT_ID -1 --pretty=%B)
+          message=$(git log $COMMIT_ID -1 --pretty=%B)
         else
-          COMMIT_MESSAGE=$BUILD_SOURCEVERSIONMESSAGE
+          message=$BUILD_SOURCEVERSIONMESSAGE
         fi
-        echo "##vso[task.setvariable variable=COMMIT_MESSAGE]$COMMIT_MESSAGE"
+        echo "##vso[task.setvariable variable=message;isOutput=true]$message"
+      name: commit
       displayName: Get source version message
+
+- job: linting
+  dependsOn: [git_commit]
+  condition: |
+    and(
+      succeeded(),
+      not(contains(dependencies['git_commit']['outputs']['commit.message'], '[lint skip]')),
+      not(contains(dependencies['git_commit']['outputs']['commit.message'], '[ci skip]'))
+    )
+  displayName: Linting
+  pool:
+    vmImage: ubuntu-18.04
+  steps:
+    - task: UsePythonVersion@0
+      inputs:
+        versionSpec: '3.9'
     - bash: |
-        set -ex
-        if [[ "$COMMIT_MESSAGE" =~ "[lint skip]" ]]; then
-          # skip linting
-          echo "Skipping flake8 linting"
-          exit 0
-        else
-          ./build_tools/circle/linting.sh
-        fi
+        pip install flake8 mypy==0.782
+      displayName: Install linters
+    - bash: |
+        ./build_tools/circle/linting.sh
       displayName: Run linting
     - bash: |
-        set -ex
-        if [[ "$COMMIT_MESSAGE" =~ "[lint skip]" ]]; then
-          # skip linting
-          echo "Skipping mypy linting"
-          exit 0
-        else
-          mypy sklearn/
-        fi
+        mypy sklearn/
       displayName: Run mypy
-    - bash: |
-        if [[ "$COMMIT_MESSAGE" =~ "[scipy-dev]" ]] || [[ $BUILD_REASON == "Schedule" ]]; then
-          echo "Running scipy-dev"
-          echo "##vso[task.setvariable variable=runScipyDev;isOutput=true]true"
-        else
-          echo "##vso[task.setvariable variable=runScipyDev;isOutput=true]false"
-        fi
-      name: gitCommitMessage
-      displayName: Determine to run scipy-dev
 
 - template: build_tools/azure/posix.yml
   parameters:
     name: Linux_Nightly
     vmImage: ubuntu-18.04
-    dependsOn: [linting]
-    condition: eq(dependencies['linting']['outputs']['gitCommitMessage.runScipyDev'], 'true')
+    dependsOn: [git_commit, linting]
+    condition: |
+      and(
+        succeeded(),
+        not(contains(dependencies['git_commit']['outputs']['commit.message'], '[ci skip]')),
+        or(eq(variables['Build.Reason'], 'Schedule'),
+           contains(dependencies['git_commit']['outputs']['commit.message'], '[scipy-dev]'
+          )
+        )
+      )
     matrix:
       pylatest_pip_scipy_dev:
         DISTRIB: 'conda-pip-scipy-dev'
@@ -84,6 +83,12 @@ jobs:
   parameters:
     name: Linux_Runs
     vmImage: ubuntu-18.04
+    dependsOn: [git_commit]
+    condition: |
+      and(
+        succeeded(),
+        not(contains(dependencies['git_commit']['outputs']['commit.message'], '[ci skip]'))
+      )
     matrix:
       pylatest_conda_mkl:
         DISTRIB: 'conda'
@@ -95,8 +100,13 @@ jobs:
   parameters:
     name: Linux
     vmImage: ubuntu-18.04
-    dependsOn: [linting]
-    condition: and(ne(variables['Build.Reason'], 'Schedule'), succeeded('linting'))
+    dependsOn: [linting, git_commit]
+    condition: |
+      and(
+        succeeded(),
+        not(contains(dependencies['git_commit']['outputs']['commit.message'], '[ci skip]')),
+        ne(variables['Build.Reason'], 'Schedule')
+      )
     matrix:
       # Linux environment to test that scikit-learn can be built against
       # versions of numpy, scipy with ATLAS that comes with Ubuntu Bionic 18.04
@@ -139,8 +149,13 @@ jobs:
   parameters:
     name: Linux32
     vmImage: ubuntu-18.04
-    dependsOn: [linting]
-    condition: and(ne(variables['Build.Reason'], 'Schedule'), succeeded('linting'))
+    dependsOn: [linting, git_commit]
+    condition: |
+      and(
+        succeeded(),
+        not(contains(dependencies['git_commit']['outputs']['commit.message'], '[ci skip]')),
+        ne(variables['Build.Reason'], 'Schedule')
+      )
     matrix:
       py36_ubuntu_atlas_32bit:
         DISTRIB: 'ubuntu-32'
@@ -157,8 +172,13 @@ jobs:
   parameters:
     name: macOS
     vmImage: macOS-10.14
-    dependsOn: [linting]
-    condition: and(ne(variables['Build.Reason'], 'Schedule'), succeeded('linting'))
+    dependsOn: [linting, git_commit]
+    condition: |
+      and(
+        succeeded(),
+        not(contains(dependencies['git_commit']['outputs']['commit.message'], '[ci skip]')),
+        ne(variables['Build.Reason'], 'Schedule')
+      )
     matrix:
       pylatest_conda_forge_mkl:
         DISTRIB: 'conda'
@@ -174,8 +194,13 @@ jobs:
   parameters:
     name: Windows
     vmImage: vs2017-win2016
-    dependsOn: [linting]
-    condition: and(ne(variables['Build.Reason'], 'Schedule'), succeeded('linting'))
+    dependsOn: [linting, git_commit]
+    condition: |
+      and(
+        succeeded(),
+        not(contains(dependencies['git_commit']['outputs']['commit.message'], '[ci skip]')),
+        ne(variables['Build.Reason'], 'Schedule')
+      )
     matrix:
       py37_conda_mkl:
         PYTHON_VERSION: '3.7'

From d6435b7f06ed62bb77cb5742642d0ed7d6c43a17 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Wed, 13 Jan 2021 04:02:12 -0500
Subject: [PATCH 060/478] DOC Clarifies docstrings in decomposition (#19161)

---
 sklearn/decomposition/_dict_learning.py | 14 ++++++-------
 sklearn/decomposition/_lda.py           |  2 +-
 sklearn/decomposition/_nmf.py           | 26 ++++++++++++-------------
 sklearn/decomposition/_pca.py           |  2 +-
 4 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/sklearn/decomposition/_dict_learning.py b/sklearn/decomposition/_dict_learning.py
index 781f288b70351..046738aa9700d 100644
--- a/sklearn/decomposition/_dict_learning.py
+++ b/sklearn/decomposition/_dict_learning.py
@@ -1156,10 +1156,10 @@ class DictionaryLearning(_BaseSparseCoding, BaseEstimator):
 
     fit_algorithm : {'lars', 'cd'}, default='lars'
         * `'lars'`: uses the least angle regression method to solve the lasso
-           problem (`linear_model.lars_path`);
+          problem (:func:`~sklearn.linear_model.lars_path`);
         * `'cd'`: uses the coordinate descent method to compute the
-          Lasso solution (`linear_model.Lasso`). Lars will be faster if
-          the estimated components are sparse.
+          Lasso solution (:class:`~sklearn.linear_model.Lasso`). Lars will be
+          faster if the estimated components are sparse.
 
         .. versionadded:: 0.17
            *cd* coordinate descent method to improve speed.
@@ -1169,11 +1169,11 @@ class DictionaryLearning(_BaseSparseCoding, BaseEstimator):
         Algorithm used to transform the data:
 
         - `'lars'`: uses the least angle regression method
-          (`linear_model.lars_path`);
+          (:func:`~sklearn.linear_model.lars_path`);
         - `'lasso_lars'`: uses Lars to compute the Lasso solution.
         - `'lasso_cd'`: uses the coordinate descent method to compute the
-          Lasso solution (`linear_model.Lasso`). `'lasso_lars'` will be faster
-          if the estimated components are sparse.
+          Lasso solution (:class:`~sklearn.linear_model.Lasso`). `'lasso_lars'`
+          will be faster if the estimated components are sparse.
         - `'omp'`: uses orthogonal matching pursuit to estimate the sparse
           solution.
         - `'threshold'`: squashes to zero all coefficients less than alpha from
@@ -1404,7 +1404,7 @@ class MiniBatchDictionaryLearning(_BaseSparseCoding, BaseEstimator):
     shuffle : bool, default=True
         Whether to shuffle the samples before forming batches.
 
-    dict_init : nbarray of shape (n_components, n_features), default=None
+    dict_init : ndarray of shape (n_components, n_features), default=None
         initial value of the dictionary for warm restart scenarios
 
     transform_algorithm : {'lasso_lars', 'lasso_cd', 'lars', 'omp', \
diff --git a/sklearn/decomposition/_lda.py b/sklearn/decomposition/_lda.py
index 6e6a5627ff7c5..e554d299fe478 100644
--- a/sklearn/decomposition/_lda.py
+++ b/sklearn/decomposition/_lda.py
@@ -194,7 +194,7 @@ class LatentDirichletAllocation(TransformerMixin, BaseEstimator):
         Number of documents to use in each EM iteration. Only used in online
         learning.
 
-    evaluate_every : int, default=0
+    evaluate_every : int, default=-1
         How often to evaluate perplexity. Only used in `fit` method.
         set it to 0 or negative number to not evaluate perplexity in
         training at all. Evaluating perplexity can help you check convergence
diff --git a/sklearn/decomposition/_nmf.py b/sklearn/decomposition/_nmf.py
index 5d01060951ae1..7bedc60998388 100644
--- a/sklearn/decomposition/_nmf.py
+++ b/sklearn/decomposition/_nmf.py
@@ -1138,23 +1138,23 @@ class NMF(TransformerMixin, BaseEstimator):
         Default: None.
         Valid options:
 
-        - None: 'nndsvd' if n_components <= min(n_samples, n_features),
-            otherwise random.
+        - `None`: 'nndsvd' if n_components <= min(n_samples, n_features),
+          otherwise random.
 
-        - 'random': non-negative random matrices, scaled with:
-            sqrt(X.mean() / n_components)
+        - `'random'`: non-negative random matrices, scaled with:
+          sqrt(X.mean() / n_components)
 
-        - 'nndsvd': Nonnegative Double Singular Value Decomposition (NNDSVD)
-            initialization (better for sparseness)
+        - `'nndsvd'`: Nonnegative Double Singular Value Decomposition (NNDSVD)
+          initialization (better for sparseness)
 
-        - 'nndsvda': NNDSVD with zeros filled with the average of X
-            (better when sparsity is not desired)
+        - `'nndsvda'`: NNDSVD with zeros filled with the average of X
+          (better when sparsity is not desired)
 
-        - 'nndsvdar': NNDSVD with zeros filled with small random values
-            (generally faster, less accurate alternative to NNDSVDa
-            for when sparsity is not desired)
+        - `'nndsvdar'` NNDSVD with zeros filled with small random values
+          (generally faster, less accurate alternative to NNDSVDa
+          for when sparsity is not desired)
 
-        - 'custom': use custom matrices W and H
+        - `'custom'`: use custom matrices W and H
 
     solver : {'cd', 'mu'}, default='cd'
         Numerical solver to use:
@@ -1207,7 +1207,7 @@ class NMF(TransformerMixin, BaseEstimator):
            Regularization parameter *l1_ratio* used in the Coordinate Descent
            solver.
 
-    verbose : bool, default=False
+    verbose : int, default=0
         Whether to be verbose.
 
     shuffle : bool, default=False
diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py
index c69dc959b851a..80ac7e856dfd0 100644
--- a/sklearn/decomposition/_pca.py
+++ b/sklearn/decomposition/_pca.py
@@ -130,7 +130,7 @@ class PCA(_BasePCA):
 
     Parameters
     ----------
-    n_components : int, float or str, default=None
+    n_components : int, float or 'mle', default=None
         Number of components to keep.
         if n_components is not set all components are kept::
 

From a5d858cd163c51848c61107782696d0630d2bc47 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juan=20Carlos=20Alfaro=20Jim=C3=A9nez?=
 <JuanCarlos.Alfaro@uclm.es>
Date: Wed, 13 Jan 2021 10:21:02 +0100
Subject: [PATCH 061/478] MNT fix strict comparison in version listing (#19163)

---
 build_tools/circle/list_versions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/build_tools/circle/list_versions.py b/build_tools/circle/list_versions.py
index 9d64497012445..399c77b723d3c 100755
--- a/build_tools/circle/list_versions.py
+++ b/build_tools/circle/list_versions.py
@@ -34,7 +34,7 @@ def human_readable_data_quantity(quantity, multiple=1024):
 
 
 def get_file_extension(version):
-    if version == 'dev':
+    if 'dev' in version:
         # The 'dev' branch should be explictly handled
         return 'zip'
 

From 0e546ebe5b5a97283ce03f915a83f0d2651394e0 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Wed, 13 Jan 2021 11:02:53 -0500
Subject: [PATCH 062/478] TST Download datasets before running pytest-xdist
 (#19118)

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 conftest.py                                   | 16 ----
 doc/computing/parallelism.rst                 |  3 +-
 sklearn/conftest.py                           | 85 +++++++++++++++++++
 sklearn/datasets/tests/conftest.py            | 60 -------------
 .../ensemble/tests/test_gradient_boosting.py  |  7 +-
 5 files changed, 90 insertions(+), 81 deletions(-)

diff --git a/conftest.py b/conftest.py
index 5c48de4ac36a3..aec49c03ae13d 100644
--- a/conftest.py
+++ b/conftest.py
@@ -5,7 +5,6 @@
 # doc/modules/clustering.rst and use sklearn from the local folder rather than
 # the one from site-packages.
 
-import os
 import platform
 import sys
 
@@ -17,18 +16,12 @@
 from sklearn._min_dependencies import PYTEST_MIN_VERSION
 from sklearn.utils.fixes import np_version, parse_version
 
-
 if parse_version(pytest.__version__) < parse_version(PYTEST_MIN_VERSION):
     raise ImportError('Your version of pytest is too old, you should have '
                       'at least pytest >= {} installed.'
                       .format(PYTEST_MIN_VERSION))
 
 
-def pytest_addoption(parser):
-    parser.addoption("--skip-network", action="store_true", default=False,
-                     help="skip network tests")
-
-
 def pytest_collection_modifyitems(config, items):
     for item in items:
         # FeatureHasher is not compatible with PyPy
@@ -50,15 +43,6 @@ def pytest_collection_modifyitems(config, items):
             )
             item.add_marker(marker)
 
-    # Skip tests which require internet if the flag is provided
-    if (config.getoption("--skip-network")
-            or int(os.environ.get("SKLEARN_SKIP_NETWORK_TESTS", "0"))):
-        skip_network = pytest.mark.skip(
-            reason="test requires internet connectivity")
-        for item in items:
-            if "network" in item.keywords:
-                item.add_marker(skip_network)
-
     # numpy changed the str/repr formatting of numpy arrays in 1.14. We want to
     # run doctests only for numpy >= 1.14.
     skip_doctests = False
diff --git a/doc/computing/parallelism.rst b/doc/computing/parallelism.rst
index 3dce5ef66bb1d..8605650e8eec5 100644
--- a/doc/computing/parallelism.rst
+++ b/doc/computing/parallelism.rst
@@ -212,4 +212,5 @@ These environment variables should be set before importing scikit-learn.
 :SKLEARN_SKIP_NETWORK_TESTS:
 
     When this environment variable is set to a non zero value, the tests
-    that need network access are skipped.
+    that need network access are skipped. When this environment variable is
+    not set then network tests are skipped.
diff --git a/sklearn/conftest.py b/sklearn/conftest.py
index 8a98921342efa..2978115e3091c 100644
--- a/sklearn/conftest.py
+++ b/sklearn/conftest.py
@@ -1,9 +1,94 @@
 import os
+from os import environ
+from functools import wraps
 
 import pytest
 from threadpoolctl import threadpool_limits
 
 from sklearn.utils._openmp_helpers import _openmp_effective_n_threads
+from sklearn.datasets import fetch_20newsgroups
+from sklearn.datasets import fetch_20newsgroups_vectorized
+from sklearn.datasets import fetch_california_housing
+from sklearn.datasets import fetch_covtype
+from sklearn.datasets import fetch_kddcup99
+from sklearn.datasets import fetch_olivetti_faces
+from sklearn.datasets import fetch_rcv1
+
+
+dataset_fetchers = {
+    'fetch_20newsgroups_fxt': fetch_20newsgroups,
+    'fetch_20newsgroups_vectorized_fxt': fetch_20newsgroups_vectorized,
+    'fetch_california_housing_fxt': fetch_california_housing,
+    'fetch_covtype_fxt': fetch_covtype,
+    'fetch_kddcup99_fxt': fetch_kddcup99,
+    'fetch_olivetti_faces_fxt': fetch_olivetti_faces,
+    'fetch_rcv1_fxt': fetch_rcv1,
+}
+
+
+def _fetch_fixture(f):
+    """Fetch dataset (download if missing and requested by environment)."""
+    download_if_missing = environ.get('SKLEARN_SKIP_NETWORK_TESTS', '1') == '0'
+
+    @wraps(f)
+    def wrapped(*args, **kwargs):
+        kwargs['download_if_missing'] = download_if_missing
+        try:
+            return f(*args, **kwargs)
+        except IOError:
+            pytest.skip("test is enabled when SKLEARN_SKIP_NETWORK_TESTS=0")
+    return pytest.fixture(lambda: wrapped)
+
+
+# Adds fixtures for fetching data
+fetch_20newsgroups_fxt = _fetch_fixture(fetch_20newsgroups)
+fetch_20newsgroups_vectorized_fxt = \
+    _fetch_fixture(fetch_20newsgroups_vectorized)
+fetch_california_housing_fxt = _fetch_fixture(fetch_california_housing)
+fetch_covtype_fxt = _fetch_fixture(fetch_covtype)
+fetch_kddcup99_fxt = _fetch_fixture(fetch_kddcup99)
+fetch_olivetti_faces_fxt = _fetch_fixture(fetch_olivetti_faces)
+fetch_rcv1_fxt = _fetch_fixture(fetch_rcv1)
+
+
+def pytest_collection_modifyitems(config, items):
+    """Called after collect is completed.
+
+    Parameters
+    ----------
+    config : pytest config
+    items : list of collected items
+    """
+    run_network_tests = environ.get('SKLEARN_SKIP_NETWORK_TESTS', '1') == '0'
+    skip_network = pytest.mark.skip(
+        reason="test is enabled when SKLEARN_SKIP_NETWORK_TESTS=0")
+
+    # download datasets during collection to avoid thread unsafe behavior
+    # when running pytest in parallel with pytest-xdist
+    dataset_features_set = set(dataset_fetchers)
+    datasets_to_download = set()
+
+    for item in items:
+        if not hasattr(item, "fixturenames"):
+            continue
+        item_fixtures = set(item.fixturenames)
+        dataset_to_fetch = item_fixtures & dataset_features_set
+        if not dataset_to_fetch:
+            continue
+
+        if run_network_tests:
+            datasets_to_download |= dataset_to_fetch
+        else:
+            # network tests are skipped
+            item.add_marker(skip_network)
+
+    # Only download datasets on the first worker spawned by pytest-xdist
+    # to avoid thread unsafe behavior. If pytest-xdist is not used, we still
+    # download before tests run.
+    worker_id = environ.get("PYTEST_XDIST_WORKER", "gw0")
+    if worker_id == "gw0" and run_network_tests:
+        for name in datasets_to_download:
+            dataset_fetchers[name]()
 
 
 @pytest.fixture(scope='function')
diff --git a/sklearn/datasets/tests/conftest.py b/sklearn/datasets/tests/conftest.py
index 4612cd5deb4bc..cf356d6ca3b10 100644
--- a/sklearn/datasets/tests/conftest.py
+++ b/sklearn/datasets/tests/conftest.py
@@ -1,67 +1,7 @@
 """ Network tests are only run, if data is already locally available,
 or if download is specifically requested by environment variable."""
 import builtins
-from functools import wraps
-from os import environ
 import pytest
-from sklearn.datasets import fetch_20newsgroups
-from sklearn.datasets import fetch_20newsgroups_vectorized
-from sklearn.datasets import fetch_california_housing
-from sklearn.datasets import fetch_covtype
-from sklearn.datasets import fetch_kddcup99
-from sklearn.datasets import fetch_olivetti_faces
-from sklearn.datasets import fetch_rcv1
-
-
-def _wrapped_fetch(f, dataset_name):
-    """ Fetch dataset (download if missing and requested by environment) """
-    download_if_missing = environ.get('SKLEARN_SKIP_NETWORK_TESTS', '1') == '0'
-
-    @wraps(f)
-    def wrapped(*args, **kwargs):
-        kwargs['download_if_missing'] = download_if_missing
-        try:
-            return f(*args, **kwargs)
-        except IOError:
-            pytest.skip("Download {} to run this test".format(dataset_name))
-    return wrapped
-
-
-@pytest.fixture
-def fetch_20newsgroups_fxt():
-    return _wrapped_fetch(fetch_20newsgroups, dataset_name='20newsgroups')
-
-
-@pytest.fixture
-def fetch_20newsgroups_vectorized_fxt():
-    return _wrapped_fetch(fetch_20newsgroups_vectorized,
-                          dataset_name='20newsgroups_vectorized')
-
-
-@pytest.fixture
-def fetch_california_housing_fxt():
-    return _wrapped_fetch(fetch_california_housing,
-                          dataset_name='california_housing')
-
-
-@pytest.fixture
-def fetch_covtype_fxt():
-    return _wrapped_fetch(fetch_covtype, dataset_name='covtype')
-
-
-@pytest.fixture
-def fetch_kddcup99_fxt():
-    return _wrapped_fetch(fetch_kddcup99, dataset_name='kddcup99')
-
-
-@pytest.fixture
-def fetch_olivetti_faces_fxt():
-    return _wrapped_fetch(fetch_olivetti_faces, dataset_name='olivetti_faces')
-
-
-@pytest.fixture
-def fetch_rcv1_fxt():
-    return _wrapped_fetch(fetch_rcv1, dataset_name='rcv1')
 
 
 @pytest.fixture
diff --git a/sklearn/ensemble/tests/test_gradient_boosting.py b/sklearn/ensemble/tests/test_gradient_boosting.py
index 256b79db4865c..498e5bf38a675 100644
--- a/sklearn/ensemble/tests/test_gradient_boosting.py
+++ b/sklearn/ensemble/tests/test_gradient_boosting.py
@@ -13,7 +13,7 @@
 
 from sklearn import datasets
 from sklearn.base import clone
-from sklearn.datasets import (make_classification, fetch_california_housing,
+from sklearn.datasets import (make_classification,
                               make_regression)
 from sklearn.ensemble import GradientBoostingClassifier
 from sklearn.ensemble import GradientBoostingRegressor
@@ -345,8 +345,7 @@ def test_max_feature_regression():
     assert deviance < 0.5, "GB failed with deviance %.4f" % deviance
 
 
-@pytest.mark.network
-def test_feature_importance_regression():
+def test_feature_importance_regression(fetch_california_housing_fxt):
     """Test that Gini importance is calculated correctly.
 
     This test follows the example from [1]_ (pg. 373).
@@ -354,7 +353,7 @@ def test_feature_importance_regression():
     .. [1] Friedman, J., Hastie, T., & Tibshirani, R. (2001). The elements
        of statistical learning. New York: Springer series in statistics.
     """
-    california = fetch_california_housing()
+    california = fetch_california_housing_fxt()
     X, y = california.data, california.target
     X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
 

From d626d778508b68b6a0145cf6fd4788b2fb853a2a Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@gmail.com>
Date: Thu, 14 Jan 2021 16:35:59 +0100
Subject: [PATCH 063/478] Add @fails_if_pypy to
 test_nonnegative_hashing_vectorizer_result_indices (#19173)

---
 sklearn/feature_extraction/tests/test_text.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py
index 9abe2b2e57240..59ab269b6d69c 100644
--- a/sklearn/feature_extraction/tests/test_text.py
+++ b/sklearn/feature_extraction/tests/test_text.py
@@ -1387,6 +1387,7 @@ def test_tie_breaking_sample_order_invariance():
     assert vocab1 == vocab2
 
 
+@fails_if_pypy
 def test_nonnegative_hashing_vectorizer_result_indices():
     # add test for pr 19035
     hashing = HashingVectorizer(n_features=1000000, ngram_range=(2, 3))

From 9b2a3e8ba50804e5cd1e4302097e86aebd2e8464 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@gmail.com>
Date: Thu, 14 Jan 2021 17:28:12 +0100
Subject: [PATCH 064/478] CI Use stable numpy scipy release for [icc-build] and
 [arm64] on travis (#19176)

The scipy-dev builds have moved to Azure Pipelines and there is no
reason to not use stable versions of numpy and scipy to run the ICC
and ARM64 tests on travis.

This should fix the invalid wheel metadata failure observed on travis
which was already resolved on Azure Pipelines by using the legacy
pip dependency resolver for the scipy-dev build.
---
 build_tools/travis/install_master.sh | 19 ++-----------------
 1 file changed, 2 insertions(+), 17 deletions(-)

diff --git a/build_tools/travis/install_master.sh b/build_tools/travis/install_master.sh
index 042ce53b41d2c..e2e0534216c7c 100755
--- a/build_tools/travis/install_master.sh
+++ b/build_tools/travis/install_master.sh
@@ -50,23 +50,8 @@ conda update --yes conda
 conda create -n testenv --yes python=3.7
 
 source activate testenv
-
-if [[ $TRAVIS_CPU_ARCH == amd64 ]]; then
-    echo "Upgrading pip and setuptools."
-    pip install --upgrade pip setuptools
-    echo "Installing numpy, scipy and pandas master wheels."
-    dev_anaconda_url=https://pypi.anaconda.org/scipy-wheels-nightly/simple
-    pip install --pre --upgrade --timeout=60 --extra-index $dev_anaconda_url numpy scipy pandas
-    echo "Installing cython pre-release wheels."
-    pip install --pre cython
-    echo "Installing joblib master."
-    pip install https://github.com/joblib/joblib/archive/master.zip
-    echo "Installing pillow master."
-    pip install https://github.com/python-pillow/Pillow/archive/master.zip
-else
-    conda install -y scipy numpy pandas cython
-    pip install joblib threadpoolctl
-fi
+conda install -y scipy numpy pandas cython
+pip install joblib threadpoolctl
 
 pip install $(get_dep pytest $PYTEST_VERSION) pytest-xdist
 

From 4b8ab9291b4f55e309362a138b02515361663e4b Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Sat, 16 Jan 2021 13:19:07 -0500
Subject: [PATCH 065/478] FIX Fixes issue with exatly_zero_info_score (#19179)

* ENH Fixes issue with exatly_zero_info_score [scipy-dev]

* ENH Remove unneeded line [scipy-dev]

* WIP Keep types [scipy-dev]

* REV Smaller diff [scipy-dev]

* WIP Expand mutual_info_score [scipy-dev]

* WIP Removes float casting [scipy-dev]

* WIP Adds casting back in

* CI [scipy-dev]

* WIP Casting is not needed [scipy-dev]

* WIP Only clip [scipy-dev]

* REV Smaller diff [scipy-dev]

* WIP Place expected_mutual_information diff back [scipy-dev]

* ENH Uses around

* WIP Use where again [scipy-dev]

* ENH Adjust comments to match code
---
 .../metrics/cluster/_expected_mutual_info_fast.pyx | 14 +++++++-------
 sklearn/metrics/cluster/_supervised.py             |  1 +
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/sklearn/metrics/cluster/_expected_mutual_info_fast.pyx b/sklearn/metrics/cluster/_expected_mutual_info_fast.pyx
index b9b94508da046..d2f9cd8578b12 100644
--- a/sklearn/metrics/cluster/_expected_mutual_info_fast.pyx
+++ b/sklearn/metrics/cluster/_expected_mutual_info_fast.pyx
@@ -22,7 +22,7 @@ def expected_mutual_information(contingency, int n_samples):
     cdef DOUBLE N, gln_N, emi, term2, term3, gln
     cdef np.ndarray[DOUBLE] gln_a, gln_b, gln_Na, gln_Nb, gln_nij, log_Nnij
     cdef np.ndarray[DOUBLE] nijs, term1
-    cdef np.ndarray[DOUBLE, ndim=2] log_ab_outer
+    cdef np.ndarray[DOUBLE] log_a, log_b
     cdef np.ndarray[np.int32_t] a, b
     #cdef np.ndarray[int, ndim=2] start, end
     R, C = contingency.shape
@@ -37,10 +37,10 @@ def expected_mutual_information(contingency, int n_samples):
     # term1 is nij / N
     term1 = nijs / N
     # term2 is log((N*nij) / (a * b)) == log(N * nij) - log(a * b)
-    # term2 uses the outer product
-    log_ab_outer = np.log(a)[:, np.newaxis] + np.log(b)
-    # term2 uses N * nij
-    log_Nnij = np.log(N * nijs)
+    log_a = np.log(a)
+    log_b = np.log(b)
+    # term2 uses log(N * nij) = log(N) + log(nij)
+    log_Nnij = np.log(N) + np.log(nijs)
     # term3 is large, and involved many factorials. Calculate these in log
     # space to stop overflows.
     gln_a = gammaln(a + 1)
@@ -54,12 +54,12 @@ def expected_mutual_information(contingency, int n_samples):
     start = np.maximum(start, 1)
     end = np.minimum(np.resize(a, (C, R)).T, np.resize(b, (R, C))) + 1
     # emi itself is a summation over the various values.
-    emi = 0
+    emi = 0.0
     cdef Py_ssize_t i, j, nij
     for i in range(R):
         for j in range(C):
             for nij in range(start[i,j], end[i,j]):
-                term2 = log_Nnij[nij] - log_ab_outer[i,j]
+                term2 = log_Nnij[nij] - log_a[i] - log_b[j]
                 # Numerators are positive, denominators are negative.
                 gln = (gln_a[i] + gln_b[j] + gln_Na[i] + gln_Nb[j]
                      - gln_N - gln_nij[nij] - lgamma(a[i] - nij + 1)
diff --git a/sklearn/metrics/cluster/_supervised.py b/sklearn/metrics/cluster/_supervised.py
index 6e4e13f26017a..19d1552518db4 100644
--- a/sklearn/metrics/cluster/_supervised.py
+++ b/sklearn/metrics/cluster/_supervised.py
@@ -795,6 +795,7 @@ def mutual_info_score(labels_true, labels_pred, *, contingency=None):
     log_outer = -np.log(outer) + log(pi.sum()) + log(pj.sum())
     mi = (contingency_nm * (log_contingency_nm - log(contingency_sum)) +
           contingency_nm * log_outer)
+    mi = np.where(np.abs(mi) < np.finfo(mi.dtype).eps, 0.0, mi)
     return np.clip(mi.sum(), 0.0, None)
 
 
From 5a63f903ff1d45084c4fd41f241bf5dfdd067680 Mon Sep 17 00:00:00 2001
From: Zito <zitorelova@gmail.com>
Date: Sat, 16 Jan 2021 10:28:47 -0800
Subject: [PATCH 066/478] DOC description for Calinski-Harabasz Index (#19167)

---
 doc/modules/clustering.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst
index 010d721fdd073..61c8393a734c8 100644
--- a/doc/modules/clustering.rst
+++ b/doc/modules/clustering.rst
@@ -1737,7 +1737,7 @@ Ratio Criterion - can be used to evaluate the model, where a higher
 Calinski-Harabasz score relates to a model with better defined clusters.
 
 The index is the ratio of the sum of between-clusters dispersion and of
-inter-cluster dispersion for all clusters (where dispersion is defined as the
+within-cluster dispersion for all clusters (where dispersion is defined as the
 sum of distances squared):
 
   >>> from sklearn import metrics

From 1fca00b0b46e89956f76e118581a4176888344ab Mon Sep 17 00:00:00 2001
From: ranjanikrishnan <ranjanikrishnanr@gmail.com>
Date: Sun, 17 Jan 2021 21:28:33 +0100
Subject: [PATCH 067/478] DOC Add link to video for contributing: Andreas video
 Volume 2 (#19180)

Co-authored-by: Nicolas Hug <contact@nicolas-hug.com>
---
 doc/developers/contributing.rst | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/doc/developers/contributing.rst b/doc/developers/contributing.rst
index 8a3c460c615a8..fb2c0aa997fe5 100644
--- a/doc/developers/contributing.rst
+++ b/doc/developers/contributing.rst
@@ -213,6 +213,11 @@ latest up-to-date workflow.
   `Transcript
   <https://github.com/data-umbrella/event-transcripts/blob/main/2020/06-reshama-shaikh-sklearn-pr.md>`__
 
+- Sprint-specific instructions and practical tips:
+  `Video <https://youtu.be/p_2Uw2BxdhA>`__,
+  `Transcript
+  <https://github.com/data-umbrella/data-umbrella-scikit-learn-sprint/blob/master/3_transcript_ACM_video_vol2.md>`__
+
 How to contribute
 -----------------
 

From a6659f885b5d07e62f2f0571d64530b67685c53a Mon Sep 17 00:00:00 2001
From: Abhinav Gupta <62496969+abhinavtps@users.noreply.github.com>
Date: Mon, 18 Jan 2021 20:43:19 +0530
Subject: [PATCH 068/478] DOC Replacing swarmplot with stripplot to avoid
 seaborn warning (#19195)

---
 .../plot_linear_model_coefficient_interpretation.py         | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py
index 93a5b430a3542..459b180f00e36 100644
--- a/examples/inspection/plot_linear_model_coefficient_interpretation.py
+++ b/examples/inspection/plot_linear_model_coefficient_interpretation.py
@@ -325,7 +325,7 @@
     columns=feature_names
 )
 plt.figure(figsize=(9, 7))
-sns.swarmplot(data=coefs, orient='h', color='k', alpha=0.5)
+sns.stripplot(data=coefs, orient='h', color='k', alpha=0.5)
 sns.boxplot(data=coefs, orient='h', color='cyan', saturation=0.5)
 plt.axvline(x=0, color='.5')
 plt.xlabel('Coefficient importance')
@@ -376,7 +376,7 @@
     columns=feature_names[:-1]
 )
 plt.figure(figsize=(9, 7))
-sns.swarmplot(data=coefs, orient='h', color='k', alpha=0.5)
+sns.stripplot(data=coefs, orient='h', color='k', alpha=0.5)
 sns.boxplot(data=coefs, orient='h', color='cyan', saturation=0.5)
 plt.axvline(x=0, color='.5')
 plt.title('Coefficient importance and its variability')
@@ -469,7 +469,7 @@
     columns=feature_names
 )
 plt.figure(figsize=(9, 7))
-sns.swarmplot(data=coefs, orient='h', color='k', alpha=0.5)
+sns.stripplot(data=coefs, orient='h', color='k', alpha=0.5)
 sns.boxplot(data=coefs, orient='h', color='cyan', saturation=0.5)
 plt.axvline(x=0, color='.5')
 plt.title('Coefficient variability')

From 8452642f0c3575730dd0e2574735370830e26f79 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Mon, 18 Jan 2021 17:25:41 +0100
Subject: [PATCH 069/478] DOC add entry in whats new for 0.24.1 (#19196)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
Co-authored-by: Jérémie du Boisberranger <34657725+jeremiedbb@users.noreply.github.com>
---
 doc/whats_new/v0.24.rst | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst
index c501405275696..3d027b029af38 100644
--- a/doc/whats_new/v0.24.rst
+++ b/doc/whats_new/v0.24.rst
@@ -7,6 +7,17 @@
 Version 0.24.1
 ==============
 
+**January 2020**
+
+Packaging
+---------
+
+The 0.24.0 scikit-learn wheels were not working with MacOS <1.15 due to
+`libomp`. The version of `libomp` used to build the wheels was too recent for
+older macOS versions. This issue has been fixed for 0.24.1 scikit-learn wheels.
+Scikit-learn wheels published on PyPI.org now officially support macOS 10.13
+and later.
+
 Changelog
 ---------
 
@@ -56,21 +67,9 @@ Details are listed in the changelog below.
 (While we are trying to better inform users by providing this information, we
 cannot assure that this list is complete.)
 
-
 Changelog
 ---------
 
-..
-    Entries should be grouped by module (in alphabetic order) and prefixed with
-    one of the labels: |MajorFeature|, |Feature|, |Efficiency|, |Enhancement|,
-    |Fix| or |API| (see whats_new.rst for descriptions).
-    Entries should be ordered by those labels (e.g. |Fix| after |Efficiency|).
-    Changes not specific to a module should be listed under *Multiple Modules*
-    or *Miscellaneous*.
-    Entries should end with:
-    :pr:`123456` by :user:`Joe Bloggs <joeongithub>`.
-    where 123456 is the *pull request* number, not the issue number.
-
 :mod:`sklearn.base`
 ...................
 

From 28efdcc5a646fbb8da2456a0f4b8ce7968432242 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Mon, 18 Jan 2021 20:06:25 +0100
Subject: [PATCH 070/478] DOC add entry in whats new for numerical instability
 in mutual information (#19200)

---
 doc/whats_new/v0.24.rst | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst
index 3d027b029af38..92697310cbe36 100644
--- a/doc/whats_new/v0.24.rst
+++ b/doc/whats_new/v0.24.rst
@@ -21,6 +21,14 @@ and later.
 Changelog
 ---------
 
+:mod:`sklearn.metrics`
+......................
+
+- |Fix| Fix numerical stability bug that could happen in
+  :func:`metrics.adjusted_mutual_info_score` and
+  :func:`metrics.mutual_info_score` with NumPy 1.20+.
+  :pr:`19179` by `Thomas Fan`_.
+
 :mod:`sklearn.semi_supervised`
 ..............................
 

From 43b169b5f48f21443932b34bbf8af5a3510bde89 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 19 Jan 2021 11:09:01 +0100
Subject: [PATCH 071/478] DOC fix year release 0.24.1

---
 doc/whats_new/v0.24.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst
index 92697310cbe36..0f88660be6f39 100644
--- a/doc/whats_new/v0.24.rst
+++ b/doc/whats_new/v0.24.rst
@@ -7,7 +7,7 @@
 Version 0.24.1
 ==============
 
-**January 2020**
+**January 2021**
 
 Packaging
 ---------

From 8c5b8fdaf3fc37d51246d14ed6a0535f63332ef5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juan=20Carlos=20Alfaro=20Jim=C3=A9nez?=
 <JuanCarlos.Alfaro@uclm.es>
Date: Tue, 19 Jan 2021 17:39:07 +0100
Subject: [PATCH 072/478] DOC Fix minimum version for building ZIP of the HTML
 (#19208)

---
 build_tools/circle/list_versions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/build_tools/circle/list_versions.py b/build_tools/circle/list_versions.py
index 399c77b723d3c..19bee5ae1cfc7 100755
--- a/build_tools/circle/list_versions.py
+++ b/build_tools/circle/list_versions.py
@@ -39,7 +39,7 @@ def get_file_extension(version):
         return 'zip'
 
     current_version = LooseVersion(version)
-    min_zip_version = LooseVersion('1.0.0')
+    min_zip_version = LooseVersion('0.24')
 
     return 'zip' if current_version >= min_zip_version else 'pdf'
 

From 8bdd29186909266f6d5226e08e44cce42b4954dc Mon Sep 17 00:00:00 2001
From: Chiara Marmo <cmarmo@users.noreply.github.com>
Date: Tue, 19 Jan 2021 18:27:09 +0100
Subject: [PATCH 073/478] DOC Generalise norm notation in NMF docstring.
 (#19209)

---
 sklearn/decomposition/_nmf.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/sklearn/decomposition/_nmf.py b/sklearn/decomposition/_nmf.py
index 7bedc60998388..6d42fecb885a2 100644
--- a/sklearn/decomposition/_nmf.py
+++ b/sklearn/decomposition/_nmf.py
@@ -867,7 +867,7 @@ def non_negative_factorization(X, W=None, H=None, n_components=None, *,
 
         .. math::
 
-            0.5 * ||X - WH||_{Fro}^2 + alpha * l1_{ratio} * ||vec(W)||_1
+            0.5 * ||X - WH||_{loss}^2 + alpha * l1_{ratio} * ||vec(W)||_1
 
             + alpha * l1_{ratio} * ||vec(H)||_1
 
@@ -881,9 +881,9 @@ def non_negative_factorization(X, W=None, H=None, n_components=None, *,
 
     :math:`||vec(A)||_1 = \\sum_{i,j} abs(A_{ij})` (Elementwise L1 norm)
 
-    For multiplicative-update ('mu') solver, the Frobenius norm
-    :math:`(0.5 * ||X - WH||_{Fro}^2)` can be changed into another
-    beta-divergence loss, by changing the beta_loss parameter.
+    The generic norm :math:`||X - WH||_{loss}^2` may represent
+    the Frobenius norm or another supported beta-divergence loss.
+    The choice between options is controlled by the `beta_loss` parameter.
 
     The objective function is minimized with an alternating minimization of W
     and H. If H is given and update_H=False, it solves for W only.
@@ -1104,7 +1104,7 @@ class NMF(TransformerMixin, BaseEstimator):
 
         .. math::
 
-            0.5 * ||X - WH||_{Fro}^2 + alpha * l1_{ratio} * ||vec(W)||_1
+            0.5 * ||X - WH||_{loss}^2 + alpha * l1_{ratio} * ||vec(W)||_1
 
             + alpha * l1_{ratio} * ||vec(H)||_1
 
@@ -1118,9 +1118,9 @@ class NMF(TransformerMixin, BaseEstimator):
 
     :math:`||vec(A)||_1 = \\sum_{i,j} abs(A_{ij})` (Elementwise L1 norm)
 
-    For multiplicative-update ('mu') solver, the Frobenius norm
-    (:math:`0.5 * ||X - WH||_{Fro}^2`) can be changed into another
-    beta-divergence loss, by changing the beta_loss parameter.
+    The generic norm :math:`||X - WH||_{loss}` may represent
+    the Frobenius norm or another supported beta-divergence loss.
+    The choice between options is controlled by the `beta_loss` parameter.
 
     The objective function is minimized with an alternating minimization of W
     and H.

From e4ae68f09a258d9578f640ff74ca6e209ec37dba Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 19 Jan 2021 23:53:22 +0100
Subject: [PATCH 074/478] DOC add info regarding support of dict of multimetric
 scores (#19205)

Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
---
 doc/whats_new/v0.24.rst                |  6 ++++
 sklearn/metrics/_scorer.py             | 12 ++++---
 sklearn/model_selection/_search.py     | 44 +++++++++++++++-----------
 sklearn/model_selection/_validation.py | 23 ++++++++------
 4 files changed, 53 insertions(+), 32 deletions(-)

diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst
index 0f88660be6f39..ffa6fe96ec3c5 100644
--- a/doc/whats_new/v0.24.rst
+++ b/doc/whats_new/v0.24.rst
@@ -635,6 +635,12 @@ Changelog
   :pr:`18266` by :user:`Subrat Sahu <subrat93>`,
   :user:`Nirvan <Nirvan101>` and :user:`Arthur Book <ArthurBook>`.
 
+- |Enhancement| :class:`model_selection.GridSearchCV`,
+  :class:`model_selection.RandomizedSearchCV` and
+  :func:`model_selection.cross_validate` support `scoring` being a callable
+  returning a dictionary of of multiple metric names/values association.
+  :pr:`15126` by `Thomas Fan`_.
+
 :mod:`sklearn.multiclass`
 .........................
 
diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py
index 3a41a6d631f07..c686d3b7c0b34 100644
--- a/sklearn/metrics/_scorer.py
+++ b/sklearn/metrics/_scorer.py
@@ -469,11 +469,15 @@ def _check_multimetric_scoring(estimator, scoring):
         The estimator for which the scoring will be applied.
 
     scoring : list, tuple or dict
-        A single string (see :ref:`scoring_parameter`) or a callable
-        (see :ref:`scoring`) to evaluate the predictions on the test set.
+        Strategy to evaluate the performance of the cross-validated model on
+        the test set.
 
-        For evaluating multiple metrics, either give a list of (unique) strings
-        or a dict with names as keys and callables as values.
+        The possibilities are:
+
+        - a list or tuple of unique strings;
+        - a callable returning a dictionary where they keys are the metric
+          names and the values are the metric scores;
+        - a dictionary with metric names as keys and callables a values.
 
         See :ref:`multimetric_grid_search` for an example.
 
diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py
index 213204b50c2a7..e0baf77e1f22d 100644
--- a/sklearn/model_selection/_search.py
+++ b/sklearn/model_selection/_search.py
@@ -997,20 +997,23 @@ class GridSearchCV(BaseSearchCV):
         in the list are explored. This enables searching over any sequence
         of parameter settings.
 
-    scoring : str, callable, list/tuple or dict, default=None
-        A single str (see :ref:`scoring_parameter`) or a callable
-        (see :ref:`scoring`) to evaluate the predictions on the test set.
+    scoring : str, callable, list, tuple or dict, default=None
+        Strategy to evaluate the performance of the cross-validated model on
+        the test set.
 
-        For evaluating multiple metrics, either give a list of (unique) strings
-        or a dict with names as keys and callables as values.
+        If `scoring` represents a single score, one can use:
 
-        NOTE that when using custom scorers, each scorer should return a single
-        value. Metric functions returning a list/array of values can be wrapped
-        into multiple scorers that return one value each.
+        - a single string (see :ref:`scoring_parameter`);
+        - a callable (see :ref:`scoring`) that returns a single value.
 
-        See :ref:`multimetric_grid_search` for an example.
+        If `scoring` reprents multiple scores, one can use:
 
-        If None, the estimator's score method is used.
+        - a list or tuple of unique strings;
+        - a callable returning a dictionary where the keys are the metric
+          names and the values are the metric scores;
+        - a dictionary with metric names as keys and callables a values.
+
+        See :ref:`multimetric_grid_search` for an example.
 
     n_jobs : int, default=None
         Number of jobs to run in parallel.
@@ -1334,16 +1337,21 @@ class RandomizedSearchCV(BaseSearchCV):
         Number of parameter settings that are sampled. n_iter trades
         off runtime vs quality of the solution.
 
-    scoring : str, callable, list/tuple or dict, default=None
-        A single str (see :ref:`scoring_parameter`) or a callable
-        (see :ref:`scoring`) to evaluate the predictions on the test set.
+    scoring : str, callable, list, tuple or dict, default=None
+        Strategy to evaluate the performance of the cross-validated model on
+        the test set.
+
+        If `scoring` represents a single score, one can use:
+
+        - a single string (see :ref:`scoring_parameter`);
+        - a callable (see :ref:`scoring`) that returns a single value.
 
-        For evaluating multiple metrics, either give a list of (unique) strings
-        or a dict with names as keys and callables as values.
+        If `scoring` reprents multiple scores, one can use:
 
-        NOTE that when using custom scorers, each scorer should return a single
-        value. Metric functions returning a list/array of values can be wrapped
-        into multiple scorers that return one value each.
+        - a list or tuple of unique strings;
+        - a callable returning a dictionary where the keys are the metric
+          names and the values are the metric scores;
+        - a dictionary with metric names as keys and callables a values.
 
         See :ref:`multimetric_grid_search` for an example.
 
diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py
index 63f68a8e30738..7a52b656e1804 100644
--- a/sklearn/model_selection/_validation.py
+++ b/sklearn/model_selection/_validation.py
@@ -65,20 +65,23 @@ def cross_validate(estimator, X, y=None, *, groups=None, scoring=None, cv=None,
         train/test set. Only used in conjunction with a "Group" :term:`cv`
         instance (e.g., :class:`GroupKFold`).
 
-    scoring : str, callable, list/tuple, or dict, default=None
-        A single str (see :ref:`scoring_parameter`) or a callable
-        (see :ref:`scoring`) to evaluate the predictions on the test set.
+    scoring : str, callable, list, tuple, or dict, default=None
+        Strategy to evaluate the performance of the cross-validated model on
+        the test set.
 
-        For evaluating multiple metrics, either give a list of (unique) strings
-        or a dict with names as keys and callables as values.
+        If `scoring` represents a single score, one can use:
 
-        NOTE that when using custom scorers, each scorer should return a single
-        value. Metric functions returning a list/array of values can be wrapped
-        into multiple scorers that return one value each.
+        - a single string (see :ref:`scoring_parameter`);
+        - a callable (see :ref:`scoring`) that returns a single value.
 
-        See :ref:`multimetric_grid_search` for an example.
+        If `scoring` reprents multiple scores, one can use:
 
-        If None, the estimator's score method is used.
+        - a list or tuple of unique strings;
+        - a callable returning a dictionary where the keys are the metric
+          names and the values are the metric scores;
+        - a dictionary with metric names as keys and callables a values.
+
+        See :ref:`multimetric_grid_search` for an example.
 
     cv : int, cross-validation generator or an iterable, default=None
         Determines the cross-validation splitting strategy.

From 9183486463c1df3b5b3c7e2357e17533bdd36573 Mon Sep 17 00:00:00 2001
From: mathurinm <mathurinm@users.noreply.github.com>
Date: Wed, 20 Jan 2021 18:35:31 +0100
Subject: [PATCH 075/478] FIX correct Lasso.dual_gap_ to match the objective in
 its docstring (#19172)

---
 doc/whats_new/v1.0.rst                         |  4 ++++
 sklearn/linear_model/_coordinate_descent.py    |  8 +++++++-
 .../tests/test_coordinate_descent.py           | 18 ++++++++++++++++++
 3 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 3b7de0aa4a21d..154c32617c4ba 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -79,6 +79,10 @@ Changelog
 - |Enhancement| Validate user-supplied gram matrix passed to linear models
   via the `precompute` argument. :pr:`19004` by :user:`Adam Midvidy <amidvidy>`.
 
+- |Fix| :class:`Lasso`, :class:`ElasticNet` no longer have a `dual_gap_`
+  not corresponding to their objective. :pr:`19172` by
+  :user:`Mathurin Massias <mathurinm>`
+
 :mod:`sklearn.naive_bayes`
 ..........................
 
diff --git a/sklearn/linear_model/_coordinate_descent.py b/sklearn/linear_model/_coordinate_descent.py
index f2a004be81048..5e6a8c5abfb62 100644
--- a/sklearn/linear_model/_coordinate_descent.py
+++ b/sklearn/linear_model/_coordinate_descent.py
@@ -507,6 +507,7 @@ def enet_path(X, y, *, l1_ratio=0.5, eps=1e-3, n_alphas=100, alphas=None,
         coef_ = np.asfortranarray(coef_init, dtype=X.dtype)
 
     for i, alpha in enumerate(alphas):
+        # account for n_samples scaling in objectives between here and cd_fast
         l1_reg = alpha * l1_ratio * n_samples
         l2_reg = alpha * (1.0 - l1_ratio) * n_samples
         if not multi_output and sparse.isspmatrix(X):
@@ -535,7 +536,9 @@ def enet_path(X, y, *, l1_ratio=0.5, eps=1e-3, n_alphas=100, alphas=None,
                              "'auto' or array-like. Got %r" % precompute)
         coef_, dual_gap_, eps_, n_iter_ = model
         coefs[..., i] = coef_
-        dual_gaps[i] = dual_gap_
+        # we correct the scale of the returned dual gap, as the objective
+        # in cd_fast is n_samples * the objective in this docstring.
+        dual_gaps[i] = dual_gap_ / n_samples
         n_iters.append(n_iter_)
 
         if verbose:
@@ -1953,6 +1956,9 @@ def fit(self, X, y):
                 self.coef_, l1_reg, l2_reg, X, y, self.max_iter, self.tol,
                 check_random_state(self.random_state), random)
 
+        # account for different objective scaling here and in cd_fast
+        self.dual_gap_ /= n_samples
+
         self._set_intercept(X_offset, y_offset, X_scale)
 
         # return self for chaining fit and predict calls
diff --git a/sklearn/linear_model/tests/test_coordinate_descent.py b/sklearn/linear_model/tests/test_coordinate_descent.py
index 232a59e846ff7..cda831829e7ee 100644
--- a/sklearn/linear_model/tests/test_coordinate_descent.py
+++ b/sklearn/linear_model/tests/test_coordinate_descent.py
@@ -205,6 +205,24 @@ def test_enet_toy():
     assert_almost_equal(clf.dual_gap_, 0)
 
 
+def test_lasso_dual_gap():
+    """
+    Check that Lasso.dual_gap_ matches its objective formulation, with the
+    datafit normalized by n_samples
+    """
+    X, y, _, _ = build_dataset(n_samples=10, n_features=30)
+    n_samples = len(y)
+    alpha = 0.01 * np.max(np.abs(X.T @ y)) / n_samples
+    clf = Lasso(alpha=alpha, fit_intercept=False).fit(X, y)
+    w = clf.coef_
+    R = y - X @ w
+    primal = 0.5 * np.mean(R ** 2) + clf.alpha * np.sum(np.abs(w))
+    # dual pt: R / n_samples, dual constraint: norm(X.T @ theta, inf) <= alpha
+    R /= np.max(np.abs(X.T @ R) / (n_samples * alpha))
+    dual = 0.5 * (np.mean(y ** 2) - np.mean((y - R) ** 2))
+    assert_allclose(clf.dual_gap_, primal - dual)
+
+
 def build_dataset(n_samples=50, n_features=200, n_informative_features=10,
                   n_targets=1):
     """

From 4db51e6cbc1639f2e08059bc625bdd0c16f03a78 Mon Sep 17 00:00:00 2001
From: Chiara Marmo <cmarmo@users.noreply.github.com>
Date: Thu, 21 Jan 2021 17:15:48 +0100
Subject: [PATCH 076/478] [DOC] Add scikeras to related projects. (#19226)

---
 doc/related_projects.rst | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/doc/related_projects.rst b/doc/related_projects.rst
index 52d008d27a183..8496b2b9b1df0 100644
--- a/doc/related_projects.rst
+++ b/doc/related_projects.rst
@@ -157,6 +157,10 @@ and tasks.
 - `skorch <https://github.com/dnouri/skorch>`_ A scikit-learn compatible
   neural network library that wraps PyTorch.
 
+- `scikeras <https://github.com/adriangb/scikeras>`_ provides a wrapper around
+  Keras to interface it with scikit-learn. SciKeras is the successor
+  of `tf.keras.wrappers.scikit_learn`.
+
 **Broad scope**
 
 - `mlxtend <https://github.com/rasbt/mlxtend>`_ Includes a number of additional

From 364b1e3e13a48446f86e4682bbf09ba4f010903d Mon Sep 17 00:00:00 2001
From: Surya Prakash <46415184+jdsurya@users.noreply.github.com>
Date: Fri, 22 Jan 2021 00:27:47 +0530
Subject: [PATCH 077/478] FIX Incorrect warning when clustering boolean data
 (#19046)

Co-authored-by: Nicolas Hug <contact@nicolas-hug.com>
Co-authored-by: prakashsur <46415184+prakashsur@users.noreply.github.com>
---
 doc/whats_new/v1.0.rst               |  3 ++
 sklearn/cluster/_optics.py           | 12 +++++++-
 sklearn/cluster/tests/test_optics.py | 44 ++++++++++++++++++++++++++++
 3 files changed, 58 insertions(+), 1 deletion(-)

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 154c32617c4ba..78dc95026c45e 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -70,6 +70,9 @@ Changelog
   in multicore settings. :pr:`19052` by
   :user:`Yusuke Nagasaka <YusukeNagasaka>`.
 
+- |Fix| Fixes incorrect multiple data-conversion warnings when clustering
+  boolean data. :pr:`19046` by :user:`Surya Prakash <jdsurya>`.
+
 :mod:`sklearn.linear_model`
 ...........................
 
diff --git a/sklearn/cluster/_optics.py b/sklearn/cluster/_optics.py
index 8998963704562..11893dbd70520 100755
--- a/sklearn/cluster/_optics.py
+++ b/sklearn/cluster/_optics.py
@@ -14,6 +14,8 @@
 import warnings
 import numpy as np
 
+from ..exceptions import DataConversionWarning
+from ..metrics.pairwise import PAIRWISE_BOOLEAN_FUNCTIONS
 from ..utils import gen_batches, get_chunk_n_rows
 from ..utils.validation import _deprecate_positional_args
 from ..neighbors import NearestNeighbors
@@ -243,7 +245,15 @@ def fit(self, X, y=None):
         self : instance of OPTICS
             The instance.
         """
-        X = self._validate_data(X, dtype=float)
+
+        dtype = bool if self.metric in PAIRWISE_BOOLEAN_FUNCTIONS else float
+        if dtype == bool and X.dtype != bool:
+            msg = (f"Data will be converted to boolean for"
+                   f" metric {self.metric}, to avoid this warning,"
+                   f" you may convert the data prior to calling fit.")
+            warnings.warn(msg, DataConversionWarning)
+
+        X = self._validate_data(X, dtype=dtype)
 
         if self.cluster_method not in ['dbscan', 'xi']:
             raise ValueError("cluster_method should be one of"
diff --git a/sklearn/cluster/tests/test_optics.py b/sklearn/cluster/tests/test_optics.py
index 4428b6c00d7eb..d5b30256d4943 100644
--- a/sklearn/cluster/tests/test_optics.py
+++ b/sklearn/cluster/tests/test_optics.py
@@ -10,6 +10,7 @@
 from sklearn.datasets import make_blobs
 from sklearn.cluster import OPTICS
 from sklearn.cluster._optics import _extend_region, _extract_xi_labels
+from sklearn.exceptions import DataConversionWarning
 from sklearn.metrics.cluster import contingency_matrix
 from sklearn.metrics.pairwise import pairwise_distances
 from sklearn.cluster import DBSCAN
@@ -213,6 +214,49 @@ def test_bad_reachability():
         clust.fit(X)
 
 
+def test_nowarn_if_metric_bool_data_bool():
+    # make sure no warning is raised if metric and data are both boolean
+    # non-regression test for
+    # https://github.com/scikit-learn/scikit-learn/issues/18996
+
+    pairwise_metric = 'rogerstanimoto'
+    X = np.random.randint(2, size=(5, 2), dtype=np.bool)
+
+    with pytest.warns(None) as warn_record:
+        OPTICS(metric=pairwise_metric).fit(X)
+        assert len(warn_record) == 0
+
+
+def test_warn_if_metric_bool_data_no_bool():
+    # make sure a *single* conversion warning is raised if metric is boolean
+    # but data isn't
+    # non-regression test for
+    # https://github.com/scikit-learn/scikit-learn/issues/18996
+
+    pairwise_metric = 'rogerstanimoto'
+    X = np.random.randint(2, size=(5, 2), dtype=np.int)
+    msg = f"Data will be converted to boolean for metric {pairwise_metric}"
+
+    with pytest.warns(DataConversionWarning, match=msg) as warn_record:
+        OPTICS(metric=pairwise_metric).fit(X)
+        assert len(warn_record) == 1
+
+
+def test_nowarn_if_metric_no_bool():
+    # make sure no conversion warning is raised if
+    # metric isn't boolean, no matter what the data type is
+    pairwise_metric = 'minkowski'
+    X_bool = np.random.randint(2, size=(5, 2), dtype=np.bool)
+    X_num = np.random.randint(2, size=(5, 2), dtype=np.int)
+
+    with pytest.warns(None) as warn_record:
+        # fit boolean data
+        OPTICS(metric=pairwise_metric).fit(X_bool)
+        # fit numeric data
+        OPTICS(metric=pairwise_metric).fit(X_num)
+        assert len(warn_record) == 0
+
+
 def test_close_extract():
     # Test extract where extraction eps is close to scaled max_eps
 

From 8ea176ae0ca535cdbfad7413322bbc3e54979e4d Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 22 Jan 2021 10:58:41 +0100
Subject: [PATCH 078/478] FIX validate handle_unkown strategies in
 OrdinalEncoder (#19234)

---
 doc/whats_new/v0.24.rst                      | 18 ++++++
 sklearn/preprocessing/_encoders.py           |  7 +++
 sklearn/preprocessing/tests/test_encoders.py | 65 ++++++++++++--------
 3 files changed, 65 insertions(+), 25 deletions(-)

diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst
index ffa6fe96ec3c5..f549b31f51aa7 100644
--- a/doc/whats_new/v0.24.rst
+++ b/doc/whats_new/v0.24.rst
@@ -2,6 +2,24 @@
 
 .. currentmodule:: sklearn
 
+.. _changes_0_24_2:
+
+Version 0.24.2
+==============
+
+**TBD 2021**
+
+Changelog
+---------
+
+:mod:`sklearn.preprocessing`
+............................
+
+- |Fix| Validate the constructor parameter `handle_unknown` in
+  :class:`preprocessing.OrdinalEncoder` to only allow for `'error'` and
+  `'use_encoded_value'` strategies.
+  :pr:`19234` by `Guillaume Lemaitre <glemaitre>`.
+
 .. _changes_0_24_1:
 
 Version 0.24.1
diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index 6d59cb5a0d858..342b730ba91ed 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -740,6 +740,13 @@ def fit(self, X, y=None):
         -------
         self
         """
+        handle_unknown_strategies = ("error", "use_encoded_value")
+        if self.handle_unknown not in handle_unknown_strategies:
+            raise ValueError(
+                f"handle_unknown should be either 'error' or "
+                f"'use_encoded_value', got {self.handle_unknown}."
+            )
+
         if self.handle_unknown == 'use_encoded_value':
             if is_scalar_nan(self.unknown_value):
                 if np.dtype(self.dtype).kind != 'f':
diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
index 213aa85047574..fd28d8c40b46c 100644
--- a/sklearn/preprocessing/tests/test_encoders.py
+++ b/sklearn/preprocessing/tests/test_encoders.py
@@ -624,33 +624,48 @@ def test_ordinal_encoder_handle_unknowns_numeric(dtype):
     assert_array_equal(X_trans_inv, inv_exp)
 
 
-def test_ordinal_encoder_handle_unknowns_raise():
+@pytest.mark.parametrize(
+    "params, err_type, err_msg",
+    [
+        (
+            {"handle_unknown": "use_encoded_value"},
+            TypeError,
+            "unknown_value should be an integer or np.nan when handle_unknown "
+            "is 'use_encoded_value', got None.",
+        ),
+        (
+            {"unknown_value": -2},
+            TypeError,
+            "unknown_value should only be set when handle_unknown is "
+            "'use_encoded_value', got -2.",
+        ),
+        (
+            {"handle_unknown": "use_encoded_value", "unknown_value": "bla"},
+            TypeError,
+            "unknown_value should be an integer or np.nan when handle_unknown "
+            "is 'use_encoded_value', got bla.",
+        ),
+        (
+            {"handle_unknown": "use_encoded_value", "unknown_value": 1},
+            ValueError,
+            "The used value for unknown_value (1) is one of the values "
+            "already used for encoding the seen categories.",
+        ),
+        (
+            {"handle_unknown": "ignore"},
+            ValueError,
+            "handle_unknown should be either 'error' or 'use_encoded_value', "
+            "got ignore.",
+        ),
+    ],
+)
+def test_ordinal_encoder_handle_unknowns_raise(params, err_type, err_msg):
+    # Check error message when validating input parameters
     X = np.array([['a', 'x'], ['b', 'y']], dtype=object)
 
-    enc = OrdinalEncoder(handle_unknown='use_encoded_value')
-    msg = ("unknown_value should be an integer or np.nan when handle_unknown "
-           "is 'use_encoded_value', got None.")
-    with pytest.raises(TypeError, match=msg):
-        enc.fit(X)
-
-    enc = OrdinalEncoder(unknown_value=-2)
-    msg = ("unknown_value should only be set when handle_unknown is "
-           "'use_encoded_value', got -2.")
-    with pytest.raises(TypeError, match=msg):
-        enc.fit(X)
-
-    enc = OrdinalEncoder(handle_unknown='use_encoded_value',
-                         unknown_value='bla')
-    msg = ("unknown_value should be an integer or np.nan when handle_unknown "
-           "is 'use_encoded_value', got bla.")
-    with pytest.raises(TypeError, match=msg):
-        enc.fit(X)
-
-    enc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=1)
-    msg = ("The used value for unknown_value (1) is one of the values already "
-           "used for encoding the seen categories.")
-    with pytest.raises(ValueError, match=msg):
-        enc.fit(X)
+    encoder = OrdinalEncoder(**params)
+    with pytest.raises(err_type, match=err_msg):
+        encoder.fit(X)
 
 
 def test_ordinal_encoder_handle_unknowns_nan():

From 306826f7b6bf5fd61af74062c0ba8f0f21aa3dae Mon Sep 17 00:00:00 2001
From: Maria Telenczuk <maja_ka@hotmail.com>
Date: Fri, 22 Jan 2021 14:57:23 +0100
Subject: [PATCH 079/478] MRG Deprecates 'normalize' in LinearRegression
 (_base.py) (#17743)

Co-authored-by: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Co-authored-by: Nicolas Hug <contact@nicolas-hug.com>
Co-authored-by: Alexandre Gramfort <alexandre.gramfort@m4x.org>
Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 doc/whats_new/v1.0.rst                        |  12 ++
 sklearn/linear_model/_base.py                 | 110 +++++++++++++++++-
 sklearn/linear_model/tests/test_base.py       |  87 +++++++++++++-
 .../tests/test_coordinate_descent.py          |  57 +++++++++
 4 files changed, 260 insertions(+), 6 deletions(-)

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 78dc95026c45e..e7110639635f4 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -94,6 +94,18 @@ Changelog
   Use ``var_`` instead.
   :pr:`18842` by :user:`Hong Shao Yang <hongshaoyang>`.
 
+- |API|: The parameter ``normalize`` of :class:`linear_model.LinearRegression`
+  is deprecated and will be removed in 1.2.
+  Motivation for this deprecation: ``normalize`` parameter did not take any
+  effect if ``fit_intercept`` was set to False and therefore was deemed
+  confusing.
+  The behavior of the deprecated LinearRegression(normalize=True) can be
+  reproduced with :class:`~sklearn.pipeline.Pipeline` with
+  :class:`~sklearn.preprocessing.StandardScaler`as follows:
+  make_pipeline(StandardScaler(with_mean=False), LinearRegression()).
+  :pr:`17743` by :user:`Maria Telenczuk <maikia>` and
+  :user:`Alexandre Gramfort <agramfort>`.
+
 Code and Documentation Contributors
 -----------------------------------
 
diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py
index 211406e642702..f84d4234c193c 100644
--- a/sklearn/linear_model/_base.py
+++ b/sklearn/linear_model/_base.py
@@ -11,6 +11,7 @@
 #         Lars Buitinck
 #         Maryan Morel <maryan.morel@polytechnique.edu>
 #         Giorgio Patrini <giorgio.patrini@anu.edu.au>
+#         Maria Telenczuk <https://github.com/maikia>
 # License: BSD 3 clause
 
 from abc import ABCMeta, abstractmethod
@@ -49,6 +50,94 @@
 # intercept oscillation.
 
 
+# FIXME in 1.2: parameter 'normalize' should be removed from linear models
+# in cases where now normalize=False. The default value of 'normalize' should
+# be changed to False in linear models where now normalize=True
+def _deprecate_normalize(normalize, default, estimator_name):
+    """ Normalize is to be deprecated from linear models and a use of
+    a pipeline with a StandardScaler is to be recommended instead.
+    Here the appropriate message is selected to be displayed to the user
+    depending on the default normalize value (as it varies between the linear
+    models and normalize value selected by the user).
+
+    Parameters
+    ----------
+    normalize : bool,
+        normalize value passed by the user
+
+    default : bool,
+        default normalize value used by the estimator
+
+    estimator_name : string,
+        name of the linear estimator which calls this function.
+        The name will be used for writing the deprecation warnings
+
+    Returns
+    -------
+    normalize : bool,
+        normalize value which should further be used by the estimator at this
+        stage of the depreciation process
+
+    Notes
+    -----
+    This function should be updated in 1.2 depending on the value of
+    `normalize`:
+    - True, warning: `normalize` was deprecated in 1.2 and will be removed in
+      1.4. Suggest to use pipeline instead.
+    - False, `normalize` was deprecated in 1.2 and it will be removed in 1.4.
+      Leave normalize to its default value.
+    - `deprecated` - this should only be possible with default == False as from
+      1.2 `normalize` in all the linear models should be either removed or the
+      default should be set to False.
+    This function should be completely removed in 1.4.
+    """
+
+    if normalize not in [True, False, 'deprecated']:
+        raise ValueError("Leave 'normalize' to its default value or set it "
+                         "to True or False")
+
+    if normalize == 'deprecated':
+        _normalize = default
+    else:
+        _normalize = normalize
+
+    if default and normalize == 'deprecated':
+        warnings.warn(
+            "The default of 'normalize' will be set to False in version 1.2 "
+            "and deprecated in version 1.4. \nPass normalize=False and use "
+            "Pipeline with a StandardScaler in a preprocessing stage if you "
+            "wish to reproduce the previous behavior:\n"
+            "model = make_pipeline(StandardScaler(with_mean=False), \n"
+            f"{estimator_name}(normalize=False))\n"
+            "If you wish to use additional parameters in "
+            "the fit() you can include them as follows:\n"
+            "kwargs = {model.steps[-1][0] + "
+            "'__<your_param_name>': <your_param_value>}\n"
+            "model.fit(X, y, **kwargs)", FutureWarning
+        )
+    elif normalize != 'deprecated' and normalize and not default:
+        warnings.warn(
+            "'normalize' was deprecated in version 1.0 and will be "
+            "removed in 1.2 \nIf you still wish to normalize use "
+            "Pipeline with a StandardScaler in a preprocessing stage if you "
+            "wish to reproduce the previous behavior:\n"
+            "model = make_pipeline(StandardScaler(with_mean=False), "
+            f"{estimator_name}()). \nIf you wish to use additional "
+            "parameters in the fit() you can include them as follows: "
+            "kwargs = {model.steps[-1][0] + "
+            "'__<your_param_name>': <your_param_value>}\n"
+            "model.fit(X, y, **kwargs)", FutureWarning
+        )
+    elif not normalize and not default:
+        warnings.warn(
+            "'normalize' was deprecated in version 1.0 and will be"
+            " removed in 1.2 Don't set 'normalize' parameter"
+            " and leave it to its default value", FutureWarning
+        )
+
+    return _normalize
+
+
 def make_dataset(X, y, sample_weight, random_state=None):
     """Create ``Dataset`` abstraction for sparse and dense inputs.
 
@@ -407,6 +496,10 @@ class LinearRegression(MultiOutputMixin, RegressorMixin, LinearModel):
         :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``
         on an estimator with ``normalize=False``.
 
+        .. deprecated:: 1.0
+           `normalize` was deprecated in version 1.0 and will be
+           removed in 1.2.
+
     copy_X : bool, default=True
         If True, X will be copied; else, it may be overwritten.
 
@@ -476,8 +569,8 @@ class LinearRegression(MultiOutputMixin, RegressorMixin, LinearModel):
     array([16.])
     """
     @_deprecate_positional_args
-    def __init__(self, *, fit_intercept=True, normalize=False, copy_X=True,
-                 n_jobs=None, positive=False):
+    def __init__(self, *, fit_intercept=True, normalize='deprecated',
+                 copy_X=True, n_jobs=None, positive=False):
         self.fit_intercept = fit_intercept
         self.normalize = normalize
         self.copy_X = copy_X
@@ -507,6 +600,11 @@ def fit(self, X, y, sample_weight=None):
         self : returns an instance of self.
         """
 
+        _normalize = _deprecate_normalize(
+            self.normalize, default=False,
+            estimator_name=self.__class__.__name__
+        )
+
         n_jobs_ = self.n_jobs
 
         accept_sparse = False if self.positive else ['csr', 'csc', 'coo']
@@ -519,7 +617,7 @@ def fit(self, X, y, sample_weight=None):
                                                  dtype=X.dtype)
 
         X, y, X_offset, y_offset, X_scale = self._preprocess_data(
-            X, y, fit_intercept=self.fit_intercept, normalize=self.normalize,
+            X, y, fit_intercept=self.fit_intercept, normalize=_normalize,
             copy=self.copy_X, sample_weight=sample_weight,
             return_mean=True)
 
@@ -651,10 +749,12 @@ def _pre_fit(X, y, Xy, precompute, normalize, fit_intercept, copy,
             check_input=check_input, sample_weight=sample_weight)
     if sample_weight is not None:
         X, y = _rescale_data(X, y, sample_weight=sample_weight)
+
+    # FIXME: 'normalize' to be removed in 1.2
     if hasattr(precompute, '__array__'):
         if (fit_intercept and not np.allclose(X_offset, np.zeros(n_features))
-                or normalize and not np.allclose(X_scale,
-                                                 np.ones(n_features))):
+                or normalize and not np.allclose(X_scale, np.ones(n_features)
+                                                 )):
             warnings.warn(
                 "Gram matrix was provided but X was centered to fit "
                 "intercept, or X was normalized : recomputing Gram matrix.",
diff --git a/sklearn/linear_model/tests/test_base.py b/sklearn/linear_model/tests/test_base.py
index c7990bfde8bd9..75cc9dd5fd8f1 100644
--- a/sklearn/linear_model/tests/test_base.py
+++ b/sklearn/linear_model/tests/test_base.py
@@ -17,6 +17,7 @@
 from sklearn.utils.fixes import parse_version
 
 from sklearn.linear_model import LinearRegression
+from sklearn.linear_model._base import _deprecate_normalize
 from sklearn.linear_model._base import _preprocess_data
 from sklearn.linear_model._base import _rescale_data
 from sklearn.linear_model._base import make_dataset
@@ -106,6 +107,7 @@ def test_raises_value_error_if_positive_and_sparse():
     with pytest.raises(TypeError, match=error_msg):
         reg.fit(X, y)
 
+
 def test_raises_value_error_if_sample_weights_greater_than_1d():
     # Sample weights must be either scalar or 1D
 
@@ -149,6 +151,59 @@ def test_fit_intercept():
             lr3_without_intercept.coef_.ndim)
 
 
+def test_error_on_wrong_normalize():
+    normalize = 'wrong'
+    default = True
+    error_msg = "Leave 'normalize' to its default"
+    with pytest.raises(ValueError, match=error_msg):
+        _deprecate_normalize(normalize, default, 'estimator')
+    ValueError
+
+
+@pytest.mark.parametrize('normalize', [True, False, 'deprecated'])
+@pytest.mark.parametrize('default', [True, False])
+# FIXME update test in 1.2 for new versions
+def test_deprecate_normalize(normalize, default):
+    # test all possible case of the normalize parameter deprecation
+    if not default:
+        if normalize == 'deprecated':
+            # no warning
+            output = default
+            expected = None
+            warning_msg = []
+        else:
+            output = normalize
+            expected = FutureWarning
+            warning_msg = ['1.2']
+            if not normalize:
+                warning_msg.append('default value')
+            else:
+                warning_msg.append('StandardScaler(')
+    elif default:
+        if normalize == 'deprecated':
+            # warning to pass False and use StandardScaler
+            output = default
+            expected = FutureWarning
+            warning_msg = ['False', '1.2', 'StandardScaler(']
+        else:
+            # no warning
+            output = normalize
+            expected = None
+            warning_msg = []
+
+    with pytest.warns(expected) as record:
+        _normalize = _deprecate_normalize(normalize, default, 'estimator')
+    assert _normalize == output
+
+    n_warnings = 0 if expected is None else 1
+    assert len(record) == n_warnings
+    if n_warnings:
+        assert all([
+            warning in str(record[0].message)
+            for warning in warning_msg
+        ])
+
+
 def test_linear_regression_sparse(random_state=0):
     # Test that linear regression also works with sparse data
     random_state = check_random_state(random_state)
@@ -165,6 +220,35 @@ def test_linear_regression_sparse(random_state=0):
         assert_array_almost_equal(ols.predict(X) - y.ravel(), 0)
 
 
+@pytest.mark.parametrize(
+    'normalize, n_warnings, warning',
+    [(True, 1, FutureWarning),
+     (False, 1, FutureWarning),
+     ("deprecated", 0, None)]
+)
+# FIXME remove test in 1.4
+def test_linear_regression_normalize_deprecation(
+     normalize, n_warnings, warning
+):
+    # check that we issue a FutureWarning when normalize was set in
+    # LinearRegression
+    rng = check_random_state(0)
+    n_samples = 200
+    n_features = 2
+    X = rng.randn(n_samples, n_features)
+    X[X < 0.1] = 0.0
+    y = rng.rand(n_samples)
+
+    model = LinearRegression(normalize=normalize)
+    with pytest.warns(warning) as record:
+        model.fit(X, y)
+    assert len(record) == n_warnings
+    if n_warnings:
+        assert "'normalize' was deprecated" in str(record[0].message)
+
+
+# FIXME: 'normalize' to be removed in 1.2 in LinearRegression
+@pytest.mark.filterwarnings("ignore:'normalize' was deprecated")
 @pytest.mark.parametrize('normalize', [True, False])
 @pytest.mark.parametrize('fit_intercept', [True, False])
 def test_linear_regression_sparse_equal_dense(normalize, fit_intercept):
@@ -303,8 +387,9 @@ def test_linear_regression_pd_sparse_dataframe_warning():
         df[str(col)] = arr
 
     msg = "pandas.DataFrame with sparse columns found."
+
+    reg = LinearRegression()
     with pytest.warns(UserWarning, match=msg):
-        reg = LinearRegression()
         reg.fit(df.iloc[:, 0:2], df.iloc[:, 3])
 
     # does not warn when the whole dataframe is sparse
diff --git a/sklearn/linear_model/tests/test_coordinate_descent.py b/sklearn/linear_model/tests/test_coordinate_descent.py
index cda831829e7ee..6e6f304bb9e24 100644
--- a/sklearn/linear_model/tests/test_coordinate_descent.py
+++ b/sklearn/linear_model/tests/test_coordinate_descent.py
@@ -26,6 +26,7 @@
 from sklearn.utils._testing import assert_warns_message
 from sklearn.utils._testing import ignore_warnings
 from sklearn.utils._testing import assert_array_equal
+from sklearn.utils._testing import _convert_container
 from sklearn.utils._testing import TempMemmap
 from sklearn.utils.fixes import parse_version
 
@@ -301,6 +302,8 @@ def test_lasso_cv_positive_constraint():
     assert min(clf_constrained.coef_) >= 0
 
 
+# FIXME: 'normalize' to be removed in 1.2
+@pytest.mark.filterwarnings("ignore:'normalize' was deprecated")
 @pytest.mark.parametrize(
     "LinearModel, params",
     [(Lasso, {"tol": 1e-16, "alpha": 0.1}),
@@ -384,6 +387,60 @@ def test_model_pipeline_same_as_normalize_true(LinearModel, params):
     assert_allclose(y_pred_normalize, y_pred_standardize)
 
 
+# FIXME: 'normalize' to be removed in 1.2
+@pytest.mark.filterwarnings("ignore:'normalize' was deprecated")
+@pytest.mark.parametrize(
+    "estimator, is_sparse, with_mean",
+    [(LinearRegression, True, False),
+     (LinearRegression, False, True),
+     (LinearRegression, False, False)]
+)
+def test_linear_model_sample_weights_normalize_in_pipeline(
+        estimator, is_sparse, with_mean
+):
+    # Test that the results for running linear regression LinearRegression with
+    # sample_weight set and with normalize set to True gives similar results as
+    # LinearRegression with no normalize in a pipeline with a StandardScaler
+    # and set sample_weight.
+    rng = np.random.RandomState(0)
+    X, y = make_regression(n_samples=20, n_features=5, noise=1e-2,
+                           random_state=rng)
+    # make sure the data is not centered to make the problem more
+    # difficult
+    X += 10
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5,
+                                                        random_state=rng)
+    if is_sparse:
+        X_train = sparse.csr_matrix(X_train)
+        X_test = _convert_container(X_train, 'sparse')
+
+    sample_weight = rng.rand(X_train.shape[0])
+
+    # linear estimator with explicit sample_weight
+    reg_with_normalize = estimator(normalize=True)
+    reg_with_normalize.fit(X_train, y_train, sample_weight=sample_weight)
+
+    # linear estimator in a pipeline
+    reg_with_scaler = make_pipeline(
+        StandardScaler(with_mean=with_mean),
+        estimator(normalize=False)
+    )
+    kwargs = {reg_with_scaler.steps[-1][0] + '__sample_weight':
+              sample_weight}
+    reg_with_scaler.fit(X_train, y_train, **kwargs)
+
+    y_pred_norm = reg_with_normalize.predict(X_test)
+    y_pred_pip = reg_with_scaler.predict(X_test)
+
+    assert_allclose(
+        reg_with_normalize.coef_ * reg_with_scaler[0].scale_,
+        reg_with_scaler[1].coef_
+    )
+    assert_allclose(y_pred_norm, y_pred_pip)
+
+
+# FIXME: 'normalize' to be removed in 1.2
+@pytest.mark.filterwarnings("ignore:'normalize' was deprecated")
 @pytest.mark.parametrize(
     "LinearModel, params",
     [(Lasso, {"tol": 1e-16, "alpha": 0.1}),

From 468c3f40577da8d7b3699400ad86f36e5bb5ab60 Mon Sep 17 00:00:00 2001
From: Adrin Jalali <adrin.jalali@gmail.com>
Date: Fri, 22 Jan 2021 15:19:48 +0100
Subject: [PATCH 080/478] MNT replace master with main (#19237)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* replace master with main

* fix imbalanced-learn

Co-authored-by: Guillaume Lemaitre <g.lemaitre58@gmail.com>

* asv default branch is `master`

Co-authored-by: Jérémie du Boisberranger <34657725+jeremiedbb@users.noreply.github.com>

Co-authored-by: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Co-authored-by: Jérémie du Boisberranger <34657725+jeremiedbb@users.noreply.github.com>
---
 .circleci/config.yml                          |  4 +-
 .codecov.yml                                  |  2 +-
 .github/workflows/twitter.yml                 |  4 +-
 .github/workflows/wheels.yml                  |  4 +-
 .travis.yml                                   |  4 +-
 CONTRIBUTING.md                               |  2 +-
 Makefile                                      |  2 +-
 PULL_REQUEST_TEMPLATE.md                      |  2 +-
 README.rst                                    | 12 +--
 asv_benchmarks/asv.conf.json                  |  6 +-
 azure-pipelines.yml                           |  2 +-
 build_tools/circle/build_doc.sh               | 16 ++--
 build_tools/circle/checkout_merge_commit.sh   |  8 +-
 build_tools/circle/linting.sh                 | 22 ++---
 build_tools/circle/push_doc.sh                | 10 +-
 build_tools/travis/install.sh                 |  2 +-
 .../{install_master.sh => install_main.sh}    |  2 +-
 doc/about.rst                                 |  2 +-
 doc/computing/computational_performance.rst   |  2 +-
 doc/conf.py                                   |  6 +-
 doc/developers/advanced_installation.rst      |  4 +-
 doc/developers/contributing.rst               | 28 +++---
 doc/developers/maintainer.rst                 | 36 ++++----
 doc/developers/tips.rst                       |  4 +-
 doc/faq.rst                                   | 18 ++--
 doc/themes/scikit-learn/layout.html           |  2 +-
 .../static/img/scikit-learn-logo.svg          | 91 +++++++++----------
 .../text_analytics/working_with_text_data.rst |  2 +-
 maint_tools/whats_missing.sh                  |  2 +-
 29 files changed, 150 insertions(+), 151 deletions(-)
 rename build_tools/travis/{install_master.sh => install_main.sh} (98%)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 48d48810f0014..4ca26a110f28c 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -132,7 +132,7 @@ jobs:
       - run: ls -ltrh doc/_build/html/stable
       - deploy:
           command: |
-            if [[ "${CIRCLE_BRANCH}" =~ ^master$|^[0-9]+\.[0-9]+\.X$ ]]; then
+            if [[ "${CIRCLE_BRANCH}" =~ ^main$|^[0-9]+\.[0-9]+\.X$ ]]; then
               bash build_tools/circle/push_doc.sh doc/_build/html/stable
             fi
 
@@ -162,6 +162,6 @@ workflows:
           filters:
             branches:
               only:
-                - master
+                - main
     jobs:
       - pypy3
diff --git a/.codecov.yml b/.codecov.yml
index a2557a5a21079..d430925ea7508 100644
--- a/.codecov.yml
+++ b/.codecov.yml
@@ -4,7 +4,7 @@ coverage:
   status:
     project:
       default:
-        # Commits pushed to master should not make the overall
+        # Commits pushed to main should not make the overall
         # project coverage decrease by more than 1%:
         target: auto
         threshold: 1%
diff --git a/.github/workflows/twitter.yml b/.github/workflows/twitter.yml
index ac2f037246257..7c219b13ec28c 100644
--- a/.github/workflows/twitter.yml
+++ b/.github/workflows/twitter.yml
@@ -1,12 +1,12 @@
 # Tweet the URL of a commit on @sklearn_commits whenever a push event
-# happens on the master branch
+# happens on the main branch
 name: Twitter Push Notification
 
 
 on:
   push:
     branches:
-      - master
+      - main
 
 
 jobs:
diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
index 17726ec9a112b..dc7afc711a2a7 100644
--- a/.github/workflows/wheels.yml
+++ b/.github/workflows/wheels.yml
@@ -7,12 +7,12 @@ on:
     - cron: "42 3 */1 * *"
   push:
     branches:
-      - master
+      - main
       # Release branches
       - "[0-9]+.[0-9]+.X"
   pull_request:
     branches:
-      - master
+      - main
       - "[0-9]+.[0-9]+.X"
   # Manual run
   workflow_dispatch:
diff --git a/.travis.yml b/.travis.yml
index 3c995f35253ae..72a9e3993444a 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -37,7 +37,7 @@ jobs:
       if: type = cron OR commit_message =~ /\[icc-build\]/
 
     # Manual trigger of linux/arm64 tests in PR without triggering the full
-    # wheel building process for all the Python versions. 
+    # wheel building process for all the Python versions.
     - python: 3.9
       os: linux
       arch: arm64
@@ -48,7 +48,7 @@ jobs:
     # Linux environments to build the scikit-learn wheels for the ARM64
     # architecture and Python 3.6 and newer. This is used both at release time
     # with the manual trigger in the commit message in the release branch and as
-    # a scheduled task to build the weekly dev build on the master branch. The
+    # a scheduled task to build the weekly dev build on the main branch. The
     # weekly frequency is meant to avoid depleting the Travis CI credits too
     # fast.
     - python: 3.6
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 5091becaa5ba6..f6f65883c65b2 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -13,7 +13,7 @@ documentation is no less important than improving the library itself. If you
 find a typo in the documentation, or have made improvements, do not hesitate to
 send an email to the mailing list or preferably submit a GitHub pull request.
 Documentation can be found under the
-[doc/](https://github.com/scikit-learn/scikit-learn/tree/master/doc) directory.
+[doc/](https://github.com/scikit-learn/scikit-learn/tree/main/doc) directory.
 
 But there are many other ways to help. In particular answering queries on the
 [issue tracker](https://github.com/scikit-learn/scikit-learn/issues),
diff --git a/Makefile b/Makefile
index cba46ecf7c1a1..112b1e68188a0 100644
--- a/Makefile
+++ b/Makefile
@@ -65,4 +65,4 @@ code-analysis:
 	pylint -E -i y sklearn/ -d E1103,E0611,E1101
 
 flake8-diff:
-	git diff upstream/master -u -- "*.py" | flake8 --diff
+	git diff upstream/main -u -- "*.py" | flake8 --diff
diff --git a/PULL_REQUEST_TEMPLATE.md b/PULL_REQUEST_TEMPLATE.md
index 82720e2af1d64..8528d5386b58a 100644
--- a/PULL_REQUEST_TEMPLATE.md
+++ b/PULL_REQUEST_TEMPLATE.md
@@ -1,6 +1,6 @@
 <!--
 Thanks for contributing a pull request! Please ensure you have taken a look at
-the contribution guidelines: https://github.com/scikit-learn/scikit-learn/blob/master/CONTRIBUTING.md
+the contribution guidelines: https://github.com/scikit-learn/scikit-learn/blob/main/CONTRIBUTING.md
 -->
 
 #### Reference Issues/PRs
diff --git a/README.rst b/README.rst
index 9eb62647a7de6..68f9ffee17d03 100644
--- a/README.rst
+++ b/README.rst
@@ -2,16 +2,16 @@
 
 |Azure|_ |Travis|_ |Codecov|_ |CircleCI|_ |Nightly wheels|_ |PythonVersion|_ |PyPi|_ |DOI|_
 
-.. |Azure| image:: https://dev.azure.com/scikit-learn/scikit-learn/_apis/build/status/scikit-learn.scikit-learn?branchName=master
-.. _Azure: https://dev.azure.com/scikit-learn/scikit-learn/_build/latest?definitionId=1&branchName=master
+.. |Azure| image:: https://dev.azure.com/scikit-learn/scikit-learn/_apis/build/status/scikit-learn.scikit-learn?branchName=main
+.. _Azure: https://dev.azure.com/scikit-learn/scikit-learn/_build/latest?definitionId=1&branchName=main
 
-.. |Travis| image:: https://api.travis-ci.com/scikit-learn/scikit-learn.svg?branch=master
+.. |Travis| image:: https://api.travis-ci.com/scikit-learn/scikit-learn.svg?branch=main
 .. _Travis: https://travis-ci.com/scikit-learn/scikit-learn
 
-.. |Codecov| image:: https://codecov.io/github/scikit-learn/scikit-learn/badge.svg?branch=master&service=github
-.. _Codecov: https://codecov.io/github/scikit-learn/scikit-learn?branch=master
+.. |Codecov| image:: https://codecov.io/github/scikit-learn/scikit-learn/badge.svg?branch=main&service=github
+.. _Codecov: https://codecov.io/github/scikit-learn/scikit-learn?branch=main
 
-.. |CircleCI| image:: https://circleci.com/gh/scikit-learn/scikit-learn/tree/master.svg?style=shield&circle-token=:circle-token
+.. |CircleCI| image:: https://circleci.com/gh/scikit-learn/scikit-learn/tree/main.svg?style=shield&circle-token=:circle-token
 .. _CircleCI: https://circleci.com/gh/scikit-learn/scikit-learn
 
 .. |Nightly wheels| image:: https://github.com/scikit-learn/scikit-learn/workflows/Wheel%20builder/badge.svg?event=schedule
diff --git a/asv_benchmarks/asv.conf.json b/asv_benchmarks/asv.conf.json
index 7e839a1ecb175..59d9f862f0f97 100644
--- a/asv_benchmarks/asv.conf.json
+++ b/asv_benchmarks/asv.conf.json
@@ -12,7 +12,7 @@
     // The URL or local path of the source code repository for the
     // project being benchmarked
     "repo": "..",
-    
+
     // The Python project's subdirectory in your repo.  If missing or
     // the empty string, the project is assumed to be located at the root
     // of the repository.
@@ -28,9 +28,9 @@
     //     "PIP_NO_BUILD_ISOLATION=false python -mpip wheel --no-deps --no-index -w {build_cache_dir} {build_dir}"
     // ],
 
-    // List of branches to benchmark. If not provided, defaults to "master"
+    // List of branches to benchmark. If not provided, defaults to "master
     // (for git) or "default" (for mercurial).
-    // "branches": ["master"], // for git
+    // "branches": ["main"], // for git
     // "branches": ["default"],    // for mercurial
 
     // The DVCS being used.  If not set, it will be automatically
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 870c5f0e1d313..1b861a6eaef5f 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -4,7 +4,7 @@ schedules:
   displayName: Run nightly build
   branches:
     include:
-    - master
+    - main
   always: true
 
 jobs:
diff --git a/build_tools/circle/build_doc.sh b/build_tools/circle/build_doc.sh
index 691006bd2dab0..37afb1841d368 100755
--- a/build_tools/circle/build_doc.sh
+++ b/build_tools/circle/build_doc.sh
@@ -9,7 +9,7 @@ set -e
 # instead of relying on the subsequent rules.
 #
 # We always build the documentation for jobs that are not related to a specific
-# PR (e.g. a merge to master or a maintenance branch).
+# PR (e.g. a merge to main or a maintenance branch).
 #
 # If this is a PR, do a full build if there are some files in this PR that are
 # under the "doc/" or "examples/" folders, otherwise perform a quick build.
@@ -49,8 +49,8 @@ get_build_type() {
         echo BUILD: not a pull request
         return
     fi
-    git_range="origin/master...$CIRCLE_SHA1"
-    git fetch origin master >&2 || (echo QUICK BUILD: failed to get changed filenames for $git_range; return)
+    git_range="origin/main...$CIRCLE_SHA1"
+    git fetch origin main >&2 || (echo QUICK BUILD: failed to get changed filenames for $git_range; return)
     filenames=$(git diff --name-only $git_range)
     if [ -z "$filenames" ]
     then
@@ -114,7 +114,7 @@ then
     exit 0
 fi
 
-if [[ "$CIRCLE_BRANCH" =~ ^master$|^[0-9]+\.[0-9]+\.X$ && -z "$CI_PULL_REQUEST" ]]
+if [[ "$CIRCLE_BRANCH" =~ ^main$|^[0-9]+\.[0-9]+\.X$ && -z "$CI_PULL_REQUEST" ]]
 then
     # ZIP linked into HTML
     make_args=dist
@@ -188,9 +188,9 @@ python setup.py develop
 
 export OMP_NUM_THREADS=1
 
-if [[ "$CIRCLE_BRANCH" =~ ^master$ && -z "$CI_PULL_REQUEST" ]]
+if [[ "$CIRCLE_BRANCH" =~ ^main$ && -z "$CI_PULL_REQUEST" ]]
 then
-    # List available documentation versions if on master
+    # List available documentation versions if on main
     python build_tools/circle/list_versions.py > doc/versions.rst
 fi
 
@@ -205,7 +205,7 @@ cd -
 set +o pipefail
 
 affected_doc_paths() {
-    files=$(git diff --name-only origin/master...$CIRCLE_SHA1)
+    files=$(git diff --name-only origin/main...$CIRCLE_SHA1)
     echo "$files" | grep ^doc/.*\.rst | sed 's/^doc\/\(.*\)\.rst$/\1.html/'
     echo "$files" | grep ^examples/.*.py | sed 's/^\(.*\)\.py$/auto_\1.html/'
     sklearn_files=$(echo "$files" | grep '^sklearn/')
@@ -216,7 +216,7 @@ affected_doc_paths() {
 }
 
 affected_doc_warnings() {
-    files=$(git diff --name-only origin/master...$CIRCLE_SHA1)
+    files=$(git diff --name-only origin/main...$CIRCLE_SHA1)
     # Look for sphinx warnings only in files affected by the PR
     if [ -n "$files" ]
     then
diff --git a/build_tools/circle/checkout_merge_commit.sh b/build_tools/circle/checkout_merge_commit.sh
index 010a6a0b55e6d..d9860b0ab5277 100755
--- a/build_tools/circle/checkout_merge_commit.sh
+++ b/build_tools/circle/checkout_merge_commit.sh
@@ -1,9 +1,9 @@
 #!/bin/bash
 
 
-# Add `master` branch to the update list.
+# Add `main` branch to the update list.
 # Otherwise CircleCI will give us a cached one.
-FETCH_REFS="+master:master"
+FETCH_REFS="+main:main"
 
 # Update PR refs for testing.
 if [[ -n "${CIRCLE_PR_NUMBER}" ]]
@@ -20,13 +20,13 @@ if [[ -n "${CIRCLE_PR_NUMBER}" ]]
 then
     git checkout -qf "pr/${CIRCLE_PR_NUMBER}/merge" || (
         echo Could not fetch merge commit. >&2
-        echo There may be conflicts in merging PR \#${CIRCLE_PR_NUMBER} with master. >&2;
+        echo There may be conflicts in merging PR \#${CIRCLE_PR_NUMBER} with main. >&2;
         exit 1)
 fi
 
 # Check for merge conflicts.
 if [[ -n "${CIRCLE_PR_NUMBER}" ]]
 then
-    git branch --merged | grep master > /dev/null
+    git branch --merged | grep main > /dev/null
     git branch --merged | grep "pr/${CIRCLE_PR_NUMBER}/head" > /dev/null
 fi
diff --git a/build_tools/circle/linting.sh b/build_tools/circle/linting.sh
index 2dd446c9ddc1c..aebe42dfecc70 100755
--- a/build_tools/circle/linting.sh
+++ b/build_tools/circle/linting.sh
@@ -38,7 +38,7 @@ git remote --verbose
 
 # Travis does the git clone with a limited depth (50 at the time of
 # writing). This may not be enough to find the common ancestor with
-# $REMOTE/master so we unshallow the git checkout
+# $REMOTE/main so we unshallow the git checkout
 if [[ -a .git/shallow ]]; then
     echo -e '\nTrying to unshallow the repo:'
     echo '--------------------------------------------------------------------------------'
@@ -59,7 +59,7 @@ if [[ "$TRAVIS" == "true" ]]; then
         fi
     else
         # We want to fetch the code as it is in the PR branch and not
-        # the result of the merge into master. This way line numbers
+        # the result of the merge into main. This way line numbers
         # reported by Travis will match with the local code.
         LOCAL_BRANCH_REF=travis_pr_$TRAVIS_PULL_REQUEST
         # In Travis the PR target is always origin
@@ -68,7 +68,7 @@ if [[ "$TRAVIS" == "true" ]]; then
 fi
 
 # If not using the commit range from Travis we need to find the common
-# ancestor between $LOCAL_BRANCH_REF and $REMOTE/master
+# ancestor between $LOCAL_BRANCH_REF and $REMOTE/main
 if [[ -z "$COMMIT_RANGE" ]]; then
     if [[ -z "$LOCAL_BRANCH_REF" ]]; then
         LOCAL_BRANCH_REF=$(git rev-parse --abbrev-ref HEAD)
@@ -77,16 +77,16 @@ if [[ -z "$COMMIT_RANGE" ]]; then
     echo '--------------------------------------------------------------------------------'
     git --no-pager log -2 $LOCAL_BRANCH_REF
 
-    REMOTE_MASTER_REF="$REMOTE/master"
-    # Make sure that $REMOTE_MASTER_REF is a valid reference
-    echo -e "\nFetching $REMOTE_MASTER_REF"
+    REMOTE_MAIN_REF="$REMOTE/main"
+    # Make sure that $REMOTE_MAIN_REF is a valid reference
+    echo -e "\nFetching $REMOTE_MAIN_REF"
     echo '--------------------------------------------------------------------------------'
-    git fetch $REMOTE master:refs/remotes/$REMOTE_MASTER_REF
+    git fetch $REMOTE main:refs/remotes/$REMOTE_MAIN_REF
     LOCAL_BRANCH_SHORT_HASH=$(git rev-parse --short $LOCAL_BRANCH_REF)
-    REMOTE_MASTER_SHORT_HASH=$(git rev-parse --short $REMOTE_MASTER_REF)
+    REMOTE_MAIN_SHORT_HASH=$(git rev-parse --short $REMOTE_MAIN_REF)
 
-    COMMIT=$(git merge-base $LOCAL_BRANCH_REF $REMOTE_MASTER_REF) || \
-        echo "No common ancestor found for $(git show $LOCAL_BRANCH_REF -q) and $(git show $REMOTE_MASTER_REF -q)"
+    COMMIT=$(git merge-base $LOCAL_BRANCH_REF $REMOTE_MAIN_REF) || \
+        echo "No common ancestor found for $(git show $LOCAL_BRANCH_REF -q) and $(git show $REMOTE_MAIN_REF -q)"
 
     if [ -z "$COMMIT" ]; then
         exit 1
@@ -95,7 +95,7 @@ if [[ -z "$COMMIT_RANGE" ]]; then
     COMMIT_SHORT_HASH=$(git rev-parse --short $COMMIT)
 
     echo -e "\nCommon ancestor between $LOCAL_BRANCH_REF ($LOCAL_BRANCH_SHORT_HASH)"\
-         "and $REMOTE_MASTER_REF ($REMOTE_MASTER_SHORT_HASH) is $COMMIT_SHORT_HASH:"
+         "and $REMOTE_MAIN_REF ($REMOTE_MAIN_SHORT_HASH) is $COMMIT_SHORT_HASH:"
     echo '--------------------------------------------------------------------------------'
     git --no-pager show --no-patch $COMMIT_SHORT_HASH
 
diff --git a/build_tools/circle/push_doc.sh b/build_tools/circle/push_doc.sh
index cb87a84548b84..5b94211e4e30e 100755
--- a/build_tools/circle/push_doc.sh
+++ b/build_tools/circle/push_doc.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# This script is meant to be called in the "deploy" step defined in 
+# This script is meant to be called in the "deploy" step defined in
 # circle.yml. See https://circleci.com/docs/ for more details.
 # The behavior of the script is controlled by environment variable defined
 # in the circle.yml in the top level folder of the project.
@@ -23,7 +23,7 @@ fi
 # Absolute path needed because we use cd further down in this script
 GENERATED_DOC_DIR=$(readlink -f $GENERATED_DOC_DIR)
 
-if [ "$CIRCLE_BRANCH" = "master" ]
+if [ "$CIRCLE_BRANCH" = "main" ]
 then
     dir=dev
 else
@@ -49,8 +49,8 @@ then
 	touch $dir/index.html
 	git add $dir
 fi
-git checkout master
-git reset --hard origin/master
+git checkout main
+git reset --hard origin/main
 if [ -d $dir ]
 then
 	git rm -rf $dir/ && rm -rf $dir/
@@ -62,4 +62,4 @@ git config push.default matching
 git add -f $dir/
 git commit -m "$MSG" $dir
 git push
-echo $MSG 
+echo $MSG
diff --git a/build_tools/travis/install.sh b/build_tools/travis/install.sh
index e466e3da89a82..1e8e2963711ef 100644
--- a/build_tools/travis/install.sh
+++ b/build_tools/travis/install.sh
@@ -9,5 +9,5 @@ set -e
 if [[ $BUILD_WHEEL == true ]]; then
     source build_tools/travis/install_wheels.sh
 else
-    source build_tools/travis/install_master.sh
+    source build_tools/travis/install_main.sh
 fi
diff --git a/build_tools/travis/install_master.sh b/build_tools/travis/install_main.sh
similarity index 98%
rename from build_tools/travis/install_master.sh
rename to build_tools/travis/install_main.sh
index e2e0534216c7c..423835df25d11 100755
--- a/build_tools/travis/install_master.sh
+++ b/build_tools/travis/install_main.sh
@@ -55,7 +55,7 @@ pip install joblib threadpoolctl
 
 pip install $(get_dep pytest $PYTEST_VERSION) pytest-xdist
 
-# Build scikit-learn in this script to collapse the 
+# Build scikit-learn in this script to collapse the
 # verbose build output in the Travis output when it
 # succeeds
 python --version
diff --git a/doc/about.rst b/doc/about.rst
index 7df56fdd608a2..fdfe8241b8aec 100644
--- a/doc/about.rst
+++ b/doc/about.rst
@@ -107,7 +107,7 @@ Artwork
 -------
 
 High quality PNG and SVG logos are available in the `doc/logos/
-<https://github.com/scikit-learn/scikit-learn/tree/master/doc/logos>`_
+<https://github.com/scikit-learn/scikit-learn/tree/main/doc/logos>`_
 source directory.
 
 .. image:: images/scikit-learn-logo-notext.png
diff --git a/doc/computing/computational_performance.rst b/doc/computing/computational_performance.rst
index d47ac6f614183..32a485e21a2a5 100644
--- a/doc/computing/computational_performance.rst
+++ b/doc/computing/computational_performance.rst
@@ -336,7 +336,7 @@ compromise between model compactness and prediction power. One can also
 further tune the ``l1_ratio`` parameter (in combination with the
 regularization strength ``alpha``) to control this tradeoff.
 
-A typical `benchmark <https://github.com/scikit-learn/scikit-learn/blob/master/benchmarks/bench_sparsify.py>`_
+A typical `benchmark <https://github.com/scikit-learn/scikit-learn/blob/main/benchmarks/bench_sparsify.py>`_
 on synthetic data yields a >30% decrease in latency when both the model and
 input are sparse (with 0.000024 and 0.027400 non-zero coefficients ratio
 respectively). Your mileage may vary depending on the sparsity and size of
diff --git a/doc/conf.py b/doc/conf.py
index dc97f1fd5be75..adf12d9e88e82 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -78,8 +78,8 @@
 # The encoding of source files.
 #source_encoding = 'utf-8'
 
-# The master toctree document.
-master_doc = 'contents'
+# The main toctree document.
+main_doc = 'contents'
 
 # General information about the project.
 project = 'scikit-learn'
@@ -283,7 +283,7 @@
         'PEP440'.format(version))
 
 if v.is_devrelease:
-    binder_branch = 'master'
+    binder_branch = 'main'
 else:
     major, minor = v.release[:2]
     binder_branch = '{}.{}.X'.format(major, minor)
diff --git a/doc/developers/advanced_installation.rst b/doc/developers/advanced_installation.rst
index 7fbceeeab4c47..37cb597776934 100644
--- a/doc/developers/advanced_installation.rst
+++ b/doc/developers/advanced_installation.rst
@@ -7,7 +7,7 @@
 Installing the development version of scikit-learn
 ==================================================
 
-This section introduces how to install the **master branch** of scikit-learn.
+This section introduces how to install the **main branch** of scikit-learn.
 This can be done by either installing a nightly build or building from source.
 
 .. _install_nightly_builds:
@@ -22,7 +22,7 @@ basis.
 Installing a nightly build is the quickest way to:
 
 - try a new feature that will be shipped in the next release (that is, a
-  feature from a pull-request that was recently merged to the master branch);
+  feature from a pull-request that was recently merged to the main branch);
 
 - check whether a bug you encountered has been fixed since the last release.
 
diff --git a/doc/developers/contributing.rst b/doc/developers/contributing.rst
index fb2c0aa997fe5..7f3aeb9537413 100644
--- a/doc/developers/contributing.rst
+++ b/doc/developers/contributing.rst
@@ -274,12 +274,12 @@ You should now have a working installation of scikit-learn, and your git
 repository properly configured. The next steps now describe the process of
 modifying code and submitting a PR:
 
-7. Synchronize your master branch with the upstream master branch:
+7. Synchronize your main branch with the upstream main branch:
 
    .. prompt:: bash $
 
-        git checkout master
-        git pull upstream master
+        git checkout main
+        git pull upstream main
 
 8. Create a feature branch to hold your development changes:
 
@@ -288,7 +288,7 @@ modifying code and submitting a PR:
         git checkout -b my_feature
 
    and start making changes. Always use a feature branch. It's good
-   practice to never work on the ``master`` branch!
+   practice to never work on the ``main`` branch!
 
 9. (**Optional**) Install `pre-commit <https://pre-commit.com/#install>`_ to
    run code style checks before each commit:
@@ -341,7 +341,7 @@ latest changes of the main scikit-learn repository:
 .. prompt:: bash $
 
     git fetch upstream
-    git merge upstream/master
+    git merge upstream/main
 
 Subsequently, you might need to solve the conflicts. You can refer to the
 `Git documentation related to resolving merge conflict using the command
@@ -416,7 +416,7 @@ complies with the following rules before marking a PR as ``[MRG]``. The
    verify the correct behavior of the fix or feature. In this manner, further
    modifications on the code base are granted to be consistent with the
    desired behavior. In the case of bug fixes, at the time of the PR, the
-   non-regression tests should fail for the code base in the master branch
+   non-regression tests should fail for the code base in the main branch
    and pass for the PR code.
 
 5. **Make sure that your PR does not add PEP8 violations**. To check the
@@ -425,7 +425,7 @@ complies with the following rules before marking a PR as ``[MRG]``. The
 
    .. prompt:: bash $
 
-        git diff upstream/master -u -- "*.py" | flake8 --diff
+        git diff upstream/main -u -- "*.py" | flake8 --diff
 
    or `make flake8-diff` which should work on unix-like system.
 
@@ -450,7 +450,7 @@ complies with the following rules before marking a PR as ``[MRG]``. The
    performance and efficiency (see :ref:`monitoring_performances`) or through
    examples of usage. Examples also illustrate the features and intricacies of
    the library to users. Have a look at other examples in the `examples/
-   <https://github.com/scikit-learn/scikit-learn/tree/master/examples>`_
+   <https://github.com/scikit-learn/scikit-learn/tree/main/examples>`_
    directory for reference. Examples should demonstrate why the new
    functionality is useful in practice and, if possible, compare it to other
    methods available in scikit-learn.
@@ -931,24 +931,24 @@ scikit-learn. Make sure it is up to date:
 
 In the benchmark suite, the benchmarks are organized following the same
 structure as scikit-learn. For example, you can compare the performance of a
-specific estimator between upstream/master and the branch you are working on:
+specific estimator between upstream/main and the branch you are working on:
 
 .. prompt:: bash $
 
-  asv continuous -b LogisticRegression upstream/master HEAD
+  asv continuous -b LogisticRegression upstream/main HEAD
 
 The command uses conda by default for creating the benchmark environments. If
 you want to use virtualenv instead, use the `-E` flag:
 
 .. prompt:: bash $
 
-  asv continuous -E virtualenv -b LogisticRegression upstream/master HEAD
+  asv continuous -E virtualenv -b LogisticRegression upstream/main HEAD
 
 You can also specify a whole module to benchmark:
 
 .. prompt:: bash $
 
-  asv continuous -b linear_model upstream/master HEAD
+  asv continuous -b linear_model upstream/main HEAD
 
 You can replace `HEAD` by any local branch. By default it will only report the
 benchmarks that have change by at least 10%. You can control this ratio with
@@ -958,7 +958,7 @@ To run the full benchmark suite, simply remove the `-b` flag :
 
 .. prompt:: bash $
 
-  asv continuous upstream/master HEAD
+  asv continuous upstream/main HEAD
 
 However this can take up to two hours. The `-b` flag also accepts a regular
 expression for a more complex subset of benchmarks to run.
@@ -1254,7 +1254,7 @@ Reading the existing code base
 ==============================
 
 Reading and digesting an existing code base is always a difficult exercise
-that takes time and experience to master. Even though we try to write simple
+that takes time and experience to main. Even though we try to write simple
 code in general, understanding the code can seem overwhelming at first,
 given the sheer size of the project. Here is a list of tips that may help
 make this task easier and faster (in no particular order).
diff --git a/doc/developers/maintainer.rst b/doc/developers/maintainer.rst
index 9770a0a570e8b..e4115e87025c7 100644
--- a/doc/developers/maintainer.rst
+++ b/doc/developers/maintainer.rst
@@ -48,8 +48,8 @@ permissions given to maintainers, which includes:
 
 - *maintainer* role on ``scikit-learn`` projects on ``pypi.org`` and
   ``test.pypi.org``, separately.
-- become a member of the *scikit-learn* team on conda-forge by editing the 
-  ``recipe/meta.yaml`` file on 
+- become a member of the *scikit-learn* team on conda-forge by editing the
+  ``recipe/meta.yaml`` file on
   ``https://github.com/conda-forge/scikit-learn-feedstock``
 
 .. _preparing_a_release_pr:
@@ -77,8 +77,8 @@ branch:
    .. prompt:: bash $
 
      # Assuming upstream is an alias for the main scikit-learn repo:
-     git fetch upstream master
-     git checkout upstream/master
+     git fetch upstream main
+     git checkout upstream/main
      git checkout -b 0.99.X
      git push --set-upstream upstream 0.99.X
 
@@ -98,11 +98,11 @@ in the description of the Pull Request to track progress.
 This PR will be used to push commits related to the release as explained in
 :ref:`making_a_release`.
 
-You can also create a second PR from master and targeting master to increment
+You can also create a second PR from main and targeting main to increment
 the ``__version__`` variable in `sklearn/__init__.py` to increment the dev
 version. This means while we're in the release candidate period, the latest
-stable is two versions behind the master branch, instead of one. In this PR
-targeting master you should also include a new file for the matching version
+stable is two versions behind the main branch, instead of one. In this PR
+targeting main you should also include a new file for the matching version
 under the ``doc/whats_new/`` folder so PRs that target the next version can
 contribute their changelog entries to this file in parallel to the release
 process.
@@ -118,11 +118,11 @@ First, create a branch, **on your own fork** (to release e.g. `0.99.3`):
 
 .. prompt:: bash $
 
-    # assuming master and upstream/master are the same
-    git checkout -b release-0.99.3 master
+    # assuming main and upstream/main are the same
+    git checkout -b release-0.99.3 main
 
 Then, create a PR **to the** `scikit-learn/0.99.X` **branch** (not to
-master!) with all the desired changes:
+main!) with all the desired changes:
 
 .. prompt:: bash $
 
@@ -145,7 +145,7 @@ Making a release
    in :ref:`preparing_a_release_pr` above.
 
 1. Update docs. Note that this is for the final release, not necessarily for
-   the RC releases. These changes should be made in master and cherry-picked
+   the RC releases. These changes should be made in main and cherry-picked
    into the release branch, only before the final release.
 
    - Edit the ``doc/whats_new/v0.99.rst`` file to add release title and list of
@@ -173,7 +173,7 @@ Making a release
 
 3. Trigger the wheel builder with the ``[cd build]`` commit marker using
    the command:
-   
+
    .. prompt:: bash $
 
     git commit --allow-empty -m "Trigger wheel builder workflow: [cd build]"
@@ -288,7 +288,7 @@ Release checklist
 The following GitHub checklist might be helpful in a release PR::
 
     * [ ] update news and what's new date in release branch
-    * [ ] update news and what's new date and sklearn dev0 version in master branch
+    * [ ] update news and what's new date and sklearn dev0 version in main branch
     * [ ] check that the for the release wheels can be built successfully
     * [ ] merge the PR with `[cd build]` commit message to upload wheels to the staging repo
     * [ ] upload the wheels and source tarball to https://test.pypi.org
@@ -363,10 +363,10 @@ deprecation cycle.
 
 To create an experimental module, you can just copy and modify the content of
 `enable_hist_gradient_boosting.py
-<https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/experimental/enable_hist_gradient_boosting.py>`_,
+<https://github.com/scikit-learn/scikit-learn/blob/main/sklearn/experimental/enable_hist_gradient_boosting.py>`_,
 or
 `enable_iterative_imputer.py
-<https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/experimental/enable_iterative_imputer.py>`_.
+<https://github.com/scikit-learn/scikit-learn/blob/main/sklearn/experimental/enable_iterative_imputer.py>`_.
 
 Note that the public import path must be to a public subpackage (like
 ``sklearn/ensemble`` or ``sklearn/impute``), not just a ``.py`` module.
@@ -379,14 +379,14 @@ in the future when the features aren't experimental anymore.
 To avoid type checker (e.g. mypy) errors a direct import of experimental
 estimators should be done in the parent module, protected by the
 ``if typing.TYPE_CHECKING`` check. See `sklearn/ensemble/__init__.py
-<https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/ensemble/__init__.py>`_,
+<https://github.com/scikit-learn/scikit-learn/blob/main/sklearn/ensemble/__init__.py>`_,
 or `sklearn/impute/__init__.py
-<https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/impute/__init__.py>`_
+<https://github.com/scikit-learn/scikit-learn/blob/main/sklearn/impute/__init__.py>`_
 for an example.
 
 Please also write basic tests following those in
 `test_enable_hist_gradient_boosting.py
-<https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/experimental/tests/test_enable_hist_gradient_boosting.py>`_.
+<https://github.com/scikit-learn/scikit-learn/blob/main/sklearn/experimental/tests/test_enable_hist_gradient_boosting.py>`_.
 
 Make sure every user-facing code you write explicitly mentions that the feature
 is experimental, and add a ``# noqa`` comment to avoid pep8-related warnings::
diff --git a/doc/developers/tips.rst b/doc/developers/tips.rst
index 06b0e966f1569..8cf5bd5b5d094 100644
--- a/doc/developers/tips.rst
+++ b/doc/developers/tips.rst
@@ -79,7 +79,7 @@ When a unit test fails, the following tricks can make debugging easier:
   2. The argument ``pytest --pdb`` drops into the Python debugger on failure. To
      instead drop into the rich IPython debugger ``ipdb``, you may set up a
      shell alias to:
-    
+
 .. prompt:: bash $
 
     pytest --pdbcls=IPython.terminal.debugger:TerminalPdb --capture no
@@ -200,7 +200,7 @@ PR-WIP: What's needed before merge?
 PR-WIP: Regression test needed
     ::
 
-        Please add a [non-regression test](https://en.wikipedia.org/wiki/Non-regression_testing) that would fail at master but pass in this PR.
+        Please add a [non-regression test](https://en.wikipedia.org/wiki/Non-regression_testing) that would fail at main but pass in this PR.
 
 PR-WIP: PEP8
     ::
diff --git a/doc/faq.rst b/doc/faq.rst
index 070ecff38be9e..0ebd4df759125 100644
--- a/doc/faq.rst
+++ b/doc/faq.rst
@@ -261,7 +261,7 @@ state in the child process is corrupted: the thread pool believes it has many
 threads while only the main thread state has been forked. It is possible to
 change the libraries to make them detect when a fork happens and reinitialize
 the thread pool in that case: we did that for OpenBLAS (merged upstream in
-master since 0.2.10) and we contributed a `patch
+main since 0.2.10) and we contributed a `patch
 <https://gcc.gnu.org/bugzilla/show_bug.cgi?id=60035>`_ to GCC's OpenMP runtime
 (not yet reviewed).
 
@@ -379,8 +379,8 @@ data structures.
 
 Do you plan to implement transform for target y in a pipeline?
 ----------------------------------------------------------------------------
-Currently transform only works for features X in a pipeline. 
-There's a long-standing discussion about 
+Currently transform only works for features X in a pipeline.
+There's a long-standing discussion about
 not being able to transform y in a pipeline.
 Follow on github issue
 `#4143 <https://github.com/scikit-learn/scikit-learn/issues/4143>`_.
@@ -388,11 +388,11 @@ Meanwhile check out
 :class:`~compose.TransformedTargetRegressor`,
 `pipegraph <https://github.com/mcasl/PipeGraph>`_,
 `imbalanced-learn <https://github.com/scikit-learn-contrib/imbalanced-learn>`_.
-Note that Scikit-learn solved for the case where y 
-has an invertible transformation applied before training 
+Note that Scikit-learn solved for the case where y
+has an invertible transformation applied before training
 and inverted after prediction. Scikit-learn intends to solve for
-use cases where y should be transformed at training time 
-and not at test time, for resampling and similar uses, 
-like at imbalanced learn. 
-In general, these use cases can be solved 
+use cases where y should be transformed at training time
+and not at test time, for resampling and similar uses,
+like at `imbalanced-learn`.
+In general, these use cases can be solved
 with a custom meta estimator rather than a Pipeline
diff --git a/doc/themes/scikit-learn/layout.html b/doc/themes/scikit-learn/layout.html
index 7259cd40ee368..41f4215c3b2e7 100644
--- a/doc/themes/scikit-learn/layout.html
+++ b/doc/themes/scikit-learn/layout.html
@@ -66,7 +66,7 @@
 <div class="header-wrapper">
     <div class="header">
         {%- if logo %}
-        <p class="logo"><a href="{{ pathto(master_doc) }}">
+        <p class="logo"><a href="{{ pathto(main_doc) }}">
             <img src="{{ pathto('_static/' + logo, 1) }}" alt="Logo"/>
         </a>
         </p>
diff --git a/doc/themes/scikit-learn/static/img/scikit-learn-logo.svg b/doc/themes/scikit-learn/static/img/scikit-learn-logo.svg
index 47ee0c75d5661..a4beb202d0967 100644
--- a/doc/themes/scikit-learn/static/img/scikit-learn-logo.svg
+++ b/doc/themes/scikit-learn/static/img/scikit-learn-logo.svg
@@ -6,7 +6,7 @@
   <head prefix="og: http://ogp.me/ns# fb: http://ogp.me/ns/fb# githubog: http://ogp.me/ns/fb/githubog#">
     <meta charset='utf-8'>
     <meta http-equiv="X-UA-Compatible" content="IE=edge">
-        <title>scikit-learn/doc/logos/scikit-learn-logo.svg at master · scikit-learn/scikit-learn</title>
+        <title>scikit-learn/doc/logos/scikit-learn-logo.svg at main · scikit-learn/scikit-learn</title>
     <link rel="search" type="application/opensearchdescription+xml" href="/opensearch.xml" title="GitHub" />
     <link rel="fluid-icon" href="https://github.com/fluidicon.png" title="GitHub" />
     <link rel="apple-touch-icon" sizes="57x57" href="/apple-touch-icon-114.png" />
@@ -18,8 +18,8 @@
     <meta name="hostname" content="fe4.rs.github.com">
     <link rel="assets" href="https://github.global.ssl.fastly.net/">
     <link rel="xhr-socket" href="/_sockets" />
-    
-    
+
+
 
 
     <meta name="msapplication-TileImage" content="/windows-tile.png" />
@@ -27,8 +27,8 @@
     <meta name="selected-link" value="repo_source" data-pjax-transient />
     <meta content="collector.githubapp.com" name="octolytics-host" /><meta content="github" name="octolytics-app-id" /><meta content="295195" name="octolytics-actor-id" /><meta content="vmichel" name="octolytics-actor-login" /><meta content="12af56ff3873f70afc40e04ed1de8c6d1d3afc81672a2e87a8ac3fafdaff894c" name="octolytics-actor-hash" />
 
-    
-    
+
+
     <link rel="icon" type="image/x-icon" href="/favicon.ico" />
 
     <meta content="authenticity_token" name="csrf-param" />
@@ -36,12 +36,12 @@
 
     <link href="https://github.global.ssl.fastly.net/assets/github-8921d913c104b05dbca482140b50a4899d808da0.css" media="all" rel="stylesheet" type="text/css" />
     <link href="https://github.global.ssl.fastly.net/assets/github2-3d41bb4cf621d85b14f818b30d303826fdcc6556.css" media="all" rel="stylesheet" type="text/css" />
-    
+
 
 
       <script src="https://github.global.ssl.fastly.net/assets/frameworks-e8054ad804a1cf9e9849130fee5a4a5487b663ed.js" type="text/javascript"></script>
       <script src="https://github.global.ssl.fastly.net/assets/github-1b7840e1ec2954c0675f2263b9e3789a0fee0808.js" type="text/javascript"></script>
-      
+
       <meta http-equiv="x-pjax-version" content="937fa06ecc5e3385af78bf5912d4a166">
 
         <link data-pjax-transient rel='permalink' href='/scikit-learn/scikit-learn/blob/798f44265472ff4d954c0f78674bb64742408a76/doc/logos/scikit-learn-logo.svg'>
@@ -55,7 +55,7 @@
   <meta name="description" content="scikit-learn: machine learning in Python" />
 
   <meta content="365630" name="octolytics-dimension-user_id" /><meta content="scikit-learn" name="octolytics-dimension-user_login" /><meta content="843222" name="octolytics-dimension-repository_id" /><meta content="scikit-learn/scikit-learn" name="octolytics-dimension-repository_nwo" /><meta content="true" name="octolytics-dimension-repository_public" /><meta content="false" name="octolytics-dimension-repository_is_fork" /><meta content="843222" name="octolytics-dimension-repository_network_root_id" /><meta content="scikit-learn/scikit-learn" name="octolytics-dimension-repository_network_root_nwo" />
-  <link href="https://github.com/scikit-learn/scikit-learn/commits/master.atom" rel="alternate" title="Recent Commits to scikit-learn:master" type="application/atom+xml" />
+  <link href="https://github.com/scikit-learn/scikit-learn/commits/main.atom" rel="alternate" title="Recent Commits to scikit-learn:main" type="application/atom+xml" />
 
   </head>
 
@@ -63,9 +63,9 @@
   <body class="logged_in page-blob linux vis-public env-production ">
 
     <div class="wrapper">
-      
-      
-      
+
+
+
 
 
       <div class="header header-logged-in true">
@@ -87,10 +87,10 @@
           <form accept-charset="UTF-8" action="/search" class="command-bar-form" id="top_search_form" method="get">
 
 <input type="text" data-hotkey=" s" name="q" id="js-command-bar-field" placeholder="Search or type a command" tabindex="1" autocapitalize="off"
-    
+
     data-username="vmichel"
       data-repo="scikit-learn/scikit-learn"
-      data-branch="master"
+      data-branch="main"
       data-sha="717ee4d7202708759558670a8fbcb5f982f37009"
   >
 
@@ -136,9 +136,9 @@
         </ul>
       </div>
 
-    
 
-  
+
+
 
     <ul id="user-links">
       <li>
@@ -171,7 +171,7 @@
 
 
 <div class="js-new-dropdown-contents hidden">
-  
+
 
 <ul class="dropdown-menu">
   <li>
@@ -206,22 +206,22 @@
 </div>
 
 
-    
+
   </div>
 </div>
 
-      
 
-      
+
+
 
 
           <div class="site" itemscope itemtype="http://schema.org/WebPage">
-    
+
     <div class="pagehead repohead instapaper_ignore readability-menu">
       <div class="container">
-        
+
 
 <ul class="pagehead-actions">
 
@@ -297,7 +297,7 @@
     </li>
 
   <li>
-  
+
 <div class="js-toggler-container js-social-container starring-container on">
   <a href="/scikit-learn/scikit-learn/unstar" class="minibutton with-count js-toggler-target star-button starred upwards" title="Unstar this repo" data-remote="true" data-method="post" rel="nofollow">
     <span class="octicon octicon-star-delete"></span><span class="text">Unstar</span>
@@ -342,7 +342,7 @@
       <div class="repository-with-sidebar repo-container ">
 
         <div class="repository-sidebar">
-            
+
 
 <div class="repo-nav repo-nav-full js-repository-container-pjax js-octicon-loaders">
   <div class="repo-nav-contents">
@@ -411,9 +411,9 @@
 </div>
 
             <div class="only-with-full-nav">
-              
 
-  
+
+
 
 <div class="clone-url open"
   data-protocol-type="http"
@@ -426,7 +426,7 @@
   <span class="js-zeroclipboard url-box-clippy minibutton zeroclipboard-button" data-clipboard-text="https://github.com/scikit-learn/scikit-learn.git" data-copied-hint="copied!" title="copy to clipboard"><span class="octicon octicon-clippy"></span></span>
 </div>
 
-  
+
 
 <div class="clone-url "
   data-protocol-type="ssh"
@@ -439,7 +439,7 @@
   <span class="js-zeroclipboard url-box-clippy minibutton zeroclipboard-button" data-clipboard-text="git@github.com:scikit-learn/scikit-learn.git" data-copied-hint="copied!" title="copy to clipboard"><span class="octicon octicon-clippy"></span></span>
 </div>
 
-  
+
 
 <div class="clone-url "
   data-protocol-type="subversion"
@@ -463,7 +463,7 @@
 
 
-                <a href="/scikit-learn/scikit-learn/archive/master.zip"
+                <a href="/scikit-learn/scikit-learn/archive/main.zip"
                    class="minibutton sidebar-button"
                    title="Download this repository as a zip file"
                    rel="nofollow">
@@ -474,7 +474,7 @@
         </div><!-- /.repository-sidebar -->
 
         <div id="js-repo-pjax-container" class="repository-content context-loader-container" data-pjax-container>
-          
+
 
 
 <!-- blob contrib key: blob_contributors:v21:d840ac6fb3b8fa9e89a48bb05435b185 -->
@@ -482,19 +482,19 @@
 
 <p title="This is a placeholder element" class="js-history-link-replace hidden"></p>
 
-<a href="/scikit-learn/scikit-learn/find/master" data-pjax data-hotkey="t" style="display:none">Show File Finder</a>
+<a href="/scikit-learn/scikit-learn/find/main" data-pjax data-hotkey="t" style="display:none">Show File Finder</a>
 
 <div class="file-navigation">
-  
+
 
 
 <div class="select-menu js-menu-container js-select-menu" >
   <span class="minibutton select-menu-button js-menu-target" data-hotkey="w"
-    data-master-branch="master"
-    data-ref="master">
+    data-main-branch="main"
+    data-ref="main">
     <span class="octicon octicon-git-branch"></span>
     <i>branch:</i>
-    <span class="js-select-button">master</span>
+    <span class="js-select-button">main</span>
   </span>
 
   <div class="select-menu-modal-holder js-menu-content js-navigation-container" data-pjax>
@@ -568,7 +568,7 @@
             </div> <!-- /.select-menu-item -->
             <div class="select-menu-item js-navigation-item selected">
               <span class="select-menu-item-icon octicon octicon-check"></span>
-              <a href="/scikit-learn/scikit-learn/blob/master/doc/logos/scikit-learn-logo.svg" class="js-navigation-open select-menu-item-text js-select-button-text css-truncate-target" data-name="master" data-skip-pjax="true" rel="nofollow" title="master">master</a>
+              <a href="/scikit-learn/scikit-learn/blob/main/doc/logos/scikit-learn-logo.svg" class="js-navigation-open select-menu-item-text js-select-button-text css-truncate-target" data-name="main" data-skip-pjax="true" rel="nofollow" title="main">main</a>
             </div> <!-- /.select-menu-item -->
         </div>
 
@@ -576,10 +576,10 @@
             <span class="octicon octicon-git-branch-create select-menu-item-icon"></span>
             <div class="select-menu-item-text">
               <h4>Create branch: <span class="js-new-item-name"></span></h4>
-              <span class="description">from ‘master’</span>
+              <span class="description">from ‘main’</span>
             </div>
             <input type="hidden" name="name" id="name" class="js-new-item-value">
-            <input type="hidden" name="branch" id="branch" value="master" />
+            <input type="hidden" name="branch" id="branch" value="main" />
             <input type="hidden" name="path" id="branch" value="doc/logos/scikit-learn-logo.svg" />
           </form> <!-- /.select-menu-item -->
 
@@ -803,12 +803,12 @@
 </div> <!-- /.select-menu -->
 
   <div class="breadcrumb">
-    <span class='repo-root js-repo-root'><span itemscope="" itemtype="http://data-vocabulary.org/Breadcrumb"><a href="/scikit-learn/scikit-learn" data-branch="master" data-direction="back" data-pjax="true" itemscope="url"><span itemprop="title">scikit-learn</span></a></span></span><span class="separator"> / </span><span itemscope="" itemtype="http://data-vocabulary.org/Breadcrumb"><a href="/scikit-learn/scikit-learn/tree/master/doc" data-branch="master" data-direction="back" data-pjax="true" itemscope="url"><span itemprop="title">doc</span></a></span><span class="separator"> / </span><span itemscope="" itemtype="http://data-vocabulary.org/Breadcrumb"><a href="/scikit-learn/scikit-learn/tree/master/doc/logos" data-branch="master" data-direction="back" data-pjax="true" itemscope="url"><span itemprop="title">logos</span></a></span><span class="separator"> / </span><strong class="final-path">scikit-learn-logo.svg</strong> <span class="js-zeroclipboard minibutton zeroclipboard-button" data-clipboard-text="doc/logos/scikit-learn-logo.svg" data-copied-hint="copied!" title="copy to clipboard"><span class="octicon octicon-clippy"></span></span>
+    <span class='repo-root js-repo-root'><span itemscope="" itemtype="http://data-vocabulary.org/Breadcrumb"><a href="/scikit-learn/scikit-learn" data-branch="main" data-direction="back" data-pjax="true" itemscope="url"><span itemprop="title">scikit-learn</span></a></span></span><span class="separator"> / </span><span itemscope="" itemtype="http://data-vocabulary.org/Breadcrumb"><a href="/scikit-learn/scikit-learn/tree/main/doc" data-branch="main" data-direction="back" data-pjax="true" itemscope="url"><span itemprop="title">doc</span></a></span><span class="separator"> / </span><span itemscope="" itemtype="http://data-vocabulary.org/Breadcrumb"><a href="/scikit-learn/scikit-learn/tree/main/doc/logos" data-branch="main" data-direction="back" data-pjax="true" itemscope="url"><span itemprop="title">logos</span></a></span><span class="separator"> / </span><strong class="final-path">scikit-learn-logo.svg</strong> <span class="js-zeroclipboard minibutton zeroclipboard-button" data-clipboard-text="doc/logos/scikit-learn-logo.svg" data-copied-hint="copied!" title="copy to clipboard"><span class="octicon octicon-clippy"></span></span>
   </div>
 </div>
 
 
-  <div class="commit commit-loader file-history-tease js-deferred-content" data-url="/scikit-learn/scikit-learn/contributors/master/doc/logos/scikit-learn-logo.svg">
+  <div class="commit commit-loader file-history-tease js-deferred-content" data-url="/scikit-learn/scikit-learn/contributors/main/doc/logos/scikit-learn-logo.svg">
     Fetching contributors…
 
     <div class="participation">
@@ -829,14 +829,14 @@
       <div class="actions">
         <div class="button-group">
                 <a class="minibutton"
-                   href="/scikit-learn/scikit-learn/edit/master/doc/logos/scikit-learn-logo.svg"
+                   href="/scikit-learn/scikit-learn/edit/main/doc/logos/scikit-learn-logo.svg"
                    data-method="post" rel="nofollow" data-hotkey="e">Edit</a>
-          <a href="/scikit-learn/scikit-learn/raw/master/doc/logos/scikit-learn-logo.svg" class="button minibutton " id="raw-url">Raw</a>
-            <a href="/scikit-learn/scikit-learn/blame/master/doc/logos/scikit-learn-logo.svg" class="button minibutton ">Blame</a>
-          <a href="/scikit-learn/scikit-learn/commits/master/doc/logos/scikit-learn-logo.svg" class="button minibutton " rel="nofollow">History</a>
+          <a href="/scikit-learn/scikit-learn/raw/main/doc/logos/scikit-learn-logo.svg" class="button minibutton " id="raw-url">Raw</a>
+            <a href="/scikit-learn/scikit-learn/blame/main/doc/logos/scikit-learn-logo.svg" class="button minibutton ">Blame</a>
+          <a href="/scikit-learn/scikit-learn/commits/main/doc/logos/scikit-learn-logo.svg" class="button minibutton " rel="nofollow">History</a>
         </div><!-- /.button-group -->
             <a class="minibutton danger empty-icon tooltipped downwards"
-               href="/scikit-learn/scikit-learn/delete/master/doc/logos/scikit-learn-logo.svg"
+               href="/scikit-learn/scikit-learn/delete/main/doc/logos/scikit-learn-logo.svg"
                title="" data-method="post" rel="nofollow">
             Delete
           </a>
@@ -1044,7 +1044,6 @@
       Something went wrong with that request. Please try again.
     </div>
 
-    
+
   </body>
 </html>
-
diff --git a/doc/tutorial/text_analytics/working_with_text_data.rst b/doc/tutorial/text_analytics/working_with_text_data.rst
index 817a5b1f547cc..26ff51b0cf3df 100644
--- a/doc/tutorial/text_analytics/working_with_text_data.rst
+++ b/doc/tutorial/text_analytics/working_with_text_data.rst
@@ -34,7 +34,7 @@ The source of this tutorial can be found within your scikit-learn folder::
     scikit-learn/doc/tutorial/text_analytics/
 
 The source can also be found `on Github
-<https://github.com/scikit-learn/scikit-learn/tree/master/doc/tutorial/text_analytics>`_.
+<https://github.com/scikit-learn/scikit-learn/tree/main/doc/tutorial/text_analytics>`_.
 
 The tutorial folder should contain the following sub-folders:
 
diff --git a/maint_tools/whats_missing.sh b/maint_tools/whats_missing.sh
index 5b2d6b8fd8a01..6627778d8d346 100755
--- a/maint_tools/whats_missing.sh
+++ b/maint_tools/whats_missing.sh
@@ -11,7 +11,7 @@ from_branch=$1
 to_file=$2
 
 logged_prs() {
-	git log --oneline $from_branch..master sklearn/ |
+	git log --oneline $from_branch..main sklearn/ |
 		grep -wv -e CLN -e TST -e CI -e DOC -e doc -e MNT -e MAINT -e BLD -e COSMIT -e EXA -e examples -e example -e minor -e STY -e Style -e docstring |
 		grep -o '(#[0-9][0-9]\+)$' |
 		grep -o '[0-9]\+'

From d652551eef2ba3e29399ed8f48d97ebd4a238aef Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?=
 <34657725+jeremiedbb@users.noreply.github.com>
Date: Fri, 22 Jan 2021 16:59:58 +0100
Subject: [PATCH 081/478] DOC reorder what's new 1.0 (#19245)

---
 doc/whats_new/v1.0.rst | 50 +++++++++++++++++++++---------------------
 1 file changed, 25 insertions(+), 25 deletions(-)

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index e7110639635f4..4109807253852 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -44,20 +44,6 @@ Changelog
     :pr:`123456` by :user:`Joe Bloggs <joeongithub>`.
     where 123456 is the *pull request* number, not the issue number.
 
-:mod:`sklearn.feature_extraction`
-.................................
-
-- |Fix| Fixed a bug in class:`feature_extraction.HashingVectorizer` where some
-  input strings would result in negative indices in the transformed data.
-  :pr:`19035` by :user:`Liu Yu <ly648499246>`.
-
-:mod:`sklearn.tree`
-...................
-
-- |Enhancement| Add `fontname` argument in :func:`tree.export_graphviz`
-  for non-English characters. :pr:`18959` by :user:`Zero <Zeroto521>`
-  and :user:`wstates <wstates>`.
-
 :mod:`sklearn.cluster`
 ......................
 
@@ -73,27 +59,26 @@ Changelog
 - |Fix| Fixes incorrect multiple data-conversion warnings when clustering
   boolean data. :pr:`19046` by :user:`Surya Prakash <jdsurya>`.
 
+:mod:`sklearn.feature_extraction`
+.................................
+
+- |Fix| Fixed a bug in class:`feature_extraction.HashingVectorizer` where some
+  input strings would result in negative indices in the transformed data.
+  :pr:`19035` by :user:`Liu Yu <ly648499246>`.
+
 :mod:`sklearn.linear_model`
 ...........................
 
-- |Fix| :meth:`ElasticNet.fit` no longer modifies `sample_weight` in place.
-  :pr:`19055` by `Thomas Fan`_.
-
 - |Enhancement| Validate user-supplied gram matrix passed to linear models
   via the `precompute` argument. :pr:`19004` by :user:`Adam Midvidy <amidvidy>`.
 
+- |Fix| :meth:`ElasticNet.fit` no longer modifies `sample_weight` in place.
+  :pr:`19055` by `Thomas Fan`_.
+
 - |Fix| :class:`Lasso`, :class:`ElasticNet` no longer have a `dual_gap_`
   not corresponding to their objective. :pr:`19172` by
   :user:`Mathurin Massias <mathurinm>`
 
-:mod:`sklearn.naive_bayes`
-..........................
-
-- |API| The attribute ``sigma_`` is now deprecated in
-  :class:`naive_bayes.GaussianNB` and will be removed in 1.2.
-  Use ``var_`` instead.
-  :pr:`18842` by :user:`Hong Shao Yang <hongshaoyang>`.
-
 - |API|: The parameter ``normalize`` of :class:`linear_model.LinearRegression`
   is deprecated and will be removed in 1.2.
   Motivation for this deprecation: ``normalize`` parameter did not take any
@@ -106,6 +91,21 @@ Changelog
   :pr:`17743` by :user:`Maria Telenczuk <maikia>` and
   :user:`Alexandre Gramfort <agramfort>`.
 
+:mod:`sklearn.naive_bayes`
+..........................
+
+- |API| The attribute ``sigma_`` is now deprecated in
+  :class:`naive_bayes.GaussianNB` and will be removed in 1.2.
+  Use ``var_`` instead.
+  :pr:`18842` by :user:`Hong Shao Yang <hongshaoyang>`.
+
+:mod:`sklearn.tree`
+...................
+
+- |Enhancement| Add `fontname` argument in :func:`tree.export_graphviz`
+  for non-English characters. :pr:`18959` by :user:`Zero <Zeroto521>`
+  and :user:`wstates <wstates>`.
+
 Code and Documentation Contributors
 -----------------------------------
 

From 7404a82891ef82c525ec61c76090f12424670a44 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?=
 <34657725+jeremiedbb@users.noreply.github.com>
Date: Fri, 22 Jan 2021 19:06:01 +0100
Subject: [PATCH 082/478] [DOC] Clarify the docstring of dict learning about
 the inits (#19227)

---
 sklearn/decomposition/_dict_learning.py | 12 ++++++++----
 sklearn/decomposition/_sparse_pca.py    |  6 ++++--
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/sklearn/decomposition/_dict_learning.py b/sklearn/decomposition/_dict_learning.py
index 046738aa9700d..cf472a52b900d 100644
--- a/sklearn/decomposition/_dict_learning.py
+++ b/sklearn/decomposition/_dict_learning.py
@@ -487,10 +487,12 @@ def dict_learning(X, n_components, *, alpha, max_iter=100, tol=1e-8,
         for more details.
 
     dict_init : ndarray of shape (n_components, n_features), default=None
-        Initial value for the dictionary for warm restart scenarios.
+        Initial value for the dictionary for warm restart scenarios. Only used
+        if `code_init` and `dict_init` are not None.
 
     code_init : ndarray of shape (n_samples, n_components), default=None
-        Initial value for the sparse code for warm restart scenarios.
+        Initial value for the sparse code for warm restart scenarios. Only used
+        if `code_init` and `dict_init` are not None.
 
     callback : callable, default=None
         Callable that gets invoked every five iterations
@@ -1205,10 +1207,12 @@ class DictionaryLearning(_BaseSparseCoding, BaseEstimator):
         for more details.
 
     code_init : ndarray of shape (n_samples, n_components), default=None
-        Initial value for the code, for warm restart.
+        Initial value for the code, for warm restart. Only used if `code_init`
+        and `dict_init` are not None.
 
     dict_init : ndarray of shape (n_components, n_features), default=None
-        Initial values for the dictionary, for warm restart.
+        Initial values for the dictionary, for warm restart. Only used if
+        `code_init` and `dict_init` are not None.
 
     verbose : bool, default=False
         To control the verbosity of the procedure.
diff --git a/sklearn/decomposition/_sparse_pca.py b/sklearn/decomposition/_sparse_pca.py
index 7b323f037b1e7..2348ada255fd4 100644
--- a/sklearn/decomposition/_sparse_pca.py
+++ b/sklearn/decomposition/_sparse_pca.py
@@ -54,10 +54,12 @@ class SparsePCA(TransformerMixin, BaseEstimator):
         for more details.
 
     U_init : ndarray of shape (n_samples, n_components), default=None
-        Initial values for the loadings for warm restart scenarios.
+        Initial values for the loadings for warm restart scenarios. Only used
+        if `U_init` and `V_init` are not None.
 
     V_init : ndarray of shape (n_components, n_features), default=None
-        Initial values for the components for warm restart scenarios.
+        Initial values for the components for warm restart scenarios. Only used
+        if `U_init` and `V_init` are not None.
 
     verbose : int or bool, default=False
         Controls the verbosity; the higher, the more messages. Defaults to 0.

From 8c6a045e46abe94e43a971d4f8042728addfd6a7 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 22 Jan 2021 19:57:41 +0100
Subject: [PATCH 083/478] ENH/DEP add class method and deprecate plot function
 for confusion matrix (#18543)

Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
Co-authored-by: Adrin Jalali <adrin.jalali@gmail.com>
---
 doc/modules/model_evaluation.rst              |   2 +-
 doc/whats_new/v1.0.rst                        |  11 +
 .../plot_digits_classification.py             |   2 +-
 .../model_selection/plot_confusion_matrix.py  |  10 +-
 sklearn/metrics/_plot/confusion_matrix.py     | 290 ++++++++++++-
 .../tests/test_confusion_matrix_display.py    | 382 ++++++++++++++++++
 .../_plot/tests/test_plot_confusion_matrix.py |  33 ++
 7 files changed, 719 insertions(+), 11 deletions(-)
 create mode 100644 sklearn/metrics/_plot/tests/test_confusion_matrix_display.py

diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst
index 0bc08f24bb19c..86e64f997cdd8 100644
--- a/doc/modules/model_evaluation.rst
+++ b/doc/modules/model_evaluation.rst
@@ -613,7 +613,7 @@ predicted to be in group :math:`j`. Here is an example::
          [0, 0, 1],
          [1, 0, 2]])
 
-:func:`plot_confusion_matrix` can be used to visually represent a confusion
+:class:`ConfusionMatrixDisplay` can be used to visually represent a confusion
 matrix as shown in the
 :ref:`sphx_glr_auto_examples_model_selection_plot_confusion_matrix.py`
 example, which creates the following figure:
diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 4109807253852..652eba896903b 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -91,6 +91,17 @@ Changelog
   :pr:`17743` by :user:`Maria Telenczuk <maikia>` and
   :user:`Alexandre Gramfort <agramfort>`.
 
+:mod:`sklearn.metrics`
+......................
+
+- |API| :class:`metrics.ConfusionMatrixDisplay` exposes two class methods
+  :func:`~metrics.ConfusionMatrixDisplay.from_estimator` and
+  :func:`~metrics.ConfusionMatrixDisplay.from_predictions` allowing to create
+  a confusion matrix plot using an estimator or the predictions.
+  :func:`metrics.plot_confusion_matrix` is deprecated in favor of these two
+  class methods and will be removed in 1.2.
+  :pr:`18543` by `Guillaume Lemaitre`_.
+
 :mod:`sklearn.naive_bayes`
 ..........................
 
diff --git a/examples/classification/plot_digits_classification.py b/examples/classification/plot_digits_classification.py
index 0d1a79f609f7d..8cb61df2c9736 100644
--- a/examples/classification/plot_digits_classification.py
+++ b/examples/classification/plot_digits_classification.py
@@ -95,7 +95,7 @@
 # We can also plot a :ref:`confusion matrix <confusion_matrix>` of the
 # true digit values and the predicted digit values.
 
-disp = metrics.plot_confusion_matrix(clf, X_test, y_test)
+disp = metrics.ConfusionMatrixDisplay.from_predictions(y_test, predicted)
 disp.figure_.suptitle("Confusion Matrix")
 print(f"Confusion matrix:\n{disp.confusion_matrix}")
 
diff --git a/examples/model_selection/plot_confusion_matrix.py b/examples/model_selection/plot_confusion_matrix.py
index 5bed1a2ccec38..d54d9747a8cf3 100644
--- a/examples/model_selection/plot_confusion_matrix.py
+++ b/examples/model_selection/plot_confusion_matrix.py
@@ -31,7 +31,7 @@
 
 from sklearn import svm, datasets
 from sklearn.model_selection import train_test_split
-from sklearn.metrics import plot_confusion_matrix
+from sklearn.metrics import ConfusionMatrixDisplay
 
 # import some data to play with
 iris = datasets.load_iris()
@@ -52,10 +52,10 @@
 titles_options = [("Confusion matrix, without normalization", None),
                   ("Normalized confusion matrix", 'true')]
 for title, normalize in titles_options:
-    disp = plot_confusion_matrix(classifier, X_test, y_test,
-                                 display_labels=class_names,
-                                 cmap=plt.cm.Blues,
-                                 normalize=normalize)
+    disp = ConfusionMatrixDisplay.from_estimator(
+        classifier, X_test, y_test, display_labels=class_names,
+        cmap=plt.cm.Blues, normalize=normalize
+    )
     disp.ax_.set_title(title)
 
     print(title)
diff --git a/sklearn/metrics/_plot/confusion_matrix.py b/sklearn/metrics/_plot/confusion_matrix.py
index 3760a5a9052e3..503bd3f2e7f64 100644
--- a/sklearn/metrics/_plot/confusion_matrix.py
+++ b/sklearn/metrics/_plot/confusion_matrix.py
@@ -4,6 +4,7 @@
 
 from .. import confusion_matrix
 from ...utils import check_matplotlib_support
+from ...utils import deprecated
 from ...utils.multiclass import unique_labels
 from ...utils.validation import _deprecate_positional_args
 from ...base import is_classifier
@@ -12,7 +13,9 @@
 class ConfusionMatrixDisplay:
     """Confusion Matrix visualization.
 
-    It is recommend to use :func:`~sklearn.metrics.plot_confusion_matrix` to
+    It is recommend to use
+    :func:`~sklearn.metrics.ConfusionMatrixDisplay.from_estimator` or
+    :func:`~sklearn.metrics.ConfusionMatrixDisplay.from_predictions` to
     create a :class:`ConfusionMatrixDisplay`. All parameters are stored as
     attributes.
 
@@ -161,7 +164,274 @@ def plot(self, *, include_values=True, cmap='viridis',
         self.ax_ = ax
         return self
 
+    @classmethod
+    def from_estimator(
+        cls,
+        estimator,
+        X,
+        y,
+        *,
+        labels=None,
+        sample_weight=None,
+        normalize=None,
+        display_labels=None,
+        include_values=True,
+        xticks_rotation="horizontal",
+        values_format=None,
+        cmap="viridis",
+        ax=None,
+        colorbar=True,
+    ):
+        """Plot Confusion Matrix given an estimator and some data.
+
+        Read more in the :ref:`User Guide <confusion_matrix>`.
+
+        .. versionadded:: 1.0
 
+        Parameters
+        ----------
+        estimator : estimator instance
+            Fitted classifier or a fitted :class:`~sklearn.pipeline.Pipeline`
+            in which the last estimator is a classifier.
+
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Input values.
+
+        y : array-like of shape (n_samples,)
+            Target values.
+
+        labels : array-like of shape (n_classes,), default=None
+            List of labels to index the confusion matrix. This may be used to
+            reorder or select a subset of labels. If `None` is given, those
+            that appear at least once in `y_true` or `y_pred` are used in
+            sorted order.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights.
+
+        normalize : {'true', 'pred', 'all'}, default=None
+            Either to normalize the counts display in the matrix:
+
+            - if `'true'`, the confusion matrix is normalized over the true
+              conditions (e.g. rows);
+            - if `'pred'`, the confusion matrix is normalized over the
+              predicted conditions (e.g. columns);
+            - if `'all'`, the confusion matrix is normalized by the total
+              number of samples;
+            - if `None` (default), the confusion matrix will not be normalized.
+
+        display_labels : array-like of shape (n_classes,), default=None
+            Target names used for plotting. By default, `labels` will be used
+            if it is defined, otherwise the unique labels of `y_true` and
+            `y_pred` will be used.
+
+        include_values : bool, default=True
+            Includes values in confusion matrix.
+
+        xticks_rotation : {'vertical', 'horizontal'} or float, \
+                default='horizontal'
+            Rotation of xtick labels.
+
+        values_format : str, default=None
+            Format specification for values in confusion matrix. If `None`, the
+            format specification is 'd' or '.2g' whichever is shorter.
+
+        cmap : str or matplotlib Colormap, default='viridis'
+            Colormap recognized by matplotlib.
+
+        ax : matplotlib Axes, default=None
+            Axes object to plot on. If `None`, a new figure and axes is
+            created.
+
+        colorbar : bool, default=True
+            Whether or not to add a colorbar to the plot.
+
+        Returns
+        -------
+        display : :class:`~sklearn.metrics.ConfusionMatrixDisplay`
+
+        See Also
+        --------
+        ConfusionMatrixDisplay.from_predictions : Plot the confusion matrix
+            given the true and predicted labels.
+
+        Examples
+        --------
+        >>> import matplotlib.pyplot as plt  # doctest: +SKIP
+        >>> from sklearn.datasets import make_classification
+        >>> from sklearn.metrics import ConfusionMatrixDisplay
+        >>> from sklearn.model_selection import train_test_split
+        >>> from sklearn.svm import SVC
+        >>> X, y = make_classification(random_state=0)
+        >>> X_train, X_test, y_train, y_test = train_test_split(
+        ...         X, y, random_state=0)
+        >>> clf = SVC(random_state=0)
+        >>> clf.fit(X_train, y_train)
+        SVC(random_state=0)
+        >>> ConfusionMatrixDisplay.from_estimator(
+        ...     clf, X_test, y_test)  # doctest: +SKIP
+        >>> plt.show()  # doctest: +SKIP
+        """
+        method_name = f"{cls.__name__}.from_estimator"
+        check_matplotlib_support(method_name)
+        if not is_classifier(estimator):
+            raise ValueError(f"{method_name} only supports classifiers")
+        y_pred = estimator.predict(X)
+
+        return cls.from_predictions(
+            y,
+            y_pred,
+            sample_weight=sample_weight,
+            labels=labels,
+            normalize=normalize,
+            display_labels=display_labels,
+            include_values=include_values,
+            cmap=cmap,
+            ax=ax,
+            xticks_rotation=xticks_rotation,
+            values_format=values_format,
+            colorbar=colorbar,
+        )
+
+    @classmethod
+    def from_predictions(
+        cls,
+        y_true,
+        y_pred,
+        *,
+        labels=None,
+        sample_weight=None,
+        normalize=None,
+        display_labels=None,
+        include_values=True,
+        xticks_rotation="horizontal",
+        values_format=None,
+        cmap="viridis",
+        ax=None,
+        colorbar=True,
+    ):
+        """Plot Confusion Matrix given true and predicted labels.
+
+        Read more in the :ref:`User Guide <confusion_matrix>`.
+
+        .. versionadded:: 0.24
+
+        Parameters
+        ----------
+        y_true : array-like of shape (n_samples,)
+            True labels.
+
+        y_pred : array-like of shape (n_samples,)
+            The predicted labels given by the method `predict` of an
+            classifier.
+
+        labels : array-like of shape (n_classes,), default=None
+            List of labels to index the confusion matrix. This may be used to
+            reorder or select a subset of labels. If `None` is given, those
+            that appear at least once in `y_true` or `y_pred` are used in
+            sorted order.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights.
+
+        normalize : {'true', 'pred', 'all'}, default=None
+            Either to normalize the counts display in the matrix:
+
+            - if `'true'`, the confusion matrix is normalized over the true
+              conditions (e.g. rows);
+            - if `'pred'`, the confusion matrix is normalized over the
+              predicted conditions (e.g. columns);
+            - if `'all'`, the confusion matrix is normalized by the total
+              number of samples;
+            - if `None` (default), the confusion matrix will not be normalized.
+
+        display_labels : array-like of shape (n_classes,), default=None
+            Target names used for plotting. By default, `labels` will be used
+            if it is defined, otherwise the unique labels of `y_true` and
+            `y_pred` will be used.
+
+        include_values : bool, default=True
+            Includes values in confusion matrix.
+
+        xticks_rotation : {'vertical', 'horizontal'} or float, \
+                default='horizontal'
+            Rotation of xtick labels.
+
+        values_format : str, default=None
+            Format specification for values in confusion matrix. If `None`, the
+            format specification is 'd' or '.2g' whichever is shorter.
+
+        cmap : str or matplotlib Colormap, default='viridis'
+            Colormap recognized by matplotlib.
+
+        ax : matplotlib Axes, default=None
+            Axes object to plot on. If `None`, a new figure and axes is
+            created.
+
+        colorbar : bool, default=True
+            Whether or not to add a colorbar to the plot.
+
+        Returns
+        -------
+        display : :class:`~sklearn.metrics.ConfusionMatrixDisplay`
+
+        See Also
+        --------
+        ConfusionMatrixDisplay.from_estimator : Plot the confusion matrix
+            given an estimator, the data, and the label.
+
+        Examples
+        --------
+        >>> import matplotlib.pyplot as plt  # doctest: +SKIP
+        >>> from sklearn.datasets import make_classification
+        >>> from sklearn.metrics import ConfusionMatrixDisplay
+        >>> from sklearn.model_selection import train_test_split
+        >>> from sklearn.svm import SVC
+        >>> X, y = make_classification(random_state=0)
+        >>> X_train, X_test, y_train, y_test = train_test_split(
+        ...         X, y, random_state=0)
+        >>> clf = SVC(random_state=0)
+        >>> clf.fit(X_train, y_train)
+        SVC(random_state=0)
+        >>> y_pred = clf.predict(X_test)
+        >>> ConfusionMatrixDisplay.from_predictions(
+        ...    y_test, y_pred)  # doctest: +SKIP
+        >>> plt.show()  # doctest: +SKIP
+        """
+        check_matplotlib_support(f"{cls.__name__}.from_predictions")
+
+        if display_labels is None:
+            if labels is None:
+                display_labels = unique_labels(y_true, y_pred)
+            else:
+                display_labels = labels
+
+        cm = confusion_matrix(
+            y_true,
+            y_pred,
+            sample_weight=sample_weight,
+            labels=labels,
+            normalize=normalize,
+        )
+
+        disp = cls(confusion_matrix=cm, display_labels=display_labels)
+
+        return disp.plot(
+            include_values=include_values,
+            cmap=cmap,
+            ax=ax,
+            xticks_rotation=xticks_rotation,
+            values_format=values_format,
+            colorbar=colorbar,
+        )
+
+
+@deprecated(
+    "Function plot_confusion_matrix is deprecated in 1.0 and will be "
+    "removed in 1.2. Use one of the class methods: "
+    "ConfusionMatrixDisplay.from_predictions or "
+    "ConfusionMatrixDisplay.from_estimator."
+)
 @_deprecate_positional_args
 def plot_confusion_matrix(estimator, X, y_true, *, labels=None,
                           sample_weight=None, normalize=None,
@@ -173,6 +443,12 @@ def plot_confusion_matrix(estimator, X, y_true, *, labels=None,
 
     Read more in the :ref:`User Guide <confusion_matrix>`.
 
+    .. deprecated:: 1.0
+       `plot_confusion_matrix` is deprecated in 1.0 and will be removed in
+       1.2. Use one of the following class methods:
+       :func:`~sklearn.metrics.ConfusionMatrixDisplay.from_predictions` or
+       :func:`~sklearn.metrics.ConfusionMatrixDisplay.from_estimator`.
+
     Parameters
     ----------
     estimator : estimator instance
@@ -194,9 +470,15 @@ def plot_confusion_matrix(estimator, X, y_true, *, labels=None,
         Sample weights.
 
     normalize : {'true', 'pred', 'all'}, default=None
-        Normalizes confusion matrix over the true (rows), predicted (columns)
-        conditions or all the population. If None, confusion matrix will not be
-        normalized.
+        Either to normalize the counts display in the matrix:
+
+            - if `'true'`, the confusion matrix is normalized over the true
+              conditions (e.g. rows);
+            - if `'pred'`, the confusion matrix is normalized over the
+              predicted conditions (e.g. columns);
+            - if `'all'`, the confusion matrix is normalized by the total
+              number of samples;
+            - if `None` (default), the confusion matrix will not be normalized.
 
     display_labels : array-like of shape (n_classes,), default=None
         Target names used for plotting. By default, `labels` will be used if
diff --git a/sklearn/metrics/_plot/tests/test_confusion_matrix_display.py b/sklearn/metrics/_plot/tests/test_confusion_matrix_display.py
new file mode 100644
index 0000000000000..ed0bc04117396
--- /dev/null
+++ b/sklearn/metrics/_plot/tests/test_confusion_matrix_display.py
@@ -0,0 +1,382 @@
+from numpy.testing import (
+    assert_allclose,
+    assert_array_equal,
+)
+import numpy as np
+import pytest
+
+from sklearn.datasets import make_classification
+from sklearn.compose import make_column_transformer
+from sklearn.exceptions import NotFittedError
+from sklearn.linear_model import LogisticRegression
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import StandardScaler
+from sklearn.svm import SVC
+from sklearn.svm import SVR
+
+from sklearn.metrics import ConfusionMatrixDisplay
+from sklearn.metrics import confusion_matrix
+
+
+# TODO: Remove when https://github.com/numpy/numpy/issues/14397 is resolved
+pytestmark = pytest.mark.filterwarnings(
+    "ignore:In future, it will be an error for 'np.bool_':DeprecationWarning:"
+    "matplotlib.*"
+)
+
+
+def test_confusion_matrix_display_validation(pyplot):
+    """Check that we raise the proper error when validating parameters."""
+    X, y = make_classification(
+        n_samples=100, n_informative=5, n_classes=5, random_state=0
+    )
+
+    regressor = SVR().fit(X, y)
+    y_pred_regressor = regressor.predict(X)
+    y_pred_classifier = SVC().fit(X, y).predict(X)
+
+    err_msg = "ConfusionMatrixDisplay.from_estimator only supports classifiers"
+    with pytest.raises(ValueError, match=err_msg):
+        ConfusionMatrixDisplay.from_estimator(regressor, X, y)
+
+    err_msg = "Mix type of y not allowed, got types"
+    with pytest.raises(ValueError, match=err_msg):
+        # Force `y_true` to be seen as a regression problem
+        ConfusionMatrixDisplay.from_predictions(y + 0.5, y_pred_classifier)
+    with pytest.raises(ValueError, match=err_msg):
+        ConfusionMatrixDisplay.from_predictions(y, y_pred_regressor)
+
+    err_msg = "Found input variables with inconsistent numbers of samples"
+    with pytest.raises(ValueError, match=err_msg):
+        ConfusionMatrixDisplay.from_predictions(y, y_pred_classifier[::2])
+
+
+@pytest.mark.parametrize(
+    "constructor_name", ["from_estimator", "from_predictions"]
+)
+def test_confusion_matrix_display_invalid_option(pyplot, constructor_name):
+    """Check the error raise if an invalid parameter value is passed."""
+    X, y = make_classification(
+        n_samples=100, n_informative=5, n_classes=5, random_state=0
+    )
+    classifier = SVC().fit(X, y)
+    y_pred = classifier.predict(X)
+
+    # safe guard for the binary if/else construction
+    assert constructor_name in ("from_estimator", "from_predictions")
+    extra_params = {"normalize": "invalid"}
+
+    err_msg = r"normalize must be one of \{'true', 'pred', 'all', None\}"
+    with pytest.raises(ValueError, match=err_msg):
+        if constructor_name == "from_estimator":
+            ConfusionMatrixDisplay.from_estimator(
+                classifier, X, y, **extra_params
+            )
+        else:
+            ConfusionMatrixDisplay.from_predictions(
+                y, y_pred, **extra_params
+            )
+
+
+@pytest.mark.parametrize(
+    "constructor_name", ["from_estimator", "from_predictions"]
+)
+@pytest.mark.parametrize("with_labels", [True, False])
+@pytest.mark.parametrize("with_display_labels", [True, False])
+def test_confusion_matrix_display_custom_labels(
+    pyplot, constructor_name, with_labels, with_display_labels
+):
+    """Check the resulting plot when labels are given."""
+    n_classes = 5
+    X, y = make_classification(
+        n_samples=100, n_informative=5, n_classes=n_classes, random_state=0
+    )
+    classifier = SVC().fit(X, y)
+    y_pred = classifier.predict(X)
+
+    # safe guard for the binary if/else construction
+    assert constructor_name in ("from_estimator", "from_predictions")
+
+    ax = pyplot.gca()
+    labels = [2, 1, 0, 3, 4] if with_labels else None
+    display_labels = ["b", "d", "a", "e", "f"] if with_display_labels else None
+
+    cm = confusion_matrix(y, y_pred, labels=labels)
+    common_kwargs = {
+        "ax": ax,
+        "display_labels": display_labels,
+        "labels": labels,
+    }
+    if constructor_name == "from_estimator":
+        disp = ConfusionMatrixDisplay.from_estimator(
+            classifier, X, y, **common_kwargs
+        )
+    else:
+        disp = ConfusionMatrixDisplay.from_predictions(
+            y, y_pred, **common_kwargs
+        )
+    assert_allclose(disp.confusion_matrix, cm)
+
+    if with_display_labels:
+        expected_display_labels = display_labels
+    elif with_labels:
+        expected_display_labels = labels
+    else:
+        expected_display_labels = list(range(n_classes))
+
+    expected_display_labels_str = [str(name)
+                                   for name in expected_display_labels]
+
+    x_ticks = [tick.get_text() for tick in disp.ax_.get_xticklabels()]
+    y_ticks = [tick.get_text() for tick in disp.ax_.get_yticklabels()]
+
+    assert_array_equal(disp.display_labels, expected_display_labels)
+    assert_array_equal(x_ticks, expected_display_labels_str)
+    assert_array_equal(y_ticks, expected_display_labels_str)
+
+
+@pytest.mark.parametrize(
+    "constructor_name", ["from_estimator", "from_predictions"]
+)
+@pytest.mark.parametrize("normalize", ["true", "pred", "all", None])
+@pytest.mark.parametrize("include_values", [True, False])
+def test_confusion_matrix_display_plotting(
+    pyplot, constructor_name, normalize, include_values,
+):
+    """Check the overall plotting rendering."""
+    n_classes = 5
+    X, y = make_classification(
+        n_samples=100, n_informative=5, n_classes=n_classes, random_state=0
+    )
+    classifier = SVC().fit(X, y)
+    y_pred = classifier.predict(X)
+
+    # safe guard for the binary if/else construction
+    assert constructor_name in ("from_estimator", "from_predictions")
+
+    ax = pyplot.gca()
+    cmap = "plasma"
+
+    cm = confusion_matrix(y, y_pred)
+    common_kwargs = {
+        "normalize": normalize,
+        "cmap": cmap,
+        "ax": ax,
+        "include_values": include_values,
+    }
+    if constructor_name == "from_estimator":
+        disp = ConfusionMatrixDisplay.from_estimator(
+            classifier, X, y, **common_kwargs
+        )
+    else:
+        disp = ConfusionMatrixDisplay.from_predictions(
+            y, y_pred, **common_kwargs
+        )
+
+    assert disp.ax_ == ax
+
+    if normalize == "true":
+        cm = cm / cm.sum(axis=1, keepdims=True)
+    elif normalize == "pred":
+        cm = cm / cm.sum(axis=0, keepdims=True)
+    elif normalize == "all":
+        cm = cm / cm.sum()
+
+    assert_allclose(disp.confusion_matrix, cm)
+    import matplotlib as mpl
+
+    assert isinstance(disp.im_, mpl.image.AxesImage)
+    assert disp.im_.get_cmap().name == cmap
+    assert isinstance(disp.ax_, pyplot.Axes)
+    assert isinstance(disp.figure_, pyplot.Figure)
+
+    assert disp.ax_.get_ylabel() == "True label"
+    assert disp.ax_.get_xlabel() == "Predicted label"
+
+    x_ticks = [tick.get_text() for tick in disp.ax_.get_xticklabels()]
+    y_ticks = [tick.get_text() for tick in disp.ax_.get_yticklabels()]
+
+    expected_display_labels = list(range(n_classes))
+
+    expected_display_labels_str = [
+        str(name) for name in expected_display_labels
+    ]
+
+    assert_array_equal(disp.display_labels, expected_display_labels)
+    assert_array_equal(x_ticks, expected_display_labels_str)
+    assert_array_equal(y_ticks, expected_display_labels_str)
+
+    image_data = disp.im_.get_array().data
+    assert_allclose(image_data, cm)
+
+    if include_values:
+        assert disp.text_.shape == (n_classes, n_classes)
+        fmt = ".2g"
+        expected_text = np.array([format(v, fmt) for v in cm.ravel(order="C")])
+        text_text = np.array(
+            [t.get_text() for t in disp.text_.ravel(order="C")]
+        )
+        assert_array_equal(expected_text, text_text)
+    else:
+        assert disp.text_ is None
+
+
+@pytest.mark.parametrize(
+    "constructor_name", ["from_estimator", "from_predictions"]
+)
+def test_confusion_matrix_display(pyplot, constructor_name):
+    """Check the behaviour of the default constructor without using the class
+    methods."""
+    n_classes = 5
+    X, y = make_classification(
+        n_samples=100, n_informative=5, n_classes=n_classes, random_state=0
+    )
+    classifier = SVC().fit(X, y)
+    y_pred = classifier.predict(X)
+
+    # safe guard for the binary if/else construction
+    assert constructor_name in ("from_estimator", "from_predictions")
+
+    cm = confusion_matrix(y, y_pred)
+    common_kwargs = {
+        "normalize": None,
+        "include_values": True,
+        "cmap": "viridis",
+        "xticks_rotation": 45.0,
+    }
+    if constructor_name == "from_estimator":
+        disp = ConfusionMatrixDisplay.from_estimator(
+            classifier, X, y, **common_kwargs
+        )
+    else:
+        disp = ConfusionMatrixDisplay.from_predictions(
+            y, y_pred, **common_kwargs
+        )
+
+    assert_allclose(disp.confusion_matrix, cm)
+    assert disp.text_.shape == (n_classes, n_classes)
+
+    rotations = [tick.get_rotation() for tick in disp.ax_.get_xticklabels()]
+    assert_allclose(rotations, 45.0)
+
+    image_data = disp.im_.get_array().data
+    assert_allclose(image_data, cm)
+
+    disp.plot(cmap="plasma")
+    assert disp.im_.get_cmap().name == "plasma"
+
+    disp.plot(include_values=False)
+    assert disp.text_ is None
+
+    disp.plot(xticks_rotation=90.0)
+    rotations = [tick.get_rotation() for tick in disp.ax_.get_xticklabels()]
+    assert_allclose(rotations, 90.0)
+
+    disp.plot(values_format="e")
+    expected_text = np.array([format(v, "e") for v in cm.ravel(order="C")])
+    text_text = np.array([t.get_text() for t in disp.text_.ravel(order="C")])
+    assert_array_equal(expected_text, text_text)
+
+
+def test_confusion_matrix_contrast(pyplot):
+    """Check that the text color is appropriate depending on background."""
+
+    cm = np.eye(2) / 2
+    disp = ConfusionMatrixDisplay(cm, display_labels=[0, 1])
+
+    disp.plot(cmap=pyplot.cm.gray)
+    # diagonal text is black
+    assert_allclose(disp.text_[0, 0].get_color(), [0.0, 0.0, 0.0, 1.0])
+    assert_allclose(disp.text_[1, 1].get_color(), [0.0, 0.0, 0.0, 1.0])
+
+    # off-diagonal text is white
+    assert_allclose(disp.text_[0, 1].get_color(), [1.0, 1.0, 1.0, 1.0])
+    assert_allclose(disp.text_[1, 0].get_color(), [1.0, 1.0, 1.0, 1.0])
+
+    disp.plot(cmap=pyplot.cm.gray_r)
+    # diagonal text is white
+    assert_allclose(disp.text_[0, 1].get_color(), [0.0, 0.0, 0.0, 1.0])
+    assert_allclose(disp.text_[1, 0].get_color(), [0.0, 0.0, 0.0, 1.0])
+
+    # off-diagonal text is black
+    assert_allclose(disp.text_[0, 0].get_color(), [1.0, 1.0, 1.0, 1.0])
+    assert_allclose(disp.text_[1, 1].get_color(), [1.0, 1.0, 1.0, 1.0])
+
+    # Regression test for #15920
+    cm = np.array([[19, 34], [32, 58]])
+    disp = ConfusionMatrixDisplay(cm, display_labels=[0, 1])
+
+    disp.plot(cmap=pyplot.cm.Blues)
+    min_color = pyplot.cm.Blues(0)
+    max_color = pyplot.cm.Blues(255)
+    assert_allclose(disp.text_[0, 0].get_color(), max_color)
+    assert_allclose(disp.text_[0, 1].get_color(), max_color)
+    assert_allclose(disp.text_[1, 0].get_color(), max_color)
+    assert_allclose(disp.text_[1, 1].get_color(), min_color)
+
+
+@pytest.mark.parametrize(
+    "clf",
+    [
+        LogisticRegression(),
+        make_pipeline(StandardScaler(), LogisticRegression()),
+        make_pipeline(
+            make_column_transformer((StandardScaler(), [0, 1])),
+            LogisticRegression(),
+        ),
+    ],
+    ids=["clf", "pipeline-clf", "pipeline-column_transformer-clf"]
+)
+def test_confusion_matrix_pipeline(pyplot, clf):
+    """Check the behaviour of the plotting with more complex pipeline."""
+    n_classes = 5
+    X, y = make_classification(
+        n_samples=100, n_informative=5, n_classes=n_classes, random_state=0
+    )
+    with pytest.raises(NotFittedError):
+        ConfusionMatrixDisplay.from_estimator(clf, X, y)
+    clf.fit(X, y)
+    y_pred = clf.predict(X)
+
+    disp = ConfusionMatrixDisplay.from_estimator(clf, X, y)
+    cm = confusion_matrix(y, y_pred)
+
+    assert_allclose(disp.confusion_matrix, cm)
+    assert disp.text_.shape == (n_classes, n_classes)
+
+
+@pytest.mark.parametrize(
+    "constructor_name", ["from_estimator", "from_predictions"]
+)
+def test_confusion_matrix_with_unknown_labels(pyplot, constructor_name):
+    """Check that when labels=None, the unique values in `y_pred` and `y_true`
+    will be used.
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/pull/18405
+    """
+    n_classes = 5
+    X, y = make_classification(
+        n_samples=100, n_informative=5, n_classes=n_classes, random_state=0
+    )
+    classifier = SVC().fit(X, y)
+    y_pred = classifier.predict(X)
+    # create unseen labels in `y_true` not seen during fitting and not present
+    # in 'classifier.classes_'
+    y = y + 1
+
+    # safe guard for the binary if/else construction
+    assert constructor_name in ("from_estimator", "from_predictions")
+
+    common_kwargs = {"labels": None}
+    if constructor_name == "from_estimator":
+        disp = ConfusionMatrixDisplay.from_estimator(
+            classifier, X, y, **common_kwargs
+        )
+    else:
+        disp = ConfusionMatrixDisplay.from_predictions(
+            y, y_pred, **common_kwargs
+        )
+
+    display_labels = [tick.get_text() for tick in disp.ax_.get_xticklabels()]
+    expected_labels = [str(i) for i in range(n_classes + 1)]
+    assert_array_equal(expected_labels, display_labels)
diff --git a/sklearn/metrics/_plot/tests/test_plot_confusion_matrix.py b/sklearn/metrics/_plot/tests/test_plot_confusion_matrix.py
index 60897cdd5613b..6fba7ec4d1a0d 100644
--- a/sklearn/metrics/_plot/tests/test_plot_confusion_matrix.py
+++ b/sklearn/metrics/_plot/tests/test_plot_confusion_matrix.py
@@ -1,3 +1,4 @@
+# TODO: remove this file when plot_confusion_matrix will be deprecated in 1.2
 import pytest
 import numpy as np
 from numpy.testing import assert_allclose
@@ -45,6 +46,9 @@ def y_pred(data, fitted_clf):
     return fitted_clf.predict(X)
 
 
+@pytest.mark.filterwarnings(
+    "ignore: Function plot_confusion_matrix is deprecated"
+)
 def test_error_on_regressor(pyplot, data):
     X, y = data
     est = SVR().fit(X, y)
@@ -54,6 +58,9 @@ def test_error_on_regressor(pyplot, data):
         plot_confusion_matrix(est, X, y)
 
 
+@pytest.mark.filterwarnings(
+    "ignore: Function plot_confusion_matrix is deprecated"
+)
 def test_error_on_invalid_option(pyplot, fitted_clf, data):
     X, y = data
     msg = (r"normalize must be one of \{'true', 'pred', 'all', "
@@ -63,6 +70,9 @@ def test_error_on_invalid_option(pyplot, fitted_clf, data):
         plot_confusion_matrix(fitted_clf, X, y, normalize='invalid')
 
 
+@pytest.mark.filterwarnings(
+    "ignore: Function plot_confusion_matrix is deprecated"
+)
 @pytest.mark.parametrize("with_labels", [True, False])
 @pytest.mark.parametrize("with_display_labels", [True, False])
 def test_plot_confusion_matrix_custom_labels(pyplot, data, y_pred, fitted_clf,
@@ -98,6 +108,9 @@ def test_plot_confusion_matrix_custom_labels(pyplot, data, y_pred, fitted_clf,
     assert_array_equal(y_ticks, expected_display_labels_str)
 
 
+@pytest.mark.filterwarnings(
+    "ignore: Function plot_confusion_matrix is deprecated"
+)
 @pytest.mark.parametrize("normalize", ['true', 'pred', 'all', None])
 @pytest.mark.parametrize("include_values", [True, False])
 def test_plot_confusion_matrix(pyplot, data, y_pred, n_classes, fitted_clf,
@@ -156,6 +169,9 @@ def test_plot_confusion_matrix(pyplot, data, y_pred, n_classes, fitted_clf,
         assert disp.text_ is None
 
 
+@pytest.mark.filterwarnings(
+    "ignore: Function plot_confusion_matrix is deprecated"
+)
 def test_confusion_matrix_display(pyplot, data, fitted_clf, y_pred, n_classes):
     X, y = data
 
@@ -227,6 +243,9 @@ def test_confusion_matrix_contrast(pyplot):
     assert_allclose(disp.text_[1, 1].get_color(), min_color)
 
 
+@pytest.mark.filterwarnings(
+    "ignore: Function plot_confusion_matrix is deprecated"
+)
 @pytest.mark.parametrize(
     "clf", [LogisticRegression(),
             make_pipeline(StandardScaler(), LogisticRegression()),
@@ -246,6 +265,9 @@ def test_confusion_matrix_pipeline(pyplot, clf, data, n_classes):
     assert disp.text_.shape == (n_classes, n_classes)
 
 
+@pytest.mark.filterwarnings(
+    "ignore: Function plot_confusion_matrix is deprecated"
+)
 @pytest.mark.parametrize("colorbar", [True, False])
 def test_plot_confusion_matrix_colorbar(pyplot, data, fitted_clf, colorbar):
     X, y = data
@@ -263,6 +285,9 @@ def _check_colorbar(disp, has_colorbar):
     _check_colorbar(disp, not colorbar)
 
 
+@pytest.mark.filterwarnings(
+    "ignore: Function plot_confusion_matrix is deprecated"
+)
 @pytest.mark.parametrize("values_format", ['e', 'n'])
 def test_confusion_matrix_text_format(pyplot, data, y_pred, n_classes,
                                       fitted_clf, values_format):
@@ -316,6 +341,9 @@ def test_default_labels(pyplot, display_labels, expected_labels):
     assert_array_equal(y_ticks, expected_labels)
 
 
+@pytest.mark.filterwarnings(
+    "ignore: Function plot_confusion_matrix is deprecated"
+)
 def test_error_on_a_dataset_with_unseen_labels(
     pyplot, fitted_clf, data, n_classes
 ):
@@ -334,3 +362,8 @@ def test_error_on_a_dataset_with_unseen_labels(
     display_labels = [tick.get_text() for tick in disp.ax_.get_xticklabels()]
     expected_labels = [str(i) for i in range(n_classes + 1)]
     assert_array_equal(expected_labels, display_labels)
+
+
+def test_plot_confusion_matrix_deprecation_warning(pyplot, fitted_clf, data):
+    with pytest.warns(FutureWarning):
+        plot_confusion_matrix(fitted_clf, *data)

From 3b3b7f554897726064f41408b6a885c3392f95c6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?=
 <34657725+jeremiedbb@users.noreply.github.com>
Date: Sat, 23 Jan 2021 16:10:38 +0100
Subject: [PATCH 084/478] FIX BIRCH: self.partial_fit -> self.partial_fit_
 (#19231)

---
 sklearn/cluster/_birch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/cluster/_birch.py b/sklearn/cluster/_birch.py
index 3c1b03e6b958d..6eb167cc9a315 100644
--- a/sklearn/cluster/_birch.py
+++ b/sklearn/cluster/_birch.py
@@ -462,7 +462,7 @@ def fit(self, X, y=None):
 
     def _fit(self, X):
         has_root = getattr(self, 'root_', None)
-        first_call = self.fit_ or self.partial_fit and not has_root
+        first_call = self.fit_ or (self.partial_fit_ and not has_root)
 
         X = self._validate_data(X, accept_sparse='csr', copy=self.copy,
                                 reset=first_call)

From 0f0eb522903431b07f6e267b8b0d42ef24659cbf Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@gmail.com>
Date: Sat, 23 Jan 2021 16:14:00 +0100
Subject: [PATCH 085/478] FIX check_decision_proba_consistency random failure
 (#19225)

* FIX more deterministic check_decision_proba_consistency

* Trigger [cd build]

* Re-add rounding

* Trigger [cd build]

* Avoid redundant phrasing in comment [ci skip]
---
 sklearn/utils/estimator_checks.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index 499bca6d6391f..849d8a1f3921b 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -2907,16 +2907,19 @@ def check_decision_proba_consistency(name, estimator_orig):
     centers = [(2, 2), (4, 4)]
     X, y = make_blobs(n_samples=100, random_state=0, n_features=4,
                       centers=centers, cluster_std=1.0, shuffle=True)
-    X_test = np.random.randn(20, 2) + 4
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
+                                                        random_state=0)
     estimator = clone(estimator_orig)
 
     if (hasattr(estimator, "decision_function") and
             hasattr(estimator, "predict_proba")):
 
-        estimator.fit(X, y)
+        estimator.fit(X_train, y_train)
         # Since the link function from decision_function() to predict_proba()
         # is sometimes not precise enough (typically expit), we round to the
-        # 10th decimal to avoid numerical issues.
+        # 10th decimal to avoid numerical issues: we compare the rank
+        # with deterministic ties rather than get platform specific rank
+        # inversions in case of machine level differences.
         a = estimator.predict_proba(X_test)[:, 1].round(decimals=10)
         b = estimator.decision_function(X_test).round(decimals=10)
         assert_array_equal(rankdata(a), rankdata(b))

From 27f1c737f2cfc6e589c292776df6f78c08a4da91 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sun, 24 Jan 2021 10:53:27 +0100
Subject: [PATCH 086/478] FEA Add SplineTransformer (#18368)

---
 doc/modules/classes.rst                       |   1 +
 doc/modules/preprocessing.rst                 | 115 ++++-
 doc/whats_new/v1.0.rst                        |   9 +
 .../plot_polynomial_interpolation.py          | 161 +++++--
 sklearn/preprocessing/__init__.py             |   3 +
 sklearn/preprocessing/_data.py                |   7 +-
 sklearn/preprocessing/_polynomial.py          | 429 ++++++++++++++++++
 sklearn/preprocessing/tests/test_data.py      |  53 +--
 .../preprocessing/tests/test_polynomial.py    | 245 ++++++++++
 sklearn/utils/fixes.py                        |  48 ++
 sklearn/utils/tests/test_fixes.py             |  35 ++
 11 files changed, 1028 insertions(+), 78 deletions(-)
 create mode 100644 sklearn/preprocessing/_polynomial.py
 create mode 100644 sklearn/preprocessing/tests/test_polynomial.py

diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index 84f8097cbbe9d..65d555f978df0 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -1414,6 +1414,7 @@ details.
    preprocessing.PowerTransformer
    preprocessing.QuantileTransformer
    preprocessing.RobustScaler
+   preprocessing.SplineTransformer
    preprocessing.StandardScaler
 
 .. autosummary::
diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
index 801d9a98ed1f4..a339b4bfae4e2 100644
--- a/doc/modules/preprocessing.rst
+++ b/doc/modules/preprocessing.rst
@@ -624,7 +624,9 @@ of continuous attributes to one with only nominal attributes.
 
 One-hot encoded discretized features can make a model more expressive, while
 maintaining interpretability. For instance, pre-processing with a discretizer
-can introduce nonlinearity to linear models.
+can introduce nonlinearity to linear models. For more advanced possibilities,
+in particular smooth ones, see :ref:`generating_polynomial_features` further
+below.
 
 K-bins discretization
 ---------------------
@@ -756,12 +758,24 @@ Imputation of missing values
 
 Tools for imputing missing values are discussed at :ref:`impute`.
 
-.. _polynomial_features:
+.. _generating_polynomial_features:
 
 Generating polynomial features
 ==============================
 
-Often it's useful to add complexity to the model by considering nonlinear features of the input data. A simple and common method to use is polynomial features, which can get features' high-order and interaction terms. It is implemented in :class:`PolynomialFeatures`::
+Often it's useful to add complexity to a model by considering nonlinear
+features of the input data. We show two possibilities that are both based on
+polynomials: The first one uses pure polynomials, the second one uses splines,
+i.e. piecewise polynomials.
+
+.. _polynomial_features:
+
+Polynomial features
+-------------------
+
+A simple and common method to use is polynomial features, which can get
+features' high-order and interaction terms. It is implemented in
+:class:`PolynomialFeatures`::
 
     >>> import numpy as np
     >>> from sklearn.preprocessing import PolynomialFeatures
@@ -776,9 +790,11 @@ Often it's useful to add complexity to the model by considering nonlinear featur
            [ 1.,  2.,  3.,  4.,  6.,  9.],
            [ 1.,  4.,  5., 16., 20., 25.]])
 
-The features of X have been transformed from :math:`(X_1, X_2)` to :math:`(1, X_1, X_2, X_1^2, X_1X_2, X_2^2)`.
+The features of X have been transformed from :math:`(X_1, X_2)` to
+:math:`(1, X_1, X_2, X_1^2, X_1X_2, X_2^2)`.
 
-In some cases, only interaction terms among features are required, and it can be gotten with the setting ``interaction_only=True``::
+In some cases, only interaction terms among features are required, and it can
+be gotten with the setting ``interaction_only=True``::
 
     >>> X = np.arange(9).reshape(3, 3)
     >>> X
@@ -791,11 +807,94 @@ In some cases, only interaction terms among features are required, and it can be
            [  1.,   3.,   4.,   5.,  12.,  15.,  20.,  60.],
            [  1.,   6.,   7.,   8.,  42.,  48.,  56., 336.]])
 
-The features of X have been transformed from :math:`(X_1, X_2, X_3)` to :math:`(1, X_1, X_2, X_3, X_1X_2, X_1X_3, X_2X_3, X_1X_2X_3)`.
+The features of X have been transformed from :math:`(X_1, X_2, X_3)` to
+:math:`(1, X_1, X_2, X_3, X_1X_2, X_1X_3, X_2X_3, X_1X_2X_3)`.
+
+Note that polynomial features are used implicitly in `kernel methods
+<https://en.wikipedia.org/wiki/Kernel_method>`_ (e.g., :class:`~sklearn.svm.SVC`,
+:class:`~sklearn.decomposition.KernelPCA`) when using polynomial :ref:`svm_kernels`.
+
+See :ref:`sphx_glr_auto_examples_linear_model_plot_polynomial_interpolation.py`
+for Ridge regression using created polynomial features.
+
+.. _spline_transformer:
+
+Spline transformer
+------------------
+
+Another way to add nonlinear terms instead of pure polynomials of features is
+to generate spline basis functions for each feature with the
+:class:`SplineTransformer`. Splines are piecewise polynomials, parametrized by
+their polynomial degree and the positions of the knots. The
+:class:`SplineTransformer` implements a B-spline basis, cf. the references
+below.
+
+.. note::
+
+    The :class:`SplineTransformer` treats each feature separately, i.e. it
+    won't give you interaction terms.
+
+Some of the advantages of splines over polynomials are:
+
+    - B-splines are very flexible and robust if you keep a fixed low degree,
+      usually 3, and parsimoniously adapt the number of knots. Polynomials
+      would need a higher degree, which leads to the next point.
+    - B-splines do not have oscillatory behaviour at the boundaries as have
+      polynomials (the higher the degree, the worse). This is known as `Runge's
+      phenomenon <https://en.wikipedia.org/wiki/Runge%27s_phenomenon>`_.
+    - B-splines provide good options for extrapolation beyond the boundaries,
+      i.e. beyond the range of fitted values. Have a look at the option
+      ``extrapolation``.
+    - B-splines generate a feature matrix with a banded structure. For a single
+      feature, every row contains only ``degree + 1`` non-zero elements, which
+      occur consecutively and are even positive. This results in a matrix with
+      good numerical properties, e.g. a low condition number, in sharp contrast
+      to a matrix of polynomials, which goes under the name
+      `Vandermonde matrix <https://en.wikipedia.org/wiki/Vandermonde_matrix>`_.
+      A low condition number is important for stable algorithms of linear
+      models.
+
+The following code snippet shows splines in action::
+
+    >>> import numpy as np
+    >>> from sklearn.preprocessing import SplineTransformer
+    >>> X = np.arange(5).reshape(5, 1)
+    >>> X
+    array([[0],
+           [1],
+           [2],
+           [3],
+           [4]])
+    >>> spline = SplineTransformer(degree=2, n_knots=3)
+    >>> spline.fit_transform(X)
+    array([[0.5  , 0.5  , 0.   , 0.   ],
+           [0.125, 0.75 , 0.125, 0.   ],
+           [0.   , 0.5  , 0.5  , 0.   ],
+           [0.   , 0.125, 0.75 , 0.125],
+           [0.   , 0.   , 0.5  , 0.5  ]])
+
+As the ``X`` is sorted, one can easily see the banded matrix output. Only the
+three middle diagonals are non-zero for ``degree=2``. The higher the degree,
+the more overlapping of the splines.
+
+Interestingly, a :class:`SplineTransformer` of ``degree=0`` is the same as
+:class:`~sklearn.preprocessing.KBinsDiscretizer` with ``encode='onehot-dense``
+and ``n_bins = n_knots - 1`` if ``knots = strategy``.
+
+.. topic:: Examples:
+
+    * :ref:`sphx_glr_auto_examples_linear_model_plot_polynomial_interpolation.py`
+
+.. topic:: References:
 
-Note that polynomial features are used implicitly in `kernel methods <https://en.wikipedia.org/wiki/Kernel_method>`_ (e.g., :class:`~sklearn.svm.SVC`, :class:`~sklearn.decomposition.KernelPCA`) when using polynomial :ref:`svm_kernels`.
+    * Eilers, P., & Marx, B. (1996). Flexible Smoothing with B-splines and
+      Penalties. Statist. Sci. 11 (1996), no. 2, 89--121.
+      `doi:10.1214/ss/1038425655 <https://doi.org/10.1214/ss/1038425655>`_
 
-See :ref:`sphx_glr_auto_examples_linear_model_plot_polynomial_interpolation.py` for Ridge regression using created polynomial features.
+    * Perperoglou, A., Sauerbrei, W., Abrahamowicz, M. et al. A review of
+      spline function procedures in R. BMC Med Res Methodol 19, 46 (2019).
+      `doi:10.1186/s12874-019-0666-3
+      <https://doi.org/10.1186/s12874-019-0666-3>`_
 
 .. _function_transformer:
 
diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 652eba896903b..177dab23b3e62 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -110,6 +110,15 @@ Changelog
   Use ``var_`` instead.
   :pr:`18842` by :user:`Hong Shao Yang <hongshaoyang>`.
 
+:mod:`sklearn.preprocessing`
+............................
+
+- |Feature| The new :class:`preprocessing.SplineTransformer` is a feature
+  preprocessing tool for the generation of B-splines, parametrized by the
+  polynomial ``degree`` of the splines, number of knots ``n_knots`` and knot
+  positioning strategy ``knots``.
+  :pr:`18368` by :user:`Christian Lorentzen <lorentzenchr>`.
+
 :mod:`sklearn.tree`
 ...................
 
diff --git a/examples/linear_model/plot_polynomial_interpolation.py b/examples/linear_model/plot_polynomial_interpolation.py
index 6f2face73c83e..cfa684ffd79ca 100644
--- a/examples/linear_model/plot_polynomial_interpolation.py
+++ b/examples/linear_model/plot_polynomial_interpolation.py
@@ -1,72 +1,147 @@
-#!/usr/bin/env python
 """
-========================
-Polynomial interpolation
-========================
-
-This example demonstrates how to approximate a function with a polynomial of
-degree n_degree by using ridge regression. Concretely, from n_samples 1d
-points, it suffices to build the Vandermonde matrix, which is n_samples x
-n_degree+1 and has the following form:
-
-[[1, x_1, x_1 ** 2, x_1 ** 3, ...],
- [1, x_2, x_2 ** 2, x_2 ** 3, ...],
- ...]
-
-Intuitively, this matrix can be interpreted as a matrix of pseudo features (the
-points raised to some power). The matrix is akin to (but different from) the
-matrix induced by a polynomial kernel.
-
-This example shows that you can do non-linear regression with a linear model,
-using a pipeline to add non-linear features. Kernel methods extend this idea
-and can induce very high (even infinite) dimensional feature spaces.
+===================================
+Polynomial and Spline interpolation
+===================================
+
+This example demonstrates how to approximate a function with polynomials up to
+degree ``degree`` by using ridge regression. We show two different ways given
+``n_samples`` of 1d points ``x_i``:
+
+- :class:`~sklearn.preprocessing.PolynomialFeatures` generates all monomials
+  up to ``degree``. This gives us the so called Vandermonde matrix with
+  ``n_samples`` rows and ``degree + 1`` columns::
+
+    [[1, x_0, x_0 ** 2, x_0 ** 3, ..., x_0 ** degree],
+     [1, x_1, x_1 ** 2, x_1 ** 3, ..., x_1 ** degree],
+     ...]
+
+  Intuitively, this matrix can be interpreted as a matrix of pseudo features
+  (the points raised to some power). The matrix is akin to (but different from)
+  the matrix induced by a polynomial kernel.
+
+- :class:`~sklearn.preprocessing.SplineTransformer` generates B-spline basis
+  functions. A basis function of a B-spline is a piece-wise polynomial function
+  of degree ``degree`` that is non-zero only between ``degree+1`` consecutive
+  knots. Given ``n_knots`` number of knots, this results in matrix of
+  ``n_samples`` rows and ``n_knots + degree - 1`` columns::
+
+    [[basis_1(x_0), basis_2(x_0), ...],
+     [basis_1(x_1), basis_2(x_1), ...],
+     ...]
+
+This example shows that these two transformers are well suited to model
+non-linear effects with a linear model, using a pipeline to add non-linear
+features. Kernel methods extend this idea and can induce very high (even
+infinite) dimensional feature spaces.
 """
 print(__doc__)
 
 # Author: Mathieu Blondel
 #         Jake Vanderplas
+#         Christian Lorentzen
 # License: BSD 3 clause
 
 import numpy as np
 import matplotlib.pyplot as plt
 
 from sklearn.linear_model import Ridge
-from sklearn.preprocessing import PolynomialFeatures
+from sklearn.preprocessing import PolynomialFeatures, SplineTransformer
 from sklearn.pipeline import make_pipeline
 
 
+# %%
+# We start by defining a function that we intent to approximate and prepare
+# plotting it.
+
 def f(x):
-    """ function to approximate by polynomial interpolation"""
+    """Function to be approximated by polynomial interpolation."""
     return x * np.sin(x)
 
 
-# generate points used to plot
-x_plot = np.linspace(0, 10, 100)
+# whole range we want to plot
+x_plot = np.linspace(-1, 11, 100)
+
+# %%
+# To make it interesting, we only give a small subset of points to train on.
 
-# generate points and keep a subset of them
-x = np.linspace(0, 10, 100)
+x_train = np.linspace(0, 10, 100)
 rng = np.random.RandomState(0)
-rng.shuffle(x)
-x = np.sort(x[:20])
-y = f(x)
+x_train = np.sort(rng.choice(x_train, size=20, replace=False))
+y_train = f(x_train)
 
-# create matrix versions of these arrays
-X = x[:, np.newaxis]
+# create 2D-array versions of these arrays to feed to transformers
+X_train = x_train[:, np.newaxis]
 X_plot = x_plot[:, np.newaxis]
 
-colors = ['teal', 'yellowgreen', 'gold']
-lw = 2
-plt.plot(x_plot, f(x_plot), color='cornflowerblue', linewidth=lw,
-         label="ground truth")
-plt.scatter(x, y, color='navy', s=30, marker='o', label="training points")
+# %%
+# Now we are ready to create polynomial features and splines, fit on the
+# training points and show how well they interpolate.
 
-for count, degree in enumerate([3, 4, 5]):
-    model = make_pipeline(PolynomialFeatures(degree), Ridge())
-    model.fit(X, y)
+# plot function
+lw = 2
+fig, ax = plt.subplots()
+ax.set_prop_cycle(color=[
+    "black", "teal", "yellowgreen", "gold", "darkorange", "tomato"
+])
+ax.plot(x_plot, f(x_plot), linewidth=lw, label="ground truth")
+
+# plot training points
+ax.scatter(x_train, y_train, label="training points")
+
+# polynomial features
+for degree in [3, 4, 5]:
+    model = make_pipeline(PolynomialFeatures(degree), Ridge(alpha=1e-3))
+    model.fit(X_train, y_train)
     y_plot = model.predict(X_plot)
-    plt.plot(x_plot, y_plot, color=colors[count], linewidth=lw,
-             label="degree %d" % degree)
+    ax.plot(x_plot, y_plot, label=f"degree {degree}")
 
-plt.legend(loc='lower left')
+# B-spline with 4 + 3 - 1 = 6 basis functions
+model = make_pipeline(SplineTransformer(n_knots=4, degree=3),
+                      Ridge(alpha=1e-3))
+model.fit(X_train, y_train)
 
+y_plot = model.predict(X_plot)
+ax.plot(x_plot, y_plot, label="B-spline")
+ax.legend(loc='lower center')
+ax.set_ylim(-20, 10)
 plt.show()
+
+# %%
+# This shows nicely that higher degree polynomials can fit the data better. But
+# at the same time, too high powers can show unwanted oscillatory behaviour
+# and are particularly dangerous for extrapolation beyond the range of fitted
+# data. This is an advantage of B-splines. They usually fit the data as well as
+# polynomials and show very nice and smooth behaviour. They have also good
+# options to control the extrapolation, which defaults to continue with a
+# constant. Note that most often, you would rather increase the number of knots
+# but keep ``degree=3``.
+#
+# In order to give more insights into the generated feature bases, we plot all
+# columns of both transformers separately.
+
+fig, axes = plt.subplots(ncols=2, figsize=(16, 5))
+pft = PolynomialFeatures(degree=3).fit(X_train)
+axes[0].plot(x_plot, pft.transform(X_plot))
+axes[0].legend(axes[0].lines, [f"degree {n}" for n in range(4)])
+axes[0].set_title("PolynomialFeatures")
+
+splt = SplineTransformer(n_knots=4, degree=3).fit(X_train)
+axes[1].plot(x_plot, splt.transform(X_plot))
+axes[1].legend(axes[1].lines, [f"spline {n}" for n in range(4)])
+axes[1].set_title("SplineTransformer")
+
+# plot knots of spline
+knots = splt.bsplines_[0].t
+axes[1].vlines(knots[3:-3], ymin=0, ymax=0.8, linestyles='dashed')
+plt.show()
+
+# %%
+# In the left plot, we recognize the lines corresponding to simple monomials
+# from ``x**0`` to ``x**3``. In the right figure, we see the four B-spline
+# basis functions of ``degree=3`` and also the four knot positions that were
+# chosen during ``fit``. Note that there are ``degree`` number of additional
+# knots each to the left and to the right of the fitted interval. These are
+# there for technical reasons, so we refrain from showing them. Every basis
+# function has local support and is continued as a constant beyond the fitted
+# range. This extrapolating behaviour could be changed by the argument
+# ``extrapolation``.
diff --git a/sklearn/preprocessing/__init__.py b/sklearn/preprocessing/__init__.py
index d048b30e1f3d0..076b9e85e1150 100644
--- a/sklearn/preprocessing/__init__.py
+++ b/sklearn/preprocessing/__init__.py
@@ -35,6 +35,8 @@
 
 from ._discretization import KBinsDiscretizer
 
+from ._polynomial import SplineTransformer
+
 
 __all__ = [
     'Binarizer',
@@ -52,6 +54,7 @@
     'OrdinalEncoder',
     'PowerTransformer',
     'RobustScaler',
+    'SplineTransformer',
     'StandardScaler',
     'add_dummy_feature',
     'PolynomialFeatures',
diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py
index 3921b898c072d..5814875e04bff 100644
--- a/sklearn/preprocessing/_data.py
+++ b/sklearn/preprocessing/_data.py
@@ -1620,6 +1620,11 @@ class PolynomialFeatures(TransformerMixin, BaseEstimator):
         features is computed by iterating over all suitably sized combinations
         of input features.
 
+    See Also
+    --------
+    SplineTransformer : Transformer that generates univariate B-spline bases
+        for features
+
     Notes
     -----
     Be aware that the number of features in the output array scales
@@ -1711,7 +1716,7 @@ def fit(self, X, y=None):
         return self
 
     def transform(self, X):
-        """Transform data to polynomial features
+        """Transform data to polynomial features.
 
         Parameters
         ----------
diff --git a/sklearn/preprocessing/_polynomial.py b/sklearn/preprocessing/_polynomial.py
new file mode 100644
index 0000000000000..47ab90be2ebcd
--- /dev/null
+++ b/sklearn/preprocessing/_polynomial.py
@@ -0,0 +1,429 @@
+"""
+This file contains preprocessing tools based on polynomials.
+"""
+import numbers
+
+import numpy as np
+from scipy.interpolate import BSpline
+
+from ..base import BaseEstimator, TransformerMixin
+from ..utils import check_array
+from ..utils.fixes import linspace
+from ..utils.validation import check_is_fitted, FLOAT_DTYPES
+
+
+__all__ = [
+    "SplineTransformer",
+]
+
+
+# TODO:
+# - sparse support (either scipy or own cython solution)?
+# - extrapolation (cyclic)
+class SplineTransformer(TransformerMixin, BaseEstimator):
+    """Generate univariate B-spline bases for features.
+
+    Generate a new feature matrix consisting of
+    `n_splines=n_knots + degree - 1` spline basis functions (B-splines) of
+    polynomial order=`degree` for each feature.
+
+    Read more in the :ref:`User Guide <spline_transformer>`.
+
+    .. versionadded:: 1.0
+
+    Parameters
+    ----------
+    n_knots : int, default=5
+        Number of knots of the splines if `knots` equals one of
+        {'uniform', 'quantile'}. Must be larger or equal 2.
+
+    degree : int, default=3
+        The polynomial degree of the spline basis. Must be a non-negative
+        integer.
+
+    knots : {'uniform', 'quantile'} or array-like of shape \
+        (n_knots, n_features), default='uniform'
+        Set knot positions such that first knot <= features <= last knot.
+
+        - If 'uniform', `n_knots` number of knots are distributed uniformly
+          from min to max values of the features.
+        - If 'quantile', they are distributed uniformly along the quantiles of
+          the features.
+        - If an array-like is given, it directly specifies the sorted knot
+          positions including the boundary knots. Note that, internally,
+          `degree` number of knots are added before the first knot, the same
+          after the last knot.
+
+    extrapolation : {'error', 'constant', 'linear', 'continue'}, \
+        default='constant'
+        If 'error', values outside the min and max values of the training
+        features raises a `ValueError`. If 'constant', the value of the
+        splines at minimum and maximum value of the features is used as
+        constant extrapolation. If 'linear', a linear extrapolation is used.
+        If 'continue', the splines are extrapolated as is, i.e. option
+        `extrapolate=True` in :class:`scipy.interpolate.BSpline`.
+
+    include_bias : bool, default=True
+        If True (default), then the last spline element inside the data range
+        of a feature is dropped. As B-splines sum to one over the spline basis
+        functions for each data point, they implicitly include a bias term,
+        i.e. a column of ones. It acts as an intercept term in a linear models.
+
+    order : {'C', 'F'}, default='C'
+        Order of output array. 'F' order is faster to compute, but may slow
+        down subsequent estimators.
+
+    Attributes
+    ----------
+    bsplines_ : list of shape (n_features,)
+        List of BSplines objects, one for each feature.
+
+    n_features_in_ : int
+        The total number of input features.
+
+    n_features_out_ : int
+        The total number of output features, which is computed as
+        `n_features * n_splines`, where `n_splines` is
+        the number of bases elements of the B-splines, `n_knots + degree - 1`.
+        If `include_bias=False`, then it is only
+        `n_features * (n_splines - 1)`.
+
+    See Also
+    --------
+    KBinsDiscretizer : Transformer that bins continuous data into intervals.
+
+    PolynomialFeatures : Transformer that generates polynomial and interaction
+        features.
+
+    Notes
+    -----
+    High degrees and a high number of knots can cause overfitting.
+
+    See :ref:`examples/linear_model/plot_polynomial_interpolation.py
+    <sphx_glr_auto_examples_linear_model_plot_polynomial_interpolation.py>`.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.preprocessing import SplineTransformer
+    >>> X = np.arange(6).reshape(6, 1)
+    >>> spline = SplineTransformer(degree=2, n_knots=3)
+    >>> spline.fit_transform(X)
+    array([[0.5 , 0.5 , 0.  , 0.  ],
+           [0.18, 0.74, 0.08, 0.  ],
+           [0.02, 0.66, 0.32, 0.  ],
+           [0.  , 0.32, 0.66, 0.02],
+           [0.  , 0.08, 0.74, 0.18],
+           [0.  , 0.  , 0.5 , 0.5 ]])
+    """
+
+    def __init__(
+        self,
+        n_knots=5,
+        degree=3,
+        *,
+        knots="uniform",
+        extrapolation="constant",
+        include_bias=True,
+        order="C",
+    ):
+        self.n_knots = n_knots
+        self.degree = degree
+        self.knots = knots
+        self.extrapolation = extrapolation
+        self.include_bias = include_bias
+        self.order = order
+
+    @staticmethod
+    def _get_base_knot_positions(X, n_knots=10, knots="uniform"):
+        """Calculate base knot positions.
+
+        Base knots such that first knot <= feature <= last knot. For the
+        B-spline construction with scipy.interpolate.BSpline, 2*degree knots
+        beyond the base interval are added.
+
+        Returns
+        -------
+        knots : ndarray of shape (n_knots, n_features), dtype=np.float64
+            Knot positions (points) of base interval.
+        """
+        if knots == "quantile":
+            knots = np.percentile(
+                X,
+                100
+                * np.linspace(start=0, stop=1, num=n_knots, dtype=np.float64),
+                axis=0,
+            )
+        else:
+            # knots == 'uniform':
+            # Note that the variable `knots` has already been validated and
+            # `else` is therefore safe.
+            x_min = np.amin(X, axis=0)
+            x_max = np.amax(X, axis=0)
+            knots = linspace(
+                start=x_min,
+                stop=x_max,
+                num=n_knots,
+                endpoint=True,
+                dtype=np.float64,
+            )
+
+        return knots
+
+    def get_feature_names(self, input_features=None):
+        """Return feature names for output features.
+
+        Parameters
+        ----------
+        input_features : list of str of shape (n_features,), default=None
+            String names for input features if available. By default,
+            "x0", "x1", ... "xn_features" is used.
+
+        Returns
+        -------
+        output_feature_names : list of str of shape (n_output_features,)
+        """
+        n_splines = self.bsplines_[0].c.shape[0]
+        if input_features is None:
+            input_features = ["x%d" % i for i in range(self.n_features_in_)]
+        feature_names = []
+        for i in range(self.n_features_in_):
+            for j in range(n_splines - 1 + self.include_bias):
+                feature_names.append(f"{input_features[i]}_sp_{j}")
+        return feature_names
+
+    def fit(self, X, y=None):
+        """Compute knot positions of splines.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The data.
+
+        y : None
+            Ignored.
+
+        Returns
+        -------
+        self : object
+            Fitted transformer.
+        """
+        X = self._validate_data(
+            X,
+            reset=True,
+            accept_sparse=False,
+            ensure_min_samples=2,
+            ensure_2d=True,
+        )
+        n_samples, n_features = X.shape
+
+        if not (
+            isinstance(self.degree, numbers.Integral) and self.degree >= 0
+        ):
+            raise ValueError("degree must be a non-negative integer.")
+
+        if not (
+            isinstance(self.n_knots, numbers.Integral) and self.n_knots >= 2
+        ):
+            raise ValueError("n_knots must be a positive integer >= 2.")
+
+        if isinstance(self.knots, str) and self.knots in [
+            "uniform",
+            "quantile",
+        ]:
+            base_knots = self._get_base_knot_positions(
+                X, n_knots=self.n_knots, knots=self.knots
+            )
+        else:
+            base_knots = check_array(self.knots)
+            if base_knots.shape[0] < 2:
+                raise ValueError(
+                    "Number of knots, knots.shape[0], must be >= " "2."
+                )
+            elif base_knots.shape[1] != n_features:
+                raise ValueError("knots.shape[1] == n_features is violated.")
+            elif not np.all(np.diff(base_knots, axis=0) > 0):
+                raise ValueError("knots must be sorted without duplicates.")
+
+        if self.extrapolation not in (
+            "error",
+            "constant",
+            "linear",
+            "continue",
+        ):
+            raise ValueError(
+                "extrapolation must be one of 'error', "
+                "'constant', 'linear' or 'continue'."
+            )
+
+        if not isinstance(self.include_bias, (bool, np.bool_)):
+            raise ValueError("include_bias must be bool.")
+
+        # number of knots for base interval
+        n_knots = base_knots.shape[0]
+        # number of splines basis functions
+        n_splines = n_knots + self.degree - 1
+        degree = self.degree
+        n_out = n_features * n_splines
+        # We have to add degree number of knots below, and degree number knots
+        # above the base knots in order to make the spline basis complete.
+        # Eilers & Marx in "Flexible smoothing with B-splines and  penalties"
+        # https://doi.org/10.1214/ss/1038425655 advice against repeating first
+        # and last knot several times, which would have inferior behaviour at
+        # boundaries if combined with a penalty (hence P-Spline). We follow
+        # this advice even if our splines are unpenalized.
+        # Meaning we do not:
+        # knots = np.r_[np.tile(base_knots.min(axis=0), reps=[degree, 1]),
+        #              base_knots,
+        #              np.tile(base_knots.max(axis=0), reps=[degree, 1])
+        #              ]
+        # Instead, we reuse the distance of the 2 fist/last knots.
+        dist_min = base_knots[1] - base_knots[0]
+        dist_max = base_knots[-1] - base_knots[-2]
+        knots = np.r_[
+            linspace(
+                base_knots[0] - degree * dist_min,
+                base_knots[0] - dist_min,
+                num=degree,
+            ),
+            base_knots,
+            linspace(
+                base_knots[-1] + dist_max,
+                base_knots[-1] + degree * dist_max,
+                num=degree,
+            ),
+        ]
+
+        # With a diagonal coefficient matrix, we get back the spline basis
+        # elements, i.e. the design matrix of the spline.
+        # Note, BSpline appreciates C-contiguous float64 arrays as c=coef.
+        coef = np.eye(n_knots + self.degree - 1, dtype=np.float64)
+        extrapolate = self.extrapolation == "continue"
+        bsplines = [
+            BSpline.construct_fast(
+                knots[:, i], coef, self.degree, extrapolate=extrapolate
+            )
+            for i in range(n_features)
+        ]
+        self.bsplines_ = bsplines
+
+        self.n_features_out_ = n_out - n_features * self.include_bias
+        return self
+
+    def transform(self, X):
+        """Transform each feature data to B-splines.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The data to transform.
+
+        Returns
+        -------
+        XBS : ndarray of shape (n_samples, n_features * n_splines)
+            The matrix of features, where n_splines is the number of bases
+            elements of the B-splines, n_knots + degree - 1.
+        """
+        check_is_fitted(self)
+
+        X = self._validate_data(
+            X, reset=False, accept_sparse=False, ensure_2d=True
+        )
+
+        n_samples, n_features = X.shape
+        n_splines = self.bsplines_[0].c.shape[0]
+        degree = self.degree
+
+        # Note that scipy BSpline returns float64 arrays and converts input
+        # x=X[:, i] to c-contiguous float64.
+        n_out = self.n_features_out_ + n_features * self.include_bias
+        if X.dtype in FLOAT_DTYPES:
+            dtype = X.dtype
+        else:
+            dtype = np.float64
+        XBS = np.zeros((n_samples, n_out), dtype=dtype, order=self.order)
+
+        for i in range(n_features):
+            spl = self.bsplines_[i]
+
+            if self.extrapolation in ("continue", "error"):
+                XBS[:, (i * n_splines):((i + 1) * n_splines)] = spl(X[:, i])
+            else:
+                xmin = spl.t[degree]
+                xmax = spl.t[-degree - 1]
+                mask = (xmin <= X[:, i]) & (X[:, i] <= xmax)
+                XBS[mask, (i * n_splines):((i + 1) * n_splines)] = spl(
+                    X[mask, i]
+                )
+
+            # Note for extrapolation:
+            # 'continue' is already returned as is by scipy BSplines
+            if self.extrapolation == "error":
+                # BSpline with extrapolate=False does not raise an error, but
+                # output np.nan.
+                if np.any(
+                    np.isnan(XBS[:, (i * n_splines):((i + 1) * n_splines)])
+                ):
+                    raise ValueError(
+                        "X contains values beyond the limits of the knots."
+                    )
+            elif self.extrapolation == "constant":
+                # Set all values beyond xmin and xmax to the value of the
+                # spline basis functions at those two positions.
+                # Only the first degree and last degree number of splines
+                # have non-zero values at the boundaries.
+
+                # spline values at boundaries
+                f_min = spl(xmin)
+                f_max = spl(xmax)
+                mask = X[:, i] < xmin
+                if np.any(mask):
+                    XBS[
+                        mask, (i * n_splines):(i * n_splines + degree)
+                    ] = f_min[:degree]
+
+                mask = X[:, i] > xmax
+                if np.any(mask):
+                    XBS[
+                        mask,
+                        ((i + 1) * n_splines - degree):((i + 1) * n_splines),
+                    ] = f_max[-degree:]
+            elif self.extrapolation == "linear":
+                # Continue the degree first and degree last spline bases
+                # linearly beyond the boundaries, with slope = derivative at
+                # the boundary.
+                # Note that all others have derivative = value = 0 at the
+                # boundaries.
+
+                # spline values at boundaries
+                f_min, f_max = spl(xmin), spl(xmax)
+                # spline derivatives = slopes at boundaries
+                fp_min, fp_max = spl(xmin, nu=1), spl(xmax, nu=1)
+                # Compute the linear continuation.
+                if degree <= 1:
+                    # For degree=1, the derivative of 2nd spline is not zero at
+                    # boundary. For degree=0 it is the same as 'constant'.
+                    degree += 1
+                for j in range(degree):
+                    mask = X[:, i] < xmin
+                    if np.any(mask):
+                        XBS[mask, i * n_splines + j] = (
+                            f_min[j] + (X[mask, i] - xmin) * fp_min[j]
+                        )
+
+                    mask = X[:, i] > xmax
+                    if np.any(mask):
+                        k = n_splines - 1 - j
+                        XBS[mask, i * n_splines + k] = (
+                            f_max[k] + (X[mask, i] - xmax) * fp_max[k]
+                        )
+
+        if self.include_bias:
+            return XBS
+        else:
+            # We throw away one spline basis per feature.
+            # We chose the last one.
+            indices = [
+                j for j in range(XBS.shape[1]) if (j + 1) % n_splines != 0
+            ]
+            return XBS[:, indices]
diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py
index b0fbee8db9455..974dad31258eb 100644
--- a/sklearn/preprocessing/tests/test_data.py
+++ b/sklearn/preprocessing/tests/test_data.py
@@ -28,26 +28,27 @@
 from sklearn.utils._testing import _convert_container
 
 from sklearn.utils.sparsefuncs import mean_variance_axis
+from sklearn.preprocessing import Binarizer
+from sklearn.preprocessing import KernelCenterer
+from sklearn.preprocessing import Normalizer
+from sklearn.preprocessing import normalize
+from sklearn.preprocessing import StandardScaler
+from sklearn.preprocessing import scale
+from sklearn.preprocessing import MinMaxScaler
+from sklearn.preprocessing import minmax_scale
+from sklearn.preprocessing import QuantileTransformer
+from sklearn.preprocessing import quantile_transform
+from sklearn.preprocessing import MaxAbsScaler
+from sklearn.preprocessing import maxabs_scale
+from sklearn.preprocessing import RobustScaler
+from sklearn.preprocessing import robust_scale
+from sklearn.preprocessing import add_dummy_feature
+from sklearn.preprocessing import PolynomialFeatures
+from sklearn.preprocessing import PowerTransformer
+from sklearn.preprocessing import power_transform
 from sklearn.preprocessing._data import _handle_zeros_in_scale
-from sklearn.preprocessing._data import Binarizer
-from sklearn.preprocessing._data import KernelCenterer
-from sklearn.preprocessing._data import Normalizer
-from sklearn.preprocessing._data import normalize
-from sklearn.preprocessing._data import StandardScaler
-from sklearn.preprocessing._data import scale
-from sklearn.preprocessing._data import MinMaxScaler
-from sklearn.preprocessing._data import minmax_scale
-from sklearn.preprocessing._data import QuantileTransformer
-from sklearn.preprocessing._data import quantile_transform
-from sklearn.preprocessing._data import MaxAbsScaler
-from sklearn.preprocessing._data import maxabs_scale
-from sklearn.preprocessing._data import RobustScaler
-from sklearn.preprocessing._data import robust_scale
-from sklearn.preprocessing._data import add_dummy_feature
-from sklearn.preprocessing._data import PolynomialFeatures
-from sklearn.preprocessing._data import PowerTransformer
-from sklearn.preprocessing._data import power_transform
 from sklearn.preprocessing._data import BOUNDS_THRESHOLD
+
 from sklearn.exceptions import NotFittedError
 
 from sklearn.base import clone
@@ -58,6 +59,7 @@
 
 from sklearn import datasets
 
+
 iris = datasets.load_iris()
 
 # Make some data to be used many times
@@ -148,6 +150,7 @@ def test_polynomial_feature_names():
 
 
 def test_polynomial_feature_array_order():
+    """Test that output array has the given order."""
     X = np.arange(10).reshape(5, 2)
 
     def is_c_contiguous(a):
@@ -1591,15 +1594,13 @@ def test_quantile_transform_bounds():
     transformer = QuantileTransformer()
     transformer.fit(X)
     assert (transformer.transform([[-10]]) ==
-                 transformer.transform([[np.min(X)]]))
+            transformer.transform([[np.min(X)]]))
     assert (transformer.transform([[10]]) ==
-                 transformer.transform([[np.max(X)]]))
+            transformer.transform([[np.max(X)]]))
     assert (transformer.inverse_transform([[-10]]) ==
-                 transformer.inverse_transform(
-                     [[np.min(transformer.references_)]]))
+            transformer.inverse_transform([[np.min(transformer.references_)]]))
     assert (transformer.inverse_transform([[10]]) ==
-                 transformer.inverse_transform(
-                     [[np.max(transformer.references_)]]))
+            transformer.inverse_transform([[np.max(transformer.references_)]]))
 
 
 def test_quantile_transform_and_inverse():
@@ -1904,9 +1905,9 @@ def test_maxabs_scaler_partial_fit():
                                   scaler_incr_csc.max_abs_)
         assert scaler_batch.n_samples_seen_ == scaler_incr.n_samples_seen_
         assert (scaler_batch.n_samples_seen_ ==
-                     scaler_incr_csr.n_samples_seen_)
+                scaler_incr_csr.n_samples_seen_)
         assert (scaler_batch.n_samples_seen_ ==
-                     scaler_incr_csc.n_samples_seen_)
+                scaler_incr_csc.n_samples_seen_)
         assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_)
         assert_array_almost_equal(scaler_batch.scale_, scaler_incr_csr.scale_)
         assert_array_almost_equal(scaler_batch.scale_, scaler_incr_csc.scale_)
diff --git a/sklearn/preprocessing/tests/test_polynomial.py b/sklearn/preprocessing/tests/test_polynomial.py
new file mode 100644
index 0000000000000..9dd65c44d8bba
--- /dev/null
+++ b/sklearn/preprocessing/tests/test_polynomial.py
@@ -0,0 +1,245 @@
+import numpy as np
+from numpy.testing import assert_allclose, assert_array_equal
+import pytest
+
+from sklearn.linear_model import LinearRegression
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import KBinsDiscretizer, SplineTransformer
+
+
+# TODO: add PolynomialFeatures if it moves to _polynomial.py
+@pytest.mark.parametrize("est", (SplineTransformer,))
+def test_polynomial_and_spline_array_order(est):
+    """Test that output array has the given order."""
+    X = np.arange(10).reshape(5, 2)
+
+    def is_c_contiguous(a):
+        return np.isfortran(a.T)
+
+    assert is_c_contiguous(est().fit_transform(X))
+    assert is_c_contiguous(est(order="C").fit_transform(X))
+    assert np.isfortran(est(order="F").fit_transform(X))
+
+
+@pytest.mark.parametrize(
+    "params, err_msg",
+    [
+        ({"degree": -1}, "degree must be a non-negative integer."),
+        ({"degree": 2.5}, "degree must be a non-negative integer."),
+        ({"degree": "string"}, "degree must be a non-negative integer."),
+        ({"n_knots": 1}, "n_knots must be a positive integer >= 2."),
+        ({"n_knots": 1}, "n_knots must be a positive integer >= 2."),
+        ({"n_knots": 2.5}, "n_knots must be a positive integer >= 2."),
+        ({"n_knots": "string"}, "n_knots must be a positive integer >= 2."),
+        ({"knots": "string"}, "Expected 2D array, got scalar array instead:"),
+        ({"knots": [1, 2]}, "Expected 2D array, got 1D array instead:"),
+        (
+            {"knots": [[1]]},
+            r"Number of knots, knots.shape\[0\], must be >= 2.",
+        ),
+        (
+            {"knots": [[1, 5], [2, 6]]},
+            r"knots.shape\[1\] == n_features is violated.",
+        ),
+        (
+            {"knots": [[1], [1], [2]]},
+            "knots must be sorted without duplicates.",
+        ),
+        ({"knots": [[2], [1]]}, "knots must be sorted without duplicates."),
+        (
+            {"extrapolation": None},
+            "extrapolation must be one of 'error', 'constant', 'linear' or "
+            "'continue'.",
+        ),
+        (
+            {"extrapolation": 1},
+            "extrapolation must be one of 'error', 'constant', 'linear' or "
+            "'continue'.",
+        ),
+        (
+            {"extrapolation": "string"},
+            "extrapolation must be one of 'error', 'constant', 'linear' or "
+            "'continue'.",
+        ),
+        ({"include_bias": None}, "include_bias must be bool."),
+        ({"include_bias": 1}, "include_bias must be bool."),
+        ({"include_bias": "string"}, "include_bias must be bool."),
+    ],
+)
+def test_spline_transformer_input_validation(params, err_msg):
+    """Test that we raise errors for invalid input in SplineTransformer."""
+    X = [[1], [2]]
+
+    with pytest.raises(ValueError, match=err_msg):
+        SplineTransformer(**params).fit(X)
+
+
+def test_spline_transformer_manual_knot_input():
+    """Test that array-like knot positions in SplineTransformer are accepted.
+    """
+    X = np.arange(20).reshape(10, 2)
+    knots = [[0.5, 1], [1.5, 2], [5, 10]]
+    st1 = SplineTransformer(degree=3, knots=knots).fit(X)
+    knots = np.asarray(knots)
+    st2 = SplineTransformer(degree=3, knots=knots).fit(X)
+    for i in range(X.shape[1]):
+        assert_allclose(st1.bsplines_[i].t, st2.bsplines_[i].t)
+
+
+def test_spline_transformer_feature_names():
+    """Test that SplineTransformer generates correct features name."""
+    X = np.arange(20).reshape(10, 2)
+    splt = SplineTransformer(n_knots=3, degree=3, include_bias=True).fit(X)
+    feature_names = splt.get_feature_names()
+    assert_array_equal(
+        feature_names,
+        [
+            "x0_sp_0",
+            "x0_sp_1",
+            "x0_sp_2",
+            "x0_sp_3",
+            "x0_sp_4",
+            "x1_sp_0",
+            "x1_sp_1",
+            "x1_sp_2",
+            "x1_sp_3",
+            "x1_sp_4",
+        ],
+    )
+
+    splt = SplineTransformer(n_knots=3, degree=3, include_bias=False).fit(X)
+    feature_names = splt.get_feature_names(["a", "b"])
+    assert_array_equal(
+        feature_names,
+        [
+            "a_sp_0",
+            "a_sp_1",
+            "a_sp_2",
+            "a_sp_3",
+            "b_sp_0",
+            "b_sp_1",
+            "b_sp_2",
+            "b_sp_3",
+        ],
+    )
+
+
+@pytest.mark.parametrize("degree", range(1, 5))
+@pytest.mark.parametrize("n_knots", range(3, 5))
+@pytest.mark.parametrize("knots", ["uniform", "quantile"])
+def test_spline_transformer_unity_decomposition(degree, n_knots, knots):
+    """Test that B-splines are indeed a decomposition of unity.
+
+    Splines basis functions must sum up to 1 per row, if we stay in between
+    boundaries.
+    """
+    X = np.linspace(0, 1, 100)[:, None]
+    # make the boundaries 0 and 1 part of X_train, for sure.
+    X_train = np.r_[[[0]], X[::2, :], [[1]]]
+    X_test = X[1::2, :]
+    splt = SplineTransformer(
+        n_knots=n_knots, degree=degree, knots=knots, include_bias=True
+    )
+    splt.fit(X_train)
+    for X in [X_train, X_test]:
+        assert_allclose(np.sum(splt.transform(X), axis=1), 1)
+
+
+@pytest.mark.parametrize(["bias", "intercept"], [(True, False), (False, True)])
+def test_spline_transformer_linear_regression(bias, intercept):
+    """Test that B-splines fit a sinusodial curve pretty well."""
+    X = np.linspace(0, 10, 100)[:, None]
+    y = np.sin(X[:, 0]) + 2  # +2 to avoid the value 0 in assert_allclose
+    pipe = Pipeline(
+        steps=[
+            (
+                "spline",
+                SplineTransformer(
+                    n_knots=15,
+                    degree=3,
+                    include_bias=bias,
+                    extrapolation="constant",
+                ),
+            ),
+            ("ols", LinearRegression(fit_intercept=intercept)),
+        ]
+    )
+    pipe.fit(X, y)
+    assert_allclose(pipe.predict(X), y, rtol=1e-3)
+
+
+@pytest.mark.parametrize(["bias", "intercept"], [(True, False), (False, True)])
+@pytest.mark.parametrize("degree", [1, 2, 3, 4, 5])
+def test_spline_transformer_extrapolation(bias, intercept, degree):
+    """Test that B-spline extrapolation works correctly."""
+    # we use a straight line for that
+    X = np.linspace(-1, 1, 100)[:, None]
+    y = X.squeeze()
+
+    # 'constant'
+    pipe = Pipeline(
+        [
+            [
+                "spline",
+                SplineTransformer(
+                    n_knots=4,
+                    degree=degree,
+                    include_bias=bias,
+                    extrapolation="constant",
+                ),
+            ],
+            ["ols", LinearRegression(fit_intercept=intercept)],
+        ]
+    )
+    pipe.fit(X, y)
+    assert_allclose(pipe.predict([[-10], [5]]), [-1, 1])
+
+    # 'linear'
+    pipe = Pipeline(
+        [
+            [
+                "spline",
+                SplineTransformer(
+                    n_knots=4,
+                    degree=degree,
+                    include_bias=bias,
+                    extrapolation="linear",
+                ),
+            ],
+            ["ols", LinearRegression(fit_intercept=intercept)],
+        ]
+    )
+    pipe.fit(X, y)
+    assert_allclose(pipe.predict([[-10], [5]]), [-10, 5])
+
+    # 'error'
+    splt = SplineTransformer(
+        n_knots=4, degree=degree, include_bias=bias, extrapolation="error"
+    )
+    splt.fit(X)
+    with pytest.raises(ValueError):
+        splt.transform([[-10]])
+    with pytest.raises(ValueError):
+        splt.transform([[5]])
+
+
+def test_spline_transformer_kbindiscretizer():
+    """Test that a B-spline of degree=0 is equivalent to KBinsDiscretizer."""
+    rng = np.random.RandomState(97531)
+    X = rng.randn(200).reshape(200, 1)
+    n_bins = 5
+    n_knots = n_bins + 1
+
+    splt = SplineTransformer(
+        n_knots=n_knots, degree=0, knots="quantile", include_bias=True
+    )
+    splines = splt.fit_transform(X)
+
+    kbd = KBinsDiscretizer(
+        n_bins=n_bins, encode="onehot-dense", strategy="quantile"
+    )
+    kbins = kbd.fit_transform(X)
+
+    # Though they should be exactly equal, we test approximately with high
+    # accuracy.
+    assert_allclose(splines, kbins, rtol=1e-13)
diff --git a/sklearn/utils/fixes.py b/sklearn/utils/fixes.py
index 49519ed55c82c..593e0eb332a99 100644
--- a/sklearn/utils/fixes.py
+++ b/sklearn/utils/fixes.py
@@ -220,3 +220,51 @@ def __init__(self, function):
     def __call__(self, *args, **kwargs):
         with config_context(**self.config):
             return self.function(*args, **kwargs)
+
+
+def linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None,
+             axis=0):
+    """Implements a simplified linspace function as of numpy verion >= 1.16.
+
+    As of numpy 1.16, the arguments start and stop can be array-like and
+    there is an optional argument `axis`.
+    For simplicity, we only allow 1d array-like to be passed to start and stop.
+    See: https://github.com/numpy/numpy/pull/12388 and numpy 1.16 release
+    notes about start and stop arrays for linspace logspace and geomspace.
+
+    Returns
+    -------
+    out : ndarray of shape (num, n_start) or (num,)
+        The output array with `n_start=start.shape[0]` columns.
+    """
+    if np_version < parse_version('1.16'):
+        start = np.asanyarray(start) * 1.0
+        stop = np.asanyarray(stop) * 1.0
+        dt = np.result_type(start, stop, float(num))
+        if dtype is None:
+            dtype = dt
+
+        if start.ndim == 0 == stop.ndim:
+            return np.linspace(start=start, stop=stop, num=num,
+                               endpoint=endpoint, retstep=retstep, dtype=dtype)
+
+        if start.ndim != 1 or stop.ndim != 1 or start.shape != stop.shape:
+            raise ValueError("start and stop must be 1d array-like of same"
+                             " shape.")
+        n_start = start.shape[0]
+        out = np.empty((num, n_start), dtype=dtype)
+        step = np.empty(n_start, dtype=np.float)
+        for i in range(n_start):
+            out[:, i], step[i] = np.linspace(start=start[i], stop=stop[i],
+                                             num=num, endpoint=endpoint,
+                                             retstep=True, dtype=dtype)
+        if axis != 0:
+            out = np.moveaxis(out, 0, axis)
+
+        if retstep:
+            return out, step
+        else:
+            return out
+    else:
+        return np.linspace(start=start, stop=stop, num=num, endpoint=endpoint,
+                           retstep=retstep, dtype=dtype, axis=axis)
diff --git a/sklearn/utils/tests/test_fixes.py b/sklearn/utils/tests/test_fixes.py
index 28824a6acee55..03e11f5bc1a08 100644
--- a/sklearn/utils/tests/test_fixes.py
+++ b/sklearn/utils/tests/test_fixes.py
@@ -15,6 +15,7 @@
 from sklearn.utils.fixes import _object_dtype_isnan
 from sklearn.utils.fixes import loguniform
 from sklearn.utils.fixes import MaskedArray
+from sklearn.utils.fixes import linspace, parse_version, np_version
 
 
 @pytest.mark.parametrize('joblib_version', ('0.11', '0.12.0'))
@@ -89,3 +90,37 @@ def test_loguniform(low, high, base):
 def test_masked_array_deprecated():  # TODO: remove in 1.0
     with pytest.warns(FutureWarning, match='is deprecated'):
         MaskedArray()
+
+
+def test_linspace():
+    """Test that linespace works like np.linespace as of numpy version 1.16."""
+    start, stop = 0, 10
+    num = 6
+    out = linspace(start=start, stop=stop, num=num, endpoint=True)
+    assert_array_equal(out, np.array([0., 2, 4, 6, 8, 10]))
+
+    start, stop = [0, 100], [10, 1100]
+    num = 6
+    out = linspace(start=start, stop=stop, num=num, endpoint=True)
+    res = np.c_[[0., 2, 4, 6, 8, 10],
+                [100, 300, 500, 700, 900, 1100]]
+    assert_array_equal(out, res)
+
+    out2 = linspace(start=start, stop=stop, num=num, endpoint=True, axis=1)
+    assert_array_equal(out2, out.T)
+
+    out, step = linspace(
+        start=start,
+        stop=stop,
+        num=num,
+        endpoint=True,
+        retstep=True,
+    )
+    assert_array_equal(out, res)
+    assert_array_equal(step, [2, 200])
+
+    if np_version < parse_version('1.16'):
+        with pytest.raises(ValueError):
+            linspace(start=[0, 1], stop=10)
+    else:
+        linspace(start=[0, 1], stop=10)

From 6f32544c51b43d122dfbed8feff5cd2887bcac80 Mon Sep 17 00:00:00 2001
From: David Poznik <dpoznik@users.noreply.github.com>
Date: Sun, 24 Jan 2021 03:30:10 -0800
Subject: [PATCH 087/478] FIX discrete Naive Bayes model fitting for degenerate
 single-class case (#18925)

Co-authored-by: Guillaume Lemaitre <g.lemaitre58@gmail.com>
---
 doc/whats_new/v1.0.rst            |   7 ++
 sklearn/naive_bayes.py            |  51 +++++----
 sklearn/tests/test_naive_bayes.py | 175 ++++++++++++++++++------------
 3 files changed, 138 insertions(+), 95 deletions(-)

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 177dab23b3e62..ffb9646499982 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -105,6 +105,13 @@ Changelog
 :mod:`sklearn.naive_bayes`
 ..........................
 
+- |Fix| The `fit` and `partial_fit` methods of the discrete naive Bayes
+  classifiers (:class:`naive_bayes.BernoulliNB`,
+  :class:`naive_bayes.CategoricalNB`, :class:`naive_bayes.ComplementNB`,
+  and :class:`naive_bayes.MultinomialNB`) now correctly handle the degenerate
+  case of a single class in the training set.
+  :pr:`18925` by :user:`David Poznik <dpoznik>`.
+
 - |API| The attribute ``sigma_`` is now deprecated in
   :class:`naive_bayes.GaussianNB` and will be removed in 1.2.
   Use ``var_`` instead.
diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py
index a1a1cdd965a49..d32e0756f2907 100644
--- a/sklearn/naive_bayes.py
+++ b/sklearn/naive_bayes.py
@@ -550,7 +550,7 @@ def partial_fit(self, X, y, classes=None, sample_weight=None):
         y : array-like of shape (n_samples,)
             Target values.
 
-        classes : array-like of shape (n_classes), default=None
+        classes : array-like of shape (n_classes,), default=None
             List of all the classes that can possibly appear in the y vector.
 
             Must be provided at the first call to partial_fit, can be omitted
@@ -569,8 +569,8 @@ def partial_fit(self, X, y, classes=None, sample_weight=None):
         if _check_partial_fit_first_call(self, classes):
             # This is the first call to partial_fit:
             # initialize various cumulative counters
-            n_effective_classes = len(classes) if len(classes) > 1 else 2
-            self._init_counters(n_effective_classes, n_features)
+            n_classes = len(classes)
+            self._init_counters(n_classes, n_features)
             self.n_features_ = n_features
         elif n_features != self.n_features_:
             msg = "Number of features %d does not match previous data %d."
@@ -578,7 +578,10 @@ def partial_fit(self, X, y, classes=None, sample_weight=None):
 
         Y = label_binarize(y, classes=self.classes_)
         if Y.shape[1] == 1:
-            Y = np.concatenate((1 - Y, Y), axis=1)
+            if len(self.classes_) == 2:
+                Y = np.concatenate((1 - Y, Y), axis=1)
+            else:    # degenerate case: just one class
+                Y = np.ones_like(Y)
 
         if X.shape[0] != Y.shape[0]:
             msg = "X.shape[0]=%d and y.shape[0]=%d are incompatible."
@@ -634,7 +637,10 @@ def fit(self, X, y, sample_weight=None):
         Y = labelbin.fit_transform(y)
         self.classes_ = labelbin.classes_
         if Y.shape[1] == 1:
-            Y = np.concatenate((1 - Y, Y), axis=1)
+            if len(self.classes_) == 2:
+                Y = np.concatenate((1 - Y, Y), axis=1)
+            else:    # degenerate case: just one class
+                Y = np.ones_like(Y)
 
         # LabelBinarizer().fit_transform() returns arrays with dtype=np.int64.
         # We convert it to np.float64 to support sample_weight consistently;
@@ -649,18 +655,17 @@ def fit(self, X, y, sample_weight=None):
 
         # Count raw events from data before updating the class log prior
         # and feature log probas
-        n_effective_classes = Y.shape[1]
-
-        self._init_counters(n_effective_classes, n_features)
+        n_classes = Y.shape[1]
+        self._init_counters(n_classes, n_features)
         self._count(X, Y)
         alpha = self._check_alpha()
         self._update_feature_log_prob(alpha)
         self._update_class_log_prior(class_prior=class_prior)
         return self
 
-    def _init_counters(self, n_effective_classes, n_features):
-        self.class_count_ = np.zeros(n_effective_classes, dtype=np.float64)
-        self.feature_count_ = np.zeros((n_effective_classes, n_features),
+    def _init_counters(self, n_classes, n_features):
+        self.class_count_ = np.zeros(n_classes, dtype=np.float64)
+        self.feature_count_ = np.zeros((n_classes, n_features),
                                        dtype=np.float64)
 
     # mypy error: Decorated property not supported
@@ -714,7 +719,7 @@ class MultinomialNB(_BaseDiscreteNB):
         Number of samples encountered for each class during fitting. This
         value is weighted by the sample weight when provided.
 
-    class_log_prior_ : ndarray of shape (n_classes, )
+    class_log_prior_ : ndarray of shape (n_classes,)
         Smoothed empirical log probability for each class.
 
     classes_ : ndarray of shape (n_classes,)
@@ -962,11 +967,11 @@ class BernoulliNB(_BaseDiscreteNB):
 
     Attributes
     ----------
-    class_count_ : ndarray of shape (n_classes)
+    class_count_ : ndarray of shape (n_classes,)
         Number of samples encountered for each class during fitting. This
         value is weighted by the sample weight when provided.
 
-    class_log_prior_ : ndarray of shape (n_classes)
+    class_log_prior_ : ndarray of shape (n_classes,)
         Log probability of each class (smoothed).
 
     classes_ : ndarray of shape (n_classes,)
@@ -1053,8 +1058,8 @@ def _update_feature_log_prob(self, alpha):
 
     def _joint_log_likelihood(self, X):
         """Calculate the posterior log probability of the samples X"""
-        n_classes, n_features = self.feature_log_prob_.shape
-        n_samples, n_features_X = X.shape
+        n_features = self.feature_log_prob_.shape[1]
+        n_features_X = X.shape[1]
 
         if n_features_X != n_features:
             raise ValueError("Expected input with %d features, got %d instead"
@@ -1173,7 +1178,7 @@ def fit(self, X, y, sample_weight=None):
         y : array-like of shape (n_samples,)
             Target values.
 
-        sample_weight : array-like of shape (n_samples), default=None
+        sample_weight : array-like of shape (n_samples,), default=None
             Weights applied to individual samples (1. for unweighted).
 
         Returns
@@ -1207,16 +1212,16 @@ def partial_fit(self, X, y, classes=None, sample_weight=None):
             total number of categories for the given feature. This can, for
             instance, be achieved with the help of OrdinalEncoder.
 
-        y : array-like of shape (n_samples)
+        y : array-like of shape (n_samples,)
             Target values.
 
-        classes : array-like of shape (n_classes), default=None
+        classes : array-like of shape (n_classes,), default=None
             List of all the classes that can possibly appear in the y vector.
 
             Must be provided at the first call to partial_fit, can be omitted
             in subsequent calls.
 
-        sample_weight : array-like of shape (n_samples), default=None
+        sample_weight : array-like of shape (n_samples,), default=None
             Weights applied to individual samples (1. for unweighted).
 
         Returns
@@ -1241,9 +1246,9 @@ def _check_X_y(self, X, y):
         check_non_negative(X, "CategoricalNB (input X)")
         return X, y
 
-    def _init_counters(self, n_effective_classes, n_features):
-        self.class_count_ = np.zeros(n_effective_classes, dtype=np.float64)
-        self.category_count_ = [np.zeros((n_effective_classes, 0))
+    def _init_counters(self, n_classes, n_features):
+        self.class_count_ = np.zeros(n_classes, dtype=np.float64)
+        self.category_count_ = [np.zeros((n_classes, 0))
                                 for _ in range(n_features)]
 
     @staticmethod
diff --git a/sklearn/tests/test_naive_bayes.py b/sklearn/tests/test_naive_bayes.py
index 753b8c981e07d..02b83e51ac8b6 100644
--- a/sklearn/tests/test_naive_bayes.py
+++ b/sklearn/tests/test_naive_bayes.py
@@ -1,6 +1,4 @@
 
-import pickle
-from io import BytesIO
 import numpy as np
 import scipy.sparse
 import pytest
@@ -23,6 +21,10 @@
 from sklearn.naive_bayes import MultinomialNB, ComplementNB
 from sklearn.naive_bayes import CategoricalNB
 
+DISCRETE_NAIVE_BAYES_CLASSES = [
+    BernoulliNB, CategoricalNB, ComplementNB, MultinomialNB]
+ALL_NAIVE_BAYES_CLASSES = DISCRETE_NAIVE_BAYES_CLASSES + [GaussianNB]
+
 
 # Data is just 6 separable points in the plane
 X = np.array([[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]])
@@ -204,46 +206,45 @@ def test_gnb_naive_bayes_scale_invariance():
 
 
 # TODO: Remove in version 1.1
-@pytest.mark.parametrize("cls", [MultinomialNB, ComplementNB, BernoulliNB,
-                                 CategoricalNB])
-def test_discretenb_deprecated_coef_intercept(cls):
-    est = cls().fit(X2, y2)
+@pytest.mark.parametrize('DiscreteNaiveBayes', DISCRETE_NAIVE_BAYES_CLASSES)
+def test_discretenb_deprecated_coef_intercept(DiscreteNaiveBayes):
+    est = DiscreteNaiveBayes().fit(X2, y2)
 
     for att in ["coef_", "intercept_"]:
         with pytest.warns(FutureWarning):
             hasattr(est, att)
 
 
-@pytest.mark.parametrize("cls", [MultinomialNB, BernoulliNB, CategoricalNB])
-def test_discretenb_prior(cls):
+@pytest.mark.parametrize('DiscreteNaiveBayes', DISCRETE_NAIVE_BAYES_CLASSES)
+def test_discretenb_prior(DiscreteNaiveBayes):
     # Test whether class priors are properly set.
-    clf = cls().fit(X2, y2)
+    clf = DiscreteNaiveBayes().fit(X2, y2)
     assert_array_almost_equal(np.log(np.array([2, 2, 2]) / 6.0),
                               clf.class_log_prior_, 8)
 
 
-@pytest.mark.parametrize("cls", [MultinomialNB, BernoulliNB, CategoricalNB])
-def test_discretenb_partial_fit(cls):
-    clf1 = cls()
+@pytest.mark.parametrize('DiscreteNaiveBayes', DISCRETE_NAIVE_BAYES_CLASSES)
+def test_discretenb_partial_fit(DiscreteNaiveBayes):
+    clf1 = DiscreteNaiveBayes()
     clf1.fit([[0, 1], [1, 0], [1, 1]], [0, 1, 1])
 
-    clf2 = cls()
+    clf2 = DiscreteNaiveBayes()
     clf2.partial_fit([[0, 1], [1, 0], [1, 1]], [0, 1, 1], classes=[0, 1])
     assert_array_equal(clf1.class_count_, clf2.class_count_)
-    if cls is CategoricalNB:
+    if DiscreteNaiveBayes is CategoricalNB:
         for i in range(len(clf1.category_count_)):
             assert_array_equal(clf1.category_count_[i],
                                clf2.category_count_[i])
     else:
         assert_array_equal(clf1.feature_count_, clf2.feature_count_)
 
-    clf3 = cls()
+    clf3 = DiscreteNaiveBayes()
     # all categories have to appear in the first partial fit
     clf3.partial_fit([[0, 1]], [0], classes=[0, 1])
     clf3.partial_fit([[1, 0]], [1])
     clf3.partial_fit([[1, 1]], [1])
     assert_array_equal(clf1.class_count_, clf3.class_count_)
-    if cls is CategoricalNB:
+    if DiscreteNaiveBayes is CategoricalNB:
         # the categories for each feature of CategoricalNB are mapped to an
         # index chronologically with each call of partial fit and therefore
         # the category_count matrices cannot be compared for equality
@@ -270,53 +271,29 @@ def test_discretenb_partial_fit(cls):
         assert_array_equal(clf1.feature_count_, clf3.feature_count_)
 
 
-@pytest.mark.parametrize('cls', [BernoulliNB, MultinomialNB, GaussianNB,
-                                 CategoricalNB])
-def test_discretenb_pickle(cls):
-    # Test picklability of discrete naive Bayes classifiers
-
-    clf = cls().fit(X2, y2)
-    y_pred = clf.predict(X2)
-
-    store = BytesIO()
-    pickle.dump(clf, store)
-    clf = pickle.load(BytesIO(store.getvalue()))
-
-    assert_array_equal(y_pred, clf.predict(X2))
-
-    # Test pickling of estimator trained with partial_fit
-    clf2 = cls().partial_fit(X2[:3], y2[:3], classes=np.unique(y2))
-    clf2.partial_fit(X2[3:], y2[3:])
-    store = BytesIO()
-    pickle.dump(clf2, store)
-    clf2 = pickle.load(BytesIO(store.getvalue()))
-    assert_array_equal(y_pred, clf2.predict(X2))
-
-
-@pytest.mark.parametrize('cls', [BernoulliNB, MultinomialNB, GaussianNB,
-                                 CategoricalNB])
-def test_discretenb_input_check_fit(cls):
+@pytest.mark.parametrize('NaiveBayes', ALL_NAIVE_BAYES_CLASSES)
+def test_naive_bayes_input_check_fit(NaiveBayes):
     # Test input checks for the fit method
 
     # check shape consistency for number of samples at fit time
-    assert_raises(ValueError, cls().fit, X2, y2[:-1])
+    assert_raises(ValueError, NaiveBayes().fit, X2, y2[:-1])
 
     # check shape consistency for number of input features at predict time
-    clf = cls().fit(X2, y2)
+    clf = NaiveBayes().fit(X2, y2)
     assert_raises(ValueError, clf.predict, X2[:, :-1])
 
 
-@pytest.mark.parametrize('cls', [BernoulliNB, MultinomialNB, CategoricalNB])
-def test_discretenb_input_check_partial_fit(cls):
+@pytest.mark.parametrize('DiscreteNaiveBayes', DISCRETE_NAIVE_BAYES_CLASSES)
+def test_discretenb_input_check_partial_fit(DiscreteNaiveBayes):
     # check shape consistency
-    assert_raises(ValueError, cls().partial_fit, X2, y2[:-1],
+    assert_raises(ValueError, DiscreteNaiveBayes().partial_fit, X2, y2[:-1],
                   classes=np.unique(y2))
 
     # classes is required for first call to partial fit
-    assert_raises(ValueError, cls().partial_fit, X2, y2)
+    assert_raises(ValueError, DiscreteNaiveBayes().partial_fit, X2, y2)
 
     # check consistency of consecutive classes values
-    clf = cls()
+    clf = DiscreteNaiveBayes()
     clf.partial_fit(X2, y2, classes=np.unique(y2))
     assert_raises(ValueError, clf.partial_fit, X2, y2,
                   classes=np.arange(42))
@@ -340,9 +317,9 @@ def test_discretenb_predict_proba():
 
     # test binary case (1-d output)
     y = [0, 0, 2]  # 2 is regression test for binary case, 02e673
-    for cls, X in zip([BernoulliNB, MultinomialNB],
-                      [X_bernoulli, X_multinomial]):
-        clf = cls().fit(X, y)
+    for DiscreteNaiveBayes, X in zip([BernoulliNB, MultinomialNB],
+                                     [X_bernoulli, X_multinomial]):
+        clf = DiscreteNaiveBayes().fit(X, y)
         assert clf.predict(X[-1:]) == 2
         assert clf.predict_proba([X[0]]).shape == (1, 2)
         assert_array_almost_equal(clf.predict_proba(X[:2]).sum(axis=1),
@@ -350,9 +327,9 @@ def test_discretenb_predict_proba():
 
     # test multiclass case (2-d output, must sum to one)
     y = [0, 1, 2]
-    for cls, X in zip([BernoulliNB, MultinomialNB],
-                      [X_bernoulli, X_multinomial]):
-        clf = cls().fit(X, y)
+    for DiscreteNaiveBayes, X in zip([BernoulliNB, MultinomialNB],
+                                     [X_bernoulli, X_multinomial]):
+        clf = DiscreteNaiveBayes().fit(X, y)
         assert clf.predict_proba(X[0:1]).shape == (1, 3)
         assert clf.predict_proba(X[:2]).shape == (2, 3)
         assert_almost_equal(np.sum(clf.predict_proba([X[1]])), 1)
@@ -361,23 +338,23 @@ def test_discretenb_predict_proba():
         assert_almost_equal(np.sum(np.exp(clf.intercept_)), 1)
 
 
-@pytest.mark.parametrize('cls', [BernoulliNB, MultinomialNB, CategoricalNB])
-def test_discretenb_uniform_prior(cls):
+@pytest.mark.parametrize('DiscreteNaiveBayes', DISCRETE_NAIVE_BAYES_CLASSES)
+def test_discretenb_uniform_prior(DiscreteNaiveBayes):
     # Test whether discrete NB classes fit a uniform prior
     # when fit_prior=False and class_prior=None
 
-    clf = cls()
+    clf = DiscreteNaiveBayes()
     clf.set_params(fit_prior=False)
     clf.fit([[0], [0], [1]], [0, 0, 1])
     prior = np.exp(clf.class_log_prior_)
     assert_array_almost_equal(prior, np.array([.5, .5]))
 
 
-@pytest.mark.parametrize('cls', [BernoulliNB, MultinomialNB, CategoricalNB])
-def test_discretenb_provide_prior(cls):
+@pytest.mark.parametrize('DiscreteNaiveBayes', DISCRETE_NAIVE_BAYES_CLASSES)
+def test_discretenb_provide_prior(DiscreteNaiveBayes):
     # Test whether discrete NB classes use provided prior
 
-    clf = cls(class_prior=[0.5, 0.5])
+    clf = DiscreteNaiveBayes(class_prior=[0.5, 0.5])
     clf.fit([[0], [0], [1]], [0, 0, 1])
     prior = np.exp(clf.class_log_prior_)
     assert_array_almost_equal(prior, np.array([.5, .5]))
@@ -388,8 +365,8 @@ def test_discretenb_provide_prior(cls):
                   classes=[0, 1, 1])
 
 
-@pytest.mark.parametrize('cls', [BernoulliNB, MultinomialNB, CategoricalNB])
-def test_discretenb_provide_prior_with_partial_fit(cls):
+@pytest.mark.parametrize('DiscreteNaiveBayes', DISCRETE_NAIVE_BAYES_CLASSES)
+def test_discretenb_provide_prior_with_partial_fit(DiscreteNaiveBayes):
     # Test whether discrete NB classes use provided prior
     # when using partial_fit
 
@@ -398,9 +375,9 @@ def test_discretenb_provide_prior_with_partial_fit(cls):
         iris.data, iris.target, test_size=0.4, random_state=415)
 
     for prior in [None, [0.3, 0.3, 0.4]]:
-        clf_full = cls(class_prior=prior)
+        clf_full = DiscreteNaiveBayes(class_prior=prior)
         clf_full.fit(iris.data, iris.target)
-        clf_partial = cls(class_prior=prior)
+        clf_partial = DiscreteNaiveBayes(class_prior=prior)
         clf_partial.partial_fit(iris_data1, iris_target1,
                                 classes=[0, 1, 2])
         clf_partial.partial_fit(iris_data2, iris_target2)
@@ -408,8 +385,8 @@ def test_discretenb_provide_prior_with_partial_fit(cls):
                                   clf_partial.class_log_prior_)
 
 
-@pytest.mark.parametrize('cls', [BernoulliNB, MultinomialNB, CategoricalNB])
-def test_discretenb_sample_weight_multiclass(cls):
+@pytest.mark.parametrize('DiscreteNaiveBayes', DISCRETE_NAIVE_BAYES_CLASSES)
+def test_discretenb_sample_weight_multiclass(DiscreteNaiveBayes):
     # check shape consistency for number of samples at fit time
     X = [
         [0, 0, 1],
@@ -420,11 +397,11 @@ def test_discretenb_sample_weight_multiclass(cls):
     y = [0, 0, 1, 2]
     sample_weight = np.array([1, 1, 2, 2], dtype=np.float64)
     sample_weight /= sample_weight.sum()
-    clf = cls().fit(X, y, sample_weight=sample_weight)
+    clf = DiscreteNaiveBayes().fit(X, y, sample_weight=sample_weight)
     assert_array_equal(clf.predict(X), [0, 1, 1, 2])
 
     # Check sample weight using the partial_fit method
-    clf = cls()
+    clf = DiscreteNaiveBayes()
     clf.partial_fit(X[:2], y[:2], classes=[0, 1, 2],
                     sample_weight=sample_weight[:2])
     clf.partial_fit(X[2:3], y[2:3], sample_weight=sample_weight[2:3])
@@ -434,19 +411,73 @@ def test_discretenb_sample_weight_multiclass(cls):
 
 # TODO: Remove in version 1.1
 @ignore_warnings(category=FutureWarning)
-@pytest.mark.parametrize('cls', [BernoulliNB, MultinomialNB])
-def test_discretenb_coef_intercept_shape(cls):
+@pytest.mark.parametrize('DiscreteNaiveBayes', [BernoulliNB, ComplementNB,
+                                                MultinomialNB])
+def test_discretenb_coef_intercept_shape(DiscreteNaiveBayes):
     # coef_ and intercept_ should have shapes as in other linear models.
     # Non-regression test for issue #2127.
     X = [[1, 0, 0], [1, 1, 1]]
     y = [1, 2]  # binary classification
-    clf = cls()
+    clf = DiscreteNaiveBayes()
 
     clf.fit(X, y)
     assert clf.coef_.shape == (1, 3)
     assert clf.intercept_.shape == (1,)
 
 
+@pytest.mark.parametrize('DiscreteNaiveBayes', DISCRETE_NAIVE_BAYES_CLASSES)
+@pytest.mark.parametrize('use_partial_fit', [False, True])
+@pytest.mark.parametrize('train_on_single_class_y', [False, True])
+def test_discretenb_degenerate_one_class_case(
+        DiscreteNaiveBayes,
+        use_partial_fit,
+        train_on_single_class_y,
+):
+    # Most array attributes of a discrete naive Bayes classifier should have a
+    # first-axis length equal to the number of classes. Exceptions include:
+    # ComplementNB.feature_all_, CategoricalNB.n_categories_.
+    # Confirm that this is the case for binary problems and the degenerate
+    # case of a single class in the training set, when fitting with `fit` or
+    # `partial_fit`.
+    # Non-regression test for handling degenerate one-class case:
+    # https://github.com/scikit-learn/scikit-learn/issues/18974
+
+    X = [[1, 0, 0], [0, 1, 0], [0, 0, 1]]
+    y = [1, 1, 2]
+    if train_on_single_class_y:
+        X = X[:-1]
+        y = y[:-1]
+    classes = sorted(list(set(y)))
+    num_classes = len(classes)
+
+    clf = DiscreteNaiveBayes()
+    if use_partial_fit:
+        clf.partial_fit(X, y, classes=classes)
+    else:
+        clf.fit(X, y)
+    assert clf.predict(X[:1]) == y[0]
+
+    # Check that attributes have expected first-axis lengths
+    attribute_names = [
+        'classes_',
+        'class_count_',
+        'class_log_prior_',
+        'feature_count_',
+        'feature_log_prob_',
+    ]
+    for attribute_name in attribute_names:
+        attribute = getattr(clf, attribute_name, None)
+        if attribute is None:
+            # CategoricalNB has no feature_count_ attribute
+            continue
+        if isinstance(attribute, np.ndarray):
+            assert attribute.shape[0] == num_classes
+        else:
+            # CategoricalNB.feature_log_prob_ is a list of arrays
+            for element in attribute:
+                assert element.shape[0] == num_classes
+
+
 @pytest.mark.parametrize('kind', ('dense', 'sparse'))
 def test_mnnb(kind):
     # Test Multinomial Naive Bayes classification.

From 1f267351277c106b244496251831ebeb183e0fe6 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Mon, 25 Jan 2021 04:19:23 -0500
Subject: [PATCH 088/478] CI Adds manylinux1 to building wheels (#19235)

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 .github/workflows/wheels.yml       | 11 ++++++++++-
 build_tools/github/check_wheels.py |  6 +++---
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
index dc7afc711a2a7..ffddf9ef88db3 100644
--- a/.github/workflows/wheels.yml
+++ b/.github/workflows/wheels.yml
@@ -38,7 +38,7 @@ jobs:
 
   # Build the wheels for Linux, Windows and macOS for Python 3.6 and newer
   build_wheels:
-    name: Build wheel for cp${{ matrix.python }}-${{ matrix.platform_id }}
+    name: Build wheel for cp${{ matrix.python }}-${{ matrix.platform_id }}-${{ matrix.manylinux_image }}
     runs-on: ${{ matrix.os }}
     needs: check_build_trigger
     if: needs.check_build_trigger.outputs.build
@@ -50,6 +50,7 @@ jobs:
         os: [windows-latest, ubuntu-latest, macos-latest]
         python: [36, 37, 38, 39]
         bitness: [32, 64]
+        manylinux_image: [manylinux1, manylinux2010]
         include:
           # Run 32 and 64 bit version in parallel for Linux and Windows
           - os: windows-latest
@@ -70,6 +71,12 @@ jobs:
         exclude:
           - os: macos-latest
             bitness: 32
+          # Remove manylinux1 from the windows and osx build matrix since
+          # manylinux_image is not used for these platforms
+          - os: windows-latest
+            manylinux_image: manylinux1
+          - os: macos-latest
+            manylinux_image: manylinux1
 
     steps:
       - name: Checkout scikit-learn
@@ -88,6 +95,8 @@ jobs:
                             SKLEARN_BUILD_PARALLEL=3
                             MACOSX_DEPLOYMENT_TARGET=10.13
           CIBW_BUILD: cp${{ matrix.python }}-${{ matrix.platform_id }}
+          CIBW_MANYLINUX_X86_64_IMAGE: ${{ matrix.manylinux_image }}
+          CIBW_MANYLINUX_I686_IMAGE: ${{ matrix.manylinux_image }}
           CIBW_REPAIR_WHEEL_COMMAND_WINDOWS: bash build_tools/github/repair_windows_wheels.sh {wheel} {dest_dir} ${{ matrix.bitness }}
           CIBW_BEFORE_TEST_WINDOWS: bash build_tools/github/build_minimal_windows_image.sh ${{ matrix.python }} ${{ matrix.bitness }}
           CIBW_TEST_REQUIRES: pytest pandas threadpoolctl
diff --git a/build_tools/github/check_wheels.py b/build_tools/github/check_wheels.py
index 64cebe3b6b0c4..c213991394a6b 100644
--- a/build_tools/github/check_wheels.py
+++ b/build_tools/github/check_wheels.py
@@ -11,11 +11,11 @@
 build_matrix = wheel_config['jobs']['build_wheels']['strategy']['matrix']
 n_python_versions = len(build_matrix['python'])
 
-# For each python version we have: 5 wheels
+# For each python version we have: 7 wheels
 # 1 osx wheel (x86_64)
-# 2 linux wheel (i686 + x86_64)
+# 4 linux wheel (i686 + x86_64) * (manylinux1 + manylinux2010)
 # 2 windows wheel (win32 + wind_amd64)
-n_wheels = 5 * n_python_versions
+n_wheels = 7 * n_python_versions
 
 # plus one more for the sdist
 n_wheels += 1

From 6e023c79d22dec26836cb42f10b30da3532ba0fb Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Mon, 25 Jan 2021 10:56:36 +0100
Subject: [PATCH 089/478] MNT use bool instead of np.bool_ to silence
 FutureWarning [scipy-dev] (#19267)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Juan Carlos Alfaro Jiménez <JuanCarlos.Alfaro@uclm.es>
---
 sklearn/cluster/tests/test_optics.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sklearn/cluster/tests/test_optics.py b/sklearn/cluster/tests/test_optics.py
index d5b30256d4943..771596064142d 100644
--- a/sklearn/cluster/tests/test_optics.py
+++ b/sklearn/cluster/tests/test_optics.py
@@ -220,7 +220,7 @@ def test_nowarn_if_metric_bool_data_bool():
     # https://github.com/scikit-learn/scikit-learn/issues/18996
 
     pairwise_metric = 'rogerstanimoto'
-    X = np.random.randint(2, size=(5, 2), dtype=np.bool)
+    X = np.random.randint(2, size=(5, 2), dtype=bool)
 
     with pytest.warns(None) as warn_record:
         OPTICS(metric=pairwise_metric).fit(X)
@@ -234,7 +234,7 @@ def test_warn_if_metric_bool_data_no_bool():
     # https://github.com/scikit-learn/scikit-learn/issues/18996
 
     pairwise_metric = 'rogerstanimoto'
-    X = np.random.randint(2, size=(5, 2), dtype=np.int)
+    X = np.random.randint(2, size=(5, 2), dtype=np.int32)
     msg = f"Data will be converted to boolean for metric {pairwise_metric}"
 
     with pytest.warns(DataConversionWarning, match=msg) as warn_record:
@@ -246,8 +246,8 @@ def test_nowarn_if_metric_no_bool():
     # make sure no conversion warning is raised if
     # metric isn't boolean, no matter what the data type is
     pairwise_metric = 'minkowski'
-    X_bool = np.random.randint(2, size=(5, 2), dtype=np.bool)
-    X_num = np.random.randint(2, size=(5, 2), dtype=np.int)
+    X_bool = np.random.randint(2, size=(5, 2), dtype=bool)
+    X_num = np.random.randint(2, size=(5, 2), dtype=np.int32)
 
     with pytest.warns(None) as warn_record:
         # fit boolean data

From 35b2bbf851c6967ba08f1eea07a37c35dc0f4b18 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Mon, 25 Jan 2021 13:46:18 +0100
Subject: [PATCH 090/478] FIX/MNT do not support OOB score for
 multiclass-multioutput and additional refactoring (#19162)

Co-authored-by: Adrin Jalali <adrin.jalali@gmail.com>
Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 doc/whats_new/v1.0.rst                |  10 +
 sklearn/ensemble/_forest.py           | 286 ++++++++++++++++----------
 sklearn/ensemble/tests/test_forest.py | 186 ++++++++++++-----
 3 files changed, 326 insertions(+), 156 deletions(-)

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index ffb9646499982..a00523ec2223b 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -59,6 +59,16 @@ Changelog
 - |Fix| Fixes incorrect multiple data-conversion warnings when clustering
   boolean data. :pr:`19046` by :user:`Surya Prakash <jdsurya>`.
 
+:mod:`sklearn.ensemble`
+.......................
+
+- |Fix| Do not allow to compute out-of-bag (OOB) score in
+  :class:`ensemble.RandomForestClassifier` and
+  :class:`ensemble.ExtraTreesClassifier` with multiclass-multioutput target
+  since scikit-learn does not provide any metric supporting this type of
+  target. Additional private refactoring was performed.
+  :pr:`19162` by :user:`Guillaume Lemaitre <glemaitre>`.
+
 :mod:`sklearn.feature_extraction`
 .................................
 
diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py
index ff1e781f7e166..c97b5b9f12528 100644
--- a/sklearn/ensemble/_forest.py
+++ b/sklearn/ensemble/_forest.py
@@ -50,8 +50,9 @@ class calls the ``fit`` method of each sub-estimator on random samples
 from scipy.sparse import hstack as sparse_hstack
 from joblib import Parallel
 
+from ..base import is_classifier
 from ..base import ClassifierMixin, RegressorMixin, MultiOutputMixin
-from ..metrics import r2_score
+from ..metrics import accuracy_score, r2_score
 from ..preprocessing import OneHotEncoder
 from ..tree import (DecisionTreeClassifier, DecisionTreeRegressor,
                     ExtraTreeClassifier, ExtraTreeRegressor)
@@ -61,7 +62,7 @@ class calls the ``fit`` method of each sub-estimator on random samples
 from ._base import BaseEnsemble, _partition_estimators
 from ..utils.fixes import delayed
 from ..utils.fixes import _joblib_parallel_args
-from ..utils.multiclass import check_classification_targets
+from ..utils.multiclass import check_classification_targets, type_of_target
 from ..utils.validation import check_is_fitted, _check_sample_weight
 from ..utils.validation import _deprecate_positional_args
 
@@ -396,7 +397,19 @@ def fit(self, X, y, sample_weight=None):
             self.estimators_.extend(trees)
 
         if self.oob_score:
-            self._set_oob_score(X, y)
+            y_type = type_of_target(y)
+            if y_type in ("multiclass-multioutput", "unknown"):
+                # FIXME: we could consider to support multiclass-multioutput if
+                # we introduce or reuse a constructor parameter (e.g.
+                # oob_score) allowing our user to pass a callable defining the
+                # scoring strategy on OOB sample.
+                raise ValueError(
+                    f"The type of target cannot be used to compute OOB "
+                    f"estimates. Got {y_type} while only the following are "
+                    f"supported: continuous, continuous-multioutput, binary, "
+                    f"multiclass, multilabel-indicator."
+                )
+            self._set_oob_score_and_attributes(X, y)
 
         # Decapsulate classes_ attributes
         if hasattr(self, "classes_") and self.n_outputs_ == 1:
@@ -406,9 +419,76 @@ def fit(self, X, y, sample_weight=None):
         return self
 
     @abstractmethod
-    def _set_oob_score(self, X, y):
+    def _set_oob_score_and_attributes(self, X, y):
+        """Compute and set the OOB score and attributes.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The data matrix.
+        y : ndarray of shape (n_samples, n_outputs)
+            The target matrix.
         """
-        Calculate out of bag predictions and score."""
+
+    def _compute_oob_predictions(self, X, y):
+        """Compute and set the OOB score.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The data matrix.
+        y : ndarray of shape (n_samples, n_outputs)
+            The target matrix.
+
+        Returns
+        -------
+        oob_pred : ndarray of shape (n_samples, n_classes, n_outputs) or \
+                (n_samples, 1, n_outputs)
+            The OOB predictions.
+      """
+        X = check_array(X, dtype=DTYPE, accept_sparse='csr')
+
+        n_samples = y.shape[0]
+        n_outputs = self.n_outputs_
+        if is_classifier(self) and hasattr(self, "n_classes_"):
+            # n_classes_ is a ndarray at this stage
+            # all the supported type of target will have the same number of
+            # classes in all outputs
+            oob_pred_shape = (n_samples, self.n_classes_[0], n_outputs)
+        else:
+            # for regression, n_classes_ does not exist and we create an empty
+            # axis to be consistent with the classification case and make
+            # the array operations compatible with the 2 settings
+            oob_pred_shape = (n_samples, 1, n_outputs)
+
+        oob_pred = np.zeros(shape=oob_pred_shape, dtype=np.float64)
+        n_oob_pred = np.zeros((n_samples, n_outputs), dtype=np.int64)
+
+        n_samples_bootstrap = _get_n_samples_bootstrap(
+            n_samples, self.max_samples,
+        )
+        for estimator in self.estimators_:
+            unsampled_indices = _generate_unsampled_indices(
+                estimator.random_state, n_samples, n_samples_bootstrap,
+            )
+
+            y_pred = self._get_oob_predictions(
+                estimator, X[unsampled_indices, :]
+            )
+            oob_pred[unsampled_indices, ...] += y_pred
+            n_oob_pred[unsampled_indices, :] += 1
+
+        for k in range(n_outputs):
+            if (n_oob_pred == 0).any():
+                warn(
+                    "Some inputs do not have OOB scores. This probably means "
+                    "too few trees were used to compute any reliable OOB "
+                    "estimates.", UserWarning
+                )
+                n_oob_pred[n_oob_pred == 0] = 1
+            oob_pred[..., k] /= n_oob_pred[..., [k]]
+
+        return oob_pred
 
     def _validate_y_class_weight(self, y):
         # Default implementation
@@ -507,53 +587,53 @@ def __init__(self,
             class_weight=class_weight,
             max_samples=max_samples)
 
-    def _set_oob_score(self, X, y):
-        """
-        Compute out-of-bag score."""
-        X = check_array(X, dtype=DTYPE, accept_sparse='csr')
-
-        n_classes_ = self.n_classes_
-        n_samples = y.shape[0]
-
-        oob_decision_function = []
-        oob_score = 0.0
-        predictions = [np.zeros((n_samples, n_classes_[k]))
-                       for k in range(self.n_outputs_)]
-
-        n_samples_bootstrap = _get_n_samples_bootstrap(
-            n_samples, self.max_samples
-        )
-
-        for estimator in self.estimators_:
-            unsampled_indices = _generate_unsampled_indices(
-                estimator.random_state, n_samples, n_samples_bootstrap)
-            p_estimator = estimator.predict_proba(X[unsampled_indices, :],
-                                                  check_input=False)
-
-            if self.n_outputs_ == 1:
-                p_estimator = [p_estimator]
-
-            for k in range(self.n_outputs_):
-                predictions[k][unsampled_indices, :] += p_estimator[k]
-
-        for k in range(self.n_outputs_):
-            if (predictions[k].sum(axis=1) == 0).any():
-                warn("Some inputs do not have OOB scores. "
-                     "This probably means too few trees were used "
-                     "to compute any reliable oob estimates.")
+    @staticmethod
+    def _get_oob_predictions(tree, X):
+        """Compute the OOB predictions for an individual tree.
 
-            decision = (predictions[k] /
-                        predictions[k].sum(axis=1)[:, np.newaxis])
-            oob_decision_function.append(decision)
-            oob_score += np.mean(y[:, k] ==
-                                 np.argmax(predictions[k], axis=1), axis=0)
+        Parameters
+        ----------
+        tree : DecisionTreeClassifier object
+            A single decision tree classifier.
+        X : ndarray of shape (n_samples, n_features)
+            The OOB samples.
 
-        if self.n_outputs_ == 1:
-            self.oob_decision_function_ = oob_decision_function[0]
+        Returns
+        -------
+        y_pred : ndarray of shape (n_samples, n_classes, n_outputs)
+            The OOB associated predictions.
+        """
+        y_pred = tree.predict_proba(X, check_input=False)
+        y_pred = np.array(y_pred, copy=False)
+        if y_pred.ndim == 2:
+            # binary and multiclass
+            y_pred = y_pred[..., np.newaxis]
         else:
-            self.oob_decision_function_ = oob_decision_function
+            # Roll the first `n_outputs` axis to the last axis. We will reshape
+            # from a shape of (n_outputs, n_samples, n_classes) to a shape of
+            # (n_samples, n_classes, n_outputs).
+            y_pred = np.rollaxis(y_pred, axis=0, start=3)
+        return y_pred
+
+    def _set_oob_score_and_attributes(self, X, y):
+        """Compute and set the OOB score and attributes.
 
-        self.oob_score_ = oob_score / self.n_outputs_
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The data matrix.
+        y : ndarray of shape (n_samples, n_outputs)
+            The target matrix.
+        """
+        self.oob_decision_function_ = super()._compute_oob_predictions(X, y)
+        if self.oob_decision_function_.shape[-1] == 1:
+            # drop the n_outputs axis if there is a single output
+            self.oob_decision_function_ = self.oob_decision_function_.squeeze(
+                axis=-1
+            )
+        self.oob_score_ = accuracy_score(
+            y, np.argmax(self.oob_decision_function_, axis=1)
+        )
 
     def _validate_y_class_weight(self, y):
         check_classification_targets(y)
@@ -664,8 +744,7 @@ def predict_proba(self, X):
 
         Returns
         -------
-        p : ndarray of shape (n_samples, n_classes), or a list of n_outputs
-            such arrays if n_outputs > 1.
+        p : ndarray of shape (n_samples, n_classes), or a list of such arrays
             The class probabilities of the input samples. The order of the
             classes corresponds to that in the attribute :term:`classes_`.
         """
@@ -711,8 +790,7 @@ def predict_log_proba(self, X):
 
         Returns
         -------
-        p : ndarray of shape (n_samples, n_classes), or a list of n_outputs
-            such arrays if n_outputs > 1.
+        p : ndarray of shape (n_samples, n_classes), or a list of such arrays
             The class probabilities of the input samples. The order of the
             classes corresponds to that in the attribute :term:`classes_`.
         """
@@ -803,52 +881,48 @@ def predict(self, X):
 
         return y_hat
 
-    def _set_oob_score(self, X, y):
-        """
-        Compute out-of-bag scores."""
-        X = check_array(X, dtype=DTYPE, accept_sparse='csr')
-
-        n_samples = y.shape[0]
-
-        predictions = np.zeros((n_samples, self.n_outputs_))
-        n_predictions = np.zeros((n_samples, self.n_outputs_))
-
-        n_samples_bootstrap = _get_n_samples_bootstrap(
-            n_samples, self.max_samples
-        )
-
-        for estimator in self.estimators_:
-            unsampled_indices = _generate_unsampled_indices(
-                estimator.random_state, n_samples, n_samples_bootstrap)
-            p_estimator = estimator.predict(
-                X[unsampled_indices, :], check_input=False)
-
-            if self.n_outputs_ == 1:
-                p_estimator = p_estimator[:, np.newaxis]
-
-            predictions[unsampled_indices, :] += p_estimator
-            n_predictions[unsampled_indices, :] += 1
+    @staticmethod
+    def _get_oob_predictions(tree, X):
+        """Compute the OOB predictions for an individual tree.
 
-        if (n_predictions == 0).any():
-            warn("Some inputs do not have OOB scores. "
-                 "This probably means too few trees were used "
-                 "to compute any reliable oob estimates.")
-            n_predictions[n_predictions == 0] = 1
-
-        predictions /= n_predictions
-        self.oob_prediction_ = predictions
-
-        if self.n_outputs_ == 1:
-            self.oob_prediction_ = \
-                self.oob_prediction_.reshape((n_samples, ))
+        Parameters
+        ----------
+        tree : DecisionTreeRegressor object
+            A single decision tree regressor.
+        X : ndarray of shape (n_samples, n_features)
+            The OOB samples.
 
-        self.oob_score_ = 0.0
+        Returns
+        -------
+        y_pred : ndarray of shape (n_samples, 1, n_outputs)
+            The OOB associated predictions.
+        """
+        y_pred = tree.predict(X, check_input=False)
+        if y_pred.ndim == 1:
+            # single output regression
+            y_pred = y_pred[:, np.newaxis, np.newaxis]
+        else:
+            # multioutput regression
+            y_pred = y_pred[:, np.newaxis, :]
+        return y_pred
 
-        for k in range(self.n_outputs_):
-            self.oob_score_ += r2_score(y[:, k],
-                                        predictions[:, k])
+    def _set_oob_score_and_attributes(self, X, y):
+        """Compute and set the OOB score and attributes.
 
-        self.oob_score_ /= self.n_outputs_
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The data matrix.
+        y : ndarray of shape (n_samples, n_outputs)
+            The target matrix.
+        """
+        self.oob_prediction_ = super()._compute_oob_predictions(X, y).squeeze(
+            axis=1
+        )
+        if self.oob_prediction_.shape[-1] == 1:
+            # drop the n_outputs axis if there is a single output
+            self.oob_prediction_ = self.oob_prediction_.squeeze(axis=-1)
+        self.oob_score_ = r2_score(y, self.oob_prediction_)
 
     def _compute_partial_dependence_recursion(self, grid, target_features):
         """Fast partial dependence computation.
@@ -881,6 +955,7 @@ def _compute_partial_dependence_recursion(self, grid, target_features):
 
         return averaged_predictions
 
+
 class RandomForestClassifier(ForestClassifier):
     """
     A random forest classifier.
@@ -999,8 +1074,7 @@ class RandomForestClassifier(ForestClassifier):
         whole dataset is used to build each tree.
 
     oob_score : bool, default=False
-        Whether to use out-of-bag samples to estimate
-        the generalization accuracy.
+        Whether to use out-of-bag samples to estimate the generalization score.
 
     n_jobs : int, default=None
         The number of jobs to run in parallel. :meth:`fit`, :meth:`predict`,
@@ -1107,7 +1181,8 @@ class labels (multi-output problem).
         Score of the training dataset obtained using an out-of-bag estimate.
         This attribute exists only when ``oob_score`` is True.
 
-    oob_decision_function_ : ndarray of shape (n_samples, n_classes)
+    oob_decision_function_ : ndarray of shape (n_samples, n_classes) or \
+            (n_samples, n_classes, n_outputs)
         Decision function computed with out-of-bag estimate on the training
         set. If n_estimators is small it might be possible that a data point
         was never left out during the bootstrap. In this case,
@@ -1322,8 +1397,7 @@ class RandomForestRegressor(ForestRegressor):
         whole dataset is used to build each tree.
 
     oob_score : bool, default=False
-        whether to use out-of-bag samples to estimate
-        the R^2 on unseen data.
+        Whether to use out-of-bag samples to estimate the generalization score.
 
     n_jobs : int, default=None
         The number of jobs to run in parallel. :meth:`fit`, :meth:`predict`,
@@ -1396,7 +1470,7 @@ class RandomForestRegressor(ForestRegressor):
         Score of the training dataset obtained using an out-of-bag estimate.
         This attribute exists only when ``oob_score`` is True.
 
-    oob_prediction_ : ndarray of shape (n_samples,)
+    oob_prediction_ : ndarray of shape (n_samples,) or (n_samples, n_outputs)
         Prediction computed with out-of-bag estimate on the training set.
         This attribute exists only when ``oob_score`` is True.
 
@@ -1605,8 +1679,7 @@ class ExtraTreesClassifier(ForestClassifier):
         whole dataset is used to build each tree.
 
     oob_score : bool, default=False
-        Whether to use out-of-bag samples to estimate
-        the generalization accuracy.
+        Whether to use out-of-bag samples to estimate the generalization score.
 
     n_jobs : int, default=None
         The number of jobs to run in parallel. :meth:`fit`, :meth:`predict`,
@@ -1717,7 +1790,8 @@ class labels (multi-output problem).
         Score of the training dataset obtained using an out-of-bag estimate.
         This attribute exists only when ``oob_score`` is True.
 
-    oob_decision_function_ : ndarray of shape (n_samples, n_classes)
+    oob_decision_function_ : ndarray of shape (n_samples, n_classes) or \
+            (n_samples, n_classes, n_outputs)
         Decision function computed with out-of-bag estimate on the training
         set. If n_estimators is small it might be possible that a data point
         was never left out during the bootstrap. In this case,
@@ -1924,7 +1998,7 @@ class ExtraTreesRegressor(ForestRegressor):
         whole dataset is used to build each tree.
 
     oob_score : bool, default=False
-        Whether to use out-of-bag samples to estimate the R^2 on unseen data.
+        Whether to use out-of-bag samples to estimate the generalization score.
 
     n_jobs : int, default=None
         The number of jobs to run in parallel. :meth:`fit`, :meth:`predict`,
@@ -2001,7 +2075,7 @@ class ExtraTreesRegressor(ForestRegressor):
         Score of the training dataset obtained using an out-of-bag estimate.
         This attribute exists only when ``oob_score`` is True.
 
-    oob_prediction_ : ndarray of shape (n_samples,)
+    oob_prediction_ : ndarray of shape (n_samples,) or (n_samples, n_outputs)
         Prediction computed with out-of-bag estimate on the training set.
         This attribute exists only when ``oob_score`` is True.
 
@@ -2290,7 +2364,7 @@ def __init__(self,
         self.min_impurity_split = min_impurity_split
         self.sparse_output = sparse_output
 
-    def _set_oob_score(self, X, y):
+    def _set_oob_score_and_attributes(self, X, y):
         raise NotImplementedError("OOB score not supported by tree embedding")
 
     def fit(self, X, y=None, sample_weight=None):
diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py
index 65350f4d602d9..2302ed169bf86 100644
--- a/sklearn/ensemble/tests/test_forest.py
+++ b/sklearn/ensemble/tests/test_forest.py
@@ -32,6 +32,7 @@
 from sklearn.utils._testing import assert_raises
 from sklearn.utils._testing import assert_warns
 from sklearn.utils._testing import assert_warns_message
+from sklearn.utils._testing import _convert_container
 from sklearn.utils._testing import ignore_warnings
 from sklearn.utils._testing import skip_if_no_parallel
 from sklearn.utils.fixes import parse_version
@@ -46,6 +47,7 @@
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.ensemble import RandomForestRegressor
 from sklearn.ensemble import RandomTreesEmbedding
+from sklearn.model_selection import train_test_split
 from sklearn.model_selection import GridSearchCV
 from sklearn.svm import LinearSVC
 from sklearn.utils.validation import check_random_state
@@ -371,72 +373,156 @@ def test_unfitted_feature_importances(name):
         getattr(FOREST_ESTIMATORS[name](), 'feature_importances_')
 
 
-def check_oob_score(name, X, y, n_estimators=20):
-    # Check that oob prediction is a good estimation of the generalization
-    # error.
-
-    # Proper behavior
-    est = FOREST_ESTIMATORS[name](oob_score=True, random_state=0,
-                                  n_estimators=n_estimators, bootstrap=True)
-    n_samples = X.shape[0]
-    est.fit(X[:n_samples // 2, :], y[:n_samples // 2])
-    test_score = est.score(X[n_samples // 2:, :], y[n_samples // 2:])
-    oob_score = est.oob_score_
-
-    assert abs(test_score - oob_score) < 0.1 and oob_score > 0.7
-
-    # Check warning if not enough estimators
-    with np.errstate(divide="ignore", invalid="ignore"):
-        est = FOREST_ESTIMATORS[name](oob_score=True, random_state=0,
-                                      n_estimators=1, bootstrap=True)
-        assert_warns(UserWarning, est.fit, X, y)
+@pytest.mark.parametrize("ForestClassifier", FOREST_CLASSIFIERS.values())
+@pytest.mark.parametrize("X_type", ["array", "sparse_csr", "sparse_csc"])
+@pytest.mark.parametrize(
+    "X, y, lower_bound_accuracy",
+    [
+        (
+            *datasets.make_classification(
+                n_samples=300, n_classes=2, random_state=0
+            ),
+            0.9,
+        ),
+        (
+            *datasets.make_classification(
+                n_samples=1000, n_classes=3, n_informative=6, random_state=0
+            ),
+            0.65,
+        ),
+        (
+            iris.data, iris.target * 2 + 1, 0.65,
+        ),
+        (
+            *datasets.make_multilabel_classification(
+                n_samples=300, random_state=0
+            ),
+            0.18,
+        ),
+    ],
+)
+def test_forest_classifier_oob(
+    ForestClassifier, X, y, X_type, lower_bound_accuracy
+):
+    """Check that OOB score is close to score on a test set."""
+    X = _convert_container(X, constructor_name=X_type)
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=0.5, random_state=0,
+    )
+    classifier = ForestClassifier(
+        n_estimators=40, bootstrap=True, oob_score=True, random_state=0,
+    )
 
+    assert not hasattr(classifier, "oob_score_")
+    assert not hasattr(classifier, "oob_decision_function_")
 
-@pytest.mark.parametrize('name', FOREST_CLASSIFIERS)
-def test_oob_score_classifiers(name):
-    check_oob_score(name, iris.data, iris.target)
+    classifier.fit(X_train, y_train)
+    test_score = classifier.score(X_test, y_test)
 
-    # csc matrix
-    check_oob_score(name, csc_matrix(iris.data), iris.target)
+    assert abs(test_score - classifier.oob_score_) <= 0.1
+    assert classifier.oob_score_ >= lower_bound_accuracy
 
-    # non-contiguous targets in classification
-    check_oob_score(name, iris.data, iris.target * 2 + 1)
+    assert hasattr(classifier, "oob_score_")
+    assert not hasattr(classifier, "oob_prediction_")
+    assert hasattr(classifier, "oob_decision_function_")
 
+    if y.ndim == 1:
+        expected_shape = (X_train.shape[0], len(set(y)))
+    else:
+        expected_shape = (X_train.shape[0], len(set(y[:, 0])), y.shape[1])
+    assert classifier.oob_decision_function_.shape == expected_shape
 
-@pytest.mark.parametrize('name', FOREST_REGRESSORS)
-def test_oob_score_regressors(name):
-    check_oob_score(name, X_reg, y_reg, 50)
 
-    # csc matrix
-    check_oob_score(name, csc_matrix(X_reg), y_reg, 50)
+@pytest.mark.parametrize("ForestRegressor", FOREST_REGRESSORS.values())
+@pytest.mark.parametrize("X_type", ["array", "sparse_csr", "sparse_csc"])
+@pytest.mark.parametrize(
+    "X, y, lower_bound_r2",
+    [
+        (
+            *datasets.make_regression(
+                n_samples=500, n_features=10, n_targets=1, random_state=0
+            ),
+            0.7,
+        ),
+        (
+            *datasets.make_regression(
+                n_samples=500, n_features=10, n_targets=2, random_state=0
+            ),
+            0.55,
+        ),
+    ],
+)
+def test_forest_regressor_oob(
+    ForestRegressor, X, y, X_type, lower_bound_r2
+):
+    """Check that forest-based regressor provide an OOB score close to the
+    score on a test set."""
+    X = _convert_container(X, constructor_name=X_type)
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=0.5, random_state=0,
+    )
+    regressor = ForestRegressor(
+        n_estimators=50, bootstrap=True, oob_score=True, random_state=0,
+    )
 
+    assert not hasattr(regressor, "oob_score_")
+    assert not hasattr(regressor, "oob_prediction_")
 
-def check_oob_score_raise_error(name):
-    ForestEstimator = FOREST_ESTIMATORS[name]
+    regressor.fit(X_train, y_train)
+    test_score = regressor.score(X_test, y_test)
 
-    if name in FOREST_TRANSFORMERS:
-        for oob_score in [True, False]:
-            assert_raises(TypeError, ForestEstimator, oob_score=oob_score)
+    assert abs(test_score - regressor.oob_score_) <= 0.1
+    assert regressor.oob_score_ >= lower_bound_r2
 
-        assert_raises(NotImplementedError, ForestEstimator()._set_oob_score,
-                      X, y)
+    assert hasattr(regressor, "oob_score_")
+    assert hasattr(regressor, "oob_prediction_")
+    assert not hasattr(regressor, "oob_decision_function_")
 
+    if y.ndim == 1:
+        expected_shape = (X_train.shape[0],)
     else:
-        # Unfitted /  no bootstrap / no oob_score
-        for oob_score, bootstrap in [(True, False), (False, True),
-                                     (False, False)]:
-            est = ForestEstimator(oob_score=oob_score, bootstrap=bootstrap,
-                                  random_state=0)
-            assert not hasattr(est, "oob_score_")
+        expected_shape = (X_train.shape[0], y.ndim)
+    assert regressor.oob_prediction_.shape == expected_shape
 
-        # No bootstrap
-        assert_raises(ValueError, ForestEstimator(oob_score=True,
-                                                  bootstrap=False).fit, X, y)
 
+@pytest.mark.parametrize(
+    "ForestEstimator", FOREST_CLASSIFIERS_REGRESSORS.values()
+)
+def test_forest_oob_warning(ForestEstimator):
+    """Check that a warning is raised when not enough estimator and the OOB
+    estimates will be inacurrate."""
+    estimator = ForestEstimator(
+        n_estimators=1, oob_score=True, bootstrap=True, random_state=0,
+    )
+    with pytest.warns(UserWarning, match="Some inputs do not have OOB scores"):
+        estimator.fit(iris.data, iris.target)
 
-@pytest.mark.parametrize('name', FOREST_ESTIMATORS)
-def test_oob_score_raise_error(name):
-    check_oob_score_raise_error(name)
+
+@pytest.mark.parametrize(
+    "ForestEstimator", FOREST_CLASSIFIERS_REGRESSORS.values()
+)
+@pytest.mark.parametrize(
+    "X, y, params, err_msg",
+    [
+        (iris.data, iris.target, {"oob_score": True, "bootstrap": False},
+         "Out of bag estimation only available if bootstrap=True"),
+        (iris.data, rng.randint(low=0, high=5, size=(iris.data.shape[0], 2)),
+         {"oob_score": True, "bootstrap": True},
+         "The type of target cannot be used to compute OOB estimates")
+    ]
+)
+def test_forest_oob_error(ForestEstimator, X, y, params, err_msg):
+    estimator = ForestEstimator(**params)
+    with pytest.raises(ValueError, match=err_msg):
+        estimator.fit(X, y)
+
+
+@pytest.mark.parametrize("oob_score", [True, False])
+def test_random_trees_embedding_raise_error_oob(oob_score):
+    with pytest.raises(TypeError, match="got an unexpected keyword argument"):
+        RandomTreesEmbedding(oob_score=oob_score)
+    with pytest.raises(NotImplementedError, match="OOB score not supported"):
+        RandomTreesEmbedding()._set_oob_score_and_attributes(X, y)
 
 
 def check_gridsearch(name):

From 7c2f928cbc8d5261e5b85b5e63d549a794116c3b Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Mon, 25 Jan 2021 10:56:12 -0500
Subject: [PATCH 091/478] FIX Fixes test_compare_to_elki failure (#19221)

---
 sklearn/cluster/_optics.py           |  4 ++++
 sklearn/cluster/tests/test_optics.py | 10 ----------
 2 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/sklearn/cluster/_optics.py b/sklearn/cluster/_optics.py
index 11893dbd70520..d0b94f43454b3 100755
--- a/sklearn/cluster/_optics.py
+++ b/sklearn/cluster/_optics.py
@@ -483,6 +483,9 @@ def compute_optics_graph(X, *, min_samples, max_eps, metric, p, metric_params,
                                                working_memory=None)
     # OPTICS puts an upper limit on these, use inf for undefined.
     core_distances_[core_distances_ > max_eps] = np.inf
+    np.around(core_distances_,
+              decimals=np.finfo(core_distances_.dtype).precision,
+              out=core_distances_)
 
     # Main OPTICS loop. Not parallelizable. The order that entries are
     # written to the 'ordering_' list is important!
@@ -543,6 +546,7 @@ def _set_reach_dist(core_distances_, reachability_, predecessor_,
                                    **_params).ravel()
 
     rdists = np.maximum(dists, core_distances_[point_index])
+    np.around(rdists, decimals=np.finfo(rdists.dtype).precision, out=rdists)
     improved = np.where(rdists < np.take(reachability_, unproc))
     reachability_[unproc[improved]] = rdists[improved]
     predecessor_[unproc[improved]] = point_index
diff --git a/sklearn/cluster/tests/test_optics.py b/sklearn/cluster/tests/test_optics.py
index 771596064142d..8578c68d0f0dc 100644
--- a/sklearn/cluster/tests/test_optics.py
+++ b/sklearn/cluster/tests/test_optics.py
@@ -1,9 +1,6 @@
 # Authors: Shane Grigsby <refuge@rocktalus.com>
 #          Adrin Jalali <adrin.jalali@gmail.com>
 # License: BSD 3 clause
-import platform
-import sys
-
 import numpy as np
 import pytest
 
@@ -18,10 +15,8 @@
 from sklearn.utils._testing import assert_array_equal
 from sklearn.utils._testing import assert_raise_message
 from sklearn.utils._testing import assert_allclose
-from sklearn.utils.fixes import sp_version, parse_version
 
 from sklearn.cluster.tests.common import generate_clustered_data
-from sklearn.utils import _IS_32BIT
 
 
 rng = np.random.RandomState(0)
@@ -362,11 +357,6 @@ def test_processing_order():
     assert_array_equal(clust.ordering_, [0, 1, 2, 3])
 
 
-@pytest.mark.skipif(sp_version >= parse_version("1.6.0")
-                    and (platform.machine() == "aarch64" or
-                         (sys.platform == "linux" and _IS_32BIT)),
-                    reason=("Test fails for SciPy 1.6.0 on ARM and on 32-bit "
-                            "linux. See #19111"))
 def test_compare_to_ELKI():
     # Expected values, computed with (future) ELKI 0.7.5 using:
     # java -jar elki.jar cli -dbc.in csv -dbc.filter FixedDBIDsFilter

From e449f9f1be7dcb937ace1327a4b0c6728afafafa Mon Sep 17 00:00:00 2001
From: Rene Jean Corneille <rene-jean.corneille@loveholidays.com>
Date: Mon, 25 Jan 2021 22:41:53 +0000
Subject: [PATCH 092/478] MNT Ignore unused variable in pipeline. (#19273)

---
 sklearn/pipeline.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py
index 00aad1a8e5315..f466b735c4fa6 100644
--- a/sklearn/pipeline.py
+++ b/sklearn/pipeline.py
@@ -240,7 +240,7 @@ def _final_estimator(self):
     def _log_message(self, step_idx):
         if not self.verbose:
             return None
-        name, step = self.steps[step_idx]
+        name, _ = self.steps[step_idx]
 
         return '(step %d of %d) Processing %s' % (step_idx + 1,
                                                   len(self.steps),

From 5ef9fa443f61ab753fa650367a7ff453448b810b Mon Sep 17 00:00:00 2001
From: Lucy Liu <jliu176@gmail.com>
Date: Tue, 26 Jan 2021 20:27:41 +1100
Subject: [PATCH 093/478] DOC replace plot_confusion_matrix in See Also section
 (#19277)

---
 sklearn/metrics/_classification.py        | 5 ++++-
 sklearn/metrics/_plot/confusion_matrix.py | 5 ++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py
index ce40f98e1219e..8503ae353b4cb 100644
--- a/sklearn/metrics/_classification.py
+++ b/sklearn/metrics/_classification.py
@@ -259,7 +259,10 @@ def confusion_matrix(y_true, y_pred, *, labels=None, sample_weight=None,
 
     See Also
     --------
-    plot_confusion_matrix : Plot Confusion Matrix.
+    ConfusionMatrixDisplay.from_estimator : Plot the confusion matrix
+        given an estimator, the data, and the label.
+    ConfusionMatrixDisplay.from_predictions : Plot the confusion matrix
+        given the true and predicted labels.
     ConfusionMatrixDisplay : Confusion Matrix visualization.
 
     References
diff --git a/sklearn/metrics/_plot/confusion_matrix.py b/sklearn/metrics/_plot/confusion_matrix.py
index 503bd3f2e7f64..9fcecec775e6e 100644
--- a/sklearn/metrics/_plot/confusion_matrix.py
+++ b/sklearn/metrics/_plot/confusion_matrix.py
@@ -49,7 +49,10 @@ class ConfusionMatrixDisplay:
     --------
     confusion_matrix : Compute Confusion Matrix to evaluate the accuracy of a
         classification.
-    plot_confusion_matrix : Plot Confusion Matrix.
+    ConfusionMatrixDisplay.from_estimator : Plot the confusion matrix
+        given an estimator, the data, and the label.
+    ConfusionMatrixDisplay.from_predictions : Plot the confusion matrix
+        given the true and predicted labels.
 
     Examples
     --------

From 8965abb264aaf70d11d9f56d2947bcc0b5ddaf75 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Tue, 26 Jan 2021 08:42:44 -0500
Subject: [PATCH 094/478] CLN Removes duplicated or unneeded code in
 ColumnTransformer (#19261)

---
 sklearn/compose/_column_transformer.py | 17 +++++------------
 1 file changed, 5 insertions(+), 12 deletions(-)

diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py
index 553e8c3afa263..6693c9896c87a 100644
--- a/sklearn/compose/_column_transformer.py
+++ b/sklearn/compose/_column_transformer.py
@@ -19,7 +19,6 @@
 from ..utils import Bunch
 from ..utils import _safe_indexing
 from ..utils import _get_column_indices
-from ..utils import _determine_key_type
 from ..utils.metaestimators import _BaseComposition
 from ..utils.validation import check_array, check_is_fitted
 from ..utils.validation import _deprecate_positional_args
@@ -320,12 +319,6 @@ def _validate_remainder(self, X):
                 "'passthrough', or estimator. '%s' was passed instead" %
                 self.remainder)
 
-        # Make it possible to check for reordered named columns on transform
-        self._has_str_cols = any(_determine_key_type(cols) == 'str'
-                                 for cols in self._columns)
-        if hasattr(X, 'columns'):
-            self._df_columns = X.columns
-
         self._n_features = X.shape[1]
         cols = []
         for columns in self._columns:
@@ -362,12 +355,12 @@ def get_feature_names(self):
                     hasattr(column, '__len__') and not len(column)):
                 continue
             if trans == 'passthrough':
-                if hasattr(self, '_df_columns'):
+                if self._feature_names_in is not None:
                     if ((not isinstance(column, slice))
                             and all(isinstance(col, str) for col in column)):
                         feature_names.extend(column)
                     else:
-                        feature_names.extend(self._df_columns[column])
+                        feature_names.extend(self._feature_names_in[column])
                 else:
                     indices = np.arange(self._n_features)
                     feature_names.extend(['x%d' % i for i in indices[column]])
@@ -441,7 +434,7 @@ def _fit_transform(self, X, y, func, fitted=False):
                     message_clsname='ColumnTransformer',
                     message=self._log_message(name, idx, len(transformers)))
                 for idx, (name, trans, column, weight) in enumerate(
-                        self._iter(fitted=fitted, replace_strings=True), 1))
+                    transformers, 1))
         except ValueError as e:
             if "Expected 2D array, got 1D array instead" in str(e):
                 raise ValueError(_ERR_MSG_1DCOLUMN) from e
@@ -606,9 +599,9 @@ def _sk_visual_block_(self):
             transformers = self.transformers
         elif hasattr(self, "_remainder"):
             remainder_columns = self._remainder[2]
-            if hasattr(self, '_df_columns'):
+            if self._feature_names_in is not None:
                 remainder_columns = (
-                    self._df_columns[remainder_columns].tolist()
+                    self._feature_names_in[remainder_columns].tolist()
                 )
             transformers = chain(self.transformers,
                                  [('remainder', self.remainder,

From 315463f3d20494581b440cd9c26d48c580419edc Mon Sep 17 00:00:00 2001
From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com>
Date: Tue, 26 Jan 2021 08:06:10 -0600
Subject: [PATCH 095/478] DOC update Keras description in related projects
 (#19265)

---
 doc/related_projects.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/related_projects.rst b/doc/related_projects.rst
index 8496b2b9b1df0..38d8bc555638e 100644
--- a/doc/related_projects.rst
+++ b/doc/related_projects.rst
@@ -148,8 +148,8 @@ and tasks.
 - `nolearn <https://github.com/dnouri/nolearn>`_ A number of wrappers and
   abstractions around existing neural network libraries
 
-- `keras <https://github.com/fchollet/keras>`_ Deep Learning library capable of
-  running on top of either TensorFlow or Theano.
+- `Keras <https://www.tensorflow.org/api_docs/python/tf/keras>`_ High-level API for
+  TensorFlow with a scikit-learn inspired API.
 
 - `lasagne <https://github.com/Lasagne/Lasagne>`_ A lightweight library to
   build and train neural networks in Theano.

From ca7fc5dd4f31a3b2eee5352e9ce615d2aa104b4d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Hans=20Moritz=20G=C3=BCnther?= <moritz.guenther@gmx.de>
Date: Tue, 26 Jan 2021 09:24:55 -0500
Subject: [PATCH 096/478] DOC Add URL to reference of Minka paper used in PCA
 (#19207)

---
 sklearn/decomposition/_pca.py | 30 ++++++++++++++++++------------
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py
index 80ac7e856dfd0..ac4a1d1d9816b 100644
--- a/sklearn/decomposition/_pca.py
+++ b/sklearn/decomposition/_pca.py
@@ -32,7 +32,8 @@ def _assess_dimension(spectrum, rank, n_samples):
     """Compute the log-likelihood of a rank ``rank`` dataset.
 
     The dataset is assumed to be embedded in gaussian noise of shape(n,
-    dimf) having spectrum ``spectrum``.
+    dimf) having spectrum ``spectrum``. This implements the method of
+    T. P. Minka.
 
     Parameters
     ----------
@@ -50,10 +51,11 @@ def _assess_dimension(spectrum, rank, n_samples):
     ll : float
         The log-likelihood.
 
-    Notes
-    -----
+    References
+    ----------
     This implements the method of `Thomas P. Minka:
-    Automatic Choice of Dimensionality for PCA. NIPS 2000: 598-604`
+    Automatic Choice of Dimensionality for PCA. NIPS 2000: 598-604
+    <https://proceedings.neurips.cc/paper/2000/file/7503cfacd12053d309b6bed5c89de212-Paper.pdf>`_
     """
 
     n_features = spectrum.shape[0]
@@ -271,26 +273,30 @@ class PCA(_BasePCA):
 
     References
     ----------
-    For n_components == 'mle', this class uses the method of *Minka, T. P.
-    "Automatic choice of dimensionality for PCA". In NIPS, pp. 598-604*
+    For n_components == 'mle', this class uses the method from:
+    `Minka, T. P.. "Automatic choice of dimensionality for PCA".
+    In NIPS, pp. 598-604 <https://tminka.github.io/papers/pca/minka-pca.pdf>`_
 
     Implements the probabilistic PCA model from:
-    Tipping, M. E., and Bishop, C. M. (1999). "Probabilistic principal
+    `Tipping, M. E., and Bishop, C. M. (1999). "Probabilistic principal
     component analysis". Journal of the Royal Statistical Society:
     Series B (Statistical Methodology), 61(3), 611-622.
+    <http://www.miketipping.com/papers/met-mppca.pdf>`_
     via the score and score_samples methods.
-    See http://www.miketipping.com/papers/met-mppca.pdf
 
     For svd_solver == 'arpack', refer to `scipy.sparse.linalg.svds`.
 
     For svd_solver == 'randomized', see:
-    *Halko, N., Martinsson, P. G., and Tropp, J. A. (2011).
+    `Halko, N., Martinsson, P. G., and Tropp, J. A. (2011).
     "Finding structure with randomness: Probabilistic algorithms for
     constructing approximate matrix decompositions".
-    SIAM review, 53(2), 217-288.* and also
-    *Martinsson, P. G., Rokhlin, V., and Tygert, M. (2011).
+    SIAM review, 53(2), 217-288.
+    <https://doi.org/10.1137/090771806>`_
+    and also
+    `Martinsson, P. G., Rokhlin, V., and Tygert, M. (2011).
     "A randomized algorithm for the decomposition of matrices".
-    Applied and Computational Harmonic Analysis, 30(1), 47-68.*
+    Applied and Computational Harmonic Analysis, 30(1), 47-68
+    <https://doi.org/10.1016/j.acha.2010.02.003>`_.
 
     Examples
     --------

From 60fb6375ae3f68646d10245be8ff9614046d8b00 Mon Sep 17 00:00:00 2001
From: Chiara Marmo <cmarmo@users.noreply.github.com>
Date: Tue, 26 Jan 2021 18:34:01 +0100
Subject: [PATCH 097/478] DOC Add translation section in related projects
 (#19001)

---
 doc/related_projects.rst | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/doc/related_projects.rst b/doc/related_projects.rst
index 38d8bc555638e..0f8f2c21eabc5 100644
--- a/doc/related_projects.rst
+++ b/doc/related_projects.rst
@@ -306,3 +306,30 @@ Domain specific packages
 
 - `MSMBuilder <http://msmbuilder.org/>`_  Machine learning for protein
   conformational dynamics time series.
+
+Translations of scikit-learn documentation
+------------------------------------------
+
+Translation’s purpose is to ease reading and understanding in languages
+other than English. Its aim is to help people who do not understand English
+or have doubts about its interpretation. Additionally, some people prefer
+to read documentation in their native language, but please bear in mind that
+the only official documentation is the English one [#f1]_.
+
+Those translation efforts are community initiatives and we have no control
+on them.
+If you want to contribute or report an issue with the translation, please
+contact the authors of the translation.
+Some available translations are linked here to improve their dissemination
+and promote community efforts.
+
+- `Chinese translation <https://sklearn.apachecn.org/>`_
+  (`source <https://github.com/apachecn/sklearn-doc-zh>`__)
+- `Persian translation <https://sklearn.ir/>`_
+  (`source <https://github.com/mehrdad-dev/scikit-learn>`__)
+
+.. rubric:: Footnotes
+
+.. [#f1] following `linux documentation Disclaimer
+   <https://www.kernel.org/doc/html/latest/translations/index.html#disclaimer>`__
+

From 0aee596bb32136df8c68371d696770251c7d14a0 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Tue, 26 Jan 2021 14:35:34 -0500
Subject: [PATCH 098/478] ENH Checks n_features_in in semi_supervised (#18742)

---
 sklearn/semi_supervised/_label_propagation.py | 7 ++++---
 sklearn/tests/test_common.py                  | 1 -
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/sklearn/semi_supervised/_label_propagation.py b/sklearn/semi_supervised/_label_propagation.py
index 1a8f1a75bda38..30f09c2866e62 100644
--- a/sklearn/semi_supervised/_label_propagation.py
+++ b/sklearn/semi_supervised/_label_propagation.py
@@ -67,7 +67,7 @@
 from ..neighbors import NearestNeighbors
 from ..utils.extmath import safe_sparse_dot
 from ..utils.multiclass import check_classification_targets
-from ..utils.validation import check_is_fitted, check_array
+from ..utils.validation import check_is_fitted
 from ..utils.validation import _deprecate_positional_args
 from ..exceptions import ConvergenceWarning
 
@@ -190,8 +190,9 @@ class labels.
         """
         check_is_fitted(self)
 
-        X_2d = check_array(X, accept_sparse=['csc', 'csr', 'coo', 'dok',
-                                             'bsr', 'lil', 'dia'])
+        X_2d = self._validate_data(
+            X, accept_sparse=['csc', 'csr', 'coo', 'dok', 'bsr', 'lil', 'dia'],
+            reset=False)
         weight_matrices = self._get_kernel(self.X_, X_2d)
         if self.kernel == 'knn':
             probabilities = np.array([
diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
index b900f94231419..99da52539cd61 100644
--- a/sklearn/tests/test_common.py
+++ b/sklearn/tests/test_common.py
@@ -282,7 +282,6 @@ def test_search_cv(estimator, check, request):
     'neighbors',
     'pipeline',
     'random_projection',
-    'semi_supervised',
 }
 
 N_FEATURES_IN_AFTER_FIT_ESTIMATORS = [

From 74f20aedff4972df97cc42285b0955b726b37a0a Mon Sep 17 00:00:00 2001
From: Ayush Singh <60514130+AYUSHBlaze@users.noreply.github.com>
Date: Wed, 27 Jan 2021 14:18:24 +0530
Subject: [PATCH 099/478] DOC Fixed typo in a comment (#19284)

---
 examples/svm/plot_svm_anova.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/svm/plot_svm_anova.py b/examples/svm/plot_svm_anova.py
index 0eac795ede791..b0392b1c00361 100644
--- a/examples/svm/plot_svm_anova.py
+++ b/examples/svm/plot_svm_anova.py
@@ -29,7 +29,7 @@
 
 # #############################################################################
 # Create a feature-selection transform, a scaler and an instance of SVM that we
-# combine together to have an full-blown estimator
+# combine together to have a full-blown estimator
 clf = Pipeline([('anova', SelectPercentile(chi2)),
                 ('scaler', StandardScaler()),
                 ('svc', SVC(gamma="auto"))])

From 054d156a1d4f2e74c15031db81c10a72c67cc2c5 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Wed, 27 Jan 2021 04:13:59 -0500
Subject: [PATCH 100/478] ENH Checks n_features_in in neighbors (#18744)

---
 sklearn/neighbors/_base.py             | 4 ++--
 sklearn/neighbors/_classification.py   | 7 +++----
 sklearn/neighbors/_nca.py              | 2 +-
 sklearn/neighbors/_nearest_centroid.py | 4 ++--
 sklearn/neighbors/_regression.py       | 5 ++---
 sklearn/tests/test_common.py           | 1 -
 6 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py
index 54cf473b2ab75..820b83eca1845 100644
--- a/sklearn/neighbors/_base.py
+++ b/sklearn/neighbors/_base.py
@@ -667,7 +667,7 @@ class from an array representing our data set and ask who's
             if self.effective_metric_ == 'precomputed':
                 X = _check_precomputed(X)
             else:
-                X = check_array(X, accept_sparse='csr')
+                X = self._validate_data(X, accept_sparse='csr', reset=False)
         else:
             query_is_train = True
             X = self._fit_X
@@ -982,7 +982,7 @@ class from an array representing our data set and ask who's
             if self.effective_metric_ == 'precomputed':
                 X = _check_precomputed(X)
             else:
-                X = check_array(X, accept_sparse='csr')
+                X = self._validate_data(X, accept_sparse='csr', reset=False)
         else:
             query_is_train = True
             X = self._fit_X
diff --git a/sklearn/neighbors/_classification.py b/sklearn/neighbors/_classification.py
index 9cd08e0c39a1d..71b869977f6aa 100644
--- a/sklearn/neighbors/_classification.py
+++ b/sklearn/neighbors/_classification.py
@@ -17,7 +17,6 @@
 from ._base import _check_weights, _get_weights
 from ._base import NeighborsBase, KNeighborsMixin, RadiusNeighborsMixin
 from ..base import ClassifierMixin
-from ..utils import check_array
 from ..utils.validation import _deprecate_positional_args
 
 
@@ -192,7 +191,7 @@ def predict(self, X):
         y : ndarray of shape (n_queries,) or (n_queries, n_outputs)
             Class labels for each data sample.
         """
-        X = check_array(X, accept_sparse='csr')
+        X = self._validate_data(X, accept_sparse='csr', reset=False)
 
         neigh_dist, neigh_ind = self.kneighbors(X)
         classes_ = self.classes_
@@ -236,7 +235,7 @@ def predict_proba(self, X):
             The class probabilities of the input samples. Classes are ordered
             by lexicographic order.
         """
-        X = check_array(X, accept_sparse='csr')
+        X = self._validate_data(X, accept_sparse='csr', reset=False)
 
         neigh_dist, neigh_ind = self.kneighbors(X)
 
@@ -545,7 +544,7 @@ def predict_proba(self, X):
             by lexicographic order.
         """
 
-        X = check_array(X, accept_sparse='csr')
+        X = self._validate_data(X, accept_sparse='csr', reset=False)
         n_queries = _num_samples(X)
 
         neigh_dist, neigh_ind = self.radius_neighbors(X)
diff --git a/sklearn/neighbors/_nca.py b/sklearn/neighbors/_nca.py
index 8920b2d99ed02..a4ef02b687d97 100644
--- a/sklearn/neighbors/_nca.py
+++ b/sklearn/neighbors/_nca.py
@@ -263,7 +263,7 @@ def transform(self, X):
         """
 
         check_is_fitted(self)
-        X = check_array(X)
+        X = self._validate_data(X, reset=False)
 
         return np.dot(X, self.components_.T)
 
diff --git a/sklearn/neighbors/_nearest_centroid.py b/sklearn/neighbors/_nearest_centroid.py
index ededb2afd877a..0c726cdc0a62c 100644
--- a/sklearn/neighbors/_nearest_centroid.py
+++ b/sklearn/neighbors/_nearest_centroid.py
@@ -15,7 +15,7 @@
 from ..base import BaseEstimator, ClassifierMixin
 from ..metrics.pairwise import pairwise_distances
 from ..preprocessing import LabelEncoder
-from ..utils.validation import check_array, check_is_fitted
+from ..utils.validation import check_is_fitted
 from ..utils.validation import _deprecate_positional_args
 from ..utils.sparsefuncs import csc_median_axis_0
 from ..utils.multiclass import check_classification_targets
@@ -201,6 +201,6 @@ def predict(self, X):
         """
         check_is_fitted(self)
 
-        X = check_array(X, accept_sparse='csr')
+        X = self._validate_data(X, accept_sparse='csr', reset=False)
         return self.classes_[pairwise_distances(
             X, self.centroids_, metric=self.metric).argmin(axis=1)]
diff --git a/sklearn/neighbors/_regression.py b/sklearn/neighbors/_regression.py
index 9bf28f037294a..d3878cd54aa06 100644
--- a/sklearn/neighbors/_regression.py
+++ b/sklearn/neighbors/_regression.py
@@ -17,7 +17,6 @@
 from ._base import _get_weights, _check_weights
 from ._base import NeighborsBase, KNeighborsMixin, RadiusNeighborsMixin
 from ..base import RegressorMixin
-from ..utils import check_array
 from ..utils.validation import _deprecate_positional_args
 from ..utils.deprecation import deprecated
 
@@ -203,7 +202,7 @@ def predict(self, X):
         y : ndarray of shape (n_queries,) or (n_queries, n_outputs), dtype=int
             Target values.
         """
-        X = check_array(X, accept_sparse='csr')
+        X = self._validate_data(X, accept_sparse='csr', reset=False)
 
         neigh_dist, neigh_ind = self.kneighbors(X)
 
@@ -392,7 +391,7 @@ def predict(self, X):
                 dtype=double
             Target values.
         """
-        X = check_array(X, accept_sparse='csr')
+        X = self._validate_data(X, accept_sparse='csr', reset=False)
 
         neigh_dist, neigh_ind = self.radius_neighbors(X)
 
diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
index 99da52539cd61..6515ac17b3a13 100644
--- a/sklearn/tests/test_common.py
+++ b/sklearn/tests/test_common.py
@@ -279,7 +279,6 @@ def test_search_cv(estimator, check, request):
     'multiclass',
     'multioutput',
     'naive_bayes',
-    'neighbors',
     'pipeline',
     'random_projection',
 }

From 6a9c2869d24ab09671aabedbb594c0732ddc2a2d Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Wed, 27 Jan 2021 04:48:00 -0500
Subject: [PATCH 101/478] DOC Fixes formating in feature_extraction module
 (#19274)

---
 sklearn/feature_extraction/_hash.py |  6 +--
 sklearn/feature_extraction/text.py  | 64 ++++++++++++++---------------
 2 files changed, 34 insertions(+), 36 deletions(-)

diff --git a/sklearn/feature_extraction/_hash.py b/sklearn/feature_extraction/_hash.py
index 198e00d7a0a5a..57f927649bd6f 100644
--- a/sklearn/feature_extraction/_hash.py
+++ b/sklearn/feature_extraction/_hash.py
@@ -70,9 +70,9 @@ class FeatureHasher(TransformerMixin, BaseEstimator):
         approximately conserve the inner product in the hashed space even for
         small n_features. This approach is similar to sparse random projection.
 
-    .. versionchanged:: 0.19
-        ``alternate_sign`` replaces the now deprecated ``non_negative``
-        parameter.
+        .. versionchanged:: 0.19
+            ``alternate_sign`` replaces the now deprecated ``non_negative``
+            parameter.
 
     Examples
     --------
diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
index c982f86b3dd3a..0c85b35755071 100644
--- a/sklearn/feature_extraction/text.py
+++ b/sklearn/feature_extraction/text.py
@@ -552,16 +552,16 @@ class HashingVectorizer(TransformerMixin, _VectorizerMixin, BaseEstimator):
     Parameters
     ----------
 
-    input : string {'filename', 'file', 'content'}, default='content'
-        If 'filename', the sequence passed as an argument to fit is
-        expected to be a list of filenames that need reading to fetch
-        the raw content to analyze.
+    input : {'filename', 'file', 'content'}, default='content'
+        - If `'filename'`, the sequence passed as an argument to fit is
+          expected to be a list of filenames that need reading to fetch
+          the raw content to analyze.
 
-        If 'file', the sequence items must have a 'read' method (file-like
-        object) that is called to fetch the bytes in memory.
+        - If `'file'`, the sequence items must have a 'read' method (file-like
+          object) that is called to fetch the bytes in memory.
 
-        Otherwise the input is expected to be a sequence of items that
-        can be of type string or byte.
+        - If `'content'`, the input is expected to be a sequence of items that
+          can be of type string or byte.
 
     encoding : string, default='utf-8'
         If bytes or files are given to analyze, this encoding is used to
@@ -597,7 +597,7 @@ class HashingVectorizer(TransformerMixin, _VectorizerMixin, BaseEstimator):
         preprocessing and n-grams generation steps.
         Only applies if ``analyzer == 'word'``.
 
-    stop_words : string {'english'}, list, default=None
+    stop_words : {'english'}, list, default=None
         If 'english', a built-in stop word list for English is used.
         There are several known issues with 'english' and you should
         consider an alternative (see :ref:`stop_words`).
@@ -633,10 +633,9 @@ class HashingVectorizer(TransformerMixin, _VectorizerMixin, BaseEstimator):
         out of the raw, unprocessed input.
 
         .. versionchanged:: 0.21
-
-        Since v0.21, if ``input`` is ``filename`` or ``file``, the data is
-        first read from the file and then passed to the given callable
-        analyzer.
+            Since v0.21, if ``input`` is ``'filename'`` or ``'file'``, the data
+            is first read from the file and then passed to the given callable
+            analyzer.
 
     n_features : int, default=(2 ** 20)
         The number of features (columns) in the output matrices. Small numbers
@@ -819,16 +818,16 @@ class CountVectorizer(_VectorizerMixin, BaseEstimator):
 
     Parameters
     ----------
-    input : string {'filename', 'file', 'content'}, default='content'
-        If 'filename', the sequence passed as an argument to fit is
-        expected to be a list of filenames that need reading to fetch
-        the raw content to analyze.
+    input : {'filename', 'file', 'content'}, default='content'
+        - If `'filename'`, the sequence passed as an argument to fit is
+          expected to be a list of filenames that need reading to fetch
+          the raw content to analyze.
 
-        If 'file', the sequence items must have a 'read' method (file-like
-        object) that is called to fetch the bytes in memory.
+        - If `'file'`, the sequence items must have a 'read' method (file-like
+          object) that is called to fetch the bytes in memory.
 
-        Otherwise the input is expected to be a sequence of items that
-        can be of type string or byte.
+        - If `'content'`, the input is expected to be a sequence of items that
+          can be of type string or byte.
 
     encoding : string, default='utf-8'
         If bytes or files are given to analyze, this encoding is used to
@@ -864,7 +863,7 @@ class CountVectorizer(_VectorizerMixin, BaseEstimator):
         preprocessing and n-grams generation steps.
         Only applies if ``analyzer == 'word'``.
 
-    stop_words : string {'english'}, list, default=None
+    stop_words : {'english'}, list, default=None
         If 'english', a built-in stop word list for English is used.
         There are several known issues with 'english' and you should
         consider an alternative (see :ref:`stop_words`).
@@ -1532,15 +1531,15 @@ class TfidfVectorizer(CountVectorizer):
     Parameters
     ----------
     input : {'filename', 'file', 'content'}, default='content'
-        If 'filename', the sequence passed as an argument to fit is
-        expected to be a list of filenames that need reading to fetch
-        the raw content to analyze.
+        - If `'filename'`, the sequence passed as an argument to fit is
+          expected to be a list of filenames that need reading to fetch
+          the raw content to analyze.
 
-        If 'file', the sequence items must have a 'read' method (file-like
-        object) that is called to fetch the bytes in memory.
+        - If `'file'`, the sequence items must have a 'read' method (file-like
+          object) that is called to fetch the bytes in memory.
 
-        Otherwise the input is expected to be a sequence of items that
-        can be of type string or byte.
+        - If `'content'`, the input is expected to be a sequence of items that
+          can be of type string or byte.
 
     encoding : str, default='utf-8'
         If bytes or files are given to analyze, this encoding is used to
@@ -1585,10 +1584,9 @@ class TfidfVectorizer(CountVectorizer):
         out of the raw, unprocessed input.
 
         .. versionchanged:: 0.21
-
-        Since v0.21, if ``input`` is ``filename`` or ``file``, the data is
-        first read from the file and then passed to the given callable
-        analyzer.
+            Since v0.21, if ``input`` is ``'filename'`` or ``'file'``, the data
+            is first read from the file and then passed to the given callable
+            analyzer.
 
     stop_words : {'english'}, list, default=None
         If a string, it is passed to _check_stop_list and the appropriate stop

From c86076fbecaac1f6f5f068a5332871f5dd0f8451 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Wed, 27 Jan 2021 17:26:47 -0500
Subject: [PATCH 102/478] DOC clarify docstrings + fixes links to references in
 linear models (#19282)

---
 sklearn/linear_model/_passive_aggressive.py |  2 +-
 sklearn/manifold/_locally_linear.py         | 18 +++++++++---------
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/sklearn/linear_model/_passive_aggressive.py b/sklearn/linear_model/_passive_aggressive.py
index f813a85dd8b5c..f3fa17ad1325e 100644
--- a/sklearn/linear_model/_passive_aggressive.py
+++ b/sklearn/linear_model/_passive_aggressive.py
@@ -319,7 +319,7 @@ class PassiveAggressiveRegressor(BaseSGDRegressor):
         squared_epsilon_insensitive: equivalent to PA-II in the reference
         paper.
 
-    epsilon : float, default=DEFAULT_EPSILON
+    epsilon : float, default=0.1
         If the difference between the current prediction and the correct label
         is below this threshold, the model is not updated.
 
diff --git a/sklearn/manifold/_locally_linear.py b/sklearn/manifold/_locally_linear.py
index c253fb85b3672..7a4e0ace9fccd 100644
--- a/sklearn/manifold/_locally_linear.py
+++ b/sklearn/manifold/_locally_linear.py
@@ -563,15 +563,15 @@ class LocallyLinearEmbedding(TransformerMixin,
         Not used if eigen_solver=='dense'.
 
     method : {'standard', 'hessian', 'modified', 'ltsa'}, default='standard'
-        standard : use the standard locally linear embedding algorithm.  see
-                   reference [1]
-        hessian  : use the Hessian eigenmap method. This method requires
-                   ``n_neighbors > n_components * (1 + (n_components + 1) / 2``
-                   see reference [2]
-        modified : use the modified locally linear embedding algorithm.
-                   see reference [3]
-        ltsa     : use local tangent space alignment algorithm
-                   see reference [4]
+        - `standard`: use the standard locally linear embedding algorithm. see
+          reference [1]_
+        - `hessian`: use the Hessian eigenmap method. This method requires
+          ``n_neighbors > n_components * (1 + (n_components + 1) / 2``. see
+          reference [2]_
+        - `modified`: use the modified locally linear embedding algorithm.
+          see reference [3]_
+        - `ltsa`: use local tangent space alignment algorithm. see
+          reference [4]_
 
     hessian_tol : float, default=1e-4
         Tolerance for Hessian eigenmapping method.

From 11834624e97de65d3f7fe66729e8cf90870c5901 Mon Sep 17 00:00:00 2001
From: dmallia17 <Daniel.Mallia21@myhunter.cuny.edu>
Date: Thu, 28 Jan 2021 04:54:34 -0500
Subject: [PATCH 103/478] DOC Correct dataframe shape in fetch_covtype
 documentation (#19288)

---
 sklearn/datasets/_covtype.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/datasets/_covtype.py b/sklearn/datasets/_covtype.py
index efabddb2fff3b..305f465369604 100644
--- a/sklearn/datasets/_covtype.py
+++ b/sklearn/datasets/_covtype.py
@@ -120,7 +120,7 @@ def fetch_covtype(*, data_home=None, download_if_missing=True,
             Each value corresponds to one of
             the 7 forest covertypes with values
             ranging between 1 to 7.
-        frame : dataframe of shape (581012, 53)
+        frame : dataframe of shape (581012, 55)
             Only present when `as_frame=True`. Contains `data` and `target`.
         DESCR : str
             Description of the forest covertype dataset.

From ee32699b4c2ecf986786de57ebb78dcf602bca8e Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 28 Jan 2021 11:51:11 +0100
Subject: [PATCH 104/478] DOC use notebook style and diagram for feature
 selection example (#18836)

---
 .../plot_feature_selection_pipeline.py        | 85 ++++++++++++++-----
 1 file changed, 63 insertions(+), 22 deletions(-)

diff --git a/examples/feature_selection/plot_feature_selection_pipeline.py b/examples/feature_selection/plot_feature_selection_pipeline.py
index d34375c4c2aa9..871c894ee0711 100644
--- a/examples/feature_selection/plot_feature_selection_pipeline.py
+++ b/examples/feature_selection/plot_feature_selection_pipeline.py
@@ -1,40 +1,81 @@
 """
 ==================
-Pipeline Anova SVM
+Pipeline ANOVA SVM
 ==================
 
-Simple usage of Pipeline that runs successively a univariate
-feature selection with anova and then a SVM of the selected features.
+This example shows how a feature selection can be easily integrated within
+a machine learning pipeline.
 
-Using a sub-pipeline, the fitted coefficients can be mapped back into
-the original feature space.
+We also show that you can easily introspect part of the pipeline.
 """
-from sklearn import svm
-from sklearn.datasets import make_classification
-from sklearn.feature_selection import SelectKBest, f_classif
-from sklearn.pipeline import make_pipeline
-from sklearn.model_selection import train_test_split
-from sklearn.metrics import classification_report
 
 print(__doc__)
+from sklearn import set_config
+set_config(display='diagram')
 
-# import some data to play with
-X, y = make_classification(
-    n_features=20, n_informative=3, n_redundant=0, n_classes=4,
-    n_clusters_per_class=2)
+# %%
+# We will start by generating a binary classification dataset. Subsequently, we
+# will divide the dataset into two subsets.
 
+from sklearn.datasets import make_classification
+from sklearn.model_selection import train_test_split
+
+X, y = make_classification(
+    n_features=20, n_informative=3, n_redundant=0, n_classes=2,
+    n_clusters_per_class=2, random_state=42)
 X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
 
-# ANOVA SVM-C
-# 1) anova filter, take 3 best ranked features
-anova_filter = SelectKBest(f_classif, k=3)
-# 2) svm
-clf = svm.LinearSVC()
+# %%
+# A common mistake done with feature selection is to search a subset of
+# discriminative features on the full dataset instead of only using the
+# training set. The usage of scikit-learn :func:`~sklearn.pipeline.Pipeline`
+# prevents to make such mistake.
+#
+# Here, we will demonstrate how to build a pipeline where the first step will
+# be the feature selection.
+#
+# When calling `fit` on the training data, a subset of feature will be selected
+# and the index of these selected features will be stored. The feature selector
+# will subsequently reduce the number of feature and pass this subset to the
+# classifier which will be trained.
 
+from sklearn.feature_selection import SelectKBest, f_classif
+from sklearn.pipeline import make_pipeline
+from sklearn.svm import LinearSVC
+
+anova_filter = SelectKBest(f_classif, k=3)
+clf = LinearSVC()
 anova_svm = make_pipeline(anova_filter, clf)
 anova_svm.fit(X_train, y_train)
+
+# %%
+# Once the training accomplished, we can predict on new unseen samples. In this
+# case, the feature selector will only select the most discriminative features
+# based on the information stored during training. Then, the data will be
+# passed to the classifier which will make the prediction.
+#
+# Here, we report the final metrics via a classification report.
+
+from sklearn.metrics import classification_report
+
 y_pred = anova_svm.predict(X_test)
 print(classification_report(y_test, y_pred))
 
-coef = anova_svm[:-1].inverse_transform(anova_svm['linearsvc'].coef_)
-print(coef)
+# %%
+# Be aware that you can inspect a step in the pipeline. For instance, we might
+# be interested about the parameters of the classifier. Since we selected
+# three features, we expect to have three coefficients.
+
+anova_svm[-1].coef_
+
+# %%
+# However, we do not know which features where selected from the original
+# dataset. We could proceed by several manner. Here, we will inverse the
+# transformation of these coefficients to get information about the original
+# space.
+
+anova_svm[:-1].inverse_transform(anova_svm[-1].coef_)
+
+# %%
+# We can see that the first three features where the selected features by
+# the first step.

From 3a1b4b864d1569265cb55813d5edde9ecb65b5a7 Mon Sep 17 00:00:00 2001
From: Sandy Khosasi <45941585+ilos-vigil@users.noreply.github.com>
Date: Thu, 28 Jan 2021 15:00:39 +0000
Subject: [PATCH 105/478] DOC remove duplicated line of code in
 linear_model.rst (#19292)

---
 doc/modules/linear_model.rst | 2 --
 1 file changed, 2 deletions(-)

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index 3b8e956136bf8..f1f376dc641c9 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -43,8 +43,6 @@ and will store the coefficients :math:`w` of the linear model in its
 
     >>> from sklearn import linear_model
     >>> reg = linear_model.LinearRegression()
-    >>> reg.fit ([[0, 0], [1, 1], [2, 2]], [0, 1, 2])
-    LinearRegression()
     >>> reg.fit([[0, 0], [1, 1], [2, 2]], [0, 1, 2])
     LinearRegression()
     >>> reg.coef_

From ff2e52da0c09e8cb2d9a1b62bd4c3ea481187308 Mon Sep 17 00:00:00 2001
From: Eric Fiegel <efiegel01@gmail.com>
Date: Thu, 28 Jan 2021 13:07:21 -0800
Subject: [PATCH 106/478] TST add binary and multiclass test for scorers
 (#18904)

Co-authored-by: Guillaume Lemaitre <g.lemaitre58@gmail.com>
---
 sklearn/metrics/tests/test_score_objects.py | 99 +++++++++++++++------
 1 file changed, 70 insertions(+), 29 deletions(-)

diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py
index 3f1401dc08713..be214944e6ee4 100644
--- a/sklearn/metrics/tests/test_score_objects.py
+++ b/sklearn/metrics/tests/test_score_objects.py
@@ -18,6 +18,8 @@
 
 from sklearn.base import BaseEstimator
 from sklearn.metrics import (
+    accuracy_score,
+    balanced_accuracy_score,
     average_precision_score,
     brier_score_loss,
     f1_score,
@@ -28,13 +30,13 @@
     r2_score,
     recall_score,
     roc_auc_score,
+    top_k_accuracy_score
 )
 from sklearn.metrics import cluster as cluster_module
 from sklearn.metrics import check_scoring
 from sklearn.metrics._scorer import (_PredictScorer, _passthrough_scorer,
                                      _MultimetricScorer,
                                      _check_multimetric_scoring)
-from sklearn.metrics import accuracy_score
 from sklearn.metrics import make_scorer, get_scorer, SCORERS
 from sklearn.neighbors import KNeighborsClassifier
 from sklearn.svm import LinearSVC
@@ -68,7 +70,7 @@
                'roc_auc', 'average_precision', 'precision',
                'precision_weighted', 'precision_macro', 'precision_micro',
                'recall', 'recall_weighted', 'recall_macro', 'recall_micro',
-               'neg_log_loss', 'log_loss', 'neg_brier_score',
+               'neg_log_loss', 'neg_brier_score',
                'jaccard', 'jaccard_weighted', 'jaccard_macro',
                'jaccard_micro', 'roc_auc_ovr', 'roc_auc_ovo',
                'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted']
@@ -306,46 +308,85 @@ def test_make_scorer():
         make_scorer(f, needs_threshold=True, needs_proba=True)
 
 
-def test_classification_scores():
-    # Test classification scorers.
+@pytest.mark.parametrize('scorer_name, metric', [
+    ('f1', f1_score),
+    ('f1_weighted', partial(f1_score, average='weighted')),
+    ('f1_macro', partial(f1_score, average='macro')),
+    ('f1_micro', partial(f1_score, average='micro')),
+    ('precision', precision_score),
+    ('precision_weighted', partial(precision_score, average='weighted')),
+    ('precision_macro', partial(precision_score, average='macro')),
+    ('precision_micro', partial(precision_score, average='micro')),
+    ('recall', recall_score),
+    ('recall_weighted', partial(recall_score, average='weighted')),
+    ('recall_macro', partial(recall_score, average='macro')),
+    ('recall_micro', partial(recall_score, average='micro')),
+    ('jaccard', jaccard_score),
+    ('jaccard_weighted', partial(jaccard_score, average='weighted')),
+    ('jaccard_macro', partial(jaccard_score, average='macro')),
+    ('jaccard_micro', partial(jaccard_score, average='micro')),
+    ('top_k_accuracy', top_k_accuracy_score),
+])
+def test_classification_binary_scores(scorer_name, metric):
+    # check consistency between score and scorer for scores supporting
+    # binary classification.
     X, y = make_blobs(random_state=0, centers=2)
     X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
     clf = LinearSVC(random_state=0)
     clf.fit(X_train, y_train)
 
-    for prefix, metric in [('f1', f1_score), ('precision', precision_score),
-                           ('recall', recall_score),
-                           ('jaccard', jaccard_score)]:
+    score = SCORERS[scorer_name](clf, X_test, y_test)
+    expected_score = metric(y_test, clf.predict(X_test))
+    assert_almost_equal(score, expected_score)
 
-        score1 = get_scorer('%s_weighted' % prefix)(clf, X_test, y_test)
-        score2 = metric(y_test, clf.predict(X_test), pos_label=None,
-                        average='weighted')
-        assert_almost_equal(score1, score2)
 
-        score1 = get_scorer('%s_macro' % prefix)(clf, X_test, y_test)
-        score2 = metric(y_test, clf.predict(X_test), pos_label=None,
-                        average='macro')
-        assert_almost_equal(score1, score2)
+@pytest.mark.parametrize('scorer_name, metric', [
+    ('accuracy', accuracy_score),
+    ('balanced_accuracy', balanced_accuracy_score),
+    ('f1_weighted', partial(f1_score, average='weighted')),
+    ('f1_macro', partial(f1_score, average='macro')),
+    ('f1_micro', partial(f1_score, average='micro')),
+    ('precision_weighted', partial(precision_score, average='weighted')),
+    ('precision_macro', partial(precision_score, average='macro')),
+    ('precision_micro', partial(precision_score, average='micro')),
+    ('recall_weighted', partial(recall_score, average='weighted')),
+    ('recall_macro', partial(recall_score, average='macro')),
+    ('recall_micro', partial(recall_score, average='micro')),
+    ('jaccard_weighted', partial(jaccard_score, average='weighted')),
+    ('jaccard_macro', partial(jaccard_score, average='macro')),
+    ('jaccard_micro', partial(jaccard_score, average='micro')),
+])
+def test_classification_multiclass_scores(scorer_name, metric):
+    # check consistency between score and scorer for scores supporting
+    # multiclass classification.
+    X, y = make_classification(
+        n_classes=3, n_informative=3, n_samples=30, random_state=0
+    )
 
-        score1 = get_scorer('%s_micro' % prefix)(clf, X_test, y_test)
-        score2 = metric(y_test, clf.predict(X_test), pos_label=None,
-                        average='micro')
-        assert_almost_equal(score1, score2)
+    # use `stratify` = y to ensure train and test sets capture all classes
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, random_state=0, stratify=y
+    )
 
-        score1 = get_scorer('%s' % prefix)(clf, X_test, y_test)
-        score2 = metric(y_test, clf.predict(X_test), pos_label=1)
-        assert_almost_equal(score1, score2)
+    clf = DecisionTreeClassifier(random_state=0)
+    clf.fit(X_train, y_train)
+    score = SCORERS[scorer_name](clf, X_test, y_test)
+    expected_score = metric(y_test, clf.predict(X_test))
+    assert score == pytest.approx(expected_score)
 
-    # test fbeta score that takes an argument
-    scorer = make_scorer(fbeta_score, beta=2)
-    score1 = scorer(clf, X_test, y_test)
-    score2 = fbeta_score(y_test, clf.predict(X_test), beta=2)
-    assert_almost_equal(score1, score2)
 
+def test_custom_scorer_pickling():
     # test that custom scorer can be pickled
+    X, y = make_blobs(random_state=0, centers=2)
+    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
+    clf = LinearSVC(random_state=0)
+    clf.fit(X_train, y_train)
+
+    scorer = make_scorer(fbeta_score, beta=2)
+    score1 = scorer(clf, X_test, y_test)
     unpickled_scorer = pickle.loads(pickle.dumps(scorer))
-    score3 = unpickled_scorer(clf, X_test, y_test)
-    assert_almost_equal(score1, score3)
+    score2 = unpickled_scorer(clf, X_test, y_test)
+    assert score1 == pytest.approx(score2)
 
     # smoke test the repr:
     repr(fbeta_score)

From 96dfe1e1a1c2849c9c83ae80ba3091238f0dbaeb Mon Sep 17 00:00:00 2001
From: Zito Relova <zitorelova@gmail.com>
Date: Thu, 28 Jan 2021 23:18:04 -0800
Subject: [PATCH 107/478] DOC Add log-link to Gamma and Poisson Regressor
 (#19287)

---
 sklearn/linear_model/_glm/glm.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index 6bbf6f2b36d55..68797b176727b 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -390,6 +390,8 @@ def _more_tags(self):
 class PoissonRegressor(GeneralizedLinearRegressor):
     """Generalized Linear Model with a Poisson distribution.
 
+    This regressor uses the 'log' link function.
+
     Read more in the :ref:`User Guide <Generalized_linear_regression>`.
 
     .. versionadded:: 0.23
@@ -472,6 +474,8 @@ def family(self, value):
 class GammaRegressor(GeneralizedLinearRegressor):
     """Generalized Linear Model with a Gamma distribution.
 
+    This regressor uses the 'log' link function.
+
     Read more in the :ref:`User Guide <Generalized_linear_regression>`.
 
     .. versionadded:: 0.23

From 776f69171fb1cbf873d72bca641b1422e7d9152e Mon Sep 17 00:00:00 2001
From: Venkatachalam N <venky.yuvy@gmail.com>
Date: Fri, 29 Jan 2021 18:39:40 +0530
Subject: [PATCH 108/478] removing incrementalPCA example in basePCA (#19298)

---
 sklearn/decomposition/_base.py | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/sklearn/decomposition/_base.py b/sklearn/decomposition/_base.py
index 13d4768e6438f..b944d23d3388d 100644
--- a/sklearn/decomposition/_base.py
+++ b/sklearn/decomposition/_base.py
@@ -109,17 +109,6 @@ def transform(self, X):
         Returns
         -------
         X_new : array-like, shape (n_samples, n_components)
-
-        Examples
-        --------
-
-        >>> import numpy as np
-        >>> from sklearn.decomposition import IncrementalPCA
-        >>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
-        >>> ipca = IncrementalPCA(n_components=2, batch_size=3)
-        >>> ipca.fit(X)
-        IncrementalPCA(batch_size=3, n_components=2)
-        >>> ipca.transform(X) # doctest: +SKIP
         """
         check_is_fitted(self)
 

From b94332434d0117e3d86407560a206d1c7bee1c81 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Fri, 29 Jan 2021 09:07:25 -0500
Subject: [PATCH 109/478] API Removes tol=None option from
 HistGradientBoosting* (#19296)

* API Removes tol=None option from HistGradient*

* Pass tol=0.0 instead of tol=None in early stopping tests

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 .../_hist_gradient_boosting/gradient_boosting.py         | 9 ++++-----
 .../tests/test_gradient_boosting.py                      | 4 ++--
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
index dd8435a2af746..d30cd030bf698 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
@@ -82,7 +82,7 @@ def _validate_parameters(self):
             raise ValueError(
                 'validation_fraction={} must be strictly '
                 'positive, or None.'.format(self.validation_fraction))
-        if self.tol is not None and self.tol < 0:
+        if self.tol < 0:
             raise ValueError('tol={} '
                              'must not be smaller than 0.'.format(self.tol))
 
@@ -646,8 +646,7 @@ def _should_stop(self, scores):
         # harder for subsequent iteration to be considered an improvement upon
         # the reference score, and therefore it is more likely to early stop
         # because of the lack of significant improvement.
-        tol = 0 if self.tol is None else self.tol
-        reference_score = scores[-reference_position] + tol
+        reference_score = scores[-reference_position] + self.tol
         recent_scores = scores[-reference_position + 1:]
         recent_improvements = [score > reference_score
                                for score in recent_scores]
@@ -992,7 +991,7 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting):
         stopped when none of the last ``n_iter_no_change`` scores are better
         than the ``n_iter_no_change - 1`` -th-to-last one, up to some
         tolerance. Only used if early stopping is performed.
-    tol : float or None, default=1e-7
+    tol : float, default=1e-7
         The absolute tolerance to use when comparing scores during early
         stopping. The higher the tolerance, the more likely we are to early
         stop: higher tolerance means that it will be harder for subsequent
@@ -1245,7 +1244,7 @@ class HistGradientBoostingClassifier(ClassifierMixin,
         stopped when none of the last ``n_iter_no_change`` scores are better
         than the ``n_iter_no_change - 1`` -th-to-last one, up to some
         tolerance. Only used if early stopping is performed.
-    tol : float or None, default=1e-7
+    tol : float, default=1e-7
         The absolute tolerance to use when comparing scores. The higher the
         tolerance, the more likely we are to early stop: higher tolerance
         means that it will be harder for subsequent iterations to be
diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
index 8bb9e83966fff..c501125059c8f 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
@@ -85,7 +85,7 @@ def test_invalid_classification_loss():
         (None, None, True, 5, 1e-1),
         ('loss', .1, True, 5, 1e-7),  # use loss
         ('loss', None, True, 5, 1e-1),  # use loss on training data
-        (None, None, False, 5, None),  # no early stopping
+        (None, None, False, 5, 0.0),  # no early stopping
         ])
 def test_early_stopping_regression(scoring, validation_fraction,
                                    early_stopping, n_iter_no_change, tol):
@@ -126,7 +126,7 @@ def test_early_stopping_regression(scoring, validation_fraction,
         (None, None, True, 5, 1e-1),
         ('loss', .1, True, 5, 1e-7),  # use loss
         ('loss', None, True, 5, 1e-1),  # use loss on training data
-        (None, None, False, 5, None),  # no early stopping
+        (None, None, False, 5, 0.0),  # no early stopping
         ])
 def test_early_stopping_classification(data, scoring, validation_fraction,
                                        early_stopping, n_iter_no_change, tol):

From aa1e69a38f1b8bc3aaa5aa34f49868684b61a7e6 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Sat, 30 Jan 2021 03:05:10 -0500
Subject: [PATCH 110/478] API Removes the use of fit_ and partial_fit_ in Birch
 (#19297)

* API Removes the use of fit_ and partial_fit_ in Birch

* DOC Adds whats new

* ENH Adjust names

* CLN Uses a verbose name
---
 doc/whats_new/v1.0.rst              |  3 +++
 sklearn/cluster/_birch.py           | 33 +++++++++++++++++++++++------
 sklearn/cluster/tests/test_birch.py | 12 +++++++++++
 3 files changed, 42 insertions(+), 6 deletions(-)

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index a00523ec2223b..382ff363e0db7 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -56,6 +56,9 @@ Changelog
   in multicore settings. :pr:`19052` by
   :user:`Yusuke Nagasaka <YusukeNagasaka>`.
 
+- |API| :class:`cluster.Birch` attributes, `fit_` and `partial_fit_`, are
+  deprecated and will be removed in 1.2. :pr:`19297` by `Thomas Fan`_.
+
 - |Fix| Fixes incorrect multiple data-conversion warnings when clustering
   boolean data. :pr:`19046` by :user:`Surya Prakash <jdsurya>`.
 
diff --git a/sklearn/cluster/_birch.py b/sklearn/cluster/_birch.py
index 6eb167cc9a315..2aacee6deef16 100644
--- a/sklearn/cluster/_birch.py
+++ b/sklearn/cluster/_birch.py
@@ -13,6 +13,7 @@
 from ..metrics.pairwise import euclidean_distances
 from ..base import TransformerMixin, ClusterMixin, BaseEstimator
 from ..utils.extmath import row_norms
+from ..utils import deprecated
 from ..utils.validation import check_is_fitted, _deprecate_positional_args
 from ..exceptions import ConvergenceWarning
 from . import AgglomerativeClustering
@@ -440,6 +441,24 @@ def __init__(self, *, threshold=0.5, branching_factor=50, n_clusters=3,
         self.compute_labels = compute_labels
         self.copy = copy
 
+    # TODO: Remove in 1.2
+    # mypy error: Decorated property not supported
+    @deprecated(  # type: ignore
+        "fit_ is deprecated in 1.0 and will be removed in 1.2"
+    )
+    @property
+    def fit_(self):
+        return self._deprecated_fit
+
+    # TODO: Remove in 1.2
+    # mypy error: Decorated property not supported
+    @deprecated(  # type: ignore
+        "partial_fit_ is deprecated in 1.0 and will be removed in 1.2"
+    )
+    @property
+    def partial_fit_(self):
+        return self._deprecated_partial_fit
+
     def fit(self, X, y=None):
         """
         Build a CF Tree for the input data.
@@ -457,12 +476,13 @@ def fit(self, X, y=None):
         self
             Fitted estimator.
         """
-        self.fit_, self.partial_fit_ = True, False
-        return self._fit(X)
+        # TODO: Remove deprected flags in 1.2
+        self._deprecated_fit, self._deprecated_partial_fit = True, False
+        return self._fit(X, partial=False)
 
-    def _fit(self, X):
+    def _fit(self, X, partial):
         has_root = getattr(self, 'root_', None)
-        first_call = self.fit_ or (self.partial_fit_ and not has_root)
+        first_call = not (partial and has_root)
 
         X = self._validate_data(X, accept_sparse='csr', copy=self.copy,
                                 reset=first_call)
@@ -552,13 +572,14 @@ def partial_fit(self, X=None, y=None):
         self
             Fitted estimator.
         """
-        self.partial_fit_, self.fit_ = True, False
+        # TODO: Remove deprected flags in 1.2
+        self._deprecated_partial_fit, self._deprecated_fit = True, False
         if X is None:
             # Perform just the final global clustering step.
             self._global_clustering()
             return self
         else:
-            return self._fit(X)
+            return self._fit(X, partial=True)
 
     def _check_fit(self, X):
         check_is_fitted(self)
diff --git a/sklearn/cluster/tests/test_birch.py b/sklearn/cluster/tests/test_birch.py
index eca5fb3e68937..37c9a083842b1 100644
--- a/sklearn/cluster/tests/test_birch.py
+++ b/sklearn/cluster/tests/test_birch.py
@@ -179,3 +179,15 @@ def test_birch_n_clusters_long_int():
     X, _ = make_blobs(random_state=0)
     n_clusters = np.int64(5)
     Birch(n_clusters=n_clusters).fit(X)
+
+
+# TODO: Remove in 1.2
+@pytest.mark.parametrize("attribute", ["fit_", "partial_fit_"])
+def test_birch_fit_attributes_deprecated(attribute):
+    """Test that fit_ and partial_fit_ attributes are deprecated."""
+    msg = f"{attribute} is deprecated in 1.0 and will be removed in 1.2"
+    X, y = make_blobs(n_samples=10)
+    brc = Birch().fit(X, y)
+
+    with pytest.warns(FutureWarning, match=msg):
+        getattr(brc, attribute)

From 863c552c448118249563f0e709ea83a1a9b2fc7f Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Sat, 30 Jan 2021 07:33:27 -0500
Subject: [PATCH 111/478] CLN Fixes PendingDeprecationWarning in
 CountVectorizer (#19299)

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 sklearn/feature_extraction/tests/test_text.py | 18 +++++++++++++++---
 sklearn/feature_extraction/text.py            | 18 ++++++++----------
 2 files changed, 23 insertions(+), 13 deletions(-)

diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py
index 59ab269b6d69c..e3180c96546bc 100644
--- a/sklearn/feature_extraction/tests/test_text.py
+++ b/sklearn/feature_extraction/tests/test_text.py
@@ -773,18 +773,30 @@ def test_vectorizer_inverse_transform(Vectorizer):
     vectorizer = Vectorizer()
     transformed_data = vectorizer.fit_transform(data)
     inversed_data = vectorizer.inverse_transform(transformed_data)
+    assert isinstance(inversed_data, list)
+
     analyze = vectorizer.build_analyzer()
     for doc, inversed_terms in zip(data, inversed_data):
         terms = np.sort(np.unique(analyze(doc)))
         inversed_terms = np.sort(np.unique(inversed_terms))
         assert_array_equal(terms, inversed_terms)
 
-    # Test that inverse_transform also works with numpy arrays
-    transformed_data = transformed_data.toarray()
-    inversed_data2 = vectorizer.inverse_transform(transformed_data)
+    assert sparse.issparse(transformed_data)
+    assert transformed_data.format == "csr"
+
+    # Test that inverse_transform also works with numpy arrays and
+    # scipy
+    transformed_data2 = transformed_data.toarray()
+    inversed_data2 = vectorizer.inverse_transform(transformed_data2)
     for terms, terms2 in zip(inversed_data, inversed_data2):
         assert_array_equal(np.sort(terms), np.sort(terms2))
 
+    # Check that inverse_transform also works on non CSR sparse data:
+    transformed_data3 = transformed_data.tocsc()
+    inversed_data3 = vectorizer.inverse_transform(transformed_data3)
+    for terms, terms3 in zip(inversed_data, inversed_data3):
+        assert_array_equal(np.sort(terms), np.sort(terms3))
+
 
 def test_count_vectorizer_pipeline_grid_selection():
     # raw documents
diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
index 0c85b35755071..7fd6303e52491 100644
--- a/sklearn/feature_extraction/text.py
+++ b/sklearn/feature_extraction/text.py
@@ -1270,22 +1270,20 @@ def inverse_transform(self, X):
             List of arrays of terms.
         """
         self._check_vocabulary()
-
-        if sp.issparse(X):
-            # We need CSR format for fast row manipulations.
-            X = X.tocsr()
-        else:
-            # We need to convert X to a matrix, so that the indexing
-            # returns 2D objects
-            X = np.asmatrix(X)
+        # We need CSR format for fast row manipulations.
+        X = check_array(X, accept_sparse='csr')
         n_samples = X.shape[0]
 
         terms = np.array(list(self.vocabulary_.keys()))
         indices = np.array(list(self.vocabulary_.values()))
         inverse_vocabulary = terms[np.argsort(indices)]
 
-        return [inverse_vocabulary[X[i, :].nonzero()[1]].ravel()
-                for i in range(n_samples)]
+        if sp.issparse(X):
+            return [inverse_vocabulary[X[i, :].nonzero()[1]].ravel()
+                    for i in range(n_samples)]
+        else:
+            return [inverse_vocabulary[np.flatnonzero(X[i, :])].ravel()
+                    for i in range(n_samples)]
 
     def get_feature_names(self):
         """Array mapping from feature integer indices to feature name.

From 38e6022e24e1a3c91f932fec87302ffc0610651b Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Sat, 30 Jan 2021 07:35:07 -0500
Subject: [PATCH 112/478] TST Ignores PendingDeprecationWarning in tests
 (#19301)

---
 sklearn/utils/tests/test_validation.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py
index 293af1732e1f4..f05bd4656cbd9 100644
--- a/sklearn/utils/tests/test_validation.py
+++ b/sklearn/utils/tests/test_validation.py
@@ -55,6 +55,8 @@
 from sklearn.utils._testing import TempMemmap
 
 
+@pytest.mark.filterwarnings(
+    "ignore:the matrix subclass:PendingDeprecationWarning")
 def test_as_float_array():
     # Test function for as_float_array
     X = np.ones((3, 10), dtype=np.int32)
@@ -111,6 +113,8 @@ def test_as_float_array_nan(X):
     assert_allclose_dense_sparse(X_converted, X)
 
 
+@pytest.mark.filterwarnings(
+    "ignore:the matrix subclass:PendingDeprecationWarning")
 def test_np_matrix():
     # Confirm that input validation code does not return np.matrix
     X = np.arange(12).reshape(3, 4)

From 88be2abb7b7f7450dc569e0065e672a0676e4130 Mon Sep 17 00:00:00 2001
From: Erich Schubert <kno10@users.noreply.github.com>
Date: Sun, 31 Jan 2021 01:52:16 +0100
Subject: [PATCH 113/478] Simplify computation of radius to match BIRCH more
 closely (#19251)

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 sklearn/cluster/_birch.py | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/sklearn/cluster/_birch.py b/sklearn/cluster/_birch.py
index 2aacee6deef16..795c7d7f54ec0 100644
--- a/sklearn/cluster/_birch.py
+++ b/sklearn/cluster/_birch.py
@@ -305,23 +305,31 @@ def merge_subcluster(self, nominee_cluster, threshold):
         new_ls = self.linear_sum_ + nominee_cluster.linear_sum_
         new_n = self.n_samples_ + nominee_cluster.n_samples_
         new_centroid = (1 / new_n) * new_ls
-        new_norm = np.dot(new_centroid, new_centroid)
-        dot_product = (-2 * new_n) * new_norm
-        sq_radius = (new_ss + dot_product) / new_n + new_norm
+        new_sq_norm = np.dot(new_centroid, new_centroid)
+
+        # The squared radius of the cluster is defined:
+        #   r^2  = sum_i ||x_i - c||^2 / n
+        # with x_i the n points assigned to the cluster and c its centroid:
+        #   c = sum_i x_i / n
+        # This can be expanded to:
+        #   r^2 = sum_i ||x_i||^2 / n - 2 < sum_i x_i / n, c> + n ||c||^2 / n
+        # and therefore simplifies to:
+        #   r^2 = sum_i ||x_i||^2 / n - ||c||^2
+        sq_radius = new_ss / new_n - new_sq_norm
+
         if sq_radius <= threshold ** 2:
             (self.n_samples_, self.linear_sum_, self.squared_sum_,
              self.centroid_, self.sq_norm_) = \
-                new_n, new_ls, new_ss, new_centroid, new_norm
+                new_n, new_ls, new_ss, new_centroid, new_sq_norm
             return True
         return False
 
     @property
     def radius(self):
         """Return radius of the subcluster"""
-        dot_product = -2 * np.dot(self.linear_sum_, self.centroid_)
-        return sqrt(
-            ((self.squared_sum_ + dot_product) / self.n_samples_) +
-            self.sq_norm_)
+        # Because of numerical issues, this could become negative
+        sq_radius = self.squared_sum_ / self.n_samples_ - self.sq_norm_
+        return sqrt(max(0, sq_radius))
 
 
 class Birch(ClusterMixin, TransformerMixin, BaseEstimator):

From 71f7085dae11a0a7091a3c07df4bc5bc8b119fe6 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Mon, 1 Feb 2021 05:26:42 -0500
Subject: [PATCH 114/478] TST Filter out warning for pyamg using deprecated
 behavior (#19312)

---
 sklearn/cluster/tests/test_spectral.py            | 3 +++
 sklearn/manifold/tests/test_spectral_embedding.py | 6 ++++++
 2 files changed, 9 insertions(+)

diff --git a/sklearn/cluster/tests/test_spectral.py b/sklearn/cluster/tests/test_spectral.py
index 2c0ac67016749..7af3b8089a09c 100644
--- a/sklearn/cluster/tests/test_spectral.py
+++ b/sklearn/cluster/tests/test_spectral.py
@@ -196,6 +196,9 @@ def test_discretize(n_samples):
 # https://github.com/scikit-learn/scikit-learn/issues/15913
 @pytest.mark.filterwarnings(
     "ignore:scipy.rand is deprecated:DeprecationWarning:pyamg.*")
+# TODO: Remove when pyamg removes the use of np.float
+@pytest.mark.filterwarnings(
+    "ignore:`np.float` is a deprecated alias:DeprecationWarning:pyamg.*")
 def test_spectral_clustering_with_arpack_amg_solvers():
     # Test that spectral_clustering is the same for arpack and amg solver
     # Based on toy example from plot_segmentation_toy.py
diff --git a/sklearn/manifold/tests/test_spectral_embedding.py b/sklearn/manifold/tests/test_spectral_embedding.py
index 8fcf113874927..3d196fed45978 100644
--- a/sklearn/manifold/tests/test_spectral_embedding.py
+++ b/sklearn/manifold/tests/test_spectral_embedding.py
@@ -181,6 +181,9 @@ def test_spectral_embedding_callable_affinity(X, seed=36):
 # https://github.com/scikit-learn/scikit-learn/issues/15913
 @pytest.mark.filterwarnings(
     "ignore:scipy.rand is deprecated:DeprecationWarning:pyamg.*")
+# TODO: Remove when pyamg removes the use of np.float
+@pytest.mark.filterwarnings(
+    "ignore:`np.float` is a deprecated alias:DeprecationWarning:pyamg.*")
 def test_spectral_embedding_amg_solver(seed=36):
     # Test spectral embedding with amg solver
     pytest.importorskip('pyamg')
@@ -216,6 +219,9 @@ def test_spectral_embedding_amg_solver(seed=36):
 # https://github.com/scikit-learn/scikit-learn/issues/15913
 @pytest.mark.filterwarnings(
     "ignore:scipy.rand is deprecated:DeprecationWarning:pyamg.*")
+# TODO: Remove when pyamg removes the use of np.float
+@pytest.mark.filterwarnings(
+    "ignore:`np.float` is a deprecated alias:DeprecationWarning:pyamg.*")
 def test_spectral_embedding_amg_solver_failure():
     # Non-regression test for amg solver failure (issue #13393 on github)
     pytest.importorskip('pyamg')

From 96a96f19579c71da80a14f33a15a0402b2f797b4 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Mon, 1 Feb 2021 10:40:57 -0500
Subject: [PATCH 115/478] ENH Adds n_features_in_ checking in
 cross_decomposition (#18741)

---
 sklearn/cross_decomposition/_pls.py |  6 +++---
 sklearn/tests/test_common.py        |  1 -
 sklearn/utils/estimator_checks.py   | 11 ++++++++---
 3 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/sklearn/cross_decomposition/_pls.py b/sklearn/cross_decomposition/_pls.py
index 817d4edbd9e88..a8d43a7fe2924 100644
--- a/sklearn/cross_decomposition/_pls.py
+++ b/sklearn/cross_decomposition/_pls.py
@@ -317,7 +317,7 @@ def transform(self, X, Y=None, copy=True):
         `x_scores` if `Y` is not given, `(x_scores, y_scores)` otherwise.
         """
         check_is_fitted(self)
-        X = check_array(X, copy=copy, dtype=FLOAT_DTYPES)
+        X = self._validate_data(X, copy=copy, dtype=FLOAT_DTYPES, reset=False)
         # Normalize
         X -= self._x_mean
         X /= self._x_std
@@ -379,7 +379,7 @@ def predict(self, X, copy=True):
         space.
         """
         check_is_fitted(self)
-        X = check_array(X, copy=copy, dtype=FLOAT_DTYPES)
+        X = self._validate_data(X, copy=copy, dtype=FLOAT_DTYPES, reset=False)
         # Normalize
         X -= self._x_mean
         X /= self._x_std
@@ -984,7 +984,7 @@ def transform(self, X, Y=None):
             `(X_transformed, Y_transformed)` otherwise.
         """
         check_is_fitted(self)
-        X = check_array(X, dtype=np.float64)
+        X = self._validate_data(X, dtype=np.float64, reset=False)
         Xr = (X - self._x_mean) / self._x_std
         x_scores = np.dot(Xr, self.x_weights_)
         if Y is not None:
diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
index 6515ac17b3a13..37b6e666238b8 100644
--- a/sklearn/tests/test_common.py
+++ b/sklearn/tests/test_common.py
@@ -267,7 +267,6 @@ def test_search_cv(estimator, check, request):
     'calibration',
     'compose',
     'covariance',
-    'cross_decomposition',
     'discriminant_analysis',
     'ensemble',
     'feature_extraction',
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index 849d8a1f3921b..e811c3c3679e9 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -63,7 +63,7 @@
     load_iris,
     make_blobs,
     make_multilabel_classification,
-    make_regression,
+    make_regression
 )
 
 REGRESSION_DATASET = None
@@ -646,6 +646,9 @@ def _set_checking_parameters(estimator):
     if name == 'OneHotEncoder':
         estimator.set_params(handle_unknown='ignore')
 
+    if name in CROSS_DECOMPOSITION:
+        estimator.set_params(n_components=1)
+
 
 class _NotAnArray:
     """An object that is convertible to an array.
@@ -3122,9 +3125,11 @@ def check_n_features_in_after_fitting(name, estimator_orig):
     if 'warm_start' in estimator.get_params():
         estimator.set_params(warm_start=False)
 
-    n_samples = 100
-    X = rng.normal(loc=100, size=(n_samples, 2))
+    n_samples = 150
+    X = rng.normal(size=(n_samples, 8))
+    X = _enforce_estimator_tags_x(estimator, X)
     X = _pairwise_estimator_convert_X(X, estimator)
+
     if is_regressor(estimator):
         y = rng.normal(size=n_samples)
     else:

From 3325c23c9afc1991f5bcd20e00102e81348cfc17 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Mon, 1 Feb 2021 14:59:42 -0500
Subject: [PATCH 116/478] CI Check only title for regex labeling (#19303)

---
 .github/labeler.yml                       |  5 -----
 .github/scripts/label_title_regex.py      | 25 +++++++++++++++++++++++
 .github/workflows/labeler-title-regex.yml | 18 +++++++++-------
 3 files changed, 36 insertions(+), 12 deletions(-)
 delete mode 100644 .github/labeler.yml
 create mode 100644 .github/scripts/label_title_regex.py

diff --git a/.github/labeler.yml b/.github/labeler.yml
deleted file mode 100644
index ad90c725669ca..0000000000000
--- a/.github/labeler.yml
+++ /dev/null
@@ -1,5 +0,0 @@
-labels:
-  'Documentation':
-    - '\bDOC\b'
-  'Build / CI':
-    - '\bCI\b'
diff --git a/.github/scripts/label_title_regex.py b/.github/scripts/label_title_regex.py
new file mode 100644
index 0000000000000..26fc02b3aef38
--- /dev/null
+++ b/.github/scripts/label_title_regex.py
@@ -0,0 +1,25 @@
+"""Labels PRs based on title. Must be run in a github action with the
+pull_request_target event."""
+from ghapi.all import context_github
+from ghapi.all import GhApi
+from ghapi.all import user_repo
+from ghapi.all import github_token
+import re
+
+owner, repo = user_repo()
+pull_request = context_github.event.pull_request
+title = pull_request.title
+
+regex_to_labels = [
+    (r"\bDOC\b", "Documentation"),
+    (r"\bCI\b", "Build / CI")
+]
+
+labels_to_add = [
+    label for regex, label in regex_to_labels
+    if re.search(regex, title)
+]
+
+if labels_to_add:
+    api = GhApi(owner=owner, repo=repo, token=github_token())
+    api.issues.add_labels(pull_request.number, labels=labels_to_add)
diff --git a/.github/workflows/labeler-title-regex.yml b/.github/workflows/labeler-title-regex.yml
index 1f059964a716a..e3c0812029d1b 100644
--- a/.github/workflows/labeler-title-regex.yml
+++ b/.github/workflows/labeler-title-regex.yml
@@ -1,16 +1,20 @@
 name: Pull Request Regex Title Labeler
 on:
   pull_request_target:
-    types: [opened]
+    types: [opened, edited]
 
 jobs:
 
   labeler:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-20.04
     steps:
-    - name: Check Labels
-      id: labeler
-      continue-on-error: true
-      uses: jimschubert/labeler-action@v2
+    - uses: actions/checkout@v2
+    - uses: actions/setup-python@v2
       with:
-        GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}
+        python-version: '3.9'
+    - name: Install ghapi
+      run: pip install -Uq ghapi
+    - name: Label pull request
+      run: python .github/scripts/label_title_regex.py
+      env:
+        CONTEXT_GITHUB: ${{ toJson(github) }}

From 462694a1e2d53d005817ca25927ab4cbdb4e6282 Mon Sep 17 00:00:00 2001
From: Agamemnon Krasoulis <agamemnon.krasoulis@gmail.com>
Date: Mon, 1 Feb 2021 20:01:41 +0000
Subject: [PATCH 117/478] DOC Balanced accuracy score adjusted doc fix (#19309)

---
 sklearn/metrics/_classification.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py
index 8503ae353b4cb..708bde662e765 100644
--- a/sklearn/metrics/_classification.py
+++ b/sklearn/metrics/_classification.py
@@ -1809,7 +1809,8 @@ def balanced_accuracy_score(y_true, y_pred, *, sample_weight=None,
 
     adjusted : bool, default=False
         When true, the result is adjusted for chance, so that random
-        performance would score 0, and perfect performance scores 1.
+        performance would score 0, while keeping perfect performance at a score
+        of 1.
 
     Returns
     -------

From 4408a25218127164cac7594a1314c302d69cdd35 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Mon, 1 Feb 2021 15:04:30 -0500
Subject: [PATCH 118/478] CI Skips test in loading_other_datasets.rst based on
 env flag (#19295)

---
 doc/conftest.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/doc/conftest.py b/doc/conftest.py
index 4496bb74152ac..5468184bf5509 100644
--- a/doc/conftest.py
+++ b/doc/conftest.py
@@ -1,6 +1,7 @@
 import os
 from os.path import exists
 from os.path import join
+from os import environ
 import warnings
 
 from sklearn.utils import IS_PYPY
@@ -48,6 +49,12 @@ def setup_loading_other_datasets():
         raise SkipTest("Skipping loading_other_datasets.rst, "
                        "pandas not installed")
 
+    # checks SKLEARN_SKIP_NETWORK_TESTS to see if test should run
+    run_network_tests = environ.get("SKLEARN_SKIP_NETWORK_TESTS", '1') == "0"
+    if not run_network_tests:
+        raise SkipTest("Skipping loading_other_datasets.rst, tests can be "
+                       "enabled by settting SKLEARN_SKIP_NETWORK_TESTS=0")
+
 
 def setup_compose():
     try:

From 0cc91e7e57c149aa42f625c1bfb16a50b951b92b Mon Sep 17 00:00:00 2001
From: EdwinWenink <edwinwenink@hotmail.com>
Date: Mon, 1 Feb 2021 21:07:19 +0100
Subject: [PATCH 119/478] DOC fix ungrammatical sentence in doc of
 SelectFromModel (#19240)

Co-authored-by: Nicolas Hug <contact@nicolas-hug.com>
---
 doc/modules/feature_selection.rst        | 8 ++++----
 sklearn/feature_selection/_from_model.py | 3 ++-
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/doc/modules/feature_selection.rst b/doc/modules/feature_selection.rst
index 0b758bf72bc0c..c27a334c2ed4b 100644
--- a/doc/modules/feature_selection.rst
+++ b/doc/modules/feature_selection.rst
@@ -145,10 +145,10 @@ number of features.
 Feature selection using SelectFromModel
 =======================================
 
-:class:`SelectFromModel` is a meta-transformer that can be used along with any
-estimator that importance of each feature through a specific attribute (such as
-``coef_``, ``feature_importances_``) or callable after fitting.
-The features are considered unimportant and removed, if the corresponding
+:class:`SelectFromModel` is a meta-transformer that can be used alongside any
+estimator that assigns importance to each feature through a specific attribute (such as
+``coef_``, ``feature_importances_``) or via an `importance_getter` callable after fitting.
+The features are considered unimportant and removed if the corresponding
 importance of the feature values are below the provided
 ``threshold`` parameter. Apart from specifying the threshold numerically,
 there are built-in heuristics for finding a threshold using a string argument.
diff --git a/sklearn/feature_selection/_from_model.py b/sklearn/feature_selection/_from_model.py
index 4b96804fbcc45..4889f73518fe9 100644
--- a/sklearn/feature_selection/_from_model.py
+++ b/sklearn/feature_selection/_from_model.py
@@ -71,8 +71,9 @@ class SelectFromModel(MetaEstimatorMixin, SelectorMixin, BaseEstimator):
     estimator : object
         The base estimator from which the transformer is built.
         This can be both a fitted (if ``prefit`` is set to True)
-        or a non-fitted estimator. The estimator must have either a
+        or a non-fitted estimator. The estimator should have a
         ``feature_importances_`` or ``coef_`` attribute after fitting.
+        Otherwise, the ``importance_getter`` parameter should be used.
 
     threshold : string or float, default=None
         The threshold value to use for feature selection. Features whose

From 81a56bd8bfd157ce971a0ea6797601b5a90b8ffe Mon Sep 17 00:00:00 2001
From: Yosuke KOBAYASHI <m7142yosuke@gmail.com>
Date: Tue, 2 Feb 2021 05:15:36 +0900
Subject: [PATCH 120/478] FIX Don't overwrite input sample_weight in
 LogisticRegression (#19182)

---
 doc/whats_new/v1.0.rst                      |  4 ++++
 sklearn/linear_model/_logistic.py           |  2 +-
 sklearn/linear_model/tests/test_logistic.py | 20 ++++++++++++++++++++
 sklearn/utils/validation.py                 |  7 +++++--
 4 files changed, 30 insertions(+), 3 deletions(-)

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 382ff363e0db7..c76d29dd6bd2b 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -92,6 +92,10 @@ Changelog
   not corresponding to their objective. :pr:`19172` by
   :user:`Mathurin Massias <mathurinm>`
 
+- |Fix|: Fixed a bug in :class:`linear_model.LogisticRegression`: the
+  sample_weight object is not modified anymore. :pr:`19182` by
+  :user:`Yosuke KOBAYASHI <m7142yosuke>`.
+
 - |API|: The parameter ``normalize`` of :class:`linear_model.LinearRegression`
   is deprecated and will be removed in 1.2.
   Motivation for this deprecation: ``normalize`` parameter did not take any
diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py
index 1afa06637b04a..2b8b6a716cbf7 100644
--- a/sklearn/linear_model/_logistic.py
+++ b/sklearn/linear_model/_logistic.py
@@ -656,7 +656,7 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
     # and check length
     # Otherwise set them to 1 for all examples
     sample_weight = _check_sample_weight(sample_weight, X,
-                                         dtype=X.dtype)
+                                         dtype=X.dtype, copy=True)
 
     # If class_weights is a dict (provided by the user), the weights
     # are assigned to the original labels. If it is "balanced", then
diff --git a/sklearn/linear_model/tests/test_logistic.py b/sklearn/linear_model/tests/test_logistic.py
index 480472504a92b..09d54c254bfb8 100644
--- a/sklearn/linear_model/tests/test_logistic.py
+++ b/sklearn/linear_model/tests/test_logistic.py
@@ -1869,3 +1869,23 @@ def test_multinomial_identifiability_on_iris(fit_intercept):
     assert_allclose(clf.coef_.sum(axis=0), 0, atol=1e-10)
     if fit_intercept:
         clf.intercept_.sum(axis=0) == pytest.approx(0, abs=1e-15)
+
+
+@pytest.mark.parametrize("multi_class", ['ovr', 'multinomial', 'auto'])
+@pytest.mark.parametrize("class_weight", [
+    {0: 1.0, 1: 10.0, 2: 1.0}, 'balanced'
+])
+def test_sample_weight_not_modified(multi_class, class_weight):
+    X, y = load_iris(return_X_y=True)
+    n_features = len(X)
+    W = np.ones(n_features)
+    W[:n_features // 2] = 2
+
+    expected = W.copy()
+
+    clf = LogisticRegression(random_state=0,
+                             class_weight=class_weight,
+                             max_iter=200,
+                             multi_class=multi_class)
+    clf.fit(X, y, sample_weight=W)
+    assert_allclose(expected, W)
diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py
index 902a9f4ddf426..273a0cb2ab04c 100644
--- a/sklearn/utils/validation.py
+++ b/sklearn/utils/validation.py
@@ -1273,7 +1273,7 @@ def _check_psd_eigenvalues(lambdas, enable_warnings=False):
     return lambdas
 
 
-def _check_sample_weight(sample_weight, X, dtype=None):
+def _check_sample_weight(sample_weight, X, dtype=None, copy=False):
     """Validate sample weights.
 
     Note that passing sample_weight=None will output an array of ones.
@@ -1296,6 +1296,9 @@ def _check_sample_weight(sample_weight, X, dtype=None):
        is be allocated.  If `dtype` is not one of `float32`, `float64`,
        `None`, the output will be of dtype `float64`.
 
+    copy : bool, default=False
+        If True, a copy of sample_weight will be created.
+
     Returns
     -------
     sample_weight : ndarray of shape (n_samples,)
@@ -1315,7 +1318,7 @@ def _check_sample_weight(sample_weight, X, dtype=None):
             dtype = [np.float64, np.float32]
         sample_weight = check_array(
             sample_weight, accept_sparse=False, ensure_2d=False, dtype=dtype,
-            order="C"
+            order="C", copy=copy
         )
         if sample_weight.ndim != 1:
             raise ValueError("Sample weights must be 1D array or scalar")

From 69d6378666dc8a4c5b370c05ddc78359149170cd Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Mon, 1 Feb 2021 21:24:22 +0100
Subject: [PATCH 121/478] EXA use pipeline in feature engineering example with
 RF,RT,GBDT (#18835)

Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
---
 .../ensemble/plot_feature_transformation.py   | 238 ++++++++++--------
 1 file changed, 140 insertions(+), 98 deletions(-)

diff --git a/examples/ensemble/plot_feature_transformation.py b/examples/ensemble/plot_feature_transformation.py
index 51e148ab713aa..f7e374c9d1c13 100644
--- a/examples/ensemble/plot_feature_transformation.py
+++ b/examples/ensemble/plot_feature_transformation.py
@@ -3,118 +3,160 @@
 Feature transformations with ensembles of trees
 ===============================================
 
-Transform your features into a higher dimensional, sparse space. Then
-train a linear model on these features.
+Transform your features into a higher dimensional, sparse space. Then train a
+linear model on these features.
 
-First fit an ensemble of trees (totally random trees, a random
-forest, or gradient boosted trees) on the training set. Then each leaf
-of each tree in the ensemble is assigned a fixed arbitrary feature
-index in a new feature space. These leaf indices are then encoded in a
-one-hot fashion.
+First fit an ensemble of trees (totally random trees, a random forest, or
+gradient boosted trees) on the training set. Then each leaf of each tree in the
+ensemble is assigned a fixed arbitrary feature index in a new feature space.
+These leaf indices are then encoded in a one-hot fashion.
 
-Each sample goes through the decisions of each tree of the ensemble
-and ends up in one leaf per tree. The sample is encoded by setting
-feature values for these leaves to 1 and the other feature values to 0.
+Each sample goes through the decisions of each tree of the ensemble and ends up
+in one leaf per tree. The sample is encoded by setting feature values for these
+leaves to 1 and the other feature values to 0.
 
 The resulting transformer has then learned a supervised, sparse,
 high-dimensional categorical embedding of the data.
-
 """
 
 # Author: Tim Head <betatim@gmail.com>
 #
 # License: BSD 3 clause
 
-import numpy as np
-np.random.seed(10)
+print(__doc__)
 
-import matplotlib.pyplot as plt
+from sklearn import set_config
+set_config(display='diagram')
+
+# %%
+# First, we will create a large dataset and split it into three sets:
+#
+# - a set to train the ensemble methods which are later used to as a feature
+#   engineering transformer;
+# - a set to train the linear model;
+# - a set to test the linear model.
+#
+# It is important to split the data in such way to avoid overfitting by leaking
+# data.
 
 from sklearn.datasets import make_classification
-from sklearn.linear_model import LogisticRegression
-from sklearn.ensemble import (RandomTreesEmbedding, RandomForestClassifier,
-                              GradientBoostingClassifier)
-from sklearn.preprocessing import OneHotEncoder
 from sklearn.model_selection import train_test_split
-from sklearn.metrics import roc_curve
+
+X, y = make_classification(n_samples=80000, random_state=10)
+
+X_full_train, X_test, y_full_train, y_test = train_test_split(
+    X, y, test_size=0.5, random_state=10)
+X_train_ensemble, X_train_linear, y_train_ensemble, y_train_linear = \
+    train_test_split(X_full_train, y_full_train, test_size=0.5,
+                     random_state=10)
+
+# %%
+# For each of the ensemble methods, we will use 10 estimators and a maximum
+# depth of 3 levels.
+
+n_estimators = 10
+max_depth = 3
+
+# %%
+# First, we will start by training the random forest and gradient boosting on
+# the separated training set
+
+from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
+
+random_forest = RandomForestClassifier(
+    n_estimators=n_estimators, max_depth=max_depth, random_state=10)
+random_forest.fit(X_train_ensemble, y_train_ensemble)
+
+gradient_boosting = GradientBoostingClassifier(
+    n_estimators=n_estimators, max_depth=max_depth, random_state=10)
+_ = gradient_boosting.fit(X_train_ensemble, y_train_ensemble)
+
+# %%
+# The :class:`~sklearn.ensemble.RandomTreesEmbedding` is an unsupervised method
+# and thus does not required to be trained independently.
+
+from sklearn.ensemble import RandomTreesEmbedding
+
+random_tree_embedding = RandomTreesEmbedding(
+    n_estimators=n_estimators, max_depth=max_depth, random_state=0)
+
+# %%
+# Now, we will create three pipelines that will use the above embedding as
+# a preprocessing stage.
+#
+# The random trees embedding can be directly pipelined with the logistic
+# regression because it is a standard scikit-learn transformer.
+
+from sklearn.linear_model import LogisticRegression
 from sklearn.pipeline import make_pipeline
 
-n_estimator = 10
-X, y = make_classification(n_samples=80000)
-X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)
-
-# It is important to train the ensemble of trees on a different subset
-# of the training data than the linear regression model to avoid
-# overfitting, in particular if the total number of leaves is
-# similar to the number of training samples
-X_train, X_train_lr, y_train, y_train_lr = train_test_split(
-    X_train, y_train, test_size=0.5)
-
-# Unsupervised transformation based on totally random trees
-rt = RandomTreesEmbedding(max_depth=3, n_estimators=n_estimator,
-                          random_state=0)
-
-rt_lm = LogisticRegression(max_iter=1000)
-pipeline = make_pipeline(rt, rt_lm)
-pipeline.fit(X_train, y_train)
-y_pred_rt = pipeline.predict_proba(X_test)[:, 1]
-fpr_rt_lm, tpr_rt_lm, _ = roc_curve(y_test, y_pred_rt)
-
-# Supervised transformation based on random forests
-rf = RandomForestClassifier(max_depth=3, n_estimators=n_estimator)
-rf_enc = OneHotEncoder()
-rf_lm = LogisticRegression(max_iter=1000)
-rf.fit(X_train, y_train)
-rf_enc.fit(rf.apply(X_train))
-rf_lm.fit(rf_enc.transform(rf.apply(X_train_lr)), y_train_lr)
-
-y_pred_rf_lm = rf_lm.predict_proba(rf_enc.transform(rf.apply(X_test)))[:, 1]
-fpr_rf_lm, tpr_rf_lm, _ = roc_curve(y_test, y_pred_rf_lm)
-
-# Supervised transformation based on gradient boosted trees
-grd = GradientBoostingClassifier(n_estimators=n_estimator)
-grd_enc = OneHotEncoder()
-grd_lm = LogisticRegression(max_iter=1000)
-grd.fit(X_train, y_train)
-grd_enc.fit(grd.apply(X_train)[:, :, 0])
-grd_lm.fit(grd_enc.transform(grd.apply(X_train_lr)[:, :, 0]), y_train_lr)
-
-y_pred_grd_lm = grd_lm.predict_proba(
-    grd_enc.transform(grd.apply(X_test)[:, :, 0]))[:, 1]
-fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred_grd_lm)
-
-# The gradient boosted model by itself
-y_pred_grd = grd.predict_proba(X_test)[:, 1]
-fpr_grd, tpr_grd, _ = roc_curve(y_test, y_pred_grd)
-
-# The random forest model by itself
-y_pred_rf = rf.predict_proba(X_test)[:, 1]
-fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_rf)
-
-plt.figure(1)
-plt.plot([0, 1], [0, 1], 'k--')
-plt.plot(fpr_rt_lm, tpr_rt_lm, label='RT + LR')
-plt.plot(fpr_rf, tpr_rf, label='RF')
-plt.plot(fpr_rf_lm, tpr_rf_lm, label='RF + LR')
-plt.plot(fpr_grd, tpr_grd, label='GBT')
-plt.plot(fpr_grd_lm, tpr_grd_lm, label='GBT + LR')
-plt.xlabel('False positive rate')
-plt.ylabel('True positive rate')
-plt.title('ROC curve')
-plt.legend(loc='best')
-plt.show()
-
-plt.figure(2)
-plt.xlim(0, 0.2)
-plt.ylim(0.8, 1)
-plt.plot([0, 1], [0, 1], 'k--')
-plt.plot(fpr_rt_lm, tpr_rt_lm, label='RT + LR')
-plt.plot(fpr_rf, tpr_rf, label='RF')
-plt.plot(fpr_rf_lm, tpr_rf_lm, label='RF + LR')
-plt.plot(fpr_grd, tpr_grd, label='GBT')
-plt.plot(fpr_grd_lm, tpr_grd_lm, label='GBT + LR')
-plt.xlabel('False positive rate')
-plt.ylabel('True positive rate')
-plt.title('ROC curve (zoomed in at top left)')
-plt.legend(loc='best')
-plt.show()
+rt_model = make_pipeline(
+    random_tree_embedding, LogisticRegression(max_iter=1000))
+rt_model.fit(X_train_linear, y_train_linear)
+
+# %%
+# Then, we can pipeline random forest or gradient boosting with a logistic
+# regression. However, the feature transformation will happen by calling the
+# method `apply`. The pipeline in scikit-learn expects a call to `transform`.
+# Therefore, we wrapped the call to `apply` within a `FunctionTransformer`.
+
+from sklearn.preprocessing import FunctionTransformer
+from sklearn.preprocessing import OneHotEncoder
+
+
+def rf_apply(X, model):
+    return model.apply(X)
+
+
+rf_leaves_yielder = FunctionTransformer(
+    rf_apply, kw_args={"model": random_forest})
+
+rf_model = make_pipeline(
+    rf_leaves_yielder, OneHotEncoder(handle_unknown="ignore"),
+    LogisticRegression(max_iter=1000))
+rf_model.fit(X_train_linear, y_train_linear)
+
+
+# %%
+def gbdt_apply(X, model):
+    return model.apply(X)[:, :, 0]
+
+
+gbdt_leaves_yielder = FunctionTransformer(
+    gbdt_apply, kw_args={"model": gradient_boosting})
+
+gbdt_model = make_pipeline(
+    gbdt_leaves_yielder, OneHotEncoder(handle_unknown="ignore"),
+    LogisticRegression(max_iter=1000))
+gbdt_model.fit(X_train_linear, y_train_linear)
+
+# %%
+# We can finally show the different ROC curves for all the models.
+
+import matplotlib.pyplot as plt
+from sklearn.metrics import plot_roc_curve
+
+fig, ax = plt.subplots()
+
+models = [
+    ("RT embedding -> LR", rt_model),
+    ("RF", random_forest),
+    ("RF embedding -> LR", rf_model),
+    ("GBDT", gradient_boosting),
+    ("GBDT embedding -> LR", gbdt_model),
+]
+
+model_displays = {}
+for name, pipeline in models:
+    model_displays[name] = plot_roc_curve(
+        pipeline, X_test, y_test, ax=ax, name=name)
+_ = ax.set_title('ROC curve')
+
+# %%
+fig, ax = plt.subplots()
+for name, pipeline in models:
+    model_displays[name].plot(ax=ax)
+
+ax.set_xlim(0, 0.2)
+ax.set_ylim(0.8, 1)
+_ = ax.set_title('ROC curve (zoomed in at top left)')

From cd06652c12ab1e05fb174a2c197c20892ac3fba2 Mon Sep 17 00:00:00 2001
From: ZhaoweiWang <wangzw834@163.com>
Date: Tue, 2 Feb 2021 04:53:36 +0800
Subject: [PATCH 122/478] FIX nan bug in BaseLabelPropagation (#19271)

Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
---
 doc/whats_new/v1.0.rst                               |  7 +++++++
 sklearn/semi_supervised/_label_propagation.py        |  1 +
 .../semi_supervised/tests/test_label_propagation.py  | 12 ++++++++----
 3 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index c76d29dd6bd2b..68f2e43f27664 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -150,6 +150,13 @@ Changelog
   for non-English characters. :pr:`18959` by :user:`Zero <Zeroto521>`
   and :user:`wstates <wstates>`.
 
+:mod:`sklearn.semi_supervised`
+.................................
+
+- |Fix| Avoid NaN during label propagation in
+  :class:`~sklearn.semi_supervised.LabelPropagation`.
+  :pr:`19271` by :user:`Zhaowei Wang <ThuWangzw>`.
+
 Code and Documentation Contributors
 -----------------------------------
 
diff --git a/sklearn/semi_supervised/_label_propagation.py b/sklearn/semi_supervised/_label_propagation.py
index 30f09c2866e62..8ba99b9603e05 100644
--- a/sklearn/semi_supervised/_label_propagation.py
+++ b/sklearn/semi_supervised/_label_propagation.py
@@ -279,6 +279,7 @@ def fit(self, X, y):
             if self._variant == 'propagation':
                 normalizer = np.sum(
                     self.label_distributions_, axis=1)[:, np.newaxis]
+                normalizer[normalizer == 0] = 1
                 self.label_distributions_ /= normalizer
                 self.label_distributions_ = np.where(unlabeled,
                                                      self.label_distributions_,
diff --git a/sklearn/semi_supervised/tests/test_label_propagation.py b/sklearn/semi_supervised/tests/test_label_propagation.py
index 015f6fa191853..652f83b90a3d6 100644
--- a/sklearn/semi_supervised/tests/test_label_propagation.py
+++ b/sklearn/semi_supervised/tests/test_label_propagation.py
@@ -157,15 +157,19 @@ def test_convergence_warning():
     assert_no_warnings(mdl.fit, X, y)
 
 
-def test_label_propagation_non_zero_normalizer():
+@pytest.mark.parametrize("LabelPropagationCls",
+                         [label_propagation.LabelSpreading,
+                          label_propagation.LabelPropagation])
+def test_label_propagation_non_zero_normalizer(LabelPropagationCls):
     # check that we don't divide by zero in case of null normalizer
     # non-regression test for
     # https://github.com/scikit-learn/scikit-learn/pull/15946
+    # https://github.com/scikit-learn/scikit-learn/issues/9292
     X = np.array([[100., 100.], [100., 100.], [0., 0.], [0., 0.]])
     y = np.array([0, 1, -1, -1])
-    mdl = label_propagation.LabelSpreading(kernel='knn',
-                                           max_iter=100,
-                                           n_neighbors=1)
+    mdl = LabelPropagationCls(kernel='knn',
+                              max_iter=100,
+                              n_neighbors=1)
     assert_no_warnings(mdl.fit, X, y)
 
 
From 8815bced4b840a9edd8e1a5943298d4e9f0de5b3 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Mon, 1 Feb 2021 15:56:40 -0500
Subject: [PATCH 123/478] DOC Updates css consistent admonition styling
 (#19249)

---
 .../scikit-learn-modern/static/css/theme.css  | 22 ++++++++-----------
 1 file changed, 9 insertions(+), 13 deletions(-)

diff --git a/doc/themes/scikit-learn-modern/static/css/theme.css b/doc/themes/scikit-learn-modern/static/css/theme.css
index a51930bd745ff..5fa26391886e0 100644
--- a/doc/themes/scikit-learn-modern/static/css/theme.css
+++ b/doc/themes/scikit-learn-modern/static/css/theme.css
@@ -786,43 +786,39 @@ div.admonition p.admonition-title + p, div.deprecated p {
   display: inline;
 }
 
-div.admonition, div.deprecated {
+div.admonition, div.deprecated,
+div.versionchanged, div.versionadded{
+  margin-top: 0.5rem;
   padding: 0.5rem;
   border-radius: 0.5rem;
+  margin-bottom: 0.5rem;
   border: 1px solid #ddd;
-  margin-bottom: 1rem;
 }
 
 div.admonition {
   background-color: #eee;
 }
 
-div.admonition p, div.admonition dl, div.admonition dd {
+div.admonition p, div.admonition dl, div.admonition dd,
+div.deprecated p, div.versionchanged p, div.versionadded p{
   margin-bottom: 0
 }
 
 div.deprecated {
   color: #b94a48;
   background-color: #F3E5E5;
-  border: 1px solid #eed3d7;
+  border-color: #eed3d7;
 }
 
 div.seealso {
   background-color: #FFFBE8;
-  border: 1px solid #fbeed5;
+  border-color: #fbeed5;
   color: #AF8A4B;
 }
 
 div.versionchanged {
-  margin-top: 0.5rem;
-  padding: 0.5rem;
   background-color: #FFFBE8;
-  border: 1px solid #fbeed5;
-  border-radius: 0.5rem;
-}
-
-div.versionchanged p {
-  margin-bottom: 0;
+  border-color: #fbeed5;
 }
 
 dt.label {

From 07e1a6bb9d6d7eb7fdcc4bb15e8ec7e6c3e6117e Mon Sep 17 00:00:00 2001
From: BaptBillard <47218108+BaptBillard@users.noreply.github.com>
Date: Mon, 1 Feb 2021 22:29:40 +0100
Subject: [PATCH 124/478] DOC Remove algorithm flag for PLS regression (#19204)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Jérémie du Boisberranger <34657725+jeremiedbb@users.noreply.github.com>
---
 sklearn/cross_decomposition/_pls.py | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/sklearn/cross_decomposition/_pls.py b/sklearn/cross_decomposition/_pls.py
index a8d43a7fe2924..48b9392834e48 100644
--- a/sklearn/cross_decomposition/_pls.py
+++ b/sklearn/cross_decomposition/_pls.py
@@ -490,11 +490,6 @@ class PLSRegression(_PLS):
     scale : bool, default=True
         Whether to scale `X` and `Y`.
 
-    algorithm : {'nipals', 'svd'}, default='nipals'
-        The algorithm used to estimate the first singular vectors of the
-        cross-covariance matrix. 'nipals' uses the power method while 'svd'
-        will compute the whole SVD.
-
     max_iter : int, default=500
         The maximum number of iterations of the power method when
         `algorithm='nipals'`. Ignored otherwise.
@@ -542,8 +537,7 @@ class PLSRegression(_PLS):
         `Y = X @ coef_`.
 
     n_iter_ : list of shape (n_components,)
-        Number of iterations of the power method, for each
-        component. Empty if `algorithm='svd'`.
+        Number of iterations of the power method for each component.
 
     Examples
     --------

From 66560d61b5361e1f2d892423a6dd495ffff68329 Mon Sep 17 00:00:00 2001
From: bmalezieux <40993275+bmalezieux@users.noreply.github.com>
Date: Mon, 1 Feb 2021 22:31:09 +0100
Subject: [PATCH 125/478] Dictionary Learning: transform_alpha default equal to
 alpha (#19159)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Jérémie du Boisberranger <34657725+jeremiedbb@users.noreply.github.com>
Co-authored-by: Roman Yurchak <rth.yurchak@gmail.com>
---
 doc/whats_new/v1.0.rst                        | 10 ++++++
 sklearn/decomposition/_dict_learning.py       | 32 +++++++++++--------
 .../decomposition/tests/test_dict_learning.py |  8 +++++
 3 files changed, 37 insertions(+), 13 deletions(-)

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 68f2e43f27664..c56d84343947f 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -59,6 +59,16 @@ Changelog
 - |API| :class:`cluster.Birch` attributes, `fit_` and `partial_fit_`, are
   deprecated and will be removed in 1.2. :pr:`19297` by `Thomas Fan`_.
 
+:mod:`sklearn.decomposition`
+............................
+
+- |API| In :class:`decomposition.DictionaryLearning`,
+  :class:`decomposition.MiniBatchDictionaryLearning`,
+  :func:`dict_learning` and :func:`dict_learning_online`,
+  `transform_alpha` will be equal to `alpha` instead of 1.0 by default 
+  starting from version 1.2
+  :pr:`19159` by :user:`Benoît Malézieux <bmalezieux>`.
+
 - |Fix| Fixes incorrect multiple data-conversion warnings when clustering
   boolean data. :pr:`19046` by :user:`Surya Prakash <jdsurya>`.
 
diff --git a/sklearn/decomposition/_dict_learning.py b/sklearn/decomposition/_dict_learning.py
index cf472a52b900d..01311e767b6ff 100644
--- a/sklearn/decomposition/_dict_learning.py
+++ b/sklearn/decomposition/_dict_learning.py
@@ -6,6 +6,7 @@
 import time
 import sys
 import itertools
+import warnings
 
 from math import ceil
 
@@ -911,10 +912,21 @@ def _transform(self, X, dictionary):
         SparseCoder."""
         X = self._validate_data(X, reset=False)
 
+        # transform_alpha has to be changed in _transform
+        # this is done for consistency with the value of alpha
+        if (hasattr(self, "alpha") and self.alpha != 1. and
+                self.transform_alpha is None):
+            warnings.warn("By default transform_alpha will be equal to"
+                          "alpha instead of 1.0 starting from version 1.2",
+                          FutureWarning)
+            transform_alpha = 1.  # TODO change to self.alpha in 1.2
+        else:
+            transform_alpha = self.transform_alpha
+
         code = sparse_encode(
             X, dictionary, algorithm=self.transform_algorithm,
             n_nonzero_coefs=self.transform_n_nonzero_coefs,
-            alpha=self.transform_alpha, max_iter=self.transform_max_iter,
+            alpha=transform_alpha, max_iter=self.transform_max_iter,
             n_jobs=self.n_jobs, positive=self.positive_code)
 
         if self.split_sign:
@@ -1186,8 +1198,8 @@ class DictionaryLearning(_BaseSparseCoding, BaseEstimator):
 
     transform_n_nonzero_coefs : int, default=None
         Number of nonzero coefficients to target in each column of the
-        solution. This is only used by `algorithm='lars'` and `algorithm='omp'`
-        and is overridden by `alpha` in the `omp` case. If `None`, then
+        solution. This is only used by `algorithm='lars'` and
+        `algorithm='omp'`. If `None`, then
         `transform_n_nonzero_coefs=int(n_features / 10)`.
 
     transform_alpha : float, default=None
@@ -1195,10 +1207,7 @@ class DictionaryLearning(_BaseSparseCoding, BaseEstimator):
         penalty applied to the L1 norm.
         If `algorithm='threshold'`, `alpha` is the absolute value of the
         threshold below which coefficients will be squashed to zero.
-        If `algorithm='omp'`, `alpha` is the tolerance parameter: the value of
-        the reconstruction error targeted. In this case, it overrides
-        `n_nonzero_coefs`.
-        If `None`, default to 1.0
+        If `None`, defaults to `alpha`.
 
     n_jobs : int or None, default=None
         Number of parallel jobs to run.
@@ -1428,8 +1437,8 @@ class MiniBatchDictionaryLearning(_BaseSparseCoding, BaseEstimator):
 
     transform_n_nonzero_coefs : int, default=None
         Number of nonzero coefficients to target in each column of the
-        solution. This is only used by `algorithm='lars'` and `algorithm='omp'`
-        and is overridden by `alpha` in the `omp` case. If `None`, then
+        solution. This is only used by `algorithm='lars'` and
+        `algorithm='omp'`. If `None`, then
         `transform_n_nonzero_coefs=int(n_features / 10)`.
 
     transform_alpha : float, default=None
@@ -1437,10 +1446,7 @@ class MiniBatchDictionaryLearning(_BaseSparseCoding, BaseEstimator):
         penalty applied to the L1 norm.
         If `algorithm='threshold'`, `alpha` is the absolute value of the
         threshold below which coefficients will be squashed to zero.
-        If `algorithm='omp'`, `alpha` is the tolerance parameter: the value of
-        the reconstruction error targeted. In this case, it overrides
-        `n_nonzero_coefs`.
-        If `None`, default to 1.
+        If `None`, defaults to `alpha`.
 
     verbose : bool, default=False
         To control the verbosity of the procedure.
diff --git a/sklearn/decomposition/tests/test_dict_learning.py b/sklearn/decomposition/tests/test_dict_learning.py
index c9590f3136678..a13c07a6ac728 100644
--- a/sklearn/decomposition/tests/test_dict_learning.py
+++ b/sklearn/decomposition/tests/test_dict_learning.py
@@ -573,3 +573,11 @@ def test_sparse_coder_n_features_in():
     d = np.array([[1, 2, 3], [1, 2, 3]])
     sc = SparseCoder(d)
     assert sc.n_features_in_ == d.shape[1]
+
+
+@pytest.mark.parametrize("Estimator", [DictionaryLearning,
+                                       MiniBatchDictionaryLearning])
+def test_warning_default_transform_alpha(Estimator):
+    dl = Estimator(alpha=0.1)
+    with pytest.warns(FutureWarning, match="default transform_alpha"):
+        dl.fit_transform(X)

From 94b81ab2e7f9b0170b2d6ba6d84c1cc913367d8b Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Mon, 1 Feb 2021 23:03:46 +0100
Subject: [PATCH 126/478] DOC use diagram and simplify pipeline in stacking
 example (#18830)

Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
Co-authored-by: Roman Yurchak <rth.yurchak@gmail.com>
---
 examples/ensemble/plot_stack_predictors.py | 140 ++++++++++-----------
 1 file changed, 69 insertions(+), 71 deletions(-)

diff --git a/examples/ensemble/plot_stack_predictors.py b/examples/ensemble/plot_stack_predictors.py
index cab3ff5ba6ec8..c07068b060c57 100644
--- a/examples/ensemble/plot_stack_predictors.py
+++ b/examples/ensemble/plot_stack_predictors.py
@@ -15,12 +15,15 @@
 stacking strategy. Stacking slightly improves the overall performance.
 
 """
-print(__doc__)
 
 # Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
 #          Maria Telenczuk    <https://github.com/maikia>
 # License: BSD 3 clause
 
+print(__doc__)
+
+from sklearn import set_config
+set_config(display='diagram')
 
 # %%
 # Download the dataset
@@ -73,68 +76,56 @@ def load_ames_housing():
 ##############################################################################
 #
 # Before we can use Ames dataset we still need to do some preprocessing.
-# First, the dataset has many missing values. To impute them, we will exchange
-# categorical missing values with the new category 'missing' while the
-# numerical missing values with the 'mean' of the column. We will also encode
-# the categories with either :class:`~sklearn.preprocessing.OneHotEncoder
-# <sklearn.preprocessing.OneHotEncoder>` or
-# :class:`~sklearn.preprocessing.OrdinalEncoder
-# <sklearn.preprocessing.OrdinalEncoder>` depending for which type of model we
-# will use them (linear or non-linear model). To facilitate this preprocessing
-# we will make two pipelines.
-# You can skip this section if your data is ready to use and does
-# not need preprocessing
+# First, we will select the categorical and numerical columns of the dataset to
+# construct the first step of the pipeline.
+
+from sklearn.compose import make_column_selector
+
+cat_selector = make_column_selector(dtype_include=object)
+num_selector = make_column_selector(dtype_include=np.number)
+cat_selector(X)
 
+# %%
+num_selector(X)
+
+# %%
+# Then, we will need to design preprocessing pipelines which depends on the
+# ending regressor. If the ending regressor is a linear model, one needs to
+# one-hot encode the categories. If the ending regressor is a tree-based model
+# an ordinal encoder will be sufficient. Besides, numerical values need to be
+# standardized for a linear model while the raw numerical data can be treated
+# as is by a tree-based model. However, both models need an imputer to
+# handle missing values.
+#
+# We will first design the pipeline required for the tree-based models.
 
 from sklearn.compose import make_column_transformer
 from sklearn.impute import SimpleImputer
 from sklearn.pipeline import make_pipeline
-from sklearn.preprocessing import OneHotEncoder
 from sklearn.preprocessing import OrdinalEncoder
-from sklearn.preprocessing import StandardScaler
-
-
-cat_cols = X.columns[X.dtypes == 'O']
-num_cols = X.columns[X.dtypes == 'float64']
 
-categories = [
-    X[column].unique() for column in X[cat_cols]]
+cat_tree_processor = OrdinalEncoder(
+    handle_unknown="use_encoded_value", unknown_value=-1)
+num_tree_processor = SimpleImputer(strategy="mean", add_indicator=True)
 
-for cat in categories:
-    cat[cat == None] = 'missing'  # noqa
+tree_preprocessor = make_column_transformer(
+    (num_tree_processor, num_selector), (cat_tree_processor, cat_selector))
+tree_preprocessor
 
-cat_proc_nlin = make_pipeline(
-    SimpleImputer(missing_values=None, strategy='constant',
-                  fill_value='missing'),
-    OrdinalEncoder(categories=categories)
-    )
-
-num_proc_nlin = make_pipeline(SimpleImputer(strategy='mean'))
-
-cat_proc_lin = make_pipeline(
-    SimpleImputer(missing_values=None,
-                  strategy='constant',
-                  fill_value='missing'),
-    OneHotEncoder(categories=categories)
-)
-
-num_proc_lin = make_pipeline(
-    SimpleImputer(strategy='mean'),
-    StandardScaler()
-)
+# %%
+# Then, we will now define the preprocessor used when the ending regressor
+# is a linear model.
 
-# transformation to use for non-linear estimators
-processor_nlin = make_column_transformer(
-    (cat_proc_nlin, cat_cols),
-    (num_proc_nlin, num_cols),
-    remainder='passthrough')
+from sklearn.preprocessing import OneHotEncoder
+from sklearn.preprocessing import StandardScaler
 
-# transformation to use for linear estimators
-processor_lin = make_column_transformer(
-    (cat_proc_lin, cat_cols),
-    (num_proc_lin, num_cols),
-    remainder='passthrough')
+cat_linear_processor = OneHotEncoder(handle_unknown="ignore")
+num_linear_processor = make_pipeline(
+    StandardScaler(), SimpleImputer(strategy="mean", add_indicator=True))
 
+linear_preprocessor = make_column_transformer(
+    (num_linear_processor, num_selector), (cat_linear_processor, cat_selector))
+linear_preprocessor
 
 # %%
 # Stack of predictors on a single data set
@@ -149,37 +140,44 @@ def load_ames_housing():
 # Here, we combine 3 learners (linear and non-linear) and use a ridge regressor
 # to combine their outputs together.
 #
-# Note: although we will make new pipelines with the processors which we wrote
-# in the previous section for the 3 learners, the final estimator RidgeCV()
-# does not need preprocessing of the data as it will be fed with the already
-# preprocessed output from the 3 learners.
+# .. note::
+#    Although we will make new pipelines with the processors which we wrote in
+#    the previous section for the 3 learners, the final estimator
+#    :class:`~sklearn.linear_model.RidgeCV()` does not need preprocessing of
+#    the data as it will be fed with the already preprocessed output from the 3
+#    learners.
 
+from sklearn.linear_model import LassoCV
 
-from sklearn.experimental import enable_hist_gradient_boosting  # noqa
-from sklearn.ensemble import HistGradientBoostingRegressor
+lasso_pipeline = make_pipeline(linear_preprocessor, LassoCV())
+lasso_pipeline
+
+# %%
 from sklearn.ensemble import RandomForestRegressor
-from sklearn.ensemble import StackingRegressor
-from sklearn.linear_model import LassoCV
-from sklearn.linear_model import RidgeCV
 
+rf_pipeline = make_pipeline(
+    tree_preprocessor, RandomForestRegressor(random_state=42))
+rf_pipeline
 
-lasso_pipeline = make_pipeline(processor_lin,
-                               LassoCV())
+# %%
+from sklearn.experimental import enable_hist_gradient_boosting  # noqa
+from sklearn.ensemble import HistGradientBoostingRegressor
 
-rf_pipeline = make_pipeline(processor_nlin,
-                            RandomForestRegressor(random_state=42))
+gbdt_pipeline = make_pipeline(
+    tree_preprocessor, HistGradientBoostingRegressor(random_state=0))
+gbdt_pipeline
 
-gradient_pipeline = make_pipeline(
-    processor_nlin,
-    HistGradientBoostingRegressor(random_state=0))
+# %%
+from sklearn.ensemble import StackingRegressor
+from sklearn.linear_model import RidgeCV
 
 estimators = [('Random Forest', rf_pipeline),
               ('Lasso', lasso_pipeline),
-              ('Gradient Boosting', gradient_pipeline)]
-
-stacking_regressor = StackingRegressor(estimators=estimators,
-                                       final_estimator=RidgeCV())
+              ('Gradient Boosting', gbdt_pipeline)]
 
+stacking_regressor = StackingRegressor(
+    estimators=estimators, final_estimator=RidgeCV())
+stacking_regressor
 
 # %%
 # Measure and plot the results

From 50d3aaad36fa83f5d43e8177838726dd08f0526b Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@gmail.com>
Date: Tue, 2 Feb 2021 18:13:49 +0100
Subject: [PATCH 127/478] Reduce parallelism for [cd build] and [arm64] on
 travis (#19330)

---
 .travis.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 72a9e3993444a..91f205276a932 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -22,10 +22,10 @@ env:
     - CIBW_BUILD_VERBOSITY=1
     - CIBW_TEST_REQUIRES="pytest pytest-xdist threadpoolctl"
     - CIBW_TEST_COMMAND="bash {project}/build_tools/travis/test_wheels.sh"
-    - CIBW_ENVIRONMENT="CPU_COUNT=8
+    - CIBW_ENVIRONMENT="CPU_COUNT=2
                         OMP_NUM_THREADS=2
                         OPENBLAS_NUM_THREADS=2
-                        SKLEARN_BUILD_PARALLEL=8
+                        SKLEARN_BUILD_PARALLEL=3
                         SKLEARN_SKIP_NETWORK_TESTS=1"
 
 jobs:
@@ -43,7 +43,7 @@ jobs:
       arch: arm64
       if: commit_message =~ /\[arm64\]/
       env:
-        - CPU_COUNT=8
+        - CPU_COUNT=4
 
     # Linux environments to build the scikit-learn wheels for the ARM64
     # architecture and Python 3.6 and newer. This is used both at release time

From 4fd851cbfd5a28931fdfcd07dce9e1a6417c1a82 Mon Sep 17 00:00:00 2001
From: Bruno Charron <bruno@charron.email>
Date: Wed, 3 Feb 2021 22:12:40 +0900
Subject: [PATCH 128/478] FIX Ensure determinism of SVD init in dict_learning
 (#18433)

---
 doc/whats_new/v1.0.rst                  | 7 ++++++-
 sklearn/decomposition/_dict_learning.py | 4 +++-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index c56d84343947f..3a6793eeccc6c 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -65,13 +65,18 @@ Changelog
 - |API| In :class:`decomposition.DictionaryLearning`,
   :class:`decomposition.MiniBatchDictionaryLearning`,
   :func:`dict_learning` and :func:`dict_learning_online`,
-  `transform_alpha` will be equal to `alpha` instead of 1.0 by default 
+  `transform_alpha` will be equal to `alpha` instead of 1.0 by default
   starting from version 1.2
   :pr:`19159` by :user:`Benoît Malézieux <bmalezieux>`.
 
 - |Fix| Fixes incorrect multiple data-conversion warnings when clustering
   boolean data. :pr:`19046` by :user:`Surya Prakash <jdsurya>`.
 
+- |Fix| Fixed :func:`dict_learning`, used by :class:`DictionaryLearning`, to
+  ensure determinism of the output. Achieved by flipping signs of the SVD
+  output which is used to initialize the code.
+  :pr:`18433` by :user:`Bruno Charron <brcharron>`.
+
 :mod:`sklearn.ensemble`
 .......................
 
diff --git a/sklearn/decomposition/_dict_learning.py b/sklearn/decomposition/_dict_learning.py
index 01311e767b6ff..27261739de621 100644
--- a/sklearn/decomposition/_dict_learning.py
+++ b/sklearn/decomposition/_dict_learning.py
@@ -18,7 +18,7 @@
 from ..utils import deprecated
 from ..utils import (check_array, check_random_state, gen_even_slices,
                      gen_batches)
-from ..utils.extmath import randomized_svd, row_norms
+from ..utils.extmath import randomized_svd, row_norms, svd_flip
 from ..utils.validation import check_is_fitted, _deprecate_positional_args
 from ..utils.fixes import delayed
 from ..linear_model import Lasso, orthogonal_mp_gram, LassoLars, Lars
@@ -567,6 +567,8 @@ def dict_learning(X, n_components, *, alpha, max_iter=100, tol=1e-8,
         dictionary = dict_init
     else:
         code, S, dictionary = linalg.svd(X, full_matrices=False)
+        # flip the initial code's sign to enforce deterministic output
+        code, dictionary = svd_flip(code, dictionary)
         dictionary = S[:, np.newaxis] * dictionary
     r = len(dictionary)
     if n_components <= r:  # True even if n_components=None

From d831babc6b430d5f41fb0cec7fe5b7f806830481 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?=
 <34657725+jeremiedbb@users.noreply.github.com>
Date: Wed, 3 Feb 2021 22:17:30 +0100
Subject: [PATCH 129/478] CI Update ICC install commands (#19329)

---
 .travis.yml                              |  6 ------
 azure-pipelines.yml                      | 22 ++++++++++++++++++++++
 build_tools/azure/install.sh             | 14 ++++++++++++++
 build_tools/azure/test_docs.sh           |  4 ++++
 build_tools/azure/test_script.sh         |  4 ++++
 build_tools/travis/install_main.sh       | 18 +-----------------
 doc/developers/advanced_installation.rst | 11 ++++++-----
 7 files changed, 51 insertions(+), 28 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 91f205276a932..1e6ed78d28ac2 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -30,12 +30,6 @@ env:
 
 jobs:
   include:
-    - python: 3.7
-      env:
-        - CHECK_WARNING=true
-        - BUILD_WITH_ICC=true
-      if: type = cron OR commit_message =~ /\[icc-build\]/
-
     # Manual trigger of linux/arm64 tests in PR without triggering the full
     # wheel building process for all the Python versions.
     - python: 3.9
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 1b861a6eaef5f..3cd2b5bb4cd9f 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -78,6 +78,28 @@ jobs:
         # Here we make sure, that they are still run on a regular basis.
         SKLEARN_SKIP_NETWORK_TESTS: '0'
 
+# Check compilation with intel C++ compiler (ICC)
+- template: build_tools/azure/posix.yml
+  parameters:
+    name: Linux_Nightly_ICC
+    vmImage: ubuntu-18.04
+    dependsOn: [git_commit, linting]
+    condition: |
+      and(
+        succeeded(),
+        not(contains(dependencies['git_commit']['outputs']['commit.message'], '[ci skip]')),
+        or(eq(variables['Build.Reason'], 'Schedule'),
+           contains(dependencies['git_commit']['outputs']['commit.message'], '[icc-build]')
+        )
+      )
+    matrix:
+      pylatest_conda_mkl:
+        DISTRIB: 'conda'
+        PYTHON_VERSION: '*'
+        BLAS: 'mkl'
+        COVERAGE: 'false'
+        BUILD_WITH_ICC: 'true'
+
 # Will run all the time regardless of linting outcome.
 - template: build_tools/azure/posix.yml
   parameters:
diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh
index 866c4530e3f5c..fbe0c90a473ab 100755
--- a/build_tools/azure/install.sh
+++ b/build_tools/azure/install.sh
@@ -148,6 +148,20 @@ if [[ "$DISTRIB" == "conda-pip-latest" ]]; then
     # environment:
     pip install --verbose --editable .
 else
+    if [[ "$BUILD_WITH_ICC" == "true" ]]; then
+        wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
+        sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
+        rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
+        sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
+        sudo apt-get update
+        sudo apt-get install intel-oneapi-compiler-dpcpp-cpp-and-cpp-classic
+        source /opt/intel/oneapi/setvars.sh
+
+        # The "build_clib" command is implicitly used to build "libsvm-skl".
+        # To compile with a different compiler, we also need to specify the
+        # compiler for this command
+        python setup.py build_ext --compiler=intelem -i build_clib --compiler=intelem
+    fi
     # Use the pre-installed build dependencies and build directly in the
     # current environment.
     python setup.py develop
diff --git a/build_tools/azure/test_docs.sh b/build_tools/azure/test_docs.sh
index b3a5ec97c4d6a..18b3ccb148b5e 100755
--- a/build_tools/azure/test_docs.sh
+++ b/build_tools/azure/test_docs.sh
@@ -8,4 +8,8 @@ elif [[ "$DISTRIB" == "ubuntu" ]]; then
     source $VIRTUALENV/bin/activate
 fi
 
+if [[ "$BUILD_WITH_ICC" == "true" ]]; then
+    source /opt/intel/oneapi/setvars.sh
+fi
+
 make test-doc
diff --git a/build_tools/azure/test_script.sh b/build_tools/azure/test_script.sh
index b08cda50cfd60..858d691b38216 100755
--- a/build_tools/azure/test_script.sh
+++ b/build_tools/azure/test_script.sh
@@ -8,6 +8,10 @@ elif [[ "$DISTRIB" == "ubuntu" ]] || [[ "$DISTRIB" == "ubuntu-32" ]]; then
     source $VIRTUALENV/bin/activate
 fi
 
+if [[ "$BUILD_WITH_ICC" == "true" ]]; then
+    source /opt/intel/oneapi/setvars.sh
+fi
+
 python --version
 python -c "import numpy; print('numpy %s' % numpy.__version__)"
 python -c "import scipy; print('scipy %s' % scipy.__version__)"
diff --git a/build_tools/travis/install_main.sh b/build_tools/travis/install_main.sh
index 423835df25d11..383fefa5bd1a3 100755
--- a/build_tools/travis/install_main.sh
+++ b/build_tools/travis/install_main.sh
@@ -62,23 +62,7 @@ python --version
 python -c "import numpy; print(f'numpy {numpy.__version__}')"
 python -c "import scipy; print(f'scipy {scipy.__version__}')"
 
-if [[ $BUILD_WITH_ICC == true ]]; then
-    wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2023.PUB
-    sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS-2023.PUB
-    rm GPG-PUB-KEY-INTEL-SW-PRODUCTS-2023.PUB
-    sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
-    sudo apt-get update
-    sudo apt-get install intel-oneapi-icc
-    source /opt/intel/oneapi/setvars.sh
-
-    # The "build_clib" command is implicitly used to build "libsvm-skl".
-    # To compile with a different compiler, we also need to specify the
-    # compiler for this command
-    python setup.py build_ext --compiler=intelem -i build_clib --compiler=intelem
-else
-    pip install -e .
-fi
-
+pip install -e .
 python setup.py develop
 
 ccache --show-stats
diff --git a/doc/developers/advanced_installation.rst b/doc/developers/advanced_installation.rst
index 37cb597776934..489f6447d57c8 100644
--- a/doc/developers/advanced_installation.rst
+++ b/doc/developers/advanced_installation.rst
@@ -479,9 +479,9 @@ repository:
 
 .. prompt:: bash $
 
-    wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2023.PUB
-    sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS-2023.PUB
-    rm GPG-PUB-KEY-INTEL-SW-PRODUCTS-2023.PUB
+    wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
+    sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
+    rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
 
 Then, add the oneAPI repository to your APT repositories:
 
@@ -490,11 +490,12 @@ Then, add the oneAPI repository to your APT repositories:
     sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
     sudo apt-get update
 
-Install ICC, packaged under the name ``intel-oneapi-icc``:
+Install ICC, packaged under the name
+``intel-oneapi-compiler-dpcpp-cpp-and-cpp-classic``:
 
 .. prompt:: bash $
 
-    sudo apt-get install intel-oneapi-icc
+    sudo apt-get install intel-oneapi-compiler-dpcpp-cpp-and-cpp-classic
 
 Before using ICC, you need to set up environment variables:
 

From 673f6259f3fb7bd2a057b1889e23b280fe638998 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?=
 <34657725+jeremiedbb@users.noreply.github.com>
Date: Wed, 3 Feb 2021 22:52:18 +0100
Subject: [PATCH 130/478] Fix min_pos when all negative + speed up (#19328)

---
 sklearn/utils/arrayfuncs.pyx           | 49 +++++++++++---------------
 sklearn/utils/tests/test_arrayfuncs.py | 26 ++++++++++++++
 2 files changed, 46 insertions(+), 29 deletions(-)
 create mode 100644 sklearn/utils/tests/test_arrayfuncs.py

diff --git a/sklearn/utils/arrayfuncs.pyx b/sklearn/utils/arrayfuncs.pyx
index 06da293164441..f494499923c71 100644
--- a/sklearn/utils/arrayfuncs.pyx
+++ b/sklearn/utils/arrayfuncs.pyx
@@ -19,35 +19,26 @@ np.import_array()
 
 
 def min_pos(np.ndarray X):
-   """
-   Find the minimum value of an array over positive values
-
-   Returns a huge value if none of the values are positive
-   """
-   if X.dtype.name == 'float32':
-      return _float_min_pos(<float *> X.data, X.size)
-   elif X.dtype.name == 'float64':
-      return _double_min_pos(<double *> X.data, X.size)
-   else:
-      raise ValueError('Unsupported dtype for array X')
-
-
-cdef float _float_min_pos(float *X, Py_ssize_t size):
-   cdef Py_ssize_t i
-   cdef float min_val = DBL_MAX
-   for i in range(size):
-      if 0. < X[i] < min_val:
-         min_val = X[i]
-   return min_val
-
-
-cdef double _double_min_pos(double *X, Py_ssize_t size):
-   cdef Py_ssize_t i
-   cdef np.float64_t min_val = FLT_MAX
-   for i in range(size):
-      if 0. < X[i] < min_val:
-         min_val = X[i]
-   return min_val
+    """Find the minimum value of an array over positive values
+
+    Returns the maximum representable value of the input dtype if none of the
+    values are positive.
+    """
+    if X.dtype == np.float32:
+        return _min_pos[float](<float *> X.data, X.size)
+    elif X.dtype == np.float64:
+        return _min_pos[double](<double *> X.data, X.size)
+    else:
+        raise ValueError('Unsupported dtype for array X')
+
+
+cdef floating _min_pos(floating* X, Py_ssize_t size):
+    cdef Py_ssize_t i
+    cdef floating min_val = FLT_MAX if floating is float else DBL_MAX
+    for i in range(size):
+        if 0. < X[i] < min_val:
+            min_val = X[i]
+    return min_val
 
 
 # General Cholesky Delete.
diff --git a/sklearn/utils/tests/test_arrayfuncs.py b/sklearn/utils/tests/test_arrayfuncs.py
new file mode 100644
index 0000000000000..6806fc7a1e6c5
--- /dev/null
+++ b/sklearn/utils/tests/test_arrayfuncs.py
@@ -0,0 +1,26 @@
+import pytest
+import numpy as np
+
+from sklearn.utils._testing import assert_allclose
+from sklearn.utils.arrayfuncs import min_pos
+
+
+def test_min_pos():
+    # Check that min_pos returns a positive value and that it's consistent
+    # between float and double
+    X = np.random.RandomState(0).randn(100)
+
+    min_double = min_pos(X)
+    min_float = min_pos(X.astype(np.float32))
+
+    assert_allclose(min_double, min_float)
+    assert min_double >= 0
+
+
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+def test_min_pos_no_positive(dtype):
+    # Check that the return value of min_pos is the maximum representable
+    # value of the input dtype when all input elements are <= 0 (#19328)
+    X = np.full(100, -1.).astype(dtype, copy=False)
+
+    assert min_pos(X) == np.finfo(dtype).max

From 74a37de119d2c7c9ea1cce673c2ee207541a55d2 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 4 Feb 2021 00:03:53 +0100
Subject: [PATCH 131/478] MAINT Use check_finite=False in sklearn.manifold
 (#18886)

---
 sklearn/manifold/_spectral_embedding.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/manifold/_spectral_embedding.py b/sklearn/manifold/_spectral_embedding.py
index 70f817904ac65..76f52946e8e87 100644
--- a/sklearn/manifold/_spectral_embedding.py
+++ b/sklearn/manifold/_spectral_embedding.py
@@ -335,7 +335,7 @@ def spectral_embedding(adjacency, *, n_components=8, eigen_solver=None,
             # lobpcg will fallback to eigh, so we short circuit it
             if sparse.isspmatrix(laplacian):
                 laplacian = laplacian.toarray()
-            _, diffusion_map = eigh(laplacian)
+            _, diffusion_map = eigh(laplacian, check_finite=False)
             embedding = diffusion_map.T[:n_components]
             if norm_laplacian:
                 embedding = embedding / dd

From 819c43cc7a1d7efab855e982a91e15be7aec7db1 Mon Sep 17 00:00:00 2001
From: Vlasovets <otorrent@mail.ru>
Date: Thu, 4 Feb 2021 05:56:15 -0800
Subject: [PATCH 132/478] DOC correct behavior of precompute with sparse input
 (#19348)

---
 sklearn/linear_model/_coordinate_descent.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/linear_model/_coordinate_descent.py b/sklearn/linear_model/_coordinate_descent.py
index 5e6a8c5abfb62..ea1216c0b55f0 100644
--- a/sklearn/linear_model/_coordinate_descent.py
+++ b/sklearn/linear_model/_coordinate_descent.py
@@ -615,7 +615,7 @@ class ElasticNet(MultiOutputMixin, RegressorMixin, LinearModel):
                  default=False
         Whether to use a precomputed Gram matrix to speed up
         calculations. The Gram matrix can also be passed as argument.
-        For sparse input this option is always ``True`` to preserve sparsity.
+        For sparse input this option is always ``False`` to preserve sparsity.
 
     max_iter : int, default=1000
         The maximum number of iterations.
@@ -944,7 +944,7 @@ class Lasso(ElasticNet):
         Whether to use a precomputed Gram matrix to speed up
         calculations. If set to ``'auto'`` let us decide. The Gram
         matrix can also be passed as argument. For sparse input
-        this option is always ``True`` to preserve sparsity.
+        this option is always ``False`` to preserve sparsity.
 
     copy_X : bool, default=True
         If ``True``, X will be copied; else, it may be overwritten.

From 23d8761615d0417eef5f52cc796518e44d41ca2a Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Fri, 5 Feb 2021 03:37:16 -0500
Subject: [PATCH 133/478] ENH Checks n_features_in_ in discriminant_analysis
 (#19342)

---
 sklearn/discriminant_analysis.py | 5 ++---
 sklearn/tests/test_common.py     | 1 -
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/sklearn/discriminant_analysis.py b/sklearn/discriminant_analysis.py
index 1e82578e2693b..c5c18ac9136d2 100644
--- a/sklearn/discriminant_analysis.py
+++ b/sklearn/discriminant_analysis.py
@@ -18,7 +18,6 @@
 from .linear_model._base import LinearClassifierMixin
 from .covariance import ledoit_wolf, empirical_covariance, shrunk_covariance
 from .utils.multiclass import unique_labels
-from .utils import check_array
 from .utils.validation import check_is_fitted
 from .utils.multiclass import check_classification_targets
 from .utils.extmath import softmax
@@ -586,7 +585,7 @@ def transform(self, X):
                                       "solver (use 'svd' or 'eigen').")
         check_is_fitted(self)
 
-        X = check_array(X)
+        X = self._validate_data(X, reset=False)
         if self.solver == 'svd':
             X_new = np.dot(X - self.xbar_, self.scalings_)
         elif self.solver == 'eigen':
@@ -824,7 +823,7 @@ def _decision_function(self, X):
         # return log posterior, see eq (4.12) p. 110 of the ESL.
         check_is_fitted(self)
 
-        X = check_array(X)
+        X = self._validate_data(X, reset=False)
         norm2 = []
         for i in range(len(self.classes_)):
             R = self.rotations_[i]
diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
index 37b6e666238b8..3c8743518d57f 100644
--- a/sklearn/tests/test_common.py
+++ b/sklearn/tests/test_common.py
@@ -267,7 +267,6 @@ def test_search_cv(estimator, check, request):
     'calibration',
     'compose',
     'covariance',
-    'discriminant_analysis',
     'ensemble',
     'feature_extraction',
     'feature_selection',

From 70534d67beb6a6c31e11662dc097f0bf5eb46d68 Mon Sep 17 00:00:00 2001
From: Muhammad Jarir Kanji <52061313+mjkanji@users.noreply.github.com>
Date: Sat, 6 Feb 2021 11:19:57 +0200
Subject: [PATCH 134/478] FIX Add `with` context manager to open file from
 OpenML in test file (#19367)

---
 sklearn/datasets/tests/test_openml.py | 26 ++++++++++++++++++--------
 1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py
index e4d2aa70fc8ad..c8dd86423b889 100644
--- a/sklearn/datasets/tests/test_openml.py
+++ b/sklearn/datasets/tests/test_openml.py
@@ -1,6 +1,7 @@
 """Test the openml loader.
 """
 import gzip
+from io import BytesIO
 import json
 import numpy as np
 import os
@@ -203,21 +204,26 @@ def _mock_urlopen_data_description(url, has_gzip_header):
                             _file_name(url, '.json'))
 
         if has_gzip_header and gzip_response:
-            fp = open(path, 'rb')
+            with open(path, 'rb') as f:
+                fp = BytesIO(f.read())
             return _MockHTTPResponse(fp, True)
         else:
-            fp = read_fn(path, 'rb')
+            with read_fn(path, 'rb') as f:
+                fp = BytesIO(f.read())
             return _MockHTTPResponse(fp, False)
 
     def _mock_urlopen_data_features(url, has_gzip_header):
         assert url.startswith(url_prefix_data_features)
         path = os.path.join(currdir, 'data', 'openml', str(data_id),
                             _file_name(url, '.json'))
+
         if has_gzip_header and gzip_response:
-            fp = open(path, 'rb')
+            with open(path, 'rb') as f:
+                fp = BytesIO(f.read())
             return _MockHTTPResponse(fp, True)
         else:
-            fp = read_fn(path, 'rb')
+            with read_fn(path, 'rb') as f:
+                fp = BytesIO(f.read())
             return _MockHTTPResponse(fp, False)
 
     def _mock_urlopen_download_data(url, has_gzip_header):
@@ -227,10 +233,12 @@ def _mock_urlopen_download_data(url, has_gzip_header):
                             _file_name(url, '.arff'))
 
         if has_gzip_header and gzip_response:
-            fp = open(path, 'rb')
+            with open(path, 'rb') as f:
+                fp = BytesIO(f.read())
             return _MockHTTPResponse(fp, True)
         else:
-            fp = read_fn(path, 'rb')
+            with read_fn(path, 'rb') as f:
+                fp = BytesIO(f.read())
             return _MockHTTPResponse(fp, False)
 
     def _mock_urlopen_data_list(url, has_gzip_header):
@@ -247,10 +255,12 @@ def _mock_urlopen_data_list(url, has_gzip_header):
                             hdrs=None, fp=None)
 
         if has_gzip_header:
-            fp = open(json_file_path, 'rb')
+            with open(json_file_path, 'rb') as f:
+                fp = BytesIO(f.read())
             return _MockHTTPResponse(fp, True)
         else:
-            fp = read_fn(json_file_path, 'rb')
+            with read_fn(json_file_path, 'rb') as f:
+                fp = BytesIO(f.read())
             return _MockHTTPResponse(fp, False)
 
     def _mock_urlopen(request):

From 28fa8972d32cd84d65e55e1d401ebc090de28678 Mon Sep 17 00:00:00 2001
From: EL-ATEIF Sara <sara.elateif@gmail.com>
Date: Sat, 6 Feb 2021 10:54:47 +0100
Subject: [PATCH 135/478] DOC rename Birch to BIRCH in docs (#19368)

---
 doc/modules/clustering.rst                        | 12 ++++++------
 examples/cluster/plot_birch_vs_minibatchkmeans.py | 10 +++++-----
 examples/cluster/plot_cluster_comparison.py       |  2 +-
 sklearn/cluster/_birch.py                         |  4 ++--
 4 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst
index 61c8393a734c8..8aaa6085c2e66 100644
--- a/doc/modules/clustering.rst
+++ b/doc/modules/clustering.rst
@@ -103,7 +103,7 @@ Overview of clustering methods
      - Flat geometry, good for density estimation
      - Mahalanobis distances to  centers
 
-   * - :ref:`Birch`
+   * - :ref:`BIRCH <birch>`
      - branching factor, threshold, optional global clusterer.
      - Large ``n_clusters`` and ``n_samples``
      - Large dataset, outlier removal, data reduction.
@@ -943,7 +943,7 @@ represented as children of a larger parent cluster.
 
 .. _birch:
 
-Birch
+BIRCH
 =====
 
 The :class:`Birch` builds a tree called the Clustering Feature Tree (CFT)
@@ -962,7 +962,7 @@ the need to hold the entire input data in memory. This information includes:
 - Centroids - To avoid recalculation linear sum / n_samples.
 - Squared norm of the centroids.
 
-The Birch algorithm has two parameters, the threshold and the branching factor.
+The BIRCH algorithm has two parameters, the threshold and the branching factor.
 The branching factor limits the number of subclusters in a node and the
 threshold limits the distance between the entering sample and the existing
 subclusters.
@@ -996,13 +996,13 @@ clusters (labels) and the samples are mapped to the global label of the nearest
   then this node is again split into two and the process is continued
   recursively, till it reaches the root.
 
-**Birch or MiniBatchKMeans?**
+**BIRCH or MiniBatchKMeans?**
 
- - Birch does not scale very well to high dimensional data. As a rule of thumb if
+ - BIRCH does not scale very well to high dimensional data. As a rule of thumb if
    ``n_features`` is greater than twenty, it is generally better to use MiniBatchKMeans.
  - If the number of instances of data needs to be reduced, or if one wants a
    large number of subclusters either as a preprocessing step or otherwise,
-   Birch is more useful than MiniBatchKMeans.
+   BIRCH is more useful than MiniBatchKMeans.
 
 
 **How to use partial_fit?**
diff --git a/examples/cluster/plot_birch_vs_minibatchkmeans.py b/examples/cluster/plot_birch_vs_minibatchkmeans.py
index c5a6cdb605b23..c4648ee5bd795 100644
--- a/examples/cluster/plot_birch_vs_minibatchkmeans.py
+++ b/examples/cluster/plot_birch_vs_minibatchkmeans.py
@@ -3,7 +3,7 @@
 Compare BIRCH and MiniBatchKMeans
 =================================
 
-This example compares the timing of Birch (with and without the global
+This example compares the timing of BIRCH (with and without the global
 clustering step) and MiniBatchKMeans on a synthetic dataset having
 100,000 samples and 2 features generated using make_blobs.
 
@@ -36,7 +36,7 @@
 n_centres = np.hstack((np.ravel(xx)[:, np.newaxis],
                        np.ravel(yy)[:, np.newaxis]))
 
-# Generate blobs to do a comparison between MiniBatchKMeans and Birch.
+# Generate blobs to do a comparison between MiniBatchKMeans and BIRCH.
 X, y = make_blobs(n_samples=100000, centers=n_centres, random_state=0)
 
 # Use all colors that matplotlib provides by default.
@@ -45,7 +45,7 @@
 fig = plt.figure(figsize=(12, 4))
 fig.subplots_adjust(left=0.04, right=0.98, bottom=0.1, top=0.9)
 
-# Compute clustering with Birch with and without the final clustering step
+# Compute clustering with BIRCH with and without the final clustering step
 # and plot.
 birch_models = [Birch(threshold=1.7, n_clusters=None),
                 Birch(threshold=1.7, n_clusters=100)]
@@ -55,7 +55,7 @@
     t = time()
     birch_model.fit(X)
     time_ = time() - t
-    print("Birch %s as the final step took %0.2f seconds" % (
+    print("BIRCH %s as the final step took %0.2f seconds" % (
           info, (time() - t)))
 
     # Plot result
@@ -75,7 +75,7 @@
     ax.set_ylim([-25, 25])
     ax.set_xlim([-25, 25])
     ax.set_autoscaley_on(False)
-    ax.set_title('Birch %s' % info)
+    ax.set_title('BIRCH %s' % info)
 
 # Compute clustering with MiniBatchKMeans.
 mbk = MiniBatchKMeans(init='k-means++', n_clusters=100, batch_size=100,
diff --git a/examples/cluster/plot_cluster_comparison.py b/examples/cluster/plot_cluster_comparison.py
index b59b874185bd8..5791464f3dc67 100644
--- a/examples/cluster/plot_cluster_comparison.py
+++ b/examples/cluster/plot_cluster_comparison.py
@@ -143,7 +143,7 @@
         ('AgglomerativeClustering', average_linkage),
         ('DBSCAN', dbscan),
         ('OPTICS', optics),
-        ('Birch', birch),
+        ('BIRCH', birch),
         ('GaussianMixture', gmm)
     )
 
diff --git a/sklearn/cluster/_birch.py b/sklearn/cluster/_birch.py
index 795c7d7f54ec0..0587fe075a952 100644
--- a/sklearn/cluster/_birch.py
+++ b/sklearn/cluster/_birch.py
@@ -333,7 +333,7 @@ def radius(self):
 
 
 class Birch(ClusterMixin, TransformerMixin, BaseEstimator):
-    """Implements the Birch clustering algorithm.
+    """Implements the BIRCH clustering algorithm.
 
     It is a memory-efficient, online-learning algorithm provided as an
     alternative to :class:`MiniBatchKMeans`. It constructs a tree
@@ -674,7 +674,7 @@ def _global_clustering(self, X=None):
             self.subcluster_labels_ = np.arange(len(centroids))
             if not_enough_centroids:
                 warnings.warn(
-                    "Number of subclusters found (%d) by Birch is less "
+                    "Number of subclusters found (%d) by BIRCH is less "
                     "than (%d). Decrease the threshold."
                     % (len(centroids), self.n_clusters), ConvergenceWarning)
         else:

From a952fbb10e9be8ebe3ca2ff60bf458f51ab2a721 Mon Sep 17 00:00:00 2001
From: Mohamed Haseeb <m@mohaseeb.com>
Date: Sat, 6 Feb 2021 11:52:14 +0100
Subject: [PATCH 136/478] MAINT Remove the use of assert_raises* in
 model_selection/tests/test_search (#19371)

---
 sklearn/model_selection/tests/test_search.py | 22 +++++++++++++-------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py
index af2ca92aee26b..5e63716164b6f 100644
--- a/sklearn/model_selection/tests/test_search.py
+++ b/sklearn/model_selection/tests/test_search.py
@@ -14,7 +14,6 @@
 import pytest
 
 from sklearn.utils._testing import (
-    assert_raises,
     assert_warns,
     assert_warns_message,
     assert_raise_message,
@@ -177,7 +176,8 @@ def test_parameter_grid():
     assert len(empty) == 1
     assert list(empty) == [{}]
     assert_grid_iter_equals_getitem(empty)
-    assert_raises(IndexError, lambda: empty[1])
+    with pytest.raises(IndexError):
+        empty[1]
 
     has_empty = ParameterGrid([{'C': [1, 10]}, {}, {'C': [.5]}])
     assert len(has_empty) == 4
@@ -207,7 +207,8 @@ def test_grid_search():
 
     # Test exception handling on scoring
     grid_search.scoring = 'sklearn'
-    assert_raises(ValueError, grid_search.fit, X, y)
+    with pytest.raises(ValueError):
+        grid_search.fit(X, y)
 
 
 def test_grid_search_pipeline_steps():
@@ -408,7 +409,8 @@ def test_grid_search_error():
 
     clf = LinearSVC()
     cv = GridSearchCV(clf, {'C': [0.1, 1.0]})
-    assert_raises(ValueError, cv.fit, X_[:180], y_)
+    with pytest.raises(ValueError):
+        cv.fit(X_[:180], y_)
 
 
 def test_grid_search_one_grid_point():
@@ -464,7 +466,8 @@ def test_grid_search_bad_param_grid():
 
     param_dict = {"C": np.ones((3, 2))}
     clf = SVC()
-    assert_raises(ValueError, GridSearchCV, clf, param_dict)
+    with pytest.raises(ValueError):
+        GridSearchCV(clf, param_dict)
 
 
 def test_grid_search_sparse():
@@ -548,7 +551,8 @@ def test_grid_search_precomputed_kernel():
 
     # test error is raised when the precomputed kernel is not array-like
     # or sparse
-    assert_raises(ValueError, cv.fit, K_train.tolist(), y_train)
+    with pytest.raises(ValueError):
+        cv.fit(K_train.tolist(), y_train)
 
 
 def test_grid_search_precomputed_kernel_error_nonsquare():
@@ -558,7 +562,8 @@ def test_grid_search_precomputed_kernel_error_nonsquare():
     y_train = np.ones((10, ))
     clf = SVC(kernel='precomputed')
     cv = GridSearchCV(clf, {'C': [0.1, 1.0]})
-    assert_raises(ValueError, cv.fit, K_train, y_train)
+    with pytest.raises(ValueError):
+        cv.fit(K_train, y_train)
 
 
 class BrokenClassifier(BaseEstimator):
@@ -1472,7 +1477,8 @@ def test_grid_search_failing_classifier_raise():
                       refit=False, error_score='raise')
 
     # FailingClassifier issues a ValueError so this is what we look for.
-    assert_raises(ValueError, gs.fit, X, y)
+    with pytest.raises(ValueError):
+        gs.fit(X, y)
 
 
 def test_parameters_sampler_replacement():

From 80c47b01536a97f74a57d39ae4a7bb1645a98d91 Mon Sep 17 00:00:00 2001
From: Amanda Dsouza <meezamanda@yahoo.com>
Date: Sat, 6 Feb 2021 17:11:27 +0530
Subject: [PATCH 137/478] ENH Adds categories with missing values support to
 fetch_openml with as_frame=True (#19365)

* MNT Fixes missing value loading into a dataframe from openml

* FIX Include data files

* DOC Adds whats new

* REV Less diffs

* DOC Uses docstring style comment in test

* resolved conflict

* resolve conflict in doc

Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
---
 doc/whats_new/v0.24.rst                          |   4 ----
 doc/whats_new/v1.0.rst                           |   8 ++++++++
 sklearn/datasets/_openml.py                      |   6 +++++-
 .../openml/42585/api-v1-json-data-42585.json.gz  | Bin 0 -> 1492 bytes
 .../api-v1-json-data-features-42585.json.gz      | Bin 0 -> 312 bytes
 .../api-v1-json-data-qualities-42585.json.gz     | Bin 0 -> 348 bytes
 .../42585/data-v1-download-21854866.arff.gz      | Bin 0 -> 4519 bytes
 sklearn/datasets/tests/test_openml.py            |  15 +++++++++++++++
 8 files changed, 28 insertions(+), 5 deletions(-)
 create mode 100644 sklearn/datasets/tests/data/openml/42585/api-v1-json-data-42585.json.gz
 create mode 100644 sklearn/datasets/tests/data/openml/42585/api-v1-json-data-features-42585.json.gz
 create mode 100644 sklearn/datasets/tests/data/openml/42585/api-v1-json-data-qualities-42585.json.gz
 create mode 100644 sklearn/datasets/tests/data/openml/42585/data-v1-download-21854866.arff.gz

diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst
index f549b31f51aa7..e5831a20a2b53 100644
--- a/doc/whats_new/v0.24.rst
+++ b/doc/whats_new/v0.24.rst
@@ -234,10 +234,6 @@ Changelog
   files downloaded or cached to ensure data integrity.
   :pr:`14800` by :user:`Shashank Singh <shashanksingh28>` and `Joel Nothman`_.
 
-- |Feature| :func:`datasets.fetch_openml` now validates md5checksum of arff
-  files downloaded or cached to ensure data integrity.
-  :pr:`14800` by :user:`Shashank Singh <shashanksingh28>` and `Joel Nothman`_.
-
 - |Enhancement| :func:`datasets.fetch_openml` now allows argument `as_frame`
   to be 'auto', which tries to convert returned data to pandas DataFrame
   unless data is sparse.
diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 3a6793eeccc6c..3086d91b28f5d 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -172,6 +172,14 @@ Changelog
   :class:`~sklearn.semi_supervised.LabelPropagation`.
   :pr:`19271` by :user:`Zhaowei Wang <ThuWangzw>`.
 
+:mod:`sklearn.datasets`
+.......................
+
+- |Enhancement| :func:`datasets.fetch_openml` now supports categories with
+  missing values when returning a pandas dataframe. :pr:`19365` by
+  `Thomas Fan`_ and :user:`Amanda Dsouza <amy12xx>` and
+  :user:`EL-ATEIF Sara <elateifsara>`.
+
 Code and Documentation Contributors
 -----------------------------------
 
diff --git a/sklearn/datasets/_openml.py b/sklearn/datasets/_openml.py
index 15cce13d8bf05..b589c9faa5213 100644
--- a/sklearn/datasets/_openml.py
+++ b/sklearn/datasets/_openml.py
@@ -23,6 +23,7 @@
 from . import get_data_home
 from urllib.error import HTTPError
 from ..utils import Bunch
+from ..utils import is_scalar_nan
 from ..utils import get_chunk_n_rows
 from ..utils import _chunk_generator
 from ..utils import check_pandas_support  # noqa
@@ -357,7 +358,10 @@ def _convert_arff_data_dataframe(
     for column in columns_to_keep:
         dtype = _feature_to_dtype(features_dict[column])
         if dtype == 'category':
-            dtype = pd.api.types.CategoricalDtype(attributes[column])
+            cats_without_missing = [cat for cat in attributes[column]
+                                    if cat is not None and
+                                    not is_scalar_nan(cat)]
+            dtype = pd.api.types.CategoricalDtype(cats_without_missing)
         df[column] = df[column].astype(dtype, copy=False)
     return (df, )
 
diff --git a/sklearn/datasets/tests/data/openml/42585/api-v1-json-data-42585.json.gz b/sklearn/datasets/tests/data/openml/42585/api-v1-json-data-42585.json.gz
new file mode 100644
index 0000000000000000000000000000000000000000..6c8fa689f269e414897f7aa54e89b71427cb4e45
GIT binary patch
literal 1492
zcmV;_1uOa=iwFn>$5&qh|1>f+I5hyhS8Z?GHW2<RR}~myz>)1FP7`At>a<H+v>Adl
z1vW&5K}(d(g%UNAisuFS?|Y7t6X#_>J8aFDnAGuJo_p?iypA&A#mq@R%cM)Ks=YE*
zG`Wsc7EPkPo$<jq>PM9*C6=|UibhoqJ6F=WAQ_FYx_35uc2NsmO1l>SvbFHOc9Ud~
z3<~8J%{)#`nGA&1%9(0mtn<m>{+VQ8#WER`!g*==f0Yb!RWw%OlQxBk>#FD_RZ=~F
zPH*na^UnD~QelK9lhb{zq#SuewKZ2NlO(9HQg~tv00HI3hTkOrthKR%719rMrCcMl
z`YWvaBpiu~goa}kwyq?rR93l|jlbdZOKBygMr)aRnL#|GQn0}*p|qH5N%IxGu`zwX
zF)u56Yi!9%A?V@#Y4Dmp`Of6D^1`NGrGh@5|9l1?sFdX#A_$QWd^A--=hBw+Gt8O-
zlA6w>lQ_1ZU!-3edl~#Y)e^CilaemJJuUwxmA!y#X#k0;at@S(io4vsR&9_;HLX^r
zVi^D{9pk1ds+&19FBHP#I74>a!ZbPqzCb1QbnzI-O*fazW!#87mRXYw(%gVQwKTPs
zAe2ZO?N`YllOBml2A8dt?ILK2>qXroF*l7zW`tfj<wA-W%;1qJB)mcvICU$^6{MJ5
zS(ADf)3G(>I<=se&@pZ83`cvt3C)t|eU&ND5Y%!Hf^J+Fuj?Wfzz&>j*#x%bpq5z%
zL_U8Clvw9DX<*>cQJ~>8O{H^mYP3pMeemIrMp;?P%C~WRhW;r^AZ$(O4zs{*>08X$
zgKZI1ZP!6nGKiCC)|MI|S(ncOl`)q{LMAQBBBPw+XD824hv}s#<imQrD~KSw!N^TU
z#U0g@dE6#8Q~kAPdYhGyDlfUFTTI<8>C!+;D26EE?;UHU=}Dj7DXnRG+Fz&k!x5)-
zXV{A$gg5C5xm7SlDXLm(Q5f2~tLPqe;sM8a1FNpb(l8#y!?;!Q?ZLADQ#w-fbNMSF
z&vq5+@uT2%<6r|_@ixj=C!zu)ulwzi|Ge_Z&r7;~I=HWk>t;itXgs`Tjng&;IuTaM
z4q+zdwB0^MNdRE!TFCj!y3UU}@Hh3Y(Ct_&=VEieGCuhK56EtcHdeFbx1+pPa+6eD
zW1HHxAhG(*G?aoUnORoai0n}nSWjC>Mc(L;8!tr~THX={Z~I>FuAbfoN65<B5X*0t
zSWNG8TEQX|;)@FE*^@ri8;obw6H21aO}>y`b-~WsHXd~?8nK*6l=sHPbQ?DWepF>a
z&ZaMu=)b<^u{T>w@_nB~0f=p2X)3<rw!fR*rONyQrpCiz68&@w%n|c4Q|O<}C%2wH
zaIIh~OdD7=bq&m{%D_C_*MjECX6r^hhX$mqYE$J@S!<OmG|#T9_SecZ90D7-2r^mo
zFI);%@@STE|GGjAaVKq6&RX$VS%wo~D;(n;+#qwgzL9;Sc!&wfJ`QApADgAy4?%QZ
zY_X=^)6SG&7Z42I;ErBau)^2WmS2eK(mfn@nCYp8*ZB%uf7YA)zies)XhHanWO~Bv
zeJ){qZigaF&eOBwW2{r2N{x*s7ts?P5&}>5N9;fBj@Y_CN<N$<qc5<AIm)Ve^F1ww
z$rxW}NCCerxMgn-Uu^H}KtWS;lYIQ?d~`6`-J6VA2KTe9QvUw>^^nh4jo<gDK7$06
zIySawhnQq=HNvm12`7V{(ZP7{V1J(n!mikb&2vtO(sCAFKx>)s$jJr9#TmlE*f$ft
zMU8!M7|oOn?;_<CN<{e;zn8)sfn8%&!L#8kytW?t`n5cRqFSJpJW`Mu)kZnOg|_<w
zLH=$UJWh06W1^0S!^7e5fZu5jIcXfuVs1vwvqEclutNbp#+w-L@npz&m)UrhE@XP?
uni4|8Z0~S)UyO!(^ZCJ8=Ht=M3o#b^yICp^M)_fMbMrSv;s-ig4gdhF+~h+5

literal 0
HcmV?d00001

diff --git a/sklearn/datasets/tests/data/openml/42585/api-v1-json-data-features-42585.json.gz b/sklearn/datasets/tests/data/openml/42585/api-v1-json-data-features-42585.json.gz
new file mode 100644
index 0000000000000000000000000000000000000000..d0239769ba485c9d8d1c9d085ac9624f2682a507
GIT binary patch
literal 312
zcmV-80muFyiwFn>$5&qh|1>f+I5hyxls!)aF${+Pi{~3?`53yO0xb({Y^aLTi|=YF
zaZYg#sH*$#__#_}%F?B%ldU(l^SqIq9XOk<snF&&p}=eh`y6KPJ0N!W1v9vU0@wx>
zK^iPcD~q8sZ<}7_7>KR!%ZAK+Sl{+TVD8Wp>dE7VSaPzB=FiAEMpzX<VzNy&W}T8#
z+ZU=tHOHO-D(hvhrTA&cX(LsH>EMjpu*PI!Wdey1*Td!V1Kt&*jmc<(#9MZ!yLcce
zV}IFGLK`%`MTJg+d~D)?_&hGAql>cj0oi!uI&Vw}$9~#?iAp#-;Pe;Z^&dco%?0t@
zjC_@+rqQMgd%@e0a2?&&1S?|d%V4`Z_zP4xd4uj%TrOVcFAGif!TiJWOmurzUH1)7
K117TS1ONatbD4_(

literal 0
HcmV?d00001

diff --git a/sklearn/datasets/tests/data/openml/42585/api-v1-json-data-qualities-42585.json.gz b/sklearn/datasets/tests/data/openml/42585/api-v1-json-data-qualities-42585.json.gz
new file mode 100644
index 0000000000000000000000000000000000000000..91dc9d91bb10b72ac1c3e1438628570335975568
GIT binary patch
literal 348
zcmV-i0i*sOiwFn>$5&qh|1>f+I5hx`l21#+FdT-zOU@%CY5wl!CW42lFwldDWHZi4
zn_`j*bL_jDIdti0<}N8E@ALf9H{F029rg1^r)wLn31GYd`<FEsKW>22jR9kD(ZzP!
zdT(kSZR@}YeCoQ}?I5TW#ato92~r9Z4z{B|?$S2Kg+%O)Ife+TsFamRScOPML=MjM
zd+QT2t#t@@#xISFdSx;Lr4UPwC}$XRA`~v-bNiKpm=TD2Ve9^xg%uQBP<j|x<d24N
z*`IaOH4Ec!UnkaSzrHd$c0RS<D`qJF<L(8k;oyK$3@5K$h}tdt($wnd@PmzSGaG_+
ztNR_4)3|W>V!lvLdKzz+1D{ej-F)3F+Io;Us_AgO=ifojoYez?3UQVFF;z)dG9W0Z
uU`SSoaVjK5tYj#6#RSBZNG5*QlHx>rl)y}o)+Bk_ZodK14sQxJ1ONcJN~2Z)

literal 0
HcmV?d00001

diff --git a/sklearn/datasets/tests/data/openml/42585/data-v1-download-21854866.arff.gz b/sklearn/datasets/tests/data/openml/42585/data-v1-download-21854866.arff.gz
new file mode 100644
index 0000000000000000000000000000000000000000..e048536c7762ef46f85608431faa9619a2855504
GIT binary patch
literal 4519
zcmV;Y5m@dYiwFP!000001JqpIZrnzezV}lU3>c7rR*S43lI5F>94nb1PGZ=~&c$M3
zP~Gence<NBO}0InAi%!HKEgc5yvaVvR-IE-{ArR^26iu^08*r8)%pL<SErb~BJq+u
zl+CVqP2T6beQ_dpr`1z=EOz;}B3E~ZCO@s3a+P0`V{zD?%L7^GP5$E5i&um$$;aw^
zS{3zc^&5O$spq~u6?s#gUX#1suB@vA`6)k@1^IJzs+)!SBmC|Csi^n)p{U89%iV54
zems?RQyj<}@_ltuAGi%4ugNc`ZGI^KkvC;^aB6;R;CIq$NZvi%e{kzo*W_*9D6?op
zBlp+!HTk}5p3aY?2F+iSr=~g9uWxU2RQ5@0x!l4t%d)yXf;3}#eG7K8rBb49$gg$2
zEztBEEi5P1soa(awbe5@o*&hQPsN&)2j?DR9QpChg8W6<UFQ3mJiJ|y?@oEGjQYz5
zaz&$<UoYK?`&G58w#vL6sf)j#)sJ<#uTICOvfh*YuqMUfb9t%``$9QlM?NcG=a1#C
zY@W#%Wku;cNpiBP_WScee^hN+$JxNv<Z4?S&NX@eyshe0q15l_@%~-)k^J)WJ&{-J
z`gI|rD2v-qtryo-xvWmx+c;XrERApfQ+ZkWV|lC&#WIRuRo7vw62sVL5V>3b>z_N7
zFIKDe6HsF|`rwcsm&VNti2bs+{$q1GubT5oyR-a9LH0#npHGGMUJa{N5;WCOd8ID5
zhg?0WQdD_sQ=M`Z+HG&Md|w^5BwwF*4cX_r0yfwb@UKHN$W`lBwM6~Mi|gKQsu-G5
z@3+&tHc!P^MIMU$G;qSAw+oG;^hSM?<g<E2EXjQ@Kfvohs2nKO+RsJbIU`X_qLis3
zsGIy&W{+{QPS+w~8=8pKqo6FMt8A4^StQX$p~XsJL9J3=`~4<LZlbJ{LEkG4s_Ks(
z*S;0Ks2=m<IsaqDG~KN7B;7pHOog>5=w_3%wPa~jJg)O3lfIaKs{p2MPWkbV9@Yy(
z`t@q{KG*;M@lE~@<GYIadb8T(>pUy+nB{RQSW={{cuY2rg)AoHt7Pyc`B3bN&&qfu
zeWPOp9T>&m+^A?iD_B?kttph-8)$X5@0G?Knk<m|sXFi0s{JT)R*h5t-`}2=uJL0I
z{Ku8G=gXHb%X7Y27VGovYE#v>$D%s!irY=TQt^AfT^CJW?r#5z%Wt-@e0h92wgJi?
zt8=4bm+zkI64T{Nb^0r)R&5KV-ifHB%n#3Gqq{aao~pG<x4QN4cc*G^n|7i0+AU3k
zeobEfa9Ee0%e89B^}q+`+L&J(2jr)M6n{S!EB!!I^#VBcwW)Ty>I*pW*N<2(KNpQQ
z;D7$>ziUhi3?6x3?aI~jLWS#Z=h9S1kPPjTUzL(~+fxC{T0XQ>1|5n9{I6}e$wAi5
z5oFyiUqX&3Ec*Gpf7FiLs3cJ-chGc@FwV(UEU0ROs(^|GNwT)Y79^(J0O>VslXSPa
zYnoH}cy9DQoE*-oY&1zZ73*@P^PlAI0R<UkrC460!VeT_;wf#XcBHCe6?amrNR$dF
zR=d2eL4kDHiXBOIMUw5BE3*fzy>8J?*8|wv{fjfX`$se!|Dq^#S?`p3zr88zRaGqB
zD)PKvJg7=8PIYnk9UoJSQ)Rsh5ZNhoYMykG?f3W@cb~0`WB(%+oz1R19+h7&*D0Sr
zR_kYlgtg9JVz5$qLh;Pse!6>izxeK6|M(p|lr70mWethldg|Zu-Py7Kj+6Rdk+T&Q
z{e;SV+VAmmIxzAR7~(#*pL1u;U<2x`)dqdi27TIg<Wpy`vh1=5;s^|s>NIoS@AK0$
zg0uqmRZZSJlLrj=w>sc$yt}3<zP;<}1iyET7*l(w>a%JRuz5^~$s`oSvSrp&H(Rpa
z?=o5rO;Qa!?*38rmJC6Sl5W#hPQBcn_jgZ5c4usScdeKqNbtqerReP@mQnY28mAY#
zizrP-c31nW)`n3aZ&h?41>r=<`74JrG<wwq8kW2=3N2s!>BIfIyN4fs`Pp%m7cc&F
z_wexHhc~}I+>ht(Z)V?iOWf`E?n<oSff<tp-B~zV4j=CC-i<ueuZiJ9qlGbiXtX+p
z4~>?{Z{HX!SirYWzbntZy?eNO@e=*>a`Dnx_;SJIGG3@eO4YY`87){UBK6~?Ui)%#
ziCCy);A%;tmt+wvL8%jqc(i0%PPB-n(2J<>FZ#y*GoeVL)s}i=$@HE)U{9u>SJX9G
z=weih62HOOQfh-U>!XB<5IIyUml%V2k_w+3j<JZHJ+NfLdQR16$`EP{;povRUxE55
z2!NomDBx)wGH_Bl_cV|7wy8EXqY&GGgiuaR7BmL+<4zXM`v;b2|L9EO+`lBzT4Cr!
zYlv4h$SaqtA$g_3A#^zO%2X>SLhR8s1m8HsBp|Q2j-GOYDnZF1?Bj9}(rhR*9m*7h
z@F@>;R-_gZutGc%gf0(A_e{>>fm@Nm!;p;NU^!lco#!DPsnuvAS2`L2*eJEKG)N;3
zfRMpAC_OQ+k$N65&`Fp~`Hokmk&r@>#8XMtmw{G9GhTwOWnmd0G0kQX)-Fh06mWqY
z_iAr{%%-eGkm>!E{?h8t1NKjJA*5~9O;$sz5Vdfii-O1V{?SQ9BiKK{ovNO}JHmQj
z_}!V9ZiHRJ=O3kirWeG<KYXGSwf+&NgJ$yy)cQX&@PZEY`aiM;Lx+fc@r8CJ0r8R<
zIBxwD>HL(o=p(;>pe8fh;3ja$s)L!(`iIlehTE!Rt1H!usJv*lJ;y-#V&Y7;gEn<o
zx3x_rrh^8FO)9mX7Xfe;+tPwg$h?3UXo8bKp-`Zwf%7Ae4IJGW8$hJeYeS6PFR=t5
z_1GZM2A7|c1vv!u%<?H;+ei>W6H~!7^GUEZADd=8#hyEHNP9KlZvrvqQfz7DM3MD1
zf>J{XNkAMRmvpw$kVqSxwy@>*l9G^E?oUJ7y}&q7*9ih7#1=}S9%4WF0@jkW=YS}L
z<i!Zt+%Zu9;ei2FYunCbK>CM=)IxVO1Ks^4iQ!0czC6122y7A3lGX{N>ibQYGL)U8
zftMU-rng`gQ?DFk4+;1?4wZwHMz)MPz;BRei36@MzfqTOXipS&crhGS&N9Zn9BEEg
z*kUIz^5Reopy%Pf{h-D+ui0|?P|@~IW~d;>X0kxr-VBFsW^r<YUbh{WVe>ILHUZr2
z<Ca>ZYbX0>O9ff)#NuJr*k~OQnJ!_wKwu2?`J9GuM~lI+S|Dcnx!s>*D|F|l$ur%6
zO4-x@6mGQ+rS|xrhEQ2cHKo#jj*qksjcx0n$B=e^X7K~`XY=|qy?^3PHq&Ip?#~?A
z#~wJ*KgReJ1UQ{Kt8_Y4gr^T0L4~E`?6iLfM@kOdB}zRK;Go|*-%Mfb$bl4Uo=Tq$
zwkx><o#&9uvbH734m<J;26|^SPaN>{3SI&rihW+fNMk#Oj+c&)<@yqU7(&NqnK#lx
zPpJ^=fc^W%C*V~GH?makC;^YA2#;U`_1GX$qtwE+pPN&RG_&;<`G=QTOFZXmuHUbr
zF9??*emMf&6<O?_>^&C~%1(!1<JEV;3$R}U22N7-zCAEG%xt#!hk!gnIT}xZw*?Iw
z&DlxV29t&OP00b$i^r!RzKe5ms1_N>bIh#H9qZ^+N%rj#Hs9v@U!U*V9*A;+>@gE|
zJwc6v?Ge4BbI_2MA)lhALZ|Ye?YE8|vmgRvh8MPOI3JIHuixT;9@OEU(VCs;3l9#r
zd?&Gw&%<!r&jIISGhJYEUVk$>5?{f0QX4cj(+j)mm|S8Z`(s^nz`^5xI4;1!2}h@s
zx!Ue;LCz2)-z<ySvl75Tz-TS9BYvC`_(sdL%hx>a#7mdXy$1KBgPpKt-V2x3;H-C3
zHZgY@?C;KaUcMRI`l3;Mi4*RIxw9f^VS<j8@W>j0M8~n(L6wcIh~2>j?e52IOfP|*
zBDJX}2BkkHg|=NRz0AIGDbxlKuM&jUdY-ts)A)VGr2ScotEtz1Gu4*1vkcq#jF+Xg
zCJl(by=Lr98a-0d-kG@1ea569O8^&)US^KRGO{O2%;bQKkroW`;v@BE3{p6tnPE~y
zgv`(KaK^m3pE;1oMh8OUap7Iyx&W53H3sbL^l>n89a}+hBI<5_hVy~3HCa1Gi#si6
zW(C7Rm2Kh|1bf`eNq4R@7^IAQ%ymxDiHkqSI%ldlAo>D3gz5johC3b?Ge|XSLTur^
zF;!T-WcHj6(HGu&iQFK0{3>`%3P=8v*?RG2KfGRmx4UK}6NF<C&aKSMA@)i!Cw%+T
z%HZmM9Fv5l9<tNJW+RDA)Ukawyg``CF*w5-<{lD#H{gfGA<;L(dg-Dc;4yno7Dt`V
zH>(%5_gHDChkNQ++Ie$#xIc^H!ZOIwJH<&USo^Rh2}1s?$CP+L?Z^GQ$e`MfOn-@K
zP`wDl;9R{>ug9c&=Lx%hfc4x=hq%rD;o)#xGj4G#4$Bxb@^J6D1@sqz;~t<GEUfQS
z%oz4H2xDH7a7@t~ePoJBXFL$pk(sT^Fk3`q^|JF3Zyk%aQ}R%r9Aj1L+)qMlKeGo}
z8Cd(g9e7C*xJC%Pr_}aCNA{mzn7zL@9t*3=#4;ro#9$FEAomf)y&cI$GId;hP?h+O
zZ+CVH=L!bz2E`o=pI(9IFn#Alnf_PN=lhouKpj0wGFA*(<%9$QI%OdephI%#D%xDZ
zJ<mf*G1rCo@WSbG1)K}<h03_oPiOs_Y8&+H2s(m-dURsCY8h}m+2PT%5;84;TdVs#
z^747;R}45o_XVmM2@)f%Bd04?2#*ATe*FpwS?gC_6C(3U$n5htTJd@b&`^L*X@~@1
z=bhXbo`A<BP?N1x=SIT3*+)&bVx;}KlwmHZI8d#gVDmO03UJ81p!$LP>`wg*2VxTy
zgezERwn7{-C&L>D$kzC4bc&6aOg(t4=4b;m*=^AEn{j}FQj6@?iAVXY4FDw3kb+D?
zo>~FNDQ~Hz=uT!+f_mKvg+p!94e9+{Qb1D_!zt{g4bt9eVPi;qA(kj1vo`xn4d5ba
zz2pOWsx?ADf;1ft+QyQ?3@0pXdu-qc7m2&d53nIMfCMXi#f+DW%sqcGEFAM|f|ubG
zSXn^%GwUVD*5MFiLITzUzG&kGB!RbSCR=^k%`k4#fHof<lP6n`3~mfvoq|XMM?hk>
zRSjSG9n*>SAkW_z5>un*&>oOW!cZ%$2cHB})7(+9058QR(zFZ1zQ(}6CUTLXF=&Gi
zHJC;+cJcCzFE`MJD8vTVHt6nai?d$h7L9orQG$xgtOP%eF{D8nk`WN;)YbzJi!^sc
ziNzs3PPmQ|2X+*R$GI%@;9zX#eL6YXprw5WmQsN2%o}21l!uh?D~}TbOd=N6pwlsk
zoDDG~QDQ^vOVbSD#XaxiJEk-<%Y;Lcj`wW{V5+wDV2FKjNL?I|miUQMj58!l0TQ2=
z0EC$X;ea@pnG3g(mSHqT*lFAFzjwT}fg&Ke$Z>Bjxl<BI7`{I@NhhTRBp=Bz%`I~h
z0wB-K|0o3-lXxqR+hWc_Bh4(pWFe4eaw)PlWWL#lp!R(@ldb-mwv7aB@KQ0H_@&UG
zvk;OY+H+M{94zw1Hqt)WHm4qUM)#5oj5IU`nsjhuMk8)%51@wZ-G`5%j5OIz!{TPA
z2HZ|K{S_D6_Yw__@zMIMjkFB&lDP+tLhB(#^tNYiF1=REmWqJ{;gw)XhCy#CrrM_&
zw7R3SZ|fLGdhW#FZ=bA93K=4SVev<nAq}t9$3kY0|51np%m1kzZutic5(`0ElHqF%
ztd+o8{xYAYqBmOgdC4%O*s^Wrg*<_U&^G80xM;^`wm3gcgdnpljE3~)6nnG7qTxH$
z7b`mdi-U<I#-WP{MK*4c4UVOrV=OWQ?z&_EgOYfw8rwr-5C8uc009600{{=DOI@Q<
F000`9<hK9-

literal 0
HcmV?d00001

diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py
index c8dd86423b889..43ad77cde30b2 100644
--- a/sklearn/datasets/tests/test_openml.py
+++ b/sklearn/datasets/tests/test_openml.py
@@ -1321,3 +1321,18 @@ def test_convert_arff_data_type():
     msg = r"arff\['data'\] must be a generator when converting to pd.DataFrame"
     with pytest.raises(ValueError, match=msg):
         _convert_arff_data_dataframe(arff, ['a'], {})
+
+
+def test_missing_values_pandas(monkeypatch):
+    """check that missing values in categories are compatible with pandas
+    categorical"""
+    pytest.importorskip('pandas')
+
+    data_id = 42585
+    _monkey_patch_webbased_functions(monkeypatch, data_id, True)
+    penguins = fetch_openml(data_id=data_id, cache=False, as_frame=True)
+
+    cat_dtype = penguins.data.dtypes['sex']
+    # there are nans in the categorical
+    assert penguins.data['sex'].isna().any()
+    assert_array_equal(cat_dtype.categories, ['FEMALE', 'MALE', '_'])

From d6bd7bee8799ea41c456c36a8ccf7780105615e8 Mon Sep 17 00:00:00 2001
From: Terence Honles <terence@honles.com>
Date: Sat, 6 Feb 2021 10:20:31 -0800
Subject: [PATCH 138/478] MNT simplify pyproject.toml by using
 oldest-supported-numpy [cd build] (#18900)

---
 pyproject.toml               | 25 +++++--------------------
 sklearn/_min_dependencies.py |  2 +-
 2 files changed, 6 insertions(+), 21 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index dd5279d19cbd2..c55c68b3182b8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,26 +5,11 @@ requires = [
     "wheel",
     "Cython>=0.28.5",
 
-    # PyPy needs numpy >= 1.14.0
-    # platform_python_implementation!='CPython' not needed >= Python 3.7 which is numpy 1.14.5
-    "numpy==1.13.3; python_version=='3.6' and platform_machine!='aarch64' and platform_system!='AIX' and platform_python_implementation=='CPython'",
-    "numpy==1.14.0; python_version=='3.6' and platform_machine!='aarch64' and platform_system!='AIX' and platform_python_implementation!='CPython'",
-
-    # AIX needs numpy >= 1.16.0
-    # platform_system!='AIX' not needed >= Python 3.8 which is numpy 1.17.3
-    "numpy==1.16.0; python_version=='3.6' and platform_machine!='aarch64' and platform_system=='AIX'",
-    "numpy==1.16.0; python_version=='3.7' and platform_machine!='aarch64' and platform_system=='AIX'",
-
-    # ARM needs numpy >= 1.19.0
-    # platform_machine!='aarch64' not needed >= Python 3.9 which is numpy 1.19.3
-    "numpy==1.19.0; python_version=='3.6' and platform_machine=='aarch64'",
-    "numpy==1.19.0; python_version=='3.7' and platform_machine=='aarch64'",
-    "numpy==1.19.0; python_version=='3.8' and platform_machine=='aarch64'",
-
-    # default numpy requirements
-    "numpy==1.14.5; python_version=='3.7' and platform_machine!='aarch64' and platform_system!='AIX'",
-    "numpy==1.17.3; python_version=='3.8' and platform_machine!='aarch64'",
-    "numpy==1.19.3; python_version=='3.9'",
+    # use oldest-supported-numpy which provides the oldest numpy version with
+    # wheels on PyPI
+    #
+    # see: https://github.com/scipy/oldest-supported-numpy/blob/master/setup.cfg
+    "oldest-supported-numpy",
 
     "scipy>=0.19.1",
 ]
diff --git a/sklearn/_min_dependencies.py b/sklearn/_min_dependencies.py
index ae9e10e2ba06f..56d44586cdc6d 100644
--- a/sklearn/_min_dependencies.py
+++ b/sklearn/_min_dependencies.py
@@ -6,7 +6,7 @@
 # numpy scipy and cython should by in sync with pyproject.toml
 if platform.python_implementation() == 'PyPy':
     SCIPY_MIN_VERSION = '1.1.0'
-    NUMPY_MIN_VERSION = '1.14.0'
+    NUMPY_MIN_VERSION = '1.19.0'
 else:
     SCIPY_MIN_VERSION = '0.19.1'
     NUMPY_MIN_VERSION = '1.13.3'

From e230778f6f1f24de1aa694b52fa9411a1e5c3197 Mon Sep 17 00:00:00 2001
From: Isaack Mungui <41724425+icky254@users.noreply.github.com>
Date: Sun, 7 Feb 2021 13:14:30 +0300
Subject: [PATCH 139/478] TST replace assert_raises* by pytest.raises in dummy
 module (#19386)

---
 sklearn/cluster/_kmeans.py  |  2 +-
 sklearn/tests/test_dummy.py | 43 ++++++++++++++++++++++++-------------
 2 files changed, 29 insertions(+), 16 deletions(-)

diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py
index e3cb8e6c17fea..437d9b32064d6 100644
--- a/sklearn/cluster/_kmeans.py
+++ b/sklearn/cluster/_kmeans.py
@@ -149,7 +149,7 @@ def _kmeans_plusplus(X, n_clusters, x_squared_norms,
 # K-means batch estimation by EM (expectation maximization)
 
 def _tolerance(X, tol):
-    """Return a tolerance which is independent of the dataset."""
+    """Return a tolerance which is dependent on the dataset."""
     if tol == 0:
         return 0
     if sp.issparse(X):
diff --git a/sklearn/tests/test_dummy.py b/sklearn/tests/test_dummy.py
index 280ade175bc4a..6d05eab1dea4a 100644
--- a/sklearn/tests/test_dummy.py
+++ b/sklearn/tests/test_dummy.py
@@ -8,7 +8,6 @@
 from sklearn.utils._testing import assert_array_equal
 from sklearn.utils._testing import assert_array_almost_equal
 from sklearn.utils._testing import assert_almost_equal
-from sklearn.utils._testing import assert_raises
 from sklearn.utils._testing import assert_warns_message
 from sklearn.utils._testing import ignore_warnings
 from sklearn.utils.stats import _weighted_percentile
@@ -257,10 +256,13 @@ def test_classifier_prediction_independent_of_X(strategy):
 
 def test_classifier_exceptions():
     clf = DummyClassifier(strategy="unknown")
-    assert_raises(ValueError, clf.fit, [], [])
+    with pytest.raises(ValueError):
+        clf.fit([], [])
 
-    assert_raises(NotFittedError, clf.predict, [])
-    assert_raises(NotFittedError, clf.predict_proba, [])
+    with pytest.raises(NotFittedError):
+        clf.predict([])
+    with pytest.raises(NotFittedError):
+        clf.predict_proba([])
 
 
 def test_mean_strategy_regressor():
@@ -299,7 +301,8 @@ def test_mean_strategy_multioutput_regressor():
 
 def test_regressor_exceptions():
     reg = DummyRegressor()
-    assert_raises(NotFittedError, reg.predict, [])
+    with pytest.raises(NotFittedError):
+        reg.predict([])
 
 
 def test_median_strategy_regressor():
@@ -401,27 +404,34 @@ def test_quantile_invalid():
     y = [0] * 5  # ignored
 
     est = DummyRegressor(strategy="quantile")
-    assert_raises(ValueError, est.fit, X, y)
+    with pytest.raises(ValueError):
+        est.fit(X, y)
 
     est = DummyRegressor(strategy="quantile", quantile=None)
-    assert_raises(ValueError, est.fit, X, y)
+    with pytest.raises(ValueError):
+        est.fit(X, y)
 
     est = DummyRegressor(strategy="quantile", quantile=[0])
-    assert_raises(ValueError, est.fit, X, y)
+    with pytest.raises(ValueError):
+        est.fit(X, y)
 
     est = DummyRegressor(strategy="quantile", quantile=-0.1)
-    assert_raises(ValueError, est.fit, X, y)
+    with pytest.raises(ValueError):
+        est.fit(X, y)
 
     est = DummyRegressor(strategy="quantile", quantile=1.1)
-    assert_raises(ValueError, est.fit, X, y)
+    with pytest.raises(ValueError):
+        est.fit(X, y)
 
     est = DummyRegressor(strategy="quantile", quantile='abc')
-    assert_raises(TypeError, est.fit, X, y)
+    with pytest.raises(TypeError):
+        est.fit(X, y)
 
 
 def test_quantile_strategy_empty_train():
     est = DummyRegressor(strategy="quantile", quantile=0.4)
-    assert_raises(ValueError, est.fit, [], [])
+    with pytest.raises(ValueError):
+        est.fit([], [])
 
 
 def test_constant_strategy_regressor():
@@ -479,7 +489,8 @@ def test_unknown_strategey_regressor():
     y = [1, 2, 4, 6, 8]
 
     est = DummyRegressor(strategy='gona')
-    assert_raises(ValueError, est.fit, X, y)
+    with pytest.raises(ValueError):
+        est.fit(X, y)
 
 
 def test_constants_not_specified_regressor():
@@ -487,7 +498,8 @@ def test_constants_not_specified_regressor():
     y = [1, 2, 4, 6, 8]
 
     est = DummyRegressor(strategy='constant')
-    assert_raises(TypeError, est.fit, X, y)
+    with pytest.raises(TypeError):
+        est.fit(X, y)
 
 
 def test_constant_size_multioutput_regressor():
@@ -496,7 +508,8 @@ def test_constant_size_multioutput_regressor():
     y = random_state.randn(10, 5)
 
     est = DummyRegressor(strategy='constant', constant=[1, 2, 3, 4])
-    assert_raises(ValueError, est.fit, X, y)
+    with pytest.raises(ValueError):
+        est.fit(X, y)
 
 
 def test_constant_strategy():

From 42200437822b370f3a07f3f01d6c75c5fc06502f Mon Sep 17 00:00:00 2001
From: Muhammad Jarir Kanji <52061313+mjkanji@users.noreply.github.com>
Date: Sun, 7 Feb 2021 13:02:15 +0200
Subject: [PATCH 140/478] TST Suppress multiple active versions of dataset
 warnings in `test_openml.py` (#19373)

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 sklearn/datasets/tests/test_openml.py | 32 ++++++++++-----------------
 1 file changed, 12 insertions(+), 20 deletions(-)

diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py
index 43ad77cde30b2..a84e705b0db68 100644
--- a/sklearn/datasets/tests/test_openml.py
+++ b/sklearn/datasets/tests/test_openml.py
@@ -1,15 +1,16 @@
 """Test the openml loader.
 """
 import gzip
-from io import BytesIO
+import warnings
 import json
-import numpy as np
 import os
 import re
+from io import BytesIO
+
+import numpy as np
 import scipy.sparse
 import sklearn
 import pytest
-
 from sklearn import config_context
 from sklearn.datasets import fetch_openml
 from sklearn.datasets._openml import (_open_openml_url,
@@ -96,7 +97,11 @@ def _fetch_dataset_from_openml(data_id, data_name, data_version,
 
     # Please note that cache=False is crucial, as the monkey patched files are
     # not consistent with reality
-    fetch_openml(name=data_name, cache=False, as_frame=False)
+    with warnings.catch_warnings():
+        # See discussion in PR #19373
+        # Catching UserWarnings about multiple versions of dataset
+        warnings.simplefilter("ignore", category=UserWarning)
+        fetch_openml(name=data_name, cache=False, as_frame=False)
     # without specifying the version, there is no guarantee that the data id
     # will be the same
 
@@ -765,11 +770,6 @@ def test_fetch_openml_iris(monkeypatch, gzip_response):
     # classification dataset with numeric only columns
     data_id = 61
     data_name = 'iris'
-    data_version = 1
-    target_column = 'class'
-    expected_observations = 150
-    expected_features = 4
-    expected_missing = 0
 
     _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
     assert_warns_message(
@@ -777,17 +777,9 @@ def test_fetch_openml_iris(monkeypatch, gzip_response):
         "Multiple active versions of the dataset matching the name"
         " iris exist. Versions may be fundamentally different, "
         "returning version 1.",
-        _fetch_dataset_from_openml,
-        **{'data_id': data_id, 'data_name': data_name,
-           'data_version': data_version,
-           'target_column': target_column,
-           'expected_observations': expected_observations,
-           'expected_features': expected_features,
-           'expected_missing': expected_missing,
-           'expect_sparse': False,
-           'expected_data_dtype': np.float64,
-           'expected_target_dtype': object,
-           'compare_default_target': True}
+        fetch_openml,
+        name=data_name,
+        as_frame=False
     )
 
 
From bd67f12b07c3532436dc61d09a8461fba69ac7b4 Mon Sep 17 00:00:00 2001
From: Isaack Mungui <41724425+icky254@users.noreply.github.com>
Date: Sun, 7 Feb 2021 14:04:56 +0300
Subject: [PATCH 141/478] TST replace assert_raises with the pytest.raises
 context manager in dummy module (#19372)


From 94337993ef1a29146c67d7e4a51e3053a79e92b7 Mon Sep 17 00:00:00 2001
From: Ogbonna Chibuike Stephen <chibuikeogbonna25@gmail.com>
Date: Sun, 7 Feb 2021 12:19:05 +0100
Subject: [PATCH 142/478] EXA remove warning when setting yticks in permutation
 importances examples (#19385)

---
 examples/inspection/plot_permutation_importance.py              | 2 +-
 .../inspection/plot_permutation_importance_multicollinear.py    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/inspection/plot_permutation_importance.py b/examples/inspection/plot_permutation_importance.py
index 0751337ef69ff..daa75453a0b69 100644
--- a/examples/inspection/plot_permutation_importance.py
+++ b/examples/inspection/plot_permutation_importance.py
@@ -130,8 +130,8 @@
 y_ticks = np.arange(0, len(feature_names))
 fig, ax = plt.subplots()
 ax.barh(y_ticks, tree_feature_importances[sorted_idx])
-ax.set_yticklabels(feature_names[sorted_idx])
 ax.set_yticks(y_ticks)
+ax.set_yticklabels(feature_names[sorted_idx])
 ax.set_title("Random Forest Feature Importances (MDI)")
 fig.tight_layout()
 plt.show()
diff --git a/examples/inspection/plot_permutation_importance_multicollinear.py b/examples/inspection/plot_permutation_importance_multicollinear.py
index 83888c8e35bc9..bde3710519667 100644
--- a/examples/inspection/plot_permutation_importance_multicollinear.py
+++ b/examples/inspection/plot_permutation_importance_multicollinear.py
@@ -60,8 +60,8 @@
 fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 8))
 ax1.barh(tree_indices,
          clf.feature_importances_[tree_importance_sorted_idx], height=0.7)
-ax1.set_yticklabels(data.feature_names[tree_importance_sorted_idx])
 ax1.set_yticks(tree_indices)
+ax1.set_yticklabels(data.feature_names[tree_importance_sorted_idx])
 ax1.set_ylim((0, len(clf.feature_importances_)))
 ax2.boxplot(result.importances[perm_sorted_idx].T, vert=False,
             labels=data.feature_names[perm_sorted_idx])

From c20a309e16a007ce80c992b27c93706bf1f90a3f Mon Sep 17 00:00:00 2001
From: Jianzhu Guo <jianzhu.guo@nlpr.ia.ac.cn>
Date: Mon, 8 Feb 2021 16:59:37 +0800
Subject: [PATCH 143/478] DOC fix a typo in _kmeans.py (#19398)

---
 sklearn/cluster/_kmeans.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py
index 437d9b32064d6..c1d889b37db2d 100644
--- a/sklearn/cluster/_kmeans.py
+++ b/sklearn/cluster/_kmeans.py
@@ -729,7 +729,7 @@ class KMeans(TransformerMixin, ClusterMixin, BaseEstimator):
     -----
     The k-means problem is solved using either Lloyd's or Elkan's algorithm.
 
-    The average complexity is given by O(k n T), were n is the number of
+    The average complexity is given by O(k n T), where n is the number of
     samples and T is the number of iteration.
 
     The worst case complexity is given by O(n^(k+2/p)) with

From cf4909eb9206ca94f1e9ad5a0dcbba31dd2536b5 Mon Sep 17 00:00:00 2001
From: Amanda Dsouza <meezamanda@yahoo.com>
Date: Mon, 8 Feb 2021 14:32:32 +0530
Subject: [PATCH 144/478] DOC simplify SVM margin illustration by chaging
 plotting style (#19387)

---
 examples/svm/plot_svm_margin.py | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/examples/svm/plot_svm_margin.py b/examples/svm/plot_svm_margin.py
index 2fdc29c1b29bd..5b267957677f8 100644
--- a/examples/svm/plot_svm_margin.py
+++ b/examples/svm/plot_svm_margin.py
@@ -24,6 +24,7 @@
 
 import numpy as np
 import matplotlib.pyplot as plt
+from matplotlib import cm
 from sklearn import svm
 
 # we create 40 separable points
@@ -62,8 +63,9 @@
     plt.plot(xx, yy_up, 'k--')
 
     plt.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1], s=80,
-                facecolors='none', zorder=10, edgecolors='k')
-    plt.scatter(X[:, 0], X[:, 1], c=Y, zorder=10, cmap=plt.cm.Paired,
+                facecolors='none', zorder=10, edgecolors='k',
+                cmap=cm.get_cmap('RdBu'))
+    plt.scatter(X[:, 0], X[:, 1], c=Y, zorder=10, cmap=cm.get_cmap('RdBu'),
                 edgecolors='k')
 
     plt.axis('tight')
@@ -72,13 +74,13 @@
     y_min = -6
     y_max = 6
 
-    XX, YY = np.mgrid[x_min:x_max:200j, y_min:y_max:200j]
-    Z = clf.predict(np.c_[XX.ravel(), YY.ravel()])
+    YY, XX = np.meshgrid(yy, xx)
+    xy = np.vstack([XX.ravel(), YY.ravel()]).T
+    Z = clf.decision_function(xy).reshape(XX.shape)
 
-    # Put the result into a color plot
-    Z = Z.reshape(XX.shape)
-    plt.figure(fignum, figsize=(4, 3))
-    plt.pcolormesh(XX, YY, Z, cmap=plt.cm.Paired)
+    # Put the result into a contour plot
+    plt.contourf(XX, YY, Z, cmap=cm.get_cmap('RdBu'),
+                 alpha=0.5, linestyles=['-'])
 
     plt.xlim(x_min, x_max)
     plt.ylim(y_min, y_max)

From 2be474be7fa8dd7fa1447ae8b8103f2b8ffacd5e Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Mon, 8 Feb 2021 04:13:33 -0500
Subject: [PATCH 145/478] DOC Documents n_features_in_ in cross_decomposition
 (#19351)

---
 sklearn/cross_decomposition/_pls.py        | 12 +++++++
 sklearn/tests/test_docstring_parameters.py | 41 ++++++++++++++++++++--
 2 files changed, 51 insertions(+), 2 deletions(-)

diff --git a/sklearn/cross_decomposition/_pls.py b/sklearn/cross_decomposition/_pls.py
index 48b9392834e48..b4219a5a1d520 100644
--- a/sklearn/cross_decomposition/_pls.py
+++ b/sklearn/cross_decomposition/_pls.py
@@ -539,6 +539,9 @@ class PLSRegression(_PLS):
     n_iter_ : list of shape (n_components,)
         Number of iterations of the power method for each component.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
     Examples
     --------
     >>> from sklearn.cross_decomposition import PLSRegression
@@ -647,6 +650,9 @@ class PLSCanonical(_PLS):
         Number of iterations of the power method, for each
         component. Empty if `algorithm='svd'`.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
     Examples
     --------
     >>> from sklearn.cross_decomposition import PLSCanonical
@@ -753,6 +759,9 @@ class CCA(_PLS):
         Number of iterations of the power method, for each
         component.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
     Examples
     --------
     >>> from sklearn.cross_decomposition import CCA
@@ -830,6 +839,9 @@ class PLSSVD(TransformerMixin, BaseEstimator):
            (renaming of 0.26). You can just call `transform` on the training
            data instead.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
     Examples
     --------
     >>> import numpy as np
diff --git a/sklearn/tests/test_docstring_parameters.py b/sklearn/tests/test_docstring_parameters.py
index 1756e0e4a65a6..2328b8d84c84e 100644
--- a/sklearn/tests/test_docstring_parameters.py
+++ b/sklearn/tests/test_docstring_parameters.py
@@ -174,6 +174,40 @@ def _construct_searchcv_instance(SearchCV):
     return SearchCV(LogisticRegression(), {"C": [0.1, 1]})
 
 
+N_FEATURES_MODULES_TO_IGNORE = {
+    'calibration',
+    'cluster',
+    'compose',
+    'covariance',
+    'decomposition',
+    'discriminant_analysis',
+    'dummy',
+    'ensemble',
+    'feature_extraction',
+    'feature_selection',
+    'gaussian_process',
+    'impute',
+    'isotonic',
+    'kernel_approximation',
+    'kernel_ridge',
+    'linear_model',
+    'manifold',
+    'mixture',
+    'model_selection',
+    'multiclass',
+    'multioutput',
+    'naive_bayes',
+    'neighbors',
+    'neural_network',
+    'pipeline',
+    'preprocessing',
+    'random_projection',
+    'semi_supervised',
+    'svm',
+    'tree'
+}
+
+
 @pytest.mark.parametrize('name, Estimator',
                          all_estimators())
 def test_fit_docstring_attributes(name, Estimator):
@@ -234,10 +268,13 @@ def test_fit_docstring_attributes(name, Estimator):
     else:
         est.fit(X, y)
 
-    skipped_attributes = {'n_features_in_',
-                          'x_scores_',  # For PLS, TODO remove in 1.1
+    skipped_attributes = {'x_scores_',  # For PLS, TODO remove in 1.1
                           'y_scores_'}  # For PLS, TODO remove in 1.1
 
+    module = est.__module__.split(".")[1]
+    if module in N_FEATURES_MODULES_TO_IGNORE:
+        skipped_attributes.add("n_features_in_")
+
     for attr in attributes:
         if attr.name in skipped_attributes:
             continue

From 920f2257bb0adc0344b398849d2653ef6e86f1d2 Mon Sep 17 00:00:00 2001
From: Alihan Zihna <alihanz@gmail.com>
Date: Mon, 8 Feb 2021 10:22:41 +0000
Subject: [PATCH 146/478] EXA improve example of forest feature importances
 (#19377)

Co-authored-by: Alihan Zihna <a.zihna@ckhgbdp.onmicrosoft.com>
Co-authored-by: Guillaume Lemaitre <g.lemaitre58@gmail.com>
---
 examples/ensemble/plot_forest_importances.py | 146 +++++++++++++------
 1 file changed, 98 insertions(+), 48 deletions(-)

diff --git a/examples/ensemble/plot_forest_importances.py b/examples/ensemble/plot_forest_importances.py
index b95329c76f036..7b75b92a1f0e0 100644
--- a/examples/ensemble/plot_forest_importances.py
+++ b/examples/ensemble/plot_forest_importances.py
@@ -1,60 +1,110 @@
 """
-=========================================
-Feature importances with forests of trees
-=========================================
+==========================================
+Feature importances with a forest of trees
+==========================================
 
-This examples shows the use of forests of trees to evaluate the importance of
-features on an artificial classification task. The red bars are
-the impurity-based feature importances of the forest,
-along with their inter-trees variability.
+This example shows the use of a forest of trees to evaluate the importance of
+features on an artificial classification task. The blue bars are the feature
+importances of the forest, along with their inter-trees variability represented
+by the error bars.
 
 As expected, the plot suggests that 3 features are informative, while the
 remaining are not.
-
-.. warning::
-    Impurity-based feature importances can be misleading for high cardinality
-    features (many unique values). See
-    :func:`sklearn.inspection.permutation_importance` as an alternative.
 """
 print(__doc__)
-
-import numpy as np
 import matplotlib.pyplot as plt
 
+# %%
+# Data generation and model fitting
+# ---------------------------------
+# We generate a synthetic dataset with only 3 informative features. We will
+# explicitly not shuffle the dataset to ensure that the informative features
+# will correspond to the three first columns of X. In addition, we will split
+# our dataset into training and testing subsets.
 from sklearn.datasets import make_classification
-from sklearn.ensemble import ExtraTreesClassifier
-
-# Build a classification task using 3 informative features
-X, y = make_classification(n_samples=1000,
-                           n_features=10,
-                           n_informative=3,
-                           n_redundant=0,
-                           n_repeated=0,
-                           n_classes=2,
-                           random_state=0,
-                           shuffle=False)
-
-# Build a forest and compute the impurity-based feature importances
-forest = ExtraTreesClassifier(n_estimators=250,
-                              random_state=0)
-
-forest.fit(X, y)
+from sklearn.model_selection import train_test_split
+
+X, y = make_classification(
+    n_samples=1000, n_features=10, n_informative=3, n_redundant=0,
+    n_repeated=0, n_classes=2, random_state=0, shuffle=False)
+X_train, X_test, y_train, y_test = train_test_split(
+    X, y, stratify=y, random_state=42)
+
+# %%
+# A random forest classifier will be fitted to compute the feature importances.
+from sklearn.ensemble import RandomForestClassifier
+
+feature_names = [f'feature {i}' for i in range(X.shape[1])]
+forest = RandomForestClassifier(random_state=0)
+forest.fit(X_train, y_train)
+
+# %%
+# Feature importance based on mean decrease in impurity
+# -----------------------------------------------------
+# Feature importances are provided by the fitted attribute
+# `feature_importances_` and they are computed as the mean and standard
+# deviation of accumulation of the impurity decrease within each tree.
+#
+# .. warning::
+#     Impurity-based feature importances can be misleading for high cardinality
+#     features (many unique values). See :ref:`permutation_importance` as
+#     an alternative below.
+import time
+import numpy as np
+
+start_time = time.time()
 importances = forest.feature_importances_
-std = np.std([tree.feature_importances_ for tree in forest.estimators_],
-             axis=0)
-indices = np.argsort(importances)[::-1]
-
-# Print the feature ranking
-print("Feature ranking:")
-
-for f in range(X.shape[1]):
-    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))
-
-# Plot the impurity-based feature importances of the forest
-plt.figure()
-plt.title("Feature importances")
-plt.bar(range(X.shape[1]), importances[indices],
-        color="r", yerr=std[indices], align="center")
-plt.xticks(range(X.shape[1]), indices)
-plt.xlim([-1, X.shape[1]])
+std = np.std([
+    tree.feature_importances_ for tree in forest.estimators_], axis=0)
+elapsed_time = time.time() - start_time
+
+print(f"Elapsed time to compute the importances: "
+      f"{elapsed_time:.3f} seconds")
+
+# %%
+# Let's plot the impurity-based importance.
+import pandas as pd
+forest_importances = pd.Series(importances, index=feature_names)
+
+fig, ax = plt.subplots()
+forest_importances.plot.bar(yerr=std, ax=ax)
+ax.set_title("Feature importances using MDI")
+ax.set_ylabel("Mean decrease in impurity")
+fig.tight_layout()
+
+# %%
+# We observe that, as expected, the three first features are found important.
+#
+# Feature importance based on feature permutation
+# -----------------------------------------------
+# Permutation feature importance overcomes limitations of the impurity-based
+# feature importance: they do not have a bias toward high-cardinality features
+# and can be computed on a left-out test set.
+from sklearn.inspection import permutation_importance
+
+start_time = time.time()
+result = permutation_importance(
+    forest, X_test, y_test, n_repeats=10, random_state=42, n_jobs=2)
+elapsed_time = time.time() - start_time
+print(f"Elapsed time to compute the importances: "
+      f"{elapsed_time:.3f} seconds")
+
+forest_importances = pd.Series(result.importances_mean, index=feature_names)
+
+# %%
+# The computation for full permutation importance is more costly. Features are
+# shuffled n times and the model refitted to estimate the importance of it.
+# Please see :ref:`permutation_importance` for more details. We can now plot
+# the importance ranking.
+
+fig, ax = plt.subplots()
+forest_importances.plot.bar(yerr=result.importances_std, ax=ax)
+ax.set_title("Feature importances using permutation on full model")
+ax.set_ylabel("Mean accuracy decrease")
+fig.tight_layout()
 plt.show()
+
+# %%
+# The same features are detected as most important using both methods. Although
+# the relative importances vary. As seen on the plots, MDI is less likely than
+# permutation importance to fully omit a feature.

From 7e197fd600b72483ff2343b8d2605368d1ab24bf Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Mon, 8 Feb 2021 09:11:05 -0500
Subject: [PATCH 147/478] ENH Adds n_features_in_ checking to feature_selection
 (#19344)

---
 sklearn/feature_selection/_base.py | 3 ++-
 sklearn/tests/test_common.py       | 1 -
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/feature_selection/_base.py b/sklearn/feature_selection/_base.py
index 60f891b69e2b7..4f0756e7ee020 100644
--- a/sklearn/feature_selection/_base.py
+++ b/sklearn/feature_selection/_base.py
@@ -79,11 +79,12 @@ def transform(self, X):
         """
         # note: we use _safe_tags instead of _get_tags because this is a
         # public Mixin.
-        X = check_array(
+        X = self._validate_data(
             X,
             dtype=None,
             accept_sparse="csr",
             force_all_finite=not _safe_tags(self, key="allow_nan"),
+            reset=False,
         )
         mask = self.get_support()
         if not mask.any():
diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
index 3c8743518d57f..90c7d9210c0c9 100644
--- a/sklearn/tests/test_common.py
+++ b/sklearn/tests/test_common.py
@@ -269,7 +269,6 @@ def test_search_cv(estimator, check, request):
     'covariance',
     'ensemble',
     'feature_extraction',
-    'feature_selection',
     'isotonic',
     'manifold',
     'mixture',

From 86bc6c9858dd1660f5762003a45733f50fc7a748 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?=
 <34657725+jeremiedbb@users.noreply.github.com>
Date: Tue, 9 Feb 2021 13:10:26 +0100
Subject: [PATCH 148/478] MNT specify main branch in asv config for the publish
 command (#19408)

---
 asv_benchmarks/asv.conf.json | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/asv_benchmarks/asv.conf.json b/asv_benchmarks/asv.conf.json
index 59d9f862f0f97..c6ba1de71c2d2 100644
--- a/asv_benchmarks/asv.conf.json
+++ b/asv_benchmarks/asv.conf.json
@@ -30,7 +30,7 @@
 
     // List of branches to benchmark. If not provided, defaults to "master
     // (for git) or "default" (for mercurial).
-    // "branches": ["main"], // for git
+    "branches": ["main"],
     // "branches": ["default"],    // for mercurial
 
     // The DVCS being used.  If not set, it will be automatically
@@ -73,7 +73,7 @@
     //
     "matrix": {
         "numpy": [],
-	    "scipy": [],
+        "scipy": [],
         "cython": [],
         "joblib": [],
         "threadpoolctl": []

From c3f647a6b2f9a9132eb39fdbfbd78995597f5c6c Mon Sep 17 00:00:00 2001
From: Mabu Manaileng <ManailengMJ@gmail.com>
Date: Tue, 9 Feb 2021 15:37:26 +0200
Subject: [PATCH 149/478] MNT Fix divide by zero error thrown by
 _random_choice_csc during a test run (#19383)

---
 sklearn/utils/tests/test_random.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/sklearn/utils/tests/test_random.py b/sklearn/utils/tests/test_random.py
index c9ff69ec8d8b8..ad356cff9dcf9 100644
--- a/sklearn/utils/tests/test_random.py
+++ b/sklearn/utils/tests/test_random.py
@@ -109,7 +109,7 @@ def test_random_choice_csc(n_samples=10000, random_state=24):
     class_probabilities = [np.array([0.5, 0.5]), np.array([0.6, 0.1, 0.3])]
 
     got = _random_choice_csc(n_samples, classes, class_probabilities,
-                            random_state)
+                             random_state)
     assert sp.issparse(got)
 
     for k in range(len(classes)):
@@ -121,8 +121,8 @@ def test_random_choice_csc(n_samples=10000, random_state=24):
     class_probabilities = [np.array([0.5, 0.5]), np.array([0, 1/2, 1/2])]
 
     got = _random_choice_csc(n_samples=n_samples,
-                            classes=classes,
-                            random_state=random_state)
+                             classes=classes,
+                             random_state=random_state)
     assert sp.issparse(got)
 
     for k in range(len(classes)):
@@ -131,10 +131,10 @@ def test_random_choice_csc(n_samples=10000, random_state=24):
 
     # Edge case probabilities 1.0 and 0.0
     classes = [np.array([0, 1]),  np.array([0, 1, 2])]
-    class_probabilities = [np.array([1.0, 0.0]), np.array([0.0, 1.0, 0.0])]
+    class_probabilities = [np.array([0.0, 1.0]), np.array([0.0, 1.0, 0.0])]
 
     got = _random_choice_csc(n_samples, classes, class_probabilities,
-                            random_state)
+                             random_state)
     assert sp.issparse(got)
 
     for k in range(len(classes)):
@@ -147,8 +147,8 @@ def test_random_choice_csc(n_samples=10000, random_state=24):
     class_probabilities = [np.array([0.0, 1.0]), np.array([1.0])]
 
     got = _random_choice_csc(n_samples=n_samples,
-                            classes=classes,
-                            random_state=random_state)
+                             classes=classes,
+                             random_state=random_state)
     assert sp.issparse(got)
 
     for k in range(len(classes)):

From a870f4399d71ffbf7a89954487176c4c49e766a4 Mon Sep 17 00:00:00 2001
From: CeeThinwa <45120266+CeeThinwa@users.noreply.github.com>
Date: Tue, 9 Feb 2021 18:49:32 +0300
Subject: [PATCH 150/478] DOC clarify variance estimation in PCA implementation
 (#19378)

Co-authored-by: Christian Lorentzen <lorentzen.ch@gmail.com>
---
 sklearn/decomposition/_pca.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py
index ac4a1d1d9816b..eb0a73919021a 100644
--- a/sklearn/decomposition/_pca.py
+++ b/sklearn/decomposition/_pca.py
@@ -217,6 +217,7 @@ class PCA(_BasePCA):
 
     explained_variance_ : ndarray of shape (n_components,)
         The amount of variance explained by each of the selected components.
+        The variance estimation uses `n_samples - 1` degrees of freedom.
 
         Equal to n_components largest eigenvalues
         of the covariance matrix of X.

From 5403e9fdaee6d4982c887ce2ae9a62ccd3955fbb Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 9 Feb 2021 16:52:26 +0100
Subject: [PATCH 151/478] DOC Explicitly remove support for `precompute='auto'`
 for Lasso (#19412)

---
 sklearn/linear_model/_coordinate_descent.py           | 7 +++----
 sklearn/linear_model/tests/test_coordinate_descent.py | 4 +++-
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/sklearn/linear_model/_coordinate_descent.py b/sklearn/linear_model/_coordinate_descent.py
index ea1216c0b55f0..9b50ea93c78c2 100644
--- a/sklearn/linear_model/_coordinate_descent.py
+++ b/sklearn/linear_model/_coordinate_descent.py
@@ -939,12 +939,11 @@ class Lasso(ElasticNet):
         :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``
         on an estimator with ``normalize=False``.
 
-    precompute : 'auto', bool or array-like of shape (n_features, n_features),\
+    precompute : bool or array-like of shape (n_features, n_features),\
                  default=False
         Whether to use a precomputed Gram matrix to speed up
-        calculations. If set to ``'auto'`` let us decide. The Gram
-        matrix can also be passed as argument. For sparse input
-        this option is always ``False`` to preserve sparsity.
+        calculations. The Gram matrix can also be passed as argument.
+        For sparse input this option is always ``False`` to preserve sparsity.
 
     copy_X : bool, default=True
         If ``True``, X will be copied; else, it may be overwritten.
diff --git a/sklearn/linear_model/tests/test_coordinate_descent.py b/sklearn/linear_model/tests/test_coordinate_descent.py
index 6e6f304bb9e24..a5acb4aa25da2 100644
--- a/sklearn/linear_model/tests/test_coordinate_descent.py
+++ b/sklearn/linear_model/tests/test_coordinate_descent.py
@@ -813,9 +813,11 @@ def test_precompute_invalid_argument():
         assert_raises_regex(ValueError, ".*should be.*True.*False.*auto.*"
                             "array-like.*Got 'invalid'", clf.fit, X, y)
 
-    # Precompute = 'auto' is not supported for ElasticNet
+    # Precompute = 'auto' is not supported for ElasticNet and Lasso
     assert_raises_regex(ValueError, ".*should be.*True.*False.*array-like.*"
                         "Got 'auto'", ElasticNet(precompute='auto').fit, X, y)
+    assert_raises_regex(ValueError, ".*should be.*True.*False.*array-like.*"
+                        "Got 'auto'", Lasso(precompute='auto').fit, X, y)
 
 
 def test_elasticnet_precompute_incorrect_gram():

From 087a6841911119e8e225f3f12541587d98c4f90f Mon Sep 17 00:00:00 2001
From: vadim-ushtanit <46819145+vadim-ushtanit@users.noreply.github.com>
Date: Wed, 10 Feb 2021 10:49:30 +0200
Subject: [PATCH 152/478] FIX index sample_weight in least_absolute_deviation
 loss in HistGradientBoosting (#19407)

Co-authored-by: Vadim Ushtanit <vadim.ushtanit@gmail.com>
Co-authored-by: Nicolas Hug <contact@nicolas-hug.com>
Co-authored-by: Guillaume Lemaitre <g.lemaitre58@gmail.com>
---
 doc/whats_new/v0.24.rst                            |  7 +++++++
 sklearn/ensemble/_hist_gradient_boosting/loss.py   |  9 +++++----
 .../tests/test_gradient_boosting.py                | 14 ++++++++++++++
 3 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst
index e5831a20a2b53..d34273532cda6 100644
--- a/doc/whats_new/v0.24.rst
+++ b/doc/whats_new/v0.24.rst
@@ -12,6 +12,13 @@ Version 0.24.2
 Changelog
 ---------
 
+:mod:`sklearn.ensemble`
+.......................
+
+- |Fix| Fixed a bug in :class:`ensemble.HistGradientBoostingRegressor` `fit`
+  with `sample_weight` parameter and `least_absolute_deviation` loss function.
+  :pr:`19407` by :user:`Vadim Ushtanit <vadim-ushtanit>`.
+
 :mod:`sklearn.preprocessing`
 ............................
 
diff --git a/sklearn/ensemble/_hist_gradient_boosting/loss.py b/sklearn/ensemble/_hist_gradient_boosting/loss.py
index f256408bf01fb..4bbf59dc01088 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/loss.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/loss.py
@@ -261,10 +261,11 @@ def update_leaves_values(self, grower, y_true, raw_predictions,
                 median_res = np.median(y_true[indices]
                                        - raw_predictions[indices])
             else:
-                median_res = _weighted_percentile(y_true[indices]
-                                                  - raw_predictions[indices],
-                                                  sample_weight=sample_weight,
-                                                  percentile=50)
+                median_res = _weighted_percentile(
+                    y_true[indices] - raw_predictions[indices],
+                    sample_weight=sample_weight[indices],
+                    percentile=50
+                )
             leaf.value = grower.shrinkage * median_res
             # Note that the regularization is ignored here
 
diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
index c501125059c8f..0e5d1e91c3dd0 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
@@ -203,6 +203,20 @@ def test_least_absolute_deviation():
     assert gbdt.score(X, y) > .9
 
 
+def test_least_absolute_deviation_sample_weight():
+    # non regression test for issue #19400
+    # make sure no error is thrown during fit of
+    # HistGradientBoostingRegressor with least_absolute_deviation loss function
+    # and passing sample_weight
+    rng = np.random.RandomState(0)
+    n_samples = 100
+    X = rng.uniform(-1, 1, size=(n_samples, 2))
+    y = rng.uniform(-1, 1, size=n_samples)
+    sample_weight = rng.uniform(0, 1, size=n_samples)
+    gbdt = HistGradientBoostingRegressor(loss='least_absolute_deviation')
+    gbdt.fit(X, y, sample_weight=sample_weight)
+
+
 @pytest.mark.parametrize('y', [([1., -2., 0.]), ([0., 0., 0.])])
 def test_poisson_y_positive(y):
     # Test that ValueError is raised if either one y_i < 0 or sum(y_i) <= 0.

From 1ea79056f7f2b3c7d8b022d28abb8c26ad9c91d2 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Wed, 10 Feb 2021 11:18:52 +0100
Subject: [PATCH 153/478] DOC reorder whats new section

---
 doc/whats_new/v1.0.rst | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 3086d91b28f5d..d073ffd80bdf7 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -59,6 +59,14 @@ Changelog
 - |API| :class:`cluster.Birch` attributes, `fit_` and `partial_fit_`, are
   deprecated and will be removed in 1.2. :pr:`19297` by `Thomas Fan`_.
 
+:mod:`sklearn.datasets`
+.......................
+
+- |Enhancement| :func:`datasets.fetch_openml` now supports categories with
+  missing values when returning a pandas dataframe. :pr:`19365` by
+  `Thomas Fan`_ and :user:`Amanda Dsouza <amy12xx>` and
+  :user:`EL-ATEIF Sara <elateifsara>`.
+
 :mod:`sklearn.decomposition`
 ............................
 
@@ -166,20 +174,12 @@ Changelog
   and :user:`wstates <wstates>`.
 
 :mod:`sklearn.semi_supervised`
-.................................
+..............................
 
 - |Fix| Avoid NaN during label propagation in
   :class:`~sklearn.semi_supervised.LabelPropagation`.
   :pr:`19271` by :user:`Zhaowei Wang <ThuWangzw>`.
 
-:mod:`sklearn.datasets`
-.......................
-
-- |Enhancement| :func:`datasets.fetch_openml` now supports categories with
-  missing values when returning a pandas dataframe. :pr:`19365` by
-  `Thomas Fan`_ and :user:`Amanda Dsouza <amy12xx>` and
-  :user:`EL-ATEIF Sara <elateifsara>`.
-
 Code and Documentation Contributors
 -----------------------------------
 

From 8066086a85bbbaaf1bbf28780218e6c4ce75a182 Mon Sep 17 00:00:00 2001
From: Norbert Preining <norbert@preining.info>
Date: Thu, 11 Feb 2021 00:37:14 +0900
Subject: [PATCH 154/478] MNT Consolidate _incremental_weighted_mean_and_var
 into _incremental_mean_and_var (#19422)

---
 sklearn/preprocessing/_data.py      |  12 +--
 sklearn/utils/extmath.py            | 139 ++++++----------------------
 sklearn/utils/tests/test_extmath.py |  33 +++----
 3 files changed, 47 insertions(+), 137 deletions(-)

diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py
index 5814875e04bff..c84716130ed05 100644
--- a/sklearn/preprocessing/_data.py
+++ b/sklearn/preprocessing/_data.py
@@ -22,8 +22,7 @@
 from ..utils import check_array
 from ..utils.deprecation import deprecated
 from ..utils.extmath import row_norms
-from ..utils.extmath import (_incremental_mean_and_var,
-                             _incremental_weighted_mean_and_var)
+from ..utils.extmath import _incremental_mean_and_var
 from ..utils.sparsefuncs_fast import (inplace_csr_row_normalize_l1,
                                       inplace_csr_row_normalize_l2)
 from ..utils.sparsefuncs import (inplace_column_scale,
@@ -838,16 +837,11 @@ def partial_fit(self, X, y=None, sample_weight=None):
                 self.var_ = None
                 self.n_samples_seen_ += X.shape[0] - np.isnan(X).sum(axis=0)
 
-            elif sample_weight is not None:
-                self.mean_, self.var_, self.n_samples_seen_ = \
-                    _incremental_weighted_mean_and_var(X, sample_weight,
-                                                       self.mean_,
-                                                       self.var_,
-                                                       self.n_samples_seen_)
             else:
                 self.mean_, self.var_, self.n_samples_seen_ = \
                     _incremental_mean_and_var(X, self.mean_, self.var_,
-                                              self.n_samples_seen_)
+                                              self.n_samples_seen_,
+                                              sample_weight=sample_weight)
 
         # for backward-compatibility, reduce n_samples_seen_ to an integer
         # if the number of samples is the same for each feature (i.e. no
diff --git a/sklearn/utils/extmath.py b/sklearn/utils/extmath.py
index ba8ce9e2879b4..31ac63c42eb69 100644
--- a/sklearn/utils/extmath.py
+++ b/sklearn/utils/extmath.py
@@ -690,113 +690,16 @@ def _safe_accumulator_op(op, x, *args, **kwargs):
     return result
 
 
-def _incremental_weighted_mean_and_var(X, sample_weight,
-                                       last_mean,
-                                       last_variance,
-                                       last_weight_sum):
-    """Calculate weighted mean and weighted variance incremental update.
-
-    .. versionadded:: 0.24
-
-    Parameters
-    ----------
-    X : array-like of shape (n_samples, n_features)
-        Data to use for mean and variance update.
-
-    sample_weight : array-like of shape (n_samples,) or None
-        Sample weights. If None, then samples are equally weighted.
-
-    last_mean : array-like of shape (n_features,)
-        Mean before the incremental update.
-
-    last_variance : array-like of shape (n_features,) or None
-        Variance before the incremental update.
-        If None, variance update is not computed (in case scaling is not
-        required).
-
-    last_weight_sum : array-like of shape (n_features,)
-        Sum of weights before the incremental update.
-
-    Returns
-    -------
-    updated_mean : array of shape (n_features,)
-
-    updated_variance : array of shape (n_features,) or None
-        If None, only mean is computed.
-
-    updated_weight_sum : array of shape (n_features,)
-
-    Notes
-    -----
-    NaNs in `X` are ignored.
-
-    `last_mean` and `last_variance` are statistics computed at the last step
-    by the function. Both must be initialized to 0.0.
-    The mean is always required (`last_mean`) and returned (`updated_mean`),
-    whereas the variance can be None (`last_variance` and `updated_variance`).
-
-    For further details on the algorithm to perform the computation in a
-    numerically stable way, see [Finch2009]_, Sections 4 and 5.
-
-    References
-    ----------
-    .. [Finch2009] `Tony Finch,
-       "Incremental calculation of weighted mean and variance",
-       University of Cambridge Computing Service, February 2009.
-       <https://fanf2.user.srcf.net/hermes/doc/antiforgery/stats.pdf>`_
-
-    """
-    # last = stats before the increment
-    # new = the current increment
-    # updated = the aggregated stats
-    if sample_weight is None:
-        return _incremental_mean_and_var(X, last_mean, last_variance,
-                                         last_weight_sum)
-    nan_mask = np.isnan(X)
-    sample_weight_T = np.reshape(sample_weight, (1, -1))
-    # new_weight_sum with shape (n_features,)
-    new_weight_sum = np.dot(sample_weight_T,
-                            ~nan_mask).ravel().astype(np.float64)
-    total_weight_sum = _safe_accumulator_op(np.sum, sample_weight, axis=0)
-
-    X_0 = np.where(nan_mask, 0, X)
-    new_mean = np.average(X_0,
-                          weights=sample_weight, axis=0).astype(np.float64)
-    new_mean *= total_weight_sum / new_weight_sum
-    updated_weight_sum = last_weight_sum + new_weight_sum
-    updated_mean = (
-            (last_weight_sum * last_mean + new_weight_sum * new_mean)
-            / updated_weight_sum)
-
-    if last_variance is None:
-        updated_variance = None
-    else:
-        X_0 = np.where(nan_mask, 0, (X-new_mean)**2)
-        new_variance =\
-            _safe_accumulator_op(
-                np.average, X_0, weights=sample_weight, axis=0)
-        new_variance *= total_weight_sum / new_weight_sum
-        new_term = (
-                new_weight_sum *
-                (new_variance +
-                 (new_mean - updated_mean) ** 2))
-        last_term = (
-                last_weight_sum *
-                (last_variance +
-                 (last_mean - updated_mean) ** 2))
-        updated_variance = (new_term + last_term) / updated_weight_sum
-
-    return updated_mean, updated_variance, updated_weight_sum
-
-
-def _incremental_mean_and_var(X, last_mean, last_variance, last_sample_count):
+def _incremental_mean_and_var(X, last_mean, last_variance, last_sample_count,
+                              sample_weight=None):
     """Calculate mean update and a Youngs and Cramer variance update.
 
-    last_mean and last_variance are statistics computed at the last step by the
-    function. Both must be initialized to 0.0. In case no scaling is required
-    last_variance can be None. The mean is always required and returned because
-    necessary for the calculation of the variance. last_n_samples_seen is the
-    number of samples encountered until now.
+    If sample_weight is given, the weighted mean and variance is computed.
+
+    Update a given mean and (possibly) variance according to new data given
+    in X. last_mean is always required to compute the new mean.
+    If last_variance is None, no variance is computed and None return for
+    updated_variance.
 
     From the paper "Algorithms for computing the sample variance: analysis and
     recommendations", by Chan, Golub, and LeVeque.
@@ -811,13 +714,19 @@ def _incremental_mean_and_var(X, last_mean, last_variance, last_sample_count):
     last_variance : array-like of shape (n_features,)
 
     last_sample_count : array-like of shape (n_features,)
+        The number of samples encountered until now if sample_weight is None.
+        If sample_weight is not None, this is the sum of sample_weight
+        encountered.
+
+    sample_weight : array-like of shape (n_samples,) or None
+        Sample weights. If None, compute the unweighted mean/variance.
 
     Returns
     -------
     updated_mean : ndarray of shape (n_features,)
 
     updated_variance : ndarray of shape (n_features,)
-        If None, only mean is computed.
+        None if last_variance was None.
 
     updated_sample_count : ndarray of shape (n_features,)
 
@@ -839,9 +748,15 @@ def _incremental_mean_and_var(X, last_mean, last_variance, last_sample_count):
     # new = the current increment
     # updated = the aggregated stats
     last_sum = last_mean * last_sample_count
-    new_sum = _safe_accumulator_op(np.nansum, X, axis=0)
+    if sample_weight is not None:
+        new_sum = _safe_accumulator_op(np.nansum, X * sample_weight[:, None],
+                                       axis=0)
+        new_sample_count = np.sum(sample_weight[:, None] * (~np.isnan(X)),
+                                  axis=0)
+    else:
+        new_sum = _safe_accumulator_op(np.nansum, X, axis=0)
+        new_sample_count = np.sum(~np.isnan(X), axis=0)
 
-    new_sample_count = np.sum(~np.isnan(X), axis=0)
     updated_sample_count = last_sample_count + new_sample_count
 
     updated_mean = (last_sum + new_sum) / updated_sample_count
@@ -849,8 +764,12 @@ def _incremental_mean_and_var(X, last_mean, last_variance, last_sample_count):
     if last_variance is None:
         updated_variance = None
     else:
-        new_unnormalized_variance = (
-            _safe_accumulator_op(np.nanvar, X, axis=0) * new_sample_count)
+        T = new_sum / new_sample_count
+        if sample_weight is not None:
+            new_unnormalized_variance = np.nansum(sample_weight[:, None] *
+                                                  (X - T)**2, axis=0)
+        else:
+            new_unnormalized_variance = np.nansum((X - T)**2, axis=0)
         last_unnormalized_variance = last_variance * last_sample_count
 
         with np.errstate(divide='ignore', invalid='ignore'):
diff --git a/sklearn/utils/tests/test_extmath.py b/sklearn/utils/tests/test_extmath.py
index cd0b1f3fd7f70..cee4870b087c2 100644
--- a/sklearn/utils/tests/test_extmath.py
+++ b/sklearn/utils/tests/test_extmath.py
@@ -30,7 +30,6 @@
 from sklearn.utils.extmath import log_logistic
 from sklearn.utils.extmath import svd_flip
 from sklearn.utils.extmath import _incremental_mean_and_var
-from sklearn.utils.extmath import _incremental_weighted_mean_and_var
 from sklearn.utils.extmath import _deterministic_vector_sign_flip
 from sklearn.utils.extmath import softmax
 from sklearn.utils.extmath import stable_cumsum
@@ -464,8 +463,8 @@ def test_incremental_weighted_mean_and_variance_simple(rng, dtype):
     mult = 10
     X = rng.rand(1000, 20).astype(dtype)*mult
     sample_weight = rng.rand(X.shape[0]) * mult
-    mean, var, _ = _incremental_weighted_mean_and_var(X, sample_weight,
-                                                      0, 0, 0)
+    mean, var, _ = _incremental_mean_and_var(X, 0, 0, 0,
+                                             sample_weight=sample_weight)
 
     expected_mean = np.average(X, weights=sample_weight, axis=0)
     expected_var = np.average(X**2, weights=sample_weight, axis=0) - \
@@ -488,11 +487,9 @@ def _assert(X, sample_weight, expected_mean, expected_var):
             last_mean, last_weight_sum, last_var = 0, 0, 0
             for batch in gen_batches(n, chunk_size):
                 last_mean, last_var, last_weight_sum = \
-                    _incremental_weighted_mean_and_var(X[batch],
-                                                       sample_weight[batch],
-                                                       last_mean,
-                                                       last_var,
-                                                       last_weight_sum)
+                    _incremental_mean_and_var(
+                        X[batch], last_mean, last_var, last_weight_sum,
+                        sample_weight=sample_weight[batch])
             assert_allclose(last_mean, expected_mean)
             assert_allclose(last_var, expected_var, atol=1e-6)
 
@@ -532,17 +529,17 @@ def test_incremental_weighted_mean_and_variance_ignore_nan(dtype):
                       [300, 300, 300, np.nan]]).astype(dtype)
 
     X_means, X_variances, X_count = \
-        _incremental_weighted_mean_and_var(X,
-                                           sample_weights_X,
-                                           old_means,
-                                           old_variances,
-                                           old_weight_sum)
+        _incremental_mean_and_var(X,
+                                  old_means,
+                                  old_variances,
+                                  old_weight_sum,
+                                  sample_weight=sample_weights_X)
     X_nan_means, X_nan_variances, X_nan_count = \
-        _incremental_weighted_mean_and_var(X_nan,
-                                           sample_weights_X_nan,
-                                           old_means,
-                                           old_variances,
-                                           old_weight_sum)
+        _incremental_mean_and_var(X_nan,
+                                  old_means,
+                                  old_variances,
+                                  old_weight_sum,
+                                  sample_weight=sample_weights_X_nan)
 
     assert_allclose(X_nan_means, X_means)
     assert_allclose(X_nan_variances, X_variances)

From 9cbe1bb8ce5c31d9069d2791f5169248b54e560c Mon Sep 17 00:00:00 2001
From: Peter Dye <pjad98@gmail.com>
Date: Wed, 10 Feb 2021 07:42:57 -0800
Subject: [PATCH 155/478] DOC Evaluating estimator performance - grammar fix
 (#19419)

---
 doc/modules/cross_validation.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/modules/cross_validation.rst b/doc/modules/cross_validation.rst
index 6b0da4c23cf7b..ae3d38f168f3f 100644
--- a/doc/modules/cross_validation.rst
+++ b/doc/modules/cross_validation.rst
@@ -219,7 +219,7 @@ following keys -
 ``['test_<scorer1_name>', 'test_<scorer2_name>', 'test_<scorer...>', 'fit_time', 'score_time']``
 
 ``return_train_score`` is set to ``False`` by default to save computation time.
-To evaluate the scores on the training set as well you need to be set to
+To evaluate the scores on the training set as well you need to set it to
 ``True``.
 
 You may also retain the estimator fitted on each training set by setting

From 4aff3857bceb1e42af5ff304140bd4d5b7e74e67 Mon Sep 17 00:00:00 2001
From: putschblos <75161135+putschblos@users.noreply.github.com>
Date: Wed, 10 Feb 2021 19:29:56 +0100
Subject: [PATCH 156/478] DOC Better information about transductive and
 inductive cluster methods (#19350)

Co-authored-by: baam333 <baam333@googlemail.com>
---
 doc/modules/clustering.rst | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst
index 8aaa6085c2e66..17ae9eb2651c6 100644
--- a/doc/modules/clustering.rst
+++ b/doc/modules/clustering.rst
@@ -51,62 +51,64 @@ Overview of clustering methods
      - number of clusters
      - Very large ``n_samples``, medium ``n_clusters`` with
        :ref:`MiniBatch code <mini_batch_kmeans>`
-     - General-purpose, even cluster size, flat geometry, not too many clusters
+     - General-purpose, even cluster size, flat geometry,
+       not too many clusters, inductive
      - Distances between points
 
    * - :ref:`Affinity propagation <affinity_propagation>`
      - damping, sample preference
      - Not scalable with n_samples
-     - Many clusters, uneven cluster size, non-flat geometry
+     - Many clusters, uneven cluster size, non-flat geometry, inductive
      - Graph distance (e.g. nearest-neighbor graph)
 
    * - :ref:`Mean-shift <mean_shift>`
      - bandwidth
      - Not scalable with ``n_samples``
-     - Many clusters, uneven cluster size, non-flat geometry
+     - Many clusters, uneven cluster size, non-flat geometry, inductive
      - Distances between points
 
    * - :ref:`Spectral clustering <spectral_clustering>`
      - number of clusters
      - Medium ``n_samples``, small ``n_clusters``
-     - Few clusters, even cluster size, non-flat geometry
+     - Few clusters, even cluster size, non-flat geometry, transductive
      - Graph distance (e.g. nearest-neighbor graph)
 
    * - :ref:`Ward hierarchical clustering <hierarchical_clustering>`
      - number of clusters or distance threshold
      - Large ``n_samples`` and ``n_clusters``
-     - Many clusters, possibly connectivity constraints
+     - Many clusters, possibly connectivity constraints, transductive
      - Distances between points
 
    * - :ref:`Agglomerative clustering <hierarchical_clustering>`
      - number of clusters or distance threshold, linkage type, distance
      - Large ``n_samples`` and ``n_clusters``
      - Many clusters, possibly connectivity constraints, non Euclidean
-       distances
+       distances, transductive
      - Any pairwise distance
 
    * - :ref:`DBSCAN <dbscan>`
      - neighborhood size
      - Very large ``n_samples``, medium ``n_clusters``
-     - Non-flat geometry, uneven cluster sizes
+     - Non-flat geometry, uneven cluster sizes, transductive
      - Distances between nearest points
 
    * - :ref:`OPTICS <optics>`
      - minimum cluster membership
      - Very large ``n_samples``, large ``n_clusters``
-     - Non-flat geometry, uneven cluster sizes, variable cluster density
+     - Non-flat geometry, uneven cluster sizes, variable cluster density,
+       transductive
      - Distances between points
 
    * - :ref:`Gaussian mixtures <mixture>`
      - many
      - Not scalable
-     - Flat geometry, good for density estimation
+     - Flat geometry, good for density estimation, inductive
      - Mahalanobis distances to  centers
 
    * - :ref:`BIRCH <birch>`
      - branching factor, threshold, optional global clusterer.
      - Large ``n_clusters`` and ``n_samples``
-     - Large dataset, outlier removal, data reduction.
+     - Large dataset, outlier removal, data reduction, inductive
      - Euclidean distance between points
 
 Non-flat geometry clustering is useful when the clusters have a specific
@@ -119,6 +121,10 @@ Gaussian mixture models, useful for clustering, are described in
 mixture models. KMeans can be seen as a special case of Gaussian mixture
 model with equal covariance per component.
 
+:term:`Transductive <transductive>` clustering methods (in contrast to
+:term:`inductive` clustering methods) are not designed to be applied to new,
+unseen data.
+
 .. _k_means:
 
 K-means

From f58d1ebd2e9880caf3c11953c26efc9ba59d54ff Mon Sep 17 00:00:00 2001
From: simonamaggio <maggio.simona@gmail.com>
Date: Thu, 11 Feb 2021 09:52:14 +0100
Subject: [PATCH 157/478] ENH Allow multiple scorers input to
 permutation_importance (#19411)

Co-authored-by: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 doc/modules/permutation_importance.rst        |  60 +++++++--
 doc/whats_new/v1.0.rst                        |   7 +
 sklearn/inspection/_permutation_importance.py | 120 ++++++++++++++----
 .../tests/test_permutation_importance.py      |  49 ++++++-
 4 files changed, 203 insertions(+), 33 deletions(-)

diff --git a/doc/modules/permutation_importance.rst b/doc/modules/permutation_importance.rst
index aa28aba6827da..833c9fb9a696e 100644
--- a/doc/modules/permutation_importance.rst
+++ b/doc/modules/permutation_importance.rst
@@ -16,6 +16,16 @@ indicative of how much the model depends on the feature. This technique
 benefits from being model agnostic and can be calculated many times with
 different permutations of the feature.
 
+.. warning::
+
+  Features that are deemed of **low importance for a bad model** (low
+  cross-validation score) could be **very important for a good model**.
+  Therefore it is always important to evaluate the predictive power of a model
+  using a held-out set (or better with cross-validation) prior to computing
+  importances. Permutation importance does not reflect to the intrinsic
+  predictive value of a feature by itself but **how important this feature is
+  for a particular model**.
+
 The :func:`permutation_importance` function calculates the feature importance
 of :term:`estimators` for a given dataset. The ``n_repeats`` parameter sets the
 number of times a feature is randomly shuffled and returns a sample of feature
@@ -64,15 +74,49 @@ highlight which features contribute the most to the generalization power of the
 inspected model. Features that are important on the training set but not on the
 held-out set might cause the model to overfit.
 
-.. warning::
+The permutation feature importance is the decrease in a model score when a single
+feature value is randomly shuffled. The score function to be used for the
+computation of importances can be specified with the `scoring` argument,
+which also accepts multiple scorers. Using multiple scorers is more computationally
+efficient than sequentially calling :func:`permutation_importance` several times
+with a different scorer, as it reuses model predictions.
 
-  Features that are deemed of **low importance for a bad model** (low
-  cross-validation score) could be **very important for a good model**.
-  Therefore it is always important to evaluate the predictive power of a model
-  using a held-out set (or better with cross-validation) prior to computing
-  importances. Permutation importance does not reflect to the intrinsic
-  predictive value of a feature by itself but **how important this feature is
-  for a particular model**.
+An example of using multiple scorers is shown below, employing a list of metrics,
+but more input formats are possible, as documented in :ref:`multimetric_scoring`.
+
+  >>> scoring = ['r2', 'neg_mean_absolute_percentage_error', 'neg_mean_squared_error']
+  >>> r_multi = permutation_importance(
+  ...     model, X_val, y_val, n_repeats=30, random_state=0, scoring=scoring)
+  ...
+  >>> for metric in r_multi:
+  ...     print(f"{metric}")
+  ...     r = r_multi[metric]
+  ...     for i in r.importances_mean.argsort()[::-1]:
+  ...         if r.importances_mean[i] - 2 * r.importances_std[i] > 0:
+  ...             print(f"    {diabetes.feature_names[i]:<8}"
+  ...                   f"{r.importances_mean[i]:.3f}"
+  ...                   f" +/- {r.importances_std[i]:.3f}")
+  ...
+  r2
+    s5      0.204 +/- 0.050
+    bmi     0.176 +/- 0.048
+    bp      0.088 +/- 0.033
+    sex     0.056 +/- 0.023
+  neg_mean_absolute_percentage_error
+    s5      0.081 +/- 0.020
+    bmi     0.064 +/- 0.015
+    bp      0.029 +/- 0.010
+  neg_mean_squared_error
+    s5      1013.903 +/- 246.460
+    bmi     872.694 +/- 240.296
+    bp      438.681 +/- 163.025
+    sex     277.382 +/- 115.126
+
+The ranking of the features is approximately the same for different metrics even
+if the scales of the importance values are very different. However, this is not
+guaranteed and different metrics might lead to significantly different feature
+importances, in particular for models trained for imbalanced classification problems,
+for which the choice of the classification metric can be critical.
 
 Outline of the permutation importance algorithm
 -----------------------------------------------
diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index d073ffd80bdf7..024fefe3fd825 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -102,6 +102,13 @@ Changelog
   input strings would result in negative indices in the transformed data.
   :pr:`19035` by :user:`Liu Yu <ly648499246>`.
 
+:mod:`sklearn.inspection`
+.........................
+
+- |Fix| Allow multiple scorers input to
+  :func:`~sklearn.inspection.permutation_importance`.
+  :pr:`19411` by :user:`Simona Maggio <simonamaggio>`.
+
 :mod:`sklearn.linear_model`
 ...........................
 
diff --git a/sklearn/inspection/_permutation_importance.py b/sklearn/inspection/_permutation_importance.py
index 688f6e9e68e03..9f2bdb0916254 100644
--- a/sklearn/inspection/_permutation_importance.py
+++ b/sklearn/inspection/_permutation_importance.py
@@ -3,6 +3,8 @@
 from joblib import Parallel
 
 from ..metrics import check_scoring
+from ..metrics._scorer import _check_multimetric_scoring, _MultimetricScorer
+from ..model_selection._validation import _aggregate_score_dicts
 from ..utils import Bunch
 from ..utils import check_random_state
 from ..utils import check_array
@@ -28,9 +30,10 @@ def _calculate_permutation_scores(estimator, X, y, sample_weight, col_idx,
     # (memmap). X.copy() on the other hand is always guaranteed to return a
     # writable data-structure whose columns can be shuffled inplace.
     X_permuted = X.copy()
-    scores = np.zeros(n_repeats)
+
+    scores = []
     shuffling_idx = np.arange(X.shape[0])
-    for n_round in range(n_repeats):
+    for _ in range(n_repeats):
         random_state.shuffle(shuffling_idx)
         if hasattr(X_permuted, "iloc"):
             col = X_permuted.iloc[shuffling_idx, col_idx]
@@ -38,14 +41,45 @@ def _calculate_permutation_scores(estimator, X, y, sample_weight, col_idx,
             X_permuted.iloc[:, col_idx] = col
         else:
             X_permuted[:, col_idx] = X_permuted[shuffling_idx, col_idx]
-        feature_score = _weights_scorer(
-            scorer, estimator, X_permuted, y, sample_weight
+        scores.append(
+            _weights_scorer(scorer, estimator, X_permuted, y, sample_weight)
         )
-        scores[n_round] = feature_score
+
+    if isinstance(scores[0], dict):
+        scores = _aggregate_score_dicts(scores)
+    else:
+        scores = np.array(scores)
 
     return scores
 
 
+def _create_importances_bunch(baseline_score, permuted_score):
+    """Compute the importances as the decrease in score.
+
+    Parameters
+    ----------
+    baseline_score : ndarray of shape (n_features,)
+        The baseline score without permutation.
+    permuted_score : ndarray of shape (n_features, n_repeats)
+        The permuted scores for the `n` repetitions.
+
+    Returns
+    -------
+    importances : :class:`~sklearn.utils.Bunch`
+        Dictionary-like object, with the following attributes.
+        importances_mean : ndarray, shape (n_features, )
+            Mean of feature importance over `n_repeats`.
+        importances_std : ndarray, shape (n_features, )
+            Standard deviation over `n_repeats`.
+        importances : ndarray, shape (n_features, n_repeats)
+            Raw permutation importance scores.
+    """
+    importances = baseline_score - permuted_score
+    return Bunch(importances_mean=np.mean(importances, axis=1),
+                 importances_std=np.std(importances, axis=1),
+                 importances=importances)
+
+
 @_deprecate_positional_args
 def permutation_importance(estimator, X, y, *, scoring=None, n_repeats=5,
                            n_jobs=None, random_state=None, sample_weight=None):
@@ -74,10 +108,25 @@ def permutation_importance(estimator, X, y, *, scoring=None, n_repeats=5,
     y : array-like or None, shape (n_samples, ) or (n_samples, n_classes)
         Targets for supervised or `None` for unsupervised.
 
-    scoring : string, callable or None, default=None
-        Scorer to use. It can be a single
-        string (see :ref:`scoring_parameter`) or a callable (see
-        :ref:`scoring`). If None, the estimator's default scorer is used.
+    scoring : str, callable, list, tuple, or dict, default=None
+        Scorer to use.
+        If `scoring` represents a single score, one can use:
+
+        - a single string (see :ref:`scoring_parameter`);
+        - a callable (see :ref:`scoring`) that returns a single value.
+
+        If `scoring` reprents multiple scores, one can use:
+
+        - a list or tuple of unique strings;
+        - a callable returning a dictionary where the keys are the metric
+          names and the values are the metric scores;
+        - a dictionary with metric names as keys and callables a values.
+
+        Passing multiple scores to `scoring` is more efficient than calling
+        `permutation_importance` for each of the scores as it reuses
+        predictions to avoid redundant computation.
+
+        If None, the estimator's default scorer is used.
 
     n_repeats : int, default=5
         Number of times to permute a feature.
@@ -102,16 +151,20 @@ def permutation_importance(estimator, X, y, *, scoring=None, n_repeats=5,
 
     Returns
     -------
-    result : :class:`~sklearn.utils.Bunch`
+    result : :class:`~sklearn.utils.Bunch` or dict of such instances
         Dictionary-like object, with the following attributes.
 
-        importances_mean : ndarray, shape (n_features, )
+        importances_mean : ndarray of shape (n_features, )
             Mean of feature importance over `n_repeats`.
-        importances_std : ndarray, shape (n_features, )
+        importances_std : ndarray of shape (n_features, )
             Standard deviation over `n_repeats`.
-        importances : ndarray, shape (n_features, n_repeats)
+        importances : ndarray of shape (n_features, n_repeats)
             Raw permutation importance scores.
 
+        If there are multiple scoring metrics in the scoring parameter
+        `result` is a dict with scorer names as keys (e.g. 'roc_auc') and
+        `Bunch` objects like above as values.
+
     References
     ----------
     .. [BRE] L. Breiman, "Random Forests", Machine Learning, 45(1), 5-32,
@@ -143,14 +196,33 @@ def permutation_importance(estimator, X, y, *, scoring=None, n_repeats=5,
     random_state = check_random_state(random_state)
     random_seed = random_state.randint(np.iinfo(np.int32).max + 1)
 
-    scorer = check_scoring(estimator, scoring=scoring)
-    baseline_score = _weights_scorer(scorer, estimator, X, y, sample_weight)
-
-    scores = Parallel(n_jobs=n_jobs)(delayed(_calculate_permutation_scores)(
-        estimator, X, y, sample_weight, col_idx, random_seed, n_repeats, scorer
-    ) for col_idx in range(X.shape[1]))
-
-    importances = baseline_score - np.array(scores)
-    return Bunch(importances_mean=np.mean(importances, axis=1),
-                 importances_std=np.std(importances, axis=1),
-                 importances=importances)
+    if callable(scoring):
+        scorer = scoring
+    elif scoring is None or isinstance(scoring, str):
+        scorer = check_scoring(estimator, scoring=scoring)
+    else:
+        scorers_dict = _check_multimetric_scoring(estimator, scoring)
+        scorer = _MultimetricScorer(**scorers_dict)
+
+    baseline_score = _weights_scorer(scorer, estimator, X, y,
+                                     sample_weight)
+
+    scores = Parallel(n_jobs=n_jobs)(
+        delayed(_calculate_permutation_scores)(
+            estimator, X, y, sample_weight, col_idx, random_seed,
+            n_repeats, scorer
+        ) for col_idx in range(X.shape[1]))
+
+    if isinstance(baseline_score, dict):
+        return {
+            name: _create_importances_bunch(
+                baseline_score[name],
+                # unpack the permuted scores
+                np.array([
+                    scores[col_idx][name] for col_idx in range(X.shape[1])
+                ])
+            )
+            for name in baseline_score
+        }
+    else:
+        return _create_importances_bunch(baseline_score, np.array(scores))
diff --git a/sklearn/inspection/tests/test_permutation_importance.py b/sklearn/inspection/tests/test_permutation_importance.py
index 99a1d09f4a7bf..e0c877d3f9a03 100644
--- a/sklearn/inspection/tests/test_permutation_importance.py
+++ b/sklearn/inspection/tests/test_permutation_importance.py
@@ -16,6 +16,11 @@
 from sklearn.impute import SimpleImputer
 from sklearn.inspection import permutation_importance
 from sklearn.model_selection import train_test_split
+from sklearn.metrics import (
+    get_scorer,
+    mean_squared_error,
+    r2_score,
+)
 from sklearn.pipeline import make_pipeline
 from sklearn.preprocessing import KBinsDiscretizer
 from sklearn.preprocessing import OneHotEncoder
@@ -25,7 +30,6 @@
 from sklearn.utils._testing import _convert_container
 
 
-
 @pytest.mark.parametrize("n_jobs", [1, 2])
 def test_permutation_importance_correlated_feature_regression(n_jobs):
     # Make sure that feature highly correlated to the target have a higher
@@ -435,3 +439,46 @@ def my_scorer(estimator, X, y):
                                scoring=my_scorer,
                                n_repeats=1,
                                sample_weight=w)
+
+
+@pytest.mark.parametrize(
+    "list_single_scorer, multi_scorer",
+    [
+        (["r2", "neg_mean_squared_error"], ["r2", "neg_mean_squared_error"]),
+        (
+            ["r2", "neg_mean_squared_error"],
+            {
+                "r2": get_scorer("r2"),
+                "neg_mean_squared_error": get_scorer("neg_mean_squared_error"),
+            },
+        ),
+        (
+            ["r2", "neg_mean_squared_error"],
+            lambda estimator, X, y: {
+                "r2": r2_score(y, estimator.predict(X)),
+                "neg_mean_squared_error": -mean_squared_error(
+                    y, estimator.predict(X)
+                ),
+            },
+        ),
+    ],
+)
+def test_permutation_importance_multi_metric(list_single_scorer, multi_scorer):
+    # Test permutation importance when scoring contains multiple scorers
+
+    # Creating some data and estimator for the permutation test
+    x, y = make_regression(n_samples=500, n_features=10, random_state=0)
+    lr = LinearRegression().fit(x, y)
+
+    multi_importance = permutation_importance(
+        lr, x, y, random_state=1, scoring=multi_scorer, n_repeats=2
+    )
+    assert set(multi_importance.keys()) == set(list_single_scorer)
+
+    for scorer in list_single_scorer:
+        multi_result = multi_importance[scorer]
+        single_result = permutation_importance(
+            lr, x, y, random_state=1, scoring=scorer, n_repeats=2
+        )
+
+        assert_allclose(multi_result.importances, single_result.importances)

From 6489daff8a7fd08857d1c9cb17d4b459e1cdac40 Mon Sep 17 00:00:00 2001
From: t-kusanagi <46693526+t-kusanagi@users.noreply.github.com>
Date: Thu, 11 Feb 2021 18:23:26 +0900
Subject: [PATCH 158/478] TST Fix RuntimeWarning: invalid value encountered in
 test_calibration.py (#19421)

---
 sklearn/calibration.py            | 11 +++++++----
 sklearn/tests/test_calibration.py | 26 +++++++++++++++++++++++++-
 2 files changed, 32 insertions(+), 5 deletions(-)

diff --git a/sklearn/calibration.py b/sklearn/calibration.py
index 3c997c906497c..bff7f6c03502f 100644
--- a/sklearn/calibration.py
+++ b/sklearn/calibration.py
@@ -656,10 +656,13 @@ def predict_proba(self, X):
         if n_classes == 2:
             proba[:, 0] = 1. - proba[:, 1]
         else:
-            proba /= np.sum(proba, axis=1)[:, np.newaxis]
-
-        # XXX : for some reason all probas can be 0
-        proba[np.isnan(proba)] = 1. / n_classes
+            denominator = np.sum(proba, axis=1)[:, np.newaxis]
+            # In the edge case where for each class calibrator returns a null
+            # probability for a given sample, use the uniform distribution
+            # instead.
+            uniform_proba = np.full_like(proba, 1 / n_classes)
+            proba = np.divide(proba, denominator, out=uniform_proba,
+                              where=denominator != 0)
 
         # Deal with cases where the predicted probability minimally exceeds 1.0
         proba[(1.0 < proba) & (proba <= 1.0 + 1e-5)] = 1.0
diff --git a/sklearn/tests/test_calibration.py b/sklearn/tests/test_calibration.py
index 4a19303ee7b01..7c3ccd06815b3 100644
--- a/sklearn/tests/test_calibration.py
+++ b/sklearn/tests/test_calibration.py
@@ -7,6 +7,7 @@
 from scipy import sparse
 
 from sklearn.base import BaseEstimator
+from sklearn.dummy import DummyClassifier
 from sklearn.model_selection import LeaveOneOut, train_test_split
 
 from sklearn.utils._testing import (assert_array_almost_equal,
@@ -26,7 +27,7 @@
 from sklearn.pipeline import Pipeline
 from sklearn.impute import SimpleImputer
 from sklearn.metrics import brier_score_loss
-from sklearn.calibration import CalibratedClassifierCV
+from sklearn.calibration import CalibratedClassifierCV, _CalibratedClassifier
 from sklearn.calibration import _sigmoid_calibration, _SigmoidCalibration
 from sklearn.calibration import calibration_curve
 
@@ -275,6 +276,29 @@ def multiclass_brier(y_true, proba_pred, n_classes):
     assert calibrated_brier < 1.1 * uncalibrated_brier
 
 
+def test_calibration_zero_probability():
+    # Test an edge case where _CalibratedClassifier avoids numerical errors
+    # in the multiclass normalization step if all the calibrators output
+    # are zero all at once for a given sample and instead fallback to uniform
+    # probabilities.
+    class ZeroCalibrator():
+        # This function is called from _CalibratedClassifier.predict_proba.
+        def predict(self, X):
+            return np.zeros(X.shape[0])
+
+    X, y = make_blobs(n_samples=50, n_features=10, random_state=7,
+                      centers=10, cluster_std=15.0)
+    clf = DummyClassifier().fit(X, y)
+    calibrator = ZeroCalibrator()
+    cal_clf = _CalibratedClassifier(
+        base_estimator=clf, calibrators=[calibrator], classes=clf.classes_)
+
+    probas = cal_clf.predict_proba(X)
+
+    # Check that all probabilities are uniformly 1. / clf.n_classes_
+    assert_allclose(probas, 1. / clf.n_classes_)
+
+
 def test_calibration_prefit():
     """Test calibration for prefitted classifiers"""
     n_samples = 50

From 7f5ee92beddb6f56fcc633d142718bcf0a4bb735 Mon Sep 17 00:00:00 2001
From: Atsushi Nukariya <a.nukariya@jp.fujitsu.com>
Date: Thu, 11 Feb 2021 19:45:15 +0900
Subject: [PATCH 159/478] TST replace assert_warns* by pytest.warns in
 ensemble/tests (#19425)

Co-authored-by: Guillaume Lemaitre <g.lemaitre58@gmail.com>
---
 sklearn/ensemble/tests/test_bagging.py        | 49 +++++++++++--------
 sklearn/ensemble/tests/test_forest.py         | 24 ++++++---
 .../ensemble/tests/test_gradient_boosting.py  | 15 +++---
 sklearn/ensemble/tests/test_iforest.py        | 13 +++--
 4 files changed, 59 insertions(+), 42 deletions(-)

diff --git a/sklearn/ensemble/tests/test_bagging.py b/sklearn/ensemble/tests/test_bagging.py
index 8b557f50ef5e0..a08d421a75dfe 100644
--- a/sklearn/ensemble/tests/test_bagging.py
+++ b/sklearn/ensemble/tests/test_bagging.py
@@ -15,8 +15,6 @@
 from sklearn.utils._testing import assert_array_equal
 from sklearn.utils._testing import assert_array_almost_equal
 from sklearn.utils._testing import assert_raises
-from sklearn.utils._testing import assert_warns
-from sklearn.utils._testing import assert_warns_message
 from sklearn.utils._testing import assert_raise_message
 
 from sklearn.dummy import DummyClassifier, DummyRegressor
@@ -348,14 +346,19 @@ def test_oob_score_classification():
         assert abs(test_score - clf.oob_score_) < 0.1
 
         # Test with few estimators
-        assert_warns(UserWarning,
-                     BaggingClassifier(base_estimator=base_estimator,
-                                       n_estimators=1,
-                                       bootstrap=True,
-                                       oob_score=True,
-                                       random_state=rng).fit,
-                     X_train,
-                     y_train)
+        warn_msg = (
+            "Some inputs do not have OOB scores. This probably means too few "
+            "estimators were used to compute any reliable oob estimates."
+        )
+        with pytest.warns(UserWarning, match=warn_msg):
+            clf = BaggingClassifier(
+                base_estimator=base_estimator,
+                n_estimators=1,
+                bootstrap=True,
+                oob_score=True,
+                random_state=rng,
+            )
+            clf.fit(X_train, y_train)
 
 
 def test_oob_score_regression():
@@ -377,14 +380,18 @@ def test_oob_score_regression():
     assert abs(test_score - clf.oob_score_) < 0.1
 
     # Test with few estimators
-    assert_warns(UserWarning,
-                 BaggingRegressor(base_estimator=DecisionTreeRegressor(),
-                                  n_estimators=1,
-                                  bootstrap=True,
-                                  oob_score=True,
-                                  random_state=rng).fit,
-                 X_train,
-                 y_train)
+    warn_msg = (
+        "Some inputs do not have OOB scores. This probably means too few "
+        "estimators were used to compute any reliable oob estimates."
+    )
+    with pytest.warns(UserWarning, match=warn_msg):
+        regr = BaggingRegressor(
+            base_estimator=DecisionTreeRegressor(),
+            n_estimators=1,
+            bootstrap=True,
+            oob_score=True,
+            random_state=rng)
+        regr.fit(X_train, y_train)
 
 
 def test_single_estimator():
@@ -654,9 +661,9 @@ def test_warm_start_equal_n_estimators():
     # modify X to nonsense values, this should not change anything
     X_train += 1.
 
-    assert_warns_message(UserWarning,
-                         "Warm-start fitting without increasing n_estimators does not",
-                         clf.fit, X_train, y_train)
+    warn_msg = "Warm-start fitting without increasing n_estimators does not"
+    with pytest.warns(UserWarning, match=warn_msg):
+        clf.fit(X_train, y_train)
     assert_array_equal(y_pred, clf.predict(X_test))
 
 
diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py
index 2302ed169bf86..f97c956327fd5 100644
--- a/sklearn/ensemble/tests/test_forest.py
+++ b/sklearn/ensemble/tests/test_forest.py
@@ -30,8 +30,6 @@
 from sklearn.utils._testing import assert_array_almost_equal
 from sklearn.utils._testing import assert_array_equal
 from sklearn.utils._testing import assert_raises
-from sklearn.utils._testing import assert_warns
-from sklearn.utils._testing import assert_warns_message
 from sklearn.utils._testing import _convert_container
 from sklearn.utils._testing import ignore_warnings
 from sklearn.utils._testing import skip_if_no_parallel
@@ -1128,8 +1126,14 @@ def check_class_weight_errors(name):
     # Warning warm_start with preset
     clf = ForestClassifier(class_weight='balanced', warm_start=True,
                            random_state=0)
-    assert_warns(UserWarning, clf.fit, X, y)
-    assert_warns(UserWarning, clf.fit, X, _y)
+    clf.fit(X, y)
+
+    warn_msg = (
+        "Warm-start fitting without increasing n_estimators does not fit new "
+        "trees."
+    )
+    with pytest.warns(UserWarning, match=warn_msg):
+        clf.fit(X, _y)
 
     # Not a list or preset for multi-output
     clf = ForestClassifier(class_weight=1, random_state=0)
@@ -1229,7 +1233,12 @@ def check_warm_start_equal_n_estimators(name):
     # Now est_2 equals est.
 
     est_2.set_params(random_state=2)
-    assert_warns(UserWarning, est_2.fit, X, y)
+    warn_msg = (
+        "Warm-start fitting without increasing n_estimators does not fit "
+        "new trees."
+    )
+    with pytest.warns(UserWarning, match=warn_msg):
+        est_2.fit(X, y)
     # If we had fit the trees again we would have got a different forest as we
     # changed the random state.
     assert_array_equal(est.apply(X), est_2.apply(X))
@@ -1324,9 +1333,8 @@ def test_min_impurity_split():
 
     for Estimator in all_estimators:
         est = Estimator(min_impurity_split=0.1)
-        est = assert_warns_message(FutureWarning,
-                                   "min_impurity_decrease",
-                                   est.fit, X, y)
+        with pytest.warns(FutureWarning, match="min_impurity_decrease"):
+            est = est.fit(X, y)
         for tree in est.estimators_:
             assert tree.min_impurity_split == 0.1
 
diff --git a/sklearn/ensemble/tests/test_gradient_boosting.py b/sklearn/ensemble/tests/test_gradient_boosting.py
index 498e5bf38a675..1fbec105e3cb4 100644
--- a/sklearn/ensemble/tests/test_gradient_boosting.py
+++ b/sklearn/ensemble/tests/test_gradient_boosting.py
@@ -29,8 +29,6 @@
 from sklearn.utils._testing import assert_array_equal
 from sklearn.utils._testing import assert_raises
 from sklearn.utils._testing import assert_raise_message
-from sklearn.utils._testing import assert_warns
-from sklearn.utils._testing import assert_warns_message
 from sklearn.utils._testing import skip_if_32bit
 from sklearn.exceptions import DataConversionWarning
 from sklearn.exceptions import NotFittedError
@@ -560,7 +558,13 @@ def test_shape_y():
     # This will raise a DataConversionWarning that we want to
     # "always" raise, elsewhere the warnings gets ignored in the
     # later tests, and the tests that check for this warning fail
-    assert_warns(DataConversionWarning, clf.fit, X, y_)
+    warn_msg = (
+        "A column-vector y was passed when a 1d array was expected. "
+        "Please change the shape of y to \\(n_samples, \\), for "
+        "example using ravel()."
+    )
+    with pytest.warns(DataConversionWarning, match=warn_msg):
+        clf.fit(X, y_)
     assert_array_equal(clf.predict(T), true_result)
     assert 100 == len(clf.estimators_)
 
@@ -1000,9 +1004,8 @@ def test_min_impurity_split(GBEstimator):
     X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
 
     est = GBEstimator(min_impurity_split=0.1)
-    est = assert_warns_message(FutureWarning,
-                               "min_impurity_decrease",
-                               est.fit, X, y)
+    with pytest.warns(FutureWarning, match="min_impurity_decrease"):
+        est = est.fit(X, y)
     for tree in est.estimators_.flat:
         assert tree.min_impurity_split == 0.1
 
diff --git a/sklearn/ensemble/tests/test_iforest.py b/sklearn/ensemble/tests/test_iforest.py
index 46174728c421c..2d7e059baaa03 100644
--- a/sklearn/ensemble/tests/test_iforest.py
+++ b/sklearn/ensemble/tests/test_iforest.py
@@ -13,7 +13,6 @@
 from sklearn.utils._testing import assert_array_equal
 from sklearn.utils._testing import assert_array_almost_equal
 from sklearn.utils._testing import assert_raises
-from sklearn.utils._testing import assert_warns_message
 from sklearn.utils._testing import ignore_warnings
 from sklearn.utils._testing import assert_allclose
 
@@ -101,9 +100,9 @@ def test_iforest_error():
     # The dataset has less than 256 samples, explicitly setting
     # max_samples > n_samples should result in a warning. If not set
     # explicitly there should be no warning
-    assert_warns_message(UserWarning,
-                         "max_samples will be set to n_samples for estimation",
-                         IsolationForest(max_samples=1000).fit, X)
+    warn_msg = "max_samples will be set to n_samples for estimation"
+    with pytest.warns(UserWarning, match=warn_msg):
+        IsolationForest(max_samples=1000).fit(X)
     # note that assert_no_warnings does not apply since it enables a
     # PendingDeprecationWarning triggered by scipy.sparse's use of
     # np.matrix. See issue #11251.
@@ -139,9 +138,9 @@ def test_max_samples_attribute():
     assert clf.max_samples_ == X.shape[0]
 
     clf = IsolationForest(max_samples=500)
-    assert_warns_message(UserWarning,
-                         "max_samples will be set to n_samples for estimation",
-                         clf.fit, X)
+    warn_msg = "max_samples will be set to n_samples for estimation"
+    with pytest.warns(UserWarning, match=warn_msg):
+        clf.fit(X)
     assert clf.max_samples_ == X.shape[0]
 
     clf = IsolationForest(max_samples=0.4).fit(X)

From a8d6361fe0dea3b313cd7621a7c75c4b24a974f5 Mon Sep 17 00:00:00 2001
From: shivamgargsya <shivam.gargshya@gmail.com>
Date: Thu, 11 Feb 2021 16:15:44 +0530
Subject: [PATCH 160/478] TST replace assert_warns* by pytest.warns in module
 svm/tests (#19424)

Co-authored-by: Shivam <shivam@ellipsishealth.com>
Co-authored-by: Guillaume Lemaitre <g.lemaitre58@gmail.com>
---
 sklearn/svm/tests/test_sparse.py | 13 +++++++-----
 sklearn/svm/tests/test_svm.py    | 35 ++++++++++++++++++++++++--------
 2 files changed, 34 insertions(+), 14 deletions(-)

diff --git a/sklearn/svm/tests/test_sparse.py b/sklearn/svm/tests/test_sparse.py
index 47c1feb4a2fe3..bb935e55e1912 100644
--- a/sklearn/svm/tests/test_sparse.py
+++ b/sklearn/svm/tests/test_sparse.py
@@ -9,9 +9,8 @@
 from sklearn.svm.tests import test_svm
 from sklearn.exceptions import ConvergenceWarning
 from sklearn.utils.extmath import safe_sparse_dot
-from sklearn.utils._testing import (assert_warns,
-                                   assert_raise_message, ignore_warnings,
-                                   skip_if_32bit)
+from sklearn.utils._testing import (assert_raise_message, ignore_warnings,
+                                    skip_if_32bit)
 
 
 # test sample 1
@@ -348,8 +347,12 @@ def test_sparse_svc_clone_with_callable_kernel():
 def test_timeout():
     sp = svm.SVC(C=1, kernel=lambda x, y: x * y.T,
                  probability=True, random_state=0, max_iter=1)
-
-    assert_warns(ConvergenceWarning, sp.fit, X_sp, Y)
+    warning_msg = (
+        r'Solver terminated early \(max_iter=1\).  Consider pre-processing '
+        r'your data with StandardScaler or MinMaxScaler.'
+    )
+    with pytest.warns(ConvergenceWarning, match=warning_msg):
+        sp.fit(X_sp, Y)
 
 
 def test_consistent_proba():
diff --git a/sklearn/svm/tests/test_svm.py b/sklearn/svm/tests/test_svm.py
index 4d57f4f7da450..f1e2cea4be2dc 100644
--- a/sklearn/svm/tests/test_svm.py
+++ b/sklearn/svm/tests/test_svm.py
@@ -19,10 +19,8 @@
 from sklearn.metrics import f1_score
 from sklearn.metrics.pairwise import rbf_kernel
 from sklearn.utils import check_random_state
-from sklearn.utils._testing import assert_warns
 from sklearn.utils._testing import assert_raise_message
 from sklearn.utils._testing import ignore_warnings
-from sklearn.utils._testing import assert_no_warnings
 from sklearn.utils.validation import _num_samples
 from sklearn.utils import shuffle
 from sklearn.exceptions import ConvergenceWarning
@@ -979,7 +977,12 @@ def test_svc_bad_kernel():
 def test_timeout():
     a = svm.SVC(kernel=lambda x, y: np.dot(x, y.T), probability=True,
                 random_state=0, max_iter=1)
-    assert_warns(ConvergenceWarning, a.fit, np.array(X), Y)
+    warning_msg = (
+        r'Solver terminated early \(max_iter=1\).  Consider pre-processing '
+        r'your data with StandardScaler or MinMaxScaler.'
+    )
+    with pytest.warns(ConvergenceWarning, match=warning_msg):
+        a.fit(np.array(X), Y)
 
 
 def test_unfitted():
@@ -1008,11 +1011,16 @@ def test_linear_svm_convergence_warnings():
     # Test that warnings are raised if model does not converge
 
     lsvc = svm.LinearSVC(random_state=0, max_iter=2)
-    assert_warns(ConvergenceWarning, lsvc.fit, X, Y)
+    warning_msg = (
+        "Liblinear failed to converge, increase the number of iterations."
+    )
+    with pytest.warns(ConvergenceWarning, match=warning_msg):
+        lsvc.fit(X, Y)
     assert lsvc.n_iter_ == 2
 
     lsvr = svm.LinearSVR(random_state=0, max_iter=2)
-    assert_warns(ConvergenceWarning, lsvr.fit, iris.data, iris.target)
+    with pytest.warns(ConvergenceWarning, match=warning_msg):
+        lsvr.fit(iris.data, iris.target)
     assert lsvr.n_iter_ == 2
 
 
@@ -1160,21 +1168,30 @@ def test_svc_ovr_tie_breaking(SVCClass):
 def test_gamma_auto():
     X, y = [[0.0, 1.2], [1.0, 1.3]], [0, 1]
 
-    assert_no_warnings(svm.SVC(kernel='linear').fit, X, y)
-    assert_no_warnings(svm.SVC(kernel='precomputed').fit, X, y)
+    with pytest.warns(None) as record:
+        svm.SVC(kernel='linear').fit(X, y)
+    assert not len(record)
+
+    with pytest.warns(None) as record:
+        svm.SVC(kernel='precomputed').fit(X, y)
+    assert not len(record)
 
 
 def test_gamma_scale():
     X, y = [[0.], [1.]], [0, 1]
 
     clf = svm.SVC()
-    assert_no_warnings(clf.fit, X, y)
+    with pytest.warns(None) as record:
+        clf.fit(X, y)
+    assert not len(record)
     assert_almost_equal(clf._gamma, 4)
 
     # X_var ~= 1 shouldn't raise warning, for when
     # gamma is not explicitly set.
     X, y = [[1, 2], [3, 2 * np.sqrt(6) / 3 + 2]], [0, 1]
-    assert_no_warnings(clf.fit, X, y)
+    with pytest.warns(None) as record:
+        clf.fit(X, y)
+    assert not len(record)
 
 
 @pytest.mark.parametrize(

From c61dcff5118a6cbd984c5e53bd3ae22628b3ba8a Mon Sep 17 00:00:00 2001
From: Hassan Alsawadi <h.alsawaddi@gmail.com>
Date: Thu, 11 Feb 2021 16:16:55 +0300
Subject: [PATCH 161/478] TST replace assert_raise_* by pytest.raises in
 ensemble module (#19399)

Co-authored-by: feras-oughali <oughali.feras@gmail.com>
Co-authored-by: hassanalsawadi <alsawadi.h@mozn.sa>
Co-authored-by: Guillaume Lemaitre <g.lemaitre58@gmail.com>
---
 sklearn/ensemble/tests/test_bagging.py        | 81 ++++++++++---------
 sklearn/ensemble/tests/test_base.py           | 19 ++---
 sklearn/ensemble/tests/test_forest.py         | 43 +++++-----
 .../ensemble/tests/test_gradient_boosting.py  | 52 ++++++------
 sklearn/ensemble/tests/test_iforest.py        | 22 ++---
 sklearn/ensemble/tests/test_voting.py         | 21 +++--
 .../ensemble/tests/test_weight_boosting.py    | 21 +++--
 7 files changed, 135 insertions(+), 124 deletions(-)

diff --git a/sklearn/ensemble/tests/test_bagging.py b/sklearn/ensemble/tests/test_bagging.py
index a08d421a75dfe..e7cb11185fa5c 100644
--- a/sklearn/ensemble/tests/test_bagging.py
+++ b/sklearn/ensemble/tests/test_bagging.py
@@ -14,9 +14,6 @@
 
 from sklearn.utils._testing import assert_array_equal
 from sklearn.utils._testing import assert_array_almost_equal
-from sklearn.utils._testing import assert_raises
-from sklearn.utils._testing import assert_raise_message
-
 from sklearn.dummy import DummyClassifier, DummyRegressor
 from sklearn.model_selection import GridSearchCV, ParameterGrid
 from sklearn.ensemble import BaggingClassifier, BaggingRegressor
@@ -418,28 +415,28 @@ def test_error():
     base = DecisionTreeClassifier()
 
     # Test max_samples
-    assert_raises(ValueError,
-                  BaggingClassifier(base, max_samples=-1).fit, X, y)
-    assert_raises(ValueError,
-                  BaggingClassifier(base, max_samples=0.0).fit, X, y)
-    assert_raises(ValueError,
-                  BaggingClassifier(base, max_samples=2.0).fit, X, y)
-    assert_raises(ValueError,
-                  BaggingClassifier(base, max_samples=1000).fit, X, y)
-    assert_raises(ValueError,
-                  BaggingClassifier(base, max_samples="foobar").fit, X, y)
+    with pytest.raises(ValueError):
+        BaggingClassifier(base, max_samples=-1).fit(X, y)
+    with pytest.raises(ValueError):
+        BaggingClassifier(base, max_samples=0.0).fit(X, y)
+    with pytest.raises(ValueError):
+        BaggingClassifier(base, max_samples=2.0).fit(X, y)
+    with pytest.raises(ValueError):
+        BaggingClassifier(base, max_samples=1000).fit(X, y)
+    with pytest.raises(ValueError):
+        BaggingClassifier(base, max_samples="foobar").fit(X, y)
 
     # Test max_features
-    assert_raises(ValueError,
-                  BaggingClassifier(base, max_features=-1).fit, X, y)
-    assert_raises(ValueError,
-                  BaggingClassifier(base, max_features=0.0).fit, X, y)
-    assert_raises(ValueError,
-                  BaggingClassifier(base, max_features=2.0).fit, X, y)
-    assert_raises(ValueError,
-                  BaggingClassifier(base, max_features=5).fit, X, y)
-    assert_raises(ValueError,
-                  BaggingClassifier(base, max_features="foobar").fit, X, y)
+    with pytest.raises(ValueError):
+        BaggingClassifier(base, max_features=-1).fit(X, y)
+    with pytest.raises(ValueError):
+        BaggingClassifier(base, max_features=0.0).fit(X, y)
+    with pytest.raises(ValueError):
+        BaggingClassifier(base, max_features=2.0).fit(X, y)
+    with pytest.raises(ValueError):
+        BaggingClassifier(base, max_features=5).fit(X, y)
+    with pytest.raises(ValueError):
+        BaggingClassifier(base, max_features="foobar").fit(X, y)
 
     # Test support of decision_function
     assert not hasattr(BaggingClassifier(base).fit(X, y), 'decision_function')
@@ -484,11 +481,13 @@ def test_parallel_classification():
     assert_array_almost_equal(decisions1, decisions2)
 
     X_err = np.hstack((X_test, np.zeros((X_test.shape[0], 1))))
-    assert_raise_message(ValueError, "Number of features of the model "
-                         "must match the input. Model n_features is {0} "
-                         "and input n_features is {1} "
-                         "".format(X_test.shape[1], X_err.shape[1]),
-                         ensemble.decision_function, X_err)
+    err_msg = (
+        f"Number of features of the model must match the input. Model "
+        f"n_features is {X_test.shape[1]} and input n_features is "
+        f"{X_err.shape[1]} "
+    )
+    with pytest.raises(ValueError, match=err_msg):
+        ensemble.decision_function(X_err)
 
     ensemble = BaggingClassifier(SVC(decision_function_shape='ovr'),
                                  n_jobs=1,
@@ -612,8 +611,9 @@ def test_bagging_sample_weight_unsupported_but_passed():
     rng = check_random_state(0)
 
     estimator.fit(iris.data, iris.target).predict(iris.data)
-    assert_raises(ValueError, estimator.fit, iris.data, iris.target,
-                  sample_weight=rng.randint(10, size=(iris.data.shape[0])))
+    with pytest.raises(ValueError):
+        estimator.fit(iris.data, iris.target,
+                      sample_weight=rng.randint(10, size=(iris.data.shape[0])))
 
 
 def test_warm_start(random_state=42):
@@ -646,7 +646,8 @@ def test_warm_start_smaller_n_estimators():
     clf = BaggingClassifier(n_estimators=5, warm_start=True)
     clf.fit(X, y)
     clf.set_params(n_estimators=4)
-    assert_raises(ValueError, clf.fit, X, y)
+    with pytest.raises(ValueError):
+        clf.fit(X, y)
 
 
 def test_warm_start_equal_n_estimators():
@@ -692,7 +693,8 @@ def test_warm_start_with_oob_score_fails():
     # Check using oob_score and warm_start simultaneously fails
     X, y = make_hastie_10_2(n_samples=20, random_state=1)
     clf = BaggingClassifier(n_estimators=5, warm_start=True, oob_score=True)
-    assert_raises(ValueError, clf.fit, X, y)
+    with pytest.raises(ValueError):
+        clf.fit(X, y)
 
 
 def test_oob_score_removed_on_warm_start():
@@ -704,7 +706,8 @@ def test_oob_score_removed_on_warm_start():
     clf.set_params(warm_start=True, oob_score=False, n_estimators=100)
     clf.fit(X, y)
 
-    assert_raises(AttributeError, getattr, clf, "oob_score_")
+    with pytest.raises(AttributeError):
+        getattr(clf, "oob_score_")
 
 
 def test_oob_score_consistency():
@@ -848,9 +851,11 @@ def test_bagging_regressor_with_missing_inputs():
         # Verify that exceptions can be raised by wrapper regressor
         regressor = DecisionTreeRegressor()
         pipeline = make_pipeline(regressor)
-        assert_raises(ValueError, pipeline.fit, X, y)
+        with pytest.raises(ValueError):
+            pipeline.fit(X, y)
         bagging_regressor = BaggingRegressor(pipeline)
-        assert_raises(ValueError, bagging_regressor.fit, X, y)
+        with pytest.raises(ValueError):
+            bagging_regressor.fit(X, y)
 
 
 def test_bagging_classifier_with_missing_inputs():
@@ -878,9 +883,11 @@ def test_bagging_classifier_with_missing_inputs():
     # Verify that exceptions can be raised by wrapper classifier
     classifier = DecisionTreeClassifier()
     pipeline = make_pipeline(classifier)
-    assert_raises(ValueError, pipeline.fit, X, y)
+    with pytest.raises(ValueError):
+        pipeline.fit(X, y)
     bagging_classifier = BaggingClassifier(pipeline)
-    assert_raises(ValueError, bagging_classifier.fit, X, y)
+    with pytest.raises(ValueError):
+        bagging_classifier.fit(X, y)
 
 
 def test_bagging_small_max_features():
diff --git a/sklearn/ensemble/tests/test_base.py b/sklearn/ensemble/tests/test_base.py
index ebeb8f364601f..3c5b7564380c6 100644
--- a/sklearn/ensemble/tests/test_base.py
+++ b/sklearn/ensemble/tests/test_base.py
@@ -6,8 +6,7 @@
 # License: BSD 3 clause
 
 import numpy as np
-
-from sklearn.utils._testing import assert_raise_message
+import pytest
 
 from sklearn.datasets import load_iris
 from sklearn.ensemble import BaggingClassifier
@@ -54,9 +53,9 @@ def test_base_zero_n_estimators():
     ensemble = BaggingClassifier(base_estimator=Perceptron(),
                                  n_estimators=0)
     iris = load_iris()
-    assert_raise_message(ValueError,
-                         "n_estimators must be greater than zero, got 0.",
-                         ensemble.fit, iris.data, iris.target)
+    err_msg = "n_estimators must be greater than zero, got 0."
+    with pytest.raises(ValueError, match=err_msg):
+        ensemble.fit(iris.data, iris.target)
 
 
 def test_base_not_int_n_estimators():
@@ -65,14 +64,12 @@ def test_base_not_int_n_estimators():
     string_ensemble = BaggingClassifier(base_estimator=Perceptron(),
                                         n_estimators='3')
     iris = load_iris()
-    assert_raise_message(ValueError,
-                         "n_estimators must be an integer",
-                         string_ensemble.fit, iris.data, iris.target)
+    with pytest.raises(ValueError, match="n_estimators must be an integer"):
+        string_ensemble.fit(iris.data, iris.target)
     float_ensemble = BaggingClassifier(base_estimator=Perceptron(),
                                        n_estimators=3.0)
-    assert_raise_message(ValueError,
-                         "n_estimators must be an integer",
-                         float_ensemble.fit, iris.data, iris.target)
+    with pytest.raises(ValueError, match="n_estimators must be an integer"):
+        float_ensemble.fit(iris.data, iris.target)
 
 
 def test_set_random_states():
diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py
index f97c956327fd5..89ded326d21aa 100644
--- a/sklearn/ensemble/tests/test_forest.py
+++ b/sklearn/ensemble/tests/test_forest.py
@@ -29,7 +29,6 @@
 from sklearn.utils._testing import assert_almost_equal
 from sklearn.utils._testing import assert_array_almost_equal
 from sklearn.utils._testing import assert_array_equal
-from sklearn.utils._testing import assert_raises
 from sklearn.utils._testing import _convert_container
 from sklearn.utils._testing import ignore_warnings
 from sklearn.utils._testing import skip_if_no_parallel
@@ -833,12 +832,12 @@ def check_min_samples_split(name):
     ForestEstimator = FOREST_ESTIMATORS[name]
 
     # test boundary value
-    assert_raises(ValueError,
-                  ForestEstimator(min_samples_split=-1).fit, X, y)
-    assert_raises(ValueError,
-                  ForestEstimator(min_samples_split=0).fit, X, y)
-    assert_raises(ValueError,
-                  ForestEstimator(min_samples_split=1.1).fit, X, y)
+    with pytest.raises(ValueError):
+        ForestEstimator(min_samples_split=-1).fit(X, y)
+    with pytest.raises(ValueError):
+        ForestEstimator(min_samples_split=0).fit(X, y)
+    with pytest.raises(ValueError):
+        ForestEstimator(min_samples_split=1.1).fit(X, y)
 
     est = ForestEstimator(min_samples_split=10, n_estimators=1, random_state=0)
     est.fit(X, y)
@@ -870,10 +869,10 @@ def check_min_samples_leaf(name):
     ForestEstimator = FOREST_ESTIMATORS[name]
 
     # test boundary value
-    assert_raises(ValueError,
-                  ForestEstimator(min_samples_leaf=-1).fit, X, y)
-    assert_raises(ValueError,
-                  ForestEstimator(min_samples_leaf=0).fit, X, y)
+    with pytest.raises(ValueError):
+        ForestEstimator(min_samples_leaf=-1).fit(X, y)
+    with pytest.raises(ValueError):
+        ForestEstimator(min_samples_leaf=0).fit(X, y)
 
     est = ForestEstimator(min_samples_leaf=5, n_estimators=1, random_state=0)
     est.fit(X, y)
@@ -1026,14 +1025,15 @@ def test_memory_layout(name, dtype):
 @ignore_warnings
 def check_1d_input(name, X, X_2d, y):
     ForestEstimator = FOREST_ESTIMATORS[name]
-    assert_raises(ValueError, ForestEstimator(n_estimators=1,
-                                              random_state=0).fit, X, y)
+    with pytest.raises(ValueError):
+        ForestEstimator(n_estimators=1, random_state=0).fit(X, y)
 
     est = ForestEstimator(random_state=0)
     est.fit(X_2d, y)
 
     if name in FOREST_CLASSIFIERS or name in FOREST_REGRESSORS:
-        assert_raises(ValueError, est.predict, X)
+        with pytest.raises(ValueError):
+            est.predict(X)
 
 
 @pytest.mark.parametrize('name', FOREST_ESTIMATORS)
@@ -1120,8 +1120,10 @@ def check_class_weight_errors(name):
 
     # Invalid preset string
     clf = ForestClassifier(class_weight='the larch', random_state=0)
-    assert_raises(ValueError, clf.fit, X, y)
-    assert_raises(ValueError, clf.fit, X, _y)
+    with pytest.raises(ValueError):
+        clf.fit(X, y)
+    with pytest.raises(ValueError):
+        clf.fit(X, _y)
 
     # Warning warm_start with preset
     clf = ForestClassifier(class_weight='balanced', warm_start=True,
@@ -1137,11 +1139,13 @@ def check_class_weight_errors(name):
 
     # Not a list or preset for multi-output
     clf = ForestClassifier(class_weight=1, random_state=0)
-    assert_raises(ValueError, clf.fit, X, _y)
+    with pytest.raises(ValueError):
+        clf.fit(X, _y)
 
     # Incorrect length list for multi-output
     clf = ForestClassifier(class_weight=[{-1: 0.5, 1: 1.}], random_state=0)
-    assert_raises(ValueError, clf.fit, X, _y)
+    with pytest.raises(ValueError):
+        clf.fit(X, _y)
 
 
 @pytest.mark.parametrize('name', FOREST_CLASSIFIERS)
@@ -1210,7 +1214,8 @@ def check_warm_start_smaller_n_estimators(name):
     est = ForestEstimator(n_estimators=5, max_depth=1, warm_start=True)
     est.fit(X, y)
     est.set_params(n_estimators=4)
-    assert_raises(ValueError, est.fit, X, y)
+    with pytest.raises(ValueError):
+        est.fit(X, y)
 
 
 @pytest.mark.parametrize('name', FOREST_ESTIMATORS)
diff --git a/sklearn/ensemble/tests/test_gradient_boosting.py b/sklearn/ensemble/tests/test_gradient_boosting.py
index 1fbec105e3cb4..57ac93f52d0d3 100644
--- a/sklearn/ensemble/tests/test_gradient_boosting.py
+++ b/sklearn/ensemble/tests/test_gradient_boosting.py
@@ -27,8 +27,6 @@
 from sklearn.utils._testing import assert_almost_equal
 from sklearn.utils._testing import assert_array_almost_equal
 from sklearn.utils._testing import assert_array_equal
-from sklearn.utils._testing import assert_raises
-from sklearn.utils._testing import assert_raise_message
 from sklearn.utils._testing import skip_if_32bit
 from sklearn.exceptions import DataConversionWarning
 from sklearn.exceptions import NotFittedError
@@ -68,7 +66,8 @@ def test_classification_toy(loss):
     clf = GradientBoostingClassifier(loss=loss, n_estimators=10,
                                      random_state=1)
 
-    assert_raises(ValueError, clf.predict, T)
+    with pytest.raises(ValueError):
+        clf.predict(T)
 
     clf.fit(X, y)
     assert_array_equal(clf.predict(T), true_result)
@@ -283,7 +282,8 @@ def test_probability_log():
     # Predict probabilities.
     clf = GradientBoostingClassifier(n_estimators=100, random_state=1)
 
-    assert_raises(ValueError, clf.predict_proba, T)
+    with pytest.raises(ValueError):
+        clf.predict_proba(T)
 
     clf.fit(X, y)
     assert_array_equal(clf.predict(T), true_result)
@@ -317,15 +317,12 @@ def test_check_inputs_predict_stages():
     clf = GradientBoostingClassifier(n_estimators=100, random_state=1)
     clf.fit(x, y)
     score = np.zeros((y.shape)).reshape(-1, 1)
-    assert_raise_message(ValueError,
-                         "When X is a sparse matrix, a CSR format is expected",
-                         predict_stages, clf.estimators_, x_sparse_csc,
-                         clf.learning_rate, score)
+    err_msg = "When X is a sparse matrix, a CSR format is expected"
+    with pytest.raises(ValueError, match=err_msg):
+        predict_stages(clf.estimators_, x_sparse_csc, clf.learning_rate, score)
     x_fortran = np.asfortranarray(x)
-    assert_raise_message(ValueError,
-                         "X should be C-ordered np.ndarray",
-                         predict_stages, clf.estimators_, x_fortran,
-                         clf.learning_rate, score)
+    with pytest.raises(ValueError, match="X should be C-ordered np.ndarray"):
+        predict_stages(clf.estimators_, x_fortran, clf.learning_rate, score)
 
 
 def test_max_feature_regression():
@@ -414,8 +411,8 @@ def test_staged_predict():
     X_test = X[200:]
     clf = GradientBoostingRegressor()
     # test raise ValueError if not fitted
-    assert_raises(ValueError, lambda X: np.fromiter(
-        clf.staged_predict(X), dtype=np.float64), X_test)
+    with pytest.raises(ValueError):
+        np.fromiter(clf.staged_predict(X_test), dtype=np.float64)
 
     clf.fit(X_train, y_train)
     y_pred = clf.predict(X_test)
@@ -435,9 +432,9 @@ def test_staged_predict_proba():
     X_train, y_train = X[:200], y[:200]
     X_test, y_test = X[200:], y[200:]
     clf = GradientBoostingClassifier(n_estimators=20)
-    # test raise NotFittedError if not fitted
-    assert_raises(NotFittedError, lambda X: np.fromiter(
-        clf.staged_predict_proba(X), dtype=np.float64), X_test)
+    # test raise NotFittedError if not
+    with pytest.raises(NotFittedError):
+        np.fromiter(clf.staged_predict_proba(X_test), dtype=np.float64)
 
     clf.fit(X_train, y_train)
 
@@ -499,7 +496,8 @@ def test_degenerate_targets():
     clf = GradientBoostingClassifier(n_estimators=100, random_state=1)
 
     # classifier should raise exception
-    assert_raises(ValueError, clf.fit, X, np.ones(len(X)))
+    with pytest.raises(ValueError):
+        clf.fit(X, np.ones(len(X)))
 
     clf = GradientBoostingRegressor(n_estimators=100, random_state=1)
     clf.fit(X, np.ones(len(X)))
@@ -615,7 +613,8 @@ def test_oob_improvement_raise():
     clf = GradientBoostingClassifier(n_estimators=100, random_state=1,
                                      subsample=1.0)
     clf.fit(X, y)
-    assert_raises(AttributeError, lambda: clf.oob_improvement_)
+    with pytest.raises(AttributeError):
+        clf.oob_improvement_
 
 
 def test_oob_multilcass_iris():
@@ -758,7 +757,8 @@ def test_warm_start_zero_n_estimators(Cls):
     est = Cls(n_estimators=100, max_depth=1, warm_start=True)
     est.fit(X, y)
     est.set_params(n_estimators=0)
-    assert_raises(ValueError, est.fit, X, y)
+    with pytest.raises(ValueError):
+        est.fit(X, y)
 
 
 @pytest.mark.parametrize('Cls', GRADIENT_BOOSTING_ESTIMATORS)
@@ -768,7 +768,8 @@ def test_warm_start_smaller_n_estimators(Cls):
     est = Cls(n_estimators=100, max_depth=1, warm_start=True)
     est.fit(X, y)
     est.set_params(n_estimators=99)
-    assert_raises(ValueError, est.fit, X, y)
+    with pytest.raises(ValueError):
+        est.fit(X, y)
 
 
 @pytest.mark.parametrize('Cls', GRADIENT_BOOSTING_ESTIMATORS)
@@ -953,7 +954,8 @@ def test_zero_estimator_reg():
 
     est = GradientBoostingRegressor(n_estimators=20, max_depth=1,
                                     random_state=1, init='foobar')
-    assert_raises(ValueError, est.fit, X_reg, y_reg)
+    with pytest.raises(ValueError):
+        est.fit(X_reg, y_reg)
 
 
 def test_zero_estimator_clf():
@@ -978,7 +980,8 @@ def test_zero_estimator_clf():
 
     est = GradientBoostingClassifier(n_estimators=20, max_depth=1,
                                      random_state=1, init='foobar')
-    assert_raises(ValueError, est.fit, X, y)
+    with pytest.raises(ValueError):
+        est.fit(X, y)
 
 
 @pytest.mark.parametrize('GBEstimator', GRADIENT_BOOSTING_ESTIMATORS)
@@ -1037,7 +1040,8 @@ def test_probability_exponential():
     clf = GradientBoostingClassifier(loss='exponential',
                                      n_estimators=100, random_state=1)
 
-    assert_raises(ValueError, clf.predict_proba, T)
+    with pytest.raises(ValueError):
+        clf.predict_proba(T)
 
     clf.fit(X, y)
     assert_array_equal(clf.predict(T), true_result)
diff --git a/sklearn/ensemble/tests/test_iforest.py b/sklearn/ensemble/tests/test_iforest.py
index 2d7e059baaa03..de0c56fff793b 100644
--- a/sklearn/ensemble/tests/test_iforest.py
+++ b/sklearn/ensemble/tests/test_iforest.py
@@ -12,7 +12,6 @@
 
 from sklearn.utils._testing import assert_array_equal
 from sklearn.utils._testing import assert_array_almost_equal
-from sklearn.utils._testing import assert_raises
 from sklearn.utils._testing import ignore_warnings
 from sklearn.utils._testing import assert_allclose
 
@@ -91,12 +90,12 @@ def test_iforest_error():
     X = iris.data
 
     # Test max_samples
-    assert_raises(ValueError,
-                  IsolationForest(max_samples=-1).fit, X)
-    assert_raises(ValueError,
-                  IsolationForest(max_samples=0.0).fit, X)
-    assert_raises(ValueError,
-                  IsolationForest(max_samples=2.0).fit, X)
+    with pytest.raises(ValueError):
+        IsolationForest(max_samples=-1).fit(X)
+    with pytest.raises(ValueError):
+        IsolationForest(max_samples=0.0).fit(X)
+    with pytest.raises(ValueError):
+        IsolationForest(max_samples=2.0).fit(X)
     # The dataset has less than 256 samples, explicitly setting
     # max_samples > n_samples should result in a warning. If not set
     # explicitly there should be no warning
@@ -117,11 +116,14 @@ def test_iforest_error():
                      if issubclass(each.category, UserWarning)]
     assert len(user_warnings) == 0
 
-    assert_raises(ValueError, IsolationForest(max_samples='foobar').fit, X)
-    assert_raises(ValueError, IsolationForest(max_samples=1.5).fit, X)
+    with pytest.raises(ValueError):
+        IsolationForest(max_samples='foobar').fit(X)
+    with pytest.raises(ValueError):
+        IsolationForest(max_samples=1.5).fit(X)
 
     # test X_test n_features match X_train one:
-    assert_raises(ValueError, IsolationForest().fit(X).predict, X[:, 1:])
+    with pytest.raises(ValueError):
+        IsolationForest().fit(X).predict(X[:, 1:])
 
 
 def test_recalculate_max_depth():
diff --git a/sklearn/ensemble/tests/test_voting.py b/sklearn/ensemble/tests/test_voting.py
index 76add81ad1475..d36e71a3c6ff3 100644
--- a/sklearn/ensemble/tests/test_voting.py
+++ b/sklearn/ensemble/tests/test_voting.py
@@ -7,7 +7,6 @@
 
 from sklearn.utils._testing import assert_almost_equal, assert_array_equal
 from sklearn.utils._testing import assert_array_almost_equal
-from sklearn.utils._testing import assert_raise_message
 from sklearn.exceptions import NotFittedError
 from sklearn.linear_model import LinearRegression
 from sklearn.linear_model import LogisticRegression
@@ -70,16 +69,16 @@ def test_notfitted():
     ereg = VotingRegressor([('dr', DummyRegressor())])
     msg = ("This %s instance is not fitted yet. Call \'fit\'"
            " with appropriate arguments before using this estimator.")
-    assert_raise_message(NotFittedError, msg % 'VotingClassifier',
-                         eclf.predict, X)
-    assert_raise_message(NotFittedError, msg % 'VotingClassifier',
-                         eclf.predict_proba, X)
-    assert_raise_message(NotFittedError, msg % 'VotingClassifier',
-                         eclf.transform, X)
-    assert_raise_message(NotFittedError, msg % 'VotingRegressor',
-                         ereg.predict, X_r)
-    assert_raise_message(NotFittedError, msg % 'VotingRegressor',
-                         ereg.transform, X_r)
+    with pytest.raises(NotFittedError, match=msg % 'VotingClassifier'):
+        eclf.predict(X)
+    with pytest.raises(NotFittedError, match=msg % 'VotingClassifier'):
+        eclf.predict_proba(X)
+    with pytest.raises(NotFittedError, match=msg % 'VotingClassifier'):
+        eclf.transform(X)
+    with pytest.raises(NotFittedError, match=msg % 'VotingRegressor'):
+        ereg.predict(X_r)
+    with pytest.raises(NotFittedError, match=msg % 'VotingRegressor'):
+        ereg.transform(X_r)
 
 
 def test_majority_label_iris():
diff --git a/sklearn/ensemble/tests/test_weight_boosting.py b/sklearn/ensemble/tests/test_weight_boosting.py
index f7c0bad5af193..587e3f538359c 100755
--- a/sklearn/ensemble/tests/test_weight_boosting.py
+++ b/sklearn/ensemble/tests/test_weight_boosting.py
@@ -11,7 +11,6 @@
 
 from sklearn.utils._testing import assert_array_equal, assert_array_less
 from sklearn.utils._testing import assert_array_almost_equal
-from sklearn.utils._testing import assert_raises, assert_raises_regexp
 
 from sklearn.base import BaseEstimator
 from sklearn.base import clone
@@ -270,17 +269,15 @@ def test_importances():
 
 def test_error():
     # Test that it gives proper exception on deficient input.
-    assert_raises(ValueError,
-                  AdaBoostClassifier(learning_rate=-1).fit,
-                  X, y_class)
 
-    assert_raises(ValueError,
-                  AdaBoostClassifier(algorithm="foo").fit,
-                  X, y_class)
+    with pytest.raises(ValueError):
+        AdaBoostClassifier(learning_rate=-1).fit(X, y_class)
 
-    assert_raises(ValueError,
-                  AdaBoostClassifier().fit,
-                  X, y_class, sample_weight=np.asarray([-1]))
+    with pytest.raises(ValueError):
+        AdaBoostClassifier(algorithm="foo").fit(X, y_class)
+
+    with pytest.raises(ValueError):
+        AdaBoostClassifier().fit(X, y_class, sample_weight=np.asarray([-1]))
 
 
 def test_base_estimator():
@@ -307,8 +304,8 @@ def test_base_estimator():
     X_fail = [[1, 1], [1, 1], [1, 1], [1, 1]]
     y_fail = ["foo", "bar", 1, 2]
     clf = AdaBoostClassifier(SVC(), algorithm="SAMME")
-    assert_raises_regexp(ValueError, "worse than random",
-                         clf.fit, X_fail, y_fail)
+    with pytest.raises(ValueError, match="worse than random"):
+        clf.fit(X_fail, y_fail)
 
 
 def test_sparse_classification():

From 7729cb4b14b6b9490f93fde47364713a0da53079 Mon Sep 17 00:00:00 2001
From: Atsushi Nukariya <a.nukariya@jp.fujitsu.com>
Date: Fri, 12 Feb 2021 00:21:20 +0900
Subject: [PATCH 162/478] FIX RuntimeWarning by dividing by zero in
 test_kernel_gradient (#19396)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Julien Jerphanion <git@jjerphan.xyz>
Co-authored-by: Jérémie du Boisberranger <34657725+jeremiedbb@users.noreply.github.com>
Co-authored-by: Roman Yurchak <rth.yurchak@gmail.com>
---
 sklearn/gaussian_process/kernels.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sklearn/gaussian_process/kernels.py b/sklearn/gaussian_process/kernels.py
index 537d733308cd9..c731dcac347cd 100644
--- a/sklearn/gaussian_process/kernels.py
+++ b/sklearn/gaussian_process/kernels.py
@@ -1686,9 +1686,9 @@ def __call__(self, X, Y=None, eval_gradient=False):
                 D = squareform(dists**2)[:, :, np.newaxis]
 
             if self.nu == 0.5:
-                K_gradient = K[..., np.newaxis] * D \
-                    / np.sqrt(D.sum(2))[:, :, np.newaxis]
-                K_gradient[~np.isfinite(K_gradient)] = 0
+                denominator = np.sqrt(D.sum(axis=2))[:, :, np.newaxis]
+                K_gradient = K[..., np.newaxis] * \
+                    np.divide(D, denominator, where=denominator != 0)
             elif self.nu == 1.5:
                 K_gradient = \
                     3 * D * np.exp(-np.sqrt(3 * D.sum(-1)))[..., np.newaxis]

From 6959532d4e43f4434993c027c7b2df09ade942ad Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Thu, 11 Feb 2021 15:52:47 -0500
Subject: [PATCH 163/478] ENH Checks n_features_in_ in covariance (#19341)

Co-authored-by: Guillaume Lemaitre <g.lemaitre58@gmail.com>
---
 sklearn/covariance/_elliptic_envelope.py    |  6 +++---
 sklearn/covariance/_empirical_covariance.py | 11 ++++++++---
 sklearn/covariance/tests/test_covariance.py | 10 ++++++++++
 sklearn/neighbors/_kde.py                   |  4 ++--
 sklearn/tests/test_common.py                |  1 -
 sklearn/utils/estimator_checks.py           |  9 +++++++--
 6 files changed, 30 insertions(+), 11 deletions(-)

diff --git a/sklearn/covariance/_elliptic_envelope.py b/sklearn/covariance/_elliptic_envelope.py
index 84600cbfa1963..a4fe7f1d08137 100644
--- a/sklearn/covariance/_elliptic_envelope.py
+++ b/sklearn/covariance/_elliptic_envelope.py
@@ -4,7 +4,7 @@
 
 import numpy as np
 from . import MinCovDet
-from ..utils.validation import check_is_fitted, check_array
+from ..utils.validation import check_is_fitted
 from ..utils.validation import _deprecate_positional_args
 from ..metrics import accuracy_score
 from ..base import OutlierMixin
@@ -180,6 +180,7 @@ def score_samples(self, X):
             Opposite of the Mahalanobis distances.
         """
         check_is_fitted(self)
+        X = self._validate_data(X, reset=False)
         return -self.mahalanobis(X)
 
     def predict(self, X):
@@ -197,9 +198,8 @@ def predict(self, X):
         is_inlier : ndarray of shape (n_samples,)
             Returns -1 for anomalies/outliers and +1 for inliers.
         """
-        X = check_array(X)
-        is_inlier = np.full(X.shape[0], -1, dtype=int)
         values = self.decision_function(X)
+        is_inlier = np.full(values.shape[0], -1, dtype=int)
         is_inlier[values >= 0] = 1
 
         return is_inlier
diff --git a/sklearn/covariance/_empirical_covariance.py b/sklearn/covariance/_empirical_covariance.py
index fc9d83191b470..fb1797e50f96a 100644
--- a/sklearn/covariance/_empirical_covariance.py
+++ b/sklearn/covariance/_empirical_covariance.py
@@ -14,6 +14,7 @@
 import numpy as np
 from scipy import linalg
 
+from .. import config_context
 from ..base import BaseEstimator
 from ..utils import check_array
 from ..utils.extmath import fast_logdet
@@ -234,6 +235,7 @@ def score(self, X_test, y=None):
             The likelihood of the data set with `self.covariance_` as an
             estimator of its covariance matrix.
         """
+        X_test = self._validate_data(X_test, reset=False)
         # compute empirical covariance of the test set
         test_cov = empirical_covariance(
             X_test - self.location_, assume_centered=True)
@@ -309,9 +311,12 @@ def mahalanobis(self, X):
         dist : ndarray of shape (n_samples,)
             Squared Mahalanobis distances of the observations.
         """
+        X = self._validate_data(X, reset=False)
+
         precision = self.get_precision()
-        # compute mahalanobis distances
-        dist = pairwise_distances(X, self.location_[np.newaxis, :],
-                                  metric='mahalanobis', VI=precision)
+        with config_context(assume_finite=True):
+            # compute mahalanobis distances
+            dist = pairwise_distances(X, self.location_[np.newaxis, :],
+                                      metric='mahalanobis', VI=precision)
 
         return np.reshape(dist, (len(X),)) ** 2
diff --git a/sklearn/covariance/tests/test_covariance.py b/sklearn/covariance/tests/test_covariance.py
index adcdeec48948e..bcf163e8182d8 100644
--- a/sklearn/covariance/tests/test_covariance.py
+++ b/sklearn/covariance/tests/test_covariance.py
@@ -303,3 +303,13 @@ def test_oas():
     oa.fit(X)
     assert_almost_equal(oa.score(X), score_, 4)
     assert(oa.precision_ is None)
+
+
+def test_EmpiricalCovariance_validates_mahalanobis():
+    """Checks that EmpiricalCovariance validates data with mahalanobis."""
+    cov = EmpiricalCovariance().fit(X)
+
+    msg = (f"X has 2 features, but \\w+ is expecting {X.shape[1]} "
+           "features as input")
+    with pytest.raises(ValueError, match=msg):
+        cov.mahalanobis(X[:, :2])
diff --git a/sklearn/neighbors/_kde.py b/sklearn/neighbors/_kde.py
index 70e6ce7d91a25..57f80f83762fb 100644
--- a/sklearn/neighbors/_kde.py
+++ b/sklearn/neighbors/_kde.py
@@ -7,7 +7,7 @@
 import numpy as np
 from scipy.special import gammainc
 from ..base import BaseEstimator
-from ..utils import check_array, check_random_state
+from ..utils import check_random_state
 from ..utils.validation import _check_sample_weight, check_is_fitted
 from ..utils.validation import _deprecate_positional_args
 
@@ -198,7 +198,7 @@ def score_samples(self, X):
         # The returned density is normalized to the number of points.
         # For it to be a probability, we must scale it.  For this reason
         # we'll also scale atol.
-        X = check_array(X, order='C', dtype=DTYPE)
+        X = self._validate_data(X, order='C', dtype=DTYPE, reset=False)
         if self.tree_.sample_weight is None:
             N = self.tree_.data.shape[0]
         else:
diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
index 90c7d9210c0c9..dbac492d5efb9 100644
--- a/sklearn/tests/test_common.py
+++ b/sklearn/tests/test_common.py
@@ -266,7 +266,6 @@ def test_search_cv(estimator, check, request):
 N_FEATURES_IN_AFTER_FIT_MODULES_TO_IGNORE = {
     'calibration',
     'compose',
-    'covariance',
     'ensemble',
     'feature_extraction',
     'isotonic',
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index e811c3c3679e9..a02b45c6754d2 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -3141,7 +3141,7 @@ def check_n_features_in_after_fitting(name, estimator_orig):
 
     # check methods will check n_features_in_
     check_methods = ["predict", "transform", "decision_function",
-                     "predict_proba"]
+                     "predict_proba", "score"]
     X_bad = X[:, [1]]
 
     msg = (f"X has 1 features, but \\w+ is expecting {X.shape[1]} "
@@ -3149,8 +3149,13 @@ def check_n_features_in_after_fitting(name, estimator_orig):
     for method in check_methods:
         if not hasattr(estimator, method):
             continue
+
+        callable_method = getattr(estimator, method)
+        if method == "score":
+            callable_method = partial(callable_method, y=y)
+
         with raises(ValueError, match=msg):
-            getattr(estimator, method)(X_bad)
+            callable_method(X_bad)
 
     # partial_fit will check in the second call
     if not hasattr(estimator, "partial_fit"):

From f4ed4c17b4d49ff4a1511fcb7e0d56a1d248ddaa Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Fri, 12 Feb 2021 03:53:35 -0500
Subject: [PATCH 164/478] TST Fixes logistic & partial_dependence test for
 32bit wheels (#19402)

---
 .../inspection/tests/test_partial_dependence.py    |  3 ++-
 sklearn/linear_model/tests/test_logistic.py        | 14 +++++---------
 2 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/sklearn/inspection/tests/test_partial_dependence.py b/sklearn/inspection/tests/test_partial_dependence.py
index 997c61c0e5f8b..f7727210148c6 100644
--- a/sklearn/inspection/tests/test_partial_dependence.py
+++ b/sklearn/inspection/tests/test_partial_dependence.py
@@ -30,6 +30,7 @@
 from sklearn.preprocessing import PolynomialFeatures
 from sklearn.preprocessing import StandardScaler
 from sklearn.preprocessing import RobustScaler
+from sklearn.preprocessing import scale
 from sklearn.pipeline import make_pipeline
 from sklearn.dummy import DummyClassifier
 from sklearn.base import BaseEstimator, ClassifierMixin, clone
@@ -607,7 +608,7 @@ def test_partial_dependence_dataframe(estimator, preprocessor, features):
     # check that the partial dependence support dataframe and pipeline
     # including a column transformer
     pd = pytest.importorskip("pandas")
-    df = pd.DataFrame(iris.data, columns=iris.feature_names)
+    df = pd.DataFrame(scale(iris.data), columns=iris.feature_names)
 
     pipe = make_pipeline(preprocessor, estimator)
     pipe.fit(df, iris.target)
diff --git a/sklearn/linear_model/tests/test_logistic.py b/sklearn/linear_model/tests/test_logistic.py
index 09d54c254bfb8..cd64eee79cbac 100644
--- a/sklearn/linear_model/tests/test_logistic.py
+++ b/sklearn/linear_model/tests/test_logistic.py
@@ -1,5 +1,4 @@
 import os
-import sys
 import warnings
 import numpy as np
 from numpy.testing import assert_allclose, assert_almost_equal
@@ -1691,9 +1690,9 @@ def test_logistic_regression_path_coefs_multinomial():
 
 
 @pytest.mark.parametrize('est',
-                         [LogisticRegression(random_state=0),
+                         [LogisticRegression(random_state=0, max_iter=500),
                           LogisticRegressionCV(random_state=0, cv=3,
-                                               Cs=3, tol=1e-3)],
+                                               Cs=3, tol=1e-3, max_iter=500)],
                          ids=lambda x: x.__class__.__name__)
 @pytest.mark.parametrize('solver', ['liblinear', 'lbfgs', 'newton-cg', 'sag',
                                     'saga'])
@@ -1703,8 +1702,9 @@ def test_logistic_regression_multi_class_auto(est, solver):
     def fit(X, y, **kw):
         return clone(est).set_params(**kw).fit(X, y)
 
-    X = iris.data[::10]
-    X2 = iris.data[1::10]
+    scaled_data = scale(iris.data)
+    X = scaled_data[::10]
+    X2 = scaled_data[1::10]
     y_multi = iris.target[::10]
     y_bin = y_multi == 0
     est_auto_bin = fit(X, y_bin, multi_class='auto', solver=solver)
@@ -1722,10 +1722,6 @@ def fit(X, y, **kw):
     else:
         est_multi_multi = fit(X, y_multi, multi_class='multinomial',
                               solver=solver)
-        if sys.platform == 'darwin' and solver == 'lbfgs':
-            pytest.xfail('Issue #11924: LogisticRegressionCV(solver="lbfgs", '
-                         'multi_class="multinomial") is nondeterministic on '
-                         'MacOS.')
         assert_allclose(est_auto_multi.coef_, est_multi_multi.coef_)
         assert_allclose(est_auto_multi.predict_proba(X2),
                         est_multi_multi.predict_proba(X2))

From abd1597e6679eb4fc56fc370064a6d33d964a365 Mon Sep 17 00:00:00 2001
From: Fortune Uwha <fortune.uwha@gmail.com>
Date: Fri, 12 Feb 2021 09:01:20 +0000
Subject: [PATCH 165/478] FIX Converted output to int64 in random_projection
 *_min_dim (#19374)

---
 sklearn/random_projection.py            | 2 +-
 sklearn/tests/test_random_projection.py | 8 ++++++++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/sklearn/random_projection.py b/sklearn/random_projection.py
index 4623ac1ab64e4..33dc108a59a4e 100644
--- a/sklearn/random_projection.py
+++ b/sklearn/random_projection.py
@@ -129,7 +129,7 @@ def johnson_lindenstrauss_min_dim(n_samples, *, eps=0.1):
             % n_samples)
 
     denominator = (eps ** 2 / 2) - (eps ** 3 / 3)
-    return (4 * np.log(n_samples) / denominator).astype(int)
+    return (4 * np.log(n_samples) / denominator).astype(np.int64)
 
 
 def _check_density(density, n_features):
diff --git a/sklearn/tests/test_random_projection.py b/sklearn/tests/test_random_projection.py
index 829f7cccc4ba9..d01f318c3f1b1 100644
--- a/sklearn/tests/test_random_projection.py
+++ b/sklearn/tests/test_random_projection.py
@@ -352,3 +352,11 @@ def test_works_with_sparse_data():
                                      random_state=1).fit(sp.csr_matrix(data))
         assert_array_almost_equal(densify(rp_dense.components_),
                                   densify(rp_sparse.components_))
+
+
+def test_johnson_lindenstrauss_min_dim():
+    """Test Johnson-Lindenstrauss for small eps.
+
+    Regression test for #17111: before #19374, 32-bit systems would fail.
+    """
+    assert johnson_lindenstrauss_min_dim(100, eps=1e-5) == 368416070986

From 769da3d51feef52b97b8129bf4700cf088a247b2 Mon Sep 17 00:00:00 2001
From: Zito Relova <zitorelova@gmail.com>
Date: Fri, 12 Feb 2021 01:03:39 -0800
Subject: [PATCH 166/478] FIX CountVectorizer does not check for lowercase in
 vocabulary (#19401)

---
 sklearn/feature_extraction/tests/test_text.py | 12 ++++++++++++
 sklearn/feature_extraction/text.py            |  9 +++++++++
 2 files changed, 21 insertions(+)

diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py
index e3180c96546bc..ebe13cc0c240a 100644
--- a/sklearn/feature_extraction/tests/test_text.py
+++ b/sklearn/feature_extraction/tests/test_text.py
@@ -378,6 +378,18 @@ def test_countvectorizer_custom_token_pattern_with_several_group():
         vectorizer.fit(corpus)
 
 
+def test_countvectorizer_uppercase_in_vocab():
+    vocabulary = ['Sample', 'Upper', 'Case' 'Vocabulary']
+    message = ("Upper case characters found in"
+               " vocabulary while 'lowercase'"
+               " is True. These entries will not"
+               " be matched with any documents")
+
+    vectorizer = CountVectorizer(lowercase=True, vocabulary=vocabulary)
+    assert_warns_message(UserWarning, message,
+                         vectorizer.fit_transform, vocabulary)
+
+
 def test_tf_idf_smoothing():
     X = [[1, 1, 1],
          [1, 1, 0],
diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
index 7fd6303e52491..fad0e53ed31ca 100644
--- a/sklearn/feature_extraction/text.py
+++ b/sklearn/feature_extraction/text.py
@@ -1107,6 +1107,15 @@ def _count_vocab(self, raw_documents, fixed_vocab):
         j_indices = []
         indptr = []
 
+        if self.lowercase:
+            for vocab in vocabulary:
+                if any(map(str.isupper, vocab)):
+                    warnings.warn("Upper case characters found in"
+                                  " vocabulary while 'lowercase'"
+                                  " is True. These entries will not"
+                                  " be matched with any documents")
+                    break
+
         values = _make_int_array()
         indptr.append(0)
         for doc in raw_documents:

From e6e543bb077d8e10b2ab7e5a847c1a1b9e6b4974 Mon Sep 17 00:00:00 2001
From: Albert Thomas <albert.thomas@centraliens.net>
Date: Fri, 12 Feb 2021 10:57:29 +0100
Subject: [PATCH 167/478] ENH Better err messages for contamination in outlier
 detectors (#18780)

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
Co-authored-by: Roman Yurchak <rth.yurchak@gmail.com>
---
 sklearn/covariance/_elliptic_envelope.py | 7 ++++++-
 sklearn/ensemble/_iforest.py             | 7 ++++++-
 sklearn/neighbors/_lof.py                | 2 +-
 sklearn/utils/estimator_checks.py        | 8 +++++---
 4 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/sklearn/covariance/_elliptic_envelope.py b/sklearn/covariance/_elliptic_envelope.py
index a4fe7f1d08137..e599f0435f48c 100644
--- a/sklearn/covariance/_elliptic_envelope.py
+++ b/sklearn/covariance/_elliptic_envelope.py
@@ -37,7 +37,7 @@ class EllipticEnvelope(OutlierMixin, MinCovDet):
 
     contamination : float, default=0.1
         The amount of contamination of the data set, i.e. the proportion
-        of outliers in the data set. Range is (0, 0.5).
+        of outliers in the data set. Range is (0, 0.5].
 
     random_state : int, RandomState instance or None, default=None
         Determines the pseudo random number generator for shuffling
@@ -142,6 +142,11 @@ def fit(self, X, y=None):
         y : Ignored
             Not used, present for API consistency by convention.
         """
+        if self.contamination != 'auto':
+            if not(0. < self.contamination <= .5):
+                raise ValueError("contamination must be in (0, 0.5], "
+                                 "got: %f" % self.contamination)
+
         super().fit(X)
         self.offset_ = np.percentile(-self.dist_, 100. * self.contamination)
         return self
diff --git a/sklearn/ensemble/_iforest.py b/sklearn/ensemble/_iforest.py
index ce7f77edb1de4..e607342456cd4 100644
--- a/sklearn/ensemble/_iforest.py
+++ b/sklearn/ensemble/_iforest.py
@@ -70,7 +70,7 @@ class IsolationForest(OutlierMixin, BaseBagging):
 
             - If 'auto', the threshold is determined as in the
               original paper.
-            - If float, the contamination should be in the range [0, 0.5].
+            - If float, the contamination should be in the range (0, 0.5].
 
         .. versionchanged:: 0.22
            The default value of ``contamination`` changed from 0.1
@@ -250,6 +250,11 @@ def fit(self, X, y=None, sample_weight=None):
         # ensure that max_sample is in [1, n_samples]:
         n_samples = X.shape[0]
 
+        if self.contamination != 'auto':
+            if not(0. < self.contamination <= .5):
+                raise ValueError("contamination must be in (0, 0.5], "
+                                 "got: %f" % self.contamination)
+
         if isinstance(self.max_samples, str):
             if self.max_samples == 'auto':
                 max_samples = min(256, n_samples)
diff --git a/sklearn/neighbors/_lof.py b/sklearn/neighbors/_lof.py
index a4aed3cdaeff9..8ba39b315f891 100644
--- a/sklearn/neighbors/_lof.py
+++ b/sklearn/neighbors/_lof.py
@@ -104,7 +104,7 @@ class LocalOutlierFactor(KNeighborsMixin,
 
         - if 'auto', the threshold is determined as in the
           original paper,
-        - if a float, the contamination should be in the range [0, 0.5].
+        - if a float, the contamination should be in the range (0, 0.5].
 
         .. versionchanged:: 0.22
            The default value of ``contamination`` changed from 0.1
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index a02b45c6754d2..7561c64abe6a8 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -2074,9 +2074,10 @@ def check_outliers_train(name, estimator_orig, readonly_memmap=True):
             check_outlier_corruption(num_outliers, expected_outliers, decision)
 
         # raises error when contamination is a scalar and not in [0,1]
+        msg = r"contamination must be in \(0, 0.5]"
         for contamination in [-0.5, 2.3]:
             estimator.set_params(contamination=contamination)
-            with raises(ValueError):
+            with raises(ValueError, match=msg):
                 estimator.fit(X)
 
 
@@ -2970,9 +2971,10 @@ def check_outliers_fit_predict(name, estimator_orig):
             check_outlier_corruption(num_outliers, expected_outliers, decision)
 
         # raises error when contamination is a scalar and not in [0,1]
-        for contamination in [-0.5, 2.3]:
+        msg = r"contamination must be in \(0, 0.5]"
+        for contamination in [-0.5, -0.001, 0.5001, 2.3]:
             estimator.set_params(contamination=contamination)
-            with raises(ValueError):
+            with raises(ValueError, match=msg):
                 estimator.fit_predict(X)
 
 
From 95fa18f2782c3fcebd7a8388cde85323104f22b0 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Fri, 12 Feb 2021 05:35:05 -0500
Subject: [PATCH 168/478] TST Fixes docstring ordering and
 test_docstring_parameters (#19048)

---
 sklearn/compose/_column_transformer.py        |   8 +
 sklearn/covariance/_robust_covariance.py      |   2 +-
 sklearn/covariance/_shrunk_covariance.py      |   2 +-
 sklearn/cross_decomposition/_pls.py           |   3 +-
 sklearn/decomposition/_dict_learning.py       |   2 +
 .../gradient_boosting.py                      |  32 ++--
 sklearn/feature_selection/_sequential.py      |   2 +-
 sklearn/linear_model/_glm/glm.py              |   8 +-
 sklearn/metrics/_plot/det_curve.py            |   4 +-
 sklearn/model_selection/_search.py            | 144 +++++++++---------
 sklearn/preprocessing/_data.py                |   2 +-
 sklearn/semi_supervised/_self_training.py     |  12 +-
 sklearn/tests/test_docstring_parameters.py    |   5 +-
 13 files changed, 119 insertions(+), 107 deletions(-)

diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py
index 6693c9896c87a..3d71c1e5abbf5 100644
--- a/sklearn/compose/_column_transformer.py
+++ b/sklearn/compose/_column_transformer.py
@@ -809,6 +809,14 @@ def __init__(self, pattern=None, *, dtype_include=None,
         self.dtype_exclude = dtype_exclude
 
     def __call__(self, df):
+        """Callable for column selection to be used by a
+        :class:`ColumnTransformer`.
+
+        Parameters
+        ----------
+        df : dataframe of shape (n_features, n_samples)
+            DataFrame to select columns from.
+        """
         if not hasattr(df, 'iloc'):
             raise ValueError("make_column_selector can only be applied to "
                              "pandas dataframes")
diff --git a/sklearn/covariance/_robust_covariance.py b/sklearn/covariance/_robust_covariance.py
index 6ee5d7adaeb5b..d4331b591e43f 100644
--- a/sklearn/covariance/_robust_covariance.py
+++ b/sklearn/covariance/_robust_covariance.py
@@ -632,7 +632,7 @@ def fit(self, X, y=None):
             Training data, where `n_samples` is the number of samples
             and `n_features` is the number of features.
 
-        y: Ignored
+        y : Ignored
             Not used, present for API consistency by convention.
 
         Returns
diff --git a/sklearn/covariance/_shrunk_covariance.py b/sklearn/covariance/_shrunk_covariance.py
index cada82996ca22..72b13681200ff 100644
--- a/sklearn/covariance/_shrunk_covariance.py
+++ b/sklearn/covariance/_shrunk_covariance.py
@@ -135,7 +135,7 @@ def fit(self, X, y=None):
             Training data, where n_samples is the number of samples
             and n_features is the number of features.
 
-        y: Ignored
+        y : Ignored
             Not used, present for API consistency by convention.
 
         Returns
diff --git a/sklearn/cross_decomposition/_pls.py b/sklearn/cross_decomposition/_pls.py
index b4219a5a1d520..42d727b9ae2be 100644
--- a/sklearn/cross_decomposition/_pls.py
+++ b/sklearn/cross_decomposition/_pls.py
@@ -537,7 +537,8 @@ class PLSRegression(_PLS):
         `Y = X @ coef_`.
 
     n_iter_ : list of shape (n_components,)
-        Number of iterations of the power method for each component.
+        Number of iterations of the power method, for each
+        component.
 
     n_features_in_ : int
         Number of features seen during :term:`fit`.
diff --git a/sklearn/decomposition/_dict_learning.py b/sklearn/decomposition/_dict_learning.py
index 27261739de621..e2ae9f8355a54 100644
--- a/sklearn/decomposition/_dict_learning.py
+++ b/sklearn/decomposition/_dict_learning.py
@@ -1123,6 +1123,8 @@ def transform(self, X, y=None):
             Test data to be transformed, must have the same number of
             features as the data used to train the model.
 
+        y : Ignored
+
         Returns
         -------
         X_new : ndarray of shape (n_samples, n_components)
diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
index d30cd030bf698..15b4c95f8cd54 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
@@ -940,14 +940,6 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting):
         Features with a small number of unique values may use less than
         ``max_bins`` bins. In addition to the ``max_bins`` bins, one more bin
         is always reserved for missing values. Must be no larger than 255.
-    monotonic_cst : array-like of int of shape (n_features), default=None
-        Indicates the monotonic constraint to enforce on each feature. -1, 1
-        and 0 respectively correspond to a negative constraint, positive
-        constraint and no constraint. Read more in the :ref:`User Guide
-        <monotonic_cst_gbdt>`.
-
-        .. versionadded:: 0.23
-
     categorical_features : array-like of {bool, int} of shape (n_features) \
             or shape (n_categorical_features,), default=None.
         Indicates the categorical features.
@@ -964,6 +956,14 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting):
 
         .. versionadded:: 0.24
 
+    monotonic_cst : array-like of int of shape (n_features), default=None
+        Indicates the monotonic constraint to enforce on each feature. -1, 1
+        and 0 respectively correspond to a negative constraint, positive
+        constraint and no constraint. Read more in the :ref:`User Guide
+        <monotonic_cst_gbdt>`.
+
+        .. versionadded:: 0.23
+
     warm_start : bool, default=False
         When set to ``True``, reuse the solution of the previous call to fit
         and add more estimators to the ensemble. For results to be valid, the
@@ -1193,14 +1193,6 @@ class HistGradientBoostingClassifier(ClassifierMixin,
         Features with a small number of unique values may use less than
         ``max_bins`` bins. In addition to the ``max_bins`` bins, one more bin
         is always reserved for missing values. Must be no larger than 255.
-    monotonic_cst : array-like of int of shape (n_features), default=None
-        Indicates the monotonic constraint to enforce on each feature. -1, 1
-        and 0 respectively correspond to a negative constraint, positive
-        constraint and no constraint. Read more in the :ref:`User Guide
-        <monotonic_cst_gbdt>`.
-
-        .. versionadded:: 0.23
-
     categorical_features : array-like of {bool, int} of shape (n_features) \
             or shape (n_categorical_features,), default=None.
         Indicates the categorical features.
@@ -1217,6 +1209,14 @@ class HistGradientBoostingClassifier(ClassifierMixin,
 
         .. versionadded:: 0.24
 
+    monotonic_cst : array-like of int of shape (n_features), default=None
+        Indicates the monotonic constraint to enforce on each feature. -1, 1
+        and 0 respectively correspond to a negative constraint, positive
+        constraint and no constraint. Read more in the :ref:`User Guide
+        <monotonic_cst_gbdt>`.
+
+        .. versionadded:: 0.23
+
     warm_start : bool, default=False
         When set to ``True``, reuse the solution of the previous call to fit
         and add more estimators to the ensemble. For results to be valid, the
diff --git a/sklearn/feature_selection/_sequential.py b/sklearn/feature_selection/_sequential.py
index 271bc0062ef6b..7ee6b043a0df1 100644
--- a/sklearn/feature_selection/_sequential.py
+++ b/sklearn/feature_selection/_sequential.py
@@ -36,7 +36,7 @@ class SequentialFeatureSelector(SelectorMixin, MetaEstimatorMixin,
         to select. If float between 0 and 1, it is the fraction of features to
         select.
 
-    direction: {'forward', 'backward'}, default='forward'
+    direction : {'forward', 'backward'}, default='forward'
         Whether to perform forward selection or backward selection.
 
     scoring : str, callable, list/tuple or dict, default=None
diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index 68797b176727b..7d98f7734b322 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -593,6 +593,10 @@ class TweedieRegressor(GeneralizedLinearRegressor):
         GLMs. In this case, the design matrix `X` must have full column rank
         (no collinearities).
 
+    fit_intercept : bool, default=True
+        Specifies if a constant (a.k.a. bias or intercept) should be
+        added to the linear predictor (X @ coef + intercept).
+
     link : {'auto', 'identity', 'log'}, default='auto'
         The link function of the GLM, i.e. mapping from linear predictor
         `X @ coeff + intercept` to prediction `y_pred`. Option 'auto' sets
@@ -601,10 +605,6 @@ class TweedieRegressor(GeneralizedLinearRegressor):
         - 'identity' for Normal distribution
         - 'log' for Poisson,  Gamma and Inverse Gaussian distributions
 
-    fit_intercept : bool, default=True
-        Specifies if a constant (a.k.a. bias or intercept) should be
-        added to the linear predictor (X @ coef + intercept).
-
     max_iter : int, default=100
         The maximal number of iterations for the solver.
 
diff --git a/sklearn/metrics/_plot/det_curve.py b/sklearn/metrics/_plot/det_curve.py
index d9f642e38052a..e512b2d972ce6 100644
--- a/sklearn/metrics/_plot/det_curve.py
+++ b/sklearn/metrics/_plot/det_curve.py
@@ -22,8 +22,8 @@ class DetCurveDisplay:
     fpr : ndarray
         False positive rate.
 
-    tpr : ndarray
-        True positive rate.
+    fnr : ndarray
+        False negative rate.
 
     estimator_name : str, default=None
         Name of estimator. If None, the estimator name is not shown.
diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py
index e0baf77e1f22d..e9c498816eae2 100644
--- a/sklearn/model_selection/_search.py
+++ b/sklearn/model_selection/_search.py
@@ -1024,42 +1024,6 @@ class GridSearchCV(BaseSearchCV):
         .. versionchanged:: v0.20
            `n_jobs` default changed from 1 to None
 
-    pre_dispatch : int, or str, default=n_jobs
-        Controls the number of jobs that get dispatched during parallel
-        execution. Reducing this number can be useful to avoid an
-        explosion of memory consumption when more jobs get dispatched
-        than CPUs can process. This parameter can be:
-
-            - None, in which case all the jobs are immediately
-              created and spawned. Use this for lightweight and
-              fast-running jobs, to avoid delays due to on-demand
-              spawning of the jobs
-
-            - An int, giving the exact number of total jobs that are
-              spawned
-
-            - A str, giving an expression as a function of n_jobs,
-              as in '2*n_jobs'
-
-    cv : int, cross-validation generator or an iterable, default=None
-        Determines the cross-validation splitting strategy.
-        Possible inputs for cv are:
-
-        - None, to use the default 5-fold cross validation,
-        - integer, to specify the number of folds in a `(Stratified)KFold`,
-        - :term:`CV splitter`,
-        - An iterable yielding (train, test) splits as arrays of indices.
-
-        For integer/None inputs, if the estimator is a classifier and ``y`` is
-        either binary or multiclass, :class:`StratifiedKFold` is used. In all
-        other cases, :class:`KFold` is used.
-
-        Refer :ref:`User Guide <cross_validation>` for the various
-        cross-validation strategies that can be used here.
-
-        .. versionchanged:: 0.22
-            ``cv`` default value if None changed from 3-fold to 5-fold.
-
     refit : bool, str, or callable, default=True
         Refit an estimator using the best found parameters on the whole
         dataset.
@@ -1090,6 +1054,25 @@ class GridSearchCV(BaseSearchCV):
         .. versionchanged:: 0.20
             Support for callable added.
 
+    cv : int, cross-validation generator or an iterable, default=None
+        Determines the cross-validation splitting strategy.
+        Possible inputs for cv are:
+
+        - None, to use the default 5-fold cross validation,
+        - integer, to specify the number of folds in a `(Stratified)KFold`,
+        - :term:`CV splitter`,
+        - An iterable yielding (train, test) splits as arrays of indices.
+
+        For integer/None inputs, if the estimator is a classifier and ``y`` is
+        either binary or multiclass, :class:`StratifiedKFold` is used. In all
+        other cases, :class:`KFold` is used.
+
+        Refer :ref:`User Guide <cross_validation>` for the various
+        cross-validation strategies that can be used here.
+
+        .. versionchanged:: 0.22
+            ``cv`` default value if None changed from 3-fold to 5-fold.
+
     verbose : int
         Controls the verbosity: the higher, the more messages.
 
@@ -1099,6 +1082,23 @@ class GridSearchCV(BaseSearchCV):
         - >3 : the fold and candidate parameter indexes are also displayed
           together with the starting time of the computation.
 
+    pre_dispatch : int, or str, default=n_jobs
+        Controls the number of jobs that get dispatched during parallel
+        execution. Reducing this number can be useful to avoid an
+        explosion of memory consumption when more jobs get dispatched
+        than CPUs can process. This parameter can be:
+
+            - None, in which case all the jobs are immediately
+              created and spawned. Use this for lightweight and
+              fast-running jobs, to avoid delays due to on-demand
+              spawning of the jobs
+
+            - An int, giving the exact number of total jobs that are
+              spawned
+
+            - A str, giving an expression as a function of n_jobs,
+              as in '2*n_jobs'
+
     error_score : 'raise' or numeric, default=np.nan
         Value to assign to the score if an error occurs in estimator fitting.
         If set to 'raise', the error is raised. If a numeric value is given,
@@ -1366,42 +1366,6 @@ class RandomizedSearchCV(BaseSearchCV):
         .. versionchanged:: v0.20
            `n_jobs` default changed from 1 to None
 
-    pre_dispatch : int, or str, default=None
-        Controls the number of jobs that get dispatched during parallel
-        execution. Reducing this number can be useful to avoid an
-        explosion of memory consumption when more jobs get dispatched
-        than CPUs can process. This parameter can be:
-
-            - None, in which case all the jobs are immediately
-              created and spawned. Use this for lightweight and
-              fast-running jobs, to avoid delays due to on-demand
-              spawning of the jobs
-
-            - An int, giving the exact number of total jobs that are
-              spawned
-
-            - A str, giving an expression as a function of n_jobs,
-              as in '2*n_jobs'
-
-    cv : int, cross-validation generator or an iterable, default=None
-        Determines the cross-validation splitting strategy.
-        Possible inputs for cv are:
-
-        - None, to use the default 5-fold cross validation,
-        - integer, to specify the number of folds in a `(Stratified)KFold`,
-        - :term:`CV splitter`,
-        - An iterable yielding (train, test) splits as arrays of indices.
-
-        For integer/None inputs, if the estimator is a classifier and ``y`` is
-        either binary or multiclass, :class:`StratifiedKFold` is used. In all
-        other cases, :class:`KFold` is used.
-
-        Refer :ref:`User Guide <cross_validation>` for the various
-        cross-validation strategies that can be used here.
-
-        .. versionchanged:: 0.22
-            ``cv`` default value if None changed from 3-fold to 5-fold.
-
     refit : bool, str, or callable, default=True
         Refit an estimator using the best found parameters on the whole
         dataset.
@@ -1432,9 +1396,45 @@ class RandomizedSearchCV(BaseSearchCV):
         .. versionchanged:: 0.20
             Support for callable added.
 
+    cv : int, cross-validation generator or an iterable, default=None
+        Determines the cross-validation splitting strategy.
+        Possible inputs for cv are:
+
+        - None, to use the default 5-fold cross validation,
+        - integer, to specify the number of folds in a `(Stratified)KFold`,
+        - :term:`CV splitter`,
+        - An iterable yielding (train, test) splits as arrays of indices.
+
+        For integer/None inputs, if the estimator is a classifier and ``y`` is
+        either binary or multiclass, :class:`StratifiedKFold` is used. In all
+        other cases, :class:`KFold` is used.
+
+        Refer :ref:`User Guide <cross_validation>` for the various
+        cross-validation strategies that can be used here.
+
+        .. versionchanged:: 0.22
+            ``cv`` default value if None changed from 3-fold to 5-fold.
+
     verbose : int
         Controls the verbosity: the higher, the more messages.
 
+    pre_dispatch : int, or str, default=None
+        Controls the number of jobs that get dispatched during parallel
+        execution. Reducing this number can be useful to avoid an
+        explosion of memory consumption when more jobs get dispatched
+        than CPUs can process. This parameter can be:
+
+            - None, in which case all the jobs are immediately
+              created and spawned. Use this for lightweight and
+              fast-running jobs, to avoid delays due to on-demand
+              spawning of the jobs
+
+            - An int, giving the exact number of total jobs that are
+              spawned
+
+            - A str, giving an expression as a function of n_jobs,
+              as in '2*n_jobs'
+
     random_state : int, RandomState instance or None, default=None
         Pseudo random number generator state used for random uniform sampling
         from lists of possible values instead of scipy.stats distributions.
diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py
index c84716130ed05..92a4135147b87 100644
--- a/sklearn/preprocessing/_data.py
+++ b/sklearn/preprocessing/_data.py
@@ -244,7 +244,7 @@ class MinMaxScaler(TransformerMixin, BaseEstimator):
         Set to False to perform inplace row normalization and avoid a
         copy (if the input is already a numpy array).
 
-    clip: bool, default=False
+    clip : bool, default=False
         Set to True to clip transformed values of held-out data to
         provided `feature range`.
 
diff --git a/sklearn/semi_supervised/_self_training.py b/sklearn/semi_supervised/_self_training.py
index 8c79065c830d1..54fa9ba45e1b8 100644
--- a/sklearn/semi_supervised/_self_training.py
+++ b/sklearn/semi_supervised/_self_training.py
@@ -41,6 +41,11 @@ class SelfTrainingClassifier(MetaEstimatorMixin, BaseEstimator):
         Invoking the ``fit`` method will fit a clone of the passed estimator,
         which will be stored in the ``base_estimator_`` attribute.
 
+    threshold : float, default=0.75
+        The decision threshold for use with `criterion='threshold'`.
+        Should be in [0, 1). When using the 'threshold' criterion, a
+        :ref:`well calibrated classifier <calibration>` should be used.
+
     criterion : {'threshold', 'k_best'}, default='threshold'
         The selection criterion used to select which labels to add to the
         training set. If 'threshold', pseudo-labels with prediction
@@ -49,11 +54,6 @@ class SelfTrainingClassifier(MetaEstimatorMixin, BaseEstimator):
         added to the dataset. When using the 'threshold' criterion, a
         :ref:`well calibrated classifier <calibration>` should be used.
 
-    threshold : float, default=0.75
-        The decision threshold for use with `criterion='threshold'`.
-        Should be in [0, 1). When using the 'threshold' criterion, a
-        :ref:`well calibrated classifier <calibration>` should be used.
-
     k_best : int, default=10
         The amount of samples to add in each iteration. Only used when
         `criterion` is k_best'.
@@ -64,7 +64,7 @@ class SelfTrainingClassifier(MetaEstimatorMixin, BaseEstimator):
         until no new pseudo-labels are added, or all unlabeled samples have
         been labeled.
 
-    verbose: bool, default=False
+    verbose : bool, default=False
         Enable verbose output.
 
     Attributes
diff --git a/sklearn/tests/test_docstring_parameters.py b/sklearn/tests/test_docstring_parameters.py
index 2328b8d84c84e..cd2bdba449799 100644
--- a/sklearn/tests/test_docstring_parameters.py
+++ b/sklearn/tests/test_docstring_parameters.py
@@ -83,8 +83,9 @@ def test_docstring_parameters():
         with warnings.catch_warnings(record=True):
             module = importlib.import_module(name)
         classes = inspect.getmembers(module, inspect.isclass)
-        # Exclude imported classes
-        classes = [cls for cls in classes if cls[1].__module__ == name]
+        # Exclude non-scikit-learn classes
+        classes = [cls for cls in classes
+                   if cls[1].__module__.startswith('sklearn')]
         for cname, cls in classes:
             this_incorrect = []
             if cname in _DOCSTRING_IGNORES or cname.startswith('_'):

From 31b34b560de57a049dd435dccc55112271322370 Mon Sep 17 00:00:00 2001
From: Atsushi Nukariya <a.nukariya@jp.fujitsu.com>
Date: Sat, 13 Feb 2021 02:28:07 +0900
Subject: [PATCH 169/478] Fix: RuntimeWarning by dividing by zero in
 test_radius_neighbors_classifier_zero_distance (#19395)

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 sklearn/neighbors/tests/test_neighbors.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py
index 2b6c9a48d545d..a4b55afd090c3 100644
--- a/sklearn/neighbors/tests/test_neighbors.py
+++ b/sklearn/neighbors/tests/test_neighbors.py
@@ -601,7 +601,10 @@ def test_radius_neighbors_classifier_zero_distance():
                                                       weights=weights,
                                                       algorithm=algorithm)
             clf.fit(X, y)
-            assert_array_equal(correct_labels1, clf.predict(z1))
+            with np.errstate(invalid="ignore"):
+                # Ignore the warning raised in _weight_func when making
+                # predictions with null distances resulting in np.inf values.
+                assert_array_equal(correct_labels1, clf.predict(z1))
 
 
 def test_neighbors_regressors_zero_distance():

From dac560551c5767d9a8608f86e3f253e706026189 Mon Sep 17 00:00:00 2001
From: Yasmeen Alsaedy <yasmeen.youssef.alsaedi@gmail.com>
Date: Fri, 12 Feb 2021 22:52:23 +0300
Subject: [PATCH 170/478] TST replace assert_raise_* by pytest.raises in
 linear_model (#19440)

---
 .../tests/test_coordinate_descent.py          |  41 ++++---
 sklearn/linear_model/tests/test_logistic.py   | 112 +++++++++++-------
 .../tests/test_passive_aggressive.py          |  21 ++--
 sklearn/linear_model/tests/test_perceptron.py |   6 +-
 sklearn/linear_model/tests/test_ransac.py     |  48 +++++---
 sklearn/linear_model/tests/test_ridge.py      |  76 ++++++------
 sklearn/linear_model/tests/test_sgd.py        |  80 ++++++++-----
 7 files changed, 228 insertions(+), 156 deletions(-)

diff --git a/sklearn/linear_model/tests/test_coordinate_descent.py b/sklearn/linear_model/tests/test_coordinate_descent.py
index a5acb4aa25da2..b6acb78838a33 100644
--- a/sklearn/linear_model/tests/test_coordinate_descent.py
+++ b/sklearn/linear_model/tests/test_coordinate_descent.py
@@ -14,14 +14,10 @@
 from sklearn.model_selection import train_test_split
 from sklearn.pipeline import make_pipeline
 from sklearn.preprocessing import StandardScaler
-
 from sklearn.exceptions import ConvergenceWarning
 from sklearn.utils._testing import assert_allclose
 from sklearn.utils._testing import assert_array_almost_equal
 from sklearn.utils._testing import assert_almost_equal
-from sklearn.utils._testing import assert_raises
-from sklearn.utils._testing import assert_raises_regex
-from sklearn.utils._testing import assert_raise_message
 from sklearn.utils._testing import assert_warns
 from sklearn.utils._testing import assert_warns_message
 from sklearn.utils._testing import ignore_warnings
@@ -735,7 +731,8 @@ def test_multioutput_enetcv_error():
     X = rng.randn(10, 2)
     y = rng.randn(10, 2)
     clf = ElasticNetCV()
-    assert_raises(ValueError, clf.fit, X, y)
+    with pytest.raises(ValueError):
+        clf.fit(X, y)
 
 
 def test_multitask_enet_and_lasso_cv():
@@ -810,14 +807,18 @@ def test_precompute_invalid_argument():
     X, y, _, _ = build_dataset()
     for clf in [ElasticNetCV(precompute="invalid"),
                 LassoCV(precompute="invalid")]:
-        assert_raises_regex(ValueError, ".*should be.*True.*False.*auto.*"
-                            "array-like.*Got 'invalid'", clf.fit, X, y)
+        err_msg = ".*should be.*True.*False.*auto.* array-like.*Got 'invalid'"
+        with pytest.raises(ValueError, match=err_msg):
+            clf.fit(X, y)
 
     # Precompute = 'auto' is not supported for ElasticNet and Lasso
-    assert_raises_regex(ValueError, ".*should be.*True.*False.*array-like.*"
-                        "Got 'auto'", ElasticNet(precompute='auto').fit, X, y)
-    assert_raises_regex(ValueError, ".*should be.*True.*False.*array-like.*"
-                        "Got 'auto'", Lasso(precompute='auto').fit, X, y)
+    err_msg = ".*should be.*True.*False.*array-like.*Got 'auto'"
+    with pytest.raises(ValueError, match=err_msg):
+        ElasticNet(precompute='auto').fit(X, y)
+
+    err_msg = ".*should be.*True.*False.*array-like.*Got 'auto'"
+    with pytest.raises(ValueError, match=err_msg):
+        Lasso(precompute='auto').fit(X, y)
 
 
 def test_elasticnet_precompute_incorrect_gram():
@@ -946,7 +947,8 @@ def test_random_descent():
 
     # Raise error when selection is not in cyclic or random.
     clf_random = ElasticNet(selection='invalid')
-    assert_raises(ValueError, clf_random.fit, X, y)
+    with pytest.raises(ValueError):
+        clf_random.fit(X, y)
 
 
 def test_enet_path_positive():
@@ -963,7 +965,8 @@ def test_enet_path_positive():
     # For multi output, positive parameter is not allowed
     # Test that an error is raised
     for path in [enet_path, lasso_path]:
-        assert_raises(ValueError, path, X, Y, positive=True)
+        with pytest.raises(ValueError):
+            path(X, Y, positive=True)
 
 
 def test_sparse_dense_descent_paths():
@@ -991,7 +994,8 @@ def test_check_input_false():
     # With no input checking, providing X in C order should result in false
     # computation
     X = check_array(X, order='C', dtype='float64')
-    assert_raises(ValueError, clf.fit, X, y, check_input=False)
+    with pytest.raises(ValueError):
+        clf.fit(X, y, check_input=False)
 
 
 @pytest.mark.parametrize("check_input", [True, False])
@@ -1105,10 +1109,11 @@ def test_enet_l1_ratio():
     X = np.array([[1, 2, 4, 5, 8], [3, 5, 7, 7, 8]]).T
     y = np.array([12, 10, 11, 21, 5])
 
-    assert_raise_message(ValueError, msg, ElasticNetCV(
-        l1_ratio=0, random_state=42).fit, X, y)
-    assert_raise_message(ValueError, msg, MultiTaskElasticNetCV(
-        l1_ratio=0, random_state=42).fit, X, y[:, None])
+    with pytest.raises(ValueError, match=msg):
+        ElasticNetCV(l1_ratio=0, random_state=42).fit(X, y)
+
+    with pytest.raises(ValueError, match=msg):
+        MultiTaskElasticNetCV(l1_ratio=0, random_state=42).fit(X, y[:, None])
 
     # Test that l1_ratio=0 is allowed if we supply a grid manually
     alphas = [0.1, 10]
diff --git a/sklearn/linear_model/tests/test_logistic.py b/sklearn/linear_model/tests/test_logistic.py
index cd64eee79cbac..329f4f72f935b 100644
--- a/sklearn/linear_model/tests/test_logistic.py
+++ b/sklearn/linear_model/tests/test_logistic.py
@@ -1,4 +1,5 @@
 import os
+import re
 import warnings
 import numpy as np
 from numpy.testing import assert_allclose, assert_almost_equal
@@ -18,8 +19,6 @@
 from sklearn.model_selection import cross_val_score
 from sklearn.preprocessing import LabelEncoder, StandardScaler
 from sklearn.utils import compute_class_weight, _IS_32BIT
-from sklearn.utils._testing import assert_raise_message
-from sklearn.utils._testing import assert_raises
 from sklearn.utils._testing import assert_warns
 from sklearn.utils._testing import ignore_warnings
 from sklearn.utils._testing import assert_warns_message
@@ -79,24 +78,33 @@ def test_predict_2_classes():
 def test_error():
     # Test for appropriate exception on errors
     msg = "Penalty term must be positive"
-    assert_raise_message(ValueError, msg,
-                         LogisticRegression(C=-1).fit, X, Y1)
-    assert_raise_message(ValueError, msg,
-                         LogisticRegression(C="test").fit, X, Y1)
+
+    with pytest.raises(ValueError, match=msg):
+        LogisticRegression(C=-1).fit(X, Y1)
+
+    with pytest.raises(ValueError, match=msg):
+        LogisticRegression(C="test").fit(X, Y1)
 
     msg = "is not a valid scoring value"
-    assert_raise_message(ValueError, msg,
-                         LogisticRegressionCV(scoring='bad-scorer', cv=2).fit,
-                         X, Y1)
+    with pytest.raises(ValueError, match=msg):
+        LogisticRegressionCV(scoring='bad-scorer', cv=2).fit(X, Y1)
 
     for LR in [LogisticRegression, LogisticRegressionCV]:
         msg = "Tolerance for stopping criteria must be positive"
-        assert_raise_message(ValueError, msg, LR(tol=-1).fit, X, Y1)
-        assert_raise_message(ValueError, msg, LR(tol="test").fit, X, Y1)
+
+        with pytest.raises(ValueError, match=msg):
+            LR(tol=-1).fit(X, Y1)
+
+        with pytest.raises(ValueError, match=msg):
+            LR(tol="test").fit(X, Y1)
 
         msg = "Maximum number of iteration must be positive"
-        assert_raise_message(ValueError, msg, LR(max_iter=-1).fit, X, Y1)
-        assert_raise_message(ValueError, msg, LR(max_iter="test").fit, X, Y1)
+
+        with pytest.raises(ValueError, match=msg):
+            LR(max_iter=-1).fit(X, Y1)
+
+        with pytest.raises(ValueError, match=msg):
+            LR(max_iter="test").fit(X, Y1)
 
 
 def test_logistic_cv_mock_scorer():
@@ -196,39 +204,46 @@ def test_predict_iris():
 @pytest.mark.parametrize('solver', ['lbfgs', 'newton-cg', 'sag', 'saga'])
 def test_multinomial_validation(solver):
     lr = LogisticRegression(C=-1, solver=solver, multi_class='multinomial')
-    assert_raises(ValueError, lr.fit, [[0, 1], [1, 0]], [0, 1])
+
+    with pytest.raises(ValueError):
+        lr.fit([[0, 1], [1, 0]], [0, 1])
 
 
 @pytest.mark.parametrize('LR', [LogisticRegression, LogisticRegressionCV])
 def test_check_solver_option(LR):
     X, y = iris.data, iris.target
 
-    msg = ("Logistic Regression supports only solvers in ['liblinear', "
-           "'newton-cg', 'lbfgs', 'sag', 'saga'], got wrong_name.")
+    msg = (r"Logistic Regression supports only solvers in \['liblinear', "
+           r"'newton-cg', 'lbfgs', 'sag', 'saga'\], got wrong_name.")
     lr = LR(solver="wrong_name", multi_class="ovr")
-    assert_raise_message(ValueError, msg, lr.fit, X, y)
+    with pytest.raises(ValueError, match=msg):
+        lr.fit(X, y)
 
     msg = ("multi_class should be 'multinomial', 'ovr' or 'auto'. "
            "Got wrong_name")
     lr = LR(solver='newton-cg', multi_class="wrong_name")
-    assert_raise_message(ValueError, msg, lr.fit, X, y)
+    with pytest.raises(ValueError, match=msg):
+        lr.fit(X, y)
 
     # only 'liblinear' solver
     msg = "Solver liblinear does not support a multinomial backend."
     lr = LR(solver='liblinear', multi_class='multinomial')
-    assert_raise_message(ValueError, msg, lr.fit, X, y)
+    with pytest.raises(ValueError, match=msg):
+        lr.fit(X, y)
 
     # all solvers except 'liblinear' and 'saga'
     for solver in ['newton-cg', 'lbfgs', 'sag']:
         msg = ("Solver %s supports only 'l2' or 'none' penalties," %
                solver)
         lr = LR(solver=solver, penalty='l1', multi_class='ovr')
-        assert_raise_message(ValueError, msg, lr.fit, X, y)
+        with pytest.raises(ValueError, match=msg):
+            lr.fit(X, y)
     for solver in ['newton-cg', 'lbfgs', 'sag', 'saga']:
         msg = ("Solver %s supports only dual=False, got dual=True" %
                solver)
         lr = LR(solver=solver, dual=True, multi_class='ovr')
-        assert_raise_message(ValueError, msg, lr.fit, X, y)
+        with pytest.raises(ValueError, match=msg):
+            lr.fit(X, y)
 
     # only saga supports elasticnet. We only test for liblinear because the
     # error is raised before for the other solvers (solver %s supports only l2
@@ -237,12 +252,14 @@ def test_check_solver_option(LR):
         msg = ("Only 'saga' solver supports elasticnet penalty, got "
                "solver={}.".format(solver))
         lr = LR(solver=solver, penalty='elasticnet')
-        assert_raise_message(ValueError, msg, lr.fit, X, y)
+        with pytest.raises(ValueError, match=msg):
+            lr.fit(X, y)
 
     # liblinear does not support penalty='none'
     msg = "penalty='none' is not supported for the liblinear solver"
     lr = LR(penalty='none', solver='liblinear')
-    assert_raise_message(ValueError, msg, lr.fit, X, y)
+    with pytest.raises(ValueError, match=msg):
+        lr.fit(X, y)
 
 
 @pytest.mark.parametrize('solver', ['lbfgs', 'newton-cg', 'sag', 'saga'])
@@ -318,11 +335,13 @@ def test_inconsistent_input():
 
     # Wrong dimensions for training data
     y_wrong = y_[:-1]
-    assert_raises(ValueError, clf.fit, X, y_wrong)
+
+    with pytest.raises(ValueError):
+        clf.fit(X, y_wrong)
 
     # Wrong dimensions for test data
-    assert_raises(ValueError, clf.fit(X_, y_).predict,
-                  rng.random_sample((3, 12)))
+    with pytest.raises(ValueError):
+        clf.fit(X_, y_).predict(rng.random_sample((3, 12)))
 
 
 def test_write_parameters():
@@ -340,7 +359,9 @@ def test_nan():
     Xnan = np.array(X, dtype=np.float64)
     Xnan[0, 1] = np.nan
     logistic = LogisticRegression(random_state=0)
-    assert_raises(ValueError, logistic.fit, Xnan, Y1)
+
+    with pytest.raises(ValueError):
+        logistic.fit(Xnan, Y1)
 
 
 def test_consistency_path():
@@ -422,8 +443,8 @@ def test_liblinear_dual_random_state():
     assert_array_almost_equal(lr1.coef_, lr2.coef_)
     # different results for different random states
     msg = "Arrays are not almost equal to 6 decimals"
-    assert_raise_message(AssertionError, msg,
-                         assert_array_almost_equal, lr1.coef_, lr3.coef_)
+    with pytest.raises(AssertionError, match=msg):
+        assert_array_almost_equal(lr1.coef_, lr3.coef_)
 
 
 def test_logistic_loss_and_grad():
@@ -1042,7 +1063,8 @@ def test_logreg_intercept_scaling():
         msg = ('Intercept scaling is %r but needs to be greater than 0.'
                ' To disable fitting an intercept,'
                ' set fit_intercept=False.' % clf.intercept_scaling)
-        assert_raise_message(ValueError, msg, clf.fit, X, Y1)
+        with pytest.raises(ValueError, match=msg):
+            clf.fit(X, Y1)
 
 
 def test_logreg_intercept_scaling_zero():
@@ -1616,14 +1638,15 @@ def test_LogisticRegressionCV_elasticnet_attribute_shapes():
 @pytest.mark.parametrize('l1_ratio', (-1, 2, None, 'something_wrong'))
 def test_l1_ratio_param(l1_ratio):
 
-    msg = "l1_ratio must be between 0 and 1; got (l1_ratio=%r)" % l1_ratio
-    assert_raise_message(ValueError, msg,
-                         LogisticRegression(penalty='elasticnet',
-                                            solver='saga',
-                                            l1_ratio=l1_ratio).fit, X, Y1)
+    msg = r"l1_ratio must be between 0 and 1; got \(l1_ratio=%r\)" % l1_ratio
+    with pytest.raises(ValueError, match=msg):
+        LogisticRegression(penalty='elasticnet', solver='saga',
+                           l1_ratio=l1_ratio).fit(X, Y1)
+
     if l1_ratio is not None:
         msg = ("l1_ratio parameter is only used when penalty is 'elasticnet'."
                " Got (penalty=l1)")
+
         assert_warns_message(UserWarning, msg,
                              LogisticRegression(penalty='l1', solver='saga',
                                                 l1_ratio=l1_ratio).fit, X, Y1)
@@ -1634,11 +1657,12 @@ def test_l1_ratios_param(l1_ratios):
 
     msg = ("l1_ratios must be a list of numbers between 0 and 1; got "
            "(l1_ratios=%r)" % l1_ratios)
-    assert_raise_message(ValueError, msg,
-                         LogisticRegressionCV(penalty='elasticnet',
-                                              solver='saga',
-                                              l1_ratios=l1_ratios, cv=2).fit,
-                         X, Y1)
+
+    with pytest.raises(ValueError, match=re.escape(msg)):
+        LogisticRegressionCV(penalty='elasticnet',
+                             solver='saga',
+                             l1_ratios=l1_ratios, cv=2).fit(X, Y1)
+
     if l1_ratios is not None:
         msg = ("l1_ratios parameter is only used when penalty is "
                "'elasticnet'. Got (penalty=l1)")
@@ -1756,12 +1780,12 @@ def test_penalty_none(solver):
     assert_array_equal(pred_none, pred_l2_C_inf)
 
     lr = LogisticRegressionCV(penalty='none')
-    assert_raise_message(
-        ValueError,
+    err_msg = (
         "penalty='none' is not useful and not supported by "
-        "LogisticRegressionCV",
-        lr.fit, X, y
+        "LogisticRegressionCV"
     )
+    with pytest.raises(ValueError, match=err_msg):
+        lr.fit(X, y)
 
 
 @pytest.mark.parametrize(
diff --git a/sklearn/linear_model/tests/test_passive_aggressive.py b/sklearn/linear_model/tests/test_passive_aggressive.py
index f67a768844213..f2403773277a7 100644
--- a/sklearn/linear_model/tests/test_passive_aggressive.py
+++ b/sklearn/linear_model/tests/test_passive_aggressive.py
@@ -6,8 +6,6 @@
 from sklearn.utils._testing import assert_array_almost_equal
 from sklearn.utils._testing import assert_array_equal
 from sklearn.utils._testing import assert_almost_equal
-from sklearn.utils._testing import assert_raises
-
 from sklearn.base import ClassifierMixin
 from sklearn.utils import check_random_state
 from sklearn.datasets import load_iris
@@ -130,7 +128,8 @@ def test_classifier_correctness(loss):
 def test_classifier_undefined_methods():
     clf = PassiveAggressiveClassifier(max_iter=100)
     for meth in ("predict_proba", "predict_log_proba", "transform"):
-        assert_raises(AttributeError, lambda x: getattr(clf, x), meth)
+        with pytest.raises(AttributeError):
+            getattr(clf, meth)
 
 
 def test_class_weights():
@@ -158,7 +157,8 @@ def test_class_weights():
 def test_partial_fit_weight_class_balanced():
     # partial_fit with class_weight='balanced' not supported
     clf = PassiveAggressiveClassifier(class_weight="balanced", max_iter=100)
-    assert_raises(ValueError, clf.partial_fit, X, y, classes=np.unique(y))
+    with pytest.raises(ValueError):
+        clf.partial_fit(X, y, classes=np.unique(y))
 
 
 def test_equal_class_weight():
@@ -189,7 +189,8 @@ def test_wrong_class_weight_label():
     y2 = [1, 1, 1, -1, -1]
 
     clf = PassiveAggressiveClassifier(class_weight={0: 0.5}, max_iter=100)
-    assert_raises(ValueError, clf.fit, X2, y2)
+    with pytest.raises(ValueError):
+        clf.fit(X2, y2)
 
 
 def test_wrong_class_weight_format():
@@ -199,10 +200,12 @@ def test_wrong_class_weight_format():
     y2 = [1, 1, 1, -1, -1]
 
     clf = PassiveAggressiveClassifier(class_weight=[0.5], max_iter=100)
-    assert_raises(ValueError, clf.fit, X2, y2)
+    with pytest.raises(ValueError):
+        clf.fit(X2, y2)
 
     clf = PassiveAggressiveClassifier(class_weight="the larch", max_iter=100)
-    assert_raises(ValueError, clf.fit, X2, y2)
+    with pytest.raises(ValueError):
+        clf.fit(X2, y2)
 
 
 def test_regressor_mse():
@@ -265,7 +268,9 @@ def test_regressor_correctness(loss):
 def test_regressor_undefined_methods():
     reg = PassiveAggressiveRegressor(max_iter=100)
     for meth in ("transform",):
-        assert_raises(AttributeError, lambda x: getattr(reg, x), meth)
+        with pytest.raises(AttributeError):
+            getattr(reg, meth)
+
 
 # TODO: remove in 1.0
 @pytest.mark.parametrize('klass', [PassiveAggressiveClassifier,
diff --git a/sklearn/linear_model/tests/test_perceptron.py b/sklearn/linear_model/tests/test_perceptron.py
index 6bfc2f50ebbe8..f62595d7bc590 100644
--- a/sklearn/linear_model/tests/test_perceptron.py
+++ b/sklearn/linear_model/tests/test_perceptron.py
@@ -1,10 +1,9 @@
 import numpy as np
 import scipy.sparse as sp
+import pytest
 
 from sklearn.utils._testing import assert_allclose
 from sklearn.utils._testing import assert_array_almost_equal
-from sklearn.utils._testing import assert_raises
-
 from sklearn.utils import check_random_state
 from sklearn.datasets import load_iris
 from sklearn.linear_model import Perceptron
@@ -67,7 +66,8 @@ def test_perceptron_correctness():
 def test_undefined_methods():
     clf = Perceptron(max_iter=100)
     for meth in ("predict_proba", "predict_log_proba"):
-        assert_raises(AttributeError, lambda x: getattr(clf, x), meth)
+        with pytest.raises(AttributeError):
+            getattr(clf, meth)
 
 
 def test_perceptron_l1_ratio():
diff --git a/sklearn/linear_model/tests/test_ransac.py b/sklearn/linear_model/tests/test_ransac.py
index d4c83c0b59a79..757faacd2d67f 100644
--- a/sklearn/linear_model/tests/test_ransac.py
+++ b/sklearn/linear_model/tests/test_ransac.py
@@ -1,4 +1,5 @@
 import numpy as np
+import pytest
 from scipy import sparse
 
 from numpy.testing import assert_array_almost_equal
@@ -7,7 +8,6 @@
 from sklearn.utils import check_random_state
 from sklearn.utils._testing import assert_warns
 from sklearn.utils._testing import assert_raises_regexp
-from sklearn.utils._testing import assert_raises
 from sklearn.utils._testing import assert_allclose
 from sklearn.datasets import make_regression
 from sklearn.linear_model import LinearRegression, RANSACRegressor
@@ -62,8 +62,8 @@ def is_data_valid(X, y):
                                        residual_threshold=5,
                                        is_data_valid=is_data_valid,
                                        random_state=0)
-
-    assert_raises(ValueError, ransac_estimator.fit, X, y)
+    with pytest.raises(ValueError):
+        ransac_estimator.fit(X, y)
 
 
 def test_ransac_is_model_valid():
@@ -77,8 +77,8 @@ def is_model_valid(estimator, X, y):
                                        residual_threshold=5,
                                        is_model_valid=is_model_valid,
                                        random_state=0)
-
-    assert_raises(ValueError, ransac_estimator.fit, X, y)
+    with pytest.raises(ValueError):
+        ransac_estimator.fit(X, y)
 
 
 def test_ransac_max_trials():
@@ -87,7 +87,8 @@ def test_ransac_max_trials():
     ransac_estimator = RANSACRegressor(base_estimator, min_samples=2,
                                        residual_threshold=5, max_trials=0,
                                        random_state=0)
-    assert_raises(ValueError, ransac_estimator.fit, X, y)
+    with pytest.raises(ValueError):
+        ransac_estimator.fit(X, y)
 
     # there is a 1e-9 chance it will take these many trials. No good reason
     # 1e-2 isn't enough, can still happen
@@ -100,6 +101,7 @@ def test_ransac_max_trials():
         ransac_estimator.fit(X, y)
         assert ransac_estimator.n_trials_ < max_trials + 1
 
+
 def test_ransac_stop_n_inliers():
     base_estimator = LinearRegression()
     ransac_estimator = RANSACRegressor(base_estimator, min_samples=2,
@@ -330,9 +332,14 @@ def test_ransac_min_n_samples():
     assert_array_almost_equal(ransac_estimator1.predict(X),
                               ransac_estimator6.predict(X))
 
-    assert_raises(ValueError, ransac_estimator3.fit, X, y)
-    assert_raises(ValueError, ransac_estimator4.fit, X, y)
-    assert_raises(ValueError, ransac_estimator7.fit, X, y)
+    with pytest.raises(ValueError):
+        ransac_estimator3.fit(X, y)
+
+    with pytest.raises(ValueError):
+        ransac_estimator4.fit(X, y)
+
+    with pytest.raises(ValueError):
+        ransac_estimator7.fit(X, y)
 
 
 def test_ransac_multi_dimensional_targets():
@@ -356,10 +363,15 @@ def test_ransac_multi_dimensional_targets():
 
 
 def test_ransac_residual_loss():
-    loss_multi1 = lambda y_true, y_pred: np.sum(np.abs(y_true - y_pred), axis=1)
-    loss_multi2 = lambda y_true, y_pred: np.sum((y_true - y_pred) ** 2, axis=1)
+    def loss_multi1(y_true, y_pred):
+        return np.sum(np.abs(y_true - y_pred), axis=1)
+
+    def loss_multi2(y_true, y_pred):
+        return np.sum((y_true - y_pred) ** 2, axis=1)
+
+    def loss_mono(y_true, y_pred):
+        return np.abs(y_true - y_pred)
 
-    loss_mono = lambda y_true, y_pred : np.abs(y_true - y_pred)
     yyy = np.column_stack([y, y, y])
 
     base_estimator = LinearRegression()
@@ -445,10 +457,14 @@ def test_ransac_dynamic_max_trials():
     base_estimator = LinearRegression()
     ransac_estimator = RANSACRegressor(base_estimator, min_samples=2,
                                        stop_probability=-0.1)
-    assert_raises(ValueError, ransac_estimator.fit, X, y)
+
+    with pytest.raises(ValueError):
+        ransac_estimator.fit(X, y)
+
     ransac_estimator = RANSACRegressor(base_estimator, min_samples=2,
                                        stop_probability=1.1)
-    assert_raises(ValueError, ransac_estimator.fit, X, y)
+    with pytest.raises(ValueError):
+        ransac_estimator.fit(X, y)
 
 
 def test_ransac_fit_sample_weight():
@@ -494,7 +510,9 @@ def test_ransac_fit_sample_weight():
     # sample_weight, raises error
     base_estimator = OrthogonalMatchingPursuit()
     ransac_estimator = RANSACRegressor(base_estimator)
-    assert_raises(ValueError, ransac_estimator.fit, X, y, weights)
+
+    with pytest.raises(ValueError):
+        ransac_estimator.fit(X, y, weights)
 
 
 def test_ransac_final_model_fit_sample_weight():
diff --git a/sklearn/linear_model/tests/test_ridge.py b/sklearn/linear_model/tests/test_ridge.py
index 2da9a60fb301e..6e8a6761dda26 100644
--- a/sklearn/linear_model/tests/test_ridge.py
+++ b/sklearn/linear_model/tests/test_ridge.py
@@ -9,9 +9,6 @@
 from sklearn.utils._testing import assert_allclose
 from sklearn.utils._testing import assert_array_almost_equal
 from sklearn.utils._testing import assert_array_equal
-from sklearn.utils._testing import assert_raises
-from sklearn.utils._testing import assert_raise_message
-from sklearn.utils._testing import assert_raises_regex
 from sklearn.utils._testing import ignore_warnings
 from sklearn.utils._testing import assert_warns
 
@@ -59,8 +56,12 @@
 y_iris = iris.target
 
 
-DENSE_FILTER = lambda X: X
-SPARSE_FILTER = lambda X: sp.csr_matrix(X)
+def DENSE_FILTER(X):
+    return X
+
+
+def SPARSE_FILTER(X):
+    return sp.csr_matrix(X)
 
 
 def _accuracy_callable(y_test, y_pred):
@@ -195,14 +196,14 @@ def test_ridge_sample_weights():
             W = np.diag(sample_weight)
             if intercept is False:
                 X_aug = X
-                I = np.eye(n_features)
+                D = np.eye(n_features)
             else:
                 dummy_column = np.ones(shape=(n_samples, 1))
                 X_aug = np.concatenate((dummy_column, X), axis=1)
-                I = np.eye(n_features + 1)
-                I[0, 0] = 0
+                D = np.eye(n_features + 1)
+                D[0, 0] = 0
 
-            cf_coefs = linalg.solve(X_aug.T.dot(W).dot(X_aug) + alpha * I,
+            cf_coefs = linalg.solve(X_aug.T.dot(W).dot(X_aug) + alpha * D,
                                     X_aug.T.dot(W).dot(y))
 
             if intercept is False:
@@ -320,7 +321,8 @@ def test_ridge_individual_penalties():
 
     # Test error is raised when number of targets and penalties do not match.
     ridge = Ridge(alpha=penalties[:-1])
-    assert_raises(ValueError, ridge.fit, X, y)
+    with pytest.raises(ValueError):
+        ridge.fit(X, y)
 
 
 @pytest.mark.parametrize('n_col', [(), (1,), (3,)])
@@ -610,7 +612,9 @@ def _test_ridge_loo(filter_):
     assert ridge_gcv2.alpha_ == pytest.approx(alpha_)
 
     # check that we get same best alpha with custom score_func
-    func = lambda x, y: -mean_squared_error(x, y)
+    def func(x, y):
+        return -mean_squared_error(x, y)
+
     scoring = make_scorer(func)
     ridge_gcv3 = RidgeCV(fit_intercept=False, scoring=scoring)
     f(ridge_gcv3.fit)(filter_(X_diabetes), y_diabetes)
@@ -868,7 +872,8 @@ def test_ridge_sparse_svd():
     X = sp.csc_matrix(rng.rand(100, 10))
     y = rng.rand(100)
     ridge = Ridge(solver='svd', fit_intercept=False)
-    assert_raises(TypeError, ridge.fit, X, y)
+    with pytest.raises(TypeError):
+        ridge.fit(X, y)
 
 
 def test_class_weights():
@@ -980,8 +985,8 @@ def test_ridgecv_store_cv_values(scoring):
     assert r.cv_values_.shape == (n_samples, n_targets, n_alphas)
 
     r = RidgeCV(cv=3, store_cv_values=True, scoring=scoring)
-    assert_raises_regex(ValueError, 'cv!=None and store_cv_values',
-                        r.fit, x, y)
+    with pytest.raises(ValueError, match='cv!=None and store_cv_values'):
+        r.fit(x, y)
 
 
 @pytest.mark.parametrize("scoring", [None, 'accuracy', _accuracy_callable])
@@ -1068,13 +1073,13 @@ def fit_ridge_not_ok():
         def fit_ridge_not_ok_2():
             ridge.fit(X, y, sample_weights_not_OK_2)
 
-        assert_raise_message(ValueError,
-                             "Sample weights must be 1D array or scalar",
-                             fit_ridge_not_ok)
+        err_msg = "Sample weights must be 1D array or scalar"
+        with pytest.raises(ValueError, match=err_msg):
+            fit_ridge_not_ok()
 
-        assert_raise_message(ValueError,
-                             "Sample weights must be 1D array or scalar",
-                             fit_ridge_not_ok_2)
+        err_msg = "Sample weights must be 1D array or scalar"
+        with pytest.raises(ValueError, match=err_msg):
+            fit_ridge_not_ok_2()
 
 
 def test_sparse_design_with_sample_weights():
@@ -1125,15 +1130,13 @@ def test_ridgecv_negative_alphas():
 
     # Negative integers
     ridge = RidgeCV(alphas=(-1, -10, -100))
-    assert_raises_regex(ValueError,
-                        "alphas must be strictly positive",
-                        ridge.fit, X, y)
+    with pytest.raises(ValueError, match="alphas must be strictly positive"):
+        ridge.fit(X, y)
 
     # Negative floats
     ridge = RidgeCV(alphas=(-0.1, -1.0, -10.0))
-    assert_raises_regex(ValueError,
-                        "alphas must be strictly positive",
-                        ridge.fit, X, y)
+    with pytest.raises(ValueError, match="alphas must be strictly positive"):
+        ridge.fit(X, y)
 
 
 def test_raises_value_error_if_solver_not_supported():
@@ -1151,7 +1154,8 @@ def func():
         y = np.ones(3)
         ridge_regression(X, y, alpha=1., solver=wrong_solver)
 
-    assert_raise_message(exception, message, func)
+        with pytest.raises(exception, match=message):
+            func()
 
 
 def test_sparse_cg_max_iter():
@@ -1261,14 +1265,13 @@ def test_ridge_regression_check_arguments_validity(return_intercept,
     alpha, atol, tol = 1e-3, 1e-4, 1e-6
 
     if solver not in ['sag', 'auto'] and return_intercept:
-        assert_raises_regex(ValueError,
-                            "In Ridge, only 'sag' solver",
-                            ridge_regression, X_testing, y,
-                            alpha=alpha,
-                            solver=solver,
-                            sample_weight=sample_weight,
-                            return_intercept=return_intercept,
-                            tol=tol)
+        with pytest.raises(ValueError, match="In Ridge, only 'sag' solver"):
+            ridge_regression(X_testing, y,
+                             alpha=alpha,
+                             solver=solver,
+                             sample_weight=sample_weight,
+                             return_intercept=return_intercept,
+                             tol=tol)
         return
 
     out = ridge_regression(X_testing, y, alpha=alpha,
@@ -1288,7 +1291,8 @@ def test_ridge_regression_check_arguments_validity(return_intercept,
 
 def test_ridge_classifier_no_support_multilabel():
     X, y = make_multilabel_classification(n_samples=10, random_state=0)
-    assert_raises(ValueError, RidgeClassifier().fit, X, y)
+    with pytest.raises(ValueError):
+        RidgeClassifier().fit(X, y)
 
 
 @pytest.mark.parametrize(
diff --git a/sklearn/linear_model/tests/test_sgd.py b/sklearn/linear_model/tests/test_sgd.py
index d5063981ff9aa..0ac7ce779f5a7 100644
--- a/sklearn/linear_model/tests/test_sgd.py
+++ b/sklearn/linear_model/tests/test_sgd.py
@@ -8,7 +8,6 @@
 from sklearn.utils._testing import assert_array_equal
 from sklearn.utils._testing import assert_almost_equal
 from sklearn.utils._testing import assert_array_almost_equal
-from sklearn.utils._testing import assert_raises
 from sklearn.utils._testing import assert_raises_regexp
 from sklearn.utils._testing import assert_warns
 from sklearn.utils._testing import ignore_warnings
@@ -169,22 +168,24 @@ def asgd(klass, X, y, eta, alpha, weight_init=None, intercept_init=0.0):
                                    SGDRegressor, SparseSGDRegressor])
 def test_sgd_bad_alpha(klass):
     # Check whether expected ValueError on bad alpha
-    assert_raises(ValueError, klass, alpha=-.1)
+    with pytest.raises(ValueError):
+        klass(alpha=-.1)
 
 
 @pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier,
                                    SGDRegressor, SparseSGDRegressor])
 def test_sgd_bad_penalty(klass):
     # Check whether expected ValueError on bad penalty
-    assert_raises(ValueError, klass, penalty='foobar',
-                  l1_ratio=0.85)
+    with pytest.raises(ValueError):
+        klass(penalty='foobar', l1_ratio=0.85)
 
 
 @pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier,
                                    SGDRegressor, SparseSGDRegressor])
 def test_sgd_bad_loss(klass):
     # Check whether expected ValueError on bad loss
-    assert_raises(ValueError, klass, loss="foobar")
+    with pytest.raises(ValueError):
+        klass(loss="foobar")
 
 
 def _test_warm_start(klass, X, Y, lr):
@@ -231,7 +232,8 @@ def test_input_format(klass):
     Y_ = np.array(Y)[:, np.newaxis]
 
     Y_ = np.c_[Y_, Y_]
-    assert_raises(ValueError, clf.fit, X, Y_)
+    with pytest.raises(ValueError):
+        clf.fit(X, Y_)
 
 
 @pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier,
@@ -334,8 +336,8 @@ def test_late_onset_averaging_reached(klass):
 def test_sgd_bad_alpha_for_optimal_learning_rate(klass):
     # Check whether expected ValueError on bad alpha, i.e. 0
     # since alpha is used to compute the optimal learning rate
-    assert_raises(ValueError, klass,
-                  alpha=0, learning_rate="optimal")
+    with pytest.raises(ValueError):
+        klass(alpha=0, learning_rate="optimal")
 
 
 @pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier,
@@ -438,80 +440,88 @@ def test_sgd_clf(klass):
 @pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
 def test_sgd_bad_l1_ratio(klass):
     # Check whether expected ValueError on bad l1_ratio
-    assert_raises(ValueError, klass, l1_ratio=1.1)
+    with pytest.raises(ValueError):
+        klass(l1_ratio=1.1)
 
 
 @pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
 def test_sgd_bad_learning_rate_schedule(klass):
     # Check whether expected ValueError on bad learning_rate
-    assert_raises(ValueError, klass, learning_rate="<unknown>")
+    with pytest.raises(ValueError):
+        klass(learning_rate="<unknown>")
 
 
 @pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
 def test_sgd_bad_eta0(klass):
     # Check whether expected ValueError on bad eta0
-    assert_raises(ValueError, klass, eta0=0,
-                  learning_rate="constant")
+    with pytest.raises(ValueError):
+        klass(eta0=0, learning_rate="constant")
 
 
 @pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
 def test_sgd_max_iter_param(klass):
     # Test parameter validity check
-    assert_raises(ValueError, klass, max_iter=-10000)
+    with pytest.raises(ValueError):
+        klass(max_iter=-10000)
 
 
 @pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
 def test_sgd_shuffle_param(klass):
     # Test parameter validity check
-    assert_raises(ValueError, klass, shuffle="false")
+    with pytest.raises(ValueError):
+        klass(shuffle="false")
 
 
 @pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
 def test_sgd_early_stopping_param(klass):
     # Test parameter validity check
-    assert_raises(ValueError, klass, early_stopping="false")
+    with pytest.raises(ValueError):
+        klass(early_stopping="false")
 
 
 @pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
 def test_sgd_validation_fraction(klass):
     # Test parameter validity check
-    assert_raises(ValueError, klass, validation_fraction=-.1)
+    with pytest.raises(ValueError):
+        klass(validation_fraction=-.1)
 
 
 @pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
 def test_sgd_n_iter_no_change(klass):
     # Test parameter validity check
-    assert_raises(ValueError, klass, n_iter_no_change=0)
+    with pytest.raises(ValueError):
+        klass(n_iter_no_change=0)
 
 
 @pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
 def test_argument_coef(klass):
     # Checks coef_init not allowed as model argument (only fit)
     # Provided coef_ does not match dataset
-    assert_raises(TypeError, klass, coef_init=np.zeros((3,)))
+    with pytest.raises(TypeError):
+        klass(coef_init=np.zeros((3,)))
 
 
 @pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
 def test_provide_coef(klass):
     # Checks coef_init shape for the warm starts
     # Provided coef_ does not match dataset.
-    assert_raises(ValueError, klass().fit,
-                  X, Y, coef_init=np.zeros((3,)))
+    with pytest.raises(ValueError):
+        klass().fit(X, Y, coef_init=np.zeros((3,)))
 
 
 @pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
 def test_set_intercept(klass):
     # Checks intercept_ shape for the warm starts
     # Provided intercept_ does not match dataset.
-    assert_raises(ValueError, klass().fit,
-                  X, Y, intercept_init=np.zeros((3,)))
+    with pytest.raises(ValueError):
+        klass().fit(X, Y, intercept_init=np.zeros((3,)))
 
 
 @pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
 def test_sgd_early_stopping_with_partial_fit(klass):
     # Test parameter validity check
-    assert_raises(ValueError,
-                  klass(early_stopping=True).partial_fit, X, Y)
+    with pytest.raises(ValueError):
+        klass(early_stopping=True).partial_fit(X, Y)
 
 
 @pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
@@ -565,7 +575,8 @@ def test_set_intercept_to_intercept(klass):
 def test_sgd_at_least_two_labels(klass):
     # Target must have at least two labels
     clf = klass(alpha=0.01, max_iter=20)
-    assert_raises(ValueError, clf.fit, X2, np.ones(9))
+    with pytest.raises(ValueError):
+        clf.fit(X2, np.ones(9))
 
 
 @pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
@@ -650,15 +661,16 @@ def test_set_coef_multiclass(klass):
     # problems
     # Provided coef_ does not match dataset
     clf = klass()
-    assert_raises(ValueError, clf.fit, X2, Y2, coef_init=np.zeros((2, 2)))
+    with pytest.raises(ValueError):
+        clf.fit(X2, Y2, coef_init=np.zeros((2, 2)))
 
     # Provided coef_ does match dataset
     clf = klass().fit(X2, Y2, coef_init=np.zeros((3, 2)))
 
     # Provided intercept_ does not match dataset
     clf = klass()
-    assert_raises(ValueError, clf.fit, X2, Y2,
-                  intercept_init=np.zeros((1,)))
+    with pytest.raises(ValueError):
+        clf.fit(X2, Y2, intercept_init=np.zeros((1,)))
 
     # Provided intercept_ does match dataset.
     clf = klass().fit(X2, Y2, intercept_init=np.zeros((3,)))
@@ -834,14 +846,16 @@ def test_equal_class_weight(klass):
 def test_wrong_class_weight_label(klass):
     # ValueError due to not existing class label.
     clf = klass(alpha=0.1, max_iter=1000, class_weight={0: 0.5})
-    assert_raises(ValueError, clf.fit, X, Y)
+    with pytest.raises(ValueError):
+        clf.fit(X, Y)
 
 
 @pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
 def test_wrong_class_weight_format(klass):
     # ValueError due to wrong class_weight argument type.
     clf = klass(alpha=0.1, max_iter=1000, class_weight=[0.5])
-    assert_raises(ValueError, clf.fit, X, Y)
+    with pytest.raises(ValueError):
+        clf.fit(X, Y)
 
 
 @pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
@@ -936,14 +950,16 @@ def test_wrong_sample_weights(klass):
     # Test if ValueError is raised if sample_weight has wrong shape
     clf = klass(alpha=0.1, max_iter=1000, fit_intercept=False)
     # provided sample_weight too long
-    assert_raises(ValueError, clf.fit, X, Y, sample_weight=np.arange(7))
+    with pytest.raises(ValueError):
+        clf.fit(X, Y, sample_weight=np.arange(7))
 
 
 @pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
 def test_partial_fit_exception(klass):
     clf = klass(alpha=0.01)
     # classes was not specified
-    assert_raises(ValueError, clf.partial_fit, X3, Y3)
+    with pytest.raises(ValueError):
+        clf.partial_fit(X3, Y3)
 
 
 @pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])

From a0ba256dbe9380b5d2cf9cee133482fc87768267 Mon Sep 17 00:00:00 2001
From: Martin Hirzel <hirzel@gmail.com>
Date: Mon, 15 Feb 2021 07:42:51 -0500
Subject: [PATCH 171/478] DOC oob_score is only available when bootstrap=True
 (#19444)

---
 sklearn/ensemble/_bagging.py | 4 ++--
 sklearn/ensemble/_forest.py  | 4 ++++
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/sklearn/ensemble/_bagging.py b/sklearn/ensemble/_bagging.py
index 58104d23fcf4e..070bc374f3123 100644
--- a/sklearn/ensemble/_bagging.py
+++ b/sklearn/ensemble/_bagging.py
@@ -488,7 +488,7 @@ class BaggingClassifier(ClassifierMixin, BaseBagging):
 
     oob_score : bool, default=False
         Whether to use out-of-bag samples to estimate
-        the generalization error.
+        the generalization error. Only available if bootstrap=True.
 
     warm_start : bool, default=False
         When set to True, reuse the solution of the previous call to fit
@@ -897,7 +897,7 @@ class BaggingRegressor(RegressorMixin, BaseBagging):
 
     oob_score : bool, default=False
         Whether to use out-of-bag samples to estimate
-        the generalization error.
+        the generalization error. Only available if bootstrap=True.
 
     warm_start : bool, default=False
         When set to True, reuse the solution of the previous call to fit
diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py
index c97b5b9f12528..c0b190c60ef54 100644
--- a/sklearn/ensemble/_forest.py
+++ b/sklearn/ensemble/_forest.py
@@ -1075,6 +1075,7 @@ class RandomForestClassifier(ForestClassifier):
 
     oob_score : bool, default=False
         Whether to use out-of-bag samples to estimate the generalization score.
+        Only available if bootstrap=True.
 
     n_jobs : int, default=None
         The number of jobs to run in parallel. :meth:`fit`, :meth:`predict`,
@@ -1398,6 +1399,7 @@ class RandomForestRegressor(ForestRegressor):
 
     oob_score : bool, default=False
         Whether to use out-of-bag samples to estimate the generalization score.
+        Only available if bootstrap=True.
 
     n_jobs : int, default=None
         The number of jobs to run in parallel. :meth:`fit`, :meth:`predict`,
@@ -1680,6 +1682,7 @@ class ExtraTreesClassifier(ForestClassifier):
 
     oob_score : bool, default=False
         Whether to use out-of-bag samples to estimate the generalization score.
+        Only available if bootstrap=True.
 
     n_jobs : int, default=None
         The number of jobs to run in parallel. :meth:`fit`, :meth:`predict`,
@@ -1999,6 +2002,7 @@ class ExtraTreesRegressor(ForestRegressor):
 
     oob_score : bool, default=False
         Whether to use out-of-bag samples to estimate the generalization score.
+        Only available if bootstrap=True.
 
     n_jobs : int, default=None
         The number of jobs to run in parallel. :meth:`fit`, :meth:`predict`,

From 8c059196ede469f050cb8a278213e0294c46230b Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Mon, 15 Feb 2021 15:17:43 +0100
Subject: [PATCH 172/478] DOC update copyright year forever (#19463)

---
 doc/conf.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/doc/conf.py b/doc/conf.py
index adf12d9e88e82..6768aab208a99 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -16,6 +16,7 @@
 import os
 import warnings
 import re
+from datetime import datetime
 from packaging.version import parse
 from pathlib import Path
 from io import StringIO
@@ -83,7 +84,9 @@
 
 # General information about the project.
 project = 'scikit-learn'
-copyright = '2007 - 2020, scikit-learn developers (BSD License)'
+copyright = (
+    f'2007 - {datetime.now().year}, scikit-learn developers (BSD License)'
+)
 
 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the

From b251f3f818e8d3cdb7ef843006d19da87755d444 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Mon, 15 Feb 2021 09:52:34 -0500
Subject: [PATCH 173/478] FIX Gives a unique id to html visualization (#19417)

---
 doc/whats_new/v0.24.rst               |  6 +++
 sklearn/utils/_estimator_html_repr.py | 62 +++++++++++++++------------
 2 files changed, 41 insertions(+), 27 deletions(-)

diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst
index d34273532cda6..891d238c0ac43 100644
--- a/doc/whats_new/v0.24.rst
+++ b/doc/whats_new/v0.24.rst
@@ -27,6 +27,12 @@ Changelog
   `'use_encoded_value'` strategies.
   :pr:`19234` by `Guillaume Lemaitre <glemaitre>`.
 
+:mod:`sklearn.utils`
+....................
+
+- |Fix| Better contains the CSS provided by :func:`utils.estimator_html_repr`
+  by giving CSS ids to the html representation. :pr:`19417` by `Thomas Fan`_.
+
 .. _changes_0_24_1:
 
 Version 0.24.1
diff --git a/sklearn/utils/_estimator_html_repr.py b/sklearn/utils/_estimator_html_repr.py
index f95ef4966cfb2..a593a6507371f 100644
--- a/sklearn/utils/_estimator_html_repr.py
+++ b/sklearn/utils/_estimator_html_repr.py
@@ -1,6 +1,7 @@
 from contextlib import closing
 from contextlib import suppress
 from io import StringIO
+from string import Template
 import uuid
 import html
 
@@ -146,14 +147,17 @@ def _write_estimator_html(out, estimator, estimator_label,
 
 
 _STYLE = """
-div.sk-top-container {
+#$id {
   color: black;
   background-color: white;
 }
-div.sk-toggleable {
+#$id pre{
+  padding: 0;
+}
+#$id div.sk-toggleable {
   background-color: white;
 }
-label.sk-toggleable__label {
+#$id label.sk-toggleable__label {
   cursor: pointer;
   display: block;
   width: 100%;
@@ -162,31 +166,31 @@ def _write_estimator_html(out, estimator, estimator_label,
   box-sizing: border-box;
   text-align: center;
 }
-div.sk-toggleable__content {
+#$id div.sk-toggleable__content {
   max-height: 0;
   max-width: 0;
   overflow: hidden;
   text-align: left;
   background-color: #f0f8ff;
 }
-div.sk-toggleable__content pre {
+#$id div.sk-toggleable__content pre {
   margin: 0.2em;
   color: black;
   border-radius: 0.25em;
   background-color: #f0f8ff;
 }
-input.sk-toggleable__control:checked~div.sk-toggleable__content {
+#$id input.sk-toggleable__control:checked~div.sk-toggleable__content {
   max-height: 200px;
   max-width: 100%;
   overflow: auto;
 }
-div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {
+#$id div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {
   background-color: #d4ebff;
 }
-div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {
+#$id div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {
   background-color: #d4ebff;
 }
-input.sk-hidden--visually {
+#$id input.sk-hidden--visually {
   border: 0;
   clip: rect(1px 1px 1px 1px);
   clip: rect(1px, 1px, 1px, 1px);
@@ -197,7 +201,7 @@ def _write_estimator_html(out, estimator, estimator_label,
   position: absolute;
   width: 1px;
 }
-div.sk-estimator {
+#$id div.sk-estimator {
   font-family: monospace;
   background-color: #f0f8ff;
   margin: 0.25em 0.25em;
@@ -205,19 +209,19 @@ def _write_estimator_html(out, estimator, estimator_label,
   border-radius: 0.25em;
   box-sizing: border-box;
 }
-div.sk-estimator:hover {
+#$id div.sk-estimator:hover {
   background-color: #d4ebff;
 }
-div.sk-parallel-item::after {
+#$id div.sk-parallel-item::after {
   content: "";
   width: 100%;
   border-bottom: 1px solid gray;
   flex-grow: 1;
 }
-div.sk-label:hover label.sk-toggleable__label {
+#$id div.sk-label:hover label.sk-toggleable__label {
   background-color: #d4ebff;
 }
-div.sk-serial::before {
+#$id div.sk-serial::before {
   content: "";
   position: absolute;
   border-left: 1px solid gray;
@@ -226,39 +230,39 @@ def _write_estimator_html(out, estimator, estimator_label,
   bottom: 0;
   left: 50%;
 }
-div.sk-serial {
+#$id div.sk-serial {
   display: flex;
   flex-direction: column;
   align-items: center;
   background-color: white;
 }
-div.sk-item {
+#$id div.sk-item {
   z-index: 1;
 }
-div.sk-parallel {
+#$id div.sk-parallel {
   display: flex;
   align-items: stretch;
   justify-content: center;
   background-color: white;
 }
-div.sk-parallel-item {
+#$id div.sk-parallel-item {
   display: flex;
   flex-direction: column;
   position: relative;
   background-color: white;
 }
-div.sk-parallel-item:first-child::after {
+#$id div.sk-parallel-item:first-child::after {
   align-self: flex-end;
   width: 50%;
 }
-div.sk-parallel-item:last-child::after {
+#$id div.sk-parallel-item:last-child::after {
   align-self: flex-start;
   width: 50%;
 }
-div.sk-parallel-item:only-child::after {
+#$id div.sk-parallel-item:only-child::after {
   width: 0;
 }
-div.sk-dashed-wrapped {
+#$id div.sk-dashed-wrapped {
   border: 1px dashed gray;
   margin: 0.2em;
   box-sizing: border-box;
@@ -266,19 +270,19 @@ def _write_estimator_html(out, estimator, estimator_label,
   background-color: white;
   position: relative;
 }
-div.sk-label label {
+#$id div.sk-label label {
   font-family: monospace;
   font-weight: bold;
   background-color: white;
   display: inline-block;
   line-height: 1.2em;
 }
-div.sk-label-container {
+#$id div.sk-label-container {
   position: relative;
   z-index: 2;
   text-align: center;
 }
-div.sk-container {
+#$id div.sk-container {
   display: inline-block;
   position: relative;
 }
@@ -301,8 +305,12 @@ def estimator_html_repr(estimator):
         HTML representation of estimator.
     """
     with closing(StringIO()) as out:
-        out.write(f'<style>{_STYLE}</style>'
-                  f'<div class="sk-top-container"><div class="sk-container">')
+        container_id = "sk-" + str(uuid.uuid4())
+        style_template = Template(_STYLE)
+        style_with_id = style_template.substitute(id=container_id)
+        out.write(f'<style>{style_with_id}</style>'
+                  f'<div id="{container_id}" class"sk-top-container">'
+                  '<div class="sk-container">')
         _write_estimator_html(out, estimator, estimator.__class__.__name__,
                               str(estimator), first_call=True)
         out.write('</div></div>')

From 4b72b579606ef50524e9c24b1304d4dfc11defbc Mon Sep 17 00:00:00 2001
From: Benjamin Pedigo <benjamindpedigo@gmail.com>
Date: Wed, 17 Feb 2021 03:49:46 -0500
Subject: [PATCH 174/478] DOC Fix typo in LDA User Guide (#19468)

---
 doc/modules/lda_qda.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/modules/lda_qda.rst b/doc/modules/lda_qda.rst
index e8f25d2c66930..962d65705f75a 100644
--- a/doc/modules/lda_qda.rst
+++ b/doc/modules/lda_qda.rst
@@ -136,7 +136,7 @@ Mathematical formulation of LDA dimensionality reduction
 
 First note that the K means :math:`\mu_k` are vectors in
 :math:`\mathcal{R}^d`, and they lie in an affine subspace :math:`H` of
-dimension at least :math:`K - 1` (2 points lie on a line, 3 points lie on a
+dimension at most :math:`K - 1` (2 points lie on a line, 3 points lie on a
 plane, etc).
 
 As mentioned above, we can interpret LDA as assigning :math:`x` to the class

From 80d674e3cd2288f8c8c331d44bffa18006db7f46 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juan=20Carlos=20Alfaro=20Jim=C3=A9nez?=
 <JuanCarlos.Alfaro@uclm.es>
Date: Wed, 17 Feb 2021 11:10:06 +0100
Subject: [PATCH 175/478] DOC Fix closing backtick in IterativeImputer (#19476)

Co-authored-by: Nicolas Hug <contact@nicolas-hug.com>
---
 sklearn/impute/_iterative.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/impute/_iterative.py b/sklearn/impute/_iterative.py
index 81ae946459a3a..f5688fa96d238 100644
--- a/sklearn/impute/_iterative.py
+++ b/sklearn/impute/_iterative.py
@@ -68,8 +68,8 @@ class IterativeImputer(_BaseImputer):
         Maximum number of imputation rounds to perform before returning the
         imputations computed during the final round. A round is a single
         imputation of each feature with missing values. The stopping criterion
-        is met once `max(abs(X_t - X_{t-1}))/max(abs(X[known_vals]))` < tol,
-        where `X_t` is `X` at iteration `t. Note that early stopping is only
+        is met once `max(abs(X_t - X_{t-1}))/max(abs(X[known_vals])) < tol`,
+        where `X_t` is `X` at iteration `t`. Note that early stopping is only
         applied if ``sample_posterior=False``.
 
     tol : float, default=1e-3

From 4d60a815d84531ba91bf097e9c814460113a7b72 Mon Sep 17 00:00:00 2001
From: Nodar Okroshiashvili <n.okroshiashvili@gmail.com>
Date: Wed, 17 Feb 2021 17:41:20 +0400
Subject: [PATCH 176/478] DOC Remove extra word from LOF docstring (#19477)

---
 sklearn/neighbors/_lof.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/neighbors/_lof.py b/sklearn/neighbors/_lof.py
index 8ba39b315f891..29bf1a5e73f91 100644
--- a/sklearn/neighbors/_lof.py
+++ b/sklearn/neighbors/_lof.py
@@ -114,7 +114,7 @@ class LocalOutlierFactor(KNeighborsMixin,
         By default, LocalOutlierFactor is only meant to be used for outlier
         detection (novelty=False). Set novelty to True if you want to use
         LocalOutlierFactor for novelty detection. In this case be aware that
-        that you should only use predict, decision_function and score_samples
+        you should only use predict, decision_function and score_samples
         on new unseen data and not on the training set.
 
         .. versionadded:: 0.20

From 66f67dd9362983a1d446ccf752b54c72c30fc090 Mon Sep 17 00:00:00 2001
From: Nigel Bosch <pnigelb@gmail.com>
Date: Thu, 18 Feb 2021 02:06:52 -0600
Subject: [PATCH 177/478] DOC Fix broken link to wikipedia in semi-supervised
 UG (#19481)

---
 doc/modules/semi_supervised.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/modules/semi_supervised.rst b/doc/modules/semi_supervised.rst
index 484484fca5a78..7c1ea8f296a49 100644
--- a/doc/modules/semi_supervised.rst
+++ b/doc/modules/semi_supervised.rst
@@ -27,7 +27,7 @@ labeled points and a large amount of unlabeled points.
 
    Semi-supervised algorithms need to make assumptions about the distribution
    of the dataset in order to achieve performance gains. See `here
-   <https://en.wikipedia.org/wiki/Semi-supervised_learning#Assumptions_used>`_
+   <https://en.wikipedia.org/wiki/Semi-supervised_learning#Assumptions>`_
    for more details.
 
 .. _self_training:

From cc13313b26852169dff3fdf80c40008c233ce40f Mon Sep 17 00:00:00 2001
From: Zito Relova <zitorelova@gmail.com>
Date: Thu, 18 Feb 2021 07:43:54 -0800
Subject: [PATCH 178/478] TST remove assert_warns in feature_extraction/tests/
 module (#19439)

---
 sklearn/feature_extraction/tests/test_text.py | 25 ++++++++++---------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py
index ebe13cc0c240a..767b04ddb5d95 100644
--- a/sklearn/feature_extraction/tests/test_text.py
+++ b/sklearn/feature_extraction/tests/test_text.py
@@ -29,8 +29,7 @@
 from numpy.testing import assert_array_equal
 from sklearn.utils import IS_PYPY
 from sklearn.utils._testing import (assert_almost_equal,
-                                    assert_warns_message, assert_raise_message,
-                                    assert_no_warnings,
+                                    assert_raise_message,
                                     fails_if_pypy,
                                     assert_allclose_dense_sparse,
                                     skip_if_32bit)
@@ -386,8 +385,8 @@ def test_countvectorizer_uppercase_in_vocab():
                " be matched with any documents")
 
     vectorizer = CountVectorizer(lowercase=True, vocabulary=vocabulary)
-    assert_warns_message(UserWarning, message,
-                         vectorizer.fit_transform, vocabulary)
+    with pytest.warns(UserWarning, match=message):
+        vectorizer.fit_transform(vocabulary)
 
 
 def test_tf_idf_smoothing():
@@ -429,8 +428,8 @@ def test_tfidf_no_smoothing():
     tr = TfidfTransformer(smooth_idf=False, norm='l2')
 
     in_warning_message = 'divide by zero'
-    assert_warns_message(RuntimeWarning, in_warning_message,
-                         tr.fit_transform, X).toarray()
+    with pytest.warns(RuntimeWarning, match=in_warning_message):
+        tr.fit_transform(X).toarray()
 
 
 def test_sublinear_tf():
@@ -1213,27 +1212,29 @@ def _check_stop_words_consistency(estimator):
 
 @fails_if_pypy
 def test_vectorizer_stop_words_inconsistent():
-    lstr = "['and', 'll', 've']"
+    lstr = r"\['and', 'll', 've'\]"
     message = ('Your stop_words may be inconsistent with your '
                'preprocessing. Tokenizing the stop words generated '
                'tokens %s not in stop_words.' % lstr)
     for vec in [CountVectorizer(),
                 TfidfVectorizer(), HashingVectorizer()]:
         vec.set_params(stop_words=["you've", "you", "you'll", 'AND'])
-        assert_warns_message(UserWarning, message, vec.fit_transform,
-                             ['hello world'])
+        with pytest.warns(UserWarning, match=message):
+            vec.fit_transform(['hello world'])
         # reset stop word validation
         del vec._stop_words_id
         assert _check_stop_words_consistency(vec) is False
 
     # Only one warning per stop list
-    assert_no_warnings(vec.fit_transform, ['hello world'])
+    with pytest.warns(None) as record:
+        vec.fit_transform(['hello world'])
+    assert not len(record)
     assert _check_stop_words_consistency(vec) is None
 
     # Test caching of inconsistency assessment
     vec.set_params(stop_words=["you've", "you", "you'll", 'blah', 'AND'])
-    assert_warns_message(UserWarning, message, vec.fit_transform,
-                         ['hello world'])
+    with pytest.warns(UserWarning, match=message):
+        vec.fit_transform(['hello world'])
 
 
 @skip_if_32bit

From 321799971be8ede64d4603c93687becd5701d30f Mon Sep 17 00:00:00 2001
From: Zito Relova <zitorelova@gmail.com>
Date: Thu, 18 Feb 2021 07:50:27 -0800
Subject: [PATCH 179/478] TST Replace the use of assert_warns messages in
 cluster/tests/ module (#19437)

---
 .../tests/test_affinity_propagation.py        | 33 +++++++++++--------
 sklearn/cluster/tests/test_birch.py           |  4 +--
 .../tests/test_feature_agglomeration.py       | 10 ++++--
 sklearn/cluster/tests/test_hierarchical.py    | 15 +++++----
 sklearn/cluster/tests/test_spectral.py        |  4 +--
 5 files changed, 38 insertions(+), 28 deletions(-)

diff --git a/sklearn/cluster/tests/test_affinity_propagation.py b/sklearn/cluster/tests/test_affinity_propagation.py
index 446b0f43c74d9..51b4fd425349e 100644
--- a/sklearn/cluster/tests/test_affinity_propagation.py
+++ b/sklearn/cluster/tests/test_affinity_propagation.py
@@ -8,9 +8,7 @@
 from scipy.sparse import csr_matrix
 
 from sklearn.exceptions import ConvergenceWarning
-from sklearn.utils._testing import (
-    assert_array_equal, assert_warns,
-    assert_warns_message, assert_no_warnings)
+from sklearn.utils._testing import assert_array_equal
 
 from sklearn.cluster import AffinityPropagation
 from sklearn.cluster._affinity_propagation import (
@@ -72,6 +70,7 @@ def test_affinity_propagation():
     with pytest.raises(TypeError):
         af_2.fit(csr_matrix((3, 3)))
 
+
 def test_affinity_propagation_predict():
     # Test AffinityPropagation.predict
     af = AffinityPropagation(affinity="euclidean", random_state=63)
@@ -104,7 +103,8 @@ def test_affinity_propagation_fit_non_convergence():
     # Force non-convergence by allowing only a single iteration
     af = AffinityPropagation(preference=-10, max_iter=1, random_state=82)
 
-    assert_warns(ConvergenceWarning, af.fit, X)
+    with pytest.warns(ConvergenceWarning):
+        af.fit(X)
     assert_array_equal(np.empty((0, 2)), af.cluster_centers_)
     assert_array_equal(np.array([-1, -1, -1]), af.labels_)
 
@@ -114,24 +114,28 @@ def test_affinity_propagation_equal_mutual_similarities():
     S = -euclidean_distances(X, squared=True)
 
     # setting preference > similarity
-    cluster_center_indices, labels = assert_warns_message(
-        UserWarning, "mutually equal", affinity_propagation, S, preference=0)
+    with pytest.warns(UserWarning, match="mutually equal"):
+        cluster_center_indices, labels = affinity_propagation(
+            S, preference=0)
 
     # expect every sample to become an exemplar
     assert_array_equal([0, 1], cluster_center_indices)
     assert_array_equal([0, 1], labels)
 
     # setting preference < similarity
-    cluster_center_indices, labels = assert_warns_message(
-        UserWarning, "mutually equal", affinity_propagation, S, preference=-10)
+    with pytest.warns(UserWarning, match="mutually equal"):
+        cluster_center_indices, labels = affinity_propagation(
+            S, preference=-10)
 
     # expect one cluster, with arbitrary (first) sample as exemplar
     assert_array_equal([0], cluster_center_indices)
     assert_array_equal([0, 0], labels)
 
     # setting different preferences
-    cluster_center_indices, labels = assert_no_warnings(
-        affinity_propagation, S, preference=[-20, -10], random_state=37)
+    with pytest.warns(None) as record:
+        cluster_center_indices, labels = affinity_propagation(
+            S, preference=[-20, -10], random_state=37)
+    assert not len(record)
 
     # expect one cluster, with highest-preference sample as exemplar
     assert_array_equal([1], cluster_center_indices)
@@ -144,14 +148,15 @@ def test_affinity_propagation_predict_non_convergence():
     X = np.array([[0, 0], [1, 1], [-2, -2]])
 
     # Force non-convergence by allowing only a single iteration
-    af = assert_warns(ConvergenceWarning,
-                      AffinityPropagation(preference=-10,
-                                          max_iter=1, random_state=75).fit, X)
+    with pytest.warns(ConvergenceWarning):
+        af = AffinityPropagation(preference=-10,
+                                 max_iter=1, random_state=75).fit(X)
 
     # At prediction time, consider new samples as noise since there are no
     # clusters
     to_predict = np.array([[2, 2], [3, 3], [4, 4]])
-    y = assert_warns(ConvergenceWarning, af.predict, to_predict)
+    with pytest.warns(ConvergenceWarning):
+        y = af.predict(to_predict)
     assert_array_equal(np.array([-1, -1, -1]), y)
 
 
diff --git a/sklearn/cluster/tests/test_birch.py b/sklearn/cluster/tests/test_birch.py
index 37c9a083842b1..e199c897f97ef 100644
--- a/sklearn/cluster/tests/test_birch.py
+++ b/sklearn/cluster/tests/test_birch.py
@@ -17,7 +17,6 @@
 from sklearn.utils._testing import assert_almost_equal
 from sklearn.utils._testing import assert_array_equal
 from sklearn.utils._testing import assert_array_almost_equal
-from sklearn.utils._testing import assert_warns
 
 
 def test_n_samples_leaves_roots():
@@ -92,7 +91,8 @@ def test_n_clusters():
 
     # Test that a small number of clusters raises a warning.
     brc4 = Birch(threshold=10000.)
-    assert_warns(ConvergenceWarning, brc4.fit, X)
+    with pytest.warns(ConvergenceWarning):
+        brc4.fit(X)
 
 
 def test_sparse_X():
diff --git a/sklearn/cluster/tests/test_feature_agglomeration.py b/sklearn/cluster/tests/test_feature_agglomeration.py
index 8afb5854252f3..ebc2fe49d7a7f 100644
--- a/sklearn/cluster/tests/test_feature_agglomeration.py
+++ b/sklearn/cluster/tests/test_feature_agglomeration.py
@@ -3,8 +3,8 @@
 """
 # Authors: Sergul Aydore 2017
 import numpy as np
+import pytest
 from sklearn.cluster import FeatureAgglomeration
-from sklearn.utils._testing import assert_no_warnings
 from sklearn.utils._testing import assert_array_almost_equal
 
 
@@ -16,8 +16,12 @@ def test_feature_agglomeration():
                                       pooling_func=np.mean)
     agglo_median = FeatureAgglomeration(n_clusters=n_clusters,
                                         pooling_func=np.median)
-    assert_no_warnings(agglo_mean.fit, X)
-    assert_no_warnings(agglo_median.fit, X)
+    with pytest.warns(None) as record:
+        agglo_mean.fit(X)
+    assert not len(record)
+    with pytest.warns(None) as record:
+        agglo_median.fit(X)
+    assert not len(record)
     assert np.size(np.unique(agglo_mean.labels_)) == n_clusters
     assert np.size(np.unique(agglo_median.labels_)) == n_clusters
     assert np.size(agglo_mean.labels_) == X.shape[1]
diff --git a/sklearn/cluster/tests/test_hierarchical.py b/sklearn/cluster/tests/test_hierarchical.py
index 26f30dcd87847..b5a2d9bbf2701 100644
--- a/sklearn/cluster/tests/test_hierarchical.py
+++ b/sklearn/cluster/tests/test_hierarchical.py
@@ -33,7 +33,6 @@
 from sklearn.cluster._hierarchical_fast import average_merge, max_merge
 from sklearn.utils._fast_dict import IntFloatDict
 from sklearn.utils._testing import assert_array_equal
-from sklearn.utils._testing import assert_warns
 from sklearn.datasets import make_moons, make_circles
 
 
@@ -94,17 +93,18 @@ def test_unstructured_linkage_tree():
         # With specified a number of clusters just for the sake of
         # raising a warning and testing the warning code
         with ignore_warnings():
-            children, n_nodes, n_leaves, parent = assert_warns(
-                UserWarning, ward_tree, this_X.T, n_clusters=10)
+            with pytest.warns(UserWarning):
+                children, n_nodes, n_leaves, parent = ward_tree(
+                    this_X.T, n_clusters=10)
         n_nodes = 2 * X.shape[1] - 1
         assert len(children) + n_leaves == n_nodes
 
     for tree_builder in _TREE_BUILDERS.values():
         for this_X in (X, X[0]):
             with ignore_warnings():
-                children, n_nodes, n_leaves, parent = assert_warns(
-                    UserWarning, tree_builder, this_X.T, n_clusters=10)
-
+                with pytest.warns(UserWarning):
+                    children, n_nodes, n_leaves, parent = tree_builder(
+                        this_X.T, n_clusters=10)
             n_nodes = 2 * X.shape[1] - 1
             assert len(children) + n_leaves == n_nodes
 
@@ -550,7 +550,8 @@ def test_connectivity_fixing_non_lil():
     m = np.array([[True, False], [False, True]])
     c = grid_to_graph(n_x=2, n_y=2, mask=m)
     w = AgglomerativeClustering(connectivity=c, linkage='ward')
-    assert_warns(UserWarning, w.fit, x)
+    with pytest.warns(UserWarning):
+        w.fit(x)
 
 
 def test_int_float_dict():
diff --git a/sklearn/cluster/tests/test_spectral.py b/sklearn/cluster/tests/test_spectral.py
index 7af3b8089a09c..6962e98917ed0 100644
--- a/sklearn/cluster/tests/test_spectral.py
+++ b/sklearn/cluster/tests/test_spectral.py
@@ -10,7 +10,6 @@
 
 from sklearn.utils import check_random_state
 from sklearn.utils._testing import assert_array_equal
-from sklearn.utils._testing import assert_warns_message
 
 from sklearn.cluster import SpectralClustering, spectral_clustering
 from sklearn.cluster._spectral import discretize
@@ -132,7 +131,8 @@ def test_affinities():
     # nearest neighbors affinity
     sp = SpectralClustering(n_clusters=2, affinity='nearest_neighbors',
                             random_state=0)
-    assert_warns_message(UserWarning, 'not fully connected', sp.fit, X)
+    with pytest.warns(UserWarning, match='not fully connected'):
+        sp.fit(X)
     assert adjusted_rand_score(y, sp.labels_) == 1
 
     sp = SpectralClustering(n_clusters=2, gamma=2, random_state=0)

From c282d6cdb7647fceb1f73a5e343c112c42a21a04 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Thu, 18 Feb 2021 10:55:00 -0500
Subject: [PATCH 180/478] DOC clarify that n_iter_no_change is not just for
 early-stopping for SGD (#19462)

---
 sklearn/linear_model/_stochastic_gradient.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/sklearn/linear_model/_stochastic_gradient.py b/sklearn/linear_model/_stochastic_gradient.py
index 948910e61b51c..65f6cc6966ba4 100644
--- a/sklearn/linear_model/_stochastic_gradient.py
+++ b/sklearn/linear_model/_stochastic_gradient.py
@@ -813,6 +813,8 @@ class SGDClassifier(BaseSGDClassifier):
         The stopping criterion. If it is not None, training will stop
         when (loss > best_loss - tol) for ``n_iter_no_change`` consecutive
         epochs.
+        Convergence is checked against the training loss or the
+        validation loss depending on the `early_stopping` parameter.
 
         .. versionadded:: 0.19
 
@@ -884,7 +886,10 @@ class SGDClassifier(BaseSGDClassifier):
             Added 'validation_fraction' option
 
     n_iter_no_change : int, default=5
-        Number of iterations with no improvement to wait before early stopping.
+        Number of iterations with no improvement to wait before stopping
+        fitting.
+        Convergence is checked against the training loss or the
+        validation loss depending on the `early_stopping` parameter.
 
         .. versionadded:: 0.20
             Added 'n_iter_no_change' option
@@ -1431,6 +1436,8 @@ class SGDRegressor(BaseSGDRegressor):
         The stopping criterion. If it is not None, training will stop
         when (loss > best_loss - tol) for ``n_iter_no_change`` consecutive
         epochs.
+        Convergence is checked against the training loss or the
+        validation loss depending on the `early_stopping` parameter.
 
         .. versionadded:: 0.19
 
@@ -1495,7 +1502,10 @@ class SGDRegressor(BaseSGDRegressor):
             Added 'validation_fraction' option
 
     n_iter_no_change : int, default=5
-        Number of iterations with no improvement to wait before early stopping.
+        Number of iterations with no improvement to wait before stopping
+        fitting.
+        Convergence is checked against the training loss or the
+        validation loss depending on the `early_stopping` parameter.
 
         .. versionadded:: 0.20
             Added 'n_iter_no_change' option

From eec623a8c5219b293625090e7ea5a01975741e37 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?xavier=20dupr=C3=A9?= <sdpython@users.noreply.github.com>
Date: Thu, 18 Feb 2021 18:28:23 +0100
Subject: [PATCH 181/478] DOC Replace onnxmltools by sklearn-onnx (#19484)

---
 doc/related_projects.rst | 2 +-
 doc/roadmap.rst          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/related_projects.rst b/doc/related_projects.rst
index 0f8f2c21eabc5..acc2689388896 100644
--- a/doc/related_projects.rst
+++ b/doc/related_projects.rst
@@ -97,7 +97,7 @@ enhance the functionality of scikit-learn's estimators.
 
 **Model export for production**
 
-- `onnxmltools <https://github.com/onnx/onnxmltools>`_ Serializes many
+- `sklearn-onnx <https://github.com/onnx/sklearn-onnx>`_ Serialization of many
   Scikit-learn pipelines to `ONNX <https://onnx.ai/>`_ for interchange and
   prediction.
 
diff --git a/doc/roadmap.rst b/doc/roadmap.rst
index 7076e22b40287..30c9f58339502 100644
--- a/doc/roadmap.rst
+++ b/doc/roadmap.rst
@@ -223,7 +223,7 @@ the document up to date as we work on these issues.
    (to be discussed);
 
    * Extend documentation to mention how to deploy models in Python-free
-     environments for instance  `ONNX <https://github.com/onnx/onnxmltools>`_.
+     environments for instance `ONNX <https://github.com/onnx/sklearn-onnx>`_.
      and use the above best practices to assess predictive consistency between
      scikit-learn and ONNX prediction functions on validation set.
    * Document good practices to detect temporal distribution drift for deployed

From bea9211cdbed5f5beb6f7f283831373a642cb0a4 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 18 Feb 2021 18:59:09 +0100
Subject: [PATCH 182/478] DOC update whats new 0.24 for backport (#19434)

---
 doc/whats_new/v0.24.rst | 14 ++++++++++++++
 doc/whats_new/v1.0.rst  | 11 -----------
 2 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst
index 891d238c0ac43..6f2584dccdd10 100644
--- a/doc/whats_new/v0.24.rst
+++ b/doc/whats_new/v0.24.rst
@@ -19,6 +19,13 @@ Changelog
   with `sample_weight` parameter and `least_absolute_deviation` loss function.
   :pr:`19407` by :user:`Vadim Ushtanit <vadim-ushtanit>`.
 
+:mod:`sklearn.linear_model`
+...........................
+
+- |Fix|: Fixed a bug in :class:`linear_model.LogisticRegression`: the
+  sample_weight object is not modified anymore. :pr:`19182` by
+  :user:`Yosuke KOBAYASHI <m7142yosuke>`.
+
 :mod:`sklearn.preprocessing`
 ............................
 
@@ -27,6 +34,13 @@ Changelog
   `'use_encoded_value'` strategies.
   :pr:`19234` by `Guillaume Lemaitre <glemaitre>`.
 
+:mod:`sklearn.semi_supervised`
+..............................
+
+- |Fix| Avoid NaN during label propagation in
+  :class:`~sklearn.semi_supervised.LabelPropagation`.
+  :pr:`19271` by :user:`Zhaowei Wang <ThuWangzw>`.
+
 :mod:`sklearn.utils`
 ....................
 
diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 024fefe3fd825..518aec8f4d7ba 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -122,10 +122,6 @@ Changelog
   not corresponding to their objective. :pr:`19172` by
   :user:`Mathurin Massias <mathurinm>`
 
-- |Fix|: Fixed a bug in :class:`linear_model.LogisticRegression`: the
-  sample_weight object is not modified anymore. :pr:`19182` by
-  :user:`Yosuke KOBAYASHI <m7142yosuke>`.
-
 - |API|: The parameter ``normalize`` of :class:`linear_model.LinearRegression`
   is deprecated and will be removed in 1.2.
   Motivation for this deprecation: ``normalize`` parameter did not take any
@@ -180,13 +176,6 @@ Changelog
   for non-English characters. :pr:`18959` by :user:`Zero <Zeroto521>`
   and :user:`wstates <wstates>`.
 
-:mod:`sklearn.semi_supervised`
-..............................
-
-- |Fix| Avoid NaN during label propagation in
-  :class:`~sklearn.semi_supervised.LabelPropagation`.
-  :pr:`19271` by :user:`Zhaowei Wang <ThuWangzw>`.
-
 Code and Documentation Contributors
 -----------------------------------
 

From 6a6217f15f654284d227c27c9179bb02a793e811 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?xavier=20dupr=C3=A9?= <sdpython@users.noreply.github.com>
Date: Thu, 18 Feb 2021 19:43:40 +0100
Subject: [PATCH 183/478] ENH Add mean_pinball_loss metric for quantile
 regression (#19415)

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
Co-authored-by: Christian Lorentzen <lorentzen.ch@gmail.com>
---
 doc/modules/classes.rst                       |   1 +
 doc/modules/model_evaluation.rst              |  71 +++-
 doc/whats_new/v1.0.rst                        |   4 +
 .../plot_gradient_boosting_quantile.py        | 347 +++++++++++++++---
 .../test_gradient_boosting_loss_functions.py  |  18 +-
 sklearn/metrics/__init__.py                   |   2 +
 sklearn/metrics/_regression.py                |  83 +++++
 sklearn/metrics/tests/test_common.py          |   8 +-
 sklearn/metrics/tests/test_regression.py      | 126 ++++++-
 9 files changed, 595 insertions(+), 65 deletions(-)

diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index 65d555f978df0..c658bc6b12452 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -991,6 +991,7 @@ details.
    metrics.mean_poisson_deviance
    metrics.mean_gamma_deviance
    metrics.mean_tweedie_deviance
+   metrics.mean_pinball_loss
 
 Multilabel ranking metrics
 --------------------------
diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst
index 86e64f997cdd8..c807af982e277 100644
--- a/doc/modules/model_evaluation.rst
+++ b/doc/modules/model_evaluation.rst
@@ -416,7 +416,7 @@ defined as
 
 .. math::
 
-   \texttt{accuracy}(y, \hat{y}) = \frac{1}{n_\text{samples}} \sum_{i=0}^{n_\text{samples}-1} 1(\hat{y}_i = y_i)
+  \texttt{accuracy}(y, \hat{y}) = \frac{1}{n_\text{samples}} \sum_{i=0}^{n_\text{samples}-1} 1(\hat{y}_i = y_i)
 
 where :math:`1(x)` is the `indicator function
 <https://en.wikipedia.org/wiki/Indicator_function>`_.
@@ -1960,8 +1960,8 @@ Regression metrics
 The :mod:`sklearn.metrics` module implements several loss, score, and utility
 functions to measure regression performance. Some of those have been enhanced
 to handle the multioutput case: :func:`mean_squared_error`,
-:func:`mean_absolute_error`, :func:`explained_variance_score` and
-:func:`r2_score`.
+:func:`mean_absolute_error`, :func:`explained_variance_score`,
+:func:`r2_score` and :func:`mean_pinball_loss`.
 
 
 These functions have an ``multioutput`` keyword argument which specifies the
@@ -2354,6 +2354,71 @@ the difference in errors decreases. Finally, by setting, ``power=2``::
 we would get identical errors. The deviance when ``power=2`` is thus only
 sensitive to relative errors.
 
+.. _pinball_loss:
+
+Pinball loss
+------------
+
+The :func:`mean_pinball_loss` function is used to evaluate the predictive
+performance of quantile regression models. The `pinball loss
+<https://en.wikipedia.org/wiki/Quantile_regression#Computation>`_ is equivalent
+to :func:`mean_absolute_error` when the quantile parameter ``alpha`` is set to
+0.5.
+
+.. math::
+
+  \text{pinball}(y, \hat{y}) = \frac{1}{n_{\text{samples}}} \sum_{i=0}^{n_{\text{samples}}-1}  \alpha \max(y_i - \hat{y}_i, 0) + (1 - \alpha) \max(\hat{y}_i - y_i, 0)
+
+Here is a small example of usage of the :func:`mean_pinball_loss` function::
+
+  >>> from sklearn.metrics import mean_pinball_loss
+  >>> y_true = [1, 2, 3]
+  >>> mean_pinball_loss(y_true, [0, 2, 3], alpha=0.1)
+  0.03...
+  >>> mean_pinball_loss(y_true, [1, 2, 4], alpha=0.1)
+  0.3...
+  >>> mean_pinball_loss(y_true, [0, 2, 3], alpha=0.9)
+  0.3...
+  >>> mean_pinball_loss(y_true, [1, 2, 4], alpha=0.9)
+  0.03...
+  >>> mean_pinball_loss(y_true, y_true, alpha=0.1)
+  0.0
+  >>> mean_pinball_loss(y_true, y_true, alpha=0.9)
+  0.0
+
+It is possible to build a scorer object with a specific choice of alpha::
+
+  >>> from sklearn.metrics import make_scorer
+  >>> mean_pinball_loss_95p = make_scorer(mean_pinball_loss, alpha=0.95)
+
+Such a scorer can be used to evaluate the generalization performance of a
+quantile regressor via cross-validation:
+
+  >>> from sklearn.datasets import make_regression
+  >>> from sklearn.model_selection import cross_val_score
+  >>> from sklearn.ensemble import GradientBoostingRegressor
+  >>>
+  >>> X, y = make_regression(n_samples=100, random_state=0)
+  >>> estimator = GradientBoostingRegressor(
+  ...     loss="quantile",
+  ...     alpha=0.95,
+  ...     random_state=0,
+  ... )
+  >>> cross_val_score(estimator, X, y, cv=5, scoring=mean_pinball_loss_95p)
+  array([11.1..., 10.4... , 24.4...,  9.2..., 12.9...])
+
+It is also possible to build scorer objects for hyper-parameter tuning. The
+sign of the loss must be switched to ensure that greater means better as
+explained in the example linked below.
+
+.. topic:: Example:
+
+  * See :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_quantile.py`
+    for an example of using a the pinball loss to evaluate and tune the
+    hyper-parameters of quantile regression models on data with non-symmetric
+    noise and outliers.
+
+
 .. _clustering_metrics:
 
 Clustering metrics
diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 518aec8f4d7ba..1005920b891d3 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -145,6 +145,10 @@ Changelog
   class methods and will be removed in 1.2.
   :pr:`18543` by `Guillaume Lemaitre`_.
 
+- |Feature| :func:`metrics.mean_pinball_loss` exposes the pinball loss for
+  quantile regression. :pr:`19415` by :user:`Xavier Dupré <sdpython>`
+  and :user:`Oliver Grisel <ogrisel>`.
+
 :mod:`sklearn.naive_bayes`
 ..........................
 
diff --git a/examples/ensemble/plot_gradient_boosting_quantile.py b/examples/ensemble/plot_gradient_boosting_quantile.py
index ef40a2247bcc5..f29a87fe6cff7 100644
--- a/examples/ensemble/plot_gradient_boosting_quantile.py
+++ b/examples/ensemble/plot_gradient_boosting_quantile.py
@@ -3,77 +3,330 @@
 Prediction Intervals for Gradient Boosting Regression
 =====================================================
 
-This example shows how quantile regression can be used
-to create prediction intervals.
+This example shows how quantile regression can be used to create prediction
+intervals.
 """
-
+# %%
+# Generate some data for a synthetic regression problem by applying the
+# function f to uniformly sampled random inputs.
 import numpy as np
-import matplotlib.pyplot as plt
-
-from sklearn.ensemble import GradientBoostingRegressor
-
-np.random.seed(1)
+from sklearn.model_selection import train_test_split
 
 
 def f(x):
     """The function to predict."""
     return x * np.sin(x)
 
-#----------------------------------------------------------------------
-#  First the noiseless case
-X = np.atleast_2d(np.random.uniform(0, 10.0, size=100)).T
-X = X.astype(np.float32)
 
-# Observations
-y = f(X).ravel()
+rng = np.random.RandomState(42)
+X = np.atleast_2d(rng.uniform(0, 10.0, size=1000)).T
+expected_y = f(X).ravel()
+
+# %%
+# To make the problem interesting, we generate observations of the target y as
+# the sum of a deterministic term computed by the function f and a random noise
+# term that follows a centered `log-normal
+# <https://en.wikipedia.org/wiki/Log-normal_distribution>`_. To make this even
+# more interesting we consider the case where the amplitude of the noise
+# depends on the input variable x (heteroscedastic noise).
+#
+# The lognormal distribution is non-symmetric and long tailed: observing large
+# outliers is likely but it is impossible to observe small outliers.
+sigma = 0.5 + X.ravel() / 10
+noise = rng.lognormal(sigma=sigma) - np.exp(sigma ** 2 / 2)
+y = expected_y + noise
+
+# %%
+# Split into train, test datasets:
+X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
+
+# %%
+# Fitting non-linear quantile and least squares regressors
+# --------------------------------------------------------
+#
+# Fit gradient boosting models trained with the quantile loss and
+# alpha=0.05, 0.5, 0.95.
+#
+# The models obtained for alpha=0.05 and alpha=0.95 produce a 90% confidence
+# interval (95% - 5% = 90%).
+#
+# The model trained with alpha=0.5 produces a regression of the median: on
+# average, there should be the same number of target observations above and
+# below the predicted values.
+from sklearn.ensemble import GradientBoostingRegressor
+from sklearn.metrics import mean_pinball_loss, mean_squared_error
+
 
-dy = 1.5 + 1.0 * np.random.random(y.shape)
-noise = np.random.normal(0, dy)
-y += noise
-y = y.astype(np.float32)
+all_models = {}
+common_params = dict(
+    learning_rate=0.05,
+    n_estimators=250,
+    max_depth=2,
+    min_samples_leaf=9,
+    min_samples_split=9,
+)
+for alpha in [0.05, 0.5, 0.95]:
+    gbr = GradientBoostingRegressor(loss='quantile', alpha=alpha,
+                                    **common_params)
+    all_models["q %1.2f" % alpha] = gbr.fit(X_train, y_train)
 
-# Mesh the input space for evaluations of the real function, the prediction and
-# its MSE
+# %%
+# For the sake of comparison, also fit a baseline model trained with the usual
+# least squares loss (ls), also known as the mean squared error (MSE).
+gbr_ls = GradientBoostingRegressor(loss='ls', **common_params)
+all_models["ls"] = gbr_ls.fit(X_train, y_train)
+
+# %%
+# Create an evenly spaced evaluation set of input values spanning the [0, 10]
+# range.
 xx = np.atleast_2d(np.linspace(0, 10, 1000)).T
-xx = xx.astype(np.float32)
 
-alpha = 0.95
+# %%
+# Plot the true conditional mean function f, the prediction of the conditional
+# mean (least squares loss), the conditional median and the conditional 90%
+# interval (from 5th to 95th conditional percentiles).
+import matplotlib.pyplot as plt
+
+
+y_pred = all_models['ls'].predict(xx)
+y_lower = all_models['q 0.05'].predict(xx)
+y_upper = all_models['q 0.95'].predict(xx)
+y_med = all_models['q 0.50'].predict(xx)
+
+fig = plt.figure(figsize=(10, 10))
+plt.plot(xx, f(xx), 'g:', linewidth=3, label=r'$f(x) = x\,\sin(x)$')
+plt.plot(X_test, y_test, 'b.', markersize=10, label='Test observations')
+plt.plot(xx, y_med, 'r-', label='Predicted median', color="orange")
+plt.plot(xx, y_pred, 'r-', label='Predicted mean')
+plt.plot(xx, y_upper, 'k-')
+plt.plot(xx, y_lower, 'k-')
+plt.fill_between(xx.ravel(), y_lower, y_upper, alpha=0.4,
+                 label='Predicted 90% interval')
+plt.xlabel('$x$')
+plt.ylabel('$f(x)$')
+plt.ylim(-10, 25)
+plt.legend(loc='upper left')
+plt.show()
+
+# %%
+# Comparing the predicted median with the predicted mean, we note that the
+# median is on average below the mean as the noise is skewed towards high
+# values (large outliers). The median estimate also seems to be smoother
+# because of its natural robustness to outliers.
+#
+# Also observe that the inductive bias of gradient boosting trees is
+# unfortunately preventing our 0.05 quantile to fully capture the sinoisoidal
+# shape of the signal, in particular around x=8. Tuning hyper-parameters can
+# reduce this effect as shown in the last part of this notebook.
+#
+# Analysis of the error metrics
+# -----------------------------
+#
+# Measure the models with :func:`mean_squared_error` and
+# :func:`mean_pinball_loss` metrics on the training dataset.
+import pandas as pd
+
+
+def highlight_min(x):
+    x_min = x.min()
+    return ['font-weight: bold' if v == x_min else ''
+            for v in x]
+
+
+results = []
+for name, gbr in sorted(all_models.items()):
+    metrics = {'model': name}
+    y_pred = gbr.predict(X_train)
+    for alpha in [0.05, 0.5, 0.95]:
+        metrics["pbl=%1.2f" % alpha] = mean_pinball_loss(
+            y_train, y_pred, alpha=alpha)
+    metrics['MSE'] = mean_squared_error(y_train, y_pred)
+    results.append(metrics)
+
+pd.DataFrame(results).set_index('model').style.apply(highlight_min)
+
+# %%
+# One column shows all models evaluated by the same metric. The minimum number
+# on a column should be obtained when the model is trained and measured with
+# the same metric. This should be always the case on the training set if the
+# training converged.
+#
+# Note that because the target distribution is asymmetric, the expected
+# conditional mean and conditional median are signficiantly different and
+# therefore one could not use the least squares model get a good estimation of
+# the conditional median nor the converse.
+#
+# If the target distribution were symmetric and had no outliers (e.g. with a
+# Gaussian noise), then median estimator and the least squares estimator would
+# have yielded similar predictions.
+#
+# We then do the same on the test set.
+results = []
+for name, gbr in sorted(all_models.items()):
+    metrics = {'model': name}
+    y_pred = gbr.predict(X_test)
+    for alpha in [0.05, 0.5, 0.95]:
+        metrics["pbl=%1.2f" % alpha] = mean_pinball_loss(
+            y_test, y_pred, alpha=alpha)
+    metrics['MSE'] = mean_squared_error(y_test, y_pred)
+    results.append(metrics)
 
-clf = GradientBoostingRegressor(loss='quantile', alpha=alpha,
-                                n_estimators=250, max_depth=3,
-                                learning_rate=.1, min_samples_leaf=9,
-                                min_samples_split=9)
+pd.DataFrame(results).set_index('model').style.apply(highlight_min)
 
-clf.fit(X, y)
 
-# Make the prediction on the meshed x-axis
-y_upper = clf.predict(xx)
+# %%
+# Errors are higher meaning the models slightly overfitted the data. It still
+# shows that the best test metric is obtained when the model is trained by
+# minimizing this same metric.
+#
+# Note that the conditional median estimator is competitive with the least
+# squares estimator in terms of MSE on the test set: this can be explained by
+# the fact the least squares estimator is very sensitive to large outliers
+# which can cause significant overfitting. This can be seen on the right hand
+# side of the previous plot. The conditional median estimator is biased
+# (underestimation for this asymetric noise) but is also naturally robust to
+# outliers and overfits less.
+#
+# Calibration of the confidence interval
+# --------------------------------------
+#
+# We can also evaluate the ability of the two extreme quantile estimators at
+# producing a well-calibrated conditational 90%-confidence interval.
+#
+# To do this we can compute the fraction of observations that fall between the
+# predictions:
+def coverage_fraction(y, y_low, y_high):
+    return np.mean(np.logical_and(y >= y_low, y <= y_high))
 
-clf.set_params(alpha=1.0 - alpha)
-clf.fit(X, y)
 
-# Make the prediction on the meshed x-axis
-y_lower = clf.predict(xx)
+coverage_fraction(y_train,
+                  all_models['q 0.05'].predict(X_train),
+                  all_models['q 0.95'].predict(X_train))
 
-clf.set_params(loss='ls')
-clf.fit(X, y)
+# %%
+# On the training set the calibration is very close to the expected coverage
+# value for a 90% confidence interval.
+coverage_fraction(y_test,
+                  all_models['q 0.05'].predict(X_test),
+                  all_models['q 0.95'].predict(X_test))
 
-# Make the prediction on the meshed x-axis
-y_pred = clf.predict(xx)
 
-# Plot the function, the prediction and the 95% confidence interval based on
-# the MSE
-fig = plt.figure()
-plt.plot(xx, f(xx), 'g:', label=r'$f(x) = x\,\sin(x)$')
-plt.plot(X, y, 'b.', markersize=10, label=u'Observations')
-plt.plot(xx, y_pred, 'r-', label=u'Prediction')
+# %%
+# On the test set, the estimated confidence interval is slightly too narrow.
+# Note, however, that we would need to wrap those metrics in a cross-validation
+# loop to assess their variability under data resampling.
+#
+# Tuning the hyper-parameters of the quantile regressors
+# ------------------------------------------------------
+#
+# In the plot above, we observed that the 5th percentile regressor seems to
+# underfit and could not adapt to sinusoidal shape of the signal.
+#
+# The hyper-parameters of the model were approximately hand-tuned for the
+# median regressor and there is no reason than the same hyper-parameters are
+# suitable for the 5th percentile regressor.
+#
+# To confirm this hypothesis, we tune the hyper-parameters of a new regressor
+# of the 5th percentile by selecting the best model parameters by
+# cross-validation on the pinball loss with alpha=0.05:
+
+# %%
+from sklearn.model_selection import RandomizedSearchCV
+from sklearn.metrics import make_scorer
+from pprint import pprint
+
+
+param_grid = dict(
+    learning_rate=[0.01, 0.05, 0.1],
+    n_estimators=[100, 150, 200, 250, 300],
+    max_depth=[2, 5, 10, 15, 20],
+    min_samples_leaf=[1, 5, 10, 20, 30, 50],
+    min_samples_split=[2, 5, 10, 20, 30, 50],
+)
+alpha = 0.05
+neg_mean_pinball_loss_05p_scorer = make_scorer(
+    mean_pinball_loss,
+    alpha=alpha,
+    greater_is_better=False,  # maximize the negative loss
+)
+gbr = GradientBoostingRegressor(loss="quantile", alpha=alpha, random_state=0)
+search_05p = RandomizedSearchCV(
+    gbr,
+    param_grid,
+    n_iter=10,  # increase this if computational budget allows
+    scoring=neg_mean_pinball_loss_05p_scorer,
+    n_jobs=2,
+    random_state=0,
+).fit(X_train, y_train)
+pprint(search_05p.best_params_)
+
+# %%
+# We observe that the search procedure identifies that deeper trees are needed
+# to get a good fit for the 5th percentile regressor. Deeper trees are more
+# expressive and less likely to underfit.
+#
+# Let's now tune the hyper-parameters for the 95th percentile regressor. We
+# need to redefine the `scoring` metric used to select the best model, along
+# with adjusting the alpha parameter of the inner gradient boosting estimator
+# itself:
+from sklearn.base import clone
+
+alpha = 0.95
+neg_mean_pinball_loss_95p_scorer = make_scorer(
+    mean_pinball_loss,
+    alpha=alpha,
+    greater_is_better=False,  # maximize the negative loss
+)
+search_95p = clone(search_05p).set_params(
+    estimator__alpha=alpha,
+    scoring=neg_mean_pinball_loss_95p_scorer,
+)
+search_95p.fit(X_train, y_train)
+pprint(search_95p.best_params_)
+
+# %%
+# This time, shallower trees are selected and lead to a more constant piecewise
+# and therefore more robust estimation of the 95th percentile. This is
+# beneficial as it avoids overfitting the large outliers of the log-normal
+# additive noise.
+#
+# We can confirm this intuition by displaying the predicted 90% confidence
+# interval comprised by the predictions of those two tuned quantile regressors:
+# the prediction of the upper 95th percentile has a much coarser shape than the
+# prediction of the lower 5th percentile:
+y_lower = search_05p.predict(xx)
+y_upper = search_95p.predict(xx)
+
+fig = plt.figure(figsize=(10, 10))
+plt.plot(xx, f(xx), 'g:', linewidth=3, label=r'$f(x) = x\,\sin(x)$')
+plt.plot(X_test, y_test, 'b.', markersize=10, label='Test observations')
 plt.plot(xx, y_upper, 'k-')
 plt.plot(xx, y_lower, 'k-')
-plt.fill(np.concatenate([xx, xx[::-1]]),
-         np.concatenate([y_upper, y_lower[::-1]]),
-         alpha=.5, fc='b', ec='None', label='95% prediction interval')
+plt.fill_between(xx.ravel(), y_lower, y_upper, alpha=0.4,
+                 label='Predicted 90% interval')
 plt.xlabel('$x$')
 plt.ylabel('$f(x)$')
-plt.ylim(-10, 20)
+plt.ylim(-10, 25)
 plt.legend(loc='upper left')
+plt.title("Prediction with tuned hyper-parameters")
 plt.show()
+
+# %%
+# The plot looks qualitatively better than for the untuned models, especially
+# for the shape of the of lower quantile.
+#
+# We now quantitatively evaluate the joint-calibration of the pair of
+# estimators:
+coverage_fraction(y_train,
+                  search_05p.predict(X_train),
+                  search_95p.predict(X_train))
+# %%
+coverage_fraction(y_test,
+                  search_05p.predict(X_test),
+                  search_95p.predict(X_test))
+# %%
+# The calibration of the tuned pair is sadly not better on the test set: the
+# width of the estimated confidence interval is still too narrow.
+#
+# Again, we would need to wrap this study in a cross-validation loop to
+# better assess the variability of those estimates.
diff --git a/sklearn/ensemble/tests/test_gradient_boosting_loss_functions.py b/sklearn/ensemble/tests/test_gradient_boosting_loss_functions.py
index d0300ddc371c7..4d7ea9bfe9bb3 100644
--- a/sklearn/ensemble/tests/test_gradient_boosting_loss_functions.py
+++ b/sklearn/ensemble/tests/test_gradient_boosting_loss_functions.py
@@ -8,6 +8,7 @@
 from pytest import approx
 
 from sklearn.utils import check_random_state
+from sklearn.metrics import mean_pinball_loss
 from sklearn.ensemble._gb_losses import RegressionLossFunction
 from sklearn.ensemble._gb_losses import LeastSquaresError
 from sklearn.ensemble._gb_losses import LeastAbsoluteError
@@ -115,6 +116,8 @@ def test_quantile_loss_function():
     y_found = QuantileLossFunction(0.9)(x, np.zeros_like(x))
     y_expected = np.asarray([0.1, 0.0, 0.9]).mean()
     np.testing.assert_allclose(y_found, y_expected)
+    y_found_p = mean_pinball_loss(x, np.zeros_like(x), alpha=0.9)
+    np.testing.assert_allclose(y_found, y_found_p)
 
 
 def test_sample_weight_deviance():
@@ -293,10 +296,11 @@ def test_init_raw_predictions_values():
 
 
 @pytest.mark.parametrize('seed', range(5))
-def test_lad_equals_quantile_50(seed):
+@pytest.mark.parametrize('alpha', [0.4, 0.5, 0.6])
+def test_lad_equals_quantiles(seed, alpha):
     # Make sure quantile loss with alpha = .5 is equivalent to LAD
     lad = LeastAbsoluteError()
-    ql = QuantileLossFunction(alpha=0.5)
+    ql = QuantileLossFunction(alpha=alpha)
 
     n_samples = 50
     rng = np.random.RandomState(seed)
@@ -305,9 +309,15 @@ def test_lad_equals_quantile_50(seed):
 
     lad_loss = lad(y_true, raw_predictions)
     ql_loss = ql(y_true, raw_predictions)
-    assert lad_loss == approx(2 * ql_loss)
+    if alpha == 0.5:
+        assert lad_loss == approx(2 * ql_loss)
 
     weights = np.linspace(0, 1, n_samples) ** 2
     lad_weighted_loss = lad(y_true, raw_predictions, sample_weight=weights)
     ql_weighted_loss = ql(y_true, raw_predictions, sample_weight=weights)
-    assert lad_weighted_loss == approx(2 * ql_weighted_loss)
+    if alpha == 0.5:
+        assert lad_weighted_loss == approx(2 * ql_weighted_loss)
+    pbl_weighted_loss = mean_pinball_loss(y_true, raw_predictions,
+                                          sample_weight=weights,
+                                          alpha=alpha)
+    assert pbl_weighted_loss == approx(ql_weighted_loss)
diff --git a/sklearn/metrics/__init__.py b/sklearn/metrics/__init__.py
index 84e7c98e29324..bca22e3916c61 100644
--- a/sklearn/metrics/__init__.py
+++ b/sklearn/metrics/__init__.py
@@ -69,6 +69,7 @@
 from ._regression import mean_squared_log_error
 from ._regression import median_absolute_error
 from ._regression import mean_absolute_percentage_error
+from ._regression import mean_pinball_loss
 from ._regression import r2_score
 from ._regression import mean_tweedie_deviance
 from ._regression import mean_poisson_deviance
@@ -133,6 +134,7 @@
     'mean_absolute_error',
     'mean_squared_error',
     'mean_squared_log_error',
+    'mean_pinball_loss',
     'mean_poisson_deviance',
     'mean_gamma_deviance',
     'mean_tweedie_deviance',
diff --git a/sklearn/metrics/_regression.py b/sklearn/metrics/_regression.py
index 0d8fddd0ba24e..7edf7924e50e1 100644
--- a/sklearn/metrics/_regression.py
+++ b/sklearn/metrics/_regression.py
@@ -43,6 +43,7 @@
     "mean_squared_log_error",
     "median_absolute_error",
     "mean_absolute_percentage_error",
+    "mean_pinball_loss",
     "r2_score",
     "explained_variance_score",
     "mean_tweedie_deviance",
@@ -194,6 +195,88 @@ def mean_absolute_error(y_true, y_pred, *,
     return np.average(output_errors, weights=multioutput)
 
 
+def mean_pinball_loss(y_true, y_pred, *,
+                      sample_weight=None,
+                      alpha=0.5,
+                      multioutput='uniform_average'):
+    """Pinball loss for quantile regression.
+
+    Read more in the :ref:`User Guide <pinball_loss>`.
+
+    Parameters
+    ----------
+    y_true : array-like of shape (n_samples,) or (n_samples, n_outputs)
+        Ground truth (correct) target values.
+
+    y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs)
+        Estimated target values.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+    alpha: double, slope of the pinball loss, default=0.5,
+        this loss is equivalent to :ref:`mean_absolute_error` when `alpha=0.5`,
+        `alpha=0.95` is minimized by estimators of the 95th percentile.
+
+    multioutput : {'raw_values', 'uniform_average'}  or array-like of shape \
+            (n_outputs,), default='uniform_average'
+        Defines aggregating of multiple output values.
+        Array-like value defines weights used to average errors.
+
+        'raw_values' :
+            Returns a full set of errors in case of multioutput input.
+
+        'uniform_average' :
+            Errors of all outputs are averaged with uniform weight.
+    Returns
+    -------
+    loss : float or ndarray of floats
+        If multioutput is 'raw_values', then mean absolute error is returned
+        for each output separately.
+        If multioutput is 'uniform_average' or an ndarray of weights, then the
+        weighted average of all output errors is returned.
+
+        The pinball loss output is a non-negative floating point. The best
+        value is 0.0.
+
+    Examples
+    --------
+    >>> from sklearn.metrics import mean_pinball_loss
+    >>> y_true = [1, 2, 3]
+    >>> mean_pinball_loss(y_true, [0, 2, 3], alpha=0.1)
+    0.03...
+    >>> mean_pinball_loss(y_true, [1, 2, 4], alpha=0.1)
+    0.3...
+    >>> mean_pinball_loss(y_true, [0, 2, 3], alpha=0.9)
+    0.3...
+    >>> mean_pinball_loss(y_true, [1, 2, 4], alpha=0.9)
+    0.03...
+    >>> mean_pinball_loss(y_true, y_true, alpha=0.1)
+    0.0
+    >>> mean_pinball_loss(y_true, y_true, alpha=0.9)
+    0.0
+    """
+    y_type, y_true, y_pred, multioutput = _check_reg_targets(
+        y_true, y_pred, multioutput)
+    check_consistent_length(y_true, y_pred, sample_weight)
+    diff = y_true - y_pred
+    sign = (diff >= 0).astype(diff.dtype)
+    loss = alpha * sign * diff - (1 - alpha) * (1 - sign) * diff
+    output_errors = np.average(loss, weights=sample_weight, axis=0)
+    if isinstance(multioutput, str):
+        if multioutput == 'raw_values':
+            return output_errors
+        elif multioutput == 'uniform_average':
+            # pass None as weights to np.average: uniform mean
+            multioutput = None
+        else:
+            raise ValueError("multioutput is expected to be 'raw_values' "
+                             "or 'uniform_average' but we got %r"
+                             " instead." % multioutput)
+
+    return np.average(output_errors, weights=multioutput)
+
+
 def mean_absolute_percentage_error(y_true, y_pred,
                                    sample_weight=None,
                                    multioutput='uniform_average'):
diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py
index 181baf19de3c2..dbf1bdd458f1a 100644
--- a/sklearn/metrics/tests/test_common.py
+++ b/sklearn/metrics/tests/test_common.py
@@ -50,6 +50,7 @@
 from sklearn.metrics import mean_gamma_deviance
 from sklearn.metrics import median_absolute_error
 from sklearn.metrics import multilabel_confusion_matrix
+from sklearn.metrics import mean_pinball_loss
 from sklearn.metrics import precision_recall_curve
 from sklearn.metrics import precision_score
 from sklearn.metrics import r2_score
@@ -101,6 +102,7 @@
     "max_error": max_error,
     "mean_absolute_error": mean_absolute_error,
     "mean_squared_error": mean_squared_error,
+    "mean_pinball_loss": mean_pinball_loss,
     "median_absolute_error": median_absolute_error,
     "mean_absolute_percentage_error": mean_absolute_percentage_error,
     "explained_variance_score": explained_variance_score,
@@ -437,7 +439,8 @@ def precision_recall_curve_padded_thresholds(*args, **kwargs):
 # Regression metrics with "multioutput-continuous" format support
 MULTIOUTPUT_METRICS = {
     "mean_absolute_error", "median_absolute_error", "mean_squared_error",
-    "r2_score", "explained_variance_score", "mean_absolute_percentage_error"
+    "r2_score", "explained_variance_score", "mean_absolute_percentage_error",
+    "mean_pinball_loss"
 }
 
 # Symmetric with respect to their input arguments y_true and y_pred
@@ -460,6 +463,9 @@ def precision_recall_curve_padded_thresholds(*args, **kwargs):
     "matthews_corrcoef_score", "mean_absolute_error", "mean_squared_error",
     "median_absolute_error", "max_error",
 
+    # Pinball loss is only symmetric for alpha=0.5 which is the default.
+    "mean_pinball_loss",
+
     "cohen_kappa_score", "mean_normal_deviance"
 }
 
diff --git a/sklearn/metrics/tests/test_regression.py b/sklearn/metrics/tests/test_regression.py
index 5b8406cf7a61f..8e935173d3319 100644
--- a/sklearn/metrics/tests/test_regression.py
+++ b/sklearn/metrics/tests/test_regression.py
@@ -1,5 +1,6 @@
 
 import numpy as np
+from scipy import optimize
 from numpy.testing import assert_allclose
 from itertools import product
 import pytest
@@ -7,6 +8,8 @@
 from sklearn.utils._testing import assert_almost_equal
 from sklearn.utils._testing import assert_array_equal
 from sklearn.utils._testing import assert_array_almost_equal
+from sklearn.dummy import DummyRegressor
+from sklearn.model_selection import GridSearchCV
 
 from sklearn.metrics import explained_variance_score
 from sklearn.metrics import mean_absolute_error
@@ -15,23 +18,30 @@
 from sklearn.metrics import median_absolute_error
 from sklearn.metrics import mean_absolute_percentage_error
 from sklearn.metrics import max_error
+from sklearn.metrics import mean_pinball_loss
 from sklearn.metrics import r2_score
 from sklearn.metrics import mean_tweedie_deviance
+from sklearn.metrics import make_scorer
 
 from sklearn.metrics._regression import _check_reg_targets
 
-from ...exceptions import UndefinedMetricWarning
+from sklearn.exceptions import UndefinedMetricWarning
 
 
 def test_regression_metrics(n_samples=50):
     y_true = np.arange(n_samples)
     y_pred = y_true + 1
+    y_pred_2 = y_true - 1
 
     assert_almost_equal(mean_squared_error(y_true, y_pred), 1.)
     assert_almost_equal(mean_squared_log_error(y_true, y_pred),
                         mean_squared_error(np.log(1 + y_true),
                                            np.log(1 + y_pred)))
     assert_almost_equal(mean_absolute_error(y_true, y_pred), 1.)
+    assert_almost_equal(mean_pinball_loss(y_true, y_pred), 0.5)
+    assert_almost_equal(mean_pinball_loss(y_true, y_pred_2), 0.5)
+    assert_almost_equal(mean_pinball_loss(y_true, y_pred, alpha=0.4), 0.6)
+    assert_almost_equal(mean_pinball_loss(y_true, y_pred_2, alpha=0.4), 0.4)
     assert_almost_equal(median_absolute_error(y_true, y_pred), 1.)
     mape = mean_absolute_percentage_error(y_true, y_pred)
     assert np.isfinite(mape)
@@ -90,6 +100,9 @@ def test_multioutput_regression():
     error = mean_absolute_error(y_true, y_pred)
     assert_almost_equal(error, (1. + 2. / 3) / 4.)
 
+    error = mean_pinball_loss(y_true, y_pred)
+    assert_almost_equal(error, (1. + 2. / 3) / 8.)
+
     error = np.around(mean_absolute_percentage_error(y_true, y_pred),
                       decimals=2)
     assert np.isfinite(error)
@@ -104,15 +117,16 @@ def test_multioutput_regression():
 
 
 def test_regression_metrics_at_limits():
-    assert_almost_equal(mean_squared_error([0.], [0.]), 0.00, 2)
-    assert_almost_equal(mean_squared_error([0.], [0.], squared=False), 0.00, 2)
-    assert_almost_equal(mean_squared_log_error([0.], [0.]), 0.00, 2)
-    assert_almost_equal(mean_absolute_error([0.], [0.]), 0.00, 2)
-    assert_almost_equal(mean_absolute_percentage_error([0.], [0.]), 0.00, 2)
-    assert_almost_equal(median_absolute_error([0.], [0.]), 0.00, 2)
-    assert_almost_equal(max_error([0.], [0.]), 0.00, 2)
-    assert_almost_equal(explained_variance_score([0.], [0.]), 1.00, 2)
-    assert_almost_equal(r2_score([0., 1], [0., 1]), 1.00, 2)
+    assert_almost_equal(mean_squared_error([0.], [0.]), 0.0)
+    assert_almost_equal(mean_squared_error([0.], [0.], squared=False), 0.0)
+    assert_almost_equal(mean_squared_log_error([0.], [0.]), 0.0)
+    assert_almost_equal(mean_absolute_error([0.], [0.]), 0.0)
+    assert_almost_equal(mean_pinball_loss([0.], [0.]), 0.0)
+    assert_almost_equal(mean_absolute_percentage_error([0.], [0.]), 0.0)
+    assert_almost_equal(median_absolute_error([0.], [0.]), 0.0)
+    assert_almost_equal(max_error([0.], [0.]), 0.0)
+    assert_almost_equal(explained_variance_score([0.], [0.]), 1.0)
+    assert_almost_equal(r2_score([0., 1], [0., 1]), 1.0)
     err_msg = ("Mean Squared Logarithmic Error cannot be used when targets "
                "contain negative values.")
     with pytest.raises(ValueError, match=err_msg):
@@ -207,6 +221,11 @@ def test_regression_multioutput_array():
 
     mse = mean_squared_error(y_true, y_pred, multioutput='raw_values')
     mae = mean_absolute_error(y_true, y_pred, multioutput='raw_values')
+    err_msg = ("multioutput is expected to be 'raw_values' "
+               "or 'uniform_average' but we got 'variance_weighted' instead.")
+    with pytest.raises(ValueError, match=err_msg):
+        mean_pinball_loss(y_true, y_pred, multioutput='variance_weighted')
+    pbl = mean_pinball_loss(y_true, y_pred, multioutput='raw_values')
     mape = mean_absolute_percentage_error(y_true, y_pred,
                                           multioutput='raw_values')
     r = r2_score(y_true, y_pred, multioutput='raw_values')
@@ -214,6 +233,7 @@ def test_regression_multioutput_array():
 
     assert_array_almost_equal(mse, [0.125, 0.5625], decimal=2)
     assert_array_almost_equal(mae, [0.25, 0.625], decimal=2)
+    assert_array_almost_equal(pbl, [0.25/2, 0.625/2], decimal=2)
     assert_array_almost_equal(mape, [0.0778, 0.2262], decimal=2)
     assert_array_almost_equal(r, [0.95, 0.93], decimal=2)
     assert_array_almost_equal(evs, [0.95, 0.93], decimal=2)
@@ -224,9 +244,11 @@ def test_regression_multioutput_array():
     y_pred = [[1, 1]]*4
     mse = mean_squared_error(y_true, y_pred, multioutput='raw_values')
     mae = mean_absolute_error(y_true, y_pred, multioutput='raw_values')
+    pbl = mean_pinball_loss(y_true, y_pred, multioutput='raw_values')
     r = r2_score(y_true, y_pred, multioutput='raw_values')
     assert_array_almost_equal(mse, [1., 1.], decimal=2)
     assert_array_almost_equal(mae, [1., 1.], decimal=2)
+    assert_array_almost_equal(pbl, [0.5, 0.5], decimal=2)
     assert_array_almost_equal(r, [0., 0.], decimal=2)
 
     r = r2_score([[0, -1], [0, 1]], [[2, 2], [1, 1]], multioutput='raw_values')
@@ -330,3 +352,87 @@ def test_mean_absolute_percentage_error():
     y_true = random_number_generator.exponential(size=100)
     y_pred = 1.2 * y_true
     assert mean_absolute_percentage_error(y_true, y_pred) == pytest.approx(0.2)
+
+
+@pytest.mark.parametrize("distribution",
+                         ["normal", "lognormal", "exponential", "uniform"])
+@pytest.mark.parametrize("target_quantile", [0.05, 0.5, 0.75])
+def test_mean_pinball_loss_on_constant_predictions(
+    distribution,
+    target_quantile
+):
+    if not hasattr(np, "quantile"):
+        pytest.skip("This test requires a more recent version of numpy "
+                    "with support for np.quantile.")
+
+    # Check that the pinball loss is minimized by the empirical quantile.
+    n_samples = 3000
+    rng = np.random.RandomState(42)
+    data = getattr(rng, distribution)(size=n_samples)
+
+    # Compute the best possible pinball loss for any constant predictor:
+    best_pred = np.quantile(data, target_quantile)
+    best_constant_pred = np.full(n_samples, fill_value=best_pred)
+    best_pbl = mean_pinball_loss(data, best_constant_pred,
+                                 alpha=target_quantile)
+
+    # Evaluate the loss on a grid of quantiles
+    candidate_predictions = np.quantile(data, np.linspace(0, 1, 100))
+    for pred in candidate_predictions:
+        # Compute the pinball loss of a constant predictor:
+        constant_pred = np.full(n_samples, fill_value=pred)
+        pbl = mean_pinball_loss(data, constant_pred, alpha=target_quantile)
+
+        # Check that the loss of this constant predictor is greater or equal
+        # than the loss of using the optimal quantile (up to machine
+        # precision):
+        assert pbl >= best_pbl - np.finfo(best_pbl.dtype).eps
+
+        # Check that the value of the pinball loss matches the analytical
+        # formula.
+        expected_pbl = (
+            (pred - data[data < pred]).sum() * (1 - target_quantile) +
+            (data[data >= pred] - pred).sum() * target_quantile
+        )
+        expected_pbl /= n_samples
+        assert_almost_equal(expected_pbl, pbl)
+
+    # Check that we can actually recover the target_quantile by minimizing the
+    # pinball loss w.r.t. the constant prediction quantile.
+    def objective_func(x):
+        constant_pred = np.full(n_samples, fill_value=x)
+        return mean_pinball_loss(data, constant_pred, alpha=target_quantile)
+
+    result = optimize.minimize(objective_func, data.mean(),
+                               method="Nelder-Mead")
+    assert result.success
+    # The minimum is not unique with limited data, hence the large tolerance.
+    assert result.x == pytest.approx(best_pred, rel=1e-2)
+    assert result.fun == pytest.approx(best_pbl)
+
+
+def test_dummy_quantile_parameter_tuning():
+    # Integration test to check that it is possible to use the pinball loss to
+    # tune the hyperparameter of a quantile regressor. This is conceptually
+    # similar to the previous test but using the scikit-learn estimator and
+    # scoring API instead.
+    n_samples = 1000
+    rng = np.random.RandomState(0)
+    X = rng.normal(size=(n_samples, 5))  # Ignored
+    y = rng.exponential(size=n_samples)
+
+    all_quantiles = [0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95]
+    for alpha in all_quantiles:
+        neg_mean_pinball_loss = make_scorer(
+            mean_pinball_loss,
+            alpha=alpha,
+            greater_is_better=False,
+        )
+        regressor = DummyRegressor(strategy="quantile", quantile=0.25)
+        grid_search = GridSearchCV(
+            regressor,
+            param_grid=dict(quantile=all_quantiles),
+            scoring=neg_mean_pinball_loss,
+        ).fit(X, y)
+
+        assert grid_search.best_params_["quantile"] == pytest.approx(alpha)

From e9c6fcaa17b983858400465fd39a2616c980c3db Mon Sep 17 00:00:00 2001
From: Dmitry Kobak <dmitry.kobak@uni-tuebingen.de>
Date: Thu, 18 Feb 2021 19:55:43 +0100
Subject: [PATCH 184/478] Fix underflow issues due to float precision in TSNE
 (#19472)

---
 doc/whats_new/v1.0.rst               | 10 +++++++++-
 sklearn/manifold/_utils.pyx          | 18 +++++++++---------
 sklearn/manifold/tests/test_t_sne.py | 12 ++++++++++++
 3 files changed, 30 insertions(+), 10 deletions(-)

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 1005920b891d3..66272c97d7a16 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -22,7 +22,8 @@ parameters, may produce different models from the previous version. This often
 occurs due to changes in the modelling logic (bug fixes or enhancements), or in
 random sampling procedures.
 
-
+- |Fix| :class:`manifold.TSNE` now avoids numerical underflow issues during
+  affinity matrix computation.
 
 Details are listed in the changelog below.
 
@@ -134,6 +135,13 @@ Changelog
   :pr:`17743` by :user:`Maria Telenczuk <maikia>` and
   :user:`Alexandre Gramfort <agramfort>`.
 
+:mod:`sklearn.manifold`
+.......................
+
+- |Fix| Change numerical precision to prevent underflow issues
+  during affinity matrix computation for :class:`manifold.TSNE`.
+  :pr:`19472` by :user:`Dmitry Kobak <dkobak>`.
+
 :mod:`sklearn.metrics`
 ......................
 
diff --git a/sklearn/manifold/_utils.pyx b/sklearn/manifold/_utils.pyx
index 0cc2b0af137cc..cd6ade795ae91 100644
--- a/sklearn/manifold/_utils.pyx
+++ b/sklearn/manifold/_utils.pyx
@@ -51,18 +51,18 @@ cpdef np.ndarray[np.float32_t, ndim=2] _binary_search_perplexity(
     cdef long n_neighbors = sqdistances.shape[1]
     cdef int using_neighbors = n_neighbors < n_samples
     # Precisions of conditional Gaussian distributions
-    cdef float beta
-    cdef float beta_min
-    cdef float beta_max
-    cdef float beta_sum = 0.0
+    cdef double beta
+    cdef double beta_min
+    cdef double beta_max
+    cdef double beta_sum = 0.0
 
     # Use log scale
-    cdef float desired_entropy = math.log(desired_perplexity)
-    cdef float entropy_diff
+    cdef double desired_entropy = math.log(desired_perplexity)
+    cdef double entropy_diff
 
-    cdef float entropy
-    cdef float sum_Pi
-    cdef float sum_disti_Pi
+    cdef double entropy
+    cdef double sum_Pi
+    cdef double sum_disti_Pi
     cdef long i, j, k, l
 
     # This array is later used as a 32bit array. It has multiple intermediate
diff --git a/sklearn/manifold/tests/test_t_sne.py b/sklearn/manifold/tests/test_t_sne.py
index 716c031d4f5bf..bd0cc3df339bf 100644
--- a/sklearn/manifold/tests/test_t_sne.py
+++ b/sklearn/manifold/tests/test_t_sne.py
@@ -116,6 +116,18 @@ def test_binary_search():
     assert_almost_equal(mean_perplexity, desired_perplexity, decimal=3)
 
 
+def test_binary_search_underflow():
+    # Test if the binary search finds Gaussians with desired perplexity.
+    # A more challenging case than the one above, producing numeric
+    # underflow in float precision (see issue #19471 and PR #19472).
+    random_state = check_random_state(42)
+    data = random_state.randn(1, 90).astype(np.float32) + 100
+    desired_perplexity = 30.0
+    P = _binary_search_perplexity(data, desired_perplexity, verbose=0)
+    perplexity = 2 ** -np.nansum(P[0, 1:] * np.log2(P[0, 1:]))
+    assert_almost_equal(perplexity, desired_perplexity, decimal=3)
+
+
 def test_binary_search_neighbors():
     # Binary perplexity search approximation.
     # Should be approximately equal to the slow method when we use

From 0c39dd32ea23922f52312615d65521f03ed247a3 Mon Sep 17 00:00:00 2001
From: James Alan Preiss <jamesalanpreiss@gmail.com>
Date: Fri, 19 Feb 2021 01:39:49 -0800
Subject: [PATCH 185/478] DOC enet_path docstring: fix sub/superscript swap
 (#19493)

---
 sklearn/linear_model/_coordinate_descent.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/linear_model/_coordinate_descent.py b/sklearn/linear_model/_coordinate_descent.py
index 9b50ea93c78c2..4fdeb783db194 100644
--- a/sklearn/linear_model/_coordinate_descent.py
+++ b/sklearn/linear_model/_coordinate_descent.py
@@ -332,7 +332,7 @@ def enet_path(X, y, *, l1_ratio=0.5, eps=1e-3, n_alphas=100, alphas=None,
 
     For multi-output tasks it is::
 
-        (1 / (2 * n_samples)) * ||Y - XW||^Fro_2
+        (1 / (2 * n_samples)) * ||Y - XW||_Fro^2
         + alpha * l1_ratio * ||W||_21
         + 0.5 * alpha * (1 - l1_ratio) * ||W||_Fro^2
 

From 43241b1979f901a44b3c30ac58e005b0179d784a Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Fri, 19 Feb 2021 11:41:17 +0100
Subject: [PATCH 186/478] DOC fix missing closing quote in user guide

---
 doc/modules/preprocessing.rst | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
index a339b4bfae4e2..e1b4c5599c3b5 100644
--- a/doc/modules/preprocessing.rst
+++ b/doc/modules/preprocessing.rst
@@ -878,8 +878,9 @@ three middle diagonals are non-zero for ``degree=2``. The higher the degree,
 the more overlapping of the splines.
 
 Interestingly, a :class:`SplineTransformer` of ``degree=0`` is the same as
-:class:`~sklearn.preprocessing.KBinsDiscretizer` with ``encode='onehot-dense``
-and ``n_bins = n_knots - 1`` if ``knots = strategy``.
+:class:`~sklearn.preprocessing.KBinsDiscretizer` with
+``encode='onehot-dense'`` and ``n_bins = n_knots - 1`` if
+``knots = strategy``.
 
 .. topic:: Examples:
 

From b5e55f79fdfcb0f41f0cfb279e54a123822bca43 Mon Sep 17 00:00:00 2001
From: Atsushi Nukariya <n.a.sch137@gmail.com>
Date: Fri, 19 Feb 2021 19:44:09 +0900
Subject: [PATCH 187/478] TST replace assert_warns* by pytest.warns in
 model_selection/tests (#19458)

---
 sklearn/model_selection/_validation.py        |  2 +-
 sklearn/model_selection/tests/test_search.py  | 20 ++++++---
 sklearn/model_selection/tests/test_split.py   |  5 +--
 .../model_selection/tests/test_validation.py  | 41 ++++++++++++-------
 4 files changed, 43 insertions(+), 25 deletions(-)

diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py
index 7a52b656e1804..8452c4eafbf90 100644
--- a/sklearn/model_selection/_validation.py
+++ b/sklearn/model_selection/_validation.py
@@ -1473,7 +1473,7 @@ def _translate_train_sizes(train_sizes, n_max_training_samples):
     if n_ticks > train_sizes_abs.shape[0]:
         warnings.warn("Removed duplicate entries from 'train_sizes'. Number "
                       "of ticks will be less than the size of "
-                      "'train_sizes' %d instead of %d)."
+                      "'train_sizes': %d instead of %d."
                       % (train_sizes_abs.shape[0], n_ticks), RuntimeWarning)
 
     return train_sizes_abs
diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py
index 5e63716164b6f..f9e0babebe3ad 100644
--- a/sklearn/model_selection/tests/test_search.py
+++ b/sklearn/model_selection/tests/test_search.py
@@ -14,8 +14,6 @@
 import pytest
 
 from sklearn.utils._testing import (
-    assert_warns,
-    assert_warns_message,
     assert_raise_message,
     assert_array_equal,
     assert_array_almost_equal,
@@ -1433,7 +1431,12 @@ def test_grid_search_failing_classifier():
     # error in this test.
     gs = GridSearchCV(clf, [{'parameter': [0, 1, 2]}], scoring='accuracy',
                       refit=False, error_score=0.0)
-    assert_warns(FitFailedWarning, gs.fit, X, y)
+    warning_message = (
+        "Estimator fit failed. The score on this train-test partition "
+        "for these parameters will be set to 0.0.*."
+    )
+    with pytest.warns(FitFailedWarning, match=warning_message):
+        gs.fit(X, y)
     n_candidates = len(gs.cv_results_['params'])
 
     # Ensure that grid scores were set to zero as required for those fits
@@ -1449,7 +1452,12 @@ def get_cand_scores(i):
 
     gs = GridSearchCV(clf, [{'parameter': [0, 1, 2]}], scoring='accuracy',
                       refit=False, error_score=float('nan'))
-    assert_warns(FitFailedWarning, gs.fit, X, y)
+    warning_message = (
+        "Estimator fit failed. The score on this train-test partition "
+        "for these parameters will be set to nan."
+    )
+    with pytest.warns(FitFailedWarning, match=warning_message):
+        gs.fit(X, y)
     n_candidates = len(gs.cv_results_['params'])
     assert all(np.all(np.isnan(get_cand_scores(cand_i)))
                for cand_i in range(n_candidates)
@@ -1492,8 +1500,8 @@ def test_parameters_sampler_replacement():
                         'than n_iter=%d. Running %d iterations. For '
                         'exhaustive searches, use GridSearchCV.'
                         % (grid_size, n_iter, grid_size))
-    assert_warns_message(UserWarning, expected_warning,
-                         list, sampler)
+    with pytest.warns(UserWarning, match=expected_warning):
+        list(sampler)
 
     # degenerates to GridSearchCV if n_iter the same as grid_size
     sampler = ParameterSampler(params, n_iter=8)
diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py
index 5d91a505238ef..183a2eab84b63 100644
--- a/sklearn/model_selection/tests/test_split.py
+++ b/sklearn/model_selection/tests/test_split.py
@@ -14,7 +14,6 @@
 from sklearn.utils._testing import assert_raises_regexp
 from sklearn.utils._testing import assert_array_almost_equal
 from sklearn.utils._testing import assert_array_equal
-from sklearn.utils._testing import assert_warns_message
 from sklearn.utils._testing import assert_raise_message
 from sklearn.utils._testing import ignore_warnings
 from sklearn.utils.validation import _num_samples
@@ -193,8 +192,8 @@ def test_kfold_valueerrors():
     y = np.array([3, 3, -1, -1, 3])
 
     skf_3 = StratifiedKFold(3)
-    assert_warns_message(Warning, "The least populated class",
-                         next, skf_3.split(X2, y))
+    with pytest.warns(Warning, match="The least populated class"):
+        next(skf_3.split(X2, y))
 
     # Check that despite the warning the folds are still computed even
     # though all the classes are not necessarily represented at on each
diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py
index 8405d3b38c452..8bb853bcd51b4 100644
--- a/sklearn/model_selection/tests/test_validation.py
+++ b/sklearn/model_selection/tests/test_validation.py
@@ -17,8 +17,6 @@
 from sklearn.utils._testing import assert_almost_equal
 from sklearn.utils._testing import assert_raises
 from sklearn.utils._testing import assert_raise_message
-from sklearn.utils._testing import assert_warns
-from sklearn.utils._testing import assert_warns_message
 from sklearn.utils._testing import assert_raises_regex
 from sklearn.utils._testing import assert_array_almost_equal
 from sklearn.utils._testing import assert_array_equal
@@ -857,13 +855,12 @@ def split(self, X, y=None, groups=None):
 
     X, y = load_iris(return_X_y=True)
 
-    warning_message = ('Number of classes in training fold (2) does '
-                       'not match total number of classes (3). '
+    warning_message = (r'Number of classes in training fold \(2\) does '
+                       r'not match total number of classes \(3\). '
                        'Results may not be appropriate for your use case.')
-    assert_warns_message(RuntimeWarning, warning_message,
-                         cross_val_predict,
-                         LogisticRegression(solver="liblinear"),
-                         X, y, method='predict_proba', cv=KFold(2))
+    with pytest.warns(RuntimeWarning, match=warning_message):
+        cross_val_predict(LogisticRegression(solver="liblinear"),
+                          X, y, method='predict_proba', cv=KFold(2))
 
 
 def test_cross_val_predict_decision_function_shape():
@@ -1210,9 +1207,13 @@ def test_learning_curve_remove_duplicate_sample_sizes():
                                n_redundant=0, n_classes=2,
                                n_clusters_per_class=1, random_state=0)
     estimator = MockImprovingEstimator(2)
-    train_sizes, _, _ = assert_warns(
-        RuntimeWarning, learning_curve, estimator, X, y, cv=3,
-        train_sizes=np.linspace(0.33, 1.0, 3))
+    warning_message = (
+        "Removed duplicate entries from 'train_sizes'. Number of ticks "
+        "will be less than the size of 'train_sizes': 2 instead of 3."
+    )
+    with pytest.warns(RuntimeWarning, match=warning_message):
+        train_sizes, _, _ = learning_curve(
+            estimator, X, y, cv=3, train_sizes=np.linspace(0.33, 1.0, 3))
     assert_array_equal(train_sizes, [1, 2])
 
 
@@ -1753,8 +1754,13 @@ def test_fit_and_score_failing():
     # passing error score to trigger the warning message
     fit_and_score_kwargs = {'error_score': 0}
     # check if the warning message type is as expected
-    assert_warns(FitFailedWarning, _fit_and_score, *fit_and_score_args,
-                 **fit_and_score_kwargs)
+    warning_message = (
+        "Estimator fit failed. The score on this train-test partition for "
+        "these parameters will be set to %f."
+        % (fit_and_score_kwargs['error_score'])
+    )
+    with pytest.warns(FitFailedWarning, match=warning_message):
+        _fit_and_score(*fit_and_score_args, **fit_and_score_kwargs)
     # since we're using FailingClassfier, our error will be the following
     error_message = "ValueError: Failing classifier failed as required"
     # the warning message we're expecting to see
@@ -1769,8 +1775,13 @@ def test_warn_trace(msg):
         mtb = split[0] + '\n' + split[-1]
         return warning_message in mtb
     # check traceback is included
-    assert_warns_message(FitFailedWarning, test_warn_trace, _fit_and_score,
-                         *fit_and_score_args, **fit_and_score_kwargs)
+    warning_message = (
+        "Estimator fit failed. The score on this train-test partition for "
+        "these parameters will be set to %f."
+        % (fit_and_score_kwargs['error_score'])
+    )
+    with pytest.warns(FitFailedWarning, match=warning_message):
+        _fit_and_score(*fit_and_score_args, **fit_and_score_kwargs)
 
     fit_and_score_kwargs = {'error_score': 'raise'}
     # check if exception was raised, with default error_score='raise'

From b169bc09b06bc257186feebb9706d38048365987 Mon Sep 17 00:00:00 2001
From: Maren Westermann <maren.westermann@gmail.com>
Date: Sat, 20 Feb 2021 09:50:57 +0100
Subject: [PATCH 188/478] FIX RuntimeWarning by dividing by zero in
 test_sanity_check_pls_regression_constant_column_Y (#19480)

---
 sklearn/cross_decomposition/tests/test_pls.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/sklearn/cross_decomposition/tests/test_pls.py b/sklearn/cross_decomposition/tests/test_pls.py
index c01e790ca1644..04c791fd4154a 100644
--- a/sklearn/cross_decomposition/tests/test_pls.py
+++ b/sklearn/cross_decomposition/tests/test_pls.py
@@ -148,9 +148,12 @@ def test_sanity_check_pls_regression_constant_column_Y():
 
     x_loadings_sign_flip = np.sign(expected_x_loadings / pls.x_loadings_)
     x_weights_sign_flip = np.sign(expected_x_weights / pls.x_weights_)
-    y_loadings_sign_flip = np.sign(expected_y_loadings / pls.y_loadings_)
+    # we ignore the first full-zeros row for y
+    y_loadings_sign_flip = np.sign(expected_y_loadings[1:] /
+                                   pls.y_loadings_[1:])
+
     assert_array_equal(x_loadings_sign_flip, x_weights_sign_flip)
-    assert_array_equal(x_loadings_sign_flip[1:], y_loadings_sign_flip[1:])
+    assert_array_equal(x_loadings_sign_flip[1:], y_loadings_sign_flip)
 
 
 def test_sanity_check_pls_canonical():

From 70c6ac9d04c396faaf604c2fd1d3945f25e4d6d4 Mon Sep 17 00:00:00 2001
From: Ashish <marisetty19102@iiitnr.edu.in>
Date: Sat, 20 Feb 2021 22:31:19 +0530
Subject: [PATCH 189/478] DOC Fixed typos in documentation (#19511)

---
 sklearn/inspection/_permutation_importance.py | 2 +-
 sklearn/model_selection/_search.py            | 4 ++--
 sklearn/model_selection/_validation.py        | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/sklearn/inspection/_permutation_importance.py b/sklearn/inspection/_permutation_importance.py
index 9f2bdb0916254..2a7b6cd23147b 100644
--- a/sklearn/inspection/_permutation_importance.py
+++ b/sklearn/inspection/_permutation_importance.py
@@ -115,7 +115,7 @@ def permutation_importance(estimator, X, y, *, scoring=None, n_repeats=5,
         - a single string (see :ref:`scoring_parameter`);
         - a callable (see :ref:`scoring`) that returns a single value.
 
-        If `scoring` reprents multiple scores, one can use:
+        If `scoring` represents multiple scores, one can use:
 
         - a list or tuple of unique strings;
         - a callable returning a dictionary where the keys are the metric
diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py
index e9c498816eae2..abe3b87488d8c 100644
--- a/sklearn/model_selection/_search.py
+++ b/sklearn/model_selection/_search.py
@@ -1006,7 +1006,7 @@ class GridSearchCV(BaseSearchCV):
         - a single string (see :ref:`scoring_parameter`);
         - a callable (see :ref:`scoring`) that returns a single value.
 
-        If `scoring` reprents multiple scores, one can use:
+        If `scoring` represents multiple scores, one can use:
 
         - a list or tuple of unique strings;
         - a callable returning a dictionary where the keys are the metric
@@ -1346,7 +1346,7 @@ class RandomizedSearchCV(BaseSearchCV):
         - a single string (see :ref:`scoring_parameter`);
         - a callable (see :ref:`scoring`) that returns a single value.
 
-        If `scoring` reprents multiple scores, one can use:
+        If `scoring` represents multiple scores, one can use:
 
         - a list or tuple of unique strings;
         - a callable returning a dictionary where the keys are the metric
diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py
index 8452c4eafbf90..63f9a53fcf91f 100644
--- a/sklearn/model_selection/_validation.py
+++ b/sklearn/model_selection/_validation.py
@@ -74,7 +74,7 @@ def cross_validate(estimator, X, y=None, *, groups=None, scoring=None, cv=None,
         - a single string (see :ref:`scoring_parameter`);
         - a callable (see :ref:`scoring`) that returns a single value.
 
-        If `scoring` reprents multiple scores, one can use:
+        If `scoring` represents multiple scores, one can use:
 
         - a list or tuple of unique strings;
         - a callable returning a dictionary where the keys are the metric

From 23f8df3c8e96697cc965116416bcfc85f3a716e8 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@gmail.com>
Date: Mon, 22 Feb 2021 08:38:43 +0100
Subject: [PATCH 190/478] TST Improve ridge solver consistency tests (#19503)

---
 sklearn/linear_model/tests/test_ridge.py | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/sklearn/linear_model/tests/test_ridge.py b/sklearn/linear_model/tests/test_ridge.py
index 6e8a6761dda26..8e33514af83f9 100644
--- a/sklearn/linear_model/tests/test_ridge.py
+++ b/sklearn/linear_model/tests/test_ridge.py
@@ -39,6 +39,7 @@
 from sklearn.model_selection import cross_val_predict
 from sklearn.model_selection import LeaveOneOut
 
+from sklearn.preprocessing import minmax_scale
 from sklearn.utils import check_random_state
 from sklearn.datasets import make_multilabel_classification
 
@@ -415,24 +416,32 @@ def _make_sparse_offset_regression(
 @pytest.mark.parametrize(
     'n_samples,dtype,proportion_nonzero',
     [(20, 'float32', .1), (40, 'float32', 1.), (20, 'float64', .2)])
+@pytest.mark.parametrize('normalize', [True, False])
 @pytest.mark.parametrize('seed', np.arange(3))
 def test_solver_consistency(
-        solver, proportion_nonzero, n_samples, dtype, sparse_X, seed):
+        solver, proportion_nonzero, n_samples, dtype, sparse_X, seed,
+        normalize):
     alpha = 1.
     noise = 50. if proportion_nonzero > .9 else 500.
     X, y = _make_sparse_offset_regression(
         bias=10, n_features=30, proportion_nonzero=proportion_nonzero,
         noise=noise, random_state=seed, n_samples=n_samples)
+    if not normalize:
+        # Manually scale the data to avoid pathological cases. We use
+        # minmax_scale to deal with the sparse case without breaking
+        # the sparsity pattern.
+        X = minmax_scale(X)
     svd_ridge = Ridge(
-        solver='svd', normalize=True, alpha=alpha).fit(X, y)
+        solver='svd', normalize=normalize, alpha=alpha).fit(X, y)
     X = X.astype(dtype, copy=False)
     y = y.astype(dtype, copy=False)
     if sparse_X:
         X = sp.csr_matrix(X)
     if solver == 'ridgecv':
-        ridge = RidgeCV(alphas=[alpha], normalize=True)
+        ridge = RidgeCV(alphas=[alpha], normalize=normalize)
     else:
-        ridge = Ridge(solver=solver, tol=1e-10, normalize=True, alpha=alpha)
+        ridge = Ridge(solver=solver, tol=1e-10, normalize=normalize,
+                      alpha=alpha)
     ridge.fit(X, y)
     assert_allclose(
         ridge.coef_, svd_ridge.coef_, atol=1e-3, rtol=1e-3)

From c3eb5eda0fe08d3c0341031a23211b89721ae3a8 Mon Sep 17 00:00:00 2001
From: Haidar Almubarak <h.almubarak@ieee.org>
Date: Mon, 22 Feb 2021 11:54:31 +0300
Subject: [PATCH 191/478] TST replace assert_raise_* by pytest.raises in
 neighbors module (#19388)

Co-authored-by: SteveKola <kolawolesteven99@gmail.com>
Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 sklearn/cluster/tests/test_hierarchical.py    |   7 +-
 sklearn/cluster/tests/test_mean_shift.py      |   7 +-
 sklearn/cluster/tests/test_optics.py          |   7 +-
 sklearn/neighbors/tests/test_dist_metrics.py  |   7 +-
 sklearn/neighbors/tests/test_kde.py           |  39 +++--
 sklearn/neighbors/tests/test_lof.py           |  20 +--
 sklearn/neighbors/tests/test_nca.py           | 165 +++++++++---------
 .../neighbors/tests/test_nearest_centroid.py  |   6 +-
 sklearn/neighbors/tests/test_neighbors.py     | 139 ++++++++-------
 9 files changed, 203 insertions(+), 194 deletions(-)

diff --git a/sklearn/cluster/tests/test_hierarchical.py b/sklearn/cluster/tests/test_hierarchical.py
index b5a2d9bbf2701..1f835a52f0105 100644
--- a/sklearn/cluster/tests/test_hierarchical.py
+++ b/sklearn/cluster/tests/test_hierarchical.py
@@ -17,7 +17,6 @@
 from sklearn.metrics.cluster import adjusted_rand_score
 from sklearn.utils._testing import assert_almost_equal
 from sklearn.utils._testing import assert_array_almost_equal
-from sklearn.utils._testing import assert_raise_message
 from sklearn.utils._testing import ignore_warnings
 
 from sklearn.cluster import ward_tree
@@ -140,7 +139,8 @@ def test_zero_cosine_linkage_tree():
     X = np.array([[0, 1],
                   [0, 0]])
     msg = 'Cosine affinity cannot be used when X contains zero vectors'
-    assert_raise_message(ValueError, msg, linkage_tree, X, affinity='cosine')
+    with pytest.raises(ValueError, match=msg):
+        linkage_tree(X, affinity='cosine')
 
 
 @pytest.mark.parametrize('n_clusters, distance_threshold',
@@ -644,7 +644,8 @@ def test_agg_n_clusters():
         agc = AgglomerativeClustering(n_clusters=n_clus)
         msg = ("n_clusters should be an integer greater than 0."
                " %s was provided." % str(agc.n_clusters))
-        assert_raise_message(ValueError, msg, agc.fit, X)
+        with pytest.raises(ValueError, match=msg):
+            agc.fit(X)
 
 
 def test_affinity_passed_to_fix_connectivity():
diff --git a/sklearn/cluster/tests/test_mean_shift.py b/sklearn/cluster/tests/test_mean_shift.py
index 7d2300711466a..2feb5363c28c8 100644
--- a/sklearn/cluster/tests/test_mean_shift.py
+++ b/sklearn/cluster/tests/test_mean_shift.py
@@ -11,7 +11,6 @@
 
 from sklearn.utils._testing import assert_array_equal
 from sklearn.utils._testing import assert_array_almost_equal
-from sklearn.utils._testing import assert_raise_message
 from sklearn.utils._testing import assert_allclose
 
 from sklearn.cluster import MeanShift
@@ -73,7 +72,8 @@ def test_estimate_bandwidth_with_sparse_matrix():
     # Test estimate_bandwidth with sparse matrix
     X = sparse.lil_matrix((1000, 1000))
     msg = "A sparse matrix was passed, but dense data is required."
-    assert_raise_message(TypeError, msg, estimate_bandwidth, X)
+    with pytest.raises(TypeError, match=msg):
+        estimate_bandwidth(X)
 
 
 def test_parallel():
@@ -103,7 +103,8 @@ def test_meanshift_all_orphans():
     # init away from the data, crash with a sensible warning
     ms = MeanShift(bandwidth=0.1, seeds=[[-9, -9], [-10, -10]])
     msg = "No point was within bandwidth=0.1"
-    assert_raise_message(ValueError, msg, ms.fit, X,)
+    with pytest.raises(ValueError, match=msg):
+        ms.fit(X,)
 
 
 def test_unfitted():
diff --git a/sklearn/cluster/tests/test_optics.py b/sklearn/cluster/tests/test_optics.py
index 8578c68d0f0dc..b253173c0b957 100644
--- a/sklearn/cluster/tests/test_optics.py
+++ b/sklearn/cluster/tests/test_optics.py
@@ -13,7 +13,6 @@
 from sklearn.cluster import DBSCAN
 from sklearn.utils import shuffle
 from sklearn.utils._testing import assert_array_equal
-from sklearn.utils._testing import assert_raise_message
 from sklearn.utils._testing import assert_allclose
 
 from sklearn.cluster.tests.common import generate_clustered_data
@@ -181,7 +180,8 @@ def test_minimum_number_of_sample_check():
     clust = OPTICS(max_eps=5.0 * 0.3, min_samples=10, min_cluster_size=1)
 
     # Run the fit
-    assert_raise_message(ValueError, msg, clust.fit, X)
+    with pytest.raises(ValueError, match=msg):
+        clust.fit(X)
 
 
 def test_bad_extract():
@@ -195,7 +195,8 @@ def test_bad_extract():
     clust = OPTICS(max_eps=5.0 * 0.03,
                    cluster_method='dbscan',
                    eps=0.3, min_samples=10)
-    assert_raise_message(ValueError, msg, clust.fit, X)
+    with pytest.raises(ValueError, match=msg):
+        clust.fit(X)
 
 
 def test_bad_reachability():
diff --git a/sklearn/neighbors/tests/test_dist_metrics.py b/sklearn/neighbors/tests/test_dist_metrics.py
index 441bcc134fe6b..05e0f4294ebb6 100644
--- a/sklearn/neighbors/tests/test_dist_metrics.py
+++ b/sklearn/neighbors/tests/test_dist_metrics.py
@@ -10,7 +10,6 @@
 from sklearn.neighbors import DistanceMetric
 from sklearn.neighbors import BallTree
 from sklearn.utils import check_random_state
-from sklearn.utils._testing import assert_raises_regex
 from sklearn.utils.fixes import sp_version, parse_version
 
 
@@ -207,9 +206,9 @@ def wrong_distance(x, y):
         return "1"
 
     X = np.ones((5, 2))
-    assert_raises_regex(TypeError,
-                        "Custom distance function must accept two vectors",
-                        BallTree, X, metric=wrong_distance)
+    msg = "Custom distance function must accept two vectors"
+    with pytest.raises(TypeError, match=msg):
+        BallTree(X, metric=wrong_distance)
 
 
 def test_input_data_size():
diff --git a/sklearn/neighbors/tests/test_kde.py b/sklearn/neighbors/tests/test_kde.py
index cff7ffafe5acd..90ce667e5c284 100644
--- a/sklearn/neighbors/tests/test_kde.py
+++ b/sklearn/neighbors/tests/test_kde.py
@@ -2,7 +2,7 @@
 
 import pytest
 
-from sklearn.utils._testing import assert_allclose, assert_raises
+from sklearn.utils._testing import assert_allclose
 from sklearn.neighbors import KernelDensity, KDTree, NearestNeighbors
 from sklearn.neighbors._ball_tree import kernel_norm
 from sklearn.pipeline import make_pipeline
@@ -92,7 +92,8 @@ def test_kernel_density_sampling(n_samples=100, n_features=3):
     # check unsupported kernels
     for kernel in ['epanechnikov', 'exponential', 'linear', 'cosine']:
         kde = KernelDensity(bandwidth=bandwidth, kernel=kernel).fit(X)
-        assert_raises(NotImplementedError, kde.sample, 100)
+        with pytest.raises(NotImplementedError):
+            kde.sample(100)
 
     # non-regression test: used to return a scalar
     X = rng.randn(4, 1)
@@ -111,8 +112,8 @@ def test_kde_algorithm_metric_choice(algorithm, metric):
     Y = rng.randn(10, 2)
 
     if algorithm == 'kd_tree' and metric not in KDTree.valid_metrics:
-        assert_raises(ValueError, KernelDensity,
-                      algorithm=algorithm, metric=metric)
+        with pytest.raises(ValueError):
+            KernelDensity(algorithm=algorithm, metric=metric)
     else:
         kde = KernelDensity(algorithm=algorithm, metric=metric)
         kde.fit(X)
@@ -129,21 +130,23 @@ def test_kde_score(n_samples=100, n_features=3):
 
 
 def test_kde_badargs():
-    assert_raises(ValueError, KernelDensity,
-                  algorithm='blah')
-    assert_raises(ValueError, KernelDensity,
-                  bandwidth=0)
-    assert_raises(ValueError, KernelDensity,
-                  kernel='blah')
-    assert_raises(ValueError, KernelDensity,
-                  metric='blah')
-    assert_raises(ValueError, KernelDensity,
-                  algorithm='kd_tree', metric='blah')
+    with pytest.raises(ValueError):
+        KernelDensity(algorithm='blah')
+    with pytest.raises(ValueError):
+        KernelDensity(bandwidth=0)
+    with pytest.raises(ValueError):
+        KernelDensity(kernel='blah')
+    with pytest.raises(ValueError):
+        KernelDensity(metric='blah')
+    with pytest.raises(ValueError):
+        KernelDensity(algorithm='kd_tree', metric='blah')
     kde = KernelDensity()
-    assert_raises(ValueError, kde.fit, np.random.random((200, 10)),
-                  sample_weight=np.random.random((200, 10)))
-    assert_raises(ValueError, kde.fit, np.random.random((200, 10)),
-                  sample_weight=-np.random.random(200))
+    with pytest.raises(ValueError):
+        kde.fit(np.random.random((200, 10)),
+                sample_weight=np.random.random((200, 10)))
+    with pytest.raises(ValueError):
+        kde.fit(np.random.random((200, 10)),
+                sample_weight=-np.random.random(200))
 
 
 def test_kde_pipeline_gridsearch():
diff --git a/sklearn/neighbors/tests/test_lof.py b/sklearn/neighbors/tests/test_lof.py
index 750fc57a8f457..5d479d5b141f7 100644
--- a/sklearn/neighbors/tests/test_lof.py
+++ b/sklearn/neighbors/tests/test_lof.py
@@ -6,7 +6,7 @@
 
 import numpy as np
 from sklearn import neighbors
-
+import re
 import pytest
 from numpy.testing import assert_array_equal
 
@@ -15,9 +15,6 @@
 
 from sklearn.utils import check_random_state
 from sklearn.utils._testing import assert_array_almost_equal
-from sklearn.utils._testing import assert_warns_message
-from sklearn.utils._testing import assert_raises
-from sklearn.utils._testing import assert_raises_regex
 from sklearn.utils.estimator_checks import check_estimator
 from sklearn.utils.estimator_checks import check_outlier_corruption
 
@@ -125,9 +122,9 @@ def test_n_neighbors_attribute():
     assert clf.n_neighbors_ == X.shape[0] - 1
 
     clf = neighbors.LocalOutlierFactor(n_neighbors=500)
-    assert_warns_message(UserWarning,
-                         "n_neighbors will be set to (n_samples - 1)",
-                         clf.fit, X)
+    msg = "n_neighbors will be set to (n_samples - 1)"
+    with pytest.warns(UserWarning, match=re.escape(msg)):
+        clf.fit(X)
     assert clf.n_neighbors_ == X.shape[0] - 1
 
 
@@ -149,7 +146,8 @@ def test_score_samples():
 def test_contamination():
     X = [[1, 1], [1, 0]]
     clf = neighbors.LocalOutlierFactor(contamination=0.6)
-    assert_raises(ValueError, clf.fit, X)
+    with pytest.raises(ValueError):
+        clf.fit(X)
 
 
 def test_novelty_errors():
@@ -161,12 +159,14 @@ def test_novelty_errors():
     # predict, decision_function and score_samples raise ValueError
     for method in ['predict', 'decision_function', 'score_samples']:
         msg = ('{} is not available when novelty=False'.format(method))
-        assert_raises_regex(AttributeError, msg, getattr, clf, method)
+        with pytest.raises(AttributeError, match=msg):
+            getattr(clf, method)
 
     # check errors for novelty=True
     clf = neighbors.LocalOutlierFactor(novelty=True)
     msg = 'fit_predict is not available when novelty=True'
-    assert_raises_regex(AttributeError, msg, getattr, clf, 'fit_predict')
+    with pytest.raises(AttributeError, match=msg):
+        getattr(clf, 'fit_predict')
 
 
 def test_novelty_training_scores():
diff --git a/sklearn/neighbors/tests/test_nca.py b/sklearn/neighbors/tests/test_nca.py
index 0bf6d3c0d1763..e7fc741899209 100644
--- a/sklearn/neighbors/tests/test_nca.py
+++ b/sklearn/neighbors/tests/test_nca.py
@@ -15,8 +15,6 @@
 from sklearn import clone
 from sklearn.exceptions import ConvergenceWarning
 from sklearn.utils import check_random_state
-from sklearn.utils._testing import (assert_raises,
-                                   assert_raise_message, assert_warns_message)
 from sklearn.datasets import load_iris, make_classification, make_blobs
 from sklearn.neighbors import NeighborhoodComponentsAnalysis
 from sklearn.metrics import pairwise_distances
@@ -127,38 +125,42 @@ def test_params_validation():
     rng = np.random.RandomState(42)
 
     # TypeError
-    assert_raises(TypeError, NCA(max_iter='21').fit, X, y)
-    assert_raises(TypeError, NCA(verbose='true').fit, X, y)
-    assert_raises(TypeError, NCA(tol='1').fit, X, y)
-    assert_raises(TypeError, NCA(n_components='invalid').fit, X, y)
-    assert_raises(TypeError, NCA(warm_start=1).fit, X, y)
+    with pytest.raises(TypeError):
+        NCA(max_iter='21').fit(X, y)
+    with pytest.raises(TypeError):
+        NCA(verbose='true').fit(X, y)
+    with pytest.raises(TypeError):
+        NCA(tol='1').fit(X, y)
+    with pytest.raises(TypeError):
+        NCA(n_components='invalid').fit(X, y)
+    with pytest.raises(TypeError):
+        NCA(warm_start=1).fit(X, y)
 
     # ValueError
-    assert_raise_message(ValueError,
-                         "`init` must be 'auto', 'pca', 'lda', 'identity', "
-                         "'random' or a numpy array of shape "
-                         "(n_components, n_features).",
-                         NCA(init=1).fit, X, y)
-    assert_raise_message(ValueError,
-                         '`max_iter`= -1, must be >= 1.',
-                         NCA(max_iter=-1).fit, X, y)
-
+    msg = (
+        r"`init` must be 'auto', 'pca', 'lda', 'identity', 'random' or a "
+        r"numpy array of shape (n_components, n_features)."
+    )
+    with pytest.raises(ValueError, match=re.escape(msg)):
+        NCA(init=1).fit(X, y)
+    with pytest.raises(ValueError, match='`max_iter`= -1, must be >= 1.'):
+        NCA(max_iter=-1).fit(X, y)
     init = rng.rand(5, 3)
-    assert_raise_message(ValueError,
-                         'The output dimensionality ({}) of the given linear '
-                         'transformation `init` cannot be greater than its '
-                         'input dimensionality ({}).'
-                         .format(init.shape[0], init.shape[1]),
-                         NCA(init=init).fit, X, y)
-
+    msg = (
+        f"The output dimensionality ({init.shape[0]}) "
+        "of the given linear transformation `init` cannot be "
+        f"greater than its input dimensionality ({init.shape[1]})."
+    )
+    with pytest.raises(ValueError, match=re.escape(msg)):
+        NCA(init=init).fit(X, y)
     n_components = 10
-    assert_raise_message(ValueError,
-                         'The preferred dimensionality of the '
-                         'projected space `n_components` ({}) cannot '
-                         'be greater than the given data '
-                         'dimensionality ({})!'
-                         .format(n_components, X.shape[1]),
-                         NCA(n_components=n_components).fit, X, y)
+    msg = (
+        "The preferred dimensionality of the projected space "
+        f"`n_components` ({n_components}) cannot be greater "
+        f"than the given data dimensionality ({X.shape[1]})!"
+    )
+    with pytest.raises(ValueError, match=re.escape(msg)):
+        NCA(n_components=n_components).fit(X, y)
 
 
 def test_transformation_dimensions():
@@ -167,17 +169,15 @@ def test_transformation_dimensions():
 
     # Fail if transformation input dimension does not match inputs dimensions
     transformation = np.array([[1, 2], [3, 4]])
-    assert_raises(ValueError,
-                  NeighborhoodComponentsAnalysis(init=transformation).fit,
-                  X, y)
+    with pytest.raises(ValueError):
+        NeighborhoodComponentsAnalysis(init=transformation).fit(X, y)
 
     # Fail if transformation output dimension is larger than
     # transformation input dimension
     transformation = np.array([[1, 2], [3, 4], [5, 6]])
     # len(transformation) > len(transformation[0])
-    assert_raises(ValueError,
-                  NeighborhoodComponentsAnalysis(init=transformation).fit,
-                  X, y)
+    with pytest.raises(ValueError):
+        NeighborhoodComponentsAnalysis(init=transformation).fit(X, y)
 
     # Pass otherwise
     transformation = np.arange(9).reshape(3, 3)
@@ -194,24 +194,25 @@ def test_n_components():
     # n_components = X.shape[1] != transformation.shape[0]
     n_components = X.shape[1]
     nca = NeighborhoodComponentsAnalysis(init=init, n_components=n_components)
-    assert_raise_message(ValueError,
-                         'The preferred dimensionality of the '
-                         'projected space `n_components` ({}) does not match '
-                         'the output dimensionality of the given '
-                         'linear transformation `init` ({})!'
-                         .format(n_components, init.shape[0]),
-                         nca.fit, X, y)
+    msg = (
+        "The preferred dimensionality of the projected space "
+        f"`n_components` ({n_components}) does not match the output "
+        "dimensionality of the given linear transformation "
+        f"`init` ({init.shape[0]})!"
+    )
+    with pytest.raises(ValueError, match=re.escape(msg)):
+        nca.fit(X, y)
 
     # n_components > X.shape[1]
     n_components = X.shape[1] + 2
     nca = NeighborhoodComponentsAnalysis(init=init, n_components=n_components)
-    assert_raise_message(ValueError,
-                         'The preferred dimensionality of the '
-                         'projected space `n_components` ({}) cannot '
-                         'be greater than the given data '
-                         'dimensionality ({})!'
-                         .format(n_components, X.shape[1]),
-                         nca.fit, X, y)
+    msg = (
+        "The preferred dimensionality of the projected space "
+        f"`n_components` ({n_components}) cannot be greater than "
+        f"the given data dimensionality ({X.shape[1]})!"
+    )
+    with pytest.raises(ValueError, match=re.escape(msg)):
+        nca.fit(X, y)
 
     # n_components < X.shape[1]
     nca = NeighborhoodComponentsAnalysis(n_components=2, init='identity')
@@ -249,34 +250,37 @@ def test_init_transformation():
     # init.shape[1] must match X.shape[1]
     init = rng.rand(X.shape[1], X.shape[1] + 1)
     nca = NeighborhoodComponentsAnalysis(init=init)
-    assert_raise_message(ValueError,
-                         'The input dimensionality ({}) of the given '
-                         'linear transformation `init` must match the '
-                         'dimensionality of the given inputs `X` ({}).'
-                         .format(init.shape[1], X.shape[1]),
-                         nca.fit, X, y)
+    msg = (
+        f"The input dimensionality ({init.shape[1]}) of the given "
+        "linear transformation `init` must match the "
+        f"dimensionality of the given inputs `X` ({X.shape[1]})."
+    )
+    with pytest.raises(ValueError, match=re.escape(msg)):
+        nca.fit(X, y)
 
     # init.shape[0] must be <= init.shape[1]
     init = rng.rand(X.shape[1] + 1, X.shape[1])
     nca = NeighborhoodComponentsAnalysis(init=init)
-    assert_raise_message(ValueError,
-                         'The output dimensionality ({}) of the given '
-                         'linear transformation `init` cannot be '
-                         'greater than its input dimensionality ({}).'
-                         .format(init.shape[0], init.shape[1]),
-                         nca.fit, X, y)
+    msg = (
+        f"The output dimensionality ({init.shape[0]}) of the given "
+        "linear transformation `init` cannot be "
+        f"greater than its input dimensionality ({init.shape[1]})."
+    )
+    with pytest.raises(ValueError, match=re.escape(msg)):
+        nca.fit(X, y)
 
     # init.shape[0] must match n_components
     init = rng.rand(X.shape[1], X.shape[1])
     n_components = X.shape[1] - 2
     nca = NeighborhoodComponentsAnalysis(init=init, n_components=n_components)
-    assert_raise_message(ValueError,
-                         'The preferred dimensionality of the '
-                         'projected space `n_components` ({}) does not match '
-                         'the output dimensionality of the given '
-                         'linear transformation `init` ({})!'
-                         .format(n_components, init.shape[0]),
-                         nca.fit, X, y)
+    msg = (
+        "The preferred dimensionality of the "
+        f"projected space `n_components` ({n_components}) "
+        "does not match the output dimensionality of the given "
+        f"linear transformation `init` ({init.shape[0]})!"
+    )
+    with pytest.raises(ValueError, match=re.escape(msg)):
+        nca.fit(X, y)
 
 
 @pytest.mark.parametrize('n_samples', [3, 5, 7, 11])
@@ -325,13 +329,13 @@ def test_warm_start_validation():
     X_less_features, y = make_classification(n_samples=30, n_features=4,
                                              n_classes=4, n_redundant=0,
                                              n_informative=4, random_state=0)
-    assert_raise_message(ValueError,
-                         'The new inputs dimensionality ({}) does not '
-                         'match the input dimensionality of the '
-                         'previously learned transformation ({}).'
-                         .format(X_less_features.shape[1],
-                                 nca.components_.shape[1]),
-                         nca.fit, X_less_features, y)
+    msg = (
+        f"The new inputs dimensionality ({X_less_features.shape[1]}) "
+        "does not match the input dimensionality of the previously learned "
+        f"transformation ({nca.components_.shape[1]})."
+    )
+    with pytest.raises(ValueError, match=re.escape(msg)):
+        nca.fit(X_less_features, y)
 
 
 def test_warm_start_effectiveness():
@@ -466,7 +470,8 @@ def test_callback(capsys):
     y = iris_target
 
     nca = NeighborhoodComponentsAnalysis(callback='my_cb')
-    assert_raises(ValueError, nca.fit, X, y)
+    with pytest.raises(ValueError):
+        nca.fit(X, y)
 
     max_iter = 10
 
@@ -515,9 +520,9 @@ def callback(self, transformation, n_iter):
 def test_convergence_warning():
     nca = NeighborhoodComponentsAnalysis(max_iter=2, verbose=1)
     cls_name = nca.__class__.__name__
-    assert_warns_message(ConvergenceWarning,
-                         '[{}] NCA did not converge'.format(cls_name),
-                         nca.fit, iris_data, iris_target)
+    msg = '[{}] NCA did not converge'.format(cls_name)
+    with pytest.warns(ConvergenceWarning, match=re.escape(msg)):
+        nca.fit(iris_data, iris_target)
 
 
 @pytest.mark.parametrize('param, value', [('n_components', np.int32(3)),
diff --git a/sklearn/neighbors/tests/test_nearest_centroid.py b/sklearn/neighbors/tests/test_nearest_centroid.py
index 451aeff377e19..f91cae74b0585 100644
--- a/sklearn/neighbors/tests/test_nearest_centroid.py
+++ b/sklearn/neighbors/tests/test_nearest_centroid.py
@@ -3,12 +3,12 @@
 """
 
 import numpy as np
+import pytest
 from scipy import sparse as sp
 from numpy.testing import assert_array_equal
 
 from sklearn.neighbors import NearestCentroid
 from sklearn import datasets
-from sklearn.utils._testing import assert_raises
 
 # toy sample
 X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]
@@ -56,7 +56,7 @@ def test_classification_toy():
 
 def test_precomputed():
     clf = NearestCentroid(metric='precomputed')
-    with assert_raises(ValueError):
+    with pytest.raises(ValueError):
         clf.fit(X, y)
 
 
@@ -158,5 +158,5 @@ def test_features_zero_var():
     y[0] = 1
 
     clf = NearestCentroid(shrink_threshold=0.1)
-    with assert_raises(ValueError):
+    with pytest.raises(ValueError):
         clf.fit(X, y)
diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py
index a4b55afd090c3..8ce52119faa02 100644
--- a/sklearn/neighbors/tests/test_neighbors.py
+++ b/sklearn/neighbors/tests/test_neighbors.py
@@ -1,6 +1,7 @@
 from itertools import product
 
 import pytest
+import re
 import numpy as np
 from scipy.sparse import (bsr_matrix, coo_matrix, csc_matrix, csr_matrix,
                           dok_matrix, lil_matrix, issparse)
@@ -19,11 +20,6 @@
 from sklearn.pipeline import make_pipeline
 from sklearn.utils._testing import assert_array_almost_equal
 from sklearn.utils._testing import assert_array_equal
-from sklearn.utils._testing import assert_raises
-from sklearn.utils._testing import assert_raises_regex
-from sklearn.utils._testing import assert_warns
-from sklearn.utils._testing import assert_warns_message
-from sklearn.utils._testing import assert_raise_message
 from sklearn.utils._testing import ignore_warnings
 from sklearn.utils.validation import check_random_state
 from sklearn.utils.fixes import sp_version, parse_version
@@ -128,18 +124,21 @@ def test_n_neighbors_datatype():
     msg = "Expected n_neighbors > 0. Got -3"
 
     neighbors_ = neighbors.NearestNeighbors(n_neighbors=3.)
-    assert_raises_regex(TypeError, expected_msg, neighbors_.fit, X)
-    assert_raises_regex(ValueError, msg,
-                        neighbors_.kneighbors, X=X, n_neighbors=-3)
-    assert_raises_regex(TypeError, expected_msg,
-                        neighbors_.kneighbors, X=X, n_neighbors=3.)
+    with pytest.raises(TypeError, match=expected_msg):
+        neighbors_.fit(X)
+    with pytest.raises(ValueError, match=msg):
+        neighbors_.kneighbors(X=X, n_neighbors=-3)
+    with pytest.raises(TypeError, match=expected_msg):
+        neighbors_.kneighbors(X=X, n_neighbors=3.)
 
 
 def test_not_fitted_error_gets_raised():
     X = [[1]]
     neighbors_ = neighbors.NearestNeighbors()
-    assert_raises(NotFittedError, neighbors_.kneighbors_graph, X)
-    assert_raises(NotFittedError, neighbors_.radius_neighbors_graph, X)
+    with pytest.raises(NotFittedError):
+        neighbors_.kneighbors_graph(X)
+    with pytest.raises(NotFittedError):
+        neighbors_.radius_neighbors_graph(X)
 
 
 @ignore_warnings(category=EfficiencyWarning)
@@ -181,7 +180,8 @@ def check_precomputed(make_train_test, estimators):
         assert_array_almost_equal(ind_X, ind_D)
 
         # Must raise a ValueError if the matrix is not of correct shape
-        assert_raises(ValueError, getattr(nbrs_D, method), X)
+        with pytest.raises(ValueError):
+            getattr(nbrs_D, method)(X)
 
     target = np.arange(X.shape[0])
     for Est in estimators:
@@ -295,14 +295,15 @@ def test_precomputed_sparse_invalid():
     dist_csr = csr_matrix(dist)
     neigh.fit(dist_csr)
     msg = "2 neighbors per samples are required, but some samples have only 1"
-    assert_raises_regex(ValueError, msg, neigh.kneighbors, None, n_neighbors=1)
+    with pytest.raises(ValueError, match=msg):
+        neigh.kneighbors(None, n_neighbors=1)
 
     # Checks error with inconsistent distance matrix
     dist = np.array([[5., 2., 1.], [-2., 0., 3.], [1., 3., 0.]])
     dist_csr = csr_matrix(dist)
     msg = "Negative values in data passed to precomputed distance matrix."
-    assert_raises_regex(ValueError, msg, neigh.kneighbors, dist_csr,
-                        n_neighbors=1)
+    with pytest.raises(ValueError, match=msg):
+        neigh.kneighbors(dist_csr, n_neighbors=1)
 
 
 def test_precomputed_cross_validation():
@@ -486,7 +487,8 @@ def test_radius_neighbors_classifier_when_no_neighbors():
                 assert_array_equal(np.array([1, 2]),
                                    clf.predict(z1))
                 if outlier_label is None:
-                    assert_raises(ValueError, clf.predict, z2)
+                    with pytest.raises(ValueError):
+                        clf.predict(z2)
 
 
 def test_radius_neighbors_classifier_outlier_labeling():
@@ -526,13 +528,15 @@ def test_radius_neighbors_classifier_outlier_labeling():
     def check_array_exception():
         clf = RNC(radius=1, outlier_label=[[5]])
         clf.fit(X, y)
-    assert_raises(TypeError, check_array_exception)
+    with pytest.raises(TypeError):
+        check_array_exception()
 
     # test invalid outlier_label dtype
     def check_dtype_exception():
         clf = RNC(radius=1, outlier_label='a')
         clf.fit(X, y)
-    assert_raises(TypeError, check_dtype_exception)
+    with pytest.raises(TypeError):
+        check_dtype_exception()
 
     # test most frequent
     clf = RNC(radius=1, outlier_label='most_frequent')
@@ -553,7 +557,8 @@ def check_warning():
         clf = RNC(radius=1, outlier_label=4)
         clf.fit(X, y)
         clf.predict_proba([[1], [15]])
-    assert_warns(UserWarning, check_warning)
+    with pytest.warns(UserWarning):
+        check_warning()
 
     # test multi output same outlier label
     y_multi = [[0, 1], [2, 1], [2, 2], [1, 2], [1, 2],
@@ -580,7 +585,8 @@ def check_warning():
     def check_exception():
         clf = RNC(radius=1, outlier_label=[0, 1, 2])
         clf.fit(X, y_multi)
-    assert_raises(ValueError, check_exception)
+    with pytest.raises(ValueError):
+        check_exception()
 
 
 def test_radius_neighbors_classifier_zero_distance():
@@ -934,10 +940,8 @@ def test_radius_neighbors_regressor(n_samples=40,
         X_test_nan = np.full((1, n_features), -1.)
         empty_warning_msg = ("One or more samples have no neighbors "
                              "within specified radius; predicting NaN.")
-        pred = assert_warns_message(UserWarning,
-                                    empty_warning_msg,
-                                    neigh.predict,
-                                    X_test_nan)
+        with pytest.warns(UserWarning, match=re.escape(empty_warning_msg)):
+            pred = neigh.predict(X_test_nan)
         assert np.all(np.isnan(pred))
 
 
@@ -1044,8 +1048,7 @@ def test_neighbors_iris():
 
         rgs = neighbors.KNeighborsRegressor(n_neighbors=5, algorithm=algorithm)
         rgs.fit(iris.data, iris.target)
-        assert (np.mean(rgs.predict(iris.data).round() == iris.target) >
-                       0.95)
+        assert (np.mean(rgs.predict(iris.data).round() == iris.target) > 0.95)
 
 
 def test_neighbors_digits():
@@ -1166,9 +1169,8 @@ def test_radius_neighbors_graph_sparse(seed=36):
 
 def test_neighbors_badargs():
     # Test bad argument values: these should all raise ValueErrors
-    assert_raises(ValueError,
-                  neighbors.NearestNeighbors,
-                  algorithm='blah')
+    with pytest.raises(ValueError):
+        neighbors.NearestNeighbors(algorithm='blah')
 
     X = rng.random_sample((10, 2))
     Xsparse = csr_matrix(X)
@@ -1179,49 +1181,45 @@ def test_neighbors_badargs():
                 neighbors.RadiusNeighborsClassifier,
                 neighbors.KNeighborsRegressor,
                 neighbors.RadiusNeighborsRegressor):
-        assert_raises(ValueError,
-                      cls,
-                      weights='blah')
-        assert_raises(ValueError,
-                      cls, p=-1)
-        assert_raises(ValueError,
-                      cls, algorithm='blah')
+        with pytest.raises(ValueError):
+            cls(weights='blah')
+        with pytest.raises(ValueError):
+            cls(p=-1)
+        with pytest.raises(ValueError):
+            cls(algorithm='blah')
 
         nbrs = cls(algorithm='ball_tree', metric='haversine')
-        assert_raises(ValueError,
-                      nbrs.predict,
-                      X)
-        assert_raises(ValueError,
-                      ignore_warnings(nbrs.fit),
-                      Xsparse, y)
+        with pytest.raises(ValueError):
+            nbrs.predict(X)
+        with pytest.raises(ValueError):
+            ignore_warnings(nbrs.fit(Xsparse, y))
 
         nbrs = cls(metric='haversine', algorithm='brute')
         nbrs.fit(X3, y)
-        assert_raise_message(ValueError,
-                             "Haversine distance only valid in 2 dimensions",
-                             nbrs.predict,
-                             X3)
+        msg = "Haversine distance only valid in 2 dimensions"
+        with pytest.raises(ValueError, match=msg):
+            nbrs.predict(X3)
 
         nbrs = cls()
-        assert_raises(ValueError,
-                      nbrs.fit,
-                      np.ones((0, 2)), np.ones(0))
-        assert_raises(ValueError,
-                      nbrs.fit,
-                      X[:, :, None], y)
+        with pytest.raises(ValueError):
+            nbrs.fit(np.ones((0, 2)), np.ones(0))
+        with pytest.raises(ValueError):
+            nbrs.fit(X[:, :, None], y)
         nbrs.fit(X, y)
-        assert_raises(ValueError,
-                      nbrs.predict,
-                      [[]])
+        with pytest.raises(ValueError):
+            nbrs.predict([[]])
         if (issubclass(cls, neighbors.KNeighborsClassifier) or
                 issubclass(cls, neighbors.KNeighborsRegressor)):
             nbrs = cls(n_neighbors=-1)
-            assert_raises(ValueError, nbrs.fit, X, y)
+            with pytest.raises(ValueError):
+                nbrs.fit(X, y)
 
     nbrs = neighbors.NearestNeighbors().fit(X)
 
-    assert_raises(ValueError, nbrs.kneighbors_graph, X, mode='blah')
-    assert_raises(ValueError, nbrs.radius_neighbors_graph, X, mode='blah')
+    with pytest.raises(ValueError):
+        nbrs.kneighbors_graph(X, mode='blah')
+    with pytest.raises(ValueError):
+        nbrs.radius_neighbors_graph(X, mode='blah')
 
 
 def test_neighbors_metrics(n_samples=20, n_features=3,
@@ -1257,10 +1255,10 @@ def test_neighbors_metrics(n_samples=20, n_features=3,
             # KD tree doesn't support all metrics
             if (algorithm == 'kd_tree' and
                     metric not in neighbors.KDTree.valid_metrics):
-                assert_raises(ValueError,
-                              neighbors.NearestNeighbors,
-                              algorithm=algorithm,
-                              metric=metric, metric_params=metric_params)
+                with pytest.raises(ValueError):
+                    neighbors.NearestNeighbors(algorithm=algorithm,
+                                               metric=metric,
+                                               metric_params=metric_params)
                 continue
             neigh = neighbors.NearestNeighbors(n_neighbors=n_neighbors,
                                                algorithm=algorithm,
@@ -1363,8 +1361,8 @@ def test_valid_brute_metric_for_auto_algorithm():
 
 
 def test_metric_params_interface():
-    assert_warns(SyntaxWarning, neighbors.KNeighborsClassifier,
-                 metric_params={'p': 3})
+    with pytest.warns(SyntaxWarning):
+        neighbors.KNeighborsClassifier(metric_params={'p': 3})
 
 
 def test_predict_sparse_ball_kd_tree():
@@ -1375,7 +1373,8 @@ def test_predict_sparse_ball_kd_tree():
     nbrs2 = neighbors.KNeighborsRegressor(1, algorithm='ball_tree')
     for model in [nbrs1, nbrs2]:
         model.fit(X, y)
-        assert_raises(ValueError, model.predict, csr_matrix(X))
+        with pytest.raises(ValueError):
+            model.predict(csr_matrix(X))
 
 
 def test_non_euclidean_kneighbors():
@@ -1406,12 +1405,12 @@ def test_non_euclidean_kneighbors():
     # Raise error when wrong parameters are supplied,
     X_nbrs = neighbors.NearestNeighbors(n_neighbors=3, metric='manhattan')
     X_nbrs.fit(X)
-    assert_raises(ValueError, neighbors.kneighbors_graph, X_nbrs, 3,
-                  metric='euclidean')
+    with pytest.raises(ValueError):
+        neighbors.kneighbors_graph(X_nbrs, 3, metric='euclidean')
     X_nbrs = neighbors.NearestNeighbors(radius=radius, metric='manhattan')
     X_nbrs.fit(X)
-    assert_raises(ValueError, neighbors.radius_neighbors_graph, X_nbrs,
-                  radius, metric='euclidean')
+    with pytest.raises(ValueError):
+        neighbors.radius_neighbors_graph(X_nbrs, radius, metric='euclidean')
 
 
 def check_object_arrays(nparray, list_check):

From dbd68b2846905efb3682db46c798298b1fd3d6c2 Mon Sep 17 00:00:00 2001
From: Maria Telenczuk <telenczukm@gmail.com>
Date: Mon, 22 Feb 2021 11:48:54 +0100
Subject: [PATCH 192/478] MRG fix Normalize for linear models when used with
 sample_weight (#19426)

Co-authored-by: Alexandre Gramfort <alexandre.gramfort@m4x.org>
Co-authored-by: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 doc/whats_new/v1.0.rst                        |   6 +
 sklearn/linear_model/_base.py                 |  40 ++---
 sklearn/linear_model/tests/test_base.py       | 138 ++++++++++++-----
 .../tests/test_coordinate_descent.py          | 145 ++++++++++++------
 4 files changed, 222 insertions(+), 107 deletions(-)

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 66272c97d7a16..25e0b369bebd3 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -135,6 +135,12 @@ Changelog
   :pr:`17743` by :user:`Maria Telenczuk <maikia>` and
   :user:`Alexandre Gramfort <agramfort>`.
 
+- |Fix|: `sample_weight` are now fully taken into account in linear models
+  when `normalize=True` for both feature centering and feature
+  scaling.
+  :pr:`19426` by :user:`Alexandre Gramfort <agramfort>` and
+  :user:`Maria Telenczuk <maikia>`.
+
 :mod:`sklearn.manifold`
 .......................
 
diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py
index f84d4234c193c..61005cb4b5d4a 100644
--- a/sklearn/linear_model/_base.py
+++ b/sklearn/linear_model/_base.py
@@ -33,6 +33,7 @@
 from ..utils.validation import _deprecate_positional_args
 from ..utils import check_random_state
 from ..utils.extmath import safe_sparse_dot
+from ..utils.extmath import _incremental_mean_and_var
 from ..utils.sparsefuncs import mean_variance_axis, inplace_column_scale
 from ..utils.fixes import sparse_lsqr
 from ..utils._seq_dataset import ArrayDataset32, CSRDataset32
@@ -40,7 +41,6 @@
 from ..utils.validation import check_is_fitted, _check_sample_weight
 
 from ..utils.fixes import delayed
-from ..preprocessing import normalize as f_normalize
 
 # TODO: bayesian_ridge_regression and bayesian_regression_ard
 # should be squashed into its respective objects.
@@ -229,33 +229,33 @@ def _preprocess_data(X, y, fit_intercept, normalize=False, copy=True,
 
     if fit_intercept:
         if sp.issparse(X):
-            X_offset, X_var = mean_variance_axis(X, axis=0)
+            X_offset, X_var = mean_variance_axis(
+                X, axis=0, weights=sample_weight
+            )
             if not return_mean:
                 X_offset[:] = X.dtype.type(0)
+        else:
+            X_offset, X_var, _ = _incremental_mean_and_var(
+                X, last_mean=0., last_variance=0., last_sample_count=0.,
+                sample_weight=sample_weight
+            )
 
-            if normalize:
+            X_offset = X_offset.astype(X.dtype)
+            X -= X_offset
 
-                # TODO: f_normalize could be used here as well but the function
-                # inplace_csr_row_normalize_l2 must be changed such that it
-                # can return also the norms computed internally
+        X_var = X_var.astype(X.dtype, copy=False)
 
-                # transform variance to norm in-place
-                X_var *= X.shape[0]
-                X_scale = np.sqrt(X_var, X_var)
-                del X_var
-                X_scale[X_scale == 0] = 1
+        if normalize:
+            X_var *= X.shape[0]
+            X_scale = np.sqrt(X_var, out=X_var)
+            X_scale[X_scale < 10 * np.finfo(X_scale.dtype).eps] = 1.
+            if sp.issparse(X):
                 inplace_column_scale(X, 1. / X_scale)
             else:
-                X_scale = np.ones(X.shape[1], dtype=X.dtype)
-
+                X /= X_scale
         else:
-            X_offset = np.average(X, axis=0, weights=sample_weight)
-            X -= X_offset
-            if normalize:
-                X, X_scale = f_normalize(X, axis=0, copy=False,
-                                         return_norm=True)
-            else:
-                X_scale = np.ones(X.shape[1], dtype=X.dtype)
+            X_scale = np.ones(X.shape[1], dtype=X.dtype)
+
         y_offset = np.average(y, axis=0, weights=sample_weight)
         y = y - y_offset
     else:
diff --git a/sklearn/linear_model/tests/test_base.py b/sklearn/linear_model/tests/test_base.py
index 75cc9dd5fd8f1..56ee18f5f0d06 100644
--- a/sklearn/linear_model/tests/test_base.py
+++ b/sklearn/linear_model/tests/test_base.py
@@ -1,5 +1,6 @@
 # Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>
 #         Fabian Pedregosa <fabian.pedregosa@inria.fr>
+#         Maria Telenczuk <https://github.com/maikia>
 #
 # License: BSD 3 clause
 
@@ -24,6 +25,7 @@
 from sklearn.datasets import make_sparse_uncorrelated
 from sklearn.datasets import make_regression
 from sklearn.datasets import load_iris
+from sklearn.preprocessing import StandardScaler
 
 rng = np.random.RandomState(0)
 rtol = 1e-6
@@ -407,31 +409,31 @@ def test_preprocess_data():
     X = rng.rand(n_samples, n_features)
     y = rng.rand(n_samples)
     expected_X_mean = np.mean(X, axis=0)
-    expected_X_norm = np.std(X, axis=0) * np.sqrt(X.shape[0])
+    expected_X_scale = np.std(X, axis=0) * np.sqrt(X.shape[0])
     expected_y_mean = np.mean(y, axis=0)
 
-    Xt, yt, X_mean, y_mean, X_norm = \
+    Xt, yt, X_mean, y_mean, X_scale = \
         _preprocess_data(X, y, fit_intercept=False, normalize=False)
     assert_array_almost_equal(X_mean, np.zeros(n_features))
     assert_array_almost_equal(y_mean, 0)
-    assert_array_almost_equal(X_norm, np.ones(n_features))
+    assert_array_almost_equal(X_scale, np.ones(n_features))
     assert_array_almost_equal(Xt, X)
     assert_array_almost_equal(yt, y)
 
-    Xt, yt, X_mean, y_mean, X_norm = \
+    Xt, yt, X_mean, y_mean, X_scale = \
         _preprocess_data(X, y, fit_intercept=True, normalize=False)
     assert_array_almost_equal(X_mean, expected_X_mean)
     assert_array_almost_equal(y_mean, expected_y_mean)
-    assert_array_almost_equal(X_norm, np.ones(n_features))
+    assert_array_almost_equal(X_scale, np.ones(n_features))
     assert_array_almost_equal(Xt, X - expected_X_mean)
     assert_array_almost_equal(yt, y - expected_y_mean)
 
-    Xt, yt, X_mean, y_mean, X_norm = \
+    Xt, yt, X_mean, y_mean, X_scale = \
         _preprocess_data(X, y, fit_intercept=True, normalize=True)
     assert_array_almost_equal(X_mean, expected_X_mean)
     assert_array_almost_equal(y_mean, expected_y_mean)
-    assert_array_almost_equal(X_norm, expected_X_norm)
-    assert_array_almost_equal(Xt, (X - expected_X_mean) / expected_X_norm)
+    assert_array_almost_equal(X_scale, expected_X_scale)
+    assert_array_almost_equal(Xt, (X - expected_X_mean) / expected_X_scale)
     assert_array_almost_equal(yt, y - expected_y_mean)
 
 
@@ -461,36 +463,94 @@ def test_preprocess_data_multioutput():
         assert_array_almost_equal(yt, y - y_mean)
 
 
-def test_preprocess_data_weighted():
+@pytest.mark.parametrize("is_sparse", [False, True])
+def test_preprocess_data_weighted(is_sparse):
     n_samples = 200
-    n_features = 2
+    n_features = 4
+    # Generate random data with 50% of zero values to make sure
+    # that the sparse variant of this test is actually sparse. This also
+    # shifts the mean value for each columns in X further away from
+    # zero.
     X = rng.rand(n_samples, n_features)
+    X[X < 0.5] = 0.
+
+    # Scale the first feature of X to be 10 larger than the other to
+    # better check the impact of feature scaling.
+    X[:, 0] *= 10
+
+    # Constant non-zero feature: this edge-case is currently not handled
+    # correctly for sparse data, see:
+    # https://github.com/scikit-learn/scikit-learn/issues/19450
+    # X[:, 2] = 1.
+
+    # Constant zero feature (non-materialized in the sparse case)
+    X[:, 3] = 0.
     y = rng.rand(n_samples)
+
     sample_weight = rng.rand(n_samples)
     expected_X_mean = np.average(X, axis=0, weights=sample_weight)
     expected_y_mean = np.average(y, axis=0, weights=sample_weight)
 
-    # XXX: if normalize=True, should we expect a weighted standard deviation?
-    #      Currently not weighted, but calculated with respect to weighted mean
-    expected_X_norm = (np.sqrt(X.shape[0]) *
-                       np.mean((X - expected_X_mean) ** 2, axis=0) ** .5)
+    X_sample_weight_avg = np.average(X, weights=sample_weight, axis=0)
+    X_sample_weight_var = np.average((X - X_sample_weight_avg)**2,
+                                     weights=sample_weight,
+                                     axis=0)
+    expected_X_scale = np.sqrt(X_sample_weight_var) * np.sqrt(n_samples)
+
+    # near constant features should not be scaled
+    expected_X_scale[expected_X_scale < 10 * np.finfo(np.float64).eps] = 1
+
+    if is_sparse:
+        X = sparse.csr_matrix(X)
 
-    Xt, yt, X_mean, y_mean, X_norm = \
+    # normalize is False
+    Xt, yt, X_mean, y_mean, X_scale = \
         _preprocess_data(X, y, fit_intercept=True, normalize=False,
-                         sample_weight=sample_weight)
+                         sample_weight=sample_weight, return_mean=True)
     assert_array_almost_equal(X_mean, expected_X_mean)
     assert_array_almost_equal(y_mean, expected_y_mean)
-    assert_array_almost_equal(X_norm, np.ones(n_features))
-    assert_array_almost_equal(Xt, X - expected_X_mean)
+    assert_array_almost_equal(X_scale, np.ones(n_features))
+    if is_sparse:
+        assert_array_almost_equal(Xt.toarray(), X.toarray())
+    else:
+        assert_array_almost_equal(Xt, X - expected_X_mean)
     assert_array_almost_equal(yt, y - expected_y_mean)
 
-    Xt, yt, X_mean, y_mean, X_norm = \
+    # normalize is True
+    Xt, yt, X_mean, y_mean, X_scale = \
         _preprocess_data(X, y, fit_intercept=True, normalize=True,
-                         sample_weight=sample_weight)
+                         sample_weight=sample_weight, return_mean=True)
+
     assert_array_almost_equal(X_mean, expected_X_mean)
     assert_array_almost_equal(y_mean, expected_y_mean)
-    assert_array_almost_equal(X_norm, expected_X_norm)
-    assert_array_almost_equal(Xt, (X - expected_X_mean) / expected_X_norm)
+    assert_array_almost_equal(X_scale, expected_X_scale)
+
+    if is_sparse:
+        # X is not centered
+        assert_array_almost_equal(
+            Xt.toarray(), X.toarray() / expected_X_scale
+        )
+    else:
+        assert_array_almost_equal(
+            Xt, (X - expected_X_mean) / expected_X_scale
+        )
+
+    # _preprocess_data with normalize=True scales the data by the feature-wise
+    # euclidean norms while StandardScaler scales the data by the feature-wise
+    # standard deviations.
+    # The two are equivalent up to a ratio of np.sqrt(n_samples)
+    if is_sparse:
+        scaler = StandardScaler(with_mean=False).fit(
+            X, sample_weight=sample_weight)
+
+        assert_array_almost_equal(
+            scaler.transform(X).toarray() / np.sqrt(n_samples), Xt.toarray()
+            )
+    else:
+        scaler = StandardScaler(with_mean=True).fit(
+            X, sample_weight=sample_weight)
+        assert_array_almost_equal(scaler.mean_, X_mean)
+        assert_array_almost_equal(scaler.transform(X) / np.sqrt(n_samples), Xt)
     assert_array_almost_equal(yt, y - expected_y_mean)
 
 
@@ -502,33 +562,33 @@ def test_sparse_preprocess_data_with_return_mean():
     X = X.tolil()
     y = rng.rand(n_samples)
     XA = X.toarray()
-    expected_X_norm = np.std(XA, axis=0) * np.sqrt(X.shape[0])
+    expected_X_scale = np.std(XA, axis=0) * np.sqrt(X.shape[0])
 
-    Xt, yt, X_mean, y_mean, X_norm = \
+    Xt, yt, X_mean, y_mean, X_scale = \
         _preprocess_data(X, y, fit_intercept=False, normalize=False,
                          return_mean=True)
     assert_array_almost_equal(X_mean, np.zeros(n_features))
     assert_array_almost_equal(y_mean, 0)
-    assert_array_almost_equal(X_norm, np.ones(n_features))
+    assert_array_almost_equal(X_scale, np.ones(n_features))
     assert_array_almost_equal(Xt.A, XA)
     assert_array_almost_equal(yt, y)
 
-    Xt, yt, X_mean, y_mean, X_norm = \
+    Xt, yt, X_mean, y_mean, X_scale = \
         _preprocess_data(X, y, fit_intercept=True, normalize=False,
                          return_mean=True)
     assert_array_almost_equal(X_mean, np.mean(XA, axis=0))
     assert_array_almost_equal(y_mean, np.mean(y, axis=0))
-    assert_array_almost_equal(X_norm, np.ones(n_features))
+    assert_array_almost_equal(X_scale, np.ones(n_features))
     assert_array_almost_equal(Xt.A, XA)
     assert_array_almost_equal(yt, y - np.mean(y, axis=0))
 
-    Xt, yt, X_mean, y_mean, X_norm = \
+    Xt, yt, X_mean, y_mean, X_scale = \
         _preprocess_data(X, y, fit_intercept=True, normalize=True,
                          return_mean=True)
     assert_array_almost_equal(X_mean, np.mean(XA, axis=0))
     assert_array_almost_equal(y_mean, np.mean(y, axis=0))
-    assert_array_almost_equal(X_norm, expected_X_norm)
-    assert_array_almost_equal(Xt.A, XA / expected_X_norm)
+    assert_array_almost_equal(X_scale, expected_X_scale)
+    assert_array_almost_equal(Xt.A, XA / expected_X_scale)
     assert_array_almost_equal(yt, y - np.mean(y, axis=0))
 
 
@@ -577,19 +637,19 @@ def test_dtype_preprocess_data():
     for fit_intercept in [True, False]:
         for normalize in [True, False]:
 
-            Xt_32, yt_32, X_mean_32, y_mean_32, X_norm_32 = _preprocess_data(
+            Xt_32, yt_32, X_mean_32, y_mean_32, X_scale_32 = _preprocess_data(
                 X_32, y_32, fit_intercept=fit_intercept, normalize=normalize,
                 return_mean=True)
 
-            Xt_64, yt_64, X_mean_64, y_mean_64, X_norm_64 = _preprocess_data(
+            Xt_64, yt_64, X_mean_64, y_mean_64, X_scale_64 = _preprocess_data(
                 X_64, y_64, fit_intercept=fit_intercept, normalize=normalize,
                 return_mean=True)
 
-            Xt_3264, yt_3264, X_mean_3264, y_mean_3264, X_norm_3264 = (
+            Xt_3264, yt_3264, X_mean_3264, y_mean_3264, X_scale_3264 = (
                 _preprocess_data(X_32, y_64, fit_intercept=fit_intercept,
                                  normalize=normalize, return_mean=True))
 
-            Xt_6432, yt_6432, X_mean_6432, y_mean_6432, X_norm_6432 = (
+            Xt_6432, yt_6432, X_mean_6432, y_mean_6432, X_scale_6432 = (
                 _preprocess_data(X_64, y_32, fit_intercept=fit_intercept,
                                  normalize=normalize, return_mean=True))
 
@@ -597,25 +657,25 @@ def test_dtype_preprocess_data():
             assert yt_32.dtype == np.float32
             assert X_mean_32.dtype == np.float32
             assert y_mean_32.dtype == np.float32
-            assert X_norm_32.dtype == np.float32
+            assert X_scale_32.dtype == np.float32
 
             assert Xt_64.dtype == np.float64
             assert yt_64.dtype == np.float64
             assert X_mean_64.dtype == np.float64
             assert y_mean_64.dtype == np.float64
-            assert X_norm_64.dtype == np.float64
+            assert X_scale_64.dtype == np.float64
 
             assert Xt_3264.dtype == np.float32
             assert yt_3264.dtype == np.float32
             assert X_mean_3264.dtype == np.float32
             assert y_mean_3264.dtype == np.float32
-            assert X_norm_3264.dtype == np.float32
+            assert X_scale_3264.dtype == np.float32
 
             assert Xt_6432.dtype == np.float64
             assert yt_6432.dtype == np.float64
             assert X_mean_6432.dtype == np.float64
             assert y_mean_6432.dtype == np.float64
-            assert X_norm_6432.dtype == np.float64
+            assert X_scale_6432.dtype == np.float64
 
             assert X_32.dtype == np.float32
             assert y_32.dtype == np.float32
@@ -626,7 +686,7 @@ def test_dtype_preprocess_data():
             assert_array_almost_equal(yt_32, yt_64)
             assert_array_almost_equal(X_mean_32, X_mean_64)
             assert_array_almost_equal(y_mean_32, y_mean_64)
-            assert_array_almost_equal(X_norm_32, X_norm_64)
+            assert_array_almost_equal(X_scale_32, X_scale_64)
 
 
 @pytest.mark.parametrize('n_targets', [None, 2])
diff --git a/sklearn/linear_model/tests/test_coordinate_descent.py b/sklearn/linear_model/tests/test_coordinate_descent.py
index b6acb78838a33..3eba535d70c89 100644
--- a/sklearn/linear_model/tests/test_coordinate_descent.py
+++ b/sklearn/linear_model/tests/test_coordinate_descent.py
@@ -13,6 +13,7 @@
 from sklearn.datasets import make_regression
 from sklearn.model_selection import train_test_split
 from sklearn.pipeline import make_pipeline
+from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import StandardScaler
 from sklearn.exceptions import ConvergenceWarning
 from sklearn.utils._testing import assert_allclose
@@ -25,6 +26,7 @@
 from sklearn.utils._testing import _convert_container
 from sklearn.utils._testing import TempMemmap
 from sklearn.utils.fixes import parse_version
+from sklearn.utils.sparsefuncs import mean_variance_axis
 
 from sklearn.linear_model import (
     ARDRegression,
@@ -298,7 +300,33 @@ def test_lasso_cv_positive_constraint():
     assert min(clf_constrained.coef_) >= 0
 
 
-# FIXME: 'normalize' to be removed in 1.2
+def _scale_alpha_inplace(estimator, n_samples):
+    """Rescale the parameter alpha from when the estimator is evoked with
+    normalize set to True to when it is evoked in a Pipeline with normalize set
+    to False and with a StandardScaler.
+    """
+    if 'alpha' not in estimator.get_params():
+        return
+
+    if isinstance(estimator, (Lasso, LassoLars, MultiTaskLasso)):
+        alpha = estimator.alpha * np.sqrt(n_samples)
+    if isinstance(estimator, (Ridge, RidgeClassifier)):
+        alpha = estimator.alpha * n_samples
+    if isinstance(estimator, (ElasticNet, MultiTaskElasticNet)):
+        if estimator.l1_ratio == 1:
+            alpha = estimator.alpha * np.sqrt(n_samples)
+        elif estimator.l1_ratio == 0:
+            alpha = estimator.alpha * n_samples
+        else:
+            # To avoid silent errors in case of refactoring
+            raise NotImplementedError
+
+    estimator.set_params(alpha=alpha)
+
+
+# FIXME: 'normalize' to be removed in 1.2 for all the models excluding:
+# OrthogonalMatchingPursuit, Lars, LassoLars, LarsCV, LassoLarsCV
+# for which it is to be removed in 1.4
 @pytest.mark.filterwarnings("ignore:'normalize' was deprecated")
 @pytest.mark.parametrize(
     "LinearModel, params",
@@ -324,7 +352,6 @@ def test_model_pipeline_same_as_normalize_true(LinearModel, params):
     # in the pipeline and with normalize set to False
 
     # normalize is True
-    model_name = LinearModel.__name__
     model_normalize = LinearModel(normalize=True, fit_intercept=True, **params)
 
     pipeline = make_pipeline(
@@ -351,22 +378,7 @@ def test_model_pipeline_same_as_normalize_true(LinearModel, params):
 
     X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
 
-    if 'alpha' in params:
-        model_normalize.set_params(alpha=params['alpha'])
-        if model_name in ['Lasso', 'LassoLars', 'MultiTaskLasso']:
-            new_params = dict(
-                alpha=params['alpha'] * np.sqrt(X_train.shape[0]))
-        if model_name in ['Ridge', 'RidgeClassifier']:
-            new_params = dict(alpha=params['alpha'] * X_train.shape[0])
-    if model_name in ['ElasticNet', 'MultiTaskElasticNet']:
-        if params['l1_ratio'] == 1:
-            new_params = dict(
-                alpha=params['alpha'] * np.sqrt(X_train.shape[0]))
-        if params['l1_ratio'] == 0:
-            new_params = dict(alpha=params['alpha'] * X_train.shape[0])
-
-    if 'new_params' in locals():
-        pipeline[1].set_params(**new_params)
+    _scale_alpha_inplace(pipeline[1], X_train.shape[0])
 
     model_normalize.fit(X_train, y_train)
     y_pred_normalize = model_normalize.predict(X_test)
@@ -386,24 +398,47 @@ def test_model_pipeline_same_as_normalize_true(LinearModel, params):
 # FIXME: 'normalize' to be removed in 1.2
 @pytest.mark.filterwarnings("ignore:'normalize' was deprecated")
 @pytest.mark.parametrize(
-    "estimator, is_sparse, with_mean",
-    [(LinearRegression, True, False),
-     (LinearRegression, False, True),
-     (LinearRegression, False, False)]
+    "estimator, params",
+    [
+         (Lasso, {"tol": 1e-16, "alpha": 0.1}),
+         (RidgeClassifier, {"solver": 'sparse_cg', "alpha": 0.1}),
+         (ElasticNet, {"tol": 1e-16, 'l1_ratio': 1, "alpha": 0.1}),
+         (ElasticNet, {"tol": 1e-16, 'l1_ratio': 0, "alpha": 0.1}),
+         (Ridge, {"solver": 'sparse_cg', 'tol': 1e-12, "alpha": 0.1}),
+         (LinearRegression, {}),
+     ]
+)
+@pytest.mark.parametrize(
+    "is_sparse, with_mean", [
+        (False, True),
+        (False, False),
+        (True, False)
+        # No need to test sparse and with_mean=True
+    ]
 )
 def test_linear_model_sample_weights_normalize_in_pipeline(
-        estimator, is_sparse, with_mean
+        is_sparse, with_mean, estimator, params
 ):
-    # Test that the results for running linear regression LinearRegression with
-    # sample_weight set and with normalize set to True gives similar results as
-    # LinearRegression with no normalize in a pipeline with a StandardScaler
-    # and set sample_weight.
+    # Test that the results for running linear model with sample_weight
+    # and with normalize set to True gives similar results as the same linear
+    # model with normalize set to False in a pipeline with
+    # a StandardScaler and sample_weight.
+    model_name = estimator.__name__
+
+    if model_name in ['Lasso', 'ElasticNet'] and is_sparse:
+        pytest.skip(f'{model_name} does not support sample_weight with sparse')
+
     rng = np.random.RandomState(0)
     X, y = make_regression(n_samples=20, n_features=5, noise=1e-2,
                            random_state=rng)
+
+    if is_classifier(estimator):
+        y = np.sign(y)
+
     # make sure the data is not centered to make the problem more
-    # difficult
-    X += 10
+    # difficult + add 0s for the sparse case
+    X[X < 0] = 0
+
     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5,
                                                         random_state=rng)
     if is_sparse:
@@ -412,27 +447,41 @@ def test_linear_model_sample_weights_normalize_in_pipeline(
 
     sample_weight = rng.rand(X_train.shape[0])
 
-    # linear estimator with explicit sample_weight
-    reg_with_normalize = estimator(normalize=True)
+    # linear estimator with built-in feature normalization
+    reg_with_normalize = estimator(normalize=True, fit_intercept=True,
+                                   **params)
     reg_with_normalize.fit(X_train, y_train, sample_weight=sample_weight)
 
-    # linear estimator in a pipeline
-    reg_with_scaler = make_pipeline(
-        StandardScaler(with_mean=with_mean),
-        estimator(normalize=False)
-    )
-    kwargs = {reg_with_scaler.steps[-1][0] + '__sample_weight':
-              sample_weight}
-    reg_with_scaler.fit(X_train, y_train, **kwargs)
-
-    y_pred_norm = reg_with_normalize.predict(X_test)
-    y_pred_pip = reg_with_scaler.predict(X_test)
-
-    assert_allclose(
-        reg_with_normalize.coef_ * reg_with_scaler[0].scale_,
-        reg_with_scaler[1].coef_
-    )
-    assert_allclose(y_pred_norm, y_pred_pip)
+    # linear estimator in a pipeline with a StandardScaler, normalize=False
+    linear_regressor = estimator(normalize=False, fit_intercept=True, **params)
+    _scale_alpha_inplace(linear_regressor, X_train.shape[0])  # rescale alpha
+    reg_with_scaler = Pipeline([
+        ("scaler", StandardScaler(with_mean=with_mean)),
+        ("linear_regressor", linear_regressor)
+    ])
+
+    fit_params = {
+        "scaler__sample_weight":  sample_weight,
+        "linear_regressor__sample_weight": sample_weight,
+    }
+
+    reg_with_scaler.fit(X_train, y_train, **fit_params)
+
+    # Check that the 2 regressions models are exactly equivalent in the
+    # sense that they predict exactly the same outcome.
+    y_pred_normalize = reg_with_normalize.predict(X_test)
+    y_pred_scaler = reg_with_scaler.predict(X_test)
+    assert_allclose(y_pred_normalize,  y_pred_scaler)
+    # Check intercept computation when normalize is True
+    y_train_mean = np.average(y_train, weights=sample_weight)
+    if is_sparse:
+        X_train_mean, _ = mean_variance_axis(X_train, axis=0,
+                                             weights=sample_weight)
+    else:
+        X_train_mean = np.average(X_train, weights=sample_weight, axis=0)
+    assert (reg_with_normalize.intercept_ ==
+            pytest.approx(y_train_mean -
+                          reg_with_normalize.coef_.dot(X_train_mean)))
 
 
 # FIXME: 'normalize' to be removed in 1.2

From 04534204f2125741505172ecd8dc3c92f9917698 Mon Sep 17 00:00:00 2001
From: putschblos <75161135+putschblos@users.noreply.github.com>
Date: Mon, 22 Feb 2021 16:23:55 +0100
Subject: [PATCH 193/478] EXA Fix overlapping titles in clustering overview
 (#19506)

---
 examples/cluster/plot_cluster_comparison.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/examples/cluster/plot_cluster_comparison.py b/examples/cluster/plot_cluster_comparison.py
index 5791464f3dc67..0eea5ee1e27e1 100644
--- a/examples/cluster/plot_cluster_comparison.py
+++ b/examples/cluster/plot_cluster_comparison.py
@@ -63,8 +63,8 @@
 # ============
 # Set up cluster parameters
 # ============
-plt.figure(figsize=(9 * 2 + 3, 12.5))
-plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=.05,
+plt.figure(figsize=(9 * 2 + 3, 13))
+plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.95, wspace=.05,
                     hspace=.01)
 
 plot_num = 1
@@ -135,16 +135,16 @@
         n_components=params['n_clusters'], covariance_type='full')
 
     clustering_algorithms = (
-        ('MiniBatchKMeans', two_means),
-        ('AffinityPropagation', affinity_propagation),
+        ('MiniBatch\nKMeans', two_means),
+        ('Affinity\nPropagation', affinity_propagation),
         ('MeanShift', ms),
-        ('SpectralClustering', spectral),
+        ('Spectral\nClustering', spectral),
         ('Ward', ward),
-        ('AgglomerativeClustering', average_linkage),
+        ('Agglomerative\nClustering', average_linkage),
         ('DBSCAN', dbscan),
         ('OPTICS', optics),
         ('BIRCH', birch),
-        ('GaussianMixture', gmm)
+        ('Gaussian\nMixture', gmm)
     )
 
     for name, algorithm in clustering_algorithms:

From 1000d0a61be311542e01d56f6745178307406395 Mon Sep 17 00:00:00 2001
From: Alihan Zihna <alihanz@gmail.com>
Date: Mon, 22 Feb 2021 15:25:34 +0000
Subject: [PATCH 194/478] TST replace assert_raise with pytest.raises in
 test_base.py (#19500)

Co-authored-by: Alihan Zihna <a.zihna@ckhgbdp.onmicrosoft.com>
---
 sklearn/tests/test_base.py | 26 ++++++++++++++++++--------
 1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/sklearn/tests/test_base.py b/sklearn/tests/test_base.py
index 7dd8d02f3c0bf..666df1499d7dc 100644
--- a/sklearn/tests/test_base.py
+++ b/sklearn/tests/test_base.py
@@ -7,7 +7,6 @@
 
 import sklearn
 from sklearn.utils._testing import assert_array_equal
-from sklearn.utils._testing import assert_raises
 from sklearn.utils._testing import assert_no_warnings
 from sklearn.utils._testing import assert_warns_message
 from sklearn.utils._testing import ignore_warnings
@@ -145,16 +144,20 @@ def test_clone_buggy():
     # Check that clone raises an error on buggy estimators.
     buggy = Buggy()
     buggy.a = 2
-    assert_raises(RuntimeError, clone, buggy)
+    with pytest.raises(RuntimeError):
+        clone(buggy)
 
     no_estimator = NoEstimator()
-    assert_raises(TypeError, clone, no_estimator)
+    with pytest.raises(TypeError):
+        clone(no_estimator)
 
     varg_est = VargEstimator()
-    assert_raises(RuntimeError, clone, varg_est)
+    with pytest.raises(RuntimeError):
+        clone(varg_est)
 
     est = ModifyInitParams()
-    assert_raises(RuntimeError, clone, est)
+    with pytest.raises(RuntimeError):
+        clone(est)
 
 
 def test_clone_empty_array():
@@ -233,7 +236,9 @@ def test_get_params():
 
     test.set_params(a__d=2)
     assert test.a.d == 2
-    assert_raises(ValueError, test.set_params, a__a=2)
+
+    with pytest.raises(ValueError):
+        test.set_params(a__a=2)
 
 
 def test_is_classifier():
@@ -248,10 +253,15 @@ def test_is_classifier():
 def test_set_params():
     # test nested estimator parameter setting
     clf = Pipeline([("svc", SVC())])
+
     # non-existing parameter in svc
-    assert_raises(ValueError, clf.set_params, svc__stupid_param=True)
+    with pytest.raises(ValueError):
+        clf.set_params(svc__stupid_param=True)
+
     # non-existing parameter of pipeline
-    assert_raises(ValueError, clf.set_params, svm__stupid_param=True)
+    with pytest.raises(ValueError):
+        clf.set_params(svm__stupid_param=True)
+
     # we don't currently catch if the things in pipeline are estimators
     # bad_pipeline = Pipeline([("bad", NoEstimator())])
     # assert_raises(AttributeError, bad_pipeline.set_params,

From 6852e31da88a94262c8a6a82a6ad00a34ddc385b Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Mon, 22 Feb 2021 21:18:21 +0100
Subject: [PATCH 195/478] ENH Adds n_features_in_ to ensemble module (#19326)

Co-authored-by: Guillaume Lemaitre <g.lemaitre58@gmail.com>
---
 sklearn/ensemble/_bagging.py                  | 62 +++++++++----------
 sklearn/ensemble/_forest.py                   | 42 ++++++++++---
 sklearn/ensemble/_gb.py                       | 47 +++++++++-----
 .../gradient_boosting.py                      |  8 ++-
 sklearn/ensemble/_iforest.py                  | 15 +++--
 sklearn/ensemble/_weight_boosting.py          |  8 ++-
 sklearn/ensemble/tests/test_bagging.py        | 22 ++++---
 sklearn/ensemble/tests/test_forest.py         | 18 ++++++
 .../ensemble/tests/test_gradient_boosting.py  | 16 +++++
 sklearn/ensemble/tests/test_iforest.py        | 12 ++++
 sklearn/tests/test_common.py                  |  1 -
 11 files changed, 174 insertions(+), 77 deletions(-)

diff --git a/sklearn/ensemble/_bagging.py b/sklearn/ensemble/_bagging.py
index 070bc374f3123..1ac309f00ad69 100644
--- a/sklearn/ensemble/_bagging.py
+++ b/sklearn/ensemble/_bagging.py
@@ -16,7 +16,7 @@
 from ..base import ClassifierMixin, RegressorMixin
 from ..metrics import r2_score, accuracy_score
 from ..tree import DecisionTreeClassifier, DecisionTreeRegressor
-from ..utils import check_random_state, check_array, column_or_1d
+from ..utils import check_random_state, column_or_1d, deprecated
 from ..utils import indices_to_mask
 from ..utils.metaestimators import if_delegate_has_method
 from ..utils.multiclass import check_classification_targets
@@ -287,7 +287,7 @@ def _fit(self, X, y, max_samples=None, max_depth=None, sample_weight=None):
             sample_weight = _check_sample_weight(sample_weight, X, dtype=None)
 
         # Remap output
-        n_samples, self.n_features_ = X.shape
+        n_samples = X.shape[0]
         self._n_samples = n_samples
         y = self._validate_y(y)
 
@@ -313,11 +313,11 @@ def _fit(self, X, y, max_samples=None, max_depth=None, sample_weight=None):
         if isinstance(self.max_features, numbers.Integral):
             max_features = self.max_features
         elif isinstance(self.max_features, float):
-            max_features = self.max_features * self.n_features_
+            max_features = self.max_features * self.n_features_in_
         else:
             raise ValueError("max_features must be int or float")
 
-        if not (0 < max_features <= self.n_features_):
+        if not (0 < max_features <= self.n_features_in_):
             raise ValueError("max_features must be in (0, n_features]")
 
         max_features = max(1, int(max_features))
@@ -408,7 +408,7 @@ def _get_estimators_indices(self):
             # to those in `_parallel_build_estimators()`
             feature_indices, sample_indices = _generate_bagging_indices(
                 seed, self.bootstrap_features, self.bootstrap,
-                self.n_features_, self._n_samples, self._max_features,
+                self.n_features_in_, self._n_samples, self._max_features,
                 self._max_samples)
 
             yield feature_indices, sample_indices
@@ -429,6 +429,16 @@ def estimators_samples_(self):
         return [sample_indices
                 for _, sample_indices in self._get_estimators_indices()]
 
+    # TODO: Remove in 1.2
+    # mypy error: Decorated property not supported
+    @deprecated(  # type: ignore
+        "Attribute n_features_ was deprecated in version 1.0 and will be "
+        "removed in 1.2. Use 'n_features_in_' instead."
+    )
+    @property
+    def n_features_(self):
+        return self.n_features_in_
+
 
 class BaggingClassifier(ClassifierMixin, BaseBagging):
     """A Bagging classifier.
@@ -523,6 +533,10 @@ class BaggingClassifier(ClassifierMixin, BaseBagging):
     n_features_ : int
         The number of features when :meth:`fit` is performed.
 
+        .. deprecated:: 1.0
+            Attribute `n_features_` was deprecated in version 1.0 and will be
+            removed in 1.2. Use `n_features_in_` instead.
+
     estimators_ : list of estimators
         The collection of fitted base estimators.
 
@@ -702,17 +716,11 @@ def predict_proba(self, X):
         """
         check_is_fitted(self)
         # Check data
-        X = check_array(
+        X = self._validate_data(
             X, accept_sparse=['csr', 'csc'], dtype=None,
-            force_all_finite=False
+            force_all_finite=False, reset=False
         )
 
-        if self.n_features_ != X.shape[1]:
-            raise ValueError("Number of features of the model must "
-                             "match the input. Model n_features is {0} and "
-                             "input n_features is {1}."
-                             "".format(self.n_features_, X.shape[1]))
-
         # Parallel loop
         n_jobs, n_estimators, starts = _partition_estimators(self.n_estimators,
                                                              self.n_jobs)
@@ -753,17 +761,11 @@ def predict_log_proba(self, X):
         check_is_fitted(self)
         if hasattr(self.base_estimator_, "predict_log_proba"):
             # Check data
-            X = check_array(
+            X = self._validate_data(
                 X, accept_sparse=['csr', 'csc'], dtype=None,
-                force_all_finite=False
+                force_all_finite=False, reset=False
             )
 
-            if self.n_features_ != X.shape[1]:
-                raise ValueError("Number of features of the model must "
-                                 "match the input. Model n_features is {0} "
-                                 "and input n_features is {1} "
-                                 "".format(self.n_features_, X.shape[1]))
-
             # Parallel loop
             n_jobs, n_estimators, starts = _partition_estimators(
                 self.n_estimators, self.n_jobs)
@@ -811,17 +813,11 @@ def decision_function(self, X):
         check_is_fitted(self)
 
         # Check data
-        X = check_array(
+        X = self._validate_data(
             X, accept_sparse=['csr', 'csc'], dtype=None,
-            force_all_finite=False
+            force_all_finite=False, reset=False
         )
 
-        if self.n_features_ != X.shape[1]:
-            raise ValueError("Number of features of the model must "
-                             "match the input. Model n_features is {0} and "
-                             "input n_features is {1} "
-                             "".format(self.n_features_, X.shape[1]))
-
         # Parallel loop
         n_jobs, n_estimators, starts = _partition_estimators(self.n_estimators,
                                                              self.n_jobs)
@@ -929,6 +925,10 @@ class BaggingRegressor(RegressorMixin, BaseBagging):
     n_features_ : int
         The number of features when :meth:`fit` is performed.
 
+        .. deprecated:: 1.0
+            Attribute `n_features_` was deprecated in version 1.0 and will be
+            removed in 1.2. Use `n_features_in_` instead.
+
     estimators_ : list of estimators
         The collection of fitted sub-estimators.
 
@@ -1024,9 +1024,9 @@ def predict(self, X):
         """
         check_is_fitted(self)
         # Check data
-        X = check_array(
+        X = self._validate_data(
             X, accept_sparse=['csr', 'csc'], dtype=None,
-            force_all_finite=False
+            force_all_finite=False, reset=False
         )
 
         # Parallel loop
diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py
index c0b190c60ef54..a93e9b7ee877e 100644
--- a/sklearn/ensemble/_forest.py
+++ b/sklearn/ensemble/_forest.py
@@ -57,7 +57,7 @@ class calls the ``fit`` method of each sub-estimator on random samples
 from ..tree import (DecisionTreeClassifier, DecisionTreeRegressor,
                     ExtraTreeClassifier, ExtraTreeRegressor)
 from ..tree._tree import DTYPE, DOUBLE
-from ..utils import check_random_state, check_array, compute_sample_weight
+from ..utils import check_random_state, compute_sample_weight, deprecated
 from ..exceptions import DataConversionWarning
 from ._base import BaseEnsemble, _partition_estimators
 from ..utils.fixes import delayed
@@ -312,9 +312,6 @@ def fit(self, X, y, sample_weight=None):
             # ensemble sorts the indices.
             X.sort_indices()
 
-        # Remap output
-        self.n_features_ = X.shape[1]
-
         y = np.atleast_1d(y)
         if y.ndim == 2 and y.shape[1] == 1:
             warn("A column-vector y was passed when a 1d array was"
@@ -446,7 +443,8 @@ def _compute_oob_predictions(self, X, y):
                 (n_samples, 1, n_outputs)
             The OOB predictions.
       """
-        X = check_array(X, dtype=DTYPE, accept_sparse='csr')
+        X = self._validate_data(X, dtype=DTYPE, accept_sparse='csr',
+                                reset=False)
 
         n_samples = y.shape[0]
         n_outputs = self.n_outputs_
@@ -530,12 +528,22 @@ def feature_importances_(self):
             for tree in self.estimators_ if tree.tree_.node_count > 1)
 
         if not all_importances:
-            return np.zeros(self.n_features_, dtype=np.float64)
+            return np.zeros(self.n_features_in_, dtype=np.float64)
 
         all_importances = np.mean(all_importances,
                                   axis=0, dtype=np.float64)
         return all_importances / np.sum(all_importances)
 
+    # TODO: Remove in 1.2
+    # mypy error: Decorated property not supported
+    @deprecated(  # type: ignore
+        "Attribute n_features_ was deprecated in version 1.0 and will be "
+        "removed in 1.2. Use 'n_features_in_' instead."
+    )
+    @property
+    def n_features_(self):
+        return self.n_features_in_
+
 
 def _accumulate_prediction(predict, X, out, lock):
     """
@@ -1164,6 +1172,10 @@ class labels (multi-output problem).
     n_features_ : int
         The number of features when ``fit`` is performed.
 
+        .. deprecated:: 1.0
+            Attribute `n_features_` was deprecated in version 1.0 and will be
+            removed in 1.2. Use `n_features_in_` instead.
+
     n_outputs_ : int
         The number of outputs when ``fit`` is performed.
 
@@ -1465,6 +1477,10 @@ class RandomForestRegressor(ForestRegressor):
     n_features_ : int
         The number of features when ``fit`` is performed.
 
+        .. deprecated:: 1.0
+            Attribute `n_features_` was deprecated in version 1.0 and will be
+            removed in 1.2. Use `n_features_in_` instead.
+
     n_outputs_ : int
         The number of outputs when ``fit`` is performed.
 
@@ -1786,6 +1802,10 @@ class labels (multi-output problem).
     n_features_ : int
         The number of features when ``fit`` is performed.
 
+        .. deprecated:: 1.0
+            Attribute `n_features_` was deprecated in version 1.0 and will be
+            removed in 1.2. Use `n_features_in_` instead.
+
     n_outputs_ : int
         The number of outputs when ``fit`` is performed.
 
@@ -2072,6 +2092,10 @@ class ExtraTreesRegressor(ForestRegressor):
     n_features_ : int
         The number of features.
 
+        .. deprecated:: 1.0
+            Attribute `n_features_` was deprecated in version 1.0 and will be
+            removed in 1.2. Use `n_features_in_` instead.
+
     n_outputs_ : int
         The number of outputs.
 
@@ -2296,6 +2320,10 @@ class RandomTreesEmbedding(BaseForest):
     n_features_ : int
         The number of features when ``fit`` is performed.
 
+        .. deprecated:: 1.0
+            Attribute `n_features_` was deprecated in version 1.0 and will be
+            removed in 1.2. Use `n_features_in_` instead.
+
     n_outputs_ : int
         The number of outputs when ``fit`` is performed.
 
@@ -2425,7 +2453,7 @@ def fit_transform(self, X, y=None, sample_weight=None):
         X_transformed : sparse matrix of shape (n_samples, n_out)
             Transformed dataset.
         """
-        X = check_array(X, accept_sparse=['csc'])
+        X = self._validate_data(X, accept_sparse=['csc'])
         if issparse(X):
             # Pre-sort indices to avoid that each individual tree of the
             # ensemble sorts the indices.
diff --git a/sklearn/ensemble/_gb.py b/sklearn/ensemble/_gb.py
index 15f5404f4701c..e9f7402188860 100644
--- a/sklearn/ensemble/_gb.py
+++ b/sklearn/ensemble/_gb.py
@@ -273,25 +273,25 @@ def _check_params(self):
         if isinstance(self.max_features, str):
             if self.max_features == "auto":
                 if is_classifier(self):
-                    max_features = max(1, int(np.sqrt(self.n_features_)))
+                    max_features = max(1, int(np.sqrt(self.n_features_in_)))
                 else:
-                    max_features = self.n_features_
+                    max_features = self.n_features_in_
             elif self.max_features == "sqrt":
-                max_features = max(1, int(np.sqrt(self.n_features_)))
+                max_features = max(1, int(np.sqrt(self.n_features_in_)))
             elif self.max_features == "log2":
-                max_features = max(1, int(np.log2(self.n_features_)))
+                max_features = max(1, int(np.log2(self.n_features_in_)))
             else:
                 raise ValueError("Invalid value for max_features: %r. "
                                  "Allowed string values are 'auto', 'sqrt' "
                                  "or 'log2'." % self.max_features)
         elif self.max_features is None:
-            max_features = self.n_features_
+            max_features = self.n_features_in_
         elif isinstance(self.max_features, numbers.Integral):
             max_features = self.max_features
         else:  # float
             if 0. < self.max_features <= 1.:
                 max_features = max(int(self.max_features *
-                                       self.n_features_), 1)
+                                       self.n_features_in_), 1)
             else:
                 raise ValueError("max_features must be in (0, n_features]")
 
@@ -411,7 +411,6 @@ def fit(self, X, y, sample_weight=None, monitor=None):
 
         X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc', 'coo'],
                                    dtype=DTYPE, multi_output=True)
-        n_samples, self.n_features_ = X.shape
 
         sample_weight_is_none = sample_weight is None
 
@@ -608,9 +607,6 @@ def _raw_predict_init(self, X):
         """Check input and compute raw predictions of the init estimator."""
         self._check_initialized()
         X = self.estimators_[0, 0]._validate_X_predict(X, check_input=True)
-        if X.shape[1] != self.n_features_:
-            raise ValueError("X.shape[1] should be {0:d}, not {1:d}.".format(
-                self.n_features_, X.shape[1]))
         if self.init_ == 'zero':
             raw_predictions = np.zeros(shape=(X.shape[0], self.loss_.K),
                                        dtype=np.float64)
@@ -647,7 +643,8 @@ def _staged_raw_predict(self, X):
             Regression and binary classification are special cases with
             ``k == 1``, otherwise ``k==n_classes``.
         """
-        X = check_array(X, dtype=DTYPE, order="C", accept_sparse='csr')
+        X = self._validate_data(X, dtype=DTYPE, order="C", accept_sparse='csr',
+                                reset=False)
         raw_predictions = self._raw_predict_init(X)
         for i in range(self.estimators_.shape[0]):
             predict_stage(self.estimators_, i, X, self.learning_rate,
@@ -681,7 +678,7 @@ def feature_importances_(self):
                           if tree.tree_.node_count > 1]
         if not relevant_trees:
             # degenerate case where all trees have only one node
-            return np.zeros(shape=self.n_features_, dtype=np.float64)
+            return np.zeros(shape=self.n_features_in_, dtype=np.float64)
 
         relevant_feature_importances = [
             tree.tree_.compute_feature_importances(normalize=False)
@@ -764,6 +761,16 @@ def apply(self, X):
 
         return leaves
 
+    # TODO: Remove in 1.2
+    # mypy error: Decorated property not supported
+    @deprecated(  # type: ignore
+        "Attribute n_features_ was deprecated in version 1.0 and will be "
+        "removed in 1.2. Use 'n_features_in_' instead."
+    )
+    @property
+    def n_features_(self):
+        return self.n_features_in_
+
 
 class GradientBoostingClassifier(ClassifierMixin, BaseGradientBoosting):
     """Gradient Boosting for classification.
@@ -1005,7 +1012,7 @@ class GradientBoostingClassifier(ClassifierMixin, BaseGradientBoosting):
         Set via the ``init`` argument or ``loss.init_estimator``.
 
     estimators_ : ndarray of DecisionTreeRegressor of \
-shape (n_estimators, ``loss_.K``)
+            shape (n_estimators, ``loss_.K``)
         The collection of fitted sub-estimators. ``loss_.K`` is 1 for binary
         classification, otherwise n_classes.
 
@@ -1015,6 +1022,10 @@ class GradientBoostingClassifier(ClassifierMixin, BaseGradientBoosting):
     n_features_ : int
         The number of data features.
 
+        .. deprecated:: 1.0
+            Attribute `n_features_` was deprecated in version 1.0 and will be
+            removed in 1.2. Use `n_features_in_` instead.
+
     n_classes_ : int
         The number of classes.
 
@@ -1140,7 +1151,8 @@ def decision_function(self, X):
             :term:`classes_`. Regression and binary classification produce an
             array of shape (n_samples,).
         """
-        X = check_array(X, dtype=DTYPE, order="C", accept_sparse='csr')
+        X = self._validate_data(X, dtype=DTYPE, order="C", accept_sparse='csr',
+                                reset=False)
         raw_predictions = self._raw_predict(X)
         if raw_predictions.shape[1] == 1:
             return raw_predictions.ravel()
@@ -1548,6 +1560,10 @@ class GradientBoostingRegressor(RegressorMixin, BaseGradientBoosting):
     n_features_ : int
         The number of data features.
 
+        .. deprecated:: 1.0
+            Attribute `n_features_` was deprecated in version 1.0 and will be
+            removed in 1.2. Use `n_features_in_` instead.
+
     max_features_ : int
         The inferred value of max_features.
 
@@ -1647,7 +1663,8 @@ def predict(self, X):
         y : ndarray of shape (n_samples,)
             The predicted values.
         """
-        X = check_array(X, dtype=DTYPE, order="C", accept_sparse='csr')
+        X = self._validate_data(X, dtype=DTYPE, order="C", accept_sparse='csr',
+                                reset=False)
         # In regression we can directly return the raw value from the trees.
         return self._raw_predict(X).ravel()
 
diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
index 15b4c95f8cd54..4fff6030b0d5a 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
@@ -8,7 +8,7 @@
 from timeit import default_timer as time
 from ...base import (BaseEstimator, RegressorMixin, ClassifierMixin,
                      is_classifier)
-from ...utils import check_random_state, check_array, resample
+from ...utils import check_random_state, resample
 from ...utils.validation import (check_is_fitted,
                                  check_consistent_length,
                                  _check_sample_weight,
@@ -733,7 +733,8 @@ def _raw_predict(self, X):
         """
         is_binned = getattr(self, '_in_fit', False)
         dtype = X_BINNED_DTYPE if is_binned else X_DTYPE
-        X = check_array(X, dtype=dtype, force_all_finite=False)
+        X = self._validate_data(X, dtype=dtype, force_all_finite=False,
+                                reset=False)
         check_is_fitted(self)
         if X.shape[1] != self._n_features:
             raise ValueError(
@@ -789,7 +790,8 @@ def _staged_raw_predict(self, X):
             The raw predictions of the input samples. The order of the
             classes corresponds to that in the attribute :term:`classes_`.
         """
-        X = check_array(X, dtype=X_DTYPE, force_all_finite=False)
+        X = self._validate_data(X, dtype=X_DTYPE, force_all_finite=False,
+                                reset=False)
         check_is_fitted(self)
         if X.shape[1] != self._n_features:
             raise ValueError(
diff --git a/sklearn/ensemble/_iforest.py b/sklearn/ensemble/_iforest.py
index e607342456cd4..588b1bbef299c 100644
--- a/sklearn/ensemble/_iforest.py
+++ b/sklearn/ensemble/_iforest.py
@@ -144,6 +144,10 @@ class IsolationForest(OutlierMixin, BaseBagging):
     n_features_ : int
         The number of features when ``fit`` is performed.
 
+        .. deprecated:: 1.0
+            Attribute `n_features_` was deprecated in version 1.0 and will be
+            removed in 1.2. Use `n_features_in_` instead.
+
     Notes
     -----
     The implementation is based on an ensemble of ExtraTreeRegressor. The
@@ -238,7 +242,7 @@ def fit(self, X, y=None, sample_weight=None):
         self : object
             Fitted estimator.
         """
-        X = check_array(X, accept_sparse=['csc'])
+        X = self._validate_data(X, accept_sparse=['csc'])
         if issparse(X):
             # Pre-sort indices to avoid that each individual tree of the
             # ensemble sorts the indices.
@@ -314,7 +318,7 @@ def predict(self, X):
             be considered as an inlier according to the fitted model.
         """
         check_is_fitted(self)
-        X = check_array(X, accept_sparse='csr')
+        X = self._validate_data(X, accept_sparse='csr', reset=False)
         is_inlier = np.ones(X.shape[0], dtype=int)
         is_inlier[self.decision_function(X) < 0] = -1
         return is_inlier
@@ -380,12 +384,7 @@ def score_samples(self, X):
         check_is_fitted(self)
 
         # Check data
-        X = check_array(X, accept_sparse='csr')
-        if self.n_features_ != X.shape[1]:
-            raise ValueError("Number of features of the model must "
-                             "match the input. Model n_features is {0} and "
-                             "input n_features is {1}."
-                             "".format(self.n_features_, X.shape[1]))
+        X = self._validate_data(X, accept_sparse='csr', reset=False)
 
         # Take the opposite of the scores as bigger is better (here less
         # abnormal)
diff --git a/sklearn/ensemble/_weight_boosting.py b/sklearn/ensemble/_weight_boosting.py
index 3ea94cff7da53..d5354232a4385 100644
--- a/sklearn/ensemble/_weight_boosting.py
+++ b/sklearn/ensemble/_weight_boosting.py
@@ -33,7 +33,7 @@
 from ..base import ClassifierMixin, RegressorMixin, is_classifier, is_regressor
 
 from ..tree import DecisionTreeClassifier, DecisionTreeRegressor
-from ..utils import check_array, check_random_state, _safe_indexing
+from ..utils import check_random_state, _safe_indexing
 from ..utils.extmath import softmax
 from ..utils.extmath import stable_cumsum
 from ..metrics import accuracy_score, r2_score
@@ -73,8 +73,10 @@ def __init__(self,
         self.random_state = random_state
 
     def _check_X(self, X):
-        return check_array(X, accept_sparse=['csr', 'csc'], ensure_2d=True,
-                           allow_nd=True, dtype=None)
+        # Only called to validate X in non-fit methods, therefore reset=False
+        return self._validate_data(
+            X, accept_sparse=['csr', 'csc'], ensure_2d=True, allow_nd=True,
+            dtype=None, reset=False)
 
     def fit(self, X, y, sample_weight=None):
         """Build a boosted classifier/regressor from the training set (X, y).
diff --git a/sklearn/ensemble/tests/test_bagging.py b/sklearn/ensemble/tests/test_bagging.py
index e7cb11185fa5c..b17cbf7c147ac 100644
--- a/sklearn/ensemble/tests/test_bagging.py
+++ b/sklearn/ensemble/tests/test_bagging.py
@@ -480,15 +480,6 @@ def test_parallel_classification():
     decisions2 = ensemble.decision_function(X_test)
     assert_array_almost_equal(decisions1, decisions2)
 
-    X_err = np.hstack((X_test, np.zeros((X_test.shape[0], 1))))
-    err_msg = (
-        f"Number of features of the model must match the input. Model "
-        f"n_features is {X_test.shape[1]} and input n_features is "
-        f"{X_err.shape[1]} "
-    )
-    with pytest.raises(ValueError, match=err_msg):
-        ensemble.decision_function(X_err)
-
     ensemble = BaggingClassifier(SVC(decision_function_shape='ovr'),
                                  n_jobs=1,
                                  random_state=0).fit(X_train, y_train)
@@ -921,3 +912,16 @@ def fit(self, X, y):
 
     assert_array_equal(clf.estimators_[0]._sample_indices,
                        clf.estimators_samples_[0])
+
+
+# FIXME: remove in 1.2
+@pytest.mark.parametrize("Estimator", [BaggingClassifier, BaggingRegressor])
+def test_n_features_deprecation(Estimator):
+    # Check that we raise the proper deprecation warning if accessing
+    # `n_features_`.
+    X = np.array([[1, 2], [3, 4]])
+    y = np.array([1, 0])
+    est = Estimator().fit(X, y)
+
+    with pytest.warns(FutureWarning, match="n_features_ was deprecated"):
+        est.n_features_
diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py
index 89ded326d21aa..c05cad26708b4 100644
--- a/sklearn/ensemble/tests/test_forest.py
+++ b/sklearn/ensemble/tests/test_forest.py
@@ -1476,3 +1476,21 @@ def test_little_tree_with_small_max_samples(ForestClass):
 
     msg = "Tree without `max_samples` restriction should have more nodes"
     assert tree1.node_count > tree2.node_count, msg
+
+
+# FIXME: remove in 1.2
+@pytest.mark.parametrize(
+    "Estimator",
+    [ExtraTreesClassifier, ExtraTreesRegressor,
+     RandomForestClassifier, RandomForestRegressor,
+     RandomTreesEmbedding]
+)
+def test_n_features_deprecation(Estimator):
+    # Check that we raise the proper deprecation warning if accessing
+    # `n_features_`.
+    X = np.array([[1, 2], [3, 4]])
+    y = np.array([1, 0])
+    est = Estimator().fit(X, y)
+
+    with pytest.warns(FutureWarning, match="n_features_ was deprecated"):
+        est.n_features_
diff --git a/sklearn/ensemble/tests/test_gradient_boosting.py b/sklearn/ensemble/tests/test_gradient_boosting.py
index 57ac93f52d0d3..63d4e668e674f 100644
--- a/sklearn/ensemble/tests/test_gradient_boosting.py
+++ b/sklearn/ensemble/tests/test_gradient_boosting.py
@@ -1353,3 +1353,19 @@ def test_criterion_mae_deprecation(estimator):
            "will be removed in version 1.1")
     with pytest.warns(FutureWarning, match=msg):
         estimator.fit(X, y)
+
+
+# FIXME: remove in 1.2
+@pytest.mark.parametrize(
+    "Estimator",
+    [GradientBoostingClassifier, GradientBoostingRegressor]
+)
+def test_n_features_deprecation(Estimator):
+    # Check that we raise the proper deprecation warning if accessing
+    # `n_features_`.
+    X = np.array([[1, 2], [3, 4]])
+    y = np.array([1, 0])
+    est = Estimator().fit(X, y)
+
+    with pytest.warns(FutureWarning, match="n_features_ was deprecated"):
+        est.n_features_
diff --git a/sklearn/ensemble/tests/test_iforest.py b/sklearn/ensemble/tests/test_iforest.py
index de0c56fff793b..0b3a521346b30 100644
--- a/sklearn/ensemble/tests/test_iforest.py
+++ b/sklearn/ensemble/tests/test_iforest.py
@@ -345,3 +345,15 @@ def test_iforest_with_uniform_data():
     assert all(iforest.predict(X) == 1)
     assert all(iforest.predict(rng.randn(100, 10)) == 1)
     assert all(iforest.predict(np.ones((100, 10))) == 1)
+
+
+# FIXME: remove in 1.2
+def test_n_features_deprecation():
+    # Check that we raise the proper deprecation warning if accessing
+    # `n_features_`.
+    X = np.array([[1, 2], [3, 4]])
+    y = np.array([1, 0])
+    est = IsolationForest().fit(X, y)
+
+    with pytest.warns(FutureWarning, match="n_features_ was deprecated"):
+        est.n_features_
diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
index dbac492d5efb9..4cdae851f9b9c 100644
--- a/sklearn/tests/test_common.py
+++ b/sklearn/tests/test_common.py
@@ -266,7 +266,6 @@ def test_search_cv(estimator, check, request):
 N_FEATURES_IN_AFTER_FIT_MODULES_TO_IGNORE = {
     'calibration',
     'compose',
-    'ensemble',
     'feature_extraction',
     'isotonic',
     'manifold',

From 8b71a677004f5ffe665b0eb6ee3341ce17573ec3 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Mon, 22 Feb 2021 15:21:20 -0500
Subject: [PATCH 196/478] CI Fixes twitter workflow (#19525)

---
 .github/workflows/twitter.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/twitter.yml b/.github/workflows/twitter.yml
index 7c219b13ec28c..96b32ec902efa 100644
--- a/.github/workflows/twitter.yml
+++ b/.github/workflows/twitter.yml
@@ -16,7 +16,7 @@ jobs:
     steps:
       - name: Tweet URL of last commit as @sklearn_commits
         if: github.repository == 'scikit-learn/scikit-learn'
-        uses: xorilog/twitter-action@0.1
+        uses: docker://thomasjpfan/twitter-action:0.3
         with:
           args: "-message \"https://github.com/scikit-learn/scikit-learn/commit/${{ github.sha }}\""
         env:

From 26c5530e792c1319ddd3335e23d1f36cf90f6c3d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juan=20Carlos=20Alfaro=20Jim=C3=A9nez?=
 <JuanCarlos.Alfaro@uclm.es>
Date: Mon, 22 Feb 2021 21:22:49 +0100
Subject: [PATCH 197/478] MNT Set non-interactive installation mode for the
 pypy job (#19461)

---
 .circleci/config.yml                           | 3 +++
 sklearn/gaussian_process/tests/test_kernels.py | 5 ++++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 4ca26a110f28c..bc4acd8a35fcb 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -104,6 +104,9 @@ jobs:
   pypy3:
     docker:
       - image: condaforge/miniforge3
+    environment:
+      # Avoid the interactive dialog when installing tzdata
+      - DEBIAN_FRONTEND: noninteractive
     steps:
       - restore_cache:
           keys:
diff --git a/sklearn/gaussian_process/tests/test_kernels.py b/sklearn/gaussian_process/tests/test_kernels.py
index 4627117677c8b..1f8e196104e75 100644
--- a/sklearn/gaussian_process/tests/test_kernels.py
+++ b/sklearn/gaussian_process/tests/test_kernels.py
@@ -20,7 +20,8 @@
 from sklearn.utils._testing import (assert_almost_equal, assert_array_equal,
                                     assert_array_almost_equal,
                                     assert_allclose,
-                                    assert_raise_message)
+                                    assert_raise_message,
+                                    fails_if_pypy)
 
 
 X = np.random.RandomState(0).normal(0, 1, (5, 2))
@@ -49,6 +50,8 @@
     kernels.append(PairwiseKernel(gamma=1.0, metric=metric))
 
 
+# Numerical precisions errors in PyPy
+@fails_if_pypy
 @pytest.mark.parametrize('kernel', kernels)
 def test_kernel_gradient(kernel):
     # Compare analytic and numeric gradient of kernels.

From 5c246225ddf130f1eee398e889e4c2a19b5f1791 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Tue, 23 Feb 2021 17:40:18 +0100
Subject: [PATCH 198/478] ENH Adds n_features_in_ to naive_bayes (#19485)

---
 sklearn/naive_bayes.py            | 92 ++++++++++++++++++++-----------
 sklearn/tests/test_common.py      |  1 -
 sklearn/tests/test_naive_bayes.py | 75 +++++++++++--------------
 3 files changed, 91 insertions(+), 77 deletions(-)

diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py
index d32e0756f2907..70f5993f98b1a 100644
--- a/sklearn/naive_bayes.py
+++ b/sklearn/naive_bayes.py
@@ -27,10 +27,10 @@
 from .preprocessing import binarize
 from .preprocessing import LabelBinarizer
 from .preprocessing import label_binarize
-from .utils import check_X_y, check_array, deprecated
+from .utils import deprecated
 from .utils.extmath import safe_sparse_dot
 from .utils.multiclass import _check_partial_fit_first_call
-from .utils.validation import check_is_fitted, check_non_negative, column_or_1d
+from .utils.validation import check_is_fitted, check_non_negative
 from .utils.validation import _check_sample_weight
 from .utils.validation import _deprecate_positional_args
 
@@ -55,7 +55,10 @@ def _joint_log_likelihood(self, X):
 
     @abstractmethod
     def _check_X(self, X):
-        """To be overridden in subclasses with the actual checks."""
+        """To be overridden in subclasses with the actual checks.
+
+        Only used in predict* methods.
+        """
 
     def predict(self, X):
         """
@@ -214,12 +217,12 @@ def fit(self, X, y, sample_weight=None):
         self : object
         """
         X, y = self._validate_data(X, y)
-        y = column_or_1d(y, warn=True)
         return self._partial_fit(X, y, np.unique(y), _refit=True,
                                  sample_weight=sample_weight)
 
     def _check_X(self, X):
-        return check_array(X)
+        """Validate X, used only in predict* methods."""
+        return self._validate_data(X, reset=False)
 
     @staticmethod
     def _update_mean_variance(n_past, mu, var, X, sample_weight=None):
@@ -367,7 +370,11 @@ def _partial_fit(self, X, y, classes=None, _refit=False,
         -------
         self : object
         """
-        X, y = check_X_y(X, y)
+        if _refit:
+            self.classes_ = None
+
+        first_call = _check_partial_fit_first_call(self, classes)
+        X, y = self._validate_data(X, y, reset=first_call)
         if sample_weight is not None:
             sample_weight = _check_sample_weight(sample_weight, X)
 
@@ -377,10 +384,7 @@ def _partial_fit(self, X, y, classes=None, _refit=False,
         # deviation of the largest dimension.
         self.epsilon_ = self.var_smoothing * np.var(X, axis=0).max()
 
-        if _refit:
-            self.classes_ = None
-
-        if _check_partial_fit_first_call(self, classes):
+        if first_call:
             # This is the first call to partial_fit:
             # initialize various cumulative counters
             n_features = X.shape[1]
@@ -488,10 +492,12 @@ class _BaseDiscreteNB(_BaseNB):
     """
 
     def _check_X(self, X):
-        return check_array(X, accept_sparse='csr')
+        """Validate X, used only in predict* methods."""
+        return self._validate_data(X, accept_sparse='csr', reset=False)
 
-    def _check_X_y(self, X, y):
-        return self._validate_data(X, y, accept_sparse='csr')
+    def _check_X_y(self, X, y, reset=True):
+        """Validate X and y in fit methods."""
+        return self._validate_data(X, y, accept_sparse='csr', reset=reset)
 
     def _update_class_log_prior(self, class_prior=None):
         n_classes = len(self.classes_)
@@ -518,7 +524,7 @@ def _check_alpha(self):
             raise ValueError('Smoothing parameter alpha = %.1e. '
                              'alpha should be > 0.' % np.min(self.alpha))
         if isinstance(self.alpha, np.ndarray):
-            if not self.alpha.shape[0] == self.n_features_:
+            if not self.alpha.shape[0] == self.n_features_in_:
                 raise ValueError("alpha should be a scalar or a numpy array "
                                  "with shape [n_features]")
         if np.min(self.alpha) < _ALPHA_MIN:
@@ -563,7 +569,8 @@ def partial_fit(self, X, y, classes=None, sample_weight=None):
         -------
         self : object
         """
-        X, y = self._check_X_y(X, y)
+        first_call = not hasattr(self, "classes_")
+        X, y = self._check_X_y(X, y, reset=first_call)
         _, n_features = X.shape
 
         if _check_partial_fit_first_call(self, classes):
@@ -571,10 +578,6 @@ def partial_fit(self, X, y, classes=None, sample_weight=None):
             # initialize various cumulative counters
             n_classes = len(classes)
             self._init_counters(n_classes, n_features)
-            self.n_features_ = n_features
-        elif n_features != self.n_features_:
-            msg = "Number of features %d does not match previous data %d."
-            raise ValueError(msg % (n_features, self.n_features_))
 
         Y = label_binarize(y, classes=self.classes_)
         if Y.shape[1] == 1:
@@ -631,7 +634,6 @@ def fit(self, X, y, sample_weight=None):
         """
         X, y = self._check_X_y(X, y)
         _, n_features = X.shape
-        self.n_features_ = n_features
 
         labelbin = LabelBinarizer()
         Y = labelbin.fit_transform(y)
@@ -687,6 +689,16 @@ def intercept_(self):
     def _more_tags(self):
         return {'poor_score': True}
 
+    # TODO: Remove in 1.2
+    # mypy error: Decorated property not supported
+    @deprecated(  # type: ignore
+        "Attribute n_features_ was deprecated in version 1.0 and will be "
+        "removed in 1.2. Use 'n_features_in_' instead."
+    )
+    @property
+    def n_features_(self):
+        return self.n_features_in_
+
 
 class MultinomialNB(_BaseDiscreteNB):
     """
@@ -753,6 +765,10 @@ class MultinomialNB(_BaseDiscreteNB):
     n_features_ : int
         Number of features of each sample.
 
+        .. deprecated:: 1.0
+            Attribute `n_features_` was deprecated in version 1.0 and will be
+            removed in 1.2. Use `n_features_in_` instead.
+
     Examples
     --------
     >>> import numpy as np
@@ -879,6 +895,10 @@ class ComplementNB(_BaseDiscreteNB):
     n_features_ : int
         Number of features of each sample.
 
+        .. deprecated:: 1.0
+            Attribute `n_features_` was deprecated in version 1.0 and will be
+            removed in 1.2. Use `n_features_in_` instead.
+
     Examples
     --------
     >>> import numpy as np
@@ -996,6 +1016,10 @@ class BernoulliNB(_BaseDiscreteNB):
     n_features_ : int
         Number of features of each sample.
 
+        .. deprecated:: 1.0
+            Attribute `n_features_` was deprecated in version 1.0 and will be
+            removed in 1.2. Use `n_features_in_` instead.
+
     Examples
     --------
     >>> import numpy as np
@@ -1032,13 +1056,14 @@ def __init__(self, *, alpha=1.0, binarize=.0, fit_prior=True,
         self.class_prior = class_prior
 
     def _check_X(self, X):
+        """Validate X, used only in predict* methods."""
         X = super()._check_X(X)
         if self.binarize is not None:
             X = binarize(X, threshold=self.binarize)
         return X
 
-    def _check_X_y(self, X, y):
-        X, y = super()._check_X_y(X, y)
+    def _check_X_y(self, X, y, reset=True):
+        X, y = super()._check_X_y(X, y, reset=reset)
         if self.binarize is not None:
             X = binarize(X, threshold=self.binarize)
         return X, y
@@ -1133,6 +1158,10 @@ class CategoricalNB(_BaseDiscreteNB):
     n_features_ : int
         Number of features of each sample.
 
+        .. deprecated:: 1.0
+            Attribute `n_features_` was deprecated in version 1.0 and will be
+            removed in 1.2. Use `n_features_in_` instead.
+
     n_categories_ : ndarray of shape (n_features,), dtype=np.int64
         Number of categories for each feature. This value is
         inferred from the data or set by the minimum number of categories.
@@ -1235,14 +1264,15 @@ def _more_tags(self):
         return {'requires_positive_X': True}
 
     def _check_X(self, X):
-        X = check_array(X, dtype='int', accept_sparse=False,
-                        force_all_finite=True)
+        """Validate X, used only in predict* methods."""
+        X = self._validate_data(X, dtype='int', accept_sparse=False,
+                                force_all_finite=True, reset=False)
         check_non_negative(X, "CategoricalNB (input X)")
         return X
 
-    def _check_X_y(self, X, y):
+    def _check_X_y(self, X, y, reset=True):
         X, y = self._validate_data(X, y, dtype='int', accept_sparse=False,
-                                   force_all_finite=True)
+                                   force_all_finite=True, reset=reset)
         check_non_negative(X, "CategoricalNB (input X)")
         return X, y
 
@@ -1297,7 +1327,7 @@ def _update_cat_count(X_feature, Y, cat_count, n_classes):
         self.class_count_ += Y.sum(axis=0)
         self.n_categories_ = self._validate_n_categories(
             X, self.min_categories)
-        for i in range(self.n_features_):
+        for i in range(self.n_features_in_):
             X_feature = X[:, i]
             self.category_count_[i] = _update_cat_count_dims(
                 self.category_count_[i], self.n_categories_[i] - 1)
@@ -1307,7 +1337,7 @@ def _update_cat_count(X_feature, Y, cat_count, n_classes):
 
     def _update_feature_log_prob(self, alpha):
         feature_log_prob = []
-        for i in range(self.n_features_):
+        for i in range(self.n_features_in_):
             smoothed_cat_count = self.category_count_[i] + alpha
             smoothed_class_count = smoothed_cat_count.sum(axis=1)
             feature_log_prob.append(
@@ -1316,11 +1346,9 @@ def _update_feature_log_prob(self, alpha):
         self.feature_log_prob_ = feature_log_prob
 
     def _joint_log_likelihood(self, X):
-        if not X.shape[1] == self.n_features_:
-            raise ValueError("Expected input with %d features, got %d instead"
-                             % (self.n_features_, X.shape[1]))
+        self._check_n_features(X, reset=False)
         jll = np.zeros((X.shape[0], self.class_count_.shape[0]))
-        for i in range(self.n_features_):
+        for i in range(self.n_features_in_):
             indices = X[:, i]
             jll += self.feature_log_prob_[i][:, indices].T
         total_ll = jll + self.class_log_prior_
diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
index 4cdae851f9b9c..4f6f232a8f716 100644
--- a/sklearn/tests/test_common.py
+++ b/sklearn/tests/test_common.py
@@ -273,7 +273,6 @@ def test_search_cv(estimator, check, request):
     'model_selection',
     'multiclass',
     'multioutput',
-    'naive_bayes',
     'pipeline',
     'random_projection',
 }
diff --git a/sklearn/tests/test_naive_bayes.py b/sklearn/tests/test_naive_bayes.py
index 02b83e51ac8b6..dcd4b07712357 100644
--- a/sklearn/tests/test_naive_bayes.py
+++ b/sklearn/tests/test_naive_bayes.py
@@ -57,7 +57,11 @@ def test_gnb():
     # Test whether label mismatch between target y and classes raises
     # an Error
     # FIXME Remove this test once the more general partial_fit tests are merged
-    assert_raises(ValueError, GaussianNB().partial_fit, X, y, classes=[0, 1])
+    with pytest.raises(
+        ValueError,
+        match="The target label.* in y do not exist in the initial classes"
+    ):
+        GaussianNB().partial_fit(X, y, classes=[0, 1])
 
 
 # TODO remove in 1.2 once sigma_ attribute is removed (GH #18842)
@@ -74,7 +78,7 @@ def test_gnb_prior():
     clf = GaussianNB().fit(X, y)
     assert_array_almost_equal(np.array([3, 3]) / 6.0,
                               clf.class_prior_, 8)
-    clf.fit(X1, y1)
+    clf = GaussianNB().fit(X1, y1)
     # Check that the class priors sum to 1
     assert_array_almost_equal(clf.class_prior_.sum(), 1)
 
@@ -171,16 +175,6 @@ def test_gnb_check_update_with_no_data():
     assert tvar == var
 
 
-def test_gnb_pfit_wrong_nb_features():
-    """Test whether an error is raised when the number of feature changes
-    between two partial fit"""
-    clf = GaussianNB()
-    # Fit for the first time the GNB
-    clf.fit(X, y)
-    # Partial fit a second time with an incoherent X
-    assert_raises(ValueError, clf.partial_fit, np.hstack((X, X)), y)
-
-
 def test_gnb_partial_fit():
     clf = GaussianNB().fit(X, y)
     clf_pf = GaussianNB().partial_fit(X, y, np.unique(y))
@@ -272,37 +266,22 @@ def test_discretenb_partial_fit(DiscreteNaiveBayes):
 
 
 @pytest.mark.parametrize('NaiveBayes', ALL_NAIVE_BAYES_CLASSES)
-def test_naive_bayes_input_check_fit(NaiveBayes):
-    # Test input checks for the fit method
-
-    # check shape consistency for number of samples at fit time
-    assert_raises(ValueError, NaiveBayes().fit, X2, y2[:-1])
-
-    # check shape consistency for number of input features at predict time
-    clf = NaiveBayes().fit(X2, y2)
-    assert_raises(ValueError, clf.predict, X2[:, :-1])
-
-
-@pytest.mark.parametrize('DiscreteNaiveBayes', DISCRETE_NAIVE_BAYES_CLASSES)
-def test_discretenb_input_check_partial_fit(DiscreteNaiveBayes):
-    # check shape consistency
-    assert_raises(ValueError, DiscreteNaiveBayes().partial_fit, X2, y2[:-1],
-                  classes=np.unique(y2))
-
+def test_NB_partial_fit_no_first_classes(NaiveBayes):
     # classes is required for first call to partial fit
-    assert_raises(ValueError, DiscreteNaiveBayes().partial_fit, X2, y2)
+    with pytest.raises(
+        ValueError,
+        match="classes must be passed on the first call to partial_fit."
+    ):
+        NaiveBayes().partial_fit(X2, y2)
 
     # check consistency of consecutive classes values
-    clf = DiscreteNaiveBayes()
+    clf = NaiveBayes()
     clf.partial_fit(X2, y2, classes=np.unique(y2))
-    assert_raises(ValueError, clf.partial_fit, X2, y2,
-                  classes=np.arange(42))
-
-    # check consistency of input shape for partial_fit
-    assert_raises(ValueError, clf.partial_fit, X2[:, :-1], y2)
-
-    # check consistency of input shape for predict
-    assert_raises(ValueError, clf.predict, X2[:, :-1])
+    with pytest.raises(
+        ValueError,
+        match="is not the same as on last call to partial_fit"
+    ):
+        clf.partial_fit(X2, y2, classes=np.arange(42))
 
 
 # TODO: Remove in version 1.1
@@ -725,11 +704,6 @@ def test_categoricalnb():
     assert_raise_message(ValueError, error_msg, clf.predict, X)
     assert_raise_message(ValueError, error_msg, clf.fit, X, y)
 
-    # Check error is raised for incorrect X
-    X = np.array([[1, 4, 1], [2, 5, 6]])
-    msg = "Expected input with 2 features, got 3 instead"
-    assert_raise_message(ValueError, msg, clf.predict, X)
-
     # Test alpha
     X3_test = np.array([[2, 5]])
     # alpha=1 increases the count of all categories by one so the final
@@ -941,3 +915,16 @@ def test_check_accuracy_on_digits():
 
     scores = cross_val_score(GaussianNB(), X_3v8, y_3v8, cv=10)
     assert scores.mean() > 0.86
+
+
+# FIXME: remove in 1.2
+@pytest.mark.parametrize("Estimator", DISCRETE_NAIVE_BAYES_CLASSES)
+def test_n_features_deprecation(Estimator):
+    # Check that we raise the proper deprecation warning if accessing
+    # `n_features_`.
+    X = np.array([[1, 2], [3, 4]])
+    y = np.array([1, 0])
+    est = Estimator().fit(X, y)
+
+    with pytest.warns(FutureWarning, match="n_features_ was deprecated"):
+        est.n_features_

From fab739c480ed8641cb7e1c6fb2cc30f9346056e5 Mon Sep 17 00:00:00 2001
From: Chiara Marmo <cmarmo@users.noreply.github.com>
Date: Tue, 23 Feb 2021 17:55:20 +0100
Subject: [PATCH 199/478] CI Add workflow to check Changelog entry. (#19155)

Co-authored-by: Nicolas Hug <contact@nicolas-hug.com>
---
 .github/workflows/check-changelog.yml | 54 +++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)
 create mode 100644 .github/workflows/check-changelog.yml

diff --git a/.github/workflows/check-changelog.yml b/.github/workflows/check-changelog.yml
new file mode 100644
index 0000000000000..7d954c530cff8
--- /dev/null
+++ b/.github/workflows/check-changelog.yml
@@ -0,0 +1,54 @@
+name: Check Changelog
+# This check makes sure that the changelog is properly updated
+# when a PR introduces a change in a test file.
+# To bypass this check, label the PR with "No Changelog Needed".
+on:
+  pull_request:
+
+jobs:
+  check:
+    runs-on: ubuntu-latest
+    if: ${{ contains(github.event.pull_request.labels.*.name, 'No Changelog Needed') == 0 }}
+    steps:
+      - name: Get PR number and milestone
+        run: |
+          echo "PR_NUMBER=${{ github.event.pull_request.number }}" >> $GITHUB_ENV
+          echo "TAGGED_MILESTONE=${{ github.event.pull_request.milestone.title }} >> $GITHUB_ENV
+      - uses: actions/checkout@v2
+        with:
+          fetch-depth: '0'
+      - name: Check the changelog
+        run: |
+          set -xe
+          changed_files=$(git diff --name-only origin/master)
+          # Changelog should be updated only if tests have been modified
+          if [[ ! "$changed_files" =~ tests ]]
+          then
+            exit 0
+          fi
+          all_changelogs=$(cat ./doc/whats_new/v*.rst)
+          if [[ "$all_changelogs" =~ :pr:\`$PR_NUMBER\` ]]
+          then
+            echo "Changelog has been updated."
+            # If the pull request is milestoned check the correspondent changelog
+            if exist -f ./doc/whats_new/v${TAGGED_MILESTONE:0:4}.rst
+            then
+              expected_changelog=$(cat ./doc/whats_new/v${TAGGED_MILESTONE:0:4}.rst)
+              if [[ "$expected_changelog" =~ :pr:\`$PR_NUMBER\` ]]
+              then
+                echo "Changelog and milestone correspond."
+              else
+                echo "Changelog and milestone do not correspond."
+                echo "If you see this error make sure that the tagged milestone for the PR"
+                echo "and the changelog name properly match."
+                exit 1
+              fi
+            fi
+          else
+            echo "Changelog entry is missing."
+            echo "If you see this error and there is already a changelog entry then make sure that"
+            echo "the PR number is correct. If no changelog entry is required for this PR,"
+            echo "label the PR with 'No Changelog Needed' to bypass this check."
+            exit 1
+          fi
+

From e23dd851476ef54c2153d6178500a3e2345f95b4 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Tue, 23 Feb 2021 13:18:41 -0500
Subject: [PATCH 200/478] TST Does not use cache in openml test (#19534)

---
 sklearn/datasets/tests/test_openml.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py
index a84e705b0db68..9f55909c6643b 100644
--- a/sklearn/datasets/tests/test_openml.py
+++ b/sklearn/datasets/tests/test_openml.py
@@ -528,12 +528,12 @@ def test_fetch_openml_as_frame_auto(monkeypatch):
 
     data_id = 61  # iris dataset version 1
     _monkey_patch_webbased_functions(monkeypatch, data_id, True)
-    data = fetch_openml(data_id=data_id, as_frame='auto')
+    data = fetch_openml(data_id=data_id, as_frame='auto', cache=False)
     assert isinstance(data.data, pd.DataFrame)
 
     data_id = 292  # Australian dataset version 1
     _monkey_patch_webbased_functions(monkeypatch, data_id, True)
-    data = fetch_openml(data_id=data_id, as_frame='auto')
+    data = fetch_openml(data_id=data_id, as_frame='auto', cache=False)
     assert isinstance(data.data, scipy.sparse.csr_matrix)
 
 
From f2943c6d10f68dd2144a80ad4b12475a25ff635a Mon Sep 17 00:00:00 2001
From: Joel Nothman <joel.nothman@gmail.com>
Date: Wed, 24 Feb 2021 14:44:08 +1100
Subject: [PATCH 201/478] CI Fix shell syntax introduced in #19155

---
 .github/workflows/check-changelog.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/check-changelog.yml b/.github/workflows/check-changelog.yml
index 7d954c530cff8..9560e4cb9d680 100644
--- a/.github/workflows/check-changelog.yml
+++ b/.github/workflows/check-changelog.yml
@@ -13,7 +13,7 @@ jobs:
       - name: Get PR number and milestone
         run: |
           echo "PR_NUMBER=${{ github.event.pull_request.number }}" >> $GITHUB_ENV
-          echo "TAGGED_MILESTONE=${{ github.event.pull_request.milestone.title }} >> $GITHUB_ENV
+          echo "TAGGED_MILESTONE=${{ github.event.pull_request.milestone.title }}" >> $GITHUB_ENV
       - uses: actions/checkout@v2
         with:
           fetch-depth: '0'

From 86445abc8086305e6723993beb1a55ed7344ef19 Mon Sep 17 00:00:00 2001
From: Joel Nothman <joel.nothman@gmail.com>
Date: Wed, 24 Feb 2021 17:16:44 +1100
Subject: [PATCH 202/478] CI Fix origin/master -> origin/main in
 check-changelog

---
 .github/workflows/check-changelog.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/check-changelog.yml b/.github/workflows/check-changelog.yml
index 9560e4cb9d680..5957744d907c7 100644
--- a/.github/workflows/check-changelog.yml
+++ b/.github/workflows/check-changelog.yml
@@ -20,7 +20,7 @@ jobs:
       - name: Check the changelog
         run: |
           set -xe
-          changed_files=$(git diff --name-only origin/master)
+          changed_files=$(git diff --name-only origin/main)
           # Changelog should be updated only if tests have been modified
           if [[ ! "$changed_files" =~ tests ]]
           then

From 638b7689bbbfae4bcc4592c6f8a43ce86b571f0b Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Wed, 24 Feb 2021 01:23:44 -0500
Subject: [PATCH 203/478] ENH Adds nan passthrough in OrdinalEncoder (#19069)

---
 doc/modules/preprocessing.rst                |  11 ++
 doc/whats_new/v1.0.rst                       |   6 +
 sklearn/preprocessing/_encoders.py           |  35 ++++-
 sklearn/preprocessing/tests/test_encoders.py | 137 ++++++++++++++++---
 4 files changed, 167 insertions(+), 22 deletions(-)

diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
index e1b4c5599c3b5..b87971ec4ae5a 100644
--- a/doc/modules/preprocessing.rst
+++ b/doc/modules/preprocessing.rst
@@ -482,6 +482,17 @@ scikit-learn estimators, as these expect continuous input, and would interpret
 the categories as being ordered, which is often not desired (i.e. the set of
 browsers was ordered arbitrarily).
 
+:class:`OrdinalEncoder` will also passthrough missing values that are
+indicated by `np.nan`.
+
+    >>> enc = preprocessing.OrdinalEncoder()
+    >>> X = [['male'], ['female'], [np.nan], ['female']]
+    >>> enc.fit_transform(X)
+    array([[ 1.],
+           [ 0.],
+           [nan],
+           [ 0.]])
+
 Another possibility to convert categorical features to features that can be used
 with scikit-learn estimators is to use a one-of-K, also known as one-hot or
 dummy encoding.
diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 25e0b369bebd3..6a565b8d5e21b 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -123,6 +123,12 @@ Changelog
   not corresponding to their objective. :pr:`19172` by
   :user:`Mathurin Massias <mathurinm>`
 
+:mod:`sklearn.preprocessing`
+............................
+
+- |Feature| :class:`preprocessing.OrdinalEncoder` supports passing through
+  missing values by default. :pr:`19069` by `Thomas Fan`_.
+
 - |API|: The parameter ``normalize`` of :class:`linear_model.LinearRegression`
   is deprecated and will be removed in 1.2.
   Motivation for this deprecation: ``normalize`` parameter did not take any
diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index 342b730ba91ed..043f9fc40ef53 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -10,6 +10,7 @@
 from ..utils import check_array, is_scalar_nan
 from ..utils.validation import check_is_fitted
 from ..utils.validation import _deprecate_positional_args
+from ..utils._mask import _get_mask
 
 from ..utils._encode import _encode, _check_unknown, _unique
 
@@ -752,7 +753,7 @@ def fit(self, X, y=None):
                 if np.dtype(self.dtype).kind != 'f':
                     raise ValueError(
                         f"When unknown_value is np.nan, the dtype "
-                        "parameter should be "
+                        f"parameter should be "
                         f"a float dtype. Got {self.dtype}."
                     )
             elif not isinstance(self.unknown_value, numbers.Integral):
@@ -765,7 +766,7 @@ def fit(self, X, y=None):
                             f"handle_unknown is 'use_encoded_value', "
                             f"got {self.unknown_value}.")
 
-        self._fit(X)
+        self._fit(X, force_all_finite='allow-nan')
 
         if self.handle_unknown == 'use_encoded_value':
             for feature_cats in self.categories_:
@@ -775,6 +776,21 @@ def fit(self, X, y=None):
                                      f"values already used for encoding the "
                                      f"seen categories.")
 
+        # stores the missing indices per category
+        self._missing_indices = {}
+        for cat_idx, categories_for_idx in enumerate(self.categories_):
+            for i, cat in enumerate(categories_for_idx):
+                if is_scalar_nan(cat):
+                    self._missing_indices[cat_idx] = i
+                    continue
+
+        if np.dtype(self.dtype).kind != 'f' and self._missing_indices:
+            raise ValueError(
+                "There are missing values in features "
+                f"{list(self._missing_indices)}. For OrdinalEncoder to "
+                "passthrough missing values, the dtype parameter must be a "
+                "float")
+
         return self
 
     def transform(self, X):
@@ -791,9 +807,14 @@ def transform(self, X):
         X_out : sparse matrix or a 2-d array
             Transformed input.
         """
-        X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown)
+        X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown,
+                                        force_all_finite='allow-nan')
         X_trans = X_int.astype(self.dtype, copy=False)
 
+        for cat_idx, missing_idx in self._missing_indices.items():
+            X_missing_mask = X_int[:, cat_idx] == missing_idx
+            X_trans[X_missing_mask, cat_idx] = np.nan
+
         # create separate category for unknown values
         if self.handle_unknown == 'use_encoded_value':
             X_trans[~X_mask] = self.unknown_value
@@ -814,7 +835,7 @@ def inverse_transform(self, X):
             Inverse transformed array.
         """
         check_is_fitted(self)
-        X = check_array(X, accept_sparse='csr')
+        X = check_array(X, accept_sparse='csr', force_all_finite='allow-nan')
 
         n_samples, _ = X.shape
         n_features = len(self.categories_)
@@ -833,6 +854,12 @@ def inverse_transform(self, X):
 
         for i in range(n_features):
             labels = X[:, i].astype('int64', copy=False)
+
+            # replace values of X[:, i] that were nan with actual indices
+            if i in self._missing_indices:
+                X_i_mask = _get_mask(X[:, i], np.nan)
+                labels[X_i_mask] = self._missing_indices[i]
+
             if self.handle_unknown == 'use_encoded_value':
                 unknown_labels = labels == self.unknown_value
                 X_tr[:, i] = self.categories_[i][np.where(
diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
index fd28d8c40b46c..b1eff0cad21e0 100644
--- a/sklearn/preprocessing/tests/test_encoders.py
+++ b/sklearn/preprocessing/tests/test_encoders.py
@@ -574,24 +574,6 @@ def test_ordinal_encoder_inverse():
         enc.inverse_transform(X_tr)
 
 
-@pytest.mark.parametrize("X", [np.array([[1, np.nan]]).T,
-                               np.array([['a', np.nan]], dtype=object).T],
-                         ids=['numeric', 'object'])
-def test_ordinal_encoder_raise_missing(X):
-    ohe = OrdinalEncoder()
-
-    with pytest.raises(ValueError, match="Input contains NaN"):
-        ohe.fit(X)
-
-    with pytest.raises(ValueError, match="Input contains NaN"):
-        ohe.fit_transform(X)
-
-    ohe.fit(X[:1, :])
-
-    with pytest.raises(ValueError, match="Input contains NaN"):
-        ohe.transform(X)
-
-
 def test_ordinal_encoder_handle_unknowns_string():
     enc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-2)
     X_fit = np.array([['a', 'x'], ['b', 'y'], ['c', 'z']], dtype=object)
@@ -930,3 +912,122 @@ def test_ohe_missing_value_support_pandas_categorical(pd_nan_type):
     assert len(ohe.categories_) == 1
     assert_array_equal(ohe.categories_[0][:-1], ['a', 'b', 'c'])
     assert np.isnan(ohe.categories_[0][-1])
+
+
+def test_ordinal_encoder_passthrough_missing_values_float_errors_dtype():
+    """Test ordinal encoder with nan passthrough fails when dtype=np.int32."""
+
+    X = np.array([[np.nan, 3.0, 1.0, 3.0]]).T
+    oe = OrdinalEncoder(dtype=np.int32)
+
+    msg = (r"There are missing values in features \[0\]. For OrdinalEncoder "
+           "to passthrough missing values, the dtype parameter must be a "
+           "float")
+    with pytest.raises(ValueError, match=msg):
+        oe.fit(X)
+
+
+def test_ordinal_encoder_passthrough_missing_values_float():
+    """Test ordinal encoder with nan on float dtypes."""
+
+    X = np.array([[np.nan, 3.0, 1.0, 3.0]], dtype=np.float64).T
+    oe = OrdinalEncoder().fit(X)
+
+    assert len(oe.categories_) == 1
+    assert_allclose(oe.categories_[0], [1.0, 3.0, np.nan])
+
+    X_trans = oe.transform(X)
+    assert_allclose(X_trans, [[np.nan], [1.0], [0.0], [1.0]])
+
+    X_inverse = oe.inverse_transform(X_trans)
+    assert_allclose(X_inverse, X)
+
+
+@pytest.mark.parametrize('pd_nan_type', ['pd.NA', 'np.nan'])
+def test_ordinal_encoder_missing_value_support_pandas_categorical(pd_nan_type):
+    """Check ordinal encoder is compatible with pandas."""
+    # checks pandas dataframe with categorical features
+    if pd_nan_type == 'pd.NA':
+        # pd.NA is in pandas 1.0
+        pd = pytest.importorskip('pandas', minversion="1.0")
+        pd_missing_value = pd.NA
+    else:  # np.nan
+        pd = pytest.importorskip('pandas')
+        pd_missing_value = np.nan
+
+    df = pd.DataFrame({
+        'col1': pd.Series(['c', 'a', pd_missing_value, 'b', 'a'],
+                          dtype='category'),
+    })
+
+    oe = OrdinalEncoder().fit(df)
+    assert len(oe.categories_) == 1
+    assert_array_equal(oe.categories_[0][:3], ['a', 'b', 'c'])
+    assert np.isnan(oe.categories_[0][-1])
+
+    df_trans = oe.transform(df)
+
+    assert_allclose(df_trans, [[2.0], [0.0], [np.nan], [1.0], [0.0]])
+
+    X_inverse = oe.inverse_transform(df_trans)
+    assert X_inverse.shape == (5, 1)
+    assert_array_equal(X_inverse[:2, 0], ['c', 'a'])
+    assert_array_equal(X_inverse[3:, 0], ['b', 'a'])
+    assert np.isnan(X_inverse[2, 0])
+
+
+@pytest.mark.parametrize("X, X2, cats, cat_dtype", [
+    ((np.array([['a', np.nan]], dtype=object).T,
+      np.array([['a', 'b']], dtype=object).T,
+     [np.array(['a', np.nan, 'd'], dtype=object)], np.object_)),
+    ((np.array([['a', np.nan]], dtype=object).T,
+      np.array([['a', 'b']], dtype=object).T,
+     [np.array(['a', np.nan, 'd'], dtype=object)], np.object_)),
+    ((np.array([[2.0, np.nan]], dtype=np.float64).T,
+      np.array([[3.0]], dtype=np.float64).T,
+     [np.array([2.0, 4.0, np.nan])], np.float64)),
+    ], ids=['object-None-missing-value', 'object-nan-missing_value',
+            'numeric-missing-value'])
+def test_ordinal_encoder_specified_categories_missing_passthrough(
+        X, X2, cats, cat_dtype):
+    """Test ordinal encoder for specified categories."""
+    oe = OrdinalEncoder(categories=cats)
+    exp = np.array([[0.], [np.nan]])
+    assert_array_equal(oe.fit_transform(X), exp)
+    # manually specified categories should have same dtype as
+    # the data when coerced from lists
+    assert oe.categories_[0].dtype == cat_dtype
+
+    # when specifying categories manually, unknown categories should already
+    # raise when fitting
+    oe = OrdinalEncoder(categories=cats)
+    with pytest.raises(ValueError, match="Found unknown categories"):
+        oe.fit(X2)
+
+
+@pytest.mark.parametrize("X, expected_X_trans, X_test", [
+    (np.array([[1.0, np.nan, 3.0]]).T,
+     np.array([[0.0, np.nan, 1.0]]).T,
+     np.array([[4.0]])),
+    (np.array([[1.0, 4.0, 3.0]]).T,
+     np.array([[0.0, 2.0, 1.0]]).T,
+     np.array([[np.nan]])),
+    (np.array([['c', np.nan, 'b']], dtype=object).T,
+     np.array([[1.0, np.nan, 0.0]]).T,
+     np.array([['d']], dtype=object)),
+    (np.array([['c', 'a', 'b']], dtype=object).T,
+     np.array([[2.0, 0.0, 1.0]]).T,
+     np.array([[np.nan]], dtype=object)),
+])
+def test_ordinal_encoder_handle_missing_and_unknown(
+        X, expected_X_trans, X_test
+):
+    """Test the interaction between missing values and handle_unknown"""
+
+    oe = OrdinalEncoder(handle_unknown="use_encoded_value",
+                        unknown_value=-1)
+
+    X_trans = oe.fit_transform(X)
+    assert_allclose(X_trans, expected_X_trans)
+
+    assert_allclose(oe.transform(X_test), [[-1.0]])

From c748e465c76c43a173ad5ab2fd82639210f8e895 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@gmail.com>
Date: Thu, 25 Feb 2021 10:21:52 +0100
Subject: [PATCH 204/478] FIX Don't scale near-constant features to large
 values (#19527)

---
 doc/whats_new/v1.0.rst                   |  7 +++
 sklearn/linear_model/_base.py            |  6 ++-
 sklearn/linear_model/tests/test_base.py  | 24 ++++++---
 sklearn/preprocessing/_data.py           | 34 ++++++++++---
 sklearn/preprocessing/tests/test_data.py | 63 ++++++++++++++++++++++--
 5 files changed, 113 insertions(+), 21 deletions(-)

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 6a565b8d5e21b..3e36438dda095 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -193,6 +193,13 @@ Changelog
   positioning strategy ``knots``.
   :pr:`18368` by :user:`Christian Lorentzen <lorentzenchr>`.
 
+- |Fix| :func:`preprocessing.scale`, :class:`preprocessing.StandardScaler`
+  and similar scalers detect near-constant features to avoid scaling them to
+  very large values. This problem happens in particular when using a scaler on
+  sparse data with a constant column with sample weights, in which case
+  centering is typically disabled. :pr:`19527` by :user:`Oliver Grisel
+  <ogrisel>` and :user:`Maria Telenczuk <maikia>`.
+
 :mod:`sklearn.tree`
 ...................
 
diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py
index 61005cb4b5d4a..28cc386b4ecda 100644
--- a/sklearn/linear_model/_base.py
+++ b/sklearn/linear_model/_base.py
@@ -246,9 +246,13 @@ def _preprocess_data(X, y, fit_intercept, normalize=False, copy=True,
         X_var = X_var.astype(X.dtype, copy=False)
 
         if normalize:
+            # Detect constant features on the computed variance, before taking
+            # the np.sqrt. Otherwise constant features cannot be detected with
+            # sample_weights.
+            constant_mask = X_var < 10 * np.finfo(X.dtype).eps
             X_var *= X.shape[0]
             X_scale = np.sqrt(X_var, out=X_var)
-            X_scale[X_scale < 10 * np.finfo(X_scale.dtype).eps] = 1.
+            X_scale[constant_mask] = 1.
             if sp.issparse(X):
                 inplace_column_scale(X, 1. / X_scale)
             else:
diff --git a/sklearn/linear_model/tests/test_base.py b/sklearn/linear_model/tests/test_base.py
index 56ee18f5f0d06..bf7a2696fcda2 100644
--- a/sklearn/linear_model/tests/test_base.py
+++ b/sklearn/linear_model/tests/test_base.py
@@ -478,10 +478,8 @@ def test_preprocess_data_weighted(is_sparse):
     # better check the impact of feature scaling.
     X[:, 0] *= 10
 
-    # Constant non-zero feature: this edge-case is currently not handled
-    # correctly for sparse data, see:
-    # https://github.com/scikit-learn/scikit-learn/issues/19450
-    # X[:, 2] = 1.
+    # Constant non-zero feature.
+    X[:, 2] = 1.
 
     # Constant zero feature (non-materialized in the sparse case)
     X[:, 3] = 0.
@@ -495,10 +493,12 @@ def test_preprocess_data_weighted(is_sparse):
     X_sample_weight_var = np.average((X - X_sample_weight_avg)**2,
                                      weights=sample_weight,
                                      axis=0)
+    constant_mask = X_sample_weight_var < 10 * np.finfo(X.dtype).eps
+    assert_array_equal(constant_mask, [0, 0, 1, 1])
     expected_X_scale = np.sqrt(X_sample_weight_var) * np.sqrt(n_samples)
 
     # near constant features should not be scaled
-    expected_X_scale[expected_X_scale < 10 * np.finfo(np.float64).eps] = 1
+    expected_X_scale[constant_mask] = 1
 
     if is_sparse:
         X = sparse.csr_matrix(X)
@@ -538,14 +538,22 @@ def test_preprocess_data_weighted(is_sparse):
     # _preprocess_data with normalize=True scales the data by the feature-wise
     # euclidean norms while StandardScaler scales the data by the feature-wise
     # standard deviations.
-    # The two are equivalent up to a ratio of np.sqrt(n_samples)
+    # The two are equivalent up to a ratio of np.sqrt(n_samples).
     if is_sparse:
         scaler = StandardScaler(with_mean=False).fit(
             X, sample_weight=sample_weight)
 
+        # Non-constant features are scaled similarly with np.sqrt(n_samples)
         assert_array_almost_equal(
-            scaler.transform(X).toarray() / np.sqrt(n_samples), Xt.toarray()
-            )
+            scaler.transform(X).toarray()[:, :2] / np.sqrt(n_samples),
+            Xt.toarray()[:, :2]
+        )
+
+        # Constant features go through un-scaled.
+        assert_array_almost_equal(
+            scaler.transform(X).toarray()[:, 2:],
+            Xt.toarray()[:, 2:]
+        )
     else:
         scaler = StandardScaler(with_mean=True).fit(
             X, sample_weight=sample_weight)
diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py
index 92a4135147b87..29190dd6e2b67 100644
--- a/sklearn/preprocessing/_data.py
+++ b/sklearn/preprocessing/_data.py
@@ -60,22 +60,36 @@
 ]
 
 
-def _handle_zeros_in_scale(scale, copy=True):
-    """Makes sure that whenever scale is zero, we handle it correctly.
+def _handle_zeros_in_scale(scale, copy=True, constant_mask=None):
+    """Set scales of near constant features to 1.
 
-    This happens in most scalers when we have constant features.
-    """
+    The goal is to avoid division by very small or zero values.
+
+    Near constant features are detected automatically by identifying
+    scales close to machine precision unless they are precomputed by
+    the caller and passed with the `constant_mask` kwarg.
 
+    Typically for standard scaling, the scales are the standard
+    deviation while near constant features are better detected on the
+    computed variances which are closer to machine precision by
+    construction.
+    """
     # if we are fitting on 1D arrays, scale might be a scalar
     if np.isscalar(scale):
         if scale == .0:
             scale = 1.
         return scale
     elif isinstance(scale, np.ndarray):
+        if constant_mask is None:
+            # Detect near constant values to avoid dividing by a very small
+            # value that could lead to suprising results and numerical
+            # stability issues.
+            constant_mask = scale < 10 * np.finfo(scale.dtype).eps
+
         if copy:
             # New array to avoid side-effects
             scale = scale.copy()
-        scale[scale == 0.0] = 1.0
+        scale[constant_mask] = 1.0
         return scale
 
 
@@ -408,7 +422,7 @@ def partial_fit(self, X, y=None):
 
         data_range = data_max - data_min
         self.scale_ = ((feature_range[1] - feature_range[0]) /
-                       _handle_zeros_in_scale(data_range))
+                       _handle_zeros_in_scale(data_range, copy=True))
         self.min_ = feature_range[0] - data_min * self.scale_
         self.data_min_ = data_min
         self.data_max_ = data_max
@@ -850,7 +864,11 @@ def partial_fit(self, X, y=None, sample_weight=None):
             self.n_samples_seen_ = self.n_samples_seen_[0]
 
         if self.with_std:
-            self.scale_ = _handle_zeros_in_scale(np.sqrt(self.var_))
+            # Extract the list of near constant features on the raw variances,
+            # before taking the square root.
+            constant_mask = self.var_ < 10 * np.finfo(X.dtype).eps
+            self.scale_ = _handle_zeros_in_scale(
+                np.sqrt(self.var_), copy=False, constant_mask=constant_mask)
         else:
             self.scale_ = None
 
@@ -1078,7 +1096,7 @@ def partial_fit(self, X, y=None):
             self.n_samples_seen_ += X.shape[0]
 
         self.max_abs_ = max_abs
-        self.scale_ = _handle_zeros_in_scale(max_abs)
+        self.scale_ = _handle_zeros_in_scale(max_abs, copy=True)
         return self
 
     def transform(self, X):
diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py
index 974dad31258eb..fdd88be0ccff4 100644
--- a/sklearn/preprocessing/tests/test_data.py
+++ b/sklearn/preprocessing/tests/test_data.py
@@ -414,6 +414,62 @@ def test_standard_scaler_dtype(add_sample_weight, sparse_constructor):
         assert scaler.scale_.dtype == np.float64
 
 
+@pytest.mark.parametrize("scaler", [
+    StandardScaler(with_mean=False),
+    RobustScaler(with_centering=False),
+])
+@pytest.mark.parametrize("sparse_constructor",
+                         [np.asarray, sparse.csc_matrix, sparse.csr_matrix])
+@pytest.mark.parametrize("add_sample_weight", [False, True])
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+@pytest.mark.parametrize("constant", [0, 1., 100.])
+def test_standard_scaler_constant_features(
+        scaler, add_sample_weight, sparse_constructor, dtype, constant):
+    if (isinstance(scaler, StandardScaler)
+            and constant > 1
+            and sparse_constructor is not np.asarray
+            and add_sample_weight):
+        # https://github.com/scikit-learn/scikit-learn/issues/19546
+        pytest.xfail("Computation of weighted variance is numerically unstable"
+                     " for sparse data. See: #19546.")
+
+    if isinstance(scaler, RobustScaler) and add_sample_weight:
+        pytest.skip(f"{scaler.__class__.__name__} does not yet support"
+                    f" sample_weight")
+
+    rng = np.random.RandomState(0)
+    n_samples = 100
+    n_features = 1
+    if add_sample_weight:
+        fit_params = dict(sample_weight=rng.uniform(size=n_samples) * 2)
+    else:
+        fit_params = {}
+    X_array = np.full(shape=(n_samples, n_features), fill_value=constant,
+                      dtype=dtype)
+    X = sparse_constructor(X_array)
+    X_scaled = scaler.fit(X, **fit_params).transform(X)
+
+    if isinstance(scaler, StandardScaler):
+        # The variance info should be close to zero for constant features.
+        assert_allclose(scaler.var_, np.zeros(X.shape[1]), atol=1e-7)
+
+    # Constant features should not be scaled (scale of 1.):
+    assert_allclose(scaler.scale_, np.ones(X.shape[1]))
+
+    if hasattr(X_scaled, "toarray"):
+        assert_allclose(X_scaled.toarray(), X_array)
+    else:
+        assert_allclose(X_scaled, X)
+
+    if isinstance(scaler, StandardScaler) and not add_sample_weight:
+        # Also check consistency with the standard scale function.
+        X_scaled_2 = scale(X, with_mean=scaler.with_mean)
+        if hasattr(X_scaled_2, "toarray"):
+            assert_allclose(X_scaled_2.toarray(), X_scaled_2.toarray())
+        else:
+            assert_allclose(X_scaled_2, X_scaled_2)
+
+
 def test_scale_1d():
     # 1-d inputs
     X_list = [1., 3., 5., 0.]
@@ -538,12 +594,11 @@ def test_scaler_float16_overflow():
 
 
 def test_handle_zeros_in_scale():
-    s1 = np.array([0, 1, 2, 3])
+    s1 = np.array([0, 1e-16, 1, 2, 3])
     s2 = _handle_zeros_in_scale(s1, copy=True)
 
-    assert not s1[0] == s2[0]
-    assert_array_equal(s1, np.array([0, 1, 2, 3]))
-    assert_array_equal(s2, np.array([1, 1, 2, 3]))
+    assert_allclose(s1, np.array([0, 1e-16, 1, 2, 3]))
+    assert_allclose(s2, np.array([1, 1, 1, 2, 3]))
 
 
 def test_minmax_scaler_partial_fit():

From 15d2df47b12d0bb3243d76d71401f9f4cc71caeb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juan=20Carlos=20Alfaro=20Jim=C3=A9nez?=
 <JuanCarlos.Alfaro@uclm.es>
Date: Thu, 25 Feb 2021 10:22:30 +0100
Subject: [PATCH 205/478] MNT Clear travis installation script (#19532)

---
 build_tools/travis/install_main.sh | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/build_tools/travis/install_main.sh b/build_tools/travis/install_main.sh
index 383fefa5bd1a3..c0795139859bb 100755
--- a/build_tools/travis/install_main.sh
+++ b/build_tools/travis/install_main.sh
@@ -32,12 +32,7 @@ ccache --max-size 100M --show-stats
 # to setup a conda-based environment instead
 deactivate
 
-if [[ $TRAVIS_CPU_ARCH == arm64 ]]; then
-    # Different Miniconda URL for ARM64 architectures
-    MINICONDA_URL="https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-aarch64.sh"
-else
-    MINICONDA_URL="https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh"
-fi
+MINICONDA_URL="https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-aarch64.sh"
 
 # Install Miniconda
 wget $MINICONDA_URL -O miniconda.sh

From 70af34c4afd34dbb604ef888846b2d62e93cf225 Mon Sep 17 00:00:00 2001
From: DS_anas <32871888+anashas@users.noreply.github.com>
Date: Thu, 25 Feb 2021 10:23:41 +0100
Subject: [PATCH 206/478] TST Use pytest.warns in sklearn.semi_supervised tests
 (#19510)

---
 .../tests/test_label_propagation.py           | 21 ++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/sklearn/semi_supervised/tests/test_label_propagation.py b/sklearn/semi_supervised/tests/test_label_propagation.py
index 652f83b90a3d6..9f355281d9881 100644
--- a/sklearn/semi_supervised/tests/test_label_propagation.py
+++ b/sklearn/semi_supervised/tests/test_label_propagation.py
@@ -4,8 +4,6 @@
 import pytest
 
 from scipy.sparse import issparse
-from sklearn.utils._testing import assert_warns
-from sklearn.utils._testing import assert_no_warnings
 from sklearn.semi_supervised import _label_propagation as label_propagation
 from sklearn.metrics.pairwise import rbf_kernel
 from sklearn.model_selection import train_test_split
@@ -143,18 +141,25 @@ def test_convergence_warning():
     X = np.array([[1., 0.], [0., 1.], [1., 2.5]])
     y = np.array([0, 1, -1])
     mdl = label_propagation.LabelSpreading(kernel='rbf', max_iter=1)
-    assert_warns(ConvergenceWarning, mdl.fit, X, y)
+    warn_msg = ('max_iter=1 was reached without convergence.')
+    with pytest.warns(ConvergenceWarning, match=warn_msg):
+        mdl.fit(X, y)
     assert mdl.n_iter_ == mdl.max_iter
 
     mdl = label_propagation.LabelPropagation(kernel='rbf', max_iter=1)
-    assert_warns(ConvergenceWarning, mdl.fit, X, y)
+    with pytest.warns(ConvergenceWarning, match=warn_msg):
+        mdl.fit(X, y)
     assert mdl.n_iter_ == mdl.max_iter
 
     mdl = label_propagation.LabelSpreading(kernel='rbf', max_iter=500)
-    assert_no_warnings(mdl.fit, X, y)
+    with pytest.warns(None) as record:
+        mdl.fit(X, y)
+    assert len(record) == 0
 
     mdl = label_propagation.LabelPropagation(kernel='rbf', max_iter=500)
-    assert_no_warnings(mdl.fit, X, y)
+    with pytest.warns(None) as record:
+        mdl.fit(X, y)
+    assert len(record) == 0
 
 
 @pytest.mark.parametrize("LabelPropagationCls",
@@ -170,7 +175,9 @@ def test_label_propagation_non_zero_normalizer(LabelPropagationCls):
     mdl = LabelPropagationCls(kernel='knn',
                               max_iter=100,
                               n_neighbors=1)
-    assert_no_warnings(mdl.fit, X, y)
+    with pytest.warns(None) as record:
+        mdl.fit(X, y)
+    assert len(record) == 0
 
 
 def test_predict_sparse_callable_kernel():

From 97fbf4eb2e162fc1bedbfd7fa4b65bc70af9f6a4 Mon Sep 17 00:00:00 2001
From: Chiara Marmo <cmarmo@users.noreply.github.com>
Date: Thu, 25 Feb 2021 11:02:54 +0100
Subject: [PATCH 207/478] [CI] Add trigging events to check-changelog workflow.
 (#19545)

Co-authored-by: Joel Nothman <joel.nothman@gmail.com>
---
 .github/workflows/check-changelog.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/check-changelog.yml b/.github/workflows/check-changelog.yml
index 5957744d907c7..753f473354131 100644
--- a/.github/workflows/check-changelog.yml
+++ b/.github/workflows/check-changelog.yml
@@ -4,6 +4,7 @@ name: Check Changelog
 # To bypass this check, label the PR with "No Changelog Needed".
 on:
   pull_request:
+    types: [opened, edited, labeled, unlabeled, synchronize]
 
 jobs:
   check:
@@ -51,4 +52,3 @@ jobs:
             echo "label the PR with 'No Changelog Needed' to bypass this check."
             exit 1
           fi
-

From 052efae8916080bd26722e7027cbfdf9296077f2 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Thu, 25 Feb 2021 06:02:08 -0500
Subject: [PATCH 208/478] TST Allows isotonic and manifold (#19539)

---
 sklearn/tests/test_common.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
index 4f6f232a8f716..6a4702aefa34c 100644
--- a/sklearn/tests/test_common.py
+++ b/sklearn/tests/test_common.py
@@ -267,8 +267,6 @@ def test_search_cv(estimator, check, request):
     'calibration',
     'compose',
     'feature_extraction',
-    'isotonic',
-    'manifold',
     'mixture',
     'model_selection',
     'multiclass',

From 12db86ee1ba602f9352a49cab8d731b4dc55cd08 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Thu, 25 Feb 2021 08:59:27 -0500
Subject: [PATCH 209/478] ENH Checks n_features_in_ after fitting in
 random_projection (#19541)

---
 sklearn/random_projection.py | 5 ++---
 sklearn/tests/test_common.py | 1 -
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/sklearn/random_projection.py b/sklearn/random_projection.py
index 33dc108a59a4e..8e968088e8141 100644
--- a/sklearn/random_projection.py
+++ b/sklearn/random_projection.py
@@ -38,7 +38,7 @@
 from .utils import check_random_state
 from .utils.extmath import safe_sparse_dot
 from .utils.random import sample_without_replacement
-from .utils.validation import check_array, check_is_fitted
+from .utils.validation import check_is_fitted
 from .utils.validation import _deprecate_positional_args
 from .exceptions import DataDimensionalityWarning
 
@@ -402,9 +402,8 @@ def transform(self, X):
         X_new : {ndarray, sparse matrix} of shape (n_samples, n_components)
             Projected array.
         """
-        X = check_array(X, accept_sparse=['csr', 'csc'])
-
         check_is_fitted(self)
+        X = self._validate_data(X, accept_sparse=['csr', 'csc'], reset=False)
 
         if X.shape[1] != self.components_.shape[1]:
             raise ValueError(
diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
index 6a4702aefa34c..859335843fd76 100644
--- a/sklearn/tests/test_common.py
+++ b/sklearn/tests/test_common.py
@@ -272,7 +272,6 @@ def test_search_cv(estimator, check, request):
     'multiclass',
     'multioutput',
     'pipeline',
-    'random_projection',
 }
 
 N_FEATURES_IN_AFTER_FIT_ESTIMATORS = [

From 139d75148ee22f1aa4f44ca561a47b62b4864801 Mon Sep 17 00:00:00 2001
From: Vangelis Gkiastas <50487017+egkiastas@users.noreply.github.com>
Date: Thu, 25 Feb 2021 16:11:20 +0200
Subject: [PATCH 210/478] DOC Update calibration.rst (#19557)

---
 doc/modules/calibration.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/modules/calibration.rst b/doc/modules/calibration.rst
index 146601d70765e..d0a9737dac612 100644
--- a/doc/modules/calibration.rst
+++ b/doc/modules/calibration.rst
@@ -181,7 +181,7 @@ common kernel functions on various benchmark datasets in section 2.1 of Platt
 1999 [3]_ but does not necessarily hold in general. Additionally, the
 logistic model works best if the calibration error is symmetrical, meaning
 the classifier output for each binary class is normally distributed with
-the same variance [6]_. This is can be a problem for highly imbalanced
+the same variance [6]_. This can be a problem for highly imbalanced
 classification problems, where outputs do not have equal variance.
 
 In general this method is most effective when the un-calibrated model is

From 94abe05b4b96de2ca30d998fb9adb2fbd3eb1bde Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Thu, 25 Feb 2021 09:16:14 -0500
Subject: [PATCH 211/478] ENH Enables common test for bicluster (#19542)

---
 sklearn/cluster/_bicluster.py | 13 +++++++++++++
 sklearn/tests/test_common.py  |  3 ---
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/sklearn/cluster/_bicluster.py b/sklearn/cluster/_bicluster.py
index 6d293206bddd8..3bde33399a8e0 100644
--- a/sklearn/cluster/_bicluster.py
+++ b/sklearn/cluster/_bicluster.py
@@ -178,6 +178,19 @@ def _k_means(self, data, n_clusters):
         labels = model.labels_
         return centroid, labels
 
+    def _more_tags(self):
+        return {
+            "_xfail_checks": {
+                "check_estimators_dtypes": "raises nan error",
+                "check_fit2d_1sample": "_scale_normalize fails",
+                "check_fit2d_1feature": "raises apply_along_axis error",
+                "check_estimator_sparse_data": "does not fail gracefully",
+                "check_methods_subset_invariance": "empty array passed inside",
+                "check_dont_overwrite_parameters": "empty array passed inside",
+                "check_fit2d_predict1d": "emptry array passed inside",
+            }
+        }
+
 
 class SpectralCoclustering(BaseSpectral):
     """Spectral Co-Clustering algorithm (Dhillon, 2001).
diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
index 859335843fd76..bfd7f98268350 100644
--- a/sklearn/tests/test_common.py
+++ b/sklearn/tests/test_common.py
@@ -23,7 +23,6 @@
 from sklearn.utils.estimator_checks import check_estimator
 
 import sklearn
-from sklearn.base import BiclusterMixin
 
 from sklearn.decomposition import PCA
 from sklearn.linear_model._base import LinearClassifierMixin
@@ -73,8 +72,6 @@ def test_get_check_estimator_ids(val, expected):
 
 def _tested_estimators():
     for name, Estimator in all_estimators():
-        if issubclass(Estimator, BiclusterMixin):
-            continue
         try:
             estimator = _construct_instance(Estimator)
         except SkipTest:

From e0f0c7f8533550dc73822c93837bf1c609659096 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Sat, 27 Feb 2021 04:02:58 +0100
Subject: [PATCH 212/478] DOC Fix documentation on pickle portability (#19561)

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 doc/modules/model_persistence.rst | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/doc/modules/model_persistence.rst b/doc/modules/model_persistence.rst
index 19d3e12205c12..e00212d80fd10 100644
--- a/doc/modules/model_persistence.rst
+++ b/doc/modules/model_persistence.rst
@@ -59,10 +59,10 @@ Security & maintainability limitations
 pickle (and joblib by extension), has some issues regarding maintainability
 and security. Because of this,
 
-* Never unpickle untrusted data as it could lead to malicious code being 
+* Never unpickle untrusted data as it could lead to malicious code being
   executed upon loading.
-* While models saved using one version of scikit-learn might load in 
-  other versions, this is entirely unsupported and inadvisable. It should 
+* While models saved using one version of scikit-learn might load in
+  other versions, this is entirely unsupported and inadvisable. It should
   also be kept in mind that operations performed on such data could give
   different and unexpected results.
 
@@ -77,12 +77,11 @@ additional metadata should be saved along the pickled model:
 This should make it possible to check that the cross-validation score is in the
 same range as before.
 
-Since a model internal representation may be different on two different
-architectures, dumping a model on one architecture and loading it on
-another architecture is not a supported behaviour, even if it might work
-on some cases.
-To overcome the issue of portability, pickle models are often deployed in
-production using containers, like docker.
+Aside for a few exceptions, pickled models should be portable across
+architectures assuming the same versions of dependencies and Python are used.
+If you encounter an estimator that is not portable please open an issue on
+GitHub. Pickled models are often deployed in production using containers, like
+Docker, in order to freeze the environment and dependencies.
 
 If you want to know more about these issues and explore other possible
 serialization methods, please refer to this
@@ -108,7 +107,7 @@ models between different machine learning frameworks, and to improve their
 portability on different computing architectures. More details are available
 from the `ONNX tutorial <https://onnx.ai/get-started.html>`_.
 To convert scikit-learn model to ONNX a specific tool `sklearn-onnx
-<http://onnx.ai/sklearn-onnx/>`_ has been developed. 
+<http://onnx.ai/sklearn-onnx/>`_ has been developed.
 
 PMML is an implementation of the `XML
 <https://en.wikipedia.org/wiki/XML>`_ document standard

From f0a6f054e03bbdba96219b9698760583b3e5037e Mon Sep 17 00:00:00 2001
From: mlondschien <61679398+mlondschien@users.noreply.github.com>
Date: Sat, 27 Feb 2021 13:09:43 +0100
Subject: [PATCH 213/478] FIX bug in SplineTransformer.n_features_out_ (#19577)

---
 sklearn/preprocessing/_polynomial.py           |  4 ++--
 sklearn/preprocessing/tests/test_polynomial.py | 16 ++++++++++++++++
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/sklearn/preprocessing/_polynomial.py b/sklearn/preprocessing/_polynomial.py
index 47ab90be2ebcd..26587e7f05823 100644
--- a/sklearn/preprocessing/_polynomial.py
+++ b/sklearn/preprocessing/_polynomial.py
@@ -307,7 +307,7 @@ def fit(self, X, y=None):
         ]
         self.bsplines_ = bsplines
 
-        self.n_features_out_ = n_out - n_features * self.include_bias
+        self.n_features_out_ = n_out - n_features * (1 - self.include_bias)
         return self
 
     def transform(self, X):
@@ -336,7 +336,7 @@ def transform(self, X):
 
         # Note that scipy BSpline returns float64 arrays and converts input
         # x=X[:, i] to c-contiguous float64.
-        n_out = self.n_features_out_ + n_features * self.include_bias
+        n_out = self.n_features_out_ + n_features * (1 - self.include_bias)
         if X.dtype in FLOAT_DTYPES:
             dtype = X.dtype
         else:
diff --git a/sklearn/preprocessing/tests/test_polynomial.py b/sklearn/preprocessing/tests/test_polynomial.py
index 9dd65c44d8bba..2ca3260f7c05e 100644
--- a/sklearn/preprocessing/tests/test_polynomial.py
+++ b/sklearn/preprocessing/tests/test_polynomial.py
@@ -243,3 +243,19 @@ def test_spline_transformer_kbindiscretizer():
     # Though they should be exactly equal, we test approximately with high
     # accuracy.
     assert_allclose(splines, kbins, rtol=1e-13)
+
+
+@pytest.mark.parametrize("n_knots", [5, 10])
+@pytest.mark.parametrize("include_bias", [True, False])
+@pytest.mark.parametrize("degree", [3, 5])
+def test_spline_transformer_n_features_out(n_knots, include_bias, degree):
+    """Test that transform results in n_features_out_ features."""
+    splt = SplineTransformer(
+        n_knots=n_knots,
+        degree=degree,
+        include_bias=include_bias
+    )
+    X = np.linspace(0, 1, 10)[:, None]
+    splt.fit(X)
+
+    assert splt.transform(X).shape[1] == splt.n_features_out_

From 0df7abfc87fd7aa875d0ee5ad133c455b2ded423 Mon Sep 17 00:00:00 2001
From: Steven Kolawole <45284829+SteveKola@users.noreply.github.com>
Date: Sat, 27 Feb 2021 16:01:46 +0100
Subject: [PATCH 214/478] TST replace asert_warns by pytest.warns in
 compose/tests (#19492)

Co-authored-by: Olivier Grisel <olivier.grisel@gmail.com>
Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 sklearn/compose/tests/test_target.py             | 7 +++----
 sklearn/neighbors/tests/test_nearest_centroid.py | 1 -
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/sklearn/compose/tests/test_target.py b/sklearn/compose/tests/test_target.py
index 573518b3fa43a..dc5d8d95743ef 100644
--- a/sklearn/compose/tests/test_target.py
+++ b/sklearn/compose/tests/test_target.py
@@ -8,7 +8,6 @@
 from sklearn.dummy import DummyRegressor
 
 from sklearn.utils._testing import assert_allclose
-from sklearn.utils._testing import assert_warns_message
 from sklearn.utils._testing import assert_no_warnings
 
 from sklearn.preprocessing import FunctionTransformer
@@ -54,9 +53,9 @@ def test_transform_target_regressor_invertible():
     regr = TransformedTargetRegressor(regressor=LinearRegression(),
                                       func=np.sqrt, inverse_func=np.log,
                                       check_inverse=True)
-    assert_warns_message(UserWarning, "The provided functions or transformer"
-                         " are not strictly inverse of each other.",
-                         regr.fit, X, y)
+    with pytest.warns(UserWarning, match="The provided functions or"
+                      " transformer are not strictly inverse of each other."):
+        regr.fit(X, y)
     regr = TransformedTargetRegressor(regressor=LinearRegression(),
                                       func=np.sqrt, inverse_func=np.log)
     regr.set_params(check_inverse=False)
diff --git a/sklearn/neighbors/tests/test_nearest_centroid.py b/sklearn/neighbors/tests/test_nearest_centroid.py
index f91cae74b0585..9af02b07e2a96 100644
--- a/sklearn/neighbors/tests/test_nearest_centroid.py
+++ b/sklearn/neighbors/tests/test_nearest_centroid.py
@@ -1,7 +1,6 @@
 """
 Testing for the nearest centroid module.
 """
-
 import numpy as np
 import pytest
 from scipy import sparse as sp

From 15c2c72e27c6ea18566f4e786506c7a3aef8a5de Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Sat, 27 Feb 2021 12:35:40 -0500
Subject: [PATCH 215/478] FIX Do not call get_feature_names for empty column
 selections (#19579)

---
 doc/whats_new/v0.24.rst                          |  7 +++++++
 sklearn/compose/_column_transformer.py           |  3 +--
 sklearn/compose/tests/test_column_transformer.py | 10 ++++++++++
 3 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst
index 6f2584dccdd10..5ac6f74f3d7df 100644
--- a/doc/whats_new/v0.24.rst
+++ b/doc/whats_new/v0.24.rst
@@ -12,6 +12,13 @@ Version 0.24.2
 Changelog
 ---------
 
+:mod:`sklearn.compose`
+......................
+
+- |Fix| :meth:`compose.ColumnTransformer.get_feature_names` does not call
+  :term:`get_feature_names` on transformers with an empty column selection.
+  :pr:`19579` by `Thomas Fan`_.
+
 :mod:`sklearn.ensemble`
 .......................
 
diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py
index 3d71c1e5abbf5..c0444fe2d6cda 100644
--- a/sklearn/compose/_column_transformer.py
+++ b/sklearn/compose/_column_transformer.py
@@ -351,8 +351,7 @@ def get_feature_names(self):
         check_is_fitted(self)
         feature_names = []
         for name, trans, column, _ in self._iter(fitted=True):
-            if trans == 'drop' or (
-                    hasattr(column, '__len__') and not len(column)):
+            if trans == 'drop' or _is_empty_column_selection(column):
                 continue
             if trans == 'passthrough':
                 if self._feature_names_in is not None:
diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py
index f2a32d6f065f4..ae2e25b68210f 100644
--- a/sklearn/compose/tests/test_column_transformer.py
+++ b/sklearn/compose/tests/test_column_transformer.py
@@ -1420,3 +1420,13 @@ def test_sk_visual_block_remainder_fitted_numpy(remainder):
     assert visual_block.names == ('scale', 'remainder')
     assert visual_block.name_details == ([0, 2], [1])
     assert visual_block.estimators == (scaler, remainder)
+
+
+@pytest.mark.parametrize("selector", [[], [False, False]])
+def test_get_feature_names_empty_selection(selector):
+    """Test that get_feature_names is only called for transformers that
+    were selected. Non-regression test for #19550.
+    """
+    ct = ColumnTransformer([('ohe', OneHotEncoder(drop='first'), selector)])
+    ct.fit([[1, 2], [3, 4]])
+    assert ct.get_feature_names() == []

From c00c4bd7e441fbe181302a74d24f8b08c67abff3 Mon Sep 17 00:00:00 2001
From: Alihan Zihna <alihanz@gmail.com>
Date: Mon, 1 Mar 2021 09:36:59 +0000
Subject: [PATCH 216/478] Change assert_raises to pytest_raises (#19509)

Co-authored-by: Alihan Zihna <a.zihna@ckhgbdp.onmicrosoft.com>
---
 sklearn/tests/test_calibration.py    | 17 +++++++++--------
 sklearn/tests/test_check_build.py    |  7 ++++---
 sklearn/tests/test_metaestimators.py | 12 ++++++------
 3 files changed, 19 insertions(+), 17 deletions(-)

diff --git a/sklearn/tests/test_calibration.py b/sklearn/tests/test_calibration.py
index 7c3ccd06815b3..4ba1599eba3e6 100644
--- a/sklearn/tests/test_calibration.py
+++ b/sklearn/tests/test_calibration.py
@@ -13,7 +13,7 @@
 from sklearn.utils._testing import (assert_array_almost_equal,
                                     assert_almost_equal,
                                     assert_array_equal,
-                                    assert_raises, ignore_warnings)
+                                    ignore_warnings)
 from sklearn.utils.extmath import softmax
 from sklearn.exceptions import NotFittedError
 from sklearn.datasets import make_classification, make_blobs
@@ -60,7 +60,8 @@ def test_calibration(data, method, ensemble):
     prob_pos_clf = clf.predict_proba(X_test)[:, 1]
 
     cal_clf = CalibratedClassifierCV(clf, cv=y.size + 1, ensemble=ensemble)
-    assert_raises(ValueError, cal_clf.fit, X, y)
+    with pytest.raises(ValueError):
+        cal_clf.fit(X, y)
 
     # Naive Bayes with calibration
     for this_X_train, this_X_test in [(X_train, X_test),
@@ -386,8 +387,8 @@ def test_sigmoid_calibration():
 
     # check that _SigmoidCalibration().fit only accepts 1d array or 2d column
     # arrays
-    assert_raises(ValueError, _SigmoidCalibration().fit,
-                  np.vstack((exF, exF)), exY)
+    with pytest.raises(ValueError):
+        _SigmoidCalibration().fit(np.vstack((exF, exF)), exY)
 
 
 def test_calibration_curve():
@@ -406,8 +407,8 @@ def test_calibration_curve():
 
     # probabilities outside [0, 1] should not be accepted when normalize
     # is set to False
-    assert_raises(ValueError, calibration_curve, [1.1], [-0.1],
-                  normalize=False)
+    with pytest.raises(ValueError):
+        calibration_curve([1.1], [-0.1], normalize=False)
 
     # test that quantiles work as expected
     y_true2 = np.array([0, 0, 0, 0, 1, 1])
@@ -421,8 +422,8 @@ def test_calibration_curve():
     assert_almost_equal(prob_pred_quantile, [0.1, 0.8])
 
     # Check that error is raised when invalid strategy is selected
-    assert_raises(ValueError, calibration_curve, y_true2, y_pred2,
-                  strategy='percentile')
+    with pytest.raises(ValueError):
+        calibration_curve(y_true2, y_pred2, strategy='percentile')
 
 
 @pytest.mark.parametrize('ensemble', [True, False])
diff --git a/sklearn/tests/test_check_build.py b/sklearn/tests/test_check_build.py
index a7799ad1b3789..3c8e64e1ba906 100644
--- a/sklearn/tests/test_check_build.py
+++ b/sklearn/tests/test_check_build.py
@@ -5,10 +5,11 @@
 # Author: G Varoquaux
 # License: BSD 3 clause
 
-from sklearn.__check_build import raise_build_error
+import pytest
 
-from sklearn.utils._testing import assert_raises
+from sklearn.__check_build import raise_build_error
 
 
 def test_raise_build_error():
-    assert_raises(ImportError, raise_build_error, ImportError())
+    with pytest.raises(ImportError):
+        raise_build_error(ImportError())
diff --git a/sklearn/tests/test_metaestimators.py b/sklearn/tests/test_metaestimators.py
index 69a994c1b5fc0..2caa01d71c444 100644
--- a/sklearn/tests/test_metaestimators.py
+++ b/sklearn/tests/test_metaestimators.py
@@ -2,11 +2,11 @@
 import functools
 
 import numpy as np
+import pytest
 
 from sklearn.base import BaseEstimator
 from sklearn.datasets import make_classification
 
-from sklearn.utils._testing import assert_raises
 from sklearn.utils.validation import check_is_fitted
 from sklearn.pipeline import Pipeline
 from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
@@ -124,12 +124,12 @@ def score(self, X, y, *args, **kwargs):
                     % (delegator_data.name, method))
             # delegation before fit raises a NotFittedError
             if method == 'score':
-                assert_raises(NotFittedError, getattr(delegator, method),
-                              delegator_data.fit_args[0],
-                              delegator_data.fit_args[1])
+                with pytest.raises(NotFittedError):
+                    getattr(delegator, method)(delegator_data.fit_args[0],
+                                               delegator_data.fit_args[1])
             else:
-                assert_raises(NotFittedError, getattr(delegator, method),
-                              delegator_data.fit_args[0])
+                with pytest.raises(NotFittedError):
+                    getattr(delegator, method)(delegator_data.fit_args[0])
 
         delegator.fit(*delegator_data.fit_args)
         for method in methods:

From 8d3b4241120a1290c0477e77beb7d2fff454462e Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@gmail.com>
Date: Mon, 1 Mar 2021 14:45:01 +0100
Subject: [PATCH 217/478] FIX race condition in get_data_home causing
 FileExistsError (#19560)

---
 sklearn/datasets/_base.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/sklearn/datasets/_base.py b/sklearn/datasets/_base.py
index adcad1474550a..17d2db9f2075b 100644
--- a/sklearn/datasets/_base.py
+++ b/sklearn/datasets/_base.py
@@ -12,7 +12,7 @@
 import shutil
 from collections import namedtuple
 from os import environ, listdir, makedirs
-from os.path import dirname, exists, expanduser, isdir, join, splitext
+from os.path import dirname, expanduser, isdir, join, splitext
 
 from ..utils import Bunch
 from ..utils import check_random_state
@@ -52,8 +52,7 @@ def get_data_home(data_home=None) -> str:
         data_home = environ.get('SCIKIT_LEARN_DATA',
                                 join('~', 'scikit_learn_data'))
     data_home = expanduser(data_home)
-    if not exists(data_home):
-        makedirs(data_home)
+    makedirs(data_home, exist_ok=True)
     return data_home
 
 
From 72db93cc40884f42e05e4290d6ab63713d0075c9 Mon Sep 17 00:00:00 2001
From: Mohamed Haseeb <m@mohaseeb.com>
Date: Mon, 1 Mar 2021 18:07:56 +0100
Subject: [PATCH 218/478] TST replaces assert_raises* by pytest.raises in
 model_selection/tests/test_split.py (#19585)

Co-authored-by: Cycks <sikolia.wycliffe@gmail.com>
---
 sklearn/model_selection/tests/test_split.py | 79 +++++++++++++--------
 1 file changed, 48 insertions(+), 31 deletions(-)

diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py
index 183a2eab84b63..e6900c90e7a87 100644
--- a/sklearn/model_selection/tests/test_split.py
+++ b/sklearn/model_selection/tests/test_split.py
@@ -10,8 +10,6 @@
 from itertools import permutations
 
 from sklearn.utils._testing import assert_allclose
-from sklearn.utils._testing import assert_raises
-from sklearn.utils._testing import assert_raises_regexp
 from sklearn.utils._testing import assert_array_almost_equal
 from sklearn.utils._testing import assert_array_equal
 from sklearn.utils._testing import assert_raise_message
@@ -206,11 +204,14 @@ def test_kfold_valueerrors():
     # classes are less than n_splits.
     y = np.array([3, 3, -1, -1, 2])
 
-    assert_raises(ValueError, next, skf_3.split(X2, y))
+    with pytest.raises(ValueError):
+        next(skf_3.split(X2, y))
 
     # Error when number of folds is <= 1
-    assert_raises(ValueError, KFold, 0)
-    assert_raises(ValueError, KFold, 1)
+    with pytest.raises(ValueError):
+        KFold(0)
+    with pytest.raises(ValueError):
+        KFold(1)
     error_string = ("k-fold cross-validation requires at least one"
                     " train/test split")
     assert_raise_message(ValueError, error_string,
@@ -219,13 +220,18 @@ def test_kfold_valueerrors():
                          StratifiedKFold, 1)
 
     # When n_splits is not integer:
-    assert_raises(ValueError, KFold, 1.5)
-    assert_raises(ValueError, KFold, 2.0)
-    assert_raises(ValueError, StratifiedKFold, 1.5)
-    assert_raises(ValueError, StratifiedKFold, 2.0)
+    with pytest.raises(ValueError):
+        KFold(1.5)
+    with pytest.raises(ValueError):
+        KFold(2.0)
+    with pytest.raises(ValueError):
+        StratifiedKFold(1.5)
+    with pytest.raises(ValueError):
+        StratifiedKFold(2.0)
 
     # When shuffle is not  a bool:
-    assert_raises(TypeError, KFold, n_splits=4, shuffle=None)
+    with pytest.raises(TypeError):
+        KFold(n_splits=4, shuffle=None)
 
 
 def test_kfold_indices():
@@ -565,24 +571,25 @@ def test_stratified_shuffle_split_init():
     X = np.arange(7)
     y = np.asarray([0, 1, 1, 1, 2, 2, 2])
     # Check that error is raised if there is a class with only one sample
-    assert_raises(ValueError, next,
-                  StratifiedShuffleSplit(3, 0.2).split(X, y))
+    with pytest.raises(ValueError):
+        next(StratifiedShuffleSplit(3, 0.2).split(X, y))
 
     # Check that error is raised if the test set size is smaller than n_classes
-    assert_raises(ValueError, next, StratifiedShuffleSplit(3, 2).split(X, y))
+    with pytest.raises(ValueError):
+        next(StratifiedShuffleSplit(3, 2).split(X, y))
     # Check that error is raised if the train set size is smaller than
     # n_classes
-    assert_raises(ValueError, next,
-                  StratifiedShuffleSplit(3, 3, 2).split(X, y))
+    with pytest.raises(ValueError):
+        next(StratifiedShuffleSplit(3, 3, 2).split(X, y))
 
     X = np.arange(9)
     y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2])
 
     # Train size or test size too small
-    assert_raises(ValueError, next,
-                  StratifiedShuffleSplit(train_size=2).split(X, y))
-    assert_raises(ValueError, next,
-                  StratifiedShuffleSplit(test_size=2).split(X, y))
+    with pytest.raises(ValueError):
+        next(StratifiedShuffleSplit(train_size=2).split(X, y))
+    with pytest.raises(ValueError):
+        next(StratifiedShuffleSplit(test_size=2).split(X, y))
 
 
 def test_stratified_shuffle_split_respects_test_size():
@@ -845,9 +852,9 @@ def test_leave_one_p_group_out():
     assert lpgo_1.get_n_splits(groups=np.arange(4)) == 4
 
     # raise ValueError if a `groups` parameter is illegal
-    with assert_raises(ValueError):
+    with pytest.raises(ValueError):
         logo.get_n_splits(None, None, [0.0, np.nan, 0.0])
-    with assert_raises(ValueError):
+    with pytest.raises(ValueError):
         lpgo_2.get_n_splits(None, None, [0.0, np.inf, 0.0])
 
     msg = "The 'groups' parameter should not be None."
@@ -911,8 +918,10 @@ def test_leave_one_p_group_out_error_on_fewer_number_of_groups():
 def test_repeated_cv_value_errors():
     # n_repeats is not integer or <= 0
     for cv in (RepeatedKFold, RepeatedStratifiedKFold):
-        assert_raises(ValueError, cv, n_repeats=0)
-        assert_raises(ValueError, cv, n_repeats=1.5)
+        with pytest.raises(ValueError):
+            cv(n_repeats=0)
+        with pytest.raises(ValueError):
+            cv(n_repeats=1.5)
 
 
 @pytest.mark.parametrize(
@@ -954,7 +963,8 @@ def test_repeated_kfold_determinstic_split():
         assert_array_equal(train, [2, 3, 4])
         assert_array_equal(test, [0, 1])
 
-        assert_raises(StopIteration, next, splits)
+        with pytest.raises(StopIteration):
+            next(splits)
 
 
 def test_get_n_splits_for_repeated_kfold():
@@ -1002,7 +1012,8 @@ def test_repeated_stratified_kfold_determinstic_split():
         assert_array_equal(train, [0, 1, 4])
         assert_array_equal(test, [2, 3])
 
-        assert_raises(StopIteration, next, splits)
+        with pytest.raises(StopIteration):
+            next(splits)
 
 
 def test_train_test_split_errors():
@@ -1258,7 +1269,8 @@ def test_check_cv():
     cv = check_cv(3, y_multioutput, classifier=True)
     np.testing.assert_equal(list(KFold(3).split(X)), list(cv.split(X)))
 
-    assert_raises(ValueError, check_cv, cv="lolo")
+    with pytest.raises(ValueError):
+        check_cv(cv="lolo")
 
 
 def test_cv_iterable_wrapper():
@@ -1375,17 +1387,22 @@ def test_group_kfold():
     # Should fail if there are more folds than groups
     groups = np.array([1, 1, 1, 2, 2])
     X = y = np.ones(len(groups))
-    assert_raises_regexp(ValueError, "Cannot have number of splits.*greater",
-                         next, GroupKFold(n_splits=3).split(X, y, groups))
+    with pytest.raises(
+        ValueError,
+        match="Cannot have number of splits.*greater"
+    ):
+        next(GroupKFold(n_splits=3).split(X, y, groups))
 
 
 def test_time_series_cv():
     X = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14]]
 
     # Should fail if there are more folds than samples
-    assert_raises_regexp(ValueError, "Cannot have number of folds.*greater",
-                         next,
-                         TimeSeriesSplit(n_splits=7).split(X))
+    with pytest.raises(
+        ValueError,
+        match="Cannot have number of folds.*greater"
+    ):
+        next(TimeSeriesSplit(n_splits=7).split(X))
 
     tscv = TimeSeriesSplit(2)
 

From 192952affa8d7db7902d3dd3bba6062bb296d294 Mon Sep 17 00:00:00 2001
From: Samuel Brice <7470577+samdbrice@users.noreply.github.com>
Date: Mon, 1 Mar 2021 20:04:58 -0500
Subject: [PATCH 219/478] FIX Deep copy criterion in trees to fix concurrency
 bug (#19580)

Co-authored-by: Samuel Brice <samuel.brice@twosigma.com>
---
 doc/whats_new/v0.24.rst               | 10 ++++++++++
 sklearn/ensemble/tests/test_forest.py | 18 ++++++++++++++++++
 sklearn/tree/_classes.py              |  5 +++++
 3 files changed, 33 insertions(+)

diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst
index 5ac6f74f3d7df..84e712c05ea79 100644
--- a/doc/whats_new/v0.24.rst
+++ b/doc/whats_new/v0.24.rst
@@ -48,6 +48,16 @@ Changelog
   :class:`~sklearn.semi_supervised.LabelPropagation`.
   :pr:`19271` by :user:`Zhaowei Wang <ThuWangzw>`.
 
+:mod:`sklearn.tree`
+.......................
+
+- |Fix| Fix a bug in `fit` of :class:`tree.BaseDecisionTree` that caused
+  segmentation faults under certain conditions. `fit` now deep copies the
+  `Criterion` object to prevent shared concurrent accesses.
+  :pr:`19580` by :user:`Samuel Brice <samdbrice>` and
+  :user:`Alex Adamson <aadamson>` and
+  :user:`Wil Yegelwel <wyegelwel>`.
+
 :mod:`sklearn.utils`
 ....................
 
diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py
index c05cad26708b4..efb1a645842bc 100644
--- a/sklearn/ensemble/tests/test_forest.py
+++ b/sklearn/ensemble/tests/test_forest.py
@@ -1494,3 +1494,21 @@ def test_n_features_deprecation(Estimator):
 
     with pytest.warns(FutureWarning, match="n_features_ was deprecated"):
         est.n_features_
+
+
+@pytest.mark.parametrize('Forest', FOREST_REGRESSORS)
+def test_mse_criterion_object_segfault_smoke_test(Forest):
+    # This is a smoke test to ensure that passing a mutable criterion
+    # does not cause a segfault when fitting with concurrent threads.
+    # Non-regression test for:
+    # https://github.com/scikit-learn/scikit-learn/issues/12623
+    from sklearn.tree._criterion import MSE
+
+    y = y_reg.reshape(-1, 1)
+    n_samples, n_outputs = y.shape
+    mse_criterion = MSE(n_outputs, n_samples)
+    est = FOREST_REGRESSORS[Forest](
+        n_estimators=2, n_jobs=2, criterion=mse_criterion
+    )
+
+    est.fit(X_reg, y)
diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
index c09ebe388aa5d..f7ae823c0070f 100644
--- a/sklearn/tree/_classes.py
+++ b/sklearn/tree/_classes.py
@@ -16,6 +16,7 @@
 
 import numbers
 import warnings
+import copy
 from abc import ABCMeta
 from abc import abstractmethod
 from math import ceil
@@ -349,6 +350,10 @@ def fit(self, X, y, sample_weight=None, check_input=True,
             else:
                 criterion = CRITERIA_REG[self.criterion](self.n_outputs_,
                                                          n_samples)
+        else:
+            # Make a deepcopy in case the criterion has mutable attributes that
+            # might be shared and modified concurrently during parallel fitting
+            criterion = copy.deepcopy(criterion)
 
         SPLITTERS = SPARSE_SPLITTERS if issparse(X) else DENSE_SPLITTERS
 

From 482a7781bd7ab01ab2afe1682e6bfa64c93611f5 Mon Sep 17 00:00:00 2001
From: Chiara Marmo <cmarmo@users.noreply.github.com>
Date: Tue, 2 Mar 2021 11:41:34 +0100
Subject: [PATCH 220/478] MAINT Update _arff.py (#19597)

---
 sklearn/externals/_arff.py | 175 +++++++++++++++++--------------------
 1 file changed, 79 insertions(+), 96 deletions(-)

diff --git a/sklearn/externals/_arff.py b/sklearn/externals/_arff.py
index 8330eec8adb87..ccfbbc5e5e971 100644
--- a/sklearn/externals/_arff.py
+++ b/sklearn/externals/_arff.py
@@ -1,4 +1,3 @@
-# -*- coding: utf-8 -*-
 # =============================================================================
 # Federal University of Rio Grande do Sul (UFRGS)
 # Connectionist Artificial Intelligence Laboratory (LIAC)
@@ -98,7 +97,7 @@
 The above keys must follow the case which were described, i.e., the keys are
 case sensitive. The attribute type ``attribute_type`` must be one of these
 strings (they are not case sensitive): ``NUMERIC``, ``INTEGER``, ``REAL`` or
-``STRING``. For nominal attributes, the ``attribute_type`` must be a list of
+``STRING``. For nominal attributes, the ``atribute_type`` must be a list of
 strings.
 
 In this format, the XOR dataset presented above can be represented as a python
@@ -138,7 +137,7 @@
 - Supports read and write the descriptions of files;
 - Supports missing values and names with spaces;
 - Supports unicode values and names;
-- Fully compatible with Python 2.7+, Python 3.3+, pypy and pypy3;
+- Fully compatible with Python 2.7+, Python 3.5+, pypy and pypy3;
 - Under `MIT License <http://opensource.org/licenses/MIT>`_
 
 '''
@@ -148,12 +147,11 @@
                     'joel.nothman@gmail.com')
 __version__ = '2.4.0'
 
-from typing import TYPE_CHECKING
-from typing import Optional, List, Dict, Any, Iterator, Union, Tuple
-
 import re
 import sys
 import csv
+import typing
+from typing import Optional, List, Dict, Any, Iterator, Union, Tuple
 
 # CONSTANTS ===================================================================
 _SIMPLE_TYPES = ['NUMERIC', 'REAL', 'INTEGER', 'STRING']
@@ -166,7 +164,6 @@
 
 _RE_RELATION     = re.compile(r'^([^\{\}%,\s]*|\".*\"|\'.*\')$', re.UNICODE)
 _RE_ATTRIBUTE    = re.compile(r'^(\".*\"|\'.*\'|[^\{\}%,\s]*)\s+(.+)$', re.UNICODE)
-_RE_TYPE_NOMINAL = re.compile(r'^\{\s*((\".*\"|\'.*\'|\S*)\s*,\s*)*(\".*\"|\'.*\'|\S*)\s*\}$', re.UNICODE)
 _RE_QUOTE_CHARS = re.compile(r'["\'\\\s%,\000-\031]', re.UNICODE)
 _RE_ESCAPE_CHARS = re.compile(r'(?=["\'\\%])|[\n\r\t\000-\031]')
 _RE_SPARSE_LINE = re.compile(r'^\s*\{.*\}\s*$', re.UNICODE)
@@ -176,7 +173,7 @@
 ArffSparseDataType = Tuple[List, ...]
 
 
-if TYPE_CHECKING:
+if typing.TYPE_CHECKING:
     # typing_extensions is available when mypy is installed
     from typing_extensions import TypedDict
 
@@ -218,10 +215,10 @@ def _build_re_values():
     dense = re.compile(r'''(?x)
         ,                # may follow ','
         \s*
-        ((?=,)|$|%(value_re)s)  # empty or value
+        ((?=,)|$|{value_re})  # empty or value
         |
         (\S.*)           # error
-        ''' % {'value_re': value_re})
+        '''.format(value_re=value_re))
 
     # This captures (key, value) groups and will have an empty key/value
     # in case of syntax errors.
@@ -240,6 +237,7 @@ def _build_re_values():
     return dense, sparse
 
 
+
 _RE_DENSE_VALUES, _RE_SPARSE_KEY_VALUES = _build_re_values()
 
 
@@ -265,10 +263,10 @@ def _escape_sub_callback(match):
     if len(s) == 2:
         try:
             return _ESCAPE_SUB_MAP[s]
-        except KeyError as e:
-            raise ValueError('Unsupported escape sequence: %s' % s) from e
+        except KeyError:
+            raise ValueError('Unsupported escape sequence: %s' % s)
     if s[1] == 'u':
-        return unichr(int(s[2:], 16))
+        return chr(int(s[2:], 16))
     else:
         return chr(int(s[1:], 8))
 
@@ -303,8 +301,8 @@ def _parse_values(s):
             # an ARFF syntax error in sparse data
             for match in _RE_SPARSE_KEY_VALUES.finditer(s):
                 if not match.group(1):
-                    raise BadLayout('Error parsing %r' % match.group()) from exc
-            raise BadLayout('Unknown parsing error') from exc
+                    raise BadLayout('Error parsing %r' % match.group())
+            raise BadLayout('Unknown parsing error')
     else:
         # an ARFF syntax error
         for match in _RE_DENSE_VALUES.finditer(s):
@@ -321,24 +319,10 @@ def _parse_values(s):
 LOD_GEN = 4   # Generator of dictionaries
 _SUPPORTED_DATA_STRUCTURES = [DENSE, COO, LOD, DENSE_GEN, LOD_GEN]
 
-# =============================================================================
-
-# COMPATIBILITY WITH PYTHON 3 =================================================
-PY3 = sys.version_info[0] == 3
-if PY3:
-    unicode = str
-    basestring = str
-    xrange = range
-    unichr = chr
-# COMPABILITY WITH PYTHON 2 ===================================================
-# =============================================================================
-PY2 = sys.version_info[0] == 2
-if PY2:
-    from itertools import izip as zip
 
 # EXCEPTIONS ==================================================================
 class ArffException(Exception):
-    message : Optional[str] = None
+    message: Optional[str] = None
 
     def __init__(self):
         self.line = -1
@@ -357,7 +341,7 @@ class BadAttributeFormat(ArffException):
 class BadDataFormat(ArffException):
     '''Error raised when some data instance is in an invalid format.'''
     def __init__(self, value):
-        super(BadDataFormat, self).__init__()
+        super().__init__()
         self.message = (
             'Bad @DATA instance format in line %d: ' +
             ('%s' % value)
@@ -373,7 +357,7 @@ class BadAttributeName(ArffException):
     declaration.'''
 
     def __init__(self, value, value2):
-        super(BadAttributeName, self).__init__()
+        super().__init__()
         self.message = (
             ('Bad @ATTRIBUTE name %s at line' % value) +
             ' %d, this name is already in use in line' +
@@ -385,7 +369,7 @@ class BadNominalValue(ArffException):
     declared into it respective attribute declaration.'''
 
     def __init__(self, value):
-        super(BadNominalValue, self).__init__()
+        super().__init__()
         self.message = (
             ('Data value %s not found in nominal declaration, ' % value)
             + 'at line %d.'
@@ -394,7 +378,7 @@ def __init__(self, value):
 class BadNominalFormatting(ArffException):
     '''Error raised when a nominal value with space is not properly quoted.'''
     def __init__(self, value):
-        super(BadNominalFormatting, self).__init__()
+        super().__init__()
         self.message = (
             ('Nominal data value "%s" not properly quoted in line ' % value) +
             '%d.'
@@ -414,7 +398,7 @@ class BadLayout(ArffException):
     message = 'Invalid layout of the ARFF file, at line %d.'
 
     def __init__(self, msg=''):
-        super(BadLayout, self).__init__()
+        super().__init__()
         if msg:
             self.message = BadLayout.message + ' ' + msg.replace('%', '%%')
 
@@ -437,11 +421,11 @@ def _unescape_sub_callback(match):
 
 def encode_string(s):
     if _RE_QUOTE_CHARS.search(s):
-        return u"'%s'" % _RE_ESCAPE_CHARS.sub(_unescape_sub_callback, s)
+        return "'%s'" % _RE_ESCAPE_CHARS.sub(_unescape_sub_callback, s)
     return s
 
 
-class EncodedNominalConversor(object):
+class EncodedNominalConversor:
     def __init__(self, values):
         self.values = {v: i for i, v in enumerate(values)}
         self.values[0] = 0
@@ -449,11 +433,11 @@ def __init__(self, values):
     def __call__(self, value):
         try:
             return self.values[value]
-        except KeyError as e:
-            raise BadNominalValue(value) from e
+        except KeyError:
+            raise BadNominalValue(value)
 
 
-class NominalConversor(object):
+class NominalConversor:
     def __init__(self, values):
         self.values = set(values)
         self.zero_value = values[0]
@@ -467,10 +451,10 @@ def __call__(self, value):
                 # with EncodedNominalConversor.
                 return self.zero_value
             raise BadNominalValue(value)
-        return unicode(value)
+        return str(value)
 
 
-class DenseGeneratorData(object):
+class DenseGeneratorData:
     '''Internal helper class to allow for different matrix types without
     making the code a huge collection of if statements.'''
 
@@ -483,7 +467,7 @@ def decode_rows(self, stream, conversors):
                     raise BadDataFormat(row)
                 # XXX: int 0 is used for implicit values, not '0'
                 values = [values[i] if i in values else 0 for i in
-                          xrange(len(conversors))]
+                          range(len(conversors))]
             else:
                 if len(values) != len(conversors):
                     raise BadDataFormat(row)
@@ -498,7 +482,7 @@ def _decode_values(values, conversors):
                       in zip(conversors, values)]
         except ValueError as exc:
             if 'float: ' in str(exc):
-                raise BadNumericalValue from exc
+                raise BadNumericalValue()
         return values
 
     def encode_data(self, data, attributes):
@@ -522,27 +506,27 @@ def encode_data(self, data, attributes):
 
             new_data = []
             for value in inst:
-                if value is None or value == u'' or value != value:
+                if value is None or value == '' or value != value:
                     s = '?'
                 else:
-                    s = encode_string(unicode(value))
+                    s = encode_string(str(value))
                 new_data.append(s)
 
             current_row += 1
-            yield u','.join(new_data)
+            yield ','.join(new_data)
 
 
-class _DataListMixin(object):
+class _DataListMixin:
     """Mixin to return a list from decode_rows instead of a generator"""
     def decode_rows(self, stream, conversors):
-        return list(super(_DataListMixin, self).decode_rows(stream, conversors))
+        return list(super().decode_rows(stream, conversors))
 
 
 class Data(_DataListMixin, DenseGeneratorData):
     pass
 
 
-class COOData(object):
+class COOData:
     def decode_rows(self, stream, conversors):
         data, rows, cols = [], [], []
         for i, row in enumerate(stream):
@@ -557,11 +541,11 @@ def decode_rows(self, stream, conversors):
                           for key, value in zip(row_cols, values)]
             except ValueError as exc:
                 if 'float: ' in str(exc):
-                    raise BadNumericalValue from exc
+                    raise BadNumericalValue()
                 raise
-            except IndexError as e:
+            except IndexError:
                 # conversor out of range
-                raise BadDataFormat(row) from e
+                raise BadDataFormat(row)
 
             data.extend(values)
             rows.extend([i] * len(values))
@@ -579,7 +563,7 @@ def encode_data(self, data, attributes):
         data = data.data
 
         # Check if the rows are sorted
-        if not all(row[i] <= row[i + 1] for i in xrange(len(row) - 1)):
+        if not all(row[i] <= row[i + 1] for i in range(len(row) - 1)):
             raise ValueError("liac-arff can only output COO matrices with "
                              "sorted rows.")
 
@@ -587,7 +571,7 @@ def encode_data(self, data, attributes):
             if row > current_row:
                 # Add empty rows if necessary
                 while current_row < row:
-                    yield " ".join([u"{", u','.join(new_data), u"}"])
+                    yield " ".join(["{", ','.join(new_data), "}"])
                     new_data = []
                     current_row += 1
 
@@ -597,15 +581,15 @@ def encode_data(self, data, attributes):
                     (current_row, col + 1, num_attributes)
                 )
 
-            if v is None or v == u'' or v != v:
+            if v is None or v == '' or v != v:
                 s = '?'
             else:
-                s = encode_string(unicode(v))
+                s = encode_string(str(v))
             new_data.append("%d %s" % (col, s))
 
-        yield " ".join([u"{", u','.join(new_data), u"}"])
+        yield " ".join(["{", ','.join(new_data), "}"])
 
-class LODGeneratorData(object):
+class LODGeneratorData:
     def decode_rows(self, stream, conversors):
         for row in stream:
             values = _parse_values(row)
@@ -617,11 +601,11 @@ def decode_rows(self, stream, conversors):
                        for key, value in values.items()}
             except ValueError as exc:
                 if 'float: ' in str(exc):
-                    raise BadNumericalValue from exc
+                    raise BadNumericalValue()
                 raise
-            except IndexError as e:
+            except IndexError:
                 # conversor out of range
-                raise BadDataFormat(row) from e
+                raise BadDataFormat(row)
 
     def encode_data(self, data, attributes):
         current_row = 0
@@ -638,14 +622,14 @@ def encode_data(self, data, attributes):
 
             for col in sorted(row):
                 v = row[col]
-                if v is None or v == u'' or v != v:
+                if v is None or v == '' or v != v:
                     s = '?'
                 else:
-                    s = encode_string(unicode(v))
+                    s = encode_string(str(v))
                 new_data.append("%d %s" % (col, s))
 
             current_row += 1
-            yield " ".join([u"{", u','.join(new_data), u"}"])
+            yield " ".join(["{", ','.join(new_data), "}"])
 
 class LODData(_DataListMixin, LODGeneratorData):
     pass
@@ -680,7 +664,7 @@ def _get_data_object_for_encoding(matrix):
 # =============================================================================
 
 # ADVANCED INTERFACE ==========================================================
-class ArffDecoder(object):
+class ArffDecoder:
     '''An ARFF decoder.'''
 
     def __init__(self):
@@ -724,7 +708,7 @@ def _decode_relation(self, s):
         if not _RE_RELATION.match(v):
             raise BadRelationFormat()
 
-        res = unicode(v.strip('"\''))
+        res = str(v.strip('"\''))
         return res
 
     def _decode_attribute(self, s):
@@ -766,20 +750,20 @@ def _decode_attribute(self, s):
         name, type_ = m.groups()
 
         # Extracts the final name
-        name = unicode(name.strip('"\''))
+        name = str(name.strip('"\''))
 
         # Extracts the final type
-        if _RE_TYPE_NOMINAL.match(type_):
+        if type_[:1] == "{" and type_[-1:] == "}":
             try:
                 type_ = _parse_values(type_.strip('{} '))
-            except Exception as e:
-                raise BadAttributeType from e
+            except Exception:
+                raise BadAttributeType()
             if isinstance(type_, dict):
                 raise BadAttributeType()
 
         else:
             # If not nominal, verify the type name
-            type_ = unicode(type_).upper()
+            type_ = str(type_).upper()
             if type_ not in ['NUMERIC', 'REAL', 'INTEGER', 'STRING']:
                 raise BadAttributeType()
 
@@ -792,15 +776,15 @@ def _decode(self, s, encode_nominal=False, matrix_type=DENSE):
         self._current_line = 0
 
         # If string, convert to a list of lines
-        if isinstance(s, basestring):
+        if isinstance(s, str):
             s = s.strip('\r\n ').replace('\r\n', '\n').split('\n')
 
         # Create the return object
         obj: ArffContainerType = {
-            u'description': u'',
-            u'relation': u'',
-            u'attributes': [],
-            u'data': []
+            'description': '',
+            'relation': '',
+            'attributes': [],
+            'data': []
         }
         attribute_names = {}
 
@@ -852,7 +836,7 @@ def _decode(self, s, encode_nominal=False, matrix_type=DENSE):
                     else:
                         conversor = NominalConversor(attr[1])
                 else:
-                    CONVERSOR_MAP = {'STRING': unicode,
+                    CONVERSOR_MAP = {'STRING': str,
                                      'INTEGER': lambda x: int(float(x)),
                                      'NUMERIC': float,
                                      'REAL': float}
@@ -915,7 +899,7 @@ def decode(self, s, encode_nominal=False, return_type=DENSE):
             raise e
 
 
-class ArffEncoder(object):
+class ArffEncoder:
     '''An ARFF encoder.'''
 
     def _encode_comment(self, s=''):
@@ -931,9 +915,9 @@ def _encode_comment(self, s=''):
         :return: a string with the encoded comment line.
         '''
         if s:
-            return u'%s %s'%(_TK_COMMENT, s)
+            return '%s %s'%(_TK_COMMENT, s)
         else:
-            return u'%s' % _TK_COMMENT
+            return '%s' % _TK_COMMENT
 
     def _encode_relation(self, name):
         '''(INTERNAL) Decodes a relation line.
@@ -949,7 +933,7 @@ def _encode_relation(self, name):
                 name = '"%s"'%name
                 break
 
-        return u'%s %s'%(_TK_RELATION, name)
+        return '%s %s'%(_TK_RELATION, name)
 
     def _encode_attribute(self, name, type_):
         '''(INTERNAL) Encodes an attribute line.
@@ -980,20 +964,20 @@ def _encode_attribute(self, name, type_):
                 break
 
         if isinstance(type_, (tuple, list)):
-            type_tmp = [u'%s' % encode_string(type_k) for type_k in type_]
-            type_ = u'{%s}'%(u', '.join(type_tmp))
+            type_tmp = ['%s' % encode_string(type_k) for type_k in type_]
+            type_ = '{%s}'%(', '.join(type_tmp))
 
-        return u'%s %s %s'%(_TK_ATTRIBUTE, name, type_)
+        return '%s %s %s'%(_TK_ATTRIBUTE, name, type_)
 
     def encode(self, obj):
         '''Encodes a given object to an ARFF file.
 
         :param obj: the object containing the ARFF information.
-        :return: the ARFF file as an unicode string.
+        :return: the ARFF file as an string.
         '''
         data = [row for row in self.iter_encode(obj)]
 
-        return u'\n'.join(data)
+        return '\n'.join(data)
 
     def iter_encode(self, obj):
         '''The iterative version of `arff.ArffEncoder.encode`.
@@ -1002,7 +986,7 @@ def iter_encode(self, obj):
         lines of the ARFF file.
 
         :param obj: the object containing the ARFF information.
-        :return: (yields) the ARFF file as unicode strings.
+        :return: (yields) the ARFF file as strings.
         '''
         # DESCRIPTION
         if obj.get('description', None):
@@ -1014,7 +998,7 @@ def iter_encode(self, obj):
             raise BadObject('Relation name not found or with invalid value.')
 
         yield self._encode_relation(obj['relation'])
-        yield u''
+        yield ''
 
         # ATTRIBUTES
         if not obj.get('attributes'):
@@ -1025,10 +1009,10 @@ def iter_encode(self, obj):
             # Verify for bad object format
             if not isinstance(attr, (tuple, list)) or \
                len(attr) != 2 or \
-               not isinstance(attr[0], basestring):
+               not isinstance(attr[0], str):
                 raise BadObject('Invalid attribute declaration "%s"'%str(attr))
 
-            if isinstance(attr[1], basestring):
+            if isinstance(attr[1], str):
                 # Verify for invalid types
                 if attr[1] not in _SIMPLE_TYPES:
                     raise BadObject('Invalid attribute type "%s"'%str(attr))
@@ -1045,17 +1029,16 @@ def iter_encode(self, obj):
                 attribute_names.add(attr[0])
 
             yield self._encode_attribute(attr[0], attr[1])
-        yield u''
+        yield ''
         attributes = obj['attributes']
 
         # DATA
         yield _TK_DATA
         if 'data' in obj:
             data = _get_data_object_for_encoding(obj.get('data'))
-            for line in data.encode_data(obj.get('data'), attributes):
-                yield line
+            yield from data.encode_data(obj.get('data'), attributes)
 
-        yield u''
+        yield ''
 
 # =============================================================================
 
@@ -1108,7 +1091,7 @@ def dump(obj, fp):
 
     last_row = next(generator)
     for row in generator:
-        fp.write(last_row + u'\n')
+        fp.write(last_row + '\n')
         last_row = row
     fp.write(last_row)
 

From 5c0bbb0a4a4e674ba8017e3cdc664e0b7c7c8dc0 Mon Sep 17 00:00:00 2001
From: Mohamed Haseeb <m@mohaseeb.com>
Date: Tue, 2 Mar 2021 12:14:26 +0100
Subject: [PATCH 221/478] TST replaces assert_raise* by pytest.raises in
 model_selection (#19592)

Co-authored-by: Cycks <sikolia.wycliffe@gmail.com>
---
 .../model_selection/tests/test_validation.py  | 184 +++++++++---------
 1 file changed, 95 insertions(+), 89 deletions(-)

diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py
index 8bb853bcd51b4..c280d1e8ef140 100644
--- a/sklearn/model_selection/tests/test_validation.py
+++ b/sklearn/model_selection/tests/test_validation.py
@@ -15,9 +15,6 @@
 from sklearn.model_selection.tests.test_search import FailingClassifier
 
 from sklearn.utils._testing import assert_almost_equal
-from sklearn.utils._testing import assert_raises
-from sklearn.utils._testing import assert_raise_message
-from sklearn.utils._testing import assert_raises_regex
 from sklearn.utils._testing import assert_array_almost_equal
 from sklearn.utils._testing import assert_array_equal
 from sklearn.utils._testing import assert_allclose
@@ -125,7 +122,6 @@ def _is_training_data(self, X):
     def partial_fit(self, X, y=None, **params):
         self.train_sizes += X.shape[0]
         self.x = X[0]
-
         if self.expected_fit_params:
             missing = set(self.expected_fit_params) - set(params)
             if missing:
@@ -281,7 +277,8 @@ def test_cross_val_score():
     clf = CheckingClassifier(check_y=list_check)
     scores = cross_val_score(clf, X, y2.tolist(), cv=3)
 
-    assert_raises(ValueError, cross_val_score, clf, X, y2, scoring="sklearn")
+    with pytest.raises(ValueError):
+        cross_val_score(clf, X, y2, scoring="sklearn")
 
     # test with 3d X and
     X_3d = X[:, :, np.newaxis]
@@ -289,8 +286,8 @@ def test_cross_val_score():
     scores = cross_val_score(clf, X_3d, y2)
 
     clf = MockClassifier(allow_nd=False)
-    assert_raises(ValueError, cross_val_score, clf, X_3d, y2,
-                  error_score='raise')
+    with pytest.raises(ValueError):
+        cross_val_score(clf, X_3d, y2, error_score='raise')
 
 
 def test_cross_validate_many_jobs():
@@ -312,38 +309,39 @@ def test_cross_validate_invalid_scoring_param():
 
     # List/tuple of callables should raise a message advising users to use
     # dict of names to callables mapping
-    assert_raises_regex(ValueError, error_message_regexp,
-                        cross_validate, estimator, X, y,
-                        scoring=(make_scorer(precision_score),
-                                 make_scorer(accuracy_score)))
-    assert_raises_regex(ValueError, error_message_regexp,
-                        cross_validate, estimator, X, y,
-                        scoring=(make_scorer(precision_score),))
+    with pytest.raises(ValueError, match=error_message_regexp):
+        cross_validate(estimator, X, y, scoring=(make_scorer(precision_score),
+                                                 make_scorer(accuracy_score)))
+    with pytest.raises(ValueError, match=error_message_regexp):
+        cross_validate(estimator, X, y,
+                       scoring=(make_scorer(precision_score),))
 
     # So should empty lists/tuples
-    assert_raises_regex(ValueError, error_message_regexp + "Empty list.*",
-                        cross_validate, estimator, X, y, scoring=())
+    with pytest.raises(
+        ValueError,
+        match=error_message_regexp + "Empty list.*"
+    ):
+        cross_validate(estimator, X, y, scoring=())
 
     # So should duplicated entries
-    assert_raises_regex(ValueError, error_message_regexp + "Duplicate.*",
-                        cross_validate, estimator, X, y,
-                        scoring=('f1_micro', 'f1_micro'))
+    with pytest.raises(ValueError, match=error_message_regexp + "Duplicate.*"):
+        cross_validate(estimator, X, y, scoring=('f1_micro', 'f1_micro'))
 
     # Nested Lists should raise a generic error message
-    assert_raises_regex(ValueError, error_message_regexp,
-                        cross_validate, estimator, X, y,
-                        scoring=[[make_scorer(precision_score)]])
+    with pytest.raises(ValueError, match=error_message_regexp):
+        cross_validate(estimator, X, y,
+                       scoring=[[make_scorer(precision_score)]])
 
     error_message_regexp = (".*scoring is invalid.*Refer to the scoring "
                             "glossary for details:.*")
 
     # Empty dict should raise invalid scoring error
-    assert_raises_regex(ValueError, "An empty dict",
-                        cross_validate, estimator, X, y, scoring=(dict()))
+    with pytest.raises(ValueError, match="An empty dict"):
+        cross_validate(estimator, X, y, scoring=(dict()))
 
     # And so should any other invalid entry
-    assert_raises_regex(ValueError, error_message_regexp,
-                        cross_validate, estimator, X, y, scoring=5)
+    with pytest.raises(ValueError, match=error_message_regexp):
+        cross_validate(estimator, X, y, scoring=5)
 
     multiclass_scorer = make_scorer(precision_recall_fscore_support)
 
@@ -359,8 +357,11 @@ def test_cross_validate_invalid_scoring_param():
     with pytest.warns(UserWarning, match=warning_message):
         cross_validate(estimator, X, y, scoring={"foo": multiclass_scorer})
 
-    assert_raises_regex(ValueError, "'mse' is not a valid scoring value.",
-                        cross_validate, SVC(), X, y, scoring="mse")
+    with pytest.raises(
+        ValueError,
+        match="'mse' is not a valid scoring value."
+    ):
+        cross_validate(SVC(), X, y, scoring="mse")
 
 
 def test_cross_validate_nested_estimator():
@@ -532,13 +533,12 @@ def test_cross_val_score_predict_groups():
 
     group_cvs = [LeaveOneGroupOut(), LeavePGroupsOut(2), GroupKFold(),
                  GroupShuffleSplit()]
+    error_message = "The 'groups' parameter should not be None."
     for cv in group_cvs:
-        assert_raise_message(ValueError,
-                             "The 'groups' parameter should not be None.",
-                             cross_val_score, estimator=clf, X=X, y=y, cv=cv)
-        assert_raise_message(ValueError,
-                             "The 'groups' parameter should not be None.",
-                             cross_val_predict, estimator=clf, X=X, y=y, cv=cv)
+        with pytest.raises(ValueError, match=error_message):
+            cross_val_score(estimator=clf, X=X, y=y, cv=cv)
+        with pytest.raises(ValueError, match=error_message):
+            cross_val_predict(estimator=clf, X=X, y=y, cv=cv)
 
 
 @pytest.mark.filterwarnings('ignore: Using or importing the ABCs from')
@@ -597,12 +597,13 @@ def test_cross_val_score_precomputed():
 
     # Error raised for non-square X
     svm = SVC(kernel="precomputed")
-    assert_raises(ValueError, cross_val_score, svm, X, y)
+    with pytest.raises(ValueError):
+        cross_val_score(svm, X, y)
 
     # test error is raised when the precomputed kernel is not array-like
     # or sparse
-    assert_raises(ValueError, cross_val_score, svm,
-                  linear_kernel.tolist(), y)
+    with pytest.raises(ValueError):
+        cross_val_score(svm, linear_kernel.tolist(), y)
 
 
 def test_cross_val_score_fit_params():
@@ -657,7 +658,8 @@ def test_cross_val_score_errors():
     class BrokenEstimator:
         pass
 
-    assert_raises(TypeError, cross_val_score, BrokenEstimator(), X)
+    with pytest.raises(TypeError):
+        cross_val_score(BrokenEstimator(), X)
 
 
 def test_cross_val_score_with_score_func_classification():
@@ -851,7 +853,8 @@ def split(self, X, y=None, groups=None):
             for i in range(4):
                 yield np.array([0, 1, 2, 3]), np.array([4, 5, 6, 7, 8])
 
-    assert_raises(ValueError, cross_val_predict, est, X, y, cv=BadCV())
+    with pytest.raises(ValueError):
+        cross_val_predict(est, X, y, cv=BadCV())
 
     X, y = load_iris(return_X_y=True)
 
@@ -882,15 +885,15 @@ def test_cross_val_predict_decision_function_shape():
     # class.
     X = X[:100]
     y = y[:100]
-    assert_raise_message(ValueError,
-                         'Only 1 class/es in training fold,'
-                         ' but 2 in overall dataset. This'
-                         ' is not supported for decision_function'
-                         ' with imbalanced folds. To fix '
-                         'this, use a cross-validation technique '
-                         'resulting in properly stratified folds',
-                         cross_val_predict, RidgeClassifier(), X, y,
-                         method='decision_function', cv=KFold(2))
+    error_message = 'Only 1 class/es in training fold,'\
+                    ' but 2 in overall dataset. This'\
+                    ' is not supported for decision_function'\
+                    ' with imbalanced folds. To fix '\
+                    'this, use a cross-validation technique '\
+                    'resulting in properly stratified folds'
+    with pytest.raises(ValueError, match=error_message):
+        cross_val_predict(RidgeClassifier(), X, y, method='decision_function',
+                          cv=KFold(2))
 
     X, y = load_digits(return_X_y=True)
     est = SVC(kernel='linear', decision_function_shape='ovo')
@@ -902,12 +905,13 @@ def test_cross_val_predict_decision_function_shape():
 
     ind = np.argsort(y)
     X, y = X[ind], y[ind]
-    assert_raises_regex(ValueError,
-                        r'Output shape \(599L?, 21L?\) of decision_function '
-                        r'does not match number of classes \(7\) in fold. '
-                        'Irregular decision_function .*',
-                        cross_val_predict, est, X, y,
-                        cv=KFold(n_splits=3), method='decision_function')
+    error_message_regexp = r'Output shape \(599L?, 21L?\) of ' \
+                           'decision_function does not match number of ' \
+                           r'classes \(7\) in fold. Irregular ' \
+                           'decision_function .*'
+    with pytest.raises(ValueError, match=error_message_regexp):
+        cross_val_predict(est, X, y, cv=KFold(n_splits=3),
+                          method='decision_function')
 
 
 def test_cross_val_predict_predict_proba_shape():
@@ -1126,8 +1130,8 @@ def test_learning_curve_incremental_learning_not_possible():
                                n_clusters_per_class=1, random_state=0)
     # The mockup does not have partial_fit()
     estimator = MockImprovingEstimator(1)
-    assert_raises(ValueError, learning_curve, estimator, X, y,
-                  exploit_incremental_learning=True)
+    with pytest.raises(ValueError):
+        learning_curve(estimator, X, y, exploit_incremental_learning=True)
 
 
 def test_learning_curve_incremental_learning():
@@ -1190,16 +1194,16 @@ def test_learning_curve_n_sample_range_out_of_bounds():
                                n_redundant=0, n_classes=2,
                                n_clusters_per_class=1, random_state=0)
     estimator = MockImprovingEstimator(20)
-    assert_raises(ValueError, learning_curve, estimator, X, y, cv=3,
-                  train_sizes=[0, 1])
-    assert_raises(ValueError, learning_curve, estimator, X, y, cv=3,
-                  train_sizes=[0.0, 1.0])
-    assert_raises(ValueError, learning_curve, estimator, X, y, cv=3,
-                  train_sizes=[0.1, 1.1])
-    assert_raises(ValueError, learning_curve, estimator, X, y, cv=3,
-                  train_sizes=[0, 20])
-    assert_raises(ValueError, learning_curve, estimator, X, y, cv=3,
-                  train_sizes=[1, 21])
+    with pytest.raises(ValueError):
+        learning_curve(estimator, X, y, cv=3, train_sizes=[0, 1])
+    with pytest.raises(ValueError):
+        learning_curve(estimator, X, y, cv=3, train_sizes=[0.0, 1.0])
+    with pytest.raises(ValueError):
+        learning_curve(estimator, X, y, cv=3, train_sizes=[0.1, 1.1])
+    with pytest.raises(ValueError):
+        learning_curve(estimator, X, y, cv=3, train_sizes=[0, 20])
+    with pytest.raises(ValueError):
+        learning_curve(estimator, X, y, cv=3, train_sizes=[1, 21])
 
 
 def test_learning_curve_remove_duplicate_sample_sizes():
@@ -1253,9 +1257,10 @@ def test_learning_curve_with_shuffle():
                               np.array([0.75, 0.3, 0.36111111]))
     assert_array_almost_equal(test_scores_batch.mean(axis=1),
                               np.array([0.36111111, 0.25, 0.25]))
-    assert_raises(ValueError, learning_curve, estimator, X, y, cv=cv, n_jobs=1,
-                  train_sizes=np.linspace(0.3, 1.0, 3), groups=groups,
-                  error_score='raise')
+    with pytest.raises(ValueError):
+        learning_curve(estimator, X, y, cv=cv, n_jobs=1,
+                       train_sizes=np.linspace(0.3, 1.0, 3), groups=groups,
+                       error_score='raise')
 
     train_sizes_inc, train_scores_inc, test_scores_inc = learning_curve(
         estimator, X, y, cv=cv, n_jobs=1, train_sizes=np.linspace(0.3, 1.0, 3),
@@ -1709,8 +1714,8 @@ def test_score_memmap():
     score = np.memmap(tf.name, shape=(), mode='r', dtype=np.float64)
     try:
         cross_val_score(clf, X, y, scoring=lambda est, X, y: score)
-        assert_raises(ValueError, cross_val_score, clf, X, y,
-                      scoring=lambda est, X, y: scores)
+        with pytest.raises(ValueError):
+            cross_val_score(clf, X, y, scoring=lambda est, X, y: scores)
     finally:
         # Best effort to release the mmap file handles before deleting the
         # backing file under Windows
@@ -1785,26 +1790,28 @@ def test_warn_trace(msg):
 
     fit_and_score_kwargs = {'error_score': 'raise'}
     # check if exception was raised, with default error_score='raise'
-    assert_raise_message(ValueError, "Failing classifier failed as required",
-                         _fit_and_score, *fit_and_score_args,
-                         **fit_and_score_kwargs)
+    with pytest.raises(
+        ValueError,
+        match="Failing classifier failed as required"
+    ):
+        _fit_and_score(*fit_and_score_args, **fit_and_score_kwargs)
 
     # check that functions upstream pass error_score param to _fit_and_score
-    error_message = ("error_score must be the string 'raise' or a"
-                     " numeric value. (Hint: if using 'raise', please"
-                     " make sure that it has been spelled correctly.)")
-
-    assert_raise_message(ValueError, error_message, cross_validate,
-                         failing_clf, X, cv=3, error_score='unvalid-string')
+    error_message = re.escape(
+        "error_score must be the string 'raise' or a numeric value. (Hint: if "
+        "using 'raise', please make sure that it has been spelled correctly.)"
+    )
+    with pytest.raises(ValueError, match=error_message):
+        cross_validate(failing_clf, X, cv=3, error_score='unvalid-string')
 
-    assert_raise_message(ValueError, error_message, cross_val_score,
-                         failing_clf, X, cv=3, error_score='unvalid-string')
+    with pytest.raises(ValueError, match=error_message):
+        cross_val_score(failing_clf, X, cv=3, error_score='unvalid-string')
 
-    assert_raise_message(ValueError, error_message, learning_curve,
-                         failing_clf, X, y, cv=3, error_score='unvalid-string')
+    with pytest.raises(ValueError, match=error_message):
+        learning_curve(failing_clf, X, y, cv=3, error_score='unvalid-string')
 
-    assert_raise_message(ValueError, error_message, validation_curve,
-                         failing_clf, X, y, param_name='parameter',
+    with pytest.raises(ValueError, match=error_message):
+        validation_curve(failing_clf, X, y, param_name='parameter',
                          param_range=[FailingClassifier.FAILING_PARAMETER],
                          cv=3, error_score='unvalid-string')
 
@@ -1907,7 +1914,6 @@ def test_cross_validate_failing_scorer(
                     assert_allclose(results[key], error_score)
 
 
-
 def three_params_scorer(i, j, k):
     return 3.4213
 
@@ -1952,8 +1958,8 @@ def test_score():
     def two_params_scorer(estimator, X_test):
         return None
     fit_and_score_args = [None, None, None, two_params_scorer]
-    assert_raise_message(ValueError, error_message,
-                         _score, *fit_and_score_args, error_score=np.nan)
+    with pytest.raises(ValueError, match=error_message):
+        _score(*fit_and_score_args, error_score=np.nan)
 
 
 def test_callable_multimetric_confusion_matrix_cross_validate():

From 28ee486b44f8e7e6440f3439e7315ba1e6d35e43 Mon Sep 17 00:00:00 2001
From: Alihan Zihna <alihanz@gmail.com>
Date: Tue, 2 Mar 2021 11:19:56 +0000
Subject: [PATCH 222/478] TST Change assert to pytest style in
 tests/test_discriminant.py (#19558)

Co-authored-by: Alihan Zihna <a.zihna@ckhgbdp.onmicrosoft.com>
Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 sklearn/tests/test_discriminant_analysis.py | 71 ++++++++++++++-------
 1 file changed, 49 insertions(+), 22 deletions(-)

diff --git a/sklearn/tests/test_discriminant_analysis.py b/sklearn/tests/test_discriminant_analysis.py
index 18364ce156f87..3dd22e2154400 100644
--- a/sklearn/tests/test_discriminant_analysis.py
+++ b/sklearn/tests/test_discriminant_analysis.py
@@ -5,14 +5,10 @@
 from scipy import linalg
 
 from sklearn.utils import check_random_state
-from sklearn.utils._testing import assert_array_equal, assert_no_warnings
+from sklearn.utils._testing import assert_array_equal
 from sklearn.utils._testing import assert_array_almost_equal
 from sklearn.utils._testing import assert_allclose
 from sklearn.utils._testing import assert_almost_equal
-from sklearn.utils._testing import assert_raises
-from sklearn.utils._testing import assert_raise_message
-from sklearn.utils._testing import assert_warns
-from sklearn.utils._testing import ignore_warnings
 
 from sklearn.datasets import make_blobs
 from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
@@ -89,15 +85,22 @@ def test_lda_predict():
 
     # Test invalid shrinkages
     clf = LinearDiscriminantAnalysis(solver="lsqr", shrinkage=-0.2231)
-    assert_raises(ValueError, clf.fit, X, y)
+    with pytest.raises(ValueError):
+        clf.fit(X, y)
+
     clf = LinearDiscriminantAnalysis(solver="eigen", shrinkage="dummy")
-    assert_raises(ValueError, clf.fit, X, y)
+    with pytest.raises(ValueError):
+        clf.fit(X, y)
+
     clf = LinearDiscriminantAnalysis(solver="svd", shrinkage="auto")
-    assert_raises(NotImplementedError, clf.fit, X, y)
+    with pytest.raises(NotImplementedError):
+        clf.fit(X, y)
+
     clf = LinearDiscriminantAnalysis(solver="lsqr", shrinkage=np.array([1, 2]))
     with pytest.raises(TypeError,
                        match="shrinkage must be a float or a string"):
         clf.fit(X, y)
+
     clf = LinearDiscriminantAnalysis(solver="lsqr",
                                      shrinkage=0.1,
                                      covariance_estimator=ShrunkCovariance())
@@ -106,9 +109,11 @@ def test_lda_predict():
                               "parameters are not None. "
                               "Only one of the two can be set.")):
         clf.fit(X, y)
+
     # Test unknown solver
     clf = LinearDiscriminantAnalysis(solver="dummy")
-    assert_raises(ValueError, clf.fit, X, y)
+    with pytest.raises(ValueError):
+        clf.fit(X, y)
 
     # test bad solver with covariance_estimator
     clf = LinearDiscriminantAnalysis(solver="svd",
@@ -199,7 +204,9 @@ def test_lda_priors():
     priors = np.array([0.5, -0.5])
     clf = LinearDiscriminantAnalysis(priors=priors)
     msg = "priors must be non-negative"
-    assert_raise_message(ValueError, msg, clf.fit, X, y)
+
+    with pytest.raises(ValueError, match=msg):
+        clf.fit(X, y)
 
     # Test that priors passed as a list are correctly handled (run to see if
     # failure)
@@ -210,7 +217,10 @@ def test_lda_priors():
     priors = np.array([0.5, 0.6])
     prior_norm = np.array([0.45, 0.55])
     clf = LinearDiscriminantAnalysis(priors=priors)
-    assert_warns(UserWarning, clf.fit, X, y)
+
+    with pytest.warns(UserWarning):
+        clf.fit(X, y)
+
     assert_array_almost_equal(clf.priors_, prior_norm, 2)
 
 
@@ -247,7 +257,9 @@ def test_lda_transform():
     clf = LinearDiscriminantAnalysis(solver="lsqr", n_components=1)
     clf.fit(X, y)
     msg = "transform not implemented for 'lsqr'"
-    assert_raise_message(NotImplementedError, msg, clf.transform, X)
+
+    with pytest.raises(NotImplementedError, match=msg):
+        clf.transform(X)
 
 
 def test_lda_explained_variance_ratio():
@@ -424,7 +436,8 @@ def test_lda_dimension_warning(n_classes, n_features):
     for n_components in [max_components - 1, None, max_components]:
         # if n_components <= min(n_classes - 1, n_features), no warning
         lda = LinearDiscriminantAnalysis(n_components=n_components)
-        assert_no_warnings(lda.fit, X, y)
+        with pytest.warns(None):
+            lda.fit(X, y)
 
     for n_components in [max_components + 1,
                          max(n_features, n_classes - 1) + 1]:
@@ -486,7 +499,8 @@ def test_qda():
     assert np.any(y_pred3 != y7)
 
     # Classes should have at least 2 elements
-    assert_raises(ValueError, clf.fit, X6, y4)
+    with pytest.raises(ValueError):
+        clf.fit(X6, y4)
 
 
 def test_qda_priors():
@@ -523,23 +537,36 @@ def test_qda_store_covariance():
 
 
 def test_qda_regularization():
-    # the default is reg_param=0. and will cause issues
-    # when there is a constant variable
+    # The default is reg_param=0. and will cause issues when there is a
+    # constant variable.
+
+    # Fitting on data with constant variable triggers an UserWarning.
+    collinear_msg = "Variables are collinear"
     clf = QuadraticDiscriminantAnalysis()
-    with ignore_warnings():
-        y_pred = clf.fit(X2, y6).predict(X2)
+    with pytest.warns(UserWarning, match=collinear_msg):
+        y_pred = clf.fit(X2, y6)
+
+    # XXX: RuntimeWarning is also raised at predict time because of divisions
+    # by zero when the model is fit with a constant feature and without
+    # regularization: should this be considered a bug? Either by the fit-time
+    # message more informative, raising and exception instead of a warning in
+    # this case or somehow changing predict to avoid division by zero.
+    with pytest.warns(RuntimeWarning, match="divide by zero"):
+        y_pred = clf.predict(X2)
     assert np.any(y_pred != y6)
 
-    # adding a little regularization fixes the problem
+    # Adding a little regularization fixes the division by zero at predict
+    # time. But UserWarning will persist at fit time.
     clf = QuadraticDiscriminantAnalysis(reg_param=0.01)
-    with ignore_warnings():
+    with pytest.warns(UserWarning, match=collinear_msg):
         clf.fit(X2, y6)
     y_pred = clf.predict(X2)
     assert_array_equal(y_pred, y6)
 
-    # Case n_samples_in_a_class < n_features
+    # UserWarning should also be there for the n_samples_in_a_class <
+    # n_features case.
     clf = QuadraticDiscriminantAnalysis(reg_param=0.1)
-    with ignore_warnings():
+    with pytest.warns(UserWarning, match=collinear_msg):
         clf.fit(X5, y5)
     y_pred5 = clf.predict(X5)
     assert_array_equal(y_pred5, y5)

From bd53f54fb666459dc54af4de52032a65fca551be Mon Sep 17 00:00:00 2001
From: Alihan Zihna <alihanz@gmail.com>
Date: Wed, 3 Mar 2021 17:01:07 +0000
Subject: [PATCH 223/478] TST Change assert from sklearn to pytest style in
 tests/test_multiclass.py (#19593)

---
 sklearn/tests/test_multiclass.py | 95 ++++++++++++++++++++------------
 1 file changed, 59 insertions(+), 36 deletions(-)

diff --git a/sklearn/tests/test_multiclass.py b/sklearn/tests/test_multiclass.py
index 96bd1b807a95f..74b380505e45a 100644
--- a/sklearn/tests/test_multiclass.py
+++ b/sklearn/tests/test_multiclass.py
@@ -6,10 +6,6 @@
 
 from sklearn.utils._testing import assert_array_equal
 from sklearn.utils._testing import assert_almost_equal
-from sklearn.utils._testing import assert_raises
-from sklearn.utils._testing import assert_warns
-from sklearn.utils._testing import assert_raise_message
-from sklearn.utils._testing import assert_raises_regexp
 from sklearn.utils._testing import ignore_warnings
 from sklearn.utils._mocking import CheckingClassifier
 from sklearn.multiclass import OneVsRestClassifier
@@ -35,6 +31,7 @@
 from sklearn.pipeline import Pipeline, make_pipeline
 from sklearn.impute import SimpleImputer
 from sklearn import svm
+from sklearn.exceptions import NotFittedError
 from sklearn import datasets
 
 iris = datasets.load_iris()
@@ -47,22 +44,30 @@
 
 def test_ovr_exceptions():
     ovr = OneVsRestClassifier(LinearSVC(random_state=0))
-    assert_raises(ValueError, ovr.predict, [])
+
+    # test predicting without fitting
+    with pytest.raises(NotFittedError):
+        ovr.predict([])
 
     # Fail on multioutput data
-    assert_raises(ValueError, OneVsRestClassifier(MultinomialNB()).fit,
-                  np.array([[1, 0], [0, 1]]),
-                  np.array([[1, 2], [3, 1]]))
-    assert_raises(ValueError, OneVsRestClassifier(MultinomialNB()).fit,
-                  np.array([[1, 0], [0, 1]]),
-                  np.array([[1.5, 2.4], [3.1, 0.8]]))
+    msg = "Multioutput target data is not supported with label binarization"
+    with pytest.raises(ValueError, match=msg):
+        X = np.array([[1, 0], [0, 1]])
+        y = np.array([[1, 2], [3, 1]])
+        OneVsRestClassifier(MultinomialNB()).fit(X, y)
+
+    with pytest.raises(ValueError, match=msg):
+        X = np.array([[1, 0], [0, 1]])
+        y = np.array([[1.5, 2.4], [3.1, 0.8]])
+        OneVsRestClassifier(MultinomialNB()).fit(X, y)
 
 
 def test_check_classification_targets():
     # Test that check_classification_target return correct type. #5782
     y = np.array([0.0, 1.1, 2.0, 3.0])
     msg = type_of_target(y)
-    assert_raise_message(ValueError, msg, check_classification_targets, y)
+    with pytest.raises(ValueError, match=msg):
+        check_classification_targets(y)
 
 
 def test_ovr_fit_predict():
@@ -120,12 +125,12 @@ def test_ovr_partial_fit_exceptions():
     X = np.abs(np.random.randn(14, 2))
     y = [1, 1, 1, 1, 2, 3, 3, 0, 0, 2, 3, 1, 2, 3]
     ovr.partial_fit(X[:7], y[:7], np.unique(y))
-    # A new class value which was not in the first call of partial_fit
-    # It should raise ValueError
+    # If a new class that was not in the first call of partial fit is seen
+    # it should raise ValueError
     y1 = [5] + y[7:-1]
-    assert_raises_regexp(ValueError, r"Mini-batch contains \[.+\] while "
-                                     r"classes must be subset of \[.+\]",
-                         ovr.partial_fit, X=X[7:], y=y1)
+    msg = r"Mini-batch contains \[.+\] while classes must be subset of \[.+\]"
+    with pytest.raises(ValueError, match=msg):
+        ovr.partial_fit(X=X[7:], y=y1)
 
 
 def test_ovr_ovo_regressor():
@@ -201,7 +206,9 @@ def test_ovr_always_present():
     y[:, 2] = 1
 
     ovr = OneVsRestClassifier(LogisticRegression())
-    assert_warns(UserWarning, ovr.fit, X, y)
+    msg = r'Label .+ is present in all training examples'
+    with pytest.warns(UserWarning, match=msg):
+        ovr.fit(X, y)
     y_pred = ovr.predict(X)
     assert_array_equal(np.array(y_pred), np.array(y))
     y_pred = ovr.decision_function(X)
@@ -213,7 +220,10 @@ def test_ovr_always_present():
     y = np.zeros((10, 2))
     y[5:, 0] = 1  # variable label
     ovr = OneVsRestClassifier(LogisticRegression())
-    assert_warns(UserWarning, ovr.fit, X, y)
+
+    msg = r'Label not 1 is present in all training examples'
+    with pytest.warns(UserWarning, match=msg):
+        ovr.fit(X, y)
     y_pred = ovr.predict_proba(X)
     assert_array_equal(y_pred[:, -1], np.zeros(X.shape[0]))
 
@@ -266,7 +276,7 @@ def conduct_test(base_clf, test_predict_proba=False):
             probabilities = clf.predict_proba(X_test)
             assert 2 == len(probabilities[0])
             assert (clf.classes_[np.argmax(probabilities, axis=1)] ==
-                         clf.predict(X_test))
+                    clf.predict(X_test))
 
         # test input as label indicator matrix
         clf = OneVsRestClassifier(base_clf).fit(X, Y)
@@ -389,8 +399,8 @@ def test_ovr_single_label_predict_proba():
 
     assert_almost_equal(Y_proba.sum(axis=1), 1.0)
     # predict assigns a label if the probability that the
-    # sample has the label is greater than 0.5.
-    pred = np.array([l.argmax() for l in Y_proba])
+    # sample has the label with the greatest predictive probability.
+    pred = Y_proba.argmax(axis=1)
     assert not (pred - Y_pred).any()
 
 
@@ -458,7 +468,7 @@ def test_ovr_coef_():
             assert shape[1] == iris.data.shape[1]
             # don't densify sparse coefficients
             assert (sp.issparse(ovr.estimators_[0].coef_) ==
-                         sp.issparse(ovr.coef_))
+                    sp.issparse(ovr.coef_))
 
 
 # TODO: Remove this test in version 1.1
@@ -467,13 +477,16 @@ def test_ovr_coef_():
 def test_ovr_coef_exceptions():
     # Not fitted exception!
     ovr = OneVsRestClassifier(LinearSVC(random_state=0))
-    # lambda is needed because we don't want coef_ to be evaluated right away
-    assert_raises(ValueError, lambda x: ovr.coef_, None)
+
+    with pytest.raises(NotFittedError):
+        ovr.coef_
 
     # Doesn't have coef_ exception!
     ovr = OneVsRestClassifier(DecisionTreeClassifier())
     ovr.fit(iris.data, iris.target)
-    assert_raises(AttributeError, lambda x: ovr.coef_, None)
+    msg = "Base estimator doesn't have a coef_ attribute"
+    with pytest.raises(AttributeError, match=msg):
+        ovr.coef_
 
 
 # TODO: Remove this test in version 1.1 when
@@ -494,7 +507,8 @@ def test_ovr_deprecated_coef_intercept():
 
 def test_ovo_exceptions():
     ovo = OneVsOneClassifier(LinearSVC(random_state=0))
-    assert_raises(ValueError, ovo.predict, [])
+    with pytest.raises(NotFittedError):
+        ovo.predict([])
 
 
 def test_ovo_fit_on_list():
@@ -563,8 +577,8 @@ def test_ovo_partial_fit_predict():
     message_re = escape("Mini-batch contains {0} while "
                         "it must be subset of {1}".format(np.unique(error_y),
                                                           np.unique(y)))
-    assert_raises_regexp(ValueError, message_re, ovo.partial_fit, X[:7],
-                         error_y, np.unique(y))
+    with pytest.raises(ValueError, match=message_re):
+        ovo.partial_fit(X[:7], error_y, np.unique(y))
 
     # test partial_fit only exists if estimator has it:
     ovr = OneVsOneClassifier(SVC())
@@ -682,7 +696,9 @@ def test_ovo_one_class():
     y = np.array(['a'] * 4)
 
     ovo = OneVsOneClassifier(LinearSVC())
-    assert_raise_message(ValueError, "when only one class", ovo.fit, X, y)
+    msg = "when only one class"
+    with pytest.raises(ValueError, match=msg):
+        ovo.fit(X, y)
 
 
 def test_ovo_float_y():
@@ -691,12 +707,15 @@ def test_ovo_float_y():
     y = iris.data[:, 0]
 
     ovo = OneVsOneClassifier(LinearSVC())
-    assert_raise_message(ValueError, "Unknown label type", ovo.fit, X, y)
+    msg = "Unknown label type"
+    with pytest.raises(ValueError, match=msg):
+        ovo.fit(X, y)
 
 
 def test_ecoc_exceptions():
     ecoc = OutputCodeClassifier(LinearSVC(random_state=0))
-    assert_raises(ValueError, ecoc.predict, [])
+    with pytest.raises(NotFittedError):
+        ecoc.predict([])
 
 
 def test_ecoc_fit_predict():
@@ -728,10 +747,14 @@ def test_ecoc_float_y():
     y = iris.data[:, 0]
 
     ovo = OutputCodeClassifier(LinearSVC())
-    assert_raise_message(ValueError, "Unknown label type", ovo.fit, X, y)
+    msg = "Unknown label type"
+    with pytest.raises(ValueError, match=msg):
+        ovo.fit(X, y)
+
     ovo = OutputCodeClassifier(LinearSVC(), code_size=-1)
-    assert_raise_message(ValueError, "code_size should be greater than 0,"
-                         " got -1", ovo.fit, X, y)
+    msg = "code_size should be greater than 0, got -1"
+    with pytest.raises(ValueError, match=msg):
+        ovo.fit(X, y)
 
 
 def test_ecoc_delegate_sparse_base_estimator():
@@ -773,7 +796,7 @@ def test_pairwise_indices():
 
     for idx in precomputed_indices:
         assert (idx.shape[0] * n_estimators / (n_estimators - 1) ==
-                     linear_kernel.shape[0])
+                linear_kernel.shape[0])
 
 
 @ignore_warnings(category=FutureWarning)

From 4e732f893c04c45e1cdb287abaef67cf83b731eb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= <loic.esteve@ymail.com>
Date: Wed, 3 Mar 2021 18:14:45 +0100
Subject: [PATCH 224/478] Add BNP Paribas Cardif testimonial. (#19586)

---
 .../images/bnp_paribas_cardif.png             | Bin 0 -> 65058 bytes
 doc/testimonials/testimonials.rst             |  41 ++++++++++++++++++
 2 files changed, 41 insertions(+)
 create mode 100644 doc/testimonials/images/bnp_paribas_cardif.png

diff --git a/doc/testimonials/images/bnp_paribas_cardif.png b/doc/testimonials/images/bnp_paribas_cardif.png
new file mode 100644
index 0000000000000000000000000000000000000000..0c7a040bae329d65086384fd1ba90d70bfaaf0a5
GIT binary patch
literal 65058
zcmX_nWmr_-7p{s@f}}LkjdXXz5JPtf3?L=l2rAv(CEXz1A)SJBNJvO`_uc&O{oM}(
zJUk5NoW1v2>#Ye^f=Z(z6Cgi(_6!vwBcbx_*$ZFra}**x`07|~6a&5?*~w@-K7012
z`|0&Op5YDQvuC8wAQGZ#ZfSe1?rAX7&xeoRzf^Qx)m;@8FZWPhz;zLzR0WZLH|Y2-
zT;8(!1{ceK3~w!?#5fEiv7}K-l&bLsrs%39^K%RdiB}XxK3R-ap>XWOQln=}`U||9
z?%qCWH$tC_o0^I}n#_-u&H`tQii<~iM;`<-H=70NQ5fJw7+8A0Kkqvq6M>1vsqi%5
ziTK{*c09*knHCzlq$CoT+X_HfcUe5LO`XvimTe!o<UQq-z6m{lNM!z8a3BaTiRd?3
zU!r-dX%h71S$L<Y5n?AMZy&{mTf-@ny$KSZ<mtQbgl)_k_lLYVIzpU%msE^KNOD@8
zm=|wa))PrX5NoM7ns^!{kUKsErNoFPk$r)XYt@sMTx0!M-;{EgeG_g`OQE=bGM@Bc
z-1@iyU50oef9=oDQxfqjiWb$U@Wr6h)5G4NB|S7n%6_)i;5N9=GWFjz5{?4l74ca(
zrT*rrNKnIac|<vpk0<DAzV+jUed`Z~K-5ey!&L-Db2yZe8z13Ekiz1_rp8%_k!94n
zZV4w+>tR30?t45QhZ5TC_dJP2EX?z5)bCvTzF1lEDy7JXZhZ_(7Sk+NDprx%q{S^5
z$YXaa+RJ$B@V^I4_~A#@viBoKw2V}YorABlebI>m?e5=<#GAOBFEUe;{hZ_IV3BS2
z$7J9aFj8O{-yO+k8C%T`ZKaduF@CoyAYNMDDD;zd?^{~Jl%AKD)A&Z`n4H_cG;cXj
zTe_`tRqnrxN=YPeJXX(OZr9-n_K&lL5V!dK@8R9tS9k+b_;@t1=$B2iRmf$V_cDn3
zb_dbX<>28=mCAh+Ay@<oRE51N;sxho33@*s(fdEFEfbP$@6+L1T|T(X;%@Wjx4kDX
zpnlFpkcV16?v@dlDyv)^hotbOKrh=knkLhS=>Us|GtZ&q<t5jE1$a+<Wqqb$C+5?G
zGyX**eAn&QfzFSS+uTfxrA$dK&+GT@Ww&2$wn@p)8WSxc9vN4~uhyElvOTYjX;e}g
zgXCkzwzy!0svH!O`$Refr(^W@{w<Fia?6#MbZ`;&#zj9u_FslmQ~OG>3km)DSC&fW
z56>MhMkwi8M<TrJFU5)>zj}Ku%(DIG|K0*PXGv%+T10q0tY1{#Q4y<fkx{RyI-Atm
z!BFV;RkwCc7QJ`z^cr6*pZ+$Ar$zjV?+`=jrqiP)3Qiv@1DUSKb6h!7{(_OE7t8IV
z6N-hw8}6unOcGKU%CRVxT1zvMGlQmK;o--_j*3{8zI2JS($C=$W)u&#vl4zF$e>LA
zcMNeboVk1n#%PPFvx`yqHk5xqk2q9CEp{>=KSN_Q|6mto%_XOWWEbCg`uRh^9$hS@
zH&j(RwlP=V8MBhQeZ=`tPpdGuT%RD63h9sQcC+*TOU^*jS5g)%?>_p9t>HzQ3cCN1
zN46cJE}o$BXmV(v%Zv(AK_A;K_?P;XYL7Jm@$dgzkmJA6b}v{(;#pj%kBDk=#^$4B
zZ?|NFmzIR!5fXkc#{Ycvg7^#9cQtty6nni+ze%JTQ)kx7qC|NlN;K%I0b#Kj2GWdI
zUhJuYXXR9PXr}keVL!Oe20S<(<eWcRc3a;{EOV9ckr#QV%XLQb%`;wgNd8@mxk$x`
z?V)A}f0c50i%U>{xDj$Ylf>lcWk??Q)Cj<F@&73cZyJmgtTgzusL0j-Pvc-#MM2<!
zHSR}WRt)IKv>g;7YJKz7?BmFJkX1_RC3Ek|)`4m8@=)?aSnG#7{p}R2Xsm_sZ&8x|
zNH0_(<a`vFndx!gT6$sAc~vD*(O3Q7qR{Gm+&CieQR3l|{mb*F{99=vIisf1R21jE
zZ!R%zMem88vABpmDaIs1WJaYWJ93<b%?9<$ya|@xk)SQHR`*Btj#?Otkh9e8ESH*2
zUj>HL%&|RDVA|Tmet!Som~rIHhST94k=MOT7A=k6Tn_n%skHah#<?|-5-9!udsL%e
zwR#EI%g)L=St=GM`ipoqRJxx=-u;M15mR}A=Yu}*z4-noo~mcx@G|j)Hyzdq%_u*r
zBG15Z{g>ahr(E-XJ=Z?R*IIeDt4C~wv$1+bSLvMPS<gmA29NEC0^`<VI)foFnBAT>
zhg?A<<GX~Q|Egix1P9;Aie`*Y_U{WRd?f<mjp(5~I(3FZ0t!b#{|VCV*d$+en9}6P
zF?TeI&I|k=Of<zFsc-wa9x*y=rzfqdMc*B)DAi_!m+3C<+zb9*+kM@x_w=q~9A;hN
zv-cAr4=)$2NV1^8;NdODk_%2xI^!LjGkJ0Au!_cgH*$9~@?TYkhBs3A8Y}+e!Q)6)
zBhW+0Gmey!=EdvlejqcGQs1ML2_~aXJbzb-Wc&76U$S$8J^v(z&`go1J_BJdW$E~@
z#cIiEvt`G!xGHwq+ZOnvol`VozHr2A_5G7&2}>*Ec9OpC4czv3a4W?Pf(l-lMZ9?a
z)FE)H{K)&=<J}*bi=0w(e!bon#^;vxz+f)&PkicEMo6Kr<>0};usb3`n6bAb>TtB@
zaJoE4#|u9uQi7I}I_cGnpMT96LD3G)^F`LLj~H!rbKcV%M~`d5cpWUqW7rTOmzxs-
zeI6X?smeLkB(6Y>yL?D#&47xhYKY8@!Pvx<KHnbxwdqxOjP5{cbjd}Sy$A!bPlW%^
zk5~&B^eF14t97nKHM{5-uJ1;c{+Wh<+N`|vnD9%LF!=54)4jAZe8cbDKAIlsA60RY
zWfTU#F=_i_b#(uPO6&g@(sfqZeS4blpZHPLVvT-N6+a_v1O|=aLM%e#87kpWjY?{n
zwDX7F+gF~e?gAu-t36Rx_|irkemZ^hQ{8eoKX7_`f{7j0b8%<vgIWBdrFf6+y`d_S
z<vUbqA^54XR=O6!VuBdfrPYl=mEwD%rNhI?69gUh#!{Wb%Sz}3Hluk7NpZK>RXIPL
z&rYH;v{hCDkZl<X)7R2vi)PoqyKtS!^LgN~mde>?oc|cxBwU`DG?*QBq1R<}ig|2J
ze!zTK?$*0+)b?TTot2~G+)eh67|=LOI=o7kv#R9~+QE>-DCakM{1hf?MO?AKe5%7f
zBwC-qe9d^T<h?L)Jt`-ALaMP#pQlv$jK$?>EX6CU#4IzR)81_FRF7L-`pdt#pedHj
z+A95y4X7`)p8wWO@x(Z2f1BLIu2pNWz{#(~OC=-l4TYGg@g`xJ68*u)UgeK(s|DA1
zKECB&@4oOr4%DGhf_g#ivWcFRvmr^IqFAIgGd3N#+wBB3D<h4R$an<{4iM{Op*4jl
zN!NZ~4?qkrijJH6=Q}XC6e}9%);rmjs+Em1Bge?;9H;82Xq%okSx{8TvF$VcyvSG7
zWY{`*X;{RjK{GVBUc1q3PkzI_Y>oZ<^KYabrvllu2d|CrR(ak;tWi_;gfky-Jcb4~
zE+yWzE@n@K$h)L0iH#dtQzD5~I}wA*@oDy=x($}UIB8ux4b@%}+!ihyU!H-rYI~%g
z{gpz*@JkY)#Qh87cU5VlOtwxK?ZK-K|13Eeb!Vf<r8?l<#(cAEob_o!!5Gz!sCbG#
zSw8>3wai(ys-0#V`QK^zdarm!EN_-d(f7E{Q-N=!LFodj><A)z19zG<_;K6x_<>Ap
z@T2EjgD`&jj*JY7KPY7osOJJ7U7w!aNXCXH9j9YUhDz?ZMHp9-jQIE8Q&C}u6I1FP
z682rMYSROim5SAr%Eg&dDaO<XML7+|t=_eL)1u(34&p(Au2RA~tArF9P5*Wd-xMB=
zS)x90wThr*T4A9WiF*#q%)uh4a+86Iu(Eo%hry>R{D2g`v6Q4vlFbuCf9d4vs#>IW
zB!E`Nsf-;BEqD(Jqw&8vi#?o=y>q?sTDxbihaGG{eVFh}g0h2XLw0a5W}MA2enK%`
zKF?D@`l81qO&?!OK`A%OPTBWk(%ht#D~8%a{Ce@U#f|+>uFa*Byo(sY1?=vZ^hv{P
z($G%_!`x+S-vmn6cv*jkMtQE(9($tH85R^sp2}5i>=X`G_--L7-Bhd>^`lJ-GhL5-
z!NkY1-KAFZ7*R;B6teNaci#e{Fhb_*ZR-HN^4FuzNLIy)nnY+1Sij`NI`l@R1D`J#
z{*k4Q{o5ZNSm|NM3C%5nCL#{LrO%@`$uZd@SdgSbro%=)yJRZRDq|614L;=0{ezvS
z5)9VktvGdT7DFQ33>6AoCrbzogL<sQ)MQOIQ-WCqFKm<pqJYXc-rUftmd?S9Pt-g4
z&gGeAU*_9xGV_|BrviF<dd3BrpV_iFP$Jw`q22992M`Y4fq7LIg4!0x!p}XQDz*jF
zVU2LLs?@P$>Cl|GY%RtZXhCYo%`Q#Ng=J}PI~7e*uVls4Y1L~5C|h{ws@~y_n&8w<
z8+6AfYnvH+py^y-6!WULeN+=OFOEDN8VfDh|2%?o;qn^=N4&TaZ(Ws=4RTV8Hyisv
z@wx#03g&GLX;H189c2GPb!UWU999W4?A369OT2S!yI#h!aARwQF1(-f(LJAPd{`=e
zq*71pG5-C@;q!84qS*(ssFCE{__%eC3~5(a?u^Yjua}}CwK}z@_SCU+Ya>L%?ZgD_
zgeHjJK1>rGG?l+k-EGo{B3)hx4>da%8ZuKdF=qR&<^{FSg}4jhFVk?$7YBv2;@Rv$
zs2=fIIR*W5u&>3wZU;lk+x`(upH-}%sTz!ukl1ld=wmrGyBF`4;1Oe}!BJCF;}LiS
zA6G)yM3l<Lrs65~O^>SGpGoyf{Z^){6F*zzna(L;k;N!VCKi$S1O14_pEntlSq>3U
z51+*+E=j1JZm4bemLT$Cz9^kuMxn@h6)i#rN)aBxQSMMm@p@8@{*o(3MU>bg&4~ad
z>2nLA1rbacdu}viv&Wqco=@*6j5Th^w5h47G)cH|b{OU1-%dI}h5Get5Or$Yx$|F~
zbTsNzy?9>BG$`WPw8g3GgnxM=TFq_0v$l_%WR}*i3wmM=K!g!rq|%V#da96!qAL}t
zJ=-Ivw{ZT}D%ldVvjaVeqBLMGKu!3LKZnJ^v@|dpv9@w6)OI+k@^`2mQfol75qJFJ
zyzWC8rLVEh_$L#p!<dJpI~&6q(OR86ow679k0WN)T$$;GCAc9PgtRlx`)&jzBz>2k
z>En^Nol?AnY&T7}t3&;vRJXEp`T6;?l_m%|IXN#yKb)ppcZp?ntgMi>wzi5`vHQaw
z<sYa&?ud}P_(tqzcQ&x;jry{rv92S8P5V*nO;-C7vXV5nN%7>jvQ?@|2WfNp(jTlC
zp|3n^J9NC+E~x%If#ZD_n`x(cvFv^ip>gaN4P4=WSH8B|fUyg&=`gwSQvxqPDPJww
ztnXKPSVS0$0Nb!yWq(yUk<s^o^(K}KIl59zKluG=oK1q~DiLCY6KPRJDf$v>gif^s
za^C_{C3we<+j}V=IG@?~jo0cp=oe7qhe-w<ESY^PzG#=&)EE>^Ma<z`TU%>rlV(ax
zb<Yik$y?d97W3QK=EcVobc9Ke#>+;_=Iz0h%QSxL#13t&*1X-K?e4MR`^~vvH1vYb
zg><0CYq@?jQI;x<wXP*kr6>ZCOSLDdsVO!2OUV>{S67#lv$HZSb_!Ifx10J+k8$+U
z(o$2fG8bidXsEnap31<%ht%jPDD`6_4!;Gi_Iguu^Yf&Xlx&@{m=Ux6&It=0&yZf%
zKqDFI*h-stO3~A=le(IkiT7p61Ty^^U{m#&a^lD{M;E+nl+N|4d*tYP`ul^Zn!vem
z&)Dx9ca;<)bC0_V4RL0jm!#|I?tVFEJuSNtV|k;ITWU`fVDxd{33PV-R4S_V@xE6^
zc=K!npZEJesK_2RF{N_ZWGm4|jc)EYrwC%be6Fghs$t3Ks9fzyMh%ONjh!AAImsLr
zU%MNF4*t1vEvv0{O*m8uEK&;@7=Q!>25xU}lf@1UL!@vBWsCUun~I9)YNlb;+S#;K
zbuQpOE@sxHK4nS#z{j^q>jLZe@)Hc^7f9lr)O^^Mx?A2HVV?Q9j;UgVi{!-C(2xWU
z4vxruJ(|eA8-vJmud-CPJ_41HYu1y17)CG?$j63fMM~Hmok3VaVMVD#LxX3c<Z(D?
zhI8`CWT(OE)RS*7G`xaUq*1TeQNQWb9eL3`zn5lfaFh4!NGQLLx?y`-SF3twhUl@c
z&+h7H;P2f_baV_1qoANcQ;!L4jdI~QyU!9IjR*c7^&>J{&kb)K@qGPVD382BPtQks
zf?~ku@1QTQV7lxE?<9ZZlxAMG%Ee~C*U@_ubxRTJg+rv^rIel%R6{+^j``!NntI$a
zlSE{-fhk`nPmSJTbkWsuo4Z~4@{)I8MF7EB<)qHz+|uB@SkwOOw{*2p7yQlbEu>(g
zq1-AsNCr8K)umoI@r*2(1%)ngNYC!)D>eGJ)HF1JTWxP!$mYf9e^4#LzV>=I;|$)^
zavW~E{_QnWQeJ}PtD-=8>Mn2&>i;;ro-y?KyeJrcPFbAJP~Py(iO<IM`+zHT=0dF%
z?x?^i%H!SW<M6L^{)L$6Mnpr0Y`J1*jdHyK@7w*pp9mzw^R-sm)n=$VR?Sh=r|(10
zj1Ts7Md>7q)K+KOA2<zf=VuVRqK=P`^Ea#!w)w?!Nz2!kmJBq?bb20c4{aY88|*0r
z1qDHK!Nmt>M_j*fU)A9^`a<!c&*QDY)rzn6a@*n(O^ivOJsuw32Rpl{3h(n07#n``
z7_zbsW18ndqrl_cX@UTpvjxWjBGvD!#moCy`GS&AYp$V=j%S2i)+<RS5k_4Br~tp{
zzSU<dV+r$p3|*(5d(^?`N;L2YF39i*VfMKml2=rmu`&MlZMVs5Th0*FljF))gt`{j
ziF^gG+lA$Sm3{BJoENo!VGVf5Je0AvLsFZh%D!&8Ga9y~y=Bm_8)ohM@oABd>(ih9
zkhctWM194&rV}xLP&g>4LSN-!gBI#_9BR0J+(%Ns_Faz&PRp1efULsje)sw)P&jOQ
zKE}NHr{POVg-<Z1SZ}AH_w`4Mhu(?&2MxZze=AqbF@oN#$7Ie;JElj@e27OvV(#IQ
zvYFu-@p$U<fF?@^h7K)G=u>fc9r=>1>mPb(F7ib4VY(I0`N`1v?>6A|k!^JE35AJ$
zBfxMnb(lK8knn_i9gV`}A<NyG*L6vAiXk_&>O*0ZL=;NnMeN5@?(2f>|I^@Nlt{+i
zAtm#VJO8DS9Wzb(6<-A3kzIaL@2eX&_wn4IJnMj(lkEfTUnv>Sxek0t`CU<uSAXrx
z$IFG`7K}}OQQ;JkhVQ#v{jva6I4dDIrGwM#iP0rCvSTp|nCwws*Q%{DqQ<M8=Dd}_
z@_b`Oyy|gm?^IqgCc$&t87!_mB=f$#ccnn}rD(^RqTm3t_i^aWLTc1cL0K*`i~+Oi
zJFtAYT8uC3z;>PFbGHU_xBNv-v+G1+)U7N16v(um7vEq&I^NEI*b=+g?3xo-1wZ0o
zo2QbcQ?`2Bw%B+uUmHph!SijV^}tzSNI<h`%5v&pHq-Nf_n_tU7w;+cp(9aXJcH(R
z9`i@a76#4ob#M~a>J~nkI5;?z@1%1R<Z%o*T%4WR-5vKio(#}9&RZ1NcRWWJo0&1E
zD!u)NBHQ{F`IX+x#`44d@{ad1Rxr_U*$yT@8LNx%PGJtuaW~er?c;vCeLq>CR`53!
zOC0>P!_Ea-Vz0mcxD6NcR_>tucZVrsF0}o{zCJv73!P=5Rf3~dHD{g9VO@zcSCHwQ
z0(xdc(WHlvOoDQ*4uq~XZ2Re<G+TGn)7<|h6S@AeKfAb?uS=3^{Zm}Zpi#Q=Tbg7e
zg8yJ^wgohJJ8*3d^|#!{piodyQRnQL#^-ZWnzbKyURLqSTs#*As4iJPp#7$#eYG$r
zVinca4Gw?+4U-?t^4z4<Uam{-G-$$iEa;;f5H{L3I^yEudTxLHv5{ZN=BY^2-(G)@
z{<04idNb1}1I_#BsjSmM@eRmMlGRmWzvcfhlL%}6DKA4b7lahXDzBjMoJBxjcx#Jx
z+o{Py7iovmms7RUZEV^aj+k2mL)u6J+n*;`4DzP~8u`UaLR!N5`Dg0vR)}u%%)+yI
z)b>7eQU)vO7fP?3UaAu-qjDB`;ck}``9BEtx<($I`aJR~3g2^>w;Z?spcm};=<e4h
z|2&c>H!L(%6ibrq6}Oy=9V!4I%97QNFx>Wq69MLMv>dO*zXSH<Z~G9sm6cA_GPWBy
z)kWUN$>&nUv3=M3#?s*|_-FnlF1K?htyXXo&onNfV0BI~d3Zq@{KS{m5~fde=+rhs
z7w|F7*A9(ghqqv9p4C~;b($vX_debjK8Bv}&Nwc)ZyuZbT$qEoRt^n`4SUXs+YR0L
z(e-96!}B1kq9PIuRXG8&1dZGY3jo*d@9#lX2wwi_>Jxsjf9JOH+|v;KrOS9SgMPFI
zXp$$_ccpyRUexij%0+5>xywkECjyaxN`cLsn3#CKESv~a5~9gNAEyo<Pf#i9PIm=x
z())s4=)6er=>#9)`W)jv?c>?m*<`CBwPMj_j~Lr>cFOkdu8Fm^b>#9K&oK>I_mDji
zxEnNqqi0XAL_G(j3b!5xp2L>Yt5e}e$LVr?-OIEy3}xQBs?1o2fmPy{&qdLXhS^7t
z`Y9sE78b0iGQaH00b&&#j07;e8^DGGFt)w=W$#KQGL+rkelgs6^%9NaKaQiUtgND%
znwpDddC72UDxJxcJCJMw$!IDRh+9TqtfJ5WmZHGVDNREKY)P*@6Oy$!Rp{SIRYj`U
zf!SADTuvi{Gcwp<EGlDHy6o%J*(^*~wHU8P$Klm~t0{~6vVNtIAj~_RGC~Y_kvQyA
z_1>{DEj&P}OruDR-fLHP+1Se~t?Lbc*k$YC=cn1@Y2!QnF|;slwYa2w^=}|aosmiL
zR552OZM^LC!zuX1KQQGs{ji{<sU!2txZ}ckiUUblv0wHV>K%LDY97T`c=h}`1rot8
z4Wb{8Lc_zl;rIAv(+I5EG2!x{MKw34R#vhG1O(_{ipnHS21^Wlu-yFZu{9r=v5P55
zEu_iQ+52lkc`xnzNQVAzp$0J>!~KlJ7J|h)OwV9hJal|tdz9!LB)vkOfGR?atAO&a
z?>sb7jymf`^qfc3?-tUz(Hl}M^kJoJ(D*21b+Me^y-LA@c*K7R`~OyX&&e*iok3bQ
z)+go~E<~o!-l!Z^U40c*r%1DuAqe~=tT277q>dVoNP{vqg)9^iPFRTYk$8&wOOR6F
zM^#lrU5EsBri`)tzFG|WsG|~Jr+p<?@NEd9rR4s4Lh_kLMTy}S0YW(n<`;knJUl!~
zb+}^(Cf_3)^OIj%l^uUxmOTJNZSMtw7U%K5k<{yx#P(q%nzW8^?xl};ZY(Ca<K<p#
z_KoF3TU%R<1yQ*aKLPAjrxHFJ6W8-P?!hyeZS{0ryd|KLg^*LVtCy%VTt~^_+^thN
zle^0rgpcV|yX?>NEu?B(7ED+GOk)9o(8pI<QdNe=xX9$GOj|*h-dpmLH8ko)L`P#V
zHEoe|S<iOR2wy`wStRHr8};g7l#ROvhy;1`*;N6vQbpRHl(_1nr~R9!2@OH?in%W#
zo%0I|ay~vj`bo*Kv_Um`Nv1@T&0pzvUBY*8fkY07o9V6w`uB4Dat8c#nKGp{)9a`1
zOzn3LY{@9ht!J3=vONZXAIR!Zwzfa)w&#NeP*haZbj+1F@6wi(mIiZhh&ed;Bo`MG
z@w$pG)`8Gu>+rBbcdLt~#za01o-H}7mN0IV?Om={#^&*Pg-?2Y%f|aO@0)Fbo2?uG
z!U5CR4f>Gtviumc=n>R->G8Pr9YB#`tO+YMT>#>l_r=t-S{KHy(eN(Cn4-t$yA2Jo
zJ3M!hN6L!kBPJloRw?>nF+t7Lu<`;ba4d1qxcQE#uhNen(tTN@NV%(W0(a!k?DZAw
zZ_RX_O+i6Haz;j7Djk`Q;oT~(xg^{ymW%;RUXDn7#<@tNM9*P>%Io1Y6XAzdgv`S_
zZ;^N<exm*C>?by+yp|X#^p{|D_?eoUns)X)Qb+m}wi{K>iFxTZqU0r&{kT$s1l#2i
zMhPcwbYyPQ4d#d20QXbIRz`*EBqilgUPUo1+=@b0_KMRT6n8Gb`{VPzwM6rWMV|)K
zZ6bra(Iyw$w!jLyv9Y4|+Y%Q+tUj?b)LbQ)(pl`<*!`Q8D+<E_Wds}8{fexIs&<P=
zPQjp>6it>^)`G%c=@kufjf;y|$IohoNG0kg*8<9T%B69@C3wk486=j?5;pXG&Eci;
z{v|CXqF5N-7k;LY>z8XB@5|pHXoh>eCPx{%O{BT8cBD&E3E3D*)D`~+Xp-JcEmjL-
zKA5^JDEsBQoh_79NJELtU=a&`Y4q9?$>w1$)Z1>tkz73^c{mCL7$3N5&M<CT@(015
z#P&PWgNC&?pse;japCjjU_dZ`gUJdunIk@%qr_2L0BD_gDxdtKV!=$|4~`!)ZKjob
zNA*YCVKXyqn+B$wt7&3l(ojBhb;YBp@2tM~Vx8v=Abeb0Gsa}R(XPlvT~f8VA}%*#
zEFnGQSXfvt)ub@xn+6%ASidcBd8*y%vNgaEy75#u@yG~MG=L}w;Mc;)IKaGC-ZvAO
zDvUV$y~S@q5%ws0pJ2&N$2e0t|3F9l8U0EK3DR|HOj8XOV`6#PP&(*q6Kkv&Q#w?Y
z7JEnSPmKAtT-#x+Y(NL)e_4Q`jfTUvTb}Fn*v6AV27D3{TiLr*eNaG{3UeRp0I2%;
z7fL`JO|U?U_(Cp?U~T64^e1gDrg5}9As7+mfH9!a1qFe9BtB`?zxYB+D%$2{yHYIR
zQfx9($lN1UD#O38A73ok7N`_WN}7H2lj-}sycsS1?(dT4VXcI-BIV4$9d@WBe|>mZ
zm=aT>`%LpvPkr0@du0C?fbc)zc2z)g>e{Iwb(00w)21UaBlarUKv>lR<Oz&%c_>up
z+@8OP^a~9pt}+JPN;j7;6MW9Fe1E9UEPhOQwaAo@ZfXbDp{)RcO;+co_Mv4=8yh>B
zmOS}qPvevqWgN;xw=bnF#IloylcMB%X)<?d1d}&T4h=>srSCB+1TS7T-E;W*Z*VCf
z_}gM5`zya8Bm1yLuAo>Rs(y=FW#0}r$1Bu*nrgU`r!OuqFOU9`Dn!{@uEzQU{!o?v
zUb_pbNvtZ*9EUfP_(u*!;ae0F)d?OcZOi-|ofU=m1({n{Ef~aEL`OtNpK0F63f^8t
zO9qzDp%m*p6dQfnb#I_mp<ABEmSyKmVGQC<2Uz4Jq5g;E6g|T+)<SVs7TFUNavT-x
z3uSKmI3-xcP@+cf-1}FIGB&3BX;^N=135@p`Z&?&I+1waE;e9pWb9X4z8=#lx6di}
z6Hx@1ajvpSi%a{NudlC*YwFpr)Ytm@`T!{HF3uy9M{BowZ~`sHx%g9(a?XI?`%r0Y
z$*Z==-6j`|{r&w~%wVudt$2Zff$m_=51!8$jt<ad%4aEkaC4gth}J^c3(l3B6ap;n
zNpk?Vw1DczgGmXhmP>b&ckfmxV^f(C$6i0tOGvR%uUyss(_&LRU=0pWWZ?1s5*%aO
zywyg%PfNM0{L9yGMA5&;RgJf4&?XLfJ>D*F5+AztWnVykc65lOi%YYiM!N*(!knD2
zM^8(!;*#Y1@o<deN=>d?>+5IcV}oH(A{UqeAc30{B-e;fz&lajMO$A$&|R_mft82n
z<G%euEs)!@o12?+>=CO+m~=*HR1U%kxgB<j3dR=}bj_tOB{3){C|a&oLj`X4s-FP4
zWvwM7vqe*04rZ`>c!*#OX^cCr&b()Q;yhq_w2sW^xu+_&V^l}chFsmuv_mVkYEC8B
zn*C}9?y_tD4Kh@$ug5ALG<fT7sMkQy7XrNhzBXLxCA0-$hJ}CT*4ANQLD*PXf3C0R
zmhmT}nWo8+#J4dT9A*<kiUaFy7NqFojqh*H_hu`*keLX$Il`~gsj$M=lh~!RER}=y
zB??e6dgrkQZC{J>H~-Xdv^t@!O(wyOOK-Y_eAtwl#CRjE&GvD~*x0$bNO@00`Xf=?
z>j?Y8Ep%e3v-3;)Bjv!;;r!*(X7h}VnwgBwjuK<Gq!Se$W$a936-Ph&w5z0spxKET
z@z|fHrdgS7;>hw9sqoU#1NkxqY;IRy#B%W2V@q|nRVZQfw8sktnJ@<rzNPy{=$<mL
z6|<a6sa;SC=aYMM+8F%T8oDtME`{*1DNFRE`CKx-@@>YLZEbHGfh}FWbRV%Y=$dJ}
z;MAxv2xsj*NQtq;ohuW%m^BeN>VWeCtM=jcVjs_h78}{6Z+3Cf#^zSSzY2&!b)Oao
zpID?rClbw3pU+(%alAsVuxIu=;E4!;w|kw$X0DSw{3Y4RB{SYlOdE@3UFNGyX7ANP
ziYk0RFG@-0xZfLl2+y+-r+T&N^I-l)@N6vq5)}>Ybj26VVfpi={r&k&wfpJ6rYhRE
z{EPjalo_?zCl30Fpmm>ST($(>9=4O@VV7u>%`>>j$&R=s)GdU+&Jx`E?%S{y!LL<k
zt({(D15k2C#=G9$Ubn5Rz$<WcM#}1!246F`bb#{K+9ZvsaU9xkNKm_N8}08$?8P$x
zGSSn;D}hcE5roif!@<KNgZ>gIGN<>4KAB(&uyb(0aECQ8KP+EfZ!7u?+SD)h78PVz
zLPm|}Zr{9l1C&w#EiyK2w97QgniRO+W<K6+@&aWDFqP_jT*%!&?)Jan4XdDEZ#;dU
zGtV@I!MbJBVq(<sb%C}5rr^!Z4IVM^MhHXsN^UH4)yCE{_5G&aT8Fd#bg8yDSE>yI
zB=J_#f8mqYPn5sOyz}Eljv*Ly)tl}YGIiCXLaC8HcWYq44yVKtOn4ssM7Umly!CNh
zbe;jE#fYjGw0W1Mu<Rf}au}3L)#$?_BOL)^Isr8^zP!Awto={ygtyN=DJkg#n4Nb3
z?$)HdYqTxr)#$%};&@NYMp>G8>iA#%TZ#B~=Uok<+qu!0(M=$NprD}vZJMQ*MMwx*
zq?Qdv+3~S`TnMqe!TpZrlOQpt^=)x+5nQqw;I#~o9lEE6yXd-XGZAxbV9m{KyJUR}
zp^5p)JJ~(RbR+A|6A`JxB+C#MZ1YN-*w6=YY((ZS(Pd3DpF2}AaVQ}#I<2@?bIe6l
zZ?}<aoJM6?9{k&{B}86*;)GohS&6jjvZaeYf<v0+EB_WgZR}W;n>jhfsaOzb(s`}@
z#Y4*@Zt4Fy^QK5=YYUr+m+h?RMalSm>7nkBw_u5O#txNY0Mj_k{&&7?z3gwf3`TtF
zUTS=jWYXeZm#@8k5`LWHxIX2t+R4}ma#q(4iDrJX9=~mF*@I&qA^59S%dpWCYlZjH
zkJ#xbH(aq&3pQ1(?jnY+uFI5IonZ=;2{0;-2Q}Mbz=4b!N##BTEbLW~U<>Rd5zp6H
z>hEn^_yRXC?||^bn(#C}+^BY2ilyE4$%evX!rdQvL52$pO#^o-G{H0U^4%(vzBl~G
zoW(^k{70a<)s+bs(R@_=^o-$syyrrZr`}AS7Nf(MD1vsp{RurzC;+ff;d`g2eLDfH
z^mt4h&Hm>rTrG-oAnP!2Vtu1}3Ghj8Umvg?K^OFm{JdvUYS8NOJLrRnXHc<$F*nyk
z>v8jL6VP(1%ti=GWaAk!N3kR^z@$s_IvMaf%OL?Fj;mL$%b&S3uZPV6A1KfP{`Io$
zR6nFMLv4aC5AcR{Kwm3t3ZPshkcB3ziq-^hByV-u?i3gZ>m3@0pSZ@nzjDxkapAVV
z05=6@c<A@(l&KO;-J+70giwP%Vpk%-T;3%bdh@z2I^%-bgKz6Tqzn36Wj>+`mvT7r
zF;l$k+4=eNqu-I<aambeO?Mn6Bqk_8GQu<ALzXlRD+(={e`EZZ#8DFUK;@Vd_SQmk
z`Lc{xe%wWCGI@Yc5P;W)vvE21r!txlzMtS*(5yUtTbu943j`%_jM>2$iS$0p;bY!$
zS+B-m^Px5oadCD&W_~<jcE&t0Gh-SkL^Cd^|9tDxcDs1^f-ywDvgHvg^U6;hDKb2K
zYNT)A_<P(csB`0|V%swpUds+;S_SP!*w%AlfoCm@-3_R|zi5Pxk;>a{EP)mX1^dzn
zEQzZX1mXUyz;~9Ls-vgYFyD1LsWKKTDUR;Y)CrdF;j9vT<jCSR5o5~x<FA{uxip*2
zW0%WDmIRh3c4AzAIltmuXj1B{b2&*1CP!QwdX3r!Pcz07O}*x^&d!Uhzvz|`si71u
z_J)7eu9t@9D$&*_bu;5W{VX50rUfxVMj$z|G_oK<p(A~DRokA$5Jw-YRv_mm8c#3T
zTOj3rQB<(j>q;yzN|jyrtVbiIHiAG}97SP7YdJWQHtcc174v5!NFMBdH~}4qOUN*X
zQskyyM(K$syeZXV0?YbUCB(I5jrfT|b4?q9W}?RgT1&rKwT1#!?<r6qZx5Oy^_yLs
z)!xiF1AE5;QkOD!5fs9j4rS-$>{(cTaLF$y&~Eo`RcGLmeT?C<42S)gD#thb_Nt!T
zHCt@YDGS%(J!Z?n!9f!PApEb)rgR)<jRKFKm?FYEw=Ykv6pR~oHa6q%?w)cewPthx
z5O?<Lmw(+~E~EV*Rb1BShp~aWXrkCoWK0|BFIv~fQPFMz_*|H&?aKI__m$Bv!Am{B
zEY#~)Uem=@SHvCyKk2q%HH4v1qvQ$m*qqX+NP}V0%iSi*m1=r<usD5D9?v1y-!MBV
z2ozb35C$Jzl>U4xrnfnhy4aE^Z{Ez`!XqH0E@ieX9Y?s@U2i0wg2P?&=@=)zIEgCg
zv}bFpFn-VLyu|R8SxdeyhMhgR1%fidFVoOj#Ng|s7EK{@HhczC?9PPJq1*|Daf__V
z#oB35B=&QriT1yEH=|_pW~)qJw;fi4$_B|1f3d7ju}K54t=-+V0K!jNEZdkd^fsW_
zXY^guYptv$Y4h}9Z9CiBe-VVQzAn--VvlRoFS_XJhiO!vV<D9t-l~<Tzu{d+f7pDy
z%LF!s*RN|jh#7!{|HB_tiF%zBkpi8gJ`aveEk|NN4A}2|zdbpNtkC~q<m*podwqn8
z3udLwSB%8e7GAdwdPb|`(bOT9sDwR<e{dz!^^Ia(2iy6^w@AehFpTgp{4uhAtA$f@
zq!hZSr66&BLO1kQUwcJRQ9M9^Aj}fiSCB-6FR2NqmzR{0_SBP0wHjl<Op9j!#p4SU
zg5oJsjlk({<9r+`sgkP#9-@)q4T@jXZ|Jbw9JZjQTGNYiONVBg<>>;ad+aOS%C~~c
zea_~Lufm}6IP{-vl}Sd?X$o9s?MvS+<8~UwU!fUerk;2?1W(}Wh+5Mpyw~O0wN)`l
zW`TVPiACxFzSZ?vfpEA=2p}JEDjILK>EriSn*k#!c`*leff=mGZPgSd%0NyE0nMe^
z1@TFm3=+^~7V2!)J|B9Le4&DKd@rODaQ&gpGEHima`>X{Jx(ZqEV_Nsy**koF|z|f
z&z-ZBR)Q!ZRzeN$$fb$w{o-?jo8+i<M^Cldy#)a=G#yG{1*zO8DELMDc@6H1feQ4o
zuV26BJ1@_+RUy`^7$LzX*!cJ?hfV?NNx_2he6?Aqn<H)NF~-P?t7VogUh!JqoYu9i
zeSM`T^N$Ke{Ge%<324QQ?o<A`nv6JejgB-k2A*Z-1coSY|73_**}yf0#)>l({R8$N
zP!l!*$RYH3y!5eM(2@z*Z34%fjgu2-TR>|Xo$vqrq<Qv55Id)i^>X1fcS6-SA!w>|
zZk!W%h(vuxfY;Pp&`c<L%q9$N<ge=?evi{0d=IPCDeJX@j0|0^kvukViXwjdQl?w4
zLW^yk`SzBx?1_c_ah}n4{@JOEv0x_iPkXPa9SEx0DoaqfD0#wFcePyKaKdfqX`FQ=
z5T)}KQbEmft%@_;Z5TdK0g5gliavaTWBcb>5Vb^010dqg6`sU~k2f=omrI`4$B{n3
ziX-X*?%$MUd}U=N?=8}B)}~x9W*EQE-bQhFWlWA%xE6`GsiN4>@Y^cMa<fCl0B?&l
zJ%#=BYO&5?XWP^v*^v~zdAv?LDT#0{p6H*UD4$?Q;=ZcIF~uKM)WbD{`x!FTIJ3Fq
z*E%?F?N5#?{t}x%(iOB_IJaC((Ky_(_Stv-ruktulH!Jo18+Yp%o!LCE5i`%m|{;f
z(G%EMDR%pYUnYM%mV<UWd6==07j4vHog#~;gj8IOyXU7WKTfUIG8~&;#BQ2uY4046
ztD+<Gw+bgTS%lJM%Cgp`<{@`SjyfMSUtqBV!`HR#H2va`?FUt7+ZxXv@L51M+R)HY
z6#`=zb}m(8vkk{eWxQLpY4*By@V;9kahy`u)Fv;S9A<S5UuqoaF)yz%-LwqbmZ?pV
z%`4qEpaJ@)#<yf>fjI_Mw0!@8wlN;^_WTbcI9xru=Lkf0&z@2ZuC5?cREAxx#aIQf
zXq*bbpm0aCCXpj5WK1{`oKuV<uIxcra5ZS><TjHtGaz5l<*ffCp#p#+X|+s+DG^=6
z^@&tqbnRcNEuiw;`qnY8BfhKAACK62Pb5H$jY6<A82Wo<twt`}E(Ki<{vdUye(9;1
z+oOEO-hKamyK>HQm^VTkaCBgWilYg}wcjtaZ+wA!^UMr-iqsVahSYCx-C7+}06rZW
zt@}=&6o$#kICX!nf2?%LJ5JT{VN~;PWP5%wk1W04G0an2v>+T<qw`;<QlB+8DO9r#
zoST~JnVrQ0*!js?wf@uan~yzrV*6@#ETvkFt@GQgRVbDr`?h)sTZm))x{tB(i$43S
zVRIo9s(QA9p?rp5Gb5OR$2Qfow#WO!g;9~AaA*Fz1J`ydx@30hy6s`$Up`#6BN&^f
znl0T72p}q>b&c_Div0PvJ5#Y)GXY&|yFFNokz@KB#{TPL<F&6@%+GEnl_^TUlV%tt
zB#InI1b%zZ@<Q*s*~6!>U`25YQp@>CtR_JSgZ8ifCifY&e$g+;6kezXA7$2}rbE(P
z`U99=#T&U(E4F8vl1}wJzh~g}T72GeA69SokbL-0!N-f95W(Q*hvLYePQv>1fnv&B
zrsD=UGsnhkVU@q=q!_|9B&4zw<O_+6%`<1qy25R4Q1JBf=_S2gE-S9;b-lYo!R*F}
zPl^~+laIILpx}*43Z9pc;M4H+qrM25*73FfasT6=4D27qJ2S=K!)(fJ6*`re>Z<8O
zQ;2iUgsQd_+=<<?-g3}ifG`21$1O-t4~dhoq#BQe;?P1AsfWyDiknjAe39%42nkJ$
zjlTi)FxI6q5y`MKPR1a1t>vu*vL;ia%{yIvF(F-y@bydB<o)^~q^Pi^URj6s{PruF
zS4Z)BkGptA^hF>(U1zy|^N!dIBQ@-+g_IM4>&5t|^q4QbeAu_$PvZ%&&xK2>uD4W&
znzGu>n)Y5Puf^gdSI<cvk4dWgZPtSE2S(!JgdvnqLMEEemAGfbeg;!H1Topt%$FxI
zSAM>*(vY*+(0WY5#O~pGb4HMclDbhF6B7t90H(Wo1$-uNftYy29J-L@Q=+<;tFpcS
zT&g{P0&N5#vFS2h%^B`kukTQFB{{=k9+*7j$hOgNkTWMQb$^cGo~x+JrlkM_Qd?{7
z_o;cmuIVhsJ>iGuiS@~)ROHt3<HO}07;~@8#P&H-59i5JM@~B6&?b2?sVde5wr+($
z!(>**232CNNQF^x-g}dHw(;zFkuY7VHOlh+f$J&2N0m&5BR8Io10S+ud1bR$E8iP@
z^P-}1Opj^mwGtX{)XojZr=w~*zlVqw7Paf-m9U~>_Vd(bu~o(~Zd~WuL;P2;OOh(d
z<>QTt<@2-ghn*bHL<xD<Y734EY;PWU?rRI|Ix#9>&fdKE*mNC|Yeq^RTXcLszEa2P
zR_UPrt84w`s5_HLHMTbS%F(MOZS@+r{>Oe^dRTnElD-9-pnF=4bfZd957wl;*H7Bg
zIStF0x-=u~X|+RTB7SMP#cr-xW*R=;J}fipIA|_j)daR!;^<~F2v{l6jsa~2A-d2K
z_5$PW8E6c(tsu-Y(uq8qivmH@VHJRi5orsZL`Q1HYf;A<ef-!D#ax9js+4NP>Hw+A
z&TcG&c>#&2Z<_}eDnAQho0GXCpegTlAtEGP1C6kDd^UGdk&=)_T>i<`1bcruh{SdB
z>l%FoI`E<T7X0$)b0!$-S-+x$t{dl0M<|I$D1Ct#(SF<9Uaf+u{5r%Y0bRr%uTYmi
zrQ726Cz`hiw&c1)W5lG{J}FtGUzGDVsP89k&*k25LS<QPLWQ{~fIt>zZP_AabR1SC
z`5{g;G3)E_KPHZ6@p4p2*JSqw*E8DM+O%Pk<JNI_k&%(AOo@POx(LOD2YiIV9AhTs
z@l0e2CN%uh8RZI&?<w2C?GW?3fG|TJ=Zs$0*jUtkY7%8opbhZ9DCr>buYf(~#5V$T
zoToIAGPZWOBHQ7BK+Ue}K&)sCdcj+Z3DAd+fzfOCq=ayBnayA$naBM>q`CcRoK*xH
zgUGkBlk3}1$w1Cad7yi{Lj?5!l@zaQt;%nE{!&>sICV`W1vqO2XU}@GgdzGy73xVU
z_H~akzgnjV6cz3hy=QQr3h26uooUSw=G{Igay<H5dY}!(72;Ka9oq#B#EGt9#`4sM
z530!&*%YME*C#J#*keo44Zg%1%_`=QB>r}YnG{s~TTNa<iQk_xhFBLb%NHHXlDrJb
z(ljC)U9~znPRLlo_zEGy#g~dMD|$e5(%J}5g+9K2!y!(VN=jOK{|i(WHzw-JrSF}m
zwQeoPa2<km?XDA1pzw99aOLZ-v;Qcm;Eg?<x=^|8*21}U6nko|W~N`UDI{dQrBn-@
zZLf*V?YCD1>{wxWc{_qHSZZ7K#Bsd0+P=z$ninHD&JTHR=A-%r6sM!tQ()A@%`FL}
z-fEmhWb@yL4(Z)1c&7qVpP%}v>X+a^<3#!M+N1YX&+2uC58EHTKee7~EL7kiZ>NS|
z+o~s)QCgPWaCPiVl{_~Ke?{gKK05GEg~<NjH$OPbz;%I2Zc>A(aLh@wsPBIqPSCpF
z%>$8`_Q|FQj~}M7VeHYEITiRNa4NrB#CSUH#U*WA48no#z_D-K&QC3M8a@@^C!wZt
z95{=KvA$;Xd)O3y<ZZv|VFs?~1#OSn%NBiaQ){jz8bU54W`GI;4{z_z4Iepi4ZT=F
zCIMvI(rpRyipg2i&~(oCFSvXou;hX|J2K>zefGu8qf>IE7^GudMn}07lu<)P+s5Gw
zT#8u8!@EMfn5At6zNs4A>nt5Lsk)6YD1NwK(P1m%5MuIvk1Y=qdx15b8)M(n!DIG8
zu<|rHnu!v@_+{l~Rq#%Rm5>@Sk-#4(_<cSex=0nRaL5Pjk=d67Cc8{@+Eb{QFP>@A
z>8a_i-uN&O<f$5)lhc1d;w5E$9=qh%?0Q)KI^+(Fln=jKsA8}BrG;L6u<7j97CU+0
z4l<!nQ3_T*KAlY&QmJVjD}L>Iu@jW#b@u^B(Y;r|%H+$KT*#VwNu6cteVV%LwAZRj
ziw(#@D=OoIL$>lDi(wG=#ag8B(l}Yp@#2VD%z&pK3nb{N;?B-$^u1zJqfH_O6ToXa
z0*lNXQtg*ZaGCkAo%sx>=PRry<gJF~VsQR<ZG7Fu1ua{0(^$T*`0+VWtLx3N=K_hS
zm2UnnF94CKOjY!;pvF(zy&pieEI}9P?dF=oWNZcT1E3B~Tar-{__kD(l!TU-GXZYT
z3)Ef^P+d)P{8hPDhjqAYHrwRPaP5^L$JE&UKt0}18v^74P#R&ZEvM;+>)(Cfyx5`Y
z(y`jjrQQZ>@}#Gf5L?DYx#&Dr0519^xs4)GCPpp~6Pz^{ukgsoQy@@9y0+gg3Y_#)
zghfP5$J5|zi=L3SGk3qK-;Z^nQK_a2Jqru8Ddgm#PmtL6ElFM!Pk(3~Ii)Ob%&;(1
z#n_%tZp+H&VTGubkI(3@B;`c>E9GwYXNU6p=AhkDXrPr-Y-|)}(_xXxt~?7w$IbCl
zq9Ydw@e2!>*h4O$%fg$U5NO<RwVuBr?<pjR6|hkne9OIh_3{mt;aimF!)Zj>9hAM+
zBbcb%8{k7jmJTwAVhTZ3eZPX^>a|NHw?ER0zR=f;EB>sT+w-s?;&6g<5;p*|-U=f{
z1WN<dj0r{I5pDByQ-qv7BFR^D%~Esr@0=%<TXyviQ(V1uJ}3Tse3B@++pkB|=;NCf
z%L_|j+?mjVCvFNHFCTG`;TaWvY)QX^v29cS`p%mFDcXh`{?#hV_)hC-D`oh4V;ZO@
z>&2!i=)V!_om>Saya28xSb=3-Pp22oTK=i7cw-nV*RxB&|4)IP7&kUht+UG)_!Tx!
zZUM#;pv6qBXDWoF$~#Ij-`xIyzsh_vn?=%9W!RgUqt`(gNAPY(vwf%V*F1<ORh0Ws
zPFTV7e$HF`Ja9?nOTFx|Th=i=AUYBhb2ox#0!Poy{x#~7KSJEr_K*>wxL59kYH2X^
ztHx)>>qBsssbs4fu`P~^)k^?h41DJi^fUvX(488+>FA3@R6m*RZ=@^!xCSfW!nI%j
z#j-MNhJrLYh^99FOR=(UAf1bob$V`})C|-e-rdsby=utM$T)qqg=fMyza!mNgK75Q
zX_NajQYR_l;o+^lhSpQ)O(1EjFM(h9^m){Y%!fG5_M0iM9o0-@5c=dY%PCymkWoJ#
z{}gZ?HJ#u=Ua8Ga9l>pG>6d+s*zj2aDW>3?3~%iVc}GvaLS9_T3@eu^8O;3r1XKmT
zHxynuTL*)c9*Z6qhRqS2B^nd^8!~G-U!IL|y6elte^;R@hF6m{4e7Dy?d?lW3W3{>
zP=x-lg84;PveJw0NyeS46spAg&b<5LF#;hv4)Ohp*k^AdjU#PjCo)1rY#t;&IQ$Cr
zqo#5M`3ty=U3AIg=Z@tzT{?0E2xVr6Z*qlVp#<#Cn?DY9eU3?cm)j0O!h0{d7C>r^
z9hckv`qSi5uK=1%uNcePLBN!oUG|H0@emLY=Gs2<KLc4pp5?1v?tdG~3rO8ELa_r!
zm64yrXfoWs;TpL8%m-<9pd!~$s@v_?&N%!_Hg~v~GgoEA+3loqa(DrWQF0<5m`i=h
zz#A=_0oMBJw*ApFq$fK*rtaHx3efP{O(5fKBI9aV`u9{2S-{WEccz|%GgB`wq9hx|
zx#2osi&i>p3~KMFgYHzMB70n7cps9^?>f(ug4sk(=dT6aTwAv;!Y45s0NVA-PC+*O
z|FQt>kFDV*bVjO_4`y7cK!6888)zW!fo%Qf+k!~$pL8Y<unyDP-_`YLaJ^W!m=lNo
zI1Ii2ScXqT1aot1I1p5^g`6*nA`rv6wq(9snMJwz?4wI!3?;l@uv%k;*u}2BPlqlX
zGhf8$SXq&{0V&XNJ1<dZp$q8fgfQ*?1AvCWigx040bKSZF&a$`l@V>}WC1C4bn_Z`
zQNXXOoPTW+7S92NLSGy`#a5pJZvV^MffUSioo*E6<2Vk8#2tEV@;?QVtH|sxUV`|f
z5SWzq`ppm$AiY+;RGTIPe3$3TlRZ~v`#JRKuUoE1Z2wg}>{RfaPpT@*K0z*S43=uC
zy`m_y>0#}5RXaN(7Fn#6sQ!=H^T)74oxbo3L43Pm6pPBu7t+1GD{=u+7^~zN`6v2v
z{n(l=!p0RVvxieG5>`D`Ai9lO{^7eTKEo}-*2spvQW(5yTz&yQKECN~oz31-ZD{51
z%K7W>RC=pD3yOKh(rG&C?%UYOUtch2Twuh(4aQ1V4Vv~8a}wQ!_-a=gQ|UKISm5ec
zbB+yuup8^(P^Xrw3@M?RkTl3McVFz?F(EF=kv!3?R+FF;D?29YOW<YWqn-#x8o&!5
zk`JTn`?5QYa-*!!_2+K+Wi;mAQtQG2ZD2rv8#t2<4>!Blmp+fK!09qZBXlJS1Su{c
za^iy=`RTtA3FU6Ut$6Q_y6_1IR&RDI>ba<^I$W!Ktz8t?TrC1sp}hW21`+e;ZMo_4
z+&N0BQ6I1ktH9>8qEk*{@G*9!So$<VnULOM-nPx<TS!F4wf+tt|L;?-7Kq?=>gKL&
z+L?~Oc!}$Wyehf#HU{qCML!@Nfn2=vuJ#{#VPh+Ld->G&ZP#8)9^Va}r_?o4Xt*?t
z32C{s58X#Sm1Ls<umahDQEAY?5=^5wY#aW<KV^UI*Q2GGa%HW3(2W<$A2ds~jD)%8
z3pqVCg7yy<o1ePj)xNFI>kr%!+3#gM=B>&=m=c7c!vIRCvr&7VhYj4K=?M1VUaoN+
zFe9o92}gP_X^zL`m|0m^zNidlSc4C{%(b|)KWUvH?}o&^?8UBI|EV?~Zq7OZI8`ot
zvqtF_gBxrJ#7^j7N+0ro%v~e!e}Q3LO+<b{*$$#ZdcYde=t=V`NZJ9Ws2zy5Vg!&O
zka)2i0{OIYr?_Ot8a_2V6CfD{?#1rgrAr}$;}Fd~<#!X?9H7gPaWA=1C<<NY-s`Uh
z+_c$tp@~{2@&SVZ=z(976|SBfWmN7z9Uz1=;JI}AZV6D`1=Wfem5p4DYK*d8ng2u6
zSqF9XeP92hf=Eb<lG4%*Qi60#OG`IMml8^MHwXd(BHbVe(%sUEbV!3DNIm<0fAjp|
z4CBC^ciel<-fO?sI<GOEDaT37HHK&&ulq?)H~%54x0a3h5*A7vX7nJ4UtXA1Oep+g
z&R21Yq%gVoarApnRjLnTJwB!x#HoeD#`2Kdhv&)JO^NwBmyyfAoHeFs6BAvho_zU^
zKeYeuzhZl-9KE-|_L&UL)FP8DabxLYs*z@UsEHfN2)|ik%u3VmKfhgOpwTt^@pje4
z00oyoe!A_?W%?>!xMVZWlkQ{omlo$!rOmrh4pHdY(T;9WGk9p>Q%~PZh~HD4Br(8@
z-CfF!tl~teMK|UBtU<jTW%t#pT}7j-)7UQElmD7Z;#+QR|2<Nn9;cZ)ds2`^o1?_g
z4*eN~bkVOs%>K3W8)yM}2M2br?7$#ke2WKa2?y;a_fnsjKvNyXa!pR^tJ5CQVc@ZE
zArs+5HTOpp!Bilc+VR<HE+UAuAxw|YfHROer--9;uX+<+^Pa$3@|{-~b<d%~x(ir!
z5C;O>;~}843KnKdJB+Q0ubhh@UorHf8}GloeYU%_UVVptfSqp;k!}GLQ;PXd@zps7
z=zdT@T7Pm6*2Z7<qR4pcux=@Oo>nWTNMT9ub?B2F!S?x1=$CXbp(u)6ZYfWD70Q!I
z6*+9VfL-gm_wi6N7QJSC)cByg<fpOZV$C1C!JIbF>*el@A)}tt&PO6J*r7`q;1m|N
zqsw+Sj8`nttqn(@cu;ie$sQjy$DX-giqY0Y1X=t^vRirk_6C4#Kd(;e&tMt|*job~
zh1z3$3FT1jB8{p)G#P!sH1Ihr3<5eZ1QSWoF#wm9Us`J@jG&rNpQXL(UvvKMI-w*C
z4BB&DR#J~j^k?~MSXfw&<8qI{FN-WQ5)c33Y8m?Qj{0J~oUa?gZ+}+97zF_YFgiYh
zDRdC`h;TKVQYkNdN^ZNKU;lmC&Qaq=^M|?fu)g(@^}c~CUiQW6qca4mzS{P?EJv35
zx^4Y&TCwq1r||~#vK6@@CVj>rXRPS(eET@gKtWoOT$c9Kw6Cmh%j#Nk9gVdTJf7!a
z<X0$iGIYJouC2RKaf^I1BnwZE|Kqy?yzBevu^j1@oHj&z^jI|EU2-o$PCuRAE?XPE
zKBZ&dnUU8d=KA8LcE)z$eg7Go(k3I5Rk);LMr_eXc{%HwhMf7-;m-p5{%Jf3`NZ{L
zM5?LsGI!%uiMGhs?z?E!b(Z(AgSqZGr?D`K8qoM-vj?pR|GqH&?mRuxuvV6R+&QIm
zaa4aT7RIDWt|Igl+q6AhLxo#F;8`HON_rKiRo-Yn)ko>TI4W_x96R#B+u{Tc|J86>
z-BtbQ^yAdqX2i`Y)5q4f7MIqPJxoJ_{$)Pl;~%aG<bgguJ}0ARHns8~Sr9~ir@1w8
zEDT1eS<th8o(+4PPL}(2!8|ApN}8szZllst$9~4bZ$1td4=o}q33k4HBLos%2o_Cu
zXT?{Aef&gC?-ba-B$B>t$YE2K8>wkzYm}j%v2a(E<Ed7){Xee{-*xT_2UkJaPd_RA
zE}TUEHjW(5O}8Y2VxPet_kwrtARh;8fli<a>6u=3{SJ00!HH|H`=m!@V={sAE$Eyv
zNuJNU2X4ZleQ5vwH-<j>>F0+?$;B-;cJ~FkpWS?=Rjm0;aEtbt5QuyA-6x4H=ZJZL
zFERdix!F%Z*7aS-q)r;WMMi?}l6t}@N7utOOE&n!?g!C)HAdt$-I(E=U}4|=r@|%u
zg4wO|g3V0jh%O_f`~6q^P7kQPLC)+zTDTHSBlfe2SL>eqv$0PVqebCQ-I^IwZL?Ab
zx*c;;ZGEk2d!q@^CR@+eGg$EyDFhhn0!-v7eKfjrWuL@|{Il{s?Re-6`cI`v7Y2Ar
z?SAkKC{%mPzlw5_07m9|e=1CNPW6bY)c}egU>iVOc|b<C3ZfB0l$GgW#*d(PzBmCg
zr$Uu3(9X>yJ=XQNz~%3W#?^QQM4Ql*8kmty7TW5hzGNZ}Q2lV!vVYw1@G2I8IYs~F
zi7xwH#9SM{bB8?(7*_D;W=xxv%vv0K%EXJYwxxzCdqwnVw)+-J`P5Vo*%sQkl92Q)
zlz%;bqW0o;{#~&LxIs=O!fT&5)44;9rKvX`e3h&;4JJ_NPLyqr|D~#lX%SwaQTmOT
zys6=VMa@bSX}@5Gl-ApgC4Lob3IDCFne{Svhg$tMUOZc^(zW-uuxYc0O*s-fFrJy+
zx$-+%5FRZ1B!7Lq<FcpklFwCmNA6C)S*p3scuqh8DfOKAe{UsqC9T_t(#1$*P#*eo
z)eX@~DBOyA@6Q_0ixr3RR(6#3=}mN6O}Qb>`aSOvf%CsTjkeQ`^)K&+-B>%jA{wFV
zBOk69p_uXoF7=)>@t{T6^i*|zeUzo4Z&>aAmw;!Ilz3g0EDQTH(j(t#D`m+&F(BMZ
zK234Wj7e(D%}IRmvlQ7bK6PN;6Y;FO$?W%6XFp3yvh0qt>|$!!My({<G&4QCSHtA+
zN`rKO>eNWJ+f(=@rKiq=>e&*il?7O=YwGI>un|bPgG^xBZKzXIU0s4HsojYTTlB)Q
z1r|T#nM-x)`-=HI=af=Wg}9U#!Ag)qfR~rof*Lo@ZoV$6h3G4yuty85Klt_QSEzIz
zm|SdZZIzz8t_i%?Z*Xe{w*IB?&uLxm*C%!X<9%lA|D4o3yg&y*=s3hF2n}&kON$78
zSYVOKe-dtSL5~@lc8R$yDx6lNYo-t{-`t#T)ojU_-?>5w{$y-lQ)8o-J{9;6Q+cfM
z`^?6zYTHzV<gX2CZSEn6J|b;_|A6BgYU9Y_GeaUL^oMejF>pc0!>;?JoJTgY8us?A
z;I}e@Vro4t)V&}u!eH8buEx5%2y1SuFnbsbjm+s88A|q(l;SMdIr=N$UKx_QZ(!#7
z@6(R~D<UkMn5hl_p2U4jc}86aD8tNueJi3(kb_TE30YeZHAVerb5T*B<L6cWcuATy
z1{m>IvP6(NG>>U86UXe{F=XrAI#;kpPujREh*ot&p>V=(rlM`oayd0}HSzZO8se&`
zG9NhoRw!al#w{mI%t$LTUTH2t<&yH=zeuBha7(Q4Mf^F8GrNG$u!@zfoOmV_1<i=B
z#??pRmFQ?}Ur-0C&HCc2A`i#r4A<!gssIMcsjQ5H9gSeV?rh}xLK%6%1ViZKm@u{h
zERU2^jB$nn)Mb#|$HvCm5F->~K5DsGV~0!OyU*J{KK=ouZGtQ<)<-;7tF@7(OtA~n
z8dl7CHLH1?cc0%BU+hd!DveI%;HA9IzqFytOQ)DMP|-O1LLracx8{sRG%=$sN$vaY
z2QHn%cT~M3Q6qC<2^`_;(=57M1H4)9(0u8>#|A0g(W9rE?r^(QDgS+=6xE<|@K4eL
zgSo-|3l@Ex?<Q~W5E%;vj^jp1hIvt)Wb%e=O%$x;ZVmobAqdCCo;7oHT;}t5;{A}>
z&@W2G$3OW;!AdY;pKJJV*^BRr!8ADhv0IhWiRNckg-6^m20Tv!E(wLvM(TPSeXWIL
zP#W%OXI&S%e{t?<CS0g{av(hY?PHy*b;_>+)0iu}>)T4<?ma6`h~pOgi>9#z=**#6
z>1rZQg4&yUqtU=*xonr#1K%|=p0A)(kjiiU@Ph=y^gZklfsITT&p(q|$V_~w76%I}
zao5CtzM!ZmjMsL`5hzcjUmVcLP33o@CMPE^uo2G}7BjDXQ^~I+<gXjrafx)?G7o(c
z<r3&Qeg46MNMdDcYYHa5_0u-T*A>lKfqMZ<#+-6kuYd(Z9*$zIayCp4`FGO>#!0>6
zE-uwhFQvKqZePGOw_|6vI;>}92P!ijUf!{(sqO*xR{eF#0Wa#fJ7JP0GcKRL?-r^t
z0)eSWdbxbRbum@~n<{$*p{B)C#n5R7sv6*Z%+}fK5&JV;@HdVmJQEB9O!e+OEsJhV
zDExGIG{8<uSd@+~?gv-`0#=d`s_<EJv{VvE4s<IhaFAIIEFW~>JW7(U1T|o6cD4`f
zUsetdkFmpQoOoZBCF*Z)Z|A`+3%8u94v}S+7ro(W`+fItdd{iSe;h2Hcb6PfU+;gJ
zrW{@7N894d6JP<gs64dGL^4k`8umDewL8=Wd41C;N~am}jMaT#E>Gs==o6m)sbeC<
zV9}`xw4Nv+P9yfJ1097L>27SNN*MDVX=HH|VnE~d6>;B%J<7Vnu|x4GjkYN4B8m3&
z9n{trKYm`4gISUh>4wD4;`?Ypv7+z$R9QLp@u5b#T0j4{c2+xU>m|6rQpV2<`oG^@
za6qeA0m9I{Kar8X*oSAs%`=rhRo1vjO%nsp<Rm!Bo%HT(@!)3ZzjvIX#UFb<1ut$p
z@;qxU2$mHNu4B=kzA-|xD%Pn!Bb5@mg6AYG9v!mwN;`W0ze)NF0~gu5avvJ*{oPO~
zKV2R`?ZMGgzz^2?=kSqQzRNF5p=C3K1XJo~TxbxDWC?B^+L;E)e888Dy*`TS4HXIh
zHWLgX63bE6gYMU)o|{`gojZQiA3q!6Be5Kz<jKhzZu1qq{jb+LrGPP^cct?1%h2!p
zw;7l5!lfgla{?@kION*g4(0z+=|+{c<5Z^9+*b-B@>=*zTUQ#Q#dXS%w)qSF6COs8
zm6er|k<r&pHS{XNl7!+~!|CU^58@Ky#2Dy4BWxJ9;3j1<yCF6fl{MFvPpthTp$TXU
z2qL=riP+;vaieo~1k0O`<X`xnnG@*R+1Pwaclu|XTBBrYN>gV)0Z;%8P|m=h-Sh9y
z_9e^t)4Mv2_c`N^VAUzme!cdj(}WYyYL$go+pN@ap^ZOrel><3(-8czGIm2(?!P%b
zKR<xr(0MT;{b1ffbNMOuO}u@nE&W@(EVZC@(z9NFR7}Jg0{2@z`gz!5HJS2oC2Ubw
z``1?^DtKlIxMZULS!Oy(6NfVn>jPy3q9`3923|~0k7OCL&}R8vc_o~kiDj^aj&EFF
z{=9w%;Lzb_UJQbfLl<K=N1xD_X#eZONGJhi=quQx{`~pF^Y}5coFIi7am&H5{!c5O
z$kt#3pFfiwds6ro(xfs+XB{}AgN34()<`wF_vP=4iImQ*WTz)Y_$<fo0$&>X{L!R%
zy~AlFODs0W%0TG@c7t@g*Tj#qHd1DV0qXoj-;KJjeg$yheVtmHNe-0Pv<Zjs(wq*=
z6zl8i5DNSdu%T4g6u?gbZnw5Nd)OV*p1kG*^w3rCE|ybQz_a2}1=@)w@It&_arp9w
z2x*KAuZ~izZEa~1`ck(c-~@W&btM8N24-t|zjT#=sKQIZ249+UKaW~2UgwA|YA?%8
zO-J%$=gcLtuf~&N`Z4Wy*n9rSw;m4;ILWi)7@Cn_Mg^T*P}BC`m{*;V3f<yxdL+qE
z7)D50)-jmIna~wMG%?QqHkJH@Jj;J1Brzpo`;iKRXk)w73zGZyaZ>#CJ}eR`-8CIE
zNYu*FnAT!u;Ey)!X$Wkv;IginQcU9yYZYDHuXxfMFQym1+xOD<a%aEcd@Mw}_KbDm
z*AHojUb_3$vLQTPq|S3~Vk^4uC5RkaTf=fsdbg-CiZzNPmlnsKjK<_$=niA2Q>Qm?
zEHXOQL?q5kkYuX45%(Mq9w+@9l=u0KFJlz?9~<@?tUV2;F6dg!Mj6G%pw9(2QbDJh
zy}dng<io%{`+0V)dI*?6yoW@rx}DH+U}Fg+`&mD*WFbJ?0Gqf`dmy^Sed0voSah|8
zrbT~DsLyG^R}5a<f@0nmW)8H&s`)HZyqM1KYo_Rh*`r-)@u>(gK*CajsKPh?-?<nJ
zKfnTeH-6oVqPDg+6CC5vk7G=`rSUuEI!%4(Z#T4|VKrJ2sN;7Klf&97^beO~{3J;&
z2EcY2VZ`Wv)N!)2KMkgNsVd8Tc5M;M&knFLwD{<tFoWj_)qto-g`pFPtAMG#LGZu8
z4&!R8KyfPMNr!*$xy&SO>ernGh}d7){W^=bY-@&QWngZb-6@bMTV}15Ip#IY2#$b4
z&55Mo;@~i1iiCw7YL$U+H`<pf#5jV6IUE**DZEzkuFkTry=J^Q7b*{ob$QuLcwd8}
z?giV<TdF9D(%K?X7TMR-aS!OXX^->lq^HXbM_{m^87+F{i9D7dU|I07Cml<hhw{t>
zgQ8}|(+@SDea(x2zqnZH>@S*A0r#JvOc#6cLL}P~CO(R(9Bq;$Oe419AL-BzV9v`l
zuNXPmErwoY#bXoOo}NzfYkOM&?AD0xd5X|Q__d&PBD10}N&hf{NAa?>{Z<S~$Cl>H
zSXk)#q5!z=)K0%ZzJRZ92*qSn2EjU{z@*HO|MiHRO-9xE?blj{fa5@R?b43(s)4+W
z5MFBkO|C$VaAG;)fScNZ@^1beRHRLt{wq)0sNTt`5rkFsl+5lt-X~B@7mPxwt!X4*
zRU;M@Y^u!V%Q})DGoAKe(PX@^Wjv@6C=7jywm(V!mZRu~hPCQ}ervN{W$1WI0I%g=
z9Tt607N0a<`bWB(!@m#4KP5Fct@${=jPq7X)s|o#QYV>NnCLRn!ht&|P{KHoJU5j7
z-4OnX@T=SZ4()=}jK`m0%e-cwZdjEoxb0M<Fos%Bo2Tw-z_KQ)S1La#UE)kA!5E}9
zPN-4F$Ldf-zJ_(>rgS?}@jL#}-H?t<w<#SMNl(nq%7JSDc2H<qxM5=kc2x0{D)md@
zH<hg-Nov*SZmTedGFt0T(s$~*2O^fhx5>$<*YP;v(a{!SA~;5SYq0gMApt1hhkrU5
z`x18&8CZy(w&5VEv{J83`=j>h^NPV^?;ERezi80}43>ZQ?st0K?|DYwAmFlo0Fngg
zL2Jygj|d71Mt+cWG)Z3B?+?=c7ryyNlb&89%eF?v#L?+4wQtqOGyLZQD-B^HvjBef
zA#3w9kux)F{>A?i2q$M|WF;g5mcC%z*Zn2C;Vgg{FPA@;36$wqTk+E;$bp&Ys&q#%
zsJJN6u=+6}#$i6YPZ$h+-3MFDbchhe&=Yoq8Q6zJIkN0W{a3y739tW^U-tuVXV%cs
z)@8^a--hc1t=zG&#Yol=7*x8>HnNo!6fn6h1|KAsGHwu;Yx>sf{w#6D5|OpFwMA&Q
z8{V9&FiVDD0EO4Dg@96MDbjrRtT5`BVD-YhTBmxp!af6@3Pkw;!_f#_(7`OrDD;Xm
zy_)V~*KK?15I*1=5`;HKj<MMFsRa$s`n3QFhuSg?>&N@u#*Hj8_ZefP=L@ExsOjH0
z{5_$344+Jx&)**@KpNUu)T{N|HTY+QK%bgRlB3dV)$;=OB!o*vbn9j)-GIzat0S<N
zNkcQUyG{OmaozAL5hWWU2GWrCzYJ_%UQW(iu+(w6tPfV*{lPK(o)&$kE{I&ssyvj<
z>01BmT|s|y50Pe0mM8z79-x-|*(_;~ujXBK#kRiruV;$iGf>3}ClOQCJRwv%OnO_}
zb*C_O^qV9tQIFJElKlow`PJLH6Fzvd<<=(X48)`Iyy--hQ}zl!RO}`voy2y{FrC*2
z4g`?NC<K(=i*vlACqbo0NfutAXUm+O`WJnfRt0_NQu#?|daYkUS!3MQ4rP%^mX!~y
z(>$Hw;8LeV=gQrNd{dGkNlbB!@kXqGGpq#hKm1t%_b^mD^B@0Erq7m;xmUJ5(8F*s
zyk!-26io8`^;$1>;&)N|t6+PbmMmjNbZM_wb`IC&c5PwmcLwFp#K#ap4eKGZHZ~KW
zW?=n0065tRwErcqtg#skUA_$7+~T33A*7i%sqEF%GU$RfwGAp)0mc5@c+-1<dmh~=
zoY0~E?`r@nFQ{aA*dbf{`wzQ2@C$!|L`<B0xnmh+p%W|lBn#KZQS66K!bxb)isn0G
z9bB^`f<SqCStqqbH8R&xF(f031=dse+NUY;qgN6qmF}Uidg3(AfiV|0zb1WM8Qe#G
zQ%PMP00x_z>%TZV7PvZH(P?m`BA{2~z!Y0+|HU6&G{@m}WCmGCFv!e+_F!UWrgw4i
zJu$b~hDEl|$<lH5wRf6+5KH)YJp}bAB(p5RyXGCSr3IH8Y(p<UZ_&ffdc;Nk6{p?*
zFLlIPrhp5Qj?u6bOp1sH(GeP?_>w%@4^N;#Ti<Kiw^$!cdwRBx68vc)va^<a_g|;K
zi`Kz04z;)NxTnZYMTd5U(G8F_VYOQEqJXfif&zLlCwRdI<_QTj2#zybW35i?!RL7y
zFCYYNIwhsM5aop+8xY6k2;*8%V_@7#rq1^WKP>gk*2w6+o$p;HivN(0jLc<_(>$}O
zf0c0BP2hNt+Y5dl#%2r&-stJgqOc+&gBS2)v~*}BP;L+hi_gb*YB3P<gCIBbN8e?L
z7g5?FNvb<c_RrT*3(lG0BzrbF_o%2S7K6`;+AQ4w1$kKvrgGY?-19$*T<B*juC;XC
z{;pOPQYaryVH76ok`xTIbX-+b?ak>w747-tR+IZ@VHojAJJZb!cezlL>qg5WQU+96
z`1w&&W~)yYKVITklutDVRysLuJv&h`6n(qruQ0#<x3_?c=b0USNYoUDjF4v*-(O2y
zIr9{uVv?QlC$IY*DvWMBIkGg)NidR{&l5U(<&JUmV;_4QZ*X%8##z4bd;6EKMP~o$
z!)L;S@0}q?MzWa!mGm~ZTn)+5X;Pt>*X<&Wr%Ej0+pAl@wv3%iqJw40Dtc8u6c>`B
zDc#r=D$nZHEW6%m#y=uSMNyn<l#4#xpPryFI!hZ(PRuvmkz8t}O6XE(&uyO!m(H_z
zA3-Kgi*LNp?B%^okZrUzp0DqB`R6+*Ati>bz7MRm!2k+l12a-+l_3Vn@F3>;G$zx1
z&icQC?>}VuUYJ#YVzsR)KQ=zz{<rnoCz;K_CUFZj4Z}I#%)<-D)YSl};YLYAmtro5
z@vtE38(SOl>)*b0)>uzi+1s1%eQ#9E7I_zdEf@;sz<0^XL=ral3!jVA#pBWnx8NLI
zbqavqMp{X<>#4+FxXkgGk8d0v8@Qk0uX!^I8Py9ooR%e5Xxmk$%J@6m4xD*Nz*yfC
zimo84+v43QCybvc#i$v)g9J30Z-yV_q8RM#??0l5ae=&<iV>0X10>33@Vun7bPY0=
zRuB{3;A^o!Q!U-KQ}mFIRc8Sti31TtpcAp1Q&q7>&mVAsY`o@+qk;d~JTvE0H~;;A
z7QkhgvkxU+KvzI!M}C8K6U+BOrg&pHXnYc^xR0pGu&b@cs70>NMkYX$I9eNM{4=FD
zkjd(l`r)BovbhQZXd&Gq;w(a1d>T)YbgwKK*t3w>w8{7sYILHQ9-S()u)gvAy*)l3
z9?xjlw{oQ-tX{t4Yi=K7ffzC14jkw0>y!Q{g{f!&i%zfoLvm9fK8`KhvRWrv_(*nr
zBzpvk!iMXs^Ty+0AtgCM#x1lCTcd&2pcYEMpD^8n_>On`-XQ@vBB^QImg5r>>d!+d
zHMZ_BnH!rX*^KBuwHBX9-ap-pKd8AwM7o=*_FPNq`uDAJv1;$a;?*wQ<e(IG{3;eZ
zLaKXgKI_wB&*^Z+)G+Sxw?r`|Nl!4y<b|hC$xY0@KGDaXvlT6nBt_3bH;r>Y8vL03
zlqkg#64Kh*X#2;_oEa1I4}_yuJUtmi^RyXV83iO&odp%_1L`hxyHxs1C{U+7o(N+m
z+Q$uBhCSwUxj@g(Z>rBmqxeTxgkMv}>S*8l%2&$tUY(P0Os*bEaiUM;O)<+gLX4xV
z2<hV&^UqIGd7^GN-Oj+Jz&5z_A!HSe`>^@hNJsl6qgXeyhyI#pd{9Z^{;V#FM`Zu~
zcYM8+e#b<TLgw##mmd+aB(l+COI2#GlKwos4Ut%aJ!CFe`(OT2^qE!JJ(V|1o-4tx
z0*5lO47nR2x8*R;_xd=ZddfDBlJhu~TK#%O>gW~6Zeq&==%t-Tp8Vwi)vMz#9>Ibc
zTMAH{3ybhXuoh>jgi)iuD;z6R7Qw7LwE9c=6AztGEcPm%*1sQDW%LF{E>BX<Noyz5
zeHy3Xp6ZHTUNWEGW5TdBO27Bg($8SwiZ{d8|47TNy}(QqAq<0a*b9e7Z?85+NHC&P
z0o_qq6u~r{c)-)K#Np=VHdCj>mA1MZ!M+YrM#Q{kOg@}(vdH~(81V4IC>5nQdM3Ii
zBeV)L9h#D}Y{X4a*xLG&GFJ>c!%lHk1ZnA@f8$?1<X10HBo0=L+DMmn+4+?7rcK2n
z_STbhREhE=2K%*IO(DZI0UH||#ZEDPtAt0&fDx1-(Mje@?LpEXTJcNckDJPD`LHqX
zv++<ZS6ZQh9F~3cyU?n1D7a;uWxxbvX%_v4Sa@VQ+)CVAod>teTeCo?vrg@Np5J%Y
za=Gh(<T#{P-jiICc~v0(2qSocxm@U@CVIvE%clRi;9#FQ25{Qx?+=fR&kJSSYKDdm
zZvrW!KEyjPx@_w?eCZl`{Bsbh2Hu^8?r+53=r%?=Wa0zWLyzKwi%g7PrY<q|oBvR@
zy{>1-&EYsyc`z`+KP&t)Cv=Ud(r%vLN4JU2jA_(r&SQ|}E)^l3`C;XcjjHPx=Ny*(
zSoD&lp*eS~qtuLwI4HuicK!?aa@o*DofE3$kiE}h%vhpwhncW@g6@cE(JHF=lA}?L
zNZu<j(eQLAo8gHtFX4md`Dj$rFBpjK=<xCKULBJ2#9Z#7XNtc)SzrV4^hR%!l>UIn
zk#JDa9(8C@p&O%3=|HBvjMy+EzWBuCd_a{Cp$1-iTb||n82`0LQC7PlZckA0(f$$=
zGH2>^5ThiXyc?+!{MOGEeJ0aqhI?Dep{Hu`jWbyY5?}~MzIc;Lt2^sej{O4u8@K)k
zzbxVS$(ow3aJ-#(9|f)ZtHh_Xog~;HVO=I*^Kyo8xg)?{j^SU5K}>IvY}L~omG_{j
z9H4l(X}dJnF*7qonziv=Q^tuc(fB_2IHXM<ooQwkp_8l`9FLSWt(PZNgX@cbQ5!#&
zms^6PPx|+BKD;*E<b(E_6M}&5`9iXuC8P@i@v_K?)?a6@=~r6cGOIURW2+U9yo(Dr
z5*3C_+?CdHD8mU?zx7|T0;d9lg@wq0#Rma#bZPEWB6enuHMX&@<9+{g?8K8ffD*|W
z>YHj1jks(S*XYTD<HfFk^GSiy+-uMU9wq~iALi@L@)2Do&=7EX?CC>{)J%iZN`);>
zx@7o*@3}pUf0n{x%+v(;ptX2xy!Kw5CGn=yS0;i`|BUVa&Xl-4h@w)_(&}L>%xs6r
z$p3gE;F~>9mqK84k(Zy}b>PKpp35}kW&l~1QV_?QcB?a3Ab^kVkKcTEod%=Ol!q{x
zJUVS6Sry((qYom2gmbI{^e^!q8Bi`i9h{V`{1`Dcs-mUrVoES{H|pDDrw=g}uZEQ5
zAR}RgzLKdDd6QDsuj#F*&d^-SecU4kAfHzqi8nXtKCs}Ap*&MUDU6j4Vl9b{>1_1%
zdO?P%(R<rI$F9{RB!)<QY=Xb6+l5>uYYCgAbKGfw%tY(8*^xl#%<n?9E%MF0)V{Om
z<h7qMhUk*N&<C!A+t=mqhIyw5O{x6WCmY$|Vwh*jNuvmOkz7hX&uAlhqgTDZyjM;9
zcCDh?i$AY|eZT4l6plW>BdWM#SW&ntc>ND^LHl_haU%J+C%t9b?|KNSSnW%I%^DGD
z3Wgmj0)NQ5gJ~79x<iqzoAkapFSG6MPrskav`p5Us2!cU=Yk=xcWdt>Dz`!k8?SlW
zCss0iHtap^w|L}J2KLTMVkrB0Hh!4HhWY5uT8z(FFLg!yV7lQ2`r^mpV&(yxWU&00
z8W};%c#n1AI}1<KD!h2pFnzq!2s^=dWxsk+<CsQ{75QypfN^oLA-VzSb0{2}Yh^nT
zusyKw5Yf&)!bfkp`cr;9V0aY|K*n>aN2pN*6o(L5(F-g85DZwhc6P`rul{<DZ5}$_
zmg4&rK$7QYZTJFeal{IbD3AluTDuKZjMqp;1!(!&-2-JOmq-?ctGoO2$=t+EMTiYA
zEscWfr<WL4z7L>%*7Qx5DDJn-=1X&n3rIbE4Nbg2^pg|q>Ku$LORXU^e?koVj$>h@
zNPzH)A*2`nU`mU{wq&fc|DNm{3_(k@EB#@pQ(@HRU*)2S_m1uHw40+sw2|uhZ0mn@
zN9G{`qaV(c4m{uTRVGUDkmS%4Syt3%f0m6D;4a{Ny{zG$!IQ-3MqnlX4ZoZEllV+x
zJfXZAgYMlN%PRvL@noe;4kvN`XB5mE_^L*>n0gYNhLrf>i@LaO#(TeYW`4}e&QffN
zhYx<G6kN634(w1B)-$$sB#*MS4oAhg-@~uFXc)p?ZIZM%!W~-l`-Vb*`MlNcm=OO{
zS~4mUY=W?VDBAUlv5c!eYz4|2%q}QBrdhWKG#h3<R9V+(etH-sQx;oj3yY$=lL|kH
z>tb62UCo4xLhtjfpYfS136EeU4lo?+Qa>f0N?yZs@e5oOOA`(V{i;o$GhL)!m`*+;
za=z1n1Y#iz79PPv|71maU_SzMX#FExMqv`Q{p%N3K$A8LiN7O18DDcJD5c0m6!7dW
zzsqUAv~x5?t}uRonxH}v#rfIT#cPK=JzSjx5t?DW(X<9|+`<gVoL414SODFx&!1Rw
znkFvPX^v%pYi-W|nYsDHgk#Khh6raE9Zc)H+%^0+p5lJ`f|<EK(hLSUpt;|Y_!GFM
zjOQ#0^XW6_CG8sQtHLk^C&se?LtbgeWJeHJGkEc&!L%4QeB*9?9!%yU(%fTNq~#q{
zyHAqo7t1OjcgWK65pwYc=$fJXoA~(i{<~(lG1FkB9D~R1#LK+B2Qo@bVK)js-M#nB
z9o}t*!fYhU#=H@!!cVw0&54`1vM~$^gXdH}jeSsKeA#>`Edr=82M31=y(r;#tG*|<
zx3$q8|45qF@Y-+ZB>Z>#^f><@@F7G`mVG(V&GJ5ygATYz<C9%!v3D8QG$^!9P!v?`
zaIawL&KYjpqkAx+*5Z<Luk7R<8jfKEw)b0$pO0~$b<HgOx8~K^&KDLe7@c<gcd+E0
zJh?Ic%r}#Suhh>{?}R@f;uP;>RUslgkk1`?qvq=+Fko4!^QM&QLp#Ho@oV4H2c?yo
z55h(MI_j>_CVT%waV8>sgXL9s;#OgN+|*p_bc_6*ftj-aG<Z^&!ooYDQPF{Y_sJLp
z^2YgmJ3y8kVf)Qfb|m2DYff-KpBJ@oGo=etXVuw@;d7vUhKXvXzTvQijDL9+ZLuc>
zcDSp^MqUOHq0^@PNgUg{leq6q^iws?wRasI)$%#6N@?|*Lf^Bp-<zIYwpFtVt4AN?
ztzqp&%JJXhNRkD;njf%J#4pqTJG+w!&7Fa883f=pNCV+r3b5f@aMV@#mp?~MlQyPn
zi}*&7K!6vXOQ`V0L8C*@NXf6W6MC#=Z;Yesc!R9m=YdJbb-6Ab6PYU(IMk8gHHe-p
zsH;ncNSvw`Tt%|DpYkW4`JJiB@d+)H!#UboH1CpQIZGy~K_M^>&MqVe0HlwQpvq5a
zj`qcJpEasAn1al83T`Un(xLuL(`okP1X!F4IH=c}8c#Mg{`~!mkjxMQHH<hJAf6E>
zH10Yd%y}p>BU8Y^j>kNuv5=71F;7fNbYre>$f(w__PrLtqh}w$Mq(N*5DdY_rh_XC
zYsEKR!JV6=f+?tse`HW?p)_fW<*Q|u7`URNbj~9i6#E+81}mIf+soTa1pa(A%`$7L
z>%JlKI50P*$QSDI{qWz>w7D7G<DXGwWcU*-m5mVdFawg$8KQ(UJf?A~m-4GEF1DQ!
z5MLC!`GQf7+S5x{6<;NoI{$NL9oEZhkLcd!NQu}P7lZF@w9y-f%gn`|G-QR0`W=aL
z(iY0;(zYv-^g$VSjPn$D-(bFBs+_XR_a4roSSU(j`@&*)XH1Ic&`y|#FFfo|6d`Zt
z!TzK5pwLN0v%MP=+OA*YXsdO6TW#nv>R&EhM%&M1bz^#YI;Dh~t5)ztmwNkVqn52D
z#T(1-(oO1JG<DToaBwf-f=Zt7m%xx^{=PRo?JfC~*f#Q+cS*So3_n$e1yQ-Wge$F3
zqPouK=N#>&@S-H%Y|D9l{2^hw%@njIA8o3dR29Mc)7XtwG4<4eoukfH>GmQ}{!5NC
zp$`Z4+)4%?pACQg`X;Un{Z}%oeKUcC<BfzM9UHg)J$0f@7$>xVAHULZNt~vbI^hLf
z?)5a5i?7P$(<{TrPG+lVTGWAuo|$?t8&M$u1lCTMPmcegZru@N@K2PJZb`MvG^@l2
zf4hH&I=^-uf-4rmS@FApj|udu_0Qd~t%X;i-M92o=FbK<`CF$w+PBgz56OhatP){<
z)1Snwq}C>zmOrUDXUkHSo*KEo*xF;0_kMzJ+{0t?H026mMZPz>^j!E>ykpu>&pC#S
zLMS(M0Q0WczPk`3Of&swyAN%wanZs5lIt{!UoUzEYvK>qLWahAm~<YZrhJEJ%XP9g
zQ~3ryek~BiA+hn$(_24-T;K&Qei75_W7ue6#q3}1itvI|X+`VL?@{A+6zNUzPngan
zk~k{Mxj!nfw|W!vQ}|W|ZG>RjqlY9%)9HHC^!Hk|e$%1ah#xkaS(pr4vuT%-YR4XX
zAUBE|HZjMarlVePeE7p+OwgF?&k}<%W%%{`cDjo}w6fsP2sKVg9il9zwzF3sMJm<8
z@8I5G$kCg_NWhVjGe7dI>q<N&JkdW8xa}4_G(SJvTpN{CY?#A5tKdN|$3sMge|IU!
z{`<#VE1gwVCDpfiw5#jJrf-k-wXB0InR$r3>O+I5?Lx3T@Ym|!J>Ss|eQ-^zogYp!
zE1qS>r)1Cb7VW~iLvPioZ$-4#2oh){hMP`=$-HG=pK`lYCS5mc&fHdB5%6`di5X1B
zojWu<`5Cy&Ktjawh9aU1ORGXRD5$dP*vx2;g@m5O>rGK1OAg=cJqYCCH18*bq=+_u
zRPS9|Lxd)mODkzZvlq(CI5iF_6qTKi>%1W+{g0Bhp*PH80Ra_(ZdZI>y7{ipg8ISS
z%}Dyfixzx0bV1XkBr<>coTh#jzk_!H0EWGV)hGjoC6Q_Rw8W$F!o100+ze@#Nn~GS
zX~n^=TmI!_d8c~B0-~EB1&~IY%vD@?BoA^e%sQ_(Wki>eP!0f6gg63Ap0C^pm*cB-
z<mlmM@12>u=VMWFW-C)#ScnESW9fS9s;B>_&t(H3(jc9{Ckpn|OzJoS>j9(0ToUmI
z7fpet#=UZP$8UA&Xdu*nQF-|WRDVW4KD!NOJRac>ZL4I@hycsx==bl6K0aS^o=4VN
z#}3^-jm|G2Z4;puOhGagppRpMWT||CdhLV#tPNs=QDfDTMN!3@0X&2)E$<5%t?w>B
zQhyUi#@G<8@3nP_rzjp_RC8Q&H@ftJtNwVJ@7(Lpo|fCsuTIx>=_h%|35%C(Ew3<N
z2=;WQOXELHt?<jw%GTMoJxkiY!oyg);kAqV9K+CN)`NQYEAK0|MIFiIT2Fb^r|OPm
z;$=N0$<~8ef(bM@XSC(zmjvuzu1NN4?5Pb8dYiE4CbjW09EAgmg&+QrG-Zwoz@)P#
z8~IZ~Lj_Ip?kR(bYSXDs<J*JxwWcPyzEhkZRYLXYm1hnan3NVg2)xQ|NzY;}e^UkE
zi;H~H5)VWn_|}n~DcW|v_NRhS_wMe0us=H`70&;O_&yxWXZ{xB%9qEaUXkL^^ZV1w
zI{RY?8chAyJ^4Y~p9nj|+RBO}zvly)DlR!$#0Wu+jbKQ8+@m9`1bQWEU;&Oy|A+Vj
z3;Gq1BP^^O2K+kgV2P{ouBUsZmQppMzv{Ss)aQN$NL!-LhIg6R%~l1$$SM2k>Z(c-
z?<;M*Rq25IJ)7MWstgeyK~PW_SBUx#Z%q8HnL=U{JIciHmCZ~ESMSO99)M2+4~9ot
zljnEp)ai2ho}b4AQ}p{_Fu4Nl-~yD?GBQe_e)iT1GhFWU3}#Z6KqeNDOU{|m5<HUj
zeDm;wri;I{%WWB(f&PPvP#1+eYQL2riM~;ld`Z4IIf$L=`@BQzJCjOn%??fmc~N+_
zKt(tHwf}rVIMBI>Q9y<|j`H2JYRHCHKNtEK!|*c&ZYGv^Mf(4R=7DyiUrUy)g5UC|
z)IrtIa0J|iO1iqbiVp;3J3dYgJ74uLqqe!@(`r~f9amnf_fPIWy&r8#_mJC6lOIFH
znMi(LU;&qF^AnYiSX?3=`BVAKwthO+P(qvCCW<!n&MNU4)r>a%;DT;lJNCdT8dnST
z1|sj{@O`Jf`y;uH1!MOsrO)U5vqDaI>eM#Xo|gy5?=OYOxX35Ca?iNmnYY9<X`-kL
z<{+XH3bYJi;Ait1>52}#bau$$5NM9)CSI!W!8RH0`D?b4{MYkW{>IGQyp%^lwqs4Q
z|4J_F+XWuBteDgywJ1a3nvI-H>Q9NWRCi2Nw{yl{-ot;AG-8%&oz1<;C~Gkx8M+uq
zh)P+MKzpo%0l~CuqN1*p!en>$=iAS|ttEV5XW4s{$DnMg!zXW@C?)q|qeH^^6P`m4
zDN+2-J~K_Z0t$yvsoSqW28LMukl7p0u~++7Amg8iP2Z|!+Fyw4jGKW?e~a)rg|d6m
zQ47|=`<l0bn`V}?qoe;DofwCZTZmv<q&{!@x!4K=Bprxv+22u-WlECQ{Bk9a8T`W?
z9PQI}{gxIe%Q6iiHocVx9`3c=+V)gLMd5u3k6fUeCN7j76^rYS4Hp8H5D6v;tdX7<
z)~q(;l7krn#AuTFo-VCFkiVPcRw8-FwDN(n@F5DEhha3FDbUeiFgYiO+gkZgSw+<K
zgW{Sti$x1e49;80oO`LH6;%10U?y6VK^d-V`{MO3FT$9zHSKWGObI6kiBmzaw3wbu
zwb7=Z%(+4=nwA_6`tTzz_k3j((fgRr(ai6TfjNLKq`U$~wVdZ+@2#^oIGvU|KmDzN
zq!_@iUh=KA)y1x6%hI+(hcMvacdoa>j~>o9=b5;jvSFh`guG5zznPlq<qw|5o=8%+
zvDSZYhw1F&LDi(6pov1SgJjKvQ1}pnl~63-xe;Zbf8QANLOn|mVYJoA4`7@oY|dU-
zc%fdepQRH!!8l!<$hABL1CX7HyF(v2X+%}u*LdbS1&}3<d%$5}UoV{9UTIsBpD@X#
zHJqk<M30+cc+rofO5-UpW#^Yx-|F$NfBdB*GiE>RUKQi=dxb`D`PUVrC%w4(`K0}H
zQdaGZM+f8SgOTXi_PmE7B4q#UlniAj>|eVwhxiKL7N@vnoZ?INieWrLBA%LfZA>}1
zw99vV&iLP6({ZDEN`=*-lec%PPw(q!_Qu%0MFEL--`so(JJc{V{fi8G{tyTs*xnGz
zvb=H~li1z7OBHU~0uuVhsT>Uv7LoJn;Y^J^^NShkm3x<_w%2Q3R|Au8E&E50A8e&H
zRtb&M=}-<F@#2W0)VA3WWV`-8lx_N|(&kEx+vX^S%UU9w;;rTW7U>&c_DG+m$i>gE
z{k+ZigO)JG<A8cyi+uf`o)VIitC*6y(lrZaPB#Y^MIopZ8CHO`OnOx-bM%mfZBm4E
z_Y1k$Z90@|Rp*;BRg1Zt6qk~vZfP{fVtPOWfou(5S2f%cRsCpCKza&4gTu`zVj$C5
zyV<j{Xt9d>o+b#;&`l4%6G}nFBuX{aXqsMZfW{P2lFZv@ITAgoFExo2obbDU^|am1
zzi1lQxx*F<3vmo%20iJhc#aY76{U=#xIZ{Q2x5ef?!Y#{bv$6jF7OjgYAF`Ceo%^L
zsEDh)uC|uA)2*gf_*CGO^ec?HkYii&Q(nk>czC2qj>$gL0j3V=x|ch{I_7SBhcuIn
z2sRLpm~oN$T^!y{OB_7t=eX2f|IEh9$_fOk)dM3+9?Z02I(@P*$p$b*eWFn2;iemy
zJr;gmqTPmAhMK*O8=&Pw3|MLG@qK2*m$Gw;ObJk$BzTo2a%Ikfas@ZJI9OH-3cCNJ
z<Z8Wh^z-}oL8xtK!MPV)YbpClE>V*j-E82M?zyqFSnjG&{%GYP8VVP-2#Z0>@#sBl
z$u>q4opf{pz0JKH`T}vlFP9|!-|oFw-xJB8!FdwdH^j;?O=HH)GD5S%ES~aPRo?q?
zgc>SAtAu5^SGX}?Y=z9cPqnJDTQuUc7H-4`$o+f$s_%nZLBp4o2FVpp{4l{*?k-c`
z_NT(q{3OQXzl#*-?3pSZSy{RDba-5djxpXfH~gaGbNJD2Bq-I!c)E60vee|K-af*t
zOmJAk#(paNj3C;WE&CCE)Z)E+<d<I*t<$^bq8yfHiK6v7rBnqT%OA;D|B_1i_Xcx~
zdO7QaK&B?de&9XDoIx>87^}9x+lb_G;BmSy>{*CtQU=C=e1<WDh@Huu;G$2vzTh2%
zDKiq$Y+KeCjM&J7gKz(*E-T!7{!zFalw4#?sY&|D=aKdwV-`&L*~mhbSFtiKAR{^h
z=W+x|RR2CFe)^2>yT&Q~y486p36)hF$CZU3dQOJ7xIvdROux%BYYUBBt=wB0u<7q$
za=%SpjFrFUEI_ZA`ewr`)2W-d;Nfm$HT6ghbl{Lz?Y9KA;Db8G@z45K5=0fSorfT~
zZ>HUms$H(kaVKM=Vgk=_pFA8gseFJ4p#QTJ?CkK6@ytTv9Q%fgLCVRA`^&~PA_BZS
zFFfe***7BtvzW^*wP>{f+DsA{7E9R1&Z@7kzt5oDUnFT6fZeDT4v}l)Fa`!ZRp+xS
zqwRDVd2n;T=bbt+b#(&h#4`P^&S>$&sy@)ZsjK7rAIM3RG*YY0q6VrrobdKPw)BsY
zWr;S+IE;r89em&SZxh#^@xh$`HYCL%?;Hv1e{36|U)cNo&i%)-jWgh=LM+O#At6t2
zSHyh;<OeLs{!_Z_rdFmG=WHSnO*K=4*`nFt8UVNsjL`5uD=5-=s`z0jfY!ns7Ri}`
z?$k@U#0C|=RQ^sliL<`<)%$(^(_1=8tFrRO-Q(OxGzB+2)du!M-0|5cbak}rf&g0?
zTyw(2UXo3%K+Ex$F>Q{PlDT|xzJKC`udTvj#2B@~e`T(9oLwUwWY@mLfcmIoY61IY
z^Cgj5!zxX<|6L}0X@w`|JzX97@0}a^=`?yx1Ol{^m1s>;Om<aV$-j$w`7Q_A=SUDu
zc-Wfv*I47$RknxwYkjS=W0}ugBy2mr=)^R?OBb<Ya!j~t5MsqI-=h72J-Z;(<hahg
zimr2Xg5<H|mGN*an-lvNru!cU<QDs@?p*2P(4cx((0fy(Ge;J#YQ=6)(Fb@66WB{g
z+!DPyj6w436yfAENDG9<JmCvw>R+ZuTyGF-GLZS2tnC!v8Q^piCn||t8ak@tIF<)j
zv8^)Hc!+cdLC{xiH65PfyH%KXo!zBBPsL?+jwvP{i+&n58Vz(@7#rg(vvWB$!c*1p
z4!Bvv-f~I}6hGf)h1rdWTkMyg8ynMr^CC1#?Hr5-c4rOD&!#{pAT)yZStX^Ln5_~I
z$r{v^(6do2fn&giI7if~Ey(bnHEwBfjnS8>!-XD@A|Xqlr48~t3_!s`z|oWe;vSAv
zF;=oA#9@FO^9Ju{RR6CPc6eeZDKn-{4Vkg`2#_jBKvQO%e8Q*+t_~1-z`MYqa6C&n
zeLpdSZmJhPTZ4mFOrXKDXo*w$Xx9FGf_Ha+e*;eZT7s9#9j3AL#x?Q7|7QUlXEOgH
za1GQ((zhy{34oRc1ZEYIulX|e-meJ7cgpi(zLXa^>r+<K)3c@ZTWwB?ZuPzfQgQ`x
zbz>Tyit7l4RkO^Vm2H!}%x#7oDK8*D&maf16LHbP9gIXLL=ds&*48E=EQIAS8NG6r
zVx{&65d8sp4FP)Se**?2T&qB|UFA0eNfb$9C7kR5+yH_;z*!pppZb_PGR5vk+3PXP
zcjsw4FGP-n6!K=Bc(XnUULGXszs4GUd*swsX7<so|2q1ixSJgIi8^V_a>3CznQg~^
z)nu&)549WiABE+G-t3RYox)Il?fTmtPmRcpw>Ge2oYAK!rDSDUs$EjtP5AKKaI8FH
zRXnzqGD0CIwMQ&PGEZ#QNQZ)HZ(6#JWwgxjQBv;7nsK6d-l*>aMj58heskZyzkH*k
z-+G_E`VS{XrT0X@f3C~Ppgkux22LDY+V;9<l|Ci3p@RO|GRVF;R*5eu=RMC4tP1=0
zA(47t!-5po;&YC-)QY~~qrB9)s%8}kG7qEA%I2Xa*JWI@RrL*BYPR`~_IACe$u?Q8
zsk%A=N?I2e7oE+U7Z_}?=r)5D#BQNE4Vig(9b16t+*f6bm+v*(7Z5|yTM4TQR3)&l
z!gnnKAeCvE7=6KFE@i~@$?km5DrTP>#@yq%c08;Q1%5aW5t4lS!Ch<5*0_ov5FL5%
zc|w>DnbQL=DtsQQ2hyK}UyRmFLDpeRM#owGOZ=&wtu5lmg_>ju{Xy97O0QAkxJr59
zi!F3A@z?eDcaF#$hg~M}_~C_*xu+9Lg8QA24)1mDY-7*@*6Z6URl+ud{*MSlaNq*v
z>=(5oa{TI53Faj+-}cS8=~{_n81qk&+ycZ?bqwT5M&vJ>j}2<!!^6*jy+NGw2#%@K
z?0Fb^IbX%nF&0o6_cWgni0-`HY%u~?bPntWZ6&2B>&H&t6V;@zB>8?g8!U&qySg60
z)g~4ehC#<Tg@k~Chu5bv0cxoGrSa}GekmfA@VOoXRq&>}TRK4-O~PksSZJRX?$78f
zZ~AFkAdy@~STVD2&%a`6c^`@TK$m%a_}u^yc{(ryDdVyqipvU{O3!%tISd)aYOmp>
z`30<z23+Wm9P^3>tVuyqK^yqNZmH1E5`JGqqcMlw3!dl%Mf#xvNs;?ooNqE5TGD_E
zg`PC{M-PXkO5VWjB7U^2BW@468=cIk=v`^((t<L&3>RGXbAL@x>>bH(zdE-{t0Rr$
z`0enc+cKIzSoxtdt$JwuU+2KeJ9?^QyH9!A1B=xE>)obC?df=qF@KC#VaBGwgP#{7
zGG(W&+}7<EL>O(J*Kvv+U8PXzaG%&u>f#$g3I4Bq9;)JRX)Hl(*1K!(s2lAlrvGdb
zZ+2DrFw`x6Ui=qr)LZA%UbeBTNa(3CVYAs7XsX&C_h7aSON#WW;>1m@OIS)zMa-es
z=^Nb#E~EZq+XKcC!uO<tF(+O@=ED4#3BT30tb8BPH+7FyRdlYVKT=G63ZZf^pWJD^
zz5?_(LxkxTm#03bT<;w=3VgQMFetweCPkv^Cxo=X9*cyQfUl_23X2{ey<-R?zTW%|
z+Wo*W#VYK8rsJ`_RxVQ)DC!Xx@Ws#T%ZG4|1KF{+JR_W866(yKH2hi7PRO4EwYFa5
z*Dxhk&}EM;v}yeO{AI6Zg=_%6U?EZ}SU(;S9RZ2o-Ym?YL6NTNpo`tN(oU>&{bS^M
z&F%9yzb6qV7LpQ0`7z0dsbTci#o378I=D1%McilSn|>htQbq<1PQ+lEG^O3()W$Or
zzEnEZ($aTAeU$0wpT-{en|&HGuOd4%_4E|rSCOO^<Y*)qeHH7PcRG>55lTHu_izO=
zY|?5c75U01V^z>;o(tX=y;#CQvMwTTscwBoc^sQ;j&N*HEf1X9acJzy{!6ec$jHcu
zlK7pFeGv%0|MuUiOCc6q#1!5KVjvc))luCYPFvPi%H)q5fk)Ic>v2i?m}fRS)}&`j
z)Cr7x&t^w_pG&je8MpD5b@lOif}Dea>;-T=T6xC2Bg!7^DJG&-o|J*Kl{<g+K0aMW
z8E{~JJj>MzmOb#0I-$T>Pk@o@DN21Z<LZN#hFq~5#*q?$N56abj)#vgS!R%lR60)q
zBxJma7mJ!=WPhaT6Vg{Sy_8uDSyUX!3ly{aq5LB;LC5|#Bf=uMY*iqYQE?#aNyNz<
ziH7xyIs>iAvvlLhl)2!RD%)&@6x*thw-lz$FOt5Ms-vV7;@%I8Oe21c*is`@TD|81
zd?`69gG)r=R?}il*cdnpI##>>M=Hr2+rq-T@gtUU`SDh}T-M{X?mvC#)h!hZ8}qyN
zd+DgAV(keqAZqqLd&T*I;fq;Y%OQ0M?9Wy9;dA+qlFYh_*%RE4BuzODnblNJGFO|`
z5*o%63x{m2a_cK#h;dj~5?}Gxu*YBJ%1XSH>+A8EFchH>hY#~A_`$+Byl3Uxw{H)u
zL-g5X1mDrWdqzs3=eP`n6gYgVQ)SU}K|STLonnC)3)pcVw!)wu33&=mGKp^^buC--
z?beHy58V-aQam<?aUr(`DMN5jQpnWQbL7knL_A1jRPPoxWFX9~uC9(eX85btc+h_a
z8jc5l;sY$7C_c*zR^0=yy^mp7GM1KwTR2Te3+6PGHQ|Wm&(1>n)0gefY=E>cnm#<&
zP9u1*8mieM&As(E*&HEE5;TP=B4D@x)2D8yq?NwnrY2IO8A}=+t~9d8g^ItgzfSL6
z6#D%u<O&aZziT4$_j^u$eh{b=&@_C7<3f?ci;%Os!fADrrQj*x5cyE`zZ1WsCIp_(
zV|pl#G77{5G5``5CfUv}*!~tyhoiUsJdTHV;|ls{=!G(d(&brH_C^OTo`bR%NzDD<
z=HA{ql5Y>;N-WwHZxG&swoY!d9xRTaaQe+ys7JH!uN`{<Ss)sD%}0S*XNaqhtFa3f
zvweZpSP3i>V#!C$c(RRIFstiO!TX@`F|w%t5A7z()YO!kx;i(@91WeS)OrEF^*x<5
zAdtb5ElZ0J&DXGoF(9U3if}^C?SVAr)T)m_lu;DW2--<{ZpJel&IHiS+?83uoptY3
zWGdTE6YMy`{bkBaTfz}Q{)FYMXXM}zy(tgV>$_T>pr@H5gD&4lxxvtr7|}sf=Ilpu
zZ?Uor$-9nX((<1MQ<q<Cxs_O<^pG%76(&;K&CevS3E3q}F?{ge%QX-eSHMp1!M*ut
znlz%z%E}HX@Ui*HA=m+D78agSZNENxZoNU{=*5<j&!FR&{>lG)e2&m~u}#Kpk&$)h
z4=%)2&*GGJ^}PJf`NFMSq14IJY<T1Rz0xysT`-X({_7d@JVJtN=b00rd6aR*h23nX
z^!Dc>j1VEGCW?Rxa|nf@S(vzHIdF1n3UrlT7{dXt4aFX^2?MN$w6DW3dmYo|hTB_R
zhC6#iON!dexk#F#+`>8Y&@rp?kcALqFjF7_G4c#qSOcbevNMY`aHd%}7o)#DQHxmz
zxSsusj%{F;(0<69AcCIfpvI&`lm&BGoDb5~W~q9~fGFLEo3lQLOO~I}3Kn-gy#erQ
zXs#QDY3gNqbmopR0JlyIIX6?n0Kl7D`HrI07L}H=(#*t|ZMn|X9BbD^%F>#&Fk@~b
zY0O7w%}yT!l4WUuzR81}KrkYyWCJRb3ILaePm90gjg?QbIkgKpmDG5VlyuVTbr4BS
zpTIes%g+pwiS*6lcoWJnOn-lQ>DrS0w*7%rT)7Nb=MdWpvOps=0r<_O{QN|r%+D_?
z8<x~nD<VWfY2fe~rq}CY8Y(O#QOM2ppOE2{+MfE}LvKMbk?ak)<#XZL0x5{^_VRZ*
zch1x8@j!Hf2Iw*dZ%dIay$?29v(TE#R-~8UO5?H0`i7BmLcaBxAPfbR1Hg2WrBVZZ
z5%vqtAs9$$`Ar8bTQ#iaIuibuz3o-rJ<2*(EGt-1u7^hyS^HMnbM524R6@Iqt;wZ@
zqgh9_=(RToq?s|NQ4?co?l~PMhOiNC{aBE5;c2_CEF6@^n($EquZj{cOtt38zi`GW
z_vV44CcpNM5z)T%AyVzbUjrY)1(-@d8r;q#)Dx;xuMiH;A9rHFA&p}5@@S(U^*s<g
zsQOxUSmBr?h0gu^&m9*j;T|fIXca?s1!|HGRwBmr|3}kVM^(9XU0g*G32CIeyQJd+
z(%s!44blzLE#2Lc(j}pQba!_M(%pRfdB^zva5(P0UOk?3p1s$aYt8vPYp<$r=rcXz
z5>5WXCDjL?di4@vyQC)B1?Tb)9FyvWKFO%QM1^_Ebz?nhX_RsuiFRLpy;+<)y}g+9
zXwMaWLc_x@c7{oP5S+q&2LY{;xq{rN$oYB>XW}j#?U3C{>wE$+?j9!l-UI7N01OOs
zyf$;aAoc=guM&KYRmK0(bAX{k67-8TgN$rHuLe+3zeJYW2fxsB-%d1qM+xRQT=CT!
zkwA6>@2k+b2^!(3v<&4CP<er`st>GhaYQz%g8Xuh9p%FE`vLA8)=$*4ljGxtoyW#)
z1xPjX2UVHJ;x7pj!g#P*mw?Ny@?V;gX?`1dL~EMcStaY<F~k=HhVvbrRGFTOYhnkG
zzjF(%Lz<bH&9n4u7^rF$qSHJJ9M(+&qcC+m0nXkw*Lx5&0o6*TeI>1*HcG%IWTf(Z
z3<5t{tOWLa1{Br2WJA|Gp@nA=1B1*L1L6;p0sN~+F}^uhff@xqXh;VfiUBie%JTe)
zBO@cF^PM0v23+yD-d*E*8z-=;Z+dDPS3CymbiYKtai9FQiS4AYz}l3Hm7u_ZqO{4^
zG@s<Ii6T6&5z=J{Op0JZjjszhHoJEPuDQX{dcrp<;qxYTAE*FOFe6OqYt+4gClJNQ
zwH}D>;z5RaFzF!wH8uO`QvZ3H%Q113(G3Zxtl*9TC(UOx_N`lwtlo0X+7rHLARH)y
z26sV`wLjo*?$D`F2!jMeTtmz%();&Gdp)$<j$KRu=?E58=75=gnEzdG%>p*I7|$6k
z<;|%UieC-&8}+tah3~$b23Dh9>Q;RewcR52?^mqs-Y+@YQECoveL1oIL(GU%=x%sO
z<nA&|_prCz820gF+`v{zSi(iMS5m(3;ehK-LqlV%GdxBYR;B4}pTXXWmzg}PX0V%`
z7#|)2(fP%#11zd?JNpPq%Wjm0hg~ZNnie*@5H;Pfll0)LSu%`w@r+v!$aS6e;bb!r
zj3pBZv`q<FqHD^{l<uRu@J}{59G%`Em1$~b=9^U0`#4)a#mp#6r?;AjFE$YgJpSh5
zd!77-^jzp?F>%>7kHv<;eTi45FpU$y@hN%!0eHICC&HXvTqb}g5R@gc5|McB2zi2u
zeEC58js}r^kSQB<adO()>HP<ouiVMeKo;gGeN`fsbuS3s0UaljgJyqbS%!MSBf#;d
zC~^B*QNl?p?6Lsy5{Mfy1=URfV3Qg=3&6lL$R+l3;5AK5JLrkP;rsEuv}y2XVBG<W
zFWoit%6C<)<(J7^rJYFHVCN9X+Y-}=4>{9QQ)9r84w2ZK9SZwxu{?pyE(@myoWApA
z6yi2uq6t(7Ht)L&`&s1RGY$y;1%bH+GPYysQ)TiJpq>MAGD4Os(`?`^fr7)p!OQIB
zrIvQl3{GX>!0!FjWg>@4lQ9r^l2cnxM*4JC{2+GMY_kaYMZB-(Lx3>>!jeIf2k_lR
zEACp(^MRP56}L?!P;IkX3n1z>&H)|KXiLXshL)3C>tBt%8({iDIgf^T4Qr1Qc($wU
znP5Bsy{7<s%>*{dkdTldKM`abf|ABSa}Wha<Uy5{m8@$*1#bR^x3N8GUsSeT{a-Tz
zpBIFVfmWeG#E@YJo-9%xEzEMwgF>n4yjxaZe}W$7s`Ii^K*B)DUd^V)-#B9<i_l=Q
zHH<1gI8PK8Ol2#oH7BH#Kb!IevDn_WM5u#eB`9sTwppIen4|4Aan~L^p&zH9t(~jB
z4+!PD9p7l~i4<M<aU-UEeFD6yzoJL2<vny`$|qhtKhS|e&HV7o+Czg-dv;cb%b+Gp
z(GQ1>Br0{;J4{J7@AZQEHD<PtBfKJec3;DGXzy)?9Tli>`MuaJ_Gh~>47#N4dc}G=
zn$8(*2Phn5PmE1?!MO(<z&95eAd?^@7xh4i8wIR~GN4d*g(TJixObcqG^TNc<b%yM
zE+*TTDpml^&;u|p!UEY$+ByEbN8soHw%}VR8ckMqxQXC*b=Ei}e}J@^ZXgs#q0<!}
zl4tyFVU(4XHSMQ1GR=seumhriUHfU2UCP+K#%z9qQBNdJm+&glpQ`GmXJ{E!SRg+&
za39>&s!6#mWk<R(C7{=VR&Yy=fcu)P3$+eR`3fZC5j*HYKZPFl)vNlbbICxP-xU=V
zGMPNiQ>#`Lz~4uO6~SV&P*<~QS-}kkIxh!=SAL)Bxch?dz6mg&^Hx}uBtQWkyCA2q
zu&r%5@(s7%7lC&{!eAPM+mqN^3YIQ_l(z$NK#0@?Yzoc+)fc>~kCogxz^4g8W&pW)
z=*ufZx4e-NuA3U6EM~)3j=Lj;K&k^HRgjWrJb)UWAa$aXKOF79lLd5qvHy6?f8T;F
z7zXML0X3y%H~POAJ)8CrV+c%jSR%=2eUn)d2e)NjWlEd_v$*n;&C|mz5Vwg=>WKhS
zZ3tRtIUnYP*pq<gPz+x6rzC$?j^DBxU>-&ojUM(1l5)6#p%2R5`O_<K#R3`vgRuh>
zR*Tn7Wy_mY_J=K^hcwyh$!}ZQ*yY>r(Cn`2CKV;l_j&(4zK%i>AyI#zC8oU4_)~Xg
zD*pohjlL_BWe}Vvp`s1yc(}Qu@{fFaz6&@3@0)S<V5@h=%H8?3%)e@Xw04up3rg0c
z6dEp=D5bIwD911;h^I8in|;|+Z<k{*G@|AsM$&OPS1q4^BD_sQkz)_Ex}7bV9^?P3
zbo(*DKM|J)<L-IR^O{R~;BMskkx$jx*87fwf0=D4=-(&XR}t{xijt9}dM9zO+3ec_
zA2{B{nxYSfhmBu0*o&Ne(8IH5XOwsqn260R#(|3Zf$1wLX&xCIA{^q7%;<%TYxa?2
zS?TV);J#OOrLwo2{$~%yl=D>eww()ZXZKRM%u;#p_SNIvpAtd)XlDms8(RB<X6iLo
zR*e3!WInu9jxpVp&>JvN6Ke4@Ks%FXup$lj`PGs;ol$2C<5AjE+PB;N5znn_P*3t@
zw!qojX^)o4$=HlOKvrQ`YQit5(41JGxp3Kjyb$Glyr6FuiQ9}-qr#FpN_v!Rd?9`9
zC@W|>=%r*qWe8L4qamlK=W;*4xnRvfK$4M`W&~14f=B?edx@oA?Zbe_gnTW}<*%3#
z*?v743VIsD{Bn5b0(9Zq5sG6exYMmI@<Jtx@$~q5Nyf6ReId`m-!RL*&D=Qg7g5t%
zynkvGsE3q$aid__*w`W?Bb%1$*d;Yxx~b3aF<)uXW2I(fIKewnNeBQyd7fHXGfGK$
z?YKEtcDEC5FyDzO5#y(K49SB43BYTWSa=kGA0Soee|#UYLml2pHbb8GAk~Mt;{$_c
zt4kftvV7kRd_y!h%at!@D0k*8K4R}QK5N_F2)6mm%X9DSg*z!WcW3yTn};fuupjf#
z#n?;k2X3_##YSBe(1mVu75Q?$J-d%xm2B4e?BFm#&~*T#oj?8s+1nnUcU$wSc-*|o
z^;i!i{ABpmshmc%P^~-BxM9?DJ&xto1NsZVeRZ}t|697HvJgQt1OQ*)V(Q3eo!ir+
zO>r$myA<~%lqyp*rO}uqbt6a#^U}57?=|<M4%b}|t}!4BXuulPf=6Z6-5lD@9<XcQ
za5hJj75*Id9p!A6^MOHL?!agPxjG1cbK0U#DTTUd4aCX1eYQkiq(iqpAfnh4%u6jA
zxS0~@yZ=<Vj2oIC)j-Rlfwj4ZMdL)ysFS90^!{^j#JhT}NVt~U4<rE=c&TzHSYuej
z87OIiy$sYOuSzkC)aDVw9&D<LKK^H{$PoVp`S@$srnD$eAW6)*wA4QGR&UmbNI&VL
zPvc4O!}rO5S_9|}LsW^L%(EQt%?EX3y$b6>wC-vjg*Y&DOH{D2vA^g!O3}F1#?H=H
zyWYu}w9N;fFAMY6bmZapc{#k)u}C~HAoiS$eXi}b&ON4=-3K4RuHe|?wRcC<uVMtU
znq0<;`un5F`*=zbX;BybAZCd^rK$J}mkE@hF6_<lmWlnym|yI<gJ^#*>{t&2W-FNN
zso>UScAfhJw2PgGM3d^s3P5f&WAbu4%t+#kozQ&zO}UC$lVu>-*gi(+!j#t|A|W+*
zsE5N1v`jvZO~zuAWM%DEgG1N-etgd^j8CGQf>%vb-{XGlNue!n7Jl9qL-d=5vowo;
zvx56Vp^!r{$i_Ghe9awgz-Uc}{6+|cnS-NeZf>siw68kxP;$g_A%qzuK?5vkgVmYR
zG5UF5j?#i^w598Mv5h~LJ8PY4s^y?ghjzhqno5BsA56RIUyo3D!>$u*Cc$&-Gq<50
zq<z{Qv!4y;uKIW4v^*yADrDHyRMW#V6XxuvUb2DWA~c_6y5+Oo{d41kPZZp!vlea<
zbE8O3!pvLvgr-RY#q8Ir=gHs@yvJ!VYm%6&z5HsX9uOK@)BYi+X28JOpr2V(NYdMs
zn+gV&zzA!3Ywr;~`w>fH*9vypZYFr|Ic*-)Wt6LDeL_xPmNaT}1JAh>W3rVed{TtE
zOGnLS(o)~{H>!WSNj*|1c>(Vft+!#ReVN`OY%3G|eLDW(n;U~O_k2!wzedd2`zVwI
zSCII4!0~}}_3U1AKtNK|43ue?9+_O<py+A`2eiUC%aOtZc`cIc%uHvJ;q9gJ+WQ#q
zuYUvOcIN1U>c|3ObVxSu(W_(bWidn<)jenoR|BS-CEyu9DJCtg72=ONF&3)Wia50C
z%2it83O`H)s#K=$NgX|MUyVuPe9Bk3E}D=QRU1Z2k88bBu$So2U&A!Z?&|MwLQ~zB
z5&L-c(MsnM*|(QmmR2!pqK`Q0^QVe&;$zRPJ1KL7Vdj#*NKxA%(Ev0p&y&cP_Tr{L
zA9<3QCAKE4xK57$mNutLfFM%MUL3yWK1r2cRr|6j;R#OZ{eFA&?jN@WabUC78CAwq
z$3n>TeJk}oNtpb@osLh;TMrVR407>KVi*`0>cY97u0PIO;}{W^&u(hEWmU2-_@srX
zYSLGBVr5=U_UQ#qh7bPci~i7Mw6ExLpLw>xGu((lZ<-oJ7+13i-!+E%raxbIA7))R
zd`dVb-HxY2sia_XOsOZ!`%BGCLw(j@Y07fnwNmt?QMR-4YjrKf#q3k58y)!BBw@>|
zYtLnUCwQSjuG(I;a<jUbkyYdc<)|d7r3u9vxy~*>fjXh&S7<Nr(#ePD@T|Arj%X&5
z<9)Q>DT!3kN|T%7A<*8a3R8P2fGh7_j*GvOk(>o{bF(#nw-w-ZcV}f-Ad@kum|2N3
zV>Zt7Yw!(wC#)iw*gbbbaPYTN7VE`JMAb?18BrrrpR4XocINJ{X8;RM6-3NnI)ocF
z?U>N}M)Z2}MGIc6&i+Cxg~<}bC$($WCC5cOZ9I@~bnjI`B&3BS@acL_PX1QBMXOGS
z{;IpV1HMz3`hV!s-D8m0L~F(rr;73Z&rMD4ctD|fImQI?>%CLLzX2EPoli@@l51CB
zZUrWL)g69p?aa~8w<^tsIKBTqm(zDFDv*507)%{r<x;8iQTH@M?Y;V4tDpulnZ-xJ
zinYVzn@Jl4WO6ExLgK_lqh8X#SeN~TgLRK`cWDg?Cq7tvRs+I=L?}2@L+(fJbq-^Z
zzf-W$MGX*B0@pO92icIV@dIQNK7G~b3z#6+W&E!mgWMRY8<v1NZ5mdHY(w=2ceaE@
zQHkSXW5-D5wQsbAu#8Hv;cr;(&bc^c2H{f7SQZp<NqJ+Wo&fJ@3qtoh`x!z<Sop4n
zLwPi-(9o(ZIS)#_h+=ZSv`8C9nbHqn2b;*`olaw`pm9=iK9?Qu;M3V|{IzX16;;M>
z<r&E-T9t3ur2$$zacc(4D#)MO{9Qz?eP=qB9(go4_v>a;Zo&!?uxl12^~bKvTIZb^
z>Xu^VzSkJ@qfpRCiJa!tj@>38)e`n;yuWab5m29iUD&I3(UNa;wD!Fg<$Gbs_|}nT
z-}$bVJ@;>d`cI|ZQQ`9+b+4K<s+SyM1O)b4_iQ!xolSg6W{fiP1o!^ww_sC8SW9s6
z?i6N-Wcma|@EaVCKDqA6X3PyZh0jLV<*0O~sl5zF5Y*blAjA9fiL#TO3|5)~{$@z#
zV0}oIs`lt_RQJu&i5=(pq)qghj44qAlf1Gf>6@(MdOLZVgyPUD4LsvSQTOl?u|&_K
zX8|(;M|c5dZe_pUXw{MuL{hD<E0o_>(=1{e^P=Px8kfB8ShSAtHjTiv!k_#5Vcw<8
ztvj;{Yqn@aD?;^ckrbmRA6*W!$!xvgp!33$R0&bdw;;wZbQSpYmpye;5$8{QmjcPB
zIz+x$ZzE!e{@yO`weqRJJ1mfXS2jy3G@uYU5r=QVTs=gd7(A5MTb3dcFZ|E&(EI;)
z0jA<6zVe{9id<%XZgeE>2>NDdG%8(i8pn8e9iB@;)*IYzgo=(wYV@G<ZMNe!ioI@|
zjq{lR+0HZ4dm0`I8NSem4|#)23ZAfB)<PaO-zjdB(o0#0jg{~2ymwx8UW8jc$HMp_
zDcro!1S@_h13rM~h=(c1^TuSa%m2;0Wu1@eO3h}c8!EobT~6g{8zBu!mc75-&h#u)
z?VA_*P^Rr@vahR{-}lHKN_75S=^(T5O^(>TyYU^bp1XC)Q7nE;HS^sWHX_7{lr&k*
zIF6rnSC@@^Ss3@Fm{eDnPdzhjM^EYHK_d^V%yHJQ{jZbq-0wPcwz4~}ek@%b@@lsE
zFqC#kkepa7gmTHW_?Q!OU~O(xaFi%F3|JUezd5w>Z%0EGcu*b)B5iIbMT+CF?rG2}
z*8C@tU0>ltfpfmiO&_gE^qsY-iKA)m3YlHr%Kf|%$DWCstp+Sz0daxQgG*^0ARCLd
zbj`JcNVBf`{^e}5&5|O)CvZ{)xonV}wHIS%Ny$W-T~Wx+$+GPfOv-Kdq<8d@Aq&bF
zx_z~@R#@m##+{T2O*_7?pj```qNo^inm|bH9v%DMJyiYo12cE@^L^V-9L6hD41|L+
zIdl(HZYBw9TcZf&he#W;uXiY_LQV|qnguURs+mvy1Vk6!$RJ{R=*bzy36FU?973-V
zyik3Np;4Tn7<4n9Wm@9c??!2exYKA0O5LK(&zS8qh{*4aI#YPt-Kl=MMxm#M=BjpK
zvqx2hSp>ULl5pa#=Scb*kbD|p=1!#fU}#}ESV^!rpGju`&z;8bhqjH%@iFEwW0g45
zzf;`hjWO2SQbfkOI!x#K2=(OZomE|xM~NOLtsrN>k_O(d5LjK=;`ofJTTUzs-y#`0
zi4?^!r_XwY#H*IQwXWB%?eqIVG<Pep!Dkm(D+~}WoLcY&hkihsHb_W9yR5O&Qhm;!
z_`lqgjFPdj;IG)mMy<rZJdBc-QjeN0FOD)JgwYxc7gW4pc{7Q`X`lE}3JX(ouEC!R
z{oz>BNBak*A==j|g-e!vg<~A=cGZe$FSZf3-om1*Uwa_s6aOL$H^Q)#d&v%)NMp@R
zLrU%KQZZv*STZ!i%31@Dhi7EW{B;`M3NeDlJEXLJAr^jbzV)w{6ibKmvLj5$#1Gq&
z&n42o9U_y}C$tYGM6S3B?xkGQ>}X~<-#U8TN5~yECb0#}B?gIW6%+n;99C%!A@mtv
zmwul)9P`SwP{E{<Aajs_e0s3aba(^Hh9<~{d{DzdZ5_{iMD5g(8`lt>X&0RqP5~RE
z)LDq_KN+Jh6-GhR@G518;nV#y^xv^h;>TQ+H4#}9C&-ZNNI7~+)BzetmewF>N-KSM
zm$&ZVWK18CEvh52A7z7k(!v;NhPqCQFT1tN{DntO-wJ)Qa7IChNt#>1J8=*G2@)Y5
zL?<%(h$zXN1c$@a0?z7-JQ+`XDQ!m0=%oksyU*HX(;3G}O5x0i%QpriKfY8G<}QEx
z$!`6DZ8hj&aErG;&Z`<HEbn`T=$xNMYM;i|&qT&|d-js@6Weu~|1KUsDNZ$HX;lQz
z(*Dv$RKytOUEO-X6y!=FOEb#wVwQBg$_ONUcaR%GhAFKETQ3WzRh)g2Av=g)UO(4o
zQ1Lt3@cR#<&QP(8trxJ(_|>lxI#UX$3seCO?sXbBMGYp6%fIAnhOgDsqxrZTXf?$c
z!GXU@ryiE!xqM=^$1BJalq!U?$Y3&s^DJ0l-~Eo+S(Yl{=rpWw=F}#U&!aAkN_k0}
z2oKr=dMpA-bR(2t5<tRo;g{MDrf>V0ypfzpk@RZsk;e9+hG_3GQS9_gvt|BK$`F7$
z9hxI&(zw3qW{+%>F3U-VhAR!+;@`C&-??d7fz)%GOoh*8X4Nww(>A(3X?|XFwq1(2
zH&3fViI!qU1L5-VFu2@T7v*)gePsPirug1UTa9ncf$*`j$X!ZNN3YX;DSLsamWSZv
zZb*ipLpyvf69)VU*R6s9nCt*_g=Kx(z0Xz?{>XcD@V)QC_j1b1EWz4~iwK*5r(oDE
z|MR_h_N$eAawWZNh{NWT*UPke5wh+m%E@}oQu}zL7&8uAG`w_bcQM3V8|@Gj8|{7L
z5VuN=Zfd1BBr^@Sg%=WQ10>v>UvB+F{_P2P24tqCy~^q6AWBiToLGhjjr{QEc!y@0
zHoaYN{%inn^GjeMcngRI5H<ipzFXEWG*l47={&DME=eeE41!QX!Y1@{{x_a*ppcGR
zb1W{zIV)MV97fPj_W!3&^9St4_X4JGqLeA&J30*b)+6U1n;@f1TWRxdJ5TRNt?3{Z
zKz9+up7TW$!|jytThz?HYd>aVXYYoD<3?Koup(*-<g#x=@5bq|BFrmF)XHw!kFS~m
zKPQ+3yg-2AayiJv4<@n4cN}DBWEK!OlDK`!0rXc89LZUsRbM*nrO`mQT6;lqIF(nP
zUgCP^&FuqWc_9zo*T7Jwz~OXGM-cOk^4gF0?c{4|k6W?-L0CX|Tl<6<F|v!LA4@RE
ze9Lle`=`r}izL3~NnRB~_>G=k5=W)S!1zBZ`M936=B$<q=2|6M8rpHm?MhWiPhOi`
zj4f`LF0nfV(Wk?P#i~Mlq1;e-28{qw^mrvKSt9~6!++#IWcy6fl8(%+M9NCJZKFyq
zvuY-SKJBD+oh>#$8s$cLX$&u&_G7?WAT2nNZ-*e{jlsRcIAiAJMfe41K0qA=R#~O8
z>6;PP_AV&43@jLo0@_NZFll)7_izW9tsdN7h+qd1%b?&nXg^kt8y`R)j-QT>-F_qJ
z{RcFfdF!~*-TvKkX_s;N4?JOK<bhToW?$ZWUm5!U7)>(%fK~4v@cVC1WdUG8wFckX
zCPJ8&^Q0_iHLy<9&0s``Zg5eClZvlQX#AyQ>ecKV1`lD`wA(hFem_VqaDadSdf**p
zWn~qvop&nD3B(b7Lx7E)GjBb@i-pE|%@++4$IGDIp?g_a3VbFLWe*M_5c-wjzLVOz
zuli#H44$?CZT3qq5C6<T6Be1yp5qZ%1lwZEzC{G@Lqw@-$6Z;+z<Jg+AK&P|n-@kE
zg*uGJ6{g24ZIF9Ps!$lLJF1m2TvyS9;35cdaNf|^R#9Oy%Vg<eh0M>V&JvC2L0vcD
z_PyXmU^k~&$rf9_1fO=f#Vz{&)+kn3RS)GAJA*S*Wys%l(z3T9T<3LcO`B!<Fl8mP
zY39bE)$go;=C=Wx-u8=fKcKkoo>vjeG<d4DAn}*WG^xPQY`%zILhQQ3x$TNWM&j>o
zc>e^gawny^{P^>YFHXdg=*2pWyKrG!F&}Y}#Kgp2q@<>jAc_c__#s1x%4*}@_W%<B
zu$BOU2F|!Bv99HUz=uM+xV*qZZ8luv(7aIU@b4<LttUZDPKzJvEoo$Dc<Y&JW?E3p
zKvGcU&!Y{nEm}Bk-ee;I<E(iEcFK2@v+pp#lW$ZlQ9BQ!=F0nq)vkm>)62DmzwPO;
zbIX&n({O*2yiOy}ShVH{%cG_twPIJ0vIm|q2k-*`tN>jPnm9odc0dcSe*wU_AewSa
zj5KYyXi!l`aIDW0YzTX_wXeAnf08DoB@d)o4WMqdptvh3&f0)}(COd8zW|}&@1OSs
zAF1^27iK}oDOgws>L42z*PMNNrpnsEGx&2#N@!eLv4qnOTr3TkOmt~PK`hL;dCcH8
zzJ0<m)BS%FC1t`c8Pa~%`t8Z(maT#|y-*#M*qb+Cu;z|J(Z4a1LM~&zBV64e3O|lZ
zru%i<$S^*9g9~wwDUs%ldrz=<O8F@3qWOmVyZKs5O>ylJ!{ldU)~k;+rot@hO>vh>
zNhJNd!mP+0BhH~e-Ar8Ee0bu9475n@xg!YOZOSQnjF?MceM8>%U;~N=4go=7M@P0)
z;j3Sul<9^B_LWT=@cWlMQDOnYx&L7Y4~>j~hYsY|fxkslRn5bXddlQMZlf5+Bnfnw
z)oTDC)awEp?|^#){^n{w^bQ5|E2pIwI>o1o%6_z`eXi|4IjJJkun66E2LR6|1s#3;
z)tbg-B%r%gSF=EA#~@4)h@2DjQYHe~z1Up`2iA)><U|S%FaKS7;En(N#C3On4}1W(
zAC5YneMSVIy`TuQ5$;`lm*bUTK$o`1{~6<3qOy;z^_FY@dp@8jA;i(#W_ho=7BI;^
z8JuQxKr<k)eaVjxZVPBe-3p*fE>iUwO!uA-At532fG6sIF46QsLq#;fp1`tTG!+!L
z>wSSq1B=~+9#5ilnHdC)Q!!h}n`f{elp4VcE<&xbwMK#w3NxOy`Nf`w((_l1rQhNF
zy5>TXrK3tTT&x8%8eeF9qzgVuQns#5ErpDy(c$yw;M7zcKs?P9R$@$U21B}L5Cwb_
z(Q$)d@GU{Y94#*9Y-lKwH(PW(4!@omF*PgeGr01#D#S2E<(ART#;iHq?-y6)+VqMQ
zsJSjnC+n~am_@A)ZBM}H@dLC*i(ooBG&DrUnHTAERh#X6UO^6CE98u=*NXCJJ8GbM
zLk!+`sltuUmO`cek~F2dF>`hvG(WI5R-!_RDALIc9xrBo{-~KWF!XeU_;dixb_Q~+
z00*t7yE~AsDAtIO+l3W8`<C^%K+22=7Q%~yO_U1a`j(lOUNLp_nq~+f2tlm?v}(wV
ze6#jeuupsC3=5iq8ARsl{cwFanmjfgjrYF4YHsaLO-&W;$o~CX$W=}{u~ueAOFe_k
z{l`3M)&Z9qt`JI7!mEC^#LZ;J_xs?=f6wIA=d>?D^WT6LUournopWv0tHMy@fGe#J
znOrWBGAzX{76^XFhIu?)+b_enu<`UnH7b;`=~*y2#ui-vP+ELf`%@K|{!_Q;{ni<U
zPf!TDxY~?N-Z{l5?nG0ZUULjB&;J5dLv%(D%;gmslPx&Xq3AJS4@CV+SJ~t*4}t|%
zXfcCBLQ+#wUVL$(c0j-OUJatzE>fY#(p3c83!vfx^%wX%8G<d-qn+}EJRL?*<Tamk
zB7je`NP)VE`N+Yp6fHan<qJqB2DcXo@QekAKFVwGptazA|E`|3t5~7}w%K-}@&;Mp
z1~QYt0;Ca8i!1t&1il%A>BvF?0m+DxAziZ<M1_LF_Q5*nD}d#zyqm@Df<nfj3l^+W
zY{OaQkN?8%0?(kb-Ai?M_!BS#bOb?{N<bvX!O2xBX|fq~ncN)MJr6YO1H8TEn;MBS
zZ$QoiU}ZmmeB&E+bdT}}^n}_?7F4BabN0~9IZUx7kPq<y($oc>FFT&pp^09jOruep
z_1}6OMn@2#3CIH-1zzH}wiAneYWv~B^Ru&@Ofu=J0_tfQl8l=}Lx>=|h3=_p0vtN!
zh!3`$&WeH7`1z54uGz&j6ROSSQB56WbQD9tuF`w79uyoI48;=|sn-u@(j)6-u3iM&
zw?lzQk%#3Il8vZl43lprKFdjD()s>%kWIh7GW21tPko7hP+$pyxqHsF(R+{P+^MTr
zhO54*!H-U2V-7QTDu}1y8cUnTR^=>>ZBatu-JrPaxc2Pm{m_0AWO9(z3tl5Q&BgN9
zZCB86%V+<+fP&~+!FPBIij)y$@`y$7E=nx#@0SGYWL?ng@u>aY7O>rf3YFGfeRf?l
z@+cN{7;!LOr8K-i#oPwD(+<Y(y2hdy5WC#BjI6CA8(X&-plvZ5)v34P?A&qo1q|{9
z-~VV0{{2f09IDIy{}#cfGF0G#e<A>nHx;-8!I>$JSU)3Yit({Qt;~K#)9e%&?V85C
zdDNNlT(0<{L5({Q`{%-b;E5=h1oBB_GRX6wk^Z{_u+agC5CYZ-I&W^cKrLdMZHgcF
zgPV=5>l*>@4S<=+Rj6(9AG;$bO8qJ6sacjIagdm51C$KjdM6gBfPIvi66)W@Fs+(o
ziJy&o>jCx?&$K%(waWpc`uCx{ynL<Wt{ga(4|~$}cA*GO4lt4~@ZN)-Tp}-X)%AuV
z>_ysFk~}Z8t<cXuuZkEGykw>fo(W>^Crj5ZSqiRugNMfl!^7eBt8b0jN-gT-V2J$D
zTlDkBXK^JG7ZzNsgP8I0>n}L3k3$gx2oI_xKVM1}+PE*6DPqYaY?=S*F_01dC4*0O
z>pz39Vxc_3K=O4XpE6s>h>i`MU)#z0BOu9WyJ4H6fCnrS0&c4EOR(oU-(*krd@A@P
z4w)E0Nu!fUKPKivtf((lh^F0O4Y-qz)2izB!yjkAfY`T@T90`m)@<1&WdE-nx9J^*
zMn;|6Tm5{{{PV4})!`XoaZ!;QDB8I|PIx97tzq*c4l=ok#KVEX$XCri^tI4Mqx;JG
z&t1Vt4?A{Gl+f|~KdVx$;5}P|-Rj8Ln1c3n<w5oqj+~6l|5zhejUCIFCk4X|lYJiZ
z&I?RJ7^3GYqX(>zTMVj6Am1AVM)SQ?_x)3;`%~wr<oF)j`1`aa2K*av1Ms>YZGjjO
zb}5<_KniR*acRjR<1e~cJv*7S;L|iYf5Cg0Pu(3PY`C&Je<>~03s#=6adEZ2EWQ9U
z0Z=mod*|He3UXujerdgpkv%XX0kX#zx2o@6YwiLwa}8==vaWV?Rch$sI@;7m^b9@{
zGH)fit(O!}W>hF%IJ#l+vF&POcOS-^EePrlygqk?{VA(iFnf9oQ#76Vr|$08kSwkU
zMsNx<<u}p|uI6>R3goa+gET~Vl7TGO{QP_!z)<iyi_gv&G_w)HU{(CQ!3E7c!AA)7
zZ4mkftP1E<i!FTBLX6$yf(EET!BH#^N5@Z$5PZxGxWvN`iyw~N2|=F?rK)_vqIZZt
zETY6QHfP5L&C=8C^PN7Ut3r}4Bi5kv5)rzYM4vl$@Wu7DBb2K6ujA?8!(R3i4L~f*
z0OY8nWj?jm#_}~N_Ms1mH}0nrw=H<Dj=0`6CrF0t5z|an&76U}<9leQnGXOdt(0XZ
z8@Ov8KpXnoOyC@+9h4v=%J`GJR}K!+A4FVq_nfgJ{#0TP)esp?_<#et!JXD6AV!@<
z6xsh5rheF;I1Cg?34Z=cLT*U5fN3dw9)6Tojl?xR<;(o;IMV{8<<0`-_@NJ+B2vwK
zS#Lxmd%o<@j8ZWhSRy%DkM~eM_hK<MyjDExSWVkvGu@Gwrf`@3-NA&G@?T+*TrYEy
zEou)+ZwQC7f-U>k!QFu!ClGuU1yUcDG(Q)RaHbK%0cf=Y0fX(&Lyi%yO+;`B0!A7@
zBNkVR?va8NMw<0{e$OjVJjjChcd#{mZhLYcae<_0HEh$k+#~OMr|i&-4^A*phm@;;
zJ51-&_O6JDa~*Q6^!N9FBm8;~qc{?Xk5H2<N7JbRXT(C*4f^A=mFxCo05j~v^p$AR
z>B)Y-xC-)TRLZ6sP7eHbnt%zWd3H~ZeMZYwri<C+{^}6h7`#v;FlhfD=m5O+j^XuA
zU&Hc30X@#2I7fvRJeKlbz|qB%o=--AmX%a`^M|-wm-eV|LA&U4?Owux$oypikT*35
zohE)X@_GrHV}_-0oE0XdA<(4-QftMdNKD>o<49{D<Lu*gcZq2BX=m#%xV`F)tRzdb
zLF>jHHTTMY7fDHE;Sftn6u<<ZyH|-~v>{6Qb8X9(x?-00Qm>e%$Xm;NWw$$lbcP^W
z8mq@B%AV%+TlD#juhva-*WOS!bpqDFauXcj^;*)DI+cY`6n@M$1LguZz|ukxRsj38
zs-S!7BQ0A&L`I$jF;oW!^x#2vDjoL|hSxAb1eZqUAZlQ9R=v?GsS9<UaN+cK@By6w
zgGsy9X}1Gvul)vG3`zFhv0lQI?TUv*pPTn872GRu5klOsnFV*)wLM3L_7D(VHBliB
zZ^08JFuAD{%APd1U1^-S3DHFDdkMrbL;<UOgX^=KTMA@k!TcssSkCvWfFHZ#s{Y66
z<qzMsP3)FO#-H9kIe!cK`aMirjTURCE(^O3EVL=P<$uyFLQ+yJ%poJY91bu!%iu__
zxxc$*!-|wF4YCo2-}b!?qf^??A+y3-Jx6yi&n<v2XMBIwxtlHY`}?Pt7!=G>Gy&sY
zswk5}I413+?rU#BIHOdGn*C&Hjuof5y(LS5L&kd{0;jYW;ke}Mqr`9!4C#Ljh1m(c
zl<r**Hy77mmG9Bs2gVS?0O0Mi+;^Zd9lXt77SRVIE~0e$ic=KjWo5_B*?|L{p;^ba
zL!@ZWeqdP1Y8Y&@{sOo4pR|uw8A`B@fAI2ZLhGd#-iwBFgR6E!WdtTjW4Vu|U+YJ~
z^di`KbYAY2GnAAlFB{?cM}N&DD>H$`an@$N+P+-!W$$VHUOI^!W5GsKk=pfeVY~Cn
zKe=^_q7@i6T)R~|hvG!w9QmUJFH7X>3CaQ<iR(L1gnz1RP~QJcp<N>L2dA=$i0j!c
zTM2^I-9~Z3kwijGu4c(e;&vA^`8QIpLqnD=IJa9(Cc`>D2^0~=U@!J-Ujr@Wj4k|9
z_VtWn#F3J?26V}xxQr=21l<@@!6=(l`bNt65#boqiAl#d1n1P#OR}6+XWq;tCJGW!
zD5xdhL7cpxbhU=liUYVn^!|dXgnKXrBw%@igA!DkTz7x-cU6@J$OlxlQUl5DtU=N#
zqgOhG$o)@W3rO*>tlxL|JJGl8&<fCH#Y!bW3<b?3b@;+i6(G&X^Iu|swCKmU>C7O|
zL<6#QC0;Q8hZHh{3EWC;(=rqu_srm}=YHo`3z&u^RC%v@?3b?Q{YV!Zn7j!kNT6B@
z&~$Cu-D-}ZaA1U>&!cax7(f#Q)=QxPmn@&VJ&u~681?k&%5DWT*3tyNcbBp3PoM!L
z5n7y9p$(6+sShK=RFgmzFxC3p%}_r&Ot!CU<+CcB!*6XJx`B->i~SK!HfnRmX6+4C
zmJ~^z%|Ky<xHTFs7OC0hT$IpaH?d|FTJncXk~HalL7CN<38_C)hTK7H(hQY*by0Hn
z7MSl|+Wr%c^Xgau1y=%dg0(hAC}3gR0${PY8>m6R+X3FBDL!LFu*ZaL>~{TgRklsf
z%*|~sD3aoXU9OE;*N7C2`fr-VmbX$;1!!)wmAcU0XJuhV%4N}L2}mv61h@Fvtyixj
z1EL7{9iimm-8Lo-xjAq=udNxVqmlLg!QS*(`>ZPB&QKtb$;kn#LO^&v<+IC@bS&PV
zy8Jf_Q)M<v*!UO_<dni?GeK`z&JqkQYePb-4&&?&!e0!hLP!kyW()<OIjc&%nK-gz
z(Xdi4e|Ew(O0DhqD|Y4a8j>Wf|I!O7Xw|o=(Qf0x$%UOO-l*joM}(1D<8S81Ep=$1
z2f{NcAzojy)!M~{oi4v{b8{18`<v|kFWo0z^%hO&qsb5*YaT-R9?`~-a@OdZ-L`L>
z+0SB6H*wEHnAtbddah6I)aU_Hr8*4N2+F*1X=xe$#gXaOgU0d;C7M<&U`qyjG0o%=
z7luPjdvj&9Qu^N(X|mcgIr1oh0girhhG+#=7T=Vg?0`$g&UZoECG3AHhVcv0bsyGq
z-`f?6sP$lr!*+l~=J>&FH&7q=k^hkE{H`esJb&)HysQ1drDBNPvOjLk>PdA;kr|y#
z>1`2o@Za?)HYAn!0#Nz*&LWjD+Qe<vb!!L^T?#YYHo_PMS!KO~%E6WD18SWU@C1#f
zau-KSd*~<ogHB95ZE|Z$OPVp+asA%f#|&$axi-*88hn-j*1LewRX!{Y8cJ(xL@Ne7
zPFtrx@D>!+%2{hqIq@E<)EF6B8Cl}OzrYr7CE3y5W7Q0gX9jozLI4ok-O}mKS7Kxn
zG!F1N46}^Vzx+h`U+}K0`)sQ(%}v~s#*mg%hws#PN~1qq_`rwB7o#}gxfahtr}nzI
zc3ucbKV_LF@Tsy$BxYra`W&DWlmA0OSQM)ho2lkC?36XkawwQGjZ-_c%YPL|ObFof
zJqt;$vAVXMiLO`&X7k+th*WWhCJN+a0(Y`{eSV2(`-bn{(-ey|&k<=jYiU`iq+T<i
z(DkRMca(>16fCGY)H+2@NM2qlIllh@&uwK+wXr-?z4o#G{46aHe&Fh7MW@!}Sth$i
z8U7wW26w<R+R@8)z?y5Q$CIY>+S~RgUIWs8da+x9q@I?yRQ}{JPuqjkf1Sk;up5MZ
zzt|BjiHkO^Fa<RP$X~wf27&sk&ps=6GmS0GpK*br8Kff!kjjue0uHzgvy4Ee<g^c;
zE1ahX_pVL6ER@I-d*^RWG-*6yzcB!51fmWvxot+hh>VJ=wp(p)*siFDKoJneLz=NU
z5D+!Nr3<11T+}{hhWo0)OO?FAY$Ar^0z!CZu`$mqyElEojrarXPl9{kKK(qZRH3#z
zOs2-s)z$U3$@M(8fqdW&(dC}M@3dsvawM-96fbVJze8L$@)9U}j5e^#KDMU>S&BL!
znX69Fo;VQty(iv9=M=H5t;H~kHY)Ge6eaOD9~I~cHw~{QEmeCbB|^;(FGADj*m;-F
zee@m44OZ&42I9*!Qf_)wR1(3PC#816Z~wmw;0@{ZH-)caWrKnbb>1S+17sb>2t7@F
zRdF0#Gsf?~R|pw(ozi_8mOic-Xs-;7Mb;=X7P?tx$l+yBUNcTHp1b<%{;H(-e5!Ey
z@SgO1N8?MgoW%=Rd|nkP-g@PP?_GWeQGbE=+U{OEhQb+y`G>2wcA};^QFp{kIa0?^
zDm!dK=0GY!pBuv$yA{O=4!OMuj7?twy%u5gya`AznjJp;pfa>Vkd>+q*pdZMgeNj+
zg&I;;Vgkc!OXnS~d8J8BJ(bpsY$oz#wZ=hN$z9)%EmE)LEXoVTBmuxS&^_yKxr<Om
zQNcXNEvpyw9Lzmqv$MN*LPWOCpcu9XIVi6jIq1F)I?I+VfCd|I8$*E2zUd<PAOOVG
z@Y&Qb#`u#}IHxm=Nd4E~4z2G4x!kpNaxuv}_BQB5%GZ*l7GeF1uNm}kc{$O)tmW9_
zLBornN({G;yy7!T^dhwQK4ayURiv^n9a)mjqGwH$5(tMSaudb*1dAs&>u`mj7x)8B
zRe?77e1U9?qEjqN7)2qy1Z|i9F5u;>oKA9Nz6%jT?+a8JY(wJ!)e4lrz+ju7td!yE
z9{NjG#^co1YHt!QH6o@_>fsfZc4<t<;Bi!O%Sno1Tx%opL#e+3O^lkzU#(IbL%bhY
z2OAheW5Yg>d(LQKvB7C-;2g-TBq(9Ofn8qkq3ssy^ozg5-Ti2;`j13t>XY$!$Q*|Z
zjcD!s7Krq5Cj6pV(BWu|2w($ux%eA#h82ZMB{}sw()xEw+R{Ale)g~WUWofn?QCBI
zr(gE-nf_A*f@^VCqtPqt$aTl;dwd9G1E`jGAnz(gouMUTVEm%#jRlc}#1zTtGF|eZ
zjfuFGEZFh$ygFF7d%m)}JqZy+BoAL78;d#LkXA3UovZT8<aIr(Np6dTDi-QOFqja3
zjfw$jJ?Gzyc{mlmx#y1?04)z#e?I@d6jj_xI64q03Y#;+Y`qTvi-SZ&ySm37AO09s
z4-3$wA5)h=U>We`05gnX{l{tjdejX_OKyIdjmYA!_R*S-_+S5<UJaF;G0KwCX{mU+
zEsIYiwU@NqWhq%18*|Q7CxgWgZ}lGYk{#~G)2VSsok{!(s9zV3-F%uckomNRc|yM>
z^{8>kVvLsqN_C!RuR#VCy4u%5F?GL`dr->?T&peApX);2Vs91g*Gm}GWEYu62P8&V
z)}(xtY{>n1_*!Ou=D`8gWzq66O7x?4*y)TB(sydH-8j;SDphN?<C?h^gS!C{Ca2*#
zkKZxpou5$tItX!)zjP_oN-LJtKR{~0r2IffqR6ewNXWrW{x%lT1!47QrR}FxwcJ%3
zGot%=gx)6%QJcdiEclm1GA4=501XetsX#>m&{TI*Eu+7EgE)$E+}jx9cb&{zc@c~o
z@=sgxI&<u8dhht(3jG2i+#7+r@n~TA<cj8l^V1AP`CL}WOY<X245lza8E#TnMNaV1
z2@INkx*Tqt_7d%aAhb=8JGu=*Kv;lo=g`degH5z08>9pZ4nuL&6^(#$Y5Y$FX-|^=
z@29|t-Gz&bi?}Ni*}EF96!)KbCo4eGJkpU7<Z@bQ&k%twN@**NgH}j&{zmWAMvG${
zX1th$vDEJ`Mki|H#c9Qv*%#Dt>7sWf{p=^S$&y4D-$kZ?i<#cMKS@nu(fq4RB;dV5
zPYy=6p^Wk$bDNTTq^!9H2Ky(bZ8-RsvUpVF1KPrgKV&=0<BmC!Jvhb5NifA4P>kdF
zgm{Y)!(H<tZazEQuv041S5SOI$_-PP+0Znf+3(g!NS>yPKPkg%n0`~5*gem{J6zD)
zfvE9OU&uLIa=XfMI~rFSn@X2Ea+gTvsL=$PiUU>@7z^Z`(<au%qT97x+>5^g>OB;v
z9fP92r%y+6czS9N1-HDC6v&BL1NYI&RYRjaq-=B<r_i0COc#Oub1(xJ0I31Itj%!>
zQuUbI|0$iY2ndXDwx1$Fmb&B+%1?^nCI#B!p#OJqnF->jON0lKG-w0BG_@0a08-e&
zQ<firFYX()uz7{^z9x@Z6?(TXGyjuqw{7k@8(yWZWN7k+(kQy3`L?0-h#w&;&^mUp
zoEqAH<t>CuWc;>j86&;)$XP;e=4)3?VJ<?!px%88r{V`EZH}Zy!EK3qqPSHqES}`=
zN6dQDvxXzGM&m^<Dwv~|y*<P9%oao(QeTab?zUh>=0Bg&;{F(>eVn`1IbWKWSi1h6
z+ucTNsNz!MEk?)3euoK6(R^mbz;6usU1W?HCY|5&|464rGR_&h`?=QyFA`8K{<dPN
zKl&r#JO}T|*-4p;5+#ztIafk!4<@<)U0V*a7g(y=Dv4`hZbE$qBn}tHNUr-iZF21u
z#JlzKuB>+!Pc$2YU)_4_ecW~J4Q7$jsxf)!B-&{HSc3#P#*7O}vHUCZmlEG0p=pbZ
z=oSvpa5Cs<V-!jP<@`8=HOi)SR*W%3bpU+r0SaXWl7Z=$FJq}%M2HB*kI?8*tD>dK
zr*U9)o7OAjj!(D5I86W+x?CLJ9>G%x>h1}XD&K`<YJ5*2x2seY)y}LO(?2TK%G}m}
zzI;C6%)Vz|^*Q-@@U1wdKu%YrE1)mndp#;1j{kqTNN$ROa`htT-qCY`4_=GLOTMe#
z7!EHf!^f*B-scB?amO7^rW$iGs>!Jc-2cx=3qEp}!kvGFw^}sA!OiXS2%L5=)?_3$
zed1|_{~VXU#?Usf-r~Csu|!6dQ|`XqddRUt?K3J}dAiaNP^i=H#}-ECr9;k_=)5F{
z-(07rqDdYE-)Z{hJoSe|@XmRD=bFOe9$Ojy^aWUke%g|t^~hb@d!MJTf?pMRo`v4Y
z4!(ro%I3OWawKR*o0j#dt!V?g^(g>3M|iZYU<J!P_xv=hW~7Xpq_*4v0~5S1D7TAu
z^;Qlf9D^PVJQj_Qb+K-s0>|u8#Ha@0IfNj&9ZGnw@_F=7SEcKvv@v#QA`umEc&K#`
z$a`J%BVXxa@(^E#dUxcU&+JzlTA(x`&+DF)H*K7?C><lrh`c)=uD}x6DALf+JS`bI
zl@u8*8mApec^+>!Y8i}8I@x?Rv*RstLC?E)PkUvJ4v)#tVMI?4iVd8_->F6ZaLg-b
zer{orX0uiOk9JIoSCFqzyWMvH1V^gDhQzgJgDKdV7ODn<`9@!kQQ`%!n!t>ILrc^@
zO}dwQ0W&#?*)kPU{6Wzy@O8)U19Ym|k~~sn2sh=A3}l~~pPzf5<In`$+5IL`#6qKx
z$hykRIdCx?-J$WT-cwC-t(lkW#?SBdWJ!bzvXw6=j|3=gBT$1!;_LeZ_xLEX@r+pG
zGtUbnl)G2!Fm&o(Ra9xMyZb2d6?{~b#);Hw3q$?@gk=dZxs_1iDN$Q}e!6~{5f1Cv
zZCyBaNJ9Q-kjOwgS8e>(_xalQ7#?v#wE%7B@%Z_%1KOIJk5lt9#D*HKQO^~awbpB$
zM9*+~&^rOS@S6MlsGK#0K;PwIZ-gmRK>i1r4MSeS2JlmF0zfGVSWeKY=mA9lLb8;E
z(hM&c4S)ei(`gSL1U4YUsj<10izYW6(^^y;iX%wJN9Ps*(ni)xH_kry*sQoE3pTs9
zh^XpCuT1(Re`15po+#VLJoZ68hxf2TzkmzpD|w@;16A$rr)caepln6E_jAsS;2>+K
zZ#9G+hTGJCnS|d^Qxto@Jdhed8Jt?>r=GnoE^lXieZW4t_=T^S=$TKB`%Xg)_HJAa
zb%Y}&Go$E-?+b^Qbr@7-IV;vE7!lTUP2b|>`NB^3+2xPsMRS>Y$JU71I81i4mT$D!
z_}FD8KlR6R7n!~_W+o@E#S0AWmo1yOHc~ZevMd)BJ=J-eAl8TAvuDE5aXNciDjO{2
zor5XpYsvc<07K2Rwkt2tpT2TIIl}w=(b{vvzVhcftjpp2@UJoA84bRQ|Na$xHBHqY
zGn!$EC<&S0=Cu*z)~lFJy>!o#3zx3z`6S6`IK|Tw0%MO4rY<{L`Oc-3aw0U_E!r6H
zK@G?FUqq4bBdnFY8%vdNB4*M6&)XlwlbpmxdllFS^tYkgHG`(8AnI$O{-93=$oqpb
z0in~);k>{bK1k<VboBW)lW!`Nc@|`u{HotDA>DUj5q0NmW&#+_PF18%SDzMrjn8+6
zKPh$tQ50zQBk?2G-k`?<vjnIn_q}iY-+!0GNxYQC)|v6;>NmLJ#Y9v6F}ry7a&mgv
zA?yA7_h7+ejD?Cx`+K#5U&2-pV}j#c#TpqB$zP*4TlWIMk+{3K`qaQy-!N0-c@0he
zeqUTTE#3k5EJP7Z&CE0fQrCIupeNYx*oEDR{eAJKUkf|~_b+L!DW?ldO*u1~<!CAt
zvM7}ZH!tCTelB*q*~>m(Gmbu6Ge$o5m!)#tzcZd^Z%x`%|3Q&qGiJ`w?+*XZB64CX
zpK|8<1di+e;6ttLr2WI?ZUeIdJ&S6^+0cmi*rVyUg)bPC95OOTWDQ|aK|&O&Jq+$+
zEojSId0P_75r-yr&vED}P6(-15sjsGU)5W0lHLDUm?`H)&W5JiOySU8PpTi8eKGlX
z+<qbG_;9!BzA;aAv+MiB8Xq5DuGyGm24v>`CuZ^Pf>#>zg*UHVsA6i6BUCZWlI2H#
zu%1i!6sN1xMM|zjYX>fNvxIlWe<-M@1S~fI9y$jmQ$`+Nz!?LimUXbzk(@lWW7YiE
zX;SxZ6Sgs=uJXDZtU)81<G~tGK73OA1oHbp*1gbT&E6%#?SxjX#Uv%6?}L4^71e$=
zijXlP*c)4Nnv`t?xK&#+8)=dt(0Yw1cr>pzAK(9PCkuF9x1rk;OlV|eWSGH4kj3u_
zocutIW#Q%y+jZCRx!rzP`0<Df#?l0`^`=YWajZ9BVm3vv>jcNk`{d9(1f=e_<WjZ`
z-z8&g2B}bLEl9>Af=)gNyFBhWT7u^CVEJb<Sa~vw;pgE2_U{g5M^nD9{|Xrrw=?hf
zp+#YF4t$)U6|{o1IS)y~?DWxvf>4-HSu_szioz&=4IDW3E$=#2nN&)_r7JeV=Ua4Z
z-lI=?ZV!^*C-uA2+Jzi~zKZ>mWL3*?5Xd@9sdi3KEXU&yNA7qI(7Bf9JRJtRFfG&K
z>lL(Y-25VBG6^JyG%C_>WF%x7dFHt#P(_yzh3LR2usx^=hr-Yv<`&xlmGAj4`c39+
zTfJy&nzhzXzihrIxO}oQw?Vs(#PyF>*f{vxD@4nK!}@n<o>#I8)R<sfS&23|-oC$j
zPzRl2w=%t@5nB<%@o!L)-MP*|O*_X~ua_{KWR`)9k_7=#4@&h`Cy26ehh|_Ar5-)l
zPzonB+IZ14W8grh7lx<GBPjoL)UeX#c@1?0piSs;H?%ZHtk)Xe=6LE7e1HK2mgak%
z(DcBt`}_NM{$IhgG<<%3zQK!!ru*xsLLkwv{nLNu0r5P7pqmdSi(5YMd<~{dVA8?0
zZPiR`iQmg8^9W+MVnL(F3PepX^kwVK%FLud6ajAsl!E-c>HB~TZ9Rc?HSi5N7hV-d
zYX-AZFCgXF_ts8UBl^Esz1H=B#DGAWbP>ynLSPm>Rv_;EFS5K)$*Vbrk(qS_Y_NbQ
zU+3i{5dVZws=$c=?Gi$#0{9nyjK&%0HbLu*Z4B0v7<HBq5UxQDhVG+9CcSU{v4<9s
zw)%6meI6ohRDDGTIm0cBKu@<uq0YYsjQ~~L3d|@j%2y74My&6fcXNqOPehPes_mHU
z+%K)F|KN*`I?{{M3FhL=Oju)Yj4><Vu01b4@U~tQGHueW-rzQRU-Ndi8<|)q>@Cpe
zRB2%#G{e#C{xGvSm|`lqljVCG?@Cs7qWYhK(4S!9lm54FQc~A%KH@dqG&1G2xH5@X
z;8MI++a74WyzCgcQ}zj)#+%LUuiYkkN+NnX3Ay>h_orQ;{72XZ9D#%x+GOIKd#NYa
zADAJ3%0R~}HUhzPUPOfd8p~SM8Zp-De&k7phZsf3s<xT@Y`~{Cv3pX)I1T1(^#j{A
zc@={qRP5=pAU=WT#~S(K^VNT`YJR{sOa#n%qgnjWtN@JX;8G*aTwJguI%~mLfd9Wm
zk6KRcijQEM$wq%DbdrNzm<O~S5Hsp_X;*-gX=)w`Y_jd`?LG3-Yot4ofX_e&<fydn
z$=Vr#i4~X$Iy7heqp7F|TcKd&(+qOA-2rN05TtSLH!wS0VKZ@D1q5Ai0HGBWj?wq;
zO*8h~>t|$SWl!@ywDq-7tx6|FYFjOH#~i|B>Y3s9o@-s3hSikwK4z&DMEKJW4%||e
z7N98Y#b@YcZ`DdA`rKv69FN9|EkW19(ciliJDIY@!1~v>_+q6l@s^R@A##4F>|#{t
zTD+ui@7{%5N2<dq1bx*+S3IXj7P!TZK3@EDRMkTto>nuJSopG=dhDghHfs1EfljU-
zs!@A!yN%9OH@bADrJ>+U@mjM{J*S93F-l5S*iE#R1=qoeN++6hw};z-E_FhVaN@`l
z;{YN}%4~w&?x^0=N(9c@BD<J)bsOSJwGw&RbI;=&f)U<pb@!*9coxO(WV>fGDBZ#Y
z@z%)C1AhW8{nNliq;Mw}sY(x~sY!9rlwf|!zYoi%aXN!?$}YFIi!sl&QbxFj_*6FV
zsXtFV&(kglaBtcnUgsPLHqAR9&=3S|XN%8|{}zY8_{O%MN}q}^B-c)Iy@0U4U^ru8
zP#GhtLR|3?>}LXYiKtzHQrJItLhwKZ9Z$4<1xaWib&s37e{2k;amfK;l<yqzSL;KE
zT+G<2QMf^IbrGFK{Y(Z(*wFd%`^~{(<LK?DqUk{=>Hll#Dg&DQ-u6I{(cuUsM-C)K
zk?xl64hiWLq<hlc64FSAgdiayASxw|NHahhX_5Er_kW*Hd>XKGo;$B-2O4{QlfS)+
z5DwaXje{Pg4L^$_73h2;KugeeR32@Sv1JaBf(AK%{4z%P=hiBlT|~I^25jTtF+~E8
zKIp8nd}eD528l+JZ0A%o5#YXn{Je3O?E^#nJT<G`CiDN6n(+zU23A~UHf&_gM#gg#
zbK3c8$3voW>kprsTLs1+8BLXQ*ps8~A2to>(^*oD$1&>sD&O{ZS~%i77e6NXxc1<^
zU8UWQox`VuIVC!ub$H7Gbvebrg&Aogjc-icpSqQq;Jv38KO}A-u8V(-u`U^vFR5UK
z(NWI1aq)V!IYp%p{1Bo|$73y}P{a7T01ssurU-dOO`ORyHTU^4lzUH=Ss%W-csxel
zU_U&_&OkzNK=a1x4|$Br%-J!L+I9VGp<9hgVf{Mh+vA;!%Uo)iM+xg6gc{Sc3r?=>
zQ+(>hN69qt3v{c9y|i@mOfH)Jd3gmYgvCvS!js-nf@Jy^Xy~qVj5b=OMEpZ*N(0Tz
zK{Z1o&Ls!-2uYWy3EOJz@`PO<fJD7qrR-J|e_O7@(Kc3_B;Bx;m6hct2g8qV7`Nkn
z@&pVGON?p&@Azkzjq6V%M^;wW4_7-?lmx`(xXH7<FW-9_zMB95`u;Mo;74y&8X6i}
zIvSg36XMR%l?KqYc!hkT1#N;&yq~akFFZ8`3o_6y=v$}CRp>ki6F#T;rfBpk=2Ndj
znN|#d@YP<Xgb5~`KWj*Tj5Wu`UP-3yimUCP|8(?unX%Nf)|>4Lux~y5DK%kYg&_xp
z@Zgf-`;tt3Ns2&P#cQ`m$RbLIt}&#NBC0{*DJ8DnJZHh`aCk#6@o7JseMgek&cYsZ
zbbsON_uH%Bi(Rj`FTS3>MI~GxtQr$9eocKZ@@}F;I$jd{zc^k@|4*>{%CT<!Voe>*
zhme4hE##<NJTi9Y=n4+iU61c|?0;vgqIpU4(T}V9=JnH)M#@|{61J%cx;HC5a0koy
zydP^SdS#b?UbGy33MbYgq+b6T_?Yi;F*6R|(8l43P=`s#vC)@()3hGu>R-MwPh^GJ
zYquZE8QpsrUwQ&--P~>wZA$#E8~;iWJC08csnWS%vmbs3u8fop6-@A5SokI28aGyn
z=enpOHd+j@IU*tePQPI;{DjkaA^EdM8obT-V7~cla+uoh)P_YM+Y#{FcnPicpi3C*
ztU!VV#0>SfQfh<tu{7_NqUs`cWtI2`05}!emjwFm7yEx35(DEv_>U{7X~y^a%C%we
zk}-4J?VUk}DDWRf0o_#wFbaGQda;uhrW1_+9H7BI$Jxq&gVe+!lkCe!N1ogq?PZGh
z^j83v<9{v;#XJv$RxGx<(t~^2u||8q@OG3fFCeZE;GscduGh!YqI%!F9l4|xok`VF
z-Zu<ZdL~kuBcIh+@eRu;G4}!cjq?Jrw~tbS*Yq05sc0cAYI@H-q92rKS$?N*cl%8J
z%~hZ6>CfD{)PMcVLT>I(R7;&Xqh_8@=mT#fc{Tf!mLd}sz7L3YS&5tqN@pHB(FVUU
zkSZZGgIm~HNBz^illB_NP)uML%r8UUj`9hK$??ZUJ7*cRHEBU>nh6L9$(~BD%HLD>
zO*{6zdKvP<;(H}&(Dv+_(CIqLb9eqUF8%J-4~#w-)M&8ZK`ySS0(NtUspEX~UsPVR
zy+_{-tcU?_GT-N*S`C|5zhWA!NvdRQ*w}`VO^5C)_j8@Zw4i#LD!W9uyFc{~hdG?O
zdjG~8tDMS~eE@wRS_)N-Iu1;0v`ZgT^@_^*5#5Q)tlJBsbYjq-VswgCJ0f{2QqdmK
z)*A<`XVX1Lgd)+*l-gTozdS?hDw!V#U*g?7JW?bs!X=is%ip4%+w(Jd?tX9w;Nr}>
zOg&Z`YEZ~&s|g_i^F>4?*G17d&cM;s;GmfP1`+ZV|DG0^T=H*669nu6P+)p<mj2q#
z%*hU9afR6l<bYPw7n^W<4IZ-ys!P;HODVp@x=7o_YAi7NJqbPrKBz6U*N?v>t)?v9
zEH0tlvOSl!rPrx8qqOE@Y~i3y9OTMmX`1`@DIe9Q=Zl8Wjo$tpHH{M7yUqM{0C1SA
zf}3EX;MwLKrnZM6mwvCgR-4Ge&zzFUW^Od!Z>OSVV?#6z2byg@@Ww2t(B#u@fLPHi
znDuaRZXX#wdzGMLz>i;=_IvFi{a{lE)Kf_GP6Kw%GlBc|?*`$m!6>y~hmAA5_hyD1
z6Q2juyZ<WJsv#9%s@CnttA5eo)t6A=KAHN!LtLssQrsL6MjBl`QV;|{DlPs643U>K
zhI;G$;6>*$lBU@~bwaM>egntMpdznibi{`ZL{eWl@nHj(zpZK%ix%G#YVGrTjV3e?
zAljuBT!lEAG|z8JNcz0rwUDQzi#h4B$UVi0(EGLCNcrC4XpHN#enp9e4R=L{Ztin3
ztp_<iYNX;}wI}pfW04<LUjJu6WdI0K{^$FpfH>v7XXFiVEU-GX+_d8iH+oVG>g-HY
z@B3g3BVR1lX{I1LJwe`5L{NG{fQ$pv;$r|&J^@6bRJ6J|CD8LfIj&xp=UIG;*4YPI
z$=sB}uw(K;KF}Nj%(TaxP3BQXYX{P!t)rx)>hav80W@;T9rzKnCz-eJ<)D1kbZ<Z{
zIOvPY!&&xVN0M-spV#U1ag>_SF@IwV3;T~&vW|`*!G}1(yom(3_GxRkLrjT>pyfrG
zcunbwo_ym+tG18xmCF2xwFJdzgypR54p-8Fg7iByMiOAB0my9=FF@8Z9Zb9Lmcqb_
z@Gp>(d(vhya;aFBzUqYVDfL%)kxa;iznzLh?8}>cU;&?)**<VNIQ_3V++1|tBq-;a
zHhAs`2$?z$*I6yqOH=BL%Do;H&i~3xYKVZxmq*<pHvVRbhv7wUtCD(qo#%S8-11iA
zSb6ddA*HvA-|I5hch?%k1W%lCpFEDPh*Y(8)IHnHvABLeTJL{gw)#OyCGJ@CQ0wW-
zQs3nsde7^<S8fNLOMU&k6RmGnEV3Y~7>O7$u%j<b)&fdfu31qWr2PFV_a3PZ@voQ-
zs0qGlZ&A@Hiz27und^RuBFa8|JUl}y)324j$fXicU-sDPx4&{CrE4vfwEq`ab8mk#
zpC{3j0tB#0(9W7PJwU%6kPOgO9caD4@d0=xKOh^Re8CSVJ;Xst=+kyuY@b5f0O*e<
zOz0R<GISmYb`@ao09FTaeE&_kW$3R+&@3$9KjR8O_!Eq2#&=|ftbykUT3Z-E<<*k0
zOhWsCI#AaE+MA-i#p#)pK=mcM(EubBeZdKPQP6iZ*9FD{2eXBZ*Bc&fEnZ)A5HJ8=
z*{+l<jRpXkkDKvqrc|i|z?|_i-Tacgm3TX;F^Rt->!z2o>l1qE0g%F*$k_MrK;Leo
z8euPYcuA~XBlZ&zv(TXMy@gC!Dj)#}Al%(I3*F!8Z;$9jJlOiboP!}&*n6&+wJbCp
z3CQ{{f~MzbK5{4p5oSrnofVla?qdJ)nLkh+5&I~|p5boYx(HzMe*wyvV0Z(_4B(Iz
zCuO|sIuO!gyTy-odpKeC$k%LDX3N(u9Y(K6ift5H7kx+d9Va1y2pijny%E>L{{A0n
z$1*C@RXWiHJc;aBS<<V9ic(gXgk6!RVlTNFhEQKJV_?d1${UYLyU2o;*%nTL5Uct!
zPJe?Xh|o0>bHl2)*TE;m<{v^ux;#bH#c!}I|J=@hY@rgW)Ct6V0fEqlD9K9c@ZaQV
z>jviFTiFf>SOAQ>z<_jYr*iLp<1-otMJ!}wl6CP0+}E&(IVNc}l35ETr^lRzAVvhm
zK(9&q_c{26T|!8DSxB&X#%o`t&SJS)IEQ1H59H@Rd!?>Fb$Pxr8GJRlz4Mj8+v3}$
z#*ZJ1UO>#s<yGL#&F$?CSJL!t5`$D6m9sJ_j-)mjMFB4{(zj4UA<Q>>x#;V3C<p0q
zG1D8UQOteA^y#un${|wO#*rn4zO577mL*nrA(xGH*~UEe_Mm%Z(mPng1oKATp9e49
zflu<<7q+*uRIdfySFr#Kl#h?ka_By)`q9x`hcCZb;HmEJ&$fGL3Qo}9HF|2ybmr*o
zv2lmy`lhA`qZ-?&Pi*2xxB(Y^^aCIhd2)UDnM4;JoJJ!aK$R%*PY?_ZR5p0@7aEU!
z058;HFo714uAG&Z_e8cbs)TQLHT-)B8Xay0DjR@n<@hGH_OIV@y1J+H{8y|)vqQB?
z5#o{jlmG=KrN8&b`o5T!q9IFrGpq5IC14xz?F=}pWEKyeYr+84!{U5@(X?#E+hE!(
zF^%lIt*3=zb}K6I!`{+|Mt`7U=mWSuZUF3H1+G6@;t<d`bF@t_^J@&je~l(kf~1JE
z;W3AB4WIm_RbkC(lVO~^@S9)jPgg_CUJI+W*QEX(f1i%lq*$D7JOd25tLJZ1)H?1-
zGZJ{I<t9nv8yhy)4i`O0@yt@Z$AHmg<4scJ$p^KDz(Wln8%KC%qW?JPe0=El|3(JO
zI`7Fza!E5_KBh#G=stYION#nw|EOISHzEZxd!o#kMhe3<m1V>V+r5s1x$R7{a}Ghv
zt(Kc@!`66c23BH&6Bk@Nuoqg)kg*Rbi;)IZvC<5ZxVKNQh0T0#5F^}Ak`yi2AtM5t
z?tZlia_L2nwXiGtvt0Pwp=R{p1^?b8;&DAlh=H4x5g>~tMV6H3g|D^T^bAgx^!yod
zD6^hMc98Ocfp$BZ><HqU_(cyfx{0^|i6&Aof$!<lNHAK^p6nS`NMRQA`-AA^#Ufon
zGkVY~Bd6JV3=q-K=ErCY2EZX?O0WX6FKQnUtq#G?ni`vMjB;;nFp&w0SzJjTO86Gg
z7I=ymU8om`^io0+{CvP75He<M@8EzZ3DgpeSz_0y065S90A0iZG-(aJwaTn$5>toC
zT>!vu0zkl10ER%bVO1=%k_vahGa4Vdk0(kJhem>&m!ZS~5OjbBx!|1Q4W}peQExA1
z0=-0nYtTu~#dK>IQocCUsAl>+ck=gX?<vJf4-S1l;0n0|V?HJ&G8z$Y0(^XY9{`7{
z555he5%R!7TcKoRWOxLIUj9{6RV|!yu(Y)7Nty>O;9YRR%(+=CvP^{G5%4#C5pjE-
z<Ih+&G<Y8d3XC_2O!1~^nReyU(a})~w5No_++CkzOS7RAh9NOBpY(oZ5zJP~NwjqD
zC_;Ll%l~u^FB$RF)?%Q*++*(PVG8c$$-gESZ~V>bG}lFVdwq;=r#(?f^V6Ptj51my
z?HlF$u$JC&_)gphh=KCsPTDFRO<Gz7;U&xrqjx$FXPbQQd3Obi{5yV=M*<mnPU@Uq
zz$}xS{r2-M&0n#g)4!fs#tzTfA@eC?6cJc>&WLZV##Gcbk{QyU2)CoVh-o{-kj^Xv
zqY>31vigG|)~pa)ZBlFoL<BR0l<HQVZ2m{`XoF;wl)@|Vo@ovfLtn69UjrhLgJ|~9
zx6x56cXv{NE<?!D_^kzf3~BiX?IQv4pbvoX<8H3cL0g5oI-GWRa&Pswm(+n*GI1EB
zuHzUl=<3JYBCU$`O36z`nn6Bvg@s0&eHF*)SVHRs?HNHn0j_le(2oNmc+20i`Cmst
zXtDQ!qCOQZ?Ps*H3;?J}Y=0!yBZ~br)$LvTR<+BSS7_}CCpY&hux-Hv$t==pFriSd
z>i&DH1?~EAyJcWc*DrSdGaS$a*8n|Nr<Q8qZsetUp0Ew4`&Eq?flR<rta<<6za2-i
z>!Y}-U4jYUmsD(pmUN(&iVUOYAomi2sqj+}ygon4)fRkLO8+XL;UV&R1^juY*y6SH
zOu-|-YR@XFz?jCW82mzDlS5N!G1%9))*JG&ZbOTN8F-wP_JwLORHngY^7x}9uL+M;
zht=}4p;t>JibIGH1hyT1Z;D1zF*&DFZ}9CA@x^KA&4;ZNrAg|OcPf(wC1Koh6a^#$
zKS&INj7gBJqu<Y{WV80v@7l4k+rIcMD8?U-;Iz7)npr+@oke!H?e__e=YPhW&yt~5
zu>I9rJSq`>eRhLS7qBaFxU;L&G54m=I9}M+2tO+}$Djo3nMRg@MEvm<)MAew8)ZA@
zoK*4W@&lhDgz+R=)?2;s)XXOMD?%1ex=5#t>Z8$?NTgOp1c3df)?q^u#m=b2L0?<E
zIL(DWI5415HT80#r`jW<>ct*~TeU2o=1Rwtk(KAhNgE6aR(}DkNw(9Br`l};|KfDU
z+||#37@rmI4B2T`XXWSjpmotZXZz->g@?s?Mj%*CxgRb|)py33V)emwM()2n|BjXP
zg(#JZm8RvhGLIHo>)<D-m4z^^{CQdyoJ;Iu{5lhO3EW4;&X>VHR?z><Y30)Tb0(M0
zl_g*yfO-q0E9^0LDxg1M*Sf$OnU)x>Yrz^O8v%-~RKOx~Zbf@|q3@uo5oYAkWn@+0
ze#{7NYK!I3w6W{`SVrSc8}i($PNBqU)?jacp_#`w;BJa5jP<RFMby4U=B=;8?C0I9
zGA%9Qa@#!bE_|27qh)Q0k-ym7{IU_l8BtrNTNv?9_FZd^kHf8N{q3h<%YZ#BMF<}|
zZ5Ku50<Vy9Bx_)iRo1uQ7nK{&-{anYZ4kW8a`C(S+Bjf+GpF9>$2DIhs}6hbN#t!v
zaM0G0+ih3yXBqWd%xiP;sMZXMF@n&N2k?s5ippg6LRjgTO}b$$=@t!(($!?NX3YsT
zTOa%v{}>;lI!Q}hM4}RE2(RTi7-l*nMm>vQDb>e5Z4p}mJ+;4_S{FcTnw6U?+h#I=
zcFmy?^&wekbsgOPQ6As^nNq$$Cg$g`l3rLA!O<Cjtp~=E#Z<bXHaAtK?CWwrg8Ko;
zGHrEV4hv4eXavxK`GkdIz!;ns6d#%FA(oH&R!rDS*$JZ8s6hKAM>E*w)=FEnPQ>g&
zY|5&rk_BGRbSA2OR0OJCw^OeQ36><wF%N4ci^q9={Z)oRL94<cSv7_BDihiL)kzJ;
zo2*WbOS>`KQkEl^m%ac>0r)f7`91OS#r4>9_tcH!YxSzoWfAD{kYIX>C2>7BS`RRd
zSW+RN;bTzM{?XkE7_$_o^(n)@H<~k*OLO^N@qo|C8f8NjJTm$Hpk*9G_-&y6HXS`Z
zPQlWNC7zq8s3<o<M98q*Rm5ZvV+sUQ%4TI15krzH&8U?pGn8J>tdd{A3@N&$tPk{s
zDq}>!o+nGEG0!WKmU9)1OB-;>V3URF6jZC%GFV?a(DLT3@I1~CR#GI<lDvzUk+1h%
zvo&d5t2?BmuKnaw@tRVtGp3KHGtT^W(@-Wz3%3Lge?t?Q#JebvHUt~wq^ZHgfHwsa
zHrBVk43hhp=_T%RzB{D#1!fPk)Ci4!#{0U0`|%zUIguw9(UP$p5&iy9+&w5+GIys^
zpMAK!(co?TzgUq`3s3ukAAW=(n$T94#h5!dM9bj(xzP6XYvA4B%@d0tVJ#VQNLf^i
zQ+kk!_~;lO5BOIr=!Qlde5VwfzJUk+OZ>8o32WS8X=UtS{^<s+Jc{+I$Tz>AO0yx0
z*9hOGp#d&#pdr5h-kFslV6>7_0=Sq)x}WVDrj~C}1c-taMp<xu&2ob}?daMS!F>Oq
zIE7x_2cTO5@H@9RA5N!@<242DQO_6{vLgYl_%1+B_ULzCP32F$FIUTZu%m!aAMV`l
ztdI1xB&vG`4OPt%umm<OAHdq=D7`KK8&rqy+P!sKXdS#qkz(D8<|3-vxq|M!I@!d-
zeNCN^#nG@%o)(QpPprN3h=a=$kFBx7USITytGraYzx%p8ia}8dA^GK-fu5~L@}jmr
zw2De<l$Z1IIPS}fM?;CR3x+R37$7#f9KtvaKHLJ`z@6#+m~OLC-$1<n@~~=7!>r3n
z_H6)B-R%<Tw_MTEMEC^rnBKs}td@15uE*cgumUS6*BvZbv6;`ylU=oYS0p`_pUxiJ
zO1RW%zw(!xJ~2GdNglnQhNYN%U#3j?VQ)2ovSh_8buRaAZ#$Y_oq)7|1m;iBa`m?Z
zg))(cV9H6BD1L<?q8B%rrOraFnFtc;laT^kJ@LxCW8G(vw&G|-ye$AF(ZC(Xvc|*7
zxeR8}Xwsss4t>sHjp@hfh_>IS!eIH78Az1nR9`ghB%oQNhh|RE0+I5lxe+4m`LbSV
zD2?z-D7XXH^?<DmNJ^eZ!Jy)D1KK8lpjtfVw7DyvRX>}-@$A{NHavU(?f}<r{f^hb
z$Z!=cJpyXfO%O+X^w@3im^V=<!X7b?btn;yU<lZYZjQ*kDHHVEB$%!<Pejk}kUI<y
zg91ihFA)Td79)BAwF&V0;;U-g@r+wtK7-N^-2=hwjOD!fROEQ8M)$m_YxMA4yDnML
zp8zQBq?S}?Pv~$$0FtBs??r=B{@xVTAm2#77(P3TOXsPcOps8VQ(dyi8rbS#^K=|y
zwm2_LdoH4@J|e8oeC)VQ%wrJV@VozR<_3>ee=9VE7+n8V>fOE+1||DAYpK|s@2^6M
z#FTebCm|oTp4`p`smWx|q<fNLKvo3lP(kF{|9qX7ig<PklMu<WX_zp!D3Hk*nrs}d
zIAmgKcTD`3&T3=-j_6_f<q(bRAAV}P`12-jG@VSD^0oCaf`Jr)B!e<)l5of&SmX!g
zd3EVNmySbgALI>4EPCKf7sTzjb!{9V3@TRe87R}iY5Z5c15%;epDj&r;Tc~{1Fau4
zg9k)E;ORmHoTT{s`@2HgWm9At{u+H;97z_ve%~7r(^n+f_4Nyb%~;oqNA>mf7Th~7
zQTIZ=u)XhF2Ua^&)YM<ujjJE!iTaYEKSFj|w`JF=g)plPIN$FGkXp&L7}Y$twH^4@
zbD&NNqzB?(Fpt?k-PY6pn8AOx6kwJ9f&Rk4$yKoxT8w!5W3dkZUviIxD3v!paCN(x
z{)|*|(S)YzT#LkbTKRj~MX${wivMPNRW*u{{yF?`NP<bLOE6Zbp=EgRj6|!JWrZi5
z^Qne$r=ntEH&#Um0mTu`+6z0N{vG`2$u4CwnZTFvLCn&S$8t<AaAz*5vWS#P$`p_0
zFg;H?=X7h`VKQbB^NtjQJx++>D`tNBjFsLp0XW?{!CK3Oi22xhu5*asWC`T{mV=dI
ziH1f+R}Jf|vU;gj&(_wvHXoPyCfpnW=V6u$e$vJ-DYRi$<X^gG=cnNUq*%g~x~$rw
zJXO*yG4Chvx$g0btBrhls8!|@@3c>Yg#j`ffzXO`mMGw;=bSFU)rlJB;eL82yFYu1
zZcc>mSopE316ps0CiZ<3h921fP}XQE16uICuFd>UD%`2eJwR6X(}_JwabAMckjqwG
z)`fWJAe;Ki3q^hR%tY2&c*X6sUG+HjlkS|Te%;#p1&-saqBLxORV@JVZl5sGBPLFq
z3Ln$kg3g<3kY;za4++OA!e0($K%tnX52YrOSTG*ClH4bS@mgwF$yp7tB>otNjWX3>
zlLunsp<<KMpPT?|XL3vu;lI_Gp}d?-2)P15sY>XJ1yzj+X&oZ3{Ngm>b!VIF3eLaJ
zbpDiPpLVL^4t;BezjkD?NtF^}=n_RlaVC%UF{7HzuAicCL+urCa6_1}U@&_TI240r
zc1_cPUcr>lN()c#&PAAGsyOvN_Kcpj3SZijU#x*0H8-=|^$=kWqSO{%f`+Ch>s`mn
zDTmt8x|&S_Eu7fidX*Rx?1lWVgglhZ3~K?(tUfJXah*I6p&wgPE1G7lrX_t2)TG$~
z8jR9yCh{K_1@eqPMD>jvty|T9uf)j{x$bQA+}rh<^NnrM+RoSa@7nkz(7t&Tz<uE^
zf&8%L=~$P%=adD>08$x`j9-e%-RgC4Y^}Ik7V!>brf_pA78ayIk|Tyv2JcL4{&-De
z{g-X)N^A1ce>jPVu<_qrh*W54P(^6lt5A~qlid^@{Z!?0H)K`%oUaiU8Dk8C1NnQ2
zu6krtl2*AQwxqm<5@pFBZ7xf5>p6mkr%t6jkfgKZe^`Wichk4jsr=(xl|I4@#mmO<
z$_pTvxFwuY&@dQgL^TU-jEH76>jz4e^EN4~*ns(;m>5LK$DbGVNcVbUkB*K^YAwh8
z?j}FT$!hz(^Q2bM`Bh?}zDSc<J?>rluLWMSoo};WHjoYazN!8@Cr94$;LFKuV5rcN
zXX$c7%djJ*BX7p_rMu;=apS|#$UN7q;Mfk;?j)b_lV4V#R_+YX$2FHtvvMv|=OQI@
zQO78}Wy)r9=kU*FX!b}K6X+is+54))p~`v!pU<z4GM|mbYq*IQq6;l8^)4I6dN(Cc
zB9Eb1qPR*GI;4j5`WCv3&*w*}BDkvyq>7QT=0A5VzSK2%Sal09-`|ZiTl1aMDkvs+
zL(Yy>X4GNzsAht5P-EL@b?WuT`YQ@{@iVz8Zb7%6_3wH$F+-H77fS4!Ra4(SznXP(
zA3yqo|Gj;%Ipw5*Os?9_*jTLDlUaz<Uf*>ANvDFfIPG2=<IgRVqk<Vbztm|u`?kug
zj@aMDSwe<xka?cyoPt)Sjvn4u-P6}+bz1!Jg@&XxtzzQ{9af3#eao@oOcD$gO0x6{
zJ84R{cPgaT*khtC<eV~i=@dT-1{Yp)-qfm3Aj0wN2ON&4xhGx<@x?G>Y-o{h#Egi<
z{DAE-Pa}CuSWF@C2>UvHEagRBlOBN%BF~?>IhkVe|DFk!tKqGuX;W8@wtwrABZa5g
zGSC|*NJEo5=^+tVQuiFsae5u(#*xlKlp-R*V|ozh*}@-1Z9>7Z4_`Txwf+>a_zHh6
zH_C$w__L5{YSCz-8;whY;=tt1doJiSya21H?+9N0i!eI#4-(&`+icqy(VdL$4+`Ku
zL#==d&ydNVb<=Ve!d!BLqbrYxOGbi$MGC<s6PDz7R0fYyO5KI&K(Z^R?O*mP>oGK8
z$|*;_UaTu`8hn+?syxA1Cj8jio(ZC4mG3kVUH$%U@HV?Dg|-w_u^>}|(WY#M8`D}S
zOsa+G1pD_lnZ{>?u#%aBO~=i|hLbSgt=9$E1%z%_@GaJ9Y)8DI{I^NNDs9PEnPIHr
z!m{O2pKfaeW@9D`_G0qaP>T*K=a>tl$D08?tYD_`DtxkQ!)2lB-Se?r?_Z2b60LpT
z3}wkO!x^WVX1q~G<XFO}N!Eskgc6j2U5k~wMMAM>DBPmRL4O=<@o+6_)rZm?<XRAR
z;XH+iiO)<<|2P|WjOY01uw={MBgf*#-<XG&gfJJL5N9@6?3X((U+rp68}?n&{opVv
z7d22e<?JmgaAfZ*pyP;t-li_%Y#i|{bz_7HvyqX|I%i>EZug$u_3fOj!6`m@?8_)#
zcRXYFjrvY8wUi(;iS)2W7akHUcIF5jl9E{E5TY+5JMlOaZ5<14;@mz5>qt+Q;R)r@
z9#EW|KROc399~u)?8+SNj*L{mxX+A0FfylwDZW}#jJ7c9l&W&Vh~(ujf=iNA3)=X4
zOJ<~Iuq$5Hvr0v)o!`^1;9wLg+v?%vnINy5d-H^JGD2PG1P`8sxAU!M&G$Htr%gPn
zgQxX@u!tR2hSDpS__KdCr-wn`VF9^w|4hl%Uzu+2j`}`n<)KVD4z-cWPRJrP<WGQ9
zT+DhcNWsaY8kHTi{3I)r?q=+biTn)hw-Ur211!S6zG8Z52JL}rwsbAUbm^iHLF|-}
zmrse<85AX<s%K5S14911?-5GSF?cX$MTHSpSkCqIL&}61#JhVBbQu)2{9NvttA51O
zzaU?T)kqwb7(sqVvO1pmK|`9VZPPRi2Z{U()0VoKQ9gc=-XEvgV$!5+%zoo!etKhN
zwJ~buvG2}y-#mr2W8O0vvL@9Ak|XE|(9bb+kKrw55=0S}#1U90shB3s!Oj#K!N8<6
zQ8`9Za;ty|eZ;~EQCTu)jg)iy949sZ`eoEJlqKG^j|!pe(oyHRzM$EM#bnbW92sSi
zwM>OKS9NtVvXSJWF=wuc<YVbSFpky|OzuOG5DaNdY0Q#g59z|bqkJMiKsxsEsdj-y
zMB}oPYOMxHsY)o?7qOKRQ7`PFoP;Z&Wgc1Spf1(?Ed|X(3H5z<wjAVD&eql_e5>yk
zUu!z?c*uX0Ms>!gRHpKC-^a=z5%T$mGBd)9m?<VhA1ylG=ZTf)Jw+&WEA&1XMD~SF
z)McL=vlkOhFlZ|h|Mw)m5|hrenyesHmD1q&?$O({pAqKK_DW?M5U|zdUf!iApKpNO
zXU?g`C8`^1^PUU3N|Ji2pxiD0hJo+P?vpME?T<HMM^08hTs!jk?S|%um9&G*7f%Q_
z_zf&rdIkp<u!iXUArL@3x-U&-G?)=HLX5psJ^bH&usUVLO3H>m1XA%|GdZ;5!m6!x
z95kWUx`5UdC?{kuMxn9pyP?<KK9mi7I$pYDzvk~6JTY~o7Hlz2^~6`4?e{ehwnMQJ
z{VXe3h&F~4>rVVb(!H7Nn>{=HhYgeEAYo@vIps>xBU(jT2yd;RpCdX2L=^w9!EkB_
z8B}R`wZT2t<y$C>1kW2j$pWWMnf^YY!s6URWSWA&o6TkF_v#s`C~kT>zO8Q~Lh$n_
zytifFZxPS_X8ll}Gkm(vS~Dx)(?!_Lvqlq->V`B#RD?z<d>)^0vIuWJmat?)0D+A-
zwEo}qw`%TNC)n91!-L~S$rkFGe+sZ7Euvv0p<T5^;hjl$_rB&1(^_7e;pa5rrPOqN
zG{k>Sy+qtOfSPGF@lWru9-EyfXclk&w)R`<?xH7=Btkj@%ES_lAb+PE`iNN?`<-AY
z89Rf&_v1f@|2+oS(Dw*iJsL?#k%2cJ0Pc`IkfM`iY9h&z>?uP!X>Lj_``iiEvg6UZ
z-M~Wzspwg+obkw@Znq(*;-s)T^ovcKl71^AiQgVLqCRsGc49+sK4KdQV}$A;B}FKx
zk})ZD2YEFsESWjsj14;k<zoN;HjzX##z@&!VnG=nlddHy+(yiK-%B6Ivo0-BiY6x$
z%aki=Gkb%*Gs%FE`@CX6&^=-~c)lg~=%?D<ZxgPYUBBP|$~3cF{ps^iMRe-|fsv-H
ziOcwyl0q{BUTKN#C_l*OJC0?o`~ksviAnaqJFH94P!a{(HtT0hvJ2O6<qnfBfaw+_
zF@|Ek+Op|&t8M&jvbM^t>$XgA(@G`Vc~`S-qYm{mWHiQOfARE>(bDjsam_JI>O5;a
zy9fgdp-ZA%2$L_+v8>jazI=Trap=F>+PAhCl_V06O`;LQu!=RMBgu}8A{$aP-)W1D
zJSy_1-YKA!_pXY@(IsAB;4`_dvFv^K_5STyZ-4j-hhI3J;a%FL<AG;vI-CrpvWTXC
zgY02s+3Fgb@NN9q|9x|MY_j}&^?Xv2u=ShGv|)m@Qa@vBUIj=Estr<No^RQW8Ex4(
z?dQzr6mmXVL)2brCiTXGC_KScXbq|D%-Q&q)lJ^ZEB<Jr0O$=yMh;d+uw#>BG3XZP
zKv)7xGo2?scNNM~A_W!V6r&UBO!qH$&4^}a80tQH(z5>Zbf<gT;kfIayXRNq%CdIa
qwjcCXiM#_la5wya$118v{Et3MN_YLR@n9AL9wj+7*;;9fu>S)+TN-2l

literal 0
HcmV?d00001

diff --git a/doc/testimonials/testimonials.rst b/doc/testimonials/testimonials.rst
index cac1292d92fa7..88997285e347e 100644
--- a/doc/testimonials/testimonials.rst
+++ b/doc/testimonials/testimonials.rst
@@ -1108,3 +1108,44 @@ Michael Fitzke Next Generation Technologies Sr Leader, Mars Inc.
    </div>
    </div>
 
+
+`BNP Paribas Cardif <https://www.bnpparibascardif.com/>`_
+---------------------------------------------------------
+
+.. raw:: html
+
+   <div class="sk-testimonial-div">
+   <div class="sk-testimonial-div-box">
+
+BNP Paribas Cardif uses scikit-learn for several of its machine learning models
+in production. Our internal community of developers and data scientists has
+been using scikit-learn since 2015, for several reasons: the quality of the
+developments, documentation and contribution governance, and the sheer size of
+the contributing community. We even explicitly mention the use of
+scikit-learn's pipelines in our internal model risk governance as one of our
+good practices to decrease operational risks and overfitting risk. As a way to
+support open source software development and in particular scikit-learn
+project, we decided to participate to scikit-learn's consortium at La Fondation
+Inria since its creation in 2018.
+
+.. raw:: html
+
+   <span class="testimonial-author">
+
+Sébastien Conort, Chief Data Scientist, BNP Paribas Cardif
+
+.. raw:: html
+
+   </span>
+   </div>
+   <div class="sk-testimonial-div-box">
+
+.. image:: images/bnp_paribas_cardif.png
+    :width: 120pt
+    :align: center
+    :target: https://www.bnpparibascardif.com/
+
+.. raw:: html
+
+   </div>
+   </div>

From 1045d16ec13b1cab7878e7555538573d1884aad3 Mon Sep 17 00:00:00 2001
From: Maria Telenczuk <maja_ka@hotmail.com>
Date: Wed, 3 Mar 2021 18:43:17 +0100
Subject: [PATCH 225/478] PERF don't compute variance when normalize is False
 in linear models (#19606)

---
 sklearn/linear_model/_base.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py
index 28cc386b4ecda..1842620dfa105 100644
--- a/sklearn/linear_model/_base.py
+++ b/sklearn/linear_model/_base.py
@@ -235,17 +235,19 @@ def _preprocess_data(X, y, fit_intercept, normalize=False, copy=True,
             if not return_mean:
                 X_offset[:] = X.dtype.type(0)
         else:
-            X_offset, X_var, _ = _incremental_mean_and_var(
-                X, last_mean=0., last_variance=0., last_sample_count=0.,
-                sample_weight=sample_weight
-            )
+            if normalize:
+                X_offset, X_var, _ = _incremental_mean_and_var(
+                    X, last_mean=0., last_variance=0., last_sample_count=0.,
+                    sample_weight=sample_weight
+                )
+            else:
+                X_offset = np.average(X, axis=0, weights=sample_weight)
 
-            X_offset = X_offset.astype(X.dtype)
+            X_offset = X_offset.astype(X.dtype, copy=False)
             X -= X_offset
 
-        X_var = X_var.astype(X.dtype, copy=False)
-
         if normalize:
+            X_var = X_var.astype(X.dtype, copy=False)
             # Detect constant features on the computed variance, before taking
             # the np.sqrt. Otherwise constant features cannot be detected with
             # sample_weights.

From 42e90e9ba28fb37c2c9bd3e8aed1ac2387f1d5d5 Mon Sep 17 00:00:00 2001
From: RichardScottOZ <72196131+RichardScottOZ@users.noreply.github.com>
Date: Mon, 8 Mar 2021 01:24:16 +1030
Subject: [PATCH 226/478] DOC Fixes spelling mistake in _kmeans.py (#19634)

Compatibility typo
---
 sklearn/cluster/_kmeans.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py
index c1d889b37db2d..1c54a5c9ff9e0 100644
--- a/sklearn/cluster/_kmeans.py
+++ b/sklearn/cluster/_kmeans.py
@@ -264,7 +264,7 @@ def k_means(X, n_clusters, *, sample_weight=None, init='k-means++',
         intensive due to the allocation of an extra array of shape
         (n_samples, n_clusters).
 
-        For now "auto" (kept for backward compatibiliy) chooses "elkan" but it
+        For now "auto" (kept for backward compatibility) chooses "elkan" but it
         might change in the future for a better heuristic.
 
     return_n_iter : bool, default=False

From f2773e840a0fcc9dd673cdd0da82dc43299a713b Mon Sep 17 00:00:00 2001
From: Alihan Zihna <alihanz@gmail.com>
Date: Mon, 8 Mar 2021 18:03:56 +0000
Subject: [PATCH 227/478] TST replace assert_raise_* by pytest.raises in
 tests/test_multioutput.py (#19618)

Co-authored-by: Alihan Zihna <a.zihna@ckhgbdp.onmicrosoft.com>
---
 sklearn/tests/test_multioutput.py | 36 ++++++++++++++++++++-----------
 1 file changed, 23 insertions(+), 13 deletions(-)

diff --git a/sklearn/tests/test_multioutput.py b/sklearn/tests/test_multioutput.py
index edfcdef1bf89c..87e5218e08e22 100644
--- a/sklearn/tests/test_multioutput.py
+++ b/sklearn/tests/test_multioutput.py
@@ -5,9 +5,6 @@
 from joblib import cpu_count
 
 from sklearn.utils._testing import assert_almost_equal
-from sklearn.utils._testing import assert_raises
-from sklearn.utils._testing import assert_raises_regex
-from sklearn.utils._testing import assert_raise_message
 from sklearn.utils._testing import assert_array_equal
 from sklearn.utils._testing import assert_array_almost_equal
 from sklearn import datasets
@@ -80,7 +77,9 @@ def test_multi_target_regression_one_target():
     # Test multi target regression raises
     X, y = datasets.make_regression(n_targets=1)
     rgr = MultiOutputRegressor(GradientBoostingRegressor(random_state=0))
-    assert_raises(ValueError, rgr.fit, X, y)
+    msg = 'at least two dimensions'
+    with pytest.raises(ValueError, match=msg):
+        rgr.fit(X, y)
 
 
 def test_multi_target_sparse_regression():
@@ -106,8 +105,9 @@ def test_multi_target_sample_weights_api():
     w = [0.8, 0.6]
 
     rgr = MultiOutputRegressor(OrthogonalMatchingPursuit())
-    assert_raises_regex(ValueError, "does not support sample weights",
-                        rgr.fit, X, y, w)
+    msg = "does not support sample weights"
+    with pytest.raises(ValueError, match=msg):
+        rgr.fit(X, y, w)
 
     # no exception should be raised if the base estimator supports weights
     rgr = MultiOutputRegressor(GradientBoostingRegressor(random_state=0))
@@ -252,9 +252,9 @@ def test_multi_output_classification_partial_fit():
 def test_multi_output_classification_partial_fit_no_first_classes_exception():
     sgd_linear_clf = SGDClassifier(loss='log', random_state=1, max_iter=5)
     multi_target_linear = MultiOutputClassifier(sgd_linear_clf)
-    assert_raises_regex(ValueError, "classes must be passed on the first call "
-                                    "to partial_fit.",
-                        multi_target_linear.partial_fit, X, y)
+    msg = "classes must be passed on the first call to partial_fit."
+    with pytest.raises(ValueError, match=msg):
+        multi_target_linear.partial_fit(X, y)
 
 
 def test_multi_output_classification():
@@ -386,17 +386,27 @@ def test_multi_output_exceptions():
     # NotFittedError when fit is not done but score, predict and
     # and predict_proba are called
     moc = MultiOutputClassifier(LinearSVC(random_state=0))
-    assert_raises(NotFittedError, moc.predict, y)
+
+    with pytest.raises(NotFittedError):
+        moc.predict(y)
+
     with pytest.raises(NotFittedError):
         moc.predict_proba
-    assert_raises(NotFittedError, moc.score, X, y)
+
+    with pytest.raises(NotFittedError):
+        moc.score(X, y)
+
     # ValueError when number of outputs is different
     # for fit and score
     y_new = np.column_stack((y1, y2))
     moc.fit(X, y)
-    assert_raises(ValueError, moc.score, X, y_new)
+    with pytest.raises(ValueError):
+        moc.score(X, y_new)
+
     # ValueError when y is continuous
-    assert_raise_message(ValueError, "Unknown label type", moc.fit, X, X[:, 1])
+    msg = "Unknown label type"
+    with pytest.raises(ValueError, match=msg):
+        moc.fit(X, X[:, 1])
 
 
 def generate_multilabel_dataset_with_correlations():

From ae3d955c90d03479d4b6a8a3b359fba10826dc2a Mon Sep 17 00:00:00 2001
From: Mohamed Haseeb <m@mohaseeb.com>
Date: Tue, 9 Mar 2021 17:45:57 +0100
Subject: [PATCH 228/478] TST Uses pytest.raises in model_selection/tests
 (#19621)

---
 sklearn/model_selection/tests/test_search.py | 72 +++++++++++---------
 sklearn/model_selection/tests/test_split.py  | 72 +++++++++++---------
 2 files changed, 80 insertions(+), 64 deletions(-)

diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py
index f9e0babebe3ad..c71c812b3368f 100644
--- a/sklearn/model_selection/tests/test_search.py
+++ b/sklearn/model_selection/tests/test_search.py
@@ -14,7 +14,6 @@
 import pytest
 
 from sklearn.utils._testing import (
-    assert_raise_message,
     assert_array_equal,
     assert_array_almost_equal,
     assert_allclose,
@@ -270,8 +269,8 @@ def test_grid_search_no_score():
 
     # giving no scoring function raises an error
     grid_search_no_score = GridSearchCV(clf_no_score, {'C': Cs})
-    assert_raise_message(TypeError, "no scoring", grid_search_no_score.fit,
-                         [[1]])
+    with pytest.raises(TypeError, match="no scoring"):
+        grid_search_no_score.fit([[1]])
 
 
 def test_grid_search_score_method():
@@ -316,11 +315,11 @@ def test_grid_search_groups():
 
     group_cvs = [LeaveOneGroupOut(), LeavePGroupsOut(2),
                  GroupKFold(n_splits=3), GroupShuffleSplit()]
+    error_msg = "The 'groups' parameter should not be None."
     for cv in group_cvs:
         gs = GridSearchCV(clf, grid, cv=cv)
-        assert_raise_message(ValueError,
-                             "The 'groups' parameter should not be None.",
-                             gs.fit, X, y)
+        with pytest.raises(ValueError, match=error_msg):
+            gs.fit(X, y)
         gs.fit(X, y, groups=groups)
 
     non_group_cvs = [StratifiedKFold(), StratifiedShuffleSplit()]
@@ -385,20 +384,21 @@ def test_no_refit():
         # error messages
         for fn_name in ('predict', 'predict_proba', 'predict_log_proba',
                         'transform', 'inverse_transform'):
-            assert_raise_message(NotFittedError,
-                                 ('refit=False. %s is available only after '
-                                  'refitting on the best parameters'
-                                  % fn_name), getattr(grid_search, fn_name), X)
+            error_msg = (f"refit=False. {fn_name} is available only after "
+                         f"refitting on the best parameters")
+            with pytest.raises(NotFittedError, match=error_msg):
+                getattr(grid_search, fn_name)(X)
 
     # Test that an invalid refit param raises appropriate error messages
+    error_msg = ("For multi-metric scoring, the parameter refit must be set to"
+                 " a scorer key")
     for refit in ["", 5, True, 'recall', 'accuracy']:
-        assert_raise_message(ValueError, "For multi-metric scoring, the "
-                             "parameter refit must be set to a scorer key",
-                             GridSearchCV(clf, {}, refit=refit,
-                                          scoring={'acc': 'accuracy',
-                                                   'prec': 'precision'}
-                                          ).fit,
-                             X, y)
+        with pytest.raises(ValueError, match=error_msg):
+            GridSearchCV(
+                clf, {},
+                refit=refit,
+                scoring={'acc': 'accuracy', 'prec': 'precision'}
+            ).fit(X, y)
 
 
 def test_grid_search_error():
@@ -437,30 +437,33 @@ def test_grid_search_when_param_grid_includes_range():
 def test_grid_search_bad_param_grid():
     param_dict = {"C": 1}
     clf = SVC(gamma='auto')
-    assert_raise_message(
-        ValueError,
+    error_msg = re.escape(
         "Parameter grid for parameter (C) needs to"
         " be a list or numpy array, but got (<class 'int'>)."
         " Single values need to be wrapped in a list"
-        " with one element.",
-        GridSearchCV, clf, param_dict)
+        " with one element."
+    )
+    with pytest.raises(ValueError, match=error_msg):
+        GridSearchCV(clf, param_dict)
 
     param_dict = {"C": []}
     clf = SVC()
-    assert_raise_message(
-        ValueError,
-        "Parameter values for parameter (C) need to be a non-empty sequence.",
-        GridSearchCV, clf, param_dict)
+    error_msg = re.escape(
+        "Parameter values for parameter (C) need to be a non-empty sequence."
+    )
+    with pytest.raises(ValueError, match=error_msg):
+        GridSearchCV(clf, param_dict)
 
     param_dict = {"C": "1,2,3"}
     clf = SVC(gamma='auto')
-    assert_raise_message(
-        ValueError,
+    error_msg = re.escape(
         "Parameter grid for parameter (C) needs to"
         " be a list or numpy array, but got (<class 'str'>)."
         " Single values need to be wrapped in a list"
-        " with one element.",
-        GridSearchCV, clf, param_dict)
+        " with one element."
+    )
+    with pytest.raises(ValueError, match=error_msg):
+        GridSearchCV(clf, param_dict)
 
     param_dict = {"C": np.ones((3, 2))}
     clf = SVC()
@@ -1293,10 +1296,13 @@ def test_fit_grid_point():
             assert n_test_samples == test.size
 
     # Should raise an error upon multimetric scorer
-    assert_raise_message(ValueError, "For evaluating multiple scores, use "
-                         "sklearn.model_selection.cross_validate instead.",
-                         fit_grid_point, X, y, svc, params, train, test,
-                         {'score': scorer}, verbose=True)
+    error_msg = ("For evaluating multiple scores, use "
+                 "sklearn.model_selection.cross_validate instead.")
+    with pytest.raises(ValueError, match=error_msg):
+        fit_grid_point(
+            X, y, svc, params, train, test, {'score': scorer},
+            verbose=True
+        )
 
 
 # FIXME remove test_fit_grid_point_deprecated as
diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py
index e6900c90e7a87..80c19c7f2e08c 100644
--- a/sklearn/model_selection/tests/test_split.py
+++ b/sklearn/model_selection/tests/test_split.py
@@ -1,6 +1,7 @@
 """Test the split module"""
 import warnings
 import pytest
+import re
 import numpy as np
 from scipy.sparse import coo_matrix, csc_matrix, csr_matrix
 from scipy import stats
@@ -12,7 +13,6 @@
 from sklearn.utils._testing import assert_allclose
 from sklearn.utils._testing import assert_array_almost_equal
 from sklearn.utils._testing import assert_array_equal
-from sklearn.utils._testing import assert_raise_message
 from sklearn.utils._testing import ignore_warnings
 from sklearn.utils.validation import _num_samples
 from sklearn.utils._mocking import MockDataFrame
@@ -116,10 +116,10 @@ def test_cross_validator_with_default_params():
 
     # ValueError for get_n_splits methods
     msg = "The 'X' parameter should not be None."
-    assert_raise_message(ValueError, msg,
-                         loo.get_n_splits, None, y, groups)
-    assert_raise_message(ValueError, msg,
-                         lpo.get_n_splits, None, y, groups)
+    with pytest.raises(ValueError, match=msg):
+        loo.get_n_splits(None, y, groups)
+    with pytest.raises(ValueError, match=msg):
+        lpo.get_n_splits(None, y, groups)
 
 
 def test_2d_y():
@@ -214,10 +214,10 @@ def test_kfold_valueerrors():
         KFold(1)
     error_string = ("k-fold cross-validation requires at least one"
                     " train/test split")
-    assert_raise_message(ValueError, error_string,
-                         StratifiedKFold, 0)
-    assert_raise_message(ValueError, error_string,
-                         StratifiedKFold, 1)
+    with pytest.raises(ValueError, match=error_string):
+        StratifiedKFold(0)
+    with pytest.raises(ValueError, match=error_string):
+        StratifiedKFold(1)
 
     # When n_splits is not integer:
     with pytest.raises(ValueError):
@@ -858,10 +858,10 @@ def test_leave_one_p_group_out():
         lpgo_2.get_n_splits(None, None, [0.0, np.inf, 0.0])
 
     msg = "The 'groups' parameter should not be None."
-    assert_raise_message(ValueError, msg,
-                         logo.get_n_splits, None, None, None)
-    assert_raise_message(ValueError, msg,
-                         lpgo_1.get_n_splits, None, None, None)
+    with pytest.raises(ValueError, match=msg):
+        logo.get_n_splits(None, None, None)
+    with pytest.raises(ValueError, match=msg):
+        lpgo_1.get_n_splits(None, None, None)
 
 
 def test_leave_group_out_changing_groups():
@@ -891,27 +891,37 @@ def test_leave_group_out_changing_groups():
 
 def test_leave_one_p_group_out_error_on_fewer_number_of_groups():
     X = y = groups = np.ones(0)
-    assert_raise_message(ValueError, "Found array with 0 sample(s)", next,
-                         LeaveOneGroupOut().split(X, y, groups))
+    msg = re.escape("Found array with 0 sample(s)")
+    with pytest.raises(ValueError, match=msg):
+        next(LeaveOneGroupOut().split(X, y, groups))
+
     X = y = groups = np.ones(1)
-    msg = ("The groups parameter contains fewer than 2 unique groups ({}). "
-           "LeaveOneGroupOut expects at least 2.").format(groups)
-    assert_raise_message(ValueError, msg, next,
-                         LeaveOneGroupOut().split(X, y, groups))
+    msg = re.escape(
+        f"The groups parameter contains fewer than 2 unique groups ({groups})."
+        f" LeaveOneGroupOut expects at least 2."
+    )
+    with pytest.raises(ValueError, match=msg):
+        next(LeaveOneGroupOut().split(X, y, groups))
+
     X = y = groups = np.ones(1)
-    msg = ("The groups parameter contains fewer than (or equal to) n_groups "
-           "(3) numbers of unique groups ({}). LeavePGroupsOut expects "
-           "that at least n_groups + 1 (4) unique groups "
-           "be present").format(groups)
-    assert_raise_message(ValueError, msg, next,
-                         LeavePGroupsOut(n_groups=3).split(X, y, groups))
+    msg = re.escape(
+        f"The groups parameter contains fewer than (or equal to) n_groups "
+        f"(3) numbers of unique groups ({groups}). LeavePGroupsOut expects "
+        f"that at least n_groups + 1 (4) unique groups "
+        f"be present"
+    )
+    with pytest.raises(ValueError, match=msg):
+        next(LeavePGroupsOut(n_groups=3).split(X, y, groups))
+
     X = y = groups = np.arange(3)
-    msg = ("The groups parameter contains fewer than (or equal to) n_groups "
-           "(3) numbers of unique groups ({}). LeavePGroupsOut expects "
-           "that at least n_groups + 1 (4) unique groups "
-           "be present").format(groups)
-    assert_raise_message(ValueError, msg, next,
-                         LeavePGroupsOut(n_groups=3).split(X, y, groups))
+    msg = re.escape(
+        f"The groups parameter contains fewer than (or equal to) n_groups "
+        f"(3) numbers of unique groups ({groups}). LeavePGroupsOut expects "
+        f"that at least n_groups + 1 (4) unique groups "
+        f"be present"
+    )
+    with pytest.raises(ValueError, match=msg):
+        next(LeavePGroupsOut(n_groups=3).split(X, y, groups))
 
 
 @ignore_warnings

From ad7c316d9d05b937aba98d6601db99b4dadf2e52 Mon Sep 17 00:00:00 2001
From: Alek Lefebvre <alek.lefebvre@gmail.com>
Date: Wed, 10 Mar 2021 05:27:23 -0500
Subject: [PATCH 229/478] Fix Calibrated classifier cv predictions with
 pipeline (#19641)

Co-authored-by: Alek Lefebvre <alek.lefebvre@mail.mcgill.ca>
Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 doc/whats_new/v1.0.rst            |  7 +++++++
 sklearn/calibration.py            | 17 +++++++----------
 sklearn/tests/test_calibration.py | 15 ++++++++++++---
 3 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 3e36438dda095..a566d03ae1bbc 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -207,6 +207,13 @@ Changelog
   for non-English characters. :pr:`18959` by :user:`Zero <Zeroto521>`
   and :user:`wstates <wstates>`.
 
+:mod:`sklearn.calibration`
+............................
+
+- |Fix| The predict and predict_proba methods of
+  :class:`calibration.CalibratedClassifierCV can now properly be used on
+  prefitted pipelines. :pr:`19641` by :user:`Alek Lefebvre <AlekLefebvre>`
+
 Code and Documentation Contributors
 -----------------------------------
 
diff --git a/sklearn/calibration.py b/sklearn/calibration.py
index bff7f6c03502f..b60a415b4419b 100644
--- a/sklearn/calibration.py
+++ b/sklearn/calibration.py
@@ -24,15 +24,15 @@
                    MetaEstimatorMixin)
 from .preprocessing import label_binarize, LabelEncoder
 from .utils import (
-    check_array,
     column_or_1d,
     deprecated,
     indexable,
 )
+
 from .utils.multiclass import check_classification_targets
 from .utils.fixes import delayed
 from .utils.validation import check_is_fitted, check_consistent_length
-from .utils.validation import _check_sample_weight
+from .utils.validation import _check_sample_weight, _num_samples
 from .pipeline import Pipeline
 from .isotonic import IsotonicRegression
 from .svm import LinearSVC
@@ -344,8 +344,7 @@ def predict_proba(self, X):
 
         Parameters
         ----------
-        X : array-like of shape (n_samples, n_features)
-            The samples.
+        X : The samples, as accepted by base_estimator.predict_proba
 
         Returns
         -------
@@ -353,11 +352,10 @@ def predict_proba(self, X):
             The predicted probas.
         """
         check_is_fitted(self)
-        X = check_array(X, accept_sparse=['csc', 'csr', 'coo'],
-                        force_all_finite=False)
+
         # Compute the arithmetic mean of the predictions of the calibrated
         # classifiers
-        mean_proba = np.zeros((X.shape[0], len(self.classes_)))
+        mean_proba = np.zeros((_num_samples(X), len(self.classes_)))
         for calibrated_classifier in self.calibrated_classifiers_:
             proba = calibrated_classifier.predict_proba(X)
             mean_proba += proba
@@ -373,8 +371,7 @@ class that has the highest probability, and can thus be different
 
         Parameters
         ----------
-        X : array-like of shape (n_samples, n_features)
-            The samples.
+        X : The samples, as accepted by base_estimator.predict
 
         Returns
         -------
@@ -643,7 +640,7 @@ def predict_proba(self, X):
             self.base_estimator.classes_
         )
 
-        proba = np.zeros((X.shape[0], n_classes))
+        proba = np.zeros((_num_samples(X), n_classes))
         for class_idx, this_pred, calibrator in \
                 zip(pos_class_indices, predictions.T, self.calibrators):
             if n_classes == 2:
diff --git a/sklearn/tests/test_calibration.py b/sklearn/tests/test_calibration.py
index 4ba1599eba3e6..86a638c4a7679 100644
--- a/sklearn/tests/test_calibration.py
+++ b/sklearn/tests/test_calibration.py
@@ -533,9 +533,14 @@ def text_data_pipeline(text_data):
 
 
 def test_calibration_pipeline(text_data, text_data_pipeline):
-    # Test that calibration works in prefit pipeline with transformer,
-    # where `X` is not array-like, sparse matrix or dataframe at the start.
-    # See https://github.com/scikit-learn/scikit-learn/issues/8710
+    """Test that calibration works in prefit pipeline with transformer
+
+    `X` is not array-like, sparse matrix or dataframe at the start.
+    See https://github.com/scikit-learn/scikit-learn/issues/8710
+
+    Also test it can predict without running into validation errors.
+    See https://github.com/scikit-learn/scikit-learn/issues/19637
+    """
     X, y = text_data
     clf = text_data_pipeline
     calib_clf = CalibratedClassifierCV(clf, cv='prefit')
@@ -546,6 +551,10 @@ def test_calibration_pipeline(text_data, text_data_pipeline):
     with pytest.raises(AttributeError, match=msg):
         calib_clf.n_features_in_
 
+    # Ensure that no error is thrown with predict and predict_proba
+    calib_clf.predict(X)
+    calib_clf.predict_proba(X)
+
 
 @pytest.mark.parametrize('clf, cv', [
     pytest.param(LinearSVC(C=1), 2),

From 4beb0c27fc0439c12dad244fe4063e96f8983a52 Mon Sep 17 00:00:00 2001
From: Geoffrey Thomas <geofft@ldpreload.com>
Date: Wed, 10 Mar 2021 08:26:22 -0500
Subject: [PATCH 230/478] MNT Make setup.py command parsing more robust
 (#19650)

Separate arguments into options and commands, and use setuptools.setup
if all the requested commands can / should run without NumPy installed,
even if there are options present.

This fixes a bug where `setup.py --no-user-site egg_info` wants NumPy to
be installed, even though `setup.py egg_info` works fine.
---
 setup.py | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/setup.py b/setup.py
index f2d832a459d89..e44f941e0a114 100755
--- a/setup.py
+++ b/setup.py
@@ -266,14 +266,9 @@ def setup_package():
                     package_data={'': ['*.pxd']},
                     **extra_setuptools_args)
 
-    if len(sys.argv) == 1 or (
-            len(sys.argv) >= 2 and ('--help' in sys.argv[1:] or
-                                    sys.argv[1] in ('--help-commands',
-                                                    'egg_info',
-                                                    'dist_info',
-                                                    '--version',
-                                                    'clean',
-                                                    'check'))):
+    commands = [arg for arg in sys.argv[1:] if not arg.startswith('-')]
+    if all(command in ('egg_info', 'dist_info', 'clean', 'check')
+           for command in commands):
         # These actions are required to succeed without Numpy for example when
         # pip is used to install Scikit-learn when Numpy is not yet present in
         # the system.

From 58af0196acc96603c9669b5bbc0c18cf118a150e Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@gmail.com>
Date: Thu, 11 Mar 2021 15:16:45 +0100
Subject: [PATCH 231/478] DOC Use term 'black people' instead of 'blacks' in
 Boston descr (#19661)

---
 sklearn/datasets/descr/boston_house_prices.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/datasets/descr/boston_house_prices.rst b/sklearn/datasets/descr/boston_house_prices.rst
index dec9b999cd592..948bccf080c82 100644
--- a/sklearn/datasets/descr/boston_house_prices.rst
+++ b/sklearn/datasets/descr/boston_house_prices.rst
@@ -21,7 +21,7 @@ Boston house prices dataset
         - RAD      index of accessibility to radial highways
         - TAX      full-value property-tax rate per $10,000
         - PTRATIO  pupil-teacher ratio by town
-        - B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
+        - B        1000(Bk - 0.63)^2 where Bk is the proportion of black people by town
         - LSTAT    % lower status of the population
         - MEDV     Median value of owner-occupied homes in $1000's
 

From 598045569c8f96fb345059f5316ea8903d374ff4 Mon Sep 17 00:00:00 2001
From: Jon Crall <erotemic@gmail.com>
Date: Thu, 11 Mar 2021 10:25:13 -0500
Subject: [PATCH 232/478] ENH Speedup confusion_matrix (#9843)

Co-authored-by: Guillaume Lemaitre <g.lemaitre58@gmail.com>
---
 doc/whats_new/v1.0.rst             |  4 ++++
 sklearn/metrics/_classification.py | 26 +++++++++++++++++---------
 2 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index a566d03ae1bbc..0f4882f1b2970 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -169,6 +169,10 @@ Changelog
   quantile regression. :pr:`19415` by :user:`Xavier Dupré <sdpython>`
   and :user:`Oliver Grisel <ogrisel>`.
 
+- |Efficiency| Improved speed of :func:`metrics.confusion_matrix` when labels
+  are integral.
+  :pr:`9843` by :user:`Jon Crall <Erotemic>`.
+
 :mod:`sklearn.naive_bayes`
 ..........................
 
diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py
index 708bde662e765..b4ab145d80937 100644
--- a/sklearn/metrics/_classification.py
+++ b/sklearn/metrics/_classification.py
@@ -309,7 +309,7 @@ def confusion_matrix(y_true, y_pred, *, labels=None, sample_weight=None,
             raise ValueError("'labels' should contains at least one label.")
         elif y_true.size == 0:
             return np.zeros((n_labels, n_labels), dtype=int)
-        elif np.all([l not in y_true for l in labels]):
+        elif len(np.intersect1d(y_true, labels)) == 0:
             raise ValueError("At least one label specified must be in y_true")
 
     if sample_weight is None:
@@ -324,17 +324,25 @@ def confusion_matrix(y_true, y_pred, *, labels=None, sample_weight=None,
                          "'all', None}")
 
     n_labels = labels.size
-    label_to_ind = {y: x for x, y in enumerate(labels)}
-    # convert yt, yp into index
-    y_pred = np.array([label_to_ind.get(x, n_labels + 1) for x in y_pred])
-    y_true = np.array([label_to_ind.get(x, n_labels + 1) for x in y_true])
+    # If labels are not consecutive integers starting from zero, then
+    # y_true and y_pred must be converted into index form
+    need_index_conversion = not (
+        labels.dtype.kind in {'i', 'u', 'b'} and
+        np.all(labels == np.arange(n_labels)) and
+        y_true.min() >= 0 and y_pred.min() >= 0
+    )
+    if need_index_conversion:
+        label_to_ind = {y: x for x, y in enumerate(labels)}
+        y_pred = np.array([label_to_ind.get(x, n_labels + 1) for x in y_pred])
+        y_true = np.array([label_to_ind.get(x, n_labels + 1) for x in y_true])
 
     # intersect y_pred, y_true with labels, eliminate items not in labels
     ind = np.logical_and(y_pred < n_labels, y_true < n_labels)
-    y_pred = y_pred[ind]
-    y_true = y_true[ind]
-    # also eliminate weights of eliminated items
-    sample_weight = sample_weight[ind]
+    if not np.all(ind):
+        y_pred = y_pred[ind]
+        y_true = y_true[ind]
+        # also eliminate weights of eliminated items
+        sample_weight = sample_weight[ind]
 
     # Choose the accumulator dtype to always have high precision
     if sample_weight.dtype.kind in {'i', 'u', 'b'}:

From 6f180d79f58b42a3fa06055c489b1edf857399ff Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Thu, 11 Mar 2021 12:34:04 -0500
Subject: [PATCH 233/478] BUG Fixes verbose > 2 for grid search (#19659)

---
 doc/whats_new/v0.24.rst                      |  7 +++++++
 sklearn/model_selection/_validation.py       | 20 ++++++++++++++------
 sklearn/model_selection/tests/test_search.py | 19 +++++++++++++++++++
 3 files changed, 40 insertions(+), 6 deletions(-)

diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst
index 84e712c05ea79..68ea8ba0f7a72 100644
--- a/doc/whats_new/v0.24.rst
+++ b/doc/whats_new/v0.24.rst
@@ -33,6 +33,13 @@ Changelog
   sample_weight object is not modified anymore. :pr:`19182` by
   :user:`Yosuke KOBAYASHI <m7142yosuke>`.
 
+:mod:`sklearn.model_selection`
+..............................
+
+- |Fix| :class:`model_selection.RandomizedSearchCV` and
+  :class:`model_selection.GridSearchCV` now correctly shows the score for
+  single metrics and verbose > 2. :pr:`19659` by `Thomas Fan`_.
+
 :mod:`sklearn.preprocessing`
 ............................
 
diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py
index 63f9a53fcf91f..e61e693b2fa74 100644
--- a/sklearn/model_selection/_validation.py
+++ b/sklearn/model_selection/_validation.py
@@ -631,13 +631,21 @@ def _fit_and_score(estimator, X, y, scorer, train, test, verbose,
         total_time = score_time + fit_time
         end_msg = f"[CV{progress_msg}] END "
         result_msg = params_msg + (";" if params_msg else "")
-        if verbose > 2 and isinstance(test_scores, dict):
-            for scorer_name in sorted(test_scores):
-                result_msg += f" {scorer_name}: ("
+        if verbose > 2:
+            if isinstance(test_scores, dict):
+                for scorer_name in sorted(test_scores):
+                    result_msg += f" {scorer_name}: ("
+                    if return_train_score:
+                        scorer_scores = train_scores[scorer_name]
+                        result_msg += f"train={scorer_scores:.3f}, "
+                    result_msg += f"test={test_scores[scorer_name]:.3f})"
+            else:
+                result_msg += ", score="
                 if return_train_score:
-                    scorer_scores = train_scores[scorer_name]
-                    result_msg += f"train={scorer_scores:.3f}, "
-                result_msg += f"test={test_scores[scorer_name]:.3f})"
+                    result_msg += (f"(train={train_scores:.3f}, "
+                                   f"test={test_scores:.3f})")
+                else:
+                    result_msg += f"{test_scores:.3f}"
         result_msg += f" total time={logger.short_format_time(total_time)}"
 
         # Right align the result_msg
diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py
index c71c812b3368f..25c4ce8cc22f7 100644
--- a/sklearn/model_selection/tests/test_search.py
+++ b/sklearn/model_selection/tests/test_search.py
@@ -2137,3 +2137,22 @@ def test_search_cv_using_minimal_compatible_estimator(SearchCV, Predictor):
     else:
         assert_allclose(y_pred, y.mean())
         assert search.score(X, y) == pytest.approx(r2_score(y, y_pred))
+
+
+@pytest.mark.parametrize("return_train_score", [True, False])
+def test_search_cv_verbose_3(capsys, return_train_score):
+    """Check that search cv with verbose>2 shows the score for single
+    metrics. non-regression test fo #19658."""
+    X, y = make_classification(n_samples=100, n_classes=2, flip_y=.2,
+                               random_state=0)
+    clf = LinearSVC(random_state=0)
+    grid = {'C': [.1]}
+
+    GridSearchCV(clf, grid, scoring='accuracy', verbose=3, cv=3,
+                 return_train_score=return_train_score).fit(X, y)
+    captured = capsys.readouterr().out
+    if return_train_score:
+        match = re.findall(r"score=\(train=[\d\.]+, test=[\d.]+\)", captured)
+    else:
+        match = re.findall(r"score=[\d\.]+", captured)
+    assert len(match) == 3

From bfd7b58c1d0d459257687da25419edb052443528 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Fri, 12 Mar 2021 08:48:24 -0500
Subject: [PATCH 234/478] TST Do not use cache in test_fetch_openml_iris 
 (#19594)

---
 sklearn/datasets/tests/test_openml.py | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py
index 9f55909c6643b..dac0762eb2160 100644
--- a/sklearn/datasets/tests/test_openml.py
+++ b/sklearn/datasets/tests/test_openml.py
@@ -772,15 +772,12 @@ def test_fetch_openml_iris(monkeypatch, gzip_response):
     data_name = 'iris'
 
     _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
-    assert_warns_message(
-        UserWarning,
-        "Multiple active versions of the dataset matching the name"
-        " iris exist. Versions may be fundamentally different, "
-        "returning version 1.",
-        fetch_openml,
-        name=data_name,
-        as_frame=False
-    )
+
+    msg = ("Multiple active versions of the dataset matching the name"
+           " iris exist. Versions may be fundamentally different, "
+           "returning version 1.")
+    with pytest.warns(UserWarning, match=msg):
+        fetch_openml(name=data_name, as_frame=False, cache=False)
 
 
 def test_decode_iris(monkeypatch):

From 5ccfabf08d13f50dc3f5b8a8e38dd362ab594c6e Mon Sep 17 00:00:00 2001
From: shivamgargsya <shivam.gargshya@gmail.com>
Date: Fri, 12 Mar 2021 20:16:50 +0530
Subject: [PATCH 235/478] TST Change assert from sklearn to pytest style in
 module linear_model/tests (#19565)

---
 sklearn/linear_model/_omp.py                  |  8 ++-
 .../tests/test_coordinate_descent.py          | 36 +++++++---
 .../linear_model/tests/test_least_angle.py    |  7 +-
 sklearn/linear_model/tests/test_logistic.py   | 70 +++++++++++--------
 sklearn/linear_model/tests/test_omp.py        | 26 ++++---
 sklearn/linear_model/tests/test_ransac.py     | 11 ++-
 sklearn/linear_model/tests/test_ridge.py      | 13 ++--
 sklearn/linear_model/tests/test_sgd.py        |  9 ++-
 .../tests/test_sparse_coordinate_descent.py   |  9 ++-
 sklearn/linear_model/tests/test_theil_sen.py  | 10 ++-
 10 files changed, 133 insertions(+), 66 deletions(-)

diff --git a/sklearn/linear_model/_omp.py b/sklearn/linear_model/_omp.py
index e100cdef04fdb..3f995f0f34318 100644
--- a/sklearn/linear_model/_omp.py
+++ b/sklearn/linear_model/_omp.py
@@ -20,9 +20,11 @@
 from ..utils.fixes import delayed
 from ..model_selection import check_cv
 
-premature = """ Orthogonal matching pursuit ended prematurely due to linear
-dependence in the dictionary. The requested precision might not have been met.
-"""
+premature = (
+    "Orthogonal matching pursuit ended prematurely due to linear"
+    " dependence in the dictionary. The requested precision might"
+    " not have been met."
+)
 
 
 def _cholesky_omp(X, y, n_nonzero_coefs, tol=None, copy_X=True,
diff --git a/sklearn/linear_model/tests/test_coordinate_descent.py b/sklearn/linear_model/tests/test_coordinate_descent.py
index 3eba535d70c89..ebddb6a7e47c6 100644
--- a/sklearn/linear_model/tests/test_coordinate_descent.py
+++ b/sklearn/linear_model/tests/test_coordinate_descent.py
@@ -19,8 +19,6 @@
 from sklearn.utils._testing import assert_allclose
 from sklearn.utils._testing import assert_array_almost_equal
 from sklearn.utils._testing import assert_almost_equal
-from sklearn.utils._testing import assert_warns
-from sklearn.utils._testing import assert_warns_message
 from sklearn.utils._testing import ignore_warnings
 from sklearn.utils._testing import assert_array_equal
 from sklearn.utils._testing import _convert_container
@@ -646,7 +644,13 @@ def test_lasso_alpha_warning():
     Y = [-1, 0, 1]       # just a straight line
 
     clf = Lasso(alpha=0)
-    assert_warns(UserWarning, clf.fit, X, Y)
+    warning_message = (
+        "With alpha=0, this algorithm does not "
+        "converge well. You are advised to use the "
+        "LinearRegression estimator"
+    )
+    with pytest.warns(UserWarning, match=warning_message):
+        clf.fit(X, Y)
 
 
 def test_lasso_positive_constraint():
@@ -733,7 +737,12 @@ def test_multi_task_lasso_and_enet():
     assert_array_almost_equal(clf.coef_[0], clf.coef_[1])
 
     clf = MultiTaskElasticNet(alpha=1.0, tol=1e-8, max_iter=1)
-    assert_warns_message(ConvergenceWarning, 'did not converge', clf.fit, X, Y)
+    warning_message = (
+        "Objective did not converge. You might want to "
+        "increase the number of iterations."
+    )
+    with pytest.warns(ConvergenceWarning, match=warning_message):
+        clf.fit(X, Y)
 
 
 def test_lasso_readonly_data():
@@ -1075,11 +1084,13 @@ def test_overrided_gram_matrix():
     X, y, _, _ = build_dataset(n_samples=20, n_features=10)
     Gram = X.T.dot(X)
     clf = ElasticNet(selection='cyclic', tol=1e-8, precompute=Gram)
-    assert_warns_message(UserWarning,
-                         "Gram matrix was provided but X was centered"
-                         " to fit intercept, "
-                         "or X was normalized : recomputing Gram matrix.",
-                         clf.fit, X, y)
+    warning_message = (
+        "Gram matrix was provided but X was centered"
+        " to fit intercept, "
+        "or X was normalized : recomputing Gram matrix."
+    )
+    with pytest.warns(UserWarning, match=warning_message):
+        clf.fit(X, y)
 
 
 @pytest.mark.parametrize('model', [ElasticNet, Lasso])
@@ -1214,7 +1225,12 @@ def test_enet_coordinate_descent(klass, n_classes, kwargs):
     y = np.ones((n_samples, n_classes))
     if klass == Lasso:
         y = y.ravel()
-    assert_warns(ConvergenceWarning, clf.fit, X, y)
+    warning_message = (
+        "Objective did not converge. You might want to"
+        " increase the number of iterations."
+    )
+    with pytest.warns(ConvergenceWarning, match=warning_message):
+        clf.fit(X, y)
 
 
 def test_convergence_warnings():
diff --git a/sklearn/linear_model/tests/test_least_angle.py b/sklearn/linear_model/tests/test_least_angle.py
index 96c5a8fedbf14..a8b0e939c080d 100644
--- a/sklearn/linear_model/tests/test_least_angle.py
+++ b/sklearn/linear_model/tests/test_least_angle.py
@@ -10,7 +10,6 @@
 from sklearn.utils._testing import assert_array_almost_equal
 from sklearn.utils._testing import assert_raises
 from sklearn.utils._testing import ignore_warnings
-from sklearn.utils._testing import assert_warns
 from sklearn.utils._testing import TempMemmap
 from sklearn.utils.fixes import np_version, parse_version
 from sklearn.exceptions import ConvergenceWarning
@@ -372,7 +371,11 @@ def objective_function(coef):
                 + alpha * linalg.norm(coef, 1))
 
     lars = linear_model.LassoLars(alpha=alpha, normalize=False)
-    assert_warns(ConvergenceWarning, lars.fit, X, y)
+    warning_message = (
+        "Regressors in active set degenerate."
+    )
+    with pytest.warns(ConvergenceWarning, match=warning_message):
+        lars.fit(X, y)
     lars_coef_ = lars.coef_
     lars_obj = objective_function(lars_coef_)
 
diff --git a/sklearn/linear_model/tests/test_logistic.py b/sklearn/linear_model/tests/test_logistic.py
index 329f4f72f935b..bdc9a4a24914b 100644
--- a/sklearn/linear_model/tests/test_logistic.py
+++ b/sklearn/linear_model/tests/test_logistic.py
@@ -19,9 +19,7 @@
 from sklearn.model_selection import cross_val_score
 from sklearn.preprocessing import LabelEncoder, StandardScaler
 from sklearn.utils import compute_class_weight, _IS_32BIT
-from sklearn.utils._testing import assert_warns
 from sklearn.utils._testing import ignore_warnings
-from sklearn.utils._testing import assert_warns_message
 from sklearn.utils import shuffle
 from sklearn.linear_model import SGDClassifier
 from sklearn.preprocessing import scale
@@ -155,11 +153,13 @@ def test_lr_liblinear_warning():
     target = iris.target_names[iris.target]
 
     lr = LogisticRegression(solver='liblinear', n_jobs=2)
-    assert_warns_message(UserWarning,
-                         "'n_jobs' > 1 does not have any effect when"
-                         " 'solver' is set to 'liblinear'. Got 'n_jobs'"
-                         " = 2.",
-                         lr.fit, iris.data, target)
+    warning_message = (
+        "'n_jobs' > 1 does not have any effect when"
+        " 'solver' is set to 'liblinear'. Got 'n_jobs'"
+        " = 2."
+    )
+    with pytest.warns(UserWarning, match=warning_message):
+        lr.fit(iris.data, target)
 
 
 def test_predict_3_classes():
@@ -1188,23 +1188,34 @@ def test_logreg_predict_proba_multinomial():
     assert clf_wrong_loss > clf_multi_loss
 
 
-def test_max_iter():
+@pytest.mark.parametrize("max_iter", np.arange(1, 5))
+@pytest.mark.parametrize("multi_class", ['ovr', 'multinomial'])
+@pytest.mark.parametrize(
+    "solver, message",
+    [("newton-cg", "newton-cg failed to converge. Increase the "
+                   "number of iterations."),
+     ("liblinear", "Liblinear failed to converge, increase the "
+                   "number of iterations."),
+     ("sag", "The max_iter was reached which means the "
+             "coef_ did not converge"),
+     ("saga", "The max_iter was reached which means the "
+              "coef_ did not converge"),
+     ("lbfgs", "lbfgs failed to converge")])
+def test_max_iter(max_iter, multi_class, solver, message):
     # Test that the maximum number of iteration is reached
     X, y_bin = iris.data, iris.target.copy()
     y_bin[y_bin == 2] = 0
 
-    solvers = ['newton-cg', 'liblinear', 'sag', 'saga', 'lbfgs']
+    if solver == 'liblinear' and multi_class == 'multinomial':
+        pytest.skip("'multinomial' is unavailable when solver='liblinear'")
+
+    lr = LogisticRegression(max_iter=max_iter, tol=1e-15,
+                            multi_class=multi_class,
+                            random_state=0, solver=solver)
+    with pytest.warns(ConvergenceWarning, match=message):
+        lr.fit(X, y_bin)
 
-    for max_iter in range(1, 5):
-        for solver in solvers:
-            for multi_class in ['ovr', 'multinomial']:
-                if solver == 'liblinear' and multi_class == 'multinomial':
-                    continue
-                lr = LogisticRegression(max_iter=max_iter, tol=1e-15,
-                                        multi_class=multi_class,
-                                        random_state=0, solver=solver)
-                assert_warns(ConvergenceWarning, lr.fit, X, y_bin)
-                assert lr.n_iter_[0] == max_iter
+    assert lr.n_iter_[0] == max_iter
 
 
 @pytest.mark.parametrize('solver',
@@ -1644,12 +1655,11 @@ def test_l1_ratio_param(l1_ratio):
                            l1_ratio=l1_ratio).fit(X, Y1)
 
     if l1_ratio is not None:
-        msg = ("l1_ratio parameter is only used when penalty is 'elasticnet'."
-               " Got (penalty=l1)")
-
-        assert_warns_message(UserWarning, msg,
-                             LogisticRegression(penalty='l1', solver='saga',
-                                                l1_ratio=l1_ratio).fit, X, Y1)
+        msg = (r"l1_ratio parameter is only used when penalty is"
+               r" 'elasticnet'\. Got \(penalty=l1\)")
+        with pytest.warns(UserWarning, match=msg):
+            LogisticRegression(penalty='l1', solver='saga',
+                               l1_ratio=l1_ratio).fit(X, Y1)
 
 
 @pytest.mark.parametrize('l1_ratios', ([], [.5, 2], None, 'something_wrong'))
@@ -1664,11 +1674,12 @@ def test_l1_ratios_param(l1_ratios):
                              l1_ratios=l1_ratios, cv=2).fit(X, Y1)
 
     if l1_ratios is not None:
-        msg = ("l1_ratios parameter is only used when penalty is "
-               "'elasticnet'. Got (penalty=l1)")
+        msg = (r"l1_ratios parameter is only used when penalty"
+               r" is 'elasticnet'. Got \(penalty=l1\)")
         function = LogisticRegressionCV(penalty='l1', solver='saga',
                                         l1_ratios=l1_ratios, cv=2).fit
-        assert_warns_message(UserWarning, msg, function, X, Y1)
+        with pytest.warns(UserWarning, match=msg):
+            function(X, Y1)
 
 
 @pytest.mark.parametrize('C', np.logspace(-3, 2, 4))
@@ -1769,7 +1780,8 @@ def test_penalty_none(solver):
 
     msg = "Setting penalty='none' will ignore the C"
     lr = LogisticRegression(penalty='none', solver=solver, C=4)
-    assert_warns_message(UserWarning, msg, lr.fit, X, y)
+    with pytest.warns(UserWarning, match=msg):
+        lr.fit(X, y)
 
     lr_none = LogisticRegression(penalty='none', solver=solver,
                                  random_state=0)
diff --git a/sklearn/linear_model/tests/test_omp.py b/sklearn/linear_model/tests/test_omp.py
index f3f3080aebe66..3cbda003f0148 100644
--- a/sklearn/linear_model/tests/test_omp.py
+++ b/sklearn/linear_model/tests/test_omp.py
@@ -2,11 +2,11 @@
 # License: BSD 3 clause
 
 import numpy as np
+import pytest
 
 from sklearn.utils._testing import assert_raises
 from sklearn.utils._testing import assert_array_equal
 from sklearn.utils._testing import assert_array_almost_equal
-from sklearn.utils._testing import assert_warns
 from sklearn.utils._testing import ignore_warnings
 
 
@@ -76,12 +76,16 @@ def test_unreachable_accuracy():
     assert_array_almost_equal(
         orthogonal_mp(X, y, tol=0),
         orthogonal_mp(X, y, n_nonzero_coefs=n_features))
-
-    assert_array_almost_equal(
-        assert_warns(RuntimeWarning, orthogonal_mp, X, y, tol=0,
-                     precompute=True),
-        orthogonal_mp(X, y, precompute=True,
-                      n_nonzero_coefs=n_features))
+    warning_message = (
+        "Orthogonal matching pursuit ended prematurely "
+        "due to linear dependence in the dictionary. "
+        "The requested precision might not have been met."
+    )
+    with pytest.warns(RuntimeWarning, match=warning_message):
+        assert_array_almost_equal(
+            orthogonal_mp(X, y, tol=0, precompute=True),
+            orthogonal_mp(X, y, precompute=True,
+                          n_nonzero_coefs=n_features))
 
 
 def test_bad_input():
@@ -155,7 +159,13 @@ def test_identical_regressors():
     gamma = np.zeros(n_features)
     gamma[0] = gamma[1] = 1.
     newy = np.dot(newX, gamma)
-    assert_warns(RuntimeWarning, orthogonal_mp, newX, newy, 2)
+    warning_message = (
+        "Orthogonal matching pursuit ended prematurely "
+        "due to linear dependence in the dictionary. "
+        "The requested precision might not have been met."
+    )
+    with pytest.warns(RuntimeWarning, match=warning_message):
+        orthogonal_mp(newX, newy, 2)
 
 
 def test_swapped_regressors():
diff --git a/sklearn/linear_model/tests/test_ransac.py b/sklearn/linear_model/tests/test_ransac.py
index 757faacd2d67f..f631199a5d268 100644
--- a/sklearn/linear_model/tests/test_ransac.py
+++ b/sklearn/linear_model/tests/test_ransac.py
@@ -6,7 +6,6 @@
 from numpy.testing import assert_array_equal
 
 from sklearn.utils import check_random_state
-from sklearn.utils._testing import assert_warns
 from sklearn.utils._testing import assert_raises_regexp
 from sklearn.utils._testing import assert_allclose
 from sklearn.datasets import make_regression
@@ -232,8 +231,14 @@ def is_data_valid(X, y):
                                        is_data_valid=is_data_valid,
                                        max_skips=3,
                                        max_trials=5)
-
-    assert_warns(ConvergenceWarning, ransac_estimator.fit, X, y)
+    warning_message = (
+        "RANSAC found a valid consensus set but exited "
+        "early due to skipping more iterations than "
+        "`max_skips`. See estimator attributes for "
+        "diagnostics."
+    )
+    with pytest.warns(ConvergenceWarning, match=warning_message):
+        ransac_estimator.fit(X, y)
     assert ransac_estimator.n_skips_no_inliers_ == 0
     assert ransac_estimator.n_skips_invalid_data_ == 4
     assert ransac_estimator.n_skips_invalid_model_ == 0
diff --git a/sklearn/linear_model/tests/test_ridge.py b/sklearn/linear_model/tests/test_ridge.py
index 8e33514af83f9..01839fe0ba457 100644
--- a/sklearn/linear_model/tests/test_ridge.py
+++ b/sklearn/linear_model/tests/test_ridge.py
@@ -10,7 +10,6 @@
 from sklearn.utils._testing import assert_array_almost_equal
 from sklearn.utils._testing import assert_array_equal
 from sklearn.utils._testing import ignore_warnings
-from sklearn.utils._testing import assert_warns
 
 from sklearn.exceptions import ConvergenceWarning
 
@@ -162,10 +161,14 @@ def test_ridge_regression_convergence_fail():
     rng = np.random.RandomState(0)
     y = rng.randn(5)
     X = rng.randn(5, 10)
-
-    assert_warns(ConvergenceWarning, ridge_regression,
-                 X, y, alpha=1.0, solver="sparse_cg",
-                 tol=0., max_iter=None, verbose=1)
+    warning_message = (
+        r"sparse_cg did not converge after"
+        r" [0-9]+ iterations."
+    )
+    with pytest.warns(ConvergenceWarning, match=warning_message):
+        ridge_regression(X, y,
+                         alpha=1.0, solver="sparse_cg",
+                         tol=0., max_iter=None, verbose=1)
 
 
 def test_ridge_sample_weights():
diff --git a/sklearn/linear_model/tests/test_sgd.py b/sklearn/linear_model/tests/test_sgd.py
index 0ac7ce779f5a7..217249631390d 100644
--- a/sklearn/linear_model/tests/test_sgd.py
+++ b/sklearn/linear_model/tests/test_sgd.py
@@ -9,7 +9,6 @@
 from sklearn.utils._testing import assert_almost_equal
 from sklearn.utils._testing import assert_array_almost_equal
 from sklearn.utils._testing import assert_raises_regexp
-from sklearn.utils._testing import assert_warns
 from sklearn.utils._testing import ignore_warnings
 from sklearn.utils.fixes import parse_version
 
@@ -1446,7 +1445,13 @@ def test_tol_parameter():
 
     # Strict tolerance and small max_iter should trigger a warning
     model_3 = SGDClassifier(max_iter=3, tol=1e-3, random_state=0)
-    model_3 = assert_warns(ConvergenceWarning, model_3.fit, X, y)
+    warning_message = (
+        "Maximum number of iteration reached before "
+        "convergence. Consider increasing max_iter to "
+        "improve the fit."
+    )
+    with pytest.warns(ConvergenceWarning, match=warning_message):
+        model_3.fit(X, y)
     assert model_3.n_iter_ == 3
 
 
diff --git a/sklearn/linear_model/tests/test_sparse_coordinate_descent.py b/sklearn/linear_model/tests/test_sparse_coordinate_descent.py
index 5f131209c1547..23b57a699a655 100644
--- a/sklearn/linear_model/tests/test_sparse_coordinate_descent.py
+++ b/sklearn/linear_model/tests/test_sparse_coordinate_descent.py
@@ -1,11 +1,11 @@
 import numpy as np
+import pytest
 import scipy.sparse as sp
 
 from sklearn.utils._testing import assert_array_almost_equal
 from sklearn.utils._testing import assert_almost_equal
 
 from sklearn.utils._testing import ignore_warnings
-from sklearn.utils._testing import assert_warns
 from sklearn.exceptions import ConvergenceWarning
 
 from sklearn.linear_model import Lasso, ElasticNet, LassoCV, ElasticNetCV
@@ -297,4 +297,9 @@ def test_sparse_enet_coordinate_descent():
     n_features = 2
     X = sp.csc_matrix((n_samples, n_features)) * 1e50
     y = np.ones(n_samples)
-    assert_warns(ConvergenceWarning, clf.fit, X, y)
+    warning_message = (
+        "Objective did not converge. You might want "
+        "to increase the number of iterations."
+    )
+    with pytest.warns(ConvergenceWarning, match=warning_message):
+        clf.fit(X, y)
diff --git a/sklearn/linear_model/tests/test_theil_sen.py b/sklearn/linear_model/tests/test_theil_sen.py
index bd17298492ca0..c670fc3979b80 100644
--- a/sklearn/linear_model/tests/test_theil_sen.py
+++ b/sklearn/linear_model/tests/test_theil_sen.py
@@ -8,8 +8,9 @@
 import sys
 from contextlib import contextmanager
 import numpy as np
+import pytest
 from numpy.testing import assert_array_equal, assert_array_less
-from numpy.testing import assert_array_almost_equal, assert_warns
+from numpy.testing import assert_array_almost_equal
 from scipy.linalg import norm
 from scipy.optimize import fmin_bfgs
 from sklearn.exceptions import ConvergenceWarning
@@ -154,7 +155,12 @@ def cost_func(y):
     fermat_weber = fmin_bfgs(cost_func, median, disp=False)
     assert_array_almost_equal(median, fermat_weber)
     # Check when maximum iteration is exceeded a warning is emitted
-    assert_warns(ConvergenceWarning, _spatial_median, X, max_iter=30, tol=0.)
+    warning_message = (
+        "Maximum number of iterations 30 reached"
+        " in spatial median."
+    )
+    with pytest.warns(ConvergenceWarning, match=warning_message):
+        _spatial_median(X, max_iter=30, tol=0.)
 
 
 def test_theil_sen_1d():

From 579e7de7f38f9f514ff2b2be049e67b14e723d17 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?=
 <34657725+jeremiedbb@users.noreply.github.com>
Date: Fri, 12 Mar 2021 16:08:36 +0100
Subject: [PATCH 236/478] move kmpp public next to kmpp private (#19666)

---
 sklearn/cluster/_kmeans.py | 187 ++++++++++++++++++-------------------
 1 file changed, 93 insertions(+), 94 deletions(-)

diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py
index 1c54a5c9ff9e0..17272858ae476 100644
--- a/sklearn/cluster/_kmeans.py
+++ b/sklearn/cluster/_kmeans.py
@@ -47,6 +47,99 @@
 ###############################################################################
 # Initialization heuristic
 
+def kmeans_plusplus(X, n_clusters, *, x_squared_norms=None,
+                    random_state=None, n_local_trials=None):
+    """Init n_clusters seeds according to k-means++
+
+    .. versionadded:: 0.24
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        The data to pick seeds from.
+
+    n_clusters : int
+        The number of centroids to initialize
+
+    x_squared_norms : array-like of shape (n_samples,), default=None
+        Squared Euclidean norm of each data point.
+
+    random_state : int or RandomState instance, default=None
+        Determines random number generation for centroid initialization. Pass
+        an int for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    n_local_trials : int, default=None
+        The number of seeding trials for each center (except the first),
+        of which the one reducing inertia the most is greedily chosen.
+        Set to None to make the number of trials depend logarithmically
+        on the number of seeds (2+log(k)).
+
+    Returns
+    -------
+    centers : ndarray of shape (n_clusters, n_features)
+        The inital centers for k-means.
+
+    indices : ndarray of shape (n_clusters,)
+        The index location of the chosen centers in the data array X. For a
+        given index and center, X[index] = center.
+
+    Notes
+    -----
+    Selects initial cluster centers for k-mean clustering in a smart way
+    to speed up convergence. see: Arthur, D. and Vassilvitskii, S.
+    "k-means++: the advantages of careful seeding". ACM-SIAM symposium
+    on Discrete algorithms. 2007
+
+    Examples
+    --------
+
+    >>> from sklearn.cluster import kmeans_plusplus
+    >>> import numpy as np
+    >>> X = np.array([[1, 2], [1, 4], [1, 0],
+    ...               [10, 2], [10, 4], [10, 0]])
+    >>> centers, indices = kmeans_plusplus(X, n_clusters=2, random_state=0)
+    >>> centers
+    array([[10,  4],
+           [ 1,  0]])
+    >>> indices
+    array([4, 2])
+    """
+
+    # Check data
+    check_array(X, accept_sparse='csr',
+                dtype=[np.float64, np.float32])
+
+    if X.shape[0] < n_clusters:
+        raise ValueError(f"n_samples={X.shape[0]} should be >= "
+                         f"n_clusters={n_clusters}.")
+
+    # Check parameters
+    if x_squared_norms is None:
+        x_squared_norms = row_norms(X, squared=True)
+    else:
+        x_squared_norms = check_array(x_squared_norms,
+                                      dtype=X.dtype,
+                                      ensure_2d=False)
+
+    if x_squared_norms.shape[0] != X.shape[0]:
+        raise ValueError(
+            f"The length of x_squared_norms {x_squared_norms.shape[0]} should "
+            f"be equal to the length of n_samples {X.shape[0]}.")
+
+    if n_local_trials is not None and n_local_trials < 1:
+        raise ValueError(
+            f"n_local_trials is set to {n_local_trials} but should be an "
+            f"integer value greater than zero.")
+
+    random_state = check_random_state(random_state)
+
+    # Call private k-means++
+    centers, indices = _kmeans_plusplus(X, n_clusters, x_squared_norms,
+                                        random_state, n_local_trials)
+
+    return centers, indices
+
 
 def _kmeans_plusplus(X, n_clusters, x_squared_norms,
                      random_state, n_local_trials=None):
@@ -1924,97 +2017,3 @@ def _more_tags(self):
                 'zero sample_weight is not equivalent to removing samples',
             }
         }
-
-
-def kmeans_plusplus(X, n_clusters, *, x_squared_norms=None,
-                    random_state=None, n_local_trials=None):
-    """Init n_clusters seeds according to k-means++
-
-    .. versionadded:: 0.24
-
-    Parameters
-    ----------
-    X : {array-like, sparse matrix} of shape (n_samples, n_features)
-        The data to pick seeds from.
-
-    n_clusters : int
-        The number of centroids to initialize
-
-    x_squared_norms : array-like of shape (n_samples,), default=None
-        Squared Euclidean norm of each data point.
-
-    random_state : int or RandomState instance, default=None
-        Determines random number generation for centroid initialization. Pass
-        an int for reproducible output across multiple function calls.
-        See :term:`Glossary <random_state>`.
-
-    n_local_trials : int, default=None
-        The number of seeding trials for each center (except the first),
-        of which the one reducing inertia the most is greedily chosen.
-        Set to None to make the number of trials depend logarithmically
-        on the number of seeds (2+log(k)).
-
-    Returns
-    -------
-    centers : ndarray of shape (n_clusters, n_features)
-        The inital centers for k-means.
-
-    indices : ndarray of shape (n_clusters,)
-        The index location of the chosen centers in the data array X. For a
-        given index and center, X[index] = center.
-
-    Notes
-    -----
-    Selects initial cluster centers for k-mean clustering in a smart way
-    to speed up convergence. see: Arthur, D. and Vassilvitskii, S.
-    "k-means++: the advantages of careful seeding". ACM-SIAM symposium
-    on Discrete algorithms. 2007
-
-    Examples
-    --------
-
-    >>> from sklearn.cluster import kmeans_plusplus
-    >>> import numpy as np
-    >>> X = np.array([[1, 2], [1, 4], [1, 0],
-    ...               [10, 2], [10, 4], [10, 0]])
-    >>> centers, indices = kmeans_plusplus(X, n_clusters=2, random_state=0)
-    >>> centers
-    array([[10,  4],
-           [ 1,  0]])
-    >>> indices
-    array([4, 2])
-    """
-
-    # Check data
-    check_array(X, accept_sparse='csr',
-                dtype=[np.float64, np.float32])
-
-    if X.shape[0] < n_clusters:
-        raise ValueError(f"n_samples={X.shape[0]} should be >= "
-                         f"n_clusters={n_clusters}.")
-
-    # Check parameters
-    if x_squared_norms is None:
-        x_squared_norms = row_norms(X, squared=True)
-    else:
-        x_squared_norms = check_array(x_squared_norms,
-                                      dtype=X.dtype,
-                                      ensure_2d=False)
-
-    if x_squared_norms.shape[0] != X.shape[0]:
-        raise ValueError(
-            f"The length of x_squared_norms {x_squared_norms.shape[0]} should "
-            f"be equal to the length of n_samples {X.shape[0]}.")
-
-    if n_local_trials is not None and n_local_trials < 1:
-        raise ValueError(
-            f"n_local_trials is set to {n_local_trials} but should be an "
-            f"integer value greater than zero.")
-
-    random_state = check_random_state(random_state)
-
-    # Call private k-means++
-    centers, indices = _kmeans_plusplus(X, n_clusters, x_squared_norms,
-                                        random_state, n_local_trials)
-
-    return centers, indices

From 15fd026963be233d37752f322b5dd484c58e09a8 Mon Sep 17 00:00:00 2001
From: Chiara Marmo <cmarmo@users.noreply.github.com>
Date: Sat, 13 Mar 2021 00:02:29 +0100
Subject: [PATCH 237/478] RFC Make non_negative_factorization call NMF instead
 of the opposite (#19607)

---
 sklearn/decomposition/_nmf.py | 217 +++++++++++++++++++++-------------
 1 file changed, 132 insertions(+), 85 deletions(-)

diff --git a/sklearn/decomposition/_nmf.py b/sklearn/decomposition/_nmf.py
index 6d42fecb885a2..b978f1a33d3af 100644
--- a/sklearn/decomposition/_nmf.py
+++ b/sklearn/decomposition/_nmf.py
@@ -1021,74 +1021,14 @@ def non_negative_factorization(X, W=None, H=None, n_components=None, *,
     """
     X = check_array(X, accept_sparse=('csr', 'csc'),
                     dtype=[np.float64, np.float32])
-    check_non_negative(X, "NMF (input X)")
-    beta_loss = _check_string_param(solver, regularization, beta_loss, init)
 
-    if X.min() == 0 and beta_loss <= 0:
-        raise ValueError("When beta_loss <= 0 and X contains zeros, "
-                         "the solver may diverge. Please add small values to "
-                         "X, or use a positive beta_loss.")
+    est = NMF(n_components=n_components, init=init, solver=solver,
+              beta_loss=beta_loss, tol=tol, max_iter=max_iter,
+              random_state=random_state, alpha=alpha, l1_ratio=l1_ratio,
+              verbose=verbose, shuffle=shuffle, regularization=regularization)
 
-    n_samples, n_features = X.shape
-    if n_components is None:
-        n_components = n_features
-
-    if not isinstance(n_components, numbers.Integral) or n_components <= 0:
-        raise ValueError("Number of components must be a positive integer;"
-                         " got (n_components=%r)" % n_components)
-    if not isinstance(max_iter, numbers.Integral) or max_iter < 0:
-        raise ValueError("Maximum number of iterations must be a positive "
-                         "integer; got (max_iter=%r)" % max_iter)
-    if not isinstance(tol, numbers.Number) or tol < 0:
-        raise ValueError("Tolerance for stopping criteria must be "
-                         "positive; got (tol=%r)" % tol)
-
-    # check W and H, or initialize them
-    if init == 'custom' and update_H:
-        _check_init(H, (n_components, n_features), "NMF (input H)")
-        _check_init(W, (n_samples, n_components), "NMF (input W)")
-        if H.dtype != X.dtype or W.dtype != X.dtype:
-            raise TypeError("H and W should have the same dtype as X. Got "
-                            "H.dtype = {} and W.dtype = {}."
-                            .format(H.dtype, W.dtype))
-    elif not update_H:
-        _check_init(H, (n_components, n_features), "NMF (input H)")
-        if H.dtype != X.dtype:
-            raise TypeError("H should have the same dtype as X. Got H.dtype = "
-                            "{}.".format(H.dtype))
-        # 'mu' solver should not be initialized by zeros
-        if solver == 'mu':
-            avg = np.sqrt(X.mean() / n_components)
-            W = np.full((n_samples, n_components), avg, dtype=X.dtype)
-        else:
-            W = np.zeros((n_samples, n_components), dtype=X.dtype)
-    else:
-        W, H = _initialize_nmf(X, n_components, init=init,
-                               random_state=random_state)
-
-    l1_reg_W, l1_reg_H, l2_reg_W, l2_reg_H = _compute_regularization(
-        alpha, l1_ratio, regularization)
-
-    if solver == 'cd':
-        W, H, n_iter = _fit_coordinate_descent(X, W, H, tol, max_iter,
-                                               l1_reg_W, l1_reg_H,
-                                               l2_reg_W, l2_reg_H,
-                                               update_H=update_H,
-                                               verbose=verbose,
-                                               shuffle=shuffle,
-                                               random_state=random_state)
-    elif solver == 'mu':
-        W, H, n_iter = _fit_multiplicative_update(X, W, H, beta_loss, max_iter,
-                                                  tol, l1_reg_W, l1_reg_H,
-                                                  l2_reg_W, l2_reg_H, update_H,
-                                                  verbose)
-
-    else:
-        raise ValueError("Invalid solver parameter '%s'." % solver)
-
-    if n_iter == max_iter and tol > 0:
-        warnings.warn("Maximum number of iterations %d reached. Increase it to"
-                      " improve convergence." % max_iter, ConvergenceWarning)
+    with config_context(assume_finite=True):
+        W, H, n_iter = est._fit_transform(X, W=W, H=H, update_H=update_H)
 
     return W, H, n_iter
 
@@ -1281,6 +1221,52 @@ def __init__(self, n_components=None, *, init='warn', solver='cd',
     def _more_tags(self):
         return {'requires_positive_X': True}
 
+    def _check_params(self, X):
+        self._n_components = self.n_components
+        if self._n_components is None:
+            self._n_components = X.shape[1]
+        if not isinstance(
+            self._n_components, numbers.Integral
+        ) or self._n_components <= 0:
+            raise ValueError("Number of components must be a positive integer;"
+                             " got (n_components=%r)" % self._n_components)
+        if not isinstance(
+            self.max_iter, numbers.Integral
+        ) or self.max_iter < 0:
+            raise ValueError("Maximum number of iterations must be a positive "
+                             "integer; got (max_iter=%r)" % self.max_iter)
+        if not isinstance(self.tol, numbers.Number) or self.tol < 0:
+            raise ValueError("Tolerance for stopping criteria must be "
+                             "positive; got (tol=%r)" % self.tol)
+        return self
+
+    def _check_w_h(self, X, W, H, update_H):
+        # check W and H, or initialize them
+        n_samples, n_features = X.shape
+        if self.init == 'custom' and update_H:
+            _check_init(H, (self._n_components, n_features), "NMF (input H)")
+            _check_init(W, (n_samples, self._n_components), "NMF (input W)")
+            if H.dtype != X.dtype or W.dtype != X.dtype:
+                raise TypeError("H and W should have the same dtype as X. Got "
+                                "H.dtype = {} and W.dtype = {}."
+                                .format(H.dtype, W.dtype))
+        elif not update_H:
+            _check_init(H, (self._n_components, n_features), "NMF (input H)")
+            if H.dtype != X.dtype:
+                raise TypeError("H should have the same dtype as X. Got "
+                                "H.dtype = {}.".format(H.dtype))
+            # 'mu' solver should not be initialized by zeros
+            if self.solver == 'mu':
+                avg = np.sqrt(X.mean() / self._n_components)
+                W = np.full((n_samples, self._n_components),
+                            avg, dtype=X.dtype)
+            else:
+                W = np.zeros((n_samples, self._n_components), dtype=X.dtype)
+        else:
+            W, H = _initialize_nmf(X, self._n_components, init=self.init,
+                                   random_state=self.random_state)
+        return W, H
+
     def fit_transform(self, X, y=None, W=None, H=None):
         """Learn a NMF model for the data X and returns the transformed data.
 
@@ -1308,23 +1294,92 @@ def fit_transform(self, X, y=None, W=None, H=None):
                                 dtype=[np.float64, np.float32])
 
         with config_context(assume_finite=True):
-            W, H, n_iter_ = non_negative_factorization(
-                X=X, W=W, H=H, n_components=self.n_components, init=self.init,
-                update_H=True, solver=self.solver, beta_loss=self.beta_loss,
-                tol=self.tol, max_iter=self.max_iter, alpha=self.alpha,
-                l1_ratio=self.l1_ratio, regularization=self.regularization,
-                random_state=self.random_state, verbose=self.verbose,
-                shuffle=self.shuffle)
-
-        self.reconstruction_err_ = _beta_divergence(X, W, H, self.beta_loss,
+            W, H, n_iter = self._fit_transform(X, W=W, H=H)
+
+        self.reconstruction_err_ = _beta_divergence(X, W, H, self._beta_loss,
                                                     square_root=True)
 
         self.n_components_ = H.shape[0]
         self.components_ = H
-        self.n_iter_ = n_iter_
+        self.n_iter_ = n_iter
 
         return W
 
+    def _fit_transform(self, X, y=None, W=None, H=None, update_H=True):
+        """Learn a NMF model for the data X and returns the transformed data.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Data matrix to be decomposed
+
+        y : Ignored
+
+        W : array-like of shape (n_samples, n_components)
+            If init='custom', it is used as initial guess for the solution.
+
+        H : array-like of shape (n_components, n_features)
+            If init='custom', it is used as initial guess for the solution.
+            If update_H=False, it is used as a constant, to solve for W only.
+
+        update_H : bool, default=True
+            If True, both W and H will be estimated from initial guesses,
+            this corresponds to a call to the 'fit_transform' method.
+            If False, only W will be estimated, this corresponds to a call
+            to the 'transform' method.
+
+        Returns
+        -------
+        W : ndarray of shape (n_samples, n_components)
+            Transformed data.
+
+        H : ndarray of shape (n_components, n_features)
+            Factorization matrix, sometimes called 'dictionary'.
+
+        n_iter_ : int
+            Actual number of iterations.
+        """
+        check_non_negative(X, "NMF (input X)")
+        self._beta_loss = _check_string_param(self.solver, self.regularization,
+                                              self.beta_loss, self.init)
+
+        if X.min() == 0 and self._beta_loss <= 0:
+            raise ValueError("When beta_loss <= 0 and X contains zeros, "
+                             "the solver may diverge. Please add small values "
+                             "to X, or use a positive beta_loss.")
+
+        n_samples, n_features = X.shape
+
+        # check parameters
+        self._check_params(X)
+
+        # initialize or check W and H
+        W, H = self._check_w_h(X, W, H, update_H)
+
+        l1_reg_W, l1_reg_H, l2_reg_W, l2_reg_H = _compute_regularization(
+            self.alpha, self.l1_ratio, self.regularization)
+
+        if self.solver == 'cd':
+            W, H, n_iter = _fit_coordinate_descent(
+                X, W, H, self.tol, self.max_iter, l1_reg_W, l1_reg_H,
+                l2_reg_W, l2_reg_H, update_H=update_H,
+                verbose=self.verbose, shuffle=self.shuffle,
+                random_state=self.random_state)
+        elif self.solver == 'mu':
+            W, H, n_iter = _fit_multiplicative_update(
+                X, W, H, self._beta_loss, self.max_iter, self.tol,
+                l1_reg_W, l1_reg_H, l2_reg_W, l2_reg_H,
+                update_H=update_H, verbose=self.verbose)
+        else:
+            raise ValueError("Invalid solver parameter '%s'." % self.solver)
+
+        if n_iter == self.max_iter and self.tol > 0:
+            warnings.warn("Maximum number of iterations %d reached. Increase "
+                          "it to improve convergence." % self.max_iter,
+                          ConvergenceWarning)
+
+        return W, H, n_iter
+
     def fit(self, X, y=None, **params):
         """Learn a NMF model for the data X.
 
@@ -1361,15 +1416,7 @@ def transform(self, X):
                                 reset=False)
 
         with config_context(assume_finite=True):
-            W, _, n_iter_ = non_negative_factorization(
-                X=X, W=None, H=self.components_,
-                n_components=self.n_components_,
-                init=self.init, update_H=False, solver=self.solver,
-                beta_loss=self.beta_loss, tol=self.tol, max_iter=self.max_iter,
-                alpha=self.alpha, l1_ratio=self.l1_ratio,
-                regularization=self.regularization,
-                random_state=self.random_state,
-                verbose=self.verbose, shuffle=self.shuffle)
+            W, *_ = self._fit_transform(X, H=self.components_, update_H=False)
 
         return W
 

From f4e692c0876425ef6afb6f514b54696f3e071c35 Mon Sep 17 00:00:00 2001
From: PierreAttard <attard.pierre@gmail.com>
Date: Sat, 13 Mar 2021 00:45:00 +0100
Subject: [PATCH 238/478] ENH Raises error in hinge_loss when 'pred_decision'
 is invalid (#19643)

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 doc/whats_new/v1.0.rst                       |  5 ++++
 sklearn/metrics/_classification.py           | 26 +++++++++++++++++---
 sklearn/metrics/tests/test_classification.py | 26 ++++++++++++++++++++
 3 files changed, 53 insertions(+), 4 deletions(-)

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 0f4882f1b2970..a1f21723bac28 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -165,6 +165,11 @@ Changelog
   class methods and will be removed in 1.2.
   :pr:`18543` by `Guillaume Lemaitre`_.
 
+- |Enhancement| A fix to raise an error in :func:`metrics.hinge_loss` when
+  ``pred_decision`` is 1d whereas it is a multiclass classification or when
+  ``pred_decision`` parameter is not consistent with the ``labels`` parameter.
+  :pr:`19643` by :user:`Pierre Attard <PierreAttard>`.
+
 - |Feature| :func:`metrics.mean_pinball_loss` exposes the pinball loss for
   quantile regression. :pr:`19415` by :user:`Xavier Dupré <sdpython>`
   and :user:`Oliver Grisel <ogrisel>`.
diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py
index b4ab145d80937..a68e17656a73b 100644
--- a/sklearn/metrics/_classification.py
+++ b/sklearn/metrics/_classification.py
@@ -2378,11 +2378,29 @@ def hinge_loss(y_true, pred_decision, *, labels=None, sample_weight=None):
     pred_decision = check_array(pred_decision, ensure_2d=False)
     y_true = column_or_1d(y_true)
     y_true_unique = np.unique(labels if labels is not None else y_true)
+
     if y_true_unique.size > 2:
-        if (labels is None and pred_decision.ndim > 1 and
-                (np.size(y_true_unique) != pred_decision.shape[1])):
-            raise ValueError("Please include all labels in y_true "
-                             "or pass labels as third argument")
+
+        if pred_decision.ndim <= 1:
+            raise ValueError("The shape of pred_decision cannot be 1d array"
+                             "with a multiclass target. pred_decision shape "
+                             "must be (n_samples, n_classes), that is "
+                             f"({y_true.shape[0]}, {y_true_unique.size})."
+                             f" Got: {pred_decision.shape}")
+
+        # pred_decision.ndim > 1 is true
+        if y_true_unique.size != pred_decision.shape[1]:
+            if labels is None:
+                raise ValueError("Please include all labels in y_true "
+                                 "or pass labels as third argument")
+            else:
+                raise ValueError("The shape of pred_decision is not "
+                                 "consistent with the number of classes. "
+                                 "With a multiclass target, pred_decision "
+                                 "shape must be "
+                                 "(n_samples, n_classes), that is "
+                                 f"({y_true.shape[0]}, {y_true_unique.size}). "
+                                 f"Got: {pred_decision.shape}")
         if labels is None:
             labels = y_true_unique
         le = LabelEncoder()
diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index c32e9c89ada47..7b634e88f2275 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -4,6 +4,7 @@
 from itertools import chain
 from itertools import permutations
 import warnings
+import re
 
 import numpy as np
 from scipy import linalg
@@ -2135,6 +2136,31 @@ def test_hinge_loss_multiclass_missing_labels_with_labels_none():
         hinge_loss(y_true, pred_decision)
 
 
+def test_hinge_loss_multiclass_no_consistent_pred_decision_shape():
+    # test for inconsistency between multiclass problem and pred_decision
+    # argument
+    y_true = np.array([2, 1, 0, 1, 0, 1, 1])
+    pred_decision = np.array([0, 1, 2, 1, 0, 2, 1])
+    error_message = ("The shape of pred_decision cannot be 1d array"
+                     "with a multiclass target. pred_decision shape "
+                     "must be (n_samples, n_classes), that is "
+                     "(7, 3). Got: (7,)")
+    with pytest.raises(ValueError, match=re.escape(error_message)):
+        hinge_loss(y_true=y_true, pred_decision=pred_decision)
+
+    # test for inconsistency between pred_decision shape and labels number
+    pred_decision = np.array([[0, 1], [0, 1], [0, 1], [0, 1],
+                              [2, 0], [0, 1], [1, 0]])
+    labels = [0, 1, 2]
+    error_message = ("The shape of pred_decision is not "
+                     "consistent with the number of classes. "
+                     "With a multiclass target, pred_decision "
+                     "shape must be (n_samples, n_classes), that is "
+                     "(7, 3). Got: (7, 2)")
+    with pytest.raises(ValueError, match=re.escape(error_message)):
+        hinge_loss(y_true=y_true, pred_decision=pred_decision, labels=labels)
+
+
 def test_hinge_loss_multiclass_with_missing_labels():
     pred_decision = np.array([
         [+0.36, -0.17, -0.58, -0.99],

From e3e4a778d3a39e17a21db596d89b3357277cc3dc Mon Sep 17 00:00:00 2001
From: Marco Gorelli <marcogorelli@protonmail.com>
Date: Sun, 14 Mar 2021 11:36:49 +0000
Subject: [PATCH 239/478] MNT Remove absolute imports (#19668)

---
 sklearn/gaussian_process/kernels.py       | 2 +-
 sklearn/inspection/_partial_dependence.py | 2 +-
 sklearn/metrics/_plot/base.py             | 2 +-
 sklearn/metrics/pairwise.py               | 2 +-
 sklearn/utils/_estimator_html_repr.py     | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/sklearn/gaussian_process/kernels.py b/sklearn/gaussian_process/kernels.py
index c731dcac347cd..008c24f294737 100644
--- a/sklearn/gaussian_process/kernels.py
+++ b/sklearn/gaussian_process/kernels.py
@@ -31,9 +31,9 @@
 from ..metrics.pairwise import pairwise_kernels
 from ..base import clone
 from ..utils.validation import _num_samples
+from ..exceptions import ConvergenceWarning
 
 import warnings
-from sklearn.exceptions import ConvergenceWarning
 
 
 def _check_length_scale(X, length_scale):
diff --git a/sklearn/inspection/_partial_dependence.py b/sklearn/inspection/_partial_dependence.py
index 1e9c0c9718a51..0736130f41524 100644
--- a/sklearn/inspection/_partial_dependence.py
+++ b/sklearn/inspection/_partial_dependence.py
@@ -27,7 +27,7 @@
 from ..ensemble import RandomForestRegressor
 from ..exceptions import NotFittedError
 from ..ensemble._gb import BaseGradientBoosting
-from sklearn.ensemble._hist_gradient_boosting.gradient_boosting import (
+from ..ensemble._hist_gradient_boosting.gradient_boosting import (
     BaseHistGradientBoosting)
 
 
diff --git a/sklearn/metrics/_plot/base.py b/sklearn/metrics/_plot/base.py
index 0e44a7715a1ed..4ac561f6d3dfa 100644
--- a/sklearn/metrics/_plot/base.py
+++ b/sklearn/metrics/_plot/base.py
@@ -1,6 +1,6 @@
 import numpy as np
 
-from sklearn.base import is_classifier
+from ...base import is_classifier
 
 
 def _check_classifier_response_method(estimator, response_method):
diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py
index a3cf7f4bf1d72..45eb256d59f67 100644
--- a/sklearn/metrics/pairwise.py
+++ b/sklearn/metrics/pairwise.py
@@ -748,7 +748,7 @@ def haversine_distances(X, Y=None):
     array([[    0.        , 11099.54035582],
            [11099.54035582,     0.        ]])
     """
-    from sklearn.neighbors import DistanceMetric
+    from ..neighbors import DistanceMetric
     return DistanceMetric.get_metric('haversine').pairwise(X, Y)
 
 
diff --git a/sklearn/utils/_estimator_html_repr.py b/sklearn/utils/_estimator_html_repr.py
index a593a6507371f..52fb779bee4d3 100644
--- a/sklearn/utils/_estimator_html_repr.py
+++ b/sklearn/utils/_estimator_html_repr.py
@@ -5,7 +5,7 @@
 import uuid
 import html
 
-from sklearn import config_context
+from .. import config_context
 
 
 class _VisualBlock:

From 0c74b8b7d5cdb60dc3a3240cdb36af40b9f40288 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Sun, 14 Mar 2021 15:43:51 +0100
Subject: [PATCH 240/478] ENH Optimize dot product order for LogisticRegression
 for dense matrices (#19571)

* Use multi_dot for Hessian and gradient product.
  np.linalg.multi_dot quickly chooses the best order for the multiplication of three matrices.
---
 doc/whats_new/v1.0.rst            | 5 +++++
 sklearn/linear_model/_logistic.py | 5 ++++-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index a1f21723bac28..4698657c9a82e 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -113,6 +113,11 @@ Changelog
 :mod:`sklearn.linear_model`
 ...........................
 
+- |Efficiency| The implementation of :class:`linear_model.LogisticRegression`
+  has been optimised for dense matrices when using `solver='newton-cg'` and
+  `multi_class!='multinomial'`.
+  :pr:`19571` by :user:`Julien Jerphanion <jjerphan>`.
+
 - |Enhancement| Validate user-supplied gram matrix passed to linear models
   via the `precompute` argument. :pr:`19004` by :user:`Adam Midvidy <amidvidy>`.
 
diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py
index 2b8b6a716cbf7..be28c5806ede5 100644
--- a/sklearn/linear_model/_logistic.py
+++ b/sklearn/linear_model/_logistic.py
@@ -233,7 +233,10 @@ def _logistic_grad_hess(w, X, y, alpha, sample_weight=None):
 
     def Hs(s):
         ret = np.empty_like(s)
-        ret[:n_features] = X.T.dot(dX.dot(s[:n_features]))
+        if sparse.issparse(X):
+            ret[:n_features] = X.T.dot(dX.dot(s[:n_features]))
+        else:
+            ret[:n_features] = np.linalg.multi_dot([X.T, dX, s[:n_features]])
         ret[:n_features] += alpha * s[:n_features]
 
         # For the fit intercept case.

From 95c3c762fbc39799639279a1ad35716375a7a6e8 Mon Sep 17 00:00:00 2001
From: Alessia Marcolini <98marcolini@gmail.com>
Date: Mon, 15 Mar 2021 10:46:22 +0100
Subject: [PATCH 241/478] DOC Fix typo in plot_multi_metric_evaluation example
 (#19675)

---
 examples/model_selection/plot_multi_metric_evaluation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/model_selection/plot_multi_metric_evaluation.py b/examples/model_selection/plot_multi_metric_evaluation.py
index 775d0af9817a8..4f03f1b19462d 100644
--- a/examples/model_selection/plot_multi_metric_evaluation.py
+++ b/examples/model_selection/plot_multi_metric_evaluation.py
@@ -36,7 +36,7 @@
 
 X, y = make_hastie_10_2(n_samples=8000, random_state=42)
 
-# The scorers can be either be one of the predefined metric strings or a scorer
+# The scorers can be either one of the predefined metric strings or a scorer
 # callable, like the one returned by make_scorer
 scoring = {'AUC': 'roc_auc', 'Accuracy': make_scorer(accuracy_score)}
 

From 77e998db353206e45e0d15ba6f8ab0fc412a7077 Mon Sep 17 00:00:00 2001
From: cliffordEmmanuel <45907515+cliffordEmmanuel@users.noreply.github.com>
Date: Mon, 15 Mar 2021 11:22:16 +0000
Subject: [PATCH 242/478] ENH Deprecated the default random_state=0 in
 randomized_svd (#19670)

Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
Co-authored-by: cinbez <cindyb@lightstone.co.za>
---
 doc/whats_new/v1.0.rst              |  9 ++++++++
 sklearn/utils/extmath.py            | 34 ++++++++++++++++++++++-------
 sklearn/utils/tests/test_extmath.py | 17 ++++++++++-----
 3 files changed, 46 insertions(+), 14 deletions(-)

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 4698657c9a82e..89280c7f01d0d 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -221,6 +221,15 @@ Changelog
   for non-English characters. :pr:`18959` by :user:`Zero <Zeroto521>`
   and :user:`wstates <wstates>`.
 
+:mod:`sklearn.utils`
+....................
+
+- |Enhancement| Deprecated the default value of the `random_state=0` in 
+  :func:`~sklearn.utils.extmath.randomized_svd`. Starting in 1.2,
+  the default value of `random_state` will be set to `None`.
+  :pr:`19459` by :user:`Cindy Bezuidenhout <cinbez>` and 
+  :user:`Clifford Akai-Nettey<cliffordEmmanuel>`.
+
 :mod:`sklearn.calibration`
 ............................
 
diff --git a/sklearn/utils/extmath.py b/sklearn/utils/extmath.py
index 31ac63c42eb69..42a014dcd8ade 100644
--- a/sklearn/utils/extmath.py
+++ b/sklearn/utils/extmath.py
@@ -245,7 +245,7 @@ def randomized_range_finder(A, *, size, n_iter,
 @_deprecate_positional_args
 def randomized_svd(M, n_components, *, n_oversamples=10, n_iter='auto',
                    power_iteration_normalizer='auto', transpose='auto',
-                   flip_sign=True, random_state=0):
+                   flip_sign=True, random_state='warn'):
     """Computes a truncated randomized SVD.
 
     Parameters
@@ -296,11 +296,17 @@ def randomized_svd(M, n_components, *, n_oversamples=10, n_iter='auto',
         set to `True`, the sign ambiguity is resolved by making the largest
         loadings for each component in the left singular vectors positive.
 
-    random_state : int, RandomState instance or None, default=0
-        The seed of the pseudo random number generator to use when shuffling
-        the data, i.e. getting the random vectors to initialize the algorithm.
-        Pass an int for reproducible results across multiple function calls.
-        See :term:`Glossary <random_state>`.
+    random_state : int, RandomState instance or None, default='warn'
+        The seed of the pseudo random number generator to use when
+        shuffling the data, i.e. getting the random vectors to initialize
+        the algorithm. Pass an int for reproducible results across multiple
+        function calls. See :term:`Glossary <random_state>`.
+
+        .. versionchanged:: 1.2
+            The previous behavior (`random_state=0`) is deprecated, and
+            from v1.2 the default value will be `random_state=None`. Set
+            the value of `random_state` explicitly to suppress the deprecation
+            warning.
 
     Notes
     -----
@@ -326,10 +332,22 @@ def randomized_svd(M, n_components, *, n_oversamples=10, n_iter='auto',
     """
     if isinstance(M, (sparse.lil_matrix, sparse.dok_matrix)):
         warnings.warn("Calculating SVD of a {} is expensive. "
-                      "csr_matrix is more efficient.".format(
-                          type(M).__name__),
+                      "csr_matrix is more efficient.".format(type(M).__name__),
                       sparse.SparseEfficiencyWarning)
 
+    if random_state == 'warn':
+        warnings.warn(
+            "If 'random_state' is not supplied, the current default "
+            "is to use 0 as a fixed seed. This will change to  "
+            "None in version 1.2 leading to non-deterministic results "
+            "that better reflect nature of the randomized_svd solver. "
+            "If you want to silence this warning, set 'random_state' "
+            "to an integer seed or to None explicitly depending "
+            "if you want your code to be deterministic or not.",
+            FutureWarning
+        )
+        random_state = 0
+
     random_state = check_random_state(random_state)
     n_random = n_components + n_oversamples
     n_samples, n_features = M.shape
diff --git a/sklearn/utils/tests/test_extmath.py b/sklearn/utils/tests/test_extmath.py
index cee4870b087c2..8e53d94d911f0 100644
--- a/sklearn/utils/tests/test_extmath.py
+++ b/sklearn/utils/tests/test_extmath.py
@@ -249,7 +249,8 @@ def test_randomized_svd_infinite_rank():
         # compute the singular values of X using the fast approximate method
         # without the iterated power method
         _, sa, _ = randomized_svd(X, k, n_iter=0,
-                                  power_iteration_normalizer=normalizer)
+                                  power_iteration_normalizer=normalizer,
+                                  random_state=0)
 
         # the approximation does not tolerate the noise:
         assert np.abs(s[:k] - sa).max() > 0.1
@@ -257,7 +258,8 @@ def test_randomized_svd_infinite_rank():
         # compute the singular values of X using the fast approximate method
         # with iterated power method
         _, sap, _ = randomized_svd(X, k, n_iter=5,
-                                   power_iteration_normalizer=normalizer)
+                                   power_iteration_normalizer=normalizer,
+                                   random_state=0)
 
         # the iterated power method is still managing to get most of the
         # structure at the requested rank
@@ -307,11 +309,13 @@ def test_randomized_svd_power_iteration_normalizer():
 
     # Check that it diverges with many (non-normalized) power iterations
     U, s, Vt = randomized_svd(X, n_components, n_iter=2,
-                              power_iteration_normalizer='none')
+                              power_iteration_normalizer='none',
+                              random_state=0)
     A = X - U.dot(np.diag(s).dot(Vt))
     error_2 = linalg.norm(A, ord='fro')
     U, s, Vt = randomized_svd(X, n_components, n_iter=20,
-                              power_iteration_normalizer='none')
+                              power_iteration_normalizer='none',
+                              random_state=0)
     A = X - U.dot(np.diag(s).dot(Vt))
     error_20 = linalg.norm(A, ord='fro')
     assert np.abs(error_2 - error_20) > 100
@@ -401,14 +405,15 @@ def max_loading_is_positive(u, v):
     mat = np.arange(10 * 8).reshape(10, -1)
 
     # Without transpose
-    u_flipped, _, v_flipped = randomized_svd(mat, 3, flip_sign=True)
+    u_flipped, _, v_flipped = randomized_svd(mat, 3, flip_sign=True,
+                                             random_state=0)
     u_based, v_based = max_loading_is_positive(u_flipped, v_flipped)
     assert u_based
     assert not v_based
 
     # With transpose
     u_flipped_with_transpose, _, v_flipped_with_transpose = randomized_svd(
-        mat, 3, flip_sign=True, transpose=True)
+        mat, 3, flip_sign=True, transpose=True, random_state=0)
     u_based, v_based = max_loading_is_positive(
         u_flipped_with_transpose, v_flipped_with_transpose)
     assert u_based

From ac6dea5b7ebcd7a6b8d8b0d499d9c57d6a7d8939 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Vil=C3=A9m=20Zouhar?= <vilem.zouhar@gmail.com>
Date: Mon, 15 Mar 2021 12:25:20 +0100
Subject: [PATCH 243/478] MNT Fix error message for Minkowski metric parameter
 (#19671)

Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
---
 sklearn/neighbors/_base.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py
index 820b83eca1845..eb14e8ef0a900 100644
--- a/sklearn/neighbors/_base.py
+++ b/sklearn/neighbors/_base.py
@@ -355,7 +355,8 @@ def _check_algorithm_metric(self):
             effective_p = self.p
 
         if self.metric in ['wminkowski', 'minkowski'] and effective_p < 1:
-            raise ValueError("p must be greater than one for minkowski metric")
+            raise ValueError("p must be greater or equal to one for "
+                             "minkowski metric")
 
     def _fit(self, X, y=None):
         if self._get_tags()["requires_y"]:
@@ -411,8 +412,8 @@ def _fit(self, X, y=None):
         if self.metric == 'minkowski':
             p = self.effective_metric_params_.pop('p', 2)
             if p < 1:
-                raise ValueError("p must be greater than one "
-                                 "for minkowski metric")
+                raise ValueError("p must be greater or equal to one for "
+                                 "minkowski metric")
             elif p == 1:
                 self.effective_metric_ = 'manhattan'
             elif p == 2:

From 302106bcac4476ecdd76b8c03fddb454edbcad96 Mon Sep 17 00:00:00 2001
From: LSturtew <56136443+LSturtew@users.noreply.github.com>
Date: Mon, 15 Mar 2021 20:00:14 +0100
Subject: [PATCH 244/478] FIX RuntimeWarning by dividing by zero in
 test_iforest_with_uniform_data  (#19622)

---
 sklearn/ensemble/_iforest.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/sklearn/ensemble/_iforest.py b/sklearn/ensemble/_iforest.py
index 588b1bbef299c..9c3f547f23459 100644
--- a/sklearn/ensemble/_iforest.py
+++ b/sklearn/ensemble/_iforest.py
@@ -450,11 +450,14 @@ def _compute_score_samples(self, X, subsample_features):
                 + _average_path_length(n_samples_leaf)
                 - 1.0
             )
-
+        denominator = (
+            len(self.estimators_) * _average_path_length([self.max_samples_])
+        )
         scores = 2 ** (
-            -depths
-            / (len(self.estimators_)
-               * _average_path_length([self.max_samples_]))
+            # For a single training sample, denominator and depth are 0.
+            # Therefore, we set the score manually to 1.
+            -np.divide(depths, denominator, out=np.ones_like(depths),
+                       where=denominator != 0)
         )
         return scores
 

From d996eaf088eda47a57aa64ae457d37b8fdfb499e Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Tue, 16 Mar 2021 10:14:25 -0400
Subject: [PATCH 245/478] ENH Adds _num_features for array-likes (#19633)

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
Co-authored-by: Christian Lorentzen <lorentzen.ch@gmail.com>
---
 sklearn/base.py                        | 14 ++++++-
 sklearn/tests/test_base.py             | 26 ++++++++++++
 sklearn/utils/tests/test_validation.py | 57 ++++++++++++++++++++++++++
 sklearn/utils/validation.py            | 57 ++++++++++++++++++++++++++
 4 files changed, 153 insertions(+), 1 deletion(-)

diff --git a/sklearn/base.py b/sklearn/base.py
index 3626e931aa9cf..ec264b0cf5edc 100644
--- a/sklearn/base.py
+++ b/sklearn/base.py
@@ -21,6 +21,7 @@
 )
 from .utils.validation import check_X_y
 from .utils.validation import check_array
+from .utils.validation import _num_features
 from .utils._estimator_html_repr import estimator_html_repr
 from .utils.validation import _deprecate_positional_args
 
@@ -349,7 +350,18 @@ def _check_n_features(self, X, reset):
                call to `partial_fit`. All other methods that validate `X`
                should set `reset=False`.
         """
-        n_features = X.shape[1]
+        try:
+            n_features = _num_features(X)
+        except TypeError as e:
+            if not reset and hasattr(self, "n_features_in_"):
+                raise ValueError(
+                    "X does not contain any features, but "
+                    f"{self.__class__.__name__} is expecting "
+                    f"{self.n_features_in_} features"
+                ) from e
+            # If the number of features is not defined and reset=True,
+            # then we skip this check
+            return
 
         if reset:
             self.n_features_in_ = n_features
diff --git a/sklearn/tests/test_base.py b/sklearn/tests/test_base.py
index 666df1499d7dc..c91419bf10a0e 100644
--- a/sklearn/tests/test_base.py
+++ b/sklearn/tests/test_base.py
@@ -581,3 +581,29 @@ class TruePairwise(BaseEstimator):
     with pytest.warns(None) as record:
         assert not _is_pairwise(est)
     assert not record
+
+
+def test_n_features_in_validation():
+    """Check that `_check_n_features` validates data when reset=False"""
+    est = MyEstimator()
+    X_train = [[1, 2, 3], [4, 5, 6]]
+    est._check_n_features(X_train, reset=True)
+
+    assert est.n_features_in_ == 3
+
+    msg = ("X does not contain any features, but MyEstimator is expecting "
+           "3 features")
+    with pytest.raises(ValueError, match=msg):
+        est._check_n_features("invalid X", reset=False)
+
+
+def test_n_features_in_no_validation():
+    """Check that `_check_n_features` does not validate data when
+    n_features_in_ is not defined."""
+    est = MyEstimator()
+    est._check_n_features("invalid X", reset=True)
+
+    assert not hasattr(est, "n_features_in_")
+
+    # does not raise
+    est._check_n_features("invalid X", reset=False)
diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py
index f05bd4656cbd9..f3db51e694b52 100644
--- a/sklearn/utils/tests/test_validation.py
+++ b/sklearn/utils/tests/test_validation.py
@@ -2,6 +2,7 @@
 
 import warnings
 import os
+import re
 
 from tempfile import NamedTemporaryFile
 from itertools import product
@@ -18,6 +19,7 @@
 from sklearn.utils._testing import assert_array_equal
 from sklearn.utils._testing import assert_allclose_dense_sparse
 from sklearn.utils._testing import assert_allclose
+from sklearn.utils._testing import _convert_container
 from sklearn.utils import as_float_array, check_array, check_symmetric
 from sklearn.utils import check_X_y
 from sklearn.utils import deprecated
@@ -44,6 +46,7 @@
     _deprecate_positional_args,
     _check_sample_weight,
     _allclose_dense_sparse,
+    _num_features,
     FLOAT_DTYPES)
 from sklearn.utils.validation import _check_fit_params
 from sklearn.utils.fixes import parse_version
@@ -1324,3 +1327,57 @@ def test_check_pandas_sparse_valid(ntype1, ntype2, expected_subtype):
                                                      dtype=ntype2)})
     arr = check_array(df, accept_sparse=['csr', 'csc'])
     assert np.issubdtype(arr.dtype, expected_subtype)
+
+
+@pytest.mark.parametrize("constructor_name", [
+    "list", "tuple", "array", "dataframe", "sparse_csr", "sparse_csc"
+])
+def test_num_features(constructor_name):
+    """Check _num_features for array-likes."""
+    X = [[1, 2, 3], [4, 5, 6]]
+    X = _convert_container(X, constructor_name)
+    assert _num_features(X) == 3
+
+
+@pytest.mark.parametrize(
+    "X",
+    [
+        [1, 2, 3],
+        ["a", "b", "c"],
+        [False, True, False],
+        [1.0, 3.4, 4.0]
+    ],
+    ids=["int", "str", "bool", "float"]
+)
+@pytest.mark.parametrize("constructor_name", [
+    "list", "tuple", "array", "series"
+])
+def test_num_features_errors_1d_containers(X, constructor_name):
+    X = _convert_container(X, constructor_name)
+    if constructor_name == "array":
+        expected_type_name = "numpy.ndarray"
+    elif constructor_name == "series":
+        expected_type_name = "pandas.core.series.Series"
+    else:
+        expected_type_name = constructor_name
+    message = (
+        "Unable to find the number of features from X of type "
+        f"{expected_type_name}"
+    )
+    if hasattr(X, "shape"):
+        message += " with shape (3,)"
+    elif isinstance(X[0], str):
+        message += " where the samples are of type str"
+    with pytest.raises(TypeError, match=re.escape(message)):
+        _num_features(X)
+
+
+@pytest.mark.parametrize("X", [1, 'b', False, 3.0],
+                         ids=["int", "str", "bool", "float"])
+def test_num_features_errors_scalars(X):
+    msg = (
+        "Unable to find the number of features from X of type "
+        f"{type(X).__qualname__}"
+    )
+    with pytest.raises(TypeError, match=msg):
+        _num_features(X)
diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py
index 273a0cb2ab04c..d0f410dd7f5d8 100644
--- a/sklearn/utils/validation.py
+++ b/sklearn/utils/validation.py
@@ -184,6 +184,63 @@ def _is_arraylike(x):
             hasattr(x, '__array__'))
 
 
+def _num_features(X):
+    """Return the number of features in an array-like X.
+
+    This helper function tries hard to avoid to materialize an array version
+    of X unless necessary. For instance, if X is a list of lists,
+    this function will return the length of the first element, assuming
+    that subsequent elements are all lists of the same length without
+    checking.
+    Parameters
+    ----------
+    X : array-like
+        array-like to get the number of features.
+
+    Returns
+    -------
+    features : int
+        Number of features
+    """
+    type_ = type(X)
+    if type_.__module__ == "builtins":
+        type_name = type_.__qualname__
+    else:
+        type_name = f"{type_.__module__}.{type_.__qualname__}"
+    message = (
+        "Unable to find the number of features from X of type "
+        f"{type_name}"
+    )
+    if not hasattr(X, '__len__') and not hasattr(X, 'shape'):
+        if not hasattr(X, '__array__'):
+            raise TypeError(message)
+        # Only convert X to a numpy array if there is no cheaper, heuristic
+        # option.
+        X = np.asarray(X)
+
+    if hasattr(X, 'shape'):
+        if not hasattr(X.shape, '__len__') or len(X.shape) <= 1:
+            message += f" with shape {X.shape}"
+            raise TypeError(message)
+        return X.shape[1]
+
+    first_sample = X[0]
+
+    # Do not consider an array-like of strings to be a 2D array
+    if isinstance(first_sample, (str, bytes)):
+        message += (f" where the samples are of type "
+                    f"{type(first_sample).__qualname__}")
+        raise TypeError(message)
+
+    try:
+        # If X is a list of lists, for instance, we assume that all nested
+        # lists have the same length without checking or converting to
+        # a numpy array to keep this function call as cheap as possible.
+        return len(first_sample)
+    except Exception as err:
+        raise TypeError(message) from err
+
+
 def _num_samples(x):
     """Return number of samples in array-like x."""
     message = 'Expected sequence or array-like, got %s' % type(x)

From edc4f15f0d46b4d26c107894b80548474f25931b Mon Sep 17 00:00:00 2001
From: Alihan Zihna <alihanz@gmail.com>
Date: Tue, 16 Mar 2021 20:21:22 +0000
Subject: [PATCH 246/478] TST Change assert from sklearn to pytest style in
 tests/test_pipeline.py (#19678)

---
 sklearn/tests/test_pipeline.py | 215 +++++++++++++++++++--------------
 1 file changed, 124 insertions(+), 91 deletions(-)

diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py
index 7989394d0a65e..85d2f7b6e07ca 100644
--- a/sklearn/tests/test_pipeline.py
+++ b/sklearn/tests/test_pipeline.py
@@ -14,13 +14,9 @@
 
 from sklearn.utils.fixes import parse_version
 from sklearn.utils._testing import (
-    assert_raises,
-    assert_raises_regex,
-    assert_raise_message,
     assert_allclose,
     assert_array_equal,
     assert_array_almost_equal,
-    assert_no_warnings,
     MinimalClassifier,
     MinimalRegressor,
     MinimalTransformer,
@@ -167,20 +163,23 @@ def predict(self, X, got_attribute=False):
 
 def test_pipeline_init():
     # Test the various init parameters of the pipeline.
-    assert_raises(TypeError, Pipeline)
+    with pytest.raises(TypeError):
+        Pipeline()
+
     # Check that we can't instantiate pipelines with objects without fit
     # method
-    assert_raises_regex(TypeError,
-                        'Last step of Pipeline should implement fit '
-                        'or be the string \'passthrough\''
-                        '.*NoFit.*',
-                        Pipeline, [('clf', NoFit())])
+    msg = ('Last step of Pipeline should implement fit '
+           'or be the string \'passthrough\''
+           '.*NoFit.*')
+    with pytest.raises(TypeError, match=msg):
+        Pipeline([('clf', NoFit())])
+
     # Smoke test with only an estimator
     clf = NoTrans()
     pipe = Pipeline([('svc', clf)])
     assert (pipe.get_params(deep=True) ==
-                 dict(svc__a=None, svc__b=None, svc=clf,
-                      **pipe.get_params(deep=False)))
+            dict(svc__a=None, svc__b=None, svc=clf,
+                 **pipe.get_params(deep=False)))
 
     # Check that params are set
     pipe.set_params(svc__a=0.1)
@@ -200,10 +199,9 @@ def test_pipeline_init():
 
     # Check that we can't instantiate with non-transformers on the way
     # Note that NoTrans implements fit, but not transform
-    assert_raises_regex(TypeError,
-                        'All intermediate steps should be transformers'
-                        '.*\\bNoTrans\\b.*',
-                        Pipeline, [('t', NoTrans()), ('svc', clf)])
+    msg = 'All intermediate steps should be transformers.*\\bNoTrans\\b.*'
+    with pytest.raises(TypeError, match=msg):
+        Pipeline([('t', NoTrans()), ('svc', clf)])
 
     # Check that params are set
     pipe.set_params(svc__C=0.1)
@@ -212,10 +210,13 @@ def test_pipeline_init():
     repr(pipe)
 
     # Check that params are not set when naming them wrong
-    assert_raises(ValueError, pipe.set_params, anova__C=0.1)
+    msg = 'Invalid parameter C for estimator SelectKBest'
+    with pytest.raises(ValueError, match=msg):
+        pipe.set_params(anova__C=0.1)
 
     # Test clone
-    pipe2 = assert_no_warnings(clone, pipe)
+    with pytest.warns(None):
+        pipe2 = clone(pipe)
     assert not pipe.named_steps['svc'] is pipe2.named_steps['svc']
 
     # Check that apart from estimators, the parameters are the same
@@ -273,11 +274,10 @@ def test_pipeline_fit_params():
     assert pipe.named_steps['transf'].a is None
     assert pipe.named_steps['transf'].b is None
     # invalid parameters should raise an error message
-    assert_raise_message(
-        TypeError,
-        "fit() got an unexpected keyword argument 'bad'",
-        pipe.fit, None, None, clf__bad=True
-    )
+
+    msg = re.escape("fit() got an unexpected keyword argument 'bad'")
+    with pytest.raises(TypeError, match=msg):
+        pipe.fit(None, None, clf__bad=True)
 
 
 def test_pipeline_sample_weight_supported():
@@ -298,11 +298,12 @@ def test_pipeline_sample_weight_unsupported():
     pipe.fit(X, y=None)
     assert pipe.score(X) == 3
     assert pipe.score(X, sample_weight=None) == 3
-    assert_raise_message(
-        TypeError,
-        "score() got an unexpected keyword argument 'sample_weight'",
-        pipe.score, X, sample_weight=np.array([2, 3])
+
+    msg = re.escape(
+        "score() got an unexpected keyword argument 'sample_weight'"
     )
+    with pytest.raises(TypeError, match=msg):
+        pipe.score(X, sample_weight=np.array([2, 3]))
 
 
 def test_pipeline_raise_set_params_error():
@@ -310,20 +311,18 @@ def test_pipeline_raise_set_params_error():
     pipe = Pipeline([('cls', LinearRegression())])
 
     # expected error message
-    error_msg = ('Invalid parameter %s for estimator %s. '
-                 'Check the list of available parameters '
-                 'with `estimator.get_params().keys()`.')
+    error_msg = re.escape(
+        f"Invalid parameter fake for estimator {pipe}. "
+        'Check the list of available parameters '
+        'with `estimator.get_params().keys()`.'
+    )
 
-    assert_raise_message(ValueError,
-                         error_msg % ('fake', pipe),
-                         pipe.set_params,
-                         fake='nope')
+    with pytest.raises(ValueError, match=error_msg):
+        pipe.set_params(fake='nope')
 
     # nested model check
-    assert_raise_message(ValueError,
-                         error_msg % ("fake", pipe),
-                         pipe.set_params,
-                         fake__estimator='nope')
+    with pytest.raises(ValueError, match=error_msg):
+        pipe.set_params(fake__estimator='nope')
 
 
 def test_pipeline_methods_pca_svm():
@@ -431,9 +430,10 @@ def test_fit_predict_on_pipeline_without_fit_predict():
     scaler = StandardScaler()
     pca = PCA(svd_solver='full')
     pipe = Pipeline([('scaler', scaler), ('pca', pca)])
-    assert_raises_regex(AttributeError,
-                        "'PCA' object has no attribute 'fit_predict'",
-                        getattr, pipe, 'fit_predict')
+
+    msg = "'PCA' object has no attribute 'fit_predict'"
+    with pytest.raises(AttributeError, match=msg):
+        getattr(pipe, 'fit_predict')
 
 
 def test_fit_predict_with_intermediate_fit_params():
@@ -484,7 +484,8 @@ def test_feature_union():
     assert_array_almost_equal(X_transformed, X_sp_transformed.toarray())
 
     # Test clone
-    fs2 = assert_no_warnings(clone, fs)
+    with pytest.warns(None):
+        fs2 = clone(fs)
     assert fs.transformer_list[0][1] is not fs2.transformer_list[0][1]
 
     # test setting parameters
@@ -497,11 +498,9 @@ def test_feature_union():
     assert X_transformed.shape == (X.shape[0], 8)
 
     # test error if some elements do not support transform
-    assert_raises_regex(TypeError,
-                        'All estimators should implement fit and '
-                        'transform.*\\bNoTrans\\b',
-                        FeatureUnion,
-                        [("transform", Transf()), ("no_transform", NoTrans())])
+    msg = 'All estimators should implement fit and transform.*\\bNoTrans\\b'
+    with pytest.raises(TypeError, match=msg):
+        FeatureUnion([("transform", Transf()), ("no_transform", NoTrans())])
 
     # test that init accepts tuples
     fs = FeatureUnion((("svd", svd), ("select", select)))
@@ -523,13 +522,13 @@ def test_make_union_kwargs():
     fu = make_union(pca, mock, n_jobs=3)
     assert fu.transformer_list == make_union(pca, mock).transformer_list
     assert 3 == fu.n_jobs
+
     # invalid keyword parameters should raise an error message
-    assert_raise_message(
-        TypeError,
-        "make_union() got an unexpected "
-        "keyword argument 'transformer_weights'",
-        make_union, pca, mock, transformer_weights={'pca': 10, 'Transf': 1}
+    msg = re.escape(
+        "make_union() got an unexpected keyword argument 'transformer_weights'"
     )
+    with pytest.raises(TypeError, match=msg):
+        make_union(pca, mock, transformer_weights={'pca': 10, 'Transf': 1})
 
 
 def test_pipeline_transform():
@@ -600,8 +599,14 @@ def test_pipeline_index():
     assert pipe['transf'] == transf
     assert pipe[-1] == clf
     assert pipe['clf'] == clf
-    assert_raises(IndexError, lambda: pipe[3])
-    assert_raises(KeyError, lambda: pipe['foobar'])
+
+    # should raise an error if slicing out of range
+    with pytest.raises(IndexError):
+        pipe[3]
+
+    # should raise an error if indexing with wrong element name
+    with pytest.raises(KeyError):
+        pipe['foobar']
 
 
 def test_set_pipeline_steps():
@@ -626,8 +631,15 @@ def test_set_pipeline_steps():
 
     # With invalid data
     pipeline.set_params(steps=[('junk', ())])
-    assert_raises(TypeError, pipeline.fit, [[1]], [1])
-    assert_raises(TypeError, pipeline.fit_transform, [[1]], [1])
+    msg = re.escape(
+        "Last step of Pipeline should implement fit or be the "
+        "string 'passthrough'."
+    )
+    with pytest.raises(TypeError, match=msg):
+        pipeline.fit([[1]], [1])
+
+    with pytest.raises(TypeError, match=msg):
+        pipeline.fit_transform([[1]], [1])
 
 
 def test_pipeline_named_steps():
@@ -692,15 +704,15 @@ def make():
     assert_array_equal([exp], pipeline.fit(X).predict(X))
     assert_array_equal(X, pipeline.inverse_transform([[exp]]))
     assert (pipeline.get_params(deep=True) ==
-                      {'steps': pipeline.steps,
-                       'm2': mult2,
-                       'm3': passthrough,
-                       'last': mult5,
-                       'memory': None,
-                       'm2__mult': 2,
-                       'last__mult': 5,
-                       'verbose': False
-                       })
+            {'steps': pipeline.steps,
+             'm2': mult2,
+             'm3': passthrough,
+             'last': mult5,
+             'memory': None,
+             'm2__mult': 2,
+             'last__mult': 5,
+             'verbose': False
+             })
 
     pipeline.set_params(m2=passthrough)
     exp = 5
@@ -727,9 +739,10 @@ def make():
     assert_array_equal([[exp]], pipeline.fit(X, y).transform(X))
     assert_array_equal([[exp]], pipeline.fit_transform(X, y))
     assert_array_equal(X, pipeline.inverse_transform([[exp]]))
-    assert_raise_message(AttributeError,
-                         "'str' object has no attribute 'predict'",
-                         getattr, pipeline, 'predict')
+
+    msg = "'str' object has no attribute 'predict'"
+    with pytest.raises(AttributeError, match=msg):
+        getattr(pipeline, 'predict')
 
     # Check 'passthrough' step at construction time
     exp = 2 * 5
@@ -872,9 +885,12 @@ def test_feature_union_feature_names():
     assert len(feature_names) == 35
 
     ft = FeatureUnion([("tr1", Transf())]).fit([[1]])
-    assert_raise_message(AttributeError,
-                         'Transformer tr1 (type Transf) does not provide '
-                         'get_feature_names', ft.get_feature_names)
+
+    msg = re.escape(
+        'Transformer tr1 (type Transf) does not provide get_feature_names'
+    )
+    with pytest.raises(AttributeError, match=msg):
+        ft.get_feature_names()
 
 
 def test_classes_property():
@@ -883,10 +899,12 @@ def test_classes_property():
 
     reg = make_pipeline(SelectKBest(k=1), LinearRegression())
     reg.fit(X, y)
-    assert_raises(AttributeError, getattr, reg, "classes_")
+    with pytest.raises(AttributeError):
+        getattr(reg, 'classes_')
 
     clf = make_pipeline(SelectKBest(k=1), LogisticRegression(random_state=0))
-    assert_raises(AttributeError, getattr, clf, "classes_")
+    with pytest.raises(AttributeError):
+        getattr(clf, 'classes_')
     clf.fit(X, y)
     assert_array_equal(clf.classes_, np.unique(y))
 
@@ -961,6 +979,11 @@ def test_set_feature_union_step_drop():
 
 
 def test_step_name_validation():
+    error_message_1 = r"Estimator names must not contain __: got \['a__q'\]"
+    error_message_2 = r"Names provided are not unique: \['a', 'a'\]"
+    error_message_3 = (
+        r"Estimator names conflict with constructor arguments: \['%s'\]"
+    )
     bad_steps1 = [('a__q', Mult(2)), ('b', Mult(3))]
     bad_steps2 = [('a', Mult(2)), ('a', Mult(3))]
     for cls, param in [(Pipeline, 'steps'),
@@ -968,29 +991,32 @@ def test_step_name_validation():
         # we validate in construction (despite scikit-learn convention)
         bad_steps3 = [('a', Mult(2)), (param, Mult(3))]
         for bad_steps, message in [
-            (bad_steps1, "Estimator names must not contain __: got ['a__q']"),
-            (bad_steps2, "Names provided are not unique: ['a', 'a']"),
-            (bad_steps3, "Estimator names conflict with constructor "
-                         "arguments: ['%s']" % param),
+            (bad_steps1, error_message_1),
+            (bad_steps2, error_message_2),
+            (bad_steps3, error_message_3 % param),
         ]:
             # three ways to make invalid:
             # - construction
-            assert_raise_message(ValueError, message, cls,
-                                 **{param: bad_steps})
+            with pytest.raises(ValueError, match=message):
+                cls(**{param: bad_steps})
 
             # - setattr
             est = cls(**{param: [('a', Mult(1))]})
             setattr(est, param, bad_steps)
-            assert_raise_message(ValueError, message, est.fit, [[1]], [1])
-            assert_raise_message(ValueError, message, est.fit_transform,
-                                 [[1]], [1])
+            with pytest.raises(ValueError, match=message):
+                est.fit([[1]], [1])
+
+            with pytest.raises(ValueError, match=message):
+                est.fit_transform([[1]], [1])
 
             # - set_params
             est = cls(**{param: [('a', Mult(1))]})
             est.set_params(**{param: bad_steps})
-            assert_raise_message(ValueError, message, est.fit, [[1]], [1])
-            assert_raise_message(ValueError, message, est.fit_transform,
-                                 [[1]], [1])
+            with pytest.raises(ValueError, match=message):
+                est.fit([[1]], [1])
+
+            with pytest.raises(ValueError, match=message):
+                est.fit_transform([[1]], [1])
 
 
 def test_set_params_nested_pipeline():
@@ -1012,9 +1038,13 @@ def test_pipeline_wrong_memory():
     memory = 1
     cached_pipe = Pipeline([('transf', DummyTransf()),
                             ('svc', SVC())], memory=memory)
-    assert_raises_regex(ValueError, "'memory' should be None, a string or"
-                        " have the same interface as joblib.Memory."
-                        " Got memory='1' instead.", cached_pipe.fit, X, y)
+
+    msg = re.escape(
+        "'memory' should be None, a string or have the same interface "
+        "as joblib.Memory. Got memory='1' instead."
+    )
+    with pytest.raises(ValueError, match=msg):
+        cached_pipe.fit(X, y)
 
 
 class DummyMemory:
@@ -1034,9 +1064,12 @@ def test_pipeline_with_cache_attribute():
     dummy = WrongDummyMemory()
     pipe = Pipeline([('transf', Transf()), ('clf', Mult())],
                     memory=dummy)
-    assert_raises_regex(ValueError, "'memory' should be None, a string or"
-                        " have the same interface as joblib.Memory."
-                        " Got memory='{}' instead.".format(dummy), pipe.fit, X)
+    msg = re.escape(
+        "'memory' should be None, a string or have the same interface "
+        f"as joblib.Memory. Got memory='{dummy}' instead."
+    )
+    with pytest.raises(ValueError, match=msg):
+        pipe.fit(X)
 
 
 def test_pipeline_memory():

From b7b510f9dbc87500e79301873852c6247c440a3e Mon Sep 17 00:00:00 2001
From: Mathis Batoul <batoulmathis@gmail.com>
Date: Tue, 16 Mar 2021 22:35:46 +0100
Subject: [PATCH 247/478] FIX RuntimeWarning division by zero in
 check_classifiers_one_label (#19690)

---
 sklearn/discriminant_analysis.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/sklearn/discriminant_analysis.py b/sklearn/discriminant_analysis.py
index c5c18ac9136d2..2e80f94404175 100644
--- a/sklearn/discriminant_analysis.py
+++ b/sklearn/discriminant_analysis.py
@@ -476,8 +476,12 @@ def _solve_svd(self, X, y):
         # (n_classes) centers
         _, S, Vt = linalg.svd(X, full_matrices=0)
 
-        self.explained_variance_ratio_ = (S**2 / np.sum(
-            S**2))[:self._max_components]
+        if self._max_components == 0:
+            self.explained_variance_ratio_ = np.empty((0,), dtype=S.dtype)
+        else:
+            self.explained_variance_ratio_ = (S**2 / np.sum(
+                S**2))[:self._max_components]
+
         rank = np.sum(S > self.tol * S[0])
         self.scalings_ = np.dot(scalings, Vt.T[:, :rank])
         coef = np.dot(self.means_ - self.xbar_, self.scalings_)

From fcf4740b4538657997b0f4b8015728d64e2d563e Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@gmail.com>
Date: Wed, 17 Mar 2021 02:58:43 +0100
Subject: [PATCH 248/478] TST Add a test to check the consistency of the Ridge
 and ElasticNet(l1_ratio=0) solutions (#19620)

---
 sklearn/linear_model/_cd_fast.pyx             | 18 +++++--
 .../tests/test_coordinate_descent.py          | 50 +++++++++++++++++++
 2 files changed, 64 insertions(+), 4 deletions(-)

diff --git a/sklearn/linear_model/_cd_fast.pyx b/sklearn/linear_model/_cd_fast.pyx
index 84e4b4a49df01..4841809ac7aa7 100644
--- a/sklearn/linear_model/_cd_fast.pyx
+++ b/sklearn/linear_model/_cd_fast.pyx
@@ -244,10 +244,20 @@ def enet_coordinate_descent(floating[::1] w,
         else:
             # for/else, runs if for doesn't end with a `break`
             with gil:
-                warnings.warn("Objective did not converge. You might want to "
-                              "increase the number of iterations. Duality "
-                              "gap: {}, tolerance: {}".format(gap, tol),
-                              ConvergenceWarning)
+                message = (
+                    "Objective did not converge. You might want to increase "
+                    "the number of iterations, check the scale of the "
+                    "features or consider increasing regularisation. "
+                    f"Duality gap: {gap:.3e}, tolerance: {tol:.3e}"
+                )
+                if alpha < np.finfo(np.float64).eps:
+                    message += (
+                        " Linear regression models with null weight for the "
+                        "l1 regularization term are more efficiently fitted "
+                        "using one of the solvers implemented in "
+                        "sklearn.linear_model.Ridge/RidgeCV instead."
+                    )
+                warnings.warn(message, ConvergenceWarning)
 
     return w, gap, tol, n_iter + 1
 
diff --git a/sklearn/linear_model/tests/test_coordinate_descent.py b/sklearn/linear_model/tests/test_coordinate_descent.py
index ebddb6a7e47c6..d63211d6050bc 100644
--- a/sklearn/linear_model/tests/test_coordinate_descent.py
+++ b/sklearn/linear_model/tests/test_coordinate_descent.py
@@ -1419,3 +1419,53 @@ def test_enet_sample_weight_does_not_overwrite_sample_weight(check_input):
     reg.fit(X, y, sample_weight=sample_weight, check_input=check_input)
 
     assert_array_equal(sample_weight, sample_weight_1_25)
+
+
+@pytest.mark.parametrize("ridge_alpha", [1e-1, 1., 1e6])
+@pytest.mark.parametrize("normalize", [True, False])
+def test_enet_ridge_consistency(normalize, ridge_alpha):
+    # Check that ElasticNet(l1_ratio=0) converges to the same solution as Ridge
+    # provided that the value of alpha is adapted.
+    #
+    # XXX: this test does not pass for weaker regularization (lower values of
+    # ridge_alpha): it could be either a problem of ElasticNet or Ridge (less
+    # likely) and depends on the dataset statistics: lower values for
+    # effective_rank are more problematic in particular.
+
+    rng = np.random.RandomState(42)
+    X, y = make_regression(
+        n_samples=100,
+        n_features=300,
+        effective_rank=100,
+        n_informative=50,
+        random_state=rng,
+    )
+    sw = rng.uniform(low=0.01, high=2, size=X.shape[0])
+
+    ridge = Ridge(
+        alpha=ridge_alpha,
+        normalize=normalize,
+    ).fit(X, y, sample_weight=sw)
+
+    enet = ElasticNet(
+        alpha=ridge_alpha / sw.sum(),
+        normalize=normalize,
+        l1_ratio=0.,
+        max_iter=1000,
+    )
+    # Even when the ElasticNet model has actually converged, the duality gap
+    # convergence criterion is never met when l1_ratio is 0 and for any value
+    # of the `tol` parameter. The convergence message should point the user to
+    # Ridge instead:
+    expected_msg = (
+        r"Objective did not converge\. .* "
+        r"Linear regression models with null weight for the "
+        r"l1 regularization term are more efficiently fitted "
+        r"using one of the solvers implemented in "
+        r"sklearn\.linear_model\.Ridge/RidgeCV instead\."
+    )
+    with pytest.warns(ConvergenceWarning, match=expected_msg):
+        enet.fit(X, y, sample_weight=sw)
+
+    assert_allclose(ridge.coef_, enet.coef_)
+    assert_allclose(ridge.intercept_, enet.intercept_)

From 36e43582c03f5933da15d833b71dc37eaafb436e Mon Sep 17 00:00:00 2001
From: Steve Stagg <stestagg@gmail.com>
Date: Wed, 17 Mar 2021 05:21:07 +0000
Subject: [PATCH 249/478] [MRG] Fix documentation for russelrao formula
 (#19695)

---
 sklearn/neighbors/_dist_metrics.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/neighbors/_dist_metrics.pyx b/sklearn/neighbors/_dist_metrics.pyx
index 8bee948eeaeba..4cc41d7136586 100755
--- a/sklearn/neighbors/_dist_metrics.pyx
+++ b/sklearn/neighbors/_dist_metrics.pyx
@@ -183,7 +183,7 @@ cdef class DistanceMetric:
     "dice"             DiceDistance             NNEQ / (NTT + NNZ)
     "kulsinski"        KulsinskiDistance        (NNEQ + N - NTT) / (NNEQ + N)
     "rogerstanimoto"   RogersTanimotoDistance   2 * NNEQ / (N + NNEQ)
-    "russellrao"       RussellRaoDistance       NNZ / N
+    "russellrao"       RussellRaoDistance       (N - NTT) / N
     "sokalmichener"    SokalMichenerDistance    2 * NNEQ / (N + NNEQ)
     "sokalsneath"      SokalSneathDistance      NNEQ / (NNEQ + 0.5 * NTT)
     =================  =======================  ===============================

From 95b7c680ab027fcd23bcbf47ebae58ee3e130ec9 Mon Sep 17 00:00:00 2001
From: mlondschien <61679398+mlondschien@users.noreply.github.com>
Date: Wed, 17 Mar 2021 13:46:45 +0100
Subject: [PATCH 250/478] ENH Add periodic extrapolation to SplineTransformer
 (#19483)

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
Co-authored-by: Christian Lorentzen <lorentzen.ch@gmail.com>
---
 doc/whats_new/v1.0.rst                        |   3 +
 .../plot_polynomial_interpolation.py          |  68 ++++++
 sklearn/preprocessing/_polynomial.py          | 135 +++++++----
 .../preprocessing/tests/test_polynomial.py    | 209 +++++++++++++++++-
 4 files changed, 363 insertions(+), 52 deletions(-)

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 89280c7f01d0d..34f39ca48f20a 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -206,6 +206,9 @@ Changelog
   polynomial ``degree`` of the splines, number of knots ``n_knots`` and knot
   positioning strategy ``knots``.
   :pr:`18368` by :user:`Christian Lorentzen <lorentzenchr>`.
+  :class:`preprocessing.SplineTransformer` also supports periodic
+  splines via the ``extrapolation`` argument.
+  :pr:`19483` by :user:`Malte Londschien <mlondschien>`.
 
 - |Fix| :func:`preprocessing.scale`, :class:`preprocessing.StandardScaler`
   and similar scalers detect near-constant features to avoid scaling them to
diff --git a/examples/linear_model/plot_polynomial_interpolation.py b/examples/linear_model/plot_polynomial_interpolation.py
index cfa684ffd79ca..34972b9522c68 100644
--- a/examples/linear_model/plot_polynomial_interpolation.py
+++ b/examples/linear_model/plot_polynomial_interpolation.py
@@ -39,6 +39,7 @@
 # Author: Mathieu Blondel
 #         Jake Vanderplas
 #         Christian Lorentzen
+#         Malte Londschien
 # License: BSD 3 clause
 
 import numpy as np
@@ -145,3 +146,70 @@ def f(x):
 # function has local support and is continued as a constant beyond the fitted
 # range. This extrapolating behaviour could be changed by the argument
 # ``extrapolation``.
+
+# %%
+# Periodic Splines
+# ----------------
+# In the previous example we saw the limitations of polynomials and splines for
+# extrapolation beyond the range of the training observations. In some
+# settings, e.g. with seasonal effects, we expect a periodic continuation of
+# the underlying signal. Such effects can be modelled using periodic splines,
+# which have equal function value and equal derivatives at the first and last
+# knot. In the following case we show how periodic splines provide a better fit
+# both within and outside of the range of training data given the additional
+# information of periodicity. The splines period is the distance between
+# the first and last knot, which we specify manually.
+#
+# Periodic splines can also be useful for naturally periodic features (such as
+# day of the year), as the smoothness at the boundary knots prevents a jump in
+# the transformed values (e.g. from Dec 31st to Jan 1st). For such naturally
+# periodic features or more generally features where the period is known, it is
+# advised to explicitly pass this information to the `SplineTransformer` by
+# setting the knots manually.
+
+
+# %%
+def g(x):
+    """Function to be approximated by periodic spline interpolation."""
+    return np.sin(x) - 0.7 * np.cos(x * 3)
+
+
+y_train = g(x_train)
+
+# Extend the test data into the future:
+x_plot_ext = np.linspace(-1, 21, 200)
+X_plot_ext = x_plot_ext[:, np.newaxis]
+
+lw = 2
+fig, ax = plt.subplots()
+ax.set_prop_cycle(color=["black", "tomato", "teal"])
+ax.plot(x_plot_ext, g(x_plot_ext), linewidth=lw, label="ground truth")
+ax.scatter(x_train, y_train, label="training points")
+
+for transformer, label in [
+  (SplineTransformer(degree=3, n_knots=10), "spline"),
+  (SplineTransformer(
+      degree=3,
+      knots=np.linspace(0, 2 * np.pi, 10)[:, None],
+      extrapolation="periodic"
+  ), "periodic spline")
+]:
+    model = make_pipeline(transformer, Ridge(alpha=1e-3))
+    model.fit(X_train, y_train)
+    y_plot_ext = model.predict(X_plot_ext)
+    ax.plot(x_plot_ext, y_plot_ext, label=label)
+
+ax.legend()
+fig.show()
+
+# %% We again plot the underlying splines.
+fig, ax = plt.subplots()
+knots = np.linspace(0, 2 * np.pi, 4)
+splt = SplineTransformer(
+  knots=knots[:, None],
+  degree=3,
+  extrapolation="periodic"
+).fit(X_train)
+ax.plot(x_plot_ext, splt.transform(X_plot_ext))
+ax.legend(ax.lines, [f"spline {n}" for n in range(3)])
+plt.show()
diff --git a/sklearn/preprocessing/_polynomial.py b/sklearn/preprocessing/_polynomial.py
index 26587e7f05823..ad358e50c4681 100644
--- a/sklearn/preprocessing/_polynomial.py
+++ b/sklearn/preprocessing/_polynomial.py
@@ -19,13 +19,13 @@
 
 # TODO:
 # - sparse support (either scipy or own cython solution)?
-# - extrapolation (cyclic)
 class SplineTransformer(TransformerMixin, BaseEstimator):
     """Generate univariate B-spline bases for features.
 
     Generate a new feature matrix consisting of
-    `n_splines=n_knots + degree - 1` spline basis functions (B-splines) of
-    polynomial order=`degree` for each feature.
+    `n_splines=n_knots + degree - 1` (`n_knots - 1` for
+    `extrapolation="periodic"`) spline basis functions
+    (B-splines) of polynomial order=`degree` for each feature.
 
     Read more in the :ref:`User Guide <spline_transformer>`.
 
@@ -54,14 +54,21 @@ class SplineTransformer(TransformerMixin, BaseEstimator):
           `degree` number of knots are added before the first knot, the same
           after the last knot.
 
-    extrapolation : {'error', 'constant', 'linear', 'continue'}, \
+    extrapolation : {'error', 'constant', 'linear', 'continue', 'periodic'}, \
         default='constant'
         If 'error', values outside the min and max values of the training
         features raises a `ValueError`. If 'constant', the value of the
         splines at minimum and maximum value of the features is used as
         constant extrapolation. If 'linear', a linear extrapolation is used.
         If 'continue', the splines are extrapolated as is, i.e. option
-        `extrapolate=True` in :class:`scipy.interpolate.BSpline`.
+        `extrapolate=True` in :class:`scipy.interpolate.BSpline`. If
+        'periodic', periodic splines with a periodicity equal to the distance
+        between the first and last knot are used. Periodic splines enforce
+        equal function values and derivatives at the first and last knot.
+        For example, this makes it possible to avoid introducing an arbitrary
+        jump between Dec 31st and Jan 1st in spline features derived from a
+        naturally periodic "day-of-year" input feature. In this case it is
+        recommended to manually set the knot values to control the period.
 
     include_bias : bool, default=True
         If True (default), then the last spline element inside the data range
@@ -84,7 +91,9 @@ class SplineTransformer(TransformerMixin, BaseEstimator):
     n_features_out_ : int
         The total number of output features, which is computed as
         `n_features * n_splines`, where `n_splines` is
-        the number of bases elements of the B-splines, `n_knots + degree - 1`.
+        the number of bases elements of the B-splines,
+        `n_knots + degree - 1` for non-periodic splines and
+        `n_knots - 1` for periodic ones.
         If `include_bias=False`, then it is only
         `n_features * (n_splines - 1)`.
 
@@ -235,7 +244,7 @@ def fit(self, X, y=None):
                 X, n_knots=self.n_knots, knots=self.knots
             )
         else:
-            base_knots = check_array(self.knots)
+            base_knots = check_array(self.knots, dtype=np.float64)
             if base_knots.shape[0] < 2:
                 raise ValueError(
                     "Number of knots, knots.shape[0], must be >= " "2."
@@ -250,10 +259,11 @@ def fit(self, X, y=None):
             "constant",
             "linear",
             "continue",
+            "periodic",
         ):
             raise ValueError(
                 "extrapolation must be one of 'error', "
-                "'constant', 'linear' or 'continue'."
+                "'constant', 'linear', 'continue' or 'periodic'."
             )
 
         if not isinstance(self.include_bias, (bool, np.bool_)):
@@ -261,44 +271,74 @@ def fit(self, X, y=None):
 
         # number of knots for base interval
         n_knots = base_knots.shape[0]
+
+        if self.extrapolation == "periodic" and n_knots <= self.degree:
+            raise ValueError(
+                "Periodic splines require degree < n_knots. Got n_knots="
+                f"{n_knots} and degree={self.degree}."
+            )
+
         # number of splines basis functions
-        n_splines = n_knots + self.degree - 1
+        if self.extrapolation != "periodic":
+            n_splines = n_knots + self.degree - 1
+        else:
+            # periodic splines have self.degree less degrees of freedom
+            n_splines = n_knots - 1
+
         degree = self.degree
         n_out = n_features * n_splines
         # We have to add degree number of knots below, and degree number knots
         # above the base knots in order to make the spline basis complete.
-        # Eilers & Marx in "Flexible smoothing with B-splines and  penalties"
-        # https://doi.org/10.1214/ss/1038425655 advice against repeating first
-        # and last knot several times, which would have inferior behaviour at
-        # boundaries if combined with a penalty (hence P-Spline). We follow
-        # this advice even if our splines are unpenalized.
-        # Meaning we do not:
-        # knots = np.r_[np.tile(base_knots.min(axis=0), reps=[degree, 1]),
-        #              base_knots,
-        #              np.tile(base_knots.max(axis=0), reps=[degree, 1])
-        #              ]
-        # Instead, we reuse the distance of the 2 fist/last knots.
-        dist_min = base_knots[1] - base_knots[0]
-        dist_max = base_knots[-1] - base_knots[-2]
-        knots = np.r_[
-            linspace(
-                base_knots[0] - degree * dist_min,
-                base_knots[0] - dist_min,
-                num=degree,
-            ),
-            base_knots,
-            linspace(
-                base_knots[-1] + dist_max,
-                base_knots[-1] + degree * dist_max,
-                num=degree,
-            ),
-        ]
+        if self.extrapolation == "periodic":
+            # For periodic splines the spacing of the first / last degree knots
+            # needs to be a continuation of the spacing of the last / first
+            # base knots.
+            period = base_knots[-1] - base_knots[0]
+            knots = np.r_[
+                base_knots[-(degree + 1): -1] - period,
+                base_knots,
+                base_knots[1: (degree + 1)] + period
+            ]
+
+        else:
+            # Eilers & Marx in "Flexible smoothing with B-splines and
+            # penalties" https://doi.org/10.1214/ss/1038425655 advice
+            # against repeating first and last knot several times, which
+            # would have inferior behaviour at boundaries if combined with
+            # a penalty (hence P-Spline). We follow this advice even if our
+            # splines are unpenalized. Meaning we do not:
+            # knots = np.r_[
+            #     np.tile(base_knots.min(axis=0), reps=[degree, 1]),
+            #     base_knots,
+            #     np.tile(base_knots.max(axis=0), reps=[degree, 1])
+            # ]
+            # Instead, we reuse the distance of the 2 fist/last knots.
+            dist_min = base_knots[1] - base_knots[0]
+            dist_max = base_knots[-1] - base_knots[-2]
+
+            knots = np.r_[
+                linspace(
+                    base_knots[0] - degree * dist_min,
+                    base_knots[0] - dist_min,
+                    num=degree,
+                ),
+                base_knots,
+                linspace(
+                    base_knots[-1] + dist_max,
+                    base_knots[-1] + degree * dist_max,
+                    num=degree,
+                ),
+            ]
 
         # With a diagonal coefficient matrix, we get back the spline basis
         # elements, i.e. the design matrix of the spline.
         # Note, BSpline appreciates C-contiguous float64 arrays as c=coef.
-        coef = np.eye(n_knots + self.degree - 1, dtype=np.float64)
-        extrapolate = self.extrapolation == "continue"
+        coef = np.eye(n_splines, dtype=np.float64)
+        if self.extrapolation == "periodic":
+            coef = np.concatenate((coef, coef[:degree, :]))
+
+        extrapolate = self.extrapolation in ["periodic", "continue"]
+
         bsplines = [
             BSpline.construct_fast(
                 knots[:, i], coef, self.degree, extrapolate=extrapolate
@@ -331,7 +371,7 @@ def transform(self, X):
         )
 
         n_samples, n_features = X.shape
-        n_splines = self.bsplines_[0].c.shape[0]
+        n_splines = self.bsplines_[0].c.shape[1]
         degree = self.degree
 
         # Note that scipy BSpline returns float64 arrays and converts input
@@ -346,8 +386,23 @@ def transform(self, X):
         for i in range(n_features):
             spl = self.bsplines_[i]
 
-            if self.extrapolation in ("continue", "error"):
-                XBS[:, (i * n_splines):((i + 1) * n_splines)] = spl(X[:, i])
+            if self.extrapolation in ("continue", "error", "periodic"):
+
+                if self.extrapolation == "periodic":
+                    # With periodic extrapolation we map x to the segment
+                    # [spl.t[k], spl.t[n]].
+                    # This is equivalent to BSpline(.., extrapolate="periodic")
+                    # for scipy>=1.0.0.
+                    n = spl.t.size - spl.k - 1
+                    # Assign to new array to avoid inplace operation
+                    x = spl.t[spl.k] + (X[:, i] - spl.t[spl.k]) % (
+                        spl.t[n] - spl.t[spl.k]
+                    )
+                else:
+                    x = X[:, i]
+
+                XBS[:, (i * n_splines):((i + 1) * n_splines)] = spl(x)
+
             else:
                 xmin = spl.t[degree]
                 xmax = spl.t[-degree - 1]
diff --git a/sklearn/preprocessing/tests/test_polynomial.py b/sklearn/preprocessing/tests/test_polynomial.py
index 2ca3260f7c05e..b1908bf9fe12a 100644
--- a/sklearn/preprocessing/tests/test_polynomial.py
+++ b/sklearn/preprocessing/tests/test_polynomial.py
@@ -1,10 +1,13 @@
 import numpy as np
-from numpy.testing import assert_allclose, assert_array_equal
 import pytest
-
+from numpy.testing import assert_allclose, assert_array_equal
+from scipy.interpolate import BSpline
 from sklearn.linear_model import LinearRegression
 from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import KBinsDiscretizer, SplineTransformer
+from sklearn.utils.fixes import linspace, sp_version
+
+from pkg_resources import parse_version
 
 
 # TODO: add PolynomialFeatures if it moves to _polynomial.py
@@ -31,7 +34,7 @@ def is_c_contiguous(a):
         ({"n_knots": 1}, "n_knots must be a positive integer >= 2."),
         ({"n_knots": 2.5}, "n_knots must be a positive integer >= 2."),
         ({"n_knots": "string"}, "n_knots must be a positive integer >= 2."),
-        ({"knots": "string"}, "Expected 2D array, got scalar array instead:"),
+        ({"knots": 1}, "Expected 2D array, got scalar array instead:"),
         ({"knots": [1, 2]}, "Expected 2D array, got 1D array instead:"),
         (
             {"knots": [[1]]},
@@ -48,22 +51,32 @@ def is_c_contiguous(a):
         ({"knots": [[2], [1]]}, "knots must be sorted without duplicates."),
         (
             {"extrapolation": None},
-            "extrapolation must be one of 'error', 'constant', 'linear' or "
-            "'continue'.",
+            "extrapolation must be one of 'error', 'constant', 'linear', "
+            "'continue' or 'periodic'.",
         ),
         (
             {"extrapolation": 1},
-            "extrapolation must be one of 'error', 'constant', 'linear' or "
-            "'continue'.",
+            "extrapolation must be one of 'error', 'constant', 'linear', "
+            "'continue' or 'periodic'.",
         ),
         (
             {"extrapolation": "string"},
-            "extrapolation must be one of 'error', 'constant', 'linear' or "
-            "'continue'.",
+            "extrapolation must be one of 'error', 'constant', 'linear', "
+            "'continue' or 'periodic'.",
         ),
         ({"include_bias": None}, "include_bias must be bool."),
         ({"include_bias": 1}, "include_bias must be bool."),
         ({"include_bias": "string"}, "include_bias must be bool."),
+        (
+            {"extrapolation": "periodic", "n_knots": 3, "degree": 3},
+            "Periodic splines require degree < n_knots. Got n_knots="
+            "3 and degree=3."
+        ),
+        (
+            {"extrapolation": "periodic", "knots": [[0], [1]], "degree": 2},
+            "Periodic splines require degree < n_knots. Got n_knots=2 and "
+            "degree=2."
+        )
     ],
 )
 def test_spline_transformer_input_validation(params, err_msg):
@@ -75,7 +88,8 @@ def test_spline_transformer_input_validation(params, err_msg):
 
 
 def test_spline_transformer_manual_knot_input():
-    """Test that array-like knot positions in SplineTransformer are accepted.
+    """
+    Test that array-like knot positions in SplineTransformer are accepted.
     """
     X = np.arange(20).reshape(10, 2)
     knots = [[0.5, 1], [1.5, 2], [5, 10]]
@@ -86,6 +100,18 @@ def test_spline_transformer_manual_knot_input():
         assert_allclose(st1.bsplines_[i].t, st2.bsplines_[i].t)
 
 
+@pytest.mark.parametrize("extrapolation", ["continue", "periodic"])
+def test_spline_transformer_integer_knots(extrapolation):
+    """Test that SplineTransformer accepts integer value knot positions."""
+    X = np.arange(20).reshape(10, 2)
+    knots = [[0, 1], [1, 2], [5, 5], [11, 10], [12, 11]]
+    _ = SplineTransformer(
+        degree=3,
+        knots=knots,
+        extrapolation=extrapolation
+    ).fit_transform(X)
+
+
 def test_spline_transformer_feature_names():
     """Test that SplineTransformer generates correct features name."""
     X = np.arange(20).reshape(10, 2)
@@ -127,7 +153,13 @@ def test_spline_transformer_feature_names():
 @pytest.mark.parametrize("degree", range(1, 5))
 @pytest.mark.parametrize("n_knots", range(3, 5))
 @pytest.mark.parametrize("knots", ["uniform", "quantile"])
-def test_spline_transformer_unity_decomposition(degree, n_knots, knots):
+@pytest.mark.parametrize("extrapolation", ["constant", "periodic"])
+def test_spline_transformer_unity_decomposition(
+    degree,
+    n_knots,
+    knots,
+    extrapolation
+):
     """Test that B-splines are indeed a decomposition of unity.
 
     Splines basis functions must sum up to 1 per row, if we stay in between
@@ -137,8 +169,16 @@ def test_spline_transformer_unity_decomposition(degree, n_knots, knots):
     # make the boundaries 0 and 1 part of X_train, for sure.
     X_train = np.r_[[[0]], X[::2, :], [[1]]]
     X_test = X[1::2, :]
+
+    if extrapolation == "periodic":
+        n_knots = n_knots + degree  # periodic splines require degree < n_knots
+
     splt = SplineTransformer(
-        n_knots=n_knots, degree=degree, knots=knots, include_bias=True
+        n_knots=n_knots,
+        degree=degree,
+        knots=knots,
+        include_bias=True,
+        extrapolation=extrapolation
     )
     splt.fit(X_train)
     for X in [X_train, X_test]:
@@ -168,6 +208,151 @@ def test_spline_transformer_linear_regression(bias, intercept):
     assert_allclose(pipe.predict(X), y, rtol=1e-3)
 
 
+@pytest.mark.parametrize("knots, n_knots, degree", [
+    ("uniform", 5, 3),
+    ("uniform", 12, 8),
+    (
+        [[-1.0, 0.0], [0, 1.0], [0.1, 2.0], [0.2, 3.0], [0.3, 4.0], [1, 5.0]],
+        100,  # this gets ignored.
+        3
+    )
+])
+def test_spline_transformer_periodicity_of_extrapolation(
+    knots, n_knots, degree
+):
+    """Test that the SplineTransformer is periodic for multiple features."""
+    X_1 = linspace((-1, 0), (1, 5), 10)
+    X_2 = linspace((1, 5), (3, 10), 10)
+
+    splt = SplineTransformer(
+        knots=knots,
+        n_knots=n_knots,
+        degree=degree,
+        extrapolation="periodic"
+    )
+    splt.fit(X_1)
+
+    assert_allclose(splt.transform(X_1), splt.transform(X_2))
+
+
+@pytest.mark.parametrize(["bias", "intercept"], [(True, False), (False, True)])
+def test_spline_transformer_periodic_linear_regression(bias, intercept):
+    """Test that B-splines fit a periodic curve pretty well."""
+    # "+ 3" to avoid the value 0 in assert_allclose
+    def f(x):
+        return np.sin(2 * np.pi * x) - np.sin(8 * np.pi * x) + 3
+
+    X = np.linspace(0, 1, 101)[:, None]
+    pipe = Pipeline(
+        steps=[
+            (
+                "spline",
+                SplineTransformer(
+                    n_knots=20,
+                    degree=3,
+                    include_bias=bias,
+                    extrapolation="periodic",
+                ),
+            ),
+            ("ols", LinearRegression(fit_intercept=intercept)),
+        ]
+    )
+    pipe.fit(X, f(X[:, 0]))
+
+    # Generate larger array to check periodic extrapolation
+    X_ = np.linspace(-1, 2, 301)[:, None]
+    predictions = pipe.predict(X_)
+    assert_allclose(predictions, f(X_[:, 0]), atol=0.01, rtol=0.01)
+    assert_allclose(predictions[0:100], predictions[100:200], rtol=1e-3)
+
+
+@pytest.mark.skipif(
+    sp_version < parse_version("1.0.0"),
+    reason="Periodic extrapolation not yet implemented for BSpline.",
+)
+def test_spline_transformer_periodic_spline_backport():
+    """Test that the backport of extrapolate="periodic" works correctly"""
+    X = np.linspace(-2, 3.5, 10)[:, None]
+    degree = 2
+
+    # Use periodic extrapolation backport in SplineTransformer
+    transformer = SplineTransformer(
+        degree=degree,
+        extrapolation="periodic",
+        knots=[[-1.0], [0.0], [1.0]]
+    )
+    Xt = transformer.fit_transform(X)
+
+    # Use periodic extrapolation in BSpline
+    coef = np.array([[1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0]])
+    spl = BSpline(np.arange(-3, 4), coef, degree, "periodic")
+    Xspl = spl(X[:, 0])
+    assert_allclose(Xt, Xspl)
+
+
+def test_spline_transformer_periodic_splines_periodicity():
+    """
+    Test if shifted knots result in the same transformation up to permutation.
+    """
+    X = np.linspace(0, 10, 101)[:, None]
+
+    transformer_1 = SplineTransformer(
+        degree=3,
+        extrapolation="periodic",
+        knots=[[0.0], [1.0], [3.0], [4.0], [5.0], [8.0]]
+    )
+
+    transformer_2 = SplineTransformer(
+        degree=3,
+        extrapolation="periodic",
+        knots=[[1.0], [3.0], [4.0], [5.0], [8.0], [9.0]]
+    )
+
+    Xt_1 = transformer_1.fit_transform(X)
+    Xt_2 = transformer_2.fit_transform(X)
+
+    assert_allclose(Xt_1, Xt_2[:, [4, 0, 1, 2, 3]])
+
+
+@pytest.mark.parametrize("degree", [3, 5])
+def test_spline_transformer_periodic_splines_smoothness(degree):
+    """Test that spline transformation is smooth at first / last knot."""
+    X = np.linspace(-2, 10, 10_000)[:, None]
+
+    transformer = SplineTransformer(
+        degree=degree,
+        extrapolation="periodic",
+        knots=[[0.0], [1.0], [3.0], [4.0], [5.0], [8.0]]
+    )
+    Xt = transformer.fit_transform(X)
+
+    delta = (X.max() - X.min()) / len(X)
+    tol = 10 * delta
+
+    dXt = Xt
+    # We expect splines of degree `degree` to be (`degree`-1) times
+    # continuously differentiable. I.e. for d = 0, ..., `degree` - 1 the d-th
+    # derivative should be continous. This is the case if the (d+1)-th
+    # numerical derivative is reasonably small (smaller than `tol` in absolute
+    # value). We thus compute d-th numeric derivatives for d = 1, ..., `degree`
+    # and compare them to `tol`.
+    #
+    # Note that the 0-th derivative is the function itself, such that we are
+    # also checking its continuity.
+    for d in range(1, degree + 1):
+        # Check continuity of the (d-1)-th derivative
+        diff = np.diff(dXt, axis=0)
+        assert np.abs(diff).max() < tol
+        # Compute d-th numeric derivative
+        dXt = diff / delta
+
+    # As degree `degree` splines are not `degree` times continously
+    # differentiable at the knots, the `degree + 1`-th numeric derivative
+    # should have spikes at the knots.
+    diff = np.diff(dXt, axis=0)
+    assert np.abs(diff).max() > 1
+
+
 @pytest.mark.parametrize(["bias", "intercept"], [(True, False), (False, True)])
 @pytest.mark.parametrize("degree", [1, 2, 3, 4, 5])
 def test_spline_transformer_extrapolation(bias, intercept, degree):

From edc69954771a6390b0e10be8309e0a47df5c0189 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juan=20Carlos=20Alfaro=20Jim=C3=A9nez?=
 <JuanCarlos.Alfaro@uclm.es>
Date: Wed, 17 Mar 2021 14:15:27 +0100
Subject: [PATCH 251/478] CI Use conda instead of pip to install
 anaconda-client (#19528)

---
 build_tools/github/upload_anaconda.sh |  6 +++++-
 build_tools/travis/after_success.sh   | 11 ++++++++++-
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/build_tools/github/upload_anaconda.sh b/build_tools/github/upload_anaconda.sh
index 7651576cf558e..13e8420e3cc5a 100644
--- a/build_tools/github/upload_anaconda.sh
+++ b/build_tools/github/upload_anaconda.sh
@@ -11,7 +11,11 @@ else
     ANACONDA_TOKEN="$SCIKIT_LEARN_STAGING_UPLOAD_TOKEN"
 fi
 
-pip install git+https://github.com/Anaconda-Server/anaconda-client
+# Install Python 3.8 because of a bug with Python 3.9
+export PATH=$CONDA/bin:$PATH
+conda create -n upload -y python=3.8
+source activate upload
+conda install -y anaconda-client
 
 # Force a replacement if the remote file already exists
 anaconda -t $ANACONDA_TOKEN upload --force -u $ANACONDA_ORG dist/artifact/*
diff --git a/build_tools/travis/after_success.sh b/build_tools/travis/after_success.sh
index 2123f7efafc22..a09a4013ed946 100755
--- a/build_tools/travis/after_success.sh
+++ b/build_tools/travis/after_success.sh
@@ -18,7 +18,16 @@ if [[ $BUILD_WHEEL == true && $TRAVIS_EVENT_TYPE != pull_request ]]; then
         ANACONDA_TOKEN="$SCIKIT_LEARN_STAGING_UPLOAD_TOKEN"
     fi
 
-    pip install git+https://github.com/Anaconda-Server/anaconda-client
+    MINICONDA_URL="https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-aarch64.sh"
+    wget $MINICONDA_URL -O miniconda.sh
+    MINICONDA_PATH=$HOME/miniconda
+    chmod +x miniconda.sh && ./miniconda.sh -b -p $MINICONDA_PATH
+
+    # Install Python 3.8 because of a bug with Python 3.9
+    export PATH=$MINICONDA_PATH/bin:$PATH
+    conda create -n upload -y python=3.8
+    source activate upload
+    conda install -y anaconda-client
 
     # Force a replacement if the remote file already exists
     anaconda -t $ANACONDA_TOKEN upload --force -u $ANACONDA_ORG wheelhouse/*.whl

From 9a186a599d5b4f75f9798211a04b4dc88b4f926a Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@gmail.com>
Date: Wed, 17 Mar 2021 14:52:03 +0100
Subject: [PATCH 252/478] MAINT Python 3.9 in badge on README.rst (#19702)

---
 README.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.rst b/README.rst
index 68f9ffee17d03..ebc4339b2ab58 100644
--- a/README.rst
+++ b/README.rst
@@ -17,8 +17,8 @@
 .. |Nightly wheels| image:: https://github.com/scikit-learn/scikit-learn/workflows/Wheel%20builder/badge.svg?event=schedule
 .. _`Nightly wheels`: https://github.com/scikit-learn/scikit-learn/actions?query=workflow%3A%22Wheel+builder%22+event%3Aschedule
 
-.. |PythonVersion| image:: https://img.shields.io/badge/python-3.6%20%7C%203.7%20%7C%203.8-blue
-.. _PythonVersion: https://img.shields.io/badge/python-3.6%20%7C%203.7%20%7C%203.8-blue
+.. |PythonVersion| image:: https://img.shields.io/badge/python-3.6%20%7C%203.7%20%7C%203.8%20%7C%203.9-blue
+.. _PythonVersion: https://img.shields.io/badge/python-3.6%20%7C%203.7%20%7C%203.8%20%7C%203.9-blue
 
 .. |PyPi| image:: https://badge.fury.io/py/scikit-learn.svg
 .. _PyPi: https://badge.fury.io/py/scikit-learn

From 04f84c6d082864c208682d27256ff74b7b488734 Mon Sep 17 00:00:00 2001
From: Sean Benhur J <43300345+seanbenhur@users.noreply.github.com>
Date: Wed, 17 Mar 2021 20:23:00 +0530
Subject: [PATCH 253/478] DOC Added utils.gen_batches in documentation (#19688)

---
 doc/developers/utilities.rst | 3 +++
 doc/modules/classes.rst      | 1 +
 sklearn/utils/__init__.py    | 9 +++++++++
 3 files changed, 13 insertions(+)

diff --git a/doc/developers/utilities.rst b/doc/developers/utilities.rst
index 1ca36d473a925..39c0c889afc95 100644
--- a/doc/developers/utilities.rst
+++ b/doc/developers/utilities.rst
@@ -196,6 +196,9 @@ Helper Functions
   to ``n``.  Used in :func:`~sklearn.decomposition.dict_learning` and
   :func:`~sklearn.cluster.k_means`.
 
+- :class:`gen_batches`: generator to create slices containing batch size elements 
+  from 0 to ``n``
+
 - :func:`safe_mask`: Helper function to convert a mask to the format expected
   by the numpy array or scipy sparse matrix on which to use it (sparse
   matrices support integer indices only while numpy arrays support both
diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index c658bc6b12452..0cd5abb16829d 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -1595,6 +1595,7 @@ Plotting
    utils.extmath.fast_logdet
    utils.extmath.density
    utils.extmath.weighted_mode
+   utils.gen_batches
    utils.gen_even_slices
    utils.graph.single_source_shortest_path_length
    utils.graph_shortest_path.graph_shortest_path
diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py
index ca2be9d14fe29..972d56f66d900 100644
--- a/sklearn/utils/__init__.py
+++ b/sklearn/utils/__init__.py
@@ -691,6 +691,10 @@ def gen_batches(n, batch_size, *, min_batch_size=0):
     ------
     slice of batch_size elements
 
+    See Also
+    --------
+    gen_even_slices: Generator to create n_packs slices going up to n.
+
     Examples
     --------
     >>> from sklearn.utils import gen_batches
@@ -740,6 +744,11 @@ def gen_even_slices(n, n_packs, *, n_samples=None):
     ------
     slice
 
+    See Also
+    --------
+    gen_batches: Generator to create slices containing batch_size elements
+        from 0 to n.
+
     Examples
     --------
     >>> from sklearn.utils import gen_even_slices

From 2e7009bc69f6fec93e8f3c59dd76b082b473148d Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Thu, 18 Mar 2021 09:15:50 -0400
Subject: [PATCH 254/478] ENH Better error for corrupted files in
 fetch_kddcup99 (#19669)

---
 doc/whats_new/v1.0.rst                  |  3 +++
 sklearn/conftest.py                     |  7 +++++--
 sklearn/datasets/_kddcup99.py           | 23 +++++++++++++----------
 sklearn/datasets/tests/test_kddcup99.py | 16 ++++++++++++++++
 4 files changed, 37 insertions(+), 12 deletions(-)

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 34f39ca48f20a..c7b786ea6d1bf 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -68,6 +68,9 @@ Changelog
   `Thomas Fan`_ and :user:`Amanda Dsouza <amy12xx>` and
   :user:`EL-ATEIF Sara <elateifsara>`.
 
+- |Enhancement| :func:`datasets.fetch_kddcup99` raises a better message
+  when the cached file is invalid. :pr:`19669` `Thomas Fan`_.
+
 :mod:`sklearn.decomposition`
 ............................
 
diff --git a/sklearn/conftest.py b/sklearn/conftest.py
index 2978115e3091c..70fec749b7c8e 100644
--- a/sklearn/conftest.py
+++ b/sklearn/conftest.py
@@ -35,8 +35,11 @@ def wrapped(*args, **kwargs):
         kwargs['download_if_missing'] = download_if_missing
         try:
             return f(*args, **kwargs)
-        except IOError:
-            pytest.skip("test is enabled when SKLEARN_SKIP_NETWORK_TESTS=0")
+        except IOError as e:
+            if str(e) != "Data not found and `download_if_missing` is False":
+                raise
+            pytest.skip("test is enabled when "
+                        "SKLEARN_SKIP_NETWORK_TESTS=0")
     return pytest.fixture(lambda: wrapped)
 
 
diff --git a/sklearn/datasets/_kddcup99.py b/sklearn/datasets/_kddcup99.py
index 539b7ffaf862e..26fb14197a211 100644
--- a/sklearn/datasets/_kddcup99.py
+++ b/sklearn/datasets/_kddcup99.py
@@ -315,7 +315,17 @@ def _fetch_brute_kddcup99(data_home=None,
     column_names = [c[0] for c in dt]
     target_names = column_names[-1]
     feature_names = column_names[:-1]
-    if download_if_missing and not available:
+
+    if available:
+        try:
+            X = joblib.load(samples_path)
+            y = joblib.load(targets_path)
+        except Exception as e:
+            raise IOError(
+                "The cache for fetch_kddcup99 is invalid, please delete "
+                f"{str(kddcup_dir)} and run the fetch_kddcup99 again") from e
+
+    elif download_if_missing:
         _mkdirp(kddcup_dir)
         logger.info("Downloading %s" % archive.url)
         _fetch_remote(archive, dirname=kddcup_dir)
@@ -343,15 +353,8 @@ def _fetch_brute_kddcup99(data_home=None,
 
         joblib.dump(X, samples_path, compress=0)
         joblib.dump(y, targets_path, compress=0)
-    elif not available:
-        if not download_if_missing:
-            raise IOError("Data not found and `download_if_missing` is False")
-
-    try:
-        X, y
-    except NameError:
-        X = joblib.load(samples_path)
-        y = joblib.load(targets_path)
+    else:
+        raise IOError("Data not found and `download_if_missing` is False")
 
     return Bunch(
         data=X,
diff --git a/sklearn/datasets/tests/test_kddcup99.py b/sklearn/datasets/tests/test_kddcup99.py
index 5119d0cda13a2..08017298d20e8 100644
--- a/sklearn/datasets/tests/test_kddcup99.py
+++ b/sklearn/datasets/tests/test_kddcup99.py
@@ -58,3 +58,19 @@ def test_fetch_kddcup99_shuffle(fetch_kddcup99_fxt):
 
 def test_pandas_dependency_message(fetch_kddcup99_fxt, hide_available_pandas):
     check_pandas_dependency_message(fetch_kddcup99_fxt)
+
+
+def test_corrupted_file_error_message(fetch_kddcup99_fxt, tmp_path):
+    """Check that a nice error message is raised when cache is corrupted."""
+    kddcup99_dir = tmp_path / "kddcup99_10-py3"
+    kddcup99_dir.mkdir()
+    samples_path = kddcup99_dir / "samples"
+
+    with samples_path.open("wb") as f:
+        f.write(b"THIS IS CORRUPTED")
+
+    msg = (f"The cache for fetch_kddcup99 is invalid, please "
+           f"delete {str(kddcup99_dir)} and run the fetch_kddcup99 again")
+
+    with pytest.raises(IOError, match=msg):
+        fetch_kddcup99_fxt(data_home=str(tmp_path))

From ca9618c0e228b67293c422e99a0f133c3384f7b2 Mon Sep 17 00:00:00 2001
From: Avi Gupta <33635739+avigupta2612@users.noreply.github.com>
Date: Thu, 18 Mar 2021 19:28:51 +0530
Subject: [PATCH 255/478] MNT move PolynomialFeatures from _data.py to
 _polynomial.py (#19611)

Co-authored-by: Roman Yurchak <rth.yurchak@gmail.com>
---
 sklearn/preprocessing/__init__.py             |   2 +-
 sklearn/preprocessing/_data.py                | 290 -----------------
 sklearn/preprocessing/_polynomial.py          | 294 +++++++++++++++++-
 sklearn/preprocessing/tests/test_data.py      | 199 ------------
 .../preprocessing/tests/test_polynomial.py    | 196 +++++++++++-
 5 files changed, 487 insertions(+), 494 deletions(-)

diff --git a/sklearn/preprocessing/__init__.py b/sklearn/preprocessing/__init__.py
index 076b9e85e1150..6653088ba85a7 100644
--- a/sklearn/preprocessing/__init__.py
+++ b/sklearn/preprocessing/__init__.py
@@ -23,7 +23,6 @@
 from ._data import quantile_transform
 from ._data import power_transform
 from ._data import PowerTransformer
-from ._data import PolynomialFeatures
 
 from ._encoders import OneHotEncoder
 from ._encoders import OrdinalEncoder
@@ -35,6 +34,7 @@
 
 from ._discretization import KBinsDiscretizer
 
+from ._polynomial import PolynomialFeatures
 from ._polynomial import SplineTransformer
 
 
diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py
index 29190dd6e2b67..5e85b932a1e39 100644
--- a/sklearn/preprocessing/_data.py
+++ b/sklearn/preprocessing/_data.py
@@ -8,9 +8,7 @@
 # License: BSD 3 clause
 
 
-from itertools import chain, combinations
 import warnings
-from itertools import combinations_with_replacement as combinations_w_r
 
 import numpy as np
 from scipy import sparse
@@ -31,7 +29,6 @@
 from ..utils.validation import (check_is_fitted, check_random_state,
                                 _check_sample_weight,
                                 FLOAT_DTYPES, _deprecate_positional_args)
-from ._csr_polynomial_expansion import _csr_polynomial_expansion
 
 from ._encoders import OneHotEncoder
 
@@ -1570,293 +1567,6 @@ def robust_scale(X, *, axis=0, with_centering=True, with_scaling=True,
     return X
 
 
-class PolynomialFeatures(TransformerMixin, BaseEstimator):
-    """Generate polynomial and interaction features.
-
-    Generate a new feature matrix consisting of all polynomial combinations
-    of the features with degree less than or equal to the specified degree.
-    For example, if an input sample is two dimensional and of the form
-    [a, b], the degree-2 polynomial features are [1, a, b, a^2, ab, b^2].
-
-    Parameters
-    ----------
-    degree : int, default=2
-        The degree of the polynomial features.
-
-    interaction_only : bool, default=False
-        If true, only interaction features are produced: features that are
-        products of at most ``degree`` *distinct* input features (so not
-        ``x[1] ** 2``, ``x[0] * x[2] ** 3``, etc.).
-
-    include_bias : bool, default=True
-        If True (default), then include a bias column, the feature in which
-        all polynomial powers are zero (i.e. a column of ones - acts as an
-        intercept term in a linear model).
-
-    order : {'C', 'F'}, default='C'
-        Order of output array in the dense case. 'F' order is faster to
-        compute, but may slow down subsequent estimators.
-
-        .. versionadded:: 0.21
-
-    Examples
-    --------
-    >>> import numpy as np
-    >>> from sklearn.preprocessing import PolynomialFeatures
-    >>> X = np.arange(6).reshape(3, 2)
-    >>> X
-    array([[0, 1],
-           [2, 3],
-           [4, 5]])
-    >>> poly = PolynomialFeatures(2)
-    >>> poly.fit_transform(X)
-    array([[ 1.,  0.,  1.,  0.,  0.,  1.],
-           [ 1.,  2.,  3.,  4.,  6.,  9.],
-           [ 1.,  4.,  5., 16., 20., 25.]])
-    >>> poly = PolynomialFeatures(interaction_only=True)
-    >>> poly.fit_transform(X)
-    array([[ 1.,  0.,  1.,  0.],
-           [ 1.,  2.,  3.,  6.],
-           [ 1.,  4.,  5., 20.]])
-
-    Attributes
-    ----------
-    powers_ : ndarray of shape (n_output_features, n_input_features)
-        powers_[i, j] is the exponent of the jth input in the ith output.
-
-    n_input_features_ : int
-        The total number of input features.
-
-    n_output_features_ : int
-        The total number of polynomial output features. The number of output
-        features is computed by iterating over all suitably sized combinations
-        of input features.
-
-    See Also
-    --------
-    SplineTransformer : Transformer that generates univariate B-spline bases
-        for features
-
-    Notes
-    -----
-    Be aware that the number of features in the output array scales
-    polynomially in the number of features of the input array, and
-    exponentially in the degree. High degrees can cause overfitting.
-
-    See :ref:`examples/linear_model/plot_polynomial_interpolation.py
-    <sphx_glr_auto_examples_linear_model_plot_polynomial_interpolation.py>`
-    """
-    @_deprecate_positional_args
-    def __init__(self, degree=2, *, interaction_only=False, include_bias=True,
-                 order='C'):
-        self.degree = degree
-        self.interaction_only = interaction_only
-        self.include_bias = include_bias
-        self.order = order
-
-    @staticmethod
-    def _combinations(n_features, degree, interaction_only, include_bias):
-        comb = (combinations if interaction_only else combinations_w_r)
-        start = int(not include_bias)
-        return chain.from_iterable(comb(range(n_features), i)
-                                   for i in range(start, degree + 1))
-
-    @property
-    def powers_(self):
-        check_is_fitted(self)
-
-        combinations = self._combinations(self.n_input_features_, self.degree,
-                                          self.interaction_only,
-                                          self.include_bias)
-        return np.vstack([np.bincount(c, minlength=self.n_input_features_)
-                          for c in combinations])
-
-    def get_feature_names(self, input_features=None):
-        """
-        Return feature names for output features
-
-        Parameters
-        ----------
-        input_features : list of str of shape (n_features,), default=None
-            String names for input features if available. By default,
-            "x0", "x1", ... "xn_features" is used.
-
-        Returns
-        -------
-        output_feature_names : list of str of shape (n_output_features,)
-        """
-        powers = self.powers_
-        if input_features is None:
-            input_features = ['x%d' % i for i in range(powers.shape[1])]
-        feature_names = []
-        for row in powers:
-            inds = np.where(row)[0]
-            if len(inds):
-                name = " ".join("%s^%d" % (input_features[ind], exp)
-                                if exp != 1 else input_features[ind]
-                                for ind, exp in zip(inds, row[inds]))
-            else:
-                name = "1"
-            feature_names.append(name)
-        return feature_names
-
-    def fit(self, X, y=None):
-        """
-        Compute number of output features.
-
-
-        Parameters
-        ----------
-        X : {array-like, sparse matrix} of shape (n_samples, n_features)
-            The data.
-
-        y : None
-            Ignored.
-
-        Returns
-        -------
-        self : object
-            Fitted transformer.
-        """
-        n_samples, n_features = self._validate_data(
-            X, accept_sparse=True).shape
-        combinations = self._combinations(n_features, self.degree,
-                                          self.interaction_only,
-                                          self.include_bias)
-        self.n_input_features_ = n_features
-        self.n_output_features_ = sum(1 for _ in combinations)
-        return self
-
-    def transform(self, X):
-        """Transform data to polynomial features.
-
-        Parameters
-        ----------
-        X : {array-like, sparse matrix} of shape (n_samples, n_features)
-            The data to transform, row by row.
-
-            Prefer CSR over CSC for sparse input (for speed), but CSC is
-            required if the degree is 4 or higher. If the degree is less than
-            4 and the input format is CSC, it will be converted to CSR, have
-            its polynomial features generated, then converted back to CSC.
-
-            If the degree is 2 or 3, the method described in "Leveraging
-            Sparsity to Speed Up Polynomial Feature Expansions of CSR Matrices
-            Using K-Simplex Numbers" by Andrew Nystrom and John Hughes is
-            used, which is much faster than the method used on CSC input. For
-            this reason, a CSC input will be converted to CSR, and the output
-            will be converted back to CSC prior to being returned, hence the
-            preference of CSR.
-
-        Returns
-        -------
-        XP : {ndarray, sparse matrix} of shape (n_samples, NP)
-            The matrix of features, where NP is the number of polynomial
-            features generated from the combination of inputs. If a sparse
-            matrix is provided, it will be converted into a sparse
-            ``csr_matrix``.
-        """
-        check_is_fitted(self)
-
-        X = self._validate_data(X, order='F', dtype=FLOAT_DTYPES, reset=False,
-                                accept_sparse=('csr', 'csc'))
-
-        n_samples, n_features = X.shape
-
-        if n_features != self.n_input_features_:
-            raise ValueError("X shape does not match training shape")
-
-        if sparse.isspmatrix_csr(X):
-            if self.degree > 3:
-                return self.transform(X.tocsc()).tocsr()
-            to_stack = []
-            if self.include_bias:
-                to_stack.append(np.ones(shape=(n_samples, 1), dtype=X.dtype))
-            to_stack.append(X)
-            for deg in range(2, self.degree+1):
-                Xp_next = _csr_polynomial_expansion(X.data, X.indices,
-                                                    X.indptr, X.shape[1],
-                                                    self.interaction_only,
-                                                    deg)
-                if Xp_next is None:
-                    break
-                to_stack.append(Xp_next)
-            XP = sparse.hstack(to_stack, format='csr')
-        elif sparse.isspmatrix_csc(X) and self.degree < 4:
-            return self.transform(X.tocsr()).tocsc()
-        else:
-            if sparse.isspmatrix(X):
-                combinations = self._combinations(n_features, self.degree,
-                                                  self.interaction_only,
-                                                  self.include_bias)
-                columns = []
-                for comb in combinations:
-                    if comb:
-                        out_col = 1
-                        for col_idx in comb:
-                            out_col = X[:, col_idx].multiply(out_col)
-                        columns.append(out_col)
-                    else:
-                        bias = sparse.csc_matrix(np.ones((X.shape[0], 1)))
-                        columns.append(bias)
-                XP = sparse.hstack(columns, dtype=X.dtype).tocsc()
-            else:
-                XP = np.empty((n_samples, self.n_output_features_),
-                              dtype=X.dtype, order=self.order)
-
-                # What follows is a faster implementation of:
-                # for i, comb in enumerate(combinations):
-                #     XP[:, i] = X[:, comb].prod(1)
-                # This implementation uses two optimisations.
-                # First one is broadcasting,
-                # multiply ([X1, ..., Xn], X1) -> [X1 X1, ..., Xn X1]
-                # multiply ([X2, ..., Xn], X2) -> [X2 X2, ..., Xn X2]
-                # ...
-                # multiply ([X[:, start:end], X[:, start]) -> ...
-                # Second optimisation happens for degrees >= 3.
-                # Xi^3 is computed reusing previous computation:
-                # Xi^3 = Xi^2 * Xi.
-
-                if self.include_bias:
-                    XP[:, 0] = 1
-                    current_col = 1
-                else:
-                    current_col = 0
-
-                # d = 0
-                XP[:, current_col:current_col + n_features] = X
-                index = list(range(current_col,
-                                   current_col + n_features))
-                current_col += n_features
-                index.append(current_col)
-
-                # d >= 1
-                for _ in range(1, self.degree):
-                    new_index = []
-                    end = index[-1]
-                    for feature_idx in range(n_features):
-                        start = index[feature_idx]
-                        new_index.append(current_col)
-                        if self.interaction_only:
-                            start += (index[feature_idx + 1] -
-                                      index[feature_idx])
-                        next_col = current_col + end - start
-                        if next_col <= current_col:
-                            break
-                        # XP[:, start:end] are terms of degree d - 1
-                        # that exclude feature #feature_idx.
-                        np.multiply(XP[:, start:end],
-                                    X[:, feature_idx:feature_idx + 1],
-                                    out=XP[:, current_col:next_col],
-                                    casting='no')
-                        current_col = next_col
-
-                    new_index.append(current_col)
-                    index = new_index
-
-        return XP
-
-
 @_deprecate_positional_args
 def normalize(X, norm='l2', *, axis=1, copy=True, return_norm=False):
     """Scale input vectors individually to unit norm (vector length).
diff --git a/sklearn/preprocessing/_polynomial.py b/sklearn/preprocessing/_polynomial.py
index ad358e50c4681..3f4ccc2fa05d4 100644
--- a/sklearn/preprocessing/_polynomial.py
+++ b/sklearn/preprocessing/_polynomial.py
@@ -2,14 +2,19 @@
 This file contains preprocessing tools based on polynomials.
 """
 import numbers
+from itertools import chain, combinations
+from itertools import combinations_with_replacement as combinations_w_r
 
 import numpy as np
+from scipy import sparse
 from scipy.interpolate import BSpline
 
 from ..base import BaseEstimator, TransformerMixin
 from ..utils import check_array
 from ..utils.fixes import linspace
-from ..utils.validation import check_is_fitted, FLOAT_DTYPES
+from ..utils.validation import (check_is_fitted, FLOAT_DTYPES,
+                                _deprecate_positional_args)
+from ._csr_polynomial_expansion import _csr_polynomial_expansion
 
 
 __all__ = [
@@ -17,6 +22,293 @@
 ]
 
 
+class PolynomialFeatures(TransformerMixin, BaseEstimator):
+    """Generate polynomial and interaction features.
+
+    Generate a new feature matrix consisting of all polynomial combinations
+    of the features with degree less than or equal to the specified degree.
+    For example, if an input sample is two dimensional and of the form
+    [a, b], the degree-2 polynomial features are [1, a, b, a^2, ab, b^2].
+
+    Parameters
+    ----------
+    degree : int, default=2
+        The degree of the polynomial features.
+
+    interaction_only : bool, default=False
+        If true, only interaction features are produced: features that are
+        products of at most ``degree`` *distinct* input features (so not
+        ``x[1] ** 2``, ``x[0] * x[2] ** 3``, etc.).
+
+    include_bias : bool, default=True
+        If True (default), then include a bias column, the feature in which
+        all polynomial powers are zero (i.e. a column of ones - acts as an
+        intercept term in a linear model).
+
+    order : {'C', 'F'}, default='C'
+        Order of output array in the dense case. 'F' order is faster to
+        compute, but may slow down subsequent estimators.
+
+        .. versionadded:: 0.21
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.preprocessing import PolynomialFeatures
+    >>> X = np.arange(6).reshape(3, 2)
+    >>> X
+    array([[0, 1],
+           [2, 3],
+           [4, 5]])
+    >>> poly = PolynomialFeatures(2)
+    >>> poly.fit_transform(X)
+    array([[ 1.,  0.,  1.,  0.,  0.,  1.],
+           [ 1.,  2.,  3.,  4.,  6.,  9.],
+           [ 1.,  4.,  5., 16., 20., 25.]])
+    >>> poly = PolynomialFeatures(interaction_only=True)
+    >>> poly.fit_transform(X)
+    array([[ 1.,  0.,  1.,  0.],
+           [ 1.,  2.,  3.,  6.],
+           [ 1.,  4.,  5., 20.]])
+
+    Attributes
+    ----------
+    powers_ : ndarray of shape (n_output_features, n_input_features)
+        powers_[i, j] is the exponent of the jth input in the ith output.
+
+    n_input_features_ : int
+        The total number of input features.
+
+    n_output_features_ : int
+        The total number of polynomial output features. The number of output
+        features is computed by iterating over all suitably sized combinations
+        of input features.
+
+    See Also
+    --------
+    SplineTransformer : Transformer that generates univariate B-spline bases
+        for features
+
+    Notes
+    -----
+    Be aware that the number of features in the output array scales
+    polynomially in the number of features of the input array, and
+    exponentially in the degree. High degrees can cause overfitting.
+
+    See :ref:`examples/linear_model/plot_polynomial_interpolation.py
+    <sphx_glr_auto_examples_linear_model_plot_polynomial_interpolation.py>`
+    """
+    @_deprecate_positional_args
+    def __init__(self, degree=2, *, interaction_only=False, include_bias=True,
+                 order='C'):
+        self.degree = degree
+        self.interaction_only = interaction_only
+        self.include_bias = include_bias
+        self.order = order
+
+    @staticmethod
+    def _combinations(n_features, degree, interaction_only, include_bias):
+        comb = (combinations if interaction_only else combinations_w_r)
+        start = int(not include_bias)
+        return chain.from_iterable(comb(range(n_features), i)
+                                   for i in range(start, degree + 1))
+
+    @property
+    def powers_(self):
+        check_is_fitted(self)
+
+        combinations = self._combinations(self.n_input_features_, self.degree,
+                                          self.interaction_only,
+                                          self.include_bias)
+        return np.vstack([np.bincount(c, minlength=self.n_input_features_)
+                          for c in combinations])
+
+    def get_feature_names(self, input_features=None):
+        """
+        Return feature names for output features
+
+        Parameters
+        ----------
+        input_features : list of str of shape (n_features,), default=None
+            String names for input features if available. By default,
+            "x0", "x1", ... "xn_features" is used.
+
+        Returns
+        -------
+        output_feature_names : list of str of shape (n_output_features,)
+        """
+        powers = self.powers_
+        if input_features is None:
+            input_features = ['x%d' % i for i in range(powers.shape[1])]
+        feature_names = []
+        for row in powers:
+            inds = np.where(row)[0]
+            if len(inds):
+                name = " ".join("%s^%d" % (input_features[ind], exp)
+                                if exp != 1 else input_features[ind]
+                                for ind, exp in zip(inds, row[inds]))
+            else:
+                name = "1"
+            feature_names.append(name)
+        return feature_names
+
+    def fit(self, X, y=None):
+        """
+        Compute number of output features.
+
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The data.
+
+        y : None
+            Ignored.
+
+        Returns
+        -------
+        self : object
+            Fitted transformer.
+        """
+        n_samples, n_features = self._validate_data(
+            X, accept_sparse=True).shape
+        combinations = self._combinations(n_features, self.degree,
+                                          self.interaction_only,
+                                          self.include_bias)
+        self.n_input_features_ = n_features
+        self.n_output_features_ = sum(1 for _ in combinations)
+        return self
+
+    def transform(self, X):
+        """Transform data to polynomial features.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The data to transform, row by row.
+
+            Prefer CSR over CSC for sparse input (for speed), but CSC is
+            required if the degree is 4 or higher. If the degree is less than
+            4 and the input format is CSC, it will be converted to CSR, have
+            its polynomial features generated, then converted back to CSC.
+
+            If the degree is 2 or 3, the method described in "Leveraging
+            Sparsity to Speed Up Polynomial Feature Expansions of CSR Matrices
+            Using K-Simplex Numbers" by Andrew Nystrom and John Hughes is
+            used, which is much faster than the method used on CSC input. For
+            this reason, a CSC input will be converted to CSR, and the output
+            will be converted back to CSC prior to being returned, hence the
+            preference of CSR.
+
+        Returns
+        -------
+        XP : {ndarray, sparse matrix} of shape (n_samples, NP)
+            The matrix of features, where NP is the number of polynomial
+            features generated from the combination of inputs. If a sparse
+            matrix is provided, it will be converted into a sparse
+            ``csr_matrix``.
+        """
+        check_is_fitted(self)
+
+        X = self._validate_data(X, order='F', dtype=FLOAT_DTYPES, reset=False,
+                                accept_sparse=('csr', 'csc'))
+
+        n_samples, n_features = X.shape
+
+        if n_features != self.n_input_features_:
+            raise ValueError("X shape does not match training shape")
+
+        if sparse.isspmatrix_csr(X):
+            if self.degree > 3:
+                return self.transform(X.tocsc()).tocsr()
+            to_stack = []
+            if self.include_bias:
+                to_stack.append(np.ones(shape=(n_samples, 1), dtype=X.dtype))
+            to_stack.append(X)
+            for deg in range(2, self.degree+1):
+                Xp_next = _csr_polynomial_expansion(X.data, X.indices,
+                                                    X.indptr, X.shape[1],
+                                                    self.interaction_only,
+                                                    deg)
+                if Xp_next is None:
+                    break
+                to_stack.append(Xp_next)
+            XP = sparse.hstack(to_stack, format='csr')
+        elif sparse.isspmatrix_csc(X) and self.degree < 4:
+            return self.transform(X.tocsr()).tocsc()
+        else:
+            if sparse.isspmatrix(X):
+                combinations = self._combinations(n_features, self.degree,
+                                                  self.interaction_only,
+                                                  self.include_bias)
+                columns = []
+                for comb in combinations:
+                    if comb:
+                        out_col = 1
+                        for col_idx in comb:
+                            out_col = X[:, col_idx].multiply(out_col)
+                        columns.append(out_col)
+                    else:
+                        bias = sparse.csc_matrix(np.ones((X.shape[0], 1)))
+                        columns.append(bias)
+                XP = sparse.hstack(columns, dtype=X.dtype).tocsc()
+            else:
+                XP = np.empty((n_samples, self.n_output_features_),
+                              dtype=X.dtype, order=self.order)
+
+                # What follows is a faster implementation of:
+                # for i, comb in enumerate(combinations):
+                #     XP[:, i] = X[:, comb].prod(1)
+                # This implementation uses two optimisations.
+                # First one is broadcasting,
+                # multiply ([X1, ..., Xn], X1) -> [X1 X1, ..., Xn X1]
+                # multiply ([X2, ..., Xn], X2) -> [X2 X2, ..., Xn X2]
+                # ...
+                # multiply ([X[:, start:end], X[:, start]) -> ...
+                # Second optimisation happens for degrees >= 3.
+                # Xi^3 is computed reusing previous computation:
+                # Xi^3 = Xi^2 * Xi.
+
+                if self.include_bias:
+                    XP[:, 0] = 1
+                    current_col = 1
+                else:
+                    current_col = 0
+
+                # d = 0
+                XP[:, current_col:current_col + n_features] = X
+                index = list(range(current_col,
+                                   current_col + n_features))
+                current_col += n_features
+                index.append(current_col)
+
+                # d >= 1
+                for _ in range(1, self.degree):
+                    new_index = []
+                    end = index[-1]
+                    for feature_idx in range(n_features):
+                        start = index[feature_idx]
+                        new_index.append(current_col)
+                        if self.interaction_only:
+                            start += (index[feature_idx + 1] -
+                                      index[feature_idx])
+                        next_col = current_col + end - start
+                        if next_col <= current_col:
+                            break
+                        # XP[:, start:end] are terms of degree d - 1
+                        # that exclude feature #feature_idx.
+                        np.multiply(XP[:, start:end],
+                                    X[:, feature_idx:feature_idx + 1],
+                                    out=XP[:, current_col:next_col],
+                                    casting='no')
+                        current_col = next_col
+
+                    new_index.append(current_col)
+                    index = new_index
+
+        return XP
+
+
 # TODO:
 # - sparse support (either scipy or own cython solution)?
 class SplineTransformer(TransformerMixin, BaseEstimator):
diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py
index fdd88be0ccff4..196060388ddd2 100644
--- a/sklearn/preprocessing/tests/test_data.py
+++ b/sklearn/preprocessing/tests/test_data.py
@@ -10,7 +10,6 @@
 import numpy as np
 import numpy.linalg as la
 from scipy import sparse, stats
-from scipy.sparse import random as sparse_random
 
 import pytest
 
@@ -43,7 +42,6 @@
 from sklearn.preprocessing import RobustScaler
 from sklearn.preprocessing import robust_scale
 from sklearn.preprocessing import add_dummy_feature
-from sklearn.preprocessing import PolynomialFeatures
 from sklearn.preprocessing import PowerTransformer
 from sklearn.preprocessing import power_transform
 from sklearn.preprocessing._data import _handle_zeros_in_scale
@@ -94,203 +92,6 @@ def assert_correct_incr(i, batch_start, batch_stop, n, chunk_size,
                 n_samples_seen)
 
 
-def test_polynomial_features():
-    # Test Polynomial Features
-    X1 = np.arange(6)[:, np.newaxis]
-    P1 = np.hstack([np.ones_like(X1),
-                    X1, X1 ** 2, X1 ** 3])
-    deg1 = 3
-
-    X2 = np.arange(6).reshape((3, 2))
-    x1 = X2[:, :1]
-    x2 = X2[:, 1:]
-    P2 = np.hstack([x1 ** 0 * x2 ** 0,
-                    x1 ** 1 * x2 ** 0,
-                    x1 ** 0 * x2 ** 1,
-                    x1 ** 2 * x2 ** 0,
-                    x1 ** 1 * x2 ** 1,
-                    x1 ** 0 * x2 ** 2])
-    deg2 = 2
-
-    for (deg, X, P) in [(deg1, X1, P1), (deg2, X2, P2)]:
-        P_test = PolynomialFeatures(deg, include_bias=True).fit_transform(X)
-        assert_array_almost_equal(P_test, P)
-
-        P_test = PolynomialFeatures(deg, include_bias=False).fit_transform(X)
-        assert_array_almost_equal(P_test, P[:, 1:])
-
-    interact = PolynomialFeatures(2, interaction_only=True, include_bias=True)
-    X_poly = interact.fit_transform(X)
-    assert_array_almost_equal(X_poly, P2[:, [0, 1, 2, 4]])
-
-    assert interact.powers_.shape == (interact.n_output_features_,
-                                      interact.n_input_features_)
-
-
-def test_polynomial_feature_names():
-    X = np.arange(30).reshape(10, 3)
-    poly = PolynomialFeatures(degree=2, include_bias=True).fit(X)
-    feature_names = poly.get_feature_names()
-    assert_array_equal(['1', 'x0', 'x1', 'x2', 'x0^2', 'x0 x1',
-                        'x0 x2', 'x1^2', 'x1 x2', 'x2^2'],
-                       feature_names)
-
-    poly = PolynomialFeatures(degree=3, include_bias=False).fit(X)
-    feature_names = poly.get_feature_names(["a", "b", "c"])
-    assert_array_equal(['a', 'b', 'c', 'a^2', 'a b', 'a c', 'b^2',
-                        'b c', 'c^2', 'a^3', 'a^2 b', 'a^2 c',
-                        'a b^2', 'a b c', 'a c^2', 'b^3', 'b^2 c',
-                        'b c^2', 'c^3'], feature_names)
-    # test some unicode
-    poly = PolynomialFeatures(degree=1, include_bias=True).fit(X)
-    feature_names = poly.get_feature_names(
-        ["\u0001F40D", "\u262E", "\u05D0"])
-    assert_array_equal(["1", "\u0001F40D", "\u262E", "\u05D0"],
-                       feature_names)
-
-
-def test_polynomial_feature_array_order():
-    """Test that output array has the given order."""
-    X = np.arange(10).reshape(5, 2)
-
-    def is_c_contiguous(a):
-        return np.isfortran(a.T)
-
-    assert is_c_contiguous(PolynomialFeatures().fit_transform(X))
-    assert is_c_contiguous(PolynomialFeatures(order='C').fit_transform(X))
-    assert np.isfortran(PolynomialFeatures(order='F').fit_transform(X))
-
-
-@pytest.mark.parametrize(['deg', 'include_bias', 'interaction_only', 'dtype'],
-                         [(1, True, False, int),
-                          (2, True, False, int),
-                          (2, True, False, np.float32),
-                          (2, True, False, np.float64),
-                          (3, False, False, np.float64),
-                          (3, False, True, np.float64),
-                          (4, False, False, np.float64),
-                          (4, False, True, np.float64)])
-def test_polynomial_features_csc_X(deg, include_bias, interaction_only, dtype):
-    rng = np.random.RandomState(0)
-    X = rng.randint(0, 2, (100, 2))
-    X_csc = sparse.csc_matrix(X)
-
-    est = PolynomialFeatures(deg, include_bias=include_bias,
-                             interaction_only=interaction_only)
-    Xt_csc = est.fit_transform(X_csc.astype(dtype))
-    Xt_dense = est.fit_transform(X.astype(dtype))
-
-    assert isinstance(Xt_csc, sparse.csc_matrix)
-    assert Xt_csc.dtype == Xt_dense.dtype
-    assert_array_almost_equal(Xt_csc.A, Xt_dense)
-
-
-@pytest.mark.parametrize(['deg', 'include_bias', 'interaction_only', 'dtype'],
-                         [(1, True, False, int),
-                          (2, True, False, int),
-                          (2, True, False, np.float32),
-                          (2, True, False, np.float64),
-                          (3, False, False, np.float64),
-                          (3, False, True, np.float64)])
-def test_polynomial_features_csr_X(deg, include_bias, interaction_only, dtype):
-    rng = np.random.RandomState(0)
-    X = rng.randint(0, 2, (100, 2))
-    X_csr = sparse.csr_matrix(X)
-
-    est = PolynomialFeatures(deg, include_bias=include_bias,
-                             interaction_only=interaction_only)
-    Xt_csr = est.fit_transform(X_csr.astype(dtype))
-    Xt_dense = est.fit_transform(X.astype(dtype, copy=False))
-
-    assert isinstance(Xt_csr, sparse.csr_matrix)
-    assert Xt_csr.dtype == Xt_dense.dtype
-    assert_array_almost_equal(Xt_csr.A, Xt_dense)
-
-
-@pytest.mark.parametrize(['deg', 'include_bias', 'interaction_only', 'dtype'],
-                         [(2, True, False, np.float32),
-                          (2, True, False, np.float64),
-                          (3, False, False, np.float64),
-                          (3, False, True, np.float64)])
-def test_polynomial_features_csr_X_floats(deg, include_bias,
-                                          interaction_only, dtype):
-    X_csr = sparse_random(1000, 10, 0.5, random_state=0).tocsr()
-    X = X_csr.toarray()
-
-    est = PolynomialFeatures(deg, include_bias=include_bias,
-                             interaction_only=interaction_only)
-    Xt_csr = est.fit_transform(X_csr.astype(dtype))
-    Xt_dense = est.fit_transform(X.astype(dtype))
-
-    assert isinstance(Xt_csr, sparse.csr_matrix)
-    assert Xt_csr.dtype == Xt_dense.dtype
-    assert_array_almost_equal(Xt_csr.A, Xt_dense)
-
-
-@pytest.mark.parametrize(['zero_row_index', 'deg', 'interaction_only'],
-                         [(0, 2, True), (1, 2, True), (2, 2, True),
-                          (0, 3, True), (1, 3, True), (2, 3, True),
-                          (0, 2, False), (1, 2, False), (2, 2, False),
-                          (0, 3, False), (1, 3, False), (2, 3, False)])
-def test_polynomial_features_csr_X_zero_row(zero_row_index, deg,
-                                            interaction_only):
-    X_csr = sparse_random(3, 10, 1.0, random_state=0).tocsr()
-    X_csr[zero_row_index, :] = 0.0
-    X = X_csr.toarray()
-
-    est = PolynomialFeatures(deg, include_bias=False,
-                             interaction_only=interaction_only)
-    Xt_csr = est.fit_transform(X_csr)
-    Xt_dense = est.fit_transform(X)
-
-    assert isinstance(Xt_csr, sparse.csr_matrix)
-    assert Xt_csr.dtype == Xt_dense.dtype
-    assert_array_almost_equal(Xt_csr.A, Xt_dense)
-
-
-# This degree should always be one more than the highest degree supported by
-# _csr_expansion.
-@pytest.mark.parametrize(['include_bias', 'interaction_only'],
-                         [(True, True), (True, False),
-                          (False, True), (False, False)])
-def test_polynomial_features_csr_X_degree_4(include_bias, interaction_only):
-    X_csr = sparse_random(1000, 10, 0.5, random_state=0).tocsr()
-    X = X_csr.toarray()
-
-    est = PolynomialFeatures(4, include_bias=include_bias,
-                             interaction_only=interaction_only)
-    Xt_csr = est.fit_transform(X_csr)
-    Xt_dense = est.fit_transform(X)
-
-    assert isinstance(Xt_csr, sparse.csr_matrix)
-    assert Xt_csr.dtype == Xt_dense.dtype
-    assert_array_almost_equal(Xt_csr.A, Xt_dense)
-
-
-@pytest.mark.parametrize(['deg', 'dim', 'interaction_only'],
-                         [(2, 1, True),
-                          (2, 2, True),
-                          (3, 1, True),
-                          (3, 2, True),
-                          (3, 3, True),
-                          (2, 1, False),
-                          (2, 2, False),
-                          (3, 1, False),
-                          (3, 2, False),
-                          (3, 3, False)])
-def test_polynomial_features_csr_X_dim_edges(deg, dim, interaction_only):
-    X_csr = sparse_random(1000, dim, 0.5, random_state=0).tocsr()
-    X = X_csr.toarray()
-
-    est = PolynomialFeatures(deg, interaction_only=interaction_only)
-    Xt_csr = est.fit_transform(X_csr)
-    Xt_dense = est.fit_transform(X)
-
-    assert isinstance(Xt_csr, sparse.csr_matrix)
-    assert Xt_csr.dtype == Xt_dense.dtype
-    assert_array_almost_equal(Xt_csr.A, Xt_dense)
-
-
 def test_raises_value_error_if_sample_weights_greater_than_1d():
     # Sample weights must be either scalar or 1D
 
diff --git a/sklearn/preprocessing/tests/test_polynomial.py b/sklearn/preprocessing/tests/test_polynomial.py
index b1908bf9fe12a..5068a8c7d8bdd 100644
--- a/sklearn/preprocessing/tests/test_polynomial.py
+++ b/sklearn/preprocessing/tests/test_polynomial.py
@@ -1,17 +1,22 @@
 import numpy as np
 import pytest
+from scipy import sparse
+from scipy.sparse import random as sparse_random
+from sklearn.utils._testing import assert_array_almost_equal
+
 from numpy.testing import assert_allclose, assert_array_equal
 from scipy.interpolate import BSpline
 from sklearn.linear_model import LinearRegression
 from sklearn.pipeline import Pipeline
-from sklearn.preprocessing import KBinsDiscretizer, SplineTransformer
+from sklearn.preprocessing import (
+    KBinsDiscretizer, PolynomialFeatures, SplineTransformer
+)
 from sklearn.utils.fixes import linspace, sp_version
 
 from pkg_resources import parse_version
 
 
-# TODO: add PolynomialFeatures if it moves to _polynomial.py
-@pytest.mark.parametrize("est", (SplineTransformer,))
+@pytest.mark.parametrize("est", (PolynomialFeatures, SplineTransformer))
 def test_polynomial_and_spline_array_order(est):
     """Test that output array has the given order."""
     X = np.arange(10).reshape(5, 2)
@@ -444,3 +449,188 @@ def test_spline_transformer_n_features_out(n_knots, include_bias, degree):
     splt.fit(X)
 
     assert splt.transform(X).shape[1] == splt.n_features_out_
+
+
+def test_polynomial_features():
+    # Test Polynomial Features
+    X1 = np.arange(6)[:, np.newaxis]
+    P1 = np.hstack([np.ones_like(X1),
+                    X1, X1 ** 2, X1 ** 3])
+    deg1 = 3
+
+    X2 = np.arange(6).reshape((3, 2))
+    x1 = X2[:, :1]
+    x2 = X2[:, 1:]
+    P2 = np.hstack([x1 ** 0 * x2 ** 0,
+                    x1 ** 1 * x2 ** 0,
+                    x1 ** 0 * x2 ** 1,
+                    x1 ** 2 * x2 ** 0,
+                    x1 ** 1 * x2 ** 1,
+                    x1 ** 0 * x2 ** 2])
+    deg2 = 2
+
+    for (deg, X, P) in [(deg1, X1, P1), (deg2, X2, P2)]:
+        P_test = PolynomialFeatures(deg, include_bias=True).fit_transform(X)
+        assert_array_almost_equal(P_test, P)
+
+        P_test = PolynomialFeatures(deg, include_bias=False).fit_transform(X)
+        assert_array_almost_equal(P_test, P[:, 1:])
+
+    interact = PolynomialFeatures(2, interaction_only=True, include_bias=True)
+    X_poly = interact.fit_transform(X)
+    assert_array_almost_equal(X_poly, P2[:, [0, 1, 2, 4]])
+
+    assert interact.powers_.shape == (interact.n_output_features_,
+                                      interact.n_input_features_)
+
+
+def test_polynomial_feature_names():
+    X = np.arange(30).reshape(10, 3)
+    poly = PolynomialFeatures(degree=2, include_bias=True).fit(X)
+    feature_names = poly.get_feature_names()
+    assert_array_equal(['1', 'x0', 'x1', 'x2', 'x0^2', 'x0 x1',
+                        'x0 x2', 'x1^2', 'x1 x2', 'x2^2'],
+                       feature_names)
+
+    poly = PolynomialFeatures(degree=3, include_bias=False).fit(X)
+    feature_names = poly.get_feature_names(["a", "b", "c"])
+    assert_array_equal(['a', 'b', 'c', 'a^2', 'a b', 'a c', 'b^2',
+                        'b c', 'c^2', 'a^3', 'a^2 b', 'a^2 c',
+                        'a b^2', 'a b c', 'a c^2', 'b^3', 'b^2 c',
+                        'b c^2', 'c^3'], feature_names)
+    # test some unicode
+    poly = PolynomialFeatures(degree=1, include_bias=True).fit(X)
+    feature_names = poly.get_feature_names(
+        ["\u0001F40D", "\u262E", "\u05D0"])
+    assert_array_equal(["1", "\u0001F40D", "\u262E", "\u05D0"],
+                       feature_names)
+
+
+@pytest.mark.parametrize(['deg', 'include_bias', 'interaction_only', 'dtype'],
+                         [(1, True, False, int),
+                          (2, True, False, int),
+                          (2, True, False, np.float32),
+                          (2, True, False, np.float64),
+                          (3, False, False, np.float64),
+                          (3, False, True, np.float64),
+                          (4, False, False, np.float64),
+                          (4, False, True, np.float64)])
+def test_polynomial_features_csc_X(deg, include_bias, interaction_only, dtype):
+    rng = np.random.RandomState(0)
+    X = rng.randint(0, 2, (100, 2))
+    X_csc = sparse.csc_matrix(X)
+
+    est = PolynomialFeatures(deg, include_bias=include_bias,
+                             interaction_only=interaction_only)
+    Xt_csc = est.fit_transform(X_csc.astype(dtype))
+    Xt_dense = est.fit_transform(X.astype(dtype))
+
+    assert isinstance(Xt_csc, sparse.csc_matrix)
+    assert Xt_csc.dtype == Xt_dense.dtype
+    assert_array_almost_equal(Xt_csc.A, Xt_dense)
+
+
+@pytest.mark.parametrize(['deg', 'include_bias', 'interaction_only', 'dtype'],
+                         [(1, True, False, int),
+                          (2, True, False, int),
+                          (2, True, False, np.float32),
+                          (2, True, False, np.float64),
+                          (3, False, False, np.float64),
+                          (3, False, True, np.float64)])
+def test_polynomial_features_csr_X(deg, include_bias, interaction_only, dtype):
+    rng = np.random.RandomState(0)
+    X = rng.randint(0, 2, (100, 2))
+    X_csr = sparse.csr_matrix(X)
+
+    est = PolynomialFeatures(deg, include_bias=include_bias,
+                             interaction_only=interaction_only)
+    Xt_csr = est.fit_transform(X_csr.astype(dtype))
+    Xt_dense = est.fit_transform(X.astype(dtype, copy=False))
+
+    assert isinstance(Xt_csr, sparse.csr_matrix)
+    assert Xt_csr.dtype == Xt_dense.dtype
+    assert_array_almost_equal(Xt_csr.A, Xt_dense)
+
+
+@pytest.mark.parametrize(['deg', 'include_bias', 'interaction_only', 'dtype'],
+                         [(2, True, False, np.float32),
+                          (2, True, False, np.float64),
+                          (3, False, False, np.float64),
+                          (3, False, True, np.float64)])
+def test_polynomial_features_csr_X_floats(deg, include_bias,
+                                          interaction_only, dtype):
+    X_csr = sparse_random(1000, 10, 0.5, random_state=0).tocsr()
+    X = X_csr.toarray()
+
+    est = PolynomialFeatures(deg, include_bias=include_bias,
+                             interaction_only=interaction_only)
+    Xt_csr = est.fit_transform(X_csr.astype(dtype))
+    Xt_dense = est.fit_transform(X.astype(dtype))
+
+    assert isinstance(Xt_csr, sparse.csr_matrix)
+    assert Xt_csr.dtype == Xt_dense.dtype
+    assert_array_almost_equal(Xt_csr.A, Xt_dense)
+
+
+@pytest.mark.parametrize(['zero_row_index', 'deg', 'interaction_only'],
+                         [(0, 2, True), (1, 2, True), (2, 2, True),
+                          (0, 3, True), (1, 3, True), (2, 3, True),
+                          (0, 2, False), (1, 2, False), (2, 2, False),
+                          (0, 3, False), (1, 3, False), (2, 3, False)])
+def test_polynomial_features_csr_X_zero_row(zero_row_index, deg,
+                                            interaction_only):
+    X_csr = sparse_random(3, 10, 1.0, random_state=0).tocsr()
+    X_csr[zero_row_index, :] = 0.0
+    X = X_csr.toarray()
+
+    est = PolynomialFeatures(deg, include_bias=False,
+                             interaction_only=interaction_only)
+    Xt_csr = est.fit_transform(X_csr)
+    Xt_dense = est.fit_transform(X)
+
+    assert isinstance(Xt_csr, sparse.csr_matrix)
+    assert Xt_csr.dtype == Xt_dense.dtype
+    assert_array_almost_equal(Xt_csr.A, Xt_dense)
+
+
+# This degree should always be one more than the highest degree supported by
+# _csr_expansion.
+@pytest.mark.parametrize(['include_bias', 'interaction_only'],
+                         [(True, True), (True, False),
+                          (False, True), (False, False)])
+def test_polynomial_features_csr_X_degree_4(include_bias, interaction_only):
+    X_csr = sparse_random(1000, 10, 0.5, random_state=0).tocsr()
+    X = X_csr.toarray()
+
+    est = PolynomialFeatures(4, include_bias=include_bias,
+                             interaction_only=interaction_only)
+    Xt_csr = est.fit_transform(X_csr)
+    Xt_dense = est.fit_transform(X)
+
+    assert isinstance(Xt_csr, sparse.csr_matrix)
+    assert Xt_csr.dtype == Xt_dense.dtype
+    assert_array_almost_equal(Xt_csr.A, Xt_dense)
+
+
+@pytest.mark.parametrize(['deg', 'dim', 'interaction_only'],
+                         [(2, 1, True),
+                          (2, 2, True),
+                          (3, 1, True),
+                          (3, 2, True),
+                          (3, 3, True),
+                          (2, 1, False),
+                          (2, 2, False),
+                          (3, 1, False),
+                          (3, 2, False),
+                          (3, 3, False)])
+def test_polynomial_features_csr_X_dim_edges(deg, dim, interaction_only):
+    X_csr = sparse_random(1000, dim, 0.5, random_state=0).tocsr()
+    X = X_csr.toarray()
+
+    est = PolynomialFeatures(deg, interaction_only=interaction_only)
+    Xt_csr = est.fit_transform(X_csr)
+    Xt_dense = est.fit_transform(X)
+
+    assert isinstance(Xt_csr, sparse.csr_matrix)
+    assert Xt_csr.dtype == Xt_dense.dtype
+    assert_array_almost_equal(Xt_csr.A, Xt_dense)

From 0d7d46f3bef0a2f943ee321f0f979ced165e0477 Mon Sep 17 00:00:00 2001
From: Mathieu Blondel <mblondel@google.com>
Date: Thu, 18 Mar 2021 18:42:47 +0100
Subject: [PATCH 256/478] Fix typo in elastic net docstring. (#19711)

* Fix typo in elastic net docstring.

* Use norms more explicitly.
---
 sklearn/linear_model/_coordinate_descent.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/linear_model/_coordinate_descent.py b/sklearn/linear_model/_coordinate_descent.py
index 4fdeb783db194..6a23fedd9902e 100644
--- a/sklearn/linear_model/_coordinate_descent.py
+++ b/sklearn/linear_model/_coordinate_descent.py
@@ -570,7 +570,7 @@ class ElasticNet(MultiOutputMixin, RegressorMixin, LinearModel):
     If you are interested in controlling the L1 and L2 penalty
     separately, keep in mind that this is equivalent to::
 
-            a * L1 + b * L2
+            a * ||w||_1 + 0.5 * b * ||w||_2^2
 
     where::
 

From bf4049cbef568fa211ec155cb724001fff742dbd Mon Sep 17 00:00:00 2001
From: flyingdutchman23 <flyingdutchman@posteo.eu>
Date: Fri, 19 Mar 2021 14:51:00 +0100
Subject: [PATCH 257/478] DOC Correct scorer documentation (#19720)

Co-authored-by: Joris Clement <joris.clement@cyren.com>
---
 sklearn/metrics/_scorer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py
index c686d3b7c0b34..8a814242cb6f1 100644
--- a/sklearn/metrics/_scorer.py
+++ b/sklearn/metrics/_scorer.py
@@ -215,7 +215,7 @@ def _score(self, method_caller, estimator, X, y_true, sample_weight=None):
             arguments, potentially caching results.
 
         estimator : object
-            Trained estimator to use for scoring. Must have a predict_proba
+            Trained estimator to use for scoring. Must have a `predict`
             method; the output of that is used to compute the score.
 
         X : {array-like, sparse matrix}
@@ -254,7 +254,7 @@ def _score(self, method_caller, clf, X, y, sample_weight=None):
             arguments, potentially caching results.
 
         clf : object
-            Trained classifier to use for scoring. Must have a predict_proba
+            Trained classifier to use for scoring. Must have a `predict_proba`
             method; the output of that is used to compute the score.
 
         X : {array-like, sparse matrix}

From b9d6db81ec2e75ec40404db49f97999a08f00c55 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Fri, 19 Mar 2021 15:21:34 +0100
Subject: [PATCH 258/478] [MRG] ENH Consistent loss name for squared error
 (#19310)

---
 benchmarks/bench_hist_gradient_boosting.py    |  2 +-
 .../bench_hist_gradient_boosting_threading.py |  2 +-
 doc/modules/ensemble.rst                      | 15 ++--
 doc/modules/sgd.rst                           |  8 +-
 doc/whats_new/v1.0.rst                        | 31 ++++++++
 .../plot_model_complexity_influence.py        |  2 +-
 .../plot_gradient_boosting_quantile.py        | 24 +++---
 .../plot_gradient_boosting_regression.py      |  2 +-
 sklearn/ensemble/_base.py                     | 10 +++
 sklearn/ensemble/_forest.py                   | 41 +++++++---
 sklearn/ensemble/_gb.py                       | 79 +++++++++++++------
 sklearn/ensemble/_gb_losses.py                |  2 +
 .../gradient_boosting.py                      | 23 ++++--
 .../ensemble/_hist_gradient_boosting/loss.py  |  2 +-
 .../tests/test_gradient_boosting.py           | 20 ++++-
 .../tests/test_loss.py                        | 18 ++---
 .../_hist_gradient_boosting/utils.pyx         |  6 +-
 sklearn/ensemble/tests/test_forest.py         | 18 ++++-
 .../ensemble/tests/test_gradient_boosting.py  | 37 ++++++++-
 .../tests/test_partial_dependence.py          |  3 +-
 sklearn/linear_model/_ransac.py               | 27 ++++---
 sklearn/linear_model/_stochastic_gradient.py  | 34 ++++++--
 sklearn/linear_model/tests/test_ransac.py     | 15 +++-
 sklearn/linear_model/tests/test_sgd.py        | 45 ++++++++---
 sklearn/neural_network/_base.py               |  2 +-
 .../neural_network/_multilayer_perceptron.py  |  4 +-
 sklearn/tree/_classes.py                      | 50 ++++++++----
 sklearn/tree/_export.py                       |  3 +
 sklearn/tree/tests/test_export.py             | 42 +++++-----
 sklearn/tree/tests/test_tree.py               | 40 +++++++---
 30 files changed, 444 insertions(+), 163 deletions(-)

diff --git a/benchmarks/bench_hist_gradient_boosting.py b/benchmarks/bench_hist_gradient_boosting.py
index 158b6fbb22d2b..82eb64faeb462 100644
--- a/benchmarks/bench_hist_gradient_boosting.py
+++ b/benchmarks/bench_hist_gradient_boosting.py
@@ -110,7 +110,7 @@ def one_run(n_samples):
     else:
         # regression
         if loss == 'default':
-            loss = 'least_squares'
+            loss = 'squared_error'
     est.set_params(loss=loss)
     est.fit(X_train, y_train, sample_weight=sample_weight_train)
     sklearn_fit_duration = time() - tic
diff --git a/benchmarks/bench_hist_gradient_boosting_threading.py b/benchmarks/bench_hist_gradient_boosting_threading.py
index 3cc6afa3871c6..61803fb5cb9cc 100644
--- a/benchmarks/bench_hist_gradient_boosting_threading.py
+++ b/benchmarks/bench_hist_gradient_boosting_threading.py
@@ -112,7 +112,7 @@ def get_estimator_and_data():
 else:
     # regression
     if loss == 'default':
-        loss = 'least_squares'
+        loss = 'squared_error'
 sklearn_est.set_params(loss=loss)
 
 
diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst
index 0e0aaaafaffba..c891b4d275b9a 100644
--- a/doc/modules/ensemble.rst
+++ b/doc/modules/ensemble.rst
@@ -537,7 +537,8 @@ Regression
 :class:`GradientBoostingRegressor` supports a number of
 :ref:`different loss functions <gradient_boosting_loss>`
 for regression which can be specified via the argument
-``loss``; the default loss function for regression is least squares (``'ls'``).
+``loss``; the default loss function for regression is squared error
+(``'squared_error'``).
 
 ::
 
@@ -549,8 +550,10 @@ for regression which can be specified via the argument
     >>> X, y = make_friedman1(n_samples=1200, random_state=0, noise=1.0)
     >>> X_train, X_test = X[:200], X[200:]
     >>> y_train, y_test = y[:200], y[200:]
-    >>> est = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1,
-    ...     max_depth=1, random_state=0, loss='ls').fit(X_train, y_train)
+    >>> est = GradientBoostingRegressor(
+    ...     n_estimators=100, learning_rate=0.1, max_depth=1, random_state=0,
+    ...     loss='squared_error'
+    ... ).fit(X_train, y_train)
     >>> mean_squared_error(y_test, est.predict(X_test))
     5.00...
 
@@ -741,8 +744,8 @@ the parameter ``loss``:
 
   * Regression
 
-    * Least squares (``'ls'``): The natural choice for regression due
-      to its superior computational properties. The initial model is
+    * Squared error (``'squared_error'``): The natural choice for regression
+      due to its superior computational properties. The initial model is
       given by the mean of the target values.
     * Least absolute deviation (``'lad'``): A robust loss function for
       regression. The initial model is given by the median of the
@@ -950,7 +953,7 @@ controls the number of iterations of the boosting process::
   >>> clf.score(X_test, y_test)
   0.8965
 
-Available losses for regression are 'least_squares',
+Available losses for regression are 'squared_error',
 'least_absolute_deviation', which is less sensitive to outliers, and
 'poisson', which is well suited to model counts and frequencies. For
 classification, 'binary_crossentropy' is used for binary classification and
diff --git a/doc/modules/sgd.rst b/doc/modules/sgd.rst
index 95a5111747509..1376947540e78 100644
--- a/doc/modules/sgd.rst
+++ b/doc/modules/sgd.rst
@@ -30,7 +30,7 @@ For example, using `SGDClassifier(loss='log')` results in logistic regression,
 i.e. a model equivalent to :class:`~sklearn.linear_model.LogisticRegression`
 which is fitted via SGD instead of being fitted by one of the other solvers
 in :class:`~sklearn.linear_model.LogisticRegression`. Similarly,
-`SGDRegressor(loss='squared_loss', penalty='l2')` and
+`SGDRegressor(loss='squared_error', penalty='l2')` and
 :class:`~sklearn.linear_model.Ridge` solve the same optimization problem, via
 different means.
 
@@ -211,7 +211,7 @@ samples (> 10.000), for other problems we recommend :class:`Ridge`,
 The concrete loss function can be set via the ``loss``
 parameter. :class:`SGDRegressor` supports the following loss functions:
 
-  * ``loss="squared_loss"``: Ordinary least squares,
+  * ``loss="squared_error"``: Ordinary least squares,
   * ``loss="huber"``: Huber loss for robust regression,
   * ``loss="epsilon_insensitive"``: linear Support Vector Regression.
 
@@ -362,9 +362,9 @@ Different choices for :math:`L` entail different classifiers or regressors:
 
 - Hinge (soft-margin): equivalent to Support Vector Classification.
   :math:`L(y_i, f(x_i)) = \max(0, 1 - y_i f(x_i))`.
-- Perceptron: 
+- Perceptron:
   :math:`L(y_i, f(x_i)) = \max(0, - y_i f(x_i))`.
-- Modified Huber: 
+- Modified Huber:
   :math:`L(y_i, f(x_i)) = \max(0, 1 - y_i f(x_i))^2` if :math:`y_i f(x_i) >
   1`, and :math:`L(y_i, f(x_i)) = -4 y_i f(x_i)` otherwise.
 - Log: equivalent to Logistic Regression.
diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index c7b786ea6d1bf..b4ee0c57b97fc 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -45,6 +45,37 @@ Changelog
     :pr:`123456` by :user:`Joe Bloggs <joeongithub>`.
     where 123456 is the *pull request* number, not the issue number.
 
+- |API| The option for using the squared error via ``loss`` and
+  ``criterion`` parameters was made more consistent. The preferred way is by
+  setting the value to `"squared_error"`. Old option names are still valid,
+  produce the same models, but are deprecated and will be removed in version
+  1.2.
+  :pr:`19310` by :user:`Christian Lorentzen <lorentzenchr>`.
+
+  - For :class:`ensemble.ExtraTreesRegressor`, `criterion="mse"` is deprecated,
+    use `"squared_error"` instead which is now the default.
+
+  - For :class:`ensemble.GradientBoostingRegressor`, `loss="ls"` is deprecated,
+    use `"squared_error"` instead which is now the default.
+
+  - For :class:`ensemble.RandomForestRegressor`, `criterion="mse"` is deprecated,
+    use `"squared_error"` instead which is now the default.
+
+  - For :class:`ensemble.HistGradientBoostingRegressor`, `loss="least_squares"`
+    is deprecated, use `"squared_error"` instead which is now the default.
+
+  - For :class:`linear_model.RANSACRegressor`, `loss="squared_loss"` is
+    deprecated, use `"squared_error"` instead.
+
+  - For :class:`linear_model.SGDRegressor`, `loss="squared_loss"` is
+    deprecated, use `"squared_error"` instead which is now the default.
+
+  - For :class:`tree.DecisionTreeRegressor`, `criterion="mse"` is deprecated,
+    use `"squared_error"` instead which is now the default.
+
+  - For :class:`tree.ExtraTreeRegressor`, `criterion="mse"` is deprecated,
+    use `"squared_error"` instead which is now the default.
+
 :mod:`sklearn.cluster`
 ......................
 
diff --git a/examples/applications/plot_model_complexity_influence.py b/examples/applications/plot_model_complexity_influence.py
index 927fcd8e85e15..5748a546bdaad 100644
--- a/examples/applications/plot_model_complexity_influence.py
+++ b/examples/applications/plot_model_complexity_influence.py
@@ -177,7 +177,7 @@ def _count_nonzero_coefficients(estimator):
      'prediction_performance_label': 'MSE',
      'n_samples': 30},
     {'estimator': GradientBoostingRegressor,
-     'tuned_params': {'loss': 'ls'},
+     'tuned_params': {'loss': 'squared_error'},
      'changing_param': 'n_estimators',
      'changing_param_values': [10, 50, 100, 200, 500],
      'complexity_label': 'n_trees',
diff --git a/examples/ensemble/plot_gradient_boosting_quantile.py b/examples/ensemble/plot_gradient_boosting_quantile.py
index f29a87fe6cff7..00be70721c1da 100644
--- a/examples/ensemble/plot_gradient_boosting_quantile.py
+++ b/examples/ensemble/plot_gradient_boosting_quantile.py
@@ -71,10 +71,10 @@ def f(x):
     all_models["q %1.2f" % alpha] = gbr.fit(X_train, y_train)
 
 # %%
-# For the sake of comparison, also fit a baseline model trained with the usual
-# least squares loss (ls), also known as the mean squared error (MSE).
-gbr_ls = GradientBoostingRegressor(loss='ls', **common_params)
-all_models["ls"] = gbr_ls.fit(X_train, y_train)
+# For the sake of comparison, we also fit a baseline model trained with the
+# usual (mean) squared error (MSE).
+gbr_ls = GradientBoostingRegressor(loss='squared_error', **common_params)
+all_models["mse"] = gbr_ls.fit(X_train, y_train)
 
 # %%
 # Create an evenly spaced evaluation set of input values spanning the [0, 10]
@@ -82,13 +82,13 @@ def f(x):
 xx = np.atleast_2d(np.linspace(0, 10, 1000)).T
 
 # %%
-# Plot the true conditional mean function f, the prediction of the conditional
-# mean (least squares loss), the conditional median and the conditional 90%
-# interval (from 5th to 95th conditional percentiles).
+# Plot the true conditional mean function f, the predictions of the conditional
+# mean (loss equals squared error), the conditional median and the conditional
+# 90% interval (from 5th to 95th conditional percentiles).
 import matplotlib.pyplot as plt
 
 
-y_pred = all_models['ls'].predict(xx)
+y_pred = all_models['mse'].predict(xx)
 y_lower = all_models['q 0.05'].predict(xx)
 y_upper = all_models['q 0.95'].predict(xx)
 y_med = all_models['q 0.50'].predict(xx)
@@ -153,7 +153,7 @@ def highlight_min(x):
 #
 # Note that because the target distribution is asymmetric, the expected
 # conditional mean and conditional median are signficiantly different and
-# therefore one could not use the least squares model get a good estimation of
+# therefore one could not use the squared error model get a good estimation of
 # the conditional median nor the converse.
 #
 # If the target distribution were symmetric and had no outliers (e.g. with a
@@ -179,9 +179,9 @@ def highlight_min(x):
 # shows that the best test metric is obtained when the model is trained by
 # minimizing this same metric.
 #
-# Note that the conditional median estimator is competitive with the least
-# squares estimator in terms of MSE on the test set: this can be explained by
-# the fact the least squares estimator is very sensitive to large outliers
+# Note that the conditional median estimator is competitive with the squared
+# error estimator in terms of MSE on the test set: this can be explained by
+# the fact the squared error estimator is very sensitive to large outliers
 # which can cause significant overfitting. This can be seen on the right hand
 # side of the previous plot. The conditional median estimator is biased
 # (underestimation for this asymetric noise) but is also naturally robust to
diff --git a/examples/ensemble/plot_gradient_boosting_regression.py b/examples/ensemble/plot_gradient_boosting_regression.py
index 92d35b26deb9c..3722f4bf2066f 100644
--- a/examples/ensemble/plot_gradient_boosting_regression.py
+++ b/examples/ensemble/plot_gradient_boosting_regression.py
@@ -67,7 +67,7 @@
           'max_depth': 4,
           'min_samples_split': 5,
           'learning_rate': 0.01,
-          'loss': 'ls'}
+          'loss': 'squared_error'}
 
 # %%
 # Fit regression model
diff --git a/sklearn/ensemble/_base.py b/sklearn/ensemble/_base.py
index 93891a2b719ab..095d801de166d 100644
--- a/sklearn/ensemble/_base.py
+++ b/sklearn/ensemble/_base.py
@@ -15,6 +15,7 @@
 from ..base import is_classifier, is_regressor
 from ..base import BaseEstimator
 from ..base import MetaEstimatorMixin
+from ..tree import DecisionTreeRegressor, ExtraTreeRegressor
 from ..utils import Bunch, _print_elapsed_time
 from ..utils import check_random_state
 from ..utils.metaestimators import _BaseComposition
@@ -151,6 +152,15 @@ def _make_estimator(self, append=True, random_state=None):
         estimator.set_params(**{p: getattr(self, p)
                                 for p in self.estimator_params})
 
+        # TODO: Remove in v1.2
+        # criterion "mse" would cause warnings in every call to
+        # DecisionTreeRegressor.fit(..)
+        if (
+            isinstance(estimator, (DecisionTreeRegressor, ExtraTreeRegressor))
+            and getattr(estimator, "criterion", None) == "mse"
+        ):
+            estimator.set_params(criterion="squared_error")
+
         if random_state is not None:
             _set_random_states(estimator, random_state)
 
diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py
index a93e9b7ee877e..140c1c93e8eef 100644
--- a/sklearn/ensemble/_forest.py
+++ b/sklearn/ensemble/_forest.py
@@ -345,6 +345,17 @@ def fit(self, X, y, sample_weight=None):
 
         # Check parameters
         self._validate_estimator()
+        # TODO: Remove in v1.2
+        if (
+            isinstance(self, (RandomForestRegressor, ExtraTreesRegressor))
+            and self.criterion == "mse"
+        ):
+            warn(
+                "Criterion 'mse' was deprecated in v1.0 and will be "
+                "removed in version 1.2. Use `criterion='squared_error'` "
+                "which is equivalent.",
+                FutureWarning
+            )
 
         if not self.bootstrap and self.oob_score:
             raise ValueError("Out of bag estimation only available"
@@ -1310,15 +1321,19 @@ class RandomForestRegressor(ForestRegressor):
            The default value of ``n_estimators`` changed from 10 to 100
            in 0.22.
 
-    criterion : {"mse", "mae"}, default="mse"
+    criterion : {"squared_error", "mse", "mae"}, default="squared_error"
         The function to measure the quality of a split. Supported criteria
-        are "mse" for the mean squared error, which is equal to variance
-        reduction as feature selection criterion, and "mae" for the mean
-        absolute error.
+        are "squared_error" for the mean squared error, which is equal to
+        variance reduction as feature selection criterion, and "mae" for the
+        mean absolute error.
 
         .. versionadded:: 0.18
            Mean Absolute Error (MAE) criterion.
 
+        .. deprecated:: 1.0
+            Criterion "mse" was deprecated in v1.0 and will be removed in
+            version 1.2. Use `criterion="squared_error"` which is equivalent.
+
     max_depth : int, default=None
         The maximum depth of the tree. If None, then nodes are expanded until
         all leaves are pure or until all leaves contain less than
@@ -1537,7 +1552,7 @@ class RandomForestRegressor(ForestRegressor):
     @_deprecate_positional_args
     def __init__(self,
                  n_estimators=100, *,
-                 criterion="mse",
+                 criterion="squared_error",
                  max_depth=None,
                  min_samples_split=2,
                  min_samples_leaf=1,
@@ -1921,15 +1936,19 @@ class ExtraTreesRegressor(ForestRegressor):
            The default value of ``n_estimators`` changed from 10 to 100
            in 0.22.
 
-    criterion : {"mse", "mae"}, default="mse"
+    criterion : {"squared_error", "mse", "mae"}, default="squared_error"
         The function to measure the quality of a split. Supported criteria
-        are "mse" for the mean squared error, which is equal to variance
-        reduction as feature selection criterion, and "mae" for the mean
-        absolute error.
+        are "squared_error" and "mse" for the mean squared error, which is
+        equal to variance reduction as feature selection criterion, and "mae"
+        for the mean absolute error.
 
         .. versionadded:: 0.18
            Mean Absolute Error (MAE) criterion.
 
+        .. deprecated:: 1.0
+            Criterion "mse" was deprecated in v1.0 and will be removed in
+            version 1.2. Use `criterion="squared_error"` which is equivalent.
+
     max_depth : int, default=None
         The maximum depth of the tree. If None, then nodes are expanded until
         all leaves are pure or until all leaves contain less than
@@ -2141,7 +2160,7 @@ class ExtraTreesRegressor(ForestRegressor):
     @_deprecate_positional_args
     def __init__(self,
                  n_estimators=100, *,
-                 criterion="mse",
+                 criterion="squared_error",
                  max_depth=None,
                  min_samples_split=2,
                  min_samples_leaf=1,
@@ -2353,7 +2372,7 @@ class RandomTreesEmbedding(BaseForest):
            [0., 1., 1., 0., 1., 0., 0., 1., 1., 0.]])
     """
 
-    criterion = 'mse'
+    criterion = "squared_error"
     max_features = 1
 
     @_deprecate_positional_args
diff --git a/sklearn/ensemble/_gb.py b/sklearn/ensemble/_gb.py
index e9f7402188860..4984575bce8c3 100644
--- a/sklearn/ensemble/_gb.py
+++ b/sklearn/ensemble/_gb.py
@@ -238,6 +238,12 @@ def _check_params(self):
                 or self.loss not in _gb_losses.LOSS_FUNCTIONS):
             raise ValueError("Loss '{0:s}' not supported. ".format(self.loss))
 
+        if self.loss == "ls":
+            warnings.warn("The loss 'ls' was deprecated in v1.0 and "
+                          "will be removed in version 1.2. Use 'squared_error'"
+                          " which is equivalent.",
+                          FutureWarning)
+
         if self.loss == 'deviance':
             loss_class = (_gb_losses.MultinomialDeviance
                           if len(self.classes_) > 2
@@ -401,6 +407,15 @@ def fit(self, X, y, sample_weight=None, monitor=None):
             # TODO: This should raise an error from 1.1
             self._warn_mae_for_criterion()
 
+        if self.criterion == 'mse':
+            # TODO: Remove in v1.2. By then it should raise an error.
+            warnings.warn(
+                "Criterion 'mse' was deprecated in v1.0 and will be "
+                "removed in version 1.2. Use `criterion='squared_error'` "
+                "which is equivalent.",
+                FutureWarning
+            )
+
         # if not warmstart - clear the estimator state
         if not self.warm_start:
             self._clear_state()
@@ -808,20 +823,26 @@ class GradientBoostingClassifier(ClassifierMixin, BaseGradientBoosting):
         Choosing `subsample < 1.0` leads to a reduction of variance
         and an increase in bias.
 
-    criterion : {'friedman_mse', 'mse', 'mae'}, default='friedman_mse'
+    criterion : {'friedman_mse', 'squared_error', 'mse', 'mae'}, \
+            default='friedman_mse'
         The function to measure the quality of a split. Supported criteria
         are 'friedman_mse' for the mean squared error with improvement
-        score by Friedman, 'mse' for mean squared error, and 'mae' for
-        the mean absolute error. The default value of 'friedman_mse' is
-        generally the best as it can provide a better approximation in
-        some cases.
+        score by Friedman, 'squared_error' for mean squared error, and 'mae'
+        for the mean absolute error. The default value of 'friedman_mse' is
+        generally the best as it can provide a better approximation in some
+        cases.
 
         .. versionadded:: 0.18
+
         .. deprecated:: 0.24
             `criterion='mae'` is deprecated and will be removed in version
-            1.1 (renaming of 0.26). Use `criterion='friedman_mse'` or `'mse'`
-            instead, as trees should use a least-square criterion in
-            Gradient Boosting.
+            1.1 (renaming of 0.26). Use `criterion='friedman_mse'` or
+            `'squared_error'` instead, as trees should use a squared error
+            criterion in Gradient Boosting.
+
+        .. deprecated:: 1.0
+            Criterion 'mse' was deprecated in v1.0 and will be removed in
+            version 1.2. Use `criterion='squared_error'` which is equivalent.
 
     min_samples_split : int or float, default=2
         The minimum number of samples required to split an internal node:
@@ -1128,9 +1149,9 @@ def _warn_mae_for_criterion(self):
         # TODO: This should raise an error from 1.1
         warnings.warn("criterion='mae' was deprecated in version 0.24 and "
                       "will be removed in version 1.1 (renaming of 0.26). Use "
-                      "criterion='friedman_mse' or 'mse' instead, as trees "
-                      "should use a least-square criterion in Gradient "
-                      "Boosting.", FutureWarning)
+                      "criterion='friedman_mse' or 'squared_error' instead, as"
+                      " trees should use a squared error criterion in Gradient"
+                      " Boosting.", FutureWarning)
 
     def decision_function(self, X):
         """Compute the decision function of ``X``.
@@ -1319,13 +1340,19 @@ class GradientBoostingRegressor(RegressorMixin, BaseGradientBoosting):
 
     Parameters
     ----------
-    loss : {'ls', 'lad', 'huber', 'quantile'}, default='ls'
-        Loss function to be optimized. 'ls' refers to least squares
-        regression. 'lad' (least absolute deviation) is a highly robust
+    loss : {'squared_error', 'ls', 'lad', 'huber', 'quantile'}, \
+            default='squared_error'
+        Loss function to be optimized. 'squared_error' refers to the squared
+        error for regression.
+        'lad' (least absolute deviation) is a highly robust
         loss function solely based on order information of the input
         variables. 'huber' is a combination of the two. 'quantile'
         allows quantile regression (use `alpha` to specify the quantile).
 
+        .. deprecated:: 1.0
+            The loss 'ls' was deprecated in v1.0 and will be removed in
+            version 1.2. Use `loss='squared_error'` which is equivalent.
+
     learning_rate : float, default=0.1
         Learning rate shrinks the contribution of each tree by `learning_rate`.
         There is a trade-off between learning_rate and n_estimators.
@@ -1342,20 +1369,26 @@ class GradientBoostingRegressor(RegressorMixin, BaseGradientBoosting):
         Choosing `subsample < 1.0` leads to a reduction of variance
         and an increase in bias.
 
-    criterion : {'friedman_mse', 'mse', 'mae'}, default='friedman_mse'
+    criterion : {'friedman_mse', 'squared_error', 'mse', 'mae'}, \
+            default='friedman_mse'
         The function to measure the quality of a split. Supported criteria
         are "friedman_mse" for the mean squared error with improvement
-        score by Friedman, "mse" for mean squared error, and "mae" for
-        the mean absolute error. The default value of "friedman_mse" is
-        generally the best as it can provide a better approximation in
-        some cases.
+        score by Friedman, "squared_error" for mean squared error, and "mae"
+        for the mean absolute error. The default value of "friedman_mse" is
+        generally the best as it can provide a better approximation in some
+        cases.
 
         .. versionadded:: 0.18
+
         .. deprecated:: 0.24
             `criterion='mae'` is deprecated and will be removed in version
             1.1 (renaming of 0.26). The correct way of minimizing the absolute
             error is to use `loss='lad'` instead.
 
+        .. deprecated:: 1.0
+            Criterion 'mse' was deprecated in v1.0 and will be removed in
+            version 1.2. Use `criterion='squared_error'` which is equivalent.
+
     min_samples_split : int or float, default=2
         The minimum number of samples required to split an internal node:
 
@@ -1427,7 +1460,7 @@ class GradientBoostingRegressor(RegressorMixin, BaseGradientBoosting):
         ``init`` has to provide :term:`fit` and :term:`predict`. If 'zero', the
         initial raw predictions are set to zero. By default a
         ``DummyEstimator`` is used, predicting either the average target value
-        (for loss='ls'), or a quantile for the other losses.
+        (for loss='squared_error'), or a quantile for the other losses.
 
     random_state : int, RandomState instance or None, default=None
         Controls the random seed given to each Tree estimator at each
@@ -1610,10 +1643,12 @@ class GradientBoostingRegressor(RegressorMixin, BaseGradientBoosting):
     Elements of Statistical Learning Ed. 2, Springer, 2009.
     """
 
-    _SUPPORTED_LOSS = ('ls', 'lad', 'huber', 'quantile')
+    # TODO: remove "ls" in verion 1.2
+    _SUPPORTED_LOSS = ("squared_error", 'ls', 'lad', 'huber', 'quantile')
 
     @_deprecate_positional_args
-    def __init__(self, *, loss='ls', learning_rate=0.1, n_estimators=100,
+    def __init__(self, *, loss="squared_error", learning_rate=0.1,
+                 n_estimators=100,
                  subsample=1.0, criterion='friedman_mse', min_samples_split=2,
                  min_samples_leaf=1, min_weight_fraction_leaf=0.,
                  max_depth=3, min_impurity_decrease=0.,
diff --git a/sklearn/ensemble/_gb_losses.py b/sklearn/ensemble/_gb_losses.py
index 82b496ae8109d..f33c7086b596b 100644
--- a/sklearn/ensemble/_gb_losses.py
+++ b/sklearn/ensemble/_gb_losses.py
@@ -856,7 +856,9 @@ def get_init_raw_predictions(self, X, estimator):
         return raw_predictions.reshape(-1, 1).astype(np.float64)
 
 
+# TODO: Remove entry 'ls' in version 1.2.
 LOSS_FUNCTIONS = {
+    "squared_error": LeastSquaresError,
     'ls': LeastSquaresError,
     'lad': LeastAbsoluteError,
     'huber': HuberLossFunction,
diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
index 4fff6030b0d5a..c35f79bd79251 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
@@ -3,6 +3,7 @@
 
 from abc import ABC, abstractmethod
 from functools import partial
+import warnings
 
 import numpy as np
 from timeit import default_timer as time
@@ -903,8 +904,8 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting):
 
     Parameters
     ----------
-    loss : {'least_squares', 'least_absolute_deviation', 'poisson'}, \
-            default='least_squares'
+    loss : {'squared_error', 'least_squares', 'least_absolute_deviation', \
+            'poisson'}, default='squared_error'
         The loss function to use in the boosting process. Note that the
         "least squares" and "poisson" losses actually implement
         "half least squares loss" and "half poisson deviance" to simplify the
@@ -914,6 +915,10 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting):
         .. versionchanged:: 0.23
            Added option 'poisson'.
 
+        .. deprecated:: 1.0
+            The loss 'least_squares' was deprecated in v1.0 and will be removed
+            in version 1.2. Use `loss='squared_error'` which is equivalent.
+
     learning_rate : float, default=0.1
         The learning rate, also known as *shrinkage*. This is used as a
         multiplicative factor for the leaves values. Use ``1`` for no
@@ -1045,11 +1050,11 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting):
     0.92...
     """
 
-    _VALID_LOSSES = ('least_squares', 'least_absolute_deviation',
-                     'poisson')
+    _VALID_LOSSES = ('squared_error', 'least_squares',
+                     'least_absolute_deviation', 'poisson')
 
     @_deprecate_positional_args
-    def __init__(self, loss='least_squares', *, learning_rate=0.1,
+    def __init__(self, loss='squared_error', *, learning_rate=0.1,
                  max_iter=100, max_leaf_nodes=31, max_depth=None,
                  min_samples_leaf=20, l2_regularization=0., max_bins=255,
                  categorical_features=None, monotonic_cst=None,
@@ -1121,6 +1126,14 @@ def _encode_y(self, y):
         return y
 
     def _get_loss(self, sample_weight):
+        if self.loss == "least_squares":
+            warnings.warn(
+                "The loss 'least_squares' was deprecated in v1.0 and will be "
+                "removed in version 1.2. Use 'squared_error' which is "
+                "equivalent.",
+                FutureWarning)
+            return _LOSSES["squared_error"](sample_weight=sample_weight)
+
         return _LOSSES[self.loss](sample_weight=sample_weight)
 
 
diff --git a/sklearn/ensemble/_hist_gradient_boosting/loss.py b/sklearn/ensemble/_hist_gradient_boosting/loss.py
index 4bbf59dc01088..c336bd347e4cf 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/loss.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/loss.py
@@ -419,7 +419,7 @@ def predict_proba(self, raw_predictions):
 
 
 _LOSSES = {
-    'least_squares': LeastSquares,
+    'squared_error': LeastSquares,
     'least_absolute_deviation': LeastAbsoluteDeviation,
     'binary_crossentropy': BinaryCrossEntropy,
     'categorical_crossentropy': CategoricalCrossEntropy,
diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
index 0e5d1e91c3dd0..265b4cf20f8f3 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
@@ -240,7 +240,7 @@ def test_poisson():
     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=n_test,
                                                         random_state=rng)
     gbdt_pois = HistGradientBoostingRegressor(loss='poisson', random_state=rng)
-    gbdt_ls = HistGradientBoostingRegressor(loss='least_squares',
+    gbdt_ls = HistGradientBoostingRegressor(loss='squared_error',
                                             random_state=rng)
     gbdt_pois.fit(X_train, y_train)
     gbdt_ls.fit(X_train, y_train)
@@ -248,7 +248,7 @@ def test_poisson():
 
     for X, y in [(X_train, y_train), (X_test, y_test)]:
         metric_pois = mean_poisson_deviance(y, gbdt_pois.predict(X))
-        # least_squares might produce non-positive predictions => clip
+        # squared_error might produce non-positive predictions => clip
         metric_ls = mean_poisson_deviance(y, np.clip(gbdt_ls.predict(X), 1e-15,
                                                      None))
         metric_dummy = mean_poisson_deviance(y, dummy.predict(X))
@@ -652,7 +652,7 @@ def test_sample_weight_effect(problem, duplication):
                        est_dup._raw_predict(X_dup))
 
 
-@pytest.mark.parametrize('loss_name', ('least_squares',
+@pytest.mark.parametrize('loss_name', ('squared_error',
                                        'least_absolute_deviation'))
 def test_sum_hessians_are_sample_weight(loss_name):
     # For losses with constant hessians, the sum_hessians field of the
@@ -992,3 +992,17 @@ def test_uint8_predict(Est):
     est = Est()
     est.fit(X, y)
     est.predict(X)
+
+
+# TODO: Remove in v1.2
+def test_loss_least_squares_deprecated():
+    X, y = make_regression(n_samples=50, random_state=0)
+    est1 = HistGradientBoostingRegressor(loss="least_squares", random_state=0)
+
+    with pytest.warns(FutureWarning,
+                      match="The loss 'least_squares' was deprecated"):
+        est1.fit(X, y)
+
+    est2 = HistGradientBoostingRegressor(loss="squared_error", random_state=0)
+    est2.fit(X, y)
+    assert_allclose(est1.predict(X), est2.predict(X))
diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py
index 221b94183a7ff..ce7b4acedbae5 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py
@@ -47,9 +47,9 @@ def get_hessians(y_true, raw_predictions):
 
 
 @pytest.mark.parametrize('loss, x0, y_true', [
-    ('least_squares', -2., 42),
-    ('least_squares', 117., 1.05),
-    ('least_squares', 0., 0.),
+    ("squared_error", -2., 42),
+    ("squared_error", 117., 1.05),
+    ("squared_error", 0., 0.),
     # The argmin of binary_crossentropy for y_true=0 and y_true=1 is resp. -inf
     # and +inf due to logit, cf. "complete separation". Therefore, we use
     # 0 < y_true < 1.
@@ -102,7 +102,7 @@ def fprime2(x: np.ndarray) -> np.ndarray:
 
 
 @pytest.mark.parametrize('loss, n_classes, prediction_dim', [
-    ('least_squares', 0, 1),
+    ("squared_error", 0, 1),
     ('least_absolute_deviation', 0, 1),
     ('binary_crossentropy', 2, 1),
     ('categorical_crossentropy', 3, 3),
@@ -118,7 +118,7 @@ def test_numerical_gradients(loss, n_classes, prediction_dim, seed=0):
 
     rng = np.random.RandomState(seed)
     n_samples = 100
-    if loss in ('least_squares', 'least_absolute_deviation'):
+    if loss in ("squared_error", 'least_absolute_deviation'):
         y_true = rng.normal(size=n_samples).astype(Y_DTYPE)
     elif loss in ('poisson'):
         y_true = rng.poisson(size=n_samples).astype(Y_DTYPE)
@@ -161,7 +161,7 @@ def test_numerical_gradients(loss, n_classes, prediction_dim, seed=0):
 def test_baseline_least_squares():
     rng = np.random.RandomState(0)
 
-    loss = _LOSSES['least_squares'](sample_weight=None)
+    loss = _LOSSES["squared_error"](sample_weight=None)
     y_train = rng.normal(size=100)
     baseline_prediction = loss.get_baseline_prediction(y_train, None, 1)
     assert baseline_prediction.shape == tuple()  # scalar
@@ -255,7 +255,7 @@ def test_baseline_categorical_crossentropy():
 
 
 @pytest.mark.parametrize('loss, problem', [
-    ('least_squares', 'regression'),
+    ("squared_error", 'regression'),
     ('least_absolute_deviation', 'regression'),
     ('binary_crossentropy', 'classification'),
     ('categorical_crossentropy', 'classification'),
@@ -317,7 +317,7 @@ def test_init_gradient_and_hessians_sample_weight():
     prediction_dim = 2
     n_samples = 5
     sample_weight = None
-    loss = _LOSSES['least_squares'](sample_weight=sample_weight)
+    loss = _LOSSES["squared_error"](sample_weight=sample_weight)
     _, hessians = loss.init_gradients_and_hessians(
         n_samples=n_samples, prediction_dim=prediction_dim,
         sample_weight=None)
@@ -325,7 +325,7 @@ def test_init_gradient_and_hessians_sample_weight():
     assert hessians.shape == (1, 1)
 
     sample_weight = np.ones(n_samples)
-    loss = _LOSSES['least_squares'](sample_weight=sample_weight)
+    loss = _LOSSES["squared_error"](sample_weight=sample_weight)
     _, hessians = loss.init_gradients_and_hessians(
         n_samples=n_samples, prediction_dim=prediction_dim,
         sample_weight=sample_weight)
diff --git a/sklearn/ensemble/_hist_gradient_boosting/utils.pyx b/sklearn/ensemble/_hist_gradient_boosting/utils.pyx
index cf2c5a51c90dd..d1168acf94835 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/utils.pyx
+++ b/sklearn/ensemble/_hist_gradient_boosting/utils.pyx
@@ -42,7 +42,7 @@ def get_equivalent_estimator(estimator, lib='lightgbm'):
         raise NotImplementedError('Early stopping should be deactivated.')
 
     lightgbm_loss_mapping = {
-        'least_squares': 'regression_l2',
+        'squared_error': 'regression_l2',
         'least_absolute_deviation': 'regression_l1',
         'binary_crossentropy': 'binary',
         'categorical_crossentropy': 'multiclass'
@@ -75,7 +75,7 @@ def get_equivalent_estimator(estimator, lib='lightgbm'):
 
     # XGB
     xgboost_loss_mapping = {
-        'least_squares': 'reg:linear',
+        'squared_error': 'reg:linear',
         'least_absolute_deviation': 'LEAST_ABSOLUTE_DEV_NOT_SUPPORTED',
         'binary_crossentropy': 'reg:logistic',
         'categorical_crossentropy': 'multi:softmax'
@@ -99,7 +99,7 @@ def get_equivalent_estimator(estimator, lib='lightgbm'):
 
     # Catboost
     catboost_loss_mapping = {
-        'least_squares': 'RMSE',
+        'squared_error': 'RMSE',
         # catboost does not support MAE when leaf_estimation_method is Newton
         'least_absolute_deviation': 'LEAST_ASBOLUTE_DEV_NOT_SUPPORTED',
         'binary_crossentropy': 'Logloss',
diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py
index efb1a645842bc..b6c1fea0e2f29 100644
--- a/sklearn/ensemble/tests/test_forest.py
+++ b/sklearn/ensemble/tests/test_forest.py
@@ -25,6 +25,7 @@
 import pytest
 
 import joblib
+from numpy.testing import assert_allclose
 
 from sklearn.utils._testing import assert_almost_equal
 from sklearn.utils._testing import assert_array_almost_equal
@@ -175,7 +176,7 @@ def check_regression_criterion(name, criterion):
 
 
 @pytest.mark.parametrize('name', FOREST_REGRESSORS)
-@pytest.mark.parametrize('criterion', ("mse", "mae", "friedman_mse"))
+@pytest.mark.parametrize('criterion', ("squared_error", "mae", "friedman_mse"))
 def test_regression(name, criterion):
     check_regression_criterion(name, criterion)
 
@@ -260,7 +261,7 @@ def check_importances(name, criterion, dtype, tolerance):
         itertools.chain(product(FOREST_CLASSIFIERS,
                                 ["gini", "entropy"]),
                         product(FOREST_REGRESSORS,
-                                ["mse", "friedman_mse", "mae"])))
+                                ["squared_error", "friedman_mse", "mae"])))
 def test_importances(dtype, name, criterion):
     tolerance = 0.01
     if name in FOREST_REGRESSORS and criterion == "mae":
@@ -1496,6 +1497,19 @@ def test_n_features_deprecation(Estimator):
         est.n_features_
 
 
+# TODO: Remove in v1.2
+def test_mse_deprecated():
+    est1 = RandomForestRegressor(criterion="mse", random_state=0)
+
+    with pytest.warns(FutureWarning,
+                      match="Criterion 'mse' was deprecated"):
+        est1.fit(X, y)
+
+    est2 = RandomForestRegressor(criterion="squared_error", random_state=0)
+    est2.fit(X, y)
+    assert_allclose(est1.predict(X), est2.predict(X))
+
+
 @pytest.mark.parametrize('Forest', FOREST_REGRESSORS)
 def test_mse_criterion_object_segfault_smoke_test(Forest):
     # This is a smoke test to ensure that passing a mutable criterion
diff --git a/sklearn/ensemble/tests/test_gradient_boosting.py b/sklearn/ensemble/tests/test_gradient_boosting.py
index 63d4e668e674f..166d6bdfc5c11 100644
--- a/sklearn/ensemble/tests/test_gradient_boosting.py
+++ b/sklearn/ensemble/tests/test_gradient_boosting.py
@@ -3,6 +3,7 @@
 """
 import warnings
 import numpy as np
+from numpy.testing import assert_allclose
 
 from scipy.sparse import csr_matrix
 from scipy.sparse import csc_matrix
@@ -170,7 +171,7 @@ def test_classification_synthetic(loss):
     assert error_rate < 0.08
 
 
-@pytest.mark.parametrize('loss', ('ls', 'lad', 'huber'))
+@pytest.mark.parametrize('loss', ('squared_error', 'lad', 'huber'))
 @pytest.mark.parametrize('subsample', (1.0, 0.5))
 def test_regression_dataset(loss, subsample):
     # Check consistency on regression dataset with least squares
@@ -229,7 +230,7 @@ def test_regression_synthetic():
     random_state = check_random_state(1)
     regression_params = {'n_estimators': 100, 'max_depth': 4,
                          'min_samples_split': 2, 'learning_rate': 0.1,
-                         'loss': 'ls'}
+                         'loss': 'squared_error'}
 
     # Friedman1
     X, y = datasets.make_friedman1(n_samples=1200,
@@ -1066,7 +1067,7 @@ def test_non_uniform_weights_toy_edge_case_reg():
     y = [0, 0, 1, 0]
     # ignore the first 2 training samples by setting their weight to 0
     sample_weight = [0, 0, 1, 1]
-    for loss in ('huber', 'ls', 'lad', 'quantile'):
+    for loss in ('huber', 'squared_error', 'lad', 'quantile'):
         gb = GradientBoostingRegressor(learning_rate=1.0, n_estimators=2,
                                        loss=loss)
         gb.fit(X, y, sample_weight=sample_weight)
@@ -1369,3 +1370,33 @@ def test_n_features_deprecation(Estimator):
 
     with pytest.warns(FutureWarning, match="n_features_ was deprecated"):
         est.n_features_
+
+
+# TODO: Remove in v1.2
+@pytest.mark.parametrize("Estimator", GRADIENT_BOOSTING_ESTIMATORS)
+def test_criterion_mse_deprecated(Estimator):
+    est1 = Estimator(criterion="mse", random_state=0)
+
+    with pytest.warns(FutureWarning,
+                      match="Criterion 'mse' was deprecated"):
+        est1.fit(X, y)
+
+    est2 = Estimator(criterion="squared_error", random_state=0)
+    est2.fit(X, y)
+    if hasattr(est1, "predict_proba"):
+        assert_allclose(est1.predict_proba(X), est2.predict_proba(X))
+    else:
+        assert_allclose(est1.predict(X), est2.predict(X))
+
+
+# TODO: Remove in v1.2
+def test_loss_ls_deprecated():
+    est1 = GradientBoostingRegressor(loss="ls", random_state=0)
+
+    with pytest.warns(FutureWarning,
+                      match="The loss 'ls' was deprecated"):
+        est1.fit(X, y)
+
+    est2 = GradientBoostingRegressor(loss="squared_error", random_state=0)
+    est2.fit(X, y)
+    assert_allclose(est1.predict(X), est2.predict(X))
diff --git a/sklearn/inspection/tests/test_partial_dependence.py b/sklearn/inspection/tests/test_partial_dependence.py
index f7727210148c6..51dd6e53e4304 100644
--- a/sklearn/inspection/tests/test_partial_dependence.py
+++ b/sklearn/inspection/tests/test_partial_dependence.py
@@ -264,7 +264,8 @@ def test_recursion_decision_tree_vs_forest_and_gbdt(seed):
     equiv_random_state = check_random_state(tree_seed).randint(
         np.iinfo(np.int32).max)
     gbdt = GradientBoostingRegressor(n_estimators=1, learning_rate=1,
-                                     criterion='mse', max_depth=max_depth,
+                                     criterion='squared_error',
+                                     max_depth=max_depth,
                                      random_state=equiv_random_state)
     tree = DecisionTreeRegressor(max_depth=max_depth,
                                  random_state=equiv_random_state)
diff --git a/sklearn/linear_model/_ransac.py b/sklearn/linear_model/_ransac.py
index c9246c121c387..2fc8143f432c8 100644
--- a/sklearn/linear_model/_ransac.py
+++ b/sklearn/linear_model/_ransac.py
@@ -138,9 +138,8 @@ class RANSACRegressor(MetaEstimatorMixin, RegressorMixin,
         the total number of samples.
 
     loss : string, callable, default='absolute_loss'
-        String inputs, "absolute_loss" and "squared_loss" are supported which
-        find the absolute loss and squared loss per sample
-        respectively.
+        String inputs, 'absolute_loss' and 'squared_error' are supported which
+        find the absolute loss and squared error per sample respectively.
 
         If ``loss`` is a callable, then it should be a function that takes
         two arrays as inputs, the true and predicted value and returns a 1-D
@@ -152,6 +151,10 @@ class RANSACRegressor(MetaEstimatorMixin, RegressorMixin,
 
         .. versionadded:: 0.18
 
+        .. deprecated:: 1.0
+            The loss 'squared_loss' was deprecated in v1.0 and will be removed
+            in version 1.2. Use `loss='squared_error'` which is equivalent.
+
     random_state : int, RandomState instance, default=None
         The generator used to initialize the centers.
         Pass an int for reproducible output across multiple function calls.
@@ -203,7 +206,7 @@ class RANSACRegressor(MetaEstimatorMixin, RegressorMixin,
     .. [1] https://en.wikipedia.org/wiki/RANSAC
     .. [2] https://www.sri.com/sites/default/files/publications/ransac-publication.pdf
     .. [3] http://www.bmva.org/bmvc/2009/Papers/Paper355/Paper355.pdf
-    """
+    """  # noqa: E501
     @_deprecate_positional_args
     def __init__(self, base_estimator=None, *, min_samples=None,
                  residual_threshold=None, is_data_valid=None,
@@ -296,8 +299,15 @@ def fit(self, X, y, sample_weight=None):
             else:
                 loss_function = lambda \
                     y_true, y_pred: np.sum(np.abs(y_true - y_pred), axis=1)
-
-        elif self.loss == "squared_loss":
+        # TODO: Remove squared_loss in v1.2.
+        elif self.loss in ("squared_error", "squared_loss"):
+            if self.loss == "squared_loss":
+                warnings.warn(
+                    "The loss 'squared_loss' was deprecated in v1.0 and will "
+                    "be removed in version 1.2. Use `loss='squared_error'` "
+                    "which is equivalent.",
+                    FutureWarning
+                )
             if y.ndim == 1:
                 loss_function = lambda y_true, y_pred: (y_true - y_pred) ** 2
             else:
@@ -309,9 +319,8 @@ def fit(self, X, y, sample_weight=None):
 
         else:
             raise ValueError(
-                "loss should be 'absolute_loss', 'squared_loss' or a callable."
-                "Got %s. " % self.loss)
-
+                "loss should be 'absolute_loss', 'squared_error' or a "
+                "callable. Got %s. " % self.loss)
 
         random_state = check_random_state(self.random_state)
 
diff --git a/sklearn/linear_model/_stochastic_gradient.py b/sklearn/linear_model/_stochastic_gradient.py
index 65f6cc6966ba4..a426c9a8d95f2 100644
--- a/sklearn/linear_model/_stochastic_gradient.py
+++ b/sklearn/linear_model/_stochastic_gradient.py
@@ -155,6 +155,14 @@ def _validate_params(self, for_partial_fit=False):
         if self.loss not in self.loss_functions:
             raise ValueError("The loss %s is not supported. " % self.loss)
 
+        if self.loss == "squared_loss":
+            warnings.warn(
+                "The loss 'squared_loss' was deprecated in v1.0 and will be "
+                "removed in version 1.2. Use `loss='squared_error'` which is "
+                "equivalent.",
+                FutureWarning
+            )
+
     def _get_loss_function(self, loss):
         """Get concrete ``LossFunction`` object for str ``loss``. """
         try:
@@ -452,12 +460,14 @@ def fit_binary(est, i, X, y, alpha, C, learning_rate, max_iter,
 
 class BaseSGDClassifier(LinearClassifierMixin, BaseSGD, metaclass=ABCMeta):
 
+    # TODO: Remove squared_loss in v1.2
     loss_functions = {
         "hinge": (Hinge, 1.0),
         "squared_hinge": (SquaredHinge, 1.0),
         "perceptron": (Hinge, 0.0),
         "log": (Log, ),
         "modified_huber": (ModifiedHuber, ),
+        "squared_error": (SquaredLoss, ),
         "squared_loss": (SquaredLoss, ),
         "huber": (Huber, DEFAULT_EPSILON),
         "epsilon_insensitive": (EpsilonInsensitive, DEFAULT_EPSILON),
@@ -766,7 +776,7 @@ class SGDClassifier(BaseSGDClassifier):
         linear SVM.
 
         The possible options are 'hinge', 'log', 'modified_huber',
-        'squared_hinge', 'perceptron', or a regression loss: 'squared_loss',
+        'squared_hinge', 'perceptron', or a regression loss: 'squared_error',
         'huber', 'epsilon_insensitive', or 'squared_epsilon_insensitive'.
 
         The 'log' loss gives logistic regression, a probabilistic classifier.
@@ -781,6 +791,10 @@ class SGDClassifier(BaseSGDClassifier):
         More details about the losses formulas can be found in the
         :ref:`User Guide <sgd_mathematical_formulation>`.
 
+        .. deprecated:: 1.0
+            The loss 'squared_loss' was deprecated in v1.0 and will be removed
+            in version 1.2. Use `loss='squared_error'` which is equivalent.
+
     penalty : {'l2', 'l1', 'elasticnet'}, default='l2'
         The penalty (aka regularization term) to be used. Defaults to 'l2'
         which is the standard regularizer for linear SVM models. 'l1' and
@@ -1117,7 +1131,9 @@ def _more_tags(self):
 
 class BaseSGDRegressor(RegressorMixin, BaseSGD):
 
+    # TODO: Remove squared_loss in v1.2
     loss_functions = {
+        "squared_error": (SquaredLoss, ),
         "squared_loss": (SquaredLoss, ),
         "huber": (Huber, DEFAULT_EPSILON),
         "epsilon_insensitive": (EpsilonInsensitive, DEFAULT_EPSILON),
@@ -1127,7 +1143,7 @@ class BaseSGDRegressor(RegressorMixin, BaseSGD):
 
     @abstractmethod
     @_deprecate_positional_args
-    def __init__(self, loss="squared_loss", *, penalty="l2", alpha=0.0001,
+    def __init__(self, loss="squared_error", *, penalty="l2", alpha=0.0001,
                  l1_ratio=0.15, fit_intercept=True, max_iter=1000, tol=1e-3,
                  shuffle=True, verbose=0, epsilon=DEFAULT_EPSILON,
                  random_state=None, learning_rate="invscaling", eta0=0.01,
@@ -1389,12 +1405,12 @@ class SGDRegressor(BaseSGDRegressor):
 
     Parameters
     ----------
-    loss : str, default='squared_loss'
-        The loss function to be used. The possible values are 'squared_loss',
+    loss : str, default='squared_error'
+        The loss function to be used. The possible values are 'squared_error',
         'huber', 'epsilon_insensitive', or 'squared_epsilon_insensitive'
 
-        The 'squared_loss' refers to the ordinary least squares fit.
-        'huber' modifies 'squared_loss' to focus less on getting outliers
+        The 'squared_error' refers to the ordinary least squares fit.
+        'huber' modifies 'squared_error' to focus less on getting outliers
         correct by switching from squared to linear loss past a distance of
         epsilon. 'epsilon_insensitive' ignores errors less than epsilon and is
         linear past that; this is the loss function used in SVR.
@@ -1404,6 +1420,10 @@ class SGDRegressor(BaseSGDRegressor):
         More details about the losses formulas can be found in the
         :ref:`User Guide <sgd_mathematical_formulation>`.
 
+        .. deprecated:: 1.0
+            The loss 'squared_loss' was deprecated in v1.0 and will be removed
+            in version 1.2. Use `loss='squared_error'` which is equivalent.
+
     penalty : {'l2', 'l1', 'elasticnet'}, default='l2'
         The penalty (aka regularization term) to be used. Defaults to 'l2'
         which is the standard regularizer for linear SVM models. 'l1' and
@@ -1583,7 +1603,7 @@ class SGDRegressor(BaseSGDRegressor):
 
     """
     @_deprecate_positional_args
-    def __init__(self, loss="squared_loss", *, penalty="l2", alpha=0.0001,
+    def __init__(self, loss="squared_error", *, penalty="l2", alpha=0.0001,
                  l1_ratio=0.15, fit_intercept=True, max_iter=1000, tol=1e-3,
                  shuffle=True, verbose=0, epsilon=DEFAULT_EPSILON,
                  random_state=None, learning_rate="invscaling", eta0=0.01,
diff --git a/sklearn/linear_model/tests/test_ransac.py b/sklearn/linear_model/tests/test_ransac.py
index f631199a5d268..857696bf387d5 100644
--- a/sklearn/linear_model/tests/test_ransac.py
+++ b/sklearn/linear_model/tests/test_ransac.py
@@ -406,7 +406,7 @@ def loss_mono(y_true, y_pred):
                               ransac_estimator2.predict(X))
     ransac_estimator3 = RANSACRegressor(base_estimator, min_samples=2,
                                         residual_threshold=5, random_state=0,
-                                        loss="squared_loss")
+                                        loss="squared_error")
     ransac_estimator3.fit(X, y)
     assert_array_almost_equal(ransac_estimator0.predict(X),
                               ransac_estimator2.predict(X))
@@ -536,3 +536,16 @@ def test_ransac_final_model_fit_sample_weight():
     )
 
     assert_allclose(ransac.estimator_.coef_, final_model.coef_, atol=1e-12)
+
+
+# TODO: Remove in v1.2
+def test_loss_squared_loss_deprecated():
+    est1 = RANSACRegressor(loss="squared_loss", random_state=0)
+
+    with pytest.warns(FutureWarning,
+                      match="The loss 'squared_loss' was deprecated"):
+        est1.fit(X, y)
+
+    est2 = RANSACRegressor(loss="squared_error", random_state=0)
+    est2.fit(X, y)
+    assert_allclose(est1.predict(X), est2.predict(X))
diff --git a/sklearn/linear_model/tests/test_sgd.py b/sklearn/linear_model/tests/test_sgd.py
index 217249631390d..aba043024fea3 100644
--- a/sklearn/linear_model/tests/test_sgd.py
+++ b/sklearn/linear_model/tests/test_sgd.py
@@ -2,6 +2,7 @@
 import pytest
 
 import numpy as np
+from numpy.testing import assert_allclose
 import scipy.sparse as sp
 import joblib
 
@@ -310,10 +311,10 @@ def test_late_onset_averaging_reached(klass):
     Y_encode[Y_encode == 2] = 1.0
 
     clf1 = klass(average=7, learning_rate="constant",
-                 loss='squared_loss', eta0=eta0,
+                 loss='squared_error', eta0=eta0,
                  alpha=alpha, max_iter=2, shuffle=False)
     clf2 = klass(average=0, learning_rate="constant",
-                 loss='squared_loss', eta0=eta0,
+                 loss='squared_error', eta0=eta0,
                  alpha=alpha, max_iter=1, shuffle=False)
 
     clf1.fit(X, Y_encode)
@@ -540,7 +541,7 @@ def test_average_binary_computed_correctly(klass):
     X = rng.normal(size=(n_samples, n_features))
     w = rng.normal(size=n_features)
 
-    clf = klass(loss='squared_loss',
+    clf = klass(loss='squared_error',
                 learning_rate='constant',
                 eta0=eta, alpha=alpha,
                 fit_intercept=True,
@@ -611,7 +612,7 @@ def test_sgd_multiclass_average(klass):
     eta = .001
     alpha = .01
     # Multi-class average test case
-    clf = klass(loss='squared_loss',
+    clf = klass(loss='squared_error',
                 learning_rate='constant',
                 eta0=eta, alpha=alpha,
                 fit_intercept=True,
@@ -675,6 +676,8 @@ def test_set_coef_multiclass(klass):
     clf = klass().fit(X2, Y2, intercept_init=np.zeros((3,)))
 
 
+# TODO: Remove filterwarnings in v1.2.
+@pytest.mark.filterwarnings("ignore:.*squared_loss.*:FutureWarning")
 @pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
 def test_sgd_predict_proba_method_access(klass):
     # Checks that SGDClassifier predict_proba and predict_log_proba methods
@@ -1067,7 +1070,7 @@ def test_regression_losses(klass):
     assert 1.0 == np.mean(clf.predict(X) == Y)
 
     clf = klass(alpha=0.01, learning_rate="constant", eta0=0.01,
-                loss="squared_loss", random_state=random_state)
+                loss="squared_error", random_state=random_state)
     clf.fit(X, Y)
     assert 1.0 == np.mean(clf.predict(X) == Y)
 
@@ -1115,7 +1118,7 @@ def test_sgd_averaged_computed_correctly(klass):
     # simple linear function without noise
     y = np.dot(X, w)
 
-    clf = klass(loss='squared_loss',
+    clf = klass(loss='squared_error',
                 learning_rate='constant',
                 eta0=eta, alpha=alpha,
                 fit_intercept=True,
@@ -1144,7 +1147,7 @@ def test_sgd_averaged_partial_fit(klass):
     # simple linear function without noise
     y = np.dot(X, w)
 
-    clf = klass(loss='squared_loss',
+    clf = klass(loss='squared_error',
                 learning_rate='constant',
                 eta0=eta, alpha=alpha,
                 fit_intercept=True,
@@ -1166,7 +1169,7 @@ def test_average_sparse(klass):
 
     eta = .001
     alpha = .01
-    clf = klass(loss='squared_loss',
+    clf = klass(loss='squared_error',
                 learning_rate='constant',
                 eta0=eta, alpha=alpha,
                 fit_intercept=True,
@@ -1194,7 +1197,7 @@ def test_sgd_least_squares_fit(klass):
     # simple linear function without noise
     y = 0.5 * X.ravel()
 
-    clf = klass(loss='squared_loss', alpha=0.1, max_iter=20,
+    clf = klass(loss='squared_error', alpha=0.1, max_iter=20,
                 fit_intercept=False)
     clf.fit(X, y)
     score = clf.score(X, y)
@@ -1203,7 +1206,7 @@ def test_sgd_least_squares_fit(klass):
     # simple linear function with noise
     y = 0.5 * X.ravel() + rng.randn(n_samples, 1).ravel()
 
-    clf = klass(loss='squared_loss', alpha=0.1, max_iter=20,
+    clf = klass(loss='squared_error', alpha=0.1, max_iter=20,
                 fit_intercept=False)
     clf.fit(X, y)
     score = clf.score(X, y)
@@ -1646,3 +1649,25 @@ def test_SGDClassifier_fit_for_all_backends(backend):
     with joblib.parallel_backend(backend=backend):
         clf_parallel.fit(X, y)
     assert_array_almost_equal(clf_sequential.coef_, clf_parallel.coef_)
+
+
+# TODO: Remove in v1.2
+@pytest.mark.parametrize(
+    'Estimator',
+    [linear_model.SGDClassifier, linear_model.SGDRegressor]
+)
+def test_loss_squared_loss_deprecated(Estimator):
+
+    # Note: class BaseSGD calls self._validate_params() in __init__, therefore
+    # even instatiation of class raises FutureWarning for squared_loss.
+    with pytest.warns(FutureWarning,
+                      match="The loss 'squared_loss' was deprecated"):
+        est1 = Estimator(loss="squared_loss", random_state=0)
+        est1.fit(X, Y)
+
+    est2 = Estimator(loss="squared_error", random_state=0)
+    est2.fit(X, Y)
+    if hasattr(est1, "predict_proba"):
+        assert_allclose(est1.predict_proba(X), est2.predict_proba(X))
+    else:
+        assert_allclose(est1.predict(X), est2.predict(X))
diff --git a/sklearn/neural_network/_base.py b/sklearn/neural_network/_base.py
index 6afe8a23db446..b8b2180bac5e5 100644
--- a/sklearn/neural_network/_base.py
+++ b/sklearn/neural_network/_base.py
@@ -224,5 +224,5 @@ def binary_log_loss(y_true, y_prob):
              xlogy(1 - y_true, 1 - y_prob).sum()) / y_prob.shape[0]
 
 
-LOSS_FUNCTIONS = {'squared_loss': squared_loss, 'log_loss': log_loss,
+LOSS_FUNCTIONS = {'squared_error': squared_loss, 'log_loss': log_loss,
                   'binary_log_loss': binary_log_loss}
diff --git a/sklearn/neural_network/_multilayer_perceptron.py b/sklearn/neural_network/_multilayer_perceptron.py
index ae06502d3ce1a..52c94a7129b9f 100644
--- a/sklearn/neural_network/_multilayer_perceptron.py
+++ b/sklearn/neural_network/_multilayer_perceptron.py
@@ -1127,7 +1127,7 @@ def predict_proba(self, X):
 class MLPRegressor(RegressorMixin, BaseMultilayerPerceptron):
     """Multi-layer Perceptron regressor.
 
-    This model optimizes the squared-loss using LBFGS or stochastic gradient
+    This model optimizes the squared error using LBFGS or stochastic gradient
     descent.
 
     .. versionadded:: 0.18
@@ -1383,7 +1383,7 @@ def __init__(self, hidden_layer_sizes=(100,), activation="relu", *,
             activation=activation, solver=solver, alpha=alpha,
             batch_size=batch_size, learning_rate=learning_rate,
             learning_rate_init=learning_rate_init, power_t=power_t,
-            max_iter=max_iter, loss='squared_loss', shuffle=shuffle,
+            max_iter=max_iter, loss='squared_error', shuffle=shuffle,
             random_state=random_state, tol=tol, verbose=verbose,
             warm_start=warm_start, momentum=momentum,
             nesterovs_momentum=nesterovs_momentum,
diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
index f7ae823c0070f..420292881f7db 100644
--- a/sklearn/tree/_classes.py
+++ b/sklearn/tree/_classes.py
@@ -62,7 +62,9 @@
 
 CRITERIA_CLF = {"gini": _criterion.Gini,
                 "entropy": _criterion.Entropy}
-CRITERIA_REG = {"mse": _criterion.MSE,
+# TODO: Remove "mse" in version 1.2.
+CRITERIA_REG = {"squared_error": _criterion.MSE,
+                "mse": _criterion.MSE,
                 "friedman_mse": _criterion.FriedmanMSE,
                 "mae": _criterion.MAE,
                 "poisson": _criterion.Poisson}
@@ -350,6 +352,14 @@ def fit(self, X, y, sample_weight=None, check_input=True,
             else:
                 criterion = CRITERIA_REG[self.criterion](self.n_outputs_,
                                                          n_samples)
+            # TODO: Remove in v1.2
+            if self.criterion == "mse":
+                warnings.warn(
+                    "Criterion 'mse' was deprecated in v1.0 and will be "
+                    "removed in version 1.2. Use `criterion='squared_error'` "
+                    "which is equivalent.",
+                    FutureWarning
+                )
         else:
             # Make a deepcopy in case the criterion has mutable attributes that
             # might be shared and modified concurrently during parallel fitting
@@ -991,15 +1001,16 @@ class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree):
 
     Parameters
     ----------
-    criterion : {"mse", "friedman_mse", "mae", "poisson"}, default="mse"
+    criterion : {"squared_error", "mse", "friedman_mse", "mae", "poisson"}, \
+            default="squared_error"
         The function to measure the quality of a split. Supported criteria
-        are "mse" for the mean squared error, which is equal to variance
-        reduction as feature selection criterion and minimizes the L2 loss
-        using the mean of each terminal node, "friedman_mse", which uses mean
-        squared error with Friedman's improvement score for potential splits,
-        "mae" for the mean absolute error, which minimizes the L1 loss using
-        the median of each terminal node, and "poisson" which uses reduction in
-        Poisson deviance to find splits.
+        are "squared_error" for the mean squared error, which is equal to
+        variance reduction as feature selection criterion and minimizes the L2
+        loss using the mean of each terminal node, "friedman_mse", which uses
+        mean squared error with Friedman's improvement score for potential
+        splits, "mae" for the mean absolute error, which minimizes the L1 loss
+        using the median of each terminal node, and "poisson" which uses
+        reduction in Poisson deviance to find splits.
 
         .. versionadded:: 0.18
            Mean Absolute Error (MAE) criterion.
@@ -1007,6 +1018,10 @@ class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree):
         .. versionadded:: 0.24
             Poisson deviance criterion.
 
+        .. deprecated:: 1.0
+            Criterion "mse" was deprecated in v1.0 and will be removed in
+            version 1.2. Use `criterion="squared_error"` which is equivalent.
+
     splitter : {"best", "random"}, default="best"
         The strategy used to choose the split at each node. Supported
         strategies are "best" to choose the best split and "random" to choose
@@ -1187,7 +1202,7 @@ class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree):
     """
     @_deprecate_positional_args
     def __init__(self, *,
-                 criterion="mse",
+                 criterion="squared_error",
                  splitter="best",
                  max_depth=None,
                  min_samples_split=2,
@@ -1545,11 +1560,12 @@ class ExtraTreeRegressor(DecisionTreeRegressor):
 
     Parameters
     ----------
-    criterion : {"mse", "friedman_mse", "mae"}, default="mse"
+    criterion : {"squared_error", "mse", "friedman_mse", "mae"}, \
+            default="squared_error"
         The function to measure the quality of a split. Supported criteria
-        are "mse" for the mean squared error, which is equal to variance
-        reduction as feature selection criterion and "mae" for the mean
-        absolute error.
+        are "squared_error" for the mean squared error, which is equal to
+        variance reduction as feature selection criterion and "mae" for the
+        mean absolute error.
 
         .. versionadded:: 0.18
            Mean Absolute Error (MAE) criterion.
@@ -1557,6 +1573,10 @@ class ExtraTreeRegressor(DecisionTreeRegressor):
         .. versionadded:: 0.24
             Poisson deviance criterion.
 
+        .. deprecated:: 1.0
+            Criterion "mse" was deprecated in v1.0 and will be removed in
+            version 1.2. Use `criterion="squared_error"` which is equivalent.
+
     splitter : {"random", "best"}, default="random"
         The strategy used to choose the split at each node. Supported
         strategies are "best" to choose the best split and "random" to choose
@@ -1722,7 +1742,7 @@ class ExtraTreeRegressor(DecisionTreeRegressor):
     """
     @_deprecate_positional_args
     def __init__(self, *,
-                 criterion="mse",
+                 criterion="squared_error",
                  splitter="random",
                  max_depth=None,
                  min_samples_split=2,
diff --git a/sklearn/tree/_export.py b/sklearn/tree/_export.py
index ff29790e3699e..affe1b68cfe9a 100644
--- a/sklearn/tree/_export.py
+++ b/sklearn/tree/_export.py
@@ -299,6 +299,9 @@ def node_to_str(self, tree, node_id, criterion):
         if self.impurity:
             if isinstance(criterion, _criterion.FriedmanMSE):
                 criterion = "friedman_mse"
+            elif (isinstance(criterion, _criterion.MSE)
+                  or criterion == "squared_error"):
+                criterion = "squared_error"
             elif not isinstance(criterion, str):
                 criterion = "impurity"
             if labels:
diff --git a/sklearn/tree/tests/test_export.py b/sklearn/tree/tests/test_export.py
index 6a7bf33b2143f..7b94fbb527dc9 100644
--- a/sklearn/tree/tests/test_export.py
+++ b/sklearn/tree/tests/test_export.py
@@ -177,32 +177,34 @@ def test_graphviz_toy():
     # Test regression output with plot_options
     clf = DecisionTreeRegressor(max_depth=3,
                                 min_samples_split=2,
-                                criterion="mse",
+                                criterion="squared_error",
                                 random_state=2)
     clf.fit(X, y)
 
     contents1 = export_graphviz(clf, filled=True, leaves_parallel=True,
                                 out_file=None, rotate=True, rounded=True,
                                 fontname="sans")
-    contents2 = 'digraph Tree {\n' \
-                'node [shape=box, style="filled, rounded", color="black", ' \
-                'fontname="sans"] ;\n' \
-                'graph [ranksep=equally, splines=polyline] ;\n' \
-                'edge [fontname="sans"] ;\n' \
-                'rankdir=LR ;\n' \
-                '0 [label="X[0] <= 0.0\\nmse = 1.0\\nsamples = 6\\n' \
-                'value = 0.0", fillcolor="#f2c09c"] ;\n' \
-                '1 [label="mse = 0.0\\nsamples = 3\\nvalue = -1.0", ' \
-                'fillcolor="#ffffff"] ;\n' \
-                '0 -> 1 [labeldistance=2.5, labelangle=-45, ' \
-                'headlabel="True"] ;\n' \
-                '2 [label="mse = 0.0\\nsamples = 3\\nvalue = 1.0", ' \
-                'fillcolor="#e58139"] ;\n' \
-                '0 -> 2 [labeldistance=2.5, labelangle=45, ' \
-                'headlabel="False"] ;\n' \
-                '{rank=same ; 0} ;\n' \
-                '{rank=same ; 1; 2} ;\n' \
-                '}'
+    contents2 = ('digraph Tree {\n'
+                 'node [shape=box, style="filled, rounded", color="black", '
+                 'fontname="sans"] ;\n'
+                 'graph [ranksep=equally, splines=polyline] ;\n'
+                 'edge [fontname="sans"] ;\n'
+                 'rankdir=LR ;\n'
+                 '0 [label="X[0] <= 0.0\\nsquared_error = 1.0\\nsamples = 6\\n'
+                 'value = 0.0", fillcolor="#f2c09c"] ;\n'
+                 '1 [label="squared_error = 0.0\\nsamples = 3\\'
+                 'nvalue = -1.0", '
+                 'fillcolor="#ffffff"] ;\n'
+                 '0 -> 1 [labeldistance=2.5, labelangle=-45, '
+                 'headlabel="True"] ;\n'
+                 '2 [label="squared_error = 0.0\\nsamples = 3\\nvalue = 1.0", '
+                 'fillcolor="#e58139"] ;\n'
+                 '0 -> 2 [labeldistance=2.5, labelangle=45, '
+                 'headlabel="False"] ;\n'
+                 '{rank=same ; 0} ;\n'
+                 '{rank=same ; 1; 2} ;\n'
+                 '}'
+                 )
 
     assert contents1 == contents2
 
diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py
index be66316f7187a..2a1da1e2bfce0 100644
--- a/sklearn/tree/tests/test_tree.py
+++ b/sklearn/tree/tests/test_tree.py
@@ -51,7 +51,7 @@
 from sklearn.utils import compute_sample_weight
 
 CLF_CRITERIONS = ("gini", "entropy")
-REG_CRITERIONS = ("mse", "mae", "friedman_mse", "poisson")
+REG_CRITERIONS = ("squared_error", "mae", "friedman_mse", "poisson")
 
 CLF_TREES = {
     "DecisionTreeClassifier": DecisionTreeClassifier,
@@ -293,7 +293,7 @@ def test_diabetes_overfit(name, Tree, criterion):
 @pytest.mark.parametrize("name, Tree", REG_TREES.items())
 @pytest.mark.parametrize(
     "criterion, max_depth, metric, max_loss",
-    [("mse", 15, mean_squared_error, 60),
+    [("squared_error", 15, mean_squared_error, 60),
      ("mae", 20, mean_squared_error, 60),
      ("friedman_mse", 15, mean_squared_error, 60),
      ("poisson", 15, mean_poisson_deviance, 30)]
@@ -420,8 +420,8 @@ def test_importances_raises():
         getattr(clf, 'feature_importances_')
 
 
-def test_importances_gini_equal_mse():
-    # Check that gini is equivalent to mse for binary output variable
+def test_importances_gini_equal_squared_error():
+    # Check that gini is equivalent to squared_error for binary output variable
 
     X, y = datasets.make_classification(n_samples=2000,
                                         n_features=10,
@@ -436,7 +436,7 @@ def test_importances_gini_equal_mse():
     # high tree depth, we restrict this maximal depth.
     clf = DecisionTreeClassifier(criterion="gini", max_depth=5,
                                  random_state=0).fit(X, y)
-    reg = DecisionTreeRegressor(criterion="mse", max_depth=5,
+    reg = DecisionTreeRegressor(criterion="squared_error", max_depth=5,
                                 random_state=0).fit(X, y)
 
     assert_almost_equal(clf.feature_importances_, reg.feature_importances_)
@@ -1973,7 +1973,9 @@ def test_apply_path_readonly_all_trees(name):
     check_apply_path_readonly(name)
 
 
-@pytest.mark.parametrize("criterion", ["mse", "friedman_mse", "poisson"])
+@pytest.mark.parametrize(
+    "criterion", ["squared_error", "friedman_mse", "poisson"]
+)
 @pytest.mark.parametrize("Tree", REG_TREES.values())
 def test_balance_property(criterion, Tree):
     # Test that sum(y_pred)=sum(y_true) on training set.
@@ -1995,7 +1997,7 @@ def test_poisson_zero_nodes(seed):
     y = [0, 0, 0, 0, 1, 2, 3, 4]
     # Note that X[:, 0] == 0 is a 100% indicator for y == 0. The tree can
     # easily learn that:
-    reg = DecisionTreeRegressor(criterion="mse", random_state=seed)
+    reg = DecisionTreeRegressor(criterion="squared_error", random_state=seed)
     reg.fit(X, y)
     assert np.amin(reg.predict(X)) == 0
     # whereas Poisson must predict strictly positive numbers
@@ -2023,7 +2025,7 @@ def test_poisson_zero_nodes(seed):
 
 def test_poisson_vs_mse():
     # For a Poisson distributed target, Poisson loss should give better results
-    # than least squares measured in Poisson deviance as metric.
+    # than squared error measured in Poisson deviance as metric.
     # We have a similar test, test_poisson(), in
     # sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
     # Note: Some fine tuning was needed to have metric_poi < metric_dummy on
@@ -2042,7 +2044,7 @@ def test_poisson_vs_mse():
     tree_poi = DecisionTreeRegressor(criterion="poisson",
                                      min_samples_split=10,
                                      random_state=rng)
-    tree_mse = DecisionTreeRegressor(criterion="mse",
+    tree_mse = DecisionTreeRegressor(criterion="squared_error",
                                      min_samples_split=10,
                                      random_state=rng)
 
@@ -2052,12 +2054,13 @@ def test_poisson_vs_mse():
 
     for X, y, val in [(X_train, y_train, "train"), (X_test, y_test, "test")]:
         metric_poi = mean_poisson_deviance(y, tree_poi.predict(X))
-        # mse might produce non-positive predictions => clip
+        # squared_error might produce non-positive predictions => clip
         metric_mse = mean_poisson_deviance(y, np.clip(tree_mse.predict(X),
                                                       1e-15, None))
         metric_dummy = mean_poisson_deviance(y, dummy.predict(X))
-        # As MSE might correctly predict 0 in train set, its train score can
-        # be better than Poisson. This is no longer the case for the test set.
+        # As squared_error might correctly predict 0 in train set, its train
+        # score can be better than Poisson. This is no longer the case for the
+        # test set.
         if val == "test":
             assert metric_poi < metric_mse
         assert metric_poi < metric_dummy
@@ -2114,3 +2117,16 @@ def test_X_idx_sorted_deprecated(TreeEstimator):
     with pytest.warns(FutureWarning,
                       match="The parameter 'X_idx_sorted' is deprecated"):
         tree.fit(X, y, X_idx_sorted=X_idx_sorted)
+
+
+# TODO: Remove in v1.2
+@pytest.mark.parametrize("Tree", REG_TREES.values())
+def test_mse_deprecated(Tree):
+    tree = Tree(criterion="mse")
+
+    with pytest.warns(FutureWarning,
+                      match="Criterion 'mse' was deprecated"):
+        tree.fit(X, y)
+
+    tree_sqer = Tree(criterion="squared_error").fit(X, y)
+    assert_allclose(tree.predict(X), tree_sqer.predict(X))

From 071ddc75e92917d372f84e20a7fca15c1b7c6ca0 Mon Sep 17 00:00:00 2001
From: Avi Gupta <33635739+avigupta2612@users.noreply.github.com>
Date: Fri, 19 Mar 2021 20:56:15 +0530
Subject: [PATCH 259/478] Removed assert_warns_message from
 gaussian_process/tests (#19697)

---
 sklearn/gaussian_process/tests/test_gpc.py    | 18 ++++-----
 sklearn/gaussian_process/tests/test_gpr.py    | 37 ++++++++++---------
 .../gaussian_process/tests/test_kernels.py    | 12 +++---
 3 files changed, 36 insertions(+), 31 deletions(-)

diff --git a/sklearn/gaussian_process/tests/test_gpc.py b/sklearn/gaussian_process/tests/test_gpc.py
index 76804906f7fb4..57efc34891c51 100644
--- a/sklearn/gaussian_process/tests/test_gpc.py
+++ b/sklearn/gaussian_process/tests/test_gpc.py
@@ -17,7 +17,7 @@
 from sklearn.exceptions import ConvergenceWarning
 
 from sklearn.utils._testing \
-    import assert_almost_equal, assert_array_equal, assert_warns_message
+    import assert_almost_equal, assert_array_equal
 
 
 def f(x):
@@ -189,14 +189,14 @@ def test_multi_class_n_jobs(kernel):
 def test_warning_bounds():
     kernel = RBF(length_scale_bounds=[1e-5, 1e-3])
     gpc = GaussianProcessClassifier(kernel=kernel)
-    assert_warns_message(ConvergenceWarning, "The optimal value found for "
-                                             "dimension 0 of parameter "
-                                             "length_scale is close to "
-                                             "the specified upper bound "
-                                             "0.001. Increasing the bound "
-                                             "and calling fit again may "
-                                             "find a better value.",
-                         gpc.fit, X, y)
+    warning_message = (
+        "The optimal value found for dimension 0 of parameter "
+        "length_scale is close to the specified upper bound "
+        "0.001. Increasing the bound and calling fit again may "
+        "find a better value."
+    )
+    with pytest.warns(ConvergenceWarning, match=warning_message):
+        gpc.fit(X, y)
 
     kernel_sum = (WhiteKernel(noise_level_bounds=[1e-5, 1e-3]) +
                   RBF(length_scale_bounds=[1e3, 1e5]))
diff --git a/sklearn/gaussian_process/tests/test_gpr.py b/sklearn/gaussian_process/tests/test_gpr.py
index 83c24c7cc8573..a5bfa05c47313 100644
--- a/sklearn/gaussian_process/tests/test_gpr.py
+++ b/sklearn/gaussian_process/tests/test_gpr.py
@@ -5,6 +5,7 @@
 # License: BSD 3 clause
 
 import sys
+import re
 import numpy as np
 import warnings
 
@@ -21,9 +22,8 @@
 
 from sklearn.utils._testing \
     import (assert_array_less,
-            assert_almost_equal, assert_raise_message,
-            assert_array_almost_equal, assert_array_equal,
-            assert_allclose, assert_warns_message)
+            assert_almost_equal, assert_array_almost_equal,
+            assert_array_equal, assert_allclose)
 
 
 def f(x):
@@ -404,12 +404,15 @@ def test_gpr_correct_error_message():
     y = np.ones(6)
     kernel = DotProduct()
     gpr = GaussianProcessRegressor(kernel=kernel, alpha=0.0)
-    assert_raise_message(np.linalg.LinAlgError,
-                         "The kernel, %s, is not returning a "
-                         "positive definite matrix. Try gradually increasing "
-                         "the 'alpha' parameter of your "
-                         "GaussianProcessRegressor estimator."
-                         % kernel, gpr.fit, X, y)
+    message = (
+        "The kernel, %s, is not returning a "
+        "positive definite matrix. Try gradually increasing "
+        "the 'alpha' parameter of your "
+        "GaussianProcessRegressor estimator."
+        % kernel
+    )
+    with pytest.raises(np.linalg.LinAlgError, match=re.escape(message)):
+        gpr.fit(X, y)
 
 
 @pytest.mark.parametrize('kernel', kernels)
@@ -474,14 +477,14 @@ def test_K_inv_reset(kernel):
 def test_warning_bounds():
     kernel = RBF(length_scale_bounds=[1e-5, 1e-3])
     gpr = GaussianProcessRegressor(kernel=kernel)
-    assert_warns_message(ConvergenceWarning, "The optimal value found for "
-                                             "dimension 0 of parameter "
-                                             "length_scale is close to "
-                                             "the specified upper bound "
-                                             "0.001. Increasing the bound "
-                                             "and calling fit again may "
-                                             "find a better value.",
-                         gpr.fit, X, y)
+    warning_message = (
+        "The optimal value found for dimension 0 of parameter "
+        "length_scale is close to the specified upper bound "
+        "0.001. Increasing the bound and calling fit again may "
+        "find a better value."
+    )
+    with pytest.warns(ConvergenceWarning, match=warning_message):
+        gpr.fit(X, y)
 
     kernel_sum = (WhiteKernel(noise_level_bounds=[1e-5, 1e-3]) +
                   RBF(length_scale_bounds=[1e3, 1e5]))
diff --git a/sklearn/gaussian_process/tests/test_kernels.py b/sklearn/gaussian_process/tests/test_kernels.py
index 1f8e196104e75..b56c0b06b5fc0 100644
--- a/sklearn/gaussian_process/tests/test_kernels.py
+++ b/sklearn/gaussian_process/tests/test_kernels.py
@@ -20,7 +20,6 @@
 from sklearn.utils._testing import (assert_almost_equal, assert_array_equal,
                                     assert_array_almost_equal,
                                     assert_allclose,
-                                    assert_raise_message,
                                     fails_if_pypy)
 
 
@@ -361,7 +360,10 @@ def test_repr_kernels(kernel):
 
 def test_rational_quadratic_kernel():
     kernel = RationalQuadratic(length_scale=[1., 1.])
-    assert_raise_message(AttributeError,
-                         "RationalQuadratic kernel only supports isotropic "
-                         "version, please use a single "
-                         "scalar for length_scale", kernel, X)
+    message = (
+        "RationalQuadratic kernel only supports isotropic "
+        "version, please use a single "
+        "scalar for length_scale"
+    )
+    with pytest.raises(AttributeError, match=message):
+        kernel(X)

From cc1b171af86dee040d933aeeae64439e85a0cd54 Mon Sep 17 00:00:00 2001
From: Avi Gupta <33635739+avigupta2612@users.noreply.github.com>
Date: Fri, 19 Mar 2021 22:03:57 +0530
Subject: [PATCH 260/478] Replaced assert_raises from
 utils/tests/test_estimator_checks (#19709)

---
 sklearn/utils/tests/test_estimator_checks.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py
index 8fabe5f91ea31..4792f50f2baef 100644
--- a/sklearn/utils/tests/test_estimator_checks.py
+++ b/sklearn/utils/tests/test_estimator_checks.py
@@ -1,3 +1,7 @@
+# We can not use pytest here, because we run
+# build_tools/azure/test_pytest_soft_dependency.sh on these
+# tests to make sure estimator_checks works without pytest.
+
 import unittest
 import sys
 
@@ -139,6 +143,7 @@ def fit(self, X, y=None):
         X, y = self._validate_data(X, y)
         return self
 
+
 class ModifiesValueInsteadOfRaisingError(BaseEstimator):
     def __init__(self, p=0):
         self.p = p

From 03edffa25f9250cd2861117c096860b1c9e09d2c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sebastian=20P=C3=B6lsterl?= <sebp@k-d-w.org>
Date: Fri, 19 Mar 2021 19:16:23 +0000
Subject: [PATCH 261/478] DOC Add scikit-survival to related projects (#19728)

---
 doc/related_projects.rst | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/doc/related_projects.rst b/doc/related_projects.rst
index acc2689388896..2b1d41bf4a5e4 100644
--- a/doc/related_projects.rst
+++ b/doc/related_projects.rst
@@ -270,6 +270,10 @@ Other packages useful for data analysis and machine learning.
 - `Seaborn <https://stanford.edu/~mwaskom/software/seaborn/>`_ Visualization library based on
   matplotlib. It provides a high-level interface for drawing attractive statistical graphics.
 
+- `scikit-survival <https://scikit-survival.readthedocs.io/>`_ A library implementing
+  models to learn from censored time-to-event data (also called survival analysis).
+  Models are fully compatible with scikit-learn.
+
 Recommendation Engine packages
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 

From fe897c0ba0f00171333dcbdb483ca0d0346fed95 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sebastian=20P=C3=B6lsterl?= <sebp@k-d-w.org>
Date: Fri, 19 Mar 2021 19:17:47 +0000
Subject: [PATCH 262/478] DOC Move Sacred to "Experimentation frameworks"
 (#19730)

---
 doc/related_projects.rst | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/doc/related_projects.rst b/doc/related_projects.rst
index 2b1d41bf4a5e4..fb02ea8beaf0d 100644
--- a/doc/related_projects.rst
+++ b/doc/related_projects.rst
@@ -60,6 +60,9 @@ enhance the functionality of scikit-learn's estimators.
 
 **Experimentation frameworks**
 
+- `Sacred <https://github.com/IDSIA/Sacred>`_ Tool to help you configure,
+  organize, log and reproduce experiments
+
 - `REP <https://github.com/yandex/REP>`_ Environment for conducting data-driven
   research in a consistent and reproducible way
 
@@ -264,9 +267,6 @@ Other packages useful for data analysis and machine learning.
 - `PyMC <https://pymc-devs.github.io/pymc/>`_ Bayesian statistical models and
   fitting algorithms.
 
-- `Sacred <https://github.com/IDSIA/Sacred>`_ Tool to help you configure,
-  organize, log and reproduce experiments
-
 - `Seaborn <https://stanford.edu/~mwaskom/software/seaborn/>`_ Visualization library based on
   matplotlib. It provides a high-level interface for drawing attractive statistical graphics.
 

From 266400e60ddc0bdba1f0de02ed49f45893e5647c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?=
 <34657725+jeremiedbb@users.noreply.github.com>
Date: Fri, 19 Mar 2021 21:44:08 +0100
Subject: [PATCH 263/478] DOC Fix doc regarding required_parameters (#19725)

---
 doc/developers/develop.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/developers/develop.rst b/doc/developers/develop.rst
index c68becf18f93c..4956530d2bbf6 100644
--- a/doc/developers/develop.rst
+++ b/doc/developers/develop.rst
@@ -641,7 +641,7 @@ In addition to the tags, estimators also need to declare any non-optional
 parameters to ``__init__`` in the ``_required_parameters`` class attribute,
 which is a list or tuple.  If ``_required_parameters`` is only
 ``["estimator"]`` or ``["base_estimator"]``, then the estimator will be
-instantiated with an instance of ``LinearDiscriminantAnalysis`` (or
+instantiated with an instance of ``LogisticRegression`` (or
 ``RidgeRegression`` if the estimator is a regressor) in the tests. The choice
 of these two models is somewhat idiosyncratic but both should provide robust
 closed-form solutions.

From 0892a98fc9972b1da88ca25282e70105bd463608 Mon Sep 17 00:00:00 2001
From: Rodion Martynov <mar.rodion@gmail.com>
Date: Sat, 20 Mar 2021 13:57:42 +0300
Subject: [PATCH 264/478] Stratified Group KFold implementation (#18649)

* Initial implementation

* Forgot to add to second __add__ list

* Update split method parameter doc

* Added example; changed default test_size to 0.1; added to author list

* StratifiedGroupKFold impl and other improvements

* Add class to __all__ spec

* Remove random_state when no shuffle

* Tighter formatting

* Update the implementation of StratifiedGroupKFold

* Add StratifiedGroupKFold to __init__

* Add y checks to StartifiedGroupKFold

* Raise error if n_splits > max num samples in class

* Warn if n_splits > mn num samples in class

* Add SGKfold to general repr test

* Add SGKFold to 2d_y test case

* Add SGKfold to value erros test case

Parameters are the same as for StratifiedKFold
to ensure similar behavior given n_groups == n_samples

* Add SGKFold to StratifiedKFold test cases

The idea is to ensure similar behavior when groups are trivial
(n_groups == n_samples)

* Add SGKFold to reproducibility test case

* Add SGKFold to GroupKFold test case

* Add SGKFold to nested cv test case

* Add SGKFold to random_state with shuffle=False test case

* Add SGKFold to constant splits test case

* Fix repr test case

* Fix formatting issues

* Add samples to a fold with least num samples

Required to produce balanced size folds when the distribution of y is
more or less the same

* Remove GroupShuffleSplit impl

* Add notes to StratifiedGroupKFold

* Fix doctest

* Added stratified group kfold tests

* Better variable naming

* Add section to documentation

* Remove leftover StratifiedGroupShuffleSplit import

* Add changelist and reference to original kernel

* Better naming for least populated class check

* Better expression for number of labels

* Remove use of Counter

We already have this data in output of np.unique

* Add tests for homogeneous groups

* Add StratifiedGroupKFold test against GroupKFold

* Add changes to changelist in docstring

* Add StratifiedGroupKFold to classes.rst

* Fix description of StratifiedGroupKFold

* Move license notice out of docstring

* Disambiguate labels to classes in doc

* Add changelog entry

* Fix changelog author entry

* Fix StratifiedGroupKFold docstring

* Better variable names

* Remove defaultdict in favor of numpy indexing

* Extracted best_fold search into a separate method

* Make use of numpy broadcasting instead of for loop

* Encode groups and use arrays instead of dicts

* Use numpy sort instead of python

* Clarify shuffling behavior of StratifiedGroupKF in docs

* Switch name from label_idx to class_idx

* Remove accidentally leftover comment

* Fix np.sort keyword to support numpy < 1.15

* Fix typo in docstring

* Add StratifiedGroupKFold to visualization doc

* Add visualization for uneven group as an example

* Fix image numbers to match updated example

* Add author

* Add SGKF visualization to docs

* Add comments for groups in stratified CV tests

Co-authored-by: Leandro Hermida <hermidal@cs.umd.edu>
Co-authored-by: marrodion <rodion_martynov@epam.com>
---
 doc/modules/classes.rst                     |   1 +
 doc/modules/cross_validation.rst            |  64 +++++-
 doc/whats_new/v1.0.rst                      |  10 +
 examples/model_selection/plot_cv_indices.py |  35 +++-
 sklearn/model_selection/__init__.py         |   2 +
 sklearn/model_selection/_split.py           | 192 +++++++++++++++++-
 sklearn/model_selection/tests/test_split.py | 212 ++++++++++++++++----
 7 files changed, 457 insertions(+), 59 deletions(-)

diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index 0cd5abb16829d..ceebfc337352a 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -1176,6 +1176,7 @@ Splitter Classes
    model_selection.ShuffleSplit
    model_selection.StratifiedKFold
    model_selection.StratifiedShuffleSplit
+   model_selection.StratifiedGroupKFold
    model_selection.TimeSeriesSplit
 
 Splitter Functions
diff --git a/doc/modules/cross_validation.rst b/doc/modules/cross_validation.rst
index ae3d38f168f3f..0b090fd7385b6 100644
--- a/doc/modules/cross_validation.rst
+++ b/doc/modules/cross_validation.rst
@@ -353,7 +353,7 @@ Example of 2-fold cross-validation on a dataset with 4 samples::
 Here is a visualization of the cross-validation behavior. Note that
 :class:`KFold` is not affected by classes or groups.
 
-.. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_cv_indices_004.png
+.. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_cv_indices_006.png
    :target: ../auto_examples/model_selection/plot_cv_indices.html
    :align: center
    :scale: 75%
@@ -509,7 +509,7 @@ Here is a usage example::
 Here is a visualization of the cross-validation behavior. Note that
 :class:`ShuffleSplit` is not affected by classes or groups.
 
-.. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_cv_indices_006.png
+.. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_cv_indices_008.png
    :target: ../auto_examples/model_selection/plot_cv_indices.html
    :align: center
    :scale: 75%
@@ -566,7 +566,7 @@ We can see that :class:`StratifiedKFold` preserves the class ratios
 
 Here is a visualization of the cross-validation behavior.
 
-.. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_cv_indices_007.png
+.. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_cv_indices_009.png
    :target: ../auto_examples/model_selection/plot_cv_indices.html
    :align: center
    :scale: 75%
@@ -585,7 +585,7 @@ percentage for each target class as in the complete set.
 
 Here is a visualization of the cross-validation behavior.
 
-.. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_cv_indices_009.png
+.. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_cv_indices_012.png
    :target: ../auto_examples/model_selection/plot_cv_indices.html
    :align: center
    :scale: 75%
@@ -645,6 +645,58 @@ size due to the imbalance in the data.
 
 Here is a visualization of the cross-validation behavior.
 
+.. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_cv_indices_007.png
+   :target: ../auto_examples/model_selection/plot_cv_indices.html
+   :align: center
+   :scale: 75%
+
+.. _stratified_group_k_fold:
+
+StratifiedGroupKFold
+^^^^^^^^^^^^^^^^^^^^
+
+:class:`StratifiedGroupKFold` is a cross-validation scheme that combines both
+:class:`StratifiedKFold` and :class:`GroupKFold`. The idea is to try to
+preserve the distribution of classes in each split while keeping each group
+within a single split. That might be useful when you have an unbalanced
+dataset so that using just :class:`GroupKFold` might produce skewed splits.
+
+Example::
+
+  >>> from sklearn.model_selection import StratifiedGroupKFold
+  >>> X = list(range(18))
+  >>> y = [1] * 6 + [0] * 12
+  >>> groups = [1, 2, 3, 3, 4, 4, 1, 1, 2, 2, 3, 4, 5, 5, 5, 6, 6, 6]
+  >>> sgkf = StratifiedGroupKFold(n_splits=3)
+  >>> for train, test in sgkf.split(X, y, groups=groups):
+  ...     print("%s %s" % (train, test))
+  [ 0  2  3  4  5  6  7 10 11 15 16 17] [ 1  8  9 12 13 14]
+  [ 0  1  4  5  6  7  8  9 11 12 13 14] [ 2  3 10 15 16 17]
+  [ 1  2  3  8  9 10 12 13 14 15 16 17] [ 0  4  5  6  7 11]
+
+Implementation notes:
+
+- With the current implementation full shuffle is not possible in most
+  scenarios. When shuffle=True, the following happens:
+
+  1. All groups a shuffled.
+  2. Groups are sorted by standard deviation of classes using stable sort.
+  3. Sorted groups are iterated over and assigned to folds.
+
+  That means that only groups with the same standard deviation of class
+  distribution will be shuffled, which might be useful when each group has only
+  a single class.
+- The algorithm greedily assigns each group to one of n_splits test sets,
+  choosing the test set that minimises the variance in class distribution
+  across test sets. Group assignment proceeds from groups with highest to
+  lowest variance in class frequency, i.e. large groups peaked on one or few
+  classes are assigned first.
+- This split is suboptimal in a sense that it might produce imbalanced splits
+  even if perfect stratification is possible. If you have relatively close
+  distribution of classes in each group, using :class:`GroupKFold` is better.
+
+Here is a visualization of cross-validation behavior for uneven groups:
+
 .. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_cv_indices_005.png
    :target: ../auto_examples/model_selection/plot_cv_indices.html
    :align: center
@@ -733,7 +785,7 @@ Here is a usage example::
 
 Here is a visualization of the cross-validation behavior.
 
-.. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_cv_indices_008.png
+.. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_cv_indices_011.png
    :target: ../auto_examples/model_selection/plot_cv_indices.html
    :align: center
    :scale: 75%
@@ -835,7 +887,7 @@ Example of 3-split time series cross-validation on a dataset with 6 samples::
 
 Here is a visualization of the cross-validation behavior.
 
-.. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_cv_indices_010.png
+.. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_cv_indices_013.png
    :target: ../auto_examples/model_selection/plot_cv_indices.html
    :align: center
    :scale: 75%
diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index b4ee0c57b97fc..521e358ac2f02 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -217,6 +217,16 @@ Changelog
   are integral.
   :pr:`9843` by :user:`Jon Crall <Erotemic>`.
 
+:mod:`sklearn.model_selection`
+..............................
+
+- |Feature| added :class:`model_selection.StratifiedGroupKFold`, that combines
+  :class:`model_selection.StratifiedKFold` and `model_selection.GroupKFold`,
+  providing an ability to split data preserving the distribution of classes in
+  each split while keeping each group within a single split.
+  :pr:`18649` by `Leandro Hermida <hermidalc>` and
+  `Rodion Martynov <marrodion>`.
+
 :mod:`sklearn.naive_bayes`
 ..........................
 
diff --git a/examples/model_selection/plot_cv_indices.py b/examples/model_selection/plot_cv_indices.py
index 91f71b0451cb2..f07fa1595e860 100644
--- a/examples/model_selection/plot_cv_indices.py
+++ b/examples/model_selection/plot_cv_indices.py
@@ -13,7 +13,8 @@
 
 from sklearn.model_selection import (TimeSeriesSplit, KFold, ShuffleSplit,
                                      StratifiedKFold, GroupShuffleSplit,
-                                     GroupKFold, StratifiedShuffleSplit)
+                                     GroupKFold, StratifiedShuffleSplit,
+                                     StratifiedGroupKFold)
 import numpy as np
 import matplotlib.pyplot as plt
 from matplotlib.patches import Patch
@@ -113,16 +114,32 @@ def plot_cv_indices(cv, X, y, group, ax, n_splits, lw=10):
 # %%
 # As you can see, by default the KFold cross-validation iterator does not
 # take either datapoint class or group into consideration. We can change this
-# by using the ``StratifiedKFold`` like so.
+# by using either:
+#
+# - ``StratifiedKFold`` to preserve the percentage of samples for each class.
+# - ``GroupKFold`` to ensure that the same group will not appear in two
+#   different folds.
+# - ``StratifiedGroupKFold`` to keep the constraint of ``GroupKFold`` while
+#   attempting to return stratified folds.
 
-fig, ax = plt.subplots()
-cv = StratifiedKFold(n_splits)
-plot_cv_indices(cv, X, y, groups, ax, n_splits)
+# To better demonstrate the difference, we will assign samples to groups
+# unevenly:
+
+uneven_groups = np.sort(np.random.randint(0, 10, n_points))
+
+cvs = [StratifiedKFold, GroupKFold, StratifiedGroupKFold]
+
+for cv in cvs:
+    fig, ax = plt.subplots(figsize=(6, 3))
+    plot_cv_indices(cv(n_splits), X, y, uneven_groups, ax, n_splits)
+    ax.legend([Patch(color=cmap_cv(.8)), Patch(color=cmap_cv(.02))],
+              ['Testing set', 'Training set'], loc=(1.02, .8))
+    # Make the legend fit
+    plt.tight_layout()
+    fig.subplots_adjust(right=.7)
 
 # %%
-# In this case, the cross-validation retained the same ratio of classes across
-# each CV split. Next we'll visualize this behavior for a number of CV
-# iterators.
+# Next we'll visualize this behavior for a number of CV iterators.
 #
 # Visualize cross-validation indices for many CV objects
 # ------------------------------------------------------
@@ -133,7 +150,7 @@ def plot_cv_indices(cv, X, y, group, ax, n_splits, lw=10):
 #
 # Note how some use the group/class information while others do not.
 
-cvs = [KFold, GroupKFold, ShuffleSplit, StratifiedKFold,
+cvs = [KFold, GroupKFold, ShuffleSplit, StratifiedKFold, StratifiedGroupKFold,
        GroupShuffleSplit, StratifiedShuffleSplit, TimeSeriesSplit]
 
 
diff --git a/sklearn/model_selection/__init__.py b/sklearn/model_selection/__init__.py
index 897183414b5a6..f79db2a5acc17 100644
--- a/sklearn/model_selection/__init__.py
+++ b/sklearn/model_selection/__init__.py
@@ -14,6 +14,7 @@
 from ._split import ShuffleSplit
 from ._split import GroupShuffleSplit
 from ._split import StratifiedShuffleSplit
+from ._split import StratifiedGroupKFold
 from ._split import PredefinedSplit
 from ._split import train_test_split
 from ._split import check_cv
@@ -57,6 +58,7 @@
            'RandomizedSearchCV',
            'ShuffleSplit',
            'StratifiedKFold',
+           'StratifiedGroupKFold',
            'StratifiedShuffleSplit',
            'check_cv',
            'cross_val_predict',
diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py
index 244b2b63af449..13edbeef071f5 100644
--- a/sklearn/model_selection/_split.py
+++ b/sklearn/model_selection/_split.py
@@ -3,13 +3,16 @@
 functions to split the data based on a preset strategy.
 """
 
-# Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>,
-#         Gael Varoquaux <gael.varoquaux@normalesup.org>,
+# Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>
+#         Gael Varoquaux <gael.varoquaux@normalesup.org>
 #         Olivier Grisel <olivier.grisel@ensta.org>
 #         Raghav RV <rvraghav93@gmail.com>
+#         Leandro Hermida <hermidal@cs.umd.edu>
+#         Rodion Martynov <marrodion@gmail.com>
 # License: BSD 3 clause
 
 from collections.abc import Iterable
+from collections import defaultdict
 import warnings
 from itertools import chain, combinations
 from math import ceil, floor
@@ -40,6 +43,7 @@
            'ShuffleSplit',
            'GroupShuffleSplit',
            'StratifiedKFold',
+           'StratifiedGroupKFold',
            'StratifiedShuffleSplit',
            'PredefinedSplit',
            'train_test_split',
@@ -732,6 +736,190 @@ def split(self, X, y, groups=None):
         return super().split(X, y, groups)
 
 
+class StratifiedGroupKFold(_BaseKFold):
+    """Stratified K-Folds iterator variant with non-overlapping groups.
+
+    This cross-validation object is a variation of StratifiedKFold attempts to
+    return stratified folds with non-overlapping groups. The folds are made by
+    preserving the percentage of samples for each class.
+
+    The same group will not appear in two different folds (the number of
+    distinct groups has to be at least equal to the number of folds).
+
+    The difference between GroupKFold and StratifiedGroupKFold is that
+    the former attempts to create balanced folds such that the number of
+    distinct groups is approximately the same in each fold, whereas
+    StratifiedGroupKFold attempts to create folds which preserve the
+    percentage of samples for each class as much as possible given the
+    constraint of non-overlapping groups between splits.
+
+    Read more in the :ref:`User Guide <cross_validation>`.
+
+    Parameters
+    ----------
+    n_splits : int, default=5
+        Number of folds. Must be at least 2.
+
+    shuffle : bool, default=False
+        Whether to shuffle each class's samples before splitting into batches.
+        Note that the samples within each split will not be shuffled.
+        This implementation can only shuffle groups that have approximately the
+        same y distribution, no global shuffle will be performed.
+
+    random_state : int or RandomState instance, default=None
+        When `shuffle` is True, `random_state` affects the ordering of the
+        indices, which controls the randomness of each fold for each class.
+        Otherwise, leave `random_state` as `None`.
+        Pass an int for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.model_selection import StratifiedGroupKFold
+    >>> X = np.ones((17, 2))
+    >>> y = np.array([0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0])
+    >>> groups = np.array([1, 1, 2, 2, 3, 3, 3, 4, 5, 5, 5, 5, 6, 6, 7, 8, 8])
+    >>> cv = StratifiedGroupKFold(n_splits=3)
+    >>> for train_idxs, test_idxs in cv.split(X, y, groups):
+    ...     print("TRAIN:", groups[train_idxs])
+    ...     print("      ", y[train_idxs])
+    ...     print(" TEST:", groups[test_idxs])
+    ...     print("      ", y[test_idxs])
+    TRAIN: [1 1 2 2 4 5 5 5 5 8 8]
+           [0 0 1 1 1 0 0 0 0 0 0]
+     TEST: [3 3 3 6 6 7]
+           [1 1 1 0 0 0]
+    TRAIN: [3 3 3 4 5 5 5 5 6 6 7]
+           [1 1 1 1 0 0 0 0 0 0 0]
+     TEST: [1 1 2 2 8 8]
+           [0 0 1 1 0 0]
+    TRAIN: [1 1 2 2 3 3 3 6 6 7 8 8]
+           [0 0 1 1 1 1 1 0 0 0 0 0]
+     TEST: [4 5 5 5 5]
+           [1 0 0 0 0]
+
+    Notes
+    -----
+    The implementation is designed to:
+
+    * Mimic the behavior of StratifiedKFold as much as possible for trivial
+      groups (e.g. when each group contains only one sample).
+    * Be invariant to class label: relabelling ``y = ["Happy", "Sad"]`` to
+      ``y = [1, 0]`` should not change the indices generated.
+    * Stratify based on samples as much as possible while keeping
+      non-overlapping groups constraint. That means that in some cases when
+      there is a small number of groups containing a large number of samples
+      the stratification will not be possible and the behavior will be close
+      to GroupKFold.
+
+    See also
+    --------
+    StratifiedKFold: Takes class information into account to build folds which
+        retain class distributions (for binary or multiclass classification
+        tasks).
+
+    GroupKFold: K-fold iterator variant with non-overlapping groups.
+    """
+
+    def __init__(self, n_splits=5, shuffle=False, random_state=None):
+        super().__init__(n_splits=n_splits, shuffle=shuffle,
+                         random_state=random_state)
+
+    def _iter_test_indices(self, X, y, groups):
+        # Implementation is based on this kaggle kernel:
+        # https://www.kaggle.com/jakubwasikowski/stratified-group-k-fold-cross-validation
+        # and is a subject to Apache 2.0 License. You may obtain a copy of the
+        # License at http://www.apache.org/licenses/LICENSE-2.0
+        # Changelist:
+        # - Refactored function to a class following scikit-learn KFold
+        #   interface.
+        # - Added heuristic for assigning group to the least populated fold in
+        #   cases when all other criteria are equal
+        # - Swtch from using python ``Counter`` to ``np.unique`` to get class
+        #   distribution
+        # - Added scikit-learn checks for input: checking that target is binary
+        #   or multiclass, checking passed random state, checking that number
+        #   of splits is less than number of members in each class, checking
+        #   that least populated class has more members than there are splits.
+        rng = check_random_state(self.random_state)
+        y = np.asarray(y)
+        type_of_target_y = type_of_target(y)
+        allowed_target_types = ('binary', 'multiclass')
+        if type_of_target_y not in allowed_target_types:
+            raise ValueError(
+                'Supported target types are: {}. Got {!r} instead.'.format(
+                    allowed_target_types, type_of_target_y))
+
+        y = column_or_1d(y)
+        _, y_inv, y_cnt = np.unique(y, return_inverse=True, return_counts=True)
+        if np.all(self.n_splits > y_cnt):
+            raise ValueError("n_splits=%d cannot be greater than the"
+                             " number of members in each class."
+                             % (self.n_splits))
+        n_smallest_class = np.min(y_cnt)
+        if self.n_splits > n_smallest_class:
+            warnings.warn(("The least populated class in y has only %d"
+                           " members, which is less than n_splits=%d."
+                           % (n_smallest_class, self.n_splits)), UserWarning)
+        n_classes = len(y_cnt)
+
+        _, groups_inv, groups_cnt = np.unique(
+            groups, return_inverse=True, return_counts=True)
+        y_counts_per_group = np.zeros((len(groups_cnt), n_classes))
+        for class_idx, group_idx in zip(y_inv, groups_inv):
+            y_counts_per_group[group_idx, class_idx] += 1
+
+        y_counts_per_fold = np.zeros((self.n_splits, n_classes))
+        groups_per_fold = defaultdict(set)
+
+        if self.shuffle:
+            rng.shuffle(y_counts_per_group)
+
+        # Stable sort to keep shuffled order for groups with the same
+        # class distribution variance
+        sorted_groups_idx = np.argsort(-np.std(y_counts_per_group, axis=1),
+                                       kind='mergesort')
+
+        for group_idx in sorted_groups_idx:
+            group_y_counts = y_counts_per_group[group_idx]
+            best_fold = self._find_best_fold(
+                y_counts_per_fold=y_counts_per_fold, y_cnt=y_cnt,
+                group_y_counts=group_y_counts)
+            y_counts_per_fold[best_fold] += group_y_counts
+            groups_per_fold[best_fold].add(group_idx)
+
+        for i in range(self.n_splits):
+            test_indices = [idx for idx, group_idx in enumerate(groups_inv)
+                            if group_idx in groups_per_fold[i]]
+            yield test_indices
+
+    def _find_best_fold(
+            self, y_counts_per_fold, y_cnt, group_y_counts):
+        best_fold = None
+        min_eval = np.inf
+        min_samples_in_fold = np.inf
+        for i in range(self.n_splits):
+            y_counts_per_fold[i] += group_y_counts
+            # Summarise the distribution over classes in each proposed fold
+            std_per_class = np.std(
+                y_counts_per_fold / y_cnt.reshape(1, -1),
+                axis=0)
+            y_counts_per_fold[i] -= group_y_counts
+            fold_eval = np.mean(std_per_class)
+            samples_in_fold = np.sum(y_counts_per_fold[i])
+            is_current_fold_better = (
+                fold_eval < min_eval or
+                np.isclose(fold_eval, min_eval)
+                and samples_in_fold < min_samples_in_fold
+            )
+            if is_current_fold_better:
+                min_eval = fold_eval
+                min_samples_in_fold = samples_in_fold
+                best_fold = i
+        return best_fold
+
+
 class TimeSeriesSplit(_BaseKFold):
     """Time Series cross-validator
 
diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py
index 80c19c7f2e08c..c66d8e1836ac9 100644
--- a/sklearn/model_selection/tests/test_split.py
+++ b/sklearn/model_selection/tests/test_split.py
@@ -35,6 +35,7 @@
 from sklearn.model_selection import GridSearchCV
 from sklearn.model_selection import RepeatedKFold
 from sklearn.model_selection import RepeatedStratifiedKFold
+from sklearn.model_selection import StratifiedGroupKFold
 
 from sklearn.linear_model import Ridge
 
@@ -80,6 +81,7 @@ def test_cross_validator_with_default_params():
     lopo = LeavePGroupsOut(p)
     ss = ShuffleSplit(random_state=0)
     ps = PredefinedSplit([1, 1, 2, 2])  # n_splits = np of unique folds = 2
+    sgkf = StratifiedGroupKFold(n_splits)
 
     loo_repr = "LeaveOneOut()"
     lpo_repr = "LeavePOut(p=2)"
@@ -90,15 +92,17 @@ def test_cross_validator_with_default_params():
     ss_repr = ("ShuffleSplit(n_splits=10, random_state=0, "
                "test_size=None, train_size=None)")
     ps_repr = "PredefinedSplit(test_fold=array([1, 1, 2, 2]))"
+    sgkf_repr = ("StratifiedGroupKFold(n_splits=2, random_state=None, "
+                 "shuffle=False)")
 
     n_splits_expected = [n_samples, comb(n_samples, p), n_splits, n_splits,
                          n_unique_groups, comb(n_unique_groups, p),
-                         n_shuffle_splits, 2]
+                         n_shuffle_splits, 2, n_splits]
 
     for i, (cv, cv_repr) in enumerate(zip(
-            [loo, lpo, kf, skf, lolo, lopo, ss, ps],
+            [loo, lpo, kf, skf, lolo, lopo, ss, ps, sgkf],
             [loo_repr, lpo_repr, kf_repr, skf_repr, lolo_repr, lopo_repr,
-             ss_repr, ps_repr])):
+             ss_repr, ps_repr, sgkf_repr])):
         # Test if get_n_splits works correctly
         assert n_splits_expected[i] == cv.get_n_splits(X, y, groups)
 
@@ -133,10 +137,11 @@ def test_2d_y():
     groups = rng.randint(0, 3, size=(n_samples,))
     splitters = [LeaveOneOut(), LeavePOut(p=2), KFold(), StratifiedKFold(),
                  RepeatedKFold(), RepeatedStratifiedKFold(),
-                 ShuffleSplit(), StratifiedShuffleSplit(test_size=.5),
-                 GroupShuffleSplit(), LeaveOneGroupOut(),
-                 LeavePGroupsOut(n_groups=2), GroupKFold(n_splits=3),
-                 TimeSeriesSplit(), PredefinedSplit(test_fold=groups)]
+                 StratifiedGroupKFold(), ShuffleSplit(),
+                 StratifiedShuffleSplit(test_size=.5), GroupShuffleSplit(),
+                 LeaveOneGroupOut(), LeavePGroupsOut(n_groups=2),
+                 GroupKFold(n_splits=3), TimeSeriesSplit(),
+                 PredefinedSplit(test_fold=groups)]
     for splitter in splitters:
         list(splitter.split(X, y, groups))
         list(splitter.split(X, y_2d, groups))
@@ -193,6 +198,11 @@ def test_kfold_valueerrors():
     with pytest.warns(Warning, match="The least populated class"):
         next(skf_3.split(X2, y))
 
+    sgkf_3 = StratifiedGroupKFold(3)
+    naive_groups = np.arange(len(y))
+    with pytest.warns(Warning, match="The least populated class"):
+        next(sgkf_3.split(X2, y, naive_groups))
+
     # Check that despite the warning the folds are still computed even
     # though all the classes are not necessarily represented at on each
     # side of the split at each split
@@ -200,12 +210,20 @@ def test_kfold_valueerrors():
         warnings.simplefilter("ignore")
         check_cv_coverage(skf_3, X2, y, groups=None, expected_n_splits=3)
 
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        check_cv_coverage(
+            sgkf_3, X2, y, groups=naive_groups, expected_n_splits=3
+        )
+
     # Check that errors are raised if all n_groups for individual
     # classes are less than n_splits.
     y = np.array([3, 3, -1, -1, 2])
 
     with pytest.raises(ValueError):
         next(skf_3.split(X2, y))
+    with pytest.raises(ValueError):
+        next(sgkf_3.split(X2, y))
 
     # Error when number of folds is <= 1
     with pytest.raises(ValueError):
@@ -218,6 +236,10 @@ def test_kfold_valueerrors():
         StratifiedKFold(0)
     with pytest.raises(ValueError, match=error_string):
         StratifiedKFold(1)
+    with pytest.raises(ValueError, match=error_string):
+        StratifiedGroupKFold(0)
+    with pytest.raises(ValueError, match=error_string):
+        StratifiedGroupKFold(1)
 
     # When n_splits is not integer:
     with pytest.raises(ValueError):
@@ -228,6 +250,10 @@ def test_kfold_valueerrors():
         StratifiedKFold(1.5)
     with pytest.raises(ValueError):
         StratifiedKFold(2.0)
+    with pytest.raises(ValueError):
+        StratifiedGroupKFold(1.5)
+    with pytest.raises(ValueError):
+        StratifiedGroupKFold(2.0)
 
     # When shuffle is not  a bool:
     with pytest.raises(TypeError):
@@ -318,7 +344,8 @@ def test_stratified_kfold_no_shuffle():
 
 @pytest.mark.parametrize('shuffle', [False, True])
 @pytest.mark.parametrize('k', [4, 5, 6, 7, 8, 9, 10])
-def test_stratified_kfold_ratios(k, shuffle):
+@pytest.mark.parametrize('kfold', [StratifiedKFold, StratifiedGroupKFold])
+def test_stratified_kfold_ratios(k, shuffle, kfold):
     # Check that stratified kfold preserves class ratios in individual splits
     # Repeat with shuffling turned off and on
     n_samples = 1000
@@ -326,12 +353,14 @@ def test_stratified_kfold_ratios(k, shuffle):
     y = np.array([4] * int(0.10 * n_samples) +
                  [0] * int(0.89 * n_samples) +
                  [1] * int(0.01 * n_samples))
+    # ensure perfect stratification with StratifiedGroupKFold
+    groups = np.arange(len(y))
     distr = np.bincount(y) / len(y)
 
     test_sizes = []
     random_state = None if not shuffle else 0
-    skf = StratifiedKFold(k, random_state=random_state, shuffle=shuffle)
-    for train, test in skf.split(X, y):
+    skf = kfold(k, random_state=random_state, shuffle=shuffle)
+    for train, test in skf.split(X, y, groups=groups):
         assert_allclose(np.bincount(y[train]) / len(train), distr, atol=0.02)
         assert_allclose(np.bincount(y[test]) / len(test), distr, atol=0.02)
         test_sizes.append(len(test))
@@ -340,20 +369,23 @@ def test_stratified_kfold_ratios(k, shuffle):
 
 @pytest.mark.parametrize('shuffle', [False, True])
 @pytest.mark.parametrize('k', [4, 6, 7])
-def test_stratified_kfold_label_invariance(k, shuffle):
+@pytest.mark.parametrize('kfold', [StratifiedKFold, StratifiedGroupKFold])
+def test_stratified_kfold_label_invariance(k, shuffle, kfold):
     # Check that stratified kfold gives the same indices regardless of labels
     n_samples = 100
     y = np.array([2] * int(0.10 * n_samples) +
                  [0] * int(0.89 * n_samples) +
                  [1] * int(0.01 * n_samples))
     X = np.ones(len(y))
+    # ensure perfect stratification with StratifiedGroupKFold
+    groups = np.arange(len(y))
 
     def get_splits(y):
         random_state = None if not shuffle else 0
         return [(list(train), list(test))
                 for train, test
-                in StratifiedKFold(k, random_state=random_state,
-                                   shuffle=shuffle).split(X, y)]
+                in kfold(k, random_state=random_state,
+                         shuffle=shuffle).split(X, y, groups=groups)]
 
     splits_base = get_splits(y)
     for perm in permutations([0, 1, 2]):
@@ -372,17 +404,20 @@ def test_kfold_balance():
         assert np.sum(sizes) == i
 
 
-def test_stratifiedkfold_balance():
+@pytest.mark.parametrize('kfold', [StratifiedKFold, StratifiedGroupKFold])
+def test_stratifiedkfold_balance(kfold):
     # Check that KFold returns folds with balanced sizes (only when
     # stratification is possible)
     # Repeat with shuffling turned off and on
     X = np.ones(17)
     y = [0] * 3 + [1] * 14
+    # ensure perfect stratification with StratifiedGroupKFold
+    groups = np.arange(len(y))
 
     for shuffle in (True, False):
-        cv = StratifiedKFold(3, shuffle=shuffle)
+        cv = kfold(3, shuffle=shuffle)
         for i in range(11, 17):
-            skf = cv.split(X[:i], y[:i])
+            skf = cv.split(X[:i], y[:i], groups[:i])
             sizes = [len(test) for _, test in skf]
 
             assert (np.max(sizes) - np.min(sizes)) <= 1
@@ -411,39 +446,39 @@ def test_shuffle_kfold():
     assert sum(all_folds) == 300
 
 
-def test_shuffle_kfold_stratifiedkfold_reproducibility():
+@pytest.mark.parametrize("kfold",
+                         [KFold, StratifiedKFold, StratifiedGroupKFold])
+def test_shuffle_kfold_stratifiedkfold_reproducibility(kfold):
     X = np.ones(15)  # Divisible by 3
     y = [0] * 7 + [1] * 8
+    groups_1 = np.arange(len(y))
     X2 = np.ones(16)  # Not divisible by 3
     y2 = [0] * 8 + [1] * 8
+    groups_2 = np.arange(len(y2))
 
     # Check that when the shuffle is True, multiple split calls produce the
     # same split when random_state is int
-    kf = KFold(3, shuffle=True, random_state=0)
-    skf = StratifiedKFold(3, shuffle=True, random_state=0)
+    kf = kfold(3, shuffle=True, random_state=0)
 
-    for cv in (kf, skf):
-        np.testing.assert_equal(list(cv.split(X, y)), list(cv.split(X, y)))
-        np.testing.assert_equal(list(cv.split(X2, y2)), list(cv.split(X2, y2)))
+    np.testing.assert_equal(
+        list(kf.split(X, y, groups_1)),
+        list(kf.split(X, y, groups_1))
+    )
 
     # Check that when the shuffle is True, multiple split calls often
     # (not always) produce different splits when random_state is
     # RandomState instance or None
-    kf = KFold(3, shuffle=True, random_state=np.random.RandomState(0))
-    skf = StratifiedKFold(3, shuffle=True,
-                          random_state=np.random.RandomState(0))
-
-    for cv in (kf, skf):
-        for data in zip((X, X2), (y, y2)):
-            # Test if the two splits are different cv
-            for (_, test_a), (_, test_b) in zip(cv.split(*data),
-                                                cv.split(*data)):
-                # cv.split(...) returns an array of tuples, each tuple
-                # consisting of an array with train indices and test indices
-                # Ensure that the splits for data are not same
-                # when random state is not set
-                with pytest.raises(AssertionError):
-                    np.testing.assert_array_equal(test_a, test_b)
+    kf = kfold(3, shuffle=True, random_state=np.random.RandomState(0))
+    for data in zip((X, X2), (y, y2), (groups_1, groups_2)):
+        # Test if the two splits are different cv
+        for (_, test_a), (_, test_b) in zip(kf.split(*data),
+                                            kf.split(*data)):
+            # cv.split(...) returns an array of tuples, each tuple
+            # consisting of an array with train indices and test indices
+            # Ensure that the splits for data are not same
+            # when random state is not set
+            with pytest.raises(AssertionError):
+                np.testing.assert_array_equal(test_a, test_b)
 
 
 def test_shuffle_stratifiedkfold():
@@ -514,6 +549,96 @@ def test_kfold_can_detect_dependent_samples_on_digits():  # see #2372
     assert mean_score > 0.80
 
 
+def test_stratified_group_kfold_trivial():
+    sgkf = StratifiedGroupKFold(n_splits=3)
+    # Trivial example - groups with the same distribution
+    y = np.array([1] * 6 + [0] * 12)
+    X = np.ones_like(y).reshape(-1, 1)
+    groups = np.asarray((1, 2, 3, 4, 5, 6, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6))
+    distr = np.bincount(y) / len(y)
+    test_sizes = []
+    for train, test in sgkf.split(X, y, groups):
+        # check group constraint
+        assert np.intersect1d(groups[train], groups[test]).size == 0
+        # check y distribution
+        assert_allclose(np.bincount(y[train]) / len(train), distr, atol=0.02)
+        assert_allclose(np.bincount(y[test]) / len(test), distr, atol=0.02)
+        test_sizes.append(len(test))
+    assert np.ptp(test_sizes) <= 1
+
+
+def test_stratified_group_kfold_approximate():
+    # Not perfect stratification (even though it is possible) because of
+    # iteration over groups
+    sgkf = StratifiedGroupKFold(n_splits=3)
+    y = np.array([1] * 6 + [0] * 12)
+    X = np.ones_like(y).reshape(-1, 1)
+    groups = np.array([1, 2, 3, 3, 4, 4, 1, 1, 2, 2, 3, 4, 5, 5, 5, 6, 6, 6])
+    expected = np.asarray([[0.833, 0.166], [0.666, 0.333], [0.5, 0.5]])
+    test_sizes = []
+    for (train, test), expect_dist in zip(sgkf.split(X, y, groups), expected):
+        # check group constraint
+        assert np.intersect1d(groups[train], groups[test]).size == 0
+        split_dist = np.bincount(y[test]) / len(test)
+        assert_allclose(split_dist, expect_dist, atol=0.001)
+        test_sizes.append(len(test))
+    assert np.ptp(test_sizes) <= 1
+
+
+@pytest.mark.parametrize('y, groups, expected',
+                         [(np.array([0] * 6 + [1] * 6),
+                           np.array([1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6]),
+                           np.asarray([[.5, .5],
+                                       [.5, .5],
+                                       [.5, .5]])),
+                          (np.array([0] * 9 + [1] * 3),
+                           np.array([1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 5, 6]),
+                           np.asarray([[.75, .25],
+                                       [.75, .25],
+                                       [.75, .25]]))])
+def test_stratified_group_kfold_homogeneous_groups(y, groups, expected):
+    sgkf = StratifiedGroupKFold(n_splits=3)
+    X = np.ones_like(y).reshape(-1, 1)
+    for (train, test), expect_dist in zip(sgkf.split(X, y, groups), expected):
+        # check group constraint
+        assert np.intersect1d(groups[train], groups[test]).size == 0
+        split_dist = np.bincount(y[test]) / len(test)
+        assert_allclose(split_dist, expect_dist, atol=0.001)
+
+
+@pytest.mark.parametrize('cls_distr',
+                         [(0.4, 0.6),
+                          (0.3, 0.7),
+                          (0.2, 0.8),
+                          (0.8, 0.2)])
+@pytest.mark.parametrize('n_groups', [5, 30, 70])
+def test_stratified_group_kfold_against_group_kfold(cls_distr, n_groups):
+    # Check that given sufficient amount of samples StratifiedGroupKFold
+    # produces better stratified folds than regular GroupKFold
+    n_splits = 5
+    sgkf = StratifiedGroupKFold(n_splits=n_splits)
+    gkf = GroupKFold(n_splits=n_splits)
+    rng = np.random.RandomState(0)
+    n_points = 1000
+    y = rng.choice(2, size=n_points, p=cls_distr)
+    X = np.ones_like(y).reshape(-1, 1)
+    g = rng.choice(n_groups, n_points)
+    sgkf_folds = sgkf.split(X, y, groups=g)
+    gkf_folds = gkf.split(X, y, groups=g)
+    sgkf_entr = 0
+    gkf_entr = 0
+    for (sgkf_train, sgkf_test), (_, gkf_test) in zip(sgkf_folds, gkf_folds):
+        # check group constraint
+        assert np.intersect1d(g[sgkf_train], g[sgkf_test]).size == 0
+        sgkf_distr = np.bincount(y[sgkf_test]) / len(sgkf_test)
+        gkf_distr = np.bincount(y[gkf_test]) / len(gkf_test)
+        sgkf_entr += stats.entropy(sgkf_distr, qk=cls_distr)
+        gkf_entr += stats.entropy(gkf_distr, qk=cls_distr)
+    sgkf_entr /= n_splits
+    gkf_entr /= n_splits
+    assert sgkf_entr <= gkf_entr
+
+
 def test_shuffle_split():
     ss1 = ShuffleSplit(test_size=0.2, random_state=0).split(X)
     ss2 = ShuffleSplit(test_size=2, random_state=0).split(X)
@@ -1310,7 +1435,8 @@ def test_cv_iterable_wrapper():
         "successive calls to split should yield different results")
 
 
-def test_group_kfold():
+@pytest.mark.parametrize('kfold', [GroupKFold, StratifiedGroupKFold])
+def test_group_kfold(kfold):
     rng = np.random.RandomState(0)
 
     # Parameters of the test
@@ -1329,7 +1455,7 @@ def test_group_kfold():
     len(np.unique(groups))
     # Get the test fold indices from the test set indices of each fold
     folds = np.zeros(n_samples)
-    lkf = GroupKFold(n_splits=n_splits)
+    lkf = kfold(n_splits=n_splits)
     for i, (_, test) in enumerate(lkf.split(X, y, groups)):
         folds[test] = i
 
@@ -1569,7 +1695,7 @@ def test_nested_cv():
     groups = rng.randint(0, 5, 15)
 
     cvs = [LeaveOneGroupOut(), LeaveOneOut(), GroupKFold(n_splits=3),
-           StratifiedKFold(),
+           StratifiedKFold(), StratifiedGroupKFold(),
            StratifiedShuffleSplit(n_splits=3, random_state=0)]
 
     for inner_cv, outer_cv in combinations_with_replacement(cvs, 2):
@@ -1640,7 +1766,8 @@ def test_leave_p_out_empty_trainset():
         next(cv.split(X, y, groups=[1, 2]))
 
 
-@pytest.mark.parametrize('Klass', (KFold, StratifiedKFold))
+@pytest.mark.parametrize('Klass',
+                         (KFold, StratifiedKFold, StratifiedGroupKFold))
 def test_random_state_shuffle_false(Klass):
     # passing a non-default random_state when shuffle=False makes no sense
     with pytest.raises(ValueError,
@@ -1653,6 +1780,8 @@ def test_random_state_shuffle_false(Klass):
     (KFold(shuffle=True, random_state=123), True),
     (StratifiedKFold(), True),
     (StratifiedKFold(shuffle=True, random_state=123), True),
+    (StratifiedGroupKFold(shuffle=True, random_state=123), True),
+    (StratifiedGroupKFold(), True),
     (RepeatedKFold(random_state=123), True),
     (RepeatedStratifiedKFold(random_state=123), True),
     (ShuffleSplit(random_state=123), True),
@@ -1664,7 +1793,6 @@ def test_random_state_shuffle_false(Klass):
     (LeaveOneGroupOut(), True),
     (LeavePGroupsOut(n_groups=2), True),
     (LeavePOut(p=2), True),
-
     (KFold(shuffle=True, random_state=None), False),
     (KFold(shuffle=True, random_state=None), False),
     (StratifiedKFold(shuffle=True, random_state=np.random.RandomState(0)),

From 3e45aeef901871b84ce59709e62f3d2245463cd8 Mon Sep 17 00:00:00 2001
From: LSturtew <56136443+LSturtew@users.noreply.github.com>
Date: Sat, 20 Mar 2021 15:07:09 +0100
Subject: [PATCH 265/478] TST Remove assert warn from preprocessing tests
 (#19691)

---
 sklearn/preprocessing/tests/test_data.py      | 37 ++++++++++++-------
 .../tests/test_discretization.py              | 21 ++++++-----
 .../tests/test_function_transformer.py        | 27 ++++++++------
 sklearn/preprocessing/tests/test_label.py     | 14 +++----
 4 files changed, 56 insertions(+), 43 deletions(-)

diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py
index 196060388ddd2..8a30eba27cff7 100644
--- a/sklearn/preprocessing/tests/test_data.py
+++ b/sklearn/preprocessing/tests/test_data.py
@@ -19,8 +19,6 @@
 from sklearn.utils._testing import assert_array_almost_equal
 from sklearn.utils._testing import assert_array_equal
 from sklearn.utils._testing import assert_array_less
-from sklearn.utils._testing import assert_warns_message
-from sklearn.utils._testing import assert_no_warnings
 from sklearn.utils._testing import assert_allclose
 from sklearn.utils._testing import assert_allclose_dense_sparse
 from sklearn.utils._testing import skip_if_32bit
@@ -291,28 +289,37 @@ def test_standard_scaler_numerical_stability():
     x = np.full(8, np.log(1e-5), dtype=np.float64)
     # This does not raise a warning as the number of samples is too low
     # to trigger the problem in recent numpy
-    x_scaled = assert_no_warnings(scale, x)
+    with pytest.warns(None) as record:
+        scale(x)
+    assert len(record) == 0
     assert_array_almost_equal(scale(x), np.zeros(8))
 
     # with 2 more samples, the std computation run into numerical issues:
     x = np.full(10, np.log(1e-5), dtype=np.float64)
-    w = "standard deviation of the data is probably very close to 0"
-    x_scaled = assert_warns_message(UserWarning, w, scale, x)
+    warning_message = (
+        "standard deviation of the data is probably very close to 0"
+    )
+    with pytest.warns(UserWarning, match=warning_message):
+        x_scaled = scale(x)
     assert_array_almost_equal(x_scaled, np.zeros(10))
 
     x = np.full(10, 1e-100, dtype=np.float64)
-    x_small_scaled = assert_no_warnings(scale, x)
+    with pytest.warns(None) as record:
+        x_small_scaled = scale(x)
+    assert len(record) == 0
     assert_array_almost_equal(x_small_scaled, np.zeros(10))
 
     # Large values can cause (often recoverable) numerical stability issues:
     x_big = np.full(10, 1e100, dtype=np.float64)
-    w = "Dataset may contain too large values"
-    x_big_scaled = assert_warns_message(UserWarning, w, scale, x_big)
+    warning_message = (
+        "Dataset may contain too large values"
+    )
+    with pytest.warns(UserWarning, match=warning_message):
+        x_big_scaled = scale(x_big)
     assert_array_almost_equal(x_big_scaled, np.zeros(10))
     assert_array_almost_equal(x_big_scaled, x_small_scaled)
-
-    x_big_centered = assert_warns_message(UserWarning, w, scale, x_big,
-                                          with_std=False)
+    with pytest.warns(UserWarning, match=warning_message):
+        x_big_centered = scale(x_big, with_std=False)
     assert_array_almost_equal(x_big_centered, np.zeros(10))
     assert_array_almost_equal(x_big_centered, x_small_scaled)
 
@@ -1239,9 +1246,11 @@ def test_quantile_transform_sparse_ignore_zeros():
                                       n_quantiles=5)
 
     # dense case -> warning raise
-    assert_warns_message(UserWarning, "'ignore_implicit_zeros' takes effect"
-                         " only with sparse matrix. This parameter has no"
-                         " effect.", transformer.fit, X)
+    warning_message = ("'ignore_implicit_zeros' takes effect"
+                       " only with sparse matrix. This parameter has no"
+                       " effect.")
+    with pytest.warns(UserWarning, match=warning_message):
+        transformer.fit(X)
 
     X_expected = np.array([[0, 0],
                            [0, 0],
diff --git a/sklearn/preprocessing/tests/test_discretization.py b/sklearn/preprocessing/tests/test_discretization.py
index 9d607c82d5831..87f3de1ce4c6c 100644
--- a/sklearn/preprocessing/tests/test_discretization.py
+++ b/sklearn/preprocessing/tests/test_discretization.py
@@ -9,7 +9,6 @@
 from sklearn.utils._testing import (
     assert_array_almost_equal,
     assert_array_equal,
-    assert_warns_message,
     assert_allclose_dense_sparse
 )
 
@@ -109,9 +108,10 @@ def test_same_min_max(strategy):
                   [1, 0],
                   [1, 1]])
     est = KBinsDiscretizer(strategy=strategy, n_bins=3, encode='ordinal')
-    assert_warns_message(UserWarning,
-                         "Feature 0 is constant and will be replaced "
-                         "with 0.", est.fit, X)
+    warning_message = ("Feature 0 is constant and will be replaced "
+                       "with 0.")
+    with pytest.warns(UserWarning, match=warning_message):
+        est.fit(X)
     assert est.n_bins_[0] == 1
     # replace the feature with zeros
     Xt = est.transform(X)
@@ -257,9 +257,9 @@ def test_overwrite():
 def test_redundant_bins(strategy, expected_bin_edges):
     X = [[0], [0], [0], [0], [3], [3]]
     kbd = KBinsDiscretizer(n_bins=3, strategy=strategy)
-    msg = ("Bins whose width are too small (i.e., <= 1e-8) in feature 0 "
-           "are removed. Consider decreasing the number of bins.")
-    assert_warns_message(UserWarning, msg, kbd.fit, X)
+    warning_message = ("Consider decreasing the number of bins.")
+    with pytest.warns(UserWarning, match=warning_message):
+        kbd.fit(X)
     assert_array_almost_equal(kbd.bin_edges_[0], expected_bin_edges)
 
 
@@ -269,9 +269,10 @@ def test_percentile_numeric_stability():
     Xt = np.array([0, 0, 4]).reshape(-1, 1)
     kbd = KBinsDiscretizer(n_bins=10, encode='ordinal',
                            strategy='quantile')
-    msg = ("Bins whose width are too small (i.e., <= 1e-8) in feature 0 "
-           "are removed. Consider decreasing the number of bins.")
-    assert_warns_message(UserWarning, msg, kbd.fit, X)
+    warning_message = ("Consider decreasing the number of bins.")
+    with pytest.warns(UserWarning, match=warning_message):
+        kbd.fit(X)
+
     assert_array_almost_equal(kbd.bin_edges_[0], bin_edges)
     assert_array_almost_equal(kbd.transform(X), Xt)
 
diff --git a/sklearn/preprocessing/tests/test_function_transformer.py b/sklearn/preprocessing/tests/test_function_transformer.py
index 924975fbed2e1..327bfa95f1160 100644
--- a/sklearn/preprocessing/tests/test_function_transformer.py
+++ b/sklearn/preprocessing/tests/test_function_transformer.py
@@ -4,8 +4,7 @@
 
 from sklearn.preprocessing import FunctionTransformer
 from sklearn.utils._testing import (assert_array_equal,
-                                   assert_allclose_dense_sparse)
-from sklearn.utils._testing import assert_warns_message, assert_no_warnings
+                                    assert_allclose_dense_sparse)
 
 
 def _make_func(args_store, kwargs_store, func=lambda X, *a, **k: X):
@@ -127,29 +126,35 @@ def test_check_inverse():
                                     accept_sparse=accept_sparse,
                                     check_inverse=True,
                                     validate=True)
-        assert_warns_message(UserWarning,
-                             "The provided functions are not strictly"
-                             " inverse of each other. If you are sure you"
-                             " want to proceed regardless, set"
-                             " 'check_inverse=False'.",
-                             trans.fit, X)
+        warning_message = ("The provided functions are not strictly"
+                           " inverse of each other. If you are sure you"
+                           " want to proceed regardless, set"
+                           " 'check_inverse=False'.")
+        with pytest.warns(UserWarning, match=warning_message):
+            trans.fit(X)
 
         trans = FunctionTransformer(func=np.expm1,
                                     inverse_func=np.log1p,
                                     accept_sparse=accept_sparse,
                                     check_inverse=True,
                                     validate=True)
-        Xt = assert_no_warnings(trans.fit_transform, X)
+        with pytest.warns(None) as record:
+            Xt = trans.fit_transform(X)
+        assert len(record) == 0
         assert_allclose_dense_sparse(X, trans.inverse_transform(Xt))
 
     # check that we don't check inverse when one of the func or inverse is not
     # provided.
     trans = FunctionTransformer(func=np.expm1, inverse_func=None,
                                 check_inverse=True, validate=True)
-    assert_no_warnings(trans.fit, X_dense)
+    with pytest.warns(None) as record:
+        trans.fit(X_dense)
+    assert len(record) == 0
     trans = FunctionTransformer(func=None, inverse_func=np.expm1,
                                 check_inverse=True, validate=True)
-    assert_no_warnings(trans.fit, X_dense)
+    with pytest.warns(None) as record:
+        trans.fit(X_dense)
+    assert len(record) == 0
 
 
 def test_function_transformer_frame():
diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py
index aa9361d9164de..fd396ceb90712 100644
--- a/sklearn/preprocessing/tests/test_label.py
+++ b/sklearn/preprocessing/tests/test_label.py
@@ -12,7 +12,6 @@
 from sklearn.utils.multiclass import type_of_target
 
 from sklearn.utils._testing import assert_array_equal
-from sklearn.utils._testing import assert_warns_message
 from sklearn.utils._testing import ignore_warnings
 from sklearn.utils import _to_object_array
 
@@ -351,15 +350,14 @@ def test_multilabel_binarizer_unknown_class():
     mlb = MultiLabelBinarizer()
     y = [[1, 2]]
     Y = np.array([[1, 0], [0, 1]])
-    w = 'unknown class(es) [0, 4] will be ignored'
-    matrix = assert_warns_message(UserWarning, w,
-                                  mlb.fit(y).transform, [[4, 1], [2, 0]])
-    assert_array_equal(matrix, Y)
+    warning_message = 'unknown class.* will be ignored'
+    with pytest.warns(UserWarning, match=warning_message):
+        matrix = mlb.fit(y).transform([[4, 1], [2, 0]])
 
     Y = np.array([[1, 0, 0], [0, 1, 0]])
     mlb = MultiLabelBinarizer(classes=[1, 2, 3])
-    matrix = assert_warns_message(UserWarning, w,
-                                  mlb.fit(y).transform, [[4, 1], [2, 0]])
+    with pytest.warns(UserWarning, match=warning_message):
+        matrix = mlb.fit(y).transform([[4, 1], [2, 0]])
     assert_array_equal(matrix, Y)
 
 
@@ -535,7 +533,7 @@ def check_binarized_results(y, classes, pos_label, neg_label, expected):
                                                       output_type=y_type,
                                                       classes=classes,
                                                       threshold=((neg_label +
-                                                                 pos_label) /
+                                                                  pos_label) /
                                                                  2.))
 
         assert_array_equal(toarray(inversed), toarray(y))

From 97bfa9266956993ed2c487ff085a6cea94b8eb59 Mon Sep 17 00:00:00 2001
From: JohanWork <39947546+JohanWork@users.noreply.github.com>
Date: Sun, 21 Mar 2021 14:10:57 +0100
Subject: [PATCH 266/478] DOC Update broken link in conftest.py (#19736)

---
 conftest.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/conftest.py b/conftest.py
index aec49c03ae13d..006dc973140a5 100644
--- a/conftest.py
+++ b/conftest.py
@@ -1,7 +1,7 @@
 # Even if empty this file is useful so that when running from the root folder
 # ./sklearn is added to sys.path by pytest. See
-# https://docs.pytest.org/en/latest/pythonpath.html for more details.  For
-# example, this allows to build extensions in place and run pytest
+# https://docs.pytest.org/en/latest/explanation/pythonpath.html for more
+# details. For example, this allows to build extensions in place and run pytest
 # doc/modules/clustering.rst and use sklearn from the local folder rather than
 # the one from site-packages.
 

From e377d858325276ccbe1c5ac19403182c385c2184 Mon Sep 17 00:00:00 2001
From: cliffordEmmanuel <45907515+cliffordEmmanuel@users.noreply.github.com>
Date: Sun, 21 Mar 2021 14:04:22 +0000
Subject: [PATCH 267/478] DOC Fix doc for single linkage in feature
 agglomeration (#19715)

---
 sklearn/cluster/_agglomerative.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/cluster/_agglomerative.py b/sklearn/cluster/_agglomerative.py
index 66342797e33b5..ee0a117824dd8 100644
--- a/sklearn/cluster/_agglomerative.py
+++ b/sklearn/cluster/_agglomerative.py
@@ -1000,7 +1000,7 @@ class FeatureAgglomeration(AgglomerativeClustering, AgglomerationTransform):
           the two sets.
         - complete or maximum linkage uses the maximum distances between
           all features of the two sets.
-        - single uses the minimum of the distances between all observations
+        - single uses the minimum of the distances between all features
           of the two sets.
 
     pooling_func : callable, default=np.mean

From 40f2dd1be18b70db53c746d2ae02465b62fbe01f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jon=20Haitz=20Legarreta=20Gorro=C3=B1o?=
 <jon.haitz.legarreta@gmail.com>
Date: Sun, 21 Mar 2021 13:06:49 -0400
Subject: [PATCH 268/478] DOC Fix typo in KDE metric docstring default value
 (#19735)

---
 sklearn/neighbors/_kde.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/neighbors/_kde.py b/sklearn/neighbors/_kde.py
index 57f80f83762fb..5a5ad55d3261c 100644
--- a/sklearn/neighbors/_kde.py
+++ b/sklearn/neighbors/_kde.py
@@ -41,7 +41,7 @@ class KernelDensity(BaseEstimator):
                  'cosine'}, default='gaussian'
         The kernel to use.
 
-    metric : str, default='euclidian'
+    metric : str, default='euclidean'
         The distance metric to use.  Note that not all metrics are
         valid with all algorithms.  Refer to the documentation of
         :class:`BallTree` and :class:`KDTree` for a description of

From 1db2681a051dc54b3e8b2af23a90830c67c1f56a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mehmet=20Ali=20=C3=96zer?=
 <mehmetali.ozer@std.yeditepe.edu.tr>
Date: Sun, 21 Mar 2021 20:07:24 +0300
Subject: [PATCH 269/478] DOC Fix load iris datasets (#19729)

---
 doc/modules/tree.rst | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/doc/modules/tree.rst b/doc/modules/tree.rst
index 6d30fdcc6bf2f..d62ca5d8ed3e4 100644
--- a/doc/modules/tree.rst
+++ b/doc/modules/tree.rst
@@ -130,7 +130,8 @@ Using the Iris dataset, we can construct a tree as follows::
 
     >>> from sklearn.datasets import load_iris
     >>> from sklearn import tree
-    >>> X, y = load_iris(return_X_y=True)
+    >>> iris = load_iris()
+    >>> X, y = iris.data, iris.target
     >>> clf = tree.DecisionTreeClassifier()
     >>> clf = clf.fit(X, y)
 

From 81102146e35c81d7aab16d448f1c2b66d8a67ed9 Mon Sep 17 00:00:00 2001
From: guiweber <guillaume.web@gmail.com>
Date: Sun, 21 Mar 2021 15:06:12 -0400
Subject: [PATCH 270/478] DOC Fixed typo in cross_val_predict docstring
 (#19739)

---
 sklearn/model_selection/_validation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py
index e61e693b2fa74..5f5338512a0f2 100644
--- a/sklearn/model_selection/_validation.py
+++ b/sklearn/model_selection/_validation.py
@@ -784,7 +784,7 @@ def cross_val_predict(estimator, X, y=None, *, groups=None, cv=None,
     verbose : int, default=0
         The verbosity level.
 
-    fit_params : dict, defualt=None
+    fit_params : dict, default=None
         Parameters to pass to the fit method of the estimator.
 
     pre_dispatch : int or str, default='2*n_jobs'

From c854b83c91dd8c1bf9282112dba10d50e43b59a4 Mon Sep 17 00:00:00 2001
From: Albert Thomas <albert.thomas@centraliens.net>
Date: Tue, 23 Mar 2021 10:53:57 +0100
Subject: [PATCH 271/478] [MRG] Linear One-Class SVM using SGD implementation
 (#10027)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Tom Dupré la Tour <tom.dupre-la-tour@m4x.org>
Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 benchmarks/bench_online_ocsvm.py              | 279 ++++++++++
 doc/modules/classes.rst                       |   1 +
 doc/modules/outlier_detection.rst             |  32 +-
 doc/modules/sgd.rst                           |  52 ++
 doc/whats_new/v1.0.rst                        |   7 +
 .../linear_model/plot_sgdocsvm_vs_ocsvm.py    | 135 +++++
 .../miscellaneous/plot_anomaly_comparison.py  |  30 +-
 sklearn/linear_model/__init__.py              |   3 +-
 sklearn/linear_model/_sgd_fast.pyx            |  16 +-
 sklearn/linear_model/_stochastic_gradient.py  | 486 +++++++++++++++++-
 sklearn/linear_model/tests/test_sgd.py        | 384 +++++++++++++-
 sklearn/svm/_classes.py                       |   4 +
 12 files changed, 1381 insertions(+), 48 deletions(-)
 create mode 100644 benchmarks/bench_online_ocsvm.py
 create mode 100644 examples/linear_model/plot_sgdocsvm_vs_ocsvm.py

diff --git a/benchmarks/bench_online_ocsvm.py b/benchmarks/bench_online_ocsvm.py
new file mode 100644
index 0000000000000..33262e8fcb690
--- /dev/null
+++ b/benchmarks/bench_online_ocsvm.py
@@ -0,0 +1,279 @@
+"""
+=====================================
+SGDOneClassSVM benchmark
+=====================================
+This benchmark compares the :class:`SGDOneClassSVM` with :class:`OneClassSVM`.
+The former is an online One-Class SVM implemented with a Stochastic Gradient
+Descent (SGD). The latter is based on the LibSVM implementation. The
+complexity of :class:`SGDOneClassSVM` is linear in the number of samples
+whereas the one of :class:`OneClassSVM` is at best quadratic in the number of
+samples. We here compare the performance in terms of AUC and training time on
+classical anomaly detection datasets.
+
+The :class:`OneClassSVM` is applied with a Gaussian kernel and we therefore
+use a kernel approximation prior to the application of :class:`SGDOneClassSVM`.
+"""
+
+from time import time
+import numpy as np
+
+from scipy.interpolate import interp1d
+
+from sklearn.metrics import roc_curve, auc
+from sklearn.datasets import fetch_kddcup99, fetch_covtype
+from sklearn.preprocessing import LabelBinarizer, StandardScaler
+from sklearn.pipeline import make_pipeline
+from sklearn.utils import shuffle
+from sklearn.kernel_approximation import Nystroem
+from sklearn.svm import OneClassSVM
+from sklearn.linear_model import SGDOneClassSVM
+
+import matplotlib.pyplot as plt
+import matplotlib
+
+font = {'weight': 'normal',
+        'size': 15}
+
+matplotlib.rc('font', **font)
+
+print(__doc__)
+
+
+def print_outlier_ratio(y):
+    """
+    Helper function to show the distinct value count of element in the target.
+    Useful indicator for the datasets used in bench_isolation_forest.py.
+    """
+    uniq, cnt = np.unique(y, return_counts=True)
+    print("----- Target count values: ")
+    for u, c in zip(uniq, cnt):
+        print("------ %s -> %d occurrences" % (str(u), c))
+    print("----- Outlier ratio: %.5f" % (np.min(cnt) / len(y)))
+
+
+# for roc curve computation
+n_axis = 1000
+x_axis = np.linspace(0, 1, n_axis)
+
+datasets = ['http', 'smtp', 'SA', 'SF', 'forestcover']
+
+novelty_detection = False  # if False, training set polluted by outliers
+
+random_states = [42]
+nu = 0.05
+
+results_libsvm = np.empty((len(datasets), n_axis + 5))
+results_online = np.empty((len(datasets), n_axis + 5))
+
+for dat, dataset_name in enumerate(datasets):
+
+    print(dataset_name)
+
+    # Loading datasets
+    if dataset_name in ['http', 'smtp', 'SA', 'SF']:
+        dataset = fetch_kddcup99(subset=dataset_name, shuffle=False,
+                                 percent10=False, random_state=88)
+        X = dataset.data
+        y = dataset.target
+
+    if dataset_name == 'forestcover':
+        dataset = fetch_covtype(shuffle=False)
+        X = dataset.data
+        y = dataset.target
+        # normal data are those with attribute 2
+        # abnormal those with attribute 4
+        s = (y == 2) + (y == 4)
+        X = X[s, :]
+        y = y[s]
+        y = (y != 2).astype(int)
+
+    # Vectorizing data
+    if dataset_name == 'SF':
+        # Casting type of X (object) as string is needed for string categorical
+        # features to apply LabelBinarizer
+        lb = LabelBinarizer()
+        x1 = lb.fit_transform(X[:, 1].astype(str))
+        X = np.c_[X[:, :1], x1, X[:, 2:]]
+        y = (y != b'normal.').astype(int)
+
+    if dataset_name == 'SA':
+        lb = LabelBinarizer()
+        # Casting type of X (object) as string is needed for string categorical
+        # features to apply LabelBinarizer
+        x1 = lb.fit_transform(X[:, 1].astype(str))
+        x2 = lb.fit_transform(X[:, 2].astype(str))
+        x3 = lb.fit_transform(X[:, 3].astype(str))
+        X = np.c_[X[:, :1], x1, x2, x3, X[:, 4:]]
+        y = (y != b'normal.').astype(int)
+
+    if dataset_name in ['http', 'smtp']:
+        y = (y != b'normal.').astype(int)
+
+    print_outlier_ratio(y)
+
+    n_samples, n_features = np.shape(X)
+    if dataset_name == 'SA':  # LibSVM too long with n_samples // 2
+        n_samples_train = n_samples // 20
+    else:
+        n_samples_train = n_samples // 2
+
+    n_samples_test = n_samples - n_samples_train
+    print('n_train: ', n_samples_train)
+    print('n_features: ', n_features)
+
+    tpr_libsvm = np.zeros(n_axis)
+    tpr_online = np.zeros(n_axis)
+    fit_time_libsvm = 0
+    fit_time_online = 0
+    predict_time_libsvm = 0
+    predict_time_online = 0
+
+    X = X.astype(float)
+
+    gamma = 1 / n_features  # OCSVM default parameter
+
+    for random_state in random_states:
+
+        print('random state: %s' % random_state)
+
+        X, y = shuffle(X, y, random_state=random_state)
+        X_train = X[:n_samples_train]
+        X_test = X[n_samples_train:]
+        y_train = y[:n_samples_train]
+        y_test = y[n_samples_train:]
+
+        if novelty_detection:
+            X_train = X_train[y_train == 0]
+            y_train = y_train[y_train == 0]
+
+        std = StandardScaler()
+
+        print('----------- LibSVM OCSVM ------------')
+        ocsvm = OneClassSVM(kernel='rbf', gamma=gamma, nu=nu)
+        pipe_libsvm = make_pipeline(std, ocsvm)
+
+        tstart = time()
+        pipe_libsvm.fit(X_train)
+        fit_time_libsvm += time() - tstart
+
+        tstart = time()
+        # scoring such that the lower, the more normal
+        scoring = -pipe_libsvm.decision_function(X_test)
+        predict_time_libsvm += time() - tstart
+        fpr_libsvm_, tpr_libsvm_, _ = roc_curve(y_test, scoring)
+
+        f_libsvm = interp1d(fpr_libsvm_, tpr_libsvm_)
+        tpr_libsvm += f_libsvm(x_axis)
+
+        print('----------- Online OCSVM ------------')
+        nystroem = Nystroem(gamma=gamma, random_state=random_state)
+        online_ocsvm = SGDOneClassSVM(nu=nu, random_state=random_state)
+        pipe_online = make_pipeline(std, nystroem, online_ocsvm)
+
+        tstart = time()
+        pipe_online.fit(X_train)
+        fit_time_online += time() - tstart
+
+        tstart = time()
+        # scoring such that the lower, the more normal
+        scoring = -pipe_online.decision_function(X_test)
+        predict_time_online += time() - tstart
+        fpr_online_, tpr_online_, _ = roc_curve(y_test, scoring)
+
+        f_online = interp1d(fpr_online_, tpr_online_)
+        tpr_online += f_online(x_axis)
+
+    tpr_libsvm /= len(random_states)
+    tpr_libsvm[0] = 0.
+    fit_time_libsvm /= len(random_states)
+    predict_time_libsvm /= len(random_states)
+    auc_libsvm = auc(x_axis, tpr_libsvm)
+
+    results_libsvm[dat] = ([fit_time_libsvm, predict_time_libsvm,
+                            auc_libsvm, n_samples_train,
+                            n_features] + list(tpr_libsvm))
+
+    tpr_online /= len(random_states)
+    tpr_online[0] = 0.
+    fit_time_online /= len(random_states)
+    predict_time_online /= len(random_states)
+    auc_online = auc(x_axis, tpr_online)
+
+    results_online[dat] = ([fit_time_online, predict_time_online,
+                            auc_online, n_samples_train,
+                            n_features] + list(tpr_libsvm))
+
+
+# -------- Plotting bar charts -------------
+fit_time_libsvm_all = results_libsvm[:, 0]
+predict_time_libsvm_all = results_libsvm[:, 1]
+auc_libsvm_all = results_libsvm[:, 2]
+n_train_all = results_libsvm[:, 3]
+n_features_all = results_libsvm[:, 4]
+
+fit_time_online_all = results_online[:, 0]
+predict_time_online_all = results_online[:, 1]
+auc_online_all = results_online[:, 2]
+
+
+width = 0.7
+ind = 2 * np.arange(len(datasets))
+x_tickslabels = [(name + '\n' + r'$n={:,d}$' + '\n' + r'$d={:d}$')
+                 .format(int(n), int(d))
+                 for name, n, d in zip(datasets, n_train_all, n_features_all)]
+
+
+def autolabel_auc(rects, ax):
+    """Attach a text label above each bar displaying its height."""
+    for rect in rects:
+        height = rect.get_height()
+        ax.text(rect.get_x() + rect.get_width() / 2., 1.05 * height,
+                '%.3f' % height, ha='center', va='bottom')
+
+
+def autolabel_time(rects, ax):
+    """Attach a text label above each bar displaying its height."""
+    for rect in rects:
+        height = rect.get_height()
+        ax.text(rect.get_x() + rect.get_width() / 2., 1.05 * height,
+                '%.1f' % height, ha='center', va='bottom')
+
+
+fig, ax = plt.subplots(figsize=(15, 8))
+ax.set_ylabel('AUC')
+ax.set_ylim((0, 1.3))
+rect_libsvm = ax.bar(ind, auc_libsvm_all, width=width, color='r')
+rect_online = ax.bar(ind + width, auc_online_all, width=width, color='y')
+ax.legend((rect_libsvm[0], rect_online[0]), ('LibSVM', 'Online SVM'))
+ax.set_xticks(ind + width / 2)
+ax.set_xticklabels(x_tickslabels)
+autolabel_auc(rect_libsvm, ax)
+autolabel_auc(rect_online, ax)
+plt.show()
+
+
+fig, ax = plt.subplots(figsize=(15, 8))
+ax.set_ylabel('Training time (sec) - Log scale')
+ax.set_yscale('log')
+rect_libsvm = ax.bar(ind, fit_time_libsvm_all, color='r', width=width)
+rect_online = ax.bar(ind + width, fit_time_online_all, color='y', width=width)
+ax.legend((rect_libsvm[0], rect_online[0]), ('LibSVM', 'Online SVM'))
+ax.set_xticks(ind + width / 2)
+ax.set_xticklabels(x_tickslabels)
+autolabel_time(rect_libsvm, ax)
+autolabel_time(rect_online, ax)
+plt.show()
+
+
+fig, ax = plt.subplots(figsize=(15, 8))
+ax.set_ylabel('Testing time (sec) - Log scale')
+ax.set_yscale('log')
+rect_libsvm = ax.bar(ind, predict_time_libsvm_all, color='r', width=width)
+rect_online = ax.bar(ind + width, predict_time_online_all,
+                     color='y', width=width)
+ax.legend((rect_libsvm[0], rect_online[0]), ('LibSVM', 'Online SVM'))
+ax.set_xticks(ind + width / 2)
+ax.set_xticklabels(x_tickslabels)
+autolabel_time(rect_libsvm, ax)
+autolabel_time(rect_online, ax)
+plt.show()
diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index ceebfc337352a..45195dcedec64 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -762,6 +762,7 @@ Linear classifiers
    linear_model.RidgeClassifier
    linear_model.RidgeClassifierCV
    linear_model.SGDClassifier
+   linear_model.SGDOneClassSVM
 
 Classical linear regressors
 ---------------------------
diff --git a/doc/modules/outlier_detection.rst b/doc/modules/outlier_detection.rst
index 5d2008f3c3f58..14495bc558dab 100644
--- a/doc/modules/outlier_detection.rst
+++ b/doc/modules/outlier_detection.rst
@@ -110,9 +110,14 @@ does not perform very well for outlier detection. That being said, outlier
 detection in high-dimension, or without any assumptions on the distribution
 of the inlying data is very challenging. :class:`svm.OneClassSVM` may still
 be used with outlier detection but requires fine-tuning of its hyperparameter
-`nu` to handle outliers and prevent overfitting. Finally,
-:class:`covariance.EllipticEnvelope` assumes the data is Gaussian and learns
-an ellipse. For more details on the different estimators refer to the example
+`nu` to handle outliers and prevent overfitting.
+:class:`linear_model.SGDOneClassSVM` provides an implementation of a
+linear One-Class SVM with a linear complexity in the number of samples. This
+implementation is here used with a kernel approximation technique to obtain
+results similar to :class:`svm.OneClassSVM` which uses a Gaussian kernel
+by default. Finally, :class:`covariance.EllipticEnvelope` assumes the data is
+Gaussian and learns an ellipse. For more details on the different estimators
+refer to the example
 :ref:`sphx_glr_auto_examples_miscellaneous_plot_anomaly_comparison.py` and the
 sections hereunder.
 
@@ -173,6 +178,23 @@ but regular, observation outside the frontier.
    :scale: 75%
 
 
+Scaling up the One-Class SVM
+----------------------------
+
+An online linear version of the One-Class SVM is implemented in
+:class:`linear_model.SGDOneClassSVM`. This implementation scales linearly with
+the number of samples and can be used with a kernel approximation to
+approximate the solution of a kernelized :class:`svm.OneClassSVM` whose
+complexity is at best quadratic in the number of samples. See section
+:ref:`sgd_online_one_class_svm` for more details.
+
+.. topic:: Examples:
+
+  * See :ref:`sphx_glr_auto_examples_linear_model_plot_sgdocsvm_vs_ocsvm.py`
+    for an illustration of the approximation of a kernelized One-Class SVM
+    with the `linear_model.SGDOneClassSVM` combined with kernel approximation.
+
+
 Outlier Detection
 =================
 
@@ -278,8 +300,8 @@ allows you to add more trees to an already fitted model::
      for a comparison of :class:`ensemble.IsolationForest` with
      :class:`neighbors.LocalOutlierFactor`,
      :class:`svm.OneClassSVM` (tuned to perform like an outlier detection
-     method) and a covariance-based outlier detection with
-     :class:`covariance.EllipticEnvelope`.
+     method), :class:`linear_model.SGDOneClassSVM`, and a covariance-based
+     outlier detection with :class:`covariance.EllipticEnvelope`.
 
 .. topic:: References:
 
diff --git a/doc/modules/sgd.rst b/doc/modules/sgd.rst
index 1376947540e78..0a1d8407e64ae 100644
--- a/doc/modules/sgd.rst
+++ b/doc/modules/sgd.rst
@@ -232,6 +232,58 @@ For regression with a squared loss and a l2 penalty, another variant of
 SGD with an averaging strategy is available with Stochastic Average
 Gradient (SAG) algorithm, available as a solver in :class:`Ridge`.
 
+.. _sgd_online_one_class_svm:
+
+Online One-Class SVM
+====================
+
+The class :class:`sklearn.linear_model.SGDOneClassSVM` implements an online
+linear version of the One-Class SVM using a stochastic gradient descent.
+Combined with kernel approximation techniques,
+:class:`sklearn.linear_model.SGDOneClassSVM` can be used to approximate the
+solution of a kernelized One-Class SVM, implemented in
+:class:`sklearn.svm.OneClassSVM`, with a linear complexity in the number of
+samples. Note that the complexity of a kernelized One-Class SVM is at best
+quadratic in the number of samples.
+:class:`sklearn.linear_model.SGDOneClassSVM` is thus well suited for datasets
+with a large number of training samples (> 10,000) for which the SGD
+variant can be several orders of magnitude faster.
+
+Its implementation is based on the implementation of the stochastic
+gradient descent. Indeed, the original optimization problem of the One-Class
+SVM is given by
+
+.. math::
+
+  \begin{aligned}
+  \min_{w, \rho, \xi} & \quad \frac{1}{2}\Vert w \Vert^2 - \rho + \frac{1}{\nu n} \sum_{i=1}^n \xi_i \\
+  \text{s.t.} & \quad \langle w, x_i \rangle \geq \rho - \xi_i \quad 1 \leq i \leq n \\
+  & \quad \xi_i \geq 0 \quad 1 \leq i \leq n
+  \end{aligned}
+
+where :math:`\nu \in (0, 1]` is the user-specified parameter controlling the
+proportion of outliers and the proportion of support vectors. Getting rid of
+the slack variables :math:`\xi_i` this problem is equivalent to
+
+.. math::
+
+  \min_{w, \rho} \frac{1}{2}\Vert w \Vert^2 - \rho + \frac{1}{\nu n} \sum_{i=1}^n \max(0, \rho - \langle w, x_i \rangle) \, .
+
+Multiplying by the constant :math:`\nu` and introducing the intercept
+:math:`b = 1 - \rho` we obtain the following equivalent optimization problem
+
+.. math::
+
+  \min_{w, b} \frac{\nu}{2}\Vert w \Vert^2 + b\nu + \frac{1}{n} \sum_{i=1}^n \max(0, 1 - (\langle w, x_i \rangle + b)) \, .
+
+This is similar to the optimization problems studied in section
+:ref:`sgd_mathematical_formulation` with :math:`y_i = 1, 1 \leq i \leq n` and
+:math:`\alpha = \nu/2`, :math:`L` being the hinge loss function and :math:`R`
+being the L2 norm. We just need to add the term :math:`b\nu` in the
+optimization loop.
+
+As :class:`SGDClassifier` and :class:`SGDRegressor`, :class:`SGDOneClassSVM`
+supports averaged SGD. Averaging can be enabled by setting ``average=True``.
 
 Stochastic Gradient Descent for sparse data
 ===========================================
diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 521e358ac2f02..c252f5df1074e 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -147,6 +147,13 @@ Changelog
 :mod:`sklearn.linear_model`
 ...........................
 
+- |Feature| The new :class:`linear_model.SGDOneClassSVM` provides an SGD
+  implementation of the linear One-Class SVM. Combined with kernel
+  approximation techniques, this implementation approximates the solution of
+  a kernelized One Class SVM while benefitting from a linear 
+  complexity in the number of samples.
+  :pr:`10027` by :user:`Albert Thomas <albertcthomas>`.
+
 - |Efficiency| The implementation of :class:`linear_model.LogisticRegression`
   has been optimised for dense matrices when using `solver='newton-cg'` and
   `multi_class!='multinomial'`.
diff --git a/examples/linear_model/plot_sgdocsvm_vs_ocsvm.py b/examples/linear_model/plot_sgdocsvm_vs_ocsvm.py
new file mode 100644
index 0000000000000..e70694cdb1c1b
--- /dev/null
+++ b/examples/linear_model/plot_sgdocsvm_vs_ocsvm.py
@@ -0,0 +1,135 @@
+"""
+====================================================================
+One-Class SVM versus One-Class SVM using Stochastic Gradient Descent
+====================================================================
+
+This example shows how to approximate the solution of
+:class:`sklearn.svm.OneClassSVM` in the case of an RBF kernel with
+:class:`sklearn.linear_model.SGDOneClassSVM`, a Stochastic Gradient Descent
+(SGD) version of the One-Class SVM. A kernel approximation is first used in
+order to apply :class:`sklearn.linear_model.SGDOneClassSVM` which implements a
+linear One-Class SVM using SGD.
+
+Note that :class:`sklearn.linear_model.SGDOneClassSVM` scales linearly with
+the number of samples whereas the complexity of a kernelized
+:class:`sklearn.svm.OneClassSVM` is at best quadratic with respect to the
+number of samples. It is not the purpose of this example to illustrate the
+benefits of such an approximation in terms of computation time but rather to
+show that we obtain similar results on a toy dataset.
+"""
+print(__doc__)  # noqa
+
+import numpy as np
+import matplotlib.pyplot as plt
+import matplotlib
+from sklearn.svm import OneClassSVM
+from sklearn.linear_model import SGDOneClassSVM
+from sklearn.kernel_approximation import Nystroem
+from sklearn.pipeline import make_pipeline
+
+font = {'weight': 'normal',
+        'size': 15}
+
+matplotlib.rc('font', **font)
+
+random_state = 42
+rng = np.random.RandomState(random_state)
+
+# Generate train data
+X = 0.3 * rng.randn(500, 2)
+X_train = np.r_[X + 2, X - 2]
+# Generate some regular novel observations
+X = 0.3 * rng.randn(20, 2)
+X_test = np.r_[X + 2, X - 2]
+# Generate some abnormal novel observations
+X_outliers = rng.uniform(low=-4, high=4, size=(20, 2))
+
+xx, yy = np.meshgrid(np.linspace(-4.5, 4.5, 50), np.linspace(-4.5, 4.5, 50))
+
+# OCSVM hyperparameters
+nu = 0.05
+gamma = 2.
+
+# Fit the One-Class SVM
+clf = OneClassSVM(gamma=gamma, kernel='rbf', nu=nu)
+clf.fit(X_train)
+y_pred_train = clf.predict(X_train)
+y_pred_test = clf.predict(X_test)
+y_pred_outliers = clf.predict(X_outliers)
+n_error_train = y_pred_train[y_pred_train == -1].size
+n_error_test = y_pred_test[y_pred_test == -1].size
+n_error_outliers = y_pred_outliers[y_pred_outliers == 1].size
+
+Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
+Z = Z.reshape(xx.shape)
+
+
+# Fit the One-Class SVM using a kernel approximation and SGD
+transform = Nystroem(gamma=gamma, random_state=random_state)
+clf_sgd = SGDOneClassSVM(nu=nu, shuffle=True, fit_intercept=True,
+                         random_state=random_state, tol=1e-4)
+pipe_sgd = make_pipeline(transform, clf_sgd)
+pipe_sgd.fit(X_train)
+y_pred_train_sgd = pipe_sgd.predict(X_train)
+y_pred_test_sgd = pipe_sgd.predict(X_test)
+y_pred_outliers_sgd = pipe_sgd.predict(X_outliers)
+n_error_train_sgd = y_pred_train_sgd[y_pred_train_sgd == -1].size
+n_error_test_sgd = y_pred_test_sgd[y_pred_test_sgd == -1].size
+n_error_outliers_sgd = y_pred_outliers_sgd[y_pred_outliers_sgd == 1].size
+
+Z_sgd = pipe_sgd.decision_function(np.c_[xx.ravel(), yy.ravel()])
+Z_sgd = Z_sgd.reshape(xx.shape)
+
+# plot the level sets of the decision function
+plt.figure(figsize=(9, 6))
+plt.title('One Class SVM')
+plt.contourf(xx, yy, Z, levels=np.linspace(Z.min(), 0, 7), cmap=plt.cm.PuBu)
+a = plt.contour(xx, yy, Z, levels=[0], linewidths=2, colors='darkred')
+plt.contourf(xx, yy, Z, levels=[0, Z.max()], colors='palevioletred')
+
+s = 20
+b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white', s=s, edgecolors='k')
+b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='blueviolet', s=s,
+                 edgecolors='k')
+c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c='gold', s=s,
+                edgecolors='k')
+plt.axis('tight')
+plt.xlim((-4.5, 4.5))
+plt.ylim((-4.5, 4.5))
+plt.legend([a.collections[0], b1, b2, c],
+           ["learned frontier", "training observations",
+            "new regular observations", "new abnormal observations"],
+           loc="upper left")
+plt.xlabel(
+    "error train: %d/%d; errors novel regular: %d/%d; "
+    "errors novel abnormal: %d/%d"
+    % (n_error_train, X_train.shape[0], n_error_test, X_test.shape[0],
+       n_error_outliers, X_outliers.shape[0]))
+plt.show()
+
+plt.figure(figsize=(9, 6))
+plt.title('Online One-Class SVM')
+plt.contourf(xx, yy, Z_sgd, levels=np.linspace(Z_sgd.min(), 0, 7),
+             cmap=plt.cm.PuBu)
+a = plt.contour(xx, yy, Z_sgd, levels=[0], linewidths=2, colors='darkred')
+plt.contourf(xx, yy, Z_sgd, levels=[0, Z_sgd.max()], colors='palevioletred')
+
+s = 20
+b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white', s=s, edgecolors='k')
+b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='blueviolet', s=s,
+                 edgecolors='k')
+c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c='gold', s=s,
+                edgecolors='k')
+plt.axis('tight')
+plt.xlim((-4.5, 4.5))
+plt.ylim((-4.5, 4.5))
+plt.legend([a.collections[0], b1, b2, c],
+           ["learned frontier", "training observations",
+            "new regular observations", "new abnormal observations"],
+           loc="upper left")
+plt.xlabel(
+    "error train: %d/%d; errors novel regular: %d/%d; "
+    "errors novel abnormal: %d/%d"
+    % (n_error_train_sgd, X_train.shape[0], n_error_test_sgd, X_test.shape[0],
+       n_error_outliers_sgd, X_outliers.shape[0]))
+plt.show()
diff --git a/examples/miscellaneous/plot_anomaly_comparison.py b/examples/miscellaneous/plot_anomaly_comparison.py
index b5ebd96bd8815..c0c3a4f890923 100644
--- a/examples/miscellaneous/plot_anomaly_comparison.py
+++ b/examples/miscellaneous/plot_anomaly_comparison.py
@@ -22,7 +22,17 @@
 One-class SVM might give useful results in these situations depending on the
 value of its hyperparameters.
 
-:class:`~sklearn.covariance.EllipticEnvelope` assumes the data is Gaussian and
+The :class:`sklearn.linear_model.SGDOneClassSVM` is an implementation of the
+One-Class SVM based on stochastic gradient descent (SGD). Combined with kernel
+approximation, this estimator can be used to approximate the solution
+of a kernelized :class:`sklearn.svm.OneClassSVM`. We note that, although not
+identical, the decision boundaries of the
+:class:`sklearn.linear_model.SGDOneClassSVM` and the ones of
+:class:`sklearn.svm.OneClassSVM` are very similar. The main advantage of using
+:class:`sklearn.linear_model.SGDOneClassSVM` is that it scales linearly with
+the number of samples.
+
+:class:`sklearn.covariance.EllipticEnvelope` assumes the data is Gaussian and
 learns an ellipse. It thus degrades when the data is not unimodal. Notice
 however that this estimator is robust to outliers.
 
@@ -66,6 +76,9 @@
 from sklearn.covariance import EllipticEnvelope
 from sklearn.ensemble import IsolationForest
 from sklearn.neighbors import LocalOutlierFactor
+from sklearn.linear_model import SGDOneClassSVM
+from sklearn.kernel_approximation import Nystroem
+from sklearn.pipeline import make_pipeline
 
 print(__doc__)
 
@@ -77,11 +90,18 @@
 n_outliers = int(outliers_fraction * n_samples)
 n_inliers = n_samples - n_outliers
 
-# define outlier/anomaly detection methods to be compared
+# define outlier/anomaly detection methods to be compared.
+# the SGDOneClassSVM must be used in a pipeline with a kernel approximation
+# to give similar results to the OneClassSVM
 anomaly_algorithms = [
     ("Robust covariance", EllipticEnvelope(contamination=outliers_fraction)),
     ("One-Class SVM", svm.OneClassSVM(nu=outliers_fraction, kernel="rbf",
                                       gamma=0.1)),
+    ("One-Class SVM (SGD)", make_pipeline(
+        Nystroem(gamma=0.1, random_state=42, n_components=150),
+        SGDOneClassSVM(nu=outliers_fraction, shuffle=True,
+                       fit_intercept=True, random_state=42, tol=1e-6)
+    )),
     ("Isolation Forest", IsolationForest(contamination=outliers_fraction,
                                          random_state=42)),
     ("Local Outlier Factor", LocalOutlierFactor(
@@ -104,7 +124,7 @@
 xx, yy = np.meshgrid(np.linspace(-7, 7, 150),
                      np.linspace(-7, 7, 150))
 
-plt.figure(figsize=(len(anomaly_algorithms) * 2 + 3, 12.5))
+plt.figure(figsize=(len(anomaly_algorithms) * 2 + 4, 12.5))
 plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=.05,
                     hspace=.01)
 
@@ -113,8 +133,8 @@
 
 for i_dataset, X in enumerate(datasets):
     # Add outliers
-    X = np.concatenate([X, rng.uniform(low=-6, high=6,
-                       size=(n_outliers, 2))], axis=0)
+    X = np.concatenate([X, rng.uniform(low=-6, high=6, size=(n_outliers, 2))],
+                       axis=0)
 
     for name, algorithm in anomaly_algorithms:
         t0 = time.time()
diff --git a/sklearn/linear_model/__init__.py b/sklearn/linear_model/__init__.py
index 110e0008bccc9..f715e30795961 100644
--- a/sklearn/linear_model/__init__.py
+++ b/sklearn/linear_model/__init__.py
@@ -18,7 +18,7 @@
                    GammaRegressor, TweedieRegressor)
 from ._huber import HuberRegressor
 from ._sgd_fast import Hinge, Log, ModifiedHuber, SquaredLoss, Huber
-from ._stochastic_gradient import SGDClassifier, SGDRegressor
+from ._stochastic_gradient import SGDClassifier, SGDRegressor, SGDOneClassSVM
 from ._ridge import (Ridge, RidgeCV, RidgeClassifier, RidgeClassifierCV,
                      ridge_regression)
 from ._logistic import LogisticRegression, LogisticRegressionCV
@@ -65,6 +65,7 @@
            'RidgeClassifierCV',
            'SGDClassifier',
            'SGDRegressor',
+           'SGDOneClassSVM',
            'SquaredLoss',
            'TheilSenRegressor',
            'enet_path',
diff --git a/sklearn/linear_model/_sgd_fast.pyx b/sklearn/linear_model/_sgd_fast.pyx
index 3940e5d873669..dab7b36b14d0e 100644
--- a/sklearn/linear_model/_sgd_fast.pyx
+++ b/sklearn/linear_model/_sgd_fast.pyx
@@ -55,7 +55,7 @@ cdef class LossFunction:
         Parameters
         ----------
         p : double
-            The prediction, p = w^T x
+            The prediction, p = w^T x + intercept
         y : double
             The true value (aka target)
 
@@ -358,6 +358,7 @@ def _plain_sgd(np.ndarray[double, ndim=1, mode='c'] weights,
                double weight_pos, double weight_neg,
                int learning_rate, double eta0,
                double power_t,
+               bint one_class,
                double t=1.0,
                double intercept_decay=1.0,
                int average=0):
@@ -427,6 +428,8 @@ def _plain_sgd(np.ndarray[double, ndim=1, mode='c'] weights,
         The initial learning rate.
     power_t : double
         The exponent for inverse scaling learning rate.
+    one_class : boolean
+        Whether to solve the One-Class SVM optimization problem.
     t : double
         Initial state of the learning rate. This value is equal to the
         iteration count except when the learning rate is set to `optimal`.
@@ -435,6 +438,7 @@ def _plain_sgd(np.ndarray[double, ndim=1, mode='c'] weights,
         The number of iterations before averaging starts. average=1 is
         equivalent to averaging for all iterations.
 
+
     Returns
     -------
     weights : array, shape=[n_features]
@@ -468,6 +472,7 @@ def _plain_sgd(np.ndarray[double, ndim=1, mode='c'] weights,
     cdef double eta = 0.0
     cdef double p = 0.0
     cdef double update = 0.0
+    cdef double intercept_update = 0.0
     cdef double sumloss = 0.0
     cdef double score = 0.0
     cdef double best_loss = INFINITY
@@ -574,10 +579,15 @@ def _plain_sgd(np.ndarray[double, ndim=1, mode='c'] weights,
                     # do not scale to negative values when eta or alpha are too
                     # big: instead set the weights to zero
                     w.scale(max(0, 1.0 - ((1.0 - l1_ratio) * eta * alpha)))
+
                 if update != 0.0:
                     w.add(x_data_ptr, x_ind_ptr, xnnz, update)
-                    if fit_intercept == 1:
-                        intercept += update * intercept_decay
+                if fit_intercept == 1:
+                    intercept_update = update
+                    if one_class:  # specific for One-Class SVM
+                        intercept_update -= 2. * eta * alpha
+                    if intercept_update != 0:
+                        intercept += intercept_update * intercept_decay
 
                 if 0 < average <= t:
                     # compute the average for the intercept and update the
diff --git a/sklearn/linear_model/_stochastic_gradient.py b/sklearn/linear_model/_stochastic_gradient.py
index a426c9a8d95f2..44ecf564ffcc5 100644
--- a/sklearn/linear_model/_stochastic_gradient.py
+++ b/sklearn/linear_model/_stochastic_gradient.py
@@ -2,7 +2,9 @@
 #          Mathieu Blondel (partial_fit support)
 #
 # License: BSD 3 clause
-"""Classification and regression using Stochastic Gradient Descent (SGD)."""
+"""Classification, regression and One-Class SVM using Stochastic Gradient
+Descent (SGD).
+"""
 
 import numpy as np
 import warnings
@@ -14,7 +16,7 @@
 from ..base import clone, is_classifier
 from ._base import LinearClassifierMixin, SparseCoefMixin
 from ._base import make_dataset
-from ..base import BaseEstimator, RegressorMixin
+from ..base import BaseEstimator, RegressorMixin, OutlierMixin
 from ..utils import check_random_state
 from ..utils.extmath import safe_sparse_dot
 from ..utils.multiclass import _check_partial_fit_first_call
@@ -134,7 +136,7 @@ def _validate_params(self, for_partial_fit=False):
             raise ValueError("max_iter must be > zero. Got %f" % self.max_iter)
         if not (0.0 <= self.l1_ratio <= 1.0):
             raise ValueError("l1_ratio must be in [0, 1]")
-        if self.alpha < 0.0:
+        if not isinstance(self, SGDOneClassSVM) and self.alpha < 0.0:
             raise ValueError("alpha must be >= 0")
         if self.n_iter_no_change < 1:
             raise ValueError("n_iter_no_change must be >= 1")
@@ -190,7 +192,7 @@ def _get_penalty_type(self, penalty):
             raise ValueError("Penalty %s is not supported. " % penalty) from e
 
     def _allocate_parameter_mem(self, n_classes, n_features, coef_init=None,
-                                intercept_init=None):
+                                intercept_init=None, one_class=0):
         """Allocate mem for parameters; initialize if provided."""
         if n_classes > 2:
             # allocate coef_ for multi-class
@@ -215,7 +217,7 @@ def _allocate_parameter_mem(self, n_classes, n_features, coef_init=None,
                 self.intercept_ = np.zeros(n_classes, dtype=np.float64,
                                            order="C")
         else:
-            # allocate coef_ for binary problem
+            # allocate coef_
             if coef_init is not None:
                 coef_init = np.asarray(coef_init, dtype=np.float64,
                                        order="C")
@@ -229,26 +231,36 @@ def _allocate_parameter_mem(self, n_classes, n_features, coef_init=None,
                                       dtype=np.float64,
                                       order="C")
 
-            # allocate intercept_ for binary problem
+            # allocate intercept_
             if intercept_init is not None:
                 intercept_init = np.asarray(intercept_init, dtype=np.float64)
                 if intercept_init.shape != (1,) and intercept_init.shape != ():
                     raise ValueError("Provided intercept_init "
                                      "does not match dataset.")
-                self.intercept_ = intercept_init.reshape(1,)
+                if one_class:
+                    self.offset_ = intercept_init.reshape(1,)
+                else:
+                    self.intercept_ = intercept_init.reshape(1,)
             else:
-                self.intercept_ = np.zeros(1, dtype=np.float64, order="C")
+                if one_class:
+                    self.offset_ = np.zeros(1, dtype=np.float64, order="C")
+                else:
+                    self.intercept_ = np.zeros(1, dtype=np.float64, order="C")
 
         # initialize average parameters
         if self.average > 0:
             self._standard_coef = self.coef_
-            self._standard_intercept = self.intercept_
             self._average_coef = np.zeros(self.coef_.shape,
                                           dtype=np.float64,
                                           order="C")
-            self._average_intercept = np.zeros(self._standard_intercept.shape,
-                                               dtype=np.float64,
-                                               order="C")
+            if one_class:
+                self._standard_intercept = 1 - self.offset_
+            else:
+                self._standard_intercept = self.intercept_
+
+            self._average_intercept = np.zeros(
+                self._standard_intercept.shape, dtype=np.float64,
+                order="C")
 
     def _make_validation_split(self, y):
         """Split the dataset between training set and validation set.
@@ -447,7 +459,7 @@ def fit_binary(est, i, X, y, alpha, C, learning_rate, max_iter,
         est.early_stopping, validation_score_cb, int(est.n_iter_no_change),
         max_iter, tol, int(est.fit_intercept), int(est.verbose),
         int(est.shuffle), seed, pos_weight, neg_weight, learning_rate_type,
-        est.eta0, est.power_t, est.t_, intercept_decay, est.average)
+        est.eta0, est.power_t, 0, est.t_, intercept_decay, est.average)
 
     if est.average:
         if len(est.classes_) == 2:
@@ -1363,7 +1375,7 @@ def _fit_regressor(self, X, y, alpha, C, loss, learning_rate,
                        seed,
                        1.0, 1.0,
                        learning_rate_type,
-                       self.eta0, self.power_t, self.t_,
+                       self.eta0, self.power_t, 0, self.t_,
                        intercept_decay, self.average)
 
         self.t_ += self.n_iter_ * X.shape[0]
@@ -1626,3 +1638,449 @@ def _more_tags(self):
                 'zero sample_weight is not equivalent to removing samples',
             }
         }
+
+
+class SGDOneClassSVM(BaseSGD, OutlierMixin):
+    """Solves linear One-Class SVM using Stochastic Gradient Descent.
+
+    This implementation is meant to be used with a kernel approximation
+    technique (e.g. `sklearn.kernel_approximation.Nystroem`) to obtain results
+    similar to `sklearn.svm.OneClassSVM` which uses a Gaussian kernel by
+    default.
+
+    Read more in the :ref:`User Guide <sgd_online_one_class_svm>`.
+
+    .. versionadded:: 1.0
+
+    Parameters
+    ----------
+    nu : float, optional
+        The nu parameter of the One Class SVM: an upper bound on the
+        fraction of training errors and a lower bound of the fraction of
+        support vectors. Should be in the interval (0, 1]. By default 0.5
+        will be taken.
+
+    fit_intercept : bool
+        Whether the intercept should be estimated or not. Defaults to True.
+
+    max_iter : int, optional
+        The maximum number of passes over the training data (aka epochs).
+        It only impacts the behavior in the ``fit`` method, and not the
+        `partial_fit`. Defaults to 1000.
+
+    tol : float or None, optional
+        The stopping criterion. If it is not None, the iterations will stop
+        when (loss > previous_loss - tol). Defaults to 1e-3.
+
+    shuffle : bool, optional
+        Whether or not the training data should be shuffled after each epoch.
+        Defaults to True.
+
+    verbose : integer, optional
+        The verbosity level
+
+    random_state : int, RandomState instance or None, optional (default=None)
+        The seed of the pseudo random number generator to use when shuffling
+        the data.  If int, random_state is the seed used by the random number
+        generator; If RandomState instance, random_state is the random number
+        generator; If None, the random number generator is the RandomState
+        instance used by `np.random`.
+
+    learning_rate : string, optional
+        The learning rate schedule:
+
+        'constant':
+            eta = eta0
+        'optimal': [default]
+            eta = 1.0 / (alpha * (t + t0))
+            where t0 is chosen by a heuristic proposed by Leon Bottou.
+        'invscaling':
+            eta = eta0 / pow(t, power_t)
+        'adaptive':
+            eta = eta0, as long as the training keeps decreasing.
+            Each time n_iter_no_change consecutive epochs fail to decrease the
+            training loss by tol or fail to increase validation score by tol if
+            early_stopping is True, the current learning rate is divided by 5.
+
+    eta0 : double
+        The initial learning rate for the 'constant', 'invscaling' or
+        'adaptive' schedules. The default value is 0.0 as eta0 is not used by
+        the default schedule 'optimal'.
+
+    power_t : double
+        The exponent for inverse scaling learning rate [default 0.5].
+
+    warm_start : bool, optional
+        When set to True, reuse the solution of the previous call to fit as
+        initialization, otherwise, just erase the previous solution.
+        See :term:`the Glossary <warm_start>`.
+
+        Repeatedly calling fit or partial_fit when warm_start is True can
+        result in a different solution than when calling fit a single time
+        because of the way the data is shuffled.
+        If a dynamic learning rate is used, the learning rate is adapted
+        depending on the number of samples already seen. Calling ``fit`` resets
+        this counter, while ``partial_fit``  will result in increasing the
+        existing counter.
+
+    average : bool or int, optional
+        When set to True, computes the averaged SGD weights and stores the
+        result in the ``coef_`` attribute. If set to an int greater than 1,
+        averaging will begin once the total number of samples seen reaches
+        average. So ``average=10`` will begin averaging after seeing 10
+        samples.
+
+    Attributes
+    ----------
+    coef_ : array, shape (1, n_features)
+        Weights assigned to the features.
+
+    offset_ : array, shape (1,)
+        Offset used to define the decision function from the raw scores.
+        We have the relation: decision_function = score_samples - offset.
+
+    n_iter_ : int
+        The actual number of iterations to reach the stopping criterion.
+
+    t_ : int
+        Number of weight updates performed during training.
+        Same as ``(n_iter_ * n_samples)``.
+
+    loss_function_ : concrete ``LossFunction``
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn import linear_model
+    >>> X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]])
+    >>> clf = linear_model.SGDOneClassSVM(random_state=42)
+    >>> clf.fit(X)
+    SGDOneClassSVM(random_state=42)
+
+    >>> print(clf.predict([[4, 4]]))
+    [1]
+
+    See also
+    --------
+    sklearn.svm.OneClassSVM
+
+    Notes
+    -----
+    This estimator has a linear complexity in the number of training samples
+    and is thus better suited than the `sklearn.svm.OneClassSVM`
+    implementation for datasets with a large number of training samples (say
+    > 10,000).
+    """
+
+    loss_functions = {"hinge": (Hinge, 1.0)}
+
+    def __init__(self, nu=0.5, fit_intercept=True, max_iter=1000, tol=1e-3,
+                 shuffle=True, verbose=0, random_state=None,
+                 learning_rate="optimal", eta0=0.0, power_t=0.5,
+                 warm_start=False, average=False):
+
+        alpha = nu / 2
+        self.nu = nu
+        super(SGDOneClassSVM, self).__init__(
+            loss="hinge", penalty='l2', alpha=alpha, C=1.0, l1_ratio=0,
+            fit_intercept=fit_intercept, max_iter=max_iter, tol=tol,
+            shuffle=shuffle, verbose=verbose, epsilon=DEFAULT_EPSILON,
+            random_state=random_state, learning_rate=learning_rate,
+            eta0=eta0, power_t=power_t, early_stopping=False,
+            validation_fraction=0.1, n_iter_no_change=5,
+            warm_start=warm_start, average=average)
+
+    def _validate_params(self, for_partial_fit=False):
+        """Validate input params. """
+        if not(0 < self.nu <= 1):
+            raise ValueError("nu must be in (0, 1], got nu=%f" % self.nu)
+
+        super(SGDOneClassSVM, self)._validate_params(
+            for_partial_fit=for_partial_fit)
+
+    def _fit_one_class(self, X, alpha, C, sample_weight,
+                       learning_rate, max_iter):
+        """Uses SGD implementation with X and y=np.ones(n_samples)."""
+
+        # The One-Class SVM uses the SGD implementation with
+        # y=np.ones(n_samples).
+        n_samples = X.shape[0]
+        y = np.ones(n_samples, dtype=np.float64, order="C")
+
+        dataset, offset_decay = make_dataset(X, y, sample_weight)
+
+        penalty_type = self._get_penalty_type(self.penalty)
+        learning_rate_type = self._get_learning_rate_type(learning_rate)
+
+        # early stopping is set to False for the One-Class SVM. thus
+        # validation_mask and validation_score_cb will be set to values
+        # associated to early_stopping=False in _make_validation_split and
+        # _make_validation_score_cb respectively.
+        validation_mask = self._make_validation_split(y)
+        validation_score_cb = self._make_validation_score_cb(
+            validation_mask, X, y, sample_weight)
+
+        random_state = check_random_state(self.random_state)
+        # numpy mtrand expects a C long which is a signed 32 bit integer under
+        # Windows
+        seed = random_state.randint(0, np.iinfo(np.int32).max)
+
+        tol = self.tol if self.tol is not None else -np.inf
+
+        one_class = 1
+        # There are no class weights for the One-Class SVM and they are
+        # therefore set to 1.
+        pos_weight = 1
+        neg_weight = 1
+
+        if self.average:
+            coef = self._standard_coef
+            intercept = self._standard_intercept
+            average_coef = self._average_coef
+            average_intercept = self._average_intercept
+        else:
+            coef = self.coef_
+            intercept = 1 - self.offset_
+            average_coef = None  # Not used
+            average_intercept = [0]  # Not used
+
+        coef, intercept, average_coef, average_intercept, self.n_iter_ = \
+            _plain_sgd(coef,
+                       intercept[0],
+                       average_coef,
+                       average_intercept[0],
+                       self.loss_function_,
+                       penalty_type,
+                       alpha, C,
+                       self.l1_ratio,
+                       dataset,
+                       validation_mask, self.early_stopping,
+                       validation_score_cb,
+                       int(self.n_iter_no_change),
+                       max_iter, tol,
+                       int(self.fit_intercept),
+                       int(self.verbose),
+                       int(self.shuffle),
+                       seed,
+                       neg_weight, pos_weight,
+                       learning_rate_type,
+                       self.eta0, self.power_t,
+                       one_class, self.t_,
+                       offset_decay, self.average)
+
+        self.t_ += self.n_iter_ * n_samples
+
+        if self.average > 0:
+
+            self._average_intercept = np.atleast_1d(average_intercept)
+            self._standard_intercept = np.atleast_1d(intercept)
+
+            if self.average <= self.t_ - 1.0:
+                # made enough updates for averaging to be taken into account
+                self.coef_ = average_coef
+                self.offset_ = 1 - np.atleast_1d(average_intercept)
+            else:
+                self.coef_ = coef
+                self.offset_ = 1 - np.atleast_1d(intercept)
+
+        else:
+            self.offset_ = 1 - np.atleast_1d(intercept)
+
+    def _partial_fit(self, X, alpha, C, loss, learning_rate, max_iter,
+                     sample_weight, coef_init, offset_init):
+        first_call = getattr(self, "coef_", None) is None
+        X = self._validate_data(
+            X, None, accept_sparse='csr', dtype=np.float64,
+            order="C", accept_large_sparse=False,
+            reset=first_call)
+
+        n_features = X.shape[1]
+
+        # Allocate datastructures from input arguments
+        sample_weight = _check_sample_weight(sample_weight, X)
+
+        # We use intercept = 1 - offset where intercept is the intercept of
+        # the SGD implementation and offset is the offset of the One-Class SVM
+        # optimization problem.
+        if getattr(self, "coef_", None) is None or coef_init is not None:
+            self._allocate_parameter_mem(1, n_features,
+                                         coef_init, offset_init, 1)
+        elif n_features != self.coef_.shape[-1]:
+            raise ValueError("Number of features %d does not match previous "
+                             "data %d." % (n_features, self.coef_.shape[-1]))
+
+        if self.average and getattr(self, "_average_coef", None) is None:
+            self._average_coef = np.zeros(n_features, dtype=np.float64,
+                                          order="C")
+            self._average_intercept = np.zeros(1, dtype=np.float64, order="C")
+
+        self.loss_function_ = self._get_loss_function(loss)
+        if not hasattr(self, "t_"):
+            self.t_ = 1.0
+
+        # delegate to concrete training procedure
+        self._fit_one_class(X, alpha=alpha, C=C,
+                            learning_rate=learning_rate,
+                            sample_weight=sample_weight,
+                            max_iter=max_iter)
+
+        return self
+
+    def partial_fit(self, X, y=None, sample_weight=None):
+        """Fit linear One-Class SVM with Stochastic Gradient Descent.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            Subset of the training data.
+
+        sample_weight : array-like, shape (n_samples,), optional
+            Weights applied to individual samples.
+            If not provided, uniform weights are assumed.
+
+        Returns
+        -------
+        self : returns an instance of self.
+        """
+
+        alpha = self.nu / 2
+        self._validate_params(for_partial_fit=True)
+
+        return self._partial_fit(X, alpha, C=1.0, loss=self.loss,
+                                 learning_rate=self.learning_rate,
+                                 max_iter=1,
+                                 sample_weight=sample_weight,
+                                 coef_init=None, offset_init=None)
+
+    def _fit(self, X, alpha, C, loss, learning_rate, coef_init=None,
+             offset_init=None, sample_weight=None):
+        self._validate_params()
+
+        if self.warm_start and hasattr(self, "coef_"):
+            if coef_init is None:
+                coef_init = self.coef_
+            if offset_init is None:
+                offset_init = self.offset_
+        else:
+            self.coef_ = None
+            self.offset_ = None
+
+        # Clear iteration count for multiple call to fit.
+        self.t_ = 1.0
+
+        self._partial_fit(X, alpha, C, loss, learning_rate, self.max_iter,
+                          sample_weight, coef_init, offset_init)
+
+        if (self.tol is not None and self.tol > -np.inf
+                and self.n_iter_ == self.max_iter):
+            warnings.warn("Maximum number of iteration reached before "
+                          "convergence. Consider increasing max_iter to "
+                          "improve the fit.",
+                          ConvergenceWarning)
+
+        return self
+
+    def fit(self, X, y=None, coef_init=None, offset_init=None,
+            sample_weight=None):
+        """Fit linear One-Class SVM with Stochastic Gradient Descent.
+
+        This solves an equivalent optimization problem of the
+        One-Class SVM primal optimization problem and returns a weight vector
+        w and an offset rho such that the decision function is given by
+        <w, x> - rho.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            Training data.
+
+        coef_init : array, shape (n_classes, n_features)
+            The initial coefficients to warm-start the optimization.
+
+        offset_init : array, shape (n_classes,)
+            The initial offset to warm-start the optimization.
+
+        sample_weight : array-like, shape (n_samples,), optional
+            Weights applied to individual samples.
+            If not provided, uniform weights are assumed. These weights will
+            be multiplied with class_weight (passed through the
+            constructor) if class_weight is specified.
+
+        Returns
+        -------
+        self : returns an instance of self.
+        """
+
+        alpha = self.nu / 2
+        self._fit(X, alpha=alpha, C=1.0,
+                  loss=self.loss, learning_rate=self.learning_rate,
+                  coef_init=coef_init, offset_init=offset_init,
+                  sample_weight=sample_weight)
+
+        return self
+
+    def decision_function(self, X):
+        """Signed distance to the separating hyperplane.
+
+        Signed distance is positive for an inlier and negative for an
+        outlier.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            Testing data.
+
+        Returns
+        -------
+        dec : array-like, shape (n_samples,)
+            Decision function values of the samples.
+        """
+
+        check_is_fitted(self, "coef_")
+
+        X = self._validate_data(X, accept_sparse='csr', reset=False)
+        decisions = safe_sparse_dot(X, self.coef_.T,
+                                    dense_output=True) - self.offset_
+
+        return decisions.ravel()
+
+    def score_samples(self, X):
+        """Raw scoring function of the samples.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            Testing data.
+
+        Returns
+        -------
+        score_samples : array-like, shape (n_samples,)
+            Unshiffted scoring function values of the samples.
+        """
+        score_samples = self.decision_function(X) + self.offset_
+        return score_samples
+
+    def predict(self, X):
+        """Return labels (1 inlier, -1 outlier) of the samples.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            Testing data.
+
+        Returns
+        -------
+        y : array, shape (n_samples,)
+            Labels of the samples.
+        """
+        y = (self.decision_function(X) >= 0).astype(np.int32)
+        y[y == 0] = -1  # for consistency with outlier detectors
+        return y
+
+    def _more_tags(self):
+        return {
+            '_xfail_checks': {
+                'check_sample_weights_invariance':
+                'zero sample_weight is not equivalent to removing samples',
+            }
+        }
diff --git a/sklearn/linear_model/tests/test_sgd.py b/sklearn/linear_model/tests/test_sgd.py
index aba043024fea3..f943592c02005 100644
--- a/sklearn/linear_model/tests/test_sgd.py
+++ b/sklearn/linear_model/tests/test_sgd.py
@@ -9,14 +9,16 @@
 from sklearn.utils._testing import assert_array_equal
 from sklearn.utils._testing import assert_almost_equal
 from sklearn.utils._testing import assert_array_almost_equal
-from sklearn.utils._testing import assert_raises_regexp
 from sklearn.utils._testing import ignore_warnings
 from sklearn.utils.fixes import parse_version
 
 from sklearn import linear_model, datasets, metrics
 from sklearn.base import clone, is_classifier
+from sklearn.svm import OneClassSVM
 from sklearn.preprocessing import LabelEncoder, scale, MinMaxScaler
 from sklearn.preprocessing import StandardScaler
+from sklearn.kernel_approximation import Nystroem
+from sklearn.pipeline import make_pipeline
 from sklearn.exceptions import ConvergenceWarning
 from sklearn.model_selection import StratifiedShuffleSplit, ShuffleSplit
 from sklearn.linear_model import _sgd_fast as sgd_fast
@@ -67,6 +69,21 @@ def decision_function(self, X, *args, **kw):
                                                            **kw)
 
 
+class _SparseSGDOneClassSVM(linear_model.SGDOneClassSVM):
+    def fit(self, X, *args, **kw):
+        X = sp.csr_matrix(X)
+        return linear_model.SGDOneClassSVM.fit(self, X, *args, **kw)
+
+    def partial_fit(self, X, *args, **kw):
+        X = sp.csr_matrix(X)
+        return linear_model.SGDOneClassSVM.partial_fit(self, X, *args, **kw)
+
+    def decision_function(self, X, *args, **kw):
+        X = sp.csr_matrix(X)
+        return linear_model.SGDOneClassSVM.decision_function(self, X, *args,
+                                                             **kw)
+
+
 def SGDClassifier(**kwargs):
     _update_kwargs(kwargs)
     return linear_model.SGDClassifier(**kwargs)
@@ -77,6 +94,11 @@ def SGDRegressor(**kwargs):
     return linear_model.SGDRegressor(**kwargs)
 
 
+def SGDOneClassSVM(**kwargs):
+    _update_kwargs(kwargs)
+    return linear_model.SGDOneClassSVM(**kwargs)
+
+
 def SparseSGDClassifier(**kwargs):
     _update_kwargs(kwargs)
     return _SparseSGDClassifier(**kwargs)
@@ -87,6 +109,11 @@ def SparseSGDRegressor(**kwargs):
     return _SparseSGDRegressor(**kwargs)
 
 
+def SparseSGDOneClassSVM(**kwargs):
+    _update_kwargs(kwargs)
+    return _SparseSGDOneClassSVM(**kwargs)
+
+
 # Test Data
 
 # test sample 1
@@ -252,7 +279,8 @@ def test_clone(klass):
 
 
 @pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier,
-                                   SGDRegressor, SparseSGDRegressor])
+                                   SGDRegressor, SparseSGDRegressor,
+                                   SGDOneClassSVM, SparseSGDOneClassSVM])
 def test_plain_has_no_average_attr(klass):
     clf = klass(average=True, eta0=.01)
     clf.fit(X, Y)
@@ -285,7 +313,8 @@ def test_sgd_deprecated_attr(klass):
 
 
 @pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier,
-                                   SGDRegressor, SparseSGDRegressor])
+                                   SGDRegressor, SparseSGDRegressor,
+                                   SGDOneClassSVM, SparseSGDOneClassSVM])
 def test_late_onset_averaging_not_reached(klass):
     clf1 = klass(average=600)
     clf2 = klass()
@@ -298,7 +327,11 @@ def test_late_onset_averaging_not_reached(klass):
             clf2.partial_fit(X, Y)
 
     assert_array_almost_equal(clf1.coef_, clf2.coef_, decimal=16)
-    assert_almost_equal(clf1.intercept_, clf2.intercept_, decimal=16)
+    if klass in [SGDClassifier, SparseSGDClassifier, SGDRegressor,
+                 SparseSGDRegressor]:
+        assert_almost_equal(clf1.intercept_, clf2.intercept_, decimal=16)
+    elif klass in [SGDOneClassSVM, SparseSGDOneClassSVM]:
+        assert_allclose(clf1.offset_, clf2.offset_)
 
 
 @pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier,
@@ -444,28 +477,32 @@ def test_sgd_bad_l1_ratio(klass):
         klass(l1_ratio=1.1)
 
 
-@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier,
+                                   SGDOneClassSVM, SparseSGDOneClassSVM])
 def test_sgd_bad_learning_rate_schedule(klass):
     # Check whether expected ValueError on bad learning_rate
     with pytest.raises(ValueError):
         klass(learning_rate="<unknown>")
 
 
-@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier,
+                                   SGDOneClassSVM, SparseSGDOneClassSVM])
 def test_sgd_bad_eta0(klass):
     # Check whether expected ValueError on bad eta0
     with pytest.raises(ValueError):
         klass(eta0=0, learning_rate="constant")
 
 
-@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier,
+                                   SGDOneClassSVM, SparseSGDOneClassSVM])
 def test_sgd_max_iter_param(klass):
     # Test parameter validity check
     with pytest.raises(ValueError):
         klass(max_iter=-10000)
 
 
-@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier,
+                                   SGDOneClassSVM, SparseSGDOneClassSVM])
 def test_sgd_shuffle_param(klass):
     # Test parameter validity check
     with pytest.raises(ValueError):
@@ -493,7 +530,8 @@ def test_sgd_n_iter_no_change(klass):
         klass(n_iter_no_change=0)
 
 
-@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier,
+                                   SGDOneClassSVM, SparseSGDOneClassSVM])
 def test_argument_coef(klass):
     # Checks coef_init not allowed as model argument (only fit)
     # Provided coef_ does not match dataset
@@ -501,7 +539,8 @@ def test_argument_coef(klass):
         klass(coef_init=np.zeros((3,)))
 
 
-@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier,
+                                   SGDOneClassSVM, SparseSGDOneClassSVM])
 def test_provide_coef(klass):
     # Checks coef_init shape for the warm starts
     # Provided coef_ does not match dataset.
@@ -509,12 +548,17 @@ def test_provide_coef(klass):
         klass().fit(X, Y, coef_init=np.zeros((3,)))
 
 
-@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier,
+                                   SGDOneClassSVM, SparseSGDOneClassSVM])
 def test_set_intercept(klass):
     # Checks intercept_ shape for the warm starts
     # Provided intercept_ does not match dataset.
-    with pytest.raises(ValueError):
-        klass().fit(X, Y, intercept_init=np.zeros((3,)))
+    if klass in [SGDClassifier, SparseSGDClassifier]:
+        with pytest.raises(ValueError):
+            klass().fit(X, Y, intercept_init=np.zeros((3,)))
+    elif klass in [SGDOneClassSVM, SparseSGDOneClassSVM]:
+        with pytest.raises(ValueError):
+            klass().fit(X, Y, offset_init=np.zeros((3,)))
 
 
 @pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
@@ -590,10 +634,8 @@ def test_partial_fit_weight_class_balanced(klass):
              r"estimate the class frequency distributions\. "
              r"Pass the resulting weights as the class_weight "
              r"parameter\.")
-    assert_raises_regexp(ValueError,
-                         regex,
-                         klass(class_weight='balanced').partial_fit,
-                         X, Y, classes=np.unique(Y))
+    with pytest.raises(ValueError, match=regex):
+        klass(class_weight='balanced').partial_fit(X, Y, classes=np.unique(Y))
 
 
 @pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
@@ -947,10 +989,14 @@ def test_sample_weights(klass):
     assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([-1]))
 
 
-@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier])
+@pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier,
+                                   SGDOneClassSVM, SparseSGDOneClassSVM])
 def test_wrong_sample_weights(klass):
     # Test if ValueError is raised if sample_weight has wrong shape
-    clf = klass(alpha=0.1, max_iter=1000, fit_intercept=False)
+    if klass in [SGDClassifier, SparseSGDClassifier]:
+        clf = klass(alpha=0.1, max_iter=1000, fit_intercept=False)
+    elif klass in [SGDOneClassSVM, SparseSGDOneClassSVM]:
+        clf = klass(nu=0.1, max_iter=1000, fit_intercept=False)
     # provided sample_weight too long
     with pytest.raises(ValueError):
         clf.fit(X, Y, sample_weight=np.arange(7))
@@ -1341,6 +1387,303 @@ def test_loss_function_epsilon(klass):
     assert clf.loss_functions['huber'][1] == 0.1
 
 
+###############################################################################
+# SGD One Class SVM Test Case
+
+# a simple implementation of ASGD to use for testing SGDOneClassSVM
+def asgd_oneclass(klass, X, eta, nu, coef_init=None, offset_init=0.0):
+    if coef_init is None:
+        coef = np.zeros(X.shape[1])
+    else:
+        coef = coef_init
+
+    average_coef = np.zeros(X.shape[1])
+    offset = offset_init
+    intercept = 1 - offset
+    average_intercept = 0.0
+    decay = 1.0
+
+    # sparse data has a fixed decay of .01
+    if klass == SparseSGDOneClassSVM:
+        decay = .01
+
+    for i, entry in enumerate(X):
+        p = np.dot(entry, coef)
+        p += intercept
+        if p <= 1.0:
+            gradient = -1
+        else:
+            gradient = 0
+        coef *= max(0, 1.0 - (eta * nu / 2))
+        coef += -(eta * gradient * entry)
+        intercept += -(eta * (nu + gradient)) * decay
+
+        average_coef *= i
+        average_coef += coef
+        average_coef /= i + 1.0
+
+        average_intercept *= i
+        average_intercept += intercept
+        average_intercept /= i + 1.0
+
+    return average_coef, 1 - average_intercept
+
+
+@pytest.mark.parametrize('klass', [SGDOneClassSVM, SparseSGDOneClassSVM])
+@pytest.mark.parametrize('nu', [-0.5, 2])
+def test_bad_nu_values(klass, nu):
+    msg = r"nu must be in \(0, 1]"
+    with pytest.raises(ValueError, match=msg):
+        klass(nu=nu)
+
+    clf = klass(nu=0.05)
+    clf2 = clone(clf)
+    with pytest.raises(ValueError, match=msg):
+        clf2.set_params(nu=nu)
+
+
+@pytest.mark.parametrize('klass', [SGDOneClassSVM, SparseSGDOneClassSVM])
+def _test_warm_start_oneclass(klass, X, lr):
+    # Test that explicit warm restart...
+    clf = klass(nu=0.5, eta0=0.01, shuffle=False,
+                learning_rate=lr)
+    clf.fit(X)
+
+    clf2 = klass(nu=0.1, eta0=0.01, shuffle=False,
+                 learning_rate=lr)
+    clf2.fit(X, coef_init=clf.coef_.copy(),
+             offset_init=clf.offset_.copy())
+
+    # ... and implicit warm restart are equivalent.
+    clf3 = klass(nu=0.5, eta0=0.01, shuffle=False,
+                 warm_start=True, learning_rate=lr)
+    clf3.fit(X)
+
+    assert clf3.t_ == clf.t_
+    assert_allclose(clf3.coef_, clf.coef_)
+
+    clf3.set_params(nu=0.1)
+    clf3.fit(X)
+
+    assert clf3.t_ == clf2.t_
+    assert_allclose(clf3.coef_, clf2.coef_)
+
+
+@pytest.mark.parametrize('klass', [SGDOneClassSVM, SparseSGDOneClassSVM])
+@pytest.mark.parametrize('lr',
+                         ["constant", "optimal", "invscaling", "adaptive"])
+def test_warm_start_oneclass(klass, lr):
+    _test_warm_start_oneclass(klass, X, lr)
+
+
+@pytest.mark.parametrize('klass', [SGDOneClassSVM, SparseSGDOneClassSVM])
+def test_clone_oneclass(klass):
+    # Test whether clone works ok.
+    clf = klass(nu=0.5)
+    clf = clone(clf)
+    clf.set_params(nu=0.1)
+    clf.fit(X)
+
+    clf2 = klass(nu=0.1)
+    clf2.fit(X)
+
+    assert_array_equal(clf.coef_, clf2.coef_)
+
+
+@pytest.mark.parametrize('klass', [SGDOneClassSVM, SparseSGDOneClassSVM])
+def test_partial_fit_oneclass(klass):
+    third = X.shape[0] // 3
+    clf = klass(nu=0.1)
+
+    clf.partial_fit(X[:third])
+    assert clf.coef_.shape == (X.shape[1], )
+    assert clf.offset_.shape == (1,)
+    assert clf.predict([[0, 0]]).shape == (1, )
+    id1 = id(clf.coef_.data)
+
+    clf.partial_fit(X[third:])
+    id2 = id(clf.coef_.data)
+    # check that coef_ haven't been re-allocated
+    assert id1 == id2
+
+    # raises ValueError if number of features does not match previous data
+    with pytest.raises(ValueError):
+        clf.partial_fit(X[:, 1])
+
+
+@pytest.mark.parametrize('klass', [SGDOneClassSVM, SparseSGDOneClassSVM])
+@pytest.mark.parametrize('lr',
+                         ["constant", "optimal", "invscaling", "adaptive"])
+def test_partial_fit_equal_fit_oneclass(klass, lr):
+    clf = klass(nu=0.05, max_iter=2, eta0=0.01,
+                learning_rate=lr, shuffle=False)
+    clf.fit(X)
+    y_scores = clf.decision_function(T)
+    t = clf.t_
+    coef = clf.coef_
+    offset = clf.offset_
+
+    clf = klass(nu=0.05, eta0=0.01, max_iter=1,
+                learning_rate=lr, shuffle=False)
+    for _ in range(2):
+        clf.partial_fit(X)
+    y_scores2 = clf.decision_function(T)
+
+    assert clf.t_ == t
+    assert_allclose(y_scores, y_scores2)
+    assert_allclose(clf.coef_, coef)
+    assert_allclose(clf.offset_, offset)
+
+
+@pytest.mark.parametrize('klass', [SGDOneClassSVM, SparseSGDOneClassSVM])
+def test_late_onset_averaging_reached_oneclass(klass):
+    # Test average
+    eta0 = .001
+    nu = .05
+
+    # 2 passes over the training set but average only at second pass
+    clf1 = klass(average=7, learning_rate="constant", eta0=eta0,
+                 nu=nu, max_iter=2, shuffle=False)
+    # 1 pass over the training set with no averaging
+    clf2 = klass(average=0, learning_rate="constant", eta0=eta0,
+                 nu=nu, max_iter=1, shuffle=False)
+
+    clf1.fit(X)
+    clf2.fit(X)
+
+    # Start from clf2 solution, compute averaging using asgd function and
+    # compare with clf1 solution
+    average_coef, average_offset = \
+        asgd_oneclass(klass, X, eta0, nu,
+                      coef_init=clf2.coef_.ravel(),
+                      offset_init=clf2.offset_)
+
+    assert_allclose(clf1.coef_.ravel(), average_coef.ravel())
+    assert_allclose(clf1.offset_, average_offset)
+
+
+@pytest.mark.parametrize('klass', [SGDOneClassSVM, SparseSGDOneClassSVM])
+def test_sgd_averaged_computed_correctly_oneclass(klass):
+    # Tests the average SGD One-Class SVM matches the naive implementation
+    eta = .001
+    nu = .05
+    n_samples = 20
+    n_features = 10
+    rng = np.random.RandomState(0)
+    X = rng.normal(size=(n_samples, n_features))
+
+    clf = klass(learning_rate='constant',
+                eta0=eta, nu=nu,
+                fit_intercept=True,
+                max_iter=1, average=True, shuffle=False)
+
+    clf.fit(X)
+    average_coef, average_offset = asgd_oneclass(klass, X, eta, nu)
+
+    assert_allclose(clf.coef_, average_coef)
+    assert_allclose(clf.offset_, average_offset)
+
+
+@pytest.mark.parametrize('klass', [SGDOneClassSVM, SparseSGDOneClassSVM])
+def test_sgd_averaged_partial_fit_oneclass(klass):
+    # Tests whether the partial fit yields the same average as the fit
+    eta = .001
+    nu = .05
+    n_samples = 20
+    n_features = 10
+    rng = np.random.RandomState(0)
+    X = rng.normal(size=(n_samples, n_features))
+
+    clf = klass(learning_rate='constant',
+                eta0=eta, nu=nu,
+                fit_intercept=True,
+                max_iter=1, average=True, shuffle=False)
+
+    clf.partial_fit(X[:int(n_samples / 2)][:])
+    clf.partial_fit(X[int(n_samples / 2):][:])
+    average_coef, average_offset = asgd_oneclass(klass, X, eta, nu)
+
+    assert_allclose(clf.coef_, average_coef)
+    assert_allclose(clf.offset_, average_offset)
+
+
+@pytest.mark.parametrize('klass', [SGDOneClassSVM, SparseSGDOneClassSVM])
+def test_average_sparse_oneclass(klass):
+    # Checks the average coef on data with 0s
+    eta = .001
+    nu = .01
+    clf = klass(learning_rate='constant',
+                eta0=eta, nu=nu,
+                fit_intercept=True,
+                max_iter=1, average=True, shuffle=False)
+
+    n_samples = X3.shape[0]
+
+    clf.partial_fit(X3[:int(n_samples / 2)])
+    clf.partial_fit(X3[int(n_samples / 2):])
+    average_coef, average_offset = asgd_oneclass(klass, X3, eta, nu)
+
+    assert_allclose(clf.coef_, average_coef)
+    assert_allclose(clf.offset_, average_offset)
+
+
+def test_sgd_oneclass():
+    # Test fit, decision_function, predict and score_samples on a toy
+    # dataset
+    X_train = np.array([[-2, -1], [-1, -1], [1, 1]])
+    X_test = np.array([[0.5, -2], [2, 2]])
+    clf = SGDOneClassSVM(nu=0.5, eta0=1, learning_rate='constant',
+                         shuffle=False, max_iter=1)
+    clf.fit(X_train)
+    assert_allclose(clf.coef_, np.array([-0.125, 0.4375]))
+    assert clf.offset_[0] == -0.5
+
+    scores = clf.score_samples(X_test)
+    assert_allclose(scores, np.array([-0.9375, 0.625]))
+
+    dec = clf.score_samples(X_test) - clf.offset_
+    assert_allclose(clf.decision_function(X_test), dec)
+
+    pred = clf.predict(X_test)
+    assert_array_equal(pred, np.array([-1, 1]))
+
+
+def test_ocsvm_vs_sgdocsvm():
+    # Checks SGDOneClass SVM gives a good approximation of kernelized
+    # One-Class SVM
+    nu = 0.05
+    gamma = 2.
+    random_state = 42
+
+    # Generate train and test data
+    rng = np.random.RandomState(random_state)
+    X = 0.3 * rng.randn(500, 2)
+    X_train = np.r_[X + 2, X - 2]
+    X = 0.3 * rng.randn(100, 2)
+    X_test = np.r_[X + 2, X - 2]
+
+    # One-Class SVM
+    clf = OneClassSVM(gamma=gamma, kernel='rbf', nu=nu)
+    clf.fit(X_train)
+    y_pred_ocsvm = clf.predict(X_test)
+    dec_ocsvm = clf.decision_function(X_test).reshape(1, -1)
+
+    # SGDOneClassSVM using kernel approximation
+    max_iter = 15
+    transform = Nystroem(gamma=gamma, random_state=random_state)
+    clf_sgd = SGDOneClassSVM(nu=nu, shuffle=True, fit_intercept=True,
+                             max_iter=max_iter, random_state=random_state,
+                             tol=-np.inf)
+    pipe_sgd = make_pipeline(transform, clf_sgd)
+    pipe_sgd.fit(X_train)
+    y_pred_sgdocsvm = pipe_sgd.predict(X_test)
+    dec_sgdocsvm = pipe_sgd.decision_function(X_test).reshape(1, -1)
+
+    assert np.mean(y_pred_sgdocsvm == y_pred_ocsvm) >= 0.99
+    corrcoef = np.corrcoef(np.concatenate((dec_ocsvm, dec_sgdocsvm)))[0, 1]
+    assert corrcoef >= 0.9
+
+
 def test_l1_ratio():
     # Test if l1 ratio extremes match L1 and L2 penalty settings.
     X, y = datasets.make_classification(n_samples=1000,
@@ -1396,7 +1739,8 @@ def test_underflow_or_overlow():
         msg_regxp = (r"Floating-point under-/overflow occurred at epoch #.*"
                      " Scaling input data with StandardScaler or MinMaxScaler"
                      " might help.")
-        assert_raises_regexp(ValueError, msg_regxp, model.fit, X, y)
+        with pytest.raises(ValueError, match=msg_regxp):
+            model.fit(X, y)
 
 
 def test_numerical_stability_large_gradient():
diff --git a/sklearn/svm/_classes.py b/sklearn/svm/_classes.py
index 908ece408bb1d..c402779f4eeb6 100644
--- a/sklearn/svm/_classes.py
+++ b/sklearn/svm/_classes.py
@@ -1334,6 +1334,10 @@ class OneClassSVM(OutlierMixin, BaseLibSVM):
     array([-1,  1,  1,  1, -1])
     >>> clf.score_samples(X)
     array([1.7798..., 2.0547..., 2.0556..., 2.0561..., 1.7332...])
+
+    See also
+    --------
+    sklearn.linear_model.SGDOneClassSVM
     """
 
     _impl = 'one_class'

From 3e64e9e6ce6f5356c08134dd9538e94dd10302f1 Mon Sep 17 00:00:00 2001
From: James Budarz <jimbudarz@gmail.com>
Date: Tue, 23 Mar 2021 04:46:14 -0700
Subject: [PATCH 272/478] DOC Clarified n_jobs parallelization in
 plot_partial_dependence (#19750)

Co-authored-by: James Michael Budarz <james.m.budarz@aexp.com>
---
 sklearn/inspection/_plot/partial_dependence.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/sklearn/inspection/_plot/partial_dependence.py b/sklearn/inspection/_plot/partial_dependence.py
index d6604d7ae675f..a2ee1886066e2 100644
--- a/sklearn/inspection/_plot/partial_dependence.py
+++ b/sklearn/inspection/_plot/partial_dependence.py
@@ -174,6 +174,9 @@ def plot_partial_dependence(
 
     n_jobs : int, default=None
         The number of CPUs to use to compute the partial dependences.
+        Computation is parallelized over features specified by the `features`
+        parameter.
+
         ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
         ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
         for more details.

From cf296c74ba91def816045f305dfa6a6dba539ad1 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Tue, 23 Mar 2021 09:08:11 -0400
Subject: [PATCH 273/478] ENH Checks n_features_in_ after fitting in mixture
 (#19540)

---
 sklearn/mixture/_base.py                      | 42 +++++--------------
 sklearn/mixture/_bayesian_mixture.py          |  5 +++
 sklearn/mixture/_gaussian_mixture.py          |  5 +++
 .../mixture/tests/test_gaussian_mixture.py    | 24 -----------
 sklearn/mixture/tests/test_mixture.py         | 16 +++++++
 sklearn/tests/test_common.py                  |  1 -
 sklearn/tests/test_docstring_parameters.py    |  1 -
 7 files changed, 36 insertions(+), 58 deletions(-)

diff --git a/sklearn/mixture/_base.py b/sklearn/mixture/_base.py
index a9aac7f4dac19..6acb6c2e09292 100644
--- a/sklearn/mixture/_base.py
+++ b/sklearn/mixture/_base.py
@@ -15,7 +15,7 @@
 from ..base import BaseEstimator
 from ..base import DensityMixin
 from ..exceptions import ConvergenceWarning
-from ..utils import check_array, check_random_state
+from ..utils import check_random_state
 from ..utils.validation import check_is_fitted
 
 
@@ -36,32 +36,6 @@ def _check_shape(param, param_shape, name):
                          "but got %s" % (name, param_shape, param.shape))
 
 
-def _check_X(X, n_components=None, n_features=None, ensure_min_samples=1):
-    """Check the input data X.
-
-    Parameters
-    ----------
-    X : array-like of shape (n_samples, n_features)
-
-    n_components : int
-
-    Returns
-    -------
-    X : array, shape (n_samples, n_features)
-    """
-    X = check_array(X, dtype=[np.float64, np.float32],
-                    ensure_min_samples=ensure_min_samples)
-    if n_components is not None and X.shape[0] < n_components:
-        raise ValueError('Expected n_samples >= n_components '
-                         'but got n_components = %d, n_samples = %d'
-                         % (n_components, X.shape[0]))
-    if n_features is not None and X.shape[1] != n_features:
-        raise ValueError("Expected the input data X have %d features, "
-                         "but got %d features"
-                         % (n_features, X.shape[1]))
-    return X
-
-
 class BaseMixture(DensityMixin, BaseEstimator, metaclass=ABCMeta):
     """Base class for mixture models.
 
@@ -217,8 +191,12 @@ def fit_predict(self, X, y=None):
         labels : array, shape (n_samples,)
             Component labels.
         """
-        X = _check_X(X, self.n_components, ensure_min_samples=2)
-        self._check_n_features(X, reset=True)
+        X = self._validate_data(X, dtype=[np.float64, np.float32],
+                                ensure_min_samples=2)
+        if X.shape[0] < self.n_components:
+            raise ValueError("Expected n_samples >= n_components "
+                             f"but got n_components = {self.n_components}, "
+                             f"n_samples = {X.shape[0]}")
         self._check_initial_parameters(X)
 
         # if we enable warm_start, we will have a unique initialisation
@@ -335,7 +313,7 @@ def score_samples(self, X):
             Log probabilities of each data point in X.
         """
         check_is_fitted(self)
-        X = _check_X(X, None, self.means_.shape[1])
+        X = self._validate_data(X, reset=False)
 
         return logsumexp(self._estimate_weighted_log_prob(X), axis=1)
 
@@ -370,7 +348,7 @@ def predict(self, X):
             Component labels.
         """
         check_is_fitted(self)
-        X = _check_X(X, None, self.means_.shape[1])
+        X = self._validate_data(X, reset=False)
         return self._estimate_weighted_log_prob(X).argmax(axis=1)
 
     def predict_proba(self, X):
@@ -389,7 +367,7 @@ def predict_proba(self, X):
             the model given each sample.
         """
         check_is_fitted(self)
-        X = _check_X(X, None, self.means_.shape[1])
+        X = self._validate_data(X, reset=False)
         _, log_resp = self._estimate_log_prob_resp(X)
         return np.exp(log_resp)
 
diff --git a/sklearn/mixture/_bayesian_mixture.py b/sklearn/mixture/_bayesian_mixture.py
index 34cef090be22b..bd1954ddc15c8 100644
--- a/sklearn/mixture/_bayesian_mixture.py
+++ b/sklearn/mixture/_bayesian_mixture.py
@@ -288,6 +288,11 @@ class BayesianGaussianMixture(BaseMixture):
             (n_features)             if 'diag',
             float                    if 'spherical'
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     Examples
     --------
     >>> import numpy as np
diff --git a/sklearn/mixture/_gaussian_mixture.py b/sklearn/mixture/_gaussian_mixture.py
index f510c81cec2dd..4bb14f9ca5bd7 100644
--- a/sklearn/mixture/_gaussian_mixture.py
+++ b/sklearn/mixture/_gaussian_mixture.py
@@ -582,6 +582,11 @@ class GaussianMixture(BaseMixture):
         Lower bound value on the log-likelihood (of the training data with
         respect to the model) of the best fit of EM.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     Examples
     --------
     >>> import numpy as np
diff --git a/sklearn/mixture/tests/test_gaussian_mixture.py b/sklearn/mixture/tests/test_gaussian_mixture.py
index 403fbb2208618..ea5ea0c2eb649 100644
--- a/sklearn/mixture/tests/test_gaussian_mixture.py
+++ b/sklearn/mixture/tests/test_gaussian_mixture.py
@@ -172,30 +172,6 @@ def test_gaussian_mixture_attributes():
     assert gmm.init_params == init_params
 
 
-def test_check_X():
-    from sklearn.mixture._base import _check_X
-    rng = np.random.RandomState(0)
-
-    n_samples, n_components, n_features = 10, 2, 2
-
-    X_bad_dim = rng.rand(n_components - 1, n_features)
-    assert_raise_message(ValueError,
-                         'Expected n_samples >= n_components '
-                         'but got n_components = %d, n_samples = %d'
-                         % (n_components, X_bad_dim.shape[0]),
-                         _check_X, X_bad_dim, n_components)
-
-    X_bad_dim = rng.rand(n_components, n_features + 1)
-    assert_raise_message(ValueError,
-                         'Expected the input data X have %d features, '
-                         'but got %d features'
-                         % (n_features, X_bad_dim.shape[1]),
-                         _check_X, X_bad_dim, n_components, n_features)
-
-    X = rng.rand(n_samples, n_features)
-    assert_array_equal(X, _check_X(X, n_components, n_features))
-
-
 def test_check_weights():
     rng = np.random.RandomState(0)
     rand_data = RandomData(rng)
diff --git a/sklearn/mixture/tests/test_mixture.py b/sklearn/mixture/tests/test_mixture.py
index a79cafe3bccec..7f497cfe76642 100644
--- a/sklearn/mixture/tests/test_mixture.py
+++ b/sklearn/mixture/tests/test_mixture.py
@@ -21,3 +21,19 @@ def test_gaussian_mixture_n_iter(estimator):
     estimator.set_params(max_iter=max_iter)
     estimator.fit(X)
     assert estimator.n_iter_ == max_iter
+
+
+@pytest.mark.parametrize(
+    "estimator",
+    [GaussianMixture(),
+     BayesianGaussianMixture()]
+)
+def test_mixture_n_components_greater_than_n_samples_error(estimator):
+    """Check error when n_components <= n_samples"""
+    rng = np.random.RandomState(0)
+    X = rng.rand(10, 5)
+    estimator.set_params(n_components=12)
+
+    msg = "Expected n_samples >= n_components"
+    with pytest.raises(ValueError, match=msg):
+        estimator.fit(X)
diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
index bfd7f98268350..05f45a51de63d 100644
--- a/sklearn/tests/test_common.py
+++ b/sklearn/tests/test_common.py
@@ -264,7 +264,6 @@ def test_search_cv(estimator, check, request):
     'calibration',
     'compose',
     'feature_extraction',
-    'mixture',
     'model_selection',
     'multiclass',
     'multioutput',
diff --git a/sklearn/tests/test_docstring_parameters.py b/sklearn/tests/test_docstring_parameters.py
index cd2bdba449799..38f22bc667f5b 100644
--- a/sklearn/tests/test_docstring_parameters.py
+++ b/sklearn/tests/test_docstring_parameters.py
@@ -193,7 +193,6 @@ def _construct_searchcv_instance(SearchCV):
     'kernel_ridge',
     'linear_model',
     'manifold',
-    'mixture',
     'model_selection',
     'multiclass',
     'multioutput',

From 5788d4a69182c6f150286757b7a0105f8adf2b12 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= <loic.esteve@ymail.com>
Date: Tue, 23 Mar 2021 14:09:31 +0100
Subject: [PATCH 274/478] MAINT Improve issue template (#19704)

Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
---
 .github/ISSUE_TEMPLATE/config.yml        |  9 ++++++---
 .github/ISSUE_TEMPLATE/other_template.md | 10 ----------
 2 files changed, 6 insertions(+), 13 deletions(-)
 delete mode 100644 .github/ISSUE_TEMPLATE/other_template.md

diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
index 7d39c399ca81b..c6af207bba1e8 100644
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -1,14 +1,17 @@
 blank_issues_enabled: false
 contact_links:
   - name: Discussions
-    url: https://github.com/scikit-learn/scikit-learn/discussions
+    url: https://github.com/scikit-learn/scikit-learn/discussions/new
     about: Ask questions and discuss with other scikit-learn community members
-  - name: Stack overflow
+  - name: Stack Overflow
     url: https://stackoverflow.com/questions/tagged/scikit-learn
-    about: Please ask and answer usage questions on stackoverflow
+    about: Please ask and answer usage questions on Stack Overflow
   - name: Mailing list 
     url: https://mail.python.org/mailman/listinfo/scikit-learn
     about: General discussions and announcements on the mailing list
   - name: Gitter
     url: https://gitter.im/scikit-learn/scikit-learn
     about: Users and developers can sometimes be found on the gitter channel
+  - name: Blank issue
+    url: https://github.com/scikit-learn/scikit-learn/issues/new
+    about: Please note that Github Discussions should be used in most cases instead
diff --git a/.github/ISSUE_TEMPLATE/other_template.md b/.github/ISSUE_TEMPLATE/other_template.md
deleted file mode 100644
index d46ae9e50b18f..0000000000000
--- a/.github/ISSUE_TEMPLATE/other_template.md
+++ /dev/null
@@ -1,10 +0,0 @@
----
-name: Other
-about: For all other issues to reach the community...
-title: ''
-labels: ''
-assignees: ''
-
----
-
-

From df3f1bda424911e2d746d68a731e188a60de925f Mon Sep 17 00:00:00 2001
From: makoeppel <koeppel.ma@googlemail.com>
Date: Tue, 23 Mar 2021 14:14:41 +0100
Subject: [PATCH 275/478] FIX Adds check_array to inverse_transform of
 StandardScaler (#19356)

Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
---
 doc/whats_new/v1.0.rst                   |  3 +++
 sklearn/preprocessing/_data.py           | 11 +++--------
 sklearn/preprocessing/tests/test_data.py | 20 ++++++++++++++++++++
 3 files changed, 26 insertions(+), 8 deletions(-)

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index c252f5df1074e..be894774f5a27 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -268,6 +268,9 @@ Changelog
   centering is typically disabled. :pr:`19527` by :user:`Oliver Grisel
   <ogrisel>` and :user:`Maria Telenczuk <maikia>`.
 
+- |Fix| :meth:`preprocessing.StandardScaler.inverse_transform` now
+  correctly handles integer dtypes. :pr:`19356` by :user:`makoeppel`.
+
 :mod:`sklearn.tree`
 ...................
 
diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py
index 5e85b932a1e39..6191fb2fd8bcd 100644
--- a/sklearn/preprocessing/_data.py
+++ b/sklearn/preprocessing/_data.py
@@ -926,22 +926,17 @@ def inverse_transform(self, X, copy=None):
         check_is_fitted(self)
 
         copy = copy if copy is not None else self.copy
+        X = check_array(X, accept_sparse='csr', copy=copy, ensure_2d=False,
+                        dtype=FLOAT_DTYPES, force_all_finite="allow-nan")
+
         if sparse.issparse(X):
             if self.with_mean:
                 raise ValueError(
                     "Cannot uncenter sparse matrices: pass `with_mean=False` "
                     "instead See docstring for motivation and alternatives.")
-            if not sparse.isspmatrix_csr(X):
-                X = X.tocsr()
-                copy = False
-            if copy:
-                X = X.copy()
             if self.scale_ is not None:
                 inplace_column_scale(X, self.scale_)
         else:
-            X = np.asarray(X)
-            if copy:
-                X = X.copy()
             if self.with_std:
                 X *= self.scale_
             if self.with_mean:
diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py
index 8a30eba27cff7..5557562283850 100644
--- a/sklearn/preprocessing/tests/test_data.py
+++ b/sklearn/preprocessing/tests/test_data.py
@@ -613,6 +613,26 @@ def test_standard_scaler_trasform_with_partial_fit(sample_weight):
             )
 
 
+def test_standard_check_array_of_inverse_transform():
+    # Check if StandardScaler inverse_transform is
+    # converting the integer array to float
+    x = np.array([
+        [1, 1, 1, 0, 1, 0],
+        [1, 1, 1, 0, 1, 0],
+        [0, 8, 0, 1, 0, 0],
+        [1, 4, 1, 1, 0, 0],
+        [0, 1, 0, 0, 1, 0],
+        [0, 4, 0, 1, 0, 1]], dtype=np.int32)
+
+    scaler = StandardScaler()
+    scaler.fit(x)
+
+    # The of inverse_transform should be converted
+    # to a float array.
+    # If not X *= self.scale_ will fail.
+    scaler.inverse_transform(x)
+
+
 def test_min_max_scaler_iris():
     X = iris.data
     scaler = MinMaxScaler()

From 114616d9f6ce9eba7c1aacd3d4a254f868010e25 Mon Sep 17 00:00:00 2001
From: Isaack Mungui <41724425+isaack-mungui@users.noreply.github.com>
Date: Tue, 23 Mar 2021 20:09:05 +0300
Subject: [PATCH 276/478] TST Replace assert_warns in covariance/tests (#19757)

---
 sklearn/covariance/tests/test_covariance.py   | 23 +++++++++++++++----
 .../tests/test_robust_covariance.py           |  8 +++----
 2 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/sklearn/covariance/tests/test_covariance.py b/sklearn/covariance/tests/test_covariance.py
index bcf163e8182d8..2557299cd395d 100644
--- a/sklearn/covariance/tests/test_covariance.py
+++ b/sklearn/covariance/tests/test_covariance.py
@@ -10,7 +10,6 @@
 from sklearn.utils._testing import assert_almost_equal
 from sklearn.utils._testing import assert_array_almost_equal
 from sklearn.utils._testing import assert_array_equal
-from sklearn.utils._testing import assert_warns
 
 from sklearn import datasets
 from sklearn.covariance import empirical_covariance, EmpiricalCovariance, \
@@ -57,7 +56,12 @@ def test_covariance():
     # Create X with 1 sample and 5 features
     X_1sample = np.arange(5).reshape(1, 5)
     cov = EmpiricalCovariance()
-    assert_warns(UserWarning, cov.fit, X_1sample)
+    warn_msg = (
+        "Only one sample available. You may want to reshape your data array"
+    )
+    with pytest.warns(UserWarning, match=warn_msg):
+        cov.fit(X_1sample)
+
     assert_array_almost_equal(cov.covariance_,
                               np.zeros(shape=(5, 5), dtype=np.float64))
 
@@ -175,7 +179,13 @@ def test_ledoit_wolf():
     # warning should be raised when using only 1 sample
     X_1sample = np.arange(5).reshape(1, 5)
     lw = LedoitWolf()
-    assert_warns(UserWarning, lw.fit, X_1sample)
+
+    warn_msg = (
+        "Only one sample available. You may want to reshape your data array"
+    )
+    with pytest.warns(UserWarning, match=warn_msg):
+        lw.fit(X_1sample)
+
     assert_array_almost_equal(lw.covariance_,
                               np.zeros(shape=(5, 5), dtype=np.float64))
 
@@ -294,7 +304,12 @@ def test_oas():
     # warning should be raised when using only 1 sample
     X_1sample = np.arange(5).reshape(1, 5)
     oa = OAS()
-    assert_warns(UserWarning, oa.fit, X_1sample)
+    warn_msg = (
+        "Only one sample available. You may want to reshape your data array"
+    )
+    with pytest.warns(UserWarning, match=warn_msg):
+        oa.fit(X_1sample)
+
     assert_array_almost_equal(oa.covariance_,
                               np.zeros(shape=(5, 5), dtype=np.float64))
 
diff --git a/sklearn/covariance/tests/test_robust_covariance.py b/sklearn/covariance/tests/test_robust_covariance.py
index 55100702bd365..01f32563710aa 100644
--- a/sklearn/covariance/tests/test_robust_covariance.py
+++ b/sklearn/covariance/tests/test_robust_covariance.py
@@ -7,10 +7,10 @@
 import itertools
 
 import numpy as np
+import pytest
 
 from sklearn.utils._testing import assert_array_almost_equal
 from sklearn.utils._testing import assert_raise_message
-from sklearn.utils._testing import assert_warns_message
 
 from sklearn import datasets
 from sklearn.covariance import empirical_covariance, MinCovDet
@@ -163,6 +163,6 @@ def test_mcd_increasing_det_warning():
          [5.2, 3.5, 1.5, 0.2]]
 
     mcd = MinCovDet(random_state=1)
-    assert_warns_message(RuntimeWarning,
-                         "Determinant has increased",
-                         mcd.fit, X)
+    warn_msg = "Determinant has increased"
+    with pytest.warns(RuntimeWarning, match=warn_msg):
+        mcd.fit(X)

From 4dfdfb4e1bb3719628753a4ece995a1b2fa5312a Mon Sep 17 00:00:00 2001
From: waijean <kwj_97@hotmail.com>
Date: Thu, 25 Mar 2021 19:39:16 +0000
Subject: [PATCH 277/478] DOC Fix typo in Truncated SVD documentation (#19765)

---
 doc/modules/decomposition.rst | 54 +++++++++++++++++------------------
 1 file changed, 27 insertions(+), 27 deletions(-)

diff --git a/doc/modules/decomposition.rst b/doc/modules/decomposition.rst
index 7e8e79d9d8bdd..e971d784c63d6 100644
--- a/doc/modules/decomposition.rst
+++ b/doc/modules/decomposition.rst
@@ -310,7 +310,7 @@ produces a low-rank approximation :math:`X`:
 .. math::
     X \approx X_k = U_k \Sigma_k V_k^\top
 
-After this operation, :math:`U_k \Sigma_k^\top`
+After this operation, :math:`U_k \Sigma_k`
 is the transformed training set with :math:`k` features
 (called ``n_components`` in the API).
 
@@ -872,34 +872,34 @@ The graphical model of LDA is a three-level generative model:
 .. image:: ../images/lda_model_graph.png
    :align: center
 
-Note on notations presented in the graphical model above, which can be found in 
+Note on notations presented in the graphical model above, which can be found in
 Hoffman et al. (2013):
 
   * The corpus is a collection of :math:`D` documents.
   * A document is a sequence of :math:`N` words.
-  * There are :math:`K` topics in the corpus. 
-  * The boxes represent repeated sampling. 
-
-In the graphical model, each node is a random variable and has a role in the 
-generative process. A shaded node indicates an observed variable and an unshaded 
-node indicates a hidden (latent) variable. In this case, words in the corpus are 
-the only data that we observe. The latent variables determine the random mixture 
-of topics in the corpus and the distribution of words in the documents. 
-The goal of LDA is to use the observed words to infer the hidden topic 
-structure. 
-
-When modeling text corpora, the model assumes the following generative process 
-for a corpus with :math:`D` documents and :math:`K` topics, with :math:`K` 
+  * There are :math:`K` topics in the corpus.
+  * The boxes represent repeated sampling.
+
+In the graphical model, each node is a random variable and has a role in the
+generative process. A shaded node indicates an observed variable and an unshaded
+node indicates a hidden (latent) variable. In this case, words in the corpus are
+the only data that we observe. The latent variables determine the random mixture
+of topics in the corpus and the distribution of words in the documents.
+The goal of LDA is to use the observed words to infer the hidden topic
+structure.
+
+When modeling text corpora, the model assumes the following generative process
+for a corpus with :math:`D` documents and :math:`K` topics, with :math:`K`
 corresponding to :attr:`n_components` in the API:
 
-  1. For each topic :math:`k \in K`, draw :math:`\beta_k \sim 
-     \mathrm{Dirichlet}(\eta)`. This provides a distribution over the words, 
-     i.e. the probability of a word appearing in topic :math:`k`. 
-     :math:`\eta` corresponds to :attr:`topic_word_prior`. 
+  1. For each topic :math:`k \in K`, draw :math:`\beta_k \sim
+     \mathrm{Dirichlet}(\eta)`. This provides a distribution over the words,
+     i.e. the probability of a word appearing in topic :math:`k`.
+     :math:`\eta` corresponds to :attr:`topic_word_prior`.
 
-  2. For each document :math:`d \in D`, draw the topic proportions 
-     :math:`\theta_d \sim \mathrm{Dirichlet}(\alpha)`. :math:`\alpha` 
-     corresponds to :attr:`doc_topic_prior`. 
+  2. For each document :math:`d \in D`, draw the topic proportions
+     :math:`\theta_d \sim \mathrm{Dirichlet}(\alpha)`. :math:`\alpha`
+     corresponds to :attr:`doc_topic_prior`.
 
   3. For each word :math:`i` in document :math:`d`:
 
@@ -916,8 +916,8 @@ For parameter estimation, the posterior distribution is:
 
 Since the posterior is intractable, variational Bayesian method
 uses a simpler distribution :math:`q(z,\theta,\beta | \lambda, \phi, \gamma)`
-to approximate it, and those variational parameters :math:`\lambda`, 
-:math:`\phi`, :math:`\gamma` are optimized to maximize the Evidence 
+to approximate it, and those variational parameters :math:`\lambda`,
+:math:`\phi`, :math:`\gamma` are optimized to maximize the Evidence
 Lower Bound (ELBO):
 
 .. math::
@@ -928,10 +928,10 @@ Maximizing ELBO is equivalent to minimizing the Kullback-Leibler(KL) divergence
 between :math:`q(z,\theta,\beta)` and the true posterior
 :math:`p(z, \theta, \beta |w, \alpha, \eta)`.
 
-:class:`LatentDirichletAllocation` implements the online variational Bayes 
+:class:`LatentDirichletAllocation` implements the online variational Bayes
 algorithm and supports both online and batch update methods.
-While the batch method updates variational variables after each full pass through 
-the data, the online method updates variational variables from mini-batch data 
+While the batch method updates variational variables after each full pass through
+the data, the online method updates variational variables from mini-batch data
 points.
 
 .. note::

From 80f923e00d6949b2385612b024981ac78a79e45a Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Mon, 29 Mar 2021 10:12:54 -0400
Subject: [PATCH 278/478] ENH num_features for a 1d collection of dicts is
 undefined (#19740)

Co-authored-by: Christian Lorentzen <lorentzen.ch@gmail.com>
---
 sklearn/utils/tests/test_validation.py | 7 +++++--
 sklearn/utils/validation.py            | 4 ++--
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py
index f3db51e694b52..66f7d9ae77687 100644
--- a/sklearn/utils/tests/test_validation.py
+++ b/sklearn/utils/tests/test_validation.py
@@ -1345,9 +1345,10 @@ def test_num_features(constructor_name):
         [1, 2, 3],
         ["a", "b", "c"],
         [False, True, False],
-        [1.0, 3.4, 4.0]
+        [1.0, 3.4, 4.0],
+        [{"a": 1}, {"b": 2}, {"c": 3}],
     ],
-    ids=["int", "str", "bool", "float"]
+    ids=["int", "str", "bool", "float", "dict"]
 )
 @pytest.mark.parametrize("constructor_name", [
     "list", "tuple", "array", "series"
@@ -1368,6 +1369,8 @@ def test_num_features_errors_1d_containers(X, constructor_name):
         message += " with shape (3,)"
     elif isinstance(X[0], str):
         message += " where the samples are of type str"
+    elif isinstance(X[0], dict):
+        message += " where the samples are of type dict"
     with pytest.raises(TypeError, match=re.escape(message)):
         _num_features(X)
 
diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py
index d0f410dd7f5d8..ce0fc0ead7e6d 100644
--- a/sklearn/utils/validation.py
+++ b/sklearn/utils/validation.py
@@ -226,8 +226,8 @@ def _num_features(X):
 
     first_sample = X[0]
 
-    # Do not consider an array-like of strings to be a 2D array
-    if isinstance(first_sample, (str, bytes)):
+    # Do not consider an array-like of strings or dicts to be a 2D array
+    if isinstance(first_sample, (str, bytes, dict)):
         message += (f" where the samples are of type "
                     f"{type(first_sample).__qualname__}")
         raise TypeError(message)

From 54ff7b7c4f745166258a529c33fec6a5ead0a432 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@gmail.com>
Date: Tue, 30 Mar 2021 18:19:56 +0200
Subject: [PATCH 279/478] Test and doc for n_features_in_ for
 sklearn.calibration (#19555)

Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
---
 sklearn/calibration.py                     | 48 ++++++++++++----------
 sklearn/tests/test_calibration.py          | 37 ++++++++++++-----
 sklearn/tests/test_common.py               |  1 -
 sklearn/tests/test_docstring_parameters.py |  1 -
 sklearn/utils/estimator_checks.py          |  7 +++-
 5 files changed, 57 insertions(+), 37 deletions(-)

diff --git a/sklearn/calibration.py b/sklearn/calibration.py
index b60a415b4419b..c6289d1df2936 100644
--- a/sklearn/calibration.py
+++ b/sklearn/calibration.py
@@ -9,7 +9,6 @@
 
 import warnings
 from inspect import signature
-from contextlib import suppress
 from functools import partial
 
 from math import log
@@ -33,7 +32,7 @@
 from .utils.fixes import delayed
 from .utils.validation import check_is_fitted, check_consistent_length
 from .utils.validation import _check_sample_weight, _num_samples
-from .pipeline import Pipeline
+from .utils import _safe_indexing
 from .isotonic import IsotonicRegression
 from .svm import LinearSVC
 from .model_selection import check_cv, cross_val_predict
@@ -141,6 +140,12 @@ class CalibratedClassifierCV(ClassifierMixin,
     classes_ : ndarray of shape (n_classes,)
         The class labels.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`. Only defined if the
+        underlying base_estimator exposes such an attribute when fit.
+
+        .. versionadded:: 0.24
+
     calibrated_classifiers_ : list (len() equal to cv or 1 if `cv="prefit"` \
             or `ensemble=False`)
         The list of classifier and calibrator pairs.
@@ -250,14 +255,8 @@ def fit(self, X, y, sample_weight=None):
 
         self.calibrated_classifiers_ = []
         if self.cv == "prefit":
-            # `classes_` and `n_features_in_` should be consistent with that
-            # of base_estimator
-            if isinstance(self.base_estimator, Pipeline):
-                check_is_fitted(self.base_estimator[-1])
-            else:
-                check_is_fitted(self.base_estimator)
-            with suppress(AttributeError):
-                self.n_features_in_ = base_estimator.n_features_in_
+            # `classes_` should be consistent with that of base_estimator
+            check_is_fitted(self.base_estimator, attributes=["classes_"])
             self.classes_ = self.base_estimator.classes_
 
             pred_method = _get_prediction_method(base_estimator)
@@ -270,10 +269,6 @@ def fit(self, X, y, sample_weight=None):
             )
             self.calibrated_classifiers_.append(calibrated_classifier)
         else:
-            X, y = self._validate_data(
-                X, y, accept_sparse=['csc', 'csr', 'coo'],
-                force_all_finite=False, allow_nd=True
-            )
             # Set `classes_` using all `y`
             label_encoder_ = LabelEncoder().fit(y)
             self.classes_ = label_encoder_.classes_
@@ -334,6 +329,9 @@ def fit(self, X, y, sample_weight=None):
                 )
                 self.calibrated_classifiers_.append(calibrated_classifier)
 
+        first_clf = self.calibrated_classifiers_[0].base_estimator
+        if hasattr(first_clf, "n_features_in_"):
+            self.n_features_in_ = first_clf.n_features_in_
         return self
 
     def predict_proba(self, X):
@@ -352,7 +350,6 @@ def predict_proba(self, X):
             The predicted probas.
         """
         check_is_fitted(self)
-
         # Compute the arithmetic mean of the predictions of the calibrated
         # classifiers
         mean_proba = np.zeros((_num_samples(X), len(self.classes_)))
@@ -431,19 +428,26 @@ def _fit_classifier_calibrator_pair(estimator, X, y, train, test, supports_sw,
     -------
     calibrated_classifier : _CalibratedClassifier instance
     """
-    if sample_weight is not None and supports_sw:
-        estimator.fit(X[train], y[train],
-                      sample_weight=sample_weight[train])
+    X_train, y_train = _safe_indexing(X, train), _safe_indexing(y, train)
+    X_test, y_test = _safe_indexing(X, test), _safe_indexing(y, test)
+    if supports_sw and sample_weight is not None:
+        sw_train = _safe_indexing(sample_weight, train)
+        sw_test = _safe_indexing(sample_weight, test)
+    else:
+        sw_train = None
+        sw_test = None
+
+    if supports_sw:
+        estimator.fit(X_train, y_train, sample_weight=sw_train)
     else:
-        estimator.fit(X[train], y[train])
+        estimator.fit(X_train, y_train)
 
     n_classes = len(classes)
     pred_method = _get_prediction_method(estimator)
-    predictions = _compute_predictions(pred_method, X[test], n_classes)
+    predictions = _compute_predictions(pred_method, X_test, n_classes)
 
-    sw = None if sample_weight is None else sample_weight[test]
     calibrated_classifier = _fit_calibrator(
-        estimator, predictions, y[test], classes, method, sample_weight=sw
+        estimator, predictions, y_test, classes, method, sample_weight=sw_test
     )
     return calibrated_classifier
 
diff --git a/sklearn/tests/test_calibration.py b/sklearn/tests/test_calibration.py
index 86a638c4a7679..53d620b41031c 100644
--- a/sklearn/tests/test_calibration.py
+++ b/sklearn/tests/test_calibration.py
@@ -512,19 +512,19 @@ def decision_function(self, X):
 
 
 @pytest.fixture
-def text_data():
-    text_data = [
+def dict_data():
+    dict_data = [
         {'state': 'NY', 'age': 'adult'},
         {'state': 'TX', 'age': 'adult'},
         {'state': 'VT', 'age': 'child'},
     ]
     text_labels = [1, 0, 1]
-    return text_data, text_labels
+    return dict_data, text_labels
 
 
 @pytest.fixture
-def text_data_pipeline(text_data):
-    X, y = text_data
+def dict_data_pipeline(dict_data):
+    X, y = dict_data
     pipeline_prefit = Pipeline([
         ('vectorizer', DictVectorizer()),
         ('clf', RandomForestClassifier())
@@ -532,7 +532,7 @@ def text_data_pipeline(text_data):
     return pipeline_prefit.fit(X, y)
 
 
-def test_calibration_pipeline(text_data, text_data_pipeline):
+def test_calibration_dict_pipeline(dict_data, dict_data_pipeline):
     """Test that calibration works in prefit pipeline with transformer
 
     `X` is not array-like, sparse matrix or dataframe at the start.
@@ -541,15 +541,17 @@ def test_calibration_pipeline(text_data, text_data_pipeline):
     Also test it can predict without running into validation errors.
     See https://github.com/scikit-learn/scikit-learn/issues/19637
     """
-    X, y = text_data
-    clf = text_data_pipeline
+    X, y = dict_data
+    clf = dict_data_pipeline
     calib_clf = CalibratedClassifierCV(clf, cv='prefit')
     calib_clf.fit(X, y)
     # Check attributes are obtained from fitted estimator
     assert_array_equal(calib_clf.classes_, clf.classes_)
-    msg = "'CalibratedClassifierCV' object has no attribute"
-    with pytest.raises(AttributeError, match=msg):
-        calib_clf.n_features_in_
+
+    # Neither the pipeline nor the calibration meta-estimator
+    # expose the n_features_in_ check on this kind of data.
+    assert not hasattr(clf, 'n_features_in_')
+    assert not hasattr(calib_clf, 'n_features_in_')
 
     # Ensure that no error is thrown with predict and predict_proba
     calib_clf.predict(X)
@@ -578,6 +580,19 @@ def test_calibration_attributes(clf, cv):
         assert calib_clf.n_features_in_ == X.shape[1]
 
 
+def test_calibration_inconsistent_prefit_n_features_in():
+    # Check that `n_features_in_` from prefit base estimator
+    # is consistent with training set
+    X, y = make_classification(n_samples=10, n_features=5,
+                               n_classes=2, random_state=7)
+    clf = LinearSVC(C=1).fit(X, y)
+    calib_clf = CalibratedClassifierCV(clf, cv='prefit')
+
+    msg = "X has 3 features, but LinearSVC is expecting 5 features as input."
+    with pytest.raises(ValueError, match=msg):
+        calib_clf.fit(X[:, :3], y)
+
+
 # FIXME: remove in 1.1
 def test_calibrated_classifier_cv_deprecation(data):
     # Check that we raise the proper deprecation warning if accessing
diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
index 05f45a51de63d..8ec4125547722 100644
--- a/sklearn/tests/test_common.py
+++ b/sklearn/tests/test_common.py
@@ -261,7 +261,6 @@ def test_search_cv(estimator, check, request):
 #
 # check_classifiers_train would need to be updated with the error message
 N_FEATURES_IN_AFTER_FIT_MODULES_TO_IGNORE = {
-    'calibration',
     'compose',
     'feature_extraction',
     'model_selection',
diff --git a/sklearn/tests/test_docstring_parameters.py b/sklearn/tests/test_docstring_parameters.py
index 38f22bc667f5b..ee2fe055a4b43 100644
--- a/sklearn/tests/test_docstring_parameters.py
+++ b/sklearn/tests/test_docstring_parameters.py
@@ -176,7 +176,6 @@ def _construct_searchcv_instance(SearchCV):
 
 
 N_FEATURES_MODULES_TO_IGNORE = {
-    'calibration',
     'cluster',
     'compose',
     'covariance',
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index 7561c64abe6a8..71f5b3b42de42 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -959,10 +959,13 @@ def check_dtype_object(name, estimator_orig):
 
 
 def check_complex_data(name, estimator_orig):
+    rng = np.random.RandomState(42)
     # check that estimators raise an exception on providing complex data
-    X = np.random.sample(10) + 1j * np.random.sample(10)
+    X = rng.uniform(size=10) + 1j * rng.uniform(size=10)
     X = X.reshape(-1, 1)
-    y = np.random.sample(10) + 1j * np.random.sample(10)
+
+    # Something both valid for classification and regression
+    y = rng.randint(low=0, high=2, size=10) + 1j
     estimator = clone(estimator_orig)
     with raises(ValueError, match="Complex data not supported"):
         estimator.fit(X, y)

From 57d3668f2a1fea69dafc2e68208576a56812cd45 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?=
 <34657725+jeremiedbb@users.noreply.github.com>
Date: Wed, 31 Mar 2021 09:20:22 +0200
Subject: [PATCH 280/478] MNT Avoid catastrophic cancellation in
 mean_variance_axis (#19766)

---
 doc/whats_new/v1.0.rst                  | 11 ++++-
 sklearn/utils/sparsefuncs_fast.pyx      | 62 +++++++++++++++++--------
 sklearn/utils/tests/test_sparsefuncs.py | 20 ++++++++
 3 files changed, 71 insertions(+), 22 deletions(-)

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index be894774f5a27..2b108d2f0e197 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -288,12 +288,19 @@ Changelog
   :user:`Clifford Akai-Nettey<cliffordEmmanuel>`.
 
 :mod:`sklearn.calibration`
-............................
+..........................
 
 - |Fix| The predict and predict_proba methods of
-  :class:`calibration.CalibratedClassifierCV can now properly be used on
+  :class:`calibration.CalibratedClassifierCV` can now properly be used on
   prefitted pipelines. :pr:`19641` by :user:`Alek Lefebvre <AlekLefebvre>`
 
+:mod:`sklearn.utils`
+....................
+  
+  - |Fix| Fixed a bug in :func:`utils.sparsefuncs.mean_variance_axis` where the
+    precision of the computed variance was very poor when the real variance is
+    exactly zero. :pr:`19766` by :user:`Jérémie du Boisberranger <jeremiedbb>`.
+
 Code and Documentation Contributors
 -----------------------------------
 
diff --git a/sklearn/utils/sparsefuncs_fast.pyx b/sklearn/utils/sparsefuncs_fast.pyx
index e89599918ec5e..4a84c03eff86b 100644
--- a/sklearn/utils/sparsefuncs_fast.pyx
+++ b/sklearn/utils/sparsefuncs_fast.pyx
@@ -124,23 +124,32 @@ def _csr_mean_variance_axis0(np.ndarray[floating, ndim=1, mode="c"] X_data,
     variances = np.zeros_like(means, dtype=dtype)
 
     cdef:
-        np.ndarray[floating, ndim=1] sum_weights = \
-            np.full(fill_value=np.sum(weights), shape=n_features, dtype=dtype)
-        np.ndarray[floating, ndim=1] sum_weights_nan = \
-            np.zeros(shape=n_features, dtype=dtype)
-        np.ndarray[floating, ndim=1] sum_weights_nz = \
-            np.zeros(shape=n_features, dtype=dtype)
+        np.ndarray[floating, ndim=1] sum_weights = np.full(
+            fill_value=np.sum(weights), shape=n_features, dtype=dtype)
+        np.ndarray[floating, ndim=1] sum_weights_nz = np.zeros(
+            shape=n_features, dtype=dtype)
+
+        np.ndarray[np.uint64_t, ndim=1] counts = np.full(
+            fill_value=weights.shape[0], shape=n_features, dtype=np.uint64)
+        np.ndarray[np.uint64_t, ndim=1] counts_nz = np.zeros(
+            shape=n_features, dtype=np.uint64)
 
     for row_ind in range(len(X_indptr) - 1):
         for i in range(X_indptr[row_ind], X_indptr[row_ind + 1]):
             col_ind = X_indices[i]
             if not isnan(X_data[i]):
                 means[col_ind] += (X_data[i] * weights[row_ind])
+                # sum of weights where X[:, col_ind] is non-zero
+                sum_weights_nz[col_ind] += weights[row_ind]
+                # number of non-zero elements of X[:, col_ind]
+                counts_nz[col_ind] += 1
             else:
-                sum_weights_nan[col_ind] += weights[row_ind]
+                # sum of weights where X[:, col_ind] is not nan
+                sum_weights[col_ind] -= weights[row_ind]
+                # number of non nan elements of X[:, col_ind]
+                counts[col_ind] -= 1
 
     for i in range(n_features):
-        sum_weights[i] -= sum_weights_nan[i]
         means[i] /= sum_weights[i]
 
     for row_ind in range(len(X_indptr) - 1):
@@ -149,10 +158,12 @@ def _csr_mean_variance_axis0(np.ndarray[floating, ndim=1, mode="c"] X_data,
             if not isnan(X_data[i]):
                 diff = X_data[i] - means[col_ind]
                 variances[col_ind] += diff * diff * weights[row_ind]
-                sum_weights_nz[col_ind] += weights[row_ind]
 
     for i in range(n_features):
-        variances[i] += (sum_weights[i] - sum_weights_nz[i]) * means[i]**2
+        if counts[i] != counts_nz[i]:
+            # only compute it when it's guaranteed to be non-zero to avoid
+            # catastrophic cancellation.
+            variances[i] += (sum_weights[i] - sum_weights_nz[i]) * means[i]**2
         variances[i] /= sum_weights[i]
 
     return means, variances, sum_weights
@@ -228,23 +239,32 @@ def _csc_mean_variance_axis0(np.ndarray[floating, ndim=1, mode="c"] X_data,
     variances = np.zeros_like(means, dtype=dtype)
 
     cdef:
-        np.ndarray[floating, ndim=1] sum_weights = \
-            np.full(fill_value=np.sum(weights), shape=n_features, dtype=dtype)
-        np.ndarray[floating, ndim=1] sum_weights_nan = \
-            np.zeros(shape=n_features, dtype=dtype)
-        np.ndarray[floating, ndim=1] sum_weights_nz = \
-            np.zeros(shape=n_features, dtype=dtype)
+        np.ndarray[floating, ndim=1] sum_weights = np.full(
+            fill_value=np.sum(weights), shape=n_features, dtype=dtype)
+        np.ndarray[floating, ndim=1] sum_weights_nz = np.zeros(
+            shape=n_features, dtype=dtype)
+
+        np.ndarray[np.uint64_t, ndim=1] counts = np.full(
+            fill_value=weights.shape[0], shape=n_features, dtype=np.uint64)
+        np.ndarray[np.uint64_t, ndim=1] counts_nz = np.zeros(
+            shape=n_features, dtype=np.uint64)
 
     for col_ind in range(n_features):
         for i in range(X_indptr[col_ind], X_indptr[col_ind + 1]):
             row_ind = X_indices[i]
             if not isnan(X_data[i]):
                 means[col_ind] += (X_data[i] * weights[row_ind])
+                # sum of weights where X[:, col_ind] is non-zero
+                sum_weights_nz[col_ind] += weights[row_ind]
+                # number of non-zero elements of X[:, col_ind]
+                counts_nz[col_ind] += 1
             else:
-                sum_weights_nan[col_ind] += weights[row_ind]
+                # sum of weights where X[:, col_ind] is not nan
+                sum_weights[col_ind] -= weights[row_ind]
+                # number of non nan elements of X[:, col_ind]
+                counts[col_ind] -= 1
 
     for i in range(n_features):
-        sum_weights[i] -= sum_weights_nan[i]
         means[i] /= sum_weights[i]
 
     for col_ind in range(n_features):
@@ -253,10 +273,12 @@ def _csc_mean_variance_axis0(np.ndarray[floating, ndim=1, mode="c"] X_data,
             if not isnan(X_data[i]):
                 diff = X_data[i] - means[col_ind]
                 variances[col_ind] += diff * diff * weights[row_ind]
-                sum_weights_nz[col_ind] += weights[row_ind]
 
     for i in range(n_features):
-        variances[i] += (sum_weights[i] - sum_weights_nz[i]) * means[i]**2
+        if counts[i] != counts_nz[i]:
+            # only compute it when it's guaranteed to be non-zero to avoid
+            # catastrophic cancellation.
+            variances[i] += (sum_weights[i] - sum_weights_nz[i]) * means[i]**2
         variances[i] /= sum_weights[i]
 
     return means, variances, sum_weights
diff --git a/sklearn/utils/tests/test_sparsefuncs.py b/sklearn/utils/tests/test_sparsefuncs.py
index 8366aabd751ad..8b087145c3d36 100644
--- a/sklearn/utils/tests/test_sparsefuncs.py
+++ b/sklearn/utils/tests/test_sparsefuncs.py
@@ -53,6 +53,26 @@ def test_mean_variance_axis0():
             assert_array_almost_equal(X_vars, np.var(X_test, axis=0))
 
 
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+@pytest.mark.parametrize("sparse_constructor", [sp.csr_matrix, sp.csc_matrix])
+def test_mean_variance_axis0_precision(dtype, sparse_constructor):
+    # Check that there's no big loss of precision when the real variance is
+    # exactly 0. (#19766)
+    rng = np.random.RandomState(0)
+    X = np.full(fill_value=100., shape=(1000, 1), dtype=dtype)
+    # Add some missing records which should be ignored:
+    missing_indices = rng.choice(np.arange(X.shape[0]), 10, replace=False)
+    X[missing_indices, 0] = np.nan
+    X = sparse_constructor(X)
+
+    # Random positive weights:
+    sample_weight = rng.rand(X.shape[0]).astype(dtype)
+
+    _, var = mean_variance_axis(X, weights=sample_weight, axis=0)
+
+    assert var < np.finfo(dtype).eps
+
+
 def test_mean_variance_axis1():
     X, _ = make_classification(5, 4, random_state=0)
     # Sparsify the array a little bit

From c9c89cfc85dd8dfefd7921c16c87327d03140a06 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Wed, 31 Mar 2021 10:45:38 -0400
Subject: [PATCH 281/478] ENH Adds support for drop + handle_unknown=ignore in
 the OneHotEncoder (#19041)

Co-authored-by: Olivier Grisel <olivier.grisel@gmail.com>
---
 doc/modules/preprocessing.rst                | 28 ++++++-
 doc/whats_new/v1.0.rst                       |  7 ++
 sklearn/preprocessing/_encoders.py           | 40 +++++++---
 sklearn/preprocessing/tests/test_encoders.py | 83 +++++++++++++++++++-
 4 files changed, 140 insertions(+), 18 deletions(-)

diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
index b87971ec4ae5a..cdde7479b1a4f 100644
--- a/doc/modules/preprocessing.rst
+++ b/doc/modules/preprocessing.rst
@@ -560,9 +560,7 @@ parameter allows the user to specify a category for each feature to be dropped.
 This is useful to avoid co-linearity in the input matrix in some classifiers.
 Such functionality is useful, for example, when using non-regularized
 regression (:class:`LinearRegression <sklearn.linear_model.LinearRegression>`),
-since co-linearity would cause the covariance matrix to be non-invertible.
-When this parameter is not None, ``handle_unknown`` must be set to
-``error``::
+since co-linearity would cause the covariance matrix to be non-invertible::
 
     >>> X = [['male', 'from US', 'uses Safari'],
     ...      ['female', 'from Europe', 'uses Firefox']]
@@ -591,6 +589,30 @@ In the transformed `X`, the first column is the encoding of the feature with
 categories "male"/"female", while the remaining 6 columns is the encoding of
 the 2 features with respectively 3 categories each.
 
+When `handle_unknown='ignore'` and `drop` is not None, unknown categories will
+be encoded as all zeros::
+
+    >>> drop_enc = preprocessing.OneHotEncoder(drop='first',
+    ...                                        handle_unknown='ignore').fit(X)
+    >>> X_test = [['unknown', 'America', 'IE']]
+    >>> drop_enc.transform(X_test).toarray()
+    array([[0., 0., 0., 0., 0.]])
+
+All the categories in `X_test` are unknown during transform and will be mapped
+to all zeros. This means that unknown categories will have the same mapping as
+the dropped category. :meth`OneHotEncoder.inverse_transform` will map all zeros
+to the dropped category if a category is dropped and `None` if a category is
+not dropped::
+
+    >>> drop_enc = preprocessing.OneHotEncoder(drop='if_binary', sparse=False,
+    ...                                        handle_unknown='ignore').fit(X)
+    >>> X_test = [['unknown', 'America', 'IE']]
+    >>> X_trans = drop_enc.transform(X_test)
+    >>> X_trans
+    array([[0., 0., 0., 0., 0., 0., 0.]])
+    >>> drop_enc.inverse_transform(X_trans)
+    array([['female', None, None]], dtype=object)
+
 :class:`OneHotEncoder` supports categorical features with missing values by
 considering the missing values as an additional category::
 
diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 2b108d2f0e197..2aaecb6d9b438 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -79,6 +79,13 @@ Changelog
 :mod:`sklearn.cluster`
 ......................
 
+:mod:`sklearn.preprocessing`
+............................
+
+- |Feature| :class:`preprocessing.OneHotEncoder` now supports
+  `handle_unknown='ignore'` and dropping categories. :pr:`19041` by
+  `Thomas Fan`_.
+
 - |Efficiency| The "k-means++" initialization of :class:`cluster.KMeans` and
   :class:`cluster.MiniBatchKMeans` is now faster, especially in multicore
   settings. :pr:`19002` by :user:`Jon Crall <Erotemic>` and
diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index 043f9fc40ef53..4344e010bba1a 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -2,6 +2,7 @@
 #          Joris Van den Bossche <jorisvandenbossche@gmail.com>
 # License: BSD 3 clause
 
+import warnings
 import numpy as np
 from scipy import sparse
 import numbers
@@ -110,7 +111,8 @@ def _fit(self, X, handle_unknown='error', force_all_finite=True):
                         raise ValueError(msg)
             self.categories_.append(cats)
 
-    def _transform(self, X, handle_unknown='error', force_all_finite=True):
+    def _transform(self, X, handle_unknown='error', force_all_finite=True,
+                   warn_on_unknown=False):
         X_list, n_samples, n_features = self._check_X(
             X, force_all_finite=force_all_finite)
 
@@ -125,6 +127,7 @@ def _transform(self, X, handle_unknown='error', force_all_finite=True):
                 .format(len(self.categories_,), n_features)
             )
 
+        columns_with_unknown = []
         for i in range(n_features):
             Xi = X_list[i]
             diff, valid_mask = _check_unknown(Xi, self.categories_[i],
@@ -136,6 +139,8 @@ def _transform(self, X, handle_unknown='error', force_all_finite=True):
                            " during transform".format(diff, i))
                     raise ValueError(msg)
                 else:
+                    if warn_on_unknown:
+                        columns_with_unknown.append(i)
                     # Set the problematic rows to an acceptable value and
                     # continue `The rows are marked `X_mask` and will be
                     # removed later.
@@ -153,6 +158,11 @@ def _transform(self, X, handle_unknown='error', force_all_finite=True):
             # already called above.
             X_int[:, i] = _encode(Xi, uniques=self.categories_[i],
                                   check_unknown=False)
+        if columns_with_unknown:
+            warnings.warn("Found unknown categories in columns "
+                          f"{columns_with_unknown} during transform. These "
+                          "unknown categories will be encoded as all zeros",
+                          UserWarning)
 
         return X_int, X_mask
 
@@ -327,14 +337,6 @@ def _validate_keywords(self):
             msg = ("handle_unknown should be either 'error' or 'ignore', "
                    "got {0}.".format(self.handle_unknown))
             raise ValueError(msg)
-        # If we have both dropped columns and ignored unknown
-        # values, there will be ambiguous cells. This creates difficulties
-        # in interpreting the model.
-        if self.drop is not None and self.handle_unknown != 'error':
-            raise ValueError(
-                "`handle_unknown` must be 'error' when the drop parameter is "
-                "specified, as both would create categories that are all "
-                "zero.")
 
     def _compute_drop_idx(self):
         if self.drop is None:
@@ -459,8 +461,11 @@ def transform(self, X):
         """
         check_is_fitted(self)
         # validation of X happens in _check_X called by _transform
+        warn_on_unknown = (self.handle_unknown == "ignore"
+                           and self.drop is not None)
         X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown,
-                                        force_all_finite='allow-nan')
+                                        force_all_finite='allow-nan',
+                                        warn_on_unknown=warn_on_unknown)
 
         n_samples, n_features = X_int.shape
 
@@ -509,8 +514,10 @@ def inverse_transform(self, X):
         """
         Convert the data back to the original representation.
 
-        In case unknown categories are encountered (all zeros in the
-        one-hot encoding), ``None`` is used to represent this category.
+        When unknown categories are encountered (all zeros in the
+        one-hot encoding), ``None`` is used to represent this category. If the
+        feature with the unknown category has a dropped caregory, the dropped
+        category will be its inverse.
 
         Parameters
         ----------
@@ -571,7 +578,14 @@ def inverse_transform(self, X):
                 unknown = np.asarray(sub.sum(axis=1) == 0).flatten()
                 # ignored unknown categories: we have a row of all zero
                 if unknown.any():
-                    found_unknown[i] = unknown
+                    # if categories were dropped then unknown categories will
+                    # be mapped to the dropped category
+                    if self.drop_idx_ is None or self.drop_idx_[i] is None:
+                        found_unknown[i] = unknown
+                    else:
+                        X_tr[unknown, i] = self.categories_[i][
+                            self.drop_idx_[i]
+                        ]
             else:
                 dropped = np.asarray(sub.sum(axis=1) == 0).flatten()
                 if dropped.any():
diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
index b1eff0cad21e0..eb776c4c25267 100644
--- a/sklearn/preprocessing/tests/test_encoders.py
+++ b/sklearn/preprocessing/tests/test_encoders.py
@@ -775,8 +775,6 @@ def test_one_hot_encoder_drop_manual(missing_value):
     "X_fit, params, err_msg",
     [([["Male"], ["Female"]], {'drop': 'second'},
      "Wrong input for parameter `drop`"),
-     ([["Male"], ["Female"]], {'drop': 'first', 'handle_unknown': 'ignore'},
-     "`handle_unknown` must be 'error'"),
      ([['abc', 2, 55], ['def', 1, 55], ['def', 3, 59]],
       {'drop': np.asarray('b', dtype=object)},
      "Wrong input for parameter `drop`"),
@@ -914,6 +912,87 @@ def test_ohe_missing_value_support_pandas_categorical(pd_nan_type):
     assert np.isnan(ohe.categories_[0][-1])
 
 
+def test_ohe_drop_first_handle_unknown_ignore_warns():
+    """Check drop='first' and handle_unknown='ignore' during transform."""
+    X = [['a', 0], ['b', 2], ['b', 1]]
+
+    ohe = OneHotEncoder(drop='first', sparse=False, handle_unknown='ignore')
+    X_trans = ohe.fit_transform(X)
+
+    X_expected = np.array([
+        [0, 0, 0],
+        [1, 0, 1],
+        [1, 1, 0],
+    ])
+    assert_allclose(X_trans, X_expected)
+
+    # Both categories are unknown
+    X_test = [['c', 3]]
+    X_expected = np.array([[0, 0, 0]])
+
+    warn_msg = (r"Found unknown categories in columns \[0, 1\] during "
+                "transform. These unknown categories will be encoded as all "
+                "zeros")
+    with pytest.warns(UserWarning, match=warn_msg):
+        X_trans = ohe.transform(X_test)
+    assert_allclose(X_trans, X_expected)
+
+    # inverse_transform maps to None
+    X_inv = ohe.inverse_transform(X_expected)
+    assert_array_equal(X_inv, np.array([['a', 0]], dtype=object))
+
+
+def test_ohe_drop_if_binary_handle_unknown_ignore_warns():
+    """Check drop='if_binary' and handle_unknown='ignore' during transform."""
+    X = [['a', 0], ['b', 2], ['b', 1]]
+
+    ohe = OneHotEncoder(drop='if_binary', sparse=False,
+                        handle_unknown='ignore')
+    X_trans = ohe.fit_transform(X)
+
+    X_expected = np.array([
+        [0, 1, 0, 0],
+        [1, 0, 0, 1],
+        [1, 0, 1, 0],
+    ])
+    assert_allclose(X_trans, X_expected)
+
+    # Both categories are unknown
+    X_test = [['c', 3]]
+    X_expected = np.array([[0, 0, 0, 0]])
+
+    warn_msg = (r"Found unknown categories in columns \[0, 1\] during "
+                "transform. These unknown categories will be encoded as all "
+                "zeros")
+    with pytest.warns(UserWarning, match=warn_msg):
+        X_trans = ohe.transform(X_test)
+    assert_allclose(X_trans, X_expected)
+
+    # inverse_transform maps to None
+    X_inv = ohe.inverse_transform(X_expected)
+    assert_array_equal(X_inv, np.array([['a', None]], dtype=object))
+
+
+def test_ohe_drop_first_explicit_categories():
+    """Check drop='first' and handle_unknown='ignore' during fit with
+    categories passed in."""
+
+    X = [['a', 0], ['b', 2], ['b', 1]]
+
+    ohe = OneHotEncoder(drop='first', sparse=False, handle_unknown='ignore',
+                        categories=[['b', 'a'], [1, 2]])
+    ohe.fit(X)
+
+    X_test = [['c', 1]]
+    X_expected = np.array([[0, 0]])
+
+    warn_msg = (r"Found unknown categories in columns \[0\] during transform. "
+                r"These unknown categories will be encoded as all zeros")
+    with pytest.warns(UserWarning, match=warn_msg):
+        X_trans = ohe.transform(X_test)
+    assert_allclose(X_trans, X_expected)
+
+
 def test_ordinal_encoder_passthrough_missing_values_float_errors_dtype():
     """Test ordinal encoder with nan passthrough fails when dtype=np.int32."""
 

From 108dd7b00095a1265e6f0c4db0a69d620f590400 Mon Sep 17 00:00:00 2001
From: Alihan Zihna <alihanz@gmail.com>
Date: Thu, 1 Apr 2021 16:15:23 +0100
Subject: [PATCH 282/478] TST Changes assert to pytest style in
 tests/test_naive_bayes.py (#19768)

Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
Co-authored-by: Alihan Zihna <a.zihna@ckhgbdp.onmicrosoft.com>
---
 sklearn/tests/test_naive_bayes.py | 125 ++++++++++++++++++++----------
 1 file changed, 82 insertions(+), 43 deletions(-)

diff --git a/sklearn/tests/test_naive_bayes.py b/sklearn/tests/test_naive_bayes.py
index dcd4b07712357..251ba6698ab0f 100644
--- a/sklearn/tests/test_naive_bayes.py
+++ b/sklearn/tests/test_naive_bayes.py
@@ -1,3 +1,4 @@
+import re
 
 import numpy as np
 import scipy.sparse
@@ -11,10 +12,6 @@
 from sklearn.utils._testing import assert_almost_equal
 from sklearn.utils._testing import assert_array_equal
 from sklearn.utils._testing import assert_array_almost_equal
-from sklearn.utils._testing import assert_raises
-from sklearn.utils._testing import assert_raise_message
-from sklearn.utils._testing import assert_warns
-from sklearn.utils._testing import assert_no_warnings
 from sklearn.utils._testing import ignore_warnings
 
 from sklearn.naive_bayes import GaussianNB, BernoulliNB
@@ -118,7 +115,10 @@ def test_gnb_sample_weight():
 def test_gnb_neg_priors():
     """Test whether an error is raised in case of negative priors"""
     clf = GaussianNB(priors=np.array([-1., 2.]))
-    assert_raises(ValueError, clf.fit, X, y)
+
+    msg = 'Priors must be non-negative'
+    with pytest.raises(ValueError, match=msg):
+        clf.fit(X, y)
 
 
 def test_gnb_priors():
@@ -146,13 +146,19 @@ def test_gnb_wrong_nb_priors():
     """ Test whether an error is raised if the number of prior is different
     from the number of class"""
     clf = GaussianNB(priors=np.array([.25, .25, .25, .25]))
-    assert_raises(ValueError, clf.fit, X, y)
+
+    msg = 'Number of priors must match number of classes'
+    with pytest.raises(ValueError, match=msg):
+        clf.fit(X, y)
 
 
 def test_gnb_prior_greater_one():
     """Test if an error is raised if the sum of prior greater than one"""
     clf = GaussianNB(priors=np.array([2., 1.]))
-    assert_raises(ValueError, clf.fit, X, y)
+
+    msg = 'The sum of the priors should be 1'
+    with pytest.raises(ValueError, match=msg):
+        clf.fit(X, y)
 
 
 def test_gnb_prior_large_bias():
@@ -339,9 +345,13 @@ def test_discretenb_provide_prior(DiscreteNaiveBayes):
     assert_array_almost_equal(prior, np.array([.5, .5]))
 
     # Inconsistent number of classes with prior
-    assert_raises(ValueError, clf.fit, [[0], [1], [2]], [0, 1, 2])
-    assert_raises(ValueError, clf.partial_fit, [[0], [1]], [0, 1],
-                  classes=[0, 1, 1])
+    msg = 'Number of priors must match number of classes'
+    with pytest.raises(ValueError, match=msg):
+        clf.fit([[0], [1], [2]], [0, 1, 2])
+
+    msg = 'is not the same as on last call to partial_fit'
+    with pytest.raises(ValueError, match=msg):
+        clf.partial_fit([[0], [1]], [0, 1], classes=[0, 1, 1])
 
 
 @pytest.mark.parametrize('DiscreteNaiveBayes', DISCRETE_NAIVE_BAYES_CLASSES)
@@ -470,7 +480,10 @@ def test_mnnb(kind):
 
     # Check the ability to predict the learning set.
     clf = MultinomialNB()
-    assert_raises(ValueError, clf.fit, -X, y2)
+
+    msg = 'Negative values in data passed to'
+    with pytest.raises(ValueError, match=msg):
+        clf.fit(-X, y2)
     y_pred = clf.fit(X, y2).predict(X)
 
     assert_array_equal(y_pred, y2)
@@ -518,18 +531,18 @@ def test_mnb_prior_unobserved_targets():
 
     clf = MultinomialNB()
 
-    assert_no_warnings(
-        clf.partial_fit, X, y, classes=[0, 1, 2]
-    )
+    with pytest.warns(None) as record:
+        clf.partial_fit(X, y, classes=[0, 1, 2])
+    assert len(record) == 0
 
     assert clf.predict([[0, 1]]) == 0
     assert clf.predict([[1, 0]]) == 1
     assert clf.predict([[1, 1]]) == 0
 
     # add a training example with previously unobserved class
-    assert_no_warnings(
-        clf.partial_fit, [[1, 1]], [2]
-    )
+    with pytest.warns(None) as record:
+        clf.partial_fit([[1, 1]], [2])
+    assert len(record) == 0
 
     assert clf.predict([[0, 1]]) == 0
     assert clf.predict([[1, 0]]) == 1
@@ -666,7 +679,10 @@ def test_cnb():
 
     # Verify inputs are nonnegative.
     clf = ComplementNB(alpha=1.0)
-    assert_raises(ValueError, clf.fit, -X, Y)
+
+    msg = re.escape('Negative values in data passed to ComplementNB (input X)')
+    with pytest.raises(ValueError, match=msg):
+        clf.fit(-X, Y)
 
     clf.fit(X, Y)
 
@@ -700,9 +716,13 @@ def test_categoricalnb():
     # Check error is raised for X with negative entries
     X = np.array([[0, -1]])
     y = np.array([1])
-    error_msg = "Negative values in data passed to CategoricalNB (input X)"
-    assert_raise_message(ValueError, error_msg, clf.predict, X)
-    assert_raise_message(ValueError, error_msg, clf.fit, X, y)
+    error_msg = re.escape(
+        "Negative values in data passed to CategoricalNB (input X)"
+    )
+    with pytest.raises(ValueError, match=error_msg):
+        clf.predict(X)
+    with pytest.raises(ValueError, match=error_msg):
+        clf.fit(X, y)
 
     # Test alpha
     X3_test = np.array([[2, 5]])
@@ -794,52 +814,67 @@ def test_alpha():
     X = np.array([[1, 0], [1, 1]])
     y = np.array([0, 1])
     nb = BernoulliNB(alpha=0.)
-    assert_warns(UserWarning, nb.partial_fit, X, y, classes=[0, 1])
-    assert_warns(UserWarning, nb.fit, X, y)
+    msg = (
+        "alpha too small will result in numeric errors,"
+        " setting alpha = 1.0e-10"
+    )
+    with pytest.warns(UserWarning, match=msg):
+        nb.partial_fit(X, y, classes=[0, 1])
+    with pytest.warns(UserWarning, match=msg):
+        nb.fit(X, y)
     prob = np.array([[1, 0], [0, 1]])
     assert_array_almost_equal(nb.predict_proba(X), prob)
 
     nb = MultinomialNB(alpha=0.)
-    assert_warns(UserWarning, nb.partial_fit, X, y, classes=[0, 1])
-    assert_warns(UserWarning, nb.fit, X, y)
+    with pytest.warns(UserWarning, match=msg):
+        nb.partial_fit(X, y, classes=[0, 1])
+    with pytest.warns(UserWarning, match=msg):
+        nb.fit(X, y)
     prob = np.array([[2. / 3, 1. / 3], [0, 1]])
     assert_array_almost_equal(nb.predict_proba(X), prob)
 
     nb = CategoricalNB(alpha=0.)
-    assert_warns(UserWarning, nb.fit, X, y)
+    with pytest.warns(UserWarning, match=msg):
+        nb.fit(X, y)
     prob = np.array([[1., 0.], [0., 1.]])
     assert_array_almost_equal(nb.predict_proba(X), prob)
 
     # Test sparse X
     X = scipy.sparse.csr_matrix(X)
     nb = BernoulliNB(alpha=0.)
-    assert_warns(UserWarning, nb.fit, X, y)
+    with pytest.warns(UserWarning, match=msg):
+        nb.fit(X, y)
     prob = np.array([[1, 0], [0, 1]])
     assert_array_almost_equal(nb.predict_proba(X), prob)
 
     nb = MultinomialNB(alpha=0.)
-    assert_warns(UserWarning, nb.fit, X, y)
+    with pytest.warns(UserWarning, match=msg):
+        nb.fit(X, y)
     prob = np.array([[2. / 3, 1. / 3], [0, 1]])
     assert_array_almost_equal(nb.predict_proba(X), prob)
 
     # Test for alpha < 0
     X = np.array([[1, 0], [1, 1]])
     y = np.array([0, 1])
-    expected_msg = ('Smoothing parameter alpha = -1.0e-01. '
-                    'alpha should be > 0.')
+    expected_msg = re.escape(
+        'Smoothing parameter alpha = -1.0e-01. alpha should be > 0.'
+    )
     b_nb = BernoulliNB(alpha=-0.1)
     m_nb = MultinomialNB(alpha=-0.1)
     c_nb = CategoricalNB(alpha=-0.1)
-    assert_raise_message(ValueError, expected_msg, b_nb.fit, X, y)
-    assert_raise_message(ValueError, expected_msg, m_nb.fit, X, y)
-    assert_raise_message(ValueError, expected_msg, c_nb.fit, X, y)
+    with pytest.raises(ValueError, match=expected_msg):
+        b_nb.fit(X, y)
+    with pytest.raises(ValueError, match=expected_msg):
+        m_nb.fit(X, y)
+    with pytest.raises(ValueError, match=expected_msg):
+        c_nb.fit(X, y)
 
     b_nb = BernoulliNB(alpha=-0.1)
     m_nb = MultinomialNB(alpha=-0.1)
-    assert_raise_message(ValueError, expected_msg, b_nb.partial_fit,
-                         X, y, classes=[0, 1])
-    assert_raise_message(ValueError, expected_msg, m_nb.partial_fit,
-                         X, y, classes=[0, 1])
+    with pytest.raises(ValueError, match=expected_msg):
+        b_nb.partial_fit(X, y, classes=[0, 1])
+    with pytest.raises(ValueError, match=expected_msg):
+        m_nb.partial_fit(X, y, classes=[0, 1])
 
 
 def test_alpha_vector():
@@ -862,10 +897,12 @@ def test_alpha_vector():
 
     # Test alpha non-negative
     alpha = np.array([1., -0.1])
-    expected_msg = ('Smoothing parameter alpha = -1.0e-01. '
-                    'alpha should be > 0.')
     m_nb = MultinomialNB(alpha=alpha)
-    assert_raise_message(ValueError, expected_msg, m_nb.fit, X, y)
+    expected_msg = (
+        'Smoothing parameter alpha = -1.0e-01. alpha should be > 0.'
+    )
+    with pytest.raises(ValueError, match=expected_msg):
+        m_nb.fit(X, y)
 
     # Test that too small pseudo-counts are replaced
     ALPHA_MIN = 1e-10
@@ -879,9 +916,11 @@ def test_alpha_vector():
     # Test correct dimensions
     alpha = np.array([1., 2., 3.])
     m_nb = MultinomialNB(alpha=alpha)
-    expected_msg = ('alpha should be a scalar or a numpy array '
-                    'with shape [n_features]')
-    assert_raise_message(ValueError, expected_msg, m_nb.fit, X, y)
+    expected_msg = re.escape(
+        'alpha should be a scalar or a numpy array with shape [n_features]'
+    )
+    with pytest.raises(ValueError, match=expected_msg):
+        m_nb.fit(X, y)
 
 
 def test_check_accuracy_on_digits():

From bc7cd3189bc817545791071515693445e1e271db Mon Sep 17 00:00:00 2001
From: Frederick Robinson <frederick.robinson@frrad.com>
Date: Fri, 2 Apr 2021 01:30:03 -0700
Subject: [PATCH 283/478] ENH more efficient _num_combinations calculation in
 PolynomialFeatures (#19734)

---
 doc/whats_new/v1.0.rst                        |  4 +++
 sklearn/preprocessing/_polynomial.py          | 35 +++++++++++++++----
 .../preprocessing/tests/test_polynomial.py    | 21 +++++++++++
 3 files changed, 54 insertions(+), 6 deletions(-)

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 2aaecb6d9b438..979ed9096aba1 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -200,6 +200,10 @@ Changelog
   :pr:`19426` by :user:`Alexandre Gramfort <agramfort>` and
   :user:`Maria Telenczuk <maikia>`.
 
+- |Efficiency| The implementation of `fit` for `PolynomialFeatures` transformer
+  is now faster. This is especially noticeable on large sparse input.
+  :pr:`19734` by :user:`Fred Robinson <frrad>`.
+
 :mod:`sklearn.manifold`
 .......................
 
diff --git a/sklearn/preprocessing/_polynomial.py b/sklearn/preprocessing/_polynomial.py
index 3f4ccc2fa05d4..d1ec49d7539bf 100644
--- a/sklearn/preprocessing/_polynomial.py
+++ b/sklearn/preprocessing/_polynomial.py
@@ -8,6 +8,7 @@
 import numpy as np
 from scipy import sparse
 from scipy.interpolate import BSpline
+from scipy.special import comb
 
 from ..base import BaseEstimator, TransformerMixin
 from ..utils import check_array
@@ -113,6 +114,29 @@ def _combinations(n_features, degree, interaction_only, include_bias):
         return chain.from_iterable(comb(range(n_features), i)
                                    for i in range(start, degree + 1))
 
+    @staticmethod
+    def _num_combinations(n_features, degree, interaction_only, include_bias):
+        """Calculate number of terms in polynomial expansion
+
+        This should be equivalent to counting the number of terms returned by
+        _combinations(...) but much faster.
+        """
+
+        if interaction_only:
+            combinations = sum(
+                [
+                    comb(n_features, i, exact=True)
+                    for i in range(1, min(degree + 1, n_features + 1))
+                ]
+            )
+        else:
+            combinations = comb(n_features + degree, degree, exact=True) - 1
+
+        if include_bias:
+            combinations += 1
+
+        return combinations
+
     @property
     def powers_(self):
         check_is_fitted(self)
@@ -170,13 +194,12 @@ def fit(self, X, y=None):
         self : object
             Fitted transformer.
         """
-        n_samples, n_features = self._validate_data(
-            X, accept_sparse=True).shape
-        combinations = self._combinations(n_features, self.degree,
-                                          self.interaction_only,
-                                          self.include_bias)
+        _, n_features = self._validate_data(X, accept_sparse=True).shape
         self.n_input_features_ = n_features
-        self.n_output_features_ = sum(1 for _ in combinations)
+        self.n_output_features_ = self._num_combinations(
+            n_features, self.degree, self.interaction_only, self.include_bias
+        )
+
         return self
 
     def transform(self, X):
diff --git a/sklearn/preprocessing/tests/test_polynomial.py b/sklearn/preprocessing/tests/test_polynomial.py
index 5068a8c7d8bdd..59c3a59df8873 100644
--- a/sklearn/preprocessing/tests/test_polynomial.py
+++ b/sklearn/preprocessing/tests/test_polynomial.py
@@ -552,6 +552,27 @@ def test_polynomial_features_csr_X(deg, include_bias, interaction_only, dtype):
     assert_array_almost_equal(Xt_csr.A, Xt_dense)
 
 
+@pytest.mark.parametrize("n_features", [1, 4, 5])
+@pytest.mark.parametrize("degree", range(1, 5))
+@pytest.mark.parametrize("interaction_only", [True, False])
+@pytest.mark.parametrize("include_bias", [True, False])
+def test_num_combinations(n_features, degree, interaction_only, include_bias):
+    """
+    Test that n_output_features_ is calculated correctly.
+    """
+    x = sparse.csr_matrix(([1], ([0], [n_features - 1])))
+    est = PolynomialFeatures(
+        degree, interaction_only=interaction_only, include_bias=include_bias
+    )
+    est.fit(x)
+    num_combos = est.n_output_features_
+
+    combos = PolynomialFeatures._combinations(
+        n_features, degree, interaction_only, include_bias
+    )
+    assert num_combos == sum([1 for _ in combos])
+
+
 @pytest.mark.parametrize(['deg', 'include_bias', 'interaction_only', 'dtype'],
                          [(2, True, False, np.float32),
                           (2, True, False, np.float64),

From a9ae69397e114d8b4df0f3f1cfb1f25525b43fc6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=20Dupr=C3=A9=20la=20Tour?= <tom.dupre-la-tour@m4x.org>
Date: Fri, 2 Apr 2021 01:44:06 -0700
Subject: [PATCH 284/478] FIX Approximate nearest neighbors in TSNE example
 (#19809)

---
 .../approximate_nearest_neighbors.py          | 64 ++++++++-----------
 1 file changed, 28 insertions(+), 36 deletions(-)

diff --git a/examples/neighbors/approximate_nearest_neighbors.py b/examples/neighbors/approximate_nearest_neighbors.py
index b7f09d3127b98..78f5f184a0da7 100644
--- a/examples/neighbors/approximate_nearest_neighbors.py
+++ b/examples/neighbors/approximate_nearest_neighbors.py
@@ -8,11 +8,6 @@
 replace KNeighborsTransformer and perform approximate nearest neighbors.
 These packages can be installed with `pip install annoy nmslib`.
 
-Note: Currently `TSNE(metric='precomputed')` does not modify the precomputed
-distances, and thus assumes that precomputed euclidean distances are squared.
-In future versions, a parameter in TSNE will control the optional squaring of
-precomputed distances (see #12401).
-
 Note: In KNeighborsTransformer we use the definition which includes each
 training point as its own neighbor in the count of `n_neighbors`, and for
 compatibility reasons, one extra neighbor is computed when
@@ -91,7 +86,6 @@ def fit(self, X):
         # see more metric in the manual
         # https://github.com/nmslib/nmslib/tree/master/manual
         space = {
-            'sqeuclidean': 'l2',
             'euclidean': 'l2',
             'cosine': 'cosinesimil',
             'l1': 'l1',
@@ -115,9 +109,6 @@ def transform(self, X):
         indices, distances = zip(*results)
         indices, distances = np.vstack(indices), np.vstack(distances)
 
-        if self.metric == 'sqeuclidean':
-            distances **= 2
-
         indptr = np.arange(0, n_samples_transform * n_neighbors + 1,
                            n_neighbors)
         kneighbors_graph = csr_matrix((distances.ravel(), indices.ravel(),
@@ -139,8 +130,7 @@ def __init__(self, n_neighbors=5, metric='euclidean', n_trees=10,
 
     def fit(self, X):
         self.n_samples_fit_ = X.shape[0]
-        metric = self.metric if self.metric != 'sqeuclidean' else 'euclidean'
-        self.annoy_ = annoy.AnnoyIndex(X.shape[1], metric=metric)
+        self.annoy_ = annoy.AnnoyIndex(X.shape[1], metric=self.metric)
         for i, x in enumerate(X):
             self.annoy_.add_item(i, x.tolist())
         self.annoy_.build(self.n_trees)
@@ -177,9 +167,6 @@ def _transform(self, X):
                     x.tolist(), n_neighbors, self.search_k,
                     include_distances=True)
 
-        if self.metric == 'sqeuclidean':
-            distances **= 2
-
         indptr = np.arange(0, n_samples_transform * n_neighbors + 1,
                            n_neighbors)
         kneighbors_graph = csr_matrix((distances.ravel(), indices.ravel(),
@@ -209,7 +196,7 @@ def test_transformers():
 
 def load_mnist(n_samples):
     """Load MNIST, shuffle the data, and return only n_samples."""
-    mnist = fetch_openml("mnist_784")
+    mnist = fetch_openml("mnist_784", as_frame=False)
     X, y = shuffle(mnist.data, mnist.target, random_state=2)
     return X[:n_samples] / 255, y[:n_samples]
 
@@ -222,34 +209,39 @@ def run_benchmark():
 
     n_iter = 500
     perplexity = 30
+    metric = "euclidean"
     # TSNE requires a certain number of neighbors which depends on the
     # perplexity parameter.
     # Add one since we include each sample as its own neighbor.
     n_neighbors = int(3. * perplexity + 1) + 1
 
+    tsne_params = dict(perplexity=perplexity, method="barnes_hut",
+                       random_state=42, n_iter=n_iter,
+                       square_distances=True)
+
     transformers = [
-        ('AnnoyTransformer', AnnoyTransformer(n_neighbors=n_neighbors,
-                                              metric='sqeuclidean')),
-        ('NMSlibTransformer', NMSlibTransformer(n_neighbors=n_neighbors,
-                                                metric='sqeuclidean')),
-        ('KNeighborsTransformer', KNeighborsTransformer(
-            n_neighbors=n_neighbors, mode='distance', metric='sqeuclidean')),
-        ('TSNE with AnnoyTransformer', make_pipeline(
-            AnnoyTransformer(n_neighbors=n_neighbors, metric='sqeuclidean'),
-            TSNE(metric='precomputed', perplexity=perplexity,
-                 method="barnes_hut", random_state=42, n_iter=n_iter), )),
-        ('TSNE with NMSlibTransformer', make_pipeline(
-            NMSlibTransformer(n_neighbors=n_neighbors, metric='sqeuclidean'),
-            TSNE(metric='precomputed', perplexity=perplexity,
-                 method="barnes_hut", random_state=42, n_iter=n_iter), )),
-        ('TSNE with KNeighborsTransformer', make_pipeline(
-            KNeighborsTransformer(n_neighbors=n_neighbors, mode='distance',
-                                  metric='sqeuclidean'),
-            TSNE(metric='precomputed', perplexity=perplexity,
-                 method="barnes_hut", random_state=42, n_iter=n_iter), )),
+        ('AnnoyTransformer',
+         AnnoyTransformer(n_neighbors=n_neighbors, metric=metric)),
+        ('NMSlibTransformer',
+         NMSlibTransformer(n_neighbors=n_neighbors, metric=metric)),
+        ('KNeighborsTransformer',
+         KNeighborsTransformer(n_neighbors=n_neighbors, mode='distance',
+                               metric=metric)),
+        ('TSNE with AnnoyTransformer',
+         make_pipeline(
+             AnnoyTransformer(n_neighbors=n_neighbors, metric=metric),
+             TSNE(metric='precomputed', **tsne_params))),
+        ('TSNE with NMSlibTransformer',
+         make_pipeline(
+             NMSlibTransformer(n_neighbors=n_neighbors, metric=metric),
+             TSNE(metric='precomputed', **tsne_params))),
+        ('TSNE with KNeighborsTransformer',
+         make_pipeline(
+             KNeighborsTransformer(n_neighbors=n_neighbors, mode='distance',
+                                   metric=metric),
+             TSNE(metric='precomputed', **tsne_params))),
         ('TSNE with internal NearestNeighbors',
-         TSNE(metric='sqeuclidean', perplexity=perplexity, method="barnes_hut",
-              random_state=42, n_iter=n_iter)),
+         TSNE(metric=metric, **tsne_params)),
     ]
 
     # init the plot

From 309f135c3284d7db6e23ca81a87948c7066a3949 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Fri, 2 Apr 2021 15:40:20 +0100
Subject: [PATCH 285/478] MNT Remove HistGradientBoosting from experimental
 (#19799)

---
 asv_benchmarks/benchmarks/ensemble.py         |  1 -
 benchmarks/bench_hist_gradient_boosting.py    |  2 -
 .../bench_hist_gradient_boosting_adult.py     |  1 -
 ...hist_gradient_boosting_categorical_only.py |  1 -
 ...bench_hist_gradient_boosting_higgsboson.py |  2 -
 .../bench_hist_gradient_boosting_threading.py |  2 -
 doc/conf.py                                   |  1 -
 doc/developers/maintainer.rst                 | 29 ++++++++++---
 doc/modules/ensemble.rst                      | 16 +------
 doc/whats_new/v0.21.rst                       |  5 +++
 doc/whats_new/v1.0.rst                        |  5 +++
 .../plot_gradient_boosting_categorical.py     |  1 -
 .../ensemble/plot_monotonic_constraints.py    |  1 -
 examples/ensemble/plot_stack_predictors.py    |  1 -
 .../inspection/plot_partial_dependence.py     |  1 -
 ...plot_poisson_regression_non_normal_loss.py |  1 -
 .../plot_release_highlights_0_22_0.py         |  1 -
 .../plot_release_highlights_0_23_0.py         |  2 -
 sklearn/ensemble/__init__.py                  | 13 ++----
 .../gradient_boosting.py                      | 26 -----------
 .../tests/test_compare_lightgbm.py            |  2 -
 .../tests/test_gradient_boosting.py           |  2 -
 .../tests/test_monotonic_contraints.py        |  1 -
 .../tests/test_warm_start.py                  |  2 -
 .../enable_hist_gradient_boosting.py          | 43 ++++++-------------
 .../test_enable_hist_gradient_boosting.py     | 41 +++---------------
 .../tests/test_from_model.py                  |  1 -
 .../tests/test_sequential.py                  |  1 -
 .../tests/test_partial_dependence.py          |  1 -
 sklearn/model_selection/tests/test_search.py  |  1 -
 sklearn/tests/test_pipeline.py                |  1 -
 31 files changed, 59 insertions(+), 149 deletions(-)

diff --git a/asv_benchmarks/benchmarks/ensemble.py b/asv_benchmarks/benchmarks/ensemble.py
index c46ac07c84475..8977eb0d10f20 100644
--- a/asv_benchmarks/benchmarks/ensemble.py
+++ b/asv_benchmarks/benchmarks/ensemble.py
@@ -1,4 +1,3 @@
-from sklearn.experimental import enable_hist_gradient_boosting  # noqa
 from sklearn.ensemble import (RandomForestClassifier,
                               GradientBoostingClassifier,
                               HistGradientBoostingClassifier)
diff --git a/benchmarks/bench_hist_gradient_boosting.py b/benchmarks/bench_hist_gradient_boosting.py
index 82eb64faeb462..533861b1b63e4 100644
--- a/benchmarks/bench_hist_gradient_boosting.py
+++ b/benchmarks/bench_hist_gradient_boosting.py
@@ -4,8 +4,6 @@
 import matplotlib.pyplot as plt
 import numpy as np
 from sklearn.model_selection import train_test_split
-# To use this experimental feature, we need to explicitly ask for it:
-from sklearn.experimental import enable_hist_gradient_boosting  # noqa
 from sklearn.ensemble import HistGradientBoostingRegressor
 from sklearn.ensemble import HistGradientBoostingClassifier
 from sklearn.datasets import make_classification
diff --git a/benchmarks/bench_hist_gradient_boosting_adult.py b/benchmarks/bench_hist_gradient_boosting_adult.py
index 5b47fcb3a6678..49109cfc049bb 100644
--- a/benchmarks/bench_hist_gradient_boosting_adult.py
+++ b/benchmarks/bench_hist_gradient_boosting_adult.py
@@ -4,7 +4,6 @@
 from sklearn.model_selection import train_test_split
 from sklearn.datasets import fetch_openml
 from sklearn.metrics import accuracy_score, roc_auc_score
-from sklearn.experimental import enable_hist_gradient_boosting  # noqa
 from sklearn.ensemble import HistGradientBoostingClassifier
 from sklearn.ensemble._hist_gradient_boosting.utils import (
     get_equivalent_estimator)
diff --git a/benchmarks/bench_hist_gradient_boosting_categorical_only.py b/benchmarks/bench_hist_gradient_boosting_categorical_only.py
index 6c69b32eff26f..d3d7a871b41d2 100644
--- a/benchmarks/bench_hist_gradient_boosting_categorical_only.py
+++ b/benchmarks/bench_hist_gradient_boosting_categorical_only.py
@@ -3,7 +3,6 @@
 
 from sklearn.preprocessing import KBinsDiscretizer
 from sklearn.datasets import make_classification
-from sklearn.experimental import enable_hist_gradient_boosting  # noqa
 from sklearn.ensemble import HistGradientBoostingClassifier
 from sklearn.ensemble._hist_gradient_boosting.utils import (
     get_equivalent_estimator)
diff --git a/benchmarks/bench_hist_gradient_boosting_higgsboson.py b/benchmarks/bench_hist_gradient_boosting_higgsboson.py
index 2c74bb8818343..4e795a18ae2ce 100644
--- a/benchmarks/bench_hist_gradient_boosting_higgsboson.py
+++ b/benchmarks/bench_hist_gradient_boosting_higgsboson.py
@@ -9,8 +9,6 @@
 from joblib import Memory
 from sklearn.model_selection import train_test_split
 from sklearn.metrics import accuracy_score, roc_auc_score
-# To use this experimental feature, we need to explicitly ask for it:
-from sklearn.experimental import enable_hist_gradient_boosting  # noqa
 from sklearn.ensemble import HistGradientBoostingClassifier
 from sklearn.ensemble._hist_gradient_boosting.utils import (
     get_equivalent_estimator)
diff --git a/benchmarks/bench_hist_gradient_boosting_threading.py b/benchmarks/bench_hist_gradient_boosting_threading.py
index 61803fb5cb9cc..6ab5de294dced 100644
--- a/benchmarks/bench_hist_gradient_boosting_threading.py
+++ b/benchmarks/bench_hist_gradient_boosting_threading.py
@@ -7,8 +7,6 @@
 from threadpoolctl import threadpool_limits
 import sklearn
 from sklearn.model_selection import train_test_split
-# To use this experimental feature, we need to explicitly ask for it:
-from sklearn.experimental import enable_hist_gradient_boosting  # noqa
 from sklearn.ensemble import HistGradientBoostingRegressor
 from sklearn.ensemble import HistGradientBoostingClassifier
 from sklearn.datasets import make_classification
diff --git a/doc/conf.py b/doc/conf.py
index 6768aab208a99..ba6b0595a7d44 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -358,7 +358,6 @@ def __call__(self, directory):
 
 # enable experimental module so that experimental estimators can be
 # discovered properly by sphinx
-from sklearn.experimental import enable_hist_gradient_boosting  # noqa
 from sklearn.experimental import enable_iterative_imputer  # noqa
 from sklearn.experimental import enable_halving_search_cv  # noqa
 
diff --git a/doc/developers/maintainer.rst b/doc/developers/maintainer.rst
index e4115e87025c7..8fd439c984660 100644
--- a/doc/developers/maintainer.rst
+++ b/doc/developers/maintainer.rst
@@ -363,10 +363,17 @@ deprecation cycle.
 
 To create an experimental module, you can just copy and modify the content of
 `enable_hist_gradient_boosting.py
-<https://github.com/scikit-learn/scikit-learn/blob/main/sklearn/experimental/enable_hist_gradient_boosting.py>`_,
+<https://github.com/scikit-learn/scikit-learn/blob/c9c89cfc85dd8dfefd7921c16c87327d03140a06/sklearn/experimental/enable_hist_gradient_boosting.py>`__,
 or
 `enable_iterative_imputer.py
-<https://github.com/scikit-learn/scikit-learn/blob/main/sklearn/experimental/enable_iterative_imputer.py>`_.
+<https://github.com/scikit-learn/scikit-learn/blob/c9c89cfc85dd8dfefd7921c16c87327d03140a06/sklearn/experimental/enable_iterative_imputer.py>`_.
+
+.. note::
+
+  These are permalink as in 0.24, where these estimators are still
+  experimental. They might be stable at the time of reading - hence the
+  permalink. See below for instructions on the transition from experimental
+  to stable.
 
 Note that the public import path must be to a public subpackage (like
 ``sklearn/ensemble`` or ``sklearn/impute``), not just a ``.py`` module.
@@ -379,14 +386,15 @@ in the future when the features aren't experimental anymore.
 To avoid type checker (e.g. mypy) errors a direct import of experimental
 estimators should be done in the parent module, protected by the
 ``if typing.TYPE_CHECKING`` check. See `sklearn/ensemble/__init__.py
-<https://github.com/scikit-learn/scikit-learn/blob/main/sklearn/ensemble/__init__.py>`_,
+<https://github.com/scikit-learn/scikit-learn/blob/c9c89cfc85dd8dfefd7921c16c87327d03140a06/sklearn/ensemble/__init__.py>`_,
 or `sklearn/impute/__init__.py
-<https://github.com/scikit-learn/scikit-learn/blob/main/sklearn/impute/__init__.py>`_
+<https://github.com/scikit-learn/scikit-learn/blob/c9c89cfc85dd8dfefd7921c16c87327d03140a06/sklearn/impute/__init__.py>`_
 for an example.
 
 Please also write basic tests following those in
 `test_enable_hist_gradient_boosting.py
-<https://github.com/scikit-learn/scikit-learn/blob/main/sklearn/experimental/tests/test_enable_hist_gradient_boosting.py>`_.
+<https://github.com/scikit-learn/scikit-learn/blob/c9c89cfc85dd8dfefd7921c16c87327d03140a06/sklearn/experimental/tests/test_enable_hist_gradient_boosting.py>`__.
+
 
 Make sure every user-facing code you write explicitly mentions that the feature
 is experimental, and add a ``# noqa`` comment to avoid pep8-related warnings::
@@ -402,3 +410,14 @@ sklearn.experimental import *`` **does not work**.
 
 Note that some experimental classes / functions are not included in the
 :mod:`sklearn.experimental` module: ``sklearn.datasets.fetch_openml``.
+
+Once the feature become stable, remove all `enable_my_experimental_feature`
+in the scikit-learn code (even feature highlights etc.) and make the
+`enable_my_experimental_feature` a no-op that just raises a warning:
+`enable_hist_gradient_boosting.py
+<https://github.com/scikit-learn/scikit-learn/blob/main/sklearn/experimental/enable_hist_gradient_boosting.py>`__.
+The file should stay there indefinitely as we don't want to break users code:
+we just incentivize them to remove that import with the warning.
+
+Also update the tests accordingly: `test_enable_hist_gradient_boosting.py
+<https://github.com/scikit-learn/scikit-learn/blob/main/sklearn/experimental/tests/test_enable_hist_gradient_boosting.py>`__.
diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst
index c891b4d275b9a..329215406c39c 100644
--- a/doc/modules/ensemble.rst
+++ b/doc/modules/ensemble.rst
@@ -467,7 +467,7 @@ trees.
 
 .. note::
 
-  Scikit-learn 0.21 introduces two new experimental implementations of
+  Scikit-learn 0.21 introduces two new implementations of
   gradient boosting trees, namely :class:`HistGradientBoostingClassifier`
   and :class:`HistGradientBoostingRegressor`, inspired by
   `LightGBM <https://github.com/Microsoft/LightGBM>`__ (See [LightGBM]_).
@@ -898,7 +898,7 @@ based on permutation of the features.
 Histogram-Based Gradient Boosting
 =================================
 
-Scikit-learn 0.21 introduced two new experimental implementations of
+Scikit-learn 0.21 introduced two new implementations of
 gradient boosting trees, namely :class:`HistGradientBoostingClassifier`
 and :class:`HistGradientBoostingRegressor`, inspired by
 `LightGBM <https://github.com/Microsoft/LightGBM>`__ (See [LightGBM]_).
@@ -920,15 +920,6 @@ estimators is slightly different, and some of the features from
 :class:`GradientBoostingClassifier` and :class:`GradientBoostingRegressor`
 are not yet supported, for instance some loss functions.
 
-These estimators are still **experimental**: their predictions
-and their API might change without any deprecation cycle. To use them, you
-need to explicitly import ``enable_hist_gradient_boosting``::
-
-  >>> # explicitly require this experimental feature
-  >>> from sklearn.experimental import enable_hist_gradient_boosting  # noqa
-  >>> # now you can import normally from ensemble
-  >>> from sklearn.ensemble import HistGradientBoostingClassifier
-
 .. topic:: Examples:
 
  * :ref:`sphx_glr_auto_examples_inspection_plot_partial_dependence.py`
@@ -941,7 +932,6 @@ Most of the parameters are unchanged from
 One exception is the ``max_iter`` parameter that replaces ``n_estimators``, and
 controls the number of iterations of the boosting process::
 
-  >>> from sklearn.experimental import enable_hist_gradient_boosting
   >>> from sklearn.ensemble import HistGradientBoostingClassifier
   >>> from sklearn.datasets import make_hastie_10_2
 
@@ -992,7 +982,6 @@ with missing values should go to the left or right child, based on the
 potential gain. When predicting, samples with missing values are assigned to
 the left or right child consequently::
 
-  >>> from sklearn.experimental import enable_hist_gradient_boosting  # noqa
   >>> from sklearn.ensemble import HistGradientBoostingClassifier
   >>> import numpy as np
 
@@ -1146,7 +1135,6 @@ You can specify a monotonic constraint on each feature using the
 constraint, while -1 and 1 indicate a negative and positive constraint,
 respectively::
 
-  >>> from sklearn.experimental import enable_hist_gradient_boosting  # noqa
   >>> from sklearn.ensemble import HistGradientBoostingRegressor
 
   ... # positive, negative, and no constraint on the 3 features
diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst
index cf9886a6636af..8012fd02b4733 100644
--- a/doc/whats_new/v0.21.rst
+++ b/doc/whats_new/v0.21.rst
@@ -420,6 +420,11 @@ Support for Python 3.4 and below has been officially dropped.
     >>> from sklearn.experimental import enable_hist_gradient_boosting  # noqa
     >>> # now you can import normally from sklearn.ensemble
     >>> from sklearn.ensemble import HistGradientBoostingClassifier
+  
+  .. note::
+      Update: since version 1.0, these estimators are not experimental
+      anymore and you don't need to use `from sklearn.experimental import
+      enable_hist_gradient_boosting`.
 
   :pr:`12807` by :user:`Nicolas Hug<NicolasHug>`.
 
diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 979ed9096aba1..f75c29586efca 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -137,6 +137,11 @@ Changelog
   target. Additional private refactoring was performed.
   :pr:`19162` by :user:`Guillaume Lemaitre <glemaitre>`.
 
+- |Enhancement| :class:`~sklearn.ensemble.HistGradientBoostingClassifier` and
+  :class:`~sklearn.ensemble.HistGradientBoostingRegressor` are no longer
+  experimental. They are now considered stable and are subject to the same
+  deprecation cycles as all other estimators. :pr:`19799` by `Nicolas Hug`_.
+
 :mod:`sklearn.feature_extraction`
 .................................
 
diff --git a/examples/ensemble/plot_gradient_boosting_categorical.py b/examples/ensemble/plot_gradient_boosting_categorical.py
index 820a508f4de3c..876a1ca21ec4c 100644
--- a/examples/ensemble/plot_gradient_boosting_categorical.py
+++ b/examples/ensemble/plot_gradient_boosting_categorical.py
@@ -45,7 +45,6 @@
 # As a baseline, we create an estimator where the categorical features are
 # dropped:
 
-from sklearn.experimental import enable_hist_gradient_boosting  # noqa
 from sklearn.ensemble import HistGradientBoostingRegressor
 from sklearn.pipeline import make_pipeline
 from sklearn.compose import make_column_transformer
diff --git a/examples/ensemble/plot_monotonic_constraints.py b/examples/ensemble/plot_monotonic_constraints.py
index 8b3f69f1d542e..c173ef35cf311 100644
--- a/examples/ensemble/plot_monotonic_constraints.py
+++ b/examples/ensemble/plot_monotonic_constraints.py
@@ -18,7 +18,6 @@
 This example was inspired by the `XGBoost documentation
 <https://xgboost.readthedocs.io/en/latest/tutorials/monotonic.html>`_.
 """
-from sklearn.experimental import enable_hist_gradient_boosting  # noqa
 from sklearn.ensemble import HistGradientBoostingRegressor
 from sklearn.inspection import plot_partial_dependence
 import numpy as np
diff --git a/examples/ensemble/plot_stack_predictors.py b/examples/ensemble/plot_stack_predictors.py
index c07068b060c57..afa48c62d8d0b 100644
--- a/examples/ensemble/plot_stack_predictors.py
+++ b/examples/ensemble/plot_stack_predictors.py
@@ -160,7 +160,6 @@ def load_ames_housing():
 rf_pipeline
 
 # %%
-from sklearn.experimental import enable_hist_gradient_boosting  # noqa
 from sklearn.ensemble import HistGradientBoostingRegressor
 
 gbdt_pipeline = make_pipeline(
diff --git a/examples/inspection/plot_partial_dependence.py b/examples/inspection/plot_partial_dependence.py
index 927857d845f9e..ac8d20ec9f155 100644
--- a/examples/inspection/plot_partial_dependence.py
+++ b/examples/inspection/plot_partial_dependence.py
@@ -134,7 +134,6 @@
 # Let's now fit a :class:`~sklearn.ensemble.HistGradientBoostingRegressor` and
 # compute the partial dependence on the same features.
 
-from sklearn.experimental import enable_hist_gradient_boosting  # noqa
 from sklearn.ensemble import HistGradientBoostingRegressor
 
 print("Training HistGradientBoostingRegressor...")
diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
index 9541be1f62b24..7ebda543b4059 100644
--- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py
+++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
@@ -258,7 +258,6 @@ def score_estimator(estimator, df_test):
 # least-squares loss. Here we only fit trees with the Poisson loss to keep this
 # example concise.
 
-from sklearn.experimental import enable_hist_gradient_boosting  # noqa
 from sklearn.ensemble import HistGradientBoostingRegressor
 from sklearn.preprocessing import OrdinalEncoder
 
diff --git a/examples/release_highlights/plot_release_highlights_0_22_0.py b/examples/release_highlights/plot_release_highlights_0_22_0.py
index d9efc9a520af1..cc0cfe674c61d 100644
--- a/examples/release_highlights/plot_release_highlights_0_22_0.py
+++ b/examples/release_highlights/plot_release_highlights_0_22_0.py
@@ -131,7 +131,6 @@
 # support for missing values (NaNs). This means that there is no need for
 # imputing data when training or predicting.
 
-from sklearn.experimental import enable_hist_gradient_boosting  # noqa
 from sklearn.ensemble import HistGradientBoostingClassifier
 
 X = np.array([0, 1, 2, np.nan]).reshape(-1, 1)
diff --git a/examples/release_highlights/plot_release_highlights_0_23_0.py b/examples/release_highlights/plot_release_highlights_0_23_0.py
index a34c23b4912be..364cd7958003e 100644
--- a/examples/release_highlights/plot_release_highlights_0_23_0.py
+++ b/examples/release_highlights/plot_release_highlights_0_23_0.py
@@ -36,7 +36,6 @@
 import numpy as np
 from sklearn.model_selection import train_test_split
 from sklearn.linear_model import PoissonRegressor
-from sklearn.experimental import enable_hist_gradient_boosting  # noqa
 from sklearn.ensemble import HistGradientBoostingRegressor
 
 n_samples, n_features = 1000, 20
@@ -124,7 +123,6 @@
 from matplotlib import pyplot as plt
 from sklearn.model_selection import train_test_split
 from sklearn.inspection import plot_partial_dependence
-from sklearn.experimental import enable_hist_gradient_boosting  # noqa
 from sklearn.ensemble import HistGradientBoostingRegressor
 
 n_samples = 500
diff --git a/sklearn/ensemble/__init__.py b/sklearn/ensemble/__init__.py
index ae86349ad9af0..0a78a774cca36 100644
--- a/sklearn/ensemble/__init__.py
+++ b/sklearn/ensemble/__init__.py
@@ -2,8 +2,6 @@
 The :mod:`sklearn.ensemble` module includes ensemble-based methods for
 classification, regression and anomaly detection.
 """
-import typing
-
 from ._base import BaseEnsemble
 from ._forest import RandomForestClassifier
 from ._forest import RandomForestRegressor
@@ -21,13 +19,9 @@
 from ._voting import VotingRegressor
 from ._stacking import StackingClassifier
 from ._stacking import StackingRegressor
-
-if typing.TYPE_CHECKING:
-    # Avoid errors in type checkers (e.g. mypy) for experimental estimators.
-    # TODO: remove this check once the estimator is no longer experimental.
-    from ._hist_gradient_boosting.gradient_boosting import (  # noqa
-        HistGradientBoostingRegressor, HistGradientBoostingClassifier
-    )
+from ._hist_gradient_boosting.gradient_boosting import (
+    HistGradientBoostingRegressor, HistGradientBoostingClassifier
+)
 
 __all__ = ["BaseEnsemble",
            "RandomForestClassifier", "RandomForestRegressor",
@@ -37,4 +31,5 @@
            "GradientBoostingRegressor", "AdaBoostClassifier",
            "AdaBoostRegressor", "VotingClassifier", "VotingRegressor",
            "StackingClassifier", "StackingRegressor",
+           'HistGradientBoostingClassifier', 'HistGradientBoostingRegressor',
            ]
diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
index c35f79bd79251..d3b62a5df784a 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
@@ -887,17 +887,6 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting):
     This implementation is inspired by
     `LightGBM <https://github.com/Microsoft/LightGBM>`_.
 
-    .. note::
-
-      This estimator is still **experimental** for now: the predictions
-      and the API might change without any deprecation cycle. To use it,
-      you need to explicitly import ``enable_hist_gradient_boosting``::
-
-        >>> # explicitly require this experimental feature
-        >>> from sklearn.experimental import enable_hist_gradient_boosting  # noqa
-        >>> # now you can import normally from ensemble
-        >>> from sklearn.ensemble import HistGradientBoostingRegressor
-
     Read more in the :ref:`User Guide <histogram_based_gradient_boosting>`.
 
     .. versionadded:: 0.21
@@ -1040,8 +1029,6 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting):
 
     Examples
     --------
-    >>> # To use this experimental feature, we need to explicitly ask for it:
-    >>> from sklearn.experimental import enable_hist_gradient_boosting  # noqa
     >>> from sklearn.ensemble import HistGradientBoostingRegressor
     >>> from sklearn.datasets import load_diabetes
     >>> X, y = load_diabetes(return_X_y=True)
@@ -1156,17 +1143,6 @@ class HistGradientBoostingClassifier(ClassifierMixin,
     This implementation is inspired by
     `LightGBM <https://github.com/Microsoft/LightGBM>`_.
 
-    .. note::
-
-      This estimator is still **experimental** for now: the predictions
-      and the API might change without any deprecation cycle. To use it,
-      you need to explicitly import ``enable_hist_gradient_boosting``::
-
-        >>> # explicitly require this experimental feature
-        >>> from sklearn.experimental import enable_hist_gradient_boosting  # noqa
-        >>> # now you can import normally from ensemble
-        >>> from sklearn.ensemble import HistGradientBoostingClassifier
-
     Read more in the :ref:`User Guide <histogram_based_gradient_boosting>`.
 
     .. versionadded:: 0.21
@@ -1304,8 +1280,6 @@ class HistGradientBoostingClassifier(ClassifierMixin,
 
     Examples
     --------
-    >>> # To use this experimental feature, we need to explicitly ask for it:
-    >>> from sklearn.experimental import enable_hist_gradient_boosting  # noqa
     >>> from sklearn.ensemble import HistGradientBoostingClassifier
     >>> from sklearn.datasets import load_iris
     >>> X, y = load_iris(return_X_y=True)
diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_compare_lightgbm.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_compare_lightgbm.py
index 4a6c4dbbb32c7..f34dffab2671c 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_compare_lightgbm.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_compare_lightgbm.py
@@ -4,8 +4,6 @@
 import numpy as np
 import pytest
 
-# To use this experimental feature, we need to explicitly ask for it:
-from sklearn.experimental import enable_hist_gradient_boosting  # noqa
 from sklearn.ensemble import HistGradientBoostingRegressor
 from sklearn.ensemble import HistGradientBoostingClassifier
 from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper
diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
index 265b4cf20f8f3..b2322f29f85d1 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
@@ -13,8 +13,6 @@
 from sklearn.exceptions import NotFittedError
 from sklearn.compose import make_column_transformer
 
-# To use this experimental feature, we need to explicitly ask for it:
-from sklearn.experimental import enable_hist_gradient_boosting  # noqa
 from sklearn.ensemble import HistGradientBoostingRegressor
 from sklearn.ensemble import HistGradientBoostingClassifier
 from sklearn.ensemble._hist_gradient_boosting.loss import _LOSSES
diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_monotonic_contraints.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_monotonic_contraints.py
index 29fc95d4bb070..725f9f6537865 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_monotonic_contraints.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_monotonic_contraints.py
@@ -10,7 +10,6 @@
     compute_node_value
 )
 from sklearn.ensemble._hist_gradient_boosting.histogram import HistogramBuilder
-from sklearn.experimental import enable_hist_gradient_boosting  # noqa
 from sklearn.ensemble import HistGradientBoostingRegressor
 from sklearn.ensemble import HistGradientBoostingClassifier
 
diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_warm_start.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_warm_start.py
index 2417de4f6cc63..044a6237bc54d 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_warm_start.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_warm_start.py
@@ -7,8 +7,6 @@
 from sklearn.base import clone
 from sklearn.datasets import make_classification, make_regression
 
-# To use this experimental feature, we need to explicitly ask for it:
-from sklearn.experimental import enable_hist_gradient_boosting  # noqa
 from sklearn.ensemble import HistGradientBoostingRegressor
 from sklearn.ensemble import HistGradientBoostingClassifier
 from sklearn.metrics import check_scoring
diff --git a/sklearn/experimental/enable_hist_gradient_boosting.py b/sklearn/experimental/enable_hist_gradient_boosting.py
index d7ceefbd58a2f..f0416ac013e96 100644
--- a/sklearn/experimental/enable_hist_gradient_boosting.py
+++ b/sklearn/experimental/enable_hist_gradient_boosting.py
@@ -1,36 +1,21 @@
-"""Enables histogram-based gradient boosting estimators.
+"""This is now a no-op and can be safely removed from your code.
 
-The API and results of these estimators might change without any deprecation
-cycle.
-
-Importing this file dynamically sets the
+It used to enable the use of
 :class:`~sklearn.ensemble.HistGradientBoostingClassifier` and
-:class:`~sklearn.ensemble.HistGradientBoostingRegressor` as attributes of the
-ensemble module::
-
-    >>> # explicitly require this experimental feature
-    >>> from sklearn.experimental import enable_hist_gradient_boosting  # noqa
-    >>> # now you can import normally from ensemble
-    >>> from sklearn.ensemble import HistGradientBoostingClassifier
-    >>> from sklearn.ensemble import HistGradientBoostingRegressor
-
-
-The ``# noqa`` comment comment can be removed: it just tells linters like
-flake8 to ignore the import, which appears as unused.
+:class:`~sklearn.ensemble.HistGradientBoostingRegressor` when they were still
+:term:`experimental`, but these estimators are now stable and can be imported
+normally from `sklearn.ensemble`.
 """
+# Don't remove this file, we don't want to break users code just because the
+# feature isn't experimental anymore.
 
-from ..ensemble._hist_gradient_boosting.gradient_boosting import (
-    HistGradientBoostingClassifier,
-    HistGradientBoostingRegressor
-)
 
-from .. import ensemble
+import warnings
 
-# use settattr to avoid mypy errors when monkeypatching
-setattr(ensemble, "HistGradientBoostingClassifier",
-        HistGradientBoostingClassifier)
-setattr(ensemble, "HistGradientBoostingRegressor",
-        HistGradientBoostingRegressor)
 
-ensemble.__all__ += ['HistGradientBoostingClassifier',
-                     'HistGradientBoostingRegressor']
+warnings.warn(
+    "Since version 1.0, "
+    "it is not needed to import enable_hist_gradient_boosting anymore. "
+    "HistGradientBoostingClassifier and HistGradientBoostingRegressor are now "
+    "stable and can be normally imported from sklearn.ensemble."
+)
diff --git a/sklearn/experimental/tests/test_enable_hist_gradient_boosting.py b/sklearn/experimental/tests/test_enable_hist_gradient_boosting.py
index 06c0976d95a1f..8ea365fed6e59 100644
--- a/sklearn/experimental/tests/test_enable_hist_gradient_boosting.py
+++ b/sklearn/experimental/tests/test_enable_hist_gradient_boosting.py
@@ -5,41 +5,10 @@
 from sklearn.utils._testing import assert_run_python_script
 
 
-def test_imports_strategies():
-    # Make sure different import strategies work or fail as expected.
-
-    # Since Python caches the imported modules, we need to run a child process
-    # for every test case. Else, the tests would not be independent
-    # (manually removing the imports from the cache (sys.modules) is not
-    # recommended and can lead to many complications).
-
-    good_import = """
-    from sklearn.experimental import enable_hist_gradient_boosting
-    from sklearn.ensemble import GradientBoostingClassifier
-    from sklearn.ensemble import GradientBoostingRegressor
-    """
-    assert_run_python_script(textwrap.dedent(good_import))
-
-    good_import_with_ensemble_first = """
-    import sklearn.ensemble
-    from sklearn.experimental import enable_hist_gradient_boosting
-    from sklearn.ensemble import GradientBoostingClassifier
-    from sklearn.ensemble import GradientBoostingRegressor
-    """
-    assert_run_python_script(textwrap.dedent(good_import_with_ensemble_first))
-
-    bad_imports = """
+def test_import_raises_warning():
+    code = """
     import pytest
-
-    with pytest.raises(ImportError):
-        from sklearn.ensemble import HistGradientBoostingClassifier
-
-    with pytest.raises(ImportError):
-        from sklearn.ensemble._hist_gradient_boosting import (
-            HistGradientBoostingClassifier)
-
-    import sklearn.experimental
-    with pytest.raises(ImportError):
-        from sklearn.ensemble import HistGradientBoostingClassifier
+    with pytest.warns(UserWarning, match="it is not needed to import"):
+        from sklearn.experimental import enable_hist_gradient_boosting  # noqa
     """
-    assert_run_python_script(textwrap.dedent(bad_imports))
+    assert_run_python_script(textwrap.dedent(code))
diff --git a/sklearn/feature_selection/tests/test_from_model.py b/sklearn/feature_selection/tests/test_from_model.py
index 37b5c105e1daa..17488b397b0c8 100644
--- a/sklearn/feature_selection/tests/test_from_model.py
+++ b/sklearn/feature_selection/tests/test_from_model.py
@@ -10,7 +10,6 @@
 from sklearn.linear_model import LogisticRegression, SGDClassifier, Lasso
 from sklearn.svm import LinearSVC
 from sklearn.feature_selection import SelectFromModel
-from sklearn.experimental import enable_hist_gradient_boosting  # noqa
 from sklearn.ensemble import (RandomForestClassifier,
                               HistGradientBoostingClassifier)
 from sklearn.linear_model import PassiveAggressiveClassifier
diff --git a/sklearn/feature_selection/tests/test_sequential.py b/sklearn/feature_selection/tests/test_sequential.py
index 2ca22517ef956..163f7acba6ce1 100644
--- a/sklearn/feature_selection/tests/test_sequential.py
+++ b/sklearn/feature_selection/tests/test_sequential.py
@@ -8,7 +8,6 @@
 from sklearn.feature_selection import SequentialFeatureSelector
 from sklearn.datasets import make_regression
 from sklearn.linear_model import LinearRegression
-from sklearn.experimental import enable_hist_gradient_boosting  # noqa
 from sklearn.ensemble import HistGradientBoostingRegressor
 
 
diff --git a/sklearn/inspection/tests/test_partial_dependence.py b/sklearn/inspection/tests/test_partial_dependence.py
index 51dd6e53e4304..f79b2aca3beae 100644
--- a/sklearn/inspection/tests/test_partial_dependence.py
+++ b/sklearn/inspection/tests/test_partial_dependence.py
@@ -15,7 +15,6 @@
 from sklearn.ensemble import GradientBoostingClassifier
 from sklearn.ensemble import GradientBoostingRegressor
 from sklearn.ensemble import RandomForestRegressor
-from sklearn.experimental import enable_hist_gradient_boosting  # noqa
 from sklearn.ensemble import HistGradientBoostingClassifier
 from sklearn.ensemble import HistGradientBoostingRegressor
 from sklearn.linear_model import LinearRegression
diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py
index 25c4ce8cc22f7..b74e250e94192 100644
--- a/sklearn/model_selection/tests/test_search.py
+++ b/sklearn/model_selection/tests/test_search.py
@@ -69,7 +69,6 @@
 from sklearn.impute import SimpleImputer
 from sklearn.pipeline import Pipeline
 from sklearn.linear_model import Ridge, SGDClassifier, LinearRegression
-from sklearn.experimental import enable_hist_gradient_boosting  # noqa
 from sklearn.ensemble import HistGradientBoostingClassifier
 
 from sklearn.model_selection.tests.common import OneTimeSplitter
diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py
index 85d2f7b6e07ca..93f19cdb8a93f 100644
--- a/sklearn/tests/test_pipeline.py
+++ b/sklearn/tests/test_pipeline.py
@@ -36,7 +36,6 @@
 from sklearn.datasets import load_iris
 from sklearn.preprocessing import StandardScaler
 from sklearn.feature_extraction.text import CountVectorizer
-from sklearn.experimental import enable_hist_gradient_boosting  # noqa
 from sklearn.ensemble import HistGradientBoostingClassifier
 from sklearn.impute import SimpleImputer
 

From 26e688d31e86461b978ca5cf7d23c279ac3f7299 Mon Sep 17 00:00:00 2001
From: Luca Bittarello <15511539+lbittarello@users.noreply.github.com>
Date: Fri, 2 Apr 2021 22:08:16 +0200
Subject: [PATCH 286/478] ENH Record output of transformers in
 ColumnTransformer (#18393)

Co-authored-by: Nicolas Hug <NicolasHug@users.noreply.github.com>
Co-authored-by: Joel Nothman <78827+jnothman@users.noreply.github.com>
---
 doc/whats_new/v1.0.rst                        |  7 ++
 sklearn/compose/_column_transformer.py        | 29 ++++++
 .../compose/tests/test_column_transformer.py  | 88 ++++++++++++++++++-
 3 files changed, 123 insertions(+), 1 deletion(-)

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index f75c29586efca..9eb49b0139a6b 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -109,6 +109,13 @@ Changelog
 - |Enhancement| :func:`datasets.fetch_kddcup99` raises a better message
   when the cached file is invalid. :pr:`19669` `Thomas Fan`_.
 
+:mod:`sklearn.compose`
+......................
+
+- |Enhancement| :class:`compose.ColumnTransformer` now records the output
+  of each transformer in `output_indices_`. :pr:`18393` by
+  :user:`Luca Bittarello <lbittarello>`.
+
 :mod:`sklearn.decomposition`
 ............................
 
diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py
index c0444fe2d6cda..da4a2dd93507c 100644
--- a/sklearn/compose/_column_transformer.py
+++ b/sklearn/compose/_column_transformer.py
@@ -134,6 +134,12 @@ class ColumnTransformer(TransformerMixin, _BaseComposition):
         sparse matrix or a dense numpy array, which depends on the output
         of the individual transformers and the `sparse_threshold` keyword.
 
+    output_indices_ : dict
+        A dictionary from each transformer name to a slice, where the slice
+        corresponds to indices in the transformed output. This is useful to
+        inspect which transformer is responsible for which transformed
+        feature(s).
+
     Notes
     -----
     The order of the columns in the transformed feature matrix follows the
@@ -408,6 +414,28 @@ def _validate_output(self, result):
                     "The output of the '{0}' transformer should be 2D (scipy "
                     "matrix, array, or pandas DataFrame).".format(name))
 
+    def _record_output_indices(self, Xs):
+        """
+        Record which transformer produced which column.
+        """
+        idx = 0
+        self.output_indices_ = {}
+
+        for transformer_idx, (name, _, _, _) in enumerate(
+            self._iter(fitted=True, replace_strings=True)
+        ):
+            n_columns = Xs[transformer_idx].shape[1]
+            self.output_indices_[name] = slice(idx, idx + n_columns)
+            idx += n_columns
+
+        # `_iter` only generates transformers that have a non empty
+        # selection. Here we set empty slices for transformers that
+        # generate no output, which are safe for indexing
+        all_names = [t[0] for t in self.transformers] + ['remainder']
+        for name in all_names:
+            if name not in self.output_indices_:
+                self.output_indices_[name] = slice(0, 0)
+
     def _log_message(self, name, idx, total):
         if not self.verbose:
             return None
@@ -518,6 +546,7 @@ def fit_transform(self, X, y=None):
 
         self._update_fitted_transformers(transformers)
         self._validate_output(Xs)
+        self._record_output_indices(Xs)
 
         return self._hstack(list(Xs))
 
diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py
index ae2e25b68210f..f7c1874d4a1b7 100644
--- a/sklearn/compose/tests/test_column_transformer.py
+++ b/sklearn/compose/tests/test_column_transformer.py
@@ -225,7 +225,7 @@ def test_column_transformer_dataframe():
     assert len(both.transformers_) == 1
     assert both.transformers_[-1][0] != 'remainder'
 
-    # ensure pandas object is passes through
+    # ensure pandas object is passed through
 
     class TransAssert(BaseEstimator):
 
@@ -310,6 +310,92 @@ def test_column_transformer_empty_columns(pandas, column_selection,
     assert isinstance(ct.transformers_[0][1], TransRaise)
 
 
+def test_column_transformer_output_indices():
+    # Checks for the output_indices_ attribute
+    X_array = np.arange(6).reshape(3, 2)
+
+    ct = ColumnTransformer([('trans1', Trans(), [0]),
+                            ('trans2', Trans(), [1])])
+    X_trans = ct.fit_transform(X_array)
+    assert ct.output_indices_ == {'trans1': slice(0, 1),
+                                  'trans2': slice(1, 2),
+                                  'remainder': slice(0, 0)}
+    assert_array_equal(X_trans[:, [0]],
+                       X_trans[:, ct.output_indices_['trans1']])
+    assert_array_equal(X_trans[:, [1]],
+                       X_trans[:, ct.output_indices_['trans2']])
+
+    # test with transformer_weights and multiple columns
+    ct = ColumnTransformer([('trans', Trans(), [0, 1])],
+                           transformer_weights={'trans': .1})
+    X_trans = ct.fit_transform(X_array)
+    assert ct.output_indices_ == {'trans': slice(0, 2),
+                                  'remainder': slice(0, 0)}
+    assert_array_equal(X_trans[:, [0, 1]],
+                       X_trans[:, ct.output_indices_['trans']])
+    assert_array_equal(X_trans[:, []],
+                       X_trans[:, ct.output_indices_['remainder']])
+
+    # test case that ensures that the attribute does also work when
+    # a given transformer doesn't have any columns to work on
+    ct = ColumnTransformer([('trans1', Trans(), [0, 1]),
+                            ('trans2', TransRaise(), [])])
+    X_trans = ct.fit_transform(X_array)
+    assert ct.output_indices_ == {'trans1': slice(0, 2),
+                                  'trans2': slice(0, 0),
+                                  'remainder': slice(0, 0)}
+    assert_array_equal(X_trans[:, [0, 1]],
+                       X_trans[:, ct.output_indices_['trans1']])
+    assert_array_equal(X_trans[:, []],
+                       X_trans[:, ct.output_indices_['trans2']])
+    assert_array_equal(X_trans[:, []],
+                       X_trans[:, ct.output_indices_['remainder']])
+
+    ct = ColumnTransformer([('trans', TransRaise(), [])],
+                           remainder='passthrough')
+    X_trans = ct.fit_transform(X_array)
+    assert ct.output_indices_ == {'trans': slice(0, 0),
+                                  'remainder': slice(0, 2)}
+    assert_array_equal(X_trans[:, []],
+                       X_trans[:, ct.output_indices_['trans']])
+    assert_array_equal(X_trans[:, [0, 1]],
+                       X_trans[:, ct.output_indices_['remainder']])
+
+
+def test_column_transformer_output_indices_df():
+    # Checks for the output_indices_ attribute with data frames
+    pd = pytest.importorskip('pandas')
+
+    X_df = pd.DataFrame(np.arange(6).reshape(3, 2),
+                        columns=['first', 'second'])
+
+    ct = ColumnTransformer([('trans1', Trans(), ['first']),
+                            ('trans2', Trans(), ['second'])])
+    X_trans = ct.fit_transform(X_df)
+    assert ct.output_indices_ == {'trans1': slice(0, 1),
+                                  'trans2': slice(1, 2),
+                                  'remainder': slice(0, 0)}
+    assert_array_equal(X_trans[:, [0]],
+                       X_trans[:, ct.output_indices_['trans1']])
+    assert_array_equal(X_trans[:, [1]],
+                       X_trans[:, ct.output_indices_['trans2']])
+    assert_array_equal(X_trans[:, []],
+                       X_trans[:, ct.output_indices_['remainder']])
+
+    ct = ColumnTransformer([('trans1', Trans(), [0]),
+                            ('trans2', Trans(), [1])])
+    X_trans = ct.fit_transform(X_df)
+    assert ct.output_indices_ == {'trans1': slice(0, 1),
+                                  'trans2': slice(1, 2),
+                                  'remainder': slice(0, 0)}
+    assert_array_equal(X_trans[:, [0]],
+                       X_trans[:, ct.output_indices_['trans1']])
+    assert_array_equal(X_trans[:, [1]],
+                       X_trans[:, ct.output_indices_['trans2']])
+    assert_array_equal(X_trans[:, []],
+                       X_trans[:, ct.output_indices_['remainder']])
+
+
 def test_column_transformer_sparse_array():
     X_sparse = sparse.eye(3, 2).tocsr()
 

From f0576399d9cfb41c1f3cd4a0a2332578b1c0b573 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Fri, 2 Apr 2021 18:00:45 -0400
Subject: [PATCH 287/478] DOC Adds version added to output_indices_ in
 ColumnTransformer (#19815)

---
 sklearn/compose/_column_transformer.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py
index da4a2dd93507c..5006663331a40 100644
--- a/sklearn/compose/_column_transformer.py
+++ b/sklearn/compose/_column_transformer.py
@@ -140,6 +140,8 @@ class ColumnTransformer(TransformerMixin, _BaseComposition):
         inspect which transformer is responsible for which transformed
         feature(s).
 
+        .. versionadded:: 1.0
+
     Notes
     -----
     The order of the columns in the transformed feature matrix follows the

From 26b6f60cd40f682570a80a02eb6484c69de88354 Mon Sep 17 00:00:00 2001
From: Christopher Yeh <chrisyeh96@users.noreply.github.com>
Date: Sat, 3 Apr 2021 20:22:51 -0600
Subject: [PATCH 288/478] DOC Use the canonical Wikipedia link (#19819)

---
 sklearn/neighbors/_unsupervised.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/neighbors/_unsupervised.py b/sklearn/neighbors/_unsupervised.py
index 822a30f503bd2..a6af48d9ed341 100644
--- a/sklearn/neighbors/_unsupervised.py
+++ b/sklearn/neighbors/_unsupervised.py
@@ -108,7 +108,7 @@ class NearestNeighbors(KNeighborsMixin,
     See :ref:`Nearest Neighbors <neighbors>` in the online documentation
     for a discussion of the choice of ``algorithm`` and ``leaf_size``.
 
-    https://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm
+    https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm
     """
 
     @_deprecate_positional_args

From f47926999d35686ff2190c3940c82d7cc7f3e691 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Sun, 4 Apr 2021 21:53:44 +0200
Subject: [PATCH 289/478] DOC Fix order of whatsnew entries (#19822)

---
 doc/whats_new/v1.0.rst | 57 ++++++++++++++++++------------------------
 1 file changed, 24 insertions(+), 33 deletions(-)

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 9eb49b0139a6b..1dd809a94240c 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -76,15 +76,15 @@ Changelog
   - For :class:`tree.ExtraTreeRegressor`, `criterion="mse"` is deprecated,
     use `"squared_error"` instead which is now the default.
 
-:mod:`sklearn.cluster`
-......................
+:mod:`sklearn.calibration`
+..........................
 
-:mod:`sklearn.preprocessing`
-............................
+- |Fix| The predict and predict_proba methods of
+  :class:`calibration.CalibratedClassifierCV` can now properly be used on
+  prefitted pipelines. :pr:`19641` by :user:`Alek Lefebvre <AlekLefebvre>`.
 
-- |Feature| :class:`preprocessing.OneHotEncoder` now supports
-  `handle_unknown='ignore'` and dropping categories. :pr:`19041` by
-  `Thomas Fan`_.
+:mod:`sklearn.cluster`
+......................
 
 - |Efficiency| The "k-means++" initialization of :class:`cluster.KMeans` and
   :class:`cluster.MiniBatchKMeans` is now faster, especially in multicore
@@ -98,6 +98,13 @@ Changelog
 - |API| :class:`cluster.Birch` attributes, `fit_` and `partial_fit_`, are
   deprecated and will be removed in 1.2. :pr:`19297` by `Thomas Fan`_.
 
+:mod:`sklearn.compose`
+......................
+
+- |Enhancement| :class:`compose.ColumnTransformer` now records the output
+  of each transformer in `output_indices_`. :pr:`18393` by
+  :user:`Luca Bittarello <lbittarello>`.
+
 :mod:`sklearn.datasets`
 .......................
 
@@ -109,13 +116,6 @@ Changelog
 - |Enhancement| :func:`datasets.fetch_kddcup99` raises a better message
   when the cached file is invalid. :pr:`19669` `Thomas Fan`_.
 
-:mod:`sklearn.compose`
-......................
-
-- |Enhancement| :class:`compose.ColumnTransformer` now records the output
-  of each transformer in `output_indices_`. :pr:`18393` by
-  :user:`Luca Bittarello <lbittarello>`.
-
 :mod:`sklearn.decomposition`
 ............................
 
@@ -169,7 +169,7 @@ Changelog
 - |Feature| The new :class:`linear_model.SGDOneClassSVM` provides an SGD
   implementation of the linear One-Class SVM. Combined with kernel
   approximation techniques, this implementation approximates the solution of
-  a kernelized One Class SVM while benefitting from a linear 
+  a kernelized One Class SVM while benefitting from a linear
   complexity in the number of samples.
   :pr:`10027` by :user:`Albert Thomas <albertcthomas>`.
 
@@ -188,12 +188,6 @@ Changelog
   not corresponding to their objective. :pr:`19172` by
   :user:`Mathurin Massias <mathurinm>`
 
-:mod:`sklearn.preprocessing`
-............................
-
-- |Feature| :class:`preprocessing.OrdinalEncoder` supports passing through
-  missing values by default. :pr:`19069` by `Thomas Fan`_.
-
 - |API|: The parameter ``normalize`` of :class:`linear_model.LinearRegression`
   is deprecated and will be removed in 1.2.
   Motivation for this deprecation: ``normalize`` parameter did not take any
@@ -284,6 +278,9 @@ Changelog
   splines via the ``extrapolation`` argument.
   :pr:`19483` by :user:`Malte Londschien <mlondschien>`.
 
+- |Feature| :class:`preprocessing.OrdinalEncoder` supports passing through
+  missing values by default. :pr:`19069` by `Thomas Fan`_.
+
 - |Fix| :func:`preprocessing.scale`, :class:`preprocessing.StandardScaler`
   and similar scalers detect near-constant features to avoid scaling them to
   very large values. This problem happens in particular when using a scaler on
@@ -294,6 +291,10 @@ Changelog
 - |Fix| :meth:`preprocessing.StandardScaler.inverse_transform` now
   correctly handles integer dtypes. :pr:`19356` by :user:`makoeppel`.
 
+- |Feature| :class:`preprocessing.OneHotEncoder` now supports
+  `handle_unknown='ignore'` and dropping categories. :pr:`19041` by
+  `Thomas Fan`_.
+
 :mod:`sklearn.tree`
 ...................
 
@@ -304,22 +305,12 @@ Changelog
 :mod:`sklearn.utils`
 ....................
 
-- |Enhancement| Deprecated the default value of the `random_state=0` in 
+- |Enhancement| Deprecated the default value of the `random_state=0` in
   :func:`~sklearn.utils.extmath.randomized_svd`. Starting in 1.2,
   the default value of `random_state` will be set to `None`.
-  :pr:`19459` by :user:`Cindy Bezuidenhout <cinbez>` and 
+  :pr:`19459` by :user:`Cindy Bezuidenhout <cinbez>` and
   :user:`Clifford Akai-Nettey<cliffordEmmanuel>`.
 
-:mod:`sklearn.calibration`
-..........................
-
-- |Fix| The predict and predict_proba methods of
-  :class:`calibration.CalibratedClassifierCV` can now properly be used on
-  prefitted pipelines. :pr:`19641` by :user:`Alek Lefebvre <AlekLefebvre>`
-
-:mod:`sklearn.utils`
-....................
-  
   - |Fix| Fixed a bug in :func:`utils.sparsefuncs.mean_variance_axis` where the
     precision of the computed variance was very poor when the real variance is
     exactly zero. :pr:`19766` by :user:`Jérémie du Boisberranger <jeremiedbb>`.

From 141123270a39c52a60e98017ca52795215dc2ce1 Mon Sep 17 00:00:00 2001
From: Flynn <crflynn@users.noreply.github.com>
Date: Mon, 5 Apr 2021 01:24:05 -0400
Subject: [PATCH 290/478] API Adds predict_params for Pipeline proba delegates
 (#19790)

---
 doc/whats_new/v1.0.rst         |  8 ++++++++
 sklearn/pipeline.py            | 18 ++++++++++++++----
 sklearn/tests/test_pipeline.py | 20 ++++++++++++++++----
 3 files changed, 38 insertions(+), 8 deletions(-)

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 1dd809a94240c..4ccb4dd14b6a4 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -266,6 +266,14 @@ Changelog
   Use ``var_`` instead.
   :pr:`18842` by :user:`Hong Shao Yang <hongshaoyang>`.
 
+:mod:`sklearn.pipeline`
+.......................
+
+- |API| The `predict_proba` and `predict_log_proba` methods of the
+  :class:`Pipeline` class now support passing prediction kwargs to
+  the final estimator.
+  :pr:`19790` by :user:`Christopher Flynn <crflynn>`.
+
 :mod:`sklearn.preprocessing`
 ............................
 
diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py
index f466b735c4fa6..1c9a62d02b7d0 100644
--- a/sklearn/pipeline.py
+++ b/sklearn/pipeline.py
@@ -456,7 +456,7 @@ def fit_predict(self, X, y=None, **fit_params):
         return y_pred
 
     @if_delegate_has_method(delegate='_final_estimator')
-    def predict_proba(self, X):
+    def predict_proba(self, X, **predict_proba_params):
         """Apply transforms, and predict_proba of the final estimator
 
         Parameters
@@ -465,6 +465,10 @@ def predict_proba(self, X):
             Data to predict on. Must fulfill input requirements of first step
             of the pipeline.
 
+        **predict_proba_params : dict of string -> object
+            Parameters to the ``predict_proba`` called at the end of all
+            transformations in the pipeline.
+
         Returns
         -------
         y_proba : array-like of shape (n_samples, n_classes)
@@ -472,7 +476,7 @@ def predict_proba(self, X):
         Xt = X
         for _, name, transform in self._iter(with_final=False):
             Xt = transform.transform(Xt)
-        return self.steps[-1][-1].predict_proba(Xt)
+        return self.steps[-1][-1].predict_proba(Xt, **predict_proba_params)
 
     @if_delegate_has_method(delegate='_final_estimator')
     def decision_function(self, X):
@@ -513,7 +517,7 @@ def score_samples(self, X):
         return self.steps[-1][-1].score_samples(Xt)
 
     @if_delegate_has_method(delegate='_final_estimator')
-    def predict_log_proba(self, X):
+    def predict_log_proba(self, X, **predict_log_proba_params):
         """Apply transforms, and predict_log_proba of the final estimator
 
         Parameters
@@ -522,6 +526,10 @@ def predict_log_proba(self, X):
             Data to predict on. Must fulfill input requirements of first step
             of the pipeline.
 
+        **predict_log_proba_params : dict of string -> object
+            Parameters to the ``predict_log_proba`` called at the end of all
+            transformations in the pipeline.
+
         Returns
         -------
         y_score : array-like of shape (n_samples, n_classes)
@@ -529,7 +537,9 @@ def predict_log_proba(self, X):
         Xt = X
         for _, name, transform in self._iter(with_final=False):
             Xt = transform.transform(Xt)
-        return self.steps[-1][-1].predict_log_proba(Xt)
+        return self.steps[-1][-1].predict_log_proba(
+            Xt, **predict_log_proba_params
+        )
 
     @property
     def transform(self):
diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py
index 93f19cdb8a93f..2ed5e37444bfc 100644
--- a/sklearn/tests/test_pipeline.py
+++ b/sklearn/tests/test_pipeline.py
@@ -159,6 +159,14 @@ def predict(self, X, got_attribute=False):
         self.got_attribute = got_attribute
         return self
 
+    def predict_proba(self, X, got_attribute=False):
+        self.got_attribute = got_attribute
+        return self
+
+    def predict_log_proba(self, X, got_attribute=False):
+        self.got_attribute = got_attribute
+        return self
+
 
 def test_pipeline_init():
     # Test the various init parameters of the pipeline.
@@ -448,12 +456,16 @@ def test_fit_predict_with_intermediate_fit_params():
     assert 'should_succeed' not in pipe.named_steps['transf'].fit_params
 
 
-def test_predict_with_predict_params():
-    # tests that Pipeline passes predict_params to the final estimator
-    # when predict is invoked
+@pytest.mark.parametrize("method_name", [
+    "predict", "predict_proba", "predict_log_proba"
+])
+def test_predict_methods_with_predict_params(method_name):
+    # tests that Pipeline passes predict_* to the final estimator
+    # when predict_* is invoked
     pipe = Pipeline([('transf', Transf()), ('clf', DummyEstimatorParams())])
     pipe.fit(None, None)
-    pipe.predict(X=None, got_attribute=True)
+    method = getattr(pipe, method_name)
+    method(X=None, got_attribute=True)
 
     assert pipe.named_steps['clf'].got_attribute
 

From c957eb37b5988e6e2a4692c1356e8689294404c5 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Mon, 5 Apr 2021 18:28:44 +0200
Subject: [PATCH 291/478] FIX Ignore zero sample weights in precision recall
 curve (#18328)

Co-authored-by: Alonso Silva Allende <alonsosilva@gmaiil.com>
---
 doc/whats_new/v1.0.rst                |  12 ++-
 sklearn/metrics/_ranking.py           |  19 ++--
 sklearn/metrics/tests/test_ranking.py | 121 ++++++++++++++------------
 sklearn/utils/validation.py           |   3 +-
 4 files changed, 89 insertions(+), 66 deletions(-)

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 4ccb4dd14b6a4..ce683958d913f 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -241,6 +241,12 @@ Changelog
   are integral.
   :pr:`9843` by :user:`Jon Crall <Erotemic>`.
 
+- |Fix| Samples with zero `sample_weight` values do not affect the results
+  from :func:`metrics.det_curve`, :func:`metrics.precision_recall_curve`
+  and :func:`metrics.roc_curve`.
+  :pr:`18328` by :user:`Albert Villanova del Moral <albertvillanova>` and
+  :user:`Alonso Silva Allende <alonsosilvaallende>`.
+
 :mod:`sklearn.model_selection`
 ..............................
 
@@ -319,9 +325,9 @@ Changelog
   :pr:`19459` by :user:`Cindy Bezuidenhout <cinbez>` and
   :user:`Clifford Akai-Nettey<cliffordEmmanuel>`.
 
-  - |Fix| Fixed a bug in :func:`utils.sparsefuncs.mean_variance_axis` where the
-    precision of the computed variance was very poor when the real variance is
-    exactly zero. :pr:`19766` by :user:`Jérémie du Boisberranger <jeremiedbb>`.
+- |Fix| Fixed a bug in :func:`utils.sparsefuncs.mean_variance_axis` where the
+  precision of the computed variance was very poor when the real variance is
+  exactly zero. :pr:`19766` by :user:`Jérémie du Boisberranger <jeremiedbb>`.
 
 Code and Documentation Contributors
 -----------------------------------
diff --git a/sklearn/metrics/_ranking.py b/sklearn/metrics/_ranking.py
index 0364fbba52f63..f1627e84fbcfe 100644
--- a/sklearn/metrics/_ranking.py
+++ b/sklearn/metrics/_ranking.py
@@ -27,6 +27,7 @@
 
 from ..utils import assert_all_finite
 from ..utils import check_consistent_length
+from ..utils.validation import _check_sample_weight
 from ..utils import column_or_1d, check_array
 from ..utils.multiclass import type_of_target
 from ..utils.extmath import stable_cumsum
@@ -291,14 +292,14 @@ def det_curve(y_true, y_score, pos_label=None, sample_weight=None):
     >>> thresholds
     array([0.35, 0.4 , 0.8 ])
     """
-    if len(np.unique(y_true)) != 2:
-        raise ValueError("Only one class present in y_true. Detection error "
-                         "tradeoff curve is not defined in that case.")
-
     fps, tps, thresholds = _binary_clf_curve(
         y_true, y_score, pos_label=pos_label, sample_weight=sample_weight
     )
 
+    if len(np.unique(y_true)) != 2:
+        raise ValueError("Only one class present in y_true. Detection error "
+                         "tradeoff curve is not defined in that case.")
+
     fns = tps[-1] - tps
     p_count = tps[-1]
     n_count = fps[-1]
@@ -696,8 +697,14 @@ def _binary_clf_curve(y_true, y_score, pos_label=None, sample_weight=None):
     assert_all_finite(y_true)
     assert_all_finite(y_score)
 
+    # Filter out zero-weighted samples, as they should not impact the result
     if sample_weight is not None:
         sample_weight = column_or_1d(sample_weight)
+        sample_weight = _check_sample_weight(sample_weight, y_true)
+        nonzero_weight_mask = sample_weight != 0
+        y_true = y_true[nonzero_weight_mask]
+        y_score = y_score[nonzero_weight_mask]
+        sample_weight = sample_weight[nonzero_weight_mask]
 
     pos_label = _check_pos_label_consistency(pos_label, y_true)
 
@@ -759,7 +766,9 @@ def precision_recall_curve(y_true, probas_pred, *, pos_label=None,
         pos_label should be explicitly given.
 
     probas_pred : ndarray of shape (n_samples,)
-        Estimated probabilities or output of a decision function.
+        Target scores, can either be probability estimates of the positive
+        class, or non-thresholded measure of decisions (as returned by
+        `decision_function` on some classifiers).
 
     pos_label : int or str, default=None
         The label of the positive class.
diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py
index fd32e2cc0b860..c37ff34feddec 100644
--- a/sklearn/metrics/tests/test_ranking.py
+++ b/sklearn/metrics/tests/test_ranking.py
@@ -41,6 +41,13 @@
 ###############################################################################
 # Utilities for testing
 
+CURVE_FUNCS = [
+    det_curve,
+    precision_recall_curve,
+    roc_curve,
+]
+
+
 def make_prediction(dataset=None, binary=False):
     """Make some classification predictions on a toy dataset using a SVC
 
@@ -73,16 +80,16 @@ def make_prediction(dataset=None, binary=False):
 
     # run classifier, get class probabilities and label predictions
     clf = svm.SVC(kernel='linear', probability=True, random_state=0)
-    probas_pred = clf.fit(X[:half], y[:half]).predict_proba(X[half:])
+    y_score = clf.fit(X[:half], y[:half]).predict_proba(X[half:])
 
     if binary:
         # only interested in probabilities of the positive case
         # XXX: do we really want a special API for the binary case?
-        probas_pred = probas_pred[:, 1]
+        y_score = y_score[:, 1]
 
     y_pred = clf.predict(X[half:])
     y_true = y[half:]
-    return y_true, y_pred, probas_pred
+    return y_true, y_pred, y_score
 
 
 ###############################################################################
@@ -183,14 +190,14 @@ def _partial_roc(y_true, y_predict, max_fpr):
 @pytest.mark.parametrize('drop', [True, False])
 def test_roc_curve(drop):
     # Test Area under Receiver Operating Characteristic (ROC) curve
-    y_true, _, probas_pred = make_prediction(binary=True)
-    expected_auc = _auc(y_true, probas_pred)
+    y_true, _, y_score = make_prediction(binary=True)
+    expected_auc = _auc(y_true, y_score)
 
-    fpr, tpr, thresholds = roc_curve(y_true, probas_pred,
+    fpr, tpr, thresholds = roc_curve(y_true, y_score,
                                      drop_intermediate=drop)
     roc_auc = auc(fpr, tpr)
     assert_array_almost_equal(roc_auc, expected_auc, decimal=2)
-    assert_almost_equal(roc_auc, roc_auc_score(y_true, probas_pred))
+    assert_almost_equal(roc_auc, roc_auc_score(y_true, y_score))
     assert fpr.shape == tpr.shape
     assert fpr.shape == thresholds.shape
 
@@ -211,13 +218,13 @@ def test_roc_curve_end_points():
 def test_roc_returns_consistency():
     # Test whether the returned threshold matches up with tpr
     # make small toy dataset
-    y_true, _, probas_pred = make_prediction(binary=True)
-    fpr, tpr, thresholds = roc_curve(y_true, probas_pred)
+    y_true, _, y_score = make_prediction(binary=True)
+    fpr, tpr, thresholds = roc_curve(y_true, y_score)
 
     # use the given thresholds to determine the tpr
     tpr_correct = []
     for t in thresholds:
-        tp = np.sum((probas_pred >= t) & y_true)
+        tp = np.sum((y_score >= t) & y_true)
         p = np.sum(y_true)
         tpr_correct.append(1.0 * tp / p)
 
@@ -229,17 +236,17 @@ def test_roc_returns_consistency():
 
 def test_roc_curve_multi():
     # roc_curve not applicable for multi-class problems
-    y_true, _, probas_pred = make_prediction(binary=False)
+    y_true, _, y_score = make_prediction(binary=False)
 
     with pytest.raises(ValueError):
-        roc_curve(y_true, probas_pred)
+        roc_curve(y_true, y_score)
 
 
 def test_roc_curve_confidence():
     # roc_curve for confidence scores
-    y_true, _, probas_pred = make_prediction(binary=True)
+    y_true, _, y_score = make_prediction(binary=True)
 
-    fpr, tpr, thresholds = roc_curve(y_true, probas_pred - 0.5)
+    fpr, tpr, thresholds = roc_curve(y_true, y_score - 0.5)
     roc_auc = auc(fpr, tpr)
     assert_array_almost_equal(roc_auc, 0.90, decimal=2)
     assert fpr.shape == tpr.shape
@@ -248,7 +255,7 @@ def test_roc_curve_confidence():
 
 def test_roc_curve_hard():
     # roc_curve for hard decisions
-    y_true, pred, probas_pred = make_prediction(binary=True)
+    y_true, pred, y_score = make_prediction(binary=True)
 
     # always predict one
     trivial_pred = np.ones(y_true.shape)
@@ -668,23 +675,17 @@ def test_auc_score_non_binary_class():
             roc_auc_score(y_true, y_pred)
 
 
-def test_binary_clf_curve_multiclass_error():
+@pytest.mark.parametrize("curve_func", CURVE_FUNCS)
+def test_binary_clf_curve_multiclass_error(curve_func):
     rng = check_random_state(404)
     y_true = rng.randint(0, 3, size=10)
     y_pred = rng.rand(10)
     msg = "multiclass format is not supported"
-
     with pytest.raises(ValueError, match=msg):
-        precision_recall_curve(y_true, y_pred)
-
-    with pytest.raises(ValueError, match=msg):
-        roc_curve(y_true, y_pred)
+        curve_func(y_true, y_pred)
 
 
-@pytest.mark.parametrize("curve_func", [
-    precision_recall_curve,
-    roc_curve,
-])
+@pytest.mark.parametrize("curve_func", CURVE_FUNCS)
 def test_binary_clf_curve_implicit_pos_label(curve_func):
     # Check that using string class labels raises an informative
     # error for any supported string dtype:
@@ -693,10 +694,10 @@ def test_binary_clf_curve_implicit_pos_label(curve_func):
            "value in {0, 1} or {-1, 1} or pass pos_label "
            "explicitly.")
     with pytest.raises(ValueError, match=msg):
-        roc_curve(np.array(["a", "b"], dtype='<U1'), [0., 1.])
+        curve_func(np.array(["a", "b"], dtype='<U1'), [0., 1.])
 
     with pytest.raises(ValueError, match=msg):
-        roc_curve(np.array(["a", "b"], dtype=object), [0., 1.])
+        curve_func(np.array(["a", "b"], dtype=object), [0., 1.])
 
     # The error message is slightly different for bytes-encoded
     # class labels, but otherwise the behavior is the same:
@@ -705,25 +706,39 @@ def test_binary_clf_curve_implicit_pos_label(curve_func):
            "value in {0, 1} or {-1, 1} or pass pos_label "
            "explicitly.")
     with pytest.raises(ValueError, match=msg):
-        roc_curve(np.array([b"a", b"b"], dtype='<S1'), [0., 1.])
+        curve_func(np.array([b"a", b"b"], dtype='<S1'), [0., 1.])
 
     # Check that it is possible to use floating point class labels
     # that are interpreted similarly to integer class labels:
     y_pred = [0., 1., 0.2, 0.42]
-    int_curve = roc_curve([0, 1, 1, 0], y_pred)
-    float_curve = roc_curve([0., 1., 1., 0.], y_pred)
+    int_curve = curve_func([0, 1, 1, 0], y_pred)
+    float_curve = curve_func([0., 1., 1., 0.], y_pred)
     for int_curve_part, float_curve_part in zip(int_curve, float_curve):
         np.testing.assert_allclose(int_curve_part, float_curve_part)
 
 
+@pytest.mark.parametrize("curve_func", CURVE_FUNCS)
+def test_binary_clf_curve_zero_sample_weight(curve_func):
+    y_true = [0, 0, 1, 1, 1]
+    y_score = [0.1, 0.2, 0.3, 0.4, 0.5]
+    sample_weight = [1, 1, 1, 0.5, 0]
+
+    result_1 = curve_func(y_true, y_score, sample_weight=sample_weight)
+    result_2 = curve_func(y_true[:-1], y_score[:-1],
+                          sample_weight=sample_weight[:-1])
+
+    for arr_1, arr_2 in zip(result_1, result_2):
+        assert_allclose(arr_1, arr_2)
+
+
 def test_precision_recall_curve():
-    y_true, _, probas_pred = make_prediction(binary=True)
-    _test_precision_recall_curve(y_true, probas_pred)
+    y_true, _, y_score = make_prediction(binary=True)
+    _test_precision_recall_curve(y_true, y_score)
 
     # Use {-1, 1} for labels; make sure original labels aren't modified
     y_true[np.where(y_true == 0)] = -1
     y_true_copy = y_true.copy()
-    _test_precision_recall_curve(y_true, probas_pred)
+    _test_precision_recall_curve(y_true, y_score)
     assert_array_equal(y_true_copy, y_true)
 
     labels = [1, 0, 0, 1]
@@ -736,31 +751,24 @@ def test_precision_recall_curve():
     assert p.size == t.size + 1
 
 
-def _test_precision_recall_curve(y_true, probas_pred):
+def _test_precision_recall_curve(y_true, y_score):
     # Test Precision-Recall and aread under PR curve
-    p, r, thresholds = precision_recall_curve(y_true, probas_pred)
-    precision_recall_auc = _average_precision_slow(y_true, probas_pred)
+    p, r, thresholds = precision_recall_curve(y_true, y_score)
+    precision_recall_auc = _average_precision_slow(y_true, y_score)
     assert_array_almost_equal(precision_recall_auc, 0.859, 3)
     assert_array_almost_equal(precision_recall_auc,
-                              average_precision_score(y_true, probas_pred))
+                              average_precision_score(y_true, y_score))
     # `_average_precision` is not very precise in case of 0.5 ties: be tolerant
-    assert_almost_equal(_average_precision(y_true, probas_pred),
+    assert_almost_equal(_average_precision(y_true, y_score),
                         precision_recall_auc, decimal=2)
     assert p.size == r.size
     assert p.size == thresholds.size + 1
     # Smoke test in the case of proba having only one value
-    p, r, thresholds = precision_recall_curve(y_true,
-                                              np.zeros_like(probas_pred))
+    p, r, thresholds = precision_recall_curve(y_true, np.zeros_like(y_score))
     assert p.size == r.size
     assert p.size == thresholds.size + 1
 
 
-def test_precision_recall_curve_errors():
-    # Contains non-binary labels
-    with pytest.raises(ValueError):
-        precision_recall_curve([0, 1, 2], [[0.0], [1.0], [1.0]])
-
-
 def test_precision_recall_curve_toydata():
     with np.errstate(all="raise"):
         # Binary classification
@@ -913,20 +921,20 @@ def test_score_scale_invariance():
     # This test was expanded (added scaled_down) in response to github
     # issue #3864 (and others), where overly aggressive rounding was causing
     # problems for users with very small y_score values
-    y_true, _, probas_pred = make_prediction(binary=True)
+    y_true, _, y_score = make_prediction(binary=True)
 
-    roc_auc = roc_auc_score(y_true, probas_pred)
-    roc_auc_scaled_up = roc_auc_score(y_true, 100 * probas_pred)
-    roc_auc_scaled_down = roc_auc_score(y_true, 1e-6 * probas_pred)
-    roc_auc_shifted = roc_auc_score(y_true, probas_pred - 10)
+    roc_auc = roc_auc_score(y_true, y_score)
+    roc_auc_scaled_up = roc_auc_score(y_true, 100 * y_score)
+    roc_auc_scaled_down = roc_auc_score(y_true, 1e-6 * y_score)
+    roc_auc_shifted = roc_auc_score(y_true, y_score - 10)
     assert roc_auc == roc_auc_scaled_up
     assert roc_auc == roc_auc_scaled_down
     assert roc_auc == roc_auc_shifted
 
-    pr_auc = average_precision_score(y_true, probas_pred)
-    pr_auc_scaled_up = average_precision_score(y_true, 100 * probas_pred)
-    pr_auc_scaled_down = average_precision_score(y_true, 1e-6 * probas_pred)
-    pr_auc_shifted = average_precision_score(y_true, probas_pred - 10)
+    pr_auc = average_precision_score(y_true, y_score)
+    pr_auc_scaled_up = average_precision_score(y_true, 100 * y_score)
+    pr_auc_scaled_down = average_precision_score(y_true, 1e-6 * y_score)
+    pr_auc_shifted = average_precision_score(y_true, y_score - 10)
     assert pr_auc == pr_auc_scaled_up
     assert pr_auc == pr_auc_scaled_down
     assert pr_auc == pr_auc_shifted
@@ -954,8 +962,7 @@ def test_score_scale_invariance():
     ([1, 0, 1], [0.5, 0.75, 1], [1, 1, 0], [0, 0.5, 0.5]),
     ([1, 0, 1], [0.25, 0.5, 0.75], [1, 1, 0], [0, 0.5, 0.5]),
 ])
-def test_det_curve_toydata(y_true, y_score,
-                                                expected_fpr, expected_fnr):
+def test_det_curve_toydata(y_true, y_score, expected_fpr, expected_fnr):
     # Check on a batch of small examples.
     fpr, fnr, _ = det_curve(y_true, y_score)
 
diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py
index ce0fc0ead7e6d..536d585caa8b7 100644
--- a/sklearn/utils/validation.py
+++ b/sklearn/utils/validation.py
@@ -1346,7 +1346,7 @@ def _check_sample_weight(sample_weight, X, dtype=None, copy=False):
     X : {ndarray, list, sparse matrix}
         Input data.
 
-    dtype: dtype, default=None
+    dtype : dtype, default=None
        dtype of the validated `sample_weight`.
        If None, and the input `sample_weight` is an array, the dtype of the
        input is preserved; otherwise an array with the default numpy dtype
@@ -1383,6 +1383,7 @@ def _check_sample_weight(sample_weight, X, dtype=None, copy=False):
         if sample_weight.shape != (n_samples,):
             raise ValueError("sample_weight.shape == {}, expected {}!"
                              .format(sample_weight.shape, (n_samples,)))
+
     return sample_weight
 
 
From aa1fba991488676321baf46627259dfb669caf89 Mon Sep 17 00:00:00 2001
From: Christopher Yeh <chrisyeh96@users.noreply.github.com>
Date: Mon, 5 Apr 2021 16:19:41 -0600
Subject: [PATCH 292/478] MNT Improve Nearest Neighbor documentation + code
 consistency (#19793)

---
 sklearn/neighbors/_base.py | 59 +++++++++++++++++---------------------
 1 file changed, 27 insertions(+), 32 deletions(-)

diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py
index eb14e8ef0a900..9a222762ec615 100644
--- a/sklearn/neighbors/_base.py
+++ b/sklearn/neighbors/_base.py
@@ -444,20 +444,19 @@ def _fit(self, X, y=None):
             self.n_samples_fit_ = X.data.shape[0]
             return self
 
-        if self.effective_metric_ == 'precomputed':
+        if self.metric == 'precomputed':
             X = _check_precomputed(X)
+            # Precomputed matrix X must be squared
+            if X.shape[0] != X.shape[1]:
+                raise ValueError("Precomputed matrix must be square."
+                                 " Input is a {}x{} matrix."
+                                 .format(X.shape[0], X.shape[1]))
             self.n_features_in_ = X.shape[1]
 
         n_samples = X.shape[0]
         if n_samples == 0:
             raise ValueError("n_samples must be greater than 0")
 
-        # Precomputed matrix X must be squared
-        if self.metric == 'precomputed' and X.shape[0] != X.shape[1]:
-            raise ValueError("Precomputed matrix must be a square matrix."
-                             " Input is a {}x{} matrix."
-                             .format(X.shape[0], X.shape[1]))
-
         if issparse(X):
             if self.algorithm not in ('auto', 'brute'):
                 warnings.warn("cannot use tree with sparse input: "
@@ -514,14 +513,12 @@ def _fit(self, X, y=None):
             if self.n_neighbors <= 0:
                 raise ValueError(
                     "Expected n_neighbors > 0. Got %d" %
-                    self.n_neighbors
-                )
-            else:
-                if not isinstance(self.n_neighbors, numbers.Integral):
-                    raise TypeError(
-                        "n_neighbors does not take %s value, "
-                        "enter integer value" %
-                        type(self.n_neighbors))
+                    self.n_neighbors)
+            elif not isinstance(self.n_neighbors, numbers.Integral):
+                raise TypeError(
+                    "n_neighbors does not take %s value, "
+                    "enter integer value" %
+                    type(self.n_neighbors))
 
         return self
 
@@ -654,18 +651,16 @@ class from an array representing our data set and ask who's
         elif n_neighbors <= 0:
             raise ValueError(
                 "Expected n_neighbors > 0. Got %d" %
-                n_neighbors
-            )
-        else:
-            if not isinstance(n_neighbors, numbers.Integral):
-                raise TypeError(
-                    "n_neighbors does not take %s value, "
-                    "enter integer value" %
-                    type(n_neighbors))
+                n_neighbors)
+        elif not isinstance(n_neighbors, numbers.Integral):
+            raise TypeError(
+                "n_neighbors does not take %s value, "
+                "enter integer value" %
+                type(n_neighbors))
 
         if X is not None:
             query_is_train = False
-            if self.effective_metric_ == 'precomputed':
+            if self.metric == 'precomputed':
                 X = _check_precomputed(X)
             else:
                 X = self._validate_data(X, accept_sparse='csr', reset=False)
@@ -687,7 +682,7 @@ class from an array representing our data set and ask who's
         n_jobs = effective_n_jobs(self.n_jobs)
         chunked_results = None
         if (self._fit_method == 'brute' and
-                self.effective_metric_ == 'precomputed' and issparse(X)):
+                self.metric == 'precomputed' and issparse(X)):
             results = _kneighbors_from_graph(
                 X, n_neighbors=n_neighbors,
                 return_distance=return_distance)
@@ -793,8 +788,8 @@ def kneighbors_graph(self, X=None, n_neighbors=None,
         Returns
         -------
         A : sparse-matrix of shape (n_queries, n_samples_fit)
-            `n_samples_fit` is the number of samples in the fitted data
-            `A[i, j]` is assigned the weight of edge that connects `i` to `j`.
+            `n_samples_fit` is the number of samples in the fitted data.
+            `A[i, j]` gives the weight of the edge connecting `i` to `j`.
             The matrix is of CSR format.
 
         Examples
@@ -980,7 +975,7 @@ class from an array representing our data set and ask who's
 
         if X is not None:
             query_is_train = False
-            if self.effective_metric_ == 'precomputed':
+            if self.metric == 'precomputed':
                 X = _check_precomputed(X)
             else:
                 X = self._validate_data(X, accept_sparse='csr', reset=False)
@@ -992,7 +987,7 @@ class from an array representing our data set and ask who's
             radius = self.radius
 
         if (self._fit_method == 'brute' and
-                self.effective_metric_ == 'precomputed' and issparse(X)):
+                self.metric == 'precomputed' and issparse(X)):
             results = _radius_neighbors_from_graph(
                 X, radius=radius, return_distance=return_distance)
 
@@ -1116,9 +1111,9 @@ def radius_neighbors_graph(self, X=None, radius=None, mode='connectivity',
         Returns
         -------
         A : sparse-matrix of shape (n_queries, n_samples_fit)
-            `n_samples_fit` is the number of samples in the fitted data
-            `A[i, j]` is assigned the weight of edge that connects `i` to `j`.
-            The matrix if of format CSR.
+            `n_samples_fit` is the number of samples in the fitted data.
+            `A[i, j]` gives the weight of the edge connecting `i` to `j`.
+            The matrix is of CSR format.
 
         Examples
         --------

From 2b505bf019fc18393a3791a953360cc94679c5ec Mon Sep 17 00:00:00 2001
From: Chiara Marmo <cmarmo@users.noreply.github.com>
Date: Tue, 6 Apr 2021 15:39:57 +0200
Subject: [PATCH 293/478] Add APHP to the Consortium sponsors (#19823)

---
 doc/about.rst                 |   6 ++++++
 doc/images/logo_APHP.png      | Bin 0 -> 16452 bytes
 doc/images/logo_APHP_text.png | Bin 0 -> 30396 bytes
 doc/templates/index.html      |   1 +
 4 files changed, 7 insertions(+)
 create mode 100644 doc/images/logo_APHP.png
 create mode 100644 doc/images/logo_APHP_text.png

diff --git a/doc/about.rst b/doc/about.rst
index fdfe8241b8aec..6b389d47d791b 100644
--- a/doc/about.rst
+++ b/doc/about.rst
@@ -160,6 +160,10 @@ Grisel, Guillaume Lemaitre, Jérémie du Boisberranger and Chiara Marmo.
    :width: 70pt
    :target: https://www.dataiku.com/
 
+.. |aphp| image:: images/logo_APHP_text.png
+   :width: 150pt
+   :target: https://aphp.fr/
+
 .. |inria| image:: images/inria-logo.jpg
    :width: 100pt
    :target: https://www.inria.fr
@@ -185,6 +189,8 @@ Grisel, Guillaume Lemaitre, Jérémie du Boisberranger and Chiara Marmo.
    +---------+----------+
    |     |dataiku|      |
    +---------+----------+
+   |       |aphp|       |
+   +---------+----------+
    |                    |
    +---------+----------+
    |       |inria|      |
diff --git a/doc/images/logo_APHP.png b/doc/images/logo_APHP.png
new file mode 100644
index 0000000000000000000000000000000000000000..99813a042b1d4ee2a0d60e4bcf35863a689acb37
GIT binary patch
literal 16452
zcmeHtWmud|((d5y?ry;bhr!+52?U$L-Q5Z9P6+M<0s(@<5Fki!4-Nr>1PLB2hva>C
z_ucdD`M$H)b<V$?xn`KIzU!{)>Z*R|r=J)NHF*qFQd9r{fT5@$qxJMR?dg+=jQI4u
z#N1vE0FaRT=@@uunR`>axH((dIzXsBd|e>a5FcAB0KjLdHrLvVT*yE2rzOD)ST_UA
zEupp0tdp<Bx%g?kmOsc67srlKp=3^gk#9eLoLD}_{d60h>esPz5_<j^$Iid*t9^8C
z+SBto;rjaFZRCRY@%h*Codu%&B#+5M=<V%FSK#p)=V_<MS{W8m_4(z%&An{g+rtG;
zqO%F*$>@uqu7o|$qKyL1l|$HvoXoM>%p`5%JKw<9KO;he<$5e0M7}u(6BW1MUZbp?
zY=rH!U_FvZOp04eeO2x3!9!}-IPsrwrdlhPkqC-SdjLK==VPt-nU`_xZJLUEcHXub
zKG8dR@>V?Pcsgssl!#ZLpX!)=_wKcLU{EHjX~p@~NuKVQ&|Qv6g~;>Kw~Ae=JKbNP
zl0$u>?C07ySw5!6>R&2ItG!dtAQcq%Dhqi`?1Qo`#@>TRcc4{p^pvxy&t*_5bD*B|
z_pu2x4ByyW*TJmG5484TD$`^)vpq=zt9S4HJNBOmyhx<$`eLD0W_EEX{{4Iz?R812
z_xQYiUi;ahZUnk?&Wv*+EzNP~oMt;r(A|&Qf-}+~vv%`3mHOrH#uoI~)$DVctudJ7
z**<}OV;Pt20Gh3}nBG<iL<cXLwr>vKy9MRRS%2&sSKeYjP$O_K^bnUN(STF6yN<_%
z239Qt&n5=V2xAmzqNMDM_O_Lvau4YSRYmzyu`FeIO4U_m`EZL_ZsQDS_kJi0nL2JP
zWtn<@7<}_O?(EIEOLorP2vDx(6{VWm=CwB4uOqJ;g^@1A?vd#ddC!<j68UdS1FZ6F
zcMYxh8ppctQPJ`Yd^2?Qp9glRvK+O}jCXu%Up|h~65i}jQxn^noly&($mMA{T4^Z=
zF5tV)<O}90gZtz^FW8*kqAb{}s5L&6os<|)1PKYd8p)~UkaJE4?TqrfpgUNmD-Ii~
zk<f*^3Kt~$PprNj4IFtD^ZjRrz|TyflGz6)ZtL{+b?~wOTF}rEF0VS-=veZShGcW-
z)t3;Hj<+VIS6FXLvMiTX>T-CWHB?23RS(zkIj%Fjb^Np!z7MCiB96TW)ZX{H<fD}w
zbx(eCY}10f`}t$mygGT1XOA{&O%|~-S*>gRA!I_@hBWk|6yXh9#9F(YTL~TED_+g+
zz-$%svKNQMCE7sxSDI8ATr_WJbhmmM=^`5rIV3DMX&og49!d!cY@NOgh5u}!`O#PN
zW0i>RE1#7D1Rk{!u$nN<M7A?3Mr&!AnV5I^TKQ~MW&JkWelADo)lJxa<mf<s7FwEZ
zhMNe37JnqcL>1Q$p>Ynn?}nJxZbqLNH9>)I+VMPF9ahkSg!nFVa8t;6Sk*6dP81&i
zNH{{(PLxWJOtFQ}!uakYH~srX+9{tQR=Tn67Gr|B_OIMd0-JlkJ8H)|-H3%1dc|~1
z=MP`Cs;(aM0^jH1w?1lCs4CInyQzyy&Q&GBsvo;ntJm<WW%oAoDXu!X?}%g}ESeqe
z(9)^bgv>Bq&)DEhm{-)vu;1kFZ4N~&l(t(GlNz;LQ&jd91m#E;aeNrBXG`c#l<vFC
zKx`H#xpu?%N^Cyfk^_a(+@>hxp0qb9yFc?=P4_|8#f%+y>i4`g-bF`$6Z$btk)Z`G
zye2{p=6!R898R9d!6pFFXp48Cpqr?r+)@r8Gj~0}W9Jez^X2_ug=Ec&5%p5<JRkL1
zV!5_o5*9SkFoz@iqV%X_Z2tK&Kh`ra_kv-nBcuocLBap4;t#GY5P<6gpC5;1Xm=yY
zM3g1G3sxTdpjVALwtY9JqSW;rs|G3*XBD^e>_mlwhNECXstFxC75D8b-+?b55zm)#
zn#$Y^+(BeJ#Bn;o2B@2v>k<7dk9~m-6z82K3sC%d3HSAxglv=yX>#T6lBMcdvX^(B
zq<9{Ah~eoBY@t8NUUBl`<_L+X##h-dNTtd%E7(>t85Us;0hFh^uKAiR&?y)x*n2$M
zM6;QfLbGB5$vazN3QCl<^6RvYxld&!{1puyStd7+RT}?sjsJTCyefC$MJ~%t;L9lf
z3a!D-fjX~U$^;7%bSD$3k@U@Pk=?7|YPv1UDylxhDwU*6KafBb(x7T7WYmEKFA_Yd
z?<5)xzSZ6OJH#VYV6#jEs$$p-U5+Mht_%N*{_pq^uY~&y7L(l4nat+o#95+t@iCMF
zCm9~lw>cCgr(kzJPriVgwoeWbb`{VVsuQuGZ2Pd<teun*+myThS)$h~L5ZFNsd<hi
zz(4t`9L9V|8)_G{J`m>+T^lB-5>Sq&h%-p0yT?eWw2j7%C+M$|DqJYZQ<_|UNU$2e
zO&n8-Vo)e@YD0s)>s5!d|FJW`8@uxm-jZ+9);kUKI@JY{Ju|QquIszCm{CY#j9qnY
z7(Q8NYH+g4>EvUqMtH@eO?nz*`dGS&_BZ}))MQvAv-7DG5*1?nmAsf~rq6KWvf+RO
znV-forA4W7N!1vxum~=Sb?5X-Uf24>TmA)pcGxeE2*1DuhNpMb<%>d5u3PuIio}gH
zr}PV7;uf;v9YOFU(imQwmGRGkPDzQ;7Z^_?cGWx%X)=9lj#RLb%iHc@5{L@K!kJ(5
ze7Ot3eg^gJBIn3{!*i=$FWXll>IP_};ZaE=*ezol!5)SK!y`wNC;CnPOqVW2;?$cO
zs_B#1BN0wvui`As#L;^f0+sUfeb%hU2Pe;6ipV$%v-C#hds{~D&TO(-=6DO=IH@xM
zm)VzAaKx*A*|J2Gv=7Z2nGDkJO#*w+KD>o-gnGFWx8W$fa0hlmWAtK%E6;KeHxivw
zUN*6C5F@i4T@Wd;A|mZPBSYri=#N~P|L`Sk4_gjqGCIl-hhoD!ufFZA@RA7=?6aLv
z;>jY)(9auVK`cLKVV(G@HykjqeB0CfmLiM=*xTZgGvet?jUpTH>g7(qN}BQXqP@;H
zV~4q@eJ6eO(D9zzRM>@yp8TCptwNCcOPxyW_(dGpR#*GT1L_^VlDIv**j}MD#%LIE
zU6HHp_--$93Ch+8_weo1QlS-mqvCrTb2${GB6M|)?>|<d3E=#*5=;LqZ)jD-i<x>O
zX&C0s?fwTzFJYpgpxAf&>{`wFsF{f`x8073!Tv8YimJQXm}N9%YtTN0mY3v3pP9gi
zDFoo5Ovd9WTHj$r1-=+tChL;>yijhcQ)uF^WqN_bRs7f-pUlun3VNMG!%2qT2Etb}
z%|R1`j<OK6ma)cIL|yOe<MRte3dO)mr}N5xu~z1ozCwPLj5KXKGn{ZNj%iLU5`{>E
zKFQA;&y9+vDka+<^g&8i(=bTIhM2^_W#JOlU_U_!Ndt|DLMxL*Wf_+^Rfj;NPby>^
zdXa_^`8_LEFrDKWHKXy1Ze#08SRcKxaZ5>npU#x_6|$%cwZcHL%J62rli^6#L3=!W
zroJZbig=V{R*|h{5WY)c=`yBWzb6}ygIXd?gA~h>D6U)J#(nvAGlGI{Q*xqQHG=Bm
zVDOAY6%FwoeYGc-3>*6j4k9&r4D2gpm`>bUl^>ON=w@%`=tT;cbQz9Uc&-qE18J1D
zQXR{Zh(d||I$Xq5Q!h8@!(!$?58LyN01K)%>)_q{j8Kpz+|V5mK=JR9BusRp@F?0>
zDGf*}WsIodh6Jc|hX$cY{6r`#5<;HjG30DpB*5_5XjG}MKy~`LWHmSlv2HMcIvi8I
z=M~i`4hUfck+CSKh_a&H6EQiC@a55Ws^>~XoqeB*Y(*FNq?DS;fX_8nmZjm)a@ZZ4
zP67Dj{I1k9pfO(yx)rY8>e06mHS+w;NiRoX;-j8<r}CjD9J6hu)}Df?BdeEZrKD%+
zl1}&0Q8e@5Je66eVPdFB{VRpy;7#y=gG-tU#+W!KLfJRimRf6)RE0fog_RQxd(HE}
z4n<FjJ$JZP6V1*d;2{y`1`D<PL*lTmCKw_NYG7KyK_KpG+S+ETex}t3bP`!3CuPtc
zv)RVpn|)qEjtCQ$5z^8r<0pcQdP;7j9js~eq1L1jy_6u<@#q~U@jMIjJ+?FlSq%J&
zUS2jGCRwCOlTDbFx35C@2*yhl!bu68rqy0H88DI}b<K;W?63=&SR;1C4KtXe@C6CI
z@@B7QBP8kuI$$*UOZpOd{Wi5*7;an>mpcZEA55hBb4VnBl{>!$mpPw+44PKQuUQ8?
zR3q5;B@{WHvJTgx6taTKIn2)YZ?*{u%c=TVwnzpOK*f{H=qN&aLtBtg<j~Xfvlyfd
z%u8Nf>_9~_q^9Rm%&!}1T&&Q>CuL@TG__GCe4b3ni~J#-M(v0V<ATlnDP6L{<9$Vu
zX`dJho?!5cB;b~w%J`)2s_XRhjZ@Rq3KX85NB(1shI+b##ABVF03NMJwy=cTr-LRZ
zt8Hfqm*)!%4#^$@C7P%8B{qeqecCn`;b}CAlpfXS8Dg*7KPQJrP;Mah=j?#V0pDkF
zr)jR~D*N_vkX%087d34c*1`!|4DH~*I$NiC<q(@LQckbf*z=Bc=Z+9Yjg+Lr6}g5<
z5&+1#EWvd{x7aPvi_N8B^|4LGPu|m^5oI2JSuGReP{(95knN{F^<IKif3nK1_k*O(
zh^*uV%wvaaz*!Du9JR?i`1?>p*nJsP8Q50ld;nqibrlh@prT3&a=Luc?kP_BH`+2u
zc}OirvB9+-qG+@6ir=0q_Q`s#(`i<s06inV$tT=fSJG0yCgv6vrr}aow33t7?Ud%N
zmkckWP1@c`2=|PP)AvtnzcN)imsl#Rtt|ta%HV%Gz8S}B@;Gj+ywrbzYeXC!je>|@
z)|<TOjQ(AbHYl+6g>2+H0+LCdFBy9WdKINjhF;u-R=Q&QfgDQ)+{dc-FV%b8_%S3J
z6A^5s@ZarOpoTliP)wd@dVHB@zGhb}mc=>E{E$&8b?ZxhGw?B%!t8bg*vN7HX}Rh8
z&U9U`XH$XvZpRh88)<kpc1^Gy*(h>lE6HmHtQTb-F|Ie9eg4jX7wo6&Z8kIIXf#62
zWsJ4gyEyHS_YiiBVvKqOXYpYHjwjis#L9-t#W&7jHWa^yFd1JAs{{6v4i7%hR1&u`
z0y~XG6FQx(D(M&yPC{v((MSNBhm>KHfyIu6)B+Lj2eopqN(y7sb+bHiVF*teOnE>R
z6a8(D96N_`NOKyiz17R3fl`>jrFlil{v%n9<$ecv;sjA5P{Gnt5#0_@2PI60RiGF!
zk6iyP<xu@b36z~k<}|{>Eew5c)4);rHc4;B!D}0YP2|C^uouFpg~;p8i?#ZCvagt7
zNY<;{P?5YE-86o_#%UK;*~EI7Y7;sQt{u)OsjD0<TP>N(Cy-N-Z@65r_)?)Can#oX
z-7HSd>}Ae-jccRK%dq6F2e}C!NgFXrxBfUzJEs$mf|SF*5?NHb2E{71-_?f8iweWU
zYrDv%et~HxNF0y0C$;oaZ`1;3-abX#t_ZxPa7skRlJLk7HddGpA!k-HluHVtKr4|~
z5dk5S(S|A}yf8L;KeZqZo7UzGa7TB{2O1KvjngBCRFUg2>g-BcZoH)cedb?bde&W|
zhSpAN+c-N}X9)%i!OF0wvP)s=8iziI`q~Ww#^pC>N-$W&motaVypnkWK_0~gFZT+E
znF$muRxTxK*uylo7MH|3%S{}JaOhht-Gs~&IM`zE`x0RNZOPy{tqlV(w{{iWKvl02
zl}BKeL*8g3wG!0QTSCb(Rl`hQ%ulUczy{Tm7^M`^F@5i6MyPGz)+$C-@d*$C*ikJ+
zDZHW8(@Fzo7tXp+KQD>2evWwFsT@>F5r_$<^KV9xI9arhraOJXT%BNtptmuvH-Be@
zF1FsSGB#1uu8B6uq0z@HF6Qv1i!rk8m6wNu+yZu34I^IITQj@a_-*hoCGLJTEUfKH
zX)}o2C19rbMvfM>4YJ-)Y?NP_c}n+j4n2Abx$l{u<%V7i1M&>>;}AO|EbMBGvwkTs
z+l&T)cqBuP|6zeufL>svPWy;n*ggfmelnKojN;{cv884~EZboQWP<hgZ659OZ<f9@
z_^J>u0h#Dv{M|(kt?7O~WNa@~S5iAm{EYsB*3q6HqX?AgD~{R2UPO^(O^hXDU)!5w
z-jV+rDv#V-)z$b?)J{?uSV(pdCFjCtwh*d*U=b^QQo>wmw@3^V!;d%`!#zf+!FCh!
zpycPaUR(2H7s8s$l)&MWx8x!WXIb!r4mDVoCz@}giDof}e>SdLXqfG#w6~4z7+tz8
zYd^6dukwtAsArN1IhaYw$1WuRAC1^-r|K(?;#50AjLVQL^Htd7o1MVPS59$~E&YA9
z(zh&!8JhI;qhFMV8UsDak>`?74mcWw1tjRLqVa|Enup>Q<WNHIlFPz16)x-tV536U
zS)Uo~Be1jQ-Y|~XMBG%oP7e0r>sglyUHK+aWqbJ6o`Fzn9~nLZM`yXoY8)L9|4nG5
zGXE7@uBprL88&DC+=iotxWp_<Dei`=M#&PTf9{!$q{;koG_tK(-_FcOBtX|RQ655?
zhXJ8R=^m?*WZb=TM`h`0R&01dTXOodYC;TZ609#i*a@~%hOf2BEZ_xlxvY%_yD1Bb
z)(BneS9Qq{(!wI$>LYA5I|3g3Fzu*@DeKppZq*iswRxM6&L~V*5H^ZF3~J#!nVyHe
zMZ<4^NvO(WVJ}8@&ZhmmetJ>xVyuO;m=)Q%#so=3X1ZLG1{2^I;OK2DSr1N>KEk+Q
znQW{;0lwgXRT0f~jr9L0FC(>-OMR;FsCW16n&XC;A}zWCBM961$m9hV=?AeT8rdT|
z9$cz&*%1dUQcK#46wOKN4<ICDlJel0XN6*QcYz$chYp&Ge&G140wck?6{hzs4!f{^
z%kzPyxcvwh^fQET!~tmfayr{3p4Rx(Ze(#|+tOAJ2rzJfQJQau@{C7LhUyndD%f>o
zpW;R8hNN72qR%RclpIj0+9-i?kS{H4&uHz?l+_JfFxp@4GJQAOn^Ze6v52&L<XpZ*
z2A`5b!+o(!Ci@mo*GCx_rR*ueh|&J8bGFaXbro7ufsRZ^9A-%|QeI2x*;4-G6gaMX
z?uoMVz$5w;Otja<vYY9mpwCKh6g~tKUG&%W2TfJ;v>MTVsaO}Y?PeM_XWtNOzDRPz
ztL%C#>Jtig?HJIT{L~G6g(cOD&pt^9Z1aUiOi+^RU?v^7;-Pw(+3FQ-*Q=GPc0?P{
zSYHBTNv1C*tGf;tl|QK=i>ai%B_k=*ORNk)CMbzc7UuSb!fWdonx97#K_H3g*n7$p
z*8ci>$X2pMi@jz04zWoV_=mkR%$zF(+?3eo+E@dX>A1Dlt=p>vr4LOlcf!+z<wtBe
zD6*DHZb+Im1hu_NCj#O6eWzHg=x0lC{W;Wfb6K0^+O^4p7lrn1`~$9{YSba6ibg@K
z$}itRb>c!N<ITUo0q&%04}}bG3&saJ<l+y~^V?q`UE@TWXlc`xcq{}aCa~6Q=Np<$
zPK(E+7MO}3Nul#P&eM@Z^h&ep_||YRO-d{eY-@bD`HU^P=@e~;CT{IADAA+=6NXq4
zw<zU!=axzNUPc#3E+>t{EW1Y0%iPR@)z1cNAnwETi^_a=u4hekmkvty7$?BbrSOj4
z+tJ@)mU3r7n5SrGvz}Sh#ID&v<qlIY&xLgm+uqINQ7vlLY<6ZLcHG%%t}3b<uHkb>
zj_u5DO>QHT+F^dr_^=JAowML}N5VwSrXId|iKw83or0`WJzeDK*3ZtCjOVuuBQk32
zKlyG?08b4uDbnrx=M@U%%?yqq+fA-dvf_t@GR!s;kC?&JbX2+(KP;_e53Fx+C(f~n
zrBWO8<caKMH-bo;NW%z3_2NvFj%H_A_FysRzEGiAQeI?Fb+Rfz_$<T{^9$Os&|%iK
zC*u`%ZyaB$EhKg4krm#0)7^jo=M^{+C)Zf*cj^KH!S2@uPEL=z$KtCi_hYNU?QK6^
zb^P>jKNs-+xxc%c<gIR0CUjkd=0~aiid3RjRIa0GE~J<ZRuIt^#2zRiBzhCzR(>Et
z@v0dwHZsIuf3g<Ir!0)?TeFXfl{7qI$Y=B3DFqtdPKDvhP3sa@qJEh%Pa1jHTLMR1
zY+^KgT`UW88{MOrAxY~aUiY$S+{!h01ttRHu8wWy4>4ySs)TJi^1!mwgWV`o#kyV*
z`*oZQ2Q>@sQcVh|B;H>;<=LH&8s;dQ;u1V618uK~>z?lWkOxd}rdg?3ss_xt1|6DX
z9M}ksmz^`Z8Px2{cH|h{VSG9MIjdPe6~e1SMpP7$eNl-FL^x@`u66ka(nNx0cBra2
z?!^EWU6k_8GNbLkVt!RpyC8i}WB(>h$ML;W+dZ*rLn@d#91qHg8aqlw@=R2=da(X@
z#BF7tcQG?wG7T+`=J`fF_~-{EM%Cpf|IYhw?jkB*kJoPs;?gh|y(WINS8i~u^PLh{
z&1+K+<|e__6?{x9e}5ztIgy#kczop9aejDwQo{Fk*`DCx?kITmexY`PCpF3Y<6Hjo
zs8{Q4+H3cr?|zzY4F!_nPS!5~05IXU($X4=($arBCwV#@$qP&oRp^r-A26NCjMc&a
zM!l6>1Tw@XaGiaIUnEsYVB`sGyFkk&PQ@pZ?`_`L7_^w{9Wc~K(wD@(2e`VsYk8#L
z{>;KR>i2P3N%Fp?^gaO;X+m-dNa1)_`FW~Jvp>C!PSKCRmZJ(u9%ZxCOPxNYFug`T
zz2xX^yq|aw@|z7C2nkUT=|i7ke8H5z)Pym9*UzPwsk`OArJ2T|a6jZZ@I_#pAqyd&
zgr_k_Xp6YP;J~ab-0<^kG*ir%>T>Pw<a2|a&H=-kd4t3J`^c%V?#_PCTxkV!n7CwW
zDM<_h9dkO#8nxI+^dqLg`|vtb+AOI$RAQms1{jQYhe$ue*H?`OVq7^{a^XL4CZ2Pl
z%Xo!at-5w6QB?TC!WV?Hb{(zT0A>nG7nY6gk@r>+IqH^k-ys<v1K`wOI*ioem$Kl;
zLGMTcwJG;{5Zd*-){O210|D=5b#D;`1>xnC(z|xgkL$+Q9z|ykPLP_#+@5Q+^6UTr
zSaG&b$5sZaDj-W|M-Fqavjv31$I<2KSPK9U5%+O1x3q_NP+LH(ZJk7EPda*Nscpfc
zwEBFiKvfrMh>fj+pBqHmPff?t&)!l9Oe-#iD&hlrB5;Iwm{a>WIykw5d_-w~^Mam!
z{}OZ3QvZf{*o)E{sA^D4JG()sc{zAFfb6n9wqD${VyM(2ZeS~rmW<q=6i+QtS{n}!
z7Z4|>x3@QkHxGxin>8nwkdP23keid6oBavF?(Xa4VeZ53<WBdC;tviPh`Xhmt&4}P
zvlI0%PIC)qPY+RA+NXZ%KgpkZEoFV4gnt_N)&5QI?g8dhd}{DMjpqr#$qfVwvIDu<
zxrI3Ys{hohs`__rC-*<2_!Li0A9EK@E)F24qvJnmxO>QY{lnhB)Nt2%Iz8mng19?-
zx>-VGy&z5=bbmGK;^682SDT*hkYA$Temj7zIG>#QZTYV{@`|b&f7kgHMr&I~m){z{
z(0@gOE&q;l@pN<ejR9M7LL4BDPmZ`hF?0PB-ow`F9~1OX<N4L|UmSTd_jmq(LjNsZ
zzkT@~SCEXerROiBiZY_Kzx)M(oh@y_px-}*1kAa3xhw_Q!CVk7c3v(43wCo$9t(DU
z9$p@Eh>(zll>qQBREkdS9_CJ#kY7|!<Q%q7Jc1A_E+97-ke$bZmzSNF+rpAvkYB)(
zotvB20u1Eg;^Pvu_zQ))o9$C2m^=K{t6x;$Cn`%Y5MnN5#mjDCDQM2l3kF-V3t0ip
z*?|_kR{Z8%R#p&muHRI@<^d$Bp(skr&GCnSza<(D<{nngZjPd~Dz;9ZKL3E~*g8VA
zJ<NXvjf<a4fS*T@kC%^!hZo2v_z#dC#LfLF$$xQj0Xew2fA@ebL2^%w=1<9H>u7Eb
z;dF7b{@w9wT0l>Oc?zuguiAKG{w;qR3rN}xV(#JWrsM4FAWHkoGwNTIze|c*<j-OO
zsW@ByR{RZyfPdB7pA{!%Zq50-DZ=^Rf&UAWwvDs5)BhdMKcIhSk#zI$c6PH@cT=~p
zgIId}_dNd!{C6g;r?t-A!_8Ome;CyNh7<WCT@{{mo!xx@VqY8L`bX=J1<Aqocc`eT
ze=h<cbIU)%?{4k|0so$Yr#Sx6WocvXWDR**ZT~E_zsYU?MH+Kk2=Z83a$B-nK!BF)
zyaIwib|D}?AG;us*UAdW%WDqi|07ra&hGAP<>77a29dOW%7LeOek!fs^GwbBJNsDv
z*%xmc$WxB60=YpzAnhO1OD)3rYeD_n@I-zsIjX9lzXm|$*OCNM{FQmyo-Qs9wh*^}
z4$R*Y<^O>Di~gTU`9G=u750a=w6lxvQ_<RZsCzs8x90x^@DBzRTT6(OyYqkN`d=Y`
zgyk>SohP6FkUd?`o-U7^f4@HdnHIk)=l|pD&vg6$XyJ+aKSuto`2Htc{|VQ>6@h;X
z{GaUlPq_ZA2>e^%|76$yvv8sQV}}iKdiqDt`)U6S>sV9vwD(4|P?nbg{QUJU?5g<n
zuqld*f}uMAfQI+$0|Us-BYqMhc_^yNBJIE<V8fEY`cP>C00iNRGLkwzOGmGL^K^H7
z2YbBrUdrMaX6U{_-Qupa(fE?u-#W18IHRaKn^*pE@nh){gQmTm3Xs&^fvmZTuT3$H
zSW=YBwTiV+Q^z=a1C|yR^&RREHxQlY`YLJN{q-l{_4w*nQSb4S^UV^TQONjR{`y_#
z#G_0l*e8hn)ORCbqbD>gM)o<1Chn{bFohN>xWT<K0}X`=gf8?gydc%lJzY8oIS<he
z5v(U&B<wy*R{3E@>%yXe)`=v7{080BSD`$>h06svi8+dp&p>#D{7YhdS=OhzU&mE-
z!CWsEuu7BT#-N39s%yqpPv(Q;lV`w-(|w$*(zfNZfnEq>fHX=^h(U0YtRkdb=C|SC
zj?y1G0L?q)ivo`zMFD0)LnT;Gu^aZTeoJ(p>3$~gZVj|XJ5{)emFC_FH;3|?1>fB=
znCbq}7KckzHk_dw8pNli*H+UvW(R(L1UMk<Myiecaz1#&06z)N2Vs%8d+k>n*+pk$
zVGKHcdYMV&R8i;?#8*!RoJMYZoAWSt^$n#EBO5@Tc{VJ&ONu<gXM%+;l{gQ2Zi;la
zaRU1uHx3jmNQ;?!Z5c{8)-R4s>=Z$Ye1UZP*@~&`z{ww`MC-B4T7wvZGh%AA715(8
zq=y7Z3;F(j9tSG+{yj*TUUz6emwpy=25tfNq{QcoFjUE6>MB{q-|0+YBW=h(Fha4Y
zSFSLi)cV5<D4`ioJ)!|X1VhAB&%!q-SeU$k*WTTDWRuuzn)wx<*W%nNtb>4CAYc?J
zA2!$jxu2|Gj0m4xCS3^Q25*`KoW`SO15V4KOBBu_&W4y7%cK-rdq=c=O`|6{Y(VHZ
z+r&%!Dx*=)X42mIMRVUt`ZGd{srMrcX&!}|P^OT#a7-k=BdP}xuaB)mbB21`xKGk*
z4{OAMq=Gm%v@YBlrn}7y-;4q<ZWv;f>u3obtH`-+8>5O{2D3Ckj=}PEw|X;gDeK2f
z^U{K=X9hjn6^rH4B)SqXh3OkrEZEsSxnxVxWvT@lyHKKYr}vAMiwPj$H8c@fwp&R;
zd$Q1K(_D>^wu10OJD6RVp2ovKf(9EAAMn|IU}z73LsD%3aj<qd!cawhDJH1=ImBez
z2IP{W_}LuBMdFRXu)hGX1z*psG(v_H)fnx*Z67!Cu@2LJihb96sUm`G9gz(wK*#y|
zaWE%0UmU&_`vw&_a8X=XNU`Iyim{^WaLB0zi>sRxyl0jEU9cW`+5!dG=MmDnZ%UN=
z$X5!VX&UwLIkt0Opy<sQC{4~(f+lZ?L+mXzHB5jFXe%k?I@a?@Hddw7uW{C@&?DKx
zXK|fpdi3XFM3&S#Ab{+qg4Vcz!fMr#d#f9j4L&o6*VwrNWWkXDr+O_sGA}gHEgrpr
z|NcG91dO<Z<ad+f<l%E9KmkHsqbkRIPav5KT*IFH>izg8qmUdyeUW{LK*(pPCA2s%
z7K=V+@Rj(!w$)ey#^hyIdM6gqfm9_6ZVG7#k9M?<iUUpHi)pX3Plif*H{)^;jmA){
z^J=pxf(G<fw21=SpmO)4SaIY9DLwx$HXHsDqWR)A*w>P(IMF(BB8&Q2AqoJdMS^J)
zWN^C)d)o_Jb=V#*BIr`Fs4ZJNf)D;g@HksVoQg%8(*UL2M=jR;VAF1hd1S{G*{eiA
zkVAPe3lKMYa`6H*#`8QL%E!yE+JYfSkA&b;W60eCP|tb;U+ufgLHdetncy9TE+Lj%
zp2pqs3yyN%-Pp53mHZwmY&bD}i@0674fS$jXL%MTZOv*Y6U^BAT4u(UIyTuHZu~2_
zahOLOga8vcB)65IID?C^vlPXt(^nxqENR5*g)8~3Ioo`OV!<7X(Il1cTJBRk5rUK8
zoFuq9jG|EH5bt~0c6Qpx+p9$cDyU8unBz4#0;$735IhWv39ev@U>PIacQ0u1;&&B9
zg)JwQn7yf~l&+G{PcFgYOhrnm4$)Rs2aRm1#=Mfb<!{*29v8{@s*V_BhdTm!16zl5
zJbZ<{OP3>8;>DLnz8-JF+_h~xm~YQ@8ko+q&n>rn<led;B+Hm~uV|g4UN!!GEIDhX
z`IzZUSrqA>3Dv$e{)>jwe0N#03R21BN8}ZG(NjZZl#g6eNASn;`y3oZvGcH|h>2X6
z<qA)A63V=~<uj<5u8^XF1osupTK*~i68ct!%e&2c7I}jVoHA;V!H_~bRN<cUWno!u
zM)CdP!d*#3CmeaYnk#6=enBuuLOfFBhir<9r&m{kt1<1Eba$S>(c!c0oF3>0u{xY;
z3LoUi7sg&h=m3tv^J^BwoR3Q=-Xv5oKauV!eP9`;Lf=tONJ;9G)6XKNN3<+$cGXI)
z-Oa>FG77wn75$EQTFTOLU3jYa!J`5%glEkDgf?HUh&C(ZI}Ow~=c?YC+xO8-yVGPo
z!$^a0clqj4E4^<u<sFXdH$$o8RF9=FDPnfa6W7;O=-Oy0Dis?+FUCnXVv-sWcAnpQ
zqPEqmwnAgcjr-|fH2MeOD#{xZYy6Uxnkx&g9x*wAn4R_`-p}-kb<)7^^hHnk%yJ3B
z+g~OyPmDkWdL_1p$P1FdxnWAJCW&q*@VnnK+^Pe<GpV$xGLB-&e=aP^dmdR@b+~#=
z`!VX{ajCXC7}V?PEum$qw2<#haf+5^tQY{>+J8&Y`ZBUIq!$v;^{LME3Y<E4XEXR#
zFgg6JmbXF_vol^~Y&a}}6dx)w)!Y0{B2gtvGkj!IewQ}qW6`j}944m<pJ$HIc4ak$
z&`xVA#jU*#dOj%hKA(T~cF<ErYUhPvnr$)8h$omQ-h~DAIa}jf`xLlii@^YOM|jKD
zEg3Thed+RNV+8sM7Or)N$}#w1U@a$M4b>Gk9UVZmPaCq}GRdLZL>uY#ohyH~2WfT!
znAGAMniS3E8Jp=WW;(?!{p-0lJWaoYCI>CHkxGGD6lgb<TvqHW4>Cb?eM>`s<?=b9
zynEGMQXf{x0~1`Zy~Q%ZpaW}fc7ivtNp93Em|7eAqO8=Onutwud^A`<nJ>&DDNd8+
zHjzU=2<}W7gC}Nn;;ObU=-O(VdR3#8pYO<DH2f7Y7W&6?x?zWT5Af1kVaHX;rV`_m
zl?&X7VB$!9j*7;u9k&fWk)K?5Yeki2TfiAAtkT94t5lWcWKMgQQ^x$CwDpD1t#Qn-
zslI?hig(qe8OAdSXC~oB^XTZ%u(L6krLbF$1g<I_2)ONI=^XEDh()ntsYjGk>%*vr
z4tYDg58=d5zS9a32HmiY(EyS(xBB+E98!&TMS4KoFlU;i#SfDWrpf{cE)zgpMte~G
zU7CJcpW<t~f-W+fkr|~ow1SSlA4|pTLEH(LLl)J;Eyy}rUl4S-^G8SDG&B=Uzj6yP
zN&1Os3<2ra*^^=?KTs{;<s7(*N2JEylSwfX-y`X;XpJ%w;@~^oB;q{FvVMbto%6E7
zt@7E}92Y%xTnv7%2C*&E8~hv`X${Vol^L_v;XzlaVIdEGwA$=tj$%d*@nd&sBeW0P
z2U=Ry`6yphl0YD3alM~_6k8W{aP@}=yH%ZG4~=}8YPlS^ERCi4PdFRIvpSwOgcBZs
zV*w;(i-P>2S;^#&_AG|1rZJl|R9gCPB&w(fM*O}-aqA_0lLXC83Dtv;qrEevR#R-{
zB?$3gFvm1|Ro}yd;4&mU`PS1Med;6(a|f2<Tq||QHYkTYR6>TDgfcW%Q>rLhYUtbX
znzh~UM?*_FeVrs5I*3yA@BL;QN+)TD?RZJ)(t3f()6MQ|2=h}pbb2^otxtOn;Z4)Z
z<0CGw9pu&y1snDdEvBAp@SVdkBkD(c`_Z}>zdYEo+-D4*GI#i<$bD4as3(rR7nN&^
zK7RJ~a7rZWI2piS_?YuLTGmu9kg$$Ri=9<M>veuFR4C0%hB9?=G7Y>W3)G_4x-54E
zufK-CR=3lPN%}<|Pa9cG)mE;aXF$c_yccz|?KQkeS9fA;=SLM`=w>KW+dGIg?4d*O
zu|<K169s7li!W_Uv9g)~jnm_}nkvp1r+|EMvSyGN?!ZOj`?vn6$y7FV(W)lh!HB&m
zmJAJ@Ox(>yiB_&*JZ9xRDPV3<rfqe`Kx$8pT;F9Li>WcFvd?2p<(gw<ir>2j#qv7z
zBMYv1+V#a{7Og7r+#o(?37OcE%^8v*{J4+b@7XJPZ4^QXz42WL^Lr|zQZT-B7<zvQ
zQ08Il67Xj>>m)Tu_}UZ7n04J)^JeWaIQqdBNMUJ?nnG)Ztuk<=R9I(#<)Gwvo_!qQ
zB(<s+E>kXP<tHBgb>h?HCpQ<OE^tfEqyEYF)br4~LA`cRpfLNG0;;d}+dVWEdbU|%
z1Es7DA63LOXNF_%*^^i~UUqX$0gz%f9}(C0PQtRHl^D6%tny7grUpC{1-&HDXr4s9
z)qH4x>vMM+?MaO|qpzcV=Txd``>nI3Q@FF=Ecol&znS<l)Rr%vQ)1T#sukw_c=Pl)
z&IVk^dJWwB$?ug*V98uK*N$5I$V=G+c65YC)N8VFuc+E`%lBTFQ>DT}L?zD>H(`vw
zaqx0q+%a^tqoyfcLQRXO8cSeXd(c99=m-O8E??yw??bF^5pXV<S&jmtVp-E<s-?uL
z4q=t(H9ie?l?LfzE!}djIEiJT&`lY{@HkYbqRD7whNF^TV(xXnPc6SxXXLJxPHcV+
zW8wktTN)gZTf%?Aol>oNJa;ryfseA+&4IM`ZM?PWpf}INrj!;MksCIxF#bvCq^cD(
z?Ll{?2*YdFzGe{DaTk|=5u^XjNd-PnBLV6SgEc{=Wkri2)%})iIFAb(-a(#(pyrHb
zM(OU9BP?myrv1yM&tt^9J^6fb)SL2sry17{0UQ3C5cKiYQLt7|VH-Sa4+|cq1UvG{
zhx3VvqI+g4**jjHupkE1TRnt;lT?!gQ>AOT&*?N1TymIL_0!q0FjvJDfZ+b~6R82X
zWz;jpDtYN}oUb*^>%v)QZVL?v2@wy>X<{j3awhg5d5Vn4R>PX}sc$Vr8dn>7g?x@L
zv@0&E<ij6c=9JAx+=?ENxiECFrhZ)f++SRS#xO(AyKae}k)Ug%v{PeqH4-KFo*$pM
z?mgUGNVVZ$apw`9CNYy%(cLuSiVEB9s)~4eE%J*{*kaRa7w!Z(OPL-Uxj{r7v--Yn
zHC$UD>A}+@DWO=QcYe1;x#ZctoX_aVC*}F;*M|6Vv9ZzyLJq#Mp&NR>-CUzT#x5HK
z&8GL%3W9OxON5-B_dOi(C=_}vI9u_`xiO|$aGg&z4g0iC1j5FJ<m9Zt{7_;X{m_=R
zQC=P~M7bP7JelW6Xy*h@t@NkFU|AzM_8y>3v>Fp>U)XD=RVv>O*+Npl6T;NQF@UsB
zry7f;J9pZMotnM(wvCoBM|$=wkauBz=0}PmUc04D=5zVhp9GMuE`$|Dwp;T$Dh~S8
zF4Wf0jY|gP6iPD9l)mRb0tAD(75jo=c6eG|p3t1yzy%?SWGARsEvB;Ulk|}N$p3K2
zu{gi14GqTUR;eULc}}JoxN5&CjKY>1`U+Lo5S1xS#)7B1-GH(?3%Y5-%W<UJVe6No
z@_K%?4?3aQE){KNMsX?*l@@BjeMXWaQ>l7o)`+YZKGz-V2z3rESdQ>qp-HQ!%nl{b
zEsFfgVWbf}PR3rWiJ#6mHhV4)`x7Bgd53;hAJyygk*jP&2Q<C>o{`W5?P<rAVn*bm
zN9qNa1hbG){91#>zHJThk$_<@D#9&XA*t|LNh+@!tBH`>`6zsXs=X%H2Ophb9F%q2
zw>bOeGuEaP3w^8pVB8{*jl(UVc{eA-s{LieGtQ-W1#ti&;eqmjD7PoB9l^6LGHcvw
zP{1Ih*Gv2TPXq%kC0b2AmLk-}??aA1g~E6WIV%_AtChNu9603R7SBwD+{o^eH$1no
zaOaBw_o{8BtL?#KM}bF<6%G54G!?u(-+6`Tm3sU#gog+N4}YOx*ONVtF`kyG#o74I
zZKR_12kb~1LuP+8mn7Aq;<I6TsphBs4B^OPUWnbbv@&OKoFm6bKw#$zZq>C%d6#D;
z;w<pOZv#JU-K2KKMpl)<i94>gtA(;3L$Ppek#6+NM#|k=8Y~Ap7S)Ealj_KRH-yPF
z+*Fa?j*nrPFF{PN7G+ZDaVMAFc+`)Nc<q6InRCxeiXbl9banN?D)q+_aJ|fx(&<^J
z9aIeA^xG0zhP#RV37z~lXQ;RToe>&h4oHy?bw?%;EnDVPcz!poHRnMbF`t@nb#F_$
zJ$R{1o+6IXt6&C+K?n9U<);lygUzcNEYf%)I(uhy7ZjHq6aNfKwqps~NpIMnXm31@
zjL;Z!u5`xL5$mKVoRIV?OYJ1bmYorH4K9<;k$2-x?_GDLY@{}`m8QzJDnoz5Rr>Y|
zOZwex<lF6QeT6lJQ)9RhtAZyDxX|8sT#K%~G~tfX7>-fRf+fo4AdvYc@j!YEznl52
zmY-hRD}pqIwyYdcMaVS9&*TBofK-Q87cu9m^QydXcvDiL^RV9jh;Y_4Fz#^PXq`s8
z0?i!z=?<42FVcdni5;CwkD)zlUY^U^*Wo6n2F8;6{FxA%L=jyiQZiSv>hI2POWEH#
zIKTUfC9rcZ#e3f-P&Ugydv&X){cf4nv4d5dck{=vd1Tg7u)lPcQMgj#G3*+L=dh+i
z`7~}dsUUtUvT7B%Uimt?Jmkz9Al9&#tF*S;O~8A3Zw@tNE^!YaiN4@IXgLjV-ho7c
zpRpGmgxWGIfA>*JbI*6cb;USUV>pmsEjr1t_0Fsj4@bmTlfh$;bFIg<vFn=%IRTs?
zd`2NrDqks`)!1N#E*vefZo3R<GHxG9j4p9sv0NM>;*RlnPcAlL?_f}%qcilNN0b%^
z@|@OKTt=f0<BLc&Pc1eLm5zYj3X&<2)+D-;?z;<Zq85z3j>CD9wOzF^3j&aC{LT0q
zYzmxE$r>vCPc`GItG+Je9Bx=`XC0utO6IvJ3fr>J7stM=Js|Sn?$UzMSwa|v34-Y~
z)tK)3x=`HSMFQ2d^{9lmdwz}uJvqx%$Tx)}hciYYmxtor)A~GiqNcoaHG{Du>8hkw
zEMq+(pKU)mbwX|?ZN<ECl>G5IUmWoU9r#YGuI-V)%d-Wp1FieYW5*=EeI`-y$EyUJ
z`5}t_;#blT9vx^S!>pOk+iMJb>a|e0#&rp{cG{IfQB49TNx8B|y7Y~oVTzqrI()de
zEcDthBK)Wr8rcN`-w3UwALG1#GmJS=7=$!&l~L#@b`u<_zV+>t9}(^84lO`YR!yc(
I$}H@E0n;Yj6951J

literal 0
HcmV?d00001

diff --git a/doc/images/logo_APHP_text.png b/doc/images/logo_APHP_text.png
new file mode 100644
index 0000000000000000000000000000000000000000..1194b92f88ad47e3831a5f5f635bd2501d3166ef
GIT binary patch
literal 30396
zcmY(qbyOTr@Gr{Z?(WVa!JWl5_~Hb2g1c)7?(T~vKnMg&f&>c$cXtc21YKBy1lh;;
zckg@mzWQU%oH;XH)zzP>?vhE=)mFvBrp87>Lc)8krlg02gz|EWl!uA_a(!!bp@oDL
z5c66|-XLi0tTeE|!f^E_(#a)+kC2!clTP8NJ6R76M9#x^yfnT1ycWt=TWdl{1eQoL
zX>ya#*eb4wXMLYVs26n&(JYhLX1HxF6bikO@%>j)XnN~ENmk%<v*s%8+VJoPi}pW4
z#|QP&i!-z%-HE^`KiMZ2Q)0YOyg3+J-S_k7Pn*6B1S&tt9ix|I6Adh~PeDuq|3B)e
z9MX!8iHRAAl1K5P_Y#~z>%w*mod3P^ACV&cH^k4x)jo`<5Nk9q>83HQ<d>IYV)T*z
zhbPAt-YJ?N(k=3Bwt7J4TW_!b2s`S(@vLK@3<<=MCmDYcj@G6BOB#$CB=Hx*mASIQ
z=weu*bH+FXv}LMSZ|?v1SX><Ke<X$NI`821d0D^mqT{_^ev$p>Pfn!&WQ`$&Y-5(B
zjztClrUADZ>YW|`H$%o>GJW2Cz;d+qOaq_zV)Fl{|Nk-+4T`|SMo7%`w36MW|Nmvt
z)<jQ3W?w_dB6pou5B}fW@E`<DVqW$`rcVqDDE~?JLjPM3+`oN*8?+|8AWE;)-v3d5
z3G#mm<Hr;r5=VA!LPXN89MvKI?^$`-BJ+#wA)qLe9&i5VizMaJV&wnyi`@@*jlS&T
zRk<3jBrf|Fz+MwAYE59>o6;Lbo(rG@UaVrKs7#JipFT-~IrfZ5=N3*u)c@%tSGE6t
z)=tqh(F?Ch%8Q>1+QGp;m=gu=!?Z3S+VA6D_xKRJoTAf7P~eo*r?3-ScZ9KXk7Z5W
zB+Ea(<<G@GwAJvFw3vEfghv;-NE6Ehr5@D?Nh8YeLl*Ds>i?N9O4{88a>@J7yD9ml
zXK$5%IF~)usluLUq9~uEYodNc#YfSRwN{5dW9mC=D;Op@Is%1RG8bj@15cG~w4|cx
zstLQ0LD(Bu6CAmIHah=_t_}DPDYcR<=|o%JDmQ$n+RAy)nYEQVpJrnfVv97X?g(Ik
z)}M%uZ?b@>A#LcP>ETgr$TcWGsAdVeC_aWYl844GR_~angBZ-BMJYu0{|{WBSrOlW
z22hh7#Xc;RJMQqF>P)hyh#koVVK=hap;hj$PN@8J;`BjMUd%HlyMY>285JDSvPic`
zeA(*fe*Mt@Gm<*-i^*V#QwLFaL5L1r<Z#9)?rMk*p=xYWzF~Yv=ki|EEq4q_JWo%l
z+WKzKaUc|p*KB4b{F474mMb>)dx>(7x(L-Oe=WQ4eBMCbz<DGaT<D#v5$l83AVb%G
z&>6G+l1(YC<y`C4J(?f?kY%CGPxk_l{{<*rm)`_j>-Xe4(zW1a7#D*-wc%O^is{rv
zay|Y2{a=+kK83B6Q50+CB_@Jl&*)H#zl{F}_Bn3=1*l<YK|)EXd^fmf!ahADf6*xQ
zhi2OVd;l4kt2Nsv`?W1sWSI6&cb`0PDzdyBC1@%q>a+P`IiNY6y}YDQN{wxO<Ng0@
znibSo8hCZvkytQvgYy>35787+$zaV;iZPBLjxmk`-QQxE#}|Q>dd2W)1|(N-Y+15y
zJwk~TQMQ_G&B%!7CMQ84i)z%KQC_#9ie-%s^Utxm;@%M~Uo;Hx`QLs+Q6w`@#lE47
zQaowSPuI#u2?5p{y}|VnroBUkNhA@xI#x1_7A<VKL~TMciR;3D@h_n0vHbHmAF9K(
zP<s1qhRMD^)Z6GF%rJ_e0)v)CEU(4WD#-=Ri_d)>BjaC*+G+35nnpPR{~rd?=LCtx
zQSOR}_Vh{%Wzl-HxH{N=7wTukMl3U{!Cb6~yu`e?VW+OUG9BH4c*n<~k}ydFH0MBc
zZReS={oQt~l|Km%y%$yJ2Cq)W`%eC>irETbZ~v2kxmfxfzkum~IZ@*5yxrPlPq}A}
zL{H@WA=bS#Ie{~wtH%l69YdBUvGrb>Bz6OcEcsu6I7O`NnS|VV8Z`Q%;un;>2K9Pf
zrcQBJ#tK<GEwePWc6h+S{;>LGO>*%3J+S}tB_Wx4Aq%6o8V-yIq+z5z*dra$uWNDA
zfNTOuaBi6_ATUe)3+Jku{%AwrWYBX=?e8h&5h2~P#q=9xto2v5iY+X;Pch4YZ3X%$
zBrPDWhvyy>34q|B?2&TX>L3RRBmSy#>3?||f-3@kj5{nYxGDbWJDv)h!pKeSP!GCz
z%b=3XrSK7HtUwy<iaR1NickADEh$0I{tdJtyS~6KM~o(8K$7}Lf!V_=i9@Tl4Xi-r
zTktzYJ`l>C=oshg0^;wKgO3ROgNg~2*MA41pWG)81rVL=Vu|?p2QfH+7kGFby*3|0
z;oM2hX(OH@-+7x;@&jcBMI6^una1Zag{<%6NPqWtW+Qzz?sSYPfD8_;`2qJ@Vo^o-
zJA*4Kjvca9MUg(Vk(hwmvns<r|GvL{3fcFUBQ!T2JDLl2b{tps;p~D+Q$K%Jl#wEt
z4#xplS|7uu)VsLC^~t4s@hPLZ3?@WB|CFiqo}_#T#%ZE6{1G&YUy^?sswI)Gm2VLv
z?Md8%OCLqNKg5V*f}!5E?i{}%xI>kb&^7?cPrExsZw<3&p|X$@O0v;%qH=0z7!@y0
ze#(sX$VzSAl5vvadTK*v!{Tke5a|`=DB62H!aQ&8cuy1EnSIzwZ+9AAneqJ_AMa(O
zrv%@D@3DKT%Rfeeb@-b8W|=f{ZZ2qK-?qBpLM=4Cw9klCwg=n9MR3F3XA|4cej!C?
zWg`VRmyq$jk$0Wt$%?@I@>Uk<nIRX;`}O)`!0MaD1MwQV5#mSzui~?9^(m47Uiw6y
zE)K`l?8PY2o$cEGISt;f5A`kJc*)n1)CK4uv<=J+&p@4dSGG57s^0>tFJg$#t<pjE
zGO(u+rQnx6trBd!vrUT;hP%QS_Z5T^#t_u?3q#|TOOG#UooRSw=Ccw*zN4J5XyvbV
zjR)Fd{OZhV>k`~9tXPzkQ^jC)C&KRswY-0CQ|#ltk?NB8IS%2HT5s?1>h@=;eOJzr
zY4OdS<Dqw;Lwl6EoJ$=0RNSpIyT0M*zYaOQA;nc<`<G%YXSNdYM{iEgfPXIHGvT>q
zB@8%W=%$XYEK7%ted);tS;Qm1AA)xXS(a>stKMAZ7l8H^rc_ofC8q3e_d)hzy4Vn9
z2NqAo%Y*e(DBevlgCbT%?3+=u)RCTd{l@f1PQL|lgUZgT)Cb!*Y?r$Dt`LOk+7X0m
z#-N{u)2;(>26UPC{U4`lZQ05JY<|5d>hC@LmLpB4|K5+(^{(9nTc1+lr+f?L=J)bn
zyWmmUYD%887QmvYKHbq(#+Kt+jfQS}7Q^#zJSp{~A1)tjtUh~NJRX3CMkX-M7(GY!
z8d3U%&<t7HWj-S_B7ZC9q`HvjJud>48QT3i=h<-*!0uBZb%9~%+xMHKs#k5&N~9NM
z?}4utR;f_=%DlC|Npy%P0%}pTn-zl;s16$GV6WMv;CIC@n>8JB2?bn=CYmOti(K7H
z684B#TMNTrBaQRs_2Ppdbbo5a>x~LS5VT&g($EFML>CMXj4#3tpBlsGGMEBA#SU7S
zbPAD7_tU;Neq#T;?l#d7^xR#`v)^_G9W2j$=2a`th~3LLh*T#lTeV<c-e$`UE8kXS
z(fCIULf`<y_I`0eeh~GUMJWe8mpydbH9>WR*}5Ble8e+R=IUtZgQh5L?G(}4pEww#
z%UZnG6h`4IQjo?ArmZP%wx-zsh!5Is7@2SX3F#&dX3j^M&i_zY49c_1T$Ta%MVvHw
zd#Kuk-iy4NhjY->9O0DqCEJJONk<YqtUJ=^bm+!_W~WhHv9kbwH4C+G(hlkRuCflf
z5?fQ5+OQog*qzL-cv3;cR45OEJ|Dmy<3;pWpML}|Q)-i4yP3v-?C*OJ&ITf18U-iU
z!M~J!ZLsCUB;MH2h^PZ7oqp|FiOvr4p0n=IjWA0W_b9RvlT!SULg`4_UdQ-!-ZJ>5
zB11R9poL^Q61jbTi|=tYIfYR*6M;3Nckj(oBNFb`z6Kf+XNiO5DFTATrzKFPw>^AG
zYLHF|Xj7qlT*t^S#}C-!y6(|L&7U*xf@I0I9_P@C8K~dxeL9LUP4G6$Ud=eTWn(`4
zeH0ohl-7PwMnEUFDWjufnJ2)B0NHND&%Sx1cWihy>38i#U5mH06ZR>eR@~>{(rpY-
zDpYoMx<alBTs!f{bn39d5$TR*lfFGn05>>pddwEr84WpR*b%#-;DfI-DCt2KJGh!a
zgpHA+r+I#=iz6!j@=-PF<?LjZRC3Y`<k)4sg6pAeC6-oys27e6B!bzd?Wq@lcbQ3*
zGe)k?0}FONozUNgC3|N!h(dgrFj<~uN$U&~a2KjZ;@aZQo15w_-a)rZYc3lbCGc<N
zIZf>w@Pc*pt0xX((1m8$TEikRlA>O((#Amp19#kOnJiak+5qhYcQ}wTPyH^Hy&>p|
zlWGN8th}Y&m_692Ez`aYp8<td*H41av|~F>(rzk2f)TpD%oW-!Nwe-I6?QXuZ=qZ-
z>z=P<K-q^a1uRpg6O}rR+9WeIBS;4){cgf5pr}Clima2V?vX&bhbc87<7t#yD-n!s
z>TX=<xN2>bunL^zFJ=!_zzuf8p_3E2p%NjAcQkYMT~YygV!-SHdb7@XAH+2U(XLhI
zYm=Ee*I|$GZu1PJ0%@!;Tu2#n?uO|HVz1s&!^5BVxG-z{P-jXQV21^m_nmLhHF+|!
zlvC6WN4LnhJS<w<$9J(tXul$)JOtrX!TJlmj^>!$++E-_Ya0Vhck8(yjBZ4&x>nlq
z6^U3^y<W+|aN9OemGr)A6IaPKIcUm}^j}C^=@~c(dGM$axZ=!wa*+t!hXqN+U*UbE
zd5e08Bm|qlIPZst?-L#I$cfGeJ%7H5aH16BB$aPWg+>JjJ@-zE&o2#cnI!d#PmlV@
zY`Z0ev)>on##N(mv1Y@;0?`V1PEr%Kfkm7E@7+{+rz<Ts$;i2&=Qla0k&byH5exe}
z2VK)M20lWN$b6!^_VBN`!6y{AS&IfOtGGQZx(1^HL|xac`$P}=35tJX@dC6pRl(c?
z%=<UsJ+nqC;^U;1qtN;Jzh(Fqr$Q6RFt49^I2{M+qIyCs3iBbTrm75WS{;jBBv?{G
z?93GV&DHYLXM4d|_J)zFzkV*-8x_bF!#o((&J8{KomoCU6d*x7r_XWRRpx#upw`ow
z=2Z=~6C`PskqL057b3~L-49F-?;S$r^O9n4C!elOhT5~7OjlCwDWiV*1mPW7Rgr|g
z=WQr<$F(^NB=u1)b5B~eEglYgrAc!7E>SVn{Y!YV5mIO3`7tIMqj<S2fae#S0XXN-
zy#$%J32j}3bT_9<d}@`xdi#5GV%_}}gYf;PTZ^$>lBO>2B_=7hP>)XK7v1u|j;54U
zov8FeY}XEEdxt5|RMF&}%K-4T*bVZQ@@0i)t{jUOIs|c&we*h<uP8clrRRZgaowq!
z{*GaX9E5;my)PvFA}sy(R((ODi|D{f+Sp!An*xGJX_&+!+Am)v&BVaLUo{k;M-&PQ
z75|n};m~FM$qC$m`|Y@Z5Ep2%9v`#Lf}TM?it(8cUBweW96H14?63zJE+1X%tyDJP
zW+@;DX%!Nx>yu`J_*6aPJptxbHM70e#qaxR7Cw~?P#KQ7Wz<Kh^;Rj^<FIbR?Jg1e
zFuQ&&0~fP!IM&rd8R5VIJ(5RK?yNz}=)N@u%++jtA~b-juKp(BvT6gm2{&VwsDjl|
zInj@UG>)-XjsO5DHoJFiE3uJ2acB+rY=O^o)7wTn%sMLMOX18sECh^=ea!s=!684{
zAKt`5`S>UQZK}xZ;m?Opga}9|c=0NFuQgtG@_)5yLwfPAY$Jo$d^_2q#(!jZumKX(
z<RQbrlLdZ#-J9g5FEZGVd`+zV%(D&(Z3F#nkd^_MVl7xhch{jVbIr4y3)k%QVlC<m
zJ^b`O9@#UEa@a?2w3ubTdWrP&iEaZNsx>=$dnL2RTZq)QULnKwHBa^TeR`%kxtUG~
zO|*t6bt~9*S1JWY?hE$B;zXipB__>8+!hiGzB<(Hq+$#1wLV!TP13?$azst4DsjO+
zRSuYS7|;$?sYY(X?~wBpJSe9-5_XV)q2np;qI0a$H+CuikTU%T3K8Y3Q$a<~d(X)O
z-MKe5&gGEHpOM^2ciffCJboo2O~%$NYYR1ttQh5~SU;i@>!uI-poFc?-SUPny<JL_
zkvqT_5L&yGlkZs|D9f(ku|A1(DLiIn>cQH&m;#tT6H5E)j}@9bT*z}qFfj=4)#$lg
z-}LRC8JU3)?VWxIObI8{rX@R+8pCA|MGoD$-#%s0vCHm8kcZZ2@?envWvnh*U-gUM
z3z($~C`l6+a<vn;!01+t7xX2If$aqB5f&$djAKfY-L}d$Qy6$TqfoBD4JDt_<Hs{L
zaGFeoQahsT9rF?e?mmU9Agh53F~aDmng&{$7+_=|MEHlS28k(&01O6_im(f%a9a;d
zc)tRVFxI@f5O>>#L;SOi<7-^69HLe>_V@ReQ$JW-L)?EUSjH*X|C6CT2r+%%NwlRH
z=1^Rv#ex&Buo3gS`d<LeYpRt5=(c<D6R5?oYW8Dc8j~FOLUgX4M})|z#FJ@R-^y-5
zB{T0CZ%5Y8xP>aN3x^y0XLc?#hH+L|eQxl3;ngcLDddiwV?ohJk$|WkF-1Uwn8Sq%
z2yxH6T@`yu7E8=7wcQ7oHMBoF{+{B0F%}|z{7xL_AbJARLRVg4TgT1QBXN^^)3udu
z#pC>HP_*&lyGA$5c=QTYMY$(i?gs5;{nzj%d{AV(sq95nWan3OvH}@3|M+L&;xL`o
zWdBM|t+q;R^~D&E&o>O3;6kH8_&-T|)1Dp%D0eE<Q-qkFeOd~rjBjx(-8y(a^~8)z
zdr!lgv;Tn3BIv6a#_SFqP44?9I(#BD;%uB?y<F&4_AcoSZCY1`J>ep!V^a}&7}c~D
zB%iok!o+nClN*Wql-^uD(7z>wbnH99)w6b+z~cL<jKqLAw_!M68Xm^nmPvB`(;jn9
zw^zY3>Yl+_?Z5TpAx_KY4tK27J=UEkBAw@&ZnKpP(T(%ez0)U45f*Us`!HkDArX)w
zx#ZE?%tm?3WmljfrS2`YXHy3*H&QVtyK&<KyluT_gY6v$D1Q&?_xrxBoSDxGds9|q
z$U-IoYE@_#eEQydnLr5^RUBQWM5TZOMxu82A`!IwXT&n9Ge7^)D*bn#2Y271zU!Oc
z^$BGS7+1cV$K@+F*+l*YXojKHO3u?dOrJw!;<ak9c0JX@93~Pv4Dtc#hY~}=V-)Qy
zd!gK1K7am{bE3AMmE*vI;tZkg6{&tHbV?eWyun=c<!DQlof-ALXtBso)oR>|ojPc8
z*jHNtO!Sm%*Kef~deSyB8_??0HxeNT_tv>@Hv&rvx%~l2mqn!U=s(^&8VJ1Bs^TBp
zE~*Tyw;)tS@@&y9zp0~c<}nvrvw(@#wFw`o?F{6lv^Ove&#HW-s7yUN_8z1Ztc1*~
z_gE4hRFt|pO+*u^M6;<p=Hjm2c9XKMJyT8h0u1;50lSkUQuq(re&$3`TwS9I6Ipx@
zrh>WZjm8RGNR8%{60FhJr}Jxu6ULfsWggs1*+I6IJo(_xQp>$D%-BZ14&_ED4H&AB
zKv?E9QaTXXrDu;K=_)U*?7s-P{Yk9z52*pFWz_%9P;`Nlz2nyWtA+88&}95QE%5es
z4pcnS&gtIt`X(eC)x-^|Z)F}x3{wsXJ*6o3nazZ%G3ELU&9wd-^S#L^1=_Qw=;L%*
zyC!O18wBZm5BkwR<D6~D;LY#;N<+W3|JzBeznNmN#(P7YEv3L(16_jfSWaXO#a^im
z-cMjQ`T_jx`z94>@;iR7I=YcTW}rNUs^yfj_qEOP^?2>|>11#=0;nvSyD@jcMlv3=
znW||Z^c|_KyybhFlet^=+1yGimuji{m$rwUMJ~uXHEEsSo7}X<<VRuvT!8m|S@sX&
zKlguutWX@H{oX}27&NNvzr+pWst*T@n-oWy?>DZST=Q)iZ<LuInV(K|7L6Jk!cG0I
zTcB425Y)!0dN0$<Vv8rb271gf@G-^WK77CJ&OPS-C^E=ajFB`$bd5XPJ(%F(`%ALt
zRTYO4=dG4g%4uMpw^cQKO;zvf5+h$9oT#_iDUP_rT~?G?=ZUJKtX@vKHgAyAKNg4-
zKTnsPCyIQNr8O2#ge;67d$~`8Q^dHYSb=I~h{5&Ofl!12ua8pdx(@KyRJiZwrtlzq
zxR8uJXxjKFt;}7;FtT49Wo3}(@MFK|0_y41OmtS$<Uem~?Zh&wAvb-bAr+H?FBs1v
zYOg~%Ge2}H8pvJN2d=z%Ud8{*$go4XGa-Yi8u^=dy^PKf)JqZt{}+t!YB%+lx^6O*
z8Rvq&uSS<Co<H%451YIDTkTvac4#|U>^B)|`6AUKfg^W^9pqIQuFIQt!9x$31ijoD
zZML>7bs6JKR)0<xjU-XZ*Gy8<iXKwn0w>*EeiDD6-}f~QzI@*;DKP!q;x%uAUiy{G
zb}Te~x`n|iof#nL%PU$u=dUVC;LU{HS1d#xD6Rgy>HOx|g%q1~sCcB8*2J9HO)fwO
zc&Ueq_j-;fnIGdo+5qLZi}1zku&3pp??{eTCmaBqe=nuL!)r~+p43U)uN(v`ix@^F
zTlp0%wdE2!4w&-i?$Y@kKq(Z_vdVes{+Qg^Q1E6mLh8&1vtCi-Oz>=iv5^A#A=h-3
zr=)MBySFt7+Wzxq&gMxxjuBlZMTv~sKd!?3^E~s^LT;_Vjv^aVgC6p^>V1R;bA(+>
z>H$B_ZUd=JvFF(95`~U!LTUXIO@O4}2Y%^TP;{G;S3c*jTR0V$dY`C4#fec!DB)0*
zrDDymAoW8=zMm5*Pb+aL6_J(cg2N)gyf8mfAKl{ScDRt}gQ(el)1VXIRb$KSODYMR
zKWxx901xEb74$1WCLzNKA@<^h$kt*1a*&YDjzmsDmT|ZYi@Egr1lUpwyjsvEU2kWO
z$>HPZY+f-N_sR@dIq>^7sulEV3t(=&iK3C}4gAW>lm2Q1=QhXgF8sH>&k6NvnF-#;
zbCIET()qV*KMDp%^gNf%TSVdt!(yCiz)$v#??)sURDbY8+1J3iT^Q;v&3P2Nj{0F$
zBNGw8^qQ=24p9~z)f^@d@x@J)suIf7D@9eOunQHlwliRftMV_4U}8w`I^ni4cjjOO
z_}8>!RajJa2xSN;5_>FHLf|U{Lu+??OJi8t1{sp{9jJ5f$}Ge6ykmw5=7$QP!s}vb
zY<IQ@Xj~DhtQU1c*&cus^ute*zDK>Sga{H-eO!#ha&XW&h9=tzaYa*q%X31O9e_LK
zGky8e>&@S?fJVOV&==m=(e^dMOn1ZE>DM;BA^K!DmZ4Nf+7=}KHBjXyCu_uMI$XSP
z3(IixqRebDssbk#ZXq|F<gggW`{GEszkb%mRIo;MR{9v3&IR6j{i&IVO}YAW&=*yv
z&j&nh5-sP&Z%DVq*?!lgBSrFPd@<+hpkw<f`}#t+{EZNd)<A$SDn<4rLAxGz%mrU$
zRjnLW1{do2fgXHNoH8vPlj8ViCX5cL%R`NON3RH>q3hDqrdgW8^F(qPHqms)yHAIi
zk|KYZt`W`Q1O5eH<+lS*coTZI(VEhjDYAyd(%qFZ^+(X&4~H}eDp2HhNcUSlw*h(H
z<`ZMifO7gs9c;>wACcNo2Lo@7C!B>~4?MnCybt#;k>)-Z8`?cRuL$ra(r5_K1yqo5
z6FIwGow*%GqR4y8`yH^YS}Hn_G<<&3s@M7675ulXX^_<&;~X?>@b8!Wi2oa_K|9<c
zp;+niGa%K<G^3)v?#FnG&oEmJ7Ct!<D1cRy9^c<Gs$Lv7_IW{NWIs2jrNqv&Uf6Ct
zkJ96c{)#||*PuEEE}CTh$$4Z(@zvO;qKwP#^~?%h#z#IHCtR`bC2?2i?6`k6+3XHQ
z`nFlbox7<*rbHxUQAD<iDFwfqM;|{y{k9IPkZ3)BRShLf!D~O8+}(bs7BHi09&tMG
z@8h~W3m`RGAoulRaF3r3)cu=#smvqSN++l@=yoo-|J+4>hXj$um}aoFVfw?*c5tDT
z?}%$V(<paYP_uVWQYWY)9(ekWm_Fv4q3eU!0N&_tE*9-VqquVAr*j7r0yVbf3KeT`
z;iIpva*7_E$6uwuui8~Ag0BplEb;ozgPx-WtT6WOhjTeWb$kxQ*&nG4dErvvPkX?t
z=B1p+i7)BH$QPgVrniyR=QDql6hwn!2fPy2#IWo)Q#-B7TBUl!)FJ!ND)vDsSL~(Q
z7wC|KqU7@8;2a_^obJ!FcxWk#6$yZVa+lX9mQn-uxIq(BG5iG5xZ+_^@EpC7U>ca6
z!6N^$<rDLjWn>bBejML`>!$7g{m$Y(5SBWEM#L)Op_(W|W6L9x4E`J6lotU?+MdSH
zNS%1jza!Sz)1_Ot0$BCKw{=nfgwu-XyvcKzIxJT)lEsiof-<B*jRgz6q>oQSC@%l5
z)>KZw#-sUAY7E=oC~$hCwyd$A!mQjc=wr1;=;*!+qLyNe$8v!GOaf1)i=&d=7;^jP
zhz<WWe@^0sjDBZ`B!>SL${v4iGD%mM#NsgOnDAu#Le_<UH;hT$<xj)0<>ch{tzz^>
zRip|(mihGeO2jE$sPL@O4Mvm}n=3VyPMRpqrxa%K>;&E<-7iA>j3E?+M@nQw6a@13
z;sSnruaL-949ZPyX}Qf6Lcdcvs&qCCeFmm*)Y+VEjjT}0K%s#AL$iB6W7k>8_YM_%
z!kP7!$7gl3{dpnsujsKzbjmWR3=7g1O!kw{prr6L=5?%v<_i*_>`ugvBM2d59Xb`C
zEh%K_dGU|!_kftnnhCz@=%>KldxT&G->v-0OMt9oneIsXz|iJ>(;^{3*G@5wAmE?=
z&4uyq;BC(6#5q;bIm@*+JN9Aqe5VXwxl{NB7I5{*Gn39GS(O8Oydasne<7APg`jHa
z4T7^gHrxb|P{&IOR9TE7QfX<m;fNJs<{=k4Yx#6IBQ7#Fz66%c-FOE@Oy=zQo`yRj
zF|^@-;;uUiU&}E5`^Y8hej}#%Q^QaYLTzx>X>d=Hsyy-(QBBLnZ2$MI`g#)&%{i(J
z`6azbb@^lK#4V9yyEymnn)S?l+r5m+&tsoLxTPQ96D)N;`dq8FC8x-Hu7J2g;$4zb
zOsSj<74)fXl(G*_wJtdTFNran(B=#j_LrS_uAb%RZyBxcdBb&YE-AZ7T%3@H59mZ$
zUR}v7Q0~`Zo(YiP)zTh5XeM{kipxx|OyL{&@95V8biOww3Dm03IAzrQZZ&%KX+vEu
ze)7`zRC@Vu#Sz-CYp1?+9@MG`B)2%}Q)_~H=qupRtKwqUPn_jv6%yhJTi-rwe4{U)
z`SAHP3!A$}UO&A<S%y>oLiFlqMKfJ|4}bA7=bvIc9N)Xrk3f3?n9)-4+`~E73EoWJ
z-h)Am+o`2;u>M1Z*L-jO(Fv5)gRP9t_uNg9eEbIcJh@cB{4fd|6iLa|9qM)TAWsaH
z9rr-0mzj`f0!7P19qd0OpL}_;+DqRp>0kjzIM69lWtGtdeBWLi_ViTE<h#|C!?ffj
zk*4Z!d^b~SM4D|C10^`ItLm{X7R<DX7RO$pm@X?@R{8V&G=ipsR@S4;P{)t6S7n~x
zsp4+s1op@sy%;{E_SXDW&~quU;+mFa`oST1i*i3t0y7u48=I}yoqO*0`f>><y`Sq`
zwUkJw&A%a0q6?p2@E`h4W6<+-6du`I(X3vFilsw%LdwM#Wcmxm8_CQ;Z$mY+EgW&8
z=}1O1-xTJhDxB%arJSih)d}o^BdepeSqp(t#yDL-;$Op@%Lb!kAJ6!YlPE(Gi=z3U
zGp)bFh?+zKdE5(x+1b@od#U_hRC1Kl<zr-SL#T7ak#N?E=0{=?oB^+WG)iTERv&G-
ztxcCc+>pFL+@^~9Ky%^UU;#f4RHtoCP+1G17nOo5SRAVQhblHh^b35FX*p6kZV`U;
zC)X%CEvRtiosX2x2Pf@OlbnqZK1go}CYN5JQ|MazW@F00DT(hF%Xn&~bD({Xi;u|$
zimu*UE%d}k(xk!hn**9l{5`Mv>tpN!Qr)}EbaQQlMxasfhVrs?{<D*N%hVuT^)QEl
zk^2GL;O&t)>ASBp!s~vx5;RWl!qH&*^Ukj0c^j3i3k}r(OVr17_b-1<>jejn&JXYS
z4C-2_`3^a&XY7~6x5;8N{<1|{eGKgABx28g$noC$&O*?~_lGV^h}Q5+2m$|h%)+CT
z%x;&&vM{ni&K`li6VL+9?s)B&TXrrhY9Hhldr`foAWqd_S3#6Mx@td%fGU}~cOQu!
zB-@H>pCgb8GM$9+ZBmj~#k7<tj(|0S9iU`n<9|ZB<bOD8$sayegt7tSpXKXhr$5rJ
zkWhVBamaXxPdLxn(DGPpc+O3b+Tp3|mDTt-HHlw5{v%;3^mTDX<)@h+U-^h-QXsps
z(d1#eM2#Zo{*K+RK4;kXYD(%FCd%&IBSg7JrNBYMUw)bOrmHTkmH~Q);Sbv8bJOb?
zBKJ`8z)+$jr_EtCWv%zNBIGc)FEXkZ|Jpv1=kc-_Mh?S06)TV~_%%k)1ze?ZbDf^(
zVByK>+j`1*vC^7=5fJW>zPT1+0Q@Z)I_5~H#b$Q-D7$_qs}%TWL+)ADG3h`yQRjD>
zBDJYr6={@9vzu52dUW8g`ZF@%>mak(VUfZk)MFRLwVYb>19uV#LPW@q)(twHsA6KY
zchAk`T#>4XgRNhGbHB~^)#`!dOr=)+TY$gH0yd|K0O%0^EyvcK5xyT0gPoP#g;5yv
z?WQbCst&DNV1%Q#T_2W*YH3ng<2z}iKT@6OL8WJfJ870N$4?h?@zLC=*n_7OS)a>=
zVEXn1f9bK|G^E=SB60IeRKubD?Mh}$7z(1AES=|`&Ef!m5i`T<YOn&-D&-oIo~4n-
z-m^A?yN`#d9wB;g-?bR@{84ll%96jB>By8g_a+^+C%%hp+Ca^B1Km6wx@|YMWTu($
zyYp;PLF@BN+3_SKclne-xX4HEc&;@2)Ynt-fONd=C3)?tPdr`W#5^rb`sEwybR_dM
z>@#nUE201WnY0`ajaJa}qWyFOXFf82c64;u?G6r*=+vpUAB?<s5P^Tk3E5s4SYb-v
z{vygjGC`^9WbaWky2520o>(O1kDGxLPR@@%tz!VFFiQj_er##dQD)+arG!b^k!BL1
zagh#=>20j8m@jZ}^Lg-*_H@WzQDU&w&I9+~77{HKgN4Q_Om1b{HPUNpRPAYD0U>#)
zsRNb*C4a)QA98UGf#!b+b5Ogi7-*jq`;bAA-aSr677S~fTkMY?hj3YJ(up@`>In6k
zv8E9y*s&5AK{PF-s)HIAE+h}E5QNFr<wXl^?@LAseZyrsltMi&ac&_ENhTT^SzWmp
z=*Pbq1(i+d=jRnZ;O3I|(jocq;-`Y2LheU>_=j>je0J=ubwH8srhB;hqYgpO>Fy};
zVwbz@F=ecs8^)q&I|m6D98ph~k6_c!!v^WZO|1;}E@9+s?L{;4&yWi39rhdon2pq}
z%1Wdx)`m;4&_kut_0y3DPz=pf6a~%t2j{Rxv=i(xxKCdhVo|D`-8yDXx@o!pJoOTq
zJ+Ae4sW)TXi0FY6kZ0c4KkfDt?bU`^s+wOW<ssi_El5bt;p((zU>wTcJk#jj%`z|e
zI@D5F6u91+utQhTl<i1ky0w}qi?E~4>nt)VB=cLQhO3?pwnvsnvL;dZkvqBsvxPlW
z1#a)(ux>Fh%VDv6SkBP|5PmvB&fB~16*~BhQjM9CdUqDCOaJrn9RZ%{5c%FcUqziz
znKGN&fOAs^Xto|bm)-2-#WdF_w_V574Ww=rb;TEL|A}Go+FbdV4Bd)WB~jkfV~3U(
zJWQ<2td0l?^&)@R@2&keq{EUk3y+EodM1GDDo7=Kxh<1h+UNj!%v9W(iY1<#EV`(K
z0zzp)k=!s>p(fTd+bz5U3r*pekqUN1c54Yj|1>bI6@HJl69GcJ+4+MkjVy0_C)byP
z>o6j};*2Wg1}Z|*eCZVA&hfHT^wS>{hN)vQW3RoGNK*IHl_5vGp||dG>lxbGXY8L?
z&On70)end>^3)j-RHrt4wxV8v59ST3yE2V6P6pbq#-6%p$c{Mm02%0lc@XjyQI=CN
zQm+gh(Q$p|57JXV;a0r&%%7J|_kd~N7fCaK+=KT()8=m@P?5d+Tk6+ZgcdA86brex
zhY=ETdZPg+bY56KmyrcqlNs*>-z^(OBg2MfilhOLakXpxs{*r2cIh*r5WL5#Q|u@%
zlqZzn=q-kSf7Bkf-hoI50u18Bv^;PBn1mYFAYBi6?eo%Mjym9>jlH3sjwBMnDse=m
z-ybY>z}D?4^-wKj?%~7ccocvjmXSv+@t)>yD=5**7u}1OVc1u=S_v78;hF-q@0CGE
zfkZ0CfI!7A!B_p;zw@NYTTekiRp{yaQVHMFFElRggj6mxinq@PBQzqVTb5m%wWv>a
z_Xd@9<1I)Wv1;>S1^^C(k0Ruv%F&>ZZy})`9+)5d{*Pc7+8uimREq8QRaugGxS9HO
z7h2!#Srj&(u5PfCTW_Xos`slFmFbpyg6ve3BLr*v<+U62LM+z<4@<7wsQ!(m2vw++
z^OKIsf%CFK-&hmTpyP#&k4bj(^bKlH0<=-y`(wDgC9g%mgHp!EJnS)Cjm^p{(4euS
zP)2U(jYfqH*(ZAp2*S#s!M^?s+DY8?peD5TNLdgEwSqV;kG?_I&YvzsQ8WES3Q<;O
zuk>-&mQxn_Zrs-{vX(Q;FA0Wo#``E5X5v>k<qoswoD}f2eG+Lw$X<U9&_UgbhVt?3
zKACT-U=BW&VTWZx=tiY7z&8XVwNI}M3Vqy;Hq+2$;-Qf&#t>3t<}dUP6wCG*X+NTK
z(L7*mwE{|%PT;9AHtG`<0}T;6`178|T^Fx@&a=E9-Zdj{8=_od74<_xnlyuISc=2P
z7mT2r#P#O~ki&Z3<}_^*Rb*JGu@SEb#|>7~%FtvDQrlNNl~{a9q+Rlq?k-M66a3yw
zS2EzmXzx`Z9yKJ@i5cvw`q2j=|F)q8YdzlzRVEb*6&mUUd_dBy2nlg9z<~%2DCR(+
z;-uJmi4szxhxnUm@Gv9nZ1A1SVMGKW+qq>XbeT^VKJk$Vy?CC}eX$9BY_A!JcHbzV
zd~|YsTDq8+QYNqG_rv@0^U+xaZDLBN=h(NxHyd%A<6jez<WIpBc_-{S<>c(UcEG^U
zAfoEwgtX6E`bk+f(q(O%_~W(2@v*G8xvts}MoC0v;d4s3p?{Xw-E?;C#G8t;f2_0q
zi_@%<6qU5TTMiTp2ek%oOm<~xVo|h4cIV&q5ER>zw&Umyw7UFpeuL*c;_dMnfR&_U
zv^7XN<^zP2L?JL(;kv@{hHi~y>6hAoBmanRg|kfR6|`@Ma<14>PC!!mM|JN5v#kWs
z;`<YiAwUE1Y}HLn;Y{^_gC{nWZ4@G%bvp5J^_IM2d`fVvFSQbK7&R4&tKgwOr=Yd0
zGFRag|IYL5ozlt-Xg=r{pyQe<Nos#fNe5?PzRy>gb>%BEfvTG>QQ0HU>)3KO6@YU!
zX!7ma5uNNxVi(b(o`kh3?rdc2bHkDVMXb)ELkgUe##rEjn#@jiu^u<KQ>hq?lGCd^
z@{@8OT{^mFBS*bIY>Wp59C{^}gVbw`MKB$m7E)K!6UqzhdTfBE3z<sRq<nCz9$jGy
zA1mOkBtlUGP(n$g)HdEJ5LQl=z%y=}OIC)bhSw58H--mKjmI*4*vB}qo5;z2IP<}T
zmaAY5uZZH}va!X*B<<QKM~JDS)i)<2T1r1U#p8<7fIkemOZS(5Km`$)Yh6?VvFy7d
zjh$0{43I#P%r56M1@ew%%*#_bv+vnbUDRc1FCW7(^Qr%a!5&`+MSN1fd1+u+*rT$>
zfym-cm!&}|bO`M~77)sdey^CbIa3b1dC8-^C(i8Lv^}(!`8bjYG$)CWmL7=AL(w4@
z@$95LuMAAA57|Km_4@eN>yL^63-6gi;@Ku!o-)RRj4JrkjcrHHsrnFrAG!57-=G|B
zjX7rc_HVh)Iu~^BLk<Sb`9sOQpJ~&%th9LHlmx1|3;#NuUBG%*Fn}P!t^}Dd@n9sI
zLHSEH5(BT<QAH7RH;11XYS0R%eY{y^5^&hY_%WvM_#NIaLFZR;HZ8lpcKG*V&u(m;
zw1-OS*%(HhVeHck(f@=kNPnlek2nWw*^jyf?|Z1C4|=f<1EunHPEPPsCHK=>$2sm+
zxq<I1S9A;i*1mVXgHaNc(9E_(@OTM8B7?9nFIJVz3_Z(#eE9RJZ}{@jBAHcO-yMJ8
zOZ7Juh2%xjUQ1?;@xtq=NM~ibWU=e{UBN*$W!*Y~)GJj8*Ojjnm~#M40ApeUyBFFD
zEc7v)fSj6b{UC@kOTg!*i8&UCBw=$W57j-3GtUXte^!^Q0_!B{g`i7OnUMEOpgnJD
zE<}f2P$J*5emvok!i_Da+Zd+WjOx-uK~rqLw&qoOl}b!xJd{!C8-+7&&nTX|RZ+pn
z{f8kns#AnZHjRLaLlP{0Jj5(}D<M_f|L8^TMGGm@xj&>+inmTjzu_F4;a;WJ=A_O^
zg3~+Q9)BT%U$W?pE{1y@Z?janOWJP9y#GgcP>WQHvsK9SDijMAge7W>gU$Aa+B=a}
z{$k>poh8N?8R1GFUDSn7@$?K1V2UG4iLIq^Bg`+8^KLLk<|zCql$md>RxFjiWdVDm
zJJ6)8Jhdxm&xDYMAUr^kZkE1t?xxa1^_+#+a?~A{^v4vH9|FY}5;U5O%yh4Uys;Bs
z`WyTDX&i`QLo!R9d3v3X`%^y{3|%;@N2jTCXs^|9-%`bA-A7{3#zL2i$z&-cUawr}
z48vv79|w74r4)QA#XJ*{_gjdjW$8OatyJHBcJLSUdNa<U8(TLUd|qMGN2BW<+td}X
zvh^&Eoex|zpAbeFZ<isB8o(eNT|Lc-GT{-~Q#MP9J|^j-B0}$0xQIoC$S&`Tx`iE~
zK6}Iw8n=9V4E%ZwzrFr|9{0@_8Yy-4`@Qs9s!G=FXv~9q^N(hs-}JJmA`?7*C3v-2
z+aAhO4svt78lIjP_y&KRyz#pZsd8AFB!A;e&>x_rq=jyG&P1gtT`J1o>9i6Z9I!Rh
z2R*l{gbBX3>>ojx20at&jpDnykboW(Skx(~1%i;Kw~>TZ^Wx`17nlk9NBfqKHtTI!
zU?c`cg6@%RbC|=cz$uvIOVdka2bPoBhh&XuSGn_#2zjl*{`_5`J{{%(lj~cs(RQtP
z>#J(i5;XHTo~~XL`k!zyfK+Qr^U{z%Gc0)DXk1g&H1_u-`9a0$qH;<zx$`f$>lM$o
z$cE_BDdErVUL}jfHYVds7t%CBYgTpVe<VqtlNIcaX1KdF?QSES+Ya4)wd5PPy>woo
z_Cyt>($P25vuM68i6Q>>1;w%tUVKV?F%ncG8M?eXGVTW$%f##-?7^rf7*)0VwGzKB
zd7cbyTM=42+fGY~=~$8p6drqMpINdchIJs*Xb)Dz=y5#eQ`qSmCOllV>jt36sQ6W`
zYyZ$ate|JSQ1-5A1@eO1$;o`wcJGVrrqaNU`&2*^4S!+4NhPfP*r#HWbOa^U92bX*
z6pFkJIKF)RaSGwhVO#`Ypjb5v)ysS>^Hfc}Sjy>>E3~%0#^gl(i0vfRCvMB`8H5?s
ze00BTaYNV=HjUXZXmj%|YNNkv?UdI_)pPux&B(|T52W{1qqoK<X>3Dz?(Zc>_B-*S
zkcmKdKm_Urgd|dL6&gwDiC~o0I1^hjMMP~r{Ru*oi@)x;#^qZ@0vYv7*1Kw1Pt`ct
zs?OB^5d|R%Q&ecvS#S>3_K`to9a3M!=6+0#1nYKG|5Q77W8s6&N&T^_%7oIqW)u3h
zWLSMP@vZm1N&3NK2Y=x*F#D}j1C^4~yUfqYDdzELdu%%zedDIQGtJk$I))nJTj1h-
z0g|0&;M9Y#evSt+{r8Zob@!Cg0OvHt+|n#jjzVnpHPNx$zTPWVi>f2K>dYpILb)$_
zp{C^cZ6Y2M?T<!!VRt%*8`9}rw4V5>w@5paFX1#(Atpbku+soA7Drcol)!D8&T0nj
zL$?IMG~zuSa9D}3^9@Tpg94;%aAdP{A!kgRrQ}DEPec>ZK+()|dlg3FQm69hxu5<T
zpFA1PQjuUAe#fh}HKE}(%hF_?Q4uP*k69w$&**BT6k!dQFW6oUr4`;4qQzi}ZC<k&
z22*&yPVwp4n_nNv{gxtR;+p8?{Z0eE*|OyRMe-X2(*D6tWeL2Z3DTEFKm&&2$K{s_
zcpg#z%5(sxtx7543GW%~Zl9bc8S^E20`%=8HMt~um3(L95Qms->T(NvOHFrUVv;A@
zO4E8{X6|%LdCvHG>rgtH-(*!8s&@gE1JfbnwLx>XQh~%3Kh_l?$&tJFuOv7Db=pDg
zRBMi;-_k*mU7SQtap%W!9nW;1_R~P(ax{?0#LXvF<qZ}qG5T*@ZveWJGbCVKtLmoH
zk+=PD7MtYMAkD;fR$l(fu0?jcW=`Y*LH0Y{hqc7|UG6^~`Z7b96EZ}<gxy*9U5a<S
zCf2AQ>c6C<ichJ%*xH=A<KDVsnb=DaX+%%)SH*KmX1QDbofiw#Mwr>ya3$;6{z$KO
zdL|=qjqS08s4-w2)T;HGie7-jHT4lOq+W37tER&ysTf(^ApfO8ibRXEghq1TYU0PG
zaX{C*{wCh?nvY>!oGQ1!@8Ka6ZTLOmiti&ylS1jK3d^$}B9Nx790afIDrq+MlT<Sv
zo|3E8w5FIn<Fg~xhDz;2COV-u`5B8$VLh77jbXNH(AggbF3rw0kw46!$u^FQR8ITd
ze8QFXg0>0!At8Z_VH5Q#(85QCxA|sF^wIOW{-1$pI}-p$3kw|wX(WGv&|1h`z)C}+
z>AK`qV|vd*3Pt(KqgTq!7O-oVf5$WEdGAA3u}{0P%6e%C`4#_ryjP`tRX3}*<op<a
z1(G~0Vo9Ugp<?zDs&EOR-=tBfg9p#2j-ep^RkIXkboHta$1+_w%A)V@*GK!JgPyr1
z0+_7cDMS!d1%BDnB>nXAGs20-7<0NmdrTvWjP0!n^Y}CMY<yUH%k7Vht;AnCl|7iT
z2G!<vDs9{>TeosMEJ4qMHiMN@8`KXd2pSvdy7ZNm!OGmDpl0t{zgayHV*5kZq~oL|
zkz$I<n(e*<k7Q=<V_=_bJ+TI*ROYt`DQT&Cw4`BDA3oJ@C>`gKcpGGPhWKjN;w)}p
zj6o)#jN4o1d67X{BNm~YX!9ma!w(&K20p%!6Y(X?P}12Aj35+9Z-If21_O=ldrlKi
z9J(khQgwsky}0RI)N|or%=j@{7kg)ENUw0rk+$MFbW_3Am(@!U66p%YC|8I+7js>u
z&@ALJ=zyeCCv~qLUVEwQ-v$X`Z{p-%Zcb07>1F>g1BP?18SO_?0@YTO{e{OEj&VX~
zemn$KZ^)*pi{Es4+0Z}`96xx{6_=Lm+&rNqhXFZf+ocsx(urH-8!p3KT!!FksZU;R
zxSX_B3TW0!6TWeo((x?kU8V{B$YL~~up)<oST2W=1NZKAUaFI&Hv2fBg&%7cV#5)L
zkpwB;RM3?y(m5x;u%91pO`)6vHQJDLq(P=RcV=wa1ecyl4C=*1YYz@tYf#TGI7v{o
z5YDXi)uoXTzWbEC%=+bH@2V-eA?hLHj_nR62m#0*_aAC-(PVcj1VPlMeLm2Q?0~q2
zcF4ri@v7Cr61F5zi=%px;W+yw79)y?z;)km6;Dsv)&L0Nb^@5rSpCNgSi-8A6ad&t
zf?j~%aWP|oA}Q>{Q$O3bGGONi7qfkBkZ;y-O`5S>rW$=>_43EPC*U1WO+6wlVqA`<
zb_*XK*=YEXr4{ac>RID7|7`-Vn*qiIid>#~*pTzq(y>%-_#lh(DC&9DnB`W(^f}xj
z#m9>XHdRk`%Z#`XnClFEkL<_erTtoc{4Ja=K8y9lZs~Vej*#GRmR`+_I}@IjN~>12
z_Q<yr%e(=XV)u<QJ<Xm}>e%|1)*1ai<5UoNHmrjzm!|ag4SsS~K+Rfom{%fSl>c{v
zb@j)NS16hQOy2J0^@#D<sNu8h(!On5KJw8?mj!``b@t^lxve>lL=Q?z2c{H3^seZ+
zoUPBSmX$tiY$ut^Y)J*_6$Yic{*gMYl3_OH@~mK=))CD6wn-C;SQK5gv)hT8OAV>2
zh#1KtOB<){Tdni7baBtN0>H(1U$^pptd=6=1bDy6H{5N6n+Z|zTiy6{J)t#DA@VC8
zF-UdnGZ^h?P(&-K6Vo+1Cr^H?njlJCMo^Na`tCj%$P*wxG3~_o`I#W|;At#F^BL7}
zHq;lr+4I|j7Z^8QoIelYLMKBt5NV|}81M$Wq;2O*4|Q-r)&2uQH)f_tGu6S<-q4cz
zQuXb_OkqGUIf>%Bb-X5tmlk=l?%LP=QAf@@DxY~;$NUYV8;RAr7PmI#-gV3;W*-gG
zkhzf~nd(bA2OP-?0-U7~mRDj`MeSFt*H2x40%pxK9e70nLa@hFqkqj<cj#WPT43~V
z--#rj3Uo<<pMP11awJE83wABuyeuTqBI@8AUpa}w3VOaNKnWH5fz&u{+ficMduyAi
zIu2~jdd@!$M$_^OwWy}P?_kIIRAPVC*)bC1$pVQ?<#e)AqEA$i{frgVpLOo3z~mSW
zVzBmoGat|V1G8i0LZ~B{P3cW_@dl#v*YdvlPAoB7gi|G6><51VFPE<L5tG&)9~=CU
z7R1FjIA^|wy$}SEKqdHWch&tl{2Abwv$Eb}MU#~%9cqI5ai_O%F3<Os*t?A?bxx$c
z2`)<iva@$xDYyBI6U=JA(#8f0obzTb(SqJF<(kJr5$M6IFKOh!{;ev;npfNA838Q9
zz48J(uGkP9kr63j6#!*){YK!<J$&3i&#$OKKF7(OV8P(xc_ktGbfxqz<bEEBV7iy}
z!gmo~Q%rQ)o{>_w5<fMk7eDUKBzfzAtd41=z)MC(rYB5rAQ6sXdNokn+Of?N?=3>b
z&^?>t3JyIr03is7G6{YZg3nh9y0?^@?`1yOzqFx$<s55O2yIxhM{vtYK@j|(?2RqR
zugKU@2lIkmgqSZir^ux`xGY~Xm?90Z+wb7vmG|ei*)sh@?UVI{^7KFPf7hWY>sSlH
zK#<-A-bo@qj(^&1k*dXl^S19gH%N2cRsO*XYjQ`3%SLty-6GvHZDI%0WOD?Dz}2(o
zNea<%R9M@I=4<2FMcK}73XAqWX2U@jli2ent=*ADb0#rum8iw$3Phc58+<cu%01b7
zx#b#cZ9Dg3<zA_`744_rnPnQWOx{6z6Sx5OWAHC8Dd}d#rUCmsL$iE2qX3on)@a!M
zI1cTsQb3*8y}Y3{uX3ts>_75T7^;PSCpgi~0Wi8>g$mpUMJq)}YLZ(H!YK-w#V2^5
zR30uES0mfcV$hh9mrWhpKhep-yQikw{8r0Yrw?hsX-W%gOY2lHN=RhrPs3(|t%sof
z+_MC*hqiJo;WMr<nOsz0c?b;0Z4!Yo=$_8esb}&%d@awWsdGps@KxO}<B<o9rByLT
zZl4WWTE}1KC*QZ{*xo?<oC9R)>d5_j7AyHzYAzo`qi$%k^+`mB%M>j?043B{!dtMW
zd%vgGzTfe6)%|~(I?K4IqIPf7-Q6jTwB*nTh{ON`NH<7Gi8Kt|-4a8IAV_zQgc8Dl
zNY~JaboZR`dCu>B&-Y=?UVHD^tiA5*f8BeRyJ^_n;u}pm*i6mskFBLyoc_OvmDj7|
za(x$w+ZkX8=@XBlK(?#{WcWeF0Jdr!Xw^BQY`Lo^txUy?Dxik7Dtz@=dCC|zI&0|^
zB$Tw9@)MXiEevjxh!66o_3OY{a8aDXC<{$llr6f2sIj{*=^*241l^5BD<$e5Y3Je+
zF{h2bi22D1?i+9WTm1Q9ENkmM?D+ieffchVZ$}k@7g=@=r16&)2KIC}Q%d(cY~uPf
zd1u3~F}8wOus-yt?HM~7!tCPIQByBYil{MCpYIs<Srx2pcbEiUYdeY2n7(ZN!W-TU
zQT2(wR#cirX-m}*5Im96PfAT04;x%V1L#L)$)?6gi>enot9T*^20-X5<#k<q$zR-*
zAm7!}cD;H>JdJc4`O+CcSO?X(pv6S{xvZ}!QA*$)I#kz#%?0`-Xw8H!>RX|q9pIhz
z>RP-P+3xNMvLvs))69P_kwj1=f%ZDyFc$!}%*f)AqQRU)Lpm0Eb7Rb~W=<xWTjs~M
zSoIYNDJD6$)*&%UArPULEbqYYnc^wpl$vOlnHtO@dANVH);&8H*m~I|TNloMdfb`9
z^EAyqThHicS3leH0U5@PWiyjkt$hlM^r1+(8<^MN)p#b4YP>O53~b{IjbXb_$@GQL
z>wf8}(={`D4~Fx?T|TA%U5@7Y2*5DhU*IGTo2#TU9%y58pZ22Im_FhhFZn5A9MS*z
zO1PXweea!ONvU87@b~9$0KyO2WXOnfG>$l#F1|lXF;iAjAPuZf>ic2OZ;B{t;6PVb
zn9*43>ooiY{Jc&=v9lu8q}bN38a*X-EQz$5<<bacd;7*QiivGyf4cvK>V-h{lu<M)
zSz$FkKb0u?ApPe?FC($d+>TocK#lVEjG9Uft!l&yxAP8>#Y68X|G!0osCjBd(qj~b
z0l@>s2EI&ySd)F(%6OEMj(RGHbJiwi4n6N8h${ZHVu<uo&>U2H>LcUdLeE&L{U>FE
zSV~E6z)(#8-~F`|w{?n1_<hm5Is_{GHdfKGQvXjFz>1%pM<xWU>F#17hSGmUQ-6Wf
zcJR{P;Dd~9SxPap>diJ=W087||MhQB&z3qEjH7!$;DS0fVxS-WLJ-l5m4oXHnGqf|
z9hVE0s@;*JmOkyz|M@)r8$|zx<mUv1URo8iRW_e%!2y6iu=FAC4nk62Roe>AVd!YX
zQ-$(>bP9^GKXkudD`;P77VpSgVgmqRCDn{Uja0-`Wy0Y#5WO)P6_6tnTA_=L%>81=
zu%a4a4kL9VmDX9Ly=6ehUtx*pp@PNp_eG_^Y%ioxVN}_Vd0(K57=11E0`GB3XdJ1T
zC}Ca){4;zg<A1T^@(|p|7orgw$GJ7|d&2Y!@z117L+FQDq8FiE(PQ|kQ=hP%#4geN
z`#A0d`oA+w;Yr;1pluE3taY&Yly)8wihc4>9m=I^v~(5w-MWM137$;jW`}r$E>!;9
zwd#;*MQcMfUCu_{#WWh~?K~bdUa1^Sq)}k8(ybwkMl2t+8U^HmROf$H<U>cXR204-
zj5s*#1ZI_~h4rB!FlWoO+nysm4N?b54NyDK`mPdO<=*0uC9i^o^}$td#AxO7zo7&0
zLN!LTxDZE4sMgNdd7;Q<q)bA4<`1^NdY05)*!9SF16xRuJVVEr+Rn*R{wofn?URqT
zu0NCfd*8*3oAPNWEm@q=fM7+=IewOIcU17K(piCfN#`)xP*SOfD=MoGuXdH}>QILS
zbkIMeb)k!<V){L6eA~KU^7Z9-sRi0>$re<LN?LE!|D%oVpVy@BSQg`f!O!xiz2<ce
zTZ_cEZ$65?wgTWxBx6jf@OE@FXNEq8b<wmZReKGMK_j5#P&})M5m0`iatkN>K-g)k
zL^4V;wu7)A;mdbWj!)^9z96I&#k!K!W0OdEdwCT{bM&0dy=lyj{6qwTU|Zps5#L=f
z41QL{?I|wAyW=Yzb|2~zhMb*ynYrsLb_-8G()PUNPNifFZS0AWG`NF>oEB8uuFw<-
zn4v>3v(q$Mv#f%-(5V+>ei$G5pdc{Znj13FTcBp}8rA;I6SBrPwGw(Jc<B;oAGl@d
zBpHmwu79#wqW<EXhN0`A>L7DdOLr7hdZaya#|@Wp1DtS9_!{O{7&n|CV_cG>c;;%_
z^1wLQn!qFMaM~u;+DcM==^Q*syu0QFcY~WpJQNfWb+7cAyaT^$J+F8X@sQH=-jhZd
zoj$ZrhVI{Omq{7=oSXCN89RQI1&Y)CSL#HumE_$uOxLkAp&q<tYg<Xkp5!#gl}r?<
zYaR`NQeXp%J;QKqd%ZrwApib_M+Fbk1k&%;c5O)jh}}hGA(8#GzxVteGF7v}&!Zz(
zFqV;V=aNLPS%+uGIP57&!YbDSaHI`mBVX)`>MiN#-w+!-bvds>6wW-Gpn|lhig$8|
zu+9;xgk4RhwdihA-K~d!%=*rTP#AdT`@J^kl+(FSpmEI@64Slrcow3LyHh-L<qa+#
zr<lMW9_N#!nS6`)&D;0UQ(t}*f_%m(_pzlZ=AulXheT?0Q6~6ox%CnnUt3HpIPc%O
zt7UP1S9!Sj0nJIkBRzeXL(M563$R`$3ZgMyKfnEXz2KM~uI~C$;GQ{EL+;Hp(pt=r
zl&=NW3Gc{HwDm^lM+KbXpUXubC6`z)V+W;!yh74os8K@QKelca&WPWQMUO)VhLycV
zzu3J_+bl#eBM9Q&sNf5Op@R8FH{rKM7y{+K+;8#^E9!Q(znnklLb*bZv(-^=2)}G0
z3?42rzG|X}*f(wndDm0<;h%WS7U0Ba91O4=KRZ0DWni$meih+)-jlegDc-xFPNGo_
zsJpC$4vE}5$8Qp^P+ft@0N}8+WNyr2d}D%XQVkU?jE=8uVcN>G<0<glx-E6PBUS5{
zx_D5=VJ~w6h{Eg?H>MR+eF;kc2d>6>RUVY3&=;*je-BqUi#;-VytzuSXcCkpzHi)G
zq92WkX2vIeI2057DZ@VdmQRC(U;ba1z7LIKC5kdstZ`7*^4Z5~QI1W%tK<1enF=i*
zq}pYME#sQgG=k;HUjUDqPSrhS!Vj*QQK)?KV2SF2XTd!3Jpzi&KI_ORREM#y>bv+<
z69@ct%xiTZ6LveyCoF99d|KG?rKO{)eTk?$<B8)I2i>OIN7gX06A3u&67}PUlDT=Q
zOSvxWGsT=0iSbrq%N9}ZH1j9Mx|M};T%(^1*OH?31)5G(iZf;P>ti$oOtjk~y(33)
zr2bDIu<^#slK_L~@3iyAS|L+-3xR3Z0JmTx6r(f^0siwhbYXKPC+l&@Pj;;sDB2Be
zs>Uvb)`suu-un=pAb@ga;{4blX#BA0Vus}mLg-T~6&HF}$>i#~H?<J%(WQ1q5et9c
z$V4>u>BAVZ7#?SY5^0ncC7S54B@hhd=UO@-DMa~+mE3~+QRDc{CaJ1}Af}v#TM0q_
z0UtxfnghAXK}pX{te+#sH&bovh7(1z!^p|&eBj#|UV7hR^_kp85AhRWkVBntv-l6R
zq89X?+z8<#cO$*tb_BPpIBi0D-OQDH2@O^Ou~~ROy|Z%8wkn8n+r;kMnhmy!BbIJW
zi43b(Cyy3&vC6Qfpi)invFPWm5N4RAnN_++4C&(`D_UM_@RbJcPH;~T;x`W2sZ!YJ
zGN)<NNl7<dlzNX{pycp-r#64a2(tc^mxZoBuFgNKSbKRUn2Xd(jKY63A#EkujTv8h
zUp-{B#)Q~8zUqzF%sYy_BhtqU|8#5`hJ;Q6K#yNQo2$`Y=~ogji+hJ*ujd^JvX?Us
zq2t^bmn)dusHi$`VQ_#18UigFeye4lz0`c#)A3zvL-KbO+fk3%RCNEeW3IKk3F-R<
zyLf0(sLV+a&*&}8KBth?mrd~INw}|V*v~zL@%ziCm=cmDVs!WVz{ZBqd!Ba;kMg=^
zr2^g!ZR@u(?6Iff@VZawAyi480w+$>pzl#3Wu$(xCvpHp#iw*eDS}Q82V2UbP#(0V
ztRp-c*_PvejLw8TlV46&wwh5{HGWUWiK!tut0Q?W+)od#0K^%a_@YB;O@rr}Aa!`W
znr?njASvCgM^pPl!Ro7n!_T(}jCNr)>i6XJm@ot$-sNp<+JF-+y^uBby-es7H0MC*
zU{s&#+j+!*lcqYPHEYk~P;8-qPBEm7vLRc>mS0(Zq9J^C{j<v9UR$-vc^}#~8pFHz
z!U6b@4-4i`!G0Gb>4gYnGFomc?M9@Wr%8Gt^KS~9;Ex&TJHMyK^vL<W#6Tpjh41lf
zinTC`8#Mwwujm63K^t}s1VV~{`QP#tREGvqb8bZ7CIGO(LMBRKTz=}eYw~}oEn4yN
z%uIKIX$54){W))|7wWBlp~$Wm#aVNq>+zMWzQ7{hd>it64l4m?s}~b4uGmjFfNb<N
zLR$*n`4h%QYOOz|OVdWwqJm-6ta_f6akrC7R8GA-DB#$RNv>SZra-1VeA38rP>fID
zpyLN%fp19<v?Gw}Cd#S8IuV=bbDLuc#pZr-`sBBHT4f;q*q@V!066~=zqN}ue&az1
z&W+k?jwSC$&MY&$ps`*^Qd-LUq+xA={*CnH8;g!j>->ay9YzkCwG1fNyQI7O{*d9X
zuH%vfhTf>xWy^(y2BK9UFZxER9lcmqV0XTkWM2d@iW$`i?HGFQD~fN+Q!lwQfd})?
zPx6m>T+#qXTNn@}{ABqx^W&k}?yF<#xgRw=A1cAE1#uAnJ>^>xsvOGB|LpT}pj}1U
zRb_D3!q!Qj>fUiZxJYI7vKB`#h)jg0ca>m1D)GyL-*IPbQ7;}qnRg+RtH?<u*r=CT
z`AG&?dGY+Sl^N~>*$pAD#W~BFr1w_maQSaK2^1rioPiF~zLpqVEdfeUw04Gw|1*dX
zsyw$1atEiw{P^gEDIXKGvLkv?tF;!r{Cf=x@_f`JUBhkYysiQN;9h0!TOUcs64)fv
zz9D;KWiN{9Eb6F<F22Rk88SP>P@J_IfrQJzr5YA7o#QXKPemG=`+!llOIEvq^RL@T
zxA|(2^{|Gf8muUe<i;**>3Ttt`Wcmj^NfSt7<8j7Zf{$5yxYGxE5_zf+u^33@$XCN
zo^?p5VQwXn_$TeB>j32DE}4j%y-BQ$?#-|bg~qha8oir~)buP*m53R7Ba!dM?HX%V
zP={t2@o2sy-KYL;S0joN567tX5qx3Qiohg-I+2%ED_frFJQ@|W%!IpshlD{RH+xN(
z$$u!9GS*v6wA}1l1|*qif0dmv&Z5ZS9h!#TakSWblr&Th-i8h52j}(JVp4><aac(;
zek!C8UeE9pxRP{Le#G`6VN{xdf3Bo<K!K393eg6W46Vr;rSmdN{|wGpXZ|H)@qy+f
z#LhYh0MpT#lyBG_8U}%Ro^KD9RUYg!kp@m$Tb1DseOomPech|KnQ&Gh@AS-`LvdBl
z>SyY9<Ep?Dk53y@IN(t2SJ&MFD05kg@kRiG0O>l~@-9T}x{K-=QnR(UvA~qye^OML
z`NKI?2yY)_$?DQVW=Gq~>c8d>f1ND7WN>;fj2%@7(K%)aPvfR=K!Rq~WS0b^X7mRH
z0M^YeK26o@3CU|0;Ay1%=Ivj;RGp@<w_{iMP}xuErfeZsC=f7LJ;{xv?0Xz%*S>CK
zG(sb6x4)I9+)m~;AS25RAz-JVqry#*gEoiKvulFrqx8dr6}jIZV}WcQzK(LEtf>6K
zB+spPkncDDYMfQELu2jCDa~cGhYX8p4#;wClm^BfNv=(hja^ylqWzU&Q-5&H+->v-
z&1~W`{G(3LC_A6HTXC=<lk{uQR^~$f*YIM%%a8Yk6cO5`1MCu|!0>Dwe=YXEomFH0
zPd0XhJ>RyjBP!cfcI24tBdsW^ezr9ao5i-qdR8ROi`y@_6hcj^`!5L^&D>}r8|Rd~
z{dQ+9&dWZ`qw=?WF2+xt+h@P16_BK*B<aA9qRS$~y+6>b^+}Q49lL1OKDHg*2^wXV
zTTr!Tt>yz7-$o{ZL9rPSt9UI|i_j~Y@=p;D)dmxa-+z9`@xPTh^SY>c`zl%7<nqUm
zgUVZ8Y}D8R2eWpK-pJPP;+K1qI-Nku&=+p$0dKgkMDSnD+poq4=sa|7Wv3nC--^h9
z`rl$`V`>Xn7t|=*pNinoo9=$LnY3cBB5L&m_LX~O`_Sp$6FQ^#v-c!vBn^Aro5xEG
zf6&#mfDDfYY`tJi46L0zTQTPr+s50<N{qLar^qT5xXe{sTA{Nd=N*SM1`3`R(g9JE
z&8w~^$gNii9>}BkqINx{qa<21b36{O)vP_OF~%j%&l~puew5qWa^|lvi$zuWoR!@0
zVjz*?)_E7g7e3cD3hVq!%PKw50U(=0P=IL(a^(!(&Z{fNn13Nm_rwoEbWA;;M&Z!a
z;JnA2_$y8`iS<C#9x^QUpur(ZPO+Avq}A_ez=+p#8;T5b1a=qN3vfvRVWpJZIq&QT
z-8GdgcGo#V&K54dx+8EJ*|48UcWeJBQ{reb=;0b9WJ(6c{vauI*LBil?W7)euzFA0
zQ=GTSMmAcGlE~KCdmTUlS)bH;a?wP;1o~HL9@G!q*M`&@nZL^o4R{@yRZg^}9rEcb
z$FEO=t5E<^YXW5s&ugzSF{>%u2dW`mX+05-sVM#s`HEkZu&O8pWJWnmuvle7D7wr+
z6<b>Oi!%rOBh6}NeV2#I2&5u?aU>DuD)OuQ_fhZPKP!AyOdOoVb!Qz13TYIkY?<rM
zLi0029%l{E-0*5jd&NJLp>9g_N2%H3tRahs`*S<a{P8mnyeFsu(TL{a#AC|^)}vx6
z^dS}|>eFdXIBQdgL4O{}+)IDXZNcds;OgZ{Y&uPmH@S$FF!K#+y<03j(6CgFTi4)~
z54u4Axp`XL{!l<EeKQtAZRVgyZ7`-q3AL`oT`Pfcr^LpwwyhO~UmTvXOPPG&ly}+3
zJ+Hm4Vn9N=#5jM<Uw1Sw+$ht0_ZQ!cg$Kpm(Q@rC&HpBCVb7!zsanfpPfZ86;zs2@
z#DmT-8TzqTHB0Yb9Xvze0itYbc$(ak>X{ClT_Ahc4$4_SgsdE>lyu$C5DZ`JE_ZOh
z^p6C=y+DD1#BG;Y6c!jESODQI<=05;7Jvu=F+cWC>Zr{J_fSX3s@cFx3vcA4WVj9b
zXlUJGjraIlFHA{9!ZD&p@b`={-O%uOOhGM8-1v8I0yt5zm%Z&N6Hh2NL)h8<?v_Gt
z{X$;7r~j(~`1V2dZN|lgfH%^LV_DUVVJbBhJF41BH^S_o0e!Y4W-a)WY^(7J`^%Pz
z@72rEgAn1yB@L-Ava4=T1Rd-z691E2ZoXO{v|A&#`8kn#&HZ~3azij<!L48r<|bl)
zPJVGDatJJZlaOxZ{<Dh|1|O0JVO{bE{Gf%2a6oxbX5!LRV&<BQF{boJZZtRDQ59E1
zM@<|9lBE;Jo|iiIWtt$jtq?i0^R;xsOqcyQ<OfD+ZS{Q(B$pzwqiXZ`H`2g}T|^T|
zHR%@*$}c8j1@nF6^F(}&Q`#Y*zKkj>=_h4ZsYmfj&Eus_@$@#zMe4$Wu)IrYWBHch
zlF^%0a&Aqh3lb2@_bry=ancx%7MqN;5%aKXf?`i%1LO)UeE+Mv=k#fw$WrpIe8R@a
z!2Np>A!?XA1Iu<o3=zdyr_74sEz(+&of%26FZH8LLrhV9iWlC>w5+^6YE-s7sI2U}
z>Iz4eIHIdI`5K>Km=-#))Gv(4h{6k%&w!Y-=R2y)qPP@1Ubyvw5HJUx{voeJ?OzR{
zh8LfOnIdc)h(~kbK3gWYk|*%gFa4>E=Ct^9lwA^|7*>`bm#2W=9jBPZ<2-1;N#4b2
zEj@sKt@Op+xPjR{vO=-YJ^P)rEEAW1xPSH~myFqmYQuG)*TZdgtCLU-ieIFLD9&lr
zR?>Kdyi~YWNFs}M42BN7j%KCd{76g(>|N?DIrc>%*6mE4i>t(W1&nT_jY_PE*KIsT
z7btaHMo3ardC|{+6tx&G6@FX$u8~j&GDMkNgY7hU7n}#5aDJ<|aoVi?M3T;T%;uQO
z*4O%`toN0*vAB${d}*ujf!Ix+SM-I$+czk9*nZA98OVQ^L&P?htxFGLEcmB1O;F8G
zj*hd|!;KG~<k82h)JGGnAuOD+wE>gNE8e~&MF5JeAc7aorIV0koOI#v)nEavaI#b^
zWSK<XWp}kBlM6grBH3{sCYY|#sW&|}AB*SdtulOUN74EQ53_idVq401;(LPwz@JMQ
z_BxLbR)q{_Nlc5nR^vw{JhVLX2Q@x1J0k^-(O=D|N0$C;!E+|MvgjoIczGp-e6Vsa
zGryNy{C+{0Sy>P_1kFKi>lsa%pR+*_FX2X&5{him2nIri=>CQ$q(M>Z@oe$+2Y+n`
zv=5px7#;CMO+%3M5?X<_xE{uixaa`PXhV3}a8FZ9Ov23hkK(O<c2}m9hFe0@WF$?6
zCVl!qKlU5$%)+WY%^1Bqm!Z*W1nfxAcU_{1FUM{{2qL#gW`)>k?w8hIoREnkfl|LG
zqr~oleyhKc0FNo6WxSyAG{o1lrMO*p1zElR*;`L;)lP~^;WPP)l(jC{Vac0X16#-K
z7+uWVpIz?H@jScgRCMUAVe>MZw2B2_hFKQk!CO}2JfZ^R?_k*-@bbL`slkQ}Y1W%H
zBv>jaq<#68vi7NLcogwrGukr^;}WV1U5DC2*G?V+b;FVhcv*K`hVU9Iuj`}S#I@>X
znr|sN#qbkvWac0^8zOomA4sv!#6hoioB#;-&taAr1!~7CHIJdl8xnpCBVO+~6nQC%
zM>x!9z?0OQeY3+Q!wq)1a317@8|sZ181sv`Tfq&{1DT971XQ1IF^2sCpCOn{GYs6M
z?*@IgUP4Gpiiv%v^(nXH!}>$y;&}hstqT7Irlq=Qyxm)5j6#Unaif68=vr9q&u}*b
zD(oKY#6(VhsuGIwMVSB)9uzNx@wdyEtTleuCtjP_eh*(OJNrE27@Hom?Up&PRpL?f
z$M`4@s_%^>MBGjY>!W<LoI_@qvkUD+W;Qfs7##C#`^Bo>DP*ueVhY|NALc|>JiM<j
zLjr!#F{5^4&a@uUl#0$|MZqP|0u6zvG>YDmdUVE6^Xr|rQYrh$)Y|v8xLx`1c)B1$
z<J<zWN$?heQ~x`}Q_4ZABx>*Gz{Kp0DVQ8VXBQ>EbaZRGM?2d|a>If5xL{c7I7??N
zC%xG<rhDUK>4$0zNg?7%!AAXoJgo+)#IvOj5zmCOASIv8hvDAx@tXc9e&toL1W@BZ
z`;`X$aemlz+z~~=m%*ZNw)YblPf5E7%s1q1L?18-2-u70my@4I>|l?bJiG=jo3t+&
z5LRslEs${L6b|}kFuBFRlqu=P{7P4LmUOrIjC9#eCQPw!Zjh!s>{R?)`eAb5CRJ<c
zThTLwxz&1V-DT0*?E<Ckn?L;PqEAVkLXK?KgRbK7KbRKr^7$XTN(Yx76V;jH`wci3
zgD|&d)G!qx#m$iI2I$|ti8Tz`7J+Z)?{U``idHR7c)0V{sq8%KFr-7V!-%J3V6MU^
zq`$l0ABOV06qIYnf+4UQ510wNmngzqmxBpL>19zQDQ`F-2o(_blfv{3Jq_J#YmU~m
zy!b}2V=<NuG0~K5(+WT9y9>=cERMHKDZ7Z)gGhxq{VvpLn^||KwAmCovzCwhhBlut
zf~ani8JlbKo$c$<Z@A9-zj|!RAImhes+RC(*{sI#5A-G67*Ar`N!ZOWjA|$V1@4l(
zW5yFzSxsOiC-4{`d+iJ~Io+h@L(F0P#?1i6)A-=-V$|`Myx8MH0*1)lx|&pK<X+_{
zm@tTU@Lffj<gGt?BXL&0U-tTKHE{Sb4SJt??cdzv0%|<5`+aSz1F!ab(ps-?a5h9K
zjdXYxe`%A-MOC4dJICs4=r?{v-;WIHG^*oD0xp^YwooO9iC|tfxitgLv8S>+A&jIl
ze^&l9VA05`bm0<bMszU3zziJwk5B<GYBh5US5*VSUQAb?Kz8@U5??M5u>15gnOp^X
zLPZ@eY|UoZ?w-UVLEv}Jy1#xRSxFJ5#YeA8)GCQ^$>-R5MS@r2;dk6w35N6GHuuDe
z7D;f%?<GO;FZ_Dttr9BtdT|4UmZRba%pwzYMB~J1X4wSCb&_<<aykU7jZxnwHCSZs
zhu4&t^x#&>u7;Z#>a)kKPjZ;UZ|UEmGf*`|i-w7wJ`CB{Vo`W&5geD)4BuU9K&Tbf
zsQi8H&#H`=rlGI>p152KR{@`{9sL`yZl0f@>x~q;NuNdT&Xs7GZL)=QGSL?BcDOIb
zTm;1a_GLYO4Ym>+eZh@QtWj(h4Tj(1^V!r>O8BA9{8-C}$m026%VB^0aehpn3JHaX
zG?DwwpG_qzXz{{SFat_wZ8%S8;r2-F9@Imx22UaHMzU>1(?wvGQoLY4c}eRB1Hr<M
zL!}k{S!(x^K(l6|#35oWvl=f931wdedk^YZqPfAQMHXn^(-JAWb=&amJo8xR$zrLn
zR5YygRpCA&8nWi`I<}1Z8>Pq1St^uqhl4)IH{7%@<n1AOmW@4ccfG%eCg)NcY{6Lp
zprQQ(sm^Vz*jfk-0~ZH{>eBkczi|WJBC$0qpdT^i(+v)!4<#xgn1q(IEP&+D5pNBR
zKc+1Y_hcvcm>Jmoofl0a?1n6PCe(x-tZV1FP1082{gGr1DG+IhWK`MRye%75cv2Kj
zcTadz#7RHh50G!kdMojVhN91ipYD(O6|AILB#yL_nZ@_7hy)LZBG>-xEhAjwQ~Huw
z%v!kc7>+TG>G+$*6|E_fFtXG&RT$lQO6lS`!U#IYaUrfLUR(vpx{O7>bA-w-b)-3Q
z*D9}zFk^mRGM9UEROA4chQH>%l@=(4b5oFAEJlL(Hq?(!q1zNj4OUc*lQW<{I|S-G
zNPCw!?JeWJALR@}j@1UjSxl6n1Hbjnk$hE;Iuy_+ckw93TjcWrRXAE}7?b(C3I)th
zC(U-XXs_KeMvo5_(nn?d#ZpyZuA?1KF{+gCh0Go6iAS%w0As*1&%jTiM)07HQ3=K?
z2_0qZz*08$7TUmfOuQCTCZR#Hp@13es1PradM-%cm;SRBoOY}zw8QMvN77-u-t=#V
zTDq$#YN@EscxM<Yo@=z<5TL(9nVfQQXEYQrP~uLYba}(qZ2T9hB4y4$*O-;5?$vZp
zAEW_&-*#FW`-yx}B*+T|8|AAe8wDXn+;OdSz5s?Hvgp63t@$>0N~lHMAKxaNN`iF!
z^+;`NQCBIO=DIZMb;ekYkkFhJ=k(|FR1G&(`+GT}<#Axj2LXcT!=;z6;1+6634n?Z
zj=iEA+D>Yi<Pb0N7oWD)T{9o(mzTzXM<ylruK10B7-1>{+A97z(9@cH7<j7dc*svC
zA6DO;`wjiH)Sp||FYsqTA`=cC6l~kv^a`qW^6(uEqmRm;WYwNd?}gih^w#HFt3jPM
z_P;iPeatL)WqG(kd3#>vSx`>)mFr(i+XgYtT9yu@3R-E$4DGOxSH~*(;-HUPf|%J(
zrtUc&^HFr(H!@wt5gN9=lSKEGmpq&~pCa_GrA{8`0N_O9x2xh%;tdCxfz>2l{r1lu
z9Yv4yy1|GPbQsZmWY7LLKMG>TtfmXz#!P8Umpv=CswdIer#}2kYQJtAzTgVdqsoVZ
zwkdhm%B4{E()#6-9sp+4b4Fv&O?&%9E*+8;+8*`{zHN>8{Yty;?nsXp8e0VHEHB)@
zovUprzM)Ul45ocCA8&G@lBUvn(w8;!9&myM*8p{iYtAA)27IA*+1k*T4~`|yTz%LR
z3fniGUM4Ym6M_GBW`n4Pu;ob`M*aeoJn$r^hV5BDc{h$M&u~BEA~ng$2O9T3fBE(>
zWl<)wLFNPzy6JlKKI>Z8!T*+Os^eL?5Xv)+XZlD<-;Yv{6ieWO_}RAfk4|+CBlIE#
z^N1WFZyCSca6|n_d4h*jkXJR|x$(s+9qt3Mb0mn`y1kvLeJKh-K<f`Z8q_SC4}dG`
zjr3P5U}|X^6bdFfLG$m{AEH2B?07?oy!dkDH@NX+Gys>cJ>4%<X-yrmvgdI0bjYd>
z)=!33)x8Zpr5YG-@HH0apI<bYCk{(&+)AMTy<;ivVskk`l;)mFZs-yJrhF49tLQzf
zUE}mI1Bjl_dEYkk<d8|)EbHp(E)iacW1iTC-d7^@dgFxu+=~j*w-D>9|GiI(*49;F
zox7`8^Q=jiW=0*Zwq;TNIeE`n>;uL01#eam4h%u$AspX9w5k3(bH~^3iLvtN?CNU{
z<f{MKefN22+pn&TNpXsBZ{ZWKw46Or=sWIcFS9+zA7xt*OZe?|S+tPnrkg^R+ytf^
z%0brl#{q5tF?a2=V+^=A0Bj%4aM)a@`H{|ns**>1)mE$as(#>ZNYR@`V)NE$0rS>V
zRH3~f5i+01|3My`>G^xywGySP&&AJv)_pJNIu9Fry=YeDZA&~6OJ{V}@`)yD|J-r*
z35;ui;GtRq(o7n&_DV8H!kj0sW4OF&_vfE7CWGHZ5Cm9?ppTD+q+il)W&L@U#cv>(
zjH|+BDX82LPl^|cv+_2;6e|W5*)X6pzQb03MEmvIn+Wr~GG)7a6uT7wRy#4wGO-&=
z=5W_m({mo1qqN{%B=r`XQ`0z*oGkW~v-_6QYj$Z(m9+dgcc+EYIIfL`R%2di9EGU)
zG<xn6?NBGUsK0AWJ5D!%NR{=usn+^p_UUnJqA*b#Y<`*_8}-yQOysa_M>3i8BDs*5
z;-}|J@t)3^ox6le0XHpm3x_h1q}@_LJd!Ib=M-H^yJlf2??SsTO)VnViWZTza=P#8
z%hS6Gn#vwUx?oL0nB5u|Nz%teHRaq;nEKCiN=PaV>6cX4VKHbGG$RBQah&w$!;|_E
z5f!P*T)EXk4(xOLh8ZO_S1k)W{*LOWh4$w%a?RNh)xY2IE+FmF;meIoXWUx$Z@tA~
z(aq-F;O3^MmyVZ$c|fsT)^*0=JLYXvGq?MTa$W|oM{<E(v|t5FCJ}bA29S`}Y>x+7
z7sW55^l!-dCA!c(yLIYUdX6y~e1%o6C@9+4=d#BnL;WUd*U~VP`jf}AL>HtM6+y($
zSLG2`#muDWKH#5l+G6jC2PG)AZP^c%Ldt3lJ#55FPLiCsllfh(i;_*)itMo$0@#Ug
z*=6+8^0}Hebnv|bwS|UQ3SQUeq-6xXnu4g9UOFc(Grxh}yDP?d7@^@Oki?@maX;7R
z>2|n9gZ}~WuaN`QUuUL^jpO{?VWaqtpjmh*PTSKZ#XIsGX0QJq`y+sT*jzk$%Ex*l
zu|sl6*8t)YhHMKmmkC@glHeNx$&ffJ#|$#CkGU%uIJ7SMIlCVDB%}@?z20UdB`Cl1
zT={vRHo{WgsSXgzj5uPMeSEFY#VrJ#)kgQ%$TvH(t<zhK@?jm?_B}L+*vGz=<x6GC
z77BWYYCy?j==*~Q8U7t@6Wj-y_8JVzUmDfWfTy=xm*BQSOfVz-uSqg+Q)#z#^(YHn
ziD7d#-ESF$`7UZDW!+i73k4~%y*3E}mBUsK{hrnm!4HD6glXiV*J$}jKIj!k8|cAe
zZ(>e{3h4v~vm+Z;pv@x9c{(8F(cQ3okshHBtTp%Q8`YZygv+73Yq}ea@e7=s@3JSD
zX&S&>JvGTKpK&x{>*Q^VVQ25R3?z9re6>+SOcfTY-t%xKmbV!kcxJ)_s85`~848=@
zIp(ZG7pxbbMj3vPzWI<GD@AfG5grI8J^SUrDl(?aiG2Ox=;Ks6{ivua^%YRVfIzF|
z^MCC*au3$cDmi99R;Eev=gzDvPPak`b4Y2NN>ju7SaojcTxJW>?B_4>wLaGM)<^!T
zIY&yEO@6cNo~zDB=W(&TyWdlT>T|5UMkh`(K1xwxrhf9?k^+oR%*m#yJ>NghroBbr
zBUcR)zm`(ppdZBS7KLsk&80op(e(Z8-4zIH_Wg(x<U|4j(=jy=5h}<+bfVr1`lzYG
z8fXJO2+MI1xb>uz^2o%P5L75LGKBlYBdo}rnSFY|ORV&(@;HXC8dig*;JsVr#v5d9
zDh4p@tDLxDp%b#+QpZAkyYM|?^{wyVqg2w%2WJeCOfcK0tuTT`uwqB_1qj7%D&GYR
zNZUKf;o;7-=Yk+SFtb7hT#g?M;ELhDlBNB;8gji^TsF`sj*ulrk+#LaXiP`TEra%4
zvvg?#<2@P*b_mP$WQq58qnxDoNu+b82WvAg%@aEr02KTB`?-r8oCSwciR@uYvH-gm
z$pIC?Vfd#HIY2;gr-J_<q4Lu?@f7#w%Vk;HCQh$6`R!}8VRUh>&~tYTlh*;UE)R$n
zVK6jv^_BQl9>ez)hxj9NaqN^Oj^9k8#}y@!ci?S`Rj!ZH3q&_wsjXe1EfI1&y8fud
zsLaaS*K)DQrEkRj(4Xdn8$3|3!%Qmi?bNm`<n8f2?Y|Z+yq)Jw`+tZ^@BtO<X$GRT
z(iRCL3mn5rOP~uTa(5c~lJ?&V!G50$WeyeaA${LwL(uVvzqT~J6J)otJ!IV1_4|_)
zVP($sgIityP}9A7@(mp@dHoNquPOF^Q$u;=t_c@Jp4tDZzsHM6g;1KB%5Ek5t4i8_
z+*W$o;yd6I<aB{tdH0Y2^x`c81T0n05hE*-y=T@gtg;*V-8yUJ;gtK%DvxK7Z@lsr
zWprV|CP=uD_4TIfyER*^&{Njhdh!>lUAzauU&eBwQrx$0y381x+@KO$WlX^_B<g}2
zHL&dP=_tutLelx?BvID^)agZ%GilXGaJ807MMu?910sd2(x*vjIcs3gM>IS*L6+S}
zjGgb!hhn0EAj{6BC27GJjZADVlGOkUe>1&ad?6+~s2TUh-Z-RgWyecjca~?|6luu;
zXmD!LPRS0)fy2#BKH@1>j0ZRI#CH8^()QX}4`%WB!t(+QJ(tML9rRvk?{l*SBa1~Z
zrUmXax7(%#(i|utzFbur&6BHK^_`E9z50S?<F%`o);z)1orhg6WA)QrK>X_(0?gs|
z1~db?J3-zr$ZCvJ#1wlyx!>T`gsehSwcL3g=6mJ<P6sTS6u^vAz}1R+WbekL^VUWk
zv`(`_AnVkvaRsca@%L>&49h3c;n$pDUvPVr7q@ILQ7q?3KQF7KO_zlsb8Ylvz1W{B
z#}&)1ene%S-r579IdsOs=CQFvYT4In{DbAPDZV%qvDU)h2d4Z&)}q-9JMfVRkl;HD
zGFJPI>Q?H$nBBVM6+yb`BbD6{ZS2omX>;upl2%<KuVwM_^WwWXnVSNi?C+f$gC)lO
zhKFg`*QvUy`;=5rKZXU444=sjQGugzeIksd#Y-nMsYA64jJKgBeOt(t*B|57Z>meG
zi`IDa_}r0unljIJNX!+{^(kr<w~Z~iUBbIJBz)C`d%G5*b>`dWKK7q<<CoJ+HcFBE
z2MefJGemGm6ytZaBXG?Wc{>K5U%9wI6tdI>UN&@<=N&(H@bE`7Pze0=Nk!+=nKy;D
zSjAw|FQV=!(AgTgpSdE{iDju<v-MY1e<I(U!Xm#X2zS6{A?fHk^(zjrVRYC2Ym6+q
zz)fvX*6vinu_mnOtyhp+WIt_W%vx@a*WX)r#VEV06nOvP@O%)@eo8(2#_^Bh5Dw-<
zIF9*M)9d%OL0h-SFVncDHR|zdKF6Bd#)Iu<21+c#`oFQdiDD5h$k}7x&hyhB8p`Y)
zf7r%njL+;^_ZfK<^g<=R(9Tkno6dsD-+dvldXa%x0d6-U)Piob1>UN%ehQQDXC19D
zQRiLI+wh!YOBtdDmX}16!G4m#DUc8;_Fm^K+rZXb>X3{;8#_z?`DiJ|TL$dXAT_~o
zN#BzCeOQSNQw@jTjtA)N=Idc}l8WNh=(FjgG3VxVwimbtO?PHgUO_R3WffkA-XDGx
z*%rypUD-HHsB)dfTP6kQ)dXpM1A>#z)9fEZbGi6iFUOyQ=Ff@Mq^v7gbyQd%y2oP?
z5zw{Jm?jQ$;#%=xBBJteo73F|EKBDY6;P=^`#!l4V>=xZeEG8FAQ47{IcnaN6%kD8
zkG`t1ZqRv0uf;8PhNqug$kob4=56$=QvlX?D6*-Hf2QK5WRXqD9T)PR_C24o?)(6L
zWw`#AQ~szd&RVM9WkS=KcE8P16u4IGf0&MmCC3dnJ(lIJIlWAy7)W4>7!F{M2ZL$0
z3+zM*63dSPv2+w<3yGr`T#Y}s-<<KR*UCJpX4ZMkNyyEbQ~Gr5T?BS+PgVsl-Z
zi%nw)nM%c$p6q*JUcdL%JkRb+f2~drGf^#C+nU<Fe~L|VQnu8H`A1i85XZ8hK323!
z$ve?EiEpsU{}kV6^$WrK;Ob*;$%vQMg=~OB;iV%V){Pn?Tu&BhRPvA|U-HC}{GgUm
zT<-H{H=)==MP{_-ZJm|K2kiLMkWS5_HSO|r+cAkJE@1GZnO$*N;%2VeT1SABhS4)S
z?EMvwz~sKQyn%Vc4ATkYMT)S8Bfmm}YRxd*yev*Oio~BE7T;<F3dG-EG5q*N0I+vN
z_~{FAE`d|p5KOPya!O}15iQ<<2*d4ZNmvw$l;oDf3Qe>PraW;sN5a{%P0n3k+TZ%F
zPvx)uyoYsm>Ia@!;Or`MKDFZBY=5UyfTU41wGBD{xKCYQ#pf#=NIWN}nUvz5JJj-D
zV@B3%EgEmP5YbV>+?U=#j>yA%F<f}fMKQPNO5Zx2iCkbyZn62kP<?=@HM9ik7OpNO
zRmM-3L%ni{H<7d~u;*W2NM-9tS3!{Bp{qVdBge^yV!uf>-Sdi;VbOXNBfKeA`2`u2
z8}TX{(Gm^S87I6Wov>>2z5IWNbXoSV0hux+7i^w)JNIsD68~5&PJ;0>iY7~^w=E%3
z$b)9!vp}8o#}uVm1R*U^LDaE6i9d&a&8G89;KJ=r&&jpzG>GsfhaG_g$BEF<|B>TB
zegIgEH`oWu;}hva9PJ?gyGXFG#%L$$jDnj_#xAh<XOfCVDDiyXV;X%uT~=Nf-evKg
zH~zor{DbLZN>ql0$Xc15H^gL>ap%KF&%YSwkg=~6NKx>;SL``W2)!m>`P0F$CnEL4
zvESIcIMRfi%74)P0+C`p8^w-Vs^X!vXE7opOLlO6khr%Y<}kvx^4qv_8l%X+9)SPb
z1P~7N;V35kQ2WH>A?6&b@7#=zK<y>q_dI?2&s?YlaOaT_y>;>CB)*I;d+t3@>CP2Y
z+Df4cVTsxHg;Ix6{`bNTK%>w9+klhP@97!5r&9BfKVJ008<j6?ER5pI7N^9w{|w0d
zAFLiOlq-bnz(&;Lfm%;=S)Hq5U9J>ixAr<z9tk{oX8ir%v+xi4-v^;BN?!Ckrd>Xv
z{ElI>JnO@0r#UfitV2$B03QC|_H8s@fnO#iP;$`jcTaO(hw7mDA`4dSao+z}c~>MW
zPbK~GUz5#}yHqUhs>rXq62&4VImNxG!Pex?LO$$Y>;h3?_)jhr^)ec&KFB}vsAOvX
z%{n$Y;wJF#R8`^CKxLuFrw>^QreX04^d3!*fx<#Ldj+9l^;kCq9`6;mEGyB8u;rqp
zZRa}5|9x#GFMl<<^r6ka!H}ap3)K&8fYyaN;sVfa6*QD(AYhq&;uCW?6o<p=#mNi!
z9os;NPYcY_%JSc7BASUx!gXQUOb`2asj8?M6=c5Hbyn)=N=|l#)uD@u2%^Isp@OeC
zB)b0mJ6@V)kB0x;e9E{Gd7>M@iLjN}B!(GwP(H}3BmZC9#+Yv@bP|~VhE~R@+68l*
zSZ*2T5}Tn)o*n<6ldV{V@f9NkuRHP#e}~$@2^qFPUVDXa29Ez`L6e7UafIO^|HUpQ
gpXlF^B@qZ>OSVV8w(!Tbf6ZxMz0_8zRk8~IA9JQ}%m4rY

literal 0
HcmV?d00001

diff --git a/doc/templates/index.html b/doc/templates/index.html
index d333530ef8376..c098fc05948af 100644
--- a/doc/templates/index.html
+++ b/doc/templates/index.html
@@ -245,6 +245,7 @@ <h4 class="sk-landing-call-header">Who uses scikit-learn?</h4>
                 <img class="sk-footer-funding-logo" src="_static/fujitsu-small.png" title="Fujitsu" >
                 <img class="sk-footer-funding-logo" src="_static/microsoft-small.png" title="Microsoft" >
                 <img class="sk-footer-funding-logo" src="_static/dataiku-small.png" title="Dataiku" >
+                <img class="sk-footer-funding-logo" src="_static/logo_APHP.png" title="APHP" >
                 <img class="sk-footer-funding-logo" src="_static/zalando_logo-small.png" title="Zalando SE" >
         </div>
         </a>

From 877c6e6db42006445ccf0695c0dde3294ff4dd4a Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Tue, 6 Apr 2021 10:37:13 -0400
Subject: [PATCH 294/478] DOC Fixes style for versionadded (#19817)

---
 doc/themes/scikit-learn-modern/static/css/theme.css | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/doc/themes/scikit-learn-modern/static/css/theme.css b/doc/themes/scikit-learn-modern/static/css/theme.css
index 5fa26391886e0..ed7a86a20fa3b 100644
--- a/doc/themes/scikit-learn-modern/static/css/theme.css
+++ b/doc/themes/scikit-learn-modern/static/css/theme.css
@@ -787,7 +787,7 @@ div.admonition p.admonition-title + p, div.deprecated p {
 }
 
 div.admonition, div.deprecated,
-div.versionchanged, div.versionadded{
+div.versionchanged {
   margin-top: 0.5rem;
   padding: 0.5rem;
   border-radius: 0.5rem;
@@ -795,6 +795,10 @@ div.versionchanged, div.versionadded{
   border: 1px solid #ddd;
 }
 
+div.versionadded {
+  margin: 1rem 0;
+}
+
 div.admonition {
   background-color: #eee;
 }

From 9cfacf1540a991461b91617c779c69753a1ee4c0 Mon Sep 17 00:00:00 2001
From: Maria Telenczuk <telenczukm@gmail.com>
Date: Tue, 6 Apr 2021 21:33:28 +0200
Subject: [PATCH 295/478] DEP Deprecate 'normalize' in ridge models (#17772)

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
Co-authored-by: Alexandre Gramfort <alexandre.gramfort@m4x.org>
---
 doc/whats_new/v1.0.rst                        | 12 +++-
 examples/linear_model/plot_huber_vs_ridge.py  |  4 +-
 sklearn/linear_model/_base.py                 | 65 ++++++++++++-------
 sklearn/linear_model/_glm/tests/test_glm.py   |  2 +
 sklearn/linear_model/_ridge.py                | 61 ++++++++++++-----
 sklearn/linear_model/tests/test_base.py       | 28 --------
 sklearn/linear_model/tests/test_common.py     | 59 +++++++++++++++++
 .../tests/test_coordinate_descent.py          | 23 +++++--
 sklearn/linear_model/tests/test_ridge.py      | 13 ++--
 9 files changed, 186 insertions(+), 81 deletions(-)
 create mode 100644 sklearn/linear_model/tests/test_common.py

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index ce683958d913f..96bb2ddfa8f7d 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -193,12 +193,18 @@ Changelog
   Motivation for this deprecation: ``normalize`` parameter did not take any
   effect if ``fit_intercept`` was set to False and therefore was deemed
   confusing.
-  The behavior of the deprecated LinearRegression(normalize=True) can be
+  The behavior of the deprecated LinearModel(normalize=True) can be
   reproduced with :class:`~sklearn.pipeline.Pipeline` with
-  :class:`~sklearn.preprocessing.StandardScaler`as follows:
-  make_pipeline(StandardScaler(with_mean=False), LinearRegression()).
+  :class:`~sklearn.preprocessing.LinearModel` (where LinearModel is
+  LinearRegression, Ridge, RidgeClassifier, RidgeCV or RidgeClassifierCV) as
+  follows:
+  make_pipeline(StandardScaler(with_mean=False), LinearModel()).
+  LinearRegression was deprecated in:
   :pr:`17743` by :user:`Maria Telenczuk <maikia>` and
   :user:`Alexandre Gramfort <agramfort>`.
+  Ridge, RidgeClassifier, RidgeCV or RidgeClassifierCV were deprecated in:
+  :pr:`17772` by :user:`Maria Telenczuk <maikia>` and
+  :user:`Alexandre Gramfort <agramfort>`.
 
 - |Fix|: `sample_weight` are now fully taken into account in linear models
   when `normalize=True` for both feature centering and feature
diff --git a/examples/linear_model/plot_huber_vs_ridge.py b/examples/linear_model/plot_huber_vs_ridge.py
index 63abffe6be4ba..e5f71cc861d88 100644
--- a/examples/linear_model/plot_huber_vs_ridge.py
+++ b/examples/linear_model/plot_huber_vs_ridge.py
@@ -43,7 +43,7 @@
 colors = ['r-', 'b-', 'y-', 'm-']
 
 x = np.linspace(X.min(), X.max(), 7)
-epsilon_values = [1.35, 1.5, 1.75, 1.9]
+epsilon_values = [1, 1.5, 1.75, 1.9]
 for k, epsilon in enumerate(epsilon_values):
     huber = HuberRegressor(alpha=0.0, epsilon=epsilon)
     huber.fit(X, y)
@@ -51,7 +51,7 @@
     plt.plot(x, coef_, colors[k], label="huber loss, %s" % epsilon)
 
 # Fit a ridge regressor to compare it to huber regressor.
-ridge = Ridge(alpha=0.0, random_state=0, normalize=True)
+ridge = Ridge(alpha=0.0, random_state=0)
 ridge.fit(X, y)
 coef_ridge = ridge.coef_
 coef_ = ridge.coef_ * x + ridge.intercept_
diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py
index 1842620dfa105..c80c2db622921 100644
--- a/sklearn/linear_model/_base.py
+++ b/sklearn/linear_model/_base.py
@@ -101,38 +101,59 @@ def _deprecate_normalize(normalize, default, estimator_name):
     else:
         _normalize = normalize
 
+    pipeline_msg = (
+        "If you wish to scale the data, use Pipeline with a StandardScaler "
+        "in a preprocessing stage. To reproduce the previous behavior:\n\n"
+        "from sklearn.pipeline import make_pipeline\n\n"
+        "model = make_pipeline(StandardScaler(with_mean=False), "
+        f"{estimator_name}())\n\n"
+        "If you wish to pass a sample_weight parameter, you need to pass it "
+        "as a fit parameter to each step of the pipeline as follows:\n\n"
+        "kwargs = {s[0] + '__sample_weight': sample_weight for s "
+        "in model.steps}\n"
+        "model.fit(X, y, **kwargs)\n\n"
+    )
+
+    if estimator_name == 'Ridge' or estimator_name == 'RidgeClassifier':
+        alpha_msg = 'Set parameter alpha to: original_alpha * n_samples. '
+    elif 'Lasso' in estimator_name:
+        alpha_msg = (
+            'Set parameter alpha to: original_alpha * np.sqrt(n_samples). '
+        )
+    elif 'ElasticNet' in estimator_name:
+        alpha_msg = (
+            'Set parameter alpha to original_alpha * np.sqrt(n_samples) if '
+            'l1_ratio is 1, and to original_alpha * n_samples if l1_ratio is '
+            '0. For other values of l1_ratio, no analytic formula is '
+            'available.'
+        )
+    elif estimator_name == 'RidgeCV' or estimator_name == 'RidgeClassifierCV':
+        alpha_msg = 'Set parameter alphas to: original_alphas * n_samples. '
+    else:
+        alpha_msg = ""
+
     if default and normalize == 'deprecated':
         warnings.warn(
             "The default of 'normalize' will be set to False in version 1.2 "
-            "and deprecated in version 1.4. \nPass normalize=False and use "
-            "Pipeline with a StandardScaler in a preprocessing stage if you "
-            "wish to reproduce the previous behavior:\n"
-            "model = make_pipeline(StandardScaler(with_mean=False), \n"
-            f"{estimator_name}(normalize=False))\n"
-            "If you wish to use additional parameters in "
-            "the fit() you can include them as follows:\n"
-            "kwargs = {model.steps[-1][0] + "
-            "'__<your_param_name>': <your_param_value>}\n"
-            "model.fit(X, y, **kwargs)", FutureWarning
+            "and deprecated in version 1.4.\n" +
+            pipeline_msg + alpha_msg,
+            FutureWarning
         )
     elif normalize != 'deprecated' and normalize and not default:
         warnings.warn(
             "'normalize' was deprecated in version 1.0 and will be "
-            "removed in 1.2 \nIf you still wish to normalize use "
-            "Pipeline with a StandardScaler in a preprocessing stage if you "
-            "wish to reproduce the previous behavior:\n"
-            "model = make_pipeline(StandardScaler(with_mean=False), "
-            f"{estimator_name}()). \nIf you wish to use additional "
-            "parameters in the fit() you can include them as follows: "
-            "kwargs = {model.steps[-1][0] + "
-            "'__<your_param_name>': <your_param_value>}\n"
-            "model.fit(X, y, **kwargs)", FutureWarning
+            "removed in 1.2.\n" +
+            pipeline_msg + alpha_msg, FutureWarning
         )
     elif not normalize and not default:
         warnings.warn(
-            "'normalize' was deprecated in version 1.0 and will be"
-            " removed in 1.2 Don't set 'normalize' parameter"
-            " and leave it to its default value", FutureWarning
+            "'normalize' was deprecated in version 1.0 and will be "
+            "removed in 1.2. "
+            "Please leave the normalize parameter to its default value to "
+            "silence this warning. The default behavior of this estimator "
+            "is to not do any normalization. If normalization is needed "
+            "please use sklearn.preprocessing.StandardScaler instead.",
+            FutureWarning
         )
 
     return _normalize
diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index d6fc4e14b12fa..89d388a424492 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -294,6 +294,8 @@ def test_warm_start(fit_intercept):
     assert_allclose(glm1.score(X, y), glm2.score(X, y), rtol=1e-4)
 
 
+# FIXME: 'normalize' to be removed in 1.2 in LinearRegression
+@pytest.mark.filterwarnings("ignore:'normalize' was deprecated")
 @pytest.mark.parametrize('n_samples, n_features', [(100, 10), (10, 100)])
 @pytest.mark.parametrize('fit_intercept', [True, False])
 @pytest.mark.parametrize('sample_weight', [None, True])
diff --git a/sklearn/linear_model/_ridge.py b/sklearn/linear_model/_ridge.py
index 2d360c6edbc58..343bc6a170c9b 100644
--- a/sklearn/linear_model/_ridge.py
+++ b/sklearn/linear_model/_ridge.py
@@ -17,7 +17,8 @@
 from scipy import sparse
 from scipy.sparse import linalg as sp_linalg
 
-from ._base import LinearClassifierMixin, LinearModel, _rescale_data
+from ._base import LinearClassifierMixin, LinearModel
+from ._base import _deprecate_normalize, _rescale_data
 from ._sag import sag_solver
 from ..base import RegressorMixin, MultiOutputMixin, is_classifier
 from ..utils.extmath import safe_sparse_dot
@@ -521,9 +522,9 @@ def _ridge_regression(X, y, alpha, sample_weight=None, solver='auto',
 class _BaseRidge(LinearModel, metaclass=ABCMeta):
     @abstractmethod
     @_deprecate_positional_args
-    def __init__(self, alpha=1.0, *, fit_intercept=True, normalize=False,
-                 copy_X=True, max_iter=None, tol=1e-3, solver="auto",
-                 random_state=None):
+    def __init__(self, alpha=1.0, *, fit_intercept=True,
+                 normalize='deprecated', copy_X=True, max_iter=None, tol=1e-3,
+                 solver="auto", random_state=None):
         self.alpha = alpha
         self.fit_intercept = fit_intercept
         self.normalize = normalize
@@ -535,7 +536,11 @@ def __init__(self, alpha=1.0, *, fit_intercept=True, normalize=False,
 
     def fit(self, X, y, sample_weight=None):
 
-        # all other solvers work at both float precision levels
+        self._normalize = _deprecate_normalize(
+            self.normalize, default=False,
+            estimator_name=self.__class__.__name__
+        )
+
         _dtype = [np.float64, np.float32]
         _accept_sparse = _get_valid_accept_sparse(sparse.issparse(X),
                                                   self.solver)
@@ -570,7 +575,7 @@ def fit(self, X, y, sample_weight=None):
 
         # when X is sparse we only remove offset from y
         X, y, X_offset, y_offset, X_scale = self._preprocess_data(
-            X, y, self.fit_intercept, self.normalize, self.copy_X,
+            X, y, self.fit_intercept, self._normalize, self.copy_X,
             sample_weight=sample_weight, return_mean=True)
 
         if solver == 'sag' and sparse.issparse(X) and self.fit_intercept:
@@ -640,6 +645,10 @@ class Ridge(MultiOutputMixin, RegressorMixin, _BaseRidge):
         :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``
         on an estimator with ``normalize=False``.
 
+        .. deprecated:: 1.0
+            ``normalize`` was deprecated in version 1.0 and
+            will be removed in 1.2.
+
     copy_X : bool, default=True
         If True, X will be copied; else, it may be overwritten.
 
@@ -731,9 +740,9 @@ class Ridge(MultiOutputMixin, RegressorMixin, _BaseRidge):
     Ridge()
     """
     @_deprecate_positional_args
-    def __init__(self, alpha=1.0, *, fit_intercept=True, normalize=False,
-                 copy_X=True, max_iter=None, tol=1e-3, solver="auto",
-                 random_state=None):
+    def __init__(self, alpha=1.0, *, fit_intercept=True,
+                 normalize='deprecated', copy_X=True, max_iter=None, tol=1e-3,
+                 solver="auto", random_state=None):
         super().__init__(
             alpha=alpha, fit_intercept=fit_intercept,
             normalize=normalize, copy_X=copy_X,
@@ -794,6 +803,10 @@ class RidgeClassifier(LinearClassifierMixin, _BaseRidge):
         :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``
         on an estimator with ``normalize=False``.
 
+        .. deprecated:: 1.0
+            ``normalize`` was deprecated in version 1.0 and
+            will be removed in 1.2.
+
     copy_X : bool, default=True
         If True, X will be copied; else, it may be overwritten.
 
@@ -889,9 +902,10 @@ class RidgeClassifier(LinearClassifierMixin, _BaseRidge):
     0.9595...
     """
     @_deprecate_positional_args
-    def __init__(self, alpha=1.0, *, fit_intercept=True, normalize=False,
-                 copy_X=True, max_iter=None, tol=1e-3, class_weight=None,
-                 solver="auto", random_state=None):
+    def __init__(self, alpha=1.0, *, fit_intercept=True,
+                 normalize='deprecated', copy_X=True, max_iter=None,
+                 tol=1e-3, class_weight=None, solver="auto",
+                 random_state=None):
         super().__init__(
             alpha=alpha, fit_intercept=fit_intercept, normalize=normalize,
             copy_X=copy_X, max_iter=max_iter, tol=tol, solver=solver,
@@ -1115,7 +1129,7 @@ class _RidgeGCV(LinearModel):
     """
     @_deprecate_positional_args
     def __init__(self, alphas=(0.1, 1.0, 10.0), *,
-                 fit_intercept=True, normalize=False,
+                 fit_intercept=True, normalize='deprecated',
                  scoring=None, copy_X=True,
                  gcv_mode=None, store_cv_values=False,
                  is_clf=False, alpha_per_target=False):
@@ -1451,6 +1465,11 @@ def fit(self, X, y, sample_weight=None):
         -------
         self : object
         """
+        _normalize = _deprecate_normalize(
+            self.normalize, default=False,
+            estimator_name=self.__class__.__name__
+        )
+
         X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc', 'coo'],
                                    dtype=[np.float64],
                                    multi_output=True, y_numeric=True)
@@ -1470,7 +1489,7 @@ def fit(self, X, y, sample_weight=None):
                 "negative or null value instead.".format(self.alphas))
 
         X, y, X_offset, y_offset, X_scale = LinearModel._preprocess_data(
-            X, y, self.fit_intercept, self.normalize, self.copy_X,
+            X, y, self.fit_intercept, _normalize, self.copy_X,
             sample_weight=sample_weight)
 
         gcv_mode = _check_gcv_mode(X, self.gcv_mode)
@@ -1584,7 +1603,7 @@ def fit(self, X, y, sample_weight=None):
 class _BaseRidgeCV(LinearModel):
     @_deprecate_positional_args
     def __init__(self, alphas=(0.1, 1.0, 10.0), *,
-                 fit_intercept=True, normalize=False, scoring=None,
+                 fit_intercept=True, normalize='deprecated', scoring=None,
                  cv=None, gcv_mode=None, store_cv_values=False,
                  alpha_per_target=False):
         self.alphas = np.asarray(alphas)
@@ -1699,6 +1718,10 @@ class RidgeCV(MultiOutputMixin, RegressorMixin, _BaseRidgeCV):
         :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``
         on an estimator with ``normalize=False``.
 
+        .. deprecated:: 1.0
+            ``normalize`` was deprecated in version 1.0 and will be removed in
+            1.2.
+
     scoring : string, callable, default=None
         A string (see model evaluation documentation) or
         a scorer callable object / function with signature
@@ -1828,6 +1851,10 @@ class RidgeClassifierCV(LinearClassifierMixin, _BaseRidgeCV):
         :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``
         on an estimator with ``normalize=False``.
 
+        .. deprecated:: 1.0
+            ``normalize`` was deprecated in version 1.0 and
+            will be removed in 1.2.
+
     scoring : string, callable, default=None
         A string (see model evaluation documentation) or
         a scorer callable object / function with signature
@@ -1911,8 +1938,8 @@ class RidgeClassifierCV(LinearClassifierMixin, _BaseRidgeCV):
     """
     @_deprecate_positional_args
     def __init__(self, alphas=(0.1, 1.0, 10.0), *, fit_intercept=True,
-                 normalize=False, scoring=None, cv=None, class_weight=None,
-                 store_cv_values=False):
+                 normalize='deprecated', scoring=None, cv=None,
+                 class_weight=None, store_cv_values=False):
         super().__init__(
             alphas=alphas, fit_intercept=fit_intercept, normalize=normalize,
             scoring=scoring, cv=cv, store_cv_values=store_cv_values)
diff --git a/sklearn/linear_model/tests/test_base.py b/sklearn/linear_model/tests/test_base.py
index bf7a2696fcda2..9fb35b389e33f 100644
--- a/sklearn/linear_model/tests/test_base.py
+++ b/sklearn/linear_model/tests/test_base.py
@@ -159,7 +159,6 @@ def test_error_on_wrong_normalize():
     error_msg = "Leave 'normalize' to its default"
     with pytest.raises(ValueError, match=error_msg):
         _deprecate_normalize(normalize, default, 'estimator')
-    ValueError
 
 
 @pytest.mark.parametrize('normalize', [True, False, 'deprecated'])
@@ -222,33 +221,6 @@ def test_linear_regression_sparse(random_state=0):
         assert_array_almost_equal(ols.predict(X) - y.ravel(), 0)
 
 
-@pytest.mark.parametrize(
-    'normalize, n_warnings, warning',
-    [(True, 1, FutureWarning),
-     (False, 1, FutureWarning),
-     ("deprecated", 0, None)]
-)
-# FIXME remove test in 1.4
-def test_linear_regression_normalize_deprecation(
-     normalize, n_warnings, warning
-):
-    # check that we issue a FutureWarning when normalize was set in
-    # LinearRegression
-    rng = check_random_state(0)
-    n_samples = 200
-    n_features = 2
-    X = rng.randn(n_samples, n_features)
-    X[X < 0.1] = 0.0
-    y = rng.rand(n_samples)
-
-    model = LinearRegression(normalize=normalize)
-    with pytest.warns(warning) as record:
-        model.fit(X, y)
-    assert len(record) == n_warnings
-    if n_warnings:
-        assert "'normalize' was deprecated" in str(record[0].message)
-
-
 # FIXME: 'normalize' to be removed in 1.2 in LinearRegression
 @pytest.mark.filterwarnings("ignore:'normalize' was deprecated")
 @pytest.mark.parametrize('normalize', [True, False])
diff --git a/sklearn/linear_model/tests/test_common.py b/sklearn/linear_model/tests/test_common.py
new file mode 100644
index 0000000000000..96a996d18dac7
--- /dev/null
+++ b/sklearn/linear_model/tests/test_common.py
@@ -0,0 +1,59 @@
+# Author: Maria Telenczuk <https://github.com/maikia>
+#
+# License: BSD 3 clause
+
+import pytest
+
+import numpy as np
+
+from sklearn.base import is_classifier
+from sklearn.linear_model import LinearRegression
+from sklearn.linear_model import Ridge
+from sklearn.linear_model import RidgeCV
+from sklearn.linear_model import RidgeClassifier
+from sklearn.linear_model import RidgeClassifierCV
+
+from sklearn.utils import check_random_state
+
+
+@pytest.mark.parametrize(
+    'normalize, n_warnings, warning_category',
+    [(True, 1, FutureWarning),
+     (False, 1, FutureWarning),
+     ("deprecated", 0, None)]
+)
+@pytest.mark.parametrize(
+    "estimator",
+    [LinearRegression, Ridge, RidgeCV, RidgeClassifier, RidgeClassifierCV]
+)
+# FIXME remove test in 1.2
+def test_linear_model_normalize_deprecation_message(
+     estimator,
+     normalize, n_warnings, warning_category
+):
+    # check that we issue a FutureWarning when normalize was set in
+    # linear model
+    rng = check_random_state(0)
+    n_samples = 200
+    n_features = 2
+    X = rng.randn(n_samples, n_features)
+    X[X < 0.1] = 0.0
+    y = rng.rand(n_samples)
+    if is_classifier(estimator):
+        y = np.sign(y)
+
+    model = estimator(normalize=normalize)
+    with pytest.warns(warning_category) as record:
+        model.fit(X, y)
+    # Filter record in case other unrelated warnings are raised
+    unwanted = [r for r in record if r.category != warning_category]
+    if len(unwanted):
+        msg = "unexpected warnings:\n"
+        for w in unwanted:
+            msg += str(w)
+            msg += "\n"
+        raise AssertionError(msg)
+    wanted = [r for r in record if r.category == warning_category]
+    if warning_category is not None:
+        assert "'normalize' was deprecated" in str(wanted[0].message)
+    assert len(wanted) == n_warnings
diff --git a/sklearn/linear_model/tests/test_coordinate_descent.py b/sklearn/linear_model/tests/test_coordinate_descent.py
index d63211d6050bc..8a269f28ebd62 100644
--- a/sklearn/linear_model/tests/test_coordinate_descent.py
+++ b/sklearn/linear_model/tests/test_coordinate_descent.py
@@ -17,11 +17,12 @@
 from sklearn.preprocessing import StandardScaler
 from sklearn.exceptions import ConvergenceWarning
 from sklearn.utils._testing import assert_allclose
-from sklearn.utils._testing import assert_array_almost_equal
 from sklearn.utils._testing import assert_almost_equal
-from sklearn.utils._testing import ignore_warnings
+from sklearn.utils._testing import assert_array_almost_equal
 from sklearn.utils._testing import assert_array_equal
+from sklearn.utils._testing import ignore_warnings
 from sklearn.utils._testing import _convert_container
+
 from sklearn.utils._testing import TempMemmap
 from sklearn.utils.fixes import parse_version
 from sklearn.utils.sparsefuncs import mean_variance_axis
@@ -48,6 +49,7 @@
     OrthogonalMatchingPursuit,
     Ridge,
     RidgeClassifier,
+    RidgeClassifierCV,
     RidgeCV,
 )
 
@@ -303,9 +305,13 @@ def _scale_alpha_inplace(estimator, n_samples):
     normalize set to True to when it is evoked in a Pipeline with normalize set
     to False and with a StandardScaler.
     """
-    if 'alpha' not in estimator.get_params():
+    if (('alpha' not in estimator.get_params()) and
+            ('alphas' not in estimator.get_params())):
         return
 
+    if isinstance(estimator, (RidgeCV, RidgeClassifierCV)):
+        alphas = estimator.alphas * n_samples
+        return estimator.set_params(alphas=alphas)
     if isinstance(estimator, (Lasso, LassoLars, MultiTaskLasso)):
         alpha = estimator.alpha * np.sqrt(n_samples)
     if isinstance(estimator, (Ridge, RidgeClassifier)):
@@ -342,7 +348,9 @@ def _scale_alpha_inplace(estimator, n_samples):
      (MultiTaskLasso, {"tol": 1e-16, "alpha": 0.1}),
      (Lars, {}),
      (LinearRegression, {}),
-     (LassoLarsIC, {})]
+     (LassoLarsIC, {}),
+     (RidgeCV, {"alphas": [0.1, 0.4]}),
+     (RidgeClassifierCV, {"alphas": [0.1, 0.4]})]
 )
 def test_model_pipeline_same_as_normalize_true(LinearModel, params):
     # Test that linear models (LinearModel) set with normalize set to True are
@@ -404,6 +412,8 @@ def test_model_pipeline_same_as_normalize_true(LinearModel, params):
          (ElasticNet, {"tol": 1e-16, 'l1_ratio': 0, "alpha": 0.1}),
          (Ridge, {"solver": 'sparse_cg', 'tol': 1e-12, "alpha": 0.1}),
          (LinearRegression, {}),
+         (RidgeCV, {"alphas": [0.1, 0.4]}),
+         (RidgeClassifierCV, {"alphas": [0.1, 0.4]})
      ]
 )
 @pytest.mark.parametrize(
@@ -494,7 +504,8 @@ def test_linear_model_sample_weights_normalize_in_pipeline(
      (ElasticNet, {"tol": 1e-16, 'l1_ratio': 0, "alpha": 0.01}),
      (Ridge, {"solver": 'sparse_cg', 'tol': 1e-12, "alpha": 0.1}),
      (LinearRegression, {}),
-     (RidgeCV, {})]
+     (RidgeCV, {}),
+     (RidgeClassifierCV, {})]
  )
 def test_model_pipeline_same_dense_and_sparse(LinearModel, params):
     # Test that linear model preceeded by StandardScaler in the pipeline and
@@ -1421,6 +1432,8 @@ def test_enet_sample_weight_does_not_overwrite_sample_weight(check_input):
     assert_array_equal(sample_weight, sample_weight_1_25)
 
 
+# FIXME: 'normalize' to be removed in 1.2
+@pytest.mark.filterwarnings("ignore:'normalize' was deprecated")
 @pytest.mark.parametrize("ridge_alpha", [1e-1, 1., 1e6])
 @pytest.mark.parametrize("normalize", [True, False])
 def test_enet_ridge_consistency(normalize, ridge_alpha):
diff --git a/sklearn/linear_model/tests/test_ridge.py b/sklearn/linear_model/tests/test_ridge.py
index 01839fe0ba457..b812788239b14 100644
--- a/sklearn/linear_model/tests/test_ridge.py
+++ b/sklearn/linear_model/tests/test_ridge.py
@@ -409,6 +409,8 @@ def _make_sparse_offset_regression(
     return X, y
 
 
+# FIXME: 'normalize' to be removed in 1.2
+@pytest.mark.filterwarnings("ignore:'normalize' was deprecated")
 @pytest.mark.parametrize(
     'solver, sparse_X',
     ((solver, sparse_X) for
@@ -452,6 +454,8 @@ def test_solver_consistency(
         ridge.intercept_, svd_ridge.intercept_, atol=1e-3, rtol=1e-3)
 
 
+# FIXME: 'normalize' to be removed in 1.2
+@pytest.mark.filterwarnings("ignore:'normalize' was deprecated")
 @pytest.mark.parametrize('gcv_mode', ['svd', 'eigen'])
 @pytest.mark.parametrize('X_constructor', [np.asarray, sp.csr_matrix])
 @pytest.mark.parametrize('X_shape', [(11, 8), (11, 20)])
@@ -504,12 +508,10 @@ def test_ridge_loo_cv_asym_scoring():
 
     alphas = [1e-3, .1, 1., 10., 1e3]
     loo_ridge = RidgeCV(cv=n_samples, fit_intercept=True,
-                        alphas=alphas, scoring=scoring,
-                        normalize=True)
+                        alphas=alphas, scoring=scoring)
 
     gcv_ridge = RidgeCV(fit_intercept=True,
-                        alphas=alphas, scoring=scoring,
-                        normalize=True)
+                        alphas=alphas, scoring=scoring)
 
     loo_ridge.fit(X, y)
     gcv_ridge.fit(X, y)
@@ -658,6 +660,7 @@ def func(x, y):
     return ret
 
 
+# FIXME: 'normalize' to be removed in 1.2
 def _test_ridge_cv_normalize(filter_):
     ridge_cv = RidgeCV(normalize=True, cv=3)
     ridge_cv.fit(filter_(10. * X_diabetes), y_diabetes)
@@ -871,6 +874,8 @@ def check_dense_sparse(test_func):
         assert_array_almost_equal(ret_dense, ret_sparse, decimal=3)
 
 
+# FIXME: 'normalize' to be removed in 1.2
+@pytest.mark.filterwarnings("ignore:'normalize' was deprecated")
 @pytest.mark.parametrize(
         'test_func',
         (_test_ridge_loo, _test_ridge_cv, _test_ridge_cv_normalize,

From 3d7fbda709230f9f733978f8608c64820162baa3 Mon Sep 17 00:00:00 2001
From: Chiara Marmo <cmarmo@users.noreply.github.com>
Date: Wed, 7 Apr 2021 21:13:17 +0200
Subject: [PATCH 296/478] CI Add a check for milestones. (#19833)

---
 .github/workflows/check-milestone.yml | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)
 create mode 100644 .github/workflows/check-milestone.yml

diff --git a/.github/workflows/check-milestone.yml b/.github/workflows/check-milestone.yml
new file mode 100644
index 0000000000000..6b71a62dd5740
--- /dev/null
+++ b/.github/workflows/check-milestone.yml
@@ -0,0 +1,21 @@
+name: Check Milestone (when failing needs Triage intervention)
+# This check makes sure that the milestone is properly set.
+# To bypass this check, label the PR with "Long Term".
+on:
+  pull_request:
+    types: [opened, edited, labeled, unlabeled, synchronize]
+
+jobs:
+  check:
+    runs-on: ubuntu-latest
+    if: ${{ contains(github.event.pull_request.labels.*.name, 'Long Term') == 0 }}
+    steps:
+      - name: Check the milestone
+        run: |
+          set -xe
+          if [ ${{ github.event.pull_request.milestone.title }} == "" ]
+          then
+              echo "No milestone has been set."
+              exit 1
+          fi
+

From 36c635b77f9744b627248f96f15f3e73e97d3571 Mon Sep 17 00:00:00 2001
From: Chiara Marmo <cmarmo@users.noreply.github.com>
Date: Wed, 7 Apr 2021 23:47:41 +0200
Subject: [PATCH 297/478] CI Fix string comparison in milestone workflow
 (#19840)

---
 .github/workflows/check-milestone.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/check-milestone.yml b/.github/workflows/check-milestone.yml
index 6b71a62dd5740..8ed3ac4ef0b8d 100644
--- a/.github/workflows/check-milestone.yml
+++ b/.github/workflows/check-milestone.yml
@@ -13,7 +13,7 @@ jobs:
       - name: Check the milestone
         run: |
           set -xe
-          if [ ${{ github.event.pull_request.milestone.title }} == "" ]
+          if [ "${{ github.event.pull_request.milestone.title }}" == "" ]
           then
               echo "No milestone has been set."
               exit 1

From 4b53fc3f67fa6d7966bd51db7c9d754cd187d48f Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Thu, 8 Apr 2021 03:34:33 -0400
Subject: [PATCH 298/478] CI Removes check milestone (#19843)

---
 .github/workflows/check-milestone.yml | 21 ---------------------
 1 file changed, 21 deletions(-)
 delete mode 100644 .github/workflows/check-milestone.yml

diff --git a/.github/workflows/check-milestone.yml b/.github/workflows/check-milestone.yml
deleted file mode 100644
index 8ed3ac4ef0b8d..0000000000000
--- a/.github/workflows/check-milestone.yml
+++ /dev/null
@@ -1,21 +0,0 @@
-name: Check Milestone (when failing needs Triage intervention)
-# This check makes sure that the milestone is properly set.
-# To bypass this check, label the PR with "Long Term".
-on:
-  pull_request:
-    types: [opened, edited, labeled, unlabeled, synchronize]
-
-jobs:
-  check:
-    runs-on: ubuntu-latest
-    if: ${{ contains(github.event.pull_request.labels.*.name, 'Long Term') == 0 }}
-    steps:
-      - name: Check the milestone
-        run: |
-          set -xe
-          if [ "${{ github.event.pull_request.milestone.title }}" == "" ]
-          then
-              echo "No milestone has been set."
-              exit 1
-          fi
-

From 246795f214ec31874aa1d1e89c90c7007ab60642 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Thu, 8 Apr 2021 10:01:49 -0400
Subject: [PATCH 299/478] TST Fixes test_partial_fit_oneclass (#19814)

---
 sklearn/linear_model/tests/test_sgd.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/sklearn/linear_model/tests/test_sgd.py b/sklearn/linear_model/tests/test_sgd.py
index f943592c02005..8465631828613 100644
--- a/sklearn/linear_model/tests/test_sgd.py
+++ b/sklearn/linear_model/tests/test_sgd.py
@@ -1499,12 +1499,11 @@ def test_partial_fit_oneclass(klass):
     assert clf.coef_.shape == (X.shape[1], )
     assert clf.offset_.shape == (1,)
     assert clf.predict([[0, 0]]).shape == (1, )
-    id1 = id(clf.coef_.data)
+    previous_coefs = clf.coef_
 
     clf.partial_fit(X[third:])
-    id2 = id(clf.coef_.data)
     # check that coef_ haven't been re-allocated
-    assert id1 == id2
+    assert clf.coef_ is previous_coefs
 
     # raises ValueError if number of features does not match previous data
     with pytest.raises(ValueError):

From dff37c4a33ecca991ab72590211384bac260d5c9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20Sad=C5=82ocha?= <adrian.adek@gmail.com>
Date: Thu, 8 Apr 2021 17:13:49 +0200
Subject: [PATCH 300/478] DOC Fix incorrect 0-1 scaling in the RBM example
 (#19363)

---
 examples/neural_networks/plot_rbm_logistic_classification.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/neural_networks/plot_rbm_logistic_classification.py b/examples/neural_networks/plot_rbm_logistic_classification.py
index 6994d3850f2f5..f7ad3513499ca 100644
--- a/examples/neural_networks/plot_rbm_logistic_classification.py
+++ b/examples/neural_networks/plot_rbm_logistic_classification.py
@@ -37,6 +37,7 @@
 from sklearn.model_selection import train_test_split
 from sklearn.neural_network import BernoulliRBM
 from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import minmax_scale
 from sklearn.base import clone
 
 
@@ -79,7 +80,7 @@ def shift(x, w):
 X, y = datasets.load_digits(return_X_y=True)
 X = np.asarray(X, 'float32')
 X, Y = nudge_dataset(X, y)
-X = (X - np.min(X, 0)) / (np.max(X, 0) + 0.0001)  # 0-1 scaling
+X = minmax_scale(X, feature_range=(0, 1))  # 0-1 scaling
 
 X_train, X_test, Y_train, Y_test = train_test_split(
     X, Y, test_size=0.2, random_state=0)

From 1ce17151bcd9bafadd94524ce3acd52c4b665696 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?=
 <34657725+jeremiedbb@users.noreply.github.com>
Date: Thu, 8 Apr 2021 17:31:18 +0200
Subject: [PATCH 301/478] TST Add a test for meta-estimators with non tabular
 data (#19755)

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 sklearn/tests/test_metaestimators.py | 119 ++++++++++++++++++++++++++-
 1 file changed, 117 insertions(+), 2 deletions(-)

diff --git a/sklearn/tests/test_metaestimators.py b/sklearn/tests/test_metaestimators.py
index 2caa01d71c444..ad716c3e4cd2f 100644
--- a/sklearn/tests/test_metaestimators.py
+++ b/sklearn/tests/test_metaestimators.py
@@ -1,19 +1,26 @@
 """Common tests for metaestimators"""
 import functools
+from inspect import signature
 
 import numpy as np
 import pytest
 
 from sklearn.base import BaseEstimator
+from sklearn.base import is_regressor
 from sklearn.datasets import make_classification
-
+from sklearn.utils import all_estimators
+from sklearn.utils.estimator_checks import _enforce_estimator_tags_x
+from sklearn.utils.estimator_checks import _enforce_estimator_tags_y
 from sklearn.utils.validation import check_is_fitted
-from sklearn.pipeline import Pipeline
+from sklearn.utils._testing import set_random_state
+from sklearn.pipeline import Pipeline, make_pipeline
 from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
+from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.feature_selection import RFE, RFECV
 from sklearn.ensemble import BaggingClassifier
 from sklearn.exceptions import NotFittedError
 from sklearn.semi_supervised import SelfTrainingClassifier
+from sklearn.linear_model import Ridge, LogisticRegression
 
 
 class DelegatorData:
@@ -151,3 +158,111 @@ def score(self, X, y, *args, **kwargs):
             assert not hasattr(delegator, method), (
                     "%s has method %r when its delegate does not"
                     % (delegator_data.name, method))
+
+
+def _generate_meta_estimator_instances_with_pipeline():
+    """Generate instances of meta-estimators fed with a pipeline
+
+    Are considered meta-estimators all estimators accepting one of "estimator",
+    "base_estimator" or "estimators".
+    """
+    for _, Estimator in sorted(all_estimators()):
+        sig = set(signature(Estimator).parameters)
+
+        if "estimator" in sig or "base_estimator" in sig:
+            if is_regressor(Estimator):
+                estimator = make_pipeline(TfidfVectorizer(), Ridge())
+                param_grid = {"ridge__alpha": [0.1, 1.0]}
+            else:
+                estimator = make_pipeline(TfidfVectorizer(),
+                                          LogisticRegression())
+                param_grid = {"logisticregression__C": [0.1, 1.0]}
+
+            if "param_grid" in sig or "param_distributions" in sig:
+                # SearchCV estimators
+                extra_params = {"n_iter": 2} if "n_iter" in sig else {}
+                yield Estimator(estimator, param_grid, **extra_params)
+            else:
+                yield Estimator(estimator)
+
+        elif "estimators" in sig:
+            # stacking, voting
+            if is_regressor(Estimator):
+                estimator = [
+                    ("est1", make_pipeline(TfidfVectorizer(),
+                                           Ridge(alpha=0.1))),
+                    ("est2", make_pipeline(TfidfVectorizer(),
+                                           Ridge(alpha=1))),
+                ]
+            else:
+                estimator = [
+                    ("est1", make_pipeline(TfidfVectorizer(),
+                                           LogisticRegression(C=0.1))),
+                    ("est2", make_pipeline(TfidfVectorizer(),
+                                           LogisticRegression(C=1))),
+                ]
+            yield Estimator(estimator)
+
+        else:
+            continue
+
+
+# TODO: remove data validation for the following estimators
+# They should be able to work on any data and delegate data validation to
+# their inner estimator(s).
+DATA_VALIDATION_META_ESTIMATORS_TO_IGNORE = [
+        "AdaBoostClassifier",
+        "AdaBoostRegressor",
+        "BaggingClassifier",
+        "BaggingRegressor",
+        "ClassifierChain",
+        "IterativeImputer",
+        "MultiOutputClassifier",
+        "MultiOutputRegressor",
+        "OneVsOneClassifier",
+        "OutputCodeClassifier",
+        "RANSACRegressor",
+        "RFE",
+        "RFECV",
+        "RegressorChain",
+        "SelfTrainingClassifier",
+        "SequentialFeatureSelector"  # not applicable (2D data mandatory)
+]
+
+DATA_VALIDATION_META_ESTIMATORS = [
+    est for est in _generate_meta_estimator_instances_with_pipeline() if
+    est.__class__.__name__ not in DATA_VALIDATION_META_ESTIMATORS_TO_IGNORE
+]
+
+
+def _get_meta_estimator_id(estimator):
+    return estimator.__class__.__name__
+
+
+@pytest.mark.parametrize(
+    "estimator", DATA_VALIDATION_META_ESTIMATORS, ids=_get_meta_estimator_id
+)
+def test_meta_estimators_delegate_data_validation(estimator):
+    # Check that meta-estimators delegate data validation to the inner
+    # estimator(s).
+    rng = np.random.RandomState(0)
+    set_random_state(estimator)
+
+    n_samples = 30
+    X = rng.choice(np.array(["aa", "bb", "cc"], dtype=object), size=n_samples)
+
+    if is_regressor(estimator):
+        y = rng.normal(size=n_samples)
+    else:
+        y = rng.randint(3, size=n_samples)
+
+    X = _enforce_estimator_tags_x(estimator, X)
+    y = _enforce_estimator_tags_y(estimator, y)
+
+    # Calling fit should not raise any data validation exception since X is a
+    # valid input datastructure for the first step of the pipeline passed as
+    # base estimator to the meta estimator.
+    estimator.fit(X, y)
+
+    # n_features_in_ should not be defined since data is not tabular data.
+    assert not hasattr(estimator, "n_features_in_")

From ee524f455dbf0285f7b121a08f1e9613a518abcf Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 8 Apr 2021 20:27:37 +0200
Subject: [PATCH 302/478] ENH Improve the creation of KDTree and BallTree on
 their worst-case time complexity (#19473)

Co-authored-by: jiefangxuanyan <505745416@qq.com>
Co-authored-by: "Thomas J. Fan" <thomasjpfan@gmail.com>
---
 doc/whats_new/v1.0.rst                 |  10 ++
 sklearn/neighbors/_binary_tree.pxi     |  69 +-------------
 sklearn/neighbors/_partition_nodes.pxd |   9 ++
 sklearn/neighbors/_partition_nodes.pyx | 122 +++++++++++++++++++++++++
 sklearn/neighbors/setup.py             |   6 ++
 5 files changed, 149 insertions(+), 67 deletions(-)
 create mode 100644 sklearn/neighbors/_partition_nodes.pxd
 create mode 100644 sklearn/neighbors/_partition_nodes.pyx

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 96bb2ddfa8f7d..ce7da3139d140 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -278,6 +278,16 @@ Changelog
   Use ``var_`` instead.
   :pr:`18842` by :user:`Hong Shao Yang <hongshaoyang>`.
 
+
+:mod:`sklearn.neighbors`
+..........................
+
+- |Enhancement| The creation of :class:`neighbors.KDTree` and
+  :class:`neighbors.BallTree` has been improved for their worst-cases time
+  complexity from :math:`\mathcal{O}(n^2)` to :math:`\mathcal{O}(n)`.
+  :pr:`19473` by :user:`jiefangxuanyan <jiefangxuanyan>` and
+  :user:`Julien Jerphanion <jjerphan>`.
+
 :mod:`sklearn.pipeline`
 .......................
 
diff --git a/sklearn/neighbors/_binary_tree.pxi b/sklearn/neighbors/_binary_tree.pxi
index 1acff082c7d76..cabad951c4975 100755
--- a/sklearn/neighbors/_binary_tree.pxi
+++ b/sklearn/neighbors/_binary_tree.pxi
@@ -159,6 +159,8 @@ from ._typedefs import DTYPE, ITYPE
 from ._dist_metrics cimport (DistanceMetric, euclidean_dist, euclidean_rdist,
                              euclidean_dist_to_rdist, euclidean_rdist_to_dist)
 
+from ._partition_nodes cimport partition_node_indices
+
 cdef extern from "numpy/arrayobject.h":
     void PyArray_ENABLEFLAGS(np.ndarray arr, int flags)
 
@@ -776,73 +778,6 @@ cdef ITYPE_t find_node_split_dim(DTYPE_t* data,
     return j_max
 
 
-cdef int partition_node_indices(DTYPE_t* data,
-                                ITYPE_t* node_indices,
-                                ITYPE_t split_dim,
-                                ITYPE_t split_index,
-                                ITYPE_t n_features,
-                                ITYPE_t n_points) except -1:
-    """Partition points in the node into two equal-sized groups.
-
-    Upon return, the values in node_indices will be rearranged such that
-    (assuming numpy-style indexing):
-
-        data[node_indices[0:split_index], split_dim]
-          <= data[node_indices[split_index], split_dim]
-
-    and
-
-        data[node_indices[split_index], split_dim]
-          <= data[node_indices[split_index:n_points], split_dim]
-
-    The algorithm is essentially a partial in-place quicksort around a
-    set pivot.
-
-    Parameters
-    ----------
-    data : double pointer
-        Pointer to a 2D array of the training data, of shape [N, n_features].
-        N must be greater than any of the values in node_indices.
-    node_indices : int pointer
-        Pointer to a 1D array of length n_points.  This lists the indices of
-        each of the points within the current node.  This will be modified
-        in-place.
-    split_dim : int
-        the dimension on which to split.  This will usually be computed via
-        the routine ``find_node_split_dim``
-    split_index : int
-        the index within node_indices around which to split the points.
-
-    Returns
-    -------
-    status : int
-        integer exit status.  On return, the contents of node_indices are
-        modified as noted above.
-    """
-    cdef ITYPE_t left, right, midindex, i
-    cdef DTYPE_t d1, d2
-    left = 0
-    right = n_points - 1
-
-    while True:
-        midindex = left
-        for i in range(left, right):
-            d1 = data[node_indices[i] * n_features + split_dim]
-            d2 = data[node_indices[right] * n_features + split_dim]
-            if d1 < d2:
-                swap(node_indices, i, midindex)
-                midindex += 1
-        swap(node_indices, midindex, right)
-        if midindex == split_index:
-            break
-        elif midindex < split_index:
-            left = midindex + 1
-        else:
-            right = midindex - 1
-
-    return 0
-
-
 ######################################################################
 # NodeHeap : min-heap used to keep track of nodes during
 #            breadth-first query
diff --git a/sklearn/neighbors/_partition_nodes.pxd b/sklearn/neighbors/_partition_nodes.pxd
new file mode 100644
index 0000000000000..522e826632824
--- /dev/null
+++ b/sklearn/neighbors/_partition_nodes.pxd
@@ -0,0 +1,9 @@
+from ._typedefs cimport DTYPE_t, ITYPE_t
+
+cdef int partition_node_indices(
+        DTYPE_t *data,
+        ITYPE_t *node_indices,
+        ITYPE_t split_dim,
+        ITYPE_t split_index,
+        ITYPE_t n_features,
+        ITYPE_t n_points) except -1
diff --git a/sklearn/neighbors/_partition_nodes.pyx b/sklearn/neighbors/_partition_nodes.pyx
new file mode 100644
index 0000000000000..508e9560ae8c2
--- /dev/null
+++ b/sklearn/neighbors/_partition_nodes.pyx
@@ -0,0 +1,122 @@
+# distutils : language = c++
+
+# BinaryTrees rely on partial sorts to partition their nodes during their
+# initialisation.
+#
+# The C++ std library exposes nth_element, an efficient partial sort for this
+# situation which has a linear time complexity as well as the best performances.
+#
+# To use std::algorithm::nth_element, a few fixture are defined using Cython:
+# - partition_node_indices, a Cython function used in BinaryTrees, that calls
+# - partition_node_indices_inner, a C++ function that wraps nth_element and uses
+# - an IndexComparator to state how to compare KDTrees' indices
+#
+# IndexComparator has been defined so that partial sorts are stable with
+# respect to the nodes initial indices.
+#
+# See for reference:
+#  - https://en.cppreference.com/w/cpp/algorithm/nth_element.
+#  - https://github.com/scikit-learn/scikit-learn/pull/11103
+#  - https://github.com/scikit-learn/scikit-learn/pull/19473
+
+cdef extern from *:
+    """
+    #include <algorithm>
+
+    template<class D, class I>
+    class IndexComparator {
+    private:
+        const D *data;
+        I split_dim, n_features;
+    public:
+        IndexComparator(const D *data, const I &split_dim, const I &n_features):
+            data(data), split_dim(split_dim), n_features(n_features) {}
+
+        bool operator()(const I &a, const I &b) const {
+            D a_value = data[a * n_features + split_dim];
+            D b_value = data[b * n_features + split_dim];
+            return a_value == b_value ? a < b : a_value < b_value;
+        }
+    };
+
+    template<class D, class I>
+    void partition_node_indices_inner(
+        const D *data,
+        I *node_indices,
+        const I &split_dim,
+        const I &split_index,
+        const I &n_features,
+        const I &n_points) {
+        IndexComparator<D, I> index_comparator(data, split_dim, n_features);
+        std::nth_element(
+            node_indices,
+            node_indices + split_index,
+            node_indices + n_points,
+            index_comparator);
+    }
+    """
+    void partition_node_indices_inner[D, I](
+                D *data,
+                I *node_indices,
+                I split_dim,
+                I split_index,
+                I n_features,
+                I n_points) except +
+
+
+cdef int partition_node_indices(
+        DTYPE_t *data,
+        ITYPE_t *node_indices,
+        ITYPE_t split_dim,
+        ITYPE_t split_index,
+        ITYPE_t n_features,
+        ITYPE_t n_points) except -1:
+    """Partition points in the node into two equal-sized groups.
+
+    Upon return, the values in node_indices will be rearranged such that
+    (assuming numpy-style indexing):
+
+        data[node_indices[0:split_index], split_dim]
+          <= data[node_indices[split_index], split_dim]
+
+    and
+
+        data[node_indices[split_index], split_dim]
+          <= data[node_indices[split_index:n_points], split_dim]
+
+    The algorithm is essentially a partial in-place quicksort around a
+    set pivot.
+
+    Parameters
+    ----------
+    data : double pointer
+        Pointer to a 2D array of the training data, of shape [N, n_features].
+        N must be greater than any of the values in node_indices.
+    node_indices : int pointer
+        Pointer to a 1D array of length n_points.  This lists the indices of
+        each of the points within the current node.  This will be modified
+        in-place.
+    split_dim : int
+        the dimension on which to split.  This will usually be computed via
+        the routine ``find_node_split_dim``.
+    split_index : int
+        the index within node_indices around which to split the points.
+    n_features: int
+        the number of features (i.e columns) in the 2D array pointed by data.
+    n_points : int
+        the length of node_indices. This is also the number of points in
+        the original dataset.
+    Returns
+    -------
+    status : int
+        integer exit status.  On return, the contents of node_indices are
+        modified as noted above.
+    """
+    partition_node_indices_inner(
+        data,
+        node_indices,
+        split_dim,
+        split_index,
+        n_features,
+        n_points)
+    return 0
diff --git a/sklearn/neighbors/setup.py b/sklearn/neighbors/setup.py
index 9264044678193..996b855d2d45a 100644
--- a/sklearn/neighbors/setup.py
+++ b/sklearn/neighbors/setup.py
@@ -20,6 +20,12 @@ def configuration(parent_package='', top_path=None):
                          include_dirs=[numpy.get_include()],
                          libraries=libraries)
 
+    config.add_extension('_partition_nodes',
+                         sources=['_partition_nodes.pyx'],
+                         include_dirs=[numpy.get_include()],
+                         language="c++",
+                         libraries=libraries)
+
     config.add_extension('_dist_metrics',
                          sources=['_dist_metrics.pyx'],
                          include_dirs=[numpy.get_include(),

From 132627e28b5be807b1e4b7d58bedf42b529d7800 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Thu, 8 Apr 2021 23:21:20 +0200
Subject: [PATCH 303/478] FIX Let ColumnTransformer.get_feature_names handle
 transformers with non-string feature names (#18459)

Co-authored-by: Alonso Silva Allende <alonsosilva@gmaiil.com>
Co-authored-by: Roman Yurchak <rth.yurchak@gmail.com>
---
 doc/whats_new/v1.0.rst                        |  5 ++++
 sklearn/compose/_column_transformer.py        |  2 +-
 .../compose/tests/test_column_transformer.py  | 28 ++++++++++++-------
 3 files changed, 24 insertions(+), 11 deletions(-)

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index ce7da3139d140..602d4b1246878 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -105,6 +105,11 @@ Changelog
   of each transformer in `output_indices_`. :pr:`18393` by
   :user:`Luca Bittarello <lbittarello>`.
 
+- |FIX| :meth:`compose.ColumnTransformer.get_feature_names` supports
+  non-string feature names returned by any of its transformers.
+  :pr:`18459` by :user:`Albert Villanova del Moral <albertvillanova>` and
+  :user:`Alonso Silva Allende <alonsosilvaallende>`.
+
 :mod:`sklearn.datasets`
 .......................
 
diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py
index 5006663331a40..2f2da882652c0 100644
--- a/sklearn/compose/_column_transformer.py
+++ b/sklearn/compose/_column_transformer.py
@@ -376,7 +376,7 @@ def get_feature_names(self):
                 raise AttributeError("Transformer %s (type %s) does not "
                                      "provide get_feature_names."
                                      % (str(name), type(trans).__name__))
-            feature_names.extend([name + "__" + f for f in
+            feature_names.extend([f"{name}__{f}" for f in
                                   trans.get_feature_names()])
         return feature_names
 
diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py
index f7c1874d4a1b7..549292ab51445 100644
--- a/sklearn/compose/tests/test_column_transformer.py
+++ b/sklearn/compose/tests/test_column_transformer.py
@@ -748,7 +748,7 @@ def test_column_transformer_cloning():
     assert hasattr(ct.transformers_[0][1], 'mean_')
 
 
-def test_column_transformer_get_feature_names():
+def test_column_transformer_get_feature_names_raises():
     X_array = np.array([[0., 1., 2.], [2., 4., 6.]]).T
     ct = ColumnTransformer([('trans', Trans(), [0, 1])])
     # raise correct error when not fitted
@@ -756,23 +756,30 @@ def test_column_transformer_get_feature_names():
         ct.get_feature_names()
     # raise correct error when no feature names are available
     ct.fit(X_array)
-    assert_raise_message(AttributeError,
-                         "Transformer trans (type Trans) does not provide "
-                         "get_feature_names", ct.get_feature_names)
+    msg = r"Transformer trans \(type Trans\) does not provide " \
+          r"get_feature_names"
+    with pytest.raises(AttributeError, match=msg):
+        ct.get_feature_names()
 
-    # working example
-    X = np.array([[{'a': 1, 'b': 2}, {'a': 3, 'b': 4}],
-                  [{'c': 5}, {'c': 6}]], dtype=object).T
+
+@pytest.mark.parametrize("X, keys", [
+    (np.array([[{'a': 1, 'b': 2}, {'a': 3, 'b': 4}],
+               [{'c': 5}, {'c': 6}]], dtype=object).T, ('a', 'b', 'c')),
+    (np.array([[{1: 1, 2: 2}, {1: 3, 2: 4}],
+               [{3: 5}, {3: 6}]], dtype=object).T, ('1', '2', '3')),
+])
+def test_column_transformer_get_feature_names(X, keys):
     ct = ColumnTransformer(
         [('col' + str(i), DictVectorizer(), i) for i in range(2)])
     ct.fit(X)
-    assert ct.get_feature_names() == ['col0__a', 'col0__b', 'col1__c']
+    assert ct.get_feature_names() == [f'col0__{key}' for key in keys[:2]] + \
+           [f'col1__{keys[2]}']
 
     # drop transformer
     ct = ColumnTransformer(
         [('col0', DictVectorizer(), 0), ('col1', 'drop', 1)])
     ct.fit(X)
-    assert ct.get_feature_names() == ['col0__a', 'col0__b']
+    assert ct.get_feature_names() == [f'col0__{key}' for key in keys[:2]]
 
     # passthrough transformer
     ct = ColumnTransformer([('trans', 'passthrough', [0, 1])])
@@ -782,7 +789,8 @@ def test_column_transformer_get_feature_names():
     ct = ColumnTransformer([('trans', DictVectorizer(), 0)],
                            remainder='passthrough')
     ct.fit(X)
-    assert ct.get_feature_names() == ['trans__a', 'trans__b', 'x1']
+    assert ct.get_feature_names() == [f'trans__{key}' for key in keys[:2]] + \
+           ['x1']
 
     ct = ColumnTransformer([('trans', 'passthrough', [1])],
                            remainder='passthrough')

From 1f91b873e420fcfb5f1d84b821d27ab54bd76144 Mon Sep 17 00:00:00 2001
From: qdeffense <quentin.deffense@student.uclouvain.be>
Date: Fri, 9 Apr 2021 00:17:49 +0200
Subject: [PATCH 304/478] TST Remove redundant max iter in
 sklearn/linear_model/tests (#14622)

Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
---
 sklearn/linear_model/tests/test_coordinate_descent.py  | 10 +++++-----
 sklearn/linear_model/tests/test_huber.py               |  5 ++---
 sklearn/linear_model/tests/test_logistic.py            | 10 +++++-----
 sklearn/linear_model/tests/test_passive_aggressive.py  |  6 +++---
 .../tests/test_sparse_coordinate_descent.py            |  4 ++--
 5 files changed, 17 insertions(+), 18 deletions(-)

diff --git a/sklearn/linear_model/tests/test_coordinate_descent.py b/sklearn/linear_model/tests/test_coordinate_descent.py
index 8a269f28ebd62..830cf32139b08 100644
--- a/sklearn/linear_model/tests/test_coordinate_descent.py
+++ b/sklearn/linear_model/tests/test_coordinate_descent.py
@@ -668,11 +668,11 @@ def test_lasso_positive_constraint():
     X = [[-1], [0], [1]]
     y = [1, 0, -1]       # just a straight line with negative slope
 
-    lasso = Lasso(alpha=0.1, max_iter=1000, positive=True)
+    lasso = Lasso(alpha=0.1, positive=True)
     lasso.fit(X, y)
     assert min(lasso.coef_) >= 0
 
-    lasso = Lasso(alpha=0.1, max_iter=1000, precompute=True, positive=True)
+    lasso = Lasso(alpha=0.1, precompute=True, positive=True)
     lasso.fit(X, y)
     assert min(lasso.coef_) >= 0
 
@@ -681,7 +681,7 @@ def test_enet_positive_constraint():
     X = [[-1], [0], [1]]
     y = [1, 0, -1]       # just a straight line with negative slope
 
-    enet = ElasticNet(alpha=0.1, max_iter=1000, positive=True)
+    enet = ElasticNet(alpha=0.1, positive=True)
     enet.fit(X, y)
     assert min(enet.coef_) >= 0
 
@@ -1255,7 +1255,7 @@ def test_convergence_warnings():
 
     # check that the model converges w/o warnings
     with pytest.warns(None) as record:
-        MultiTaskElasticNet(max_iter=1000).fit(X, y)
+        MultiTaskElasticNet().fit(X, y)
 
     assert not record.list
 
@@ -1269,7 +1269,7 @@ def test_sparse_input_convergence_warning():
 
     # check that the model converges w/o warnings
     with pytest.warns(None) as record:
-        Lasso(max_iter=1000).fit(sparse.csr_matrix(X, dtype=np.float32), y)
+        Lasso().fit(sparse.csr_matrix(X, dtype=np.float32), y)
 
     assert not record.list
 
diff --git a/sklearn/linear_model/tests/test_huber.py b/sklearn/linear_model/tests/test_huber.py
index 156cd4b57dbc8..7aa69e68f5136 100644
--- a/sklearn/linear_model/tests/test_huber.py
+++ b/sklearn/linear_model/tests/test_huber.py
@@ -128,7 +128,7 @@ def test_huber_sparse():
 def test_huber_scaling_invariant():
     # Test that outliers filtering is scaling independent.
     X, y = make_regression_with_outliers()
-    huber = HuberRegressor(fit_intercept=False, alpha=0.0, max_iter=100)
+    huber = HuberRegressor(fit_intercept=False, alpha=0.0)
     huber.fit(X, y)
     n_outliers_mask_1 = huber.outliers_
     assert not np.all(n_outliers_mask_1)
@@ -149,8 +149,7 @@ def test_huber_and_sgd_same_results():
 
     # Fit once to find out the scale parameter. Scale down X and y by scale
     # so that the scale parameter is optimized to 1.0
-    huber = HuberRegressor(fit_intercept=False, alpha=0.0, max_iter=100,
-                           epsilon=1.35)
+    huber = HuberRegressor(fit_intercept=False, alpha=0.0, epsilon=1.35)
     huber.fit(X, y)
     X_scale = X / huber.scale_
     y_scale = y / huber.scale_
diff --git a/sklearn/linear_model/tests/test_logistic.py b/sklearn/linear_model/tests/test_logistic.py
index bdc9a4a24914b..5ec4a434f857a 100644
--- a/sklearn/linear_model/tests/test_logistic.py
+++ b/sklearn/linear_model/tests/test_logistic.py
@@ -1234,14 +1234,14 @@ def test_n_iter(solver):
     n_classes = 1 if solver == 'liblinear' else np.unique(y).shape[0]
     clf = LogisticRegression(tol=1e-2, multi_class='ovr',
                              solver=solver, C=1.,
-                             random_state=42, max_iter=100)
+                             random_state=42)
     clf.fit(X, y)
     assert clf.n_iter_.shape == (n_classes,)
 
     n_classes = np.unique(y).shape[0]
     clf = LogisticRegressionCV(tol=1e-2, multi_class='ovr',
                                solver=solver, Cs=n_Cs, cv=n_cv_fold,
-                               random_state=42, max_iter=100)
+                               random_state=42)
     clf.fit(X, y)
     assert clf.n_iter_.shape == (n_classes, n_cv_fold, n_Cs)
     clf.fit(X, y_bin)
@@ -1254,13 +1254,13 @@ def test_n_iter(solver):
 
     clf = LogisticRegression(tol=1e-2, multi_class='multinomial',
                              solver=solver, C=1.,
-                             random_state=42, max_iter=100)
+                             random_state=42)
     clf.fit(X, y)
     assert clf.n_iter_.shape == (n_classes,)
 
     clf = LogisticRegressionCV(tol=1e-2, multi_class='multinomial',
                                solver=solver, Cs=n_Cs, cv=n_cv_fold,
-                               random_state=42, max_iter=100)
+                               random_state=42)
     clf.fit(X, y)
     assert clf.n_iter_.shape == (n_classes, n_cv_fold, n_Cs)
     clf.fit(X, y_bin)
@@ -1280,7 +1280,7 @@ def test_warm_start(solver, warm_start, fit_intercept, multi_class):
     clf = LogisticRegression(tol=1e-4, multi_class=multi_class,
                              warm_start=warm_start,
                              solver=solver,
-                             random_state=42, max_iter=100,
+                             random_state=42,
                              fit_intercept=fit_intercept)
     with ignore_warnings(category=ConvergenceWarning):
         clf.fit(X, y)
diff --git a/sklearn/linear_model/tests/test_passive_aggressive.py b/sklearn/linear_model/tests/test_passive_aggressive.py
index f2403773277a7..d0d099eeacc8d 100644
--- a/sklearn/linear_model/tests/test_passive_aggressive.py
+++ b/sklearn/linear_model/tests/test_passive_aggressive.py
@@ -165,16 +165,16 @@ def test_equal_class_weight():
     X2 = [[1, 0], [1, 0], [0, 1], [0, 1]]
     y2 = [0, 0, 1, 1]
     clf = PassiveAggressiveClassifier(
-        C=0.1, max_iter=1000, tol=None, class_weight=None)
+        C=0.1, tol=None, class_weight=None)
     clf.fit(X2, y2)
 
     # Already balanced, so "balanced" weights should have no effect
     clf_balanced = PassiveAggressiveClassifier(
-        C=0.1, max_iter=1000, tol=None, class_weight="balanced")
+        C=0.1, tol=None, class_weight="balanced")
     clf_balanced.fit(X2, y2)
 
     clf_weighted = PassiveAggressiveClassifier(
-        C=0.1, max_iter=1000, tol=None, class_weight={0: 0.5, 1: 0.5})
+        C=0.1, tol=None, class_weight={0: 0.5, 1: 0.5})
     clf_weighted.fit(X2, y2)
 
     # should be similar up to some epsilon due to learning rate schedule
diff --git a/sklearn/linear_model/tests/test_sparse_coordinate_descent.py b/sklearn/linear_model/tests/test_sparse_coordinate_descent.py
index 23b57a699a655..c4364cc31a80d 100644
--- a/sklearn/linear_model/tests/test_sparse_coordinate_descent.py
+++ b/sklearn/linear_model/tests/test_sparse_coordinate_descent.py
@@ -63,7 +63,7 @@ def test_enet_toy_list_input():
     assert_array_almost_equal(pred, [2, 3, 4])
     assert_almost_equal(clf.dual_gap_, 0)
 
-    clf = ElasticNet(alpha=0.5, l1_ratio=0.3, max_iter=1000)
+    clf = ElasticNet(alpha=0.5, l1_ratio=0.3)
     clf.fit(X, Y)
     pred = clf.predict(T)
     assert_array_almost_equal(clf.coef_, [0.50819], decimal=3)
@@ -102,7 +102,7 @@ def test_enet_toy_explicit_sparse_input():
     assert_array_almost_equal(pred, [2, 3, 4])
     assert_almost_equal(clf.dual_gap_, 0)
 
-    clf = ElasticNet(alpha=0.5, l1_ratio=0.3, max_iter=1000)
+    clf = ElasticNet(alpha=0.5, l1_ratio=0.3)
     clf.fit(X, Y)
     pred = clf.predict(T)
     assert_array_almost_equal(clf.coef_, [0.50819], decimal=3)

From d88ffabb6fe3152902c213133eb2bdd0a3c9ab86 Mon Sep 17 00:00:00 2001
From: Shooter23 <44271378+Shooter23@users.noreply.github.com>
Date: Thu, 8 Apr 2021 20:54:51 -0400
Subject: [PATCH 305/478] DOC Update attribute docstrings in
 _multilayer_perceptron.py (#19595)

Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
---
 sklearn/neural_network/_multilayer_perceptron.py | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/sklearn/neural_network/_multilayer_perceptron.py b/sklearn/neural_network/_multilayer_perceptron.py
index 52c94a7129b9f..04822360791e7 100644
--- a/sklearn/neural_network/_multilayer_perceptron.py
+++ b/sklearn/neural_network/_multilayer_perceptron.py
@@ -884,7 +884,7 @@ class MLPClassifier(ClassifierMixin, BaseMultilayerPerceptron):
         layer i + 1.
 
     n_iter_ : int
-        The number of iterations the solver has ran.
+        The number of iterations the solver has run.
 
     n_layers_ : int
         Number of layers.
@@ -1292,10 +1292,13 @@ class MLPRegressor(RegressorMixin, BaseMultilayerPerceptron):
         The minimum loss reached by the solver throughout fitting.
 
     loss_curve_ : list of shape (`n_iter_`,)
+        Loss value evaluated at the end of each training step.
         The ith element in the list represents the loss at the ith iteration.
 
     t_ : int
         The number of training samples seen by the solver during fitting.
+        Mathematically equals `n_iters * X.shape[0]`, it means
+        `time_step` and it is used by optimizer's learning rate scheduler.
 
     coefs_ : list of shape (n_layers - 1,)
         The ith element in the list represents the weight matrix corresponding
@@ -1306,7 +1309,7 @@ class MLPRegressor(RegressorMixin, BaseMultilayerPerceptron):
         layer i + 1.
 
     n_iter_ : int
-        The number of iterations the solver has ran.
+        The number of iterations the solver has run.
 
     n_layers_ : int
         Number of layers.
@@ -1317,13 +1320,6 @@ class MLPRegressor(RegressorMixin, BaseMultilayerPerceptron):
     out_activation_ : str
         Name of the output activation function.
 
-    loss_curve_ : list of shape (n_iters,)
-        Loss value evaluated at the end of each training step.
-
-    t_ : int
-        Mathematically equals `n_iters * X.shape[0]`, it means
-        `time_step` and it is used by optimizer's learning rate scheduler.
-
     Examples
     --------
     >>> from sklearn.neural_network import MLPRegressor

From a80b99ca04a6e8df9fb838bb195432654b592263 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Fri, 9 Apr 2021 10:31:53 +0200
Subject: [PATCH 306/478] DOC Fix versionchanged/versionadded in OneHotEncoder
 (#16562)

Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
Co-authored-by: Guillaume Lemaitre <g.lemaitre58@gmail.com>
---
 sklearn/preprocessing/_encoders.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index 4344e010bba1a..d3f557d2993cb 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -193,8 +193,6 @@ class OneHotEncoder(_BaseEncoder):
 
     Read more in the :ref:`User Guide <preprocessing_categorical_features>`.
 
-    .. versionchanged:: 0.20
-
     Parameters
     ----------
     categories : 'auto' or a list of array-like, default='auto'
@@ -230,8 +228,11 @@ class OneHotEncoder(_BaseEncoder):
         - array : ``drop[i]`` is the category in feature ``X[:, i]`` that
           should be dropped.
 
+        .. versionadded:: 0.21
+           The parameter `drop` was added in 0.21.
+
         .. versionchanged:: 0.23
-           Added option 'if_binary'.
+           The option `drop='if_binary'` was added in 0.23.
 
     sparse : bool, default=True
         Will return sparse matrix if set True else will return an array.

From 734ae1f2dfb320ea824478860dda1f4aa5736d05 Mon Sep 17 00:00:00 2001
From: Alihan Zihna <alihanz@gmail.com>
Date: Fri, 9 Apr 2021 09:37:15 +0100
Subject: [PATCH 307/478] DOC add explicit message regarding shuffling in
 default CV (#19776)

Co-authored-by: Alihan Zihna <a.zihna@ckhgbdp.onmicrosoft.com>
---
 sklearn/ensemble/_stacking.py                  |  4 ++++
 sklearn/feature_selection/_sequential.py       |  3 ++-
 sklearn/model_selection/_search.py             |  6 ++++--
 .../_search_successive_halving.py              |  6 ++++--
 sklearn/model_selection/_validation.py         | 18 ++++++++++++------
 5 files changed, 26 insertions(+), 11 deletions(-)

diff --git a/sklearn/ensemble/_stacking.py b/sklearn/ensemble/_stacking.py
index 303015cc9f751..09a460f7519d5 100644
--- a/sklearn/ensemble/_stacking.py
+++ b/sklearn/ensemble/_stacking.py
@@ -297,6 +297,8 @@ class StackingClassifier(ClassifierMixin, _BaseStacking):
         either binary or multiclass,
         :class:`~sklearn.model_selection.StratifiedKFold` is used.
         In all other cases, :class:`~sklearn.model_selection.KFold` is used.
+        These splitters are instantiated with `shuffle=False` so the splits
+        will be the same across calls.
 
         Refer :ref:`User Guide <cross_validation>` for the various
         cross-validation strategies that can be used here.
@@ -570,6 +572,8 @@ class StackingRegressor(RegressorMixin, _BaseStacking):
         either binary or multiclass,
         :class:`~sklearn.model_selection.StratifiedKFold` is used.
         In all other cases, :class:`~sklearn.model_selection.KFold` is used.
+        These splitters are instantiated with `shuffle=False` so the splits
+        will be the same across calls.
 
         Refer :ref:`User Guide <cross_validation>` for the various
         cross-validation strategies that can be used here.
diff --git a/sklearn/feature_selection/_sequential.py b/sklearn/feature_selection/_sequential.py
index 7ee6b043a0df1..8e831b53e4983 100644
--- a/sklearn/feature_selection/_sequential.py
+++ b/sklearn/feature_selection/_sequential.py
@@ -60,7 +60,8 @@ class SequentialFeatureSelector(SelectorMixin, MetaEstimatorMixin,
 
         For integer/None inputs, if the estimator is a classifier and ``y`` is
         either binary or multiclass, :class:`StratifiedKFold` is used. In all
-        other cases, :class:`KFold` is used.
+        other cases, :class:`KFold` is used. These splitters are instantiated
+        with `shuffle=False` so the splits will be the same across calls.
 
         Refer :ref:`User Guide <cross_validation>` for the various
         cross-validation strategies that can be used here.
diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py
index abe3b87488d8c..ebd085c08e68f 100644
--- a/sklearn/model_selection/_search.py
+++ b/sklearn/model_selection/_search.py
@@ -1065,7 +1065,8 @@ class GridSearchCV(BaseSearchCV):
 
         For integer/None inputs, if the estimator is a classifier and ``y`` is
         either binary or multiclass, :class:`StratifiedKFold` is used. In all
-        other cases, :class:`KFold` is used.
+        other cases, :class:`KFold` is used. These splitters are instantiated
+        with `shuffle=False` so the splits will be the same across calls.
 
         Refer :ref:`User Guide <cross_validation>` for the various
         cross-validation strategies that can be used here.
@@ -1407,7 +1408,8 @@ class RandomizedSearchCV(BaseSearchCV):
 
         For integer/None inputs, if the estimator is a classifier and ``y`` is
         either binary or multiclass, :class:`StratifiedKFold` is used. In all
-        other cases, :class:`KFold` is used.
+        other cases, :class:`KFold` is used. These splitters are instantiated
+        with `shuffle=False` so the splits will be the same across calls.
 
         Refer :ref:`User Guide <cross_validation>` for the various
         cross-validation strategies that can be used here.
diff --git a/sklearn/model_selection/_search_successive_halving.py b/sklearn/model_selection/_search_successive_halving.py
index b522ce7fbda41..f4396920c1677 100644
--- a/sklearn/model_selection/_search_successive_halving.py
+++ b/sklearn/model_selection/_search_successive_halving.py
@@ -425,7 +425,8 @@ class HalvingGridSearchCV(BaseSuccessiveHalving):
 
         For integer/None inputs, if the estimator is a classifier and ``y`` is
         either binary or multiclass, :class:`StratifiedKFold` is used. In all
-        other cases, :class:`KFold` is used.
+        other cases, :class:`KFold` is used. These splitters are instantiated
+        with `shuffle=False` so the splits will be the same across calls.
 
         Refer :ref:`User Guide <cross_validation>` for the various
         cross-validation strategies that can be used here.
@@ -712,7 +713,8 @@ class HalvingRandomSearchCV(BaseSuccessiveHalving):
 
         For integer/None inputs, if the estimator is a classifier and ``y`` is
         either binary or multiclass, :class:`StratifiedKFold` is used. In all
-        other cases, :class:`KFold` is used.
+        other cases, :class:`KFold` is used. These splitters are instantiated
+        with `shuffle=False` so the splits will be the same across calls.
 
         Refer :ref:`User Guide <cross_validation>` for the various
         cross-validation strategies that can be used here.
diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py
index 5f5338512a0f2..9765303a30b8d 100644
--- a/sklearn/model_selection/_validation.py
+++ b/sklearn/model_selection/_validation.py
@@ -94,7 +94,8 @@ def cross_validate(estimator, X, y=None, *, groups=None, scoring=None, cv=None,
 
         For int/None inputs, if the estimator is a classifier and ``y`` is
         either binary or multiclass, :class:`StratifiedKFold` is used. In all
-        other cases, :class:`KFold` is used.
+        other cases, :class:`.Fold` is used. These splitters are instantiated
+        with `shuffle=False` so the splits will be the same across calls.
 
         Refer :ref:`User Guide <cross_validation>` for the various
         cross-validation strategies that can be used here.
@@ -364,7 +365,8 @@ def cross_val_score(estimator, X, y=None, *, groups=None, scoring=None,
 
         For int/None inputs, if the estimator is a classifier and ``y`` is
         either binary or multiclass, :class:`StratifiedKFold` is used. In all
-        other cases, :class:`KFold` is used.
+        other cases, :class:`KFold` is used. These splitters are instantiated
+        with `shuffle=False` so the splits will be the same across calls.
 
         Refer :ref:`User Guide <cross_validation>` for the various
         cross-validation strategies that can be used here.
@@ -766,7 +768,8 @@ def cross_val_predict(estimator, X, y=None, *, groups=None, cv=None,
 
         For int/None inputs, if the estimator is a classifier and ``y`` is
         either binary or multiclass, :class:`StratifiedKFold` is used. In all
-        other cases, :class:`KFold` is used.
+        other cases, :class:`KFold` is used. These splitters are instantiated
+        with `shuffle=False` so the splits will be the same across calls.
 
         Refer :ref:`User Guide <cross_validation>` for the various
         cross-validation strategies that can be used here.
@@ -1115,7 +1118,8 @@ def permutation_test_score(estimator, X, y, *, groups=None, cv=None,
 
         For int/None inputs, if the estimator is a classifier and ``y`` is
         either binary or multiclass, :class:`StratifiedKFold` is used. In all
-        other cases, :class:`KFold` is used.
+        other cases, :class:`KFold` is used. These splitters are instantiated
+        with `shuffle=False` so the splits will be the same across calls.
 
         Refer :ref:`User Guide <cross_validation>` for the various
         cross-validation strategies that can be used here.
@@ -1279,7 +1283,8 @@ def learning_curve(estimator, X, y, *, groups=None,
 
         For int/None inputs, if the estimator is a classifier and ``y`` is
         either binary or multiclass, :class:`StratifiedKFold` is used. In all
-        other cases, :class:`KFold` is used.
+        other cases, :class:`KFold` is used. These splitters are instantiated
+        with `shuffle=False` so the splits will be the same across calls.
 
         Refer :ref:`User Guide <cross_validation>` for the various
         cross-validation strategies that can be used here.
@@ -1579,7 +1584,8 @@ def validation_curve(estimator, X, y, *, param_name, param_range, groups=None,
 
         For int/None inputs, if the estimator is a classifier and ``y`` is
         either binary or multiclass, :class:`StratifiedKFold` is used. In all
-        other cases, :class:`KFold` is used.
+        other cases, :class:`KFold` is used. These splitters are instantiated
+        with `shuffle=False` so the splits will be the same across calls.
 
         Refer :ref:`User Guide <cross_validation>` for the various
         cross-validation strategies that can be used here.

From b8903dacee48a82512619a8a6ed0bf706c1ab909 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Fri, 9 Apr 2021 05:32:41 -0400
Subject: [PATCH 308/478] ENH Adds final_estimator in html repr for Stacking*
 (#19564)

---
 doc/whats_new/v1.0.rst        |  4 ++++
 sklearn/ensemble/_stacking.py | 11 ++++++++---
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 602d4b1246878..1245193d76d89 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -154,6 +154,10 @@ Changelog
   experimental. They are now considered stable and are subject to the same
   deprecation cycles as all other estimators. :pr:`19799` by `Nicolas Hug`_.
 
+- |Enhancement| Improve the HTML rendering of the
+  :class:`ensemble.StackingClassifier` and :class:`ensemble.StackingRegressor`.
+  :pr:`19564` by `Thomas Fan`_.
+
 :mod:`sklearn.feature_extraction`
 .................................
 
diff --git a/sklearn/ensemble/_stacking.py b/sklearn/ensemble/_stacking.py
index 09a460f7519d5..3522b381389d3 100644
--- a/sklearn/ensemble/_stacking.py
+++ b/sklearn/ensemble/_stacking.py
@@ -250,9 +250,14 @@ def _sk_visual_block_(self, final_estimator):
         names, estimators = zip(*self.estimators)
         parallel = _VisualBlock('parallel', estimators, names=names,
                                 dash_wrapped=False)
-        serial = _VisualBlock('serial', (parallel, final_estimator),
-                              dash_wrapped=False)
-        return _VisualBlock('serial', [serial])
+
+        # final estimator is wrapped in a parallel block to show the label:
+        # 'final_estimator' in the html repr
+        final_block = _VisualBlock('parallel', [final_estimator],
+                                   names=['final_estimator'],
+                                   dash_wrapped=False)
+        return _VisualBlock('serial', (parallel, final_block),
+                            dash_wrapped=False)
 
 
 class StackingClassifier(ClassifierMixin, _BaseStacking):

From a44653fb3438c80955e647c9d634c231de28a8c4 Mon Sep 17 00:00:00 2001
From: Alihan Zihna <alihanz@gmail.com>
Date: Fri, 9 Apr 2021 10:40:37 +0100
Subject: [PATCH 309/478] TST Changes assert to pytest style in test_config.py
 and test_kernel_approximation.py (#19845)

---
 sklearn/tests/test_config.py               | 12 ++++++++----
 sklearn/tests/test_kernel_approximation.py | 19 +++++++++++++++----
 2 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/sklearn/tests/test_config.py b/sklearn/tests/test_config.py
index eec349861258c..22ec862ef24a3 100644
--- a/sklearn/tests/test_config.py
+++ b/sklearn/tests/test_config.py
@@ -1,5 +1,5 @@
+import pytest
 from sklearn import get_config, set_config, config_context
-from sklearn.utils._testing import assert_raises
 
 
 def test_config_context():
@@ -43,9 +43,12 @@ def test_config_context():
                             'display': 'text'}
 
     # No positional arguments
-    assert_raises(TypeError, config_context, True)
+    with pytest.raises(TypeError):
+        config_context(True)
+
     # No unknown arguments
-    assert_raises(TypeError, config_context(do_something_else=True).__enter__)
+    with pytest.raises(TypeError):
+        config_context(do_something_else=True).__enter__()
 
 
 def test_config_context_exception():
@@ -71,4 +74,5 @@ def test_set_config():
     assert get_config()['assume_finite'] is False
 
     # No unknown arguments
-    assert_raises(TypeError, set_config, do_something_else=True)
+    with pytest.raises(TypeError):
+        set_config(do_something_else=True)
diff --git a/sklearn/tests/test_kernel_approximation.py b/sklearn/tests/test_kernel_approximation.py
index 0cee04f9f2d0a..cfd9c9671fc4d 100644
--- a/sklearn/tests/test_kernel_approximation.py
+++ b/sklearn/tests/test_kernel_approximation.py
@@ -1,9 +1,11 @@
+import re
+
 import numpy as np
 from scipy.sparse import csr_matrix
 import pytest
 
 from sklearn.utils._testing import assert_array_equal
-from sklearn.utils._testing import assert_array_almost_equal, assert_raises
+from sklearn.utils._testing import assert_array_almost_equal
 
 from sklearn.metrics.pairwise import kernel_metrics
 from sklearn.kernel_approximation import RBFSampler
@@ -90,11 +92,18 @@ def test_additive_chi2_sampler():
     # test error is raised on negative input
     Y_neg = Y.copy()
     Y_neg[0, 0] = -1
-    assert_raises(ValueError, transform.transform, Y_neg)
+    msg = 'Negative values in data passed to'
+    with pytest.raises(ValueError, match=msg):
+        transform.transform(Y_neg)
 
     # test error on invalid sample_steps
     transform = AdditiveChi2Sampler(sample_steps=4)
-    assert_raises(ValueError, transform.fit, X)
+    msg = re.escape(
+        "If sample_steps is not in [1, 2, 3],"
+        " you need to provide sample_interval"
+    )
+    with pytest.raises(ValueError, match=msg):
+        transform.fit(X)
 
     # test that the sample interval is set correctly
     sample_steps_available = [1, 2, 3]
@@ -154,7 +163,9 @@ def test_skewed_chi2_sampler():
     # test error is raised on when inputs contains values smaller than -c
     Y_neg = Y.copy()
     Y_neg[0, 0] = -c * 2.
-    assert_raises(ValueError, transform.transform, Y_neg)
+    msg = 'X may not contain entries smaller than -skewedness'
+    with pytest.raises(ValueError, match=msg):
+        transform.transform(Y_neg)
 
 
 def test_additive_chi2_sampler_exceptions():

From 02e2a113e6cc63854f08349e054d4a3b3e045cb4 Mon Sep 17 00:00:00 2001
From: "Abdulelah S. Al Mesfer" <28743265+abdulelahsm@users.noreply.github.com>
Date: Fri, 9 Apr 2021 13:04:54 +0300
Subject: [PATCH 310/478] TST replace assert_raises by pytest.raises in
 test_least_angle, test_omp, test_test_theil_sen (#19406)

Co-authored-by: Olivier Grisel <olivier.grisel@gmail.com>
Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
Co-authored-by: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Co-authored-by: Chiara Marmo <cmarmo@users.noreply.github.com>
---
 .../linear_model/tests/test_least_angle.py    | 10 +++----
 sklearn/linear_model/tests/test_omp.py        | 26 +++++++++----------
 sklearn/linear_model/tests/test_theil_sen.py  | 15 +++++++----
 3 files changed, 27 insertions(+), 24 deletions(-)

diff --git a/sklearn/linear_model/tests/test_least_angle.py b/sklearn/linear_model/tests/test_least_angle.py
index a8b0e939c080d..4321c39b45e92 100644
--- a/sklearn/linear_model/tests/test_least_angle.py
+++ b/sklearn/linear_model/tests/test_least_angle.py
@@ -3,12 +3,10 @@
 import numpy as np
 import pytest
 from scipy import linalg
-
 from sklearn.base import clone
 from sklearn.model_selection import train_test_split
 from sklearn.utils._testing import assert_allclose
 from sklearn.utils._testing import assert_array_almost_equal
-from sklearn.utils._testing import assert_raises
 from sklearn.utils._testing import ignore_warnings
 from sklearn.utils._testing import TempMemmap
 from sklearn.utils.fixes import np_version, parse_version
@@ -96,8 +94,8 @@ def test_lars_path_gram_equivalent(method, return_path):
 def test_x_none_gram_none_raises_value_error():
     # Test that lars_path with no X and Gram raises exception
     Xy = np.dot(X.T, y)
-    assert_raises(ValueError, linear_model.lars_path, None, y, Gram=None,
-                  Xy=Xy)
+    with pytest.raises(ValueError):
+        linear_model.lars_path(None, y, Gram=None, Xy=Xy)
 
 
 def test_all_precomputed():
@@ -489,7 +487,9 @@ def test_lasso_lars_ic():
 
     # test error on unknown IC
     lars_broken = linear_model.LassoLarsIC('<unknown>')
-    assert_raises(ValueError, lars_broken.fit, X, y)
+
+    with pytest.raises(ValueError):
+        lars_broken.fit(X, y)
 
 
 def test_lars_path_readonly_data():
diff --git a/sklearn/linear_model/tests/test_omp.py b/sklearn/linear_model/tests/test_omp.py
index 3cbda003f0148..1d2eb6a239786 100644
--- a/sklearn/linear_model/tests/test_omp.py
+++ b/sklearn/linear_model/tests/test_omp.py
@@ -4,7 +4,6 @@
 import numpy as np
 import pytest
 
-from sklearn.utils._testing import assert_raises
 from sklearn.utils._testing import assert_array_equal
 from sklearn.utils._testing import assert_array_almost_equal
 from sklearn.utils._testing import ignore_warnings
@@ -33,16 +32,16 @@
 
 def test_correct_shapes():
     assert (orthogonal_mp(X, y[:, 0], n_nonzero_coefs=5).shape ==
-                 (n_features,))
+            (n_features,))
     assert (orthogonal_mp(X, y, n_nonzero_coefs=5).shape ==
-                 (n_features, 3))
+            (n_features, 3))
 
 
 def test_correct_shapes_gram():
     assert (orthogonal_mp_gram(G, Xy[:, 0], n_nonzero_coefs=5).shape ==
-                 (n_features,))
+            (n_features,))
     assert (orthogonal_mp_gram(G, Xy, n_nonzero_coefs=5).shape ==
-                 (n_features, 3))
+            (n_features, 3))
 
 
 def test_n_nonzero_coefs():
@@ -88,15 +87,14 @@ def test_unreachable_accuracy():
                           n_nonzero_coefs=n_features))
 
 
-def test_bad_input():
-    assert_raises(ValueError, orthogonal_mp, X, y, tol=-1)
-    assert_raises(ValueError, orthogonal_mp, X, y, n_nonzero_coefs=-1)
-    assert_raises(ValueError, orthogonal_mp, X, y,
-                  n_nonzero_coefs=n_features + 1)
-    assert_raises(ValueError, orthogonal_mp_gram, G, Xy, tol=-1)
-    assert_raises(ValueError, orthogonal_mp_gram, G, Xy, n_nonzero_coefs=-1)
-    assert_raises(ValueError, orthogonal_mp_gram, G, Xy,
-                  n_nonzero_coefs=n_features + 1)
+@pytest.mark.parametrize("positional_params", [(X, y), (G, Xy)])
+@pytest.mark.parametrize(
+    "keyword_params",
+    [{"tol": -1}, {"n_nonzero_coefs": -1}, {"n_nonzero_coefs": n_features + 1}]
+)
+def test_bad_input(positional_params, keyword_params):
+    with pytest.raises(ValueError):
+        orthogonal_mp(*positional_params, **keyword_params)
 
 
 def test_perfect_signal_recovery():
diff --git a/sklearn/linear_model/tests/test_theil_sen.py b/sklearn/linear_model/tests/test_theil_sen.py
index c670fc3979b80..125c89599af83 100644
--- a/sklearn/linear_model/tests/test_theil_sen.py
+++ b/sklearn/linear_model/tests/test_theil_sen.py
@@ -17,7 +17,7 @@
 from sklearn.linear_model import LinearRegression, TheilSenRegressor
 from sklearn.linear_model._theil_sen import _spatial_median, _breakdown_point
 from sklearn.linear_model._theil_sen import _modified_weiszfeld_step
-from sklearn.utils._testing import assert_almost_equal, assert_raises
+from sklearn.utils._testing import assert_almost_equal
 
 
 @contextmanager
@@ -209,19 +209,23 @@ def test_calc_breakdown_point():
 def test_checksubparams_negative_subpopulation():
     X, y, w, c = gen_toy_problem_1d()
     theil_sen = TheilSenRegressor(max_subpopulation=-1, random_state=0)
-    assert_raises(ValueError, theil_sen.fit, X, y)
+
+    with pytest.raises(ValueError):
+        theil_sen.fit(X, y)
 
 
 def test_checksubparams_too_few_subsamples():
     X, y, w, c = gen_toy_problem_1d()
     theil_sen = TheilSenRegressor(n_subsamples=1, random_state=0)
-    assert_raises(ValueError, theil_sen.fit, X, y)
+    with pytest.raises(ValueError):
+        theil_sen.fit(X, y)
 
 
 def test_checksubparams_too_many_subsamples():
     X, y, w, c = gen_toy_problem_1d()
     theil_sen = TheilSenRegressor(n_subsamples=101, random_state=0)
-    assert_raises(ValueError, theil_sen.fit, X, y)
+    with pytest.raises(ValueError):
+        theil_sen.fit(X, y)
 
 
 def test_checksubparams_n_subsamples_if_less_samples_than_features():
@@ -230,7 +234,8 @@ def test_checksubparams_n_subsamples_if_less_samples_than_features():
     X = random_state.normal(size=(n_samples, n_features))
     y = random_state.normal(size=n_samples)
     theil_sen = TheilSenRegressor(n_subsamples=9, random_state=0)
-    assert_raises(ValueError, theil_sen.fit, X, y)
+    with pytest.raises(ValueError):
+        theil_sen.fit(X, y)
 
 
 def test_subpopulation():

From 80e985b5da06a835eecd9130abeed79a31e63200 Mon Sep 17 00:00:00 2001
From: LSturtew <56136443+LSturtew@users.noreply.github.com>
Date: Fri, 9 Apr 2021 13:54:30 +0200
Subject: [PATCH 311/478] TST Changes assert to pytest style in
 test_random_projection.py  (#19846)

---
 sklearn/tests/test_random_projection.py | 92 ++++++++++++-------------
 1 file changed, 44 insertions(+), 48 deletions(-)

diff --git a/sklearn/tests/test_random_projection.py b/sklearn/tests/test_random_projection.py
index d01f318c3f1b1..79d2af5776859 100644
--- a/sklearn/tests/test_random_projection.py
+++ b/sklearn/tests/test_random_projection.py
@@ -14,12 +14,9 @@
 from sklearn.random_projection import SparseRandomProjection
 from sklearn.random_projection import GaussianRandomProjection
 
-from sklearn.utils._testing import assert_raises
-from sklearn.utils._testing import assert_raise_message
 from sklearn.utils._testing import assert_array_equal
 from sklearn.utils._testing import assert_almost_equal
 from sklearn.utils._testing import assert_array_almost_equal
-from sklearn.utils._testing import assert_warns
 from sklearn.exceptions import DataDimensionalityWarning
 
 all_sparse_random_matrix: List[Any] = [_sparse_random_matrix]
@@ -59,19 +56,21 @@ def densify(matrix):
 ###############################################################################
 # test on JL lemma
 ###############################################################################
-def test_invalid_jl_domain():
-    assert_raises(ValueError, johnson_lindenstrauss_min_dim, 100, eps=1.1)
-    assert_raises(ValueError, johnson_lindenstrauss_min_dim, 100, eps=0.0)
-    assert_raises(ValueError, johnson_lindenstrauss_min_dim, 100, eps=-0.1)
-    assert_raises(ValueError, johnson_lindenstrauss_min_dim, 0, eps=0.5)
 
+@pytest.mark.parametrize("n_samples, eps", [
+    (100, 1.1),
+    (100, 0.0),
+    (100, -0.1),
+    (0, 0.5)
+])
+def test_invalid_jl_domain(n_samples, eps):
+    with pytest.raises(ValueError):
+        johnson_lindenstrauss_min_dim(n_samples, eps=eps)
 
-def test_input_size_jl_min_dim():
-    assert_raises(ValueError, johnson_lindenstrauss_min_dim,
-                  3 * [100], eps=2 * [0.9])
 
-    assert_raises(ValueError, johnson_lindenstrauss_min_dim, 3 * [100],
-                  eps=2 * [0.9])
+def test_input_size_jl_min_dim():
+    with pytest.raises(ValueError):
+        johnson_lindenstrauss_min_dim(3 * [100], eps=2 * [0.9])
 
     johnson_lindenstrauss_min_dim(np.random.randint(1, 10, size=(10, 10)),
                                   eps=np.full((10, 10), 0.5))
@@ -81,18 +80,17 @@ def test_input_size_jl_min_dim():
 # tests random matrix generation
 ###############################################################################
 def check_input_size_random_matrix(random_matrix):
-    assert_raises(ValueError, random_matrix, 0, 0)
-    assert_raises(ValueError, random_matrix, -1, 1)
-    assert_raises(ValueError, random_matrix, 1, -1)
-    assert_raises(ValueError, random_matrix, 1, 0)
-    assert_raises(ValueError, random_matrix, -1, 0)
+    inputs = [(0, 0), (-1, 1), (1, -1), (1, 0), (-1, 0)]
+    for n_components, n_features in inputs:
+        with pytest.raises(ValueError):
+            random_matrix(n_components, n_features)
 
 
 def check_size_generated(random_matrix):
-    assert random_matrix(1, 5).shape == (1, 5)
-    assert random_matrix(5, 1).shape == (5, 1)
-    assert random_matrix(5, 5).shape == (5, 5)
-    assert random_matrix(1, 1).shape == (1, 1)
+    inputs = [(1, 5), (5, 1), (5, 5), (1, 1)]
+    for n_components, n_features in inputs:
+        assert random_matrix(n_components, n_features).shape == (
+            n_components, n_features)
 
 
 def check_zero_mean_and_unit_norm(random_matrix):
@@ -109,8 +107,8 @@ def check_input_with_sparse_random_matrix(random_matrix):
     n_components, n_features = 5, 10
 
     for density in [-1., 0.0, 1.1]:
-        assert_raises(ValueError,
-                      random_matrix, n_components, n_features, density=density)
+        with pytest.raises(ValueError):
+            random_matrix(n_components, n_features, density=density)
 
 
 @pytest.mark.parametrize("random_matrix", all_random_matrix)
@@ -153,9 +151,9 @@ def test_sparse_random_matrix():
         s = 1 / density
 
         A = _sparse_random_matrix(n_components,
-                                 n_features,
-                                 density=density,
-                                 random_state=0)
+                                  n_features,
+                                  density=density,
+                                  random_state=0)
         A = densify(A)
 
         # Check possible values
@@ -196,31 +194,27 @@ def test_sparse_random_matrix():
 ###############################################################################
 # tests on random projection transformer
 ###############################################################################
-def test_sparse_random_projection_transformer_invalid_density():
-    for RandomProjection in all_SparseRandomProjection:
-        assert_raises(ValueError,
-                      RandomProjection(density=1.1).fit, data)
 
-        assert_raises(ValueError,
-                      RandomProjection(density=0).fit, data)
-
-        assert_raises(ValueError,
-                      RandomProjection(density=-0.1).fit, data)
+@pytest.mark.parametrize("density", [1.1, 0, -0.1])
+def test_sparse_random_projection_transformer_invalid_density(density):
+    for RandomProjection in all_SparseRandomProjection:
+        with pytest.raises(ValueError):
+            RandomProjection(density=density).fit(data)
 
 
-def test_random_projection_transformer_invalid_input():
+@pytest.mark.parametrize("n_components, fit_data", [
+    ('auto', [[0, 1, 2]]), (-10, data)]
+)
+def test_random_projection_transformer_invalid_input(n_components, fit_data):
     for RandomProjection in all_RandomProjection:
-        assert_raises(ValueError,
-                      RandomProjection(n_components='auto').fit, [[0, 1, 2]])
-
-        assert_raises(ValueError,
-                      RandomProjection(n_components=-10).fit, data)
+        with pytest.raises(ValueError):
+            RandomProjection(n_components=n_components).fit(fit_data)
 
 
 def test_try_to_transform_before_fit():
     for RandomProjection in all_RandomProjection:
-        assert_raises(ValueError,
-                      RandomProjection(n_components='auto').transform, data)
+        with pytest.raises(ValueError):
+            RandomProjection(n_components='auto').transform(data)
 
 
 def test_too_many_samples_to_find_a_safe_embedding():
@@ -232,7 +226,8 @@ def test_too_many_samples_to_find_a_safe_embedding():
             'eps=0.100000 and n_samples=1000 lead to a target dimension'
             ' of 5920 which is larger than the original space with'
             ' n_features=100')
-        assert_raise_message(ValueError, expected_msg, rp.fit, data)
+        with pytest.raises(ValueError, match=expected_msg):
+            rp.fit(data)
 
 
 def test_random_projection_embedding_quality():
@@ -318,7 +313,8 @@ def test_correct_RandomProjection_dimensions_embedding():
         assert_array_equal(projected_1, projected_3)
 
         # Try to transform with an input X of size different from fitted.
-        assert_raises(ValueError, rp.transform, data[:, 1:5])
+        with pytest.raises(ValueError):
+            rp.transform(data[:, 1:5])
 
         # it is also possible to fix the number of components and the density
         # level
@@ -337,8 +333,8 @@ def test_warning_n_components_greater_than_n_features():
     data, _ = make_sparse_random_data(5, n_features, int(n_features / 4))
 
     for RandomProjection in all_RandomProjection:
-        assert_warns(DataDimensionalityWarning,
-                     RandomProjection(n_components=n_features + 1).fit, data)
+        with pytest.warns(DataDimensionalityWarning):
+            RandomProjection(n_components=n_features + 1).fit(data)
 
 
 def test_works_with_sparse_data():

From da3c2d2a19ade5ca69adb6952ecace811ed122ff Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Fri, 9 Apr 2021 12:34:13 -0400
Subject: [PATCH 312/478] FIX MultiOutputRegressor correctly ducktypes fitted
 estimators (#19308)

Co-authored-by: Olivier Grisel <olivier.grisel@gmail.com>
---
 doc/whats_new/v0.24.rst           |  7 +++++++
 sklearn/multioutput.py            |  2 +-
 sklearn/tests/test_multioutput.py | 18 ++++++++++++++++++
 3 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst
index 68ea8ba0f7a72..2cfe6970dd7b1 100644
--- a/doc/whats_new/v0.24.rst
+++ b/doc/whats_new/v0.24.rst
@@ -48,6 +48,13 @@ Changelog
   `'use_encoded_value'` strategies.
   :pr:`19234` by `Guillaume Lemaitre <glemaitre>`.
 
+:mod:`sklearn.multioutput`
+..........................
+
+- |Fix| :class:`multioutput.MultiOutputRegressor` now works with estimators
+  that dynamically define `predict` during fitting, such as
+  :class:`ensemble.StackingRegressor`. :pr:`19308` by `Thomas Fan`_.
+
 :mod:`sklearn.semi_supervised`
 ..............................
 
diff --git a/sklearn/multioutput.py b/sklearn/multioutput.py
index 9987c01b13187..4cb01c524d59d 100644
--- a/sklearn/multioutput.py
+++ b/sklearn/multioutput.py
@@ -198,7 +198,7 @@ def predict(self, X):
             Note: Separate models are generated for each predictor.
         """
         check_is_fitted(self)
-        if not hasattr(self.estimator, "predict"):
+        if not hasattr(self.estimators_[0], "predict"):
             raise ValueError("The base estimator should implement"
                              " a predict method")
 
diff --git a/sklearn/tests/test_multioutput.py b/sklearn/tests/test_multioutput.py
index 87e5218e08e22..c20db084aa664 100644
--- a/sklearn/tests/test_multioutput.py
+++ b/sklearn/tests/test_multioutput.py
@@ -10,6 +10,7 @@
 from sklearn import datasets
 from sklearn.base import clone
 from sklearn.datasets import make_classification
+from sklearn.datasets import load_linnerud
 from sklearn.ensemble import GradientBoostingRegressor, RandomForestClassifier
 from sklearn.exceptions import NotFittedError
 from sklearn.linear_model import Lasso
@@ -30,6 +31,7 @@
 from sklearn.dummy import DummyRegressor, DummyClassifier
 from sklearn.pipeline import make_pipeline
 from sklearn.impute import SimpleImputer
+from sklearn.ensemble import StackingRegressor
 
 
 def test_multi_target_regression():
@@ -658,3 +660,19 @@ def test_classifier_chain_tuple_invalid_order():
 
     with pytest.raises(ValueError, match='invalid order'):
         chain.fit(X, y)
+
+
+def test_multioutputregressor_ducktypes_fitted_estimator():
+    """Test that MultiOutputRegressor checks the fitted estimator for
+    predict. Non-regression test for #16549."""
+    X, y = load_linnerud(return_X_y=True)
+    stacker = StackingRegressor(
+        estimators=[("sgd", SGDRegressor(random_state=1))],
+        final_estimator=Ridge(),
+        cv=2
+    )
+
+    reg = MultiOutputRegressor(estimator=stacker).fit(X, y)
+
+    # Does not raise
+    reg.predict(X)

From 7d728d357e55253f30408ce68cafcc82d888393c Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 9 Apr 2021 19:36:36 +0200
Subject: [PATCH 313/478] FIX missing space in import in svm/_base.py (#19852)

---
 sklearn/svm/_base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/svm/_base.py b/sklearn/svm/_base.py
index 67808278cc59a..62710ec5157fb 100644
--- a/sklearn/svm/_base.py
+++ b/sklearn/svm/_base.py
@@ -6,7 +6,7 @@
 # mypy error: error: Module 'sklearn.svm' has no attribute '_libsvm'
 # (and same for other imports)
 from . import _libsvm as libsvm  # type: ignore
-from .import _liblinear as liblinear  # type: ignore
+from . import _liblinear as liblinear  # type: ignore
 from . import _libsvm_sparse as libsvm_sparse  # type: ignore
 from ..base import BaseEstimator, ClassifierMixin
 from ..preprocessing import LabelEncoder

From 3ff1267a7b74259dd0f0fdaf7da88b02e727e7c1 Mon Sep 17 00:00:00 2001
From: Oras Phongpanangam <panangam@users.noreply.github.com>
Date: Fri, 9 Apr 2021 11:19:04 -0700
Subject: [PATCH 314/478] FIX allows TransformedTargetRegressor to take nD
 target (#18898)

Co-authored-by: Guillaume Lemaitre <g.lemaitre58@gmail.com>
---
 doc/whats_new/v1.0.rst               |  4 ++++
 sklearn/compose/_target.py           |  2 +-
 sklearn/compose/tests/test_target.py | 21 +++++++++++++++++++++
 3 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 1245193d76d89..b438ee16139f3 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -110,6 +110,10 @@ Changelog
   :pr:`18459` by :user:`Albert Villanova del Moral <albertvillanova>` and
   :user:`Alonso Silva Allende <alonsosilvaallende>`.
 
+- |Fix| :class:`compose.TransformedTargetRegressor` now takes nD targets with
+  an adequate transformer.
+  :pr:`18898` by :user:`Oras Phongpanagnam <panangam>`.
+
 :mod:`sklearn.datasets`
 .......................
 
diff --git a/sklearn/compose/_target.py b/sklearn/compose/_target.py
index 1d6695a808d81..1a80046c66376 100644
--- a/sklearn/compose/_target.py
+++ b/sklearn/compose/_target.py
@@ -176,7 +176,7 @@ def fit(self, X, y, **fit_params):
         self : object
         """
         y = check_array(y, accept_sparse=False, force_all_finite=True,
-                        ensure_2d=False, dtype='numeric')
+                        ensure_2d=False, dtype='numeric', allow_nd=True)
 
         # store the number of dimension of the target to predict an array of
         # similar shape at predict
diff --git a/sklearn/compose/tests/test_target.py b/sklearn/compose/tests/test_target.py
index dc5d8d95743ef..1f3d6bc08e711 100644
--- a/sklearn/compose/tests/test_target.py
+++ b/sklearn/compose/tests/test_target.py
@@ -197,6 +197,27 @@ def test_transform_target_regressor_2d_transformer_multioutput():
     assert_allclose(regr.regressor_.coef_, lr.coef_)
 
 
+def test_transform_target_regressor_3d_target():
+    # Non-regression test for:
+    # https://github.com/scikit-learn/scikit-learn/issues/18866
+    # Check with a 3D target with a transformer that reshapes the target
+    X = friedman[0]
+    y = np.tile(friedman[1].reshape(-1, 1, 1), [1, 3, 2])
+
+    def flatten_data(data):
+        return data.reshape(data.shape[0], -1)
+
+    def unflatten_data(data):
+        return data.reshape(data.shape[0], -1, 2)
+
+    transformer = FunctionTransformer(func=flatten_data,
+                                      inverse_func=unflatten_data)
+    regr = TransformedTargetRegressor(regressor=LinearRegression(),
+                                      transformer=transformer)
+    y_pred = regr.fit(X, y).predict(X)
+    assert y.shape == y_pred.shape
+
+
 def test_transform_target_regressor_multi_to_single():
     X = friedman[0]
     y = np.transpose([friedman[1], (friedman[1] ** 2 + 1)])

From b1d686d07559fb83040cb085b752d86ebbb9b3ba Mon Sep 17 00:00:00 2001
From: Ana Pessoa <34238053+analuizaypessoa@users.noreply.github.com>
Date: Sun, 11 Apr 2021 18:13:13 -0300
Subject: [PATCH 315/478] DOC Fixed typo in clustering.rst (#19863)

---
 doc/modules/clustering.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst
index 17ae9eb2651c6..7f9fe2a7bd12e 100644
--- a/doc/modules/clustering.rst
+++ b/doc/modules/clustering.rst
@@ -202,7 +202,7 @@ As a result, the computation is often done several times, with different
 initializations of the centroids. One method to help address this issue is the
 k-means++ initialization scheme, which has been implemented in scikit-learn
 (use the ``init='k-means++'`` parameter). This initializes the centroids to be
-(generally) distant from each other, leading to provably better results than
+(generally) distant from each other, leading to probably better results than
 random initialization, as shown in the reference. 
 
 K-means++ can also be called independently to select seeds for other 

From f1018c6af15711855e0e626a1c1d2a387ed8dbbb Mon Sep 17 00:00:00 2001
From: xiaoyuchai <39104103+xiaoyuchai@users.noreply.github.com>
Date: Sun, 11 Apr 2021 23:54:31 -0700
Subject: [PATCH 316/478] FIX BaseSuccessiveHalving class groups support
 (#19847)

Co-authored-by: Shawn <shawn@mpirica.com>
Co-authored-by: Nicolas Hug <nicolashug@fb.com>
---
 doc/whats_new/v1.0.rst                        |  5 +++
 .../_search_successive_halving.py             |  2 +-
 .../tests/test_successive_halving.py          | 36 +++++++++++++++++++
 3 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index b438ee16139f3..ba3f6d6d1110d 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -276,6 +276,11 @@ Changelog
   :pr:`18649` by `Leandro Hermida <hermidalc>` and
   `Rodion Martynov <marrodion>`.
 
+- |Fix| The `fit` method of the successive halving parameter search 
+  (:class:`model_selection.HalvingGridSearchCV`, and
+  :class:`model_selection.HalvingRandomSearchCV`) now correctly handles the
+  `groups` parameter. :pr:`19847` by :user:`Xiaoyu Chai <xiaoyuchai>`.
+
 :mod:`sklearn.naive_bayes`
 ..........................
 
diff --git a/sklearn/model_selection/_search_successive_halving.py b/sklearn/model_selection/_search_successive_halving.py
index f4396920c1677..2f5c465d6cf41 100644
--- a/sklearn/model_selection/_search_successive_halving.py
+++ b/sklearn/model_selection/_search_successive_halving.py
@@ -210,7 +210,7 @@ def fit(self, X, y=None, groups=None, **fit_params):
 
         self._n_samples_orig = _num_samples(X)
 
-        super().fit(X, y=y, groups=None, **fit_params)
+        super().fit(X, y=y, groups=groups, **fit_params)
 
         # Set best_score_: BaseSearchCV does not set it, as refit is a callable
         self.best_score_ = (
diff --git a/sklearn/model_selection/tests/test_successive_halving.py b/sklearn/model_selection/tests/test_successive_halving.py
index 2c55f6aa6cd85..6660b35a934ba 100644
--- a/sklearn/model_selection/tests/test_successive_halving.py
+++ b/sklearn/model_selection/tests/test_successive_halving.py
@@ -7,9 +7,16 @@
 from sklearn.datasets import make_classification
 from sklearn.dummy import DummyClassifier
 from sklearn.experimental import enable_halving_search_cv  # noqa
+from sklearn.model_selection import StratifiedKFold
+from sklearn.model_selection import StratifiedShuffleSplit
+from sklearn.model_selection import LeaveOneGroupOut
+from sklearn.model_selection import LeavePGroupsOut
+from sklearn.model_selection import GroupKFold
+from sklearn.model_selection import GroupShuffleSplit
 from sklearn.model_selection import HalvingGridSearchCV
 from sklearn.model_selection import HalvingRandomSearchCV
 from sklearn.model_selection import KFold, ShuffleSplit
+from sklearn.svm import LinearSVC
 from sklearn.model_selection._search_successive_halving import (
     _SubsampleMetaSplitter, _top_k, _refit_callable)
 
@@ -562,3 +569,32 @@ def set_params(self, **params):
 
     assert (cv_results_df['params'] == passed_params).all()
     assert (cv_results_df['n_resources'] == passed_n_samples).all()
+
+
+@pytest.mark.parametrize('Est', (HalvingGridSearchCV, HalvingRandomSearchCV))
+def test_groups_support(Est):
+    # Check if ValueError (when groups is None) propagates to
+    # HalvingGridSearchCV and HalvingRandomSearchCV
+    # And also check if groups is correctly passed to the cv object
+    rng = np.random.RandomState(0)
+
+    X, y = make_classification(n_samples=50, n_classes=2, random_state=0)
+    groups = rng.randint(0, 3, 50)
+
+    clf = LinearSVC(random_state=0)
+    grid = {'C': [1]}
+
+    group_cvs = [LeaveOneGroupOut(), LeavePGroupsOut(2),
+                 GroupKFold(n_splits=3), GroupShuffleSplit(random_state=0)]
+    error_msg = "The 'groups' parameter should not be None."
+    for cv in group_cvs:
+        gs = Est(clf, grid, cv=cv)
+        with pytest.raises(ValueError, match=error_msg):
+            gs.fit(X, y)
+        gs.fit(X, y, groups=groups)
+
+    non_group_cvs = [StratifiedKFold(), StratifiedShuffleSplit(random_state=0)]
+    for cv in non_group_cvs:
+        gs = Est(clf, grid, cv=cv)
+        # Should not raise an error
+        gs.fit(X, y)

From 7b343ddd53e4efe97b6588b74a75f08c37d76f46 Mon Sep 17 00:00:00 2001
From: Christopher Yeh <chrisyeh96@users.noreply.github.com>
Date: Mon, 12 Apr 2021 05:51:22 -0600
Subject: [PATCH 317/478] CLN Improve doc/error consistency for
 GaussianProcessRegressor (#19687)

Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
---
 sklearn/gaussian_process/_gpr.py | 36 ++++++++++++++++----------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/sklearn/gaussian_process/_gpr.py b/sklearn/gaussian_process/_gpr.py
index b4ab0441efc71..4e8814dd69951 100644
--- a/sklearn/gaussian_process/_gpr.py
+++ b/sklearn/gaussian_process/_gpr.py
@@ -30,9 +30,9 @@ class GaussianProcessRegressor(MultiOutputMixin,
     GaussianProcessRegressor:
 
        * allows prediction without prior fitting (based on the GP prior)
-       * provides an additional method sample_y(X), which evaluates samples
+       * provides an additional method `sample_y(X)`, which evaluates samples
          drawn from the GPR (prior or posterior) at given inputs
-       * exposes a method log_marginal_likelihood(theta), which can be used
+       * exposes a method `log_marginal_likelihood(theta)`, which can be used
          externally for other ways of selecting hyperparameters, e.g., via
          Markov chain Monte Carlo.
 
@@ -68,8 +68,8 @@ class GaussianProcessRegressor(MultiOutputMixin,
         must have the signature::
 
             def optimizer(obj_func, initial_theta, bounds):
-                # * 'obj_func' is the objective function to be minimized, which
-                #   takes the hyperparameters theta as parameter and an
+                # * 'obj_func': the objective function to be minimized, which
+                #   takes the hyperparameters theta as a parameter and an
                 #   optional flag eval_gradient, which determines if the
                 #   gradient is returned additionally to the function value
                 # * 'initial_theta': the initial value for theta, which can be
@@ -80,7 +80,7 @@ def optimizer(obj_func, initial_theta, bounds):
                 # the corresponding value of the target function.
                 return theta_opt, func_min
 
-        Per default, the 'L-BGFS-B' algorithm from scipy.optimize.minimize
+        Per default, the 'L-BFGS-B' algorithm from scipy.optimize.minimize
         is used. If None is passed, the kernel's parameters are kept fixed.
         Available internal optimizers are::
 
@@ -113,7 +113,7 @@ def optimizer(obj_func, initial_theta, bounds):
     random_state : int, RandomState instance or None, default=None
         Determines random number generation used to initialize the centers.
         Pass an int for reproducible results across multiple function calls.
-        See :term: `Glossary <random_state>`.
+        See :term:`Glossary <random_state>`.
 
     Attributes
     ----------
@@ -211,8 +211,8 @@ def fit(self, X, y):
             if self.alpha.shape[0] == 1:
                 self.alpha = self.alpha[0]
             else:
-                raise ValueError("alpha must be a scalar or an array"
-                                 " with same number of entries as y.(%d != %d)"
+                raise ValueError("alpha must be a scalar or an array "
+                                 "with same number of entries as y. (%d != %d)"
                                  % (self.alpha.shape[0], y.shape[0]))
 
         self.X_train_ = np.copy(X) if self.copy_X_train else X
@@ -283,9 +283,9 @@ def predict(self, X, return_std=False, return_cov=False):
         """Predict using the Gaussian process regression model
 
         We can also predict based on an unfitted model by using the GP prior.
-        In addition to the mean of the predictive distribution, also its
-        standard deviation (return_std=True) or covariance (return_cov=True).
-        Note that at most one of the two can be requested.
+        In addition to the mean of the predictive distribution, optionally also
+        returns its standard deviation (`return_std=True`) or covariance
+        (`return_cov=True`). Note that at most one of the two can be requested.
 
         Parameters
         ----------
@@ -302,7 +302,7 @@ def predict(self, X, return_std=False, return_cov=False):
 
         Returns
         -------
-        y_mean : ndarray of shape (n_samples, [n_output_dims])
+        y_mean : ndarray of shape (n_samples,) or (n_samples, n_targets)
             Mean of predictive distribution a query points.
 
         y_std : ndarray of shape (n_samples,), optional
@@ -315,8 +315,7 @@ def predict(self, X, return_std=False, return_cov=False):
         """
         if return_std and return_cov:
             raise RuntimeError(
-                "Not returning standard deviation of predictions when "
-                "returning full covariance.")
+                "At most one of return_std or return_cov can be requested.")
 
         if self.kernel is None or self.kernel.requires_vector_input:
             X = self._validate_data(X, ensure_2d=True, dtype="numeric",
@@ -389,21 +388,22 @@ def sample_y(self, X, n_samples=1, random_state=0):
 
         Parameters
         ----------
-        X : array-like of shape (n_samples, n_features) or list of object
+        X : array-like of shape (n_samples_X, n_features) or list of object
             Query points where the GP is evaluated.
 
         n_samples : int, default=1
-            The number of samples drawn from the Gaussian process
+            Number of samples drawn from the Gaussian process per query point
 
         random_state : int, RandomState instance or None, default=0
             Determines random number generation to randomly draw samples.
             Pass an int for reproducible results across multiple function
             calls.
-            See :term: `Glossary <random_state>`.
+            See :term:`Glossary <random_state>`.
 
         Returns
         -------
-        y_samples : ndarray of shape (n_samples_X, [n_output_dims], n_samples)
+        y_samples : ndarray of shape (n_samples_X, n_samples), or \
+            (n_samples_X, n_targets, n_samples)
             Values of n_samples samples drawn from Gaussian process and
             evaluated at query points.
         """

From e56d76a8da59f1d28f7887c8be4e55076da885b7 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Mon, 12 Apr 2021 17:03:02 +0200
Subject: [PATCH 318/478] FIX Removes unecessary check in _BaseChain (#19865)

---
 sklearn/multioutput.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sklearn/multioutput.py b/sklearn/multioutput.py
index 4cb01c524d59d..9b64d28f41eb8 100644
--- a/sklearn/multioutput.py
+++ b/sklearn/multioutput.py
@@ -470,7 +470,6 @@ def fit(self, X, Y, **fit_params):
         X, Y = self._validate_data(X, Y, multi_output=True, accept_sparse=True)
 
         random_state = check_random_state(self.random_state)
-        check_array(X, accept_sparse=True)
         self.order_ = self.order
         if isinstance(self.order_, tuple):
             self.order_ = np.array(self.order_)

From c09c654ed4d5833d73f557381f3d10f3d062e5d7 Mon Sep 17 00:00:00 2001
From: Vinicius Rios Fuck <viniciusrf1992@gmail.com>
Date: Mon, 12 Apr 2021 16:59:41 -0300
Subject: [PATCH 319/478] DOC Fix typo in common_pitfalls.rst (#19867)

---
 doc/common_pitfalls.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/common_pitfalls.rst b/doc/common_pitfalls.rst
index 6bc79fbc14c0d..ac5dccb3b5609 100644
--- a/doc/common_pitfalls.rst
+++ b/doc/common_pitfalls.rst
@@ -564,7 +564,7 @@ preformance by letting the estimator use a different RNG on each fold. This
 is done by passing a `RandomState` instance (or `None`) to the estimator
 initialization.
 
-When we pass an integer, the estimator will use the same RNG on each fold: if
+When we pass an integer, the estimator will use the same RNG on each fold:
 if the estimator performs well (or bad), as evaluated by CV, it might just be
 because we got lucky (or unlucky) with that specific seed. Passing instances
 leads to more robust CV results, and makes the comparison between various

From c59a310e5eb4c3f72a00503a2643005551b9d3eb Mon Sep 17 00:00:00 2001
From: Vinicius Rios Fuck <viniciusrf1992@gmail.com>
Date: Mon, 12 Apr 2021 23:18:20 -0300
Subject: [PATCH 320/478] DOC Fix typos plot_column_transformer_mixed_types.py
 (#19871)

---
 examples/compose/plot_column_transformer_mixed_types.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/compose/plot_column_transformer_mixed_types.py b/examples/compose/plot_column_transformer_mixed_types.py
index a2937e041f186..401fe67b7f587 100644
--- a/examples/compose/plot_column_transformer_mixed_types.py
+++ b/examples/compose/plot_column_transformer_mixed_types.py
@@ -147,8 +147,8 @@
 
 # %%
 # The resulting score is not exactly the same as the one from the previous
-# pipeline becase the dtype-based selector treats the ``pclass`` columns as
-# a numeric features instead of a categorical feature as previously:
+# pipeline because the dtype-based selector treats the ``pclass`` column as
+# a numeric feature instead of a categorical feature as previously:
 
 selector(dtype_exclude="category")(X_train)
 
@@ -201,7 +201,7 @@
 # %%
 # The best hyper-parameters have be used to re-fit a final model on the full
 # training set. We can evaluate that final model on held out test data that was
-# not used for hyparameter tuning.
+# not used for hyperparameter tuning.
 #
 print(("best logistic regression from grid search: %.3f"
        % grid_search.score(X_test, y_test)))

From 926633c00f476f0fcbee9bac2dd275249feef444 Mon Sep 17 00:00:00 2001
From: Chiara Marmo <cmarmo@users.noreply.github.com>
Date: Tue, 13 Apr 2021 09:44:24 +0200
Subject: [PATCH 321/478] Update who may propose a new triage member. (#19870)

---
 doc/governance.rst | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/doc/governance.rst b/doc/governance.rst
index 4ab968786cd79..1d971f16a566e 100644
--- a/doc/governance.rst
+++ b/doc/governance.rst
@@ -40,9 +40,10 @@ Similarly to what has been decided in the `python project
 any contributor may become a member of the scikit-learn triage team, after
 showing some continuity in participating to scikit-learn
 development (with pull requests and reviews).
-Any core developer is welcome to propose a scikit-learn contributor to join the
-triage team. Other core developers are then consulted: while it is expected
-that most acceptances will be unanimous, a two-thirds majority is enough.
+Any core developer or member of the triage team is welcome to propose a
+scikit-learn contributor to join the triage team. Other core developers
+are then consulted: while it is expected that most acceptances will be
+unanimous, a two-thirds majority is enough.
 Every new triager will be announced in the mailing list.
 Triagers are welcome to participate in `monthly core developer meetings
 <https://github.com/scikit-learn/administrative/tree/master/meeting_notes>`_.

From 767fd63c9ddddc46e288fdec2cca36a129529a8e Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 13 Apr 2021 12:27:14 +0200
Subject: [PATCH 322/478] DOC make documentation consistent regarding types in
 _encoders.py (#19876)

---
 sklearn/preprocessing/_encoders.py | 33 +++++++++++++++++-------------
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index d3f557d2993cb..65e86e512e381 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -406,7 +406,7 @@ def fit(self, X, y=None):
 
         Parameters
         ----------
-        X : array-like, shape [n_samples, n_features]
+        X : array-like of shape (n_samples, n_features)
             The data to determine the categories of each feature.
 
         y : None
@@ -431,7 +431,7 @@ def fit_transform(self, X, y=None):
 
         Parameters
         ----------
-        X : array-like, shape [n_samples, n_features]
+        X : array-like of shape (n_samples, n_features)
             The data to encode.
 
         y : None
@@ -440,8 +440,10 @@ def fit_transform(self, X, y=None):
 
         Returns
         -------
-        X_out : sparse matrix if sparse=True else a 2-d array
-            Transformed input.
+        X_out : {ndarray, sparse matrix} of shape \
+                (n_samples, n_encoded_features)
+            Transformed input. If `sparse=True`, a sparse matrix will be
+            returned.
         """
         self._validate_keywords()
         return super().fit_transform(X, y)
@@ -452,13 +454,15 @@ def transform(self, X):
 
         Parameters
         ----------
-        X : array-like, shape [n_samples, n_features]
+        X : array-like of shape (n_samples, n_features)
             The data to encode.
 
         Returns
         -------
-        X_out : sparse matrix if sparse=True else a 2-d array
-            Transformed input.
+        X_out : {ndarray, sparse matrix} of shape \
+                (n_samples, n_encoded_features)
+            Transformed input. If `sparse=True`, a sparse matrix will be
+            returned.
         """
         check_is_fitted(self)
         # validation of X happens in _check_X called by _transform
@@ -522,12 +526,13 @@ def inverse_transform(self, X):
 
         Parameters
         ----------
-        X : array-like or sparse matrix, shape [n_samples, n_encoded_features]
+        X : {array-like, sparse matrix} of shape \
+                (n_samples, n_encoded_features)
             The transformed data.
 
         Returns
         -------
-        X_tr : array-like, shape [n_samples, n_features]
+        X_tr : ndarray of shape (n_samples, n_features)
             Inverse transformed array.
         """
         check_is_fitted(self)
@@ -745,7 +750,7 @@ def fit(self, X, y=None):
 
         Parameters
         ----------
-        X : array-like, shape [n_samples, n_features]
+        X : array-like of shape (n_samples, n_features)
             The data to determine the categories of each feature.
 
         y : None
@@ -814,12 +819,12 @@ def transform(self, X):
 
         Parameters
         ----------
-        X : array-like, shape [n_samples, n_features]
+        X : array-like of shape (n_samples, n_features)
             The data to encode.
 
         Returns
         -------
-        X_out : sparse matrix or a 2-d array
+        X_out : ndarray of shape (n_samples, n_features)
             Transformed input.
         """
         X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown,
@@ -841,12 +846,12 @@ def inverse_transform(self, X):
 
         Parameters
         ----------
-        X : array-like or sparse matrix, shape [n_samples, n_encoded_features]
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
             The transformed data.
 
         Returns
         -------
-        X_tr : array-like, shape [n_samples, n_features]
+        X_tr : ndarray of shape (n_samples, n_features)
             Inverse transformed array.
         """
         check_is_fitted(self)

From 8a3939aa69a9faa45eefc4dfb37d5d3f39f425d3 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 13 Apr 2021 13:59:30 +0200
Subject: [PATCH 323/478] FIX Error for sparse matrix in
 OrdinalEncoder.inverse_transform (#19879)

---
 doc/whats_new/v1.0.rst                       |  4 ++++
 sklearn/preprocessing/_encoders.py           |  4 ++--
 sklearn/preprocessing/tests/test_encoders.py | 22 ++++++++++++++++++++
 3 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index ba3f6d6d1110d..23211cd3a95b1 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -343,6 +343,10 @@ Changelog
   `handle_unknown='ignore'` and dropping categories. :pr:`19041` by
   `Thomas Fan`_.
 
+- |Fix| :meth:`preprocessing.OrdinalEncoder.inverse_transform` is not
+  supporting sparse matrix and raise the appropriate error message.
+  :pr:`19879` by :user:`Guillaume Lemaitre <glemaitre>`.
+
 :mod:`sklearn.tree`
 ...................
 
diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index 65e86e512e381..cd05dc89bb75d 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -846,7 +846,7 @@ def inverse_transform(self, X):
 
         Parameters
         ----------
-        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        X : array-like of shape (n_samples, n_encoded_features)
             The transformed data.
 
         Returns
@@ -855,7 +855,7 @@ def inverse_transform(self, X):
             Inverse transformed array.
         """
         check_is_fitted(self)
-        X = check_array(X, accept_sparse='csr', force_all_finite='allow-nan')
+        X = check_array(X, force_all_finite='allow-nan')
 
         n_samples, _ = X.shape
         n_features = len(self.categories_)
diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
index eb776c4c25267..9f1e331f78fec 100644
--- a/sklearn/preprocessing/tests/test_encoders.py
+++ b/sklearn/preprocessing/tests/test_encoders.py
@@ -1110,3 +1110,25 @@ def test_ordinal_encoder_handle_missing_and_unknown(
     assert_allclose(X_trans, expected_X_trans)
 
     assert_allclose(oe.transform(X_test), [[-1.0]])
+
+
+def test_ordinal_encoder_sparse():
+    """Check that we raise proper error with sparse input in OrdinalEncoder.
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/19878
+    """
+    X = np.array([[3, 2, 1], [0, 1, 1]])
+    X_sparse = sparse.csr_matrix(X)
+
+    encoder = OrdinalEncoder()
+
+    err_msg = "A sparse matrix was passed, but dense data is required"
+    with pytest.raises(TypeError, match=err_msg):
+        encoder.fit(X_sparse)
+    with pytest.raises(TypeError, match=err_msg):
+        encoder.fit_transform(X_sparse)
+
+    X_trans = encoder.fit_transform(X)
+    X_trans_sparse = sparse.csr_matrix(X_trans)
+    with pytest.raises(TypeError, match=err_msg):
+        encoder.inverse_transform(X_trans_sparse)

From bbdd3bbbec6c28c03d2e7dbbf96039eaf3c64f97 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 13 Apr 2021 17:11:13 +0200
Subject: [PATCH 324/478] CI Add label on PRs modifying Cython code (#19850)

Co-authored-by: "Thomas J. Fan" <thomasjpfan@gmail.com>
Co-authored-by: Nicolas Hug <contact@nicolas-hug.com>
Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 .github/labeler-file-extensions.yml  |  8 ++++++++
 .github/workflows/labeler-module.yml | 10 ++++++++++
 2 files changed, 18 insertions(+)
 create mode 100644 .github/labeler-file-extensions.yml

diff --git a/.github/labeler-file-extensions.yml b/.github/labeler-file-extensions.yml
new file mode 100644
index 0000000000000..63fcfcacfeb17
--- /dev/null
+++ b/.github/labeler-file-extensions.yml
@@ -0,0 +1,8 @@
+cython:
+- sklearn/**/*.pyx
+- sklearn/**/*.pxd
+- sklearn/**/*.pxi
+# Tempita templates
+- sklearn/**/*.pyx.tp
+- sklearn/**/*.pxd.tp
+- sklearn/**/*.pxi.tp
diff --git a/.github/workflows/labeler-module.yml b/.github/workflows/labeler-module.yml
index 3a9ed8d364f79..eb1669443bb0d 100644
--- a/.github/workflows/labeler-module.yml
+++ b/.github/workflows/labeler-module.yml
@@ -12,3 +12,13 @@ jobs:
         repo-token: "${{ secrets.GITHUB_TOKEN }}"
         max-labels: "3"
         configuration-path: ".github/labeler-module.yml"
+
+  triage_file_extensions:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: thomasjpfan/labeler@v2.5.0
+      continue-on-error: true
+      if: github.repository == 'scikit-learn/scikit-learn'
+      with:
+        repo-token: "${{ secrets.GITHUB_TOKEN }}"
+        configuration-path: ".github/labeler-file-extensions.yml"
\ No newline at end of file

From ab65c8b7d672164e43479f38a95788376229fed0 Mon Sep 17 00:00:00 2001
From: Alihan Zihna <alihanz@gmail.com>
Date: Tue, 13 Apr 2021 16:47:09 +0100
Subject: [PATCH 325/478] TST Changes assert to pytest style in
 tests/test_isotonic.py (#19864)

Co-authored-by: Alihan Zihna <a.zihna@ckhgbdp.onmicrosoft.com>
---
 sklearn/tests/test_isotonic.py | 69 +++++++++++++++++++++++++---------
 1 file changed, 51 insertions(+), 18 deletions(-)

diff --git a/sklearn/tests/test_isotonic.py b/sklearn/tests/test_isotonic.py
index af14f73cd1beb..a88c830256e73 100644
--- a/sklearn/tests/test_isotonic.py
+++ b/sklearn/tests/test_isotonic.py
@@ -9,10 +9,9 @@
                               IsotonicRegression, _make_unique)
 
 from sklearn.utils.validation import check_array
-from sklearn.utils._testing import (assert_raises, assert_allclose,
+from sklearn.utils._testing import (assert_allclose,
                                     assert_array_equal,
-                                    assert_array_almost_equal,
-                                    assert_warns_message, assert_no_warnings)
+                                    assert_array_almost_equal)
 from sklearn.utils import shuffle
 
 from scipy.special import expit
@@ -37,7 +36,10 @@ def test_check_increasing_small_number_of_samples():
     x = [0, 1, 2]
     y = [1, 1.1, 1.05]
 
-    is_increasing = assert_no_warnings(check_increasing, x, y)
+    with pytest.warns(None) as record:
+        is_increasing = check_increasing(x, y)
+    assert len(record) == 0
+
     assert is_increasing
 
 
@@ -46,7 +48,10 @@ def test_check_increasing_up():
     y = [0, 1.5, 2.77, 8.99, 8.99, 50]
 
     # Check that we got increasing=True and no warnings
-    is_increasing = assert_no_warnings(check_increasing, x, y)
+    with pytest.warns(None) as record:
+        is_increasing = check_increasing(x, y)
+    assert len(record) == 0
+
     assert is_increasing
 
 
@@ -55,7 +60,10 @@ def test_check_increasing_up_extreme():
     y = [0, 1, 2, 3, 4, 5]
 
     # Check that we got increasing=True and no warnings
-    is_increasing = assert_no_warnings(check_increasing, x, y)
+    with pytest.warns(None) as record:
+        is_increasing = check_increasing(x, y)
+    assert len(record) == 0
+
     assert is_increasing
 
 
@@ -64,7 +72,10 @@ def test_check_increasing_down():
     y = [0, -1.5, -2.77, -8.99, -8.99, -50]
 
     # Check that we got increasing=False and no warnings
-    is_increasing = assert_no_warnings(check_increasing, x, y)
+    with pytest.warns(None) as record:
+        is_increasing = check_increasing(x, y)
+    assert len(record) == 0
+
     assert not is_increasing
 
 
@@ -73,7 +84,10 @@ def test_check_increasing_down_extreme():
     y = [0, -1, -2, -3, -4, -5]
 
     # Check that we got increasing=False and no warnings
-    is_increasing = assert_no_warnings(check_increasing, x, y)
+    with pytest.warns(None) as record:
+        is_increasing = check_increasing(x, y)
+    assert len(record) == 0
+
     assert not is_increasing
 
 
@@ -82,9 +96,9 @@ def test_check_ci_warn():
     y = [0, -1, 2, -3, 4, -5]
 
     # Check that we got increasing=False and CI interval warning
-    is_increasing = assert_warns_message(UserWarning, "interval",
-                                         check_increasing,
-                                         x, y)
+    msg = "interval"
+    with pytest.warns(UserWarning, match=msg):
+        is_increasing = check_increasing(x, y)
 
     assert not is_increasing
 
@@ -244,10 +258,21 @@ def test_isotonic_regression_auto_increasing():
 def test_assert_raises_exceptions():
     ir = IsotonicRegression()
     rng = np.random.RandomState(42)
-    assert_raises(ValueError, ir.fit, [0, 1, 2], [5, 7, 3], [0.1, 0.6])
-    assert_raises(ValueError, ir.fit, [0, 1, 2], [5, 7])
-    assert_raises(ValueError, ir.fit, rng.randn(3, 10), [0, 1, 2])
-    assert_raises(ValueError, ir.transform, rng.randn(3, 10))
+
+    msg = "Found input variables with inconsistent numbers of samples"
+    with pytest.raises(ValueError, match=msg):
+        ir.fit([0, 1, 2], [5, 7, 3], [0.1, 0.6])
+
+    with pytest.raises(ValueError, match=msg):
+        ir.fit([0, 1, 2], [5, 7])
+
+    msg = 'X should be a 1d array'
+    with pytest.raises(ValueError, match=msg):
+        ir.fit(rng.randn(3, 10), [0, 1, 2])
+
+    msg = 'Isotonic regression input X should be a 1d array'
+    with pytest.raises(ValueError, match=msg):
+        ir.transform(rng.randn(3, 10))
 
 
 def test_isotonic_sample_weight_parameter_default_value():
@@ -298,7 +323,9 @@ def test_isotonic_regression_oob_raise():
     ir.fit(x, y)
 
     # Check that an exception is thrown
-    assert_raises(ValueError, ir.predict, [min(x) - 10, max(x) + 10])
+    msg = 'A value in x_new is below the interpolation range'
+    with pytest.raises(ValueError, match=msg):
+        ir.predict([min(x) - 10, max(x) + 10])
 
 
 def test_isotonic_regression_oob_clip():
@@ -340,7 +367,10 @@ def test_isotonic_regression_oob_bad():
     ir = IsotonicRegression(increasing='auto', out_of_bounds="xyz")
 
     # Make sure that we throw an error for bad out_of_bounds value
-    assert_raises(ValueError, ir.fit, x, y)
+    msg = ("The argument ``out_of_bounds`` must be in 'nan', "
+           "'clip', 'raise'; got xyz")
+    with pytest.raises(ValueError, match=msg):
+        ir.fit(x, y)
 
 
 def test_isotonic_regression_oob_bad_after():
@@ -354,7 +384,10 @@ def test_isotonic_regression_oob_bad_after():
     # Make sure that we throw an error for bad out_of_bounds value in transform
     ir.fit(x, y)
     ir.out_of_bounds = "xyz"
-    assert_raises(ValueError, ir.transform, x)
+    msg = ("The argument ``out_of_bounds`` must be in 'nan', "
+           "'clip', 'raise'; got xyz")
+    with pytest.raises(ValueError, match=msg):
+        ir.transform(x)
 
 
 def test_isotonic_regression_pickle():

From 7fa2e6e2734b590d96e62d5932c648a9c1002f34 Mon Sep 17 00:00:00 2001
From: Christopher Yeh <chrisyeh96@users.noreply.github.com>
Date: Tue, 13 Apr 2021 15:00:39 -0600
Subject: [PATCH 326/478] DOC Clarify documentation for spectral clustering
 (#19795)

---
 sklearn/cluster/_spectral.py | 110 ++++++++++++++++++-----------------
 1 file changed, 57 insertions(+), 53 deletions(-)

diff --git a/sklearn/cluster/_spectral.py b/sklearn/cluster/_spectral.py
index b86d5870025c3..e9a5d7a7b4302 100644
--- a/sklearn/cluster/_spectral.py
+++ b/sklearn/cluster/_spectral.py
@@ -191,7 +191,7 @@ def spectral_clustering(affinity, *, n_clusters=8, n_components=None,
         Number of clusters to extract.
 
     n_components : int, default=n_clusters
-        Number of eigen vectors to use for the spectral embedding
+        Number of eigenvectors to use for the spectral embedding
 
     eigen_solver : {None, 'arpack', 'lobpcg', or 'amg'}
         The eigenvalue decomposition strategy to use. AMG requires pyamg
@@ -201,15 +201,16 @@ def spectral_clustering(affinity, *, n_clusters=8, n_components=None,
 
     random_state : int, RandomState instance, default=None
         A pseudo random number generator used for the initialization of the
-        lobpcg eigen vectors decomposition when eigen_solver == 'amg' and by
+        lobpcg eigenvectors decomposition when eigen_solver == 'amg' and by
         the K-Means initialization. Use an int to make the randomness
         deterministic.
         See :term:`Glossary <random_state>`.
 
     n_init : int, default=10
         Number of time the k-means algorithm will be run with different
-        centroid seeds. The final results will be the best output of
-        n_init consecutive runs in terms of inertia.
+        centroid seeds. The final results will be the best output of n_init
+        consecutive runs in terms of inertia. Only used if
+        ``assign_labels='kmeans'``.
 
     eigen_tol : float, default=0.0
         Stopping criterion for eigendecomposition of the Laplacian matrix
@@ -217,7 +218,7 @@ def spectral_clustering(affinity, *, n_clusters=8, n_components=None,
 
     assign_labels : {'kmeans', 'discretize'}, default='kmeans'
         The strategy to use to assign labels in the embedding
-        space.  There are two ways to assign labels after the laplacian
+        space.  There are two ways to assign labels after the Laplacian
         embedding.  k-means can be applied and is a popular choice. But it can
         also be sensitive to initialization. Discretization is another
         approach which is less sensitive to random initialization. See
@@ -265,7 +266,7 @@ def spectral_clustering(affinity, *, n_clusters=8, n_components=None,
     random_state = check_random_state(random_state)
     n_components = n_clusters if n_components is None else n_components
 
-    # The first eigen vector is constant only for fully connected graphs
+    # The first eigenvector is constant only for fully connected graphs
     # and should be kept for spectral clustering (drop_first = False)
     # See spectral_embedding documentation.
     maps = spectral_embedding(affinity, n_components=n_components,
@@ -288,24 +289,24 @@ class SpectralClustering(ClusterMixin, BaseEstimator):
     """Apply clustering to a projection of the normalized Laplacian.
 
     In practice Spectral Clustering is very useful when the structure of
-    the individual clusters is highly non-convex or more generally when
+    the individual clusters is highly non-convex, or more generally when
     a measure of the center and spread of the cluster is not a suitable
-    description of the complete cluster. For instance when clusters are
+    description of the complete cluster, such as when clusters are
     nested circles on the 2D plane.
 
-    If affinity is the adjacency matrix of a graph, this method can be
-    used to find normalized graph cuts.
+    If the affinity matrix is the adjacency matrix of a graph, this method
+    can be used to find normalized graph cuts.
 
     When calling ``fit``, an affinity matrix is constructed using either
-    kernel function such the Gaussian (aka RBF) kernel of the euclidean
-    distanced ``d(X, X)``::
+    a kernel function such the Gaussian (aka RBF) kernel with Euclidean
+    distance ``d(X, X)``::
 
             np.exp(-gamma * d(X,X) ** 2)
 
     or a k-nearest neighbors connectivity matrix.
 
-    Alternatively, using ``precomputed``, a user-provided affinity
-    matrix can be used.
+    Alternatively, a user-provided affinity matrix can be specified by
+    setting ``affinity='precomputed'``.
 
     Read more in the :ref:`User Guide <spectral_clustering>`.
 
@@ -321,19 +322,20 @@ class SpectralClustering(ClusterMixin, BaseEstimator):
         used.
 
     n_components : int, default=n_clusters
-        Number of eigen vectors to use for the spectral embedding
+        Number of eigenvectors to use for the spectral embedding
 
     random_state : int, RandomState instance, default=None
         A pseudo random number generator used for the initialization of the
-        lobpcg eigen vectors decomposition when ``eigen_solver='amg'`` and by
+        lobpcg eigenvectors decomposition when ``eigen_solver='amg'`` and by
         the K-Means initialization. Use an int to make the randomness
         deterministic.
         See :term:`Glossary <random_state>`.
 
     n_init : int, default=10
         Number of time the k-means algorithm will be run with different
-        centroid seeds. The final results will be the best output of
-        n_init consecutive runs in terms of inertia.
+        centroid seeds. The final results will be the best output of n_init
+        consecutive runs in terms of inertia. Only used if
+        ``assign_labels='kmeans'``.
 
     gamma : float, default=1.0
         Kernel coefficient for rbf, poly, sigmoid, laplacian and chi2 kernels.
@@ -341,14 +343,15 @@ class SpectralClustering(ClusterMixin, BaseEstimator):
 
     affinity : str or callable, default='rbf'
         How to construct the affinity matrix.
-         - 'nearest_neighbors' : construct the affinity matrix by computing a
+         - 'nearest_neighbors': construct the affinity matrix by computing a
            graph of nearest neighbors.
-         - 'rbf' : construct the affinity matrix using a radial basis function
+         - 'rbf': construct the affinity matrix using a radial basis function
            (RBF) kernel.
-         - 'precomputed' : interpret ``X`` as a precomputed affinity matrix.
-         - 'precomputed_nearest_neighbors' : interpret ``X`` as a sparse graph
-           of precomputed nearest neighbors, and constructs the affinity matrix
-           by selecting the ``n_neighbors`` nearest neighbors.
+         - 'precomputed': interpret ``X`` as a precomputed affinity matrix,
+           where larger values indicate greater similarity between instances.
+         - 'precomputed_nearest_neighbors': interpret ``X`` as a sparse graph
+           of precomputed distances, and construct a binary affinity matrix
+           from the ``n_neighbors`` nearest neighbors of each instance.
          - one of the kernels supported by
            :func:`~sklearn.metrics.pairwise_kernels`.
 
@@ -365,11 +368,11 @@ class SpectralClustering(ClusterMixin, BaseEstimator):
         when ``eigen_solver='arpack'``.
 
     assign_labels : {'kmeans', 'discretize'}, default='kmeans'
-        The strategy to use to assign labels in the embedding
-        space. There are two ways to assign labels after the laplacian
-        embedding. k-means can be applied and is a popular choice. But it can
-        also be sensitive to initialization. Discretization is another approach
-        which is less sensitive to random initialization.
+        The strategy for assigning labels in the embedding space. There are two
+        ways to assign labels after the Laplacian embedding. k-means is a
+        popular choice, but it can be sensitive to initialization.
+        Discretization is another approach which is less sensitive to random
+        initialization.
 
     degree : float, default=3
         Degree of the polynomial kernel. Ignored by other kernels.
@@ -398,7 +401,7 @@ class SpectralClustering(ClusterMixin, BaseEstimator):
     Attributes
     ----------
     affinity_matrix_ : array-like of shape (n_samples, n_samples)
-        Affinity matrix used for clustering. Available only if after calling
+        Affinity matrix used for clustering. Available only after calling
         ``fit``.
 
     labels_ : ndarray of shape (n_samples,)
@@ -411,7 +414,7 @@ class SpectralClustering(ClusterMixin, BaseEstimator):
     >>> X = np.array([[1, 1], [2, 1], [1, 0],
     ...               [4, 7], [3, 5], [3, 6]])
     >>> clustering = SpectralClustering(n_clusters=2,
-    ...         assign_labels="discretize",
+    ...         assign_labels='discretize',
     ...         random_state=0).fit(X)
     >>> clustering.labels_
     array([1, 1, 1, 0, 0, 0])
@@ -421,19 +424,18 @@ class SpectralClustering(ClusterMixin, BaseEstimator):
 
     Notes
     -----
-    If you have an affinity matrix, such as a distance matrix,
-    for which 0 means identical elements, and high values means
-    very dissimilar elements, it can be transformed in a
-    similarity matrix that is well suited for the algorithm by
-    applying the Gaussian (RBF, heat) kernel::
+    A distance matrix for which 0 indicates identical elements and high values
+    indicate very dissimilar elements can be transformed into an affinity /
+    similarity matrix that is well-suited for the algorithm by
+    applying the Gaussian (aka RBF, heat) kernel::
 
         np.exp(- dist_matrix ** 2 / (2. * delta ** 2))
 
-    Where ``delta`` is a free parameter representing the width of the Gaussian
+    where ``delta`` is a free parameter representing the width of the Gaussian
     kernel.
 
-    Another alternative is to take a symmetric version of the k
-    nearest neighbors connectivity matrix of the points.
+    An alternative is to take a symmetric version of the k-nearest neighbors
+    connectivity matrix of the points.
 
     If the pyamg package is installed, it is used: this greatly
     speeds up computation.
@@ -480,13 +482,14 @@ def fit(self, X, y=None):
 
         Parameters
         ----------
-        X : {array-like, sparse matrix} of shape (n_samples, n_features), or \
-                array-like of shape (n_samples, n_samples)
-            Training instances to cluster, or similarities / affinities between
-            instances if ``affinity='precomputed'``. If a sparse matrix is
-            provided in a format other than ``csr_matrix``, ``csc_matrix``,
-            or ``coo_matrix``, it will be converted into a sparse
-            ``csr_matrix``.
+        X : {array-like, sparse matrix} of shape (n_samples, n_features) or \
+                (n_samples, n_samples)
+            Training instances to cluster, similarities / affinities between
+            instances if ``affinity='precomputed'``, or distances between
+            instances if ``affinity='precomputed_nearest_neighbors``. If a
+            sparse matrix is provided in a format other than ``csr_matrix``,
+            ``csc_matrix``, or ``coo_matrix``, it will be converted into a
+            sparse ``csr_matrix``.
 
         y : Ignored
             Not used, present here for API consistency by convention.
@@ -549,13 +552,14 @@ def fit_predict(self, X, y=None):
 
         Parameters
         ----------
-        X : {array-like, sparse matrix} of shape (n_samples, n_features), or \
-                array-like of shape (n_samples, n_samples)
-            Training instances to cluster, or similarities / affinities between
-            instances if ``affinity='precomputed'``. If a sparse matrix is
-            provided in a format other than ``csr_matrix``, ``csc_matrix``,
-            or ``coo_matrix``, it will be converted into a sparse
-            ``csr_matrix``.
+        X : {array-like, sparse matrix} of shape (n_samples, n_features) or \
+                (n_samples, n_samples)
+            Training instances to cluster, similarities / affinities between
+            instances if ``affinity='precomputed'``, or distances between
+            instances if ``affinity='precomputed_nearest_neighbors``. If a
+            sparse matrix is provided in a format other than ``csr_matrix``,
+            ``csc_matrix``, or ``coo_matrix``, it will be converted into a
+            sparse ``csr_matrix``.
 
         y : Ignored
             Not used, present here for API consistency by convention.

From 872052b9ab471cb336c448cf4e0aa968b49f9199 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Wed, 14 Apr 2021 08:30:25 +0100
Subject: [PATCH 327/478] FIX convert cv_results_ values to numpy array in
 SuccessiveHalving (#19211)

Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
---
 doc/whats_new/v0.24.rst                                  | 5 +++++
 sklearn/model_selection/_search.py                       | 4 ++++
 sklearn/model_selection/tests/test_successive_halving.py | 6 ++++++
 3 files changed, 15 insertions(+)

diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst
index 2cfe6970dd7b1..09f3d9bdecd3e 100644
--- a/doc/whats_new/v0.24.rst
+++ b/doc/whats_new/v0.24.rst
@@ -40,6 +40,11 @@ Changelog
   :class:`model_selection.GridSearchCV` now correctly shows the score for
   single metrics and verbose > 2. :pr:`19659` by `Thomas Fan`_.
 
+- |Fix| Some values in the `cv_results_` attribute of
+  :class:`model_selection.HalvingRandomSearchCV` and
+  :class:`model_selection.HalvingGridSearchCV` were not properly converted to
+  numpy arrays. :pr:`19211` by `Nicolas Hug`_.
+
 :mod:`sklearn.preprocessing`
 ............................
 
diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py
index ebd085c08e68f..6e837a2f97b24 100644
--- a/sklearn/model_selection/_search.py
+++ b/sklearn/model_selection/_search.py
@@ -897,6 +897,10 @@ def _format_results(self, candidate_params, n_splits, out,
         out = _aggregate_score_dicts(out)
 
         results = dict(more_results or {})
+        for key, val in results.items():
+            # each value is a list (as per evaluate_candidate's convention)
+            # we convert it to an array for consistency with the other keys
+            results[key] = np.asarray(val)
 
         def _store(key_name, array, weights=None, splits=False, rank=False):
             """A small helper to store the scores/times to the cv_results_"""
diff --git a/sklearn/model_selection/tests/test_successive_halving.py b/sklearn/model_selection/tests/test_successive_halving.py
index 6660b35a934ba..3abd7956938d1 100644
--- a/sklearn/model_selection/tests/test_successive_halving.py
+++ b/sklearn/model_selection/tests/test_successive_halving.py
@@ -445,6 +445,12 @@ def scorer(est, X, y):
         sh.set_params(n_candidates=2 * 30, min_resources='exhaust')
 
     sh.fit(X, y)
+
+    # non-regression check for
+    # https://github.com/scikit-learn/scikit-learn/issues/19203
+    assert isinstance(sh.cv_results_['iter'], np.ndarray)
+    assert isinstance(sh.cv_results_['n_resources'], np.ndarray)
+
     cv_results_df = pd.DataFrame(sh.cv_results_)
 
     # just make sure we don't have ties

From 684b7d1955e76c0621ca2e399df90e83e525a6f7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?=
 <34657725+jeremiedbb@users.noreply.github.com>
Date: Wed, 14 Apr 2021 13:34:22 +0200
Subject: [PATCH 328/478] FIX detect near constant feature in StandardScaler
 and linear models (#19788)

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 doc/whats_new/v1.0.rst                   |  3 +-
 sklearn/linear_model/_base.py            |  6 +-
 sklearn/preprocessing/_data.py           | 19 ++++-
 sklearn/preprocessing/tests/test_data.py | 63 +++++++++++++++--
 sklearn/utils/extmath.py                 | 42 +++++++++--
 sklearn/utils/sparsefuncs_fast.pyx       | 90 ++++++++++++++----------
 6 files changed, 166 insertions(+), 57 deletions(-)

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 23211cd3a95b1..516af4b349c00 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -334,7 +334,8 @@ Changelog
   very large values. This problem happens in particular when using a scaler on
   sparse data with a constant column with sample weights, in which case
   centering is typically disabled. :pr:`19527` by :user:`Oliver Grisel
-  <ogrisel>` and :user:`Maria Telenczuk <maikia>`.
+  <ogrisel>` and :user:`Maria Telenczuk <maikia>` and :pr:`19788` by
+  :user:`Jérémie du Boisberranger <jeremiedbb>`.
 
 - |Fix| :meth:`preprocessing.StandardScaler.inverse_transform` now
   correctly handles integer dtypes. :pr:`19356` by :user:`makoeppel`.
diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py
index c80c2db622921..5783e4740a08c 100644
--- a/sklearn/linear_model/_base.py
+++ b/sklearn/linear_model/_base.py
@@ -28,6 +28,7 @@
 
 from ..base import (BaseEstimator, ClassifierMixin, RegressorMixin,
                     MultiOutputMixin)
+from ..preprocessing._data import _is_constant_feature
 from ..utils import check_array
 from ..utils.validation import FLOAT_DTYPES
 from ..utils.validation import _deprecate_positional_args
@@ -39,7 +40,6 @@
 from ..utils._seq_dataset import ArrayDataset32, CSRDataset32
 from ..utils._seq_dataset import ArrayDataset64, CSRDataset64
 from ..utils.validation import check_is_fitted, _check_sample_weight
-
 from ..utils.fixes import delayed
 
 # TODO: bayesian_ridge_regression and bayesian_regression_ard
@@ -271,8 +271,8 @@ def _preprocess_data(X, y, fit_intercept, normalize=False, copy=True,
             X_var = X_var.astype(X.dtype, copy=False)
             # Detect constant features on the computed variance, before taking
             # the np.sqrt. Otherwise constant features cannot be detected with
-            # sample_weights.
-            constant_mask = X_var < 10 * np.finfo(X.dtype).eps
+            # sample weights.
+            constant_mask = _is_constant_feature(X_var, X_offset, X.shape[0])
             X_var *= X.shape[0]
             X_scale = np.sqrt(X_var, out=X_var)
             X_scale[constant_mask] = 1.
diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py
index 6191fb2fd8bcd..80cb132174328 100644
--- a/sklearn/preprocessing/_data.py
+++ b/sklearn/preprocessing/_data.py
@@ -57,6 +57,22 @@
 ]
 
 
+def _is_constant_feature(var, mean, n_samples):
+    """Detect if a feature is indistinguishable from a constant feature.
+
+    The detection is based on its computed variance and on the theoretical
+    error bounds of the '2 pass algorithm' for variance computation.
+
+    See "Algorithms for computing the sample variance: analysis and
+    recommendations", by Chan, Golub, and LeVeque.
+    """
+    # In scikit-learn, variance is always computed using float64 accumulators.
+    eps = np.finfo(np.float64).eps
+
+    upper_bound = n_samples * eps * var + (n_samples * mean * eps)**2
+    return var <= upper_bound
+
+
 def _handle_zeros_in_scale(scale, copy=True, constant_mask=None):
     """Set scales of near constant features to 1.
 
@@ -863,7 +879,8 @@ def partial_fit(self, X, y=None, sample_weight=None):
         if self.with_std:
             # Extract the list of near constant features on the raw variances,
             # before taking the square root.
-            constant_mask = self.var_ < 10 * np.finfo(X.dtype).eps
+            constant_mask = _is_constant_feature(
+                self.var_, self.mean_, self.n_samples_seen_)
             self.scale_ = _handle_zeros_in_scale(
                 np.sqrt(self.var_), copy=False, constant_mask=constant_mask)
         else:
diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py
index 5557562283850..45d967d5f39a2 100644
--- a/sklearn/preprocessing/tests/test_data.py
+++ b/sklearn/preprocessing/tests/test_data.py
@@ -224,13 +224,6 @@ def test_standard_scaler_dtype(add_sample_weight, sparse_constructor):
 @pytest.mark.parametrize("constant", [0, 1., 100.])
 def test_standard_scaler_constant_features(
         scaler, add_sample_weight, sparse_constructor, dtype, constant):
-    if (isinstance(scaler, StandardScaler)
-            and constant > 1
-            and sparse_constructor is not np.asarray
-            and add_sample_weight):
-        # https://github.com/scikit-learn/scikit-learn/issues/19546
-        pytest.xfail("Computation of weighted variance is numerically unstable"
-                     " for sparse data. See: #19546.")
 
     if isinstance(scaler, RobustScaler) and add_sample_weight:
         pytest.skip(f"{scaler.__class__.__name__} does not yet support"
@@ -269,6 +262,62 @@ def test_standard_scaler_constant_features(
             assert_allclose(X_scaled_2, X_scaled_2)
 
 
+@pytest.mark.parametrize("n_samples", [10, 100, 10_000])
+@pytest.mark.parametrize("average", [1e-10, 1, 1e10])
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+@pytest.mark.parametrize("array_constructor",
+                         [np.asarray, sparse.csc_matrix, sparse.csr_matrix])
+def test_standard_scaler_near_constant_features(n_samples, array_constructor,
+                                                average, dtype):
+    # Check that when the variance is too small (var << mean**2) the feature
+    # is considered constant and not scaled.
+
+    scale_min, scale_max = -30, 19
+    scales = np.array([10**i for i in range(scale_min, scale_max + 1)],
+                      dtype=dtype)
+
+    n_features = scales.shape[0]
+    X = np.empty((n_samples, n_features), dtype=dtype)
+    # Make a dataset of known var = scales**2 and mean = average
+    X[:n_samples//2, :] = average + scales
+    X[n_samples//2:, :] = average - scales
+    X_array = array_constructor(X)
+
+    scaler = StandardScaler(with_mean=False).fit(X_array)
+
+    # StandardScaler uses float64 accumulators even if the data has a float32
+    # dtype.
+    eps = np.finfo(np.float64).eps
+
+    # if var < bound = N.eps.var + N².eps².mean², the feature is considered
+    # constant and the scale_ attribute is set to 1.
+    bounds = n_samples * eps * scales**2 + n_samples**2 * eps**2 * average**2
+    within_bounds = scales**2 <= bounds
+
+    # Check that scale_min is small enough to have some scales below the
+    # bound and therefore detected as constant:
+    assert np.any(within_bounds)
+
+    # Check that such features are actually treated as constant by the scaler:
+    assert all(scaler.var_[within_bounds] <= bounds[within_bounds])
+    assert_allclose(scaler.scale_[within_bounds], 1.)
+
+    # Depending the on the dtype of X, some features might not actually be
+    # representable as non constant for small scales (even if above the
+    # precision bound of the float64 variance estimate). Such feature should
+    # be correctly detected as constants with 0 variance by StandardScaler.
+    representable_diff = X[0, :] - X[-1, :] != 0
+    assert_allclose(scaler.var_[np.logical_not(representable_diff)], 0)
+    assert_allclose(scaler.scale_[np.logical_not(representable_diff)], 1)
+
+    # The other features are scaled and scale_ is equal to sqrt(var_) assuming
+    # that scales are large enough for average + scale and average - scale to
+    # be distinct in X (depending on X's dtype).
+    common_mask = np.logical_and(scales**2 > bounds, representable_diff)
+    assert_allclose(scaler.scale_[common_mask],
+                    np.sqrt(scaler.var_)[common_mask])
+
+
 def test_scale_1d():
     # 1-d inputs
     X_list = [1., 3., 5., 0.]
diff --git a/sklearn/utils/extmath.py b/sklearn/utils/extmath.py
index 42a014dcd8ade..add8c5883a751 100644
--- a/sklearn/utils/extmath.py
+++ b/sklearn/utils/extmath.py
@@ -18,6 +18,7 @@
 
 from . import check_random_state
 from ._logistic_sigmoid import _log_logistic_sigmoid
+from .fixes import np_version, parse_version
 from .sparsefuncs_fast import csr_row_norms
 from .validation import check_array
 from .validation import _deprecate_positional_args
@@ -767,10 +768,17 @@ def _incremental_mean_and_var(X, last_mean, last_variance, last_sample_count,
     # updated = the aggregated stats
     last_sum = last_mean * last_sample_count
     if sample_weight is not None:
-        new_sum = _safe_accumulator_op(np.nansum, X * sample_weight[:, None],
-                                       axis=0)
-        new_sample_count = np.sum(sample_weight[:, None] * (~np.isnan(X)),
-                                  axis=0)
+        if np_version >= parse_version("1.16.6"):
+            # equivalent to np.nansum(X * sample_weight, axis=0)
+            # safer because np.float64(X*W) != np.float64(X)*np.float64(W)
+            # dtype arg of np.matmul only exists since version 1.16
+            new_sum = _safe_accumulator_op(
+                np.matmul, sample_weight, np.where(np.isnan(X), 0, X))
+        else:
+            new_sum = _safe_accumulator_op(
+                np.nansum, X * sample_weight[:, None], axis=0)
+        new_sample_count = _safe_accumulator_op(
+            np.sum, sample_weight[:, None] * (~np.isnan(X)), axis=0)
     else:
         new_sum = _safe_accumulator_op(np.nansum, X, axis=0)
         new_sample_count = np.sum(~np.isnan(X), axis=0)
@@ -784,10 +792,30 @@ def _incremental_mean_and_var(X, last_mean, last_variance, last_sample_count,
     else:
         T = new_sum / new_sample_count
         if sample_weight is not None:
-            new_unnormalized_variance = np.nansum(sample_weight[:, None] *
-                                                  (X - T)**2, axis=0)
+            if np_version >= parse_version("1.16.6"):
+                # equivalent to np.nansum((X-T)**2 * sample_weight, axis=0)
+                # safer because np.float64(X*W) != np.float64(X)*np.float64(W)
+                # dtype arg of np.matmul only exists since version 1.16
+                new_unnormalized_variance = _safe_accumulator_op(
+                    np.matmul, sample_weight,
+                    np.where(np.isnan(X), 0, (X - T)**2))
+                correction = _safe_accumulator_op(
+                    np.matmul, sample_weight, np.where(np.isnan(X), 0, X - T))
+            else:
+                new_unnormalized_variance = _safe_accumulator_op(
+                    np.nansum, (X - T)**2 * sample_weight[:, None], axis=0)
+                correction = _safe_accumulator_op(
+                    np.nansum, (X - T) * sample_weight[:, None], axis=0)
         else:
-            new_unnormalized_variance = np.nansum((X - T)**2, axis=0)
+            new_unnormalized_variance = _safe_accumulator_op(
+                np.nansum, (X - T)**2, axis=0)
+            correction = _safe_accumulator_op(np.nansum, X - T, axis=0)
+
+        # correction term of the corrected 2 pass algorithm.
+        # See "Algorithms for computing the sample variance: analysis
+        # and recommendations", by Chan, Golub, and LeVeque.
+        new_unnormalized_variance -= correction**2 / new_sample_count
+
         last_unnormalized_variance = last_variance * last_sample_count
 
         with np.errstate(divide='ignore', invalid='ignore'):
diff --git a/sklearn/utils/sparsefuncs_fast.pyx b/sklearn/utils/sparsefuncs_fast.pyx
index 4a84c03eff86b..09677600cbbe4 100644
--- a/sklearn/utils/sparsefuncs_fast.pyx
+++ b/sklearn/utils/sparsefuncs_fast.pyx
@@ -57,6 +57,8 @@ def _csr_row_norms(np.ndarray[floating, ndim=1, mode="c"] X_data,
 def csr_mean_variance_axis0(X, weights=None, return_sum_weights=False):
     """Compute mean and variance along axis 0 on a CSR matrix
 
+    Uses a np.float64 accumulator.
+
     Parameters
     ----------
     X : CSR sparse matrix, shape (n_samples, n_features)
@@ -109,25 +111,18 @@ def _csr_mean_variance_axis0(np.ndarray[floating, ndim=1, mode="c"] X_data,
         np.npy_intp i
         unsigned long long row_ind
         integral col_ind
-        floating diff
+        np.float64_t diff
         # means[j] contains the mean of feature j
-        np.ndarray[floating, ndim=1] means
+        np.ndarray[np.float64_t, ndim=1] means = np.zeros(n_features)
         # variances[j] contains the variance of feature j
-        np.ndarray[floating, ndim=1] variances
-
-    if floating is float:
-        dtype = np.float32
-    else:
-        dtype = np.float64
+        np.ndarray[np.float64_t, ndim=1] variances = np.zeros(n_features)
 
-    means = np.zeros(n_features, dtype=dtype)
-    variances = np.zeros_like(means, dtype=dtype)
-
-    cdef:
-        np.ndarray[floating, ndim=1] sum_weights = np.full(
-            fill_value=np.sum(weights), shape=n_features, dtype=dtype)
-        np.ndarray[floating, ndim=1] sum_weights_nz = np.zeros(
-            shape=n_features, dtype=dtype)
+        np.ndarray[np.float64_t, ndim=1] sum_weights = np.full(
+            fill_value=np.sum(weights, dtype=np.float64), shape=n_features)
+        np.ndarray[np.float64_t, ndim=1] sum_weights_nz = np.zeros(
+            shape=n_features)
+        np.ndarray[np.float64_t, ndim=1] correction = np.zeros(
+            shape=n_features)
 
         np.ndarray[np.uint64_t, ndim=1] counts = np.full(
             fill_value=weights.shape[0], shape=n_features, dtype=np.uint64)
@@ -138,7 +133,7 @@ def _csr_mean_variance_axis0(np.ndarray[floating, ndim=1, mode="c"] X_data,
         for i in range(X_indptr[row_ind], X_indptr[row_ind + 1]):
             col_ind = X_indices[i]
             if not isnan(X_data[i]):
-                means[col_ind] += (X_data[i] * weights[row_ind])
+                means[col_ind] += <np.float64_t>(X_data[i]) * weights[row_ind]
                 # sum of weights where X[:, col_ind] is non-zero
                 sum_weights_nz[col_ind] += weights[row_ind]
                 # number of non-zero elements of X[:, col_ind]
@@ -157,21 +152,35 @@ def _csr_mean_variance_axis0(np.ndarray[floating, ndim=1, mode="c"] X_data,
             col_ind = X_indices[i]
             if not isnan(X_data[i]):
                 diff = X_data[i] - means[col_ind]
+                # correction term of the corrected 2 pass algorithm.
+                # See "Algorithms for computing the sample variance: analysis
+                # and recommendations", by Chan, Golub, and LeVeque.
+                correction[col_ind] += diff * weights[row_ind]
                 variances[col_ind] += diff * diff * weights[row_ind]
 
     for i in range(n_features):
+        if counts[i] != counts_nz[i]:
+            correction[i] -= (sum_weights[i] - sum_weights_nz[i]) * means[i]
+        correction[i] = correction[i]**2 / sum_weights[i]
         if counts[i] != counts_nz[i]:
             # only compute it when it's guaranteed to be non-zero to avoid
             # catastrophic cancellation.
             variances[i] += (sum_weights[i] - sum_weights_nz[i]) * means[i]**2
-        variances[i] /= sum_weights[i]
+        variances[i] = (variances[i] - correction[i]) / sum_weights[i]
 
-    return means, variances, sum_weights
+    if floating is float:
+        return (np.array(means, dtype=np.float32),
+                np.array(variances, dtype=np.float32),
+                np.array(sum_weights, dtype=np.float32))
+    else:
+        return means, variances, sum_weights
 
 
 def csc_mean_variance_axis0(X, weights=None, return_sum_weights=False):
     """Compute mean and variance along axis 0 on a CSC matrix
 
+    Uses a np.float64 accumulator.
+
     Parameters
     ----------
     X : CSC sparse matrix, shape (n_samples, n_features)
@@ -224,25 +233,18 @@ def _csc_mean_variance_axis0(np.ndarray[floating, ndim=1, mode="c"] X_data,
         np.npy_intp i
         unsigned long long col_ind
         integral row_ind
-        floating diff
+        np.float64_t diff
         # means[j] contains the mean of feature j
-        np.ndarray[floating, ndim=1] means
+        np.ndarray[np.float64_t, ndim=1] means = np.zeros(n_features)
         # variances[j] contains the variance of feature j
-        np.ndarray[floating, ndim=1] variances
-
-    if floating is float:
-        dtype = np.float32
-    else:
-        dtype = np.float64
+        np.ndarray[np.float64_t, ndim=1] variances = np.zeros(n_features)
 
-    means = np.zeros(n_features, dtype=dtype)
-    variances = np.zeros_like(means, dtype=dtype)
-
-    cdef:
-        np.ndarray[floating, ndim=1] sum_weights = np.full(
-            fill_value=np.sum(weights), shape=n_features, dtype=dtype)
-        np.ndarray[floating, ndim=1] sum_weights_nz = np.zeros(
-            shape=n_features, dtype=dtype)
+        np.ndarray[np.float64_t, ndim=1] sum_weights = np.full(
+            fill_value=np.sum(weights, dtype=np.float64), shape=n_features)
+        np.ndarray[np.float64_t, ndim=1] sum_weights_nz = np.zeros(
+            shape=n_features)
+        np.ndarray[np.float64_t, ndim=1] correction = np.zeros(
+            shape=n_features)
 
         np.ndarray[np.uint64_t, ndim=1] counts = np.full(
             fill_value=weights.shape[0], shape=n_features, dtype=np.uint64)
@@ -253,7 +255,7 @@ def _csc_mean_variance_axis0(np.ndarray[floating, ndim=1, mode="c"] X_data,
         for i in range(X_indptr[col_ind], X_indptr[col_ind + 1]):
             row_ind = X_indices[i]
             if not isnan(X_data[i]):
-                means[col_ind] += (X_data[i] * weights[row_ind])
+                means[col_ind] += <np.float64_t>(X_data[i]) * weights[row_ind]
                 # sum of weights where X[:, col_ind] is non-zero
                 sum_weights_nz[col_ind] += weights[row_ind]
                 # number of non-zero elements of X[:, col_ind]
@@ -272,16 +274,28 @@ def _csc_mean_variance_axis0(np.ndarray[floating, ndim=1, mode="c"] X_data,
             row_ind = X_indices[i]
             if not isnan(X_data[i]):
                 diff = X_data[i] - means[col_ind]
+                # correction term of the corrected 2 pass algorithm.
+                # See "Algorithms for computing the sample variance: analysis
+                # and recommendations", by Chan, Golub, and LeVeque.
+                correction[col_ind] += diff * weights[row_ind]
                 variances[col_ind] += diff * diff * weights[row_ind]
 
     for i in range(n_features):
+        if counts[i] != counts_nz[i]:
+            correction[i] -= (sum_weights[i] - sum_weights_nz[i]) * means[i]
+        correction[i] = correction[i]**2 / sum_weights[i]
         if counts[i] != counts_nz[i]:
             # only compute it when it's guaranteed to be non-zero to avoid
             # catastrophic cancellation.
             variances[i] += (sum_weights[i] - sum_weights_nz[i]) * means[i]**2
-        variances[i] /= sum_weights[i]
+        variances[i] = (variances[i] - correction[i]) / sum_weights[i]
 
-    return means, variances, sum_weights
+    if floating is float:
+        return (np.array(means, dtype=np.float32),
+                np.array(variances, dtype=np.float32),
+                np.array(sum_weights, dtype=np.float32))
+    else:
+        return means, variances, sum_weights
 
 
 def incr_mean_variance_axis0(X, last_mean, last_var, last_n, weights=None):

From 138da7ea911274f34d28849337c2768d7e3a7a96 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 14 Apr 2021 17:21:57 +0200
Subject: [PATCH 329/478] MNT Use const memory views in DistanceMetric
 subclasses (#19883)

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
---
 doc/whats_new/v1.0.rst                       |  6 ++
 sklearn/cluster/_hierarchical_fast.pyx       |  2 +-
 sklearn/cluster/tests/test_hierarchical.py   | 44 +++++++++++-
 sklearn/neighbors/_dist_metrics.pxd          | 10 +--
 sklearn/neighbors/_dist_metrics.pyx          | 71 ++++++++++----------
 sklearn/neighbors/tests/test_dist_metrics.py | 35 +++++++---
 6 files changed, 117 insertions(+), 51 deletions(-)

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 516af4b349c00..6c75ab511e21d 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -98,6 +98,9 @@ Changelog
 - |API| :class:`cluster.Birch` attributes, `fit_` and `partial_fit_`, are
   deprecated and will be removed in 1.2. :pr:`19297` by `Thomas Fan`_.
 
+- |FIX| :class:`cluster.AgglomerativeClustering` now supports readonly
+  memory-mapped datasets. :pr:`19883` by `Julien Jerphanion <jjerphan>`.
+
 :mod:`sklearn.compose`
 ......................
 
@@ -306,6 +309,9 @@ Changelog
   :pr:`19473` by :user:`jiefangxuanyan <jiefangxuanyan>` and
   :user:`Julien Jerphanion <jjerphan>`.
 
+- |FIX| :class:`neighbors.DistanceMetric` subclasses now support readonly
+  memory-mapped datasets. :pr:`19883` by `Julien Jerphanion <jjerphan>`.
+
 :mod:`sklearn.pipeline`
 .......................
 
diff --git a/sklearn/cluster/_hierarchical_fast.pyx b/sklearn/cluster/_hierarchical_fast.pyx
index ec8c96410c25c..2a58757ce327d 100644
--- a/sklearn/cluster/_hierarchical_fast.pyx
+++ b/sklearn/cluster/_hierarchical_fast.pyx
@@ -455,7 +455,7 @@ def single_linkage_label(L):
 @cython.boundscheck(False)
 @cython.nonecheck(False)
 def mst_linkage_core(
-        DTYPE_t [:, ::1] raw_data,
+        const DTYPE_t [:, ::1] raw_data,
         DistanceMetric dist_metric):
     """
     Compute the necessary elements of a minimum spanning
diff --git a/sklearn/cluster/tests/test_hierarchical.py b/sklearn/cluster/tests/test_hierarchical.py
index 1f835a52f0105..513dbf8e9218e 100644
--- a/sklearn/cluster/tests/test_hierarchical.py
+++ b/sklearn/cluster/tests/test_hierarchical.py
@@ -5,6 +5,7 @@
 # Authors: Vincent Michel, 2010, Gael Varoquaux 2012,
 #          Matteo Visconti di Oleggio Castello 2014
 # License: BSD 3 clause
+import itertools
 from tempfile import mkdtemp
 import shutil
 import pytest
@@ -15,7 +16,11 @@
 from scipy.cluster import hierarchy
 
 from sklearn.metrics.cluster import adjusted_rand_score
-from sklearn.utils._testing import assert_almost_equal
+from sklearn.neighbors.tests.test_dist_metrics import METRICS_DEFAULT_PARAMS
+from sklearn.utils._testing import (
+    assert_almost_equal,
+    create_memmap_backed_data
+)
 from sklearn.utils._testing import assert_array_almost_equal
 from sklearn.utils._testing import ignore_warnings
 
@@ -28,8 +33,12 @@
 from sklearn.metrics.pairwise import PAIRED_DISTANCES, cosine_distances,\
     manhattan_distances, pairwise_distances
 from sklearn.metrics.cluster import normalized_mutual_info_score
-from sklearn.neighbors import kneighbors_graph
-from sklearn.cluster._hierarchical_fast import average_merge, max_merge
+from sklearn.neighbors import kneighbors_graph, DistanceMetric
+from sklearn.cluster._hierarchical_fast import (
+    average_merge,
+    max_merge,
+    mst_linkage_core
+)
 from sklearn.utils._fast_dict import IntFloatDict
 from sklearn.utils._testing import assert_array_equal
 from sklearn.datasets import make_moons, make_circles
@@ -264,6 +273,16 @@ def test_agglomerative_clustering():
     assert_array_equal(clustering.labels_, clustering2.labels_)
 
 
+def test_agglomerative_clustering_memory_mapped():
+    """AgglomerativeClustering must work on mem-mapped dataset.
+
+    Non-regression test for issue #19875.
+    """
+    rng = np.random.RandomState(0)
+    Xmm = create_memmap_backed_data(rng.randn(50, 100))
+    AgglomerativeClustering(affinity="euclidean", linkage="single").fit(Xmm)
+
+
 def test_ward_agglomeration():
     # Check that we obtain the correct solution in a simplistic case
     rng = np.random.RandomState(0)
@@ -375,6 +394,25 @@ def test_vector_scikit_single_vs_scipy_single(seed):
     assess_same_labelling(cut, cut_scipy)
 
 
+@pytest.mark.parametrize('metric', METRICS_DEFAULT_PARAMS)
+def test_mst_linkage_core_memory_mapped(metric):
+    """The MST-LINKAGE-CORE algorithm must work on mem-mapped dataset.
+
+    Non-regression test for issue #19875.
+    """
+    rng = np.random.RandomState(seed=1)
+    X = rng.normal(size=(20, 4))
+    Xmm = create_memmap_backed_data(X)
+    argdict = METRICS_DEFAULT_PARAMS[metric]
+    keys = argdict.keys()
+    for vals in itertools.product(*argdict.values()):
+        kwargs = dict(zip(keys, vals))
+        distance_metric = DistanceMetric.get_metric(metric, **kwargs)
+        mst = mst_linkage_core(X, distance_metric)
+        mst_mm = mst_linkage_core(Xmm, distance_metric)
+        np.testing.assert_equal(mst, mst_mm)
+
+
 def test_identical_points():
     # Ensure identical points are handled correctly when using mst with
     # a sparse connectivity matrix
diff --git a/sklearn/neighbors/_dist_metrics.pxd b/sklearn/neighbors/_dist_metrics.pxd
index 89c63cc46905f..856d5bb2dde5b 100644
--- a/sklearn/neighbors/_dist_metrics.pxd
+++ b/sklearn/neighbors/_dist_metrics.pxd
@@ -15,7 +15,7 @@ from ._typedefs import DTYPE, ITYPE
 #
 #  We use these for the default (euclidean) case so that they can be
 #  inlined.  This leads to faster computation for the most common case
-cdef inline DTYPE_t euclidean_dist(DTYPE_t* x1, DTYPE_t* x2,
+cdef inline DTYPE_t euclidean_dist(const DTYPE_t* x1, const DTYPE_t* x2,
                                    ITYPE_t size) nogil except -1:
     cdef DTYPE_t tmp, d=0
     cdef np.intp_t j
@@ -25,7 +25,7 @@ cdef inline DTYPE_t euclidean_dist(DTYPE_t* x1, DTYPE_t* x2,
     return sqrt(d)
 
 
-cdef inline DTYPE_t euclidean_rdist(DTYPE_t* x1, DTYPE_t* x2,
+cdef inline DTYPE_t euclidean_rdist(const DTYPE_t* x1, const DTYPE_t* x2,
                                     ITYPE_t size) nogil except -1:
     cdef DTYPE_t tmp, d=0
     cdef np.intp_t j
@@ -35,11 +35,11 @@ cdef inline DTYPE_t euclidean_rdist(DTYPE_t* x1, DTYPE_t* x2,
     return d
 
 
-cdef inline DTYPE_t euclidean_dist_to_rdist(DTYPE_t dist) nogil except -1:
+cdef inline DTYPE_t euclidean_dist_to_rdist(const DTYPE_t dist) nogil except -1:
     return dist * dist
 
 
-cdef inline DTYPE_t euclidean_rdist_to_dist(DTYPE_t dist) nogil except -1:
+cdef inline DTYPE_t euclidean_rdist_to_dist(const DTYPE_t dist) nogil except -1:
     return sqrt(dist)
 
 
@@ -61,7 +61,7 @@ cdef class DistanceMetric:
     cdef object func
     cdef object kwargs
 
-    cdef DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2,
+    cdef DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
                       ITYPE_t size) nogil except -1
 
     cdef DTYPE_t rdist(self, DTYPE_t* x1, DTYPE_t* x2,
diff --git a/sklearn/neighbors/_dist_metrics.pyx b/sklearn/neighbors/_dist_metrics.pyx
index 4cc41d7136586..398591bcdf49f 100755
--- a/sklearn/neighbors/_dist_metrics.pyx
+++ b/sklearn/neighbors/_dist_metrics.pyx
@@ -300,7 +300,7 @@ cdef class DistanceMetric:
         """
         return
 
-    cdef DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2,
+    cdef DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
                       ITYPE_t size) nogil except -1:
         """Compute the distance between vectors x1 and x2
 
@@ -308,7 +308,7 @@ cdef class DistanceMetric:
         """
         return -999
 
-    cdef DTYPE_t rdist(self, DTYPE_t* x1, DTYPE_t* x2,
+    cdef DTYPE_t rdist(self, const DTYPE_t* x1, const DTYPE_t* x2,
                        ITYPE_t size) nogil except -1:
         """Compute the reduced distance between vectors x1 and x2.
 
@@ -321,7 +321,7 @@ cdef class DistanceMetric:
         """
         return self.dist(x1, x2, size)
 
-    cdef int pdist(self, DTYPE_t[:, ::1] X, DTYPE_t[:, ::1] D) except -1:
+    cdef int pdist(self, const DTYPE_t[:, ::1] X, DTYPE_t[:, ::1] D) except -1:
         """compute the pairwise distances between points in X"""
         cdef ITYPE_t i1, i2
         for i1 in range(X.shape[0]):
@@ -330,7 +330,7 @@ cdef class DistanceMetric:
                 D[i2, i1] = D[i1, i2]
         return 0
 
-    cdef int cdist(self, DTYPE_t[:, ::1] X, DTYPE_t[:, ::1] Y,
+    cdef int cdist(self, const DTYPE_t[:, ::1] X, const DTYPE_t[:, ::1] Y,
                    DTYPE_t[:, ::1] D) except -1:
         """compute the cross-pairwise distances between arrays X and Y"""
         cdef ITYPE_t i1, i2
@@ -423,11 +423,11 @@ cdef class EuclideanDistance(DistanceMetric):
     def __init__(self):
         self.p = 2
 
-    cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2,
+    cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
                              ITYPE_t size) nogil except -1:
         return euclidean_dist(x1, x2, size)
 
-    cdef inline DTYPE_t rdist(self, DTYPE_t* x1, DTYPE_t* x2,
+    cdef inline DTYPE_t rdist(self, const DTYPE_t* x1, const DTYPE_t* x2,
                               ITYPE_t size) nogil except -1:
         return euclidean_rdist(x1, x2, size)
 
@@ -463,7 +463,7 @@ cdef class SEuclideanDistance(DistanceMetric):
         if X.shape[1] != self.size:
             raise ValueError('SEuclidean dist: size of V does not match')
 
-    cdef inline DTYPE_t rdist(self, DTYPE_t* x1, DTYPE_t* x2,
+    cdef inline DTYPE_t rdist(self, const DTYPE_t* x1, const DTYPE_t* x2,
                               ITYPE_t size) nogil except -1:
         cdef DTYPE_t tmp, d=0
         cdef np.intp_t j
@@ -472,7 +472,7 @@ cdef class SEuclideanDistance(DistanceMetric):
             d += tmp * tmp / self.vec_ptr[j]
         return d
 
-    cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2,
+    cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
                              ITYPE_t size) nogil except -1:
         return sqrt(self.rdist(x1, x2, size))
 
@@ -501,7 +501,7 @@ cdef class ManhattanDistance(DistanceMetric):
     def __init__(self):
         self.p = 1
 
-    cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2,
+    cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
                              ITYPE_t size) nogil except -1:
         cdef DTYPE_t d = 0
         cdef np.intp_t j
@@ -534,7 +534,7 @@ cdef class ChebyshevDistance(DistanceMetric):
     def __init__(self):
         self.p = INF
 
-    cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2,
+    cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
                              ITYPE_t size) nogil except -1:
         cdef DTYPE_t d = 0
         cdef np.intp_t j
@@ -565,7 +565,7 @@ cdef class MinkowskiDistance(DistanceMetric):
                              "For p=inf, use ChebyshevDistance.")
         self.p = p
 
-    cdef inline DTYPE_t rdist(self, DTYPE_t* x1, DTYPE_t* x2,
+    cdef inline DTYPE_t rdist(self, const DTYPE_t* x1, const DTYPE_t* x2,
                               ITYPE_t size) nogil except -1:
         cdef DTYPE_t d=0
         cdef np.intp_t j
@@ -573,7 +573,7 @@ cdef class MinkowskiDistance(DistanceMetric):
             d += pow(fabs(x1[j] - x2[j]), self.p)
         return d
 
-    cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2,
+    cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
                              ITYPE_t size) nogil except -1:
         return pow(self.rdist(x1, x2, size), 1. / self.p)
 
@@ -625,7 +625,7 @@ cdef class WMinkowskiDistance(DistanceMetric):
             raise ValueError('WMinkowskiDistance dist: '
                              'size of w does not match')
 
-    cdef inline DTYPE_t rdist(self, DTYPE_t* x1, DTYPE_t* x2,
+    cdef inline DTYPE_t rdist(self, const DTYPE_t* x1, const DTYPE_t* x2,
                               ITYPE_t size) nogil except -1:
         cdef DTYPE_t d=0
         cdef np.intp_t j
@@ -633,7 +633,7 @@ cdef class WMinkowskiDistance(DistanceMetric):
             d += pow(self.vec_ptr[j] * fabs(x1[j] - x2[j]), self.p)
         return d
 
-    cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2,
+    cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
                              ITYPE_t size) nogil except -1:
         return pow(self.rdist(x1, x2, size), 1. / self.p)
 
@@ -690,7 +690,7 @@ cdef class MahalanobisDistance(DistanceMetric):
         if X.shape[1] != self.size:
             raise ValueError('Mahalanobis dist: size of V does not match')
 
-    cdef inline DTYPE_t rdist(self, DTYPE_t* x1, DTYPE_t* x2,
+    cdef inline DTYPE_t rdist(self, const DTYPE_t* x1, const DTYPE_t* x2,
                               ITYPE_t size) nogil except -1:
         cdef DTYPE_t tmp, d = 0
         cdef np.intp_t i, j
@@ -706,7 +706,7 @@ cdef class MahalanobisDistance(DistanceMetric):
             d += tmp * self.vec_ptr[i]
         return d
 
-    cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2,
+    cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
                              ITYPE_t size) nogil except -1:
         return sqrt(self.rdist(x1, x2, size))
 
@@ -735,7 +735,7 @@ cdef class HammingDistance(DistanceMetric):
     .. math::
        D(x, y) = \frac{1}{N} \sum_i \delta_{x_i, y_i}
     """
-    cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2,
+    cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
                              ITYPE_t size) nogil except -1:
         cdef int n_unequal = 0
         cdef np.intp_t j
@@ -757,7 +757,7 @@ cdef class CanberraDistance(DistanceMetric):
     .. math::
        D(x, y) = \sum_i \frac{|x_i - y_i|}{|x_i| + |y_i|}
     """
-    cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2,
+    cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
                              ITYPE_t size) nogil except -1:
         cdef DTYPE_t denom, d = 0
         cdef np.intp_t j
@@ -780,7 +780,7 @@ cdef class BrayCurtisDistance(DistanceMetric):
     .. math::
        D(x, y) = \frac{\sum_i |x_i - y_i|}{\sum_i(|x_i| + |y_i|)}
     """
-    cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2,
+    cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
                              ITYPE_t size) nogil except -1:
         cdef DTYPE_t num = 0, denom = 0
         cdef np.intp_t j
@@ -806,7 +806,7 @@ cdef class JaccardDistance(DistanceMetric):
     .. math::
        D(x, y) = \frac{N_{TF} + N_{FT}}{N_{TT} + N_{TF} + N_{FT}}
     """
-    cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2,
+    cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
                              ITYPE_t size) nogil except -1:
         cdef int tf1, tf2, n_eq = 0, nnz = 0
         cdef np.intp_t j
@@ -836,7 +836,7 @@ cdef class MatchingDistance(DistanceMetric):
     .. math::
        D(x, y) = \frac{N_{TF} + N_{FT}}{N}
     """
-    cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2,
+    cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
                              ITYPE_t size) nogil except -1:
         cdef int tf1, tf2, n_neq = 0
         cdef np.intp_t j
@@ -860,7 +860,7 @@ cdef class DiceDistance(DistanceMetric):
     .. math::
        D(x, y) = \frac{N_{TF} + N_{FT}}{2 * N_{TT} + N_{TF} + N_{FT}}
     """
-    cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2,
+    cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
                              ITYPE_t size) nogil except -1:
         cdef int tf1, tf2, n_neq = 0, ntt = 0
         cdef np.intp_t j
@@ -885,7 +885,7 @@ cdef class KulsinskiDistance(DistanceMetric):
     .. math::
        D(x, y) = 1 - \frac{N_{TT}}{N + N_{TF} + N_{FT}}
     """
-    cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2,
+    cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
                              ITYPE_t size) nogil except -1:
         cdef int tf1, tf2, ntt = 0, n_neq = 0
         cdef np.intp_t j
@@ -910,7 +910,7 @@ cdef class RogersTanimotoDistance(DistanceMetric):
     .. math::
        D(x, y) = \frac{2 (N_{TF} + N_{FT})}{N + N_{TF} + N_{FT}}
     """
-    cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2,
+    cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
                              ITYPE_t size) nogil except -1:
         cdef int tf1, tf2, n_neq = 0
         cdef np.intp_t j
@@ -934,7 +934,7 @@ cdef class RussellRaoDistance(DistanceMetric):
     .. math::
        D(x, y) = \frac{N - N_{TT}}{N}
     """
-    cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2,
+    cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
                              ITYPE_t size) nogil except -1:
         cdef int tf1, tf2, ntt = 0
         cdef np.intp_t j
@@ -958,7 +958,7 @@ cdef class SokalMichenerDistance(DistanceMetric):
     .. math::
        D(x, y) = \frac{2 (N_{TF} + N_{FT})}{N + N_{TF} + N_{FT}}
     """
-    cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2,
+    cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
                              ITYPE_t size) nogil except -1:
         cdef int tf1, tf2, n_neq = 0
         cdef np.intp_t j
@@ -982,7 +982,7 @@ cdef class SokalSneathDistance(DistanceMetric):
     .. math::
        D(x, y) = \frac{N_{TF} + N_{FT}}{N_{TT} / 2 + N_{TF} + N_{FT}}
     """
-    cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2,
+    cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
                              ITYPE_t size) nogil except -1:
         cdef int tf1, tf2, ntt = 0, n_neq = 0
         cdef np.intp_t j
@@ -1016,13 +1016,13 @@ cdef class HaversineDistance(DistanceMetric):
             raise ValueError("Haversine distance only valid "
                              "in 2 dimensions")
 
-    cdef inline DTYPE_t rdist(self, DTYPE_t* x1, DTYPE_t* x2,
+    cdef inline DTYPE_t rdist(self, const DTYPE_t* x1, const DTYPE_t* x2,
                               ITYPE_t size) nogil except -1:
         cdef DTYPE_t sin_0 = sin(0.5 * (x1[0] - x2[0]))
         cdef DTYPE_t sin_1 = sin(0.5 * (x1[1] - x2[1]))
         return (sin_0 * sin_0 + cos(x1[0]) * cos(x2[0]) * sin_1 * sin_1)
 
-    cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2,
+    cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
                              ITYPE_t size) nogil except -1:
         return 2 * asin(sqrt(self.rdist(x1, x2, size)))
 
@@ -1047,7 +1047,8 @@ cdef class HaversineDistance(DistanceMetric):
 # [This is not a true metric, so we will leave it out.]
 #
 #cdef class YuleDistance(DistanceMetric):
-#    cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, ITYPE_t size):
+#    cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
+#                             ITYPE_t size):
 #        cdef int tf1, tf2, ntf = 0, nft = 0, ntt = 0, nff = 0
 #        cdef np.intp_t j
 #        for j in range(size):
@@ -1066,7 +1067,8 @@ cdef class HaversineDistance(DistanceMetric):
 # [This is not a true metric, so we will leave it out.]
 #
 #cdef class CosineDistance(DistanceMetric):
-#    cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, ITYPE_t size):
+#    cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
+#                             ITYPE_t size):
 #        cdef DTYPE_t d = 0, norm1 = 0, norm2 = 0
 #        cdef np.intp_t j
 #        for j in range(size):
@@ -1082,7 +1084,8 @@ cdef class HaversineDistance(DistanceMetric):
 # [This is not a true metric, so we will leave it out.]
 #
 #cdef class CorrelationDistance(DistanceMetric):
-#    cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, ITYPE_t size):
+#    cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
+#                             ITYPE_t size):
 #        cdef DTYPE_t mu1 = 0, mu2 = 0, x1nrm = 0, x2nrm = 0, x1Tx2 = 0
 #        cdef DTYPE_t tmp1, tmp2
 #
@@ -1125,11 +1128,11 @@ cdef class PyFuncDistance(DistanceMetric):
     # allowed in cython >= 0.26 since it is a redundant GIL acquisition. The
     # only way to be back compatible is to inherit `dist` from the base class
     # without GIL and called an inline `_dist` which acquire GIL.
-    cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2,
+    cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
                              ITYPE_t size) nogil except -1:
         return self._dist(x1, x2, size)
 
-    cdef inline DTYPE_t _dist(self, DTYPE_t* x1, DTYPE_t* x2,
+    cdef inline DTYPE_t _dist(self, const DTYPE_t* x1, const DTYPE_t* x2,
                               ITYPE_t size) except -1 with gil:
         cdef np.ndarray x1arr
         cdef np.ndarray x2arr
diff --git a/sklearn/neighbors/tests/test_dist_metrics.py b/sklearn/neighbors/tests/test_dist_metrics.py
index 05e0f4294ebb6..07705e93c3390 100644
--- a/sklearn/neighbors/tests/test_dist_metrics.py
+++ b/sklearn/neighbors/tests/test_dist_metrics.py
@@ -10,6 +10,7 @@
 from sklearn.neighbors import DistanceMetric
 from sklearn.neighbors import BallTree
 from sklearn.utils import check_random_state
+from sklearn.utils._testing import create_memmap_backed_data
 from sklearn.utils.fixes import sp_version, parse_version
 
 
@@ -24,10 +25,15 @@ def dist_func(x1, x2, p):
 X1 = rng.random_sample((n1, d)).astype('float64', copy=False)
 X2 = rng.random_sample((n2, d)).astype('float64', copy=False)
 
+[X1_mmap, X2_mmap] = create_memmap_backed_data([X1, X2])
+
 # make boolean arrays: ones and zeros
 X1_bool = X1.round(0)
 X2_bool = X2.round(0)
 
+[X1_bool_mmap, X2_bool_mmap] = create_memmap_backed_data([X1_bool, X2_bool])
+
+
 V = rng.random_sample((d, d))
 VI = np.dot(V, V.T)
 
@@ -47,14 +53,18 @@ def dist_func(x1, x2, p):
                           'canberra': {},
                           'braycurtis': {}}
 
-
 @pytest.mark.parametrize('metric', METRICS_DEFAULT_PARAMS)
-def test_cdist(metric):
+@pytest.mark.parametrize('X1, X2', [(X1, X2), (X1_mmap, X2_mmap)])
+def test_cdist(metric, X1, X2):
     argdict = METRICS_DEFAULT_PARAMS[metric]
     keys = argdict.keys()
     for vals in itertools.product(*argdict.values()):
         kwargs = dict(zip(keys, vals))
-        if metric == "wminkowski":
+        if metric == "mahalanobis":
+            # See: https://github.com/scipy/scipy/issues/13861
+            pytest.xfail("scipy#13861: cdist with 'mahalanobis' fails on"
+                         "memmap data")
+        elif metric == "wminkowski":
             if sp_version >= parse_version("1.8.0"):
                 pytest.skip("wminkowski will be removed in SciPy 1.8.0")
 
@@ -71,7 +81,9 @@ def test_cdist(metric):
 
 
 @pytest.mark.parametrize('metric', BOOL_METRICS)
-def test_cdist_bool_metric(metric):
+@pytest.mark.parametrize('X1_bool, X2_bool', [(X1_bool, X2_bool),
+                                              (X1_bool_mmap, X2_bool_mmap)])
+def test_cdist_bool_metric(metric, X1_bool, X2_bool):
     D_true = cdist(X1_bool, X2_bool, metric)
     check_cdist_bool(metric, D_true)
 
@@ -89,12 +101,17 @@ def check_cdist_bool(metric, D_true):
 
 
 @pytest.mark.parametrize('metric', METRICS_DEFAULT_PARAMS)
-def test_pdist(metric):
+@pytest.mark.parametrize('X1, X2', [(X1, X2), (X1_mmap, X2_mmap)])
+def test_pdist(metric, X1, X2):
     argdict = METRICS_DEFAULT_PARAMS[metric]
     keys = argdict.keys()
     for vals in itertools.product(*argdict.values()):
         kwargs = dict(zip(keys, vals))
-        if metric == "wminkowski":
+        if metric == "mahalanobis":
+            # See: https://github.com/scipy/scipy/issues/13861
+            pytest.xfail("scipy#13861: pdist with 'mahalanobis' fails on"
+                         "memmap data")
+        elif metric == "wminkowski":
             if sp_version >= parse_version("1.8.0"):
                 pytest.skip("wminkowski will be removed in SciPy 1.8.0")
 
@@ -111,7 +128,8 @@ def test_pdist(metric):
 
 
 @pytest.mark.parametrize('metric', BOOL_METRICS)
-def test_pdist_bool_metrics(metric):
+@pytest.mark.parametrize('X1_bool', [X1_bool, X1_bool_mmap])
+def test_pdist_bool_metrics(metric, X1_bool):
     D_true = cdist(X1_bool, X1_bool, metric)
     check_pdist_bool(metric, D_true)
 
@@ -143,7 +161,8 @@ def test_pickle(metric):
 
 
 @pytest.mark.parametrize('metric', BOOL_METRICS)
-def test_pickle_bool_metrics(metric):
+@pytest.mark.parametrize('X1_bool', [X1_bool, X1_bool_mmap])
+def test_pickle_bool_metrics(metric, X1_bool):
     dm = DistanceMetric.get_metric(metric)
     D1 = dm.pairwise(X1_bool)
     dm2 = pickle.loads(pickle.dumps(dm))

From 8c4589b23c6481f978d4cfab511f25b77a805f13 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?=
 <34657725+jeremiedbb@users.noreply.github.com>
Date: Thu, 15 Apr 2021 08:32:22 +0200
Subject: [PATCH 330/478] ENH Scalable MiniBatchKMeans plus cln / fixes /
 refactoring (#17622)

---
 doc/whats_new/v1.0.rst                        |  22 +-
 ...{_k_means_fast.pxd => _k_means_common.pxd} |   0
 ...{_k_means_fast.pyx => _k_means_common.pyx} | 118 +---
 sklearn/cluster/_k_means_elkan.pyx            |  14 +-
 sklearn/cluster/_k_means_lloyd.pyx            |  10 +-
 sklearn/cluster/_k_means_minibatch.pyx        | 228 ++++++
 sklearn/cluster/_kmeans.py                    | 655 +++++++++---------
 sklearn/cluster/setup.py                      |   9 +-
 sklearn/cluster/tests/test_k_means.py         | 259 ++++---
 9 files changed, 736 insertions(+), 579 deletions(-)
 rename sklearn/cluster/{_k_means_fast.pxd => _k_means_common.pxd} (100%)
 rename sklearn/cluster/{_k_means_fast.pyx => _k_means_common.pyx} (67%)
 create mode 100644 sklearn/cluster/_k_means_minibatch.pyx

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 6c75ab511e21d..5975177f7a0c8 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -95,12 +95,30 @@ Changelog
   in multicore settings. :pr:`19052` by
   :user:`Yusuke Nagasaka <YusukeNagasaka>`.
 
-- |API| :class:`cluster.Birch` attributes, `fit_` and `partial_fit_`, are
-  deprecated and will be removed in 1.2. :pr:`19297` by `Thomas Fan`_.
+- |Efficiency| :class:`cluster.MiniBatchKMeans` is now faster in multicore
+  settings. :pr:`17622` by :user:`Jérémie du Boisberranger <jeremiedbb>`.
+
+- |Fix| Fixed a bug in :class:`cluster.MiniBatchKMeans` where the sample
+  weights were partially ignored when the input is sparse. :pr:`17622` by
+  :user:`Jérémie du Boisberranger <jeremiedbb>`.
 
+- |Fix| Improved convergence detection based on center change in
+  :class:`cluster.MiniBatchKMeans` which was almost never achievable.
+  :pr:`17622` by :user:`Jérémie du Boisberranger <jeremiedbb>`.
+  
 - |FIX| :class:`cluster.AgglomerativeClustering` now supports readonly
   memory-mapped datasets. :pr:`19883` by `Julien Jerphanion <jjerphan>`.
 
+- |API| :class:`cluster.Birch` attributes, `fit_` and `partial_fit_`, are
+  deprecated and will be removed in 1.2. :pr:`19297` by `Thomas Fan`_.
+
+- |API| the default value for the `batch_size` parameter of
+  :class:`MiniBatchKMeans` was changed from 100 to 1024 due to efficiency
+  reasons. The `n_iter_` attribute of :class:`MiniBatchKMeans` now reports the
+  number of started epochs and the `n_steps_` attribute reports the number of
+  mini batches processed. :pr:`17622`
+  by :user:`Jérémie du Boisberranger <jeremiedbb>`.
+
 :mod:`sklearn.compose`
 ......................
 
diff --git a/sklearn/cluster/_k_means_fast.pxd b/sklearn/cluster/_k_means_common.pxd
similarity index 100%
rename from sklearn/cluster/_k_means_fast.pxd
rename to sklearn/cluster/_k_means_common.pxd
diff --git a/sklearn/cluster/_k_means_fast.pyx b/sklearn/cluster/_k_means_common.pyx
similarity index 67%
rename from sklearn/cluster/_k_means_fast.pyx
rename to sklearn/cluster/_k_means_common.pyx
index 21bf7dd9bf65a..373be241dd013 100644
--- a/sklearn/cluster/_k_means_fast.pyx
+++ b/sklearn/cluster/_k_means_common.pyx
@@ -14,8 +14,8 @@
 
 import numpy as np
 cimport numpy as np
-cimport cython
 from cython cimport floating
+from cython.parallel cimport prange
 from libc.math cimport sqrt
 
 from ..utils.extmath import row_norms
@@ -23,9 +23,6 @@ from ..utils.extmath import row_norms
 
 np.import_array()
 
-ctypedef np.float64_t DOUBLE
-ctypedef np.int32_t INT
-
 
 # Number of samples per data chunk defined as a global constant.
 CHUNK_SIZE = 256
@@ -103,7 +100,8 @@ cpdef floating _inertia_dense(
         np.ndarray[floating, ndim=2, mode='c'] X,  # IN
         floating[::1] sample_weight,               # IN
         floating[:, ::1] centers,                  # IN
-        int[::1] labels):                          # IN
+        int[::1] labels,                           # IN
+        int n_threads):
     """Compute inertia for dense input data
 
     Sum of squared distance between each sample and its assigned center.
@@ -116,7 +114,8 @@ cpdef floating _inertia_dense(
         floating sq_dist = 0.0
         floating inertia = 0.0
 
-    for i in range(n_samples):
+    for i in prange(n_samples, nogil=True, num_threads=n_threads,
+                    schedule='static'):
         j = labels[i]
         sq_dist = _euclidean_dense_dense(&X[i, 0], &centers[j, 0],
                                          n_features, True)
@@ -129,7 +128,8 @@ cpdef floating _inertia_sparse(
         X,                            # IN
         floating[::1] sample_weight,  # IN
         floating[:, ::1] centers,     # IN
-        int[::1] labels):             # IN
+        int[::1] labels,              # IN
+        int n_threads):
     """Compute inertia for sparse input data
 
     Sum of squared distance between each sample and its assigned center.
@@ -148,7 +148,8 @@ cpdef floating _inertia_sparse(
 
         floating[::1] centers_squared_norms = row_norms(centers, squared=True)
 
-    for i in range(n_samples):
+    for i in prange(n_samples, nogil=True, num_threads=n_threads,
+                    schedule='static'):
         j = labels[i]
         sq_dist = _euclidean_sparse_dense(
             X_data[X_indptr[i]: X_indptr[i + 1]],
@@ -286,104 +287,3 @@ cdef void _center_shift(
     for j in range(n_clusters):
         center_shift[j] = _euclidean_dense_dense(
             &centers_new[j, 0], &centers_old[j, 0], n_features, False)
-
-
-def _mini_batch_update_csr(X, np.ndarray[floating, ndim=1] sample_weight,
-                           np.ndarray[floating, ndim=1] x_squared_norms,
-                           np.ndarray[floating, ndim=2] centers,
-                           np.ndarray[floating, ndim=1] weight_sums,
-                           np.ndarray[INT, ndim=1] nearest_center,
-                           np.ndarray[floating, ndim=1] old_center,
-                           int compute_squared_diff):
-    """Incremental update of the centers for sparse MiniBatchKMeans.
-
-    Parameters
-    ----------
-
-    X : CSR matrix, dtype float
-        The complete (pre allocated) training set as a CSR matrix.
-
-    centers : array, shape (n_clusters, n_features)
-        The cluster centers
-
-    counts : array, shape (n_clusters,)
-         The vector in which we keep track of the numbers of elements in a
-         cluster
-
-    Returns
-    -------
-    inertia : float
-        The inertia of the batch prior to centers update, i.e. the sum
-        of squared distances to the closest center for each sample. This
-        is the objective function being minimized by the k-means algorithm.
-
-    squared_diff : float
-        The sum of squared update (squared norm of the centers position
-        change). If compute_squared_diff is 0, this computation is skipped and
-        0.0 is returned instead.
-
-    Both squared diff and inertia are commonly used to monitor the convergence
-    of the algorithm.
-    """
-    cdef:
-        np.ndarray[floating, ndim=1] X_data = X.data
-        np.ndarray[int, ndim=1] X_indices = X.indices
-        np.ndarray[int, ndim=1] X_indptr = X.indptr
-        unsigned int n_samples = X.shape[0]
-        unsigned int n_clusters = centers.shape[0]
-        unsigned int n_features = centers.shape[1]
-
-        unsigned int sample_idx, center_idx, feature_idx
-        unsigned int k
-        DOUBLE old_weight_sum, new_weight_sum
-        DOUBLE center_diff
-        DOUBLE squared_diff = 0.0
-
-    # move centers to the mean of both old and newly assigned samples
-    for center_idx in range(n_clusters):
-        old_weight_sum = weight_sums[center_idx]
-        new_weight_sum = old_weight_sum
-
-        # count the number of samples assigned to this center
-        for sample_idx in range(n_samples):
-            if nearest_center[sample_idx] == center_idx:
-                new_weight_sum += sample_weight[sample_idx]
-
-        if new_weight_sum == old_weight_sum:
-            # no new sample: leave this center as it stands
-            continue
-
-        # rescale the old center to reflect it previous accumulated weight
-        # with regards to the new data that will be incrementally contributed
-        if compute_squared_diff:
-            old_center[:] = centers[center_idx]
-        centers[center_idx] *= old_weight_sum
-
-        # iterate of over samples assigned to this cluster to move the center
-        # location by inplace summation
-        for sample_idx in range(n_samples):
-            if nearest_center[sample_idx] != center_idx:
-                continue
-
-            # inplace sum with new samples that are members of this cluster
-            # and update of the incremental squared difference update of the
-            # center position
-            for k in range(X_indptr[sample_idx], X_indptr[sample_idx + 1]):
-                centers[center_idx, X_indices[k]] += X_data[k]
-
-        # inplace rescale center with updated count
-        if new_weight_sum > old_weight_sum:
-            # update the count statistics for this center
-            weight_sums[center_idx] = new_weight_sum
-
-            # re-scale the updated center with the total new counts
-            centers[center_idx] /= new_weight_sum
-
-            # update the incremental computation of the squared total
-            # centers position change
-            if compute_squared_diff:
-                for feature_idx in range(n_features):
-                    squared_diff += (old_center[feature_idx]
-                                     - centers[center_idx, feature_idx]) ** 2
-
-    return squared_diff
diff --git a/sklearn/cluster/_k_means_elkan.pyx b/sklearn/cluster/_k_means_elkan.pyx
index 1010e581f5e7f..84464d78fe244 100644
--- a/sklearn/cluster/_k_means_elkan.pyx
+++ b/sklearn/cluster/_k_means_elkan.pyx
@@ -18,13 +18,13 @@ from libc.stdlib cimport calloc, free
 from libc.string cimport memset, memcpy
 
 from ..utils.extmath import row_norms
-from ._k_means_fast import CHUNK_SIZE
-from ._k_means_fast cimport _relocate_empty_clusters_dense
-from ._k_means_fast cimport _relocate_empty_clusters_sparse
-from ._k_means_fast cimport _euclidean_dense_dense
-from ._k_means_fast cimport _euclidean_sparse_dense
-from ._k_means_fast cimport _average_centers
-from ._k_means_fast cimport _center_shift
+from ._k_means_common import CHUNK_SIZE
+from ._k_means_common cimport _relocate_empty_clusters_dense
+from ._k_means_common cimport _relocate_empty_clusters_sparse
+from ._k_means_common cimport _euclidean_dense_dense
+from ._k_means_common cimport _euclidean_sparse_dense
+from ._k_means_common cimport _average_centers
+from ._k_means_common cimport _center_shift
 
 
 np.import_array()
diff --git a/sklearn/cluster/_k_means_lloyd.pyx b/sklearn/cluster/_k_means_lloyd.pyx
index 255f4f470a648..7cb7d2abb728e 100644
--- a/sklearn/cluster/_k_means_lloyd.pyx
+++ b/sklearn/cluster/_k_means_lloyd.pyx
@@ -11,16 +11,16 @@ cimport numpy as np
 from cython cimport floating
 from cython.parallel import prange, parallel
 from libc.stdlib cimport malloc, calloc, free
-from libc.string cimport memset, memcpy
+from libc.string cimport memset
 from libc.float cimport DBL_MAX, FLT_MAX
 
 from ..utils.extmath import row_norms
 from ..utils._cython_blas cimport _gemm
 from ..utils._cython_blas cimport RowMajor, Trans, NoTrans
-from ._k_means_fast import CHUNK_SIZE
-from ._k_means_fast cimport _relocate_empty_clusters_dense
-from ._k_means_fast cimport _relocate_empty_clusters_sparse
-from ._k_means_fast cimport _average_centers, _center_shift
+from ._k_means_common import CHUNK_SIZE
+from ._k_means_common cimport _relocate_empty_clusters_dense
+from ._k_means_common cimport _relocate_empty_clusters_sparse
+from ._k_means_common cimport _average_centers, _center_shift
 
 
 np.import_array()
diff --git a/sklearn/cluster/_k_means_minibatch.pyx b/sklearn/cluster/_k_means_minibatch.pyx
new file mode 100644
index 0000000000000..ab5aee35ea075
--- /dev/null
+++ b/sklearn/cluster/_k_means_minibatch.pyx
@@ -0,0 +1,228 @@
+# cython: profile=True, boundscheck=False, wraparound=False, cdivision=True
+
+# TODO: We still need to use ndarrays instead of typed memoryviews when using
+# fused types and when the array may be read-only (for instance when it's
+# provided by the user). This will be fixed in cython >= 0.3.
+
+cimport numpy as np
+from cython cimport floating
+from cython.parallel cimport parallel, prange
+from libc.stdlib cimport malloc, free
+
+
+np.import_array()
+
+
+def _minibatch_update_dense(
+        np.ndarray[floating, ndim=2, mode="c"] X,  # IN
+        floating[::1] sample_weight,               # IN
+        floating[:, ::1] centers_old,              # IN
+        floating[:, ::1] centers_new,              # OUT
+        floating[::1] weight_sums,                 # INOUT
+        int[::1] labels,                           # IN
+        int n_threads):
+    """Update of the centers for dense MiniBatchKMeans.
+
+    Parameters
+    ----------
+    X : ndarray of shape (n_samples, n_features), dtype=floating
+        The observations to cluster.
+
+    sample_weight : ndarray of shape (n_samples,), dtype=floating
+        The weights for each observation in X.
+
+    centers_old : ndarray of shape (n_clusters, n_features), dtype=floating
+        Centers before previous iteration, placeholder for the centers after
+        previous iteration.
+
+    centers_new : ndarray of shape (n_clusters, n_features), dtype=floating
+        Centers after previous iteration, placeholder for the new centers
+        computed during this iteration.
+
+    weight_sums : ndarray of shape (n_clusters,), dtype=floating
+        Current sums of the accumulated weights for each center.
+
+    labels : ndarray of shape (n_samples,), dtype=int
+        labels assignment.
+
+    n_threads : int
+        The number of threads to be used by openmp.
+    """
+    cdef:
+        int n_samples = X.shape[0]
+        int n_clusters = centers_old.shape[0]
+        int cluster_idx
+
+        int *indices
+
+    with nogil, parallel(num_threads=n_threads):
+        indices = <int*> malloc(n_samples * sizeof(int))
+
+        for cluster_idx in prange(n_clusters, schedule="static"):
+            update_center_dense(cluster_idx, &X[0, 0], sample_weight,
+                                centers_old, centers_new, weight_sums, labels,
+                                indices)
+
+        free(indices)
+
+
+cdef void update_center_dense(
+        int cluster_idx,
+        floating *X,                   # IN
+        floating[::1] sample_weight,   # IN
+        floating[:, ::1] centers_old,  # IN
+        floating[:, ::1] centers_new,  # OUT
+        floating[::1] weight_sums,     # INOUT
+        int[::1] labels,               # IN
+        int *indices) nogil:           # TMP
+    """Update of a single center for dense MinibatchKMeans"""
+    cdef:
+        int n_samples = sample_weight.shape[0]
+        int n_features = centers_old.shape[1]
+        floating alpha
+        int n_indices
+        int k, sample_idx, feature_idx
+
+        floating wsum = 0
+
+    # indices = np.where(labels == cluster_idx)[0]
+    k = 0
+    for sample_idx in range(n_samples):
+        if labels[sample_idx] == cluster_idx:
+            indices[k] = sample_idx
+            wsum += sample_weight[sample_idx]
+            k += 1
+    n_indices = k
+
+    if wsum > 0:
+        # Undo the previous count-based scaling for this cluster center
+        for feature_idx in range(n_features):
+            centers_new[cluster_idx, feature_idx] = centers_old[cluster_idx, feature_idx] * weight_sums[cluster_idx]
+
+        # Update cluster with new point members
+        for k in range(n_indices):
+            sample_idx = indices[k]
+            for feature_idx in range(n_features):
+                centers_new[cluster_idx, feature_idx] += X[sample_idx * n_features + feature_idx] * sample_weight[sample_idx]
+
+        # Update the count statistics for this center
+        weight_sums[cluster_idx] += wsum
+
+        # Rescale to compute mean of all points (old and new)
+        alpha = 1 / weight_sums[cluster_idx]
+        for feature_idx in range(n_features):
+            centers_new[cluster_idx, feature_idx] *= alpha
+    else:
+        # No sample was assigned to this cluster in this batch of data
+        for feature_idx in range(n_features):
+            centers_new[cluster_idx, feature_idx] = centers_old[cluster_idx, feature_idx]
+
+
+def _minibatch_update_sparse(
+        X,                             # IN
+        floating[::1] sample_weight,   # IN
+        floating[:, ::1] centers_old,  # IN
+        floating[:, ::1] centers_new,  # OUT
+        floating[::1] weight_sums,     # INOUT
+        int[::1] labels,               # IN
+        int n_threads):
+    """Update of the centers for sparse MiniBatchKMeans.
+
+    Parameters
+    ----------
+    X : sparse matrix of shape (n_samples, n_features), dtype=floating
+        The observations to cluster. Must be in CSR format.
+
+    sample_weight : ndarray of shape (n_samples,), dtype=floating
+        The weights for each observation in X.
+
+    centers_old : ndarray of shape (n_clusters, n_features), dtype=floating
+        Centers before previous iteration, placeholder for the centers after
+        previous iteration.
+
+    centers_new : ndarray of shape (n_clusters, n_features), dtype=floating
+        Centers after previous iteration, placeholder for the new centers
+        computed during this iteration.
+
+    weight_sums : ndarray of shape (n_clusters,), dtype=floating
+        Current sums of the accumulated weights for each center.
+
+    labels : ndarray of shape (n_samples,), dtype=int
+        labels assignment.
+
+    n_threads : int
+        The number of threads to be used by openmp.
+    """
+    cdef:
+        floating[::1] X_data = X.data
+        int[::1] X_indices = X.indices
+        int[::1] X_indptr = X.indptr
+        int n_samples = X.shape[0]
+        int n_clusters = centers_old.shape[0]
+        int cluster_idx
+
+        int *indices
+
+    with nogil, parallel(num_threads=n_threads):
+        indices = <int*> malloc(n_samples * sizeof(int))
+
+        for cluster_idx in prange(n_clusters, schedule="static"):
+            update_center_sparse(cluster_idx, X_data, X_indices, X_indptr,
+                                 sample_weight, centers_old, centers_new,
+                                 weight_sums, labels, indices)
+
+        free(indices)
+
+
+cdef void update_center_sparse(
+        int cluster_idx,
+        floating[::1] X_data,          # IN
+        int[::1] X_indices,            # IN
+        int[::1] X_indptr,             # IN
+        floating[::1] sample_weight,   # IN
+        floating[:, ::1] centers_old,  # IN
+        floating[:, ::1] centers_new,  # OUT
+        floating[::1] weight_sums,     # INOUT
+        int[::1] labels,               # IN
+        int *indices) nogil:           # TMP
+    """Update of a single center for sparse MinibatchKMeans"""
+    cdef:
+        int n_samples = sample_weight.shape[0]
+        int n_features = centers_old.shape[1]
+        floating alpha
+        int n_indices
+        int k, sample_idx, feature_idx
+
+        floating wsum = 0
+
+    # indices = np.where(labels == cluster_idx)[0]
+    k = 0
+    for sample_idx in range(n_samples):
+        if labels[sample_idx] == cluster_idx:
+            indices[k] = sample_idx
+            wsum += sample_weight[sample_idx]
+            k += 1
+    n_indices = k
+
+    if wsum > 0:
+        # Undo the previous count-based scaling for this cluster center:
+        for feature_idx in range(n_features):
+            centers_new[cluster_idx, feature_idx] = centers_old[cluster_idx, feature_idx] * weight_sums[cluster_idx]
+
+        # Update cluster with new point members
+        for k in range(n_indices):
+            sample_idx = indices[k]
+            for feature_idx in range(X_indptr[sample_idx], X_indptr[sample_idx + 1]):
+                centers_new[cluster_idx, X_indices[feature_idx]] += X_data[feature_idx] * sample_weight[sample_idx]
+
+        # Update the count statistics for this center
+        weight_sums[cluster_idx] += wsum
+
+        # Rescale to compute mean of all points (old and new)
+        alpha = 1 / weight_sums[cluster_idx]
+        for feature_idx in range(n_features):
+            centers_new[cluster_idx, feature_idx] *= alpha
+    else:
+        # No sample was assigned to this cluster in this batch of data
+        for feature_idx in range(n_features):
+            centers_new[cluster_idx, feature_idx] = centers_old[cluster_idx, feature_idx]
diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py
index 17272858ae476..44c2837a8802a 100644
--- a/sklearn/cluster/_kmeans.py
+++ b/sklearn/cluster/_kmeans.py
@@ -26,16 +26,16 @@
 from ..utils.sparsefuncs import mean_variance_axis
 from ..utils.validation import _deprecate_positional_args
 from ..utils import check_array
-from ..utils import gen_batches
 from ..utils import check_random_state
 from ..utils import deprecated
 from ..utils.validation import check_is_fitted, _check_sample_weight
 from ..utils._openmp_helpers import _openmp_effective_n_threads
 from ..exceptions import ConvergenceWarning
-from ._k_means_fast import CHUNK_SIZE
-from ._k_means_fast import _inertia_dense
-from ._k_means_fast import _inertia_sparse
-from ._k_means_fast import _mini_batch_update_csr
+from ._k_means_common import CHUNK_SIZE
+from ._k_means_common import _inertia_dense
+from ._k_means_common import _inertia_sparse
+from ._k_means_minibatch import _minibatch_update_dense
+from ._k_means_minibatch import _minibatch_update_sparse
 from ._k_means_lloyd import lloyd_iter_chunked_dense
 from ._k_means_lloyd import lloyd_iter_chunked_sparse
 from ._k_means_elkan import init_bounds_dense
@@ -488,7 +488,7 @@ def _kmeans_single_elkan(X, sample_weight, centers_init, max_iter=300,
             np.asarray(center_half_distances), kth=1, axis=0)[1]
 
         if verbose:
-            inertia = _inertia(X, sample_weight, centers, labels)
+            inertia = _inertia(X, sample_weight, centers, labels, n_threads)
             print(f"Iteration {i}, inertia {inertia}")
 
         centers, centers_new = centers_new, centers
@@ -517,7 +517,7 @@ def _kmeans_single_elkan(X, sample_weight, centers_init, max_iter=300,
                    upper_bounds, lower_bounds, labels, center_shift,
                    n_threads, update_centers=False)
 
-    inertia = _inertia(X, sample_weight, centers, labels)
+    inertia = _inertia(X, sample_weight, centers, labels, n_threads)
 
     return labels, inertia, centers, i + 1
 
@@ -602,7 +602,8 @@ def _kmeans_single_lloyd(X, sample_weight, centers_init, max_iter=300,
                        weight_in_clusters, labels, center_shift, n_threads)
 
             if verbose:
-                inertia = _inertia(X, sample_weight, centers, labels)
+                inertia = _inertia(X, sample_weight, centers, labels,
+                                   n_threads)
                 print(f"Iteration {i}, inertia {inertia}.")
 
             centers, centers_new = centers_new, centers
@@ -630,13 +631,13 @@ def _kmeans_single_lloyd(X, sample_weight, centers_init, max_iter=300,
                        weight_in_clusters, labels, center_shift, n_threads,
                        update_centers=False)
 
-    inertia = _inertia(X, sample_weight, centers, labels)
+    inertia = _inertia(X, sample_weight, centers, labels, n_threads)
 
     return labels, inertia, centers, i + 1
 
 
 def _labels_inertia(X, sample_weight, x_squared_norms, centers,
-                    n_threads=None):
+                    n_threads=1):
     """E step of the K-means EM algorithm.
 
     Compute the labels and the inertia of the given samples and centers.
@@ -657,7 +658,7 @@ def _labels_inertia(X, sample_weight, x_squared_norms, centers,
     centers : ndarray of shape (n_clusters, n_features)
         The cluster centers.
 
-    n_threads : int, default=None
+    n_threads : int, default=1
         The number of OpenMP threads to use for the computation. Parallelism is
         sample-wise on the main cython loop which assigns each sample to its
         closest center.
@@ -673,8 +674,6 @@ def _labels_inertia(X, sample_weight, x_squared_norms, centers,
     n_samples = X.shape[0]
     n_clusters = centers.shape[0]
 
-    n_threads = _openmp_effective_n_threads(n_threads)
-
     labels = np.full(n_samples, -1, dtype=np.int32)
     weight_in_clusters = np.zeros(n_clusters, dtype=centers.dtype)
     center_shift = np.zeros_like(weight_in_clusters)
@@ -690,7 +689,17 @@ def _labels_inertia(X, sample_weight, x_squared_norms, centers,
             weight_in_clusters, labels, center_shift, n_threads,
             update_centers=False)
 
-    inertia = _inertia(X, sample_weight, centers, labels)
+    inertia = _inertia(X, sample_weight, centers, labels, n_threads)
+
+    return labels, inertia
+
+
+def _labels_inertia_threadpool_limit(X, sample_weight, x_squared_norms,
+                                     centers, n_threads=1):
+    """Same as _labels_inertia but in a threadpool_limits context."""
+    with threadpool_limits(limits=1, user_api="blas"):
+        labels, inertia = _labels_inertia(X, sample_weight, x_squared_norms,
+                                          centers, n_threads)
 
     return labels, inertia
 
@@ -806,7 +815,8 @@ class KMeans(TransformerMixin, ClusterMixin, BaseEstimator):
         Labels of each point
 
     inertia_ : float
-        Sum of squared distances of samples to their closest cluster center.
+        Sum of squared distances of samples to their closest cluster center,
+        weighted by the sample weights if provided.
 
     n_iter_ : int
         Number of iterations run.
@@ -1192,10 +1202,6 @@ def fit_transform(self, X, y=None, sample_weight=None):
         X_new : ndarray of shape (n_samples, n_clusters)
             X transformed in the new space.
         """
-        # Currently, this just skips a copy of the data if it is not in
-        # np.array or CSR format already.
-        # XXX This skips _check_test_data, which may change the dtype;
-        # we should refactor the input validation.
         return self.fit(X, sample_weight=sample_weight)._transform(X)
 
     def transform(self, X):
@@ -1251,8 +1257,9 @@ def predict(self, X, sample_weight=None):
         x_squared_norms = row_norms(X, squared=True)
         sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
 
-        return _labels_inertia(X, sample_weight, x_squared_norms,
-                               self.cluster_centers_, self._n_threads)[0]
+        return _labels_inertia_threadpool_limit(
+            X, sample_weight, x_squared_norms, self.cluster_centers_,
+            self._n_threads)[0]
 
     def score(self, X, y=None, sample_weight=None):
         """Opposite of the value of X on the K-means objective.
@@ -1280,8 +1287,9 @@ def score(self, X, y=None, sample_weight=None):
         x_squared_norms = row_norms(X, squared=True)
         sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
 
-        return -_labels_inertia(X, sample_weight, x_squared_norms,
-                                self.cluster_centers_)[1]
+        return -_labels_inertia_threadpool_limit(
+            X, sample_weight, x_squared_norms, self.cluster_centers_,
+            self._n_threads)[1]
 
     def _more_tags(self):
         return {
@@ -1292,50 +1300,42 @@ def _more_tags(self):
         }
 
 
-def _mini_batch_step(X, sample_weight, x_squared_norms, centers, weight_sums,
-                     old_center_buffer, compute_squared_diff,
-                     distances, random_reassign=False,
-                     random_state=None, reassignment_ratio=.01,
-                     verbose=False):
+def _mini_batch_step(X, x_squared_norms, sample_weight, centers, centers_new,
+                     weight_sums, random_state, random_reassign=False,
+                     reassignment_ratio=0.01, verbose=False, n_threads=1):
     """Incremental update of the centers for the Minibatch K-Means algorithm.
 
     Parameters
     ----------
 
-    X : ndarray of shape (n_samples, n_features)
-        The original data array.
-
-    sample_weight : array-like of shape (n_samples,)
-        The weights for each observation in X.
+    X : {ndarray, sparse matrix} of shape (n_samples, n_features)
+        The original data array. If sparse, must be in CSR format.
 
     x_squared_norms : ndarray of shape (n_samples,)
         Squared euclidean norm of each data point.
 
-    centers : ndarray of shape (k, n_features)
-        The cluster centers. This array is MODIFIED IN PLACE
+    sample_weight : ndarray of shape (n_samples,)
+        The weights for each observation in X.
+
+    centers : ndarray of shape (n_clusters, n_features)
+        The cluster centers before the current iteration
 
-    old_center_buffer : int
-        Copy of old centers for monitoring convergence.
+    centers_new : ndarray of shape (n_clusters, n_features)
+        The cluster centers after the current iteration. Modified in-place.
 
-    compute_squared_diff : bool
-        If set to False, the squared diff computation is skipped.
+    weight_sums : ndarray of shape (n_clusters,)
+        The vector in which we keep track of the numbers of points in a
+        cluster. This array is modified in place.
 
-    distances : ndarray of shape (n_samples,), dtype=float, default=None
-        If not None, should be a pre-allocated array that will be used to store
-        the distances of each sample to its closest center.
-        May not be None when random_reassign is True.
+    random_state : RandomState instance
+        Determines random number generation for low count centers reassignment.
+        See :term:`Glossary <random_state>`.
 
-    random_reassign : bool, default=False
+    random_reassign : boolean, default=False
         If True, centers with very low counts are randomly reassigned
         to observations.
 
-    random_state : int, RandomState instance or None, default=None
-        Determines random number generation for centroid initialization and to
-        pick new clusters amongst observations with uniform probability. Use
-        an int to make the randomness deterministic.
-        See :term:`Glossary <random_state>`.
-
-    reassignment_ratio : float, default=.01
+    reassignment_ratio : float, default=0.01
         Control the fraction of the maximum number of counts for a
         center to be reassigned. A higher value means that low count
         centers are more likely to be reassigned, which means that the
@@ -1345,156 +1345,64 @@ def _mini_batch_step(X, sample_weight, x_squared_norms, centers, weight_sums,
     verbose : bool, default=False
         Controls the verbosity.
 
+    n_threads : int, default=1
+        The number of OpenMP threads to use for the computation.
+
     Returns
     -------
     inertia : float
         Sum of squared distances of samples to their closest cluster center.
-
-    squared_diff : ndarray of shape (n_clusters,)
-        Squared distances between previous and updated cluster centers.
-
+        The inertia is computed after finding the labels and before updating
+        the centers.
     """
     # Perform label assignment to nearest centers
-    nearest_center, inertia = _labels_inertia(X, sample_weight,
-                                              x_squared_norms, centers)
+    # For better efficiency, it's better to run _mini_batch_step in a
+    # threadpool_limit context than using _labels_inertia_threadpool_limit here
+    labels, inertia = _labels_inertia(X, sample_weight,
+                                      x_squared_norms, centers,
+                                      n_threads=n_threads)
+
+    # Update centers according to the labels
+    if sp.issparse(X):
+        _minibatch_update_sparse(X, sample_weight, centers, centers_new,
+                                 weight_sums, labels, n_threads)
+    else:
+        _minibatch_update_dense(X, sample_weight, centers, centers_new,
+                                weight_sums, labels, n_threads)
 
+    # Reassign clusters that have very low weight
     if random_reassign and reassignment_ratio > 0:
-        random_state = check_random_state(random_state)
-        # Reassign clusters that have very low weight
         to_reassign = weight_sums < reassignment_ratio * weight_sums.max()
+
         # pick at most .5 * batch_size samples as new centers
         if to_reassign.sum() > .5 * X.shape[0]:
             indices_dont_reassign = \
                     np.argsort(weight_sums)[int(.5 * X.shape[0]):]
             to_reassign[indices_dont_reassign] = False
         n_reassigns = to_reassign.sum()
+
         if n_reassigns:
             # Pick new clusters amongst observations with uniform probability
             new_centers = random_state.choice(X.shape[0], replace=False,
                                               size=n_reassigns)
             if verbose:
-                print("[MiniBatchKMeans] Reassigning %i cluster centers."
-                      % n_reassigns)
+                print(f"[MiniBatchKMeans] Reassigning {n_reassigns} "
+                      f"cluster centers.")
 
-            if sp.issparse(X) and not sp.issparse(centers):
+            if sp.issparse(X):
                 assign_rows_csr(
                         X, new_centers.astype(np.intp, copy=False),
                         np.where(to_reassign)[0].astype(np.intp, copy=False),
-                        centers)
+                        centers_new)
             else:
-                centers[to_reassign] = X[new_centers]
+                centers_new[to_reassign] = X[new_centers]
+
         # reset counts of reassigned centers, but don't reset them too small
         # to avoid instant reassignment. This is a pretty dirty hack as it
         # also modifies the learning rates.
         weight_sums[to_reassign] = np.min(weight_sums[~to_reassign])
 
-    # implementation for the sparse CSR representation completely written in
-    # cython
-    if sp.issparse(X):
-        return inertia, _mini_batch_update_csr(
-            X, sample_weight, x_squared_norms, centers, weight_sums,
-            nearest_center, old_center_buffer, compute_squared_diff)
-
-    # dense variant in mostly numpy (not as memory efficient though)
-    k = centers.shape[0]
-    squared_diff = 0.0
-    for center_idx in range(k):
-        # find points from minibatch that are assigned to this center
-        center_mask = nearest_center == center_idx
-        wsum = sample_weight[center_mask].sum()
-
-        if wsum > 0:
-            if compute_squared_diff:
-                old_center_buffer[:] = centers[center_idx]
-
-            # inplace remove previous count scaling
-            centers[center_idx] *= weight_sums[center_idx]
-
-            # inplace sum with new points members of this cluster
-            centers[center_idx] += \
-                np.sum(X[center_mask] *
-                       sample_weight[center_mask, np.newaxis], axis=0)
-
-            # update the count statistics for this center
-            weight_sums[center_idx] += wsum
-
-            # inplace rescale to compute mean of all points (old and new)
-            # Note: numpy >= 1.10 does not support '/=' for the following
-            # expression for a mixture of int and float (see numpy issue #6464)
-            centers[center_idx] = centers[center_idx] / weight_sums[center_idx]
-
-            # update the squared diff if necessary
-            if compute_squared_diff:
-                diff = centers[center_idx].ravel() - old_center_buffer.ravel()
-                squared_diff += np.dot(diff, diff)
-
-    return inertia, squared_diff
-
-
-def _mini_batch_convergence(model, iteration_idx, n_iter, tol,
-                            n_samples, centers_squared_diff, batch_inertia,
-                            context, verbose=0):
-    """Helper function to encapsulate the early stopping logic."""
-    # Normalize inertia to be able to compare values when
-    # batch_size changes
-    batch_inertia /= model.batch_size
-    centers_squared_diff /= model.batch_size
-
-    # Compute an Exponentially Weighted Average of the squared
-    # diff to monitor the convergence while discarding
-    # minibatch-local stochastic variability:
-    # https://en.wikipedia.org/wiki/Moving_average
-    ewa_diff = context.get('ewa_diff')
-    ewa_inertia = context.get('ewa_inertia')
-    if ewa_diff is None:
-        ewa_diff = centers_squared_diff
-        ewa_inertia = batch_inertia
-    else:
-        alpha = float(model.batch_size) * 2.0 / (n_samples + 1)
-        alpha = 1.0 if alpha > 1.0 else alpha
-        ewa_diff = ewa_diff * (1 - alpha) + centers_squared_diff * alpha
-        ewa_inertia = ewa_inertia * (1 - alpha) + batch_inertia * alpha
-
-    # Log progress to be able to monitor convergence
-    if verbose:
-        progress_msg = (
-            'Minibatch iteration %d/%d:'
-            ' mean batch inertia: %f, ewa inertia: %f ' % (
-                iteration_idx + 1, n_iter, batch_inertia,
-                ewa_inertia))
-        print(progress_msg)
-
-    # Early stopping based on absolute tolerance on squared change of
-    # centers position (using EWA smoothing)
-    if tol > 0.0 and ewa_diff <= tol:
-        if verbose:
-            print('Converged (small centers change) at iteration %d/%d'
-                  % (iteration_idx + 1, n_iter))
-        return True
-
-    # Early stopping heuristic due to lack of improvement on smoothed inertia
-    ewa_inertia_min = context.get('ewa_inertia_min')
-    no_improvement = context.get('no_improvement', 0)
-    if ewa_inertia_min is None or ewa_inertia < ewa_inertia_min:
-        no_improvement = 0
-        ewa_inertia_min = ewa_inertia
-    else:
-        no_improvement += 1
-
-    if (model.max_no_improvement is not None
-            and no_improvement >= model.max_no_improvement):
-        if verbose:
-            print('Converged (lack of improvement in inertia)'
-                  ' at iteration %d/%d'
-                  % (iteration_idx + 1, n_iter))
-        return True
-
-    # update the convergence context to maintain state across successive calls:
-    context['ewa_diff'] = ewa_diff
-    context['ewa_inertia'] = ewa_inertia
-    context['ewa_inertia_min'] = ewa_inertia_min
-    context['no_improvement'] = no_improvement
-    return False
+    return inertia
 
 
 class MiniBatchKMeans(KMeans):
@@ -1531,8 +1439,13 @@ class MiniBatchKMeans(KMeans):
         Maximum number of iterations over the complete dataset before
         stopping independently of any early stopping criterion heuristics.
 
-    batch_size : int, default=100
+    batch_size : int, default=1024
         Size of the mini batches.
+        For faster compuations, you can set the ``batch_size`` greater than
+        256 * number of cores to enable parallelism on all cores.
+
+        .. versionchanged:: 1.0
+           `batch_size` default changed from 100 to 1024.
 
     verbose : int, default=0
         Verbosity mode.
@@ -1570,7 +1483,8 @@ class MiniBatchKMeans(KMeans):
         only algorithm is initialized by running a batch KMeans on a
         random subset of the data. This needs to be larger than n_clusters.
 
-        If `None`, `init_size= 3 * batch_size`.
+        If `None`, the heuristic is `init_size = 3 * batch_size` if
+        `3 * batch_size < n_clusters`, else `init_size = 3 * n_clusters`.
 
     n_init : int, default=3
         Number of random initializations that are tried.
@@ -1578,11 +1492,12 @@ class MiniBatchKMeans(KMeans):
         best of the ``n_init`` initializations as measured by inertia.
 
     reassignment_ratio : float, default=0.01
-        Control the fraction of the maximum number of counts for a
-        center to be reassigned. A higher value means that low count
-        centers are more easily reassigned, which means that the
-        model will take longer to converge, but should converge in a
-        better clustering.
+        Control the fraction of the maximum number of counts for a center to
+        be reassigned. A higher value means that low count centers are more
+        easily reassigned, which means that the model will take longer to
+        converge, but should converge in a better clustering. However, too high
+        a value may cause convergence issues, especially with a small batch
+        size.
 
     Attributes
     ----------
@@ -1590,17 +1505,24 @@ class MiniBatchKMeans(KMeans):
     cluster_centers_ : ndarray of shape (n_clusters, n_features)
         Coordinates of cluster centers.
 
-    labels_ : int
+    labels_ : ndarray of shape (n_samples,)
         Labels of each point (if compute_labels is set to True).
 
     inertia_ : float
         The value of the inertia criterion associated with the chosen
-        partition (if compute_labels is set to True). The inertia is
-        defined as the sum of square distances of samples to their nearest
-        neighbor.
+        partition if compute_labels is set to True. If compute_labels is set to
+        False, it's an approximation of the inertia based on an exponentially
+        weighted average of the batch inertiae.
+        The inertia is defined as the sum of square distances of samples to
+        their cluster center, weighted by the sample weights if provided.
 
     n_iter_ : int
-        Number of batches processed.
+        Number of iterations over the full dataset.
+
+    n_steps_ : int
+        Number of minibatches processed.
+
+        .. versionadded:: 1.0
 
     counts_ : ndarray of shape (n_clusters,)
         Weigth sum of each cluster.
@@ -1651,14 +1573,14 @@ class MiniBatchKMeans(KMeans):
     ...                          batch_size=6,
     ...                          max_iter=10).fit(X)
     >>> kmeans.cluster_centers_
-    array([[3.95918367, 2.40816327],
-           [1.12195122, 1.3902439 ]])
+    array([[1.19..., 1.22...],
+           [4.03..., 2.46...]])
     >>> kmeans.predict([[0, 0], [4, 4]])
-    array([1, 0], dtype=int32)
+    array([0, 1], dtype=int32)
     """
     @_deprecate_positional_args
     def __init__(self, n_clusters=8, *, init='k-means++', max_iter=100,
-                 batch_size=100, verbose=0, compute_labels=True,
+                 batch_size=1024, verbose=0, compute_labels=True,
                  random_state=None, tol=0.0, max_no_improvement=10,
                  init_size=None, n_init=3, reassignment_ratio=0.01):
 
@@ -1703,6 +1625,7 @@ def _check_params(self, X):
         if self.batch_size <= 0:
             raise ValueError(
                 f"batch_size should be > 0, got {self.batch_size} instead.")
+        self._batch_size = min(self.batch_size, X.shape[0])
 
         # init_size
         if self.init_size is not None and self.init_size <= 0:
@@ -1710,7 +1633,7 @@ def _check_params(self, X):
                 f"init_size should be > 0, got {self.init_size} instead.")
         self._init_size = self.init_size
         if self._init_size is None:
-            self._init_size = 3 * self.batch_size
+            self._init_size = 3 * self._batch_size
             if self._init_size < self.n_clusters:
                 self._init_size = 3 * self.n_clusters
         elif self._init_size < self.n_clusters:
@@ -1728,6 +1651,80 @@ def _check_params(self, X):
                 f"reassignment_ratio should be >= 0, got "
                 f"{self.reassignment_ratio} instead.")
 
+    def _mini_batch_convergence(self, step, n_steps, n_samples,
+                                centers_squared_diff, batch_inertia):
+        """Helper function to encapsulate the early stopping logic"""
+        # Normalize inertia to be able to compare values when
+        # batch_size changes
+        batch_inertia /= self._batch_size
+
+        # count steps starting from 1 for user friendly verbose mode.
+        step = step + 1
+
+        # Ignore first iteration because it's inertia from initialization.
+        if step == 1:
+            if self.verbose:
+                print(f"Minibatch step {step}/{n_steps}: mean batch "
+                      f"inertia: {batch_inertia}")
+            return False
+
+        # Compute an Exponentially Weighted Average of the inertia to
+        # monitor the convergence while discarding minibatch-local stochastic
+        # variability: https://en.wikipedia.org/wiki/Moving_average
+        if self._ewa_inertia is None:
+            self._ewa_inertia = batch_inertia
+        else:
+            alpha = self._batch_size * 2.0 / (n_samples + 1)
+            alpha = min(alpha, 1)
+            self._ewa_inertia = (
+                self._ewa_inertia * (1 - alpha) + batch_inertia * alpha)
+
+        # Log progress to be able to monitor convergence
+        if self.verbose:
+            print(f"Minibatch step {step}/{n_steps}: mean batch inertia: "
+                  f"{batch_inertia}, ewa inertia: {self._ewa_inertia}")
+
+        # Early stopping based on absolute tolerance on squared change of
+        # centers position
+        if self._tol > 0.0 and centers_squared_diff <= self._tol:
+            if self.verbose:
+                print(f"Converged (small centers change) at step "
+                      f"{step}/{n_steps}")
+            return True
+
+        # Early stopping heuristic due to lack of improvement on smoothed
+        # inertia
+        if (self._ewa_inertia_min is None or
+                self._ewa_inertia < self._ewa_inertia_min):
+            self._no_improvement = 0
+            self._ewa_inertia_min = self._ewa_inertia
+        else:
+            self._no_improvement += 1
+
+        if (self.max_no_improvement is not None
+                and self._no_improvement >= self.max_no_improvement):
+            if self.verbose:
+                print(f"Converged (lack of improvement in inertia) at step "
+                      f"{step}/{n_steps}")
+            return True
+
+        return False
+
+    def _random_reassign(self):
+        """Check if a random reassignment needs to be done.
+
+        Do random reassignments each time 10 * n_clusters samples have been
+        processed.
+
+        If there are empty clusters we always want to reassign.
+        """
+        self._n_since_last_reassign += self._batch_size
+        if ((self._counts == 0).any() or
+                self._n_since_last_reassign >= (10 * self.n_clusters)):
+            self._n_since_last_reassign = 0
+            return True
+        return False
+
     def fit(self, X, y=None, sample_weight=None):
         """Compute the centroids on X by chunking it into mini-batches.
 
@@ -1737,13 +1734,15 @@ def fit(self, X, y=None, sample_weight=None):
             Training instances to cluster. It must be noted that the data
             will be converted to C ordering, which will cause a memory copy
             if the given data is not C-contiguous.
+            If a sparse matrix is passed, a copy will be made if it's not in
+            CSR format.
 
         y : Ignored
             Not used, present here for API consistency by convention.
 
         sample_weight : array-like of shape (n_samples,), default=None
             The weights for each observation in X. If None, all observations
-            are assigned equal weight (default: None).
+            are assigned equal weight.
 
             .. versionadded:: 0.20
 
@@ -1758,6 +1757,7 @@ def fit(self, X, y=None, sample_weight=None):
         self._check_params(X)
         random_state = check_random_state(self.random_state)
         sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
+        n_samples, n_features = X.shape
 
         # Validate init array
         init = self.init
@@ -1765,182 +1765,146 @@ def fit(self, X, y=None, sample_weight=None):
             init = check_array(init, dtype=X.dtype, copy=True, order='C')
             self._validate_center_shape(X, init)
 
-        n_samples, n_features = X.shape
-        x_squared_norms = row_norms(X, squared=True)
-
-        if self.tol > 0.0:
-            tol = _tolerance(X, self.tol)
+        self._check_mkl_vcomp(X, self._batch_size)
 
-            # using tol-based early stopping needs the allocation of a
-            # dedicated before which can be expensive for high dim data:
-            # hence we allocate it outside of the main loop
-            old_center_buffer = np.zeros(n_features, dtype=X.dtype)
-        else:
-            tol = 0.0
-            # no need for the center buffer if tol-based early stopping is
-            # disabled
-            old_center_buffer = np.zeros(0, dtype=X.dtype)
-
-        distances = np.zeros(self.batch_size, dtype=X.dtype)
-        n_batches = int(np.ceil(float(n_samples) / self.batch_size))
-        n_iter = int(self.max_iter * n_batches)
-
-        self._check_mkl_vcomp(X, self.batch_size)
+        # precompute squared norms of data points
+        x_squared_norms = row_norms(X, squared=True)
 
+        # Validation set for the init
         validation_indices = random_state.randint(0, n_samples,
                                                   self._init_size)
         X_valid = X[validation_indices]
         sample_weight_valid = sample_weight[validation_indices]
         x_squared_norms_valid = x_squared_norms[validation_indices]
 
-        # perform several inits with random sub-sets
+        # perform several inits with random subsets
         best_inertia = None
         for init_idx in range(self._n_init):
             if self.verbose:
-                print("Init %d/%d with method: %s"
-                      % (init_idx + 1, self._n_init, init))
-            weight_sums = np.zeros(self.n_clusters, dtype=sample_weight.dtype)
-
-            # TODO: once the `k_means` function works with sparse input we
-            # should refactor the following init to use it instead.
+                print(f"Init {init_idx + 1}/{self._n_init} with method {init}")
 
             # Initialize the centers using only a fraction of the data as we
-            # expect n_samples to be very large when using MiniBatchKMeans
+            # expect n_samples to be very large when using MiniBatchKMeans.
             cluster_centers = self._init_centroids(
-                X, x_squared_norms=x_squared_norms,
-                init=init,
-                random_state=random_state,
-                init_size=self._init_size)
-
-            # Compute the label assignment on the init dataset
-            _mini_batch_step(
-                X_valid, sample_weight_valid,
-                x_squared_norms[validation_indices], cluster_centers,
-                weight_sums, old_center_buffer, False, distances=None,
-                verbose=self.verbose)
-
-            # Keep only the best cluster centers across independent inits on
-            # the common validation set
-            _, inertia = _labels_inertia(X_valid, sample_weight_valid,
-                                         x_squared_norms_valid,
-                                         cluster_centers)
+                X, x_squared_norms=x_squared_norms, init=init,
+                random_state=random_state, init_size=self._init_size)
+
+            # Compute inertia on a validation set.
+            _, inertia = _labels_inertia_threadpool_limit(
+                X_valid, sample_weight_valid, x_squared_norms_valid,
+                cluster_centers, n_threads=self._n_threads)
+
             if self.verbose:
-                print("Inertia for init %d/%d: %f"
-                      % (init_idx + 1, self._n_init, inertia))
+                print(f"Inertia for init {init_idx + 1}/{self._n_init}: "
+                      f"{inertia}")
             if best_inertia is None or inertia < best_inertia:
-                self.cluster_centers_ = cluster_centers
-                self._counts = weight_sums
+                init_centers = cluster_centers
                 best_inertia = inertia
 
-        # Empty context to be used inplace by the convergence check routine
-        convergence_context = {}
-
-        # Perform the iterative optimization until the final convergence
-        # criterion
-        for iteration_idx in range(n_iter):
-            # Sample a minibatch from the full dataset
-            minibatch_indices = random_state.randint(
-                0, n_samples, self.batch_size)
-
-            # Perform the actual update step on the minibatch data
-            batch_inertia, centers_squared_diff = _mini_batch_step(
-                X[minibatch_indices], sample_weight[minibatch_indices],
-                x_squared_norms[minibatch_indices],
-                self.cluster_centers_, self._counts,
-                old_center_buffer, tol > 0.0, distances=distances,
-                # Here we randomly choose whether to perform
-                # random reassignment: the choice is done as a function
-                # of the iteration index, and the minimum number of
-                # counts, in order to force this reassignment to happen
-                # every once in a while
-                random_reassign=((iteration_idx + 1)
-                                 % (10 + int(self._counts.min())) == 0),
-                random_state=random_state,
-                reassignment_ratio=self.reassignment_ratio,
-                verbose=self.verbose)
-
-            # Monitor convergence and do early stopping if necessary
-            if _mini_batch_convergence(
-                    self, iteration_idx, n_iter, tol, n_samples,
-                    centers_squared_diff, batch_inertia, convergence_context,
-                    verbose=self.verbose):
-                break
+        centers = init_centers
+        centers_new = np.empty_like(centers)
+
+        # Initialize counts
+        self._counts = np.zeros(self.n_clusters, dtype=X.dtype)
+
+        # Attributes to monitor the convergence
+        self._ewa_inertia = None
+        self._ewa_inertia_min = None
+        self._no_improvement = 0
+
+        # Initialize number of samples seen since last reassignment
+        self._n_since_last_reassign = 0
+
+        n_steps = (self.max_iter * n_samples) // self._batch_size
+
+        with threadpool_limits(limits=1, user_api="blas"):
+            # Perform the iterative optimization until convergence
+            for i in range(n_steps):
+                # Sample a minibatch from the full dataset
+                minibatch_indices = random_state.randint(0, n_samples,
+                                                         self._batch_size)
+
+                # Perform the actual update step on the minibatch data
+                batch_inertia = _mini_batch_step(
+                    X=X[minibatch_indices],
+                    x_squared_norms=x_squared_norms[minibatch_indices],
+                    sample_weight=sample_weight[minibatch_indices],
+                    centers=centers,
+                    centers_new=centers_new,
+                    weight_sums=self._counts,
+                    random_state=random_state,
+                    random_reassign=self._random_reassign(),
+                    reassignment_ratio=self.reassignment_ratio,
+                    verbose=self.verbose,
+                    n_threads=self._n_threads)
+
+                if self._tol > 0.0:
+                    centers_squared_diff = np.sum((centers_new - centers)**2)
+                else:
+                    centers_squared_diff = 0
+
+                centers, centers_new = centers_new, centers
+
+                # Monitor convergence and do early stopping if necessary
+                if self._mini_batch_convergence(
+                        i, n_steps, n_samples, centers_squared_diff,
+                        batch_inertia):
+                    break
+
+        self.cluster_centers_ = centers
 
-        self.n_iter_ = iteration_idx + 1
+        self.n_steps_ = i + 1
+        self.n_iter_ = int(np.ceil(((i + 1) * self._batch_size) / n_samples))
 
         if self.compute_labels:
-            self.labels_, self.inertia_ = \
-                    self._labels_inertia_minibatch(X, sample_weight)
+            self.labels_, self.inertia_ = _labels_inertia_threadpool_limit(
+                X, sample_weight, x_squared_norms, self.cluster_centers_,
+                n_threads=self._n_threads)
+        else:
+            self.inertia_ = self._ewa_inertia * n_samples
 
         return self
 
-    def _labels_inertia_minibatch(self, X, sample_weight):
-        """Compute labels and inertia using mini batches.
-
-        This is slightly slower than doing everything at once but prevents
-        memory errors / segfaults.
-
-        Parameters
-        ----------
-        X : array-like of shape (n_samples, n_features)
-            Input data.
-
-        sample_weight : array-like of shape (n_samples,)
-            The weights for each observation in X.
-
-        Returns
-        -------
-        labels : ndarray of shape (n_samples,)
-            Cluster labels for each point.
-
-        inertia : float
-            Sum of squared distances of points to nearest cluster.
-        """
-        if self.verbose:
-            print('Computing label assignment and total inertia')
-        sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
-        x_squared_norms = row_norms(X, squared=True)
-        slices = gen_batches(X.shape[0], self.batch_size)
-        results = [_labels_inertia(X[s], sample_weight[s], x_squared_norms[s],
-                                   self.cluster_centers_) for s in slices]
-        labels, inertia = zip(*results)
-        return np.hstack(labels), np.sum(inertia)
-
     def partial_fit(self, X, y=None, sample_weight=None):
         """Update k means estimate on a single mini-batch X.
 
         Parameters
         ----------
-        X : array-like of shape (n_samples, n_features)
-            Coordinates of the data points to cluster. It must be noted that
-            X will be copied if it is not C-contiguous.
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training instances to cluster. It must be noted that the data
+            will be converted to C ordering, which will cause a memory copy
+            if the given data is not C-contiguous.
+            If a sparse matrix is passed, a copy will be made if it's not in
+            CSR format.
 
         y : Ignored
             Not used, present here for API consistency by convention.
 
         sample_weight : array-like of shape (n_samples,), default=None
             The weights for each observation in X. If None, all observations
-            are assigned equal weight (default: None).
+            are assigned equal weight.
 
         Returns
         -------
         self
         """
-        is_first_call_to_partial_fit = not hasattr(self, 'cluster_centers_')
+        has_centers = hasattr(self, 'cluster_centers_')
 
         X = self._validate_data(X, accept_sparse='csr',
                                 dtype=[np.float64, np.float32],
                                 order='C', accept_large_sparse=False,
-                                reset=is_first_call_to_partial_fit)
+                                reset=not has_centers)
 
         self._random_state = getattr(self, "_random_state",
                                      check_random_state(self.random_state))
         sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
+        self.n_steps_ = getattr(self, "n_steps_", 0)
 
+        # precompute squared norms of data points
         x_squared_norms = row_norms(X, squared=True)
 
-        if is_first_call_to_partial_fit:
-            # this is the first call to partial_fit on this object
+        if not has_centers:
+            # this instance has not been fitted yet (fit or partial_fit)
             self._check_params(X)
 
             # Validate init array
@@ -1953,34 +1917,34 @@ def partial_fit(self, X, y=None, sample_weight=None):
 
             # initialize the cluster centers
             self.cluster_centers_ = self._init_centroids(
-                X, x_squared_norms=x_squared_norms,
-                init=init,
-                random_state=self._random_state,
-                init_size=self._init_size)
-
-            self._counts = np.zeros(self.n_clusters,
-                                    dtype=sample_weight.dtype)
-            random_reassign = False
-            distances = None
-        else:
-            # The lower the minimum count is, the more we do random
-            # reassignment, however, we don't want to do random
-            # reassignment too often, to allow for building up counts
-            random_reassign = self._random_state.randint(
-                10 * (1 + self._counts.min())) == 0
-            distances = np.zeros(X.shape[0], dtype=X.dtype)
-
-        _mini_batch_step(X, sample_weight, x_squared_norms,
-                         self.cluster_centers_, self._counts,
-                         np.zeros(0, dtype=X.dtype), 0,
-                         random_reassign=random_reassign, distances=distances,
-                         random_state=self._random_state,
-                         reassignment_ratio=self.reassignment_ratio,
-                         verbose=self.verbose)
+                X, x_squared_norms=x_squared_norms, init=init,
+                random_state=self._random_state, init_size=self._init_size)
+
+            # Initialize counts
+            self._counts = np.zeros(self.n_clusters, dtype=X.dtype)
+
+            # Initialize number of samples seen since last reassignment
+            self._n_since_last_reassign = 0
+
+        with threadpool_limits(limits=1, user_api="blas"):
+            _mini_batch_step(X,
+                             x_squared_norms=x_squared_norms,
+                             sample_weight=sample_weight,
+                             centers=self.cluster_centers_,
+                             centers_new=self.cluster_centers_,
+                             weight_sums=self._counts,
+                             random_state=self._random_state,
+                             random_reassign=self._random_reassign(),
+                             reassignment_ratio=self.reassignment_ratio,
+                             verbose=self.verbose,
+                             n_threads=self._n_threads)
 
         if self.compute_labels:
-            self.labels_, self.inertia_ = _labels_inertia(
-                X, sample_weight, x_squared_norms, self.cluster_centers_)
+            self.labels_, self.inertia_ = _labels_inertia_threadpool_limit(
+                X, sample_weight, x_squared_norms, self.cluster_centers_,
+                n_threads=self._n_threads)
+
+        self.n_steps_ += 1
 
         return self
 
@@ -1998,7 +1962,7 @@ def predict(self, X, sample_weight=None):
 
         sample_weight : array-like of shape (n_samples,), default=None
             The weights for each observation in X. If None, all observations
-            are assigned equal weight (default: None).
+            are assigned equal weight.
 
         Returns
         -------
@@ -2008,7 +1972,14 @@ def predict(self, X, sample_weight=None):
         check_is_fitted(self)
 
         X = self._check_test_data(X)
-        return self._labels_inertia_minibatch(X, sample_weight)[0]
+        x_squared_norms = row_norms(X, squared=True)
+        sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
+
+        labels, _ = _labels_inertia_threadpool_limit(
+            X, sample_weight, x_squared_norms, self.cluster_centers_,
+            n_threads=self._n_threads)
+
+        return labels
 
     def _more_tags(self):
         return {
diff --git a/sklearn/cluster/setup.py b/sklearn/cluster/setup.py
index 48ed25c5c0eaf..9a85541731e5f 100644
--- a/sklearn/cluster/setup.py
+++ b/sklearn/cluster/setup.py
@@ -25,8 +25,8 @@ def configuration(parent_package='', top_path=None):
                          include_dirs=[numpy.get_include()],
                          libraries=libraries)
 
-    config.add_extension('_k_means_fast',
-                         sources=['_k_means_fast.pyx'],
+    config.add_extension('_k_means_common',
+                         sources=['_k_means_common.pyx'],
                          include_dirs=[numpy.get_include()],
                          libraries=libraries)
 
@@ -40,6 +40,11 @@ def configuration(parent_package='', top_path=None):
                          include_dirs=[numpy.get_include()],
                          libraries=libraries)
 
+    config.add_extension('_k_means_minibatch',
+                         sources=['_k_means_minibatch.pyx'],
+                         include_dirs=[numpy.get_include()],
+                         libraries=libraries)
+
     config.add_subpackage('tests')
 
     return config
diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py
index 341b00c5c137f..248b2e1ddd498 100644
--- a/sklearn/cluster/tests/test_k_means.py
+++ b/sklearn/cluster/tests/test_k_means.py
@@ -9,9 +9,7 @@
 import pytest
 
 from sklearn.utils._testing import assert_array_equal
-from sklearn.utils._testing import assert_array_almost_equal
 from sklearn.utils._testing import assert_allclose
-from sklearn.utils._testing import assert_almost_equal
 from sklearn.utils.fixes import _astype_copy_false
 from sklearn.base import clone
 from sklearn.exceptions import ConvergenceWarning
@@ -24,12 +22,12 @@
 from sklearn.cluster import MiniBatchKMeans
 from sklearn.cluster._kmeans import _labels_inertia
 from sklearn.cluster._kmeans import _mini_batch_step
-from sklearn.cluster._k_means_fast import _relocate_empty_clusters_dense
-from sklearn.cluster._k_means_fast import _relocate_empty_clusters_sparse
-from sklearn.cluster._k_means_fast import _euclidean_dense_dense_wrapper
-from sklearn.cluster._k_means_fast import _euclidean_sparse_dense_wrapper
-from sklearn.cluster._k_means_fast import _inertia_dense
-from sklearn.cluster._k_means_fast import _inertia_sparse
+from sklearn.cluster._k_means_common import _relocate_empty_clusters_dense
+from sklearn.cluster._k_means_common import _relocate_empty_clusters_sparse
+from sklearn.cluster._k_means_common import _euclidean_dense_dense_wrapper
+from sklearn.cluster._k_means_common import _euclidean_sparse_dense_wrapper
+from sklearn.cluster._k_means_common import _inertia_dense
+from sklearn.cluster._k_means_common import _inertia_sparse
 from sklearn.datasets import make_blobs
 from io import StringIO
 
@@ -176,68 +174,58 @@ def test_kmeans_convergence(algorithm):
 def test_minibatch_update_consistency():
     # Check that dense and sparse minibatch update give the same results
     rng = np.random.RandomState(42)
-    old_centers = centers + rng.normal(size=centers.shape)
 
-    new_centers = old_centers.copy()
-    new_centers_csr = old_centers.copy()
+    centers_old = centers + rng.normal(size=centers.shape)
+    centers_old_csr = centers_old.copy()
 
-    weight_sums = np.zeros(new_centers.shape[0], dtype=np.double)
-    weight_sums_csr = np.zeros(new_centers.shape[0], dtype=np.double)
+    centers_new = np.zeros_like(centers_old)
+    centers_new_csr = np.zeros_like(centers_old_csr)
+
+    weight_sums = np.zeros(centers_old.shape[0], dtype=X.dtype)
+    weight_sums_csr = np.zeros(centers_old.shape[0], dtype=X.dtype)
 
     x_squared_norms = (X ** 2).sum(axis=1)
     x_squared_norms_csr = row_norms(X_csr, squared=True)
 
-    buffer = np.zeros(centers.shape[1], dtype=np.double)
-    buffer_csr = np.zeros(centers.shape[1], dtype=np.double)
+    sample_weight = np.ones(X.shape[0], dtype=X.dtype)
 
     # extract a small minibatch
     X_mb = X[:10]
     X_mb_csr = X_csr[:10]
     x_mb_squared_norms = x_squared_norms[:10]
     x_mb_squared_norms_csr = x_squared_norms_csr[:10]
-
-    sample_weight_mb = np.ones(X_mb.shape[0], dtype=np.double)
+    sample_weight_mb = sample_weight[:10]
 
     # step 1: compute the dense minibatch update
-    old_inertia, incremental_diff = _mini_batch_step(
-        X_mb, sample_weight_mb, x_mb_squared_norms, new_centers, weight_sums,
-        buffer, 1, None, random_reassign=False)
+    old_inertia = _mini_batch_step(
+        X_mb, x_mb_squared_norms, sample_weight_mb, centers_old, centers_new,
+        weight_sums, np.random.RandomState(0), random_reassign=False)
     assert old_inertia > 0.0
 
     # compute the new inertia on the same batch to check that it decreased
     labels, new_inertia = _labels_inertia(
-        X_mb, sample_weight_mb, x_mb_squared_norms, new_centers)
+        X_mb, sample_weight_mb, x_mb_squared_norms, centers_new)
     assert new_inertia > 0.0
     assert new_inertia < old_inertia
 
-    # check that the incremental difference computation is matching the
-    # final observed value
-    effective_diff = np.sum((new_centers - old_centers) ** 2)
-    assert_almost_equal(incremental_diff, effective_diff)
-
     # step 2: compute the sparse minibatch update
-    old_inertia_csr, incremental_diff_csr = _mini_batch_step(
-        X_mb_csr, sample_weight_mb, x_mb_squared_norms_csr, new_centers_csr,
-        weight_sums_csr, buffer_csr, 1, None, random_reassign=False)
+    old_inertia_csr = _mini_batch_step(
+        X_mb_csr, x_mb_squared_norms_csr, sample_weight_mb, centers_old_csr,
+        centers_new_csr, weight_sums_csr, np.random.RandomState(0),
+        random_reassign=False)
     assert old_inertia_csr > 0.0
 
     # compute the new inertia on the same batch to check that it decreased
     labels_csr, new_inertia_csr = _labels_inertia(
-        X_mb_csr, sample_weight_mb, x_mb_squared_norms_csr, new_centers_csr)
+        X_mb_csr, sample_weight_mb, x_mb_squared_norms_csr, centers_new_csr)
     assert new_inertia_csr > 0.0
     assert new_inertia_csr < old_inertia_csr
 
-    # check that the incremental difference computation is matching the
-    # final observed value
-    effective_diff = np.sum((new_centers_csr - old_centers) ** 2)
-    assert_almost_equal(incremental_diff_csr, effective_diff)
-
     # step 3: check that sparse and dense updates lead to the same results
     assert_array_equal(labels, labels_csr)
-    assert_array_almost_equal(new_centers, new_centers_csr)
-    assert_almost_equal(incremental_diff, incremental_diff_csr)
-    assert_almost_equal(old_inertia, old_inertia_csr)
-    assert_almost_equal(new_inertia, new_inertia_csr)
+    assert_allclose(centers_new, centers_new_csr)
+    assert_allclose(old_inertia, old_inertia_csr)
+    assert_allclose(new_inertia, new_inertia_csr)
 
 
 def _check_fitted_model(km):
@@ -250,7 +238,7 @@ def _check_fitted_model(km):
     assert np.unique(labels).shape[0] == n_clusters
 
     # check that the labels assignment are perfect (up to a permutation)
-    assert v_measure_score(true_labels, labels) == 1.0
+    assert_allclose(v_measure_score(true_labels, labels), 1.0)
     assert km.inertia_ > 0.0
 
 
@@ -412,66 +400,54 @@ def test_minibatch_sensible_reassign():
     assert km.cluster_centers_.any(axis=1).sum() > 10
 
 
-def test_minibatch_reassign():
-    # Give a perfect initialization, but a large reassignment_ratio,
-    # as a result all the centers should be reassigned and the model
-    # should no longer be good
-    sample_weight = np.ones(X.shape[0], dtype=X.dtype)
-    for this_X in (X, X_csr):
-        mb_k_means = MiniBatchKMeans(n_clusters=n_clusters, batch_size=100,
-                                     random_state=42)
-        mb_k_means.fit(this_X)
-
-        score_before = mb_k_means.score(this_X)
-        try:
-            old_stdout = sys.stdout
-            sys.stdout = StringIO()
-            # Turn on verbosity to smoke test the display code
-            _mini_batch_step(this_X, sample_weight, (X ** 2).sum(axis=1),
-                             mb_k_means.cluster_centers_,
-                             mb_k_means._counts,
-                             np.zeros(X.shape[1], np.double),
-                             False, distances=np.zeros(X.shape[0]),
-                             random_reassign=True, random_state=42,
-                             reassignment_ratio=1, verbose=True)
-        finally:
-            sys.stdout = old_stdout
-        assert score_before > mb_k_means.score(this_X)
+@pytest.mark.parametrize("data", [X, X_csr], ids=["dense", "sparse"])
+def test_minibatch_reassign(data):
+    # Check the reassignment part of the minibatch step with very high or very
+    # low reassignment ratio.
+    perfect_centers = np.empty((n_clusters, n_features))
+    for i in range(n_clusters):
+        perfect_centers[i] = X[true_labels == i].mean(axis=0)
+
+    x_squared_norms = row_norms(data, squared=True)
+    sample_weight = np.ones(n_samples)
+    centers_new = np.empty_like(perfect_centers)
+
+    # Give a perfect initialization, but a large reassignment_ratio, as a
+    # result many centers should be reassigned and the model should no longer
+    # be good
+    score_before = - _labels_inertia(data, sample_weight, x_squared_norms,
+                                     perfect_centers, 1)[1]
+
+    _mini_batch_step(data, x_squared_norms, sample_weight, perfect_centers,
+                     centers_new, np.zeros(n_clusters),
+                     np.random.RandomState(0), random_reassign=True,
+                     reassignment_ratio=1)
+
+    score_after = - _labels_inertia(data, sample_weight, x_squared_norms,
+                                    centers_new, 1)[1]
+
+    assert score_before > score_after
 
     # Give a perfect initialization, with a small reassignment_ratio,
-    # no center should be reassigned
-    for this_X in (X, X_csr):
-        mb_k_means = MiniBatchKMeans(n_clusters=n_clusters, batch_size=100,
-                                     init=centers.copy(),
-                                     random_state=42, n_init=1)
-        mb_k_means.fit(this_X)
-        clusters_before = mb_k_means.cluster_centers_
-        # Turn on verbosity to smoke test the display code
-        _mini_batch_step(this_X, sample_weight, (X ** 2).sum(axis=1),
-                         mb_k_means.cluster_centers_,
-                         mb_k_means._counts,
-                         np.zeros(X.shape[1], np.double),
-                         False, distances=np.zeros(X.shape[0]),
-                         random_reassign=True, random_state=42,
-                         reassignment_ratio=1e-15)
-        assert_array_almost_equal(clusters_before, mb_k_means.cluster_centers_)
+    # no center should be reassigned.
+    _mini_batch_step(data, x_squared_norms, sample_weight, perfect_centers,
+                     centers_new, np.zeros(n_clusters),
+                     np.random.RandomState(0), random_reassign=True,
+                     reassignment_ratio=1e-15)
+
+    assert_allclose(centers_new, perfect_centers)
 
 
 def test_minibatch_with_many_reassignments():
     # Test for the case that the number of clusters to reassign is bigger
-    # than the batch_size
-    n_samples = 550
-    rnd = np.random.RandomState(42)
-    X = rnd.uniform(size=(n_samples, 10))
-    # Check that the fit works if n_clusters is bigger than the batch_size.
-    # Run the test with 550 clusters and 550 samples, because it turned out
-    # that this values ensure that the number of clusters to reassign
-    # is always bigger than the batch_size
-    n_clusters = 550
-    MiniBatchKMeans(n_clusters=n_clusters,
-                    batch_size=100,
+    # than the batch_size. Run the test with 100 clusters and a batch_size of
+    # 10 because it turned out that these values ensure that the number of
+    # clusters to reassign is always bigger than the batch_size.
+    MiniBatchKMeans(n_clusters=100,
+                    batch_size=10,
                     init_size=n_samples,
-                    random_state=42).fit(X)
+                    random_state=42,
+                    verbose=True).fit(X)
 
 
 def test_minibatch_kmeans_init_size():
@@ -491,6 +467,46 @@ def test_minibatch_kmeans_init_size():
     assert km._init_size == n_samples
 
 
+@pytest.mark.parametrize("tol, max_no_improvement", [(1e-4, None), (0, 10)])
+def test_minibatch_declared_convergence(capsys, tol, max_no_improvement):
+    # Check convergence detection based on ewa batch inertia or on
+    # small center change.
+    X, _, centers = make_blobs(centers=3, random_state=0, return_centers=True)
+
+    km = MiniBatchKMeans(n_clusters=3, init=centers, batch_size=20, tol=tol,
+                         random_state=0, max_iter=10, n_init=1, verbose=1,
+                         max_no_improvement=max_no_improvement)
+
+    km.fit(X)
+    assert 1 < km.n_iter_ < 10
+
+    captured = capsys.readouterr()
+    if max_no_improvement is None:
+        assert "Converged (small centers change)" in captured.out
+    if tol == 0:
+        assert "Converged (lack of improvement in inertia)" in captured.out
+
+
+def test_minibatch_iter_steps():
+    # Check consistency of n_iter_ and n_steps_ attributes.
+    batch_size = 30
+    n_samples = X.shape[0]
+    km = MiniBatchKMeans(n_clusters=3, batch_size=batch_size,
+                         random_state=0).fit(X)
+
+    # n_iter_ is the number of started epochs
+    assert km.n_iter_ == np.ceil((km.n_steps_ * batch_size) / n_samples)
+    assert isinstance(km.n_iter_, int)
+
+    # without stopping condition, max_iter should be reached
+    km = MiniBatchKMeans(n_clusters=3, batch_size=batch_size, random_state=0,
+                         tol=0, max_no_improvement=None, max_iter=10).fit(X)
+
+    assert km.n_iter_ == 10
+    assert km.n_steps_ == (10 * n_samples) // batch_size
+    assert isinstance(km.n_steps_, int)
+
+
 def test_kmeans_copyx():
     # Check that copy_x=False returns nearly equal X after de-centering.
     my_X = X.copy()
@@ -584,6 +600,19 @@ def test_predict(Estimator, algorithm, init, dtype, array_constr):
     assert_allclose(v_measure_score(pred, np.arange(10)), 1)
 
 
+@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans])
+def test_dense_sparse(Estimator):
+    # Check that the results are the same for dense and sparse input.
+    sample_weight = np.random.RandomState(0).random_sample((n_samples,))
+    km_dense = Estimator(n_clusters=n_clusters, random_state=0, n_init=1)
+    km_dense.fit(X, sample_weight=sample_weight)
+    km_sparse = Estimator(n_clusters=n_clusters, random_state=0, n_init=1)
+    km_sparse.fit(X_csr, sample_weight=sample_weight)
+
+    assert_array_equal(km_dense.labels_, km_sparse.labels_)
+    assert_allclose(km_dense.cluster_centers_, km_sparse.cluster_centers_)
+
+
 @pytest.mark.parametrize("init", ["random", "k-means++", centers],
                          ids=["random", "k-means++", "ndarray"])
 @pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans])
@@ -801,17 +830,19 @@ def test_unit_weights_vs_no_weights(Estimator, data):
     assert_allclose(km_none.cluster_centers_, km_ones.cluster_centers_)
 
 
-def test_scaled_weights():
-    # scaling all sample weights by a common factor
+@pytest.mark.parametrize("data", [X, X_csr], ids=["dense", "sparse"])
+@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans])
+def test_scaled_weights(Estimator, data):
+    # Check that scaling all sample weights by a common factor
     # shouldn't change the result
-    sample_weight = np.ones(n_samples)
-    for estimator in [KMeans(n_clusters=n_clusters, random_state=42),
-                      MiniBatchKMeans(n_clusters=n_clusters, random_state=42)]:
-        est_1 = clone(estimator).fit(X)
-        est_2 = clone(estimator).fit(X, sample_weight=0.5*sample_weight)
-        assert_almost_equal(v_measure_score(est_1.labels_, est_2.labels_), 1.0)
-        assert_almost_equal(_sort_centers(est_1.cluster_centers_),
-                            _sort_centers(est_2.cluster_centers_))
+    sample_weight = np.random.RandomState(0).uniform(n_samples)
+
+    km = Estimator(n_clusters=n_clusters, random_state=42, n_init=1)
+    km_orig = clone(km).fit(data, sample_weight=sample_weight)
+    km_scaled = clone(km).fit(data, sample_weight=0.5 * sample_weight)
+
+    assert_array_equal(km_orig.labels_, km_scaled.labels_)
+    assert_allclose(km_orig.cluster_centers_, km_scaled.cluster_centers_)
 
 
 def test_kmeans_elkan_iter_attribute():
@@ -837,18 +868,19 @@ def test_kmeans_empty_cluster_relocated(array_constr):
     assert_allclose(km.cluster_centers_, [[-1], [1]])
 
 
-def test_result_of_kmeans_equal_in_diff_n_threads():
-    # Check that KMeans gives the same results in parallel mode than in
-    # sequential mode.
+@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans])
+def test_result_equal_in_diff_n_threads(Estimator):
+    # Check that KMeans/MiniBatchKMeans give the same results in parallel mode
+    # than in sequential mode.
     rnd = np.random.RandomState(0)
     X = rnd.normal(size=(50, 10))
 
     with threadpool_limits(limits=1, user_api="openmp"):
-        result_1 = KMeans(
-            n_clusters=3, random_state=0).fit(X).labels_
+        result_1 = Estimator(
+            n_clusters=n_clusters, random_state=0).fit(X).labels_
     with threadpool_limits(limits=2, user_api="openmp"):
-        result_2 = KMeans(
-            n_clusters=3, random_state=0).fit(X).labels_
+        result_2 = Estimator(
+            n_clusters=n_clusters, random_state=0).fit(X).labels_
     assert_array_equal(result_1, result_2)
 
 
@@ -954,6 +986,7 @@ def test_euclidean_distance(dtype, squared):
 
 @pytest.mark.parametrize("dtype", [np.float32, np.float64])
 def test_inertia(dtype):
+    # Check that the _inertia_(dense/sparse) helpers produce correct results.
     rng = np.random.RandomState(0)
     X_sparse = sp.random(100, 10, density=0.5, format="csr", random_state=rng,
                          dtype=dtype)
@@ -965,8 +998,10 @@ def test_inertia(dtype):
     distances = ((X_dense - centers[labels])**2).sum(axis=1)
     expected = np.sum(distances * sample_weight)
 
-    inertia_dense = _inertia_dense(X_dense, sample_weight, centers, labels)
-    inertia_sparse = _inertia_sparse(X_sparse, sample_weight, centers, labels)
+    inertia_dense = _inertia_dense(
+        X_dense, sample_weight, centers, labels, n_threads=1)
+    inertia_sparse = _inertia_sparse(
+        X_sparse, sample_weight, centers, labels, n_threads=1)
 
     assert_allclose(inertia_dense, inertia_sparse, rtol=1e-6)
     assert_allclose(inertia_dense, expected, rtol=1e-6)

From 2c5ea4e6b3add57588fb35293b7dd25506c5fe06 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?=
 <34657725+jeremiedbb@users.noreply.github.com>
Date: Thu, 15 Apr 2021 14:33:22 +0200
Subject: [PATCH 331/478] DictionaryLearning: Fix several issues in the dict
 update (#19198)

Co-authored-by: Olivier Grisel <olivier.grisel@gmail.com>
---
 doc/whats_new/v1.0.rst                        |  14 ++
 sklearn/decomposition/_dict_learning.py       | 157 ++++++++----------
 .../decomposition/tests/test_dict_learning.py |  28 ++++
 3 files changed, 115 insertions(+), 84 deletions(-)

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 5975177f7a0c8..0494e5f29bf39 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -159,11 +159,25 @@ Changelog
 - |Fix| Fixes incorrect multiple data-conversion warnings when clustering
   boolean data. :pr:`19046` by :user:`Surya Prakash <jdsurya>`.
 
+:mod:`sklearn.decomposition`
+............................
+
 - |Fix| Fixed :func:`dict_learning`, used by :class:`DictionaryLearning`, to
   ensure determinism of the output. Achieved by flipping signs of the SVD
   output which is used to initialize the code.
   :pr:`18433` by :user:`Bruno Charron <brcharron>`.
 
+- |Fix| Fixed a bug in :class:`MiniBatchDictionaryLearning`,
+  :class:`MiniBatchSparsePCA` and :func:`dict_learning_online` where the
+  update of the dictionary was incorrect. :pr:`19198` by
+  :user:`Jérémie du Boisberranger <jeremiedbb>`.
+
+- |Fix| Fixed a bug in :class:`DictionaryLearning`, :class:`SparsePCA`,
+  :class:`MiniBatchDictionaryLearning`, :class:`MiniBatchSparsePCA`,
+  :func:`dict_learning` and :func:`dict_learning_online` where the restart of
+  unused atoms during the dictionary update was not working as expected.
+  :pr:`19198` by :user:`Jérémie du Boisberranger <jeremiedbb>`.
+
 :mod:`sklearn.ensemble`
 .......................
 
diff --git a/sklearn/decomposition/_dict_learning.py b/sklearn/decomposition/_dict_learning.py
index e2ae9f8355a54..bd8a95e37dbaf 100644
--- a/sklearn/decomposition/_dict_learning.py
+++ b/sklearn/decomposition/_dict_learning.py
@@ -355,28 +355,32 @@ def sparse_encode(X, dictionary, *, gram=None, cov=None,
     return code
 
 
-def _update_dict(dictionary, Y, code, verbose=False, return_r2=False,
+def _update_dict(dictionary, Y, code, A=None, B=None, verbose=False,
                  random_state=None, positive=False):
     """Update the dense dictionary factor in place.
 
     Parameters
     ----------
-    dictionary : ndarray of shape (n_features, n_components)
+    dictionary : ndarray of shape (n_components, n_features)
         Value of the dictionary at the previous iteration.
 
-    Y : ndarray of shape (n_features, n_samples)
+    Y : ndarray of shape (n_samples, n_features)
         Data matrix.
 
-    code : ndarray of shape (n_components, n_samples)
+    code : ndarray of shape (n_samples, n_components)
         Sparse coding of the data against which to optimize the dictionary.
 
+    A : ndarray of shape (n_components, n_components), default=None
+        Together with `B`, sufficient stats of the online model to update the
+        dictionary.
+
+    B : ndarray of shape (n_features, n_components), default=None
+        Together with `A`, sufficient stats of the online model to update the
+        dictionary.
+
     verbose: bool, default=False
         Degree of output the procedure will print.
 
-    return_r2 : bool, default=False
-        Whether to compute and return the residual sum of squares corresponding
-        to the computed solution.
-
     random_state : int, RandomState instance or None, default=None
         Used for randomly initializing the dictionary. Pass an int for
         reproducible results across multiple function calls.
@@ -386,54 +390,41 @@ def _update_dict(dictionary, Y, code, verbose=False, return_r2=False,
         Whether to enforce positivity when finding the dictionary.
 
         .. versionadded:: 0.20
-
-    Returns
-    -------
-    dictionary : ndarray of shape (n_features, n_components)
-        Updated dictionary.
     """
-    n_components = len(code)
-    n_features = Y.shape[0]
+    n_samples, n_components = code.shape
     random_state = check_random_state(random_state)
-    # Get BLAS functions
-    gemm, = linalg.get_blas_funcs(('gemm',), (dictionary, code, Y))
-    ger, = linalg.get_blas_funcs(('ger',), (dictionary, code))
-    nrm2, = linalg.get_blas_funcs(('nrm2',), (dictionary,))
-    # Residuals, computed with BLAS for speed and efficiency
-    # R <- -1.0 * U * V^T + 1.0 * Y
-    # Outputs R as Fortran array for efficiency
-    R = gemm(-1.0, dictionary, code, 1.0, Y)
+
+    if A is None:
+        A = code.T @ code
+    if B is None:
+        B = Y.T @ code
+
+    n_unused = 0
+
     for k in range(n_components):
-        # R <- 1.0 * U_k * V_k^T + R
-        R = ger(1.0, dictionary[:, k], code[k, :], a=R, overwrite_a=True)
-        dictionary[:, k] = np.dot(R, code[k, :])
-        if positive:
-            np.clip(dictionary[:, k], 0, None, out=dictionary[:, k])
-        # Scale k'th atom
-        # (U_k * U_k) ** 0.5
-        atom_norm = nrm2(dictionary[:, k])
-        if atom_norm < 1e-10:
-            if verbose == 1:
-                sys.stdout.write("+")
-                sys.stdout.flush()
-            elif verbose:
-                print("Adding new random atom")
-            dictionary[:, k] = random_state.randn(n_features)
-            if positive:
-                np.clip(dictionary[:, k], 0, None, out=dictionary[:, k])
-            # Setting corresponding coefs to 0
-            code[k, :] = 0.0
-            # (U_k * U_k) ** 0.5
-            atom_norm = nrm2(dictionary[:, k])
-            dictionary[:, k] /= atom_norm
+        if A[k, k] > 1e-6:
+            # 1e-6 is arbitrary but consistent with the spams implementation
+            dictionary[k] += (B[:, k] - A[k] @ dictionary) / A[k, k]
         else:
-            dictionary[:, k] /= atom_norm
-            # R <- -1.0 * U_k * V_k^T + R
-            R = ger(-1.0, dictionary[:, k], code[k, :], a=R, overwrite_a=True)
-    if return_r2:
-        R = nrm2(R) ** 2.0
-        return dictionary, R
-    return dictionary
+            # kth atom is almost never used -> sample a new one from the data
+            newd = Y[random_state.choice(n_samples)]
+
+            # add small noise to avoid making the sparse coding ill conditioned
+            noise_level = 0.01 * (newd.std() or 1)  # avoid 0 std
+            noise = random_state.normal(0, noise_level, size=len(newd))
+
+            dictionary[k] = newd + noise
+            code[:, k] = 0
+            n_unused += 1
+
+        if positive:
+            np.clip(dictionary[k], 0, None, out=dictionary[k])
+
+        # Projection on the constraint set ||V_k|| == 1
+        dictionary[k] /= linalg.norm(dictionary[k])
+
+    if verbose and n_unused > 0:
+        print(f"{n_unused} unused atoms resampled.")
 
 
 @_deprecate_positional_args
@@ -579,10 +570,9 @@ def dict_learning(X, n_components, *, alpha, max_iter=100, tol=1e-8,
         dictionary = np.r_[dictionary,
                            np.zeros((n_components - r, dictionary.shape[1]))]
 
-    # Fortran-order dict, as we are going to access its row vectors
-    dictionary = np.array(dictionary, order='F')
-
-    residuals = 0
+    # Fortran-order dict better suited for the sparse coding which is the
+    # bottleneck of this algorithm.
+    dictionary = np.asfortranarray(dictionary)
 
     errors = []
     current_cost = np.nan
@@ -607,15 +597,14 @@ def dict_learning(X, n_components, *, alpha, max_iter=100, tol=1e-8,
         code = sparse_encode(X, dictionary, algorithm=method, alpha=alpha,
                              init=code, n_jobs=n_jobs, positive=positive_code,
                              max_iter=method_max_iter, verbose=verbose)
-        # Update dictionary
-        dictionary, residuals = _update_dict(dictionary.T, X.T, code.T,
-                                             verbose=verbose, return_r2=True,
-                                             random_state=random_state,
-                                             positive=positive_dict)
-        dictionary = dictionary.T
+
+        # Update dictionary in place
+        _update_dict(dictionary, X, code, verbose=verbose,
+                     random_state=random_state, positive=positive_dict)
 
         # Cost function
-        current_cost = 0.5 * residuals + alpha * np.sum(np.abs(code))
+        current_cost = (0.5 * np.sum((X - code @ dictionary)**2)
+                        + alpha * np.sum(np.abs(code)))
         errors.append(current_cost)
 
         if ii > 0:
@@ -807,7 +796,9 @@ def dict_learning_online(X, n_components=2, *, alpha=1, n_iter=100,
     else:
         X_train = X
 
-    dictionary = check_array(dictionary.T, order='F', dtype=np.float64,
+    # Fortran-order dict better suited for the sparse coding which is the
+    # bottleneck of this algorithm.
+    dictionary = check_array(dictionary, order='F', dtype=np.float64,
                              copy=False)
     dictionary = np.require(dictionary, requirements='W')
 
@@ -839,11 +830,11 @@ def dict_learning_online(X, n_components=2, *, alpha=1, n_iter=100,
                 print("Iteration % 3i (elapsed time: % 3is, % 4.1fmn)"
                       % (ii, dt, dt / 60))
 
-        this_code = sparse_encode(this_X, dictionary.T, algorithm=method,
+        this_code = sparse_encode(this_X, dictionary, algorithm=method,
                                   alpha=alpha, n_jobs=n_jobs,
                                   check_input=False,
                                   positive=positive_code,
-                                  max_iter=method_max_iter, verbose=verbose).T
+                                  max_iter=method_max_iter, verbose=verbose)
 
         # Update the auxiliary variables
         if ii < batch_size - 1:
@@ -853,15 +844,13 @@ def dict_learning_online(X, n_components=2, *, alpha=1, n_iter=100,
         beta = (theta + 1 - batch_size) / (theta + 1)
 
         A *= beta
-        A += np.dot(this_code, this_code.T)
+        A += np.dot(this_code.T, this_code)
         B *= beta
-        B += np.dot(this_X.T, this_code.T)
+        B += np.dot(this_X.T, this_code)
 
-        # Update dictionary
-        dictionary = _update_dict(dictionary, B, A, verbose=verbose,
-                                  random_state=random_state,
-                                  positive=positive_dict)
-        # XXX: Can the residuals be of any use?
+        # Update dictionary in place
+        _update_dict(dictionary, this_X, this_code, A, B, verbose=verbose,
+                     random_state=random_state, positive=positive_dict)
 
         # Maybe we need a stopping criteria based on the amount of
         # modification in the dictionary
@@ -870,15 +859,15 @@ def dict_learning_online(X, n_components=2, *, alpha=1, n_iter=100,
 
     if return_inner_stats:
         if return_n_iter:
-            return dictionary.T, (A, B), ii - iter_offset + 1
+            return dictionary, (A, B), ii - iter_offset + 1
         else:
-            return dictionary.T, (A, B)
+            return dictionary, (A, B)
     if return_code:
         if verbose > 1:
             print('Learning code...', end=' ')
         elif verbose == 1:
             print('|', end=' ')
-        code = sparse_encode(X, dictionary.T, algorithm=method, alpha=alpha,
+        code = sparse_encode(X, dictionary, algorithm=method, alpha=alpha,
                              n_jobs=n_jobs, check_input=False,
                              positive=positive_code, max_iter=method_max_iter,
                              verbose=verbose)
@@ -886,14 +875,14 @@ def dict_learning_online(X, n_components=2, *, alpha=1, n_iter=100,
             dt = (time.time() - t0)
             print('done (total time: % 3is, % 4.1fmn)' % (dt, dt / 60))
         if return_n_iter:
-            return code, dictionary.T, ii - iter_offset + 1
+            return code, dictionary, ii - iter_offset + 1
         else:
-            return code, dictionary.T
+            return code, dictionary
 
     if return_n_iter:
-        return dictionary.T, ii - iter_offset + 1
+        return dictionary, ii - iter_offset + 1
     else:
-        return dictionary.T
+        return dictionary
 
 
 class _BaseSparseCoding(TransformerMixin):
@@ -1286,7 +1275,7 @@ class DictionaryLearning(_BaseSparseCoding, BaseEstimator):
     We can check the level of sparsity of `X_transformed`:
 
     >>> np.mean(X_transformed == 0)
-    0.88...
+    0.87...
 
     We can compare the average squared euclidean norm of the reconstruction
     error of the sparse coded signal relative to the squared euclidean norm of
@@ -1294,7 +1283,7 @@ class DictionaryLearning(_BaseSparseCoding, BaseEstimator):
 
     >>> X_hat = X_transformed @ dict_learner.components_
     >>> np.mean(np.sum((X_hat - X) ** 2, axis=1) / np.sum(X ** 2, axis=1))
-    0.07...
+    0.08...
 
     Notes
     -----
@@ -1523,7 +1512,7 @@ class MiniBatchDictionaryLearning(_BaseSparseCoding, BaseEstimator):
     We can check the level of sparsity of `X_transformed`:
 
     >>> np.mean(X_transformed == 0)
-    0.87...
+    0.86...
 
     We can compare the average squared euclidean norm of the reconstruction
     error of the sparse coded signal relative to the squared euclidean norm of
@@ -1531,7 +1520,7 @@ class MiniBatchDictionaryLearning(_BaseSparseCoding, BaseEstimator):
 
     >>> X_hat = X_transformed @ dict_learner.components_
     >>> np.mean(np.sum((X_hat - X) ** 2, axis=1) / np.sum(X ** 2, axis=1))
-    0.10...
+    0.07...
 
     Notes
     -----
diff --git a/sklearn/decomposition/tests/test_dict_learning.py b/sklearn/decomposition/tests/test_dict_learning.py
index a13c07a6ac728..4048450a5d486 100644
--- a/sklearn/decomposition/tests/test_dict_learning.py
+++ b/sklearn/decomposition/tests/test_dict_learning.py
@@ -10,6 +10,7 @@
 
 from sklearn.utils import check_array
 
+from sklearn.utils._testing import assert_allclose
 from sklearn.utils._testing import assert_array_almost_equal
 from sklearn.utils._testing import assert_array_equal
 from sklearn.utils._testing import ignore_warnings
@@ -25,6 +26,8 @@
 from sklearn.utils.estimator_checks import check_transformer_general
 from sklearn.utils.estimator_checks import check_transformers_unfitted
 
+from sklearn.decomposition._dict_learning import _update_dict
+
 
 rng_global = np.random.RandomState(0)
 n_samples, n_features = 10, 8
@@ -575,6 +578,31 @@ def test_sparse_coder_n_features_in():
     assert sc.n_features_in_ == d.shape[1]
 
 
+def test_update_dict():
+    # Check the dict update in batch mode vs online mode
+    # Non-regression test for #4866
+    rng = np.random.RandomState(0)
+
+    code = np.array([[0.5, -0.5],
+                     [0.1, 0.9]])
+    dictionary = np.array([[1., 0.],
+                           [0.6, 0.8]])
+
+    X = np.dot(code, dictionary) + rng.randn(2, 2)
+
+    # full batch update
+    newd_batch = dictionary.copy()
+    _update_dict(newd_batch, X, code)
+
+    # online update
+    A = np.dot(code.T, code)
+    B = np.dot(X.T, code)
+    newd_online = dictionary.copy()
+    _update_dict(newd_online, X, code, A, B)
+
+    assert_allclose(newd_batch, newd_online)
+
+
 @pytest.mark.parametrize("Estimator", [DictionaryLearning,
                                        MiniBatchDictionaryLearning])
 def test_warning_default_transform_alpha(Estimator):

From 962bd9a401bfee1f2d8e7e832018a75424b5bbe2 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 16 Apr 2021 14:19:43 +0200
Subject: [PATCH 332/478] DOC Adds consistence in docs for univariate selection
 metrics (#19904)

Co-authored-by: Julien Jerphanion <git@jjerphan.xyz>
Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 .../_univariate_selection.py                  | 29 ++++++++++---------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/sklearn/feature_selection/_univariate_selection.py b/sklearn/feature_selection/_univariate_selection.py
index 0656e27d6e30f..be3298387f612 100644
--- a/sklearn/feature_selection/_univariate_selection.py
+++ b/sklearn/feature_selection/_univariate_selection.py
@@ -51,15 +51,15 @@ def f_oneway(*args):
 
     Parameters
     ----------
-    *args : array-like, sparse matrices
+    *args : {array-like, sparse matrix}
         sample1, sample2... The sample measurements should be given as
         arguments.
 
     Returns
     -------
-    F-value : float
+    f_statistic : float
         The computed F-value of the test.
-    p-value : float
+    p_value : float
         The associated p-value from the F-distribution.
 
     Notes
@@ -127,19 +127,19 @@ def f_classif(X, y):
 
     Parameters
     ----------
-    X : {array-like, sparse matrix} shape = [n_samples, n_features]
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
         The set of regressors that will be tested sequentially.
 
-    y : array of shape(n_samples)
-        The data matrix.
+    y : ndarray of shape (n_samples,)
+        The target vector.
 
     Returns
     -------
-    F : array, shape = [n_features,]
-        The set of F values.
+    f_statistic : ndarray of shape (n_features,)
+        F-statistic for each feature.
 
-    pval : array, shape = [n_features,]
-        The set of p-values.
+    p_values : ndarray of shape (n_features,)
+        P-values associated with the F-statistic.
 
     See Also
     --------
@@ -195,10 +195,11 @@ def chi2(X, y):
 
     Returns
     -------
-    chi2 : array, shape = (n_features,)
-        chi2 statistics of each feature.
-    pval : array, shape = (n_features,)
-        p-values of each feature.
+    chi2 : ndarray of shape (n_features,)
+        Chi2 statistics for each feature.
+
+    p_values : ndarray of shape (n_features,)
+        P-values for each feature.
 
     Notes
     -----

From 90b399269b83531191f5f244d48182014acbc4f5 Mon Sep 17 00:00:00 2001
From: Shao Yang Hong <hongsy2006@gmail.com>
Date: Sat, 17 Apr 2021 00:41:59 +0800
Subject: [PATCH 333/478] DOC Fix the description of some features in
 load_diabetes (#19366)

Co-authored-by: Guillaume Lemaitre <g.lemaitre58@gmail.com>
---
 sklearn/datasets/_base.py           | 6 ++++++
 sklearn/datasets/descr/diabetes.rst | 6 +++---
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/sklearn/datasets/_base.py b/sklearn/datasets/_base.py
index 17d2db9f2075b..948b4f7cba61e 100644
--- a/sklearn/datasets/_base.py
+++ b/sklearn/datasets/_base.py
@@ -765,6 +765,12 @@ def load_diabetes(*, return_X_y=False, as_frame=False):
     Features         real, -.2 < x < .2
     Targets          integer 25 - 346
     ==============   ==================
+    
+    .. note::
+       The meaning of each feature (i.e. `feature_names`) might be unclear
+       (especially for `ltg`) as the documentation of the original dataset is
+       not explicit. We provide information that seems correct in regard with
+       the scientific literature in this field of research.
 
     Read more in the :ref:`User Guide <diabetes_dataset>`.
 
diff --git a/sklearn/datasets/descr/diabetes.rst b/sklearn/datasets/descr/diabetes.rst
index 771b3e5fe282a..04651c0163307 100644
--- a/sklearn/datasets/descr/diabetes.rst
+++ b/sklearn/datasets/descr/diabetes.rst
@@ -21,11 +21,11 @@ quantitative measure of disease progression one year after baseline.
       - sex
       - bmi     body mass index
       - bp      average blood pressure
-      - s1      tc, T-Cells (a type of white blood cells)
+      - s1      tc, total serum cholesterol
       - s2      ldl, low-density lipoproteins
       - s3      hdl, high-density lipoproteins
-      - s4      tch, thyroid stimulating hormone
-      - s5      ltg, lamotrigine
+      - s4      tch, total cholesterol / HDL
+      - s5      ltg, possibly log of serum triglycerides level
       - s6      glu, blood sugar level
 
 Note: Each of these 10 feature variables have been mean centered and scaled by the standard deviation times `n_samples` (i.e. the sum of squares of each column totals 1).

From e1f879e8eed85c5018d888c9f87f168bc44085e1 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Fri, 16 Apr 2021 21:07:13 +0200
Subject: [PATCH 334/478] DOC add FAQ entry for the many linear model classes
 (#19861)

Co-authored-by: Chiara Marmo <cmarmo@users.noreply.github.com>
---
 doc/faq.rst | 41 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/doc/faq.rst b/doc/faq.rst
index 0ebd4df759125..4038106bc93d7 100644
--- a/doc/faq.rst
+++ b/doc/faq.rst
@@ -396,3 +396,44 @@ and not at test time, for resampling and similar uses,
 like at `imbalanced-learn`.
 In general, these use cases can be solved
 with a custom meta estimator rather than a Pipeline
+
+Why are there so many different estimators for linear models?
+-------------------------------------------------------------
+Usually, there is one classifier and one regressor per model type, e.g.
+:class:`~ensemble.GradientBoostingClassifier` and
+:class:`~ensemble.GradientBoostingRegressor`. Both have similar options and
+both have the parameter `loss`, which is especially useful in the regression
+case as it enables the estimation of conditional mean as well as conditional
+quantiles.
+
+For linear models, there are many estimator classes which are very close to
+each other. Let us have a look at
+
+- :class:`~linear_model.LinearRegression`, no penalty
+- :class:`~linear_model.Ridge`, L2 penalty
+- :class:`~linear_model.Lasso`, L1 penalty (sparse models)
+- :class:`~linear_model.ElasticNet`, L1 + L2 penalty (less sparse models)
+- :class:`~linear_model.SGDRegressor` with `loss='squared_loss'`
+
+**Maintainer perspective:**
+They all do in principle the same and are different only by the penalty they
+impose. This, however, has a large impact on the way the underlying
+optimization problem is solved. In the end, this amounts to usage of different
+methods and tricks from linear algebra. A special case is `SGDRegressor` which
+comprises all 4 previous models and is different by the optimization procedure.
+A further side effect is that the different estimators favor different data
+layouts (`X` c-contiguous or f-contiguous, sparse csr or csc). This complexity
+of the seemingly simple linear models is the reason for having different
+estimator classes for different penalties.
+
+**User perspective:**
+First, the current design is inspired by the scientific literature where linear
+regression models with different regularization/penalty were given different
+names, e.g. *ridge regression*. Having different model classes with according
+names makes it easier for users to find those regression models.
+Secondly, if all the 5 above mentioned linear models were unified into a single
+class, there would be parameters with a lot of options like the ``solver``
+parameter. On top of that, there would be a lot of exclusive interactions
+between different parameters. For example, the possible options of the
+parameters ``solver``, ``precompute`` and ``selection`` would depend on the
+chosen values of the penalty parameters ``alpha`` and ``l1_ratio``.

From 9605f3b586990c51a045838dd7464cc0ef3d3e18 Mon Sep 17 00:00:00 2001
From: Ishan Mishra <33893659+legitishan@users.noreply.github.com>
Date: Sun, 18 Apr 2021 17:32:17 +0530
Subject: [PATCH 335/478] DOC Fixes links in outlier_detection.html (#19917)

---
 doc/modules/outlier_detection.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/modules/outlier_detection.rst b/doc/modules/outlier_detection.rst
index 14495bc558dab..496b840e0c6da 100644
--- a/doc/modules/outlier_detection.rst
+++ b/doc/modules/outlier_detection.rst
@@ -356,7 +356,7 @@ on new unseen data when LOF is applied for novelty detection, i.e. when the
 This strategy is illustrated below.
 
 .. figure:: ../auto_examples/neighbors/images/sphx_glr_plot_lof_outlier_detection_001.png
-   :target: ../auto_examples/neighbors/sphx_glr_plot_lof_outlier_detection.html
+   :target: ../auto_examples/neighbors/plot_lof_outlier_detection.html
    :align: center
    :scale: 75%
 
@@ -401,6 +401,6 @@ Note that ``fit_predict`` is not available in this case.
 Novelty detection with Local Outlier Factor is illustrated below.
 
   .. figure:: ../auto_examples/neighbors/images/sphx_glr_plot_lof_novelty_detection_001.png
-     :target: ../auto_examples/neighbors/sphx_glr_plot_lof_novelty_detection.html
+     :target: ../auto_examples/neighbors/plot_lof_novelty_detection.html
      :align: center
      :scale: 75%

From 5d8796b91db3e85975e6cab8b779aefa9502227a Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Sun, 18 Apr 2021 11:31:50 -0400
Subject: [PATCH 336/478] FIX Fixes unknown handling for str dtypes in
 OrdinalEncoder.transform (#19888)

* FIX Fixes unknown handling for str X in OrdinalEncoder.transform

* DOC Adds whats new

* DOC Move to 0.24.2

* DOC Adds reasoning in comment
---
 doc/whats_new/v0.24.rst                      |  3 +++
 doc/whats_new/v1.0.rst                       |  2 +-
 sklearn/preprocessing/_encoders.py           |  6 ++++++
 sklearn/preprocessing/tests/test_encoders.py | 20 ++++++++++++++++++++
 4 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst
index 09f3d9bdecd3e..880d1879637ed 100644
--- a/doc/whats_new/v0.24.rst
+++ b/doc/whats_new/v0.24.rst
@@ -53,6 +53,9 @@ Changelog
   `'use_encoded_value'` strategies.
   :pr:`19234` by `Guillaume Lemaitre <glemaitre>`.
 
+- |Fix| :meth:`preprocessing.OrdinalEncoder.transfrom` correctly handles
+  unknown values for string dtypes. :pr:`19888` by `Thomas Fan`_.
+
 :mod:`sklearn.multioutput`
 ..........................
 
diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 0494e5f29bf39..a78cbe69b746d 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -311,7 +311,7 @@ Changelog
   :pr:`18649` by `Leandro Hermida <hermidalc>` and
   `Rodion Martynov <marrodion>`.
 
-- |Fix| The `fit` method of the successive halving parameter search 
+- |Fix| The `fit` method of the successive halving parameter search
   (:class:`model_selection.HalvingGridSearchCV`, and
   :class:`model_selection.HalvingRandomSearchCV`) now correctly handles the
   `groups` parameter. :pr:`19847` by :user:`Xiaoyu Chai <xiaoyuchai>`.
diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index cd05dc89bb75d..7c62cbdcbc565 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -150,6 +150,12 @@ def _transform(self, X, handle_unknown='error', force_all_finite=True,
                     if (self.categories_[i].dtype.kind in ('U', 'S')
                             and self.categories_[i].itemsize > Xi.itemsize):
                         Xi = Xi.astype(self.categories_[i].dtype)
+                    elif (self.categories_[i].dtype.kind == 'O' and
+                            Xi.dtype.kind == 'U'):
+                        # categories are objects and Xi are numpy strings.
+                        # Cast Xi to an object dtype to prevent truncation
+                        # when setting invalid values.
+                        Xi = Xi.astype('O')
                     else:
                         Xi = Xi.copy()
 
diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
index 9f1e331f78fec..94e2c276dcd58 100644
--- a/sklearn/preprocessing/tests/test_encoders.py
+++ b/sklearn/preprocessing/tests/test_encoders.py
@@ -1132,3 +1132,23 @@ def test_ordinal_encoder_sparse():
     X_trans_sparse = sparse.csr_matrix(X_trans)
     with pytest.raises(TypeError, match=err_msg):
         encoder.inverse_transform(X_trans_sparse)
+
+
+@pytest.mark.parametrize("X_train", [
+    [['AA', 'B']],
+    np.array([['AA', 'B']], dtype='O'),
+    np.array([['AA', 'B']], dtype='U'),
+])
+@pytest.mark.parametrize("X_test", [
+    [['A', 'B']],
+    np.array([['A', 'B']], dtype='O'),
+    np.array([['A', 'B']], dtype='U'),
+])
+def test_ordinal_encoder_handle_unknown_string_dtypes(X_train, X_test):
+    """Checks that ordinal encoder transforms string dtypes. Non-regression
+    test for #19872."""
+    enc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-9)
+    enc.fit(X_train)
+
+    X_trans = enc.transform(X_test)
+    assert_allclose(X_trans, [[-9, 0]])

From 66ec10f341087ca05156f701e0ba90717cd252d2 Mon Sep 17 00:00:00 2001
From: Maxwell <zl2480@columbia.edu>
Date: Mon, 19 Apr 2021 15:50:10 +0800
Subject: [PATCH 337/478] MAINT Clean up code in FastICA (#19796)

* FIX code cleanup in FastICA

* keep syntax X_mean

Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
---
 sklearn/decomposition/_fastica.py | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/sklearn/decomposition/_fastica.py b/sklearn/decomposition/_fastica.py
index 27f7f646ea579..a57ddada85694 100644
--- a/sklearn/decomposition/_fastica.py
+++ b/sklearn/decomposition/_fastica.py
@@ -427,9 +427,8 @@ def _fit(self, X, compute_sources=False):
         -------
             X_new : ndarray of shape (n_samples, n_components)
         """
-
-        X = self._validate_data(X, copy=self.whiten, dtype=FLOAT_DTYPES,
-                                ensure_min_samples=2).T
+        XT = self._validate_data(X, copy=self.whiten, dtype=FLOAT_DTYPES,
+                                 ensure_min_samples=2).T
         fun_args = {} if self.fun_args is None else self.fun_args
         random_state = check_random_state(self.random_state)
 
@@ -454,7 +453,7 @@ def g(x, fun_args):
                 % self.fun
             )
 
-        n_samples, n_features = X.shape
+        n_features, n_samples = XT.shape
 
         n_components = self.n_components
         if not self.whiten and n_components is not None:
@@ -471,24 +470,24 @@ def g(x, fun_args):
             )
 
         if self.whiten:
-            # Centering the columns (ie the variables)
-            X_mean = X.mean(axis=-1)
-            X -= X_mean[:, np.newaxis]
+            # Centering the features of X
+            X_mean = XT.mean(axis=-1)
+            XT -= X_mean[:, np.newaxis]
 
             # Whitening and preprocessing by PCA
-            u, d, _ = linalg.svd(X, full_matrices=False, check_finite=False)
+            u, d, _ = linalg.svd(XT, full_matrices=False, check_finite=False)
 
             del _
             K = (u / d).T[:n_components]  # see (6.33) p.140
             del u, d
-            X1 = np.dot(K, X)
+            X1 = np.dot(K, XT)
             # see (13.6) p.267 Here X1 is white and data
             # in X has been projected onto a subspace by PCA
-            X1 *= np.sqrt(n_features)
+            X1 *= np.sqrt(n_samples)
         else:
             # X must be casted to floats to avoid typing issues with numpy
             # 2.0 and the line below
-            X1 = as_float_array(X, copy=False)  # copy has been taken care of
+            X1 = as_float_array(XT, copy=False)  # copy has been taken care of
 
         w_init = self.w_init
         if w_init is None:
@@ -519,9 +518,9 @@ def g(x, fun_args):
 
         if compute_sources:
             if self.whiten:
-                S = np.linalg.multi_dot([W, K, X]).T
+                S = np.linalg.multi_dot([W, K, XT]).T
             else:
-                S = np.dot(W, X).T
+                S = np.dot(W, XT).T
         else:
             S = None
 

From 969ec32c9273d77641bc9591f44cd2ae3daf4434 Mon Sep 17 00:00:00 2001
From: Isaack Mungui <41724425+isaack-mungui@users.noreply.github.com>
Date: Mon, 19 Apr 2021 11:08:36 +0300
Subject: [PATCH 338/478] DOC fix early stopping description in MLP (#19818)

* Maintenance task: Moved PolynomialFeatures to _polynomial.py

* Updated docstring including behaviour of neural network when early stopping is activated

* Revert "Maintenance task: Moved PolynomialFeatures to _polynomial.py"

This reverts commit f76df548ecd31dbe6093d4e8329711197c830542.

* reverted failing commit

* Update sklearn/neural_network/_multilayer_perceptron.py

Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>

* Updated doc with proposed changes

* Fixed whitespace error

Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
---
 sklearn/neural_network/_multilayer_perceptron.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/sklearn/neural_network/_multilayer_perceptron.py b/sklearn/neural_network/_multilayer_perceptron.py
index 04822360791e7..e349dfd844f96 100644
--- a/sklearn/neural_network/_multilayer_perceptron.py
+++ b/sklearn/neural_network/_multilayer_perceptron.py
@@ -825,6 +825,9 @@ class MLPClassifier(ClassifierMixin, BaseMultilayerPerceptron):
         validation score is not improving by at least tol for
         ``n_iter_no_change`` consecutive epochs. The split is stratified,
         except in a multilabel setting.
+        If early stopping is False, then the training stops when the training
+        loss does not improve by more than tol for n_iter_no_change consecutive
+        passes over the training set.
         Only effective when solver='sgd' or 'adam'
 
     validation_fraction : float, default=0.1

From 73e8b7d0a984fac8420cb4f948d53470ef9b5abf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juan=20Carlos=20Alfaro=20Jim=C3=A9nez?=
 <JuanCarlos.Alfaro@uclm.es>
Date: Mon, 19 Apr 2021 10:43:14 +0200
Subject: [PATCH 339/478] DOC use math mode in r2_score function (#19921)

---
 sklearn/metrics/_regression.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/sklearn/metrics/_regression.py b/sklearn/metrics/_regression.py
index 7edf7924e50e1..c2a0e7f7f033b 100644
--- a/sklearn/metrics/_regression.py
+++ b/sklearn/metrics/_regression.py
@@ -670,12 +670,12 @@ def explained_variance_score(y_true, y_pred, *,
 @_deprecate_positional_args
 def r2_score(y_true, y_pred, *, sample_weight=None,
              multioutput="uniform_average"):
-    """R^2 (coefficient of determination) regression score function.
+    """:math:`R^2` (coefficient of determination) regression score function.
 
     Best possible score is 1.0 and it can be negative (because the
     model can be arbitrarily worse). A constant model that always
     predicts the expected value of y, disregarding the input features,
-    would get a R^2 score of 0.0.
+    would get a :math:`R^2` score of 0.0.
 
     Read more in the :ref:`User Guide <r2_score>`.
 
@@ -713,15 +713,15 @@ def r2_score(y_true, y_pred, *, sample_weight=None,
     Returns
     -------
     z : float or ndarray of floats
-        The R^2 score or ndarray of scores if 'multioutput' is
+        The :math:`R^2` score or ndarray of scores if 'multioutput' is
         'raw_values'.
 
     Notes
     -----
     This is not a symmetric function.
 
-    Unlike most other scores, R^2 score may be negative (it need not actually
-    be the square of a quantity R).
+    Unlike most other scores, :math:`R^2` score may be negative (it need not
+    actually be the square of a quantity R).
 
     This metric is not well-defined for single samples and will return a NaN
     value if n_samples is less than two.

From 5efcb10c3e9f87103c404c32a036beb623182c83 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Mon, 19 Apr 2021 11:06:38 -0400
Subject: [PATCH 340/478] MAINT Vendors packaging/version.py for pep440
 versioning (#19826)

* MAINT Vendors packaging/version for pep440 versioning

* ENH Uses folder structure of packaging

* ENH Uses fixes parse_version

* ENH Uses packaging name

* MAINT Adds packaging

* BLD Use vendored version [cd build]
---
 setup.py                                      |   2 +-
 sklearn/externals/_packaging/__init__.py      |   0
 sklearn/externals/_packaging/_structures.py   |  90 +++
 sklearn/externals/_packaging/version.py       | 527 ++++++++++++++++++
 .../preprocessing/tests/test_polynomial.py    |   4 +-
 sklearn/setup.py                              |   1 +
 sklearn/utils/fixes.py                        |   8 +-
 7 files changed, 621 insertions(+), 11 deletions(-)
 create mode 100644 sklearn/externals/_packaging/__init__.py
 create mode 100644 sklearn/externals/_packaging/_structures.py
 create mode 100644 sklearn/externals/_packaging/version.py

diff --git a/setup.py b/setup.py
index e44f941e0a114..9758f62de1301 100755
--- a/setup.py
+++ b/setup.py
@@ -14,7 +14,6 @@
 from distutils.command.clean import clean as Clean
 from distutils.command.sdist import sdist
 
-from pkg_resources import parse_version
 import traceback
 import importlib
 try:
@@ -51,6 +50,7 @@
 # does not need the compiled code
 import sklearn
 import sklearn._min_dependencies as min_deps  # noqa
+from sklearn.externals._packaging.version import parse as parse_version  # noqa
 
 
 VERSION = sklearn.__version__
diff --git a/sklearn/externals/_packaging/__init__.py b/sklearn/externals/_packaging/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sklearn/externals/_packaging/_structures.py b/sklearn/externals/_packaging/_structures.py
new file mode 100644
index 0000000000000..837e3a7946d70
--- /dev/null
+++ b/sklearn/externals/_packaging/_structures.py
@@ -0,0 +1,90 @@
+"""Vendoered from
+https://github.com/pypa/packaging/blob/main/packaging/_structures.py
+"""
+# Copyright (c) Donald Stufft and individual contributors.
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+#     1. Redistributions of source code must retain the above copyright notice,
+#        this list of conditions and the following disclaimer.
+
+#     2. Redistributions in binary form must reproduce the above copyright
+#        notice, this list of conditions and the following disclaimer in the
+#        documentation and/or other materials provided with the distribution.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+class InfinityType:
+    def __repr__(self) -> str:
+        return "Infinity"
+
+    def __hash__(self) -> int:
+        return hash(repr(self))
+
+    def __lt__(self, other: object) -> bool:
+        return False
+
+    def __le__(self, other: object) -> bool:
+        return False
+
+    def __eq__(self, other: object) -> bool:
+        return isinstance(other, self.__class__)
+
+    def __ne__(self, other: object) -> bool:
+        return not isinstance(other, self.__class__)
+
+    def __gt__(self, other: object) -> bool:
+        return True
+
+    def __ge__(self, other: object) -> bool:
+        return True
+
+    def __neg__(self: object) -> "NegativeInfinityType":
+        return NegativeInfinity
+
+
+Infinity = InfinityType()
+
+
+class NegativeInfinityType:
+    def __repr__(self) -> str:
+        return "-Infinity"
+
+    def __hash__(self) -> int:
+        return hash(repr(self))
+
+    def __lt__(self, other: object) -> bool:
+        return True
+
+    def __le__(self, other: object) -> bool:
+        return True
+
+    def __eq__(self, other: object) -> bool:
+        return isinstance(other, self.__class__)
+
+    def __ne__(self, other: object) -> bool:
+        return not isinstance(other, self.__class__)
+
+    def __gt__(self, other: object) -> bool:
+        return False
+
+    def __ge__(self, other: object) -> bool:
+        return False
+
+    def __neg__(self: object) -> InfinityType:
+        return Infinity
+
+
+NegativeInfinity = NegativeInfinityType()
diff --git a/sklearn/externals/_packaging/version.py b/sklearn/externals/_packaging/version.py
new file mode 100644
index 0000000000000..ea83bbb8b5389
--- /dev/null
+++ b/sklearn/externals/_packaging/version.py
@@ -0,0 +1,527 @@
+"""Vendoered from
+https://github.com/pypa/packaging/blob/main/packaging/version.py
+"""
+# Copyright (c) Donald Stufft and individual contributors.
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+#     1. Redistributions of source code must retain the above copyright notice,
+#        this list of conditions and the following disclaimer.
+
+#     2. Redistributions in binary form must reproduce the above copyright
+#        notice, this list of conditions and the following disclaimer in the
+#        documentation and/or other materials provided with the distribution.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import collections
+import itertools
+import re
+import warnings
+from typing import Callable, Iterator, List, Optional, SupportsInt, Tuple, Union
+
+from ._structures import Infinity, InfinityType, NegativeInfinity, NegativeInfinityType
+
+__all__ = ["parse", "Version", "LegacyVersion", "InvalidVersion", "VERSION_PATTERN"]
+
+InfiniteTypes = Union[InfinityType, NegativeInfinityType]
+PrePostDevType = Union[InfiniteTypes, Tuple[str, int]]
+SubLocalType = Union[InfiniteTypes, int, str]
+LocalType = Union[
+    NegativeInfinityType,
+    Tuple[
+        Union[
+            SubLocalType,
+            Tuple[SubLocalType, str],
+            Tuple[NegativeInfinityType, SubLocalType],
+        ],
+        ...,
+    ],
+]
+CmpKey = Tuple[
+    int, Tuple[int, ...], PrePostDevType, PrePostDevType, PrePostDevType, LocalType
+]
+LegacyCmpKey = Tuple[int, Tuple[str, ...]]
+VersionComparisonMethod = Callable[
+    [Union[CmpKey, LegacyCmpKey], Union[CmpKey, LegacyCmpKey]], bool
+]
+
+_Version = collections.namedtuple(
+    "_Version", ["epoch", "release", "dev", "pre", "post", "local"]
+)
+
+
+def parse(version: str) -> Union["LegacyVersion", "Version"]:
+    """
+    Parse the given version string and return either a :class:`Version` object
+    or a :class:`LegacyVersion` object depending on if the given version is
+    a valid PEP 440 version or a legacy version.
+    """
+    try:
+        return Version(version)
+    except InvalidVersion:
+        return LegacyVersion(version)
+
+
+class InvalidVersion(ValueError):
+    """
+    An invalid version was found, users should refer to PEP 440.
+    """
+
+
+class _BaseVersion:
+    _key: Union[CmpKey, LegacyCmpKey]
+
+    def __hash__(self) -> int:
+        return hash(self._key)
+
+    # Please keep the duplicated `isinstance` check
+    # in the six comparisons hereunder
+    # unless you find a way to avoid adding overhead function calls.
+    def __lt__(self, other: "_BaseVersion") -> bool:
+        if not isinstance(other, _BaseVersion):
+            return NotImplemented
+
+        return self._key < other._key
+
+    def __le__(self, other: "_BaseVersion") -> bool:
+        if not isinstance(other, _BaseVersion):
+            return NotImplemented
+
+        return self._key <= other._key
+
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, _BaseVersion):
+            return NotImplemented
+
+        return self._key == other._key
+
+    def __ge__(self, other: "_BaseVersion") -> bool:
+        if not isinstance(other, _BaseVersion):
+            return NotImplemented
+
+        return self._key >= other._key
+
+    def __gt__(self, other: "_BaseVersion") -> bool:
+        if not isinstance(other, _BaseVersion):
+            return NotImplemented
+
+        return self._key > other._key
+
+    def __ne__(self, other: object) -> bool:
+        if not isinstance(other, _BaseVersion):
+            return NotImplemented
+
+        return self._key != other._key
+
+
+class LegacyVersion(_BaseVersion):
+    def __init__(self, version: str) -> None:
+        self._version = str(version)
+        self._key = _legacy_cmpkey(self._version)
+
+        warnings.warn(
+            "Creating a LegacyVersion has been deprecated and will be "
+            "removed in the next major release",
+            DeprecationWarning,
+        )
+
+    def __str__(self) -> str:
+        return self._version
+
+    def __repr__(self) -> str:
+        return f"<LegacyVersion('{self}')>"
+
+    @property
+    def public(self) -> str:
+        return self._version
+
+    @property
+    def base_version(self) -> str:
+        return self._version
+
+    @property
+    def epoch(self) -> int:
+        return -1
+
+    @property
+    def release(self) -> None:
+        return None
+
+    @property
+    def pre(self) -> None:
+        return None
+
+    @property
+    def post(self) -> None:
+        return None
+
+    @property
+    def dev(self) -> None:
+        return None
+
+    @property
+    def local(self) -> None:
+        return None
+
+    @property
+    def is_prerelease(self) -> bool:
+        return False
+
+    @property
+    def is_postrelease(self) -> bool:
+        return False
+
+    @property
+    def is_devrelease(self) -> bool:
+        return False
+
+
+_legacy_version_component_re = re.compile(r"(\d+ | [a-z]+ | \.| -)", re.VERBOSE)
+
+_legacy_version_replacement_map = {
+    "pre": "c",
+    "preview": "c",
+    "-": "final-",
+    "rc": "c",
+    "dev": "@",
+}
+
+
+def _parse_version_parts(s: str) -> Iterator[str]:
+    for part in _legacy_version_component_re.split(s):
+        part = _legacy_version_replacement_map.get(part, part)
+
+        if not part or part == ".":
+            continue
+
+        if part[:1] in "0123456789":
+            # pad for numeric comparison
+            yield part.zfill(8)
+        else:
+            yield "*" + part
+
+    # ensure that alpha/beta/candidate are before final
+    yield "*final"
+
+
+def _legacy_cmpkey(version: str) -> LegacyCmpKey:
+
+    # We hardcode an epoch of -1 here. A PEP 440 version can only have a epoch
+    # greater than or equal to 0. This will effectively put the LegacyVersion,
+    # which uses the defacto standard originally implemented by setuptools,
+    # as before all PEP 440 versions.
+    epoch = -1
+
+    # This scheme is taken from pkg_resources.parse_version setuptools prior to
+    # it's adoption of the packaging library.
+    parts: List[str] = []
+    for part in _parse_version_parts(version.lower()):
+        if part.startswith("*"):
+            # remove "-" before a prerelease tag
+            if part < "*final":
+                while parts and parts[-1] == "*final-":
+                    parts.pop()
+
+            # remove trailing zeros from each series of numeric parts
+            while parts and parts[-1] == "00000000":
+                parts.pop()
+
+        parts.append(part)
+
+    return epoch, tuple(parts)
+
+
+# Deliberately not anchored to the start and end of the string, to make it
+# easier for 3rd party code to reuse
+VERSION_PATTERN = r"""
+    v?
+    (?:
+        (?:(?P<epoch>[0-9]+)!)?                           # epoch
+        (?P<release>[0-9]+(?:\.[0-9]+)*)                  # release segment
+        (?P<pre>                                          # pre-release
+            [-_\.]?
+            (?P<pre_l>(a|b|c|rc|alpha|beta|pre|preview))
+            [-_\.]?
+            (?P<pre_n>[0-9]+)?
+        )?
+        (?P<post>                                         # post release
+            (?:-(?P<post_n1>[0-9]+))
+            |
+            (?:
+                [-_\.]?
+                (?P<post_l>post|rev|r)
+                [-_\.]?
+                (?P<post_n2>[0-9]+)?
+            )
+        )?
+        (?P<dev>                                          # dev release
+            [-_\.]?
+            (?P<dev_l>dev)
+            [-_\.]?
+            (?P<dev_n>[0-9]+)?
+        )?
+    )
+    (?:\+(?P<local>[a-z0-9]+(?:[-_\.][a-z0-9]+)*))?       # local version
+"""
+
+
+class Version(_BaseVersion):
+
+    _regex = re.compile(r"^\s*" + VERSION_PATTERN + r"\s*$", re.VERBOSE | re.IGNORECASE)
+
+    def __init__(self, version: str) -> None:
+
+        # Validate the version and parse it into pieces
+        match = self._regex.search(version)
+        if not match:
+            raise InvalidVersion(f"Invalid version: '{version}'")
+
+        # Store the parsed out pieces of the version
+        self._version = _Version(
+            epoch=int(match.group("epoch")) if match.group("epoch") else 0,
+            release=tuple(int(i) for i in match.group("release").split(".")),
+            pre=_parse_letter_version(match.group("pre_l"), match.group("pre_n")),
+            post=_parse_letter_version(
+                match.group("post_l"), match.group("post_n1") or match.group("post_n2")
+            ),
+            dev=_parse_letter_version(match.group("dev_l"), match.group("dev_n")),
+            local=_parse_local_version(match.group("local")),
+        )
+
+        # Generate a key which will be used for sorting
+        self._key = _cmpkey(
+            self._version.epoch,
+            self._version.release,
+            self._version.pre,
+            self._version.post,
+            self._version.dev,
+            self._version.local,
+        )
+
+    def __repr__(self) -> str:
+        return f"<Version('{self}')>"
+
+    def __str__(self) -> str:
+        parts = []
+
+        # Epoch
+        if self.epoch != 0:
+            parts.append(f"{self.epoch}!")
+
+        # Release segment
+        parts.append(".".join(str(x) for x in self.release))
+
+        # Pre-release
+        if self.pre is not None:
+            parts.append("".join(str(x) for x in self.pre))
+
+        # Post-release
+        if self.post is not None:
+            parts.append(f".post{self.post}")
+
+        # Development release
+        if self.dev is not None:
+            parts.append(f".dev{self.dev}")
+
+        # Local version segment
+        if self.local is not None:
+            parts.append(f"+{self.local}")
+
+        return "".join(parts)
+
+    @property
+    def epoch(self) -> int:
+        _epoch: int = self._version.epoch
+        return _epoch
+
+    @property
+    def release(self) -> Tuple[int, ...]:
+        _release: Tuple[int, ...] = self._version.release
+        return _release
+
+    @property
+    def pre(self) -> Optional[Tuple[str, int]]:
+        _pre: Optional[Tuple[str, int]] = self._version.pre
+        return _pre
+
+    @property
+    def post(self) -> Optional[int]:
+        return self._version.post[1] if self._version.post else None
+
+    @property
+    def dev(self) -> Optional[int]:
+        return self._version.dev[1] if self._version.dev else None
+
+    @property
+    def local(self) -> Optional[str]:
+        if self._version.local:
+            return ".".join(str(x) for x in self._version.local)
+        else:
+            return None
+
+    @property
+    def public(self) -> str:
+        return str(self).split("+", 1)[0]
+
+    @property
+    def base_version(self) -> str:
+        parts = []
+
+        # Epoch
+        if self.epoch != 0:
+            parts.append(f"{self.epoch}!")
+
+        # Release segment
+        parts.append(".".join(str(x) for x in self.release))
+
+        return "".join(parts)
+
+    @property
+    def is_prerelease(self) -> bool:
+        return self.dev is not None or self.pre is not None
+
+    @property
+    def is_postrelease(self) -> bool:
+        return self.post is not None
+
+    @property
+    def is_devrelease(self) -> bool:
+        return self.dev is not None
+
+    @property
+    def major(self) -> int:
+        return self.release[0] if len(self.release) >= 1 else 0
+
+    @property
+    def minor(self) -> int:
+        return self.release[1] if len(self.release) >= 2 else 0
+
+    @property
+    def micro(self) -> int:
+        return self.release[2] if len(self.release) >= 3 else 0
+
+
+def _parse_letter_version(
+    letter: str, number: Union[str, bytes, SupportsInt]
+) -> Optional[Tuple[str, int]]:
+
+    if letter:
+        # We consider there to be an implicit 0 in a pre-release if there is
+        # not a numeral associated with it.
+        if number is None:
+            number = 0
+
+        # We normalize any letters to their lower case form
+        letter = letter.lower()
+
+        # We consider some words to be alternate spellings of other words and
+        # in those cases we want to normalize the spellings to our preferred
+        # spelling.
+        if letter == "alpha":
+            letter = "a"
+        elif letter == "beta":
+            letter = "b"
+        elif letter in ["c", "pre", "preview"]:
+            letter = "rc"
+        elif letter in ["rev", "r"]:
+            letter = "post"
+
+        return letter, int(number)
+    if not letter and number:
+        # We assume if we are given a number, but we are not given a letter
+        # then this is using the implicit post release syntax (e.g. 1.0-1)
+        letter = "post"
+
+        return letter, int(number)
+
+    return None
+
+
+_local_version_separators = re.compile(r"[\._-]")
+
+
+def _parse_local_version(local: str) -> Optional[LocalType]:
+    """
+    Takes a string like abc.1.twelve and turns it into ("abc", 1, "twelve").
+    """
+    if local is not None:
+        return tuple(
+            part.lower() if not part.isdigit() else int(part)
+            for part in _local_version_separators.split(local)
+        )
+    return None
+
+
+def _cmpkey(
+    epoch: int,
+    release: Tuple[int, ...],
+    pre: Optional[Tuple[str, int]],
+    post: Optional[Tuple[str, int]],
+    dev: Optional[Tuple[str, int]],
+    local: Optional[Tuple[SubLocalType]],
+) -> CmpKey:
+
+    # When we compare a release version, we want to compare it with all of the
+    # trailing zeros removed. So we'll use a reverse the list, drop all the now
+    # leading zeros until we come to something non zero, then take the rest
+    # re-reverse it back into the correct order and make it a tuple and use
+    # that for our sorting key.
+    _release = tuple(
+        reversed(list(itertools.dropwhile(lambda x: x == 0, reversed(release))))
+    )
+
+    # We need to "trick" the sorting algorithm to put 1.0.dev0 before 1.0a0.
+    # We'll do this by abusing the pre segment, but we _only_ want to do this
+    # if there is not a pre or a post segment. If we have one of those then
+    # the normal sorting rules will handle this case correctly.
+    if pre is None and post is None and dev is not None:
+        _pre: PrePostDevType = NegativeInfinity
+    # Versions without a pre-release (except as noted above) should sort after
+    # those with one.
+    elif pre is None:
+        _pre = Infinity
+    else:
+        _pre = pre
+
+    # Versions without a post segment should sort before those with one.
+    if post is None:
+        _post: PrePostDevType = NegativeInfinity
+
+    else:
+        _post = post
+
+    # Versions without a development segment should sort after those with one.
+    if dev is None:
+        _dev: PrePostDevType = Infinity
+
+    else:
+        _dev = dev
+
+    if local is None:
+        # Versions without a local segment should sort before those with one.
+        _local: LocalType = NegativeInfinity
+    else:
+        # Versions with a local segment need that segment parsed to implement
+        # the sorting rules in PEP440.
+        # - Alpha numeric segments sort before numeric segments
+        # - Alpha numeric segments sort lexicographically
+        # - Numeric segments sort numerically
+        # - Shorter versions sort before longer versions when the prefixes
+        #   match exactly
+        _local = tuple(
+            (i, "") if isinstance(i, int) else (NegativeInfinity, i) for i in local
+        )
+
+    return epoch, _release, _pre, _post, _dev, _local
diff --git a/sklearn/preprocessing/tests/test_polynomial.py b/sklearn/preprocessing/tests/test_polynomial.py
index 59c3a59df8873..1f70ec9854a54 100644
--- a/sklearn/preprocessing/tests/test_polynomial.py
+++ b/sklearn/preprocessing/tests/test_polynomial.py
@@ -11,9 +11,7 @@
 from sklearn.preprocessing import (
     KBinsDiscretizer, PolynomialFeatures, SplineTransformer
 )
-from sklearn.utils.fixes import linspace, sp_version
-
-from pkg_resources import parse_version
+from sklearn.utils.fixes import linspace, sp_version, parse_version
 
 
 @pytest.mark.parametrize("est", (PolynomialFeatures, SplineTransformer))
diff --git a/sklearn/setup.py b/sklearn/setup.py
index e5d7e6e26b3ab..ae8a929d6b9cb 100644
--- a/sklearn/setup.py
+++ b/sklearn/setup.py
@@ -51,6 +51,7 @@ def configuration(parent_package='', top_path=None):
     config.add_subpackage('_loss/')
     config.add_subpackage('_loss/tests')
     config.add_subpackage('externals')
+    config.add_subpackage('externals/_packaging')
 
     # submodules which have their own setup.py
     config.add_subpackage('cluster')
diff --git a/sklearn/utils/fixes.py b/sklearn/utils/fixes.py
index 593e0eb332a99..a5a455ee7b9a1 100644
--- a/sklearn/utils/fixes.py
+++ b/sklearn/utils/fixes.py
@@ -11,7 +11,6 @@
 # License: BSD 3 clause
 
 from functools import update_wrapper
-from distutils.version import LooseVersion
 import functools
 
 import numpy as np
@@ -23,12 +22,7 @@
 from .._config import config_context, get_config
 
 from .deprecation import deprecated
-
-try:
-    from pkg_resources import parse_version  # type: ignore
-except ImportError:
-    # setuptools not installed
-    parse_version = LooseVersion  # type: ignore
+from ..externals._packaging.version import parse as parse_version
 
 
 np_version = parse_version(np.__version__)

From 0df9efe2c1407f3fb887c22056452c791fd83dc9 Mon Sep 17 00:00:00 2001
From: Helder Geovane Gomes de Lima <he7d3r@gmail.com>
Date: Mon, 19 Apr 2021 18:48:39 -0300
Subject: [PATCH 341/478] DOC Fixes typo in doc/modules/cross_validation.rst
 (#19925)

---
 doc/modules/cross_validation.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/modules/cross_validation.rst b/doc/modules/cross_validation.rst
index 0b090fd7385b6..98b3c41ee5c72 100644
--- a/doc/modules/cross_validation.rst
+++ b/doc/modules/cross_validation.rst
@@ -813,7 +813,7 @@ samples that are part of the validation set, and to -1 for all other samples.
 Using cross-validation iterators to split train and test
 --------------------------------------------------------
 
-The above group cross-validation functions may also be useful for spitting a
+The above group cross-validation functions may also be useful for splitting a
 dataset into training and testing subsets. Note that the convenience
 function :func:`train_test_split` is a wrapper around :func:`ShuffleSplit`
 and thus only allows for stratified splitting (using the class labels)

From dd7b7e5ef950ac026ac44d758af9167eafcc9ee2 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Tue, 20 Apr 2021 11:14:39 +0200
Subject: [PATCH 342/478] MAINT Remove `get_memview_*` helpers in
 `neighbours.BinaryTree` (#19893)

Co-authored-by: Roman Yurchak <rth.yurchak@pm.me>
---
 sklearn/neighbors/_ball_tree.pyx    |   2 +-
 sklearn/neighbors/_binary_tree.pxi  | 108 +++++++++-------------------
 sklearn/neighbors/_dist_metrics.pyx |  12 +---
 sklearn/neighbors/_kd_tree.pyx      |   2 +-
 sklearn/neighbors/tests/test_lof.py |   8 +--
 5 files changed, 40 insertions(+), 92 deletions(-)

diff --git a/sklearn/neighbors/_ball_tree.pyx b/sklearn/neighbors/_ball_tree.pyx
index 81ce9606f7b80..16e9407aa72ca 100644
--- a/sklearn/neighbors/_ball_tree.pyx
+++ b/sklearn/neighbors/_ball_tree.pyx
@@ -44,7 +44,7 @@ cdef int allocate_data(BinaryTree tree, ITYPE_t n_nodes,
                        ITYPE_t n_features) except -1:
     """Allocate arrays needed for the KD Tree"""
     tree.node_bounds_arr = np.zeros((1, n_nodes, n_features), dtype=DTYPE)
-    tree.node_bounds = get_memview_DTYPE_3D(tree.node_bounds_arr)
+    tree.node_bounds = tree.node_bounds_arr
     return 0
 
 
diff --git a/sklearn/neighbors/_binary_tree.pxi b/sklearn/neighbors/_binary_tree.pxi
index cabad951c4975..de85ec49166ec 100755
--- a/sklearn/neighbors/_binary_tree.pxi
+++ b/sklearn/neighbors/_binary_tree.pxi
@@ -196,47 +196,6 @@ cdef NodeData_t nd_tmp
 NodeData = np.asarray(<NodeData_t[:1]>(&nd_tmp)).dtype
 
 
-######################################################################
-# Numpy 1.3-1.4 compatibility utilities
-cdef DTYPE_t[::1] get_memview_DTYPE_1D(
-                               np.ndarray[DTYPE_t, ndim=1, mode='c'] X):
-    return <DTYPE_t[:X.shape[0]:1]> (<DTYPE_t*> X.data)
-
-
-cdef DTYPE_t[:, ::1] get_memview_DTYPE_2D(
-                               np.ndarray[DTYPE_t, ndim=2, mode='c'] X):
-    return <DTYPE_t[:X.shape[0], :X.shape[1]:1]> (<DTYPE_t*> X.data)
-
-
-cdef DTYPE_t[:, :, ::1] get_memview_DTYPE_3D(
-                               np.ndarray[DTYPE_t, ndim=3, mode='c'] X):
-    return <DTYPE_t[:X.shape[0], :X.shape[1], :X.shape[2]:1]>\
-                                                       (<DTYPE_t*> X.data)
-
-
-cdef ITYPE_t[::1] get_memview_ITYPE_1D(
-                               np.ndarray[ITYPE_t, ndim=1, mode='c'] X):
-    return <ITYPE_t[:X.shape[0]:1]> (<ITYPE_t*> X.data)
-
-
-cdef ITYPE_t[:, ::1] get_memview_ITYPE_2D(
-                               np.ndarray[ITYPE_t, ndim=2, mode='c'] X):
-    return <ITYPE_t[:X.shape[0], :X.shape[1]:1]> (<ITYPE_t*> X.data)
-
-
-cdef NodeHeapData_t[::1] get_memview_NodeHeapData_1D(
-                    np.ndarray[NodeHeapData_t, ndim=1, mode='c'] X):
-    return <NodeHeapData_t[:X.shape[0]:1]> (<NodeHeapData_t*> X.data)
-
-
-cdef NodeData_t[::1] get_memview_NodeData_1D(
-                    np.ndarray[NodeData_t, ndim=1, mode='c'] X):
-    return <NodeData_t[:X.shape[0]:1]> (<NodeData_t*> X.data)
-
-######################################################################
-
-
-
 ######################################################################
 # Define doc strings, substituting the appropriate class name using
 # the DOC_DICT variable defined in the pyx files.
@@ -574,15 +533,15 @@ cdef class NeighborsHeap:
     def __cinit__(self):
         self.distances_arr = np.zeros((1, 1), dtype=DTYPE, order='C')
         self.indices_arr = np.zeros((1, 1), dtype=ITYPE, order='C')
-        self.distances = get_memview_DTYPE_2D(self.distances_arr)
-        self.indices = get_memview_ITYPE_2D(self.indices_arr)
+        self.distances = self.distances_arr
+        self.indices = self.indices_arr
 
     def __init__(self, n_pts, n_nbrs):
         self.distances_arr = np.full((n_pts, n_nbrs), np.inf, dtype=DTYPE,
                                      order='C')
         self.indices_arr = np.zeros((n_pts, n_nbrs), dtype=ITYPE, order='C')
-        self.distances = get_memview_DTYPE_2D(self.distances_arr)
-        self.indices = get_memview_ITYPE_2D(self.indices_arr)
+        self.distances = self.distances_arr
+        self.indices = self.indices_arr
 
     def get_arrays(self, sort=True):
         """Get the arrays of distances and indices within the heap.
@@ -806,12 +765,12 @@ cdef class NodeHeap:
 
     def __cinit__(self):
         self.data_arr = np.zeros(1, dtype=NodeHeapData, order='C')
-        self.data = get_memview_NodeHeapData_1D(self.data_arr)
+        self.data = self.data_arr
 
     def __init__(self, size_guess=100):
         size_guess = max(size_guess, 1)  # need space for at least one item
         self.data_arr = np.zeros(size_guess, dtype=NodeHeapData, order='C')
-        self.data = get_memview_NodeHeapData_1D(self.data_arr)
+        self.data = self.data_arr
         self.n = size_guess
         self.clear()
 
@@ -823,8 +782,7 @@ cdef class NodeHeap:
         cdef ITYPE_t size = self.data.shape[0]
         cdef np.ndarray new_data_arr = np.zeros(new_size,
                                                 dtype=NodeHeapData)
-        cdef NodeHeapData_t[::1] new_data =\
-                                    get_memview_NodeHeapData_1D(new_data_arr)
+        cdef NodeHeapData_t[::1] new_data = new_data_arr
 
         if size > 0 and new_size > 0:
             data_ptr = &self.data[0]
@@ -933,8 +891,8 @@ cdef class BinaryTree:
     cdef np.ndarray node_data_arr
     cdef np.ndarray node_bounds_arr
 
-    cdef readonly DTYPE_t[:, ::1] data
-    cdef readonly DTYPE_t[::1] sample_weight
+    cdef readonly const DTYPE_t[:, ::1] data
+    cdef readonly const DTYPE_t[::1] sample_weight
     cdef public DTYPE_t sum_weight
     cdef public ITYPE_t[::1] idx_array
     cdef public NodeData_t[::1] node_data
@@ -964,11 +922,11 @@ cdef class BinaryTree:
         self.node_data_arr = np.empty(1, dtype=NodeData, order='C')
         self.node_bounds_arr = np.empty((1, 1, 1), dtype=DTYPE)
 
-        self.data = get_memview_DTYPE_2D(self.data_arr)
-        self.sample_weight = get_memview_DTYPE_1D(self.sample_weight_arr)
-        self.idx_array = get_memview_ITYPE_1D(self.idx_array_arr)
-        self.node_data = get_memview_NodeData_1D(self.node_data_arr)
-        self.node_bounds = get_memview_DTYPE_3D(self.node_bounds_arr)
+        self.data = self.data_arr
+        self.sample_weight = self.sample_weight_arr
+        self.idx_array = self.idx_array_arr
+        self.node_data = self.node_data_arr
+        self.node_bounds = self.node_bounds_arr
 
         self.leaf_size = 0
         self.n_levels = 0
@@ -1028,8 +986,7 @@ cdef class BinaryTree:
         if sample_weight is not None:
             self.sample_weight_arr = np.asarray(
                 sample_weight, dtype=DTYPE, order='C')
-            self.sample_weight = get_memview_DTYPE_1D(
-                self.sample_weight_arr)
+            self.sample_weight = self.sample_weight_arr
             self.sum_weight = np.sum(self.sample_weight)
         else:
             self.sample_weight = None
@@ -1037,10 +994,10 @@ cdef class BinaryTree:
             self.sum_weight = <DTYPE_t> n_samples
 
     def _update_memviews(self):
-        self.data = get_memview_DTYPE_2D(self.data_arr)
-        self.idx_array = get_memview_ITYPE_1D(self.idx_array_arr)
-        self.node_data = get_memview_NodeData_1D(self.node_data_arr)
-        self.node_bounds = get_memview_DTYPE_3D(self.node_bounds_arr)
+        self.data = self.data_arr
+        self.idx_array = self.idx_array_arr
+        self.node_data = self.node_data_arr
+        self.node_bounds = self.node_bounds_arr
 
 
     def __reduce__(self):
@@ -1279,7 +1236,7 @@ cdef class BinaryTree:
 
         # flatten X, and save original shape information
         np_Xarr = X.reshape((-1, self.data.shape[1]))
-        cdef DTYPE_t[:, ::1] Xarr = get_memview_DTYPE_2D(np_Xarr)
+        cdef const DTYPE_t[:, ::1] Xarr = np_Xarr
         cdef DTYPE_t reduced_dist_LB
         cdef ITYPE_t i
         cdef DTYPE_t* pt
@@ -1410,8 +1367,7 @@ cdef class BinaryTree:
             raise ValueError("query data dimension must "
                              "match training data dimension")
 
-        cdef DTYPE_t[:, ::1] Xarr =\
-                get_memview_DTYPE_2D(X.reshape((-1, self.data.shape[1])))
+        cdef const DTYPE_t[:, ::1] Xarr = X.reshape((-1, self.data.shape[1]))
 
         # prepare r for query
         r = np.asarray(r, dtype=DTYPE, order='C')
@@ -1423,7 +1379,7 @@ cdef class BinaryTree:
                 raise ValueError("r must be broadcastable to X.shape")
 
         rarr_np = r.reshape(-1)  # store explicitly to keep in scope
-        cdef DTYPE_t[::1] rarr = get_memview_DTYPE_1D(rarr_np)
+        cdef DTYPE_t[::1] rarr = rarr_np
 
         if not count_only:
             indices = <ITYPE_t**>calloc(Xarr.shape[0], sizeof(ITYPE_t*))
@@ -1436,13 +1392,13 @@ cdef class BinaryTree:
                     raise MemoryError()
 
         np_idx_arr = np.zeros(self.data.shape[0], dtype=ITYPE)
-        idx_arr_i = get_memview_ITYPE_1D(np_idx_arr)
+        idx_arr_i = np_idx_arr
 
         np_dist_arr = np.zeros(self.data.shape[0], dtype=DTYPE)
-        dist_arr_i = get_memview_DTYPE_1D(np_dist_arr)
+        dist_arr_i = np_dist_arr
 
         counts_arr = np.zeros(Xarr.shape[0], dtype=ITYPE)
-        counts = get_memview_ITYPE_1D(counts_arr)
+        counts = counts_arr
 
         pt = &Xarr[0, 0]
         memory_error = False
@@ -1609,10 +1565,10 @@ cdef class BinaryTree:
             raise ValueError("query data dimension must "
                              "match training data dimension")
         Xarr_np = X.reshape((-1, n_features))
-        cdef DTYPE_t[:, ::1] Xarr = get_memview_DTYPE_2D(Xarr_np)
+        cdef DTYPE_t[:, ::1] Xarr = Xarr_np
 
         log_density_arr = np.zeros(Xarr.shape[0], dtype=DTYPE)
-        cdef DTYPE_t[::1] log_density = get_memview_DTYPE_1D(log_density_arr)
+        cdef DTYPE_t[::1] log_density = log_density_arr
 
         cdef DTYPE_t* pt = &Xarr[0, 0]
 
@@ -1626,9 +1582,9 @@ cdef class BinaryTree:
         #       computed between node pairs.
         if breadth_first:
             node_log_min_bounds_arr = np.full(self.n_nodes, -np.inf)
-            node_log_min_bounds = get_memview_DTYPE_1D(node_log_min_bounds_arr)
+            node_log_min_bounds = node_log_min_bounds_arr
             node_bound_widths_arr = np.zeros(self.n_nodes)
-            node_bound_widths = get_memview_DTYPE_1D(node_bound_widths_arr)
+            node_bound_widths = node_bound_widths_arr
             for i in range(Xarr.shape[0]):
                 log_density[i] = self._kde_single_breadthfirst(
                                             pt, kernel_c, h_c,
@@ -1704,7 +1660,7 @@ cdef class BinaryTree:
                              "match training data dimension")
 
         np_Xarr = X.reshape((-1, self.data.shape[1]))
-        cdef DTYPE_t[:, ::1] Xarr = get_memview_DTYPE_2D(np_Xarr)
+        cdef DTYPE_t[:, ::1] Xarr = np_Xarr
 
         # prepare r for query
         r = np.asarray(r, dtype=DTYPE, order='C')
@@ -1713,11 +1669,11 @@ cdef class BinaryTree:
             raise ValueError("r must be a 1-dimensional array")
         i_rsort = np.argsort(r)
         rarr_np = r[i_rsort]  # needed to keep memory in scope
-        cdef DTYPE_t[::1] rarr = get_memview_DTYPE_1D(rarr_np)
+        cdef DTYPE_t[::1] rarr = rarr_np
 
         # create array to hold counts
         count = np.zeros(r.shape[0], dtype=ITYPE)
-        cdef ITYPE_t[::1] carr = get_memview_ITYPE_1D(count)
+        cdef ITYPE_t[::1] carr = count
 
         cdef DTYPE_t* pt = &Xarr[0, 0]
 
diff --git a/sklearn/neighbors/_dist_metrics.pyx b/sklearn/neighbors/_dist_metrics.pyx
index 398591bcdf49f..cf0c703a5d491 100755
--- a/sklearn/neighbors/_dist_metrics.pyx
+++ b/sklearn/neighbors/_dist_metrics.pyx
@@ -14,11 +14,6 @@ np.import_array()  # required in order to use C-API
 
 ######################################################################
 # Numpy 1.3-1.4 compatibility utilities
-cdef DTYPE_t[:, ::1] get_memview_DTYPE_2D(
-                               np.ndarray[DTYPE_t, ndim=2, mode='c'] X):
-    return <DTYPE_t[:X.shape[0],:X.shape[1]:1]> (<DTYPE_t*> X.data)
-
-
 cdef DTYPE_t* get_vec_ptr(np.ndarray[DTYPE_t, ndim=1, mode='c'] vec):
     return &vec[0]
 
@@ -398,16 +393,13 @@ cdef class DistanceMetric:
         if Y is None:
             Darr = np.zeros((Xarr.shape[0], Xarr.shape[0]),
                          dtype=DTYPE, order='C')
-            self.pdist(get_memview_DTYPE_2D(Xarr),
-                       get_memview_DTYPE_2D(Darr))
+            self.pdist(Xarr, Darr)
         else:
             Yarr = np.asarray(Y, dtype=DTYPE, order='C')
             self._validate_data(Yarr)
             Darr = np.zeros((Xarr.shape[0], Yarr.shape[0]),
                          dtype=DTYPE, order='C')
-            self.cdist(get_memview_DTYPE_2D(Xarr),
-                       get_memview_DTYPE_2D(Yarr),
-                       get_memview_DTYPE_2D(Darr))
+            self.cdist(Xarr, Yarr, Darr)
         return Darr
 
 
diff --git a/sklearn/neighbors/_kd_tree.pyx b/sklearn/neighbors/_kd_tree.pyx
index bc1ab764a6fcf..175b61962da99 100644
--- a/sklearn/neighbors/_kd_tree.pyx
+++ b/sklearn/neighbors/_kd_tree.pyx
@@ -38,7 +38,7 @@ cdef int allocate_data(BinaryTree tree, ITYPE_t n_nodes,
                        ITYPE_t n_features) except -1:
     """Allocate arrays needed for the KD Tree"""
     tree.node_bounds_arr = np.zeros((2, n_nodes, n_features), dtype=DTYPE)
-    tree.node_bounds = get_memview_DTYPE_3D(tree.node_bounds_arr)
+    tree.node_bounds = tree.node_bounds_arr
     return 0
 
 
diff --git a/sklearn/neighbors/tests/test_lof.py b/sklearn/neighbors/tests/test_lof.py
index 5d479d5b141f7..ec67bddae29e8 100644
--- a/sklearn/neighbors/tests/test_lof.py
+++ b/sklearn/neighbors/tests/test_lof.py
@@ -15,8 +15,8 @@
 
 from sklearn.utils import check_random_state
 from sklearn.utils._testing import assert_array_almost_equal
-from sklearn.utils.estimator_checks import check_estimator
 from sklearn.utils.estimator_checks import check_outlier_corruption
+from sklearn.utils.estimator_checks import parametrize_with_checks
 
 from sklearn.datasets import load_iris
 
@@ -208,11 +208,11 @@ def test_hasattr_prediction():
     assert not hasattr(clf, 'score_samples')
 
 
-def test_novelty_true_common_tests():
-
+@parametrize_with_checks([neighbors.LocalOutlierFactor(novelty=True)])
+def test_novelty_true_common_tests(estimator, check):
     # the common tests are run for the default LOF (novelty=False).
     # here we run these common tests for LOF when novelty=True
-    check_estimator(neighbors.LocalOutlierFactor(novelty=True))
+    check(estimator)
 
 
 @pytest.mark.parametrize('expected_outliers', [30, 53])

From 9b7ff272534f130893e95933db46a3ff295190b2 Mon Sep 17 00:00:00 2001
From: Bharat Raghunathan <bharatraghunthan9767@gmail.com>
Date: Tue, 20 Apr 2021 15:11:17 +0530
Subject: [PATCH 343/478] DOC improve learning-rate AdaBoost estimator (#19919)

---
 sklearn/ensemble/_weight_boosting.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/sklearn/ensemble/_weight_boosting.py b/sklearn/ensemble/_weight_boosting.py
index d5354232a4385..92c5e15731d63 100644
--- a/sklearn/ensemble/_weight_boosting.py
+++ b/sklearn/ensemble/_weight_boosting.py
@@ -313,9 +313,9 @@ class AdaBoostClassifier(ClassifierMixin, BaseWeightBoosting):
         In case of perfect fit, the learning procedure is stopped early.
 
     learning_rate : float, default=1.
-        Learning rate shrinks the contribution of each classifier by
-        ``learning_rate``. There is a trade-off between ``learning_rate`` and
-        ``n_estimators``.
+        Weight applied to each classifier at each boosting iteration. A higher
+        learning rate increases the contribution of each classifier. There is
+        a trade-off between the `learning_rate` and `n_estimators` parameters.
 
     algorithm : {'SAMME', 'SAMME.R'}, default='SAMME.R'
         If 'SAMME.R' then use the SAMME.R real boosting algorithm.
@@ -898,9 +898,9 @@ class AdaBoostRegressor(RegressorMixin, BaseWeightBoosting):
         In case of perfect fit, the learning procedure is stopped early.
 
     learning_rate : float, default=1.
-        Learning rate shrinks the contribution of each regressor by
-        ``learning_rate``. There is a trade-off between ``learning_rate`` and
-        ``n_estimators``.
+        Weight applied to each classifier at each boosting iteration. A higher
+        learning rate increases the contribution of each classifier. There is
+        a trade-off between the `learning_rate` and `n_estimators` parameters.
 
     loss : {'linear', 'square', 'exponential'}, default='linear'
         The loss function to use when updating the weights after each

From 9099c796399e5c9653d5d307b4ad8a46047b1cdb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= <loic.esteve@ymail.com>
Date: Tue, 20 Apr 2021 15:57:42 +0200
Subject: [PATCH 344/478] EXA Fix plot_map_data_to_normal.py example legend
 (#19930)

Lambda was meant to be on a different line.
---
 examples/preprocessing/plot_map_data_to_normal.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/preprocessing/plot_map_data_to_normal.py b/examples/preprocessing/plot_map_data_to_normal.py
index ff465df78b0df..581ca20a83a42 100644
--- a/examples/preprocessing/plot_map_data_to_normal.py
+++ b/examples/preprocessing/plot_map_data_to_normal.py
@@ -132,7 +132,7 @@
         ax.hist(X_trans, color=color, bins=BINS)
         title = 'After {}'.format(meth_name)
         if lmbda is not None:
-            title += r'\n$\lambda$ = {}'.format(lmbda)
+            title += '\n$\\lambda$ = {}'.format(lmbda)
         ax.set_title(title, fontsize=FONT_SIZE)
         ax.tick_params(axis='both', which='major', labelsize=FONT_SIZE)
         ax.set_xlim([-3.5, 3.5])

From ce217db386aaddfa5b5dde3fe47d42a1964120a0 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Tue, 20 Apr 2021 11:22:13 -0400
Subject: [PATCH 345/478] FIX Fixes memory view bug in distance metrics
 (#19933)

---
 sklearn/neighbors/_dist_metrics.pxd | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/neighbors/_dist_metrics.pxd b/sklearn/neighbors/_dist_metrics.pxd
index 856d5bb2dde5b..30124c309bc49 100644
--- a/sklearn/neighbors/_dist_metrics.pxd
+++ b/sklearn/neighbors/_dist_metrics.pxd
@@ -67,9 +67,9 @@ cdef class DistanceMetric:
     cdef DTYPE_t rdist(self, DTYPE_t* x1, DTYPE_t* x2,
                        ITYPE_t size) nogil except -1
 
-    cdef int pdist(self, DTYPE_t[:, ::1] X, DTYPE_t[:, ::1] D) except -1
+    cdef int pdist(self, const DTYPE_t[:, ::1] X, DTYPE_t[:, ::1] D) except -1
 
-    cdef int cdist(self, DTYPE_t[:, ::1] X, DTYPE_t[:, ::1] Y,
+    cdef int cdist(self, const DTYPE_t[:, ::1] X, const DTYPE_t[:, ::1] Y,
                    DTYPE_t[:, ::1] D) except -1
 
     cdef DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) nogil except -1

From 7ddd6e5d34911346afe6839c16fc06fc820fc013 Mon Sep 17 00:00:00 2001
From: Gleb Levitskiy <36483986+GLevV@users.noreply.github.com>
Date: Tue, 20 Apr 2021 19:39:19 +0000
Subject: [PATCH 346/478] KBinsDiscretizer efficiency improvement to 'kmeans'
 strategy (#19934)

* efficiency improvement

* update doc

* lint

* lint
---
 doc/whats_new/v1.0.rst                   | 4 ++++
 sklearn/preprocessing/_discretization.py | 3 ++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index a78cbe69b746d..8a2351b04ecc2 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -386,6 +386,10 @@ Changelog
   supporting sparse matrix and raise the appropriate error message.
   :pr:`19879` by :user:`Guillaume Lemaitre <glemaitre>`.
 
+- |Efficiency| Changed ``algorithm`` argument for :class:`cluster.KMeans` in 
+  :class:`preprocessing.KBinsDiscretizer` from ``auto`` to ``full``. 
+  :pr:`19934` by :user:`Gleb Levitskiy <GLevV>`.
+
 :mod:`sklearn.tree`
 ...................
 
diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py
index 22fa236f3314e..9ce95a97544a5 100644
--- a/sklearn/preprocessing/_discretization.py
+++ b/sklearn/preprocessing/_discretization.py
@@ -205,7 +205,8 @@ def fit(self, X, y=None):
                 init = (uniform_edges[1:] + uniform_edges[:-1])[:, None] * 0.5
 
                 # 1D k-means procedure
-                km = KMeans(n_clusters=n_bins[jj], init=init, n_init=1)
+                km = KMeans(n_clusters=n_bins[jj], init=init, n_init=1,
+                            algorithm='full')
                 centers = km.fit(column[:, None]).cluster_centers_[:, 0]
                 # Must sort, centers may be unsorted even with sorted init
                 centers.sort()

From 1a601c0f553b5cd097c086c1f4fa12bd5afaed9c Mon Sep 17 00:00:00 2001
From: TFiFiE <TFiFiE@users.noreply.github.com>
Date: Tue, 20 Apr 2021 23:05:14 +0200
Subject: [PATCH 347/478] DOC Remove misleading "linear kernel" statements
 (#19937)

---
 sklearn/svm/_classes.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/svm/_classes.py b/sklearn/svm/_classes.py
index c402779f4eeb6..674fa294dcf3c 100644
--- a/sklearn/svm/_classes.py
+++ b/sklearn/svm/_classes.py
@@ -105,7 +105,7 @@ class LinearSVC(LinearClassifierMixin,
     coef_ : ndarray of shape (1, n_features) if n_classes == 2 \
             else (n_classes, n_features)
         Weights assigned to the features (coefficients in the primal
-        problem). This is only available in the case of a linear kernel.
+        problem).
 
         ``coef_`` is a readonly property derived from ``raw_coef_`` that
         follows the internal memory layout of liblinear.
@@ -326,7 +326,7 @@ class LinearSVR(RegressorMixin, LinearModel):
     coef_ : ndarray of shape (n_features) if n_classes == 2 \
             else (n_classes, n_features)
         Weights assigned to the features (coefficients in the primal
-        problem). This is only available in the case of a linear kernel.
+        problem).
 
         `coef_` is a readonly property derived from `raw_coef_` that
         follows the internal memory layout of liblinear.

From 4946bfcd72ab45e821eadbb260b7187116f7c1ae Mon Sep 17 00:00:00 2001
From: Kei Ishikawa <30857855+kstoneriv3@users.noreply.github.com>
Date: Tue, 20 Apr 2021 23:12:33 +0200
Subject: [PATCH 348/478] FIX fix a bug in KernelPCA.inverse_transform (#19732)

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 doc/whats_new/v0.24.rst                       |  6 +++++
 sklearn/decomposition/_kernel_pca.py          |  2 --
 .../decomposition/tests/test_kernel_pca.py    | 23 +++++++++++--------
 3 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst
index 880d1879637ed..81953db29efa3 100644
--- a/doc/whats_new/v0.24.rst
+++ b/doc/whats_new/v0.24.rst
@@ -26,6 +26,12 @@ Changelog
   with `sample_weight` parameter and `least_absolute_deviation` loss function.
   :pr:`19407` by :user:`Vadim Ushtanit <vadim-ushtanit>`.
 
+:mod:`sklearn.decomposition`
+............................
+
+- |Fix| Fixed a bug in :class:`decomposition.KernelPCA`'s
+  ``inverse_transform``.  :pr:`19732` by :user:`Kei Ishikawa <kstoneriv3>`.
+
 :mod:`sklearn.linear_model`
 ...........................
 
diff --git a/sklearn/decomposition/_kernel_pca.py b/sklearn/decomposition/_kernel_pca.py
index 5655eddb0bf31..7ea1d118e4391 100644
--- a/sklearn/decomposition/_kernel_pca.py
+++ b/sklearn/decomposition/_kernel_pca.py
@@ -364,8 +364,6 @@ def inverse_transform(self, X):
                                  "the inverse transform is not available.")
 
         K = self._get_kernel(X, self.X_transformed_fit_)
-        n_samples = self.X_transformed_fit_.shape[0]
-        K.flat[::n_samples + 1] += self.alpha
         return np.dot(K, self.dual_coef_)
 
     def _more_tags(self):
diff --git a/sklearn/decomposition/tests/test_kernel_pca.py b/sklearn/decomposition/tests/test_kernel_pca.py
index 2acccb0df6781..adf68f1db1a6c 100644
--- a/sklearn/decomposition/tests/test_kernel_pca.py
+++ b/sklearn/decomposition/tests/test_kernel_pca.py
@@ -286,16 +286,19 @@ def test_kernel_conditioning():
     assert np.all(kpca.lambdas_ == _check_psd_eigenvalues(kpca.lambdas_))
 
 
-@pytest.mark.parametrize("kernel",
-                         ["linear", "poly", "rbf", "sigmoid", "cosine"])
-def test_kernel_pca_inverse_transform(kernel):
-    X, *_ = make_blobs(n_samples=100, n_features=4, centers=[[1, 1, 1, 1]],
-                       random_state=0)
-
-    kp = KernelPCA(n_components=2, kernel=kernel, fit_inverse_transform=True)
-    X_trans = kp.fit_transform(X)
-    X_inv = kp.inverse_transform(X_trans)
-    assert_allclose(X, X_inv)
+def test_kernel_pca_inverse_transform_reconstruction():
+    # Test if the reconstruction is a good approximation.
+    # Note that in general it is not possible to get an arbitrarily good
+    # reconstruction because of kernel centering that does not
+    # preserve all the information of the original data.
+    X, *_ = make_blobs(n_samples=100, n_features=4, random_state=0)
+
+    kpca = KernelPCA(
+        n_components=20, kernel='rbf', fit_inverse_transform=True, alpha=1e-3
+    )
+    X_trans = kpca.fit_transform(X)
+    X_reconst = kpca.inverse_transform(X_trans)
+    assert np.linalg.norm(X - X_reconst) / np.linalg.norm(X) < 1e-1
 
 
 def test_32_64_decomposition_shape():

From 0bd7cedababab7bd70ebacb31d46eccd2371a3bd Mon Sep 17 00:00:00 2001
From: Kei Ishikawa <30857855+kstoneriv3@users.noreply.github.com>
Date: Tue, 20 Apr 2021 23:21:54 +0200
Subject: [PATCH 349/478] ENH Enrich docstring on `inverse_transform` of
 `KernelPCA` (#19910)

---
 sklearn/decomposition/_kernel_pca.py | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/sklearn/decomposition/_kernel_pca.py b/sklearn/decomposition/_kernel_pca.py
index 7ea1d118e4391..415ee034c1769 100644
--- a/sklearn/decomposition/_kernel_pca.py
+++ b/sklearn/decomposition/_kernel_pca.py
@@ -346,6 +346,26 @@ def transform(self, X):
     def inverse_transform(self, X):
         """Transform X back to original space.
 
+        ``inverse_transform`` approximates the inverse transformation using
+        a learned pre-image. The pre-image is learned by kernel ridge
+        regression of the original data on their low-dimensional representation
+        vectors.
+
+        .. note:
+            :meth:`~sklearn.decomposition.fit` internally uses a centered
+            kernel. As the centered kernel no longer contains the information
+            of the mean of kernel features, such information is not taken into
+            account in reconstruction.
+
+        .. note::
+            When users want to compute inverse transformation for 'linear'
+            kernel, it is recommended that they use
+            :class:`~sklearn.decomposition.PCA` instead. Unlike
+            :class:`~sklearn.decomposition.PCA`,
+            :class:`~sklearn.decomposition.KernelPCA`'s ``inverse_transform``
+            does not reconstruct the mean of data when 'linear' kernel is used
+            due to the use of centered kernel.
+
         Parameters
         ----------
         X : {array-like, sparse matrix} of shape (n_samples, n_components)

From 004b44d007408aa2db1fdaf4428990d0d7b7f85a Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Tue, 20 Apr 2021 17:29:02 -0400
Subject: [PATCH 350/478] FIX OneHotEncoder.fit no longer alters the drop
 parameter (#19924)

---
 doc/whats_new/v0.24.rst                      |  3 +++
 doc/whats_new/v1.0.rst                       |  2 +-
 sklearn/preprocessing/_encoders.py           | 11 +++++------
 sklearn/preprocessing/tests/test_encoders.py |  2 ++
 4 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst
index 81953db29efa3..5b0b753f0f294 100644
--- a/doc/whats_new/v0.24.rst
+++ b/doc/whats_new/v0.24.rst
@@ -62,6 +62,9 @@ Changelog
 - |Fix| :meth:`preprocessing.OrdinalEncoder.transfrom` correctly handles
   unknown values for string dtypes. :pr:`19888` by `Thomas Fan`_.
 
+- |Fix| :meth:`preprocessing.OneHotEncoder.fit` no longer alters the `drop`
+  parameter. :pr:`19924` by `Thomas Fan`_.
+
 :mod:`sklearn.multioutput`
 ..........................
 
diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 8a2351b04ecc2..458bb8dfff8a0 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -105,7 +105,7 @@ Changelog
 - |Fix| Improved convergence detection based on center change in
   :class:`cluster.MiniBatchKMeans` which was almost never achievable.
   :pr:`17622` by :user:`Jérémie du Boisberranger <jeremiedbb>`.
-  
+
 - |FIX| :class:`cluster.AgglomerativeClustering` now supports readonly
   memory-mapped datasets. :pr:`19883` by `Julien Jerphanion <jjerphan>`.
 
diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index 7c62cbdcbc565..36ca74ac09cdb 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -363,22 +363,21 @@ def _compute_drop_idx(self):
 
         else:
             try:
-                self.drop = np.asarray(self.drop, dtype=object)
-                droplen = len(self.drop)
+                drop_array = np.asarray(self.drop, dtype=object)
+                droplen = len(drop_array)
             except (ValueError, TypeError):
                 msg = (
                     "Wrong input for parameter `drop`. Expected "
                     "'first', 'if_binary', None or array of objects, got {}"
                     )
-                raise ValueError(msg.format(type(self.drop)))
+                raise ValueError(msg.format(type(drop_array)))
             if droplen != len(self.categories_):
                 msg = ("`drop` should have length equal to the number "
                        "of features ({}), got {}")
-                raise ValueError(msg.format(len(self.categories_),
-                                            len(self.drop)))
+                raise ValueError(msg.format(len(self.categories_), droplen))
             missing_drops = []
             drop_indices = []
-            for col_idx, (val, cat_list) in enumerate(zip(self.drop,
+            for col_idx, (val, cat_list) in enumerate(zip(drop_array,
                                                           self.categories_)):
                 if not is_scalar_nan(val):
                     drop_idx = np.where(cat_list == val)[0]
diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
index 94e2c276dcd58..72fa46544b198 100644
--- a/sklearn/preprocessing/tests/test_encoders.py
+++ b/sklearn/preprocessing/tests/test_encoders.py
@@ -748,6 +748,8 @@ def test_one_hot_encoder_drop_manual(missing_value):
            [0, 1, 0, 1, 1],
            [0, 0, 0, 0, 0]]
     assert_array_equal(trans, exp)
+    assert enc.drop is cats_to_drop
+
     dropped_cats = [cat[feature]
                     for cat, feature in zip(enc.categories_,
                                             enc.drop_idx_)]

From d5ebdca662b0c43c283009e3aeb5bc270c6100a2 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 21 Apr 2021 10:49:18 +0200
Subject: [PATCH 351/478] [MRG] Refactor `feature_selection.f_regression` and
 introduce `feature_selection.r_regression` (#17169)

Co-authored-by: Dmytro S Lituiev <d.lituiev@gmail.com>
Co-authored-by: Chiara Marmo <chiara.marmo@u-psud.fr>
Co-authored-by: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 doc/modules/classes.rst                       |   1 +
 doc/whats_new/v1.0.rst                        |   8 +
 sklearn/feature_selection/__init__.py         |   2 +
 .../_univariate_selection.py                  | 149 ++++++++++++------
 .../tests/test_feature_select.py              |  59 +++++--
 5 files changed, 160 insertions(+), 59 deletions(-)

diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index 45195dcedec64..5462e06f81214 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -560,6 +560,7 @@ From text
    feature_selection.chi2
    feature_selection.f_classif
    feature_selection.f_regression
+   feature_selection.r_regression
    feature_selection.mutual_info_classif
    feature_selection.mutual_info_regression
 
diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 458bb8dfff8a0..270ae456b5213 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -204,6 +204,14 @@ Changelog
   input strings would result in negative indices in the transformed data.
   :pr:`19035` by :user:`Liu Yu <ly648499246>`.
 
+:mod:`sklearn.feature_selection`
+................................
+
+- |Feature| :func:`feature_selection.r_regression` computes Pearson's R
+  correlation coefficients between the features and the target.
+  :pr:`17169` by `Dmytro Lituiev <DSLituiev>`
+  and `Julien Jerphanion <jjerphan>`.
+
 :mod:`sklearn.inspection`
 .........................
 
diff --git a/sklearn/feature_selection/__init__.py b/sklearn/feature_selection/__init__.py
index 86e8a2af39084..ef894b40065de 100644
--- a/sklearn/feature_selection/__init__.py
+++ b/sklearn/feature_selection/__init__.py
@@ -8,6 +8,7 @@
 from ._univariate_selection import f_classif
 from ._univariate_selection import f_oneway
 from ._univariate_selection import f_regression
+from ._univariate_selection import r_regression
 from ._univariate_selection import SelectPercentile
 from ._univariate_selection import SelectKBest
 from ._univariate_selection import SelectFpr
@@ -44,6 +45,7 @@
            'f_classif',
            'f_oneway',
            'f_regression',
+           'r_regression',
            'mutual_info_classif',
            'mutual_info_regression',
            'SelectorMixin']
diff --git a/sklearn/feature_selection/_univariate_selection.py b/sklearn/feature_selection/_univariate_selection.py
index be3298387f612..d9db03e479163 100644
--- a/sklearn/feature_selection/_univariate_selection.py
+++ b/sklearn/feature_selection/_univariate_selection.py
@@ -230,60 +230,53 @@ def chi2(X, y):
     return _chisquare(observed, expected)
 
 
-@_deprecate_positional_args
-def f_regression(X, y, *, center=True):
-    """Univariate linear regression tests.
+def r_regression(X, y, *, center=True):
+    """Compute Pearson's r for each features and the target.
+
+    Pearson's r is also known as the Pearson correlation coefficient.
+
+    .. versionadded:: 1.0
 
     Linear model for testing the individual effect of each of many regressors.
     This is a scoring function to be used in a feature selection procedure, not
     a free standing feature selection procedure.
 
-    This is done in 2 steps:
-
-    1. The correlation between each regressor and the target is computed,
-       that is, ((X[:, i] - mean(X[:, i])) * (y - mean_y)) / (std(X[:, i]) *
-       std(y)).
-    2. It is converted to an F score then to a p-value.
+    The cross correlation between each regressor and the target is computed
+    as ((X[:, i] - mean(X[:, i])) * (y - mean_y)) / (std(X[:, i]) * std(y)).
 
     For more on usage see the :ref:`User Guide <univariate_feature_selection>`.
 
     Parameters
     ----------
-    X : {array-like, sparse matrix}  shape = (n_samples, n_features)
-        The set of regressors that will be tested sequentially.
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        The data matrix.
 
-    y : array of shape(n_samples).
-        The data matrix
+    y : array-like of shape (n_samples,)
+        The target vector.
 
     center : bool, default=True
-        If true, X and y will be centered.
+        Whether or not to center the data matrix `X` and the target vector `y`.
+        By default, `X` and `y` will be centered.
 
     Returns
     -------
-    F : array, shape=(n_features,)
-        F values of features.
-
-    pval : array, shape=(n_features,)
-        p-values of F-scores.
+    correlation_coefficient : ndarray of shape (n_features,)
+        Pearson's R correlation coefficients of features.
 
     See Also
     --------
-    mutual_info_regression : Mutual information for a continuous target.
-    f_classif : ANOVA F-value between label/feature for classification tasks.
-    chi2 : Chi-squared stats of non-negative features for classification tasks.
-    SelectKBest : Select features based on the k highest scores.
-    SelectFpr : Select features based on a false positive rate test.
-    SelectFdr : Select features based on an estimated false discovery rate.
-    SelectFwe : Select features based on family-wise error rate.
-    SelectPercentile : Select features based on percentile of the highest
-        scores.
+    f_regression: Univariate linear regression tests returning f-statistic
+        and p-values
+    mutual_info_regression: Mutual information for a continuous target.
+    f_classif: ANOVA F-value between label/feature for classification tasks.
+    chi2: Chi-squared stats of non-negative features for classification tasks.
     """
     X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
                      dtype=np.float64)
     n_samples = X.shape[0]
 
-    # compute centered values
-    # note that E[(x - mean(x))*(y - mean(y))] = E[x*(y - mean(y))], so we
+    # Compute centered values
+    # Note that E[(x - mean(x))*(y - mean(y))] = E[x*(y - mean(y))], so we
     # need not center X
     if center:
         y = y - np.mean(y)
@@ -291,22 +284,86 @@ def f_regression(X, y, *, center=True):
             X_means = X.mean(axis=0).getA1()
         else:
             X_means = X.mean(axis=0)
-        # compute the scaled standard deviations via moments
+        # Compute the scaled standard deviations via moments
         X_norms = np.sqrt(row_norms(X.T, squared=True) -
                           n_samples * X_means ** 2)
     else:
         X_norms = row_norms(X.T)
 
-    # compute the correlation
-    corr = safe_sparse_dot(y, X)
-    corr /= X_norms
-    corr /= np.linalg.norm(y)
+    correlation_coefficient = safe_sparse_dot(y, X)
+    correlation_coefficient /= X_norms
+    correlation_coefficient /= np.linalg.norm(y)
+    return correlation_coefficient
+
+
+@_deprecate_positional_args
+def f_regression(X, y, *, center=True):
+    """Univariate linear regression tests returning F-statistic and p-values.
+
+    Quick linear model for testing the effect of a single regressor,
+    sequentially for many regressors.
+
+    This is done in 2 steps:
+
+    1. The cross correlation between each regressor and the target is computed,
+       that is, ((X[:, i] - mean(X[:, i])) * (y - mean_y)) / (std(X[:, i]) *
+       std(y)) using r_regression function.
+    2. It is converted to an F score and then to a p-value.
+
+    :func:`f_regression` is derived from :func:`r_regression` and will rank
+    features in the same order if all the features are positively correlated
+    with the target.
+
+    Note however that contrary to :func:`f_regression`, :func:`r_regression`
+    values lie in [-1, 1] and can thus be negative. :func:`f_regression` is
+    therefore recommended as a feature selection criterion to identify
+    potentially predictive feature for a downstream classifier, irrespective of
+    the sign of the association with the target variable.
 
-    # convert to p-value
-    degrees_of_freedom = y.size - (2 if center else 1)
-    F = corr ** 2 / (1 - corr ** 2) * degrees_of_freedom
-    pv = stats.f.sf(F, 1, degrees_of_freedom)
-    return F, pv
+    Furthermore :func:`f_regression` returns p-values while
+    :func:`r_regression` does not.
+
+    Read more in the :ref:`User Guide <univariate_feature_selection>`.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        The data matrix.
+
+    y : array-like of shape (n_samples,)
+        The target vector.
+
+    center : bool, default=True
+        Whether or not to center the data matrix `X` and the target vector `y`.
+        By default, `X` and `y` will be centered.
+
+    Returns
+    -------
+    f_statistic : ndarray of shape (n_features,)
+        F-statistic for each feature.
+
+    p_values : ndarray of shape (n_features,)
+        P-values associated with the F-statistic.
+
+    See Also
+    --------
+    r_regression: Pearson's R between label/feature for regression tasks.
+    f_classif: ANOVA F-value between label/feature for classification tasks.
+    chi2: Chi-squared stats of non-negative features for classification tasks.
+    SelectKBest: Select features based on the k highest scores.
+    SelectFpr: Select features based on a false positive rate test.
+    SelectFdr: Select features based on an estimated false discovery rate.
+    SelectFwe: Select features based on family-wise error rate.
+    SelectPercentile: Select features based on percentile of the highest
+        scores.
+    """
+    correlation_coefficient = r_regression(X, y, center=center)
+    deg_of_freedom = y.size - (2 if center else 1)
+
+    corr_coef_squared = correlation_coefficient ** 2
+    f_statistic = corr_coef_squared / (1 - corr_coef_squared) * deg_of_freedom
+    p_values = stats.f.sf(f_statistic, 1, deg_of_freedom)
+    return f_statistic, p_values
 
 
 ######################################################################
@@ -503,12 +560,12 @@ class SelectKBest(_BaseFilter):
 
     See Also
     --------
-    f_classif : ANOVA F-value between label/feature for classification tasks.
-    mutual_info_classif : Mutual information for a discrete target.
-    chi2 : Chi-squared stats of non-negative features for classification tasks.
-    f_regression : F-value between label/feature for regression tasks.
-    mutual_info_regression : Mutual information for a continuous target.
-    SelectPercentile : Select features based on percentile of the highest
+    f_classif: ANOVA F-value between label/feature for classification tasks.
+    mutual_info_classif: Mutual information for a discrete target.
+    chi2: Chi-squared stats of non-negative features for classification tasks.
+    f_regression: F-value between label/feature for regression tasks.
+    mutual_info_regression: Mutual information for a continuous target.
+    SelectPercentile: Select features based on percentile of the highest
         scores.
     SelectFpr : Select features based on a false positive rate test.
     SelectFdr : Select features based on an estimated false discovery rate.
diff --git a/sklearn/feature_selection/tests/test_feature_select.py b/sklearn/feature_selection/tests/test_feature_select.py
index 61f709094147e..852c8228b2a76 100644
--- a/sklearn/feature_selection/tests/test_feature_select.py
+++ b/sklearn/feature_selection/tests/test_feature_select.py
@@ -4,11 +4,12 @@
 import itertools
 import warnings
 import numpy as np
+from numpy.testing import assert_allclose
 from scipy import stats, sparse
 
 import pytest
 
-from sklearn.utils._testing import assert_almost_equal
+from sklearn.utils._testing import assert_almost_equal, _convert_container
 from sklearn.utils._testing import assert_array_equal
 from sklearn.utils._testing import assert_array_almost_equal
 from sklearn.utils._testing import assert_warns
@@ -18,9 +19,20 @@
 
 from sklearn.datasets import make_classification, make_regression
 from sklearn.feature_selection import (
-    chi2, f_classif, f_oneway, f_regression, mutual_info_classif,
-    mutual_info_regression, SelectPercentile, SelectKBest, SelectFpr,
-    SelectFdr, SelectFwe, GenericUnivariateSelect)
+    chi2,
+    f_classif,
+    f_oneway,
+    f_regression,
+    GenericUnivariateSelect,
+    mutual_info_classif,
+    mutual_info_regression,
+    r_regression,
+    SelectPercentile,
+    SelectKBest,
+    SelectFpr,
+    SelectFdr,
+    SelectFwe,
+)
 
 
 ##############################################################################
@@ -71,6 +83,27 @@ def test_f_classif():
     assert_array_almost_equal(pv_sparse, pv)
 
 
+@pytest.mark.parametrize("center", [True, False])
+def test_r_regression(center):
+    X, y = make_regression(n_samples=2000, n_features=20, n_informative=5,
+                           shuffle=False, random_state=0)
+
+    corr_coeffs = r_regression(X, y, center=center)
+    assert ((-1 < corr_coeffs).all())
+    assert ((corr_coeffs < 1).all())
+
+    sparse_X = _convert_container(X, "sparse")
+
+    sparse_corr_coeffs = r_regression(sparse_X, y, center=center)
+    assert_allclose(sparse_corr_coeffs, corr_coeffs)
+
+    # Testing against numpy for reference
+    Z = np.hstack((X, y[:, np.newaxis]))
+    correlation_matrix = np.corrcoef(Z, rowvar=False)
+    np_corr_coeffs = correlation_matrix[:-1, -1]
+    assert_array_almost_equal(np_corr_coeffs, corr_coeffs, decimal=3)
+
+
 def test_f_regression():
     # Test whether the F test yields meaningful results
     # on a simple simulated regression problem
@@ -87,14 +120,14 @@ def test_f_regression():
     # with centering, compare with sparse
     F, pv = f_regression(X, y, center=True)
     F_sparse, pv_sparse = f_regression(sparse.csr_matrix(X), y, center=True)
-    assert_array_almost_equal(F_sparse, F)
-    assert_array_almost_equal(pv_sparse, pv)
+    assert_allclose(F_sparse, F)
+    assert_allclose(pv_sparse, pv)
 
     # again without centering, compare with sparse
     F, pv = f_regression(X, y, center=False)
     F_sparse, pv_sparse = f_regression(sparse.csr_matrix(X), y, center=False)
-    assert_array_almost_equal(F_sparse, F)
-    assert_array_almost_equal(pv_sparse, pv)
+    assert_allclose(F_sparse, F)
+    assert_allclose(pv_sparse, pv)
 
 
 def test_f_regression_input_dtype():
@@ -106,8 +139,8 @@ def test_f_regression_input_dtype():
 
     F1, pv1 = f_regression(X, y)
     F2, pv2 = f_regression(X, y.astype(float))
-    assert_array_almost_equal(F1, F2, 5)
-    assert_array_almost_equal(pv1, pv2, 5)
+    assert_allclose(F1, F2, 5)
+    assert_allclose(pv1, pv2, 5)
 
 
 def test_f_regression_center():
@@ -123,7 +156,7 @@ def test_f_regression_center():
 
     F1, _ = f_regression(X, Y, center=True)
     F2, _ = f_regression(X, Y, center=False)
-    assert_array_almost_equal(F1 * (n_samples - 1.) / (n_samples - 2.), F2)
+    assert_allclose(F1 * (n_samples - 1.) / (n_samples - 2.), F2)
     assert_almost_equal(F2[0], 0.232558139)  # value from statsmodels OLS
 
 
@@ -262,7 +295,7 @@ def test_select_heuristics_classif():
             f_classif, mode=mode, param=0.01).fit(X, y).transform(X)
         assert_array_equal(X_r, X_r2)
         support = univariate_filter.get_support()
-        assert_array_almost_equal(support, gtruth)
+        assert_allclose(support, gtruth)
 
 
 ##############################################################################
@@ -272,7 +305,7 @@ def test_select_heuristics_classif():
 def assert_best_scores_kept(score_filter):
     scores = score_filter.scores_
     support = score_filter.get_support()
-    assert_array_almost_equal(np.sort(scores[support]),
+    assert_allclose(np.sort(scores[support]),
                               np.sort(scores)[-support.sum():])
 
 
From b84afe541c679b0cb57b3b1f7f438400392d11ee Mon Sep 17 00:00:00 2001
From: Alexandr Fonari <alexandr.fonari@gmail.com>
Date: Wed, 21 Apr 2021 18:39:07 +0000
Subject: [PATCH 352/478] FIX prevent division by zero with constant target in
 GPR (#19703)

Co-authored-by: Sasha Fonari <fonari@schrodinger.com>
Co-authored-by: Chiara Marmo <cmarmo@users.noreply.github.com>
Co-authored-by: Guillaume Lemaitre <g.lemaitre58@gmail.com>
---
 doc/whats_new/v0.24.rst                    | 11 +++++++++++
 sklearn/gaussian_process/_gpr.py           |  5 ++++-
 sklearn/gaussian_process/tests/test_gpr.py | 23 ++++++++++++++++++++++
 3 files changed, 38 insertions(+), 1 deletion(-)

diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst
index 5b0b753f0f294..305648dbdcfc9 100644
--- a/doc/whats_new/v0.24.rst
+++ b/doc/whats_new/v0.24.rst
@@ -32,6 +32,17 @@ Changelog
 - |Fix| Fixed a bug in :class:`decomposition.KernelPCA`'s
   ``inverse_transform``.  :pr:`19732` by :user:`Kei Ishikawa <kstoneriv3>`.
 
+:mod:`sklearn.gaussian_process`
+...............................
+
+- |Fix| Avoid division by zero when scaling constant target in
+  :class:`gaussian_process.GaussianProcessRegressor`. It was due to a std. dev.
+  equal to 0. Now, such case is detected and the std. dev. is affected to 1
+  avoiding a division by zero and thus the presence of NaN values in the
+  normalized target.
+  :pr:`19703` by :user:`sobkevich`, :user:`Boris Villazón-Terrazas <boricles>`
+  and :user:`Alexandr Fonari <afonari>`.
+
 :mod:`sklearn.linear_model`
 ...........................
 
diff --git a/sklearn/gaussian_process/_gpr.py b/sklearn/gaussian_process/_gpr.py
index 4e8814dd69951..8f9575ffe42df 100644
--- a/sklearn/gaussian_process/_gpr.py
+++ b/sklearn/gaussian_process/_gpr.py
@@ -14,6 +14,7 @@
 from ..base import BaseEstimator, RegressorMixin, clone
 from ..base import MultiOutputMixin
 from .kernels import RBF, ConstantKernel as C
+from ..preprocessing._data import _handle_zeros_in_scale
 from ..utils import check_random_state
 from ..utils.optimize import _check_optimize_result
 from ..utils.validation import _deprecate_positional_args
@@ -197,7 +198,9 @@ def fit(self, X, y):
         # Normalize target value
         if self.normalize_y:
             self._y_train_mean = np.mean(y, axis=0)
-            self._y_train_std = np.std(y, axis=0)
+            self._y_train_std = _handle_zeros_in_scale(
+                np.std(y, axis=0), copy=False
+            )
 
             # Remove mean and make unit variance
             y = (y - self._y_train_mean) / self._y_train_std
diff --git a/sklearn/gaussian_process/tests/test_gpr.py b/sklearn/gaussian_process/tests/test_gpr.py
index a5bfa05c47313..440e421cb95cc 100644
--- a/sklearn/gaussian_process/tests/test_gpr.py
+++ b/sklearn/gaussian_process/tests/test_gpr.py
@@ -546,3 +546,26 @@ def test_bound_check_fixed_hyperparameter():
                         periodicity_bounds="fixed")  # seasonal component
     kernel = k1 + k2
     GaussianProcessRegressor(kernel=kernel).fit(X, y)
+
+
+# FIXME: we should test for multitargets as well. However, GPR is broken:
+# see: https://github.com/scikit-learn/scikit-learn/pull/19706
+@pytest.mark.parametrize('kernel', kernels)
+def test_constant_target(kernel):
+    """Check that the std. dev. is affected to 1 when normalizing a constant
+    feature.
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/18318
+    NaN where affected to the target when scaling due to null std. dev. with
+    constant target.
+    """
+    y_constant = np.ones(X.shape[0], dtype=np.float64)
+
+    gpr = GaussianProcessRegressor(kernel=kernel, normalize_y=True)
+    gpr.fit(X, y_constant)
+    assert gpr._y_train_std == pytest.approx(1.0)
+
+    y_pred, y_cov = gpr.predict(X, return_cov=True)
+    assert_allclose(y_pred, y_constant)
+    # set atol because we compare to zero
+    assert_allclose(np.diag(y_cov), 0., atol=1e-9)

From a67b284f90299989c4cc03f848dc9cc1be57c623 Mon Sep 17 00:00:00 2001
From: Andrew Delong <andrew.delong@gmail.com>
Date: Wed, 21 Apr 2021 17:34:28 -0400
Subject: [PATCH 353/478] FIX Encoder should accept categories having dtype='S'
 (#19727)

Co-authored-by: Guillaume Lemaitre <g.lemaitre58@gmail.com>
---
 doc/whats_new/v0.24.rst                      | 21 +++++---
 sklearn/preprocessing/_encoders.py           |  2 +-
 sklearn/preprocessing/tests/test_encoders.py | 17 +++---
 sklearn/utils/_encode.py                     |  4 +-
 sklearn/utils/_testing.py                    | 48 +++++++++++++----
 sklearn/utils/tests/test_testing.py          | 54 ++++++++++++++++----
 6 files changed, 108 insertions(+), 38 deletions(-)

diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst
index 305648dbdcfc9..41dfcfbc4d1c9 100644
--- a/doc/whats_new/v0.24.rst
+++ b/doc/whats_new/v0.24.rst
@@ -62,6 +62,13 @@ Changelog
   :class:`model_selection.HalvingGridSearchCV` were not properly converted to
   numpy arrays. :pr:`19211` by `Nicolas Hug`_.
 
+:mod:`sklearn.multioutput`
+..........................
+
+- |Fix| :class:`multioutput.MultiOutputRegressor` now works with estimators
+  that dynamically define `predict` during fitting, such as
+  :class:`ensemble.StackingRegressor`. :pr:`19308` by `Thomas Fan`_.
+
 :mod:`sklearn.preprocessing`
 ............................
 
@@ -70,19 +77,17 @@ Changelog
   `'use_encoded_value'` strategies.
   :pr:`19234` by `Guillaume Lemaitre <glemaitre>`.
 
+- |Fix| Fix encoder categories having dtype='S'
+  :class:`preprocessing.OneHotEncoder` and
+  :class:`preprocessing.OrdinalEncoder`.
+  :pr:`19727` by :user:`Andrew Delong <andrewdelong>`.
+
 - |Fix| :meth:`preprocessing.OrdinalEncoder.transfrom` correctly handles
   unknown values for string dtypes. :pr:`19888` by `Thomas Fan`_.
 
 - |Fix| :meth:`preprocessing.OneHotEncoder.fit` no longer alters the `drop`
   parameter. :pr:`19924` by `Thomas Fan`_.
 
-:mod:`sklearn.multioutput`
-..........................
-
-- |Fix| :class:`multioutput.MultiOutputRegressor` now works with estimators
-  that dynamically define `predict` during fitting, such as
-  :class:`ensemble.StackingRegressor`. :pr:`19308` by `Thomas Fan`_.
-
 :mod:`sklearn.semi_supervised`
 ..............................
 
@@ -91,7 +96,7 @@ Changelog
   :pr:`19271` by :user:`Zhaowei Wang <ThuWangzw>`.
 
 :mod:`sklearn.tree`
-.......................
+...................
 
 - |Fix| Fix a bug in `fit` of :class:`tree.BaseDecisionTree` that caused
   segmentation faults under certain conditions. `fit` now deep copies the
diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index 36ca74ac09cdb..ba1d48df175ee 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -92,7 +92,7 @@ def _fit(self, X, handle_unknown='error', force_all_finite=True):
                 cats = _unique(Xi)
             else:
                 cats = np.array(self.categories[i], dtype=Xi.dtype)
-                if Xi.dtype.kind not in 'OU':
+                if Xi.dtype.kind not in 'OUS':
                     sorted_cats = np.sort(cats)
                     error_msg = ("Unsorted categories are not "
                                  "supported for numerical categories")
diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
index 72fa46544b198..ef2ac000a0c83 100644
--- a/sklearn/preprocessing/tests/test_encoders.py
+++ b/sklearn/preprocessing/tests/test_encoders.py
@@ -692,7 +692,8 @@ def test_encoder_dtypes():
 
     for X in [np.array([[1, 2], [3, 4]], dtype='int64'),
               np.array([[1, 2], [3, 4]], dtype='float64'),
-              np.array([['a', 'b'], ['c', 'd']]),  # string dtype
+              np.array([['a', 'b'], ['c', 'd']]),      # unicode dtype
+              np.array([[b'a', b'b'], [b'c', b'd']]),  # string dtype
               np.array([[1, 'a'], [3, 'b']], dtype='object')]:
         enc.fit(X)
         assert all([enc.categories_[i].dtype == X.dtype for i in range(2)])
@@ -827,21 +828,25 @@ def test_encoders_has_categorical_tags(Encoder):
     assert 'categorical' in Encoder()._get_tags()['X_types']
 
 
-@pytest.mark.parametrize('input_dtype', ['O', 'U'])
-@pytest.mark.parametrize('category_dtype', ['O', 'U'])
+# deliberately omit 'OS' as an invalid combo
+@pytest.mark.parametrize('input_dtype, category_dtype', ['OO', 'OU',
+                                                         'UO', 'UU', 'US',
+                                                         'SO', 'SU', 'SS'])
 @pytest.mark.parametrize('array_type', ['list', 'array', 'dataframe'])
-def test_encoders_unicode_categories(input_dtype, category_dtype, array_type):
-    """Check that encoding work with string and object dtypes.
+def test_encoders_string_categories(input_dtype, category_dtype, array_type):
+    """Check that encoding work with object, unicode, and byte string dtypes.
     Non-regression test for:
     https://github.com/scikit-learn/scikit-learn/issues/15616
     https://github.com/scikit-learn/scikit-learn/issues/15726
+    https://github.com/scikit-learn/scikit-learn/issues/19677
     """
 
     X = np.array([['b'], ['a']], dtype=input_dtype)
     categories = [np.array(['b', 'a'], dtype=category_dtype)]
     ohe = OneHotEncoder(categories=categories, sparse=False).fit(X)
 
-    X_test = _convert_container([['a'], ['a'], ['b'], ['a']], array_type)
+    X_test = _convert_container([['a'], ['a'], ['b'], ['a']], array_type,
+                                dtype=input_dtype)
     X_trans = ohe.transform(X_test)
 
     expected = np.array([[0, 1], [0, 1], [1, 0], [0, 1]])
diff --git a/sklearn/utils/_encode.py b/sklearn/utils/_encode.py
index b43afa998698b..2295150a6626b 100644
--- a/sklearn/utils/_encode.py
+++ b/sklearn/utils/_encode.py
@@ -173,7 +173,7 @@ def _encode(values, *, uniques, check_unknown=True):
     encoded : ndarray
         Encoded values
     """
-    if values.dtype.kind in 'OU':
+    if values.dtype.kind in 'OUS':
         try:
             return _map_to_integer(values, uniques)
         except KeyError as e:
@@ -214,7 +214,7 @@ def _check_unknown(values, known_values, return_mask=False):
     """
     valid_mask = None
 
-    if values.dtype.kind in 'UO':
+    if values.dtype.kind in 'OUS':
         values_set = set(values)
         values_set, missing_in_values = _extract_missing(values_set)
 
diff --git a/sklearn/utils/_testing.py b/sklearn/utils/_testing.py
index 779e7b6574e3e..8fc77748740d5 100644
--- a/sklearn/utils/_testing.py
+++ b/sklearn/utils/_testing.py
@@ -758,30 +758,58 @@ def assert_run_python_script(source_code, timeout=60):
         os.unlink(source_file)
 
 
-def _convert_container(container, constructor_name, columns_name=None):
+def _convert_container(
+    container, constructor_name, columns_name=None, dtype=None
+):
+    """Convert a given container to a specific array-like with a dtype.
+
+    Parameters
+    ----------
+    container : array-like
+        The container to convert.
+    constructor_name : {"list", "tuple", "array", "sparse", "dataframe", \
+            "series", "index", "slice", "sparse_csr", "sparse_csc"}
+        The type of the returned container.
+    columns_name : index or array-like, default=None
+        For pandas container supporting `columns_names`, it will affect
+        specific names.
+    dtype : dtype, default=None
+        Force the dtype of the container. Does not apply to `"slice"`
+        container.
+
+    Returns
+    -------
+    converted_container
+    """
     if constructor_name == 'list':
-        return list(container)
+        if dtype is None:
+            return list(container)
+        else:
+            return np.asarray(container, dtype=dtype).tolist()
     elif constructor_name == 'tuple':
-        return tuple(container)
+        if dtype is None:
+            return tuple(container)
+        else:
+            return tuple(np.asarray(container, dtype=dtype).tolist())
     elif constructor_name == 'array':
-        return np.asarray(container)
+        return np.asarray(container, dtype=dtype)
     elif constructor_name == 'sparse':
-        return sp.sparse.csr_matrix(container)
+        return sp.sparse.csr_matrix(container, dtype=dtype)
     elif constructor_name == 'dataframe':
         pd = pytest.importorskip('pandas')
-        return pd.DataFrame(container, columns=columns_name)
+        return pd.DataFrame(container, columns=columns_name, dtype=dtype)
     elif constructor_name == 'series':
         pd = pytest.importorskip('pandas')
-        return pd.Series(container)
+        return pd.Series(container, dtype=dtype)
     elif constructor_name == 'index':
         pd = pytest.importorskip('pandas')
-        return pd.Index(container)
+        return pd.Index(container, dtype=dtype)
     elif constructor_name == 'slice':
         return slice(container[0], container[1])
     elif constructor_name == 'sparse_csr':
-        return sp.sparse.csr_matrix(container)
+        return sp.sparse.csr_matrix(container, dtype=dtype)
     elif constructor_name == 'sparse_csc':
-        return sp.sparse.csc_matrix(container)
+        return sp.sparse.csc_matrix(container, dtype=dtype)
 
 
 def raises(expected_exc_type, match=None, may_pass=False, err_msg=None):
diff --git a/sklearn/utils/tests/test_testing.py b/sklearn/utils/tests/test_testing.py
index 1d4b3780953a7..8685409a4fd44 100644
--- a/sklearn/utils/tests/test_testing.py
+++ b/sklearn/utils/tests/test_testing.py
@@ -624,19 +624,51 @@ def test_create_memmap_backed_data(monkeypatch):
 
 @pytest.mark.parametrize(
     "constructor_name, container_type",
-    [('list', list),
-     ('tuple', tuple),
-     ('array', np.ndarray),
-     ('sparse', sparse.csr_matrix),
-     ('dataframe', pytest.importorskip('pandas').DataFrame),
-     ('series', pytest.importorskip('pandas').Series),
-     ('index', pytest.importorskip('pandas').Index),
-     ('slice', slice)]
+    [
+        ('list', list),
+        ('tuple', tuple),
+        ('array', np.ndarray),
+        ('sparse', sparse.csr_matrix),
+        ('sparse_csr', sparse.csr_matrix),
+        ('sparse_csc', sparse.csc_matrix),
+        ('dataframe', lambda: pytest.importorskip('pandas').DataFrame),
+        ('series', lambda: pytest.importorskip('pandas').Series),
+        ('index', lambda: pytest.importorskip('pandas').Index),
+        ('slice', slice),
+    ]
 )
-def test_convert_container(constructor_name, container_type):
+@pytest.mark.parametrize(
+    "dtype, superdtype",
+    [
+        (np.int32, np.integer),
+        (np.int64, np.integer),
+        (np.float32, np.floating),
+        (np.float64, np.floating),
+    ]
+)
+def test_convert_container(
+    constructor_name, container_type, dtype, superdtype,
+):
+    """Check that we convert the container to the right type of array with the
+    right data type."""
+    if constructor_name in ("dataframe", "series", "index"):
+        # delay the import of pandas within the function to only skip this test
+        # instead of the whole file
+        container_type = container_type()
     container = [0, 1]
-    assert isinstance(_convert_container(container, constructor_name),
-                      container_type)
+    container_converted = _convert_container(
+        container, constructor_name, dtype=dtype,
+    )
+    assert isinstance(container_converted, container_type)
+
+    if constructor_name in ("list", "tuple", "index"):
+        # list and tuple will use Python class dtype: int, float
+        # pandas index will always use high precision: np.int64 and np.float64
+        assert np.issubdtype(type(container_converted[0]), superdtype)
+    elif hasattr(container_converted, "dtype"):
+        assert container_converted.dtype == dtype
+    elif hasattr(container_converted, "dtypes"):
+        assert container_converted.dtypes[0] == dtype
 
 
 def test_raises():

From dbed806a7aad5d253cf1ca0a3bca9bda5e391456 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Thu, 22 Apr 2021 04:49:24 -0400
Subject: [PATCH 354/478] FIX Fixes regression in CCA due to change of cutoff
 values in SciPy  (#19646)

---
 doc/whats_new/v0.24.rst                       |  6 ++++++
 sklearn/cross_decomposition/_pls.py           | 21 +++++++++++++++++--
 sklearn/cross_decomposition/tests/test_pls.py | 16 ++++++++++++++
 3 files changed, 41 insertions(+), 2 deletions(-)

diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst
index 41dfcfbc4d1c9..dc1727b2264a5 100644
--- a/doc/whats_new/v0.24.rst
+++ b/doc/whats_new/v0.24.rst
@@ -19,6 +19,12 @@ Changelog
   :term:`get_feature_names` on transformers with an empty column selection.
   :pr:`19579` by `Thomas Fan`_.
 
+:mod:`sklearn.cross_decomposition`
+..................................
+
+- |Fix| Fixed a regression in :class:`cross_decomposition.CCA`. :pr:`19646`
+  by `Thomas Fan`_.
+
 :mod:`sklearn.ensemble`
 .......................
 
diff --git a/sklearn/cross_decomposition/_pls.py b/sklearn/cross_decomposition/_pls.py
index 42d727b9ae2be..3c886a0dd0c1b 100644
--- a/sklearn/cross_decomposition/_pls.py
+++ b/sklearn/cross_decomposition/_pls.py
@@ -23,6 +23,24 @@
 __all__ = ['PLSCanonical', 'PLSRegression', 'PLSSVD']
 
 
+def _pinv2_old(a):
+    # Used previous scipy pinv2 that was updated in:
+    # https://github.com/scipy/scipy/pull/10067
+    # We can not set `cond` or `rcond` for pinv2 in scipy >= 1.3 to keep the
+    # same behavior of pinv2 for scipy < 1.3, because the condition used to
+    # determine the rank is dependent on the output of svd.
+    u, s, vh = svd(a, full_matrices=False, check_finite=False)
+
+    t = u.dtype.char.lower()
+    factor = {'f': 1E3, 'd': 1E6}
+    cond = np.max(s) * factor[t] * np.finfo(t).eps
+    rank = np.sum(s > cond)
+
+    u = u[:, :rank]
+    u /= s[:rank]
+    return np.transpose(np.conjugate(np.dot(u, vh[:rank])))
+
+
 def _get_first_singular_vectors_power_method(X, Y, mode="A", max_iter=500,
                                              tol=1e-06, norm_y_weights=False):
     """Return the first left and right singular vectors of X'Y.
@@ -44,8 +62,7 @@ def _get_first_singular_vectors_power_method(X, Y, mode="A", max_iter=500,
         # As a result, and as detailed in the Wegelin's review, CCA (i.e. mode
         # B) will be unstable if n_features > n_samples or n_targets >
         # n_samples
-        X_pinv = pinv2(X, check_finite=False, cond=10*eps)
-        Y_pinv = pinv2(Y, check_finite=False, cond=10*eps)
+        X_pinv, Y_pinv = _pinv2_old(X), _pinv2_old(Y)
 
     for i in range(max_iter):
         if mode == "B":
diff --git a/sklearn/cross_decomposition/tests/test_pls.py b/sklearn/cross_decomposition/tests/test_pls.py
index 04c791fd4154a..1179161b8436c 100644
--- a/sklearn/cross_decomposition/tests/test_pls.py
+++ b/sklearn/cross_decomposition/tests/test_pls.py
@@ -552,3 +552,19 @@ def test_svd_flip_1d():
 
     assert_allclose(v, v_expected.ravel())
     assert_allclose(v, [-1, -2, -3])
+
+
+def test_loadings_converges():
+    """Test that CCA converges. Non-regression test for #19549."""
+    X, y = make_regression(n_samples=200, n_features=20, n_targets=20,
+                           random_state=20)
+
+    cca = CCA(n_components=10, max_iter=500)
+
+    with pytest.warns(None) as record:
+        cca.fit(X, y)
+    # ConvergenceWarning is not raised
+    assert not record
+
+    # Loadings converges to reasonable values
+    assert np.all(np.abs(cca.x_loadings_) < 1)

From efc703cb0a2e7a5bbc224aa54910d2f67a5ffb16 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 22 Apr 2021 15:26:04 +0200
Subject: [PATCH 355/478] DOC order whats new 0.24.2

---
 doc/whats_new/v0.24.rst | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst
index dc1727b2264a5..b3768f92155eb 100644
--- a/doc/whats_new/v0.24.rst
+++ b/doc/whats_new/v0.24.rst
@@ -25,6 +25,12 @@ Changelog
 - |Fix| Fixed a regression in :class:`cross_decomposition.CCA`. :pr:`19646`
   by `Thomas Fan`_.
 
+:mod:`sklearn.decomposition`
+............................
+
+- |Fix| Fixed a bug in :class:`decomposition.KernelPCA`'s
+  ``inverse_transform``.  :pr:`19732` by :user:`Kei Ishikawa <kstoneriv3>`.
+
 :mod:`sklearn.ensemble`
 .......................
 
@@ -32,12 +38,6 @@ Changelog
   with `sample_weight` parameter and `least_absolute_deviation` loss function.
   :pr:`19407` by :user:`Vadim Ushtanit <vadim-ushtanit>`.
 
-:mod:`sklearn.decomposition`
-............................
-
-- |Fix| Fixed a bug in :class:`decomposition.KernelPCA`'s
-  ``inverse_transform``.  :pr:`19732` by :user:`Kei Ishikawa <kstoneriv3>`.
-
 :mod:`sklearn.gaussian_process`
 ...............................
 

From c88c89cffd87c34299ebb8db6192c973823bd827 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 22 Apr 2021 18:50:46 +0200
Subject: [PATCH 356/478] DOC move whats new entry from 1.0 to 0.24

---
 doc/whats_new/v0.24.rst | 7 ++++++-
 doc/whats_new/v1.0.rst  | 5 -----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst
index b3768f92155eb..34744de8a6b91 100644
--- a/doc/whats_new/v0.24.rst
+++ b/doc/whats_new/v0.24.rst
@@ -7,7 +7,7 @@
 Version 0.24.2
 ==============
 
-**TBD 2021**
+**April 2021**
 
 Changelog
 ---------
@@ -68,6 +68,11 @@ Changelog
   :class:`model_selection.HalvingGridSearchCV` were not properly converted to
   numpy arrays. :pr:`19211` by `Nicolas Hug`_.
 
+- |Fix| The `fit` method of the successive halving parameter search
+  (:class:`model_selection.HalvingGridSearchCV`, and
+  :class:`model_selection.HalvingRandomSearchCV`) now correctly handles the
+  `groups` parameter. :pr:`19847` by :user:`Xiaoyu Chai <xiaoyuchai>`.
+
 :mod:`sklearn.multioutput`
 ..........................
 
diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 270ae456b5213..3b3884e68e185 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -319,11 +319,6 @@ Changelog
   :pr:`18649` by `Leandro Hermida <hermidalc>` and
   `Rodion Martynov <marrodion>`.
 
-- |Fix| The `fit` method of the successive halving parameter search
-  (:class:`model_selection.HalvingGridSearchCV`, and
-  :class:`model_selection.HalvingRandomSearchCV`) now correctly handles the
-  `groups` parameter. :pr:`19847` by :user:`Xiaoyu Chai <xiaoyuchai>`.
-
 :mod:`sklearn.naive_bayes`
 ..........................
 

From 09684342745cfc3509432885396e7be776e64cee Mon Sep 17 00:00:00 2001
From: Chiara Marmo <cmarmo@users.noreply.github.com>
Date: Fri, 23 Apr 2021 13:02:14 +0200
Subject: [PATCH 357/478] MAINT Remove tests for metric configuration ignoring
 pos_label (#19961)

---
 sklearn/metrics/tests/test_common.py | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py
index dbf1bdd458f1a..66df47a778b38 100644
--- a/sklearn/metrics/tests/test_common.py
+++ b/sklearn/metrics/tests/test_common.py
@@ -342,16 +342,6 @@ def precision_recall_curve_padded_thresholds(*args, **kwargs):
     "weighted_average_precision_score",
     "micro_average_precision_score",
     "samples_average_precision_score",
-
-    # pos_label support deprecated; to be removed in 0.18:
-    "weighted_f0.5_score", "weighted_f1_score", "weighted_f2_score",
-    "weighted_precision_score", "weighted_recall_score",
-
-    "micro_f0.5_score", "micro_f1_score", "micro_f2_score",
-    "micro_precision_score", "micro_recall_score",
-
-    "macro_f0.5_score", "macro_f1_score", "macro_f2_score",
-    "macro_precision_score", "macro_recall_score",
 }
 
 # Metrics with a "labels" argument

From 2641baf16d9de5191316745ec46120cc8b57a666 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Fri, 23 Apr 2021 07:50:50 -0400
Subject: [PATCH 358/478] FIX Fixes PLSRegression regression for constant Yk
 (#19922)

---
 doc/whats_new/v0.24.rst                       |  4 ++++
 sklearn/cross_decomposition/_pls.py           | 21 ++++++++++++++-----
 sklearn/cross_decomposition/tests/test_pls.py | 15 +++++++++++++
 3 files changed, 35 insertions(+), 5 deletions(-)

diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst
index 34744de8a6b91..a14a649fc94a9 100644
--- a/doc/whats_new/v0.24.rst
+++ b/doc/whats_new/v0.24.rst
@@ -25,6 +25,10 @@ Changelog
 - |Fix| Fixed a regression in :class:`cross_decomposition.CCA`. :pr:`19646`
   by `Thomas Fan`_.
 
+- |Fix| :class:`cross_decomposition.PLSRegression` raises warning for
+  constant y residuals instead of a `StopIteration` error. :pr:`19922`
+  by `Thomas Fan`_.
+
 :mod:`sklearn.decomposition`
 ............................
 
diff --git a/sklearn/cross_decomposition/_pls.py b/sklearn/cross_decomposition/_pls.py
index 3c886a0dd0c1b..2f6e63d556388 100644
--- a/sklearn/cross_decomposition/_pls.py
+++ b/sklearn/cross_decomposition/_pls.py
@@ -52,7 +52,11 @@ def _get_first_singular_vectors_power_method(X, Y, mode="A", max_iter=500,
     """
 
     eps = np.finfo(X.dtype).eps
-    y_score = next(col for col in Y.T if np.any(np.abs(col) > eps))
+    try:
+        y_score = next(col for col in Y.T if np.any(np.abs(col) > eps))
+    except StopIteration as e:
+        raise StopIteration("Y residual is constant") from e
+
     x_weights_old = 100  # init to big value for first convergence check
 
     if mode == 'B':
@@ -256,10 +260,17 @@ def fit(self, X, Y):
                 Yk_mask = np.all(np.abs(Yk) < 10 * Y_eps, axis=0)
                 Yk[:, Yk_mask] = 0.0
 
-                x_weights, y_weights, n_iter_ = \
-                    _get_first_singular_vectors_power_method(
-                        Xk, Yk, mode=self.mode, max_iter=self.max_iter,
-                        tol=self.tol, norm_y_weights=norm_y_weights)
+                try:
+                    x_weights, y_weights, n_iter_ = \
+                        _get_first_singular_vectors_power_method(
+                            Xk, Yk, mode=self.mode, max_iter=self.max_iter,
+                            tol=self.tol, norm_y_weights=norm_y_weights)
+                except StopIteration as e:
+                    if str(e) != "Y residual is constant":
+                        raise
+                    warnings.warn(f"Y residual is constant at iteration {k}")
+                    break
+
                 self.n_iter_.append(n_iter_)
 
             elif self.algorithm == "svd":
diff --git a/sklearn/cross_decomposition/tests/test_pls.py b/sklearn/cross_decomposition/tests/test_pls.py
index 1179161b8436c..644e1418e3edc 100644
--- a/sklearn/cross_decomposition/tests/test_pls.py
+++ b/sklearn/cross_decomposition/tests/test_pls.py
@@ -568,3 +568,18 @@ def test_loadings_converges():
 
     # Loadings converges to reasonable values
     assert np.all(np.abs(cca.x_loadings_) < 1)
+
+
+def test_pls_constant_y():
+    """Checks warning when y is constant. Non-regression test for #19831"""
+    rng = np.random.RandomState(42)
+    x = rng.rand(100, 3)
+    y = np.zeros(100)
+
+    pls = PLSRegression()
+
+    msg = "Y residual is constant at iteration"
+    with pytest.warns(UserWarning, match=msg):
+        pls.fit(x, y)
+
+    assert_allclose(pls.x_rotations_, 0)

From 6927fa26aedf48162314b675016180e3356ad557 Mon Sep 17 00:00:00 2001
From: flyingdutchman23 <j.clement@campus.tu-berlin.de>
Date: Mon, 26 Apr 2021 14:22:59 +0200
Subject: [PATCH 359/478] FIX mislabelling multiclass target when labels is
 provided in top_k_accuracy_score (#19721)

---
 doc/whats_new/v0.24.rst               |  8 ++++++++
 sklearn/metrics/_ranking.py           |  4 +++-
 sklearn/metrics/tests/test_ranking.py | 24 ++++++++++++++++++++++++
 3 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst
index a14a649fc94a9..79f6ecb15c3d0 100644
--- a/doc/whats_new/v0.24.rst
+++ b/doc/whats_new/v0.24.rst
@@ -59,6 +59,14 @@ Changelog
 - |Fix|: Fixed a bug in :class:`linear_model.LogisticRegression`: the
   sample_weight object is not modified anymore. :pr:`19182` by
   :user:`Yosuke KOBAYASHI <m7142yosuke>`.
+  
+:mod:`sklearn.metrics`
+......................
+
+- |Fix| :func:`metrics.top_k_accuracy_score` now supports multiclass
+  problems where only two classes appear in `y_true` and all the classes
+  are specified in `labels`.
+  :pr:`19721` by :user:`Joris Clement <flyingdutchman23>`.
 
 :mod:`sklearn.model_selection`
 ..............................
diff --git a/sklearn/metrics/_ranking.py b/sklearn/metrics/_ranking.py
index f1627e84fbcfe..8c458ac81e529 100644
--- a/sklearn/metrics/_ranking.py
+++ b/sklearn/metrics/_ranking.py
@@ -1598,7 +1598,7 @@ def top_k_accuracy_score(y_true, y_score, *, k=2, normalize=True,
         non-thresholded decision values (as returned by
         :term:`decision_function` on some classifiers). The binary case expects
         scores with shape (n_samples,) while the multiclass case expects scores
-        with shape (n_samples, n_classes). In the nulticlass case, the order of
+        with shape (n_samples, n_classes). In the multiclass case, the order of
         the class scores must correspond to the order of ``labels``, if
         provided, or else to the numerical or lexicographical order of the
         labels in ``y_true``.
@@ -1655,6 +1655,8 @@ def top_k_accuracy_score(y_true, y_score, *, k=2, normalize=True,
     y_true = check_array(y_true, ensure_2d=False, dtype=None)
     y_true = column_or_1d(y_true)
     y_type = type_of_target(y_true)
+    if y_type == "binary" and labels is not None and len(labels) > 2:
+        y_type = "multiclass"
     y_score = check_array(y_score, ensure_2d=False)
     y_score = column_or_1d(y_score) if y_type == 'binary' else y_score
     check_consistent_length(y_true, y_score, sample_weight)
diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py
index c37ff34feddec..85a00ca520f7b 100644
--- a/sklearn/metrics/tests/test_ranking.py
+++ b/sklearn/metrics/tests/test_ranking.py
@@ -1657,6 +1657,30 @@ def test_top_k_accuracy_score_binary(y_score, k, true_score):
     assert score == score_acc == pytest.approx(true_score)
 
 
+@pytest.mark.parametrize('y_true, true_score, labels', [
+    (np.array([0, 1, 1, 2]), 0.75, [0, 1, 2, 3]),
+    (np.array([0, 1, 1, 1]), 0.5, [0, 1, 2, 3]),
+    (np.array([1, 1, 1, 1]), 0.5, [0, 1, 2, 3]),
+    (np.array(['a', 'e', 'e', 'a']), 0.75, ['a', 'b', 'd', 'e']),
+])
+@pytest.mark.parametrize("labels_as_ndarray", [True, False])
+def test_top_k_accuracy_score_multiclass_with_labels(
+        y_true, true_score, labels, labels_as_ndarray
+):
+    """Test when labels and y_score are multiclass."""
+    if labels_as_ndarray:
+        labels = np.asarray(labels)
+    y_score = np.array([
+        [0.4, 0.3, 0.2, 0.1],
+        [0.1, 0.3, 0.4, 0.2],
+        [0.4, 0.1, 0.2, 0.3],
+        [0.3, 0.2, 0.4, 0.1],
+    ])
+
+    score = top_k_accuracy_score(y_true, y_score, k=2, labels=labels)
+    assert score == pytest.approx(true_score)
+
+
 def test_top_k_accuracy_score_increasing():
     # Make sure increasing k leads to a higher score
     X, y = datasets.make_classification(n_classes=10, n_samples=1000,

From ff0949907cef8e2fc1236b92e2789620ccab820a Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Mon, 26 Apr 2021 09:40:29 -0400
Subject: [PATCH 360/478] CI Fixes MAC ar build error (#19968)

---
 build_tools/azure/install.sh | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh
index fbe0c90a473ab..d2711d6bd610e 100755
--- a/build_tools/azure/install.sh
+++ b/build_tools/azure/install.sh
@@ -51,6 +51,10 @@ if [[ "$DISTRIB" == "conda" ]]; then
             # sklearn/svm/_libsvm.cpython-38-darwin.so,
             # 2): Symbol not found: _svm_check_parameter error
             TO_INSTALL="$TO_INSTALL compilers>=1.0.4,!=1.1.0 llvm-openmp"
+        else
+            # Without openmp, we use the system clang. Here we use /usr/bin/ar
+            # instead because llvm-ar errors
+            export AR=/usr/bin/ar
         fi
     fi
 	make_conda $TO_INSTALL

From d22fe3e922ba5ea063fa3afe0574e86884449539 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Mon, 26 Apr 2021 10:39:03 -0400
Subject: [PATCH 361/478] CI Lowers precision for doctest in LinearRegression
 (#19988)

---
 sklearn/linear_model/_base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py
index 5783e4740a08c..808ec9f3b3bb0 100644
--- a/sklearn/linear_model/_base.py
+++ b/sklearn/linear_model/_base.py
@@ -591,7 +591,7 @@ class LinearRegression(MultiOutputMixin, RegressorMixin, LinearModel):
     >>> reg.coef_
     array([1., 2.])
     >>> reg.intercept_
-    3.0000...
+    3.0...
     >>> reg.predict(np.array([[3, 5]]))
     array([16.])
     """

From d852aa0825769d7623d662bfa1f5a2bb5dfbae6d Mon Sep 17 00:00:00 2001
From: Alihan Zihna <alihanz@gmail.com>
Date: Mon, 26 Apr 2021 16:55:17 +0100
Subject: [PATCH 362/478] TST Changes assert to pytest style in /mixture/tests
 (#19983)

Co-authored-by: Alihan Zihna <a.zihna@ckhgbdp.onmicrosoft.com>
---
 .../mixture/tests/test_bayesian_mixture.py    |  95 ++++----
 .../mixture/tests/test_gaussian_mixture.py    | 203 ++++++++++--------
 2 files changed, 168 insertions(+), 130 deletions(-)

diff --git a/sklearn/mixture/tests/test_bayesian_mixture.py b/sklearn/mixture/tests/test_bayesian_mixture.py
index 1d061da908e3c..dc2cbda4b66e7 100644
--- a/sklearn/mixture/tests/test_bayesian_mixture.py
+++ b/sklearn/mixture/tests/test_bayesian_mixture.py
@@ -2,12 +2,12 @@
 #         Thierry Guillemot <thierry.guillemot.work@gmail.com>
 # License: BSD 3 clause
 import copy
+import re
 
 import numpy as np
 from scipy.special import gammaln
 import pytest
 
-from sklearn.utils._testing import assert_raise_message
 from sklearn.utils._testing import assert_almost_equal
 from sklearn.utils._testing import assert_array_equal
 
@@ -66,11 +66,13 @@ def test_bayesian_mixture_covariance_type():
     covariance_type = 'bad_covariance_type'
     bgmm = BayesianGaussianMixture(covariance_type=covariance_type,
                                    random_state=rng)
-    assert_raise_message(ValueError,
-                         "Invalid value for 'covariance_type': %s "
-                         "'covariance_type' should be in "
-                         "['spherical', 'tied', 'diag', 'full']"
-                         % covariance_type, bgmm.fit, X)
+
+    msg = re.escape(
+        f"Invalid value for 'covariance_type': {covariance_type} "
+        "'covariance_type' should be in ['spherical', 'tied', 'diag', 'full']"
+    )
+    with pytest.raises(ValueError, match=msg):
+        bgmm.fit(X)
 
 
 def test_bayesian_mixture_weight_concentration_prior_type():
@@ -81,11 +83,13 @@ def test_bayesian_mixture_weight_concentration_prior_type():
     bad_prior_type = 'bad_prior_type'
     bgmm = BayesianGaussianMixture(
         weight_concentration_prior_type=bad_prior_type, random_state=rng)
-    assert_raise_message(ValueError,
-                         "Invalid value for 'weight_concentration_prior_type':"
-                         " %s 'weight_concentration_prior_type' should be in "
-                         "['dirichlet_process', 'dirichlet_distribution']"
-                         % bad_prior_type, bgmm.fit, X)
+    msg = re.escape(
+        "Invalid value for 'weight_concentration_prior_type':"
+        f" {bad_prior_type} 'weight_concentration_prior_type' should be in "
+        "['dirichlet_process', 'dirichlet_distribution']"
+    )
+    with pytest.raises(ValueError, match=msg):
+        bgmm.fit(X)
 
 
 def test_bayesian_mixture_weights_prior_initialisation():
@@ -98,11 +102,12 @@ def test_bayesian_mixture_weights_prior_initialisation():
     bgmm = BayesianGaussianMixture(
         weight_concentration_prior=bad_weight_concentration_prior_,
         random_state=0)
-    assert_raise_message(ValueError,
-                         "The parameter 'weight_concentration_prior' "
-                         "should be greater than 0., but got %.3f."
-                         % bad_weight_concentration_prior_,
-                         bgmm.fit, X)
+    msg = (
+        "The parameter 'weight_concentration_prior' should be greater "
+        f"than 0., but got {bad_weight_concentration_prior_:.3f}."
+    )
+    with pytest.raises(ValueError, match=msg):
+        bgmm.fit(X)
 
     # Check correct init for a given value of weight_concentration_prior
     weight_concentration_prior = rng.rand()
@@ -128,11 +133,12 @@ def test_bayesian_mixture_mean_prior_initialisation():
     bgmm = BayesianGaussianMixture(
         mean_precision_prior=bad_mean_precision_prior_,
         random_state=rng)
-    assert_raise_message(ValueError,
-                         "The parameter 'mean_precision_prior' should be "
-                         "greater than 0., but got %.3f."
-                         % bad_mean_precision_prior_,
-                         bgmm.fit, X)
+    msg = (
+        "The parameter 'mean_precision_prior' "
+        f"should be greater than 0., but got {bad_mean_precision_prior_:.3f}."
+    )
+    with pytest.raises(ValueError, match=msg):
+        bgmm.fit(X)
 
     # Check correct init for a given value of mean_precision_prior
     mean_precision_prior = rng.rand()
@@ -150,9 +156,9 @@ def test_bayesian_mixture_mean_prior_initialisation():
     bgmm = BayesianGaussianMixture(n_components=n_components,
                                    mean_prior=mean_prior,
                                    random_state=rng)
-    assert_raise_message(ValueError,
-                         "The parameter 'means' should have the shape of ",
-                         bgmm.fit, X)
+    msg = "The parameter 'means' should have the shape of "
+    with pytest.raises(ValueError, match=msg):
+        bgmm.fit(X)
 
     # Check correct init for a given value of mean_prior
     mean_prior = rng.rand(n_features)
@@ -177,11 +183,12 @@ def test_bayesian_mixture_precisions_prior_initialisation():
     bgmm = BayesianGaussianMixture(
         degrees_of_freedom_prior=bad_degrees_of_freedom_prior_,
         random_state=rng)
-    assert_raise_message(ValueError,
-                         "The parameter 'degrees_of_freedom_prior' should be "
-                         "greater than %d, but got %.3f."
-                         % (n_features - 1, bad_degrees_of_freedom_prior_),
-                         bgmm.fit, X)
+    msg = (
+        "The parameter 'degrees_of_freedom_prior' should be greater than"
+        f" {n_features -1}, but got {bad_degrees_of_freedom_prior_:.3f}."
+    )
+    with pytest.raises(ValueError, match=msg):
+        bgmm.fit(X)
 
     # Check correct init for a given value of degrees_of_freedom_prior
     degrees_of_freedom_prior = rng.rand() + n_features - 1.
@@ -219,11 +226,12 @@ def test_bayesian_mixture_precisions_prior_initialisation():
     bgmm = BayesianGaussianMixture(covariance_type='spherical',
                                    covariance_prior=bad_covariance_prior_,
                                    random_state=rng)
-    assert_raise_message(ValueError,
-                         "The parameter 'spherical covariance_prior' "
-                         "should be greater than 0., but got %.3f."
-                         % bad_covariance_prior_,
-                         bgmm.fit, X)
+    msg = (
+        "The parameter 'spherical covariance_prior' "
+        f"should be greater than 0., but got {bad_covariance_prior_:.3f}."
+    )
+    with pytest.raises(ValueError, match=msg):
+        bgmm.fit(X)
 
     # Check correct init for the default value of covariance_prior
     covariance_prior_default = {
@@ -247,9 +255,10 @@ def test_bayesian_mixture_check_is_fitted():
     # Check raise message
     bgmm = BayesianGaussianMixture(random_state=rng)
     X = rng.rand(n_samples, n_features)
-    assert_raise_message(ValueError,
-                         'This BayesianGaussianMixture instance is not '
-                         'fitted yet.', bgmm.score, X)
+
+    msg = "This BayesianGaussianMixture instance is not fitted yet."
+    with pytest.raises(ValueError, match=msg):
+        bgmm.score(X)
 
 
 def test_bayesian_mixture_weights():
@@ -475,11 +484,13 @@ def test_bayesian_mixture_predict_predict_proba():
                 covariance_type=covar_type)
 
             # Check a warning message arrive if we don't do fit
-            assert_raise_message(NotFittedError,
-                                 "This BayesianGaussianMixture instance"
-                                 " is not fitted yet. Call 'fit' with "
-                                 "appropriate arguments before using "
-                                 "this estimator.", bgmm.predict, X)
+            msg = (
+                "This BayesianGaussianMixture instance is not fitted yet. "
+                "Call 'fit' with appropriate arguments before using this "
+                "estimator."
+            )
+            with pytest.raises(NotFittedError, match=msg):
+                bgmm.predict(X)
 
             bgmm.fit(X)
             Y_pred = bgmm.predict(X)
diff --git a/sklearn/mixture/tests/test_gaussian_mixture.py b/sklearn/mixture/tests/test_gaussian_mixture.py
index ea5ea0c2eb649..2d8dc81e54275 100644
--- a/sklearn/mixture/tests/test_gaussian_mixture.py
+++ b/sklearn/mixture/tests/test_gaussian_mixture.py
@@ -2,6 +2,7 @@
 #         Thierry Guillemot <thierry.guillemot.work@gmail.com>
 # License: BSD 3 clause
 
+import re
 import sys
 import copy
 import warnings
@@ -29,8 +30,6 @@
 from sklearn.utils._testing import assert_almost_equal
 from sklearn.utils._testing import assert_array_almost_equal
 from sklearn.utils._testing import assert_array_equal
-from sklearn.utils._testing import assert_raise_message
-from sklearn.utils._testing import assert_warns_message
 from sklearn.utils._testing import ignore_warnings
 
 
@@ -105,55 +104,66 @@ def test_gaussian_mixture_attributes():
 
     n_components_bad = 0
     gmm = GaussianMixture(n_components=n_components_bad)
-    assert_raise_message(ValueError,
-                         "Invalid value for 'n_components': %d "
-                         "Estimation requires at least one component"
-                         % n_components_bad, gmm.fit, X)
+    msg = (
+        f"Invalid value for 'n_components': {n_components_bad} "
+        "Estimation requires at least one component"
+    )
+    with pytest.raises(ValueError, match=msg):
+        gmm.fit(X)
 
     # covariance_type should be in [spherical, diag, tied, full]
     covariance_type_bad = 'bad_covariance_type'
     gmm = GaussianMixture(covariance_type=covariance_type_bad)
-    assert_raise_message(ValueError,
-                         "Invalid value for 'covariance_type': %s "
-                         "'covariance_type' should be in "
-                         "['spherical', 'tied', 'diag', 'full']"
-                         % covariance_type_bad,
-                         gmm.fit, X)
+    msg = (
+        f"Invalid value for 'covariance_type': {covariance_type_bad} "
+        "'covariance_type' should be in ['spherical', 'tied', 'diag', 'full']"
+    )
+    with pytest.raises(ValueError):
+        gmm.fit(X)
 
     tol_bad = -1
     gmm = GaussianMixture(tol=tol_bad)
-    assert_raise_message(ValueError,
-                         "Invalid value for 'tol': %.5f "
-                         "Tolerance used by the EM must be non-negative"
-                         % tol_bad, gmm.fit, X)
+    msg = (
+        f"Invalid value for 'tol': {tol_bad:.5f} "
+        "Tolerance used by the EM must be non-negative"
+    )
+    with pytest.raises(ValueError, match=msg):
+        gmm.fit(X)
 
     reg_covar_bad = -1
     gmm = GaussianMixture(reg_covar=reg_covar_bad)
-    assert_raise_message(ValueError,
-                         "Invalid value for 'reg_covar': %.5f "
-                         "regularization on covariance must be "
-                         "non-negative" % reg_covar_bad, gmm.fit, X)
+    msg = (
+        f"Invalid value for 'reg_covar': {reg_covar_bad:.5f} "
+        "regularization on covariance must be non-negative"
+    )
+    with pytest.raises(ValueError, match=msg):
+        gmm.fit(X)
 
     max_iter_bad = 0
     gmm = GaussianMixture(max_iter=max_iter_bad)
-    assert_raise_message(ValueError,
-                         "Invalid value for 'max_iter': %d "
-                         "Estimation requires at least one iteration"
-                         % max_iter_bad, gmm.fit, X)
+    msg = (
+        f"Invalid value for 'max_iter': {max_iter_bad} "
+        "Estimation requires at least one iteration"
+    )
+    with pytest.raises(ValueError, match=msg):
+        gmm.fit(X)
 
     n_init_bad = 0
     gmm = GaussianMixture(n_init=n_init_bad)
-    assert_raise_message(ValueError,
-                         "Invalid value for 'n_init': %d "
-                         "Estimation requires at least one run"
-                         % n_init_bad, gmm.fit, X)
+    msg = (
+        f"Invalid value for 'n_init': {n_init_bad} "
+        "Estimation requires at least one run"
+    )
+    with pytest.raises(ValueError, match=msg):
+        gmm.fit(X)
 
     init_params_bad = 'bad_method'
     gmm = GaussianMixture(init_params=init_params_bad)
-    assert_raise_message(ValueError,
-                         "Unimplemented initialization method '%s'"
-                         % init_params_bad,
-                         gmm.fit, X)
+    msg = (
+        f"Unimplemented initialization method '{init_params_bad}'"
+    )
+    with pytest.raises(ValueError, match=msg):
+        gmm.fit(X)
 
     # test good parameters
     n_components, tol, n_init, max_iter, reg_covar = 2, 1e-4, 3, 30, 1e-1
@@ -184,31 +194,34 @@ def test_check_weights():
     # Check bad shape
     weights_bad_shape = rng.rand(n_components, 1)
     g.weights_init = weights_bad_shape
-    assert_raise_message(ValueError,
-                         "The parameter 'weights' should have the shape of "
-                         "(%d,), but got %s" %
-                         (n_components, str(weights_bad_shape.shape)),
-                         g.fit, X)
+    msg = re.escape(
+        "The parameter 'weights' should have the shape of "
+        f"({n_components},), but got {str(weights_bad_shape.shape)}"
+    )
+    with pytest.raises(ValueError, match=msg):
+        g.fit(X)
 
     # Check bad range
     weights_bad_range = rng.rand(n_components) + 1
     g.weights_init = weights_bad_range
-    assert_raise_message(ValueError,
-                         "The parameter 'weights' should be in the range "
-                         "[0, 1], but got max value %.5f, min value %.5f"
-                         % (np.min(weights_bad_range),
-                            np.max(weights_bad_range)),
-                         g.fit, X)
+    msg = re.escape(
+        "The parameter 'weights' should be in the range [0, 1], but got"
+        f" max value {np.min(weights_bad_range):.5f}, "
+        f"min value {np.max(weights_bad_range):.5f}"
+    )
+    with pytest.raises(ValueError, match=msg):
+        g.fit(X)
 
     # Check bad normalization
     weights_bad_norm = rng.rand(n_components)
     weights_bad_norm = weights_bad_norm / (weights_bad_norm.sum() + 1)
     g.weights_init = weights_bad_norm
-    assert_raise_message(ValueError,
-                         "The parameter 'weights' should be normalized, "
-                         "but got sum(weights) = %.5f"
-                         % np.sum(weights_bad_norm),
-                         g.fit, X)
+    msg = re.escape(
+        "The parameter 'weights' should be normalized, "
+        f"but got sum(weights) = {np.sum(weights_bad_norm):.5f}"
+    )
+    with pytest.raises(ValueError, match=msg):
+        g.fit(X)
 
     # Check good weights matrix
     weights = rand_data.weights
@@ -229,9 +242,9 @@ def test_check_means():
     # Check means bad shape
     means_bad_shape = rng.rand(n_components + 1, n_features)
     g.means_init = means_bad_shape
-    assert_raise_message(ValueError,
-                         "The parameter 'means' should have the shape of ",
-                         g.fit, X)
+    msg = "The parameter 'means' should have the shape of "
+    with pytest.raises(ValueError, match=msg):
+        g.fit(X)
 
     # Check good means matrix
     means = rand_data.means
@@ -278,17 +291,21 @@ def test_check_precisions():
 
         # Check precisions with bad shapes
         g.precisions_init = precisions_bad_shape[covar_type]
-        assert_raise_message(ValueError,
-                             "The parameter '%s precision' should have "
-                             "the shape of" % covar_type,
-                             g.fit, X)
+        msg = (
+            f"The parameter '{covar_type} precision' should have "
+            "the shape of"
+        )
+        with pytest.raises(ValueError, match=msg):
+            g.fit(X)
 
         # Check not positive precisions
         g.precisions_init = precisions_not_positive[covar_type]
-        assert_raise_message(ValueError,
-                             "'%s precision' should be %s"
-                             % (covar_type, not_positive_errors[covar_type]),
-                             g.fit, X)
+        msg = (
+            f"'{covar_type} precision' should be "
+            f"{not_positive_errors[covar_type]}"
+        )
+        with pytest.raises(ValueError, match=msg):
+            g.fit(X)
 
         # Check the correct init of precisions_init
         g.precisions_init = rand_data.precisions[covar_type]
@@ -532,10 +549,12 @@ def test_gaussian_mixture_predict_predict_proba():
                             covariance_type=covar_type)
 
         # Check a warning message arrive if we don't do fit
-        assert_raise_message(NotFittedError,
-                             "This GaussianMixture instance is not fitted "
-                             "yet. Call 'fit' with appropriate arguments "
-                             "before using this estimator.", g.predict, X)
+        msg = (
+            "This GaussianMixture instance is not fitted yet. Call 'fit' "
+            "with appropriate arguments before using this estimator."
+        )
+        with pytest.raises(NotFittedError, match=msg):
+            g.predict(X)
 
         g.fit(X)
         Y_pred = g.predict(X)
@@ -660,12 +679,13 @@ def test_gaussian_mixture_fit_convergence_warning():
         g = GaussianMixture(n_components=n_components, n_init=1,
                             max_iter=max_iter, reg_covar=0, random_state=rng,
                             covariance_type=covar_type)
-        assert_warns_message(ConvergenceWarning,
-                             'Initialization %d did not converge. '
-                             'Try different init parameters, '
-                             'or increase max_iter, tol '
-                             'or check for degenerate data.'
-                             % max_iter, g.fit, X)
+        msg = (
+            f"Initialization {max_iter} did not converge. Try different init "
+            "parameters, or increase max_iter, tol or check for degenerate"
+            " data."
+        )
+        with pytest.warns(ConvergenceWarning, match=msg):
+            g.fit(X)
 
 
 def test_multiple_init():
@@ -831,10 +851,12 @@ def test_score():
     gmm1 = GaussianMixture(n_components=n_components, n_init=1,
                            max_iter=1, reg_covar=0, random_state=rng,
                            covariance_type=covar_type)
-    assert_raise_message(NotFittedError,
-                         "This GaussianMixture instance is not fitted "
-                         "yet. Call 'fit' with appropriate arguments "
-                         "before using this estimator.", gmm1.score, X)
+    msg = (
+        "This GaussianMixture instance is not fitted yet. Call 'fit' with "
+        "appropriate arguments before using this estimator."
+    )
+    with pytest.raises(NotFittedError, match=msg):
+        gmm1.score(X)
 
     # Check score value
     with warnings.catch_warnings():
@@ -861,10 +883,12 @@ def test_score_samples():
     # Check the error message if we don't call fit
     gmm = GaussianMixture(n_components=n_components, n_init=1, reg_covar=0,
                           random_state=rng, covariance_type=covar_type)
-    assert_raise_message(NotFittedError,
-                         "This GaussianMixture instance is not fitted "
-                         "yet. Call 'fit' with appropriate arguments "
-                         "before using this estimator.", gmm.score_samples, X)
+    msg = (
+        "This GaussianMixture instance is not fitted yet. Call 'fit' with "
+        "appropriate arguments before using this estimator."
+    )
+    with pytest.raises(NotFittedError, match=msg):
+        gmm.score_samples(X)
 
     gmm_score_samples = gmm.fit(X).score_samples(X)
     assert gmm_score_samples.shape[0] == rand_data.n_samples
@@ -914,13 +938,14 @@ def test_regularisation():
 
         with warnings.catch_warnings():
             warnings.simplefilter("ignore", RuntimeWarning)
-            assert_raise_message(ValueError,
-                                 "Fitting the mixture model failed because "
-                                 "some components have ill-defined empirical "
-                                 "covariance (for instance caused by "
-                                 "singleton or collapsed samples). Try to "
-                                 "decrease the number of components, or "
-                                 "increase reg_covar.", gmm.fit, X)
+            msg = re.escape(
+                "Fitting the mixture model failed because some components have"
+                " ill-defined empirical covariance (for instance caused by "
+                "singleton or collapsed samples). Try to decrease the number "
+                "of components, or increase reg_covar."
+            )
+            with pytest.raises(ValueError, match=msg):
+                gmm.fit(X)
 
             gmm.set_params(reg_covar=1e-6).fit(X)
 
@@ -958,12 +983,14 @@ def test_sample():
         gmm = GaussianMixture(n_components=n_components,
                               covariance_type=covar_type, random_state=rng)
         # To sample we need that GaussianMixture is fitted
-        assert_raise_message(NotFittedError, "This GaussianMixture instance "
-                             "is not fitted", gmm.sample, 0)
+        msg = "This GaussianMixture instance is not fitted"
+        with pytest.raises(NotFittedError, match=msg):
+            gmm.sample(0)
         gmm.fit(X)
 
-        assert_raise_message(ValueError, "Invalid value for 'n_samples",
-                             gmm.sample, 0)
+        msg = "Invalid value for 'n_samples'"
+        with pytest.raises(ValueError, match=msg):
+            gmm.sample(0)
 
         # Just to make sure the class samples correctly
         n_samples = 20000

From 8156c1082886bd23c7e6486a7d654412df1d9325 Mon Sep 17 00:00:00 2001
From: Dmitry Kobak <dmitry.kobak@uni-tuebingen.de>
Date: Mon, 26 Apr 2021 21:29:59 +0200
Subject: [PATCH 363/478] ENH Improve initialization and learning rate in t-SNE
 (#19491)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Tom Dupré la Tour <tom.dupre-la-tour@m4x.org>
---
 doc/modules/manifold.rst                      |  15 +-
 doc/whats_new/v1.0.rst                        |   6 +
 sklearn/manifold/_t_sne.py                    |  73 +++++++--
 sklearn/manifold/tests/test_t_sne.py          | 149 ++++++++++++++++--
 .../tests/test_neighbors_pipeline.py          |   3 +
 sklearn/tests/test_docstring_parameters.py    |   5 +
 sklearn/utils/estimator_checks.py             |   1 +
 7 files changed, 226 insertions(+), 26 deletions(-)

diff --git a/doc/modules/manifold.rst b/doc/modules/manifold.rst
index 8de2a73477c87..72e8c7485df44 100644
--- a/doc/modules/manifold.rst
+++ b/doc/modules/manifold.rst
@@ -555,7 +555,10 @@ between natural clusters in the data. If the factor is too high, the KL
 divergence could increase during this phase. Usually it does not have to be
 tuned. A critical parameter is the learning rate. If it is too low gradient
 descent will get stuck in a bad local minimum. If it is too high the KL
-divergence will increase during optimization. More tips can be found in
+divergence will increase during optimization. A heuristic suggested in
+Belkina et al. (2019) is to set the learning rate to the sample size
+divided by the early exaggeration factor. We implement this heuristic
+as `learning_rate='auto'` argument. More tips can be found in
 Laurens van der Maaten's FAQ (see references). The last parameter, angle,
 is a tradeoff between performance and accuracy. Larger angles imply that we
 can approximate larger regions by a single point, leading to better speed
@@ -614,9 +617,15 @@ the internal structure of the data.
     <https://lvdmaaten.github.io/tsne/>`_
     van der Maaten, L.J.P.
 
-  * `"Accelerating t-SNE using Tree-Based Algorithms."
+  * `"Accelerating t-SNE using Tree-Based Algorithms"
     <https://lvdmaaten.github.io/publications/papers/JMLR_2014.pdf>`_
-    L.J.P. van der Maaten.  Journal of Machine Learning Research 15(Oct):3221-3245, 2014.
+    van der Maaten, L.J.P.; Journal of Machine Learning Research 15(Oct):3221-3245, 2014.
+    
+  * `"Automated optimized parameters for T-distributed stochastic neighbor
+    embedding improve visualization and analysis of large datasets"
+    <https://www.nature.com/articles/s41467-019-13055-y>`_
+    Belkina, A.C., Ciccolella, C.O., Anno, R., Halpert, R., Spidlen, J.,
+    Snyder-Cappione, J.E., Nature Communications 10, 5415 (2019). 
 
 Tips on practical use
 =====================
diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 3b3884e68e185..0a13d22860d07 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -279,6 +279,12 @@ Changelog
   during affinity matrix computation for :class:`manifold.TSNE`.
   :pr:`19472` by :user:`Dmitry Kobak <dkobak>`.
 
+- |Enhancement| Implement `'auto'` heuristic for the `learning_rate` in
+  :class:`manifold.TSNE`. It will become default in 1.2. The default
+  initialization will change to `pca` in 1.2. PCA initialization will
+  be scaled to have standard deviation 1e-4 in 1.2.
+  :pr:`19491` by :user:`Dmitry Kobak <dkobak>`.
+
 :mod:`sklearn.metrics`
 ......................
 
diff --git a/sklearn/manifold/_t_sne.py b/sklearn/manifold/_t_sne.py
index b6072a6e198c4..682fdc095d3bf 100644
--- a/sklearn/manifold/_t_sne.py
+++ b/sklearn/manifold/_t_sne.py
@@ -517,13 +517,19 @@ class TSNE(BaseEstimator):
         optimization, the early exaggeration factor or the learning rate
         might be too high.
 
-    learning_rate : float, default=200.0
+    learning_rate : float or 'auto', default=200.0
         The learning rate for t-SNE is usually in the range [10.0, 1000.0]. If
         the learning rate is too high, the data may look like a 'ball' with any
         point approximately equidistant from its nearest neighbours. If the
         learning rate is too low, most points may look compressed in a dense
         cloud with few outliers. If the cost function gets stuck in a bad local
         minimum increasing the learning rate may help.
+        Note that many other t-SNE implementations (bhtsne, FIt-SNE, openTSNE,
+        etc.) use a definition of learning_rate that is 4 times smaller than
+        ours. So our learning_rate=200 corresponds to learning_rate=800 in
+        those other implementations. The 'auto' option sets the learning_rate
+        to `max(N / early_exaggeration / 4, 50)` where N is the sample size,
+        following [4] and [5]. This will become default in 1.2.
 
     n_iter : int, default=1000
         Maximum number of iterations for the optimization. Should be at
@@ -559,7 +565,8 @@ class TSNE(BaseEstimator):
         Initialization of embedding. Possible options are 'random', 'pca',
         and a numpy array of shape (n_samples, n_components).
         PCA initialization cannot be used with precomputed distances and is
-        usually more globally stable than random initialization.
+        usually more globally stable than random initialization. `init='pca'`
+        will become default in 1.2.
 
     verbose : int, default=0
         Verbosity level.
@@ -631,7 +638,8 @@ class TSNE(BaseEstimator):
     >>> import numpy as np
     >>> from sklearn.manifold import TSNE
     >>> X = np.array([[0, 0, 0], [0, 1, 1], [1, 0, 1], [1, 1, 1]])
-    >>> X_embedded = TSNE(n_components=2).fit_transform(X)
+    >>> X_embedded = TSNE(n_components=2, learning_rate='auto',
+    ...                   init='random').fit_transform(X)
     >>> X_embedded.shape
     (4, 2)
 
@@ -647,6 +655,14 @@ class TSNE(BaseEstimator):
     [3] L.J.P. van der Maaten. Accelerating t-SNE using Tree-Based Algorithms.
         Journal of Machine Learning Research 15(Oct):3221-3245, 2014.
         https://lvdmaaten.github.io/publications/papers/JMLR_2014.pdf
+
+    [4] Belkina, A. C., Ciccolella, C. O., Anno, R., Halpert, R., Spidlen, J.,
+        & Snyder-Cappione, J. E. (2019). Automated optimized parameters for
+        T-distributed stochastic neighbor embedding improve visualization
+        and analysis of large datasets. Nature Communications, 10(1), 1-12.
+
+    [5] Kobak, D., & Berens, P. (2019). The art of using t-SNE for single-cell
+        transcriptomics. Nature Communications, 10(1), 1-14.
     """
     # Control the number of exploration iterations with early_exaggeration on
     _EXPLORATION_N_ITER = 250
@@ -656,9 +672,9 @@ class TSNE(BaseEstimator):
 
     @_deprecate_positional_args
     def __init__(self, n_components=2, *, perplexity=30.0,
-                 early_exaggeration=12.0, learning_rate=200.0, n_iter=1000,
+                 early_exaggeration=12.0, learning_rate="warn", n_iter=1000,
                  n_iter_without_progress=300, min_grad_norm=1e-7,
-                 metric="euclidean", init="random", verbose=0,
+                 metric="euclidean", init="warn", verbose=0,
                  random_state=None, method='barnes_hut', angle=0.5,
                  n_jobs=None, square_distances='legacy'):
         self.n_components = n_components
@@ -681,12 +697,39 @@ def __init__(self, n_components=2, *, perplexity=30.0,
     def _fit(self, X, skip_num_points=0):
         """Private function to fit the model using X as training data."""
 
+        if isinstance(self.init, str) and self.init == 'warn':
+            # See issue #18018
+            warnings.warn("The default initialization in TSNE will change "
+                          "from 'random' to 'pca' in 1.2.", FutureWarning)
+            self._init = 'random'
+        else:
+            self._init = self.init
+        if self.learning_rate == 'warn':
+            # See issue #18018
+            warnings.warn("The default learning rate in TSNE will change "
+                          "from 200.0 to 'auto' in 1.2.", FutureWarning)
+            self._learning_rate = 200.0
+        else:
+            self._learning_rate = self.learning_rate
+
+        if isinstance(self._init, str) and self._init == 'pca' and issparse(X):
+            raise TypeError("PCA initialization is currently not suported "
+                            "with the sparse input matrix. Use "
+                            "init=\"random\" instead.")
         if self.method not in ['barnes_hut', 'exact']:
             raise ValueError("'method' must be 'barnes_hut' or 'exact'")
         if self.angle < 0.0 or self.angle > 1.0:
             raise ValueError("'angle' must be between 0.0 - 1.0")
         if self.square_distances not in [True, 'legacy']:
             raise ValueError("'square_distances' must be True or 'legacy'.")
+        if self._learning_rate == 'auto':
+            # See issue #18018
+            self._learning_rate = X.shape[0] / self.early_exaggeration / 4
+            self._learning_rate = np.maximum(self._learning_rate, 50)
+        else:
+            if not (self._learning_rate > 0):
+                raise ValueError("'learning_rate' must be a positive number "
+                                 "or 'auto'.")
         if self.metric != "euclidean" and self.square_distances is not True:
             warnings.warn(
                 "'square_distances' has been introduced in 0.24 to help phase "
@@ -706,7 +749,7 @@ def _fit(self, X, skip_num_points=0):
             X = self._validate_data(X, accept_sparse=['csr', 'csc', 'coo'],
                                     dtype=[np.float32, np.float64])
         if self.metric == "precomputed":
-            if isinstance(self.init, str) and self.init == 'pca':
+            if isinstance(self._init, str) and self._init == 'pca':
                 raise ValueError("The parameter init=\"pca\" cannot be "
                                  "used with metric=\"precomputed\".")
             if X.shape[0] != X.shape[1]:
@@ -817,13 +860,21 @@ def _fit(self, X, skip_num_points=0):
             P = _joint_probabilities_nn(distances_nn, self.perplexity,
                                         self.verbose)
 
-        if isinstance(self.init, np.ndarray):
-            X_embedded = self.init
-        elif self.init == 'pca':
+        if isinstance(self._init, np.ndarray):
+            X_embedded = self._init
+        elif self._init == 'pca':
             pca = PCA(n_components=self.n_components, svd_solver='randomized',
                       random_state=random_state)
             X_embedded = pca.fit_transform(X).astype(np.float32, copy=False)
-        elif self.init == 'random':
+            # TODO: Update in 1.2
+            # PCA is rescaled so that PC1 has standard deviation 1e-4 which is
+            # the default value for random initialization. See issue #18018.
+            warnings.warn("The PCA initialization in TSNE will change to "
+                          "have the standard deviation of PC1 equal to 1e-4 "
+                          "in 1.2. This will ensure better convergence.",
+                          FutureWarning)
+            # X_embedded = X_embedded / np.std(X_embedded[:, 0]) * 1e-4
+        elif self._init == 'random':
             # The embedding is initialized with iid samples from Gaussians with
             # standard deviation 1e-4.
             X_embedded = 1e-4 * random_state.randn(
@@ -857,7 +908,7 @@ def _tsne(self, P, degrees_of_freedom, n_samples, X_embedded,
             "it": 0,
             "n_iter_check": self._N_ITER_CHECK,
             "min_grad_norm": self.min_grad_norm,
-            "learning_rate": self.learning_rate,
+            "learning_rate": self._learning_rate,
             "verbose": self.verbose,
             "kwargs": dict(skip_num_points=skip_num_points),
             "args": [P, degrees_of_freedom, n_samples, self.n_components],
diff --git a/sklearn/manifold/tests/test_t_sne.py b/sklearn/manifold/tests/test_t_sne.py
index bd0cc3df339bf..7f0840fb7b82f 100644
--- a/sklearn/manifold/tests/test_t_sne.py
+++ b/sklearn/manifold/tests/test_t_sne.py
@@ -247,6 +247,8 @@ def test_trustworthiness():
     assert_almost_equal(trustworthiness(X, X_embedded, n_neighbors=1), 0.2)
 
 
+# TODO: Remove filterwarning in 1.2
+@pytest.mark.filterwarnings("ignore:.*TSNE will change.*:FutureWarning")
 @pytest.mark.parametrize("method", ['exact', 'barnes_hut'])
 @pytest.mark.parametrize("init", ('random', 'pca'))
 def test_preserve_trustworthiness_approximately(method, init):
@@ -261,6 +263,8 @@ def test_preserve_trustworthiness_approximately(method, init):
     assert t > 0.85
 
 
+# TODO: Remove filterwarnings in 1.2
+@pytest.mark.filterwarnings("ignore:.*TSNE will change.*:FutureWarning")
 def test_optimization_minimizes_kl_divergence():
     """t-SNE should give a lower KL divergence with more iterations."""
     random_state = check_random_state(0)
@@ -275,6 +279,8 @@ def test_optimization_minimizes_kl_divergence():
     assert kl_divergences[2] <= kl_divergences[1]
 
 
+# TODO: Remove filterwarnings in 1.2
+@pytest.mark.filterwarnings("ignore:.*TSNE will change.*:FutureWarning")
 @pytest.mark.parametrize('method', ['exact', 'barnes_hut'])
 def test_fit_csr_matrix(method):
     # X can be a sparse matrix.
@@ -289,6 +295,8 @@ def test_fit_csr_matrix(method):
                     1.0, rtol=1.1e-1)
 
 
+# TODO: Remove filterwarnings in 1.2
+@pytest.mark.filterwarnings("ignore:.*TSNE will change.*:FutureWarning")
 def test_preserve_trustworthiness_approximately_with_precomputed_distances():
     # Nearest neighbors should be preserved approximately.
     random_state = check_random_state(0)
@@ -298,7 +306,7 @@ def test_preserve_trustworthiness_approximately_with_precomputed_distances():
         tsne = TSNE(n_components=2, perplexity=2, learning_rate=100.0,
                     early_exaggeration=2.0, metric="precomputed",
                     random_state=i, verbose=0, n_iter=500,
-                    square_distances=True)
+                    square_distances=True, init='random')
         X_embedded = tsne.fit_transform(D)
         t = trustworthiness(D, X_embedded, n_neighbors=1, metric="precomputed")
         assert t > .95
@@ -314,6 +322,8 @@ def test_trustworthiness_not_euclidean_metric():
                             metric='precomputed'))
 
 
+# TODO: Remove filterwarnings in 1.2
+@pytest.mark.filterwarnings("ignore:.*TSNE will change.*:FutureWarning")
 def test_early_exaggeration_too_small():
     # Early exaggeration factor must be >= 1.
     tsne = TSNE(early_exaggeration=0.99)
@@ -321,6 +331,8 @@ def test_early_exaggeration_too_small():
         tsne.fit_transform(np.array([[0.0], [0.0]]))
 
 
+# TODO: Remove filterwarnings in 1.2
+@pytest.mark.filterwarnings("ignore:.*TSNE will change.*:FutureWarning")
 def test_too_few_iterations():
     # Number of gradient descent iterations must be at least 200.
     tsne = TSNE(n_iter=199)
@@ -328,6 +340,8 @@ def test_too_few_iterations():
         tsne.fit_transform(np.array([[0.0], [0.0]]))
 
 
+# TODO: Remove filterwarnings in 1.2
+@pytest.mark.filterwarnings("ignore:.*TSNE will change.*:FutureWarning")
 @pytest.mark.parametrize('method, retype', [
     ('exact', np.asarray),
     ('barnes_hut', np.asarray),
@@ -339,27 +353,35 @@ def test_too_few_iterations():
 ])
 def test_bad_precomputed_distances(method, D, retype, message_regex):
     tsne = TSNE(metric="precomputed", method=method,
-                square_distances=True)
+                square_distances=True, init='random', random_state=42)
     with pytest.raises(ValueError, match=message_regex):
         tsne.fit_transform(retype(D))
 
 
+# TODO: Remove filterwarnings in 1.2
+@pytest.mark.filterwarnings("ignore:.*TSNE will change.*:FutureWarning")
 def test_exact_no_precomputed_sparse():
-    tsne = TSNE(metric='precomputed', method='exact', square_distances=True)
+    tsne = TSNE(metric='precomputed', method='exact', square_distances=True,
+                init='random', random_state=42)
     with pytest.raises(TypeError, match='sparse'):
         tsne.fit_transform(sp.csr_matrix([[0, 5], [5, 0]]))
 
 
+# TODO: Remove filterwarnings in 1.2
+@pytest.mark.filterwarnings("ignore:.*TSNE will change.*:FutureWarning")
 def test_high_perplexity_precomputed_sparse_distances():
     # Perplexity should be less than 50
     dist = np.array([[1., 0., 0.], [0., 1., 0.], [1., 0., 0.]])
     bad_dist = sp.csr_matrix(dist)
-    tsne = TSNE(metric="precomputed", square_distances=True)
+    tsne = TSNE(metric="precomputed", square_distances=True,
+                init='random', random_state=42)
     msg = "3 neighbors per samples are required, but some samples have only 1"
     with pytest.raises(ValueError, match=msg):
         tsne.fit_transform(bad_dist)
 
 
+# TODO: Remove filterwarnings in 1.2
+@pytest.mark.filterwarnings("ignore:.*TSNE will change.*:FutureWarning")
 @ignore_warnings(category=EfficiencyWarning)
 def test_sparse_precomputed_distance():
     """Make sure that TSNE works identically for sparse and dense matrix"""
@@ -372,7 +394,8 @@ def test_sparse_precomputed_distance():
     assert sp.issparse(D_sparse)
     assert_almost_equal(D_sparse.A, D)
 
-    tsne = TSNE(metric="precomputed", random_state=0, square_distances=True)
+    tsne = TSNE(metric="precomputed", random_state=0, square_distances=True,
+                init='random')
     Xt_dense = tsne.fit_transform(D)
 
     for fmt in ['csr', 'lil']:
@@ -380,6 +403,8 @@ def test_sparse_precomputed_distance():
         assert_almost_equal(Xt_dense, Xt_sparse)
 
 
+# TODO: Remove filterwarnings in 1.2
+@pytest.mark.filterwarnings("ignore:.*TSNE will change.*:FutureWarning")
 def test_non_positive_computed_distances():
     # Computed distance matrices must be positive.
     def metric(x, y):
@@ -392,6 +417,8 @@ def metric(x, y):
         tsne.fit_transform(X)
 
 
+# TODO: Remove filterwarnings in 1.2
+@pytest.mark.filterwarnings("ignore:.*TSNE will change.*:FutureWarning")
 def test_init_not_available():
     # 'init' must be 'pca', 'random', or numpy array.
     tsne = TSNE(init="not available")
@@ -400,6 +427,8 @@ def test_init_not_available():
         tsne.fit_transform(np.array([[0.0], [1.0]]))
 
 
+# TODO: Remove filterwarnings in 1.2
+@pytest.mark.filterwarnings("ignore:.*TSNE will change.*:FutureWarning")
 def test_init_ndarray():
     # Initialize TSNE with ndarray and test fit
     tsne = TSNE(init=np.zeros((100, 2)))
@@ -411,10 +440,12 @@ def test_init_ndarray_precomputed():
     # Initialize TSNE with ndarray and metric 'precomputed'
     # Make sure no FutureWarning is thrown from _fit
     tsne = TSNE(init=np.zeros((100, 2)), metric="precomputed",
-                square_distances=True)
+                square_distances=True, learning_rate=50.0)
     tsne.fit(np.zeros((100, 100)))
 
 
+# TODO: Remove filterwarnings in 1.2
+@pytest.mark.filterwarnings("ignore:.*TSNE will change.*:FutureWarning")
 def test_distance_not_available():
     # 'metric' must be valid.
     tsne = TSNE(metric="not available", method='exact', square_distances=True)
@@ -427,6 +458,8 @@ def test_distance_not_available():
         tsne.fit_transform(np.array([[0.0], [1.0]]))
 
 
+# TODO: Remove filterwarnings in 1.2
+@pytest.mark.filterwarnings("ignore:.*TSNE will change.*:FutureWarning")
 def test_method_not_available():
     # 'nethod' must be 'barnes_hut' or 'exact'
     tsne = TSNE(method='not available')
@@ -434,6 +467,8 @@ def test_method_not_available():
         tsne.fit_transform(np.array([[0.0], [1.0]]))
 
 
+# TODO: Remove filterwarnings in 1.2
+@pytest.mark.filterwarnings("ignore:.*TSNE will change.*:FutureWarning")
 def test_square_distances_not_available():
     # square_distances must be True or 'legacy'.
     tsne = TSNE(square_distances="not_available")
@@ -441,6 +476,8 @@ def test_square_distances_not_available():
         tsne.fit_transform(np.array([[0.0], [1.0]]))
 
 
+# TODO: Remove filterwarnings in 1.2
+@pytest.mark.filterwarnings("ignore:.*TSNE will change.*:FutureWarning")
 def test_angle_out_of_range_checks():
     # check the angle parameter range
     for angle in [-1, -1e-6, 1 + 1e-6, 2]:
@@ -450,8 +487,10 @@ def test_angle_out_of_range_checks():
             tsne.fit_transform(np.array([[0.0], [1.0]]))
 
 
+# TODO: Remove filterwarnings in 1.2
+@pytest.mark.filterwarnings("ignore:.*TSNE will change.*:FutureWarning")
 def test_pca_initialization_not_compatible_with_precomputed_kernel():
-    # Precomputed distance matrices must be square matrices.
+    # Precomputed distance matrices cannot use PCA initialization.
     tsne = TSNE(metric="precomputed", init="pca", square_distances=True)
     with pytest.raises(ValueError, match="The parameter init=\"pca\" cannot"
                                          " be used with"
@@ -459,6 +498,15 @@ def test_pca_initialization_not_compatible_with_precomputed_kernel():
         tsne.fit_transform(np.array([[0.0], [1.0]]))
 
 
+def test_pca_initialization_not_compatible_with_sparse_input():
+    # Sparse input matrices cannot use PCA initialization.
+    tsne = TSNE(init="pca", learning_rate=100.0)
+    with pytest.raises(TypeError, match="PCA initialization.*"):
+        tsne.fit_transform(sp.csr_matrix([[0, 5], [5, 0]]))
+
+
+# TODO: Remove filterwarnings in 1.2
+@pytest.mark.filterwarnings("ignore:.*TSNE will change.*:FutureWarning")
 def test_n_components_range():
     # barnes_hut method should only be used with n_components <= 3
     tsne = TSNE(n_components=4, method="barnes_hut")
@@ -466,6 +514,8 @@ def test_n_components_range():
         tsne.fit_transform(np.array([[0.0], [1.0]]))
 
 
+# TODO: Remove filterwarnings in 1.2
+@pytest.mark.filterwarnings("ignore:.*TSNE will change.*:FutureWarning")
 def test_early_exaggeration_used():
     # check that the ``early_exaggeration`` parameter has an effect
     random_state = check_random_state(0)
@@ -585,6 +635,8 @@ def _run_answer_test(pos_input, pos_output, neighbors, grad_output,
     assert_array_almost_equal(grad_bh, grad_output, decimal=4)
 
 
+# TODO: Remove filterwarnings in 1.2
+@pytest.mark.filterwarnings("ignore:.*TSNE will change.*:FutureWarning")
 def test_verbose():
     # Verbose options write to stdout.
     random_state = check_random_state(0)
@@ -607,6 +659,8 @@ def test_verbose():
     assert("early exaggeration" in out)
 
 
+# TODO: Remove filterwarnings in 1.2
+@pytest.mark.filterwarnings("ignore:.*TSNE will change.*:FutureWarning")
 def test_chebyshev_metric():
     # t-SNE should allow metrics that cannot be squared (issue #3526).
     random_state = check_random_state(0)
@@ -615,6 +669,8 @@ def test_chebyshev_metric():
     tsne.fit_transform(X)
 
 
+# TODO: Remove filterwarnings in 1.2
+@pytest.mark.filterwarnings("ignore:.*TSNE will change.*:FutureWarning")
 def test_reduction_to_one_component():
     # t-SNE should allow reduction to one component (issue #4154).
     random_state = check_random_state(0)
@@ -624,6 +680,8 @@ def test_reduction_to_one_component():
     assert(np.all(np.isfinite(X_embedded)))
 
 
+# TODO: Remove filterwarnings in 1.2
+@pytest.mark.filterwarnings("ignore:.*TSNE will change.*:FutureWarning")
 @pytest.mark.parametrize('method', ['barnes_hut', 'exact'])
 @pytest.mark.parametrize('dt', [np.float32, np.float64])
 def test_64bit(method, dt):
@@ -642,6 +700,8 @@ def test_64bit(method, dt):
     assert effective_type == np.float32
 
 
+# TODO: Remove filterwarnings in 1.2
+@pytest.mark.filterwarnings("ignore:.*TSNE will change.*:FutureWarning")
 @pytest.mark.parametrize('method', ['barnes_hut', 'exact'])
 def test_kl_divergence_not_nan(method):
     # Ensure kl_divergence_ is computed at last iteration
@@ -713,6 +773,8 @@ def test_n_iter_without_progress():
                 "last -1 episodes. Finished." in out)
 
 
+# TODO: Remove filterwarnings in 1.2
+@pytest.mark.filterwarnings("ignore:.*TSNE will change.*:FutureWarning")
 def test_min_grad_norm():
     # Make sure that the parameter min_grad_norm is used correctly
     random_state = check_random_state(0)
@@ -756,6 +818,8 @@ def test_min_grad_norm():
     assert n_smaller_gradient_norms <= 1
 
 
+# TODO: Remove filterwarnings in 1.2
+@pytest.mark.filterwarnings("ignore:.*TSNE will change.*:FutureWarning")
 def test_accessible_kl_divergence():
     # Ensures that the accessible kl_divergence matches the computed value
     random_state = check_random_state(0)
@@ -784,6 +848,8 @@ def test_accessible_kl_divergence():
     assert_almost_equal(tsne.kl_divergence_, float(error), decimal=5)
 
 
+# TODO: Remove filterwarnings in 1.2
+@pytest.mark.filterwarnings("ignore:.*TSNE will change.*:FutureWarning")
 @pytest.mark.parametrize('method', ['barnes_hut', 'exact'])
 def test_uniform_grid(method):
     """Make sure that TSNE can approximately recover a uniform 2D grid
@@ -885,6 +951,8 @@ def test_gradient_bh_multithread_match_sequential():
         assert_allclose(grad_multithread, grad_multithread)
 
 
+# TODO: Remove filterwarnings in 1.2
+@pytest.mark.filterwarnings("ignore:.*TSNE will change.*:FutureWarning")
 def test_tsne_with_different_distance_metrics():
     """Make sure that TSNE works for different distance metrics"""
     random_state = check_random_state(0)
@@ -896,10 +964,11 @@ def test_tsne_with_different_distance_metrics():
     for metric, dist_func in zip(metrics, dist_funcs):
         X_transformed_tsne = TSNE(
             metric=metric, n_components=n_components_embedding,
-            random_state=0, n_iter=300, square_distances=True).fit_transform(X)
+            random_state=0, n_iter=300, square_distances=True,
+            init='random').fit_transform(X)
         X_transformed_tsne_precomputed = TSNE(
             metric='precomputed', n_components=n_components_embedding,
-            random_state=0, n_iter=300,
+            random_state=0, n_iter=300, init='random',
             square_distances=True).fit_transform(dist_func(X))
         assert_array_equal(X_transformed_tsne, X_transformed_tsne_precomputed)
 
@@ -926,11 +995,11 @@ def test_tsne_different_square_distances(method, metric, square_distances):
     X_transformed_tsne = TSNE(
         metric=metric, n_components=n_components_embedding,
         square_distances=square_distances, method=method,
-        random_state=0).fit_transform(X)
+        random_state=0, init='random').fit_transform(X)
     X_transformed_tsne_precomputed = TSNE(
         metric='precomputed', n_components=n_components_embedding,
         square_distances=square_distances, method=method,
-        random_state=0).fit_transform(X_precomputed)
+        random_state=0, init='random').fit_transform(X_precomputed)
 
     assert_allclose(X_transformed_tsne, X_transformed_tsne_precomputed)
 
@@ -943,7 +1012,8 @@ def test_tsne_square_distances_futurewarning(metric, square_distances):
     random_state = check_random_state(0)
 
     X = random_state.randn(5, 2)
-    tsne = TSNE(metric=metric, square_distances=square_distances)
+    tsne = TSNE(metric=metric, square_distances=square_distances,
+                learning_rate=200.0, init="random")
 
     if metric != 'euclidean' and square_distances is not True:
         with pytest.warns(FutureWarning, match="'square_distances'.*"):
@@ -954,6 +1024,61 @@ def test_tsne_square_distances_futurewarning(metric, square_distances):
         assert not record
 
 
+# TODO: Remove in 1.2
+@pytest.mark.parametrize('init', [None, 'random', 'pca'])
+def test_tsne_init_futurewarning(init):
+    """Make sure that a FutureWarning is only raised when the
+    init is not specified or is 'pca'."""
+    random_state = check_random_state(0)
+
+    X = random_state.randn(5, 2)
+    kwargs = dict(learning_rate=200.0, init=init)
+    tsne = TSNE(**{k: v for k, v in kwargs.items() if v is not None})
+
+    if init is None:
+        with pytest.warns(FutureWarning, match="The default initialization.*"):
+            tsne.fit_transform(X)
+    elif init == 'pca':
+        with pytest.warns(FutureWarning, match="The PCA initialization.*"):
+            tsne.fit_transform(X)
+    else:
+        with pytest.warns(None) as record:
+            tsne.fit_transform(X)
+        assert not record
+
+
+# TODO: Remove in 1.2
+@pytest.mark.parametrize('learning_rate', [None, 200.0])
+def test_tsne_learning_rate_futurewarning(learning_rate):
+    """Make sure that a FutureWarning is only raised when the learning rate
+    is not specified"""
+    random_state = check_random_state(0)
+
+    X = random_state.randn(5, 2)
+    kwargs = dict(learning_rate=learning_rate, init='random')
+    tsne = TSNE(**{k: v for k, v in kwargs.items() if v is not None})
+
+    if learning_rate is None:
+        with pytest.warns(FutureWarning, match="The default learning rate.*"):
+            tsne.fit_transform(X)
+    else:
+        with pytest.warns(None) as record:
+            tsne.fit_transform(X)
+        assert not record
+
+
+# TODO: Remove filterwarnings in 1.2
+@pytest.mark.filterwarnings("ignore:.*TSNE will change.*:FutureWarning")
+def test_tsne_negative_learning_rate():
+    """Make sure that negative learning rate results in a ValueError"""
+    random_state = check_random_state(0)
+    X = random_state.randn(5, 2)
+    with pytest.raises(ValueError, match="'learning_rate' must be.*"):
+        TSNE(learning_rate=-50.0).fit_transform(X)
+
+
+# TODO: Remove filterwarnings in 1.2
+@pytest.mark.filterwarnings("ignore:.*TSNE will change.*:FutureWarning")
 @pytest.mark.parametrize('method', ['exact', 'barnes_hut'])
 def test_tsne_n_jobs(method):
     """Make sure that the n_jobs parameter doesn't impact the output"""
diff --git a/sklearn/neighbors/tests/test_neighbors_pipeline.py b/sklearn/neighbors/tests/test_neighbors_pipeline.py
index f8f9472bdac48..5b5f294d2d243 100644
--- a/sklearn/neighbors/tests/test_neighbors_pipeline.py
+++ b/sklearn/neighbors/tests/test_neighbors_pipeline.py
@@ -6,6 +6,7 @@
 """
 
 import numpy as np
+import pytest
 
 from sklearn.utils._testing import assert_array_almost_equal
 from sklearn.cluster.tests.common import generate_clustered_data
@@ -111,6 +112,8 @@ def test_isomap():
     assert_array_almost_equal(Xt_chain, Xt_compact)
 
 
+# TODO: Remove filterwarning in 1.2
+@pytest.mark.filterwarnings("ignore:.*TSNE will change.*:FutureWarning")
 def test_tsne():
     # Test chaining KNeighborsTransformer and TSNE
     n_iter = 250
diff --git a/sklearn/tests/test_docstring_parameters.py b/sklearn/tests/test_docstring_parameters.py
index ee2fe055a4b43..719df2f4a0f77 100644
--- a/sklearn/tests/test_docstring_parameters.py
+++ b/sklearn/tests/test_docstring_parameters.py
@@ -253,6 +253,11 @@ def test_fit_docstring_attributes(name, Estimator):
     if Estimator.__name__ == 'NMF':
         est.init = 'nndsvda'
 
+    # FIXME: TO BE REMOVED for 1.2 (avoid FutureWarning)
+    if Estimator.__name__ == 'TSNE':
+        est.learning_rate = 200.0
+        est.init = 'random'
+
     X, y = make_classification(n_samples=20, n_features=3,
                                n_redundant=0, n_classes=2,
                                random_state=2)
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index 71f5b3b42de42..f0c0383a7bfe8 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -3116,6 +3116,7 @@ def check_requires_y_none(name, estimator_orig):
             warnings.warn(warning_msg, FutureWarning)
 
 
+@ignore_warnings(category=FutureWarning)
 def check_n_features_in_after_fitting(name, estimator_orig):
     # Make sure that n_features_in are checked after fitting
     tags = _safe_tags(estimator_orig)

From e4bb9fa86b0df873ad750b6d59090843d9d23d50 Mon Sep 17 00:00:00 2001
From: Ray Bell <rayjohnbell0@gmail.com>
Date: Mon, 26 Apr 2021 16:04:08 -0400
Subject: [PATCH 364/478] DOC: add import in binary tree class examples
 (#19991)

Co-authored-by: Ray Bell <ray.bell@dtn.com>
---
 sklearn/neighbors/_binary_tree.pxi | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sklearn/neighbors/_binary_tree.pxi b/sklearn/neighbors/_binary_tree.pxi
index de85ec49166ec..3adfa1b31006a 100755
--- a/sklearn/neighbors/_binary_tree.pxi
+++ b/sklearn/neighbors/_binary_tree.pxi
@@ -246,6 +246,7 @@ Examples
 Query for k-nearest neighbors
 
     >>> import numpy as np
+    >>> from sklearn.neighbors import {BinaryTree}
     >>> rng = np.random.RandomState(0)
     >>> X = rng.random_sample((10, 3))  # 10 points in 3 dimensions
     >>> tree = {BinaryTree}(X, leaf_size=2)              # doctest: +SKIP

From eecde00c7a706546271ff40d7d492b5f27046d2b Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 27 Apr 2021 11:38:53 +0200
Subject: [PATCH 365/478] FIX support multiple str/single category with dense
 DictVectorizer (#19982)

---
 doc/whats_new/v0.24.rst                       |  7 +++
 .../feature_extraction/_dict_vectorizer.py    | 21 +--------
 .../tests/test_dict_vectorizer.py             | 45 +++++++++++++++++++
 3 files changed, 53 insertions(+), 20 deletions(-)

diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst
index 79f6ecb15c3d0..bfcd134bdd2bd 100644
--- a/doc/whats_new/v0.24.rst
+++ b/doc/whats_new/v0.24.rst
@@ -42,6 +42,13 @@ Changelog
   with `sample_weight` parameter and `least_absolute_deviation` loss function.
   :pr:`19407` by :user:`Vadim Ushtanit <vadim-ushtanit>`.
 
+:mod:`feature_extraction`
+.........................
+
+- |Fix| Fixed a bug to support multiple strings for a category when
+  `sparse=False` in :class:`feature_extraction.DictVectorizer`.
+  :pr:`19982` by :user:`Guillaume Lemaitre <glemaitre>`.
+
 :mod:`sklearn.gaussian_process`
 ...............................
 
diff --git a/sklearn/feature_extraction/_dict_vectorizer.py b/sklearn/feature_extraction/_dict_vectorizer.py
index e0516407c205a..44b50dc45a103 100644
--- a/sklearn/feature_extraction/_dict_vectorizer.py
+++ b/sklearn/feature_extraction/_dict_vectorizer.py
@@ -347,26 +347,7 @@ def transform(self, X):
         Xa : {array, sparse matrix}
             Feature vectors; always 2-d.
         """
-        if self.sparse:
-            return self._transform(X, fitting=False)
-
-        else:
-            dtype = self.dtype
-            vocab = self.vocabulary_
-            X = _tosequence(X)
-            Xa = np.zeros((len(X), len(vocab)), dtype=dtype)
-
-            for i, x in enumerate(X):
-                for f, v in x.items():
-                    if isinstance(v, str):
-                        f = "%s%s%s" % (f, self.separator, v)
-                        v = 1
-                    try:
-                        Xa[i, vocab[f]] = dtype(v)
-                    except KeyError:
-                        pass
-
-            return Xa
+        return self._transform(X, fitting=False)
 
     def get_feature_names(self):
         """Returns a list of feature names, ordered by their indices.
diff --git a/sklearn/feature_extraction/tests/test_dict_vectorizer.py b/sklearn/feature_extraction/tests/test_dict_vectorizer.py
index 519201b580598..9984bdc5aa3da 100644
--- a/sklearn/feature_extraction/tests/test_dict_vectorizer.py
+++ b/sklearn/feature_extraction/tests/test_dict_vectorizer.py
@@ -6,6 +6,7 @@
 import numpy as np
 import scipy.sparse as sp
 from numpy.testing import assert_array_equal
+from numpy.testing import assert_allclose
 
 import pytest
 
@@ -165,3 +166,47 @@ def test_n_features_in():
     d = [{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': 1}]
     dv.fit(d)
     assert not hasattr(dv, 'n_features_in_')
+
+
+def test_dictvectorizer_dense_sparse_equivalence():
+    """Check the equivalence between between sparse and dense DictVectorizer.
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/19978
+    """
+    movie_entry_fit = [
+        {"category": ["thriller", "drama"], "year": 2003},
+        {"category": ["animation", "family"], "year": 2011},
+        {"year": 1974},
+    ]
+    movie_entry_transform = [{"category": ["thriller"], "unseen_feature": "3"}]
+    dense_vectorizer = DictVectorizer(sparse=False)
+    sparse_vectorizer = DictVectorizer(sparse=True)
+
+    dense_vector_fit = dense_vectorizer.fit_transform(movie_entry_fit)
+    sparse_vector_fit = sparse_vectorizer.fit_transform(movie_entry_fit)
+
+    assert not sp.issparse(dense_vector_fit)
+    assert sp.issparse(sparse_vector_fit)
+
+    assert_allclose(dense_vector_fit, sparse_vector_fit.toarray())
+
+    dense_vector_transform = dense_vectorizer.transform(movie_entry_transform)
+    sparse_vector_transform = sparse_vectorizer.transform(
+        movie_entry_transform
+    )
+
+    assert not sp.issparse(dense_vector_transform)
+    assert sp.issparse(sparse_vector_transform)
+
+    assert_allclose(dense_vector_transform, sparse_vector_transform.toarray())
+
+    dense_inverse_transform = dense_vectorizer.inverse_transform(
+        dense_vector_transform
+    )
+    sparse_inverse_transform = sparse_vectorizer.inverse_transform(
+        sparse_vector_transform
+    )
+
+    expected_inverse = [{"category=thriller": 1.0}]
+    assert dense_inverse_transform == expected_inverse
+    assert sparse_inverse_transform == expected_inverse

From 9694d5a4b517420f9a2953c67b8c06100b256efd Mon Sep 17 00:00:00 2001
From: Fatos Morina <fatosi.morina@gmail.com>
Date: Tue, 27 Apr 2021 12:37:56 +0200
Subject: [PATCH 366/478] Remove the unused import of csc_matrix (#19989)

---
 sklearn/tree/_tree.pyx | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx
index f4484ab1a3314..afd6aa8d6cf51 100644
--- a/sklearn/tree/_tree.pyx
+++ b/sklearn/tree/_tree.pyx
@@ -29,7 +29,6 @@ cimport numpy as np
 np.import_array()
 
 from scipy.sparse import issparse
-from scipy.sparse import csc_matrix
 from scipy.sparse import csr_matrix
 
 from ._utils cimport Stack

From 674e7dff128700de7b1e9588b18a78ad6f38f12a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?=
 <34657725+jeremiedbb@users.noreply.github.com>
Date: Tue, 27 Apr 2021 12:47:28 +0200
Subject: [PATCH 367/478] DOC retroactive changed model entry (#19992)

---
 doc/whats_new/v0.24.rst | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst
index bfcd134bdd2bd..72a96aa74f470 100644
--- a/doc/whats_new/v0.24.rst
+++ b/doc/whats_new/v0.24.rst
@@ -209,6 +209,9 @@ random sampling procedures.
 
 - |Fix| :class:`linear_model.Perceptron` when `penalty='elasticnet'`.
 
+- |Fix| Change in the random sampling procedures for the center initialization
+  of :class:`cluster.KMeans`.
+
 Details are listed in the changelog below.
 
 (While we are trying to better inform users by providing this information, we

From bf0886bae0ccbc8c5d285b6e2affe7e40474f970 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sylvain=20Mari=C3=A9?=
 <sylvain.marie@schneider-electric.com>
Date: Tue, 27 Apr 2021 16:06:10 +0200
Subject: [PATCH 368/478] [MRG after #12145] Add "Randomized SVD" solver option
 to KernelPCA for faster partial decompositions, like in PCA (#12069)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Sylvain MARIE <sylvain.marie@se.com>
Co-authored-by: Thomas J Fan <thomasjpfan@gmail.com>
Co-authored-by: Nicolas Hug <contact@nicolas-hug.com>
Co-authored-by: Joel Nothman <joel.nothman@gmail.com>
Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
Co-authored-by: Olivier Grisel <olivier.grisel@gmail.com>
Co-authored-by: Tom Dupré la Tour <tom.dupre-la-tour@m4x.org>
---
 ...kernel_pca_solvers_time_vs_n_components.py | 148 ++++++++++
 ...ch_kernel_pca_solvers_time_vs_n_samples.py | 153 ++++++++++
 doc/modules/decomposition.rst                 | 122 ++++++--
 doc/whats_new/v1.0.rst                        |  13 +-
 sklearn/decomposition/_kernel_pca.py          |  86 +++++-
 .../decomposition/tests/test_kernel_pca.py    | 276 +++++++++++++++---
 sklearn/utils/extmath.py                      | 172 ++++++++++-
 sklearn/utils/tests/test_extmath.py           | 129 +++++++-
 8 files changed, 1017 insertions(+), 82 deletions(-)
 create mode 100644 benchmarks/bench_kernel_pca_solvers_time_vs_n_components.py
 create mode 100644 benchmarks/bench_kernel_pca_solvers_time_vs_n_samples.py

diff --git a/benchmarks/bench_kernel_pca_solvers_time_vs_n_components.py b/benchmarks/bench_kernel_pca_solvers_time_vs_n_components.py
new file mode 100644
index 0000000000000..d871967ad1327
--- /dev/null
+++ b/benchmarks/bench_kernel_pca_solvers_time_vs_n_components.py
@@ -0,0 +1,148 @@
+"""
+=============================================================
+Kernel PCA Solvers comparison benchmark: time vs n_components
+=============================================================
+
+This benchmark shows that the approximate solvers provided in Kernel PCA can
+help significantly improve its execution speed when an approximate solution
+(small `n_components`) is acceptable. In many real-world datasets a few
+hundreds of principal components are indeed sufficient enough to capture the
+underlying distribution.
+
+Description:
+------------
+A fixed number of training (default: 2000) and test (default: 1000) samples
+with 2 features is generated using the `make_circles` helper method.
+
+KernelPCA models are trained on the training set with an increasing number of
+principal components, between 1 and `max_n_compo` (default: 1999), with
+`n_compo_grid_size` positions (default: 10). For each value of `n_components`
+to try, KernelPCA models are trained for the various possible `eigen_solver`
+values. The execution times are displayed in a plot at the end of the
+experiment.
+
+What you can observe:
+---------------------
+When the number of requested principal components is small, the dense solver
+takes more time to complete, while the randomized method returns similar
+results with shorter execution times.
+
+Going further:
+--------------
+You can adjust `max_n_compo` and `n_compo_grid_size` if you wish to explore a
+different range of values for `n_components`.
+
+You can also set `arpack_all=True` to activate arpack solver for large number
+of components (this takes more time).
+"""
+# Authors: Sylvain MARIE, Schneider Electric
+
+import time
+
+import numpy as np
+import matplotlib.pyplot as plt
+
+from numpy.testing import assert_array_almost_equal
+from sklearn.decomposition import KernelPCA
+from sklearn.datasets import make_circles
+
+
+print(__doc__)
+
+
+# 1- Design the Experiment
+# ------------------------
+n_train, n_test = 2000, 1000            # the sample sizes to use
+max_n_compo = 1999                      # max n_components to try
+n_compo_grid_size = 10                  # nb of positions in the grid to try
+# generate the grid
+n_compo_range = [np.round(np.exp((x / (n_compo_grid_size - 1))
+                                 * np.log(max_n_compo)))
+                 for x in range(0, n_compo_grid_size)]
+
+n_iter = 3          # the number of times each experiment will be repeated
+arpack_all = False  # set to True if you wish to run arpack for all n_compo
+
+
+# 2- Generate random data
+# -----------------------
+n_features = 2
+X, y = make_circles(n_samples=(n_train + n_test), factor=.3, noise=.05,
+                    random_state=0)
+X_train, X_test = X[:n_train, :], X[n_train:, :]
+
+
+# 3- Benchmark
+# ------------
+# init
+ref_time = np.empty((len(n_compo_range), n_iter)) * np.nan
+a_time = np.empty((len(n_compo_range), n_iter)) * np.nan
+r_time = np.empty((len(n_compo_range), n_iter)) * np.nan
+# loop
+for j, n_components in enumerate(n_compo_range):
+
+    n_components = int(n_components)
+    print("Performing kPCA with n_components = %i" % n_components)
+
+    # A- reference (dense)
+    print("  - dense solver")
+    for i in range(n_iter):
+        start_time = time.perf_counter()
+        ref_pred = KernelPCA(n_components, eigen_solver="dense") \
+            .fit(X_train).transform(X_test)
+        ref_time[j, i] = time.perf_counter() - start_time
+
+    # B- arpack (for small number of components only, too slow otherwise)
+    if arpack_all or n_components < 100:
+        print("  - arpack solver")
+        for i in range(n_iter):
+            start_time = time.perf_counter()
+            a_pred = KernelPCA(n_components, eigen_solver="arpack") \
+                .fit(X_train).transform(X_test)
+            a_time[j, i] = time.perf_counter() - start_time
+            # check that the result is still correct despite the approx
+            assert_array_almost_equal(np.abs(a_pred), np.abs(ref_pred))
+
+    # C- randomized
+    print("  - randomized solver")
+    for i in range(n_iter):
+        start_time = time.perf_counter()
+        r_pred = KernelPCA(n_components, eigen_solver="randomized") \
+            .fit(X_train).transform(X_test)
+        r_time[j, i] = time.perf_counter() - start_time
+        # check that the result is still correct despite the approximation
+        assert_array_almost_equal(np.abs(r_pred), np.abs(ref_pred))
+
+# Compute statistics for the 3 methods
+avg_ref_time = ref_time.mean(axis=1)
+std_ref_time = ref_time.std(axis=1)
+avg_a_time = a_time.mean(axis=1)
+std_a_time = a_time.std(axis=1)
+avg_r_time = r_time.mean(axis=1)
+std_r_time = r_time.std(axis=1)
+
+
+# 4- Plots
+# --------
+fig, ax = plt.subplots(figsize=(12, 8))
+
+# Display 1 plot with error bars per method
+ax.errorbar(n_compo_range, avg_ref_time, yerr=std_ref_time,
+            marker='x', linestyle='', color='r', label='full')
+ax.errorbar(n_compo_range, avg_a_time, yerr=std_a_time, marker='x',
+            linestyle='', color='g', label='arpack')
+ax.errorbar(n_compo_range, avg_r_time, yerr=std_r_time, marker='x',
+            linestyle='', color='b', label='randomized')
+ax.legend(loc='upper left')
+
+# customize axes
+ax.set_xscale('log')
+ax.set_xlim(1, max(n_compo_range) * 1.1)
+ax.set_ylabel("Execution time (s)")
+ax.set_xlabel("n_components")
+
+ax.set_title("kPCA Execution time comparison on %i samples with %i "
+             "features, according to the choice of `eigen_solver`"
+             "" % (n_train, n_features))
+
+plt.show()
diff --git a/benchmarks/bench_kernel_pca_solvers_time_vs_n_samples.py b/benchmarks/bench_kernel_pca_solvers_time_vs_n_samples.py
new file mode 100644
index 0000000000000..d238802a68d64
--- /dev/null
+++ b/benchmarks/bench_kernel_pca_solvers_time_vs_n_samples.py
@@ -0,0 +1,153 @@
+"""
+==========================================================
+Kernel PCA Solvers comparison benchmark: time vs n_samples
+==========================================================
+
+This benchmark shows that the approximate solvers provided in Kernel PCA can
+help significantly improve its execution speed when an approximate solution
+(small `n_components`) is acceptable. In many real-world datasets the number of
+samples is very large, but a few hundreds of principal components are
+sufficient enough to capture the underlying distribution.
+
+Description:
+------------
+An increasing number of examples is used to train a KernelPCA, between
+`min_n_samples` (default: 101) and `max_n_samples` (default: 4000) with
+`n_samples_grid_size` positions (default: 4). Samples have 2 features, and are
+generated using `make_circles`. For each training sample size, KernelPCA models
+are trained for the various possible `eigen_solver` values. All of them are
+trained to obtain `n_components` principal components (default: 100). The
+execution times are displayed in a plot at the end of the experiment.
+
+What you can observe:
+---------------------
+When the number of samples provided gets large, the dense solver takes a lot
+of time to complete, while the randomized method returns similar results in
+much shorter execution times.
+
+Going further:
+--------------
+You can increase `max_n_samples` and `nb_n_samples_to_try` if you wish to
+explore a wider range of values for `n_samples`.
+
+You can also set `include_arpack=True` to add this other solver in the
+experiments (much slower).
+
+Finally you can have a look at the second example of this series, "Kernel PCA
+Solvers comparison benchmark: time vs n_components", where this time the number
+of examples is fixed, and the desired number of components varies.
+"""
+# Author: Sylvain MARIE, Schneider Electric
+
+import time
+
+import numpy as np
+import matplotlib.pyplot as plt
+
+from numpy.testing import assert_array_almost_equal
+from sklearn.decomposition import KernelPCA
+from sklearn.datasets import make_circles
+
+
+print(__doc__)
+
+
+# 1- Design the Experiment
+# ------------------------
+min_n_samples, max_n_samples = 101, 4000  # min and max n_samples to try
+n_samples_grid_size = 4                   # nb of positions in the grid to try
+# generate the grid
+n_samples_range = [min_n_samples + np.floor((x / (n_samples_grid_size - 1))
+                                            * (max_n_samples - min_n_samples))
+                   for x in range(0, n_samples_grid_size)]
+
+n_components = 100      # the number of principal components we want to use
+n_iter = 3              # the number of times each experiment will be repeated
+include_arpack = False  # set this to True to include arpack solver (slower)
+
+
+# 2- Generate random data
+# -----------------------
+n_features = 2
+X, y = make_circles(n_samples=max_n_samples, factor=.3, noise=.05,
+                    random_state=0)
+
+
+# 3- Benchmark
+# ------------
+# init
+ref_time = np.empty((len(n_samples_range), n_iter)) * np.nan
+a_time = np.empty((len(n_samples_range), n_iter)) * np.nan
+r_time = np.empty((len(n_samples_range), n_iter)) * np.nan
+
+# loop
+for j, n_samples in enumerate(n_samples_range):
+
+    n_samples = int(n_samples)
+    print("Performing kPCA with n_samples = %i" % n_samples)
+
+    X_train = X[:n_samples, :]
+    X_test = X_train
+
+    # A- reference (dense)
+    print("  - dense")
+    for i in range(n_iter):
+        start_time = time.perf_counter()
+        ref_pred = KernelPCA(n_components, eigen_solver="dense") \
+            .fit(X_train).transform(X_test)
+        ref_time[j, i] = time.perf_counter() - start_time
+
+    # B- arpack
+    if include_arpack:
+        print("  - arpack")
+        for i in range(n_iter):
+            start_time = time.perf_counter()
+            a_pred = KernelPCA(n_components, eigen_solver="arpack") \
+                .fit(X_train).transform(X_test)
+            a_time[j, i] = time.perf_counter() - start_time
+            # check that the result is still correct despite the approx
+            assert_array_almost_equal(np.abs(a_pred), np.abs(ref_pred))
+
+    # C- randomized
+    print("  - randomized")
+    for i in range(n_iter):
+        start_time = time.perf_counter()
+        r_pred = KernelPCA(n_components, eigen_solver="randomized") \
+            .fit(X_train).transform(X_test)
+        r_time[j, i] = time.perf_counter() - start_time
+        # check that the result is still correct despite the approximation
+        assert_array_almost_equal(np.abs(r_pred), np.abs(ref_pred))
+
+# Compute statistics for the 3 methods
+avg_ref_time = ref_time.mean(axis=1)
+std_ref_time = ref_time.std(axis=1)
+avg_a_time = a_time.mean(axis=1)
+std_a_time = a_time.std(axis=1)
+avg_r_time = r_time.mean(axis=1)
+std_r_time = r_time.std(axis=1)
+
+
+# 4- Plots
+# --------
+fig, ax = plt.subplots(figsize=(12, 8))
+
+# Display 1 plot with error bars per method
+ax.errorbar(n_samples_range, avg_ref_time, yerr=std_ref_time,
+            marker='x', linestyle='', color='r', label='full')
+if include_arpack:
+    ax.errorbar(n_samples_range, avg_a_time, yerr=std_a_time, marker='x',
+                linestyle='', color='g', label='arpack')
+ax.errorbar(n_samples_range, avg_r_time, yerr=std_r_time, marker='x',
+            linestyle='', color='b', label='randomized')
+ax.legend(loc='upper left')
+
+# customize axes
+ax.set_xlim(min(n_samples_range) * 0.9, max(n_samples_range) * 1.1)
+ax.set_ylabel("Execution time (s)")
+ax.set_xlabel("n_samples")
+
+ax.set_title("Execution time comparison of kPCA with %i components on samples "
+             "with %i features, according to the choice of `eigen_solver`"
+             "" % (n_components, n_features))
+
+plt.show()
diff --git a/doc/modules/decomposition.rst b/doc/modules/decomposition.rst
index e971d784c63d6..fd51f60d8bfc6 100644
--- a/doc/modules/decomposition.rst
+++ b/doc/modules/decomposition.rst
@@ -166,32 +166,16 @@ Note: the implementation of ``inverse_transform`` in :class:`PCA` with
 
 .. topic:: References:
 
-    * `"Finding structure with randomness: Stochastic algorithms for
+    * Algorithm 4.3 in
+      `"Finding structure with randomness: Stochastic algorithms for
       constructing approximate matrix decompositions"
       <https://arxiv.org/abs/0909.4061>`_
       Halko, et al., 2009
 
-
-.. _kernel_PCA:
-
-Kernel PCA
-----------
-
-:class:`KernelPCA` is an extension of PCA which achieves non-linear
-dimensionality reduction through the use of kernels (see :ref:`metrics`). It
-has many applications including denoising, compression and structured
-prediction (kernel dependency estimation). :class:`KernelPCA` supports both
-``transform`` and ``inverse_transform``.
-
-.. figure:: ../auto_examples/decomposition/images/sphx_glr_plot_kernel_pca_001.png
-    :target: ../auto_examples/decomposition/plot_kernel_pca.html
-    :align: center
-    :scale: 75%
-
-.. topic:: Examples:
-
-    * :ref:`sphx_glr_auto_examples_decomposition_plot_kernel_pca.py`
-
+    * `"An implementation of a randomized algorithm for principal component
+      analysis"
+      <https://arxiv.org/pdf/1412.3510.pdf>`_
+      A. Szlam et al. 2014
 
 .. _SparsePCA:
 
@@ -278,6 +262,100 @@ factorization, while larger values shrink many coefficients to zero.
      R. Jenatton, G. Obozinski, F. Bach, 2009
 
 
+.. _kernel_PCA:
+
+Kernel Principal Component Analysis (kPCA)
+==========================================
+
+Exact Kernel PCA
+----------------
+
+:class:`KernelPCA` is an extension of PCA which achieves non-linear
+dimensionality reduction through the use of kernels (see :ref:`metrics`). It
+has many applications including denoising, compression and structured
+prediction (kernel dependency estimation). :class:`KernelPCA` supports both
+``transform`` and ``inverse_transform``.
+
+.. figure:: ../auto_examples/decomposition/images/sphx_glr_plot_kernel_pca_001.png
+    :target: ../auto_examples/decomposition/plot_kernel_pca.html
+    :align: center
+    :scale: 75%
+
+.. topic:: Examples:
+
+    * :ref:`sphx_glr_auto_examples_decomposition_plot_kernel_pca.py`
+
+.. topic:: References:
+
+    * Kernel PCA was introduced in "Kernel principal component analysis"
+      Bernhard Schoelkopf, Alexander J. Smola, and Klaus-Robert Mueller. 1999.
+      In Advances in kernel methods, MIT Press, Cambridge, MA, USA 327-352.
+
+
+.. _kPCA_Solvers:
+
+Choice of solver for Kernel PCA
+-------------------------------
+
+While in :class:`PCA` the number of components is bounded by the number of
+features, in :class:`KernelPCA` the number of components is bounded by the
+number of samples. Many real-world datasets have large number of samples! In
+these cases finding *all* the components with a full kPCA is a waste of
+computation time, as data is mostly described by the first few components
+(e.g. ``n_components<=100``). In other words, the centered Gram matrix that
+is eigendecomposed in the Kernel PCA fitting process has an effective rank that
+is much smaller than its size. This is a situation where approximate
+eigensolvers can provide speedup with very low precision loss.
+
+The optional parameter ``eigen_solver='randomized'`` can be used to
+*significantly* reduce the computation time when the number of requested
+``n_components`` is small compared with the number of samples. It relies on
+randomized decomposition methods to find an approximate solution in a shorter
+time.
+
+The time complexity of the randomized :class:`KernelPCA` is
+:math:`O(n_{\mathrm{samples}}^2 \cdot n_{\mathrm{components}})`
+instead of :math:`O(n_{\mathrm{samples}}^3)` for the exact method
+implemented with ``eigen_solver='dense'``.
+
+The memory footprint of randomized :class:`KernelPCA` is also proportional to
+:math:`2 \cdot n_{\mathrm{samples}} \cdot n_{\mathrm{components}}` instead of
+:math:`n_{\mathrm{samples}}^2` for the exact method.
+
+Note: this technique is the same as in :ref:`RandomizedPCA`.
+
+In addition to the above two solvers, ``eigen_solver='arpack'`` can be used as
+an alternate way to get an approximate decomposition. In practice, this method
+only provides reasonable execution times when the number of components to find
+is extremely small. It is enabled by default when the desired number of
+components is less than 10 (strict) and the number of samples is more than 200
+(strict). See :class:`KernelPCA` for details.
+
+.. topic:: References:
+
+    * *dense* solver:
+      `scipy.linalg.eigh documentation
+      <https://docs.scipy.org/doc/scipy/reference/generated/scipy.linalg.eigh.html>`_
+
+    * *randomized* solver:
+
+        - Algorithm 4.3 in
+          `"Finding structure with randomness: Stochastic algorithms for
+          constructing approximate matrix decompositions"
+          <https://arxiv.org/abs/0909.4061>`_
+          Halko, et al., 2009
+
+        - `"An implementation of a randomized algorithm for principal component
+          analysis"
+          <https://arxiv.org/pdf/1412.3510.pdf>`_
+          A. Szlam et al. 2014
+
+    * *arpack* solver:
+      `scipy.sparse.linalg.eigsh documentation
+      <https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.linalg.eigsh.html>`_
+      R. B. Lehoucq, D. C. Sorensen, and C. Yang, 1998
+
+
 .. _LSA:
 
 Truncated singular value decomposition and latent semantic analysis
diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 0a13d22860d07..0cd1d6a89d158 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -159,14 +159,17 @@ Changelog
 - |Fix| Fixes incorrect multiple data-conversion warnings when clustering
   boolean data. :pr:`19046` by :user:`Surya Prakash <jdsurya>`.
 
-:mod:`sklearn.decomposition`
-............................
-
 - |Fix| Fixed :func:`dict_learning`, used by :class:`DictionaryLearning`, to
   ensure determinism of the output. Achieved by flipping signs of the SVD
   output which is used to initialize the code.
   :pr:`18433` by :user:`Bruno Charron <brcharron>`.
 
+- |Enhancement| added a new approximate solver (randomized SVD, available with
+  `eigen_solver='randomized'`) to :class:`decomposition.KernelPCA`. This
+  significantly accelerates computation when the number of samples is much
+  larger than the desired number of components.
+  :pr:`12069` by :user:`Sylvain Marié <smarie>`.
+
 - |Fix| Fixed a bug in :class:`MiniBatchDictionaryLearning`,
   :class:`MiniBatchSparsePCA` and :func:`dict_learning_online` where the
   update of the dictionary was incorrect. :pr:`19198` by
@@ -395,8 +398,8 @@ Changelog
   supporting sparse matrix and raise the appropriate error message.
   :pr:`19879` by :user:`Guillaume Lemaitre <glemaitre>`.
 
-- |Efficiency| Changed ``algorithm`` argument for :class:`cluster.KMeans` in 
-  :class:`preprocessing.KBinsDiscretizer` from ``auto`` to ``full``. 
+- |Efficiency| Changed ``algorithm`` argument for :class:`cluster.KMeans` in
+  :class:`preprocessing.KBinsDiscretizer` from ``auto`` to ``full``.
   :pr:`19934` by :user:`Gleb Levitskiy <GLevV>`.
 
 :mod:`sklearn.tree`
diff --git a/sklearn/decomposition/_kernel_pca.py b/sklearn/decomposition/_kernel_pca.py
index 415ee034c1769..8663193a8383e 100644
--- a/sklearn/decomposition/_kernel_pca.py
+++ b/sklearn/decomposition/_kernel_pca.py
@@ -1,6 +1,7 @@
 """Kernel Principal Components Analysis."""
 
 # Author: Mathieu Blondel <mathieu@mblondel.org>
+#         Sylvain Marie <sylvain.marie@schneider-electric.com>
 # License: BSD 3 clause
 
 import numpy as np
@@ -8,7 +9,7 @@
 from scipy.sparse.linalg import eigsh
 
 from ..utils._arpack import _init_arpack_v0
-from ..utils.extmath import svd_flip
+from ..utils.extmath import svd_flip, _randomized_eigsh
 from ..utils.validation import check_is_fitted, _check_psd_eigenvalues
 from ..utils.deprecation import deprecated
 from ..exceptions import NotFittedError
@@ -24,6 +25,12 @@ class KernelPCA(TransformerMixin, BaseEstimator):
     Non-linear dimensionality reduction through the use of kernels (see
     :ref:`metrics`).
 
+    It uses the `scipy.linalg.eigh` LAPACK implementation of the full SVD or
+    the `scipy.sparse.linalg.eigsh` ARPACK implementation of the truncated SVD,
+    depending on the shape of the input data and the number of components to
+    extract. It can also use a randomized truncated SVD by the method of
+    Halko et al. 2009, see `eigen_solver`.
+
     Read more in the :ref:`User Guide <kernel_PCA>`.
 
     Parameters
@@ -59,10 +66,37 @@ class KernelPCA(TransformerMixin, BaseEstimator):
         Learn the inverse transform for non-precomputed kernels.
         (i.e. learn to find the pre-image of a point)
 
-    eigen_solver : {'auto', 'dense', 'arpack'}, default='auto'
-        Select eigensolver to use. If n_components is much less than
-        the number of training samples, arpack may be more efficient
-        than the dense eigensolver.
+    eigen_solver : {'auto', 'dense', 'arpack', 'randomized'}, \
+        default='auto'
+        Select eigensolver to use. If `n_components` is much
+        less than the number of training samples, randomized (or arpack to a
+        smaller extend) may be more efficient than the dense eigensolver.
+        Randomized SVD is performed according to the method of Halko et al.
+
+        auto :
+            the solver is selected by a default policy based on n_samples
+            (the number of training samples) and `n_components`:
+            if the number of components to extract is less than 10 (strict) and
+            the number of samples is more than 200 (strict), the 'arpack'
+            method is enabled. Otherwise the exact full eigenvalue
+            decomposition is computed and optionally truncated afterwards
+            ('dense' method).
+        dense :
+            run exact full eigenvalue decomposition calling the standard
+            LAPACK solver via `scipy.linalg.eigh`, and select the components
+            by postprocessing
+        arpack :
+            run SVD truncated to n_components calling ARPACK solver using
+            `scipy.sparse.linalg.eigsh`. It requires strictly
+            0 < n_components < n_samples
+        randomized :
+            run randomized SVD by the method of Halko et al. The current
+            implementation selects eigenvalues based on their module; therefore
+            using this method can lead to unexpected results if the kernel is
+            not positive semi-definite.
+
+        .. versionchanged:: 1.0
+           `'randomized'` was added.
 
     tol : float, default=0
         Convergence tolerance for arpack.
@@ -72,6 +106,13 @@ class KernelPCA(TransformerMixin, BaseEstimator):
         Maximum number of iterations for arpack.
         If None, optimal value will be chosen by arpack.
 
+    iterated_power : int >= 0, or 'auto', default='auto'
+        Number of iterations for the power method computed by
+        svd_solver == 'randomized'. When 'auto', it is set to 7 when
+        `n_components < 0.1 * min(X.shape)`, other it is set to 4.
+
+        .. versionadded:: 1.0
+
     remove_zero_eig : bool, default=False
         If True, then all components with zero eigenvalues are removed, so
         that the number of components in the output may be < n_components
@@ -80,8 +121,8 @@ class KernelPCA(TransformerMixin, BaseEstimator):
         with zero eigenvalues are removed regardless.
 
     random_state : int, RandomState instance or None, default=None
-        Used when ``eigen_solver`` == 'arpack'. Pass an int for reproducible
-        results across multiple function calls.
+        Used when ``eigen_solver`` == 'arpack' or 'randomized'. Pass an int
+        for reproducible results across multiple function calls.
         See :term:`Glossary <random_state>`.
 
         .. versionadded:: 0.18
@@ -141,12 +182,22 @@ class KernelPCA(TransformerMixin, BaseEstimator):
         and Klaus-Robert Mueller. 1999. Kernel principal
         component analysis. In Advances in kernel methods,
         MIT Press, Cambridge, MA, USA 327-352.
+
+    For eigen_solver == 'arpack', refer to `scipy.sparse.linalg.eigsh`.
+
+    For eigen_solver == 'randomized', see:
+        Finding structure with randomness: Stochastic algorithms
+        for constructing approximate matrix decompositions Halko, et al., 2009
+        (arXiv:909)
+        A randomized algorithm for the decomposition of matrices
+        Per-Gunnar Martinsson, Vladimir Rokhlin and Mark Tygert
     """
     @_deprecate_positional_args
     def __init__(self, n_components=None, *, kernel="linear",
                  gamma=None, degree=3, coef0=1, kernel_params=None,
                  alpha=1.0, fit_inverse_transform=False, eigen_solver='auto',
-                 tol=0, max_iter=None, remove_zero_eig=False,
+                 tol=0, max_iter=None, iterated_power='auto',
+                 remove_zero_eig=False,
                  random_state=None, copy_X=True, n_jobs=None):
         if fit_inverse_transform and kernel == 'precomputed':
             raise ValueError(
@@ -160,9 +211,10 @@ def __init__(self, n_components=None, *, kernel="linear",
         self.alpha = alpha
         self.fit_inverse_transform = fit_inverse_transform
         self.eigen_solver = eigen_solver
-        self.remove_zero_eig = remove_zero_eig
         self.tol = tol
         self.max_iter = max_iter
+        self.iterated_power = iterated_power
+        self.remove_zero_eig = remove_zero_eig
         self.random_state = random_state
         self.n_jobs = n_jobs
         self.copy_X = copy_X
@@ -191,9 +243,14 @@ def _fit_transform(self, K):
         # center kernel
         K = self._centerer.fit_transform(K)
 
+        # adjust n_components according to user inputs
         if self.n_components is None:
-            n_components = K.shape[0]
+            n_components = K.shape[0]  # use all dimensions
         else:
+            if self.n_components < 1:
+                raise ValueError(
+                    f"`n_components` should be >= 1, got: {self.n_component}"
+                )
             n_components = min(K.shape[0], self.n_components)
 
         # compute eigenvectors
@@ -206,6 +263,7 @@ def _fit_transform(self, K):
             eigen_solver = self.eigen_solver
 
         if eigen_solver == 'dense':
+            # Note: eigvals specifies the indices of smallest/largest to return
             self.lambdas_, self.alphas_ = linalg.eigh(
                 K, eigvals=(K.shape[0] - n_components, K.shape[0] - 1))
         elif eigen_solver == 'arpack':
@@ -215,6 +273,14 @@ def _fit_transform(self, K):
                                                 tol=self.tol,
                                                 maxiter=self.max_iter,
                                                 v0=v0)
+        elif eigen_solver == 'randomized':
+            self.lambdas_, self.alphas_ = _randomized_eigsh(
+                K, n_components=n_components, n_iter=self.iterated_power,
+                random_state=self.random_state, selection='module'
+            )
+        else:
+            raise ValueError("Unsupported value for `eigen_solver`: %r"
+                             % eigen_solver)
 
         # make sure that the eigenvalues are ok and fix numerical issues
         self.lambdas_ = _check_psd_eigenvalues(self.lambdas_,
diff --git a/sklearn/decomposition/tests/test_kernel_pca.py b/sklearn/decomposition/tests/test_kernel_pca.py
index adf68f1db1a6c..5c8d052a7aa14 100644
--- a/sklearn/decomposition/tests/test_kernel_pca.py
+++ b/sklearn/decomposition/tests/test_kernel_pca.py
@@ -3,11 +3,13 @@
 import pytest
 
 from sklearn.utils._testing import (assert_array_almost_equal,
-                                   assert_allclose)
+                                    assert_array_equal,
+                                    assert_allclose)
 
 from sklearn.decomposition import PCA, KernelPCA
 from sklearn.datasets import make_circles
 from sklearn.datasets import make_blobs
+from sklearn.exceptions import NotFittedError
 from sklearn.linear_model import Perceptron
 from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import StandardScaler
@@ -17,6 +19,12 @@
 
 
 def test_kernel_pca():
+    """Nominal test for all solvers and all known kernels + a custom one
+
+    It tests
+     - that fit_transform is equivalent to fit+transform
+     - that the shapes of transforms and inverse transforms are correct
+    """
     rng = np.random.RandomState(0)
     X_fit = rng.random_sample((5, 4))
     X_pred = rng.random_sample((2, 4))
@@ -26,7 +34,7 @@ def histogram(x, y, **kwargs):
         assert kwargs == {}    # no kernel_params that we didn't ask for
         return np.minimum(x, y).sum()
 
-    for eigen_solver in ("auto", "dense", "arpack"):
+    for eigen_solver in ("auto", "dense", "arpack", "randomized"):
         for kernel in ("linear", "rbf", "poly", histogram):
             # histogram kernel produces singular matrix inside linalg.solve
             # XXX use a least-squares approximation?
@@ -55,12 +63,31 @@ def histogram(x, y, **kwargs):
                 assert X_pred2.shape == X_pred.shape
 
 
+def test_kernel_pca_invalid_solver():
+    """Check that kPCA raises an error if the solver parameter is invalid
+
+    """
+    with pytest.raises(ValueError):
+        KernelPCA(eigen_solver="unknown").fit(np.random.randn(10, 10))
+
+
 def test_kernel_pca_invalid_parameters():
+    """Check that kPCA raises an error if the parameters are invalid
+
+    Tests fitting inverse transform with a precomputed kernel raises a
+    ValueError.
+    """
     with pytest.raises(ValueError):
         KernelPCA(10, fit_inverse_transform=True, kernel='precomputed')
 
 
 def test_kernel_pca_consistent_transform():
+    """Check robustness to mutations in the original training array
+
+    Test that after fitting a kPCA model, it stays independent of any
+    mutation of the values of the original data object by relying on an
+    internal copy.
+    """
     # X_fit_ needs to retain the old, unmodified copy of X
     state = np.random.RandomState(0)
     X = state.rand(10, 10)
@@ -74,6 +101,10 @@ def test_kernel_pca_consistent_transform():
 
 
 def test_kernel_pca_deterministic_output():
+    """Test that Kernel PCA produces deterministic output
+
+    Tests that the same inputs and random state produce the same output.
+    """
     rng = np.random.RandomState(0)
     X = rng.rand(10, 10)
     eigen_solver = ('arpack', 'dense')
@@ -89,15 +120,20 @@ def test_kernel_pca_deterministic_output():
 
 
 def test_kernel_pca_sparse():
+    """Test that kPCA works on a sparse data input.
+
+    Same test as ``test_kernel_pca except inverse_transform`` since it's not
+    implemented for sparse matrices.
+    """
     rng = np.random.RandomState(0)
     X_fit = sp.csr_matrix(rng.random_sample((5, 4)))
     X_pred = sp.csr_matrix(rng.random_sample((2, 4)))
 
-    for eigen_solver in ("auto", "arpack"):
+    for eigen_solver in ("auto", "arpack", "randomized"):
         for kernel in ("linear", "rbf", "poly"):
             # transform fit data
             kpca = KernelPCA(4, kernel=kernel, eigen_solver=eigen_solver,
-                             fit_inverse_transform=False)
+                             fit_inverse_transform=False, random_state=0)
             X_fit_transformed = kpca.fit_transform(X_fit)
             X_fit_transformed2 = kpca.fit(X_fit).transform(X_fit)
             assert_array_almost_equal(np.abs(X_fit_transformed),
@@ -108,31 +144,47 @@ def test_kernel_pca_sparse():
             assert (X_pred_transformed.shape[1] ==
                          X_fit_transformed.shape[1])
 
-            # inverse transform
-            # X_pred2 = kpca.inverse_transform(X_pred_transformed)
-            # assert X_pred2.shape == X_pred.shape)
+            # inverse transform: not available for sparse matrices
+            # XXX: should we raise another exception type here? For instance:
+            # NotImplementedError.
+            with pytest.raises(NotFittedError):
+                kpca.inverse_transform(X_pred_transformed)
 
 
-def test_kernel_pca_linear_kernel():
+@pytest.mark.parametrize("solver", ["auto", "dense", "arpack", "randomized"])
+@pytest.mark.parametrize("n_features", [4, 10])
+def test_kernel_pca_linear_kernel(solver, n_features):
+    """Test that kPCA with linear kernel is equivalent to PCA for all solvers.
+
+    KernelPCA with linear kernel should produce the same output as PCA.
+    """
     rng = np.random.RandomState(0)
-    X_fit = rng.random_sample((5, 4))
-    X_pred = rng.random_sample((2, 4))
+    X_fit = rng.random_sample((5, n_features))
+    X_pred = rng.random_sample((2, n_features))
 
     # for a linear kernel, kernel PCA should find the same projection as PCA
     # modulo the sign (direction)
     # fit only the first four components: fifth is near zero eigenvalue, so
     # can be trimmed due to roundoff error
+    n_comps = 3 if solver == "arpack" else 4
     assert_array_almost_equal(
-        np.abs(KernelPCA(4).fit(X_fit).transform(X_pred)),
-        np.abs(PCA(4).fit(X_fit).transform(X_pred)))
+        np.abs(KernelPCA(n_comps, eigen_solver=solver).fit(X_fit)
+               .transform(X_pred)),
+        np.abs(PCA(n_comps, svd_solver=solver if solver != "dense" else "full")
+               .fit(X_fit).transform(X_pred)))
 
 
 def test_kernel_pca_n_components():
+    """Test that `n_components` is correctly taken into account for projections
+
+    For all solvers this tests that the output has the correct shape depending
+    on the selected number of components.
+    """
     rng = np.random.RandomState(0)
     X_fit = rng.random_sample((5, 4))
     X_pred = rng.random_sample((2, 4))
 
-    for eigen_solver in ("dense", "arpack"):
+    for eigen_solver in ("dense", "arpack", "randomized"):
         for c in [1, 2, 4]:
             kpca = KernelPCA(n_components=c, eigen_solver=eigen_solver)
             shape = kpca.fit(X_fit).transform(X_pred).shape
@@ -141,6 +193,11 @@ def test_kernel_pca_n_components():
 
 
 def test_remove_zero_eig():
+    """Check that the ``remove_zero_eig`` parameter works correctly.
+
+    Tests that the null-space (Zero) eigenvalues are removed when
+    remove_zero_eig=True, whereas they are not by default.
+    """
     X = np.array([[1 - 1e-30, 1], [1, 1], [1, 1 - 1e-20]])
 
     # n_components=None (default) => remove_zero_eig is True
@@ -158,9 +215,11 @@ def test_remove_zero_eig():
 
 
 def test_leave_zero_eig():
-    """This test checks that fit().transform() returns the same result as
+    """Non-regression test for issue #12141 (PR #12143)
+
+    This test checks that fit().transform() returns the same result as
     fit_transform() in case of non-removed zero eigenvalue.
-    Non-regression test for issue #12141 (PR #12143)"""
+    """
     X_fit = np.array([[1, 1], [0, 0]])
 
     # Assert that even with all np warnings on, there is no div by zero warning
@@ -184,23 +243,29 @@ def test_leave_zero_eig():
 
 
 def test_kernel_pca_precomputed():
+    """Test that kPCA works with a precomputed kernel, for all solvers
+
+    """
     rng = np.random.RandomState(0)
     X_fit = rng.random_sample((5, 4))
     X_pred = rng.random_sample((2, 4))
 
-    for eigen_solver in ("dense", "arpack"):
-        X_kpca = KernelPCA(4, eigen_solver=eigen_solver).\
-            fit(X_fit).transform(X_pred)
+    for eigen_solver in ("dense", "arpack", "randomized"):
+        X_kpca = KernelPCA(
+            4, eigen_solver=eigen_solver, random_state=0
+        ).fit(X_fit).transform(X_pred)
+
         X_kpca2 = KernelPCA(
-            4, eigen_solver=eigen_solver, kernel='precomputed').fit(
-                np.dot(X_fit, X_fit.T)).transform(np.dot(X_pred, X_fit.T))
+            4, eigen_solver=eigen_solver, kernel='precomputed', random_state=0
+        ).fit(np.dot(X_fit, X_fit.T)).transform(np.dot(X_pred, X_fit.T))
 
         X_kpca_train = KernelPCA(
-            4, eigen_solver=eigen_solver,
-            kernel='precomputed').fit_transform(np.dot(X_fit, X_fit.T))
+            4, eigen_solver=eigen_solver, kernel='precomputed', random_state=0
+        ).fit_transform(np.dot(X_fit, X_fit.T))
+
         X_kpca_train2 = KernelPCA(
-            4, eigen_solver=eigen_solver, kernel='precomputed').fit(
-                np.dot(X_fit, X_fit.T)).transform(np.dot(X_fit, X_fit.T))
+            4, eigen_solver=eigen_solver, kernel='precomputed', random_state=0
+        ).fit(np.dot(X_fit, X_fit.T)).transform(np.dot(X_fit, X_fit.T))
 
         assert_array_almost_equal(np.abs(X_kpca),
                                   np.abs(X_kpca2))
@@ -209,7 +274,42 @@ def test_kernel_pca_precomputed():
                                   np.abs(X_kpca_train2))
 
 
+@pytest.mark.parametrize("solver", ["auto", "dense", "arpack", "randomized"])
+def test_kernel_pca_precomputed_non_symmetric(solver):
+    """Check that the kernel centerer works.
+
+    Tests that a non symmetric precomputed kernel is actually accepted
+    because the kernel centerer does its job correctly.
+    """
+
+    # a non symmetric gram matrix
+    K = [
+        [1, 2],
+        [3, 40]
+    ]
+    kpca = KernelPCA(kernel="precomputed", eigen_solver=solver,
+                     n_components=1, random_state=0)
+    kpca.fit(K)  # no error
+
+    # same test with centered kernel
+    Kc = [
+        [9, -9],
+        [-9, 9]
+    ]
+    kpca_c = KernelPCA(kernel="precomputed", eigen_solver=solver,
+                       n_components=1, random_state=0)
+    kpca_c.fit(Kc)
+
+    # comparison between the non-centered and centered versions
+    assert_array_equal(kpca.alphas_, kpca_c.alphas_)
+    assert_array_equal(kpca.lambdas_, kpca_c.lambdas_)
+
+
 def test_kernel_pca_invalid_kernel():
+    """Tests that using an invalid kernel name raises a ValueError
+
+    An invalid kernel name should raise a ValueError at fit time.
+    """
     rng = np.random.RandomState(0)
     X_fit = rng.random_sample((2, 4))
     kpca = KernelPCA(kernel="tototiti")
@@ -218,8 +318,11 @@ def test_kernel_pca_invalid_kernel():
 
 
 def test_gridsearch_pipeline():
-    # Test if we can do a grid-search to find parameters to separate
-    # circles with a perceptron model.
+    """Check that kPCA works as expected in a grid search pipeline
+
+    Test if we can do a grid-search to find parameters to separate
+    circles with a perceptron model.
+    """
     X, y = make_circles(n_samples=400, factor=.3, noise=.05,
                         random_state=0)
     kpca = KernelPCA(kernel="rbf", n_components=2)
@@ -232,8 +335,11 @@ def test_gridsearch_pipeline():
 
 
 def test_gridsearch_pipeline_precomputed():
-    # Test if we can do a grid-search to find parameters to separate
-    # circles with a perceptron model using a precomputed kernel.
+    """Check that kPCA works as expected in a grid search pipeline (2)
+
+    Test if we can do a grid-search to find parameters to separate
+    circles with a perceptron model. This test uses a precomputed kernel.
+    """
     X, y = make_circles(n_samples=400, factor=.3, noise=.05,
                         random_state=0)
     kpca = KernelPCA(kernel="precomputed", n_components=2)
@@ -247,7 +353,12 @@ def test_gridsearch_pipeline_precomputed():
 
 
 def test_nested_circles():
-    # Test the linear separability of the first 2D KPCA transform
+    """Check that kPCA projects in a space where nested circles are separable
+
+    Tests that 2D nested circles become separable with a perceptron when
+    projected in the first 2 kPCA using an RBF kernel, while raw samples
+    are not directly separable in the original space.
+    """
     X, y = make_circles(n_samples=400, factor=.3, noise=.05,
                         random_state=0)
 
@@ -270,8 +381,10 @@ def test_nested_circles():
 
 
 def test_kernel_conditioning():
-    """ Test that ``_check_psd_eigenvalues`` is correctly called
-    Non-regression test for issue #12140 (PR #12145)"""
+    """Check that ``_check_psd_eigenvalues`` is correctly called in kPCA
+
+    Non-regression test for issue #12140 (PR #12145).
+    """
 
     # create a pathological X leading to small non-zero eigenvalue
     X = [[5, 1],
@@ -286,11 +399,93 @@ def test_kernel_conditioning():
     assert np.all(kpca.lambdas_ == _check_psd_eigenvalues(kpca.lambdas_))
 
 
+@pytest.mark.parametrize("solver", ["auto", "dense", "arpack", "randomized"])
+def test_precomputed_kernel_not_psd(solver):
+    """Check how KernelPCA works with non-PSD kernels depending on n_components
+
+    Tests for all methods what happens with a non PSD gram matrix (this
+    can happen in an isomap scenario, or with custom kernel functions, or
+    maybe with ill-posed datasets).
+
+    When ``n_component`` is large enough to capture a negative eigenvalue, an
+    error should be raised. Otherwise, KernelPCA should run without error
+    since the negative eigenvalues are not selected.
+    """
+
+    # a non PSD kernel with large eigenvalues, already centered
+    # it was captured from an isomap call and multiplied by 100 for compacity
+    K = [
+        [4.48, -1., 8.07, 2.33, 2.33, 2.33, -5.76, -12.78],
+        [-1., -6.48, 4.5, -1.24, -1.24, -1.24, -0.81, 7.49],
+        [8.07, 4.5, 15.48, 2.09, 2.09, 2.09, -11.1, -23.23],
+        [2.33, -1.24, 2.09, 4., -3.65, -3.65, 1.02, -0.9],
+        [2.33, -1.24, 2.09, -3.65, 4., -3.65, 1.02, -0.9],
+        [2.33, -1.24, 2.09, -3.65, -3.65, 4., 1.02, -0.9],
+        [-5.76, -0.81, -11.1, 1.02, 1.02, 1.02, 4.86, 9.75],
+        [-12.78, 7.49, -23.23, -0.9, -0.9, -0.9, 9.75, 21.46]
+    ]
+    # this gram matrix has 5 positive eigenvalues and 3 negative ones
+    # [ 52.72,   7.65,   7.65,   5.02,   0.  ,  -0.  ,  -6.13, -15.11]
+
+    # 1. ask for enough components to get a significant negative one
+    kpca = KernelPCA(kernel="precomputed", eigen_solver=solver, n_components=7)
+    # make sure that the appropriate error is raised
+    with pytest.raises(ValueError,
+                       match="There are significant negative eigenvalues"):
+        kpca.fit(K)
+
+    # 2. ask for a small enough n_components to get only positive ones
+    kpca = KernelPCA(kernel="precomputed", eigen_solver=solver, n_components=2)
+    if solver == 'randomized':
+        # the randomized method is still inconsistent with the others on this
+        # since it selects the eigenvalues based on the largest 2 modules, not
+        # on the largest 2 values.
+        #
+        # At least we can ensure that we return an error instead of returning
+        # the wrong eigenvalues
+        with pytest.raises(ValueError,
+                           match="There are significant negative eigenvalues"):
+            kpca.fit(K)
+    else:
+        # general case: make sure that it works
+        kpca.fit(K)
+
+
+@pytest.mark.parametrize("n_components", [4, 10, 20])
+def test_kernel_pca_solvers_equivalence(n_components):
+    """Check that 'dense' 'arpack' & 'randomized' solvers give similar results
+    """
+
+    # Generate random data
+    n_train, n_test = 2000, 100
+    X, _ = make_circles(n_samples=(n_train + n_test), factor=.3, noise=.05,
+                        random_state=0)
+    X_fit, X_pred = X[:n_train, :], X[n_train:, :]
+
+    # reference (full)
+    ref_pred = KernelPCA(n_components, eigen_solver="dense", random_state=0
+                         ).fit(X_fit).transform(X_pred)
+
+    # arpack
+    a_pred = KernelPCA(n_components, eigen_solver="arpack", random_state=0
+                       ).fit(X_fit).transform(X_pred)
+    # check that the result is still correct despite the approx
+    assert_array_almost_equal(np.abs(a_pred), np.abs(ref_pred))
+
+    # randomized
+    r_pred = KernelPCA(n_components, eigen_solver="randomized", random_state=0
+                       ).fit(X_fit).transform(X_pred)
+    # check that the result is still correct despite the approximation
+    assert_array_almost_equal(np.abs(r_pred), np.abs(ref_pred))
+
+
 def test_kernel_pca_inverse_transform_reconstruction():
-    # Test if the reconstruction is a good approximation.
-    # Note that in general it is not possible to get an arbitrarily good
-    # reconstruction because of kernel centering that does not
-    # preserve all the information of the original data.
+    """Test if the reconstruction is a good approximation.
+
+    Note that in general it is not possible to get an arbitrarily good
+    reconstruction because of kernel centering that does not
+    preserve all the information of the original data.
+    """
     X, *_ = make_blobs(n_samples=100, n_features=4, random_state=0)
 
     kpca = KernelPCA(
@@ -302,8 +497,11 @@ def test_kernel_pca_inverse_transform_reconstruction():
 
 
 def test_32_64_decomposition_shape():
-    """ Test that the decomposition is similar for 32 and 64 bits data """
-    # see https://github.com/scikit-learn/scikit-learn/issues/18146
+    """Test that the decomposition is similar for 32 and 64 bits data
+
+    Non regression test for
+    https://github.com/scikit-learn/scikit-learn/issues/18146
+    """
     X, y = make_blobs(
         n_samples=30,
         centers=[[0, 0, 0], [1, 1, 1]],
@@ -321,6 +519,10 @@ def test_32_64_decomposition_shape():
 
 # TODO: Remove in 1.1
 def test_kernel_pcc_pairwise_is_deprecated():
+    """Check that `_pairwise` is correctly marked with deprecation warning
+
+    Tests that a `FutureWarning` is issued when `_pairwise` is accessed.
+    """
     kp = KernelPCA(kernel='precomputed')
     msg = r"Attribute _pairwise was deprecated in version 0\.24"
     with pytest.warns(FutureWarning, match=msg):
diff --git a/sklearn/utils/extmath.py b/sklearn/utils/extmath.py
index add8c5883a751..c72c54bd1aa4d 100644
--- a/sklearn/utils/extmath.py
+++ b/sklearn/utils/extmath.py
@@ -249,6 +249,9 @@ def randomized_svd(M, n_components, *, n_oversamples=10, n_iter='auto',
                    flip_sign=True, random_state='warn'):
     """Computes a truncated randomized SVD.
 
+    This method solves the fixed-rank approximation problem described in the
+    Halko et al paper (problem (1.5), p5).
+
     Parameters
     ----------
     M : {ndarray, sparse matrix}
@@ -262,13 +265,23 @@ def randomized_svd(M, n_components, *, n_oversamples=10, n_iter='auto',
         to ensure proper conditioning. The total number of random vectors
         used to find the range of M is n_components + n_oversamples. Smaller
         number can improve speed but can negatively impact the quality of
-        approximation of singular vectors and singular values.
+        approximation of singular vectors and singular values. Users might wish
+        to increase this parameter up to `2*k - n_components` where k is the
+        effective rank, for large matrices, noisy problems, matrices with
+        slowly decaying spectrums, or to increase precision accuracy. See Halko
+        et al (pages 5, 23 and 26).
 
     n_iter : int or 'auto', default='auto'
         Number of power iterations. It can be used to deal with very noisy
         problems. When 'auto', it is set to 4, unless `n_components` is small
-        (< .1 * min(X.shape)) `n_iter` in which case is set to 7.
-        This improves precision with few components.
+        (< .1 * min(X.shape)) in which case `n_iter` is set to 7.
+        This improves precision with few components. Note that in general
+        users should rather increase `n_oversamples` before increasing `n_iter`
+        as the principle of the randomized method is to avoid usage of these
+        more costly power iterations steps. When `n_components` is equal
+        or greater to the effective matrix rank and the spectrum does not
+        present a slow decay, `n_iter=0` or `1` should even work fine in theory
+        (see Halko et al paper, page 9).
 
         .. versionchanged:: 0.18
 
@@ -316,12 +329,15 @@ def randomized_svd(M, n_components, *, n_oversamples=10, n_iter='auto',
     computations. It is particularly fast on large matrices on which
     you wish to extract only a small number of components. In order to
     obtain further speed up, `n_iter` can be set <=2 (at the cost of
-    loss of precision).
+    loss of precision). To increase the precision it is recommended to
+    increase `n_oversamples`, up to `2*k-n_components` where k is the
+    effective rank. Usually, `n_components` is chosen to be greater than k
+    so increasing `n_oversamples` up to `n_components` should be enough.
 
     References
     ----------
     * Finding structure with randomness: Stochastic algorithms for constructing
-      approximate matrix decompositions
+      approximate matrix decompositions (Algorithm 4.3)
       Halko, et al., 2009 https://arxiv.org/abs/0909.4061
 
     * A randomized algorithm for the decomposition of matrices
@@ -393,6 +409,152 @@ def randomized_svd(M, n_components, *, n_oversamples=10, n_iter='auto',
         return U[:, :n_components], s[:n_components], Vt[:n_components, :]
 
 
+@_deprecate_positional_args
+def _randomized_eigsh(M, n_components, *, n_oversamples=10, n_iter='auto',
+                      power_iteration_normalizer='auto',
+                      selection='module', random_state=None):
+    """Computes a truncated eigendecomposition using randomized methods
+
+    This method solves the fixed-rank approximation problem described in the
+    Halko et al paper.
+
+    The choice of which components to select can be tuned with the `selection`
+    parameter.
+
+    .. versionadded:: 0.24
+
+    Parameters
+    ----------
+    M : ndarray or sparse matrix
+        Matrix to decompose, it should be real symmetric square or complex
+        hermitian
+
+    n_components : int
+        Number of eigenvalues and vectors to extract.
+
+    n_oversamples : int, default=10
+        Additional number of random vectors to sample the range of M so as
+        to ensure proper conditioning. The total number of random vectors
+        used to find the range of M is n_components + n_oversamples. Smaller
+        number can improve speed but can negatively impact the quality of
+        approximation of eigenvectors and eigenvalues. Users might wish
+        to increase this parameter up to `2*k - n_components` where k is the
+        effective rank, for large matrices, noisy problems, matrices with
+        slowly decaying spectrums, or to increase precision accuracy. See Halko
+        et al (pages 5, 23 and 26).
+
+    n_iter : int or 'auto', default='auto'
+        Number of power iterations. It can be used to deal with very noisy
+        problems. When 'auto', it is set to 4, unless `n_components` is small
+        (< .1 * min(X.shape)) in which case `n_iter` is set to 7.
+        This improves precision with few components. Note that in general
+        users should rather increase `n_oversamples` before increasing `n_iter`
+        as the principle of the randomized method is to avoid usage of these
+        more costly power iterations steps. When `n_components` is equal
+        or greater to the effective matrix rank and the spectrum does not
+        present a slow decay, `n_iter=0` or `1` should even work fine in theory
+        (see Halko et al paper, page 9).
+
+    power_iteration_normalizer : {'auto', 'QR', 'LU', 'none'}, default='auto'
+        Whether the power iterations are normalized with step-by-step
+        QR factorization (the slowest but most accurate), 'none'
+        (the fastest but numerically unstable when `n_iter` is large, e.g.
+        typically 5 or larger), or 'LU' factorization (numerically stable
+        but can lose slightly in accuracy). The 'auto' mode applies no
+        normalization if `n_iter` <= 2 and switches to LU otherwise.
+
+    selection : {'value', 'module'}, default='module'
+        Strategy used to select the n components. When `selection` is `'value'`
+        (not yet implemented, will become the default when implemented), the
+        components corresponding to the n largest eigenvalues are returned.
+        When `selection` is `'module'`, the components corresponding to the n
+        eigenvalues with largest modules are returned.
+
+    random_state : int, RandomState instance, default=None
+        The seed of the pseudo random number generator to use when shuffling
+        the data, i.e. getting the random vectors to initialize the algorithm.
+        Pass an int for reproducible results across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    Notes
+    -----
+    This algorithm finds a (usually very good) approximate truncated
+    eigendecomposition using randomized methods to speed up the computations.
+
+    This method is particularly fast on large matrices on which
+    you wish to extract only a small number of components. In order to
+    obtain further speed up, `n_iter` can be set <=2 (at the cost of
+    loss of precision). To increase the precision it is recommended to
+    increase `n_oversamples`, up to `2*k-n_components` where k is the
+    effective rank. Usually, `n_components` is chosen to be greater than k
+    so increasing `n_oversamples` up to `n_components` should be enough.
+
+    Strategy 'value': not implemented yet.
+    Algorithms 5.3, 5.4 and 5.5 in the Halko et al paper should provide good
+    condidates for a future implementation.
+
+    Strategy 'module':
+    The principle is that for diagonalizable matrices, the singular values and
+    eigenvalues are related: if t is an eigenvalue of A, then :math:`|t|` is a
+    singular value of A. This method relies on a randomized SVD to find the n
+    singular components corresponding to the n singular values with largest
+    modules, and then uses the signs of the singular vectors to find the true
+    sign of t: if the sign of left and right singular vectors are different
+    then the corresponding eigenvalue is negative.
+
+    Returns
+    -------
+    eigvals : 1D array of shape (n_components,) containing the `n_components`
+        eigenvalues selected (see ``selection`` parameter).
+    eigvecs : 2D array of shape (M.shape[0], n_components) containing the
+        `n_components` eigenvectors corresponding to the `eigvals`, in the
+        corresponding order. Note that this follows the `scipy.linalg.eigh`
+        convention.
+
+    See Also
+    --------
+    :func:`randomized_svd`
+
+    References
+    ----------
+    * Finding structure with randomness: Stochastic algorithms for constructing
+      approximate matrix decompositions (Algorithm 4.3 for strategy 'module')
+      Halko, et al., 2009 https://arxiv.org/abs/0909.4061
+
+    """
+    if selection == 'value':  # pragma: no cover
+        # to do : an algorithm can be found in the Halko et al reference
+        raise NotImplementedError()
+
+    elif selection == 'module':
+        # Note: no need for deterministic U and Vt (flip_sign=True),
+        # as we only use the dot product UVt afterwards
+        U, S, Vt = randomized_svd(
+            M, n_components=n_components, n_oversamples=n_oversamples,
+            n_iter=n_iter,
+            power_iteration_normalizer=power_iteration_normalizer,
+            flip_sign=False, random_state=random_state)
+
+        eigvecs = U[:, :n_components]
+        eigvals = S[:n_components]
+
+        # Conversion of Singular values into Eigenvalues:
+        # For any eigenvalue t, the corresponding singular value is |t|.
+        # So if there is a negative eigenvalue t, the corresponding singular
+        # value will be -t, and the left (U) and right (V) singular vectors
+        # will have opposite signs.
+        # Fastest way: see <https://stackoverflow.com/a/61974002/7262247>
+        diag_VtU = np.einsum('ji,ij->j',
+                             Vt[:n_components, :], U[:, :n_components])
+        signs = np.sign(diag_VtU)
+        eigvals = eigvals * signs
+
+    else:  # pragma: no cover
+        raise ValueError("Invalid `selection`: %r" % selection)
+
+    return eigvals, eigvecs
+
+
 @_deprecate_positional_args
 def weighted_mode(a, w, *, axis=0):
     """Returns an array of the weighted modal (most common) value in a.
diff --git a/sklearn/utils/tests/test_extmath.py b/sklearn/utils/tests/test_extmath.py
index 8e53d94d911f0..1a77d08b12388 100644
--- a/sklearn/utils/tests/test_extmath.py
+++ b/sklearn/utils/tests/test_extmath.py
@@ -8,11 +8,12 @@
 from scipy import sparse
 from scipy import linalg
 from scipy import stats
+from scipy.sparse.linalg import eigsh
 from scipy.special import expit
 
 import pytest
 from sklearn.utils import gen_batches
-
+from sklearn.utils._arpack import _init_arpack_v0
 from sklearn.utils._testing import assert_almost_equal
 from sklearn.utils._testing import assert_allclose
 from sklearn.utils._testing import assert_allclose_dense_sparse
@@ -23,7 +24,7 @@
 from sklearn.utils._testing import skip_if_32bit
 
 from sklearn.utils.extmath import density, _safe_accumulator_op
-from sklearn.utils.extmath import randomized_svd
+from sklearn.utils.extmath import randomized_svd, _randomized_eigsh
 from sklearn.utils.extmath import row_norms
 from sklearn.utils.extmath import weighted_mode
 from sklearn.utils.extmath import cartesian
@@ -34,7 +35,7 @@
 from sklearn.utils.extmath import softmax
 from sklearn.utils.extmath import stable_cumsum
 from sklearn.utils.extmath import safe_sparse_dot
-from sklearn.datasets import make_low_rank_matrix
+from sklearn.datasets import make_low_rank_matrix, make_sparse_spd_matrix
 
 
 def test_density():
@@ -161,6 +162,128 @@ def test_randomized_svd_low_rank_all_dtypes(dtype):
     check_randomized_svd_low_rank(dtype)
 
 
+@pytest.mark.parametrize('dtype',
+                         (np.int32, np.int64, np.float32, np.float64))
+def test_randomized_eigsh(dtype):
+    """Test that `_randomized_eigsh` returns the appropriate components"""
+
+    rng = np.random.RandomState(42)
+    X = np.diag(np.array([1., -2., 0., 3.], dtype=dtype))
+    # random rotation that preserves the eigenvalues of X
+    rand_rot = np.linalg.qr(rng.normal(size=X.shape))[0]
+    X = rand_rot @ X @ rand_rot.T
+
+    # with 'module' selection method, the negative eigenvalue shows up
+    eigvals, eigvecs = _randomized_eigsh(X, n_components=2, selection='module')
+    # eigenvalues
+    assert eigvals.shape == (2,)
+    assert_array_almost_equal(eigvals, [3., -2.])  # negative eigenvalue here
+    # eigenvectors
+    assert eigvecs.shape == (4, 2)
+
+    # with 'value' selection method, the negative eigenvalue does not show up
+    with pytest.raises(NotImplementedError):
+        _randomized_eigsh(X, n_components=2, selection='value')
+
+
+@pytest.mark.parametrize('k', (10, 50, 100, 199, 200))
+def test_randomized_eigsh_compared_to_others(k):
+    """Check that `_randomized_eigsh` is similar to other `eigsh`
+
+    Tests that for a random PSD matrix, `_randomized_eigsh` provides results
+    comparable to LAPACK (scipy.linalg.eigh) and ARPACK
+    (scipy.sparse.linalg.eigsh).
+
+    Note: some versions of ARPACK do not support k=n_features.
+    """
+
+    # make a random PSD matrix
+    n_features = 200
+    X = make_sparse_spd_matrix(n_features, random_state=0)
+
+    # compare two versions of randomized
+    # rough and fast
+    eigvals, eigvecs = _randomized_eigsh(X, n_components=k, selection='module',
+                                         n_iter=25, random_state=0)
+    # more accurate but slow (TODO find realistic settings here)
+    eigvals_qr, eigvecs_qr = _randomized_eigsh(
+        X, n_components=k, n_iter=25, n_oversamples=20, random_state=0,
+        power_iteration_normalizer="QR", selection='module'
+    )
+
+    # with LAPACK
+    eigvals_lapack, eigvecs_lapack = linalg.eigh(X, eigvals=(n_features - k,
+                                                             n_features - 1))
+    indices = eigvals_lapack.argsort()[::-1]
+    eigvals_lapack = eigvals_lapack[indices]
+    eigvecs_lapack = eigvecs_lapack[:, indices]
+
+    # -- eigenvalues comparison
+    assert eigvals_lapack.shape == (k,)
+    # comparison precision
+    assert_array_almost_equal(eigvals, eigvals_lapack, decimal=6)
+    assert_array_almost_equal(eigvals_qr, eigvals_lapack, decimal=6)
+
+    # -- eigenvectors comparison
+    assert eigvecs_lapack.shape == (n_features, k)
+    # flip eigenvectors' sign to enforce deterministic output
+    dummy_vecs = np.zeros_like(eigvecs).T
+    eigvecs, _ = svd_flip(eigvecs, dummy_vecs)
+    eigvecs_qr, _ = svd_flip(eigvecs_qr, dummy_vecs)
+    eigvecs_lapack, _ = svd_flip(eigvecs_lapack, dummy_vecs)
+    assert_array_almost_equal(eigvecs, eigvecs_lapack, decimal=4)
+    assert_array_almost_equal(eigvecs_qr, eigvecs_lapack, decimal=6)
+
+    # comparison ARPACK ~ LAPACK (some ARPACK implems do not support k=n)
+    if k < n_features:
+        v0 = _init_arpack_v0(n_features, random_state=0)
+        # "LA" largest algebraic <=> selection="value" in randomized_eigsh
+        eigvals_arpack, eigvecs_arpack = eigsh(X, k, which="LA", tol=0,
+                                               maxiter=None, v0=v0)
+        indices = eigvals_arpack.argsort()[::-1]
+        # eigenvalues
+        eigvals_arpack = eigvals_arpack[indices]
+        assert_array_almost_equal(eigvals_lapack, eigvals_arpack, decimal=10)
+        # eigenvectors
+        eigvecs_arpack = eigvecs_arpack[:, indices]
+        eigvecs_arpack, _ = svd_flip(eigvecs_arpack, dummy_vecs)
+        assert_array_almost_equal(eigvecs_arpack, eigvecs_lapack, decimal=8)
+
+
+@pytest.mark.parametrize("n,rank", [
+    (10, 7),
+    (100, 10),
+    (100, 80),
+    (500, 10),
+    (500, 250),
+    (500, 400),
+])
+def test_randomized_eigsh_reconst_low_rank(n, rank):
+    """Check that randomized_eigsh is able to reconstruct a low rank psd matrix
+
+    Tests that the decomposition provided by `_randomized_eigsh` leads to
+    orthonormal eigenvectors, and that a low rank PSD matrix can be effectively
+    reconstructed with good accuracy using it.
+    """
+    assert rank < n
+
+    # create a low rank PSD
+    rng = np.random.RandomState(69)
+    X = rng.randn(n, rank)
+    A = X @ X.T
+
+    # approximate A with the "right" number of components
+    S, V = _randomized_eigsh(A, n_components=rank, random_state=rng)
+    # orthonormality checks
+    assert_array_almost_equal(np.linalg.norm(V, axis=0), np.ones(S.shape))
+    assert_array_almost_equal(V.T @ V, np.diag(np.ones(S.shape)))
+    # reconstruction
+    A_reconstruct = V @ np.diag(S) @ V.T
+
+    # test that the approximation is good
+    assert_array_almost_equal(A_reconstruct, A, decimal=6)
+
+
 @pytest.mark.parametrize('dtype',
                          (np.float32, np.float64))
 def test_row_norms(dtype):

From 6a562d31e4488b067bff89fbce21382076823fa3 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 27 Apr 2021 17:45:20 +0200
Subject: [PATCH 369/478] ENH improve KernelCenterer documentation and tests
 (#19901)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Julien Jerphanion <git@jjerphan.xyz>
Co-authored-by: Sylvain Marié <sylvain.marie@schneider-electric.com>
---
 doc/modules/preprocessing.rst            | 63 ++++++++++++++++++--
 sklearn/preprocessing/_data.py           | 36 +++++++++---
 sklearn/preprocessing/tests/test_data.py | 74 ++++++++++++++++++++++++
 3 files changed, 162 insertions(+), 11 deletions(-)

diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
index cdde7479b1a4f..0afd79b754608 100644
--- a/doc/modules/preprocessing.rst
+++ b/doc/modules/preprocessing.rst
@@ -240,10 +240,65 @@ Centering kernel matrices
 -------------------------
 
 If you have a kernel matrix of a kernel :math:`K` that computes a dot product
-in a feature space defined by function :math:`\phi`,
-a :class:`KernelCenterer` can transform the kernel matrix
-so that it contains inner products in the feature space
-defined by :math:`\phi` followed by removal of the mean in that space.
+in a feature space (possibly implicitly) defined by a function
+:math:`\phi(\cdot)`, a :class:`KernelCenterer` can transform the kernel matrix
+so that it contains inner products in the feature space defined by :math:`\phi`
+followed by the removal of the mean in that space. In other words,
+:class:`KernelCenterer` computes the centered Gram matrix associated to a
+positive semidefinite kernel :math:`K`.
+
+**Mathematical formulation**
+
+We can have a look at the mathematical formulation now that we have the
+intuition. Let :math:`K` be a kernel matrix of shape `(n_samples, n_samples)`
+computed from :math:`X`, a data matrix of shape `(n_samples, n_features)`,
+during the `fit` step. :math:`K` is defined by
+
+.. math::
+  K(X, X) = \phi(X) . \phi(X)^{T}
+
+:math:`\phi(X)` is a function mapping of :math:`X` to a Hilbert space. A
+centered kernel :math:`\tilde{K}` is defined as:
+
+.. math::
+  \tilde{K}(X, X) = \tilde{\phi}(X) . \tilde{\phi}(X)^{T}
+
+where :math:`\tilde{\phi}(X)` results from centering :math:`\phi(X)` in the
+Hilbert space.
+
+Thus, one could compute :math:`\tilde{K}` by mapping :math:`X` using the
+function :math:`\phi(\cdot)` and center the data in this new space. However,
+kernels are often used because they allows some algebra calculations that
+avoid computing explicitly this mapping using :math:`\phi(\cdot)`. Indeed, one
+can implicitly center as shown in Appendix B in [Scholkopf1998]_:
+
+.. math::
+  \tilde{K} = K - 1_{\text{n}_{samples}} K - K 1_{\text{n}_{samples}} + 1_{\text{n}_{samples}} K 1_{\text{n}_{samples}}
+
+:math:`1_{\text{n}_{samples}}` is a matrix of `(n_samples, n_samples)` where
+all entries are equal to :math:`\frac{1}{\text{n}_{samples}}`. In the
+`transform` step, the kernel becomes :math:`K_{test}(X, Y)` defined as:
+
+.. math::
+  K_{test}(X, Y) = \phi(Y) . \phi(X)^{T}
+
+:math:`Y` is the test dataset of shape `(n_samples_test, n_features)` and thus
+:math:`K_{test}` is of shape `(n_samples_test, n_samples)`. In this case,
+centering :math:`K_{test}` is done as:
+
+.. math::
+  \tilde{K}_{test}(X, Y) = K_{test} - 1'_{\text{n}_{samples}} K - K_{test} 1_{\text{n}_{samples}} + 1'_{\text{n}_{samples}} K 1_{\text{n}_{samples}}
+
+:math:`1'_{\text{n}_{samples}}` is a matrix of shape
+`(n_samples_test, n_samples)` where all entries are equal to
+:math:`\frac{1}{\text{n}_{samples}}`.
+
+.. topic:: References
+
+  .. [Scholkopf1998] B. Schölkopf, A. Smola, and K.R. Müller,
+    `"Nonlinear component analysis as a kernel eigenvalue problem."
+    <https://www.mlpack.org/papers/kpca.pdf>`_
+    Neural computation 10.5 (1998): 1299-1319.
 
 .. _preprocessing_transformer:
 
diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py
index 80cb132174328..befd3e61b96fc 100644
--- a/sklearn/preprocessing/_data.py
+++ b/sklearn/preprocessing/_data.py
@@ -1951,24 +1951,46 @@ def _more_tags(self):
 
 
 class KernelCenterer(TransformerMixin, BaseEstimator):
-    """Center a kernel matrix.
+    r"""Center an arbitrary kernel matrix :math:`K`.
 
-    Let K(x, z) be a kernel defined by phi(x)^T phi(z), where phi is a
-    function mapping x to a Hilbert space. KernelCenterer centers (i.e.,
-    normalize to have zero mean) the data without explicitly computing phi(x).
-    It is equivalent to centering phi(x) with
-    sklearn.preprocessing.StandardScaler(with_std=False).
+    Let define a kernel :math:`K` such that:
+
+    .. math::
+        K(X, Y) = \phi(X) . \phi(Y)^{T}
+
+    :math:`\phi(X)` is a function mapping of rows of :math:`X` to a
+    Hilbert space and :math:`K` is of shape `(n_samples, n_samples)`.
+
+    This class allows to compute :math:`\tilde{K}(X, Y)` such that:
+
+    .. math::
+        \tilde{K(X, Y)} = \tilde{\phi}(X) . \tilde{\phi}(Y)^{T}
+
+    :math:`\tilde{\phi}(X)` is the centered mapped data in the Hilbert
+    space.
+
+    `KernelCenterer` centers the features without explicitly computing the
+    mapping :math:`\phi(\cdot)`. Working with centered kernels is sometime
+    expected when dealing with algebra computation such as eigendecomposition
+    for :class:`~sklearn.decomposition.KernelPCA` for instance.
 
     Read more in the :ref:`User Guide <kernel_centering>`.
 
     Attributes
     ----------
-    K_fit_rows_ : array of shape (n_samples,)
+    K_fit_rows_ : ndarray of shape (n_samples,)
         Average of each column of kernel matrix.
 
     K_fit_all_ : float
         Average of kernel matrix.
 
+    References
+    ----------
+    .. [1] `Schölkopf, Bernhard, Alexander Smola, and Klaus-Robert Müller.
+       "Nonlinear component analysis as a kernel eigenvalue problem."
+       Neural computation 10.5 (1998): 1299-1319.
+       <https://www.mlpack.org/papers/kpca.pdf>`_
+
     Examples
     --------
     >>> from sklearn.preprocessing import KernelCenterer
diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py
index 45d967d5f39a2..2cc51a4208675 100644
--- a/sklearn/preprocessing/tests/test_data.py
+++ b/sklearn/preprocessing/tests/test_data.py
@@ -2167,6 +2167,80 @@ def test_center_kernel():
     K_pred_centered2 = centerer.transform(K_pred)
     assert_array_almost_equal(K_pred_centered, K_pred_centered2)
 
+    # check the results coherence with the method proposed in:
+    # B. Schölkopf, A. Smola, and K.R. Müller,
+    # "Nonlinear component analysis as a kernel eigenvalue problem"
+    # equation (B.3)
+
+    # K_centered3 = (I - 1_M) K (I - 1_M)
+    #             =  K - 1_M K - K 1_M + 1_M K 1_M
+    ones_M = np.ones_like(K_fit) / K_fit.shape[0]
+    K_fit_centered3 = (
+        K_fit - ones_M @ K_fit - K_fit @ ones_M + ones_M @ K_fit @ ones_M
+    )
+    assert_allclose(K_fit_centered, K_fit_centered3)
+
+    # K_test_centered3 = (K_test - 1'_M K)(I - 1_M)
+    #                  = K_test - 1'_M K - K_test 1_M + 1'_M K 1_M
+    ones_prime_M = np.ones_like(K_pred) / K_fit.shape[0]
+    K_pred_centered3 = (
+        K_pred - ones_prime_M @ K_fit - K_pred @ ones_M +
+        ones_prime_M @ K_fit @ ones_M
+    )
+    assert_allclose(K_pred_centered, K_pred_centered3)
+
+
+def test_kernelcenterer_non_linear_kernel():
+    """Check kernel centering for non-linear kernel."""
+    rng = np.random.RandomState(0)
+    X, X_test = rng.randn(100, 50), rng.randn(20, 50)
+
+    def phi(X):
+        """Our mapping function phi."""
+        return np.vstack([
+            np.clip(X, a_min=0, a_max=None),
+            -np.clip(X, a_min=None, a_max=0),
+        ])
+
+    phi_X = phi(X)
+    phi_X_test = phi(X_test)
+
+    # centered the projection
+    scaler = StandardScaler(with_std=False)
+    phi_X_center = scaler.fit_transform(phi_X)
+    phi_X_test_center = scaler.transform(phi_X_test)
+
+    # create the different kernel
+    K = phi_X @ phi_X.T
+    K_test = phi_X_test @ phi_X.T
+    K_center = phi_X_center @ phi_X_center.T
+    K_test_center = phi_X_test_center @ phi_X_center.T
+
+    kernel_centerer = KernelCenterer()
+    kernel_centerer.fit(K)
+
+    assert_allclose(kernel_centerer.transform(K), K_center)
+    assert_allclose(kernel_centerer.transform(K_test), K_test_center)
+
+    # check the results coherence with the method proposed in:
+    # B. Schölkopf, A. Smola, and K.R. Müller,
+    # "Nonlinear component analysis as a kernel eigenvalue problem"
+    # equation (B.3)
+
+    # K_centered = (I - 1_M) K (I - 1_M)
+    #            =  K - 1_M K - K 1_M + 1_M K 1_M
+    ones_M = np.ones_like(K) / K.shape[0]
+    K_centered = K - ones_M @ K - K @ ones_M + ones_M @ K @ ones_M
+    assert_allclose(kernel_centerer.transform(K), K_centered)
+
+    # K_test_centered = (K_test - 1'_M K)(I - 1_M)
+    #                 = K_test - 1'_M K - K_test 1_M + 1'_M K 1_M
+    ones_prime_M = np.ones_like(K_test) / K.shape[0]
+    K_test_centered = (
+        K_test - ones_prime_M @ K - K_test @ ones_M + ones_prime_M @ K @ ones_M
+    )
+    assert_allclose(kernel_centerer.transform(K_test), K_test_centered)
+
 
 def test_cv_pipeline_precomputed():
     # Cross-validate a regression on four coplanar points with the same

From 7b1c9afcaf34e622de76d1f5d5e929e5aaffc514 Mon Sep 17 00:00:00 2001
From: iwhalvic <iwhalvic@gmail.com>
Date: Tue, 27 Apr 2021 11:23:08 -0500
Subject: [PATCH 370/478] FIX Use cho_solve when return_std=True for
 GaussianProcessRegressor (#19939)

---
 doc/whats_new/v0.24.rst                    |  7 +++
 sklearn/gaussian_process/_gpr.py           | 24 ++++-----
 sklearn/gaussian_process/tests/test_gpr.py | 57 +++++++++++++---------
 3 files changed, 49 insertions(+), 39 deletions(-)

diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst
index 72a96aa74f470..f54e20e5154bc 100644
--- a/doc/whats_new/v0.24.rst
+++ b/doc/whats_new/v0.24.rst
@@ -52,6 +52,13 @@ Changelog
 :mod:`sklearn.gaussian_process`
 ...............................
 
+- |Fix| Avoid explicitly forming inverse covariance matrix in
+  :class:`gaussian_process.GaussianProcessRegressor` when set to output
+  standard deviation. With certain covariance matrices this inverse is unstable
+  to compute explicitly. Calling Cholesky solver mitigates this issue in
+  computation.
+  :pr:`19939` by :user:`Ian Halvic <iwhalvic>`.
+
 - |Fix| Avoid division by zero when scaling constant target in
   :class:`gaussian_process.GaussianProcessRegressor`. It was due to a std. dev.
   equal to 0. Now, such case is detected and the std. dev. is affected to 1
diff --git a/sklearn/gaussian_process/_gpr.py b/sklearn/gaussian_process/_gpr.py
index 8f9575ffe42df..9b1d0ae409526 100644
--- a/sklearn/gaussian_process/_gpr.py
+++ b/sklearn/gaussian_process/_gpr.py
@@ -8,7 +8,7 @@
 from operator import itemgetter
 
 import numpy as np
-from scipy.linalg import cholesky, cho_solve, solve_triangular
+from scipy.linalg import cholesky, cho_solve
 import scipy.optimize
 
 from ..base import BaseEstimator, RegressorMixin, clone
@@ -270,8 +270,6 @@ def obj_func(theta, eval_gradient=True):
         K[np.diag_indices_from(K)] += self.alpha
         try:
             self.L_ = cholesky(K, lower=True)  # Line 2
-            # self.L_ changed, self._K_inv needs to be recomputed
-            self._K_inv = None
         except np.linalg.LinAlgError as exc:
             exc.args = ("The kernel, %s, is not returning a "
                         "positive definite matrix. Try gradually "
@@ -345,31 +343,27 @@ def predict(self, X, return_std=False, return_cov=False):
         else:  # Predict based on GP posterior
             K_trans = self.kernel_(X, self.X_train_)
             y_mean = K_trans.dot(self.alpha_)  # Line 4 (y_mean = f_star)
-
             # undo normalisation
             y_mean = self._y_train_std * y_mean + self._y_train_mean
 
             if return_cov:
-                v = cho_solve((self.L_, True), K_trans.T)  # Line 5
-                y_cov = self.kernel_(X) - K_trans.dot(v)  # Line 6
+                # Solve K @ V = K_trans.T
+                V = cho_solve((self.L_, True), K_trans.T)  # Line 5
+                y_cov = self.kernel_(X) - K_trans.dot(V)  # Line 6
 
                 # undo normalisation
                 y_cov = y_cov * self._y_train_std**2
 
                 return y_mean, y_cov
             elif return_std:
-                # cache result of K_inv computation
-                if self._K_inv is None:
-                    # compute inverse K_inv of K based on its Cholesky
-                    # decomposition L and its inverse L_inv
-                    L_inv = solve_triangular(self.L_.T,
-                                             np.eye(self.L_.shape[0]))
-                    self._K_inv = L_inv.dot(L_inv.T)
+                # Solve K @ V = K_trans.T
+                V = cho_solve((self.L_, True), K_trans.T)  # Line 5
 
                 # Compute variance of predictive distribution
+                # Use einsum to avoid explicitly forming the large matrix
+                # K_trans @ V just to extract its diagonal afterward.
                 y_var = self.kernel_.diag(X)
-                y_var -= np.einsum("ij,ij->i",
-                                   np.dot(K_trans, self._K_inv), K_trans)
+                y_var -= np.einsum("ij,ji->i", K_trans, V)
 
                 # Check if any of the variances is negative because of
                 # numerical issues. If yes: set the variance to 0.
diff --git a/sklearn/gaussian_process/tests/test_gpr.py b/sklearn/gaussian_process/tests/test_gpr.py
index 440e421cb95cc..66e3c96a8f029 100644
--- a/sklearn/gaussian_process/tests/test_gpr.py
+++ b/sklearn/gaussian_process/tests/test_gpr.py
@@ -20,10 +20,12 @@
 from sklearn.gaussian_process.tests._mini_sequence_kernel import MiniSeqKernel
 from sklearn.exceptions import ConvergenceWarning
 
-from sklearn.utils._testing \
-    import (assert_array_less,
-            assert_almost_equal, assert_array_almost_equal,
-            assert_array_equal, assert_allclose)
+from sklearn.utils._testing import (
+    assert_array_less,
+    assert_almost_equal,
+    assert_array_almost_equal,
+    assert_allclose
+)
 
 
 def f(x):
@@ -185,7 +187,8 @@ def test_no_optimizer():
 
 
 @pytest.mark.parametrize('kernel', kernels)
-def test_predict_cov_vs_std(kernel):
+@pytest.mark.parametrize("target", [y, np.ones(X.shape[0], dtype=np.float64)])
+def test_predict_cov_vs_std(kernel, target):
     if sys.maxsize <= 2 ** 32 and sys.version_info[:2] == (3, 6):
         pytest.xfail("This test may fail on 32bit Py3.6")
 
@@ -455,25 +458,6 @@ def test_no_fit_default_predict():
     assert_array_almost_equal(y_cov1, y_cov2)
 
 
-@pytest.mark.parametrize('kernel', kernels)
-def test_K_inv_reset(kernel):
-    y2 = f(X2).ravel()
-
-    # Test that self._K_inv is reset after a new fit
-    gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)
-    assert hasattr(gpr, '_K_inv')
-    assert gpr._K_inv is None
-    gpr.predict(X, return_std=True)
-    assert gpr._K_inv is not None
-    gpr.fit(X2, y2)
-    assert gpr._K_inv is None
-    gpr.predict(X2, return_std=True)
-    gpr2 = GaussianProcessRegressor(kernel=kernel).fit(X2, y2)
-    gpr2.predict(X2, return_std=True)
-    # the value of K_inv should be independent of the first fit
-    assert_array_equal(gpr._K_inv, gpr2._K_inv)
-
-
 def test_warning_bounds():
     kernel = RBF(length_scale_bounds=[1e-5, 1e-3])
     gpr = GaussianProcessRegressor(kernel=kernel)
@@ -569,3 +553,28 @@ def test_constant_target(kernel):
     assert_allclose(y_pred, y_constant)
     # set atol because we compare to zero
     assert_allclose(np.diag(y_cov), 0., atol=1e-9)
+
+
+def test_gpr_consistency_std_cov_non_invertible_kernel():
+    """Check the consistency between the returned std. dev. and the covariance.
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/19936
+    Inconsistencies were observed when the kernel cannot be inverted (or
+    numerically stable).
+    """
+    kernel = (C(8.98576054e+05, (1e-12, 1e12)) *
+              RBF([5.91326520e+02, 1.32584051e+03], (1e-12, 1e12)) +
+              WhiteKernel(noise_level=1e-5))
+    gpr = GaussianProcessRegressor(kernel=kernel, alpha=0, optimizer=None)
+    X_train = np.array([[0., 0.], [1.54919334, -0.77459667], [-1.54919334, 0.],
+                        [0., -1.54919334], [0.77459667, 0.77459667],
+                        [-0.77459667, 1.54919334]])
+    y_train = np.array([[-2.14882017e-10], [-4.66975823e+00], [4.01823986e+00],
+                        [-1.30303674e+00], [-1.35760156e+00],
+                        [3.31215668e+00]])
+    gpr.fit(X_train, y_train)
+    X_test = np.array([[-1.93649167, -1.93649167], [1.93649167, -1.93649167],
+                       [-1.93649167, 1.93649167], [1.93649167, 1.93649167]])
+    pred1, std = gpr.predict(X_test, return_std=True)
+    pred2, cov = gpr.predict(X_test, return_cov=True)
+    assert_allclose(std, np.sqrt(np.diagonal(cov)), rtol=1e-5)

From 23032e72882647f7a54c4c8e10440567dfb53e80 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Tue, 27 Apr 2021 12:52:39 -0400
Subject: [PATCH 371/478] ENH Makes global configuration thread local (#18736)

---
 doc/whats_new/v1.0.rst       |  5 ++++
 sklearn/_config.py           | 29 +++++++++++++------
 sklearn/tests/test_config.py | 55 ++++++++++++++++++++++++++++++++++++
 3 files changed, 81 insertions(+), 8 deletions(-)

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 0cd1d6a89d158..6e3c063a45dcb 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -76,6 +76,11 @@ Changelog
   - For :class:`tree.ExtraTreeRegressor`, `criterion="mse"` is deprecated,
     use `"squared_error"` instead which is now the default.
 
+:mod:`sklearn.base`
+...................
+
+- |Fix| :func:`config_context` is now threadsafe. :pr:`18736` by `Thomas Fan`_.
+
 :mod:`sklearn.calibration`
 ..........................
 
diff --git a/sklearn/_config.py b/sklearn/_config.py
index feb5e86287c38..e81d50849db05 100644
--- a/sklearn/_config.py
+++ b/sklearn/_config.py
@@ -2,6 +2,7 @@
 """
 import os
 from contextlib import contextmanager as contextmanager
+import threading
 
 _global_config = {
     'assume_finite': bool(os.environ.get('SKLEARN_ASSUME_FINITE', False)),
@@ -9,6 +10,15 @@
     'print_changed_only': True,
     'display': 'text',
 }
+_threadlocal = threading.local()
+
+
+def _get_threadlocal_config():
+    """Get a threadlocal **mutable** configuration. If the configuration
+    does not exist, copy the default global configuration."""
+    if not hasattr(_threadlocal, 'global_config'):
+        _threadlocal.global_config = _global_config.copy()
+    return _threadlocal.global_config
 
 
 def get_config():
@@ -24,7 +34,9 @@ def get_config():
     config_context : Context manager for global scikit-learn configuration.
     set_config : Set global scikit-learn configuration.
     """
-    return _global_config.copy()
+    # Return a copy of the threadlocal configuration so that users will
+    # not be able to modify the configuration with the returned dict.
+    return _get_threadlocal_config().copy()
 
 
 def set_config(assume_finite=None, working_memory=None,
@@ -72,14 +84,16 @@ def set_config(assume_finite=None, working_memory=None,
     config_context : Context manager for global scikit-learn configuration.
     get_config : Retrieve current values of the global configuration.
     """
+    local_config = _get_threadlocal_config()
+
     if assume_finite is not None:
-        _global_config['assume_finite'] = assume_finite
+        local_config['assume_finite'] = assume_finite
     if working_memory is not None:
-        _global_config['working_memory'] = working_memory
+        local_config['working_memory'] = working_memory
     if print_changed_only is not None:
-        _global_config['print_changed_only'] = print_changed_only
+        local_config['print_changed_only'] = print_changed_only
     if display is not None:
-        _global_config['display'] = display
+        local_config['display'] = display
 
 
 @contextmanager
@@ -120,8 +134,7 @@ def config_context(**new_config):
     Notes
     -----
     All settings, not just those presently modified, will be returned to
-    their previous values when the context manager is exited. This is not
-    thread-safe.
+    their previous values when the context manager is exited.
 
     Examples
     --------
@@ -141,7 +154,7 @@ def config_context(**new_config):
     set_config : Set global scikit-learn configuration.
     get_config : Retrieve current values of the global configuration.
     """
-    old_config = get_config().copy()
+    old_config = get_config()
     set_config(**new_config)
 
     try:
diff --git a/sklearn/tests/test_config.py b/sklearn/tests/test_config.py
index 22ec862ef24a3..6d458088a37a8 100644
--- a/sklearn/tests/test_config.py
+++ b/sklearn/tests/test_config.py
@@ -1,5 +1,13 @@
+import time
+from concurrent.futures import ThreadPoolExecutor
+
+from joblib import Parallel
+import joblib
 import pytest
+
 from sklearn import get_config, set_config, config_context
+from sklearn.utils.fixes import delayed
+from sklearn.utils.fixes import parse_version
 
 
 def test_config_context():
@@ -76,3 +84,50 @@ def test_set_config():
     # No unknown arguments
     with pytest.raises(TypeError):
         set_config(do_something_else=True)
+
+
+def set_assume_finite(assume_finite, sleep_duration):
+    """Return the value of assume_finite after waiting `sleep_duration`."""
+    with config_context(assume_finite=assume_finite):
+        time.sleep(sleep_duration)
+        return get_config()['assume_finite']
+
+
+@pytest.mark.parametrize("backend",
+                         ["loky", "multiprocessing", "threading"])
+def test_config_threadsafe_joblib(backend):
+    """Test that the global config is threadsafe with all joblib backends.
+    Two jobs are spawned and sets assume_finite to two different values.
+    When the job with a duration 0.1s completes, the assume_finite value
+    should be the same as the value passed to the function. In other words,
+    it is not influenced by the other job setting assume_finite to True.
+    """
+
+    if (parse_version(joblib.__version__) < parse_version('0.12')
+            and backend == 'loky'):
+        pytest.skip('loky backend does not exist in joblib <0.12')  # noqa
+
+    assume_finites = [False, True]
+    sleep_durations = [0.1, 0.2]
+
+    items = Parallel(backend=backend, n_jobs=2)(
+        delayed(set_assume_finite)(assume_finite, sleep_dur)
+        for assume_finite, sleep_dur
+        in zip(assume_finites, sleep_durations))
+
+    assert items == [False, True]
+
+
+def test_config_threadsafe():
+    """Uses threads directly to test that the global config does not change
+    between threads. Same test as `test_config_threadsafe_joblib` but with
+    `ThreadPoolExecutor`."""
+
+    assume_finites = [False, True]
+    sleep_durations = [0.1, 0.2]
+
+    with ThreadPoolExecutor(max_workers=2) as e:
+        items = [output for output in
+                 e.map(set_assume_finite, assume_finites, sleep_durations)]
+
+    assert items == [False, True]

From a45c0c99a38cffca6724cb8fd38b12edd4fb6b35 Mon Sep 17 00:00:00 2001
From: Maria Telenczuk <maja_ka@hotmail.com>
Date: Tue, 27 Apr 2021 19:22:16 +0200
Subject: [PATCH 372/478] DEP Deprecates 'normalize' in _bayes.py (#17746)

---
 doc/whats_new/v1.0.rst                    |  2 ++
 sklearn/linear_model/_bayes.py            | 32 ++++++++++++++++++-----
 sklearn/linear_model/tests/test_bayes.py  |  2 ++
 sklearn/linear_model/tests/test_common.py |  5 +++-
 4 files changed, 33 insertions(+), 8 deletions(-)

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 6e3c063a45dcb..977d83890e0c0 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -269,6 +269,8 @@ Changelog
   Ridge, RidgeClassifier, RidgeCV or RidgeClassifierCV were deprecated in:
   :pr:`17772` by :user:`Maria Telenczuk <maikia>` and
   :user:`Alexandre Gramfort <agramfort>`.
+  BayesianRidge, ARDRegression were deprecated in:
+  :pr:`17746` by :user:`Maria Telenczuk <maikia>`.
 
 - |Fix|: `sample_weight` are now fully taken into account in linear models
   when `normalize=True` for both feature centering and feature
diff --git a/sklearn/linear_model/_bayes.py b/sklearn/linear_model/_bayes.py
index 634417e2b0efa..2eae8b5c13cee 100644
--- a/sklearn/linear_model/_bayes.py
+++ b/sklearn/linear_model/_bayes.py
@@ -11,6 +11,7 @@
 
 from ._base import LinearModel, _rescale_data
 from ..base import RegressorMixin
+from ._base import _deprecate_normalize
 from ..utils.extmath import fast_logdet
 from scipy.linalg import pinvh
 from ..utils.validation import _check_sample_weight
@@ -84,6 +85,10 @@ class BayesianRidge(RegressorMixin, LinearModel):
         :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``
         on an estimator with ``normalize=False``.
 
+        .. deprecated:: 1.0
+            ``normalize`` was deprecated in version 1.0 and will be removed in
+            1.2.
+
     copy_X : bool, default=True
         If True, X will be copied; else, it may be overwritten.
 
@@ -158,7 +163,7 @@ class BayesianRidge(RegressorMixin, LinearModel):
     def __init__(self, *, n_iter=300, tol=1.e-3, alpha_1=1.e-6, alpha_2=1.e-6,
                  lambda_1=1.e-6, lambda_2=1.e-6, alpha_init=None,
                  lambda_init=None, compute_score=False, fit_intercept=True,
-                 normalize=False, copy_X=True, verbose=False):
+                 normalize='deprecated', copy_X=True, verbose=False):
         self.n_iter = n_iter
         self.tol = tol
         self.alpha_1 = alpha_1
@@ -193,6 +198,10 @@ def fit(self, X, y, sample_weight=None):
         -------
         self : returns an instance of self.
         """
+        self._normalize = _deprecate_normalize(
+            self.normalize, default=False,
+            estimator_name=self.__class__.__name__
+        )
 
         if self.n_iter < 1:
             raise ValueError('n_iter should be greater than or equal to 1.'
@@ -205,7 +214,7 @@ def fit(self, X, y, sample_weight=None):
                                                  dtype=X.dtype)
 
         X, y, X_offset_, y_offset_, X_scale_ = self._preprocess_data(
-            X, y, self.fit_intercept, self.normalize, self.copy_X,
+            X, y, self.fit_intercept, self._normalize, self.copy_X,
             sample_weight=sample_weight)
 
         if sample_weight is not None:
@@ -325,7 +334,7 @@ def predict(self, X, return_std=False):
         if return_std is False:
             return y_mean
         else:
-            if self.normalize:
+            if self._normalize:
                 X = (X - self.X_offset_) / self.X_scale_
             sigmas_squared_data = (np.dot(X, self.sigma_) * X).sum(axis=1)
             y_std = np.sqrt(sigmas_squared_data + (1. / self.alpha_))
@@ -445,6 +454,10 @@ class ARDRegression(RegressorMixin, LinearModel):
         :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``
         on an estimator with ``normalize=False``.
 
+        .. deprecated:: 1.0
+            ``normalize`` was deprecated in version 1.0 and will be removed in
+            1.2.
+
     copy_X : bool, default=True
         If True, X will be copied; else, it may be overwritten.
 
@@ -510,8 +523,8 @@ class ARDRegression(RegressorMixin, LinearModel):
     @_deprecate_positional_args
     def __init__(self, *, n_iter=300, tol=1.e-3, alpha_1=1.e-6, alpha_2=1.e-6,
                  lambda_1=1.e-6, lambda_2=1.e-6, compute_score=False,
-                 threshold_lambda=1.e+4, fit_intercept=True, normalize=False,
-                 copy_X=True, verbose=False):
+                 threshold_lambda=1.e+4, fit_intercept=True,
+                 normalize='deprecated', copy_X=True, verbose=False):
         self.n_iter = n_iter
         self.tol = tol
         self.fit_intercept = fit_intercept
@@ -543,6 +556,11 @@ def fit(self, X, y):
         -------
         self : returns an instance of self.
         """
+        self._normalize = _deprecate_normalize(
+            self.normalize, default=False,
+            estimator_name=self.__class__.__name__
+        )
+
         X, y = self._validate_data(X, y, dtype=np.float64, y_numeric=True,
                                    ensure_min_samples=2)
 
@@ -550,7 +568,7 @@ def fit(self, X, y):
         coef_ = np.zeros(n_features)
 
         X, y, X_offset_, y_offset_, X_scale_ = self._preprocess_data(
-            X, y, self.fit_intercept, self.normalize, self.copy_X)
+            X, y, self.fit_intercept, self._normalize, self.copy_X)
 
         self.X_offset_ = X_offset_
         self.X_scale_ = X_scale_
@@ -686,7 +704,7 @@ def predict(self, X, return_std=False):
         if return_std is False:
             return y_mean
         else:
-            if self.normalize:
+            if self._normalize:
                 X = (X - self.X_offset_) / self.X_scale_
             X = X[:, self.lambda_ < self.threshold_lambda]
             sigmas_squared_data = (np.dot(X, self.sigma_) * X).sum(axis=1)
diff --git a/sklearn/linear_model/tests/test_bayes.py b/sklearn/linear_model/tests/test_bayes.py
index 529702ff752ac..a22a0243cdcb7 100644
--- a/sklearn/linear_model/tests/test_bayes.py
+++ b/sklearn/linear_model/tests/test_bayes.py
@@ -274,6 +274,8 @@ def test_update_sigma(seed):
     np.testing.assert_allclose(sigma, sigma_woodbury)
 
 
+# FIXME: 'normalize' to be removed in 1.2 in LinearRegression
+@pytest.mark.filterwarnings("ignore:'normalize' was deprecated")
 def test_ard_regression_predict_normalize_true():
     """Check that we can predict with `normalize=True` and `return_std=True`.
     Non-regression test for:
diff --git a/sklearn/linear_model/tests/test_common.py b/sklearn/linear_model/tests/test_common.py
index 96a996d18dac7..f255384be4167 100644
--- a/sklearn/linear_model/tests/test_common.py
+++ b/sklearn/linear_model/tests/test_common.py
@@ -12,6 +12,8 @@
 from sklearn.linear_model import RidgeCV
 from sklearn.linear_model import RidgeClassifier
 from sklearn.linear_model import RidgeClassifierCV
+from sklearn.linear_model import BayesianRidge
+from sklearn.linear_model import ARDRegression
 
 from sklearn.utils import check_random_state
 
@@ -24,7 +26,8 @@
 )
 @pytest.mark.parametrize(
     "estimator",
-    [LinearRegression, Ridge, RidgeCV, RidgeClassifier, RidgeClassifierCV]
+    [LinearRegression, Ridge, RidgeCV, RidgeClassifier, RidgeClassifierCV,
+     BayesianRidge, ARDRegression]
 )
 # FIXME remove test in 1.2
 def test_linear_model_normalize_deprecation_message(

From a9cc0ed86fca1480acbd8aaf211f062ee2abd5b7 Mon Sep 17 00:00:00 2001
From: Maria Telenczuk <maja_ka@hotmail.com>
Date: Wed, 28 Apr 2021 15:37:41 +0200
Subject: [PATCH 373/478] DOC correct the orders of the x labels (#19997)

---
 examples/impute/plot_missing_values.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py
index f5d75f68c3d09..3ea5c61427ff0 100644
--- a/examples/impute/plot_missing_values.py
+++ b/examples/impute/plot_missing_values.py
@@ -121,11 +121,7 @@ def get_scores_for_imputer(imputer, X_missing, y_missing):
     return impute_scores
 
 
-x_labels = ['Full data',
-            'Zero imputation',
-            'Mean Imputation',
-            'KNN Imputation',
-            'Iterative Imputation']
+x_labels = []
 
 mses_california = np.zeros(5)
 stds_california = np.zeros(5)
@@ -149,6 +145,7 @@ def get_full_score(X_full, y_full):
 mses_california[0], stds_california[0] = get_full_score(X_california,
                                                         y_california)
 mses_diabetes[0], stds_diabetes[0] = get_full_score(X_diabetes, y_diabetes)
+x_labels.append('Full data')
 
 
 # %%
@@ -172,6 +169,7 @@ def get_impute_zero_score(X_missing, y_missing):
     X_miss_california, y_miss_california)
 mses_diabetes[1], stds_diabetes[1] = get_impute_zero_score(X_miss_diabetes,
                                                            y_miss_diabetes)
+x_labels.append('Zero imputation')
 
 
 # %%
@@ -191,6 +189,7 @@ def get_impute_knn_score(X_missing, y_missing):
     X_miss_california, y_miss_california)
 mses_diabetes[2], stds_diabetes[2] = get_impute_knn_score(X_miss_diabetes,
                                                           y_miss_diabetes)
+x_labels.append('KNN Imputation')
 
 
 # %%
@@ -209,6 +208,7 @@ def get_impute_mean(X_missing, y_missing):
                                                          y_miss_california)
 mses_diabetes[3], stds_diabetes[3] = get_impute_mean(X_miss_diabetes,
                                                      y_miss_diabetes)
+x_labels.append('Mean Imputation')
 
 
 # %%
@@ -237,6 +237,7 @@ def get_impute_iterative(X_missing, y_missing):
     X_miss_california, y_miss_california)
 mses_diabetes[4], stds_diabetes[4] = get_impute_iterative(X_miss_diabetes,
                                                           y_miss_diabetes)
+x_labels.append('Iterative Imputation')
 
 mses_diabetes = mses_diabetes * -1
 mses_california = mses_california * -1

From 9c3b402f0082cfc17da3ab9430a203ecc2ac4dfc Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Thu, 29 Apr 2021 15:46:56 -0400
Subject: [PATCH 374/478] ENH Makes ColumnTransformer more flexible by only
 checking for non-dropped columns (#19263)

---
 doc/modules/compose.rst                       |  14 ++
 doc/whats_new/v1.0.rst                        |   5 +
 sklearn/compose/_column_transformer.py        | 106 ++++++++----
 .../compose/tests/test_column_transformer.py  | 151 +++++++++---------
 4 files changed, 164 insertions(+), 112 deletions(-)

diff --git a/doc/modules/compose.rst b/doc/modules/compose.rst
index 6e827304c38cd..a9195ba9ab022 100644
--- a/doc/modules/compose.rst
+++ b/doc/modules/compose.rst
@@ -527,6 +527,20 @@ above example would be::
                                   ('countvectorizer', CountVectorizer(),
                                    'title')])
 
+If :class:`~sklearn.compose.ColumnTransformer` is fitted with a dataframe
+and the dataframe only has string column names, then transforming a dataframe
+will use the column names to select the columns::
+
+
+  >>> ct = ColumnTransformer(
+  ...          [("scale", StandardScaler(), ["expert_rating"])]).fit(X)
+  >>> X_new = pd.DataFrame({"expert_rating": [5, 6, 1],
+  ...                       "ignored_new_col": [1.2, 0.3, -0.1]})
+  >>> ct.transform(X_new)
+  array([[ 0.9...],
+         [ 2.1...],
+         [-3.9...]])
+
 .. _visualizing_composite_estimators:
 
 Visualizing Composite Estimators
diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 977d83890e0c0..d26c5dd0c347d 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -131,6 +131,11 @@ Changelog
   of each transformer in `output_indices_`. :pr:`18393` by
   :user:`Luca Bittarello <lbittarello>`.
 
+- |Enhancement| :class:`compose.ColumnTransformer` now allows DataFrame input to
+  have its columns appear in a changed order in `transform`. Further, columns that
+  are dropped will not be required in transform, and additional columns will be
+  ignored if `remainder='drop'`. :pr:`19263` by `Thomas Fan`_
+
 - |FIX| :meth:`compose.ColumnTransformer.get_feature_names` supports
   non-string feature names returned by any of its transformers.
   :pr:`18459` by :user:`Albert Villanova del Moral <albertvillanova>` and
diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py
index 2f2da882652c0..441fc95a106f1 100644
--- a/sklearn/compose/_column_transformer.py
+++ b/sklearn/compose/_column_transformer.py
@@ -244,7 +244,8 @@ def set_params(self, **kwargs):
         self._set_params('_transformers', **kwargs)
         return self
 
-    def _iter(self, fitted=False, replace_strings=False):
+    def _iter(self, fitted=False, replace_strings=False,
+              column_as_strings=False):
         """
         Generate (name, trans, column, weight) tuples.
 
@@ -262,11 +263,11 @@ def _iter(self, fitted=False, replace_strings=False):
                 in zip(self.transformers, self._columns)
             ]
             # add transformer tuple for remainder
-            if self._remainder[2] is not None:
+            if self._remainder[2]:
                 transformers = chain(transformers, [self._remainder])
         get_weight = (self.transformer_weights or {}).get
 
-        for name, trans, column in transformers:
+        for name, trans, columns in transformers:
             if replace_strings:
                 # replace 'passthrough' with identity transformer and
                 # skip in case of 'drop'
@@ -276,10 +277,21 @@ def _iter(self, fitted=False, replace_strings=False):
                     )
                 elif trans == 'drop':
                     continue
-                elif _is_empty_column_selection(column):
+                elif _is_empty_column_selection(columns):
                     continue
 
-            yield (name, trans, column, get_weight(name))
+            if column_as_strings and self._only_str_columns:
+                # Convert all columns to using their string labels
+                columns_is_scalar = np.isscalar(columns)
+
+                indices = self._transformer_to_input_indices[name]
+                columns = self._feature_names_in[indices]
+
+                if columns_is_scalar:
+                    # selection is done with one dimension
+                    columns = columns[0]
+
+            yield (name, trans, columns, get_weight(name))
 
     def _validate_transformers(self):
         if not self.transformers:
@@ -305,12 +317,17 @@ def _validate_column_callables(self, X):
         """
         Converts callable column specifications.
         """
-        columns = []
-        for _, _, column in self.transformers:
-            if callable(column):
-                column = column(X)
-            columns.append(column)
-        self._columns = columns
+        all_columns = []
+        transformer_to_input_indices = {}
+        for name, _, columns in self.transformers:
+            if callable(columns):
+                columns = columns(X)
+            all_columns.append(columns)
+            transformer_to_input_indices[name] = _get_column_indices(X,
+                                                                     columns)
+
+        self._columns = all_columns
+        self._transformer_to_input_indices = transformer_to_input_indices
 
     def _validate_remainder(self, X):
         """
@@ -328,12 +345,10 @@ def _validate_remainder(self, X):
                 self.remainder)
 
         self._n_features = X.shape[1]
-        cols = []
-        for columns in self._columns:
-            cols.extend(_get_column_indices(X, columns))
-
-        remaining_idx = sorted(set(range(self._n_features)) - set(cols))
-        self._remainder = ('remainder', self.remainder, remaining_idx or None)
+        cols = set(chain(*self._transformer_to_input_indices.values()))
+        remaining = sorted(set(range(self._n_features)) - cols)
+        self._remainder = ('remainder', self.remainder, remaining)
+        self._transformer_to_input_indices['remainder'] = remaining
 
     @property
     def named_transformers_(self):
@@ -443,7 +458,8 @@ def _log_message(self, name, idx, total):
             return None
         return '(%d of %d) Processing %s' % (idx, total, name)
 
-    def _fit_transform(self, X, y, func, fitted=False):
+    def _fit_transform(self, X, y, func, fitted=False,
+                       column_as_strings=False):
         """
         Private function to fit and/or transform on demand.
 
@@ -452,7 +468,9 @@ def _fit_transform(self, X, y, func, fitted=False):
         ``fitted=True`` ensures the fitted transformers are used.
         """
         transformers = list(
-            self._iter(fitted=fitted, replace_strings=True))
+            self._iter(
+                fitted=fitted, replace_strings=True,
+                column_as_strings=column_as_strings))
         try:
             return Parallel(n_jobs=self.n_jobs)(
                 delayed(func)(
@@ -518,6 +536,8 @@ def fit_transform(self, X, y=None):
         # TODO: this should be `feature_names_in_` when we start having it
         if hasattr(X, "columns"):
             self._feature_names_in = np.asarray(X.columns)
+            self._only_str_columns = all(isinstance(col, str)
+                                         for col in self._feature_names_in)
         else:
             self._feature_names_in = None
         X = _check_X(X)
@@ -572,20 +592,34 @@ def transform(self, X):
         """
         check_is_fitted(self)
         X = _check_X(X)
-        if hasattr(X, "columns"):
-            X_feature_names = np.asarray(X.columns)
+
+        fit_dataframe_and_transform_dataframe = (
+            self._feature_names_in is not None and hasattr(X, "columns"))
+
+        if fit_dataframe_and_transform_dataframe:
+            named_transformers = self.named_transformers_
+            # check that all names seen in fit are in transform, unless
+            # they were dropped
+            non_dropped_indices = [
+                ind for name, ind in self._transformer_to_input_indices.items()
+                if name in named_transformers and
+                isinstance(named_transformers[name], str) and
+                named_transformers[name] != 'drop']
+
+            all_indices = set(chain(*non_dropped_indices))
+            all_names = set(self._feature_names_in[ind] for ind in all_indices)
+
+            diff = all_names - set(X.columns)
+            if diff:
+                raise ValueError(f"columns are missing: {diff}")
         else:
-            X_feature_names = None
-
-        self._check_n_features(X, reset=False)
-        if (self._feature_names_in is not None and
-            X_feature_names is not None and
-                np.any(self._feature_names_in != X_feature_names)):
-            raise RuntimeError(
-                "Given feature/column names do not match the ones for the "
-                "data given during fit."
-            )
-        Xs = self._fit_transform(X, None, _transform_one, fitted=True)
+            # ndarray was used for fitting or transforming, thus we only
+            # check that n_features_in_ is consistent
+            self._check_n_features(X, reset=False)
+
+        Xs = self._fit_transform(
+            X, None, _transform_one, fitted=True,
+            column_as_strings=fit_dataframe_and_transform_dataframe)
         self._validate_output(Xs)
 
         if not Xs:
@@ -629,10 +663,12 @@ def _sk_visual_block_(self):
             transformers = self.transformers
         elif hasattr(self, "_remainder"):
             remainder_columns = self._remainder[2]
-            if self._feature_names_in is not None:
+            if (self._feature_names_in is not None and
+                    remainder_columns and
+                    not all(isinstance(col, str)
+                            for col in remainder_columns)):
                 remainder_columns = (
-                    self._feature_names_in[remainder_columns].tolist()
-                )
+                    self._feature_names_in[remainder_columns].tolist())
             transformers = chain(self.transformers,
                                  [('remainder', self.remainder,
                                    remainder_columns)])
diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py
index 549292ab51445..9278d67296ec5 100644
--- a/sklearn/compose/tests/test_column_transformer.py
+++ b/sklearn/compose/tests/test_column_transformer.py
@@ -4,7 +4,6 @@
 import re
 import pickle
 
-import warnings
 import numpy as np
 from scipy import sparse
 import pytest
@@ -1260,82 +1259,6 @@ def test_column_transformer_negative_column_indexes():
     assert_array_equal(tf_1.fit_transform(X), tf_2.fit_transform(X))
 
 
-@pytest.mark.parametrize("explicit_colname", ['first', 'second'])
-def test_column_transformer_reordered_column_names_remainder(explicit_colname):
-    """Regression test for issue #14223: 'Named col indexing fails with
-       ColumnTransformer remainder on changing DataFrame column ordering'
-
-       Should raise error on changed order combined with remainder.
-       Should allow for added columns in `transform` input DataFrame
-       as long as all preceding columns match.
-    """
-    pd = pytest.importorskip('pandas')
-
-    X_fit_array = np.array([[0, 1, 2], [2, 4, 6]]).T
-    X_fit_df = pd.DataFrame(X_fit_array, columns=['first', 'second'])
-
-    X_trans_array = np.array([[2, 4, 6], [0, 1, 2]]).T
-    X_trans_df = pd.DataFrame(X_trans_array, columns=['second', 'first'])
-
-    tf = ColumnTransformer([('bycol', Trans(), explicit_colname)],
-                           remainder=Trans())
-
-    tf.fit(X_fit_df)
-    err_msg = ("Given feature/column names do not match the ones for the "
-               "data given during fit.")
-    with pytest.raises(RuntimeError, match=err_msg):
-        tf.transform(X_trans_df)
-
-    # ValueError for added columns
-    X_extended_df = X_fit_df.copy()
-    X_extended_df['third'] = [3, 6, 9]
-    err_msg = ("X has 3 features, but ColumnTransformer is expecting 2 "
-               "features as input.")
-    with pytest.raises(ValueError, match=err_msg):
-        tf.transform(X_extended_df)
-
-    # No 'columns' AttributeError when transform input is a numpy array
-    X_array = X_fit_array.copy()
-    err_msg = 'Specifying the columns'
-    with pytest.raises(ValueError, match=err_msg):
-        tf.transform(X_array)
-
-
-def test_feature_name_validation():
-    """Tests if the proper warning/error is raised if the columns do not match
-    during fit and transform."""
-    pd = pytest.importorskip("pandas")
-
-    X = np.ones(shape=(3, 2))
-    X_extra = np.ones(shape=(3, 3))
-    df = pd.DataFrame(X, columns=['a', 'b'])
-    df_extra = pd.DataFrame(X_extra, columns=['a', 'b', 'c'])
-
-    tf = ColumnTransformer([('bycol', Trans(), ['a', 'b'])])
-    tf.fit(df)
-
-    msg = ("X has 3 features, but ColumnTransformer is expecting 2 features "
-           "as input.")
-    with pytest.raises(ValueError, match=msg):
-        tf.transform(df_extra)
-
-    tf = ColumnTransformer([('bycol', Trans(), [0])])
-    tf.fit(df)
-
-    with pytest.raises(ValueError, match=msg):
-        tf.transform(X_extra)
-
-    with warnings.catch_warnings(record=True) as warns:
-        tf.transform(X)
-    assert not warns
-
-    tf = ColumnTransformer([('bycol', Trans(), ['a'])],
-                           remainder=Trans())
-    tf.fit(df)
-    with pytest.raises(ValueError, match=msg):
-        tf.transform(df_extra)
-
-
 @pytest.mark.parametrize("array_type", [np.asarray, sparse.csr_matrix])
 def test_column_transformer_mask_indexing(array_type):
     # Regression test for #14510
@@ -1516,6 +1439,80 @@ def test_sk_visual_block_remainder_fitted_numpy(remainder):
     assert visual_block.estimators == (scaler, remainder)
 
 
+@pytest.mark.parametrize("explicit_colname", ['first', 'second', 0, 1])
+@pytest.mark.parametrize("remainder", [Trans(), 'passthrough', 'drop'])
+def test_column_transformer_reordered_column_names_remainder(explicit_colname,
+                                                             remainder):
+    """Test the interaction between remainder and column transformer"""
+    pd = pytest.importorskip('pandas')
+
+    X_fit_array = np.array([[0, 1, 2], [2, 4, 6]]).T
+    X_fit_df = pd.DataFrame(X_fit_array, columns=['first', 'second'])
+
+    X_trans_array = np.array([[2, 4, 6], [0, 1, 2]]).T
+    X_trans_df = pd.DataFrame(X_trans_array, columns=['second', 'first'])
+
+    tf = ColumnTransformer([('bycol', Trans(), explicit_colname)],
+                           remainder=remainder)
+
+    tf.fit(X_fit_df)
+    X_fit_trans = tf.transform(X_fit_df)
+
+    # Changing the order still works
+    X_trans = tf.transform(X_trans_df)
+    assert_allclose(X_trans, X_fit_trans)
+
+    # extra columns are ignored
+    X_extended_df = X_fit_df.copy()
+    X_extended_df['third'] = [3, 6, 9]
+    X_trans = tf.transform(X_extended_df)
+    assert_allclose(X_trans, X_fit_trans)
+
+    if isinstance(explicit_colname, str):
+        # Raise error if columns are specified by names but input only allows
+        # to specify by position, e.g. numpy array instead of a pandas df.
+        X_array = X_fit_array.copy()
+        err_msg = 'Specifying the columns'
+        with pytest.raises(ValueError, match=err_msg):
+            tf.transform(X_array)
+
+
+def test_feature_name_validation_missing_columns_drop_passthough():
+    """Test the interaction between {'drop', 'passthrough'} and
+    missing column names."""
+    pd = pytest.importorskip("pandas")
+
+    X = np.ones(shape=(3, 4))
+    df = pd.DataFrame(X, columns=['a', 'b', 'c', 'd'])
+
+    df_dropped = df.drop('c', axis=1)
+
+    # with remainder='passthrough', all columns seen during `fit` must be
+    # present
+    tf = ColumnTransformer([('bycol', Trans(), [1])], remainder='passthrough')
+    tf.fit(df)
+    msg = r"columns are missing: {'c'}"
+    with pytest.raises(ValueError, match=msg):
+        tf.transform(df_dropped)
+
+    # with remainder='drop', it is allowed to have column 'c' missing
+    tf = ColumnTransformer([('bycol', Trans(), [1])],
+                           remainder='drop')
+    tf.fit(df)
+
+    df_dropped_trans = tf.transform(df_dropped)
+    df_fit_trans = tf.transform(df)
+    assert_allclose(df_dropped_trans, df_fit_trans)
+
+    # bycol drops 'c', thus it is allowed for 'c' to be missing
+    tf = ColumnTransformer([('bycol', 'drop', ['c'])],
+                           remainder='passthrough')
+    tf.fit(df)
+    df_dropped_trans = tf.transform(df_dropped)
+    df_fit_trans = tf.transform(df)
+    assert_allclose(df_dropped_trans, df_fit_trans)
+
+
 @pytest.mark.parametrize("selector", [[], [False, False]])
 def test_get_feature_names_empty_selection(selector):
     """Test that get_feature_names is only called for transformers that

From 36a9257b9ae797ff150034e6054a3081d2941ae5 Mon Sep 17 00:00:00 2001
From: Helder Geovane Gomes de Lima <he7d3r@gmail.com>
Date: Thu, 29 Apr 2021 20:33:33 -0300
Subject: [PATCH 375/478] DOC Fix typo on comment about t-SNE (#20009)

---
 doc/modules/manifold.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/modules/manifold.rst b/doc/modules/manifold.rst
index 72e8c7485df44..e6e8e842fa7fc 100644
--- a/doc/modules/manifold.rst
+++ b/doc/modules/manifold.rst
@@ -602,7 +602,7 @@ be well separated by non linear methods that focus on the local structure (e.g.
 an SVM with a Gaussian RBF kernel). However, failing to visualize well
 separated homogeneously labeled groups with t-SNE in 2D does not necessarily
 imply that the data cannot be correctly classified by a supervised model. It
-might be the case that 2 dimensions are not low enough to accurately represents
+might be the case that 2 dimensions are not high enough to accurately represent
 the internal structure of the data.
 
 
From 4023a0f94bde429456f45b983c84c5f35475480f Mon Sep 17 00:00:00 2001
From: Haoyin Xu <haoyinxu@gmail.com>
Date: Thu, 29 Apr 2021 19:38:53 -0400
Subject: [PATCH 376/478] CLN Fix _add_node parameter name (#20008)

---
 sklearn/tree/_tree.pxd | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/tree/_tree.pxd b/sklearn/tree/_tree.pxd
index 8957f0342892a..a6be4ece56970 100644
--- a/sklearn/tree/_tree.pxd
+++ b/sklearn/tree/_tree.pxd
@@ -58,7 +58,7 @@ cdef class Tree:
     cdef SIZE_t _add_node(self, SIZE_t parent, bint is_left, bint is_leaf,
                           SIZE_t feature, double threshold, double impurity,
                           SIZE_t n_node_samples,
-                          double weighted_n_samples) nogil except -1
+                          double weighted_n_node_samples) nogil except -1
     cdef int _resize(self, SIZE_t capacity) nogil except -1
     cdef int _resize_c(self, SIZE_t capacity=*) nogil except -1
 

From 86bda0ae8472687e19bc071c4cbb957a21738650 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Mon, 3 May 2021 09:25:54 +0200
Subject: [PATCH 377/478] DOC add bug fixes releases in News web section

---
 doc/templates/index.html | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/doc/templates/index.html b/doc/templates/index.html
index c098fc05948af..ccc6ef51a08e0 100644
--- a/doc/templates/index.html
+++ b/doc/templates/index.html
@@ -155,6 +155,10 @@ <h4 class="sk-landing-call-header">News</h4>
         <ul class="sk-landing-call-list list-unstyled">
         <li><strong>On-going development:</strong>
         <a href="https://scikit-learn.org/dev/whats_new.html"><strong>What's new</strong> (Changelog)</a>
+        <li><strong>April 2021.</strong> scikit-learn 0.24.2 is available for download (<a href="whats_new/v0.24.html#version-0-24-2">Changelog</a>).
+        </li>
+        <li><strong>January 2021.</strong> scikit-learn 0.24.1 is available for download (<a href="whats_new/v0.24.html#version-0-24-1">Changelog</a>).
+        </li>
         <li><strong>December 2020.</strong> scikit-learn 0.24.0 is available for download (<a href="whats_new/v0.24.html#version-0-24-0">Changelog</a>).
         </li>
         <li><strong>August 2020.</strong> scikit-learn 0.23.2 is available for download (<a href="whats_new/v0.23.html#version-0-23-2">Changelog</a>).

From a9ce392f3a58da5caf5ac9bd287205e220082fc5 Mon Sep 17 00:00:00 2001
From: Chiara Marmo <cmarmo@users.noreply.github.com>
Date: Mon, 3 May 2021 16:05:25 +0200
Subject: [PATCH 378/478] DOC Add figure tag properties in css (#20028)

---
 doc/themes/scikit-learn-modern/static/css/theme.css | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/doc/themes/scikit-learn-modern/static/css/theme.css b/doc/themes/scikit-learn-modern/static/css/theme.css
index ed7a86a20fa3b..4d2b78c6a7322 100644
--- a/doc/themes/scikit-learn-modern/static/css/theme.css
+++ b/doc/themes/scikit-learn-modern/static/css/theme.css
@@ -849,7 +849,8 @@ div.body dd > p {
     hyphens: none;
 }
 
-img.align-center, .figure.align-center, object.align-center {
+img.align-center, figure.align-center,
+.figure.align-center, object.align-center {
   display: block;
   margin-left: auto;
   margin-right: auto;
@@ -857,7 +858,8 @@ img.align-center, .figure.align-center, object.align-center {
   text-align: center;
 }
 
-img.align-right, .figure.align-right, object.align-right {
+img.align-right, figure.align-right,
+.figure.align-right, object.align-right {
   clear: right;
   float: right;
   margin-left: 1em;

From 4803a0adfe0be1f4de788a77471a6b34df71b43f Mon Sep 17 00:00:00 2001
From: Chiara Marmo <cmarmo@users.noreply.github.com>
Date: Wed, 5 May 2021 19:43:44 +0200
Subject: [PATCH 379/478] DOC Add a note about the involvement of the
 contributor in maintenance. (#20044)

Co-authored-by: Julien Jerphanion <git@jjerphan.xyz>
Co-authored-by: Nicolas Hug <contact@nicolas-hug.com>
Co-authored-by: Nicolas Hug <nicolashug@fb.com>
---
 build_tools/circle/build_doc.sh | 2 +-
 doc/developers/contributing.rst | 9 ++++++---
 doc/developers/tips.rst         | 2 +-
 3 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/build_tools/circle/build_doc.sh b/build_tools/circle/build_doc.sh
index 37afb1841d368..c447db180697c 100755
--- a/build_tools/circle/build_doc.sh
+++ b/build_tools/circle/build_doc.sh
@@ -1,4 +1,4 @@
-#!/usr/bin/env bash
+q#!/usr/bin/env bash
 set -x
 set -e
 
diff --git a/doc/developers/contributing.rst b/doc/developers/contributing.rst
index 7f3aeb9537413..2a7ec7afe48a4 100644
--- a/doc/developers/contributing.rst
+++ b/doc/developers/contributing.rst
@@ -455,9 +455,12 @@ complies with the following rules before marking a PR as ``[MRG]``. The
    functionality is useful in practice and, if possible, compare it to other
    methods available in scikit-learn.
 
-10. New features often need to be illustrated with narrative documentation in
-    the user guide, with small code snippets. If relevant, please also add
-    references in the literature, with PDF links when possible.
+10. New features have some maintenance overhead. We expect PR authors
+    to take part in the maintenance for the code they submit, at least
+    initially. New features need to be illustrated with narrative
+    documentation in the user guide, with small code snippets.
+    If relevant, please also add references in the literature, with PDF links
+    when possible.
 
 11. The user guide should also include expected time and space complexity
     of the algorithm and scalability, e.g. "this algorithm can scale to a
diff --git a/doc/developers/tips.rst b/doc/developers/tips.rst
index 8cf5bd5b5d094..36e2cd4a58779 100644
--- a/doc/developers/tips.rst
+++ b/doc/developers/tips.rst
@@ -190,7 +190,7 @@ PR-NEW: Fix #
 PR-NEW or Issue: Maintenance cost
     ::
 
-        Every feature we include has a [maintenance cost](http://scikit-learn.org/dev/faq.html#why-are-you-so-selective-on-what-algorithms-you-include-in-scikit-learn). Our maintainers are mostly volunteers. For a new feature to be included, we need evidence that it is often useful and, ideally, [well-established](http://scikit-learn.org/dev/faq.html#what-are-the-inclusion-criteria-for-new-algorithms) in the literature or in practice. That doesn't stop you implementing it for yourself and publishing it in a separate repository, or even [scikit-learn-contrib](https://scikit-learn-contrib.github.io).
+        Every feature we include has a [maintenance cost](http://scikit-learn.org/dev/faq.html#why-are-you-so-selective-on-what-algorithms-you-include-in-scikit-learn). Our maintainers are mostly volunteers. For a new feature to be included, we need evidence that it is often useful and, ideally, [well-established](http://scikit-learn.org/dev/faq.html#what-are-the-inclusion-criteria-for-new-algorithms) in the literature or in practice. Also, we expect PR authors to take part in the maintenance for the code they submit, at least initially. That doesn't stop you implementing it for yourself and publishing it in a separate repository, or even [scikit-learn-contrib](https://scikit-learn-contrib.github.io).
 
 PR-WIP: What's needed before merge?
     ::

From 0eb9ad73c53c8f3cc0ea03d33312035853bee29b Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Wed, 5 May 2021 20:59:59 +0100
Subject: [PATCH 380/478] MNT fix bad shebang in build_doc.sh (#20050)

---
 build_tools/circle/build_doc.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/build_tools/circle/build_doc.sh b/build_tools/circle/build_doc.sh
index c447db180697c..37afb1841d368 100755
--- a/build_tools/circle/build_doc.sh
+++ b/build_tools/circle/build_doc.sh
@@ -1,4 +1,4 @@
-q#!/usr/bin/env bash
+#!/usr/bin/env bash
 set -x
 set -e
 

From de1262c35e2aa4ee062d050281ee576ce9e35c94 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Wed, 5 May 2021 21:38:17 -0400
Subject: [PATCH 381/478] CLN Remove **kwargs in Neighbors estiamtors (#20013)

---
 sklearn/neighbors/_classification.py      |  7 +++----
 sklearn/neighbors/_regression.py          | 10 ++++------
 sklearn/neighbors/tests/test_neighbors.py |  4 +---
 3 files changed, 8 insertions(+), 13 deletions(-)

diff --git a/sklearn/neighbors/_classification.py b/sklearn/neighbors/_classification.py
index 71b869977f6aa..83078e9f77ba9 100644
--- a/sklearn/neighbors/_classification.py
+++ b/sklearn/neighbors/_classification.py
@@ -147,14 +147,13 @@ class KNeighborsClassifier(KNeighborsMixin,
     @_deprecate_positional_args
     def __init__(self, n_neighbors=5, *,
                  weights='uniform', algorithm='auto', leaf_size=30,
-                 p=2, metric='minkowski', metric_params=None, n_jobs=None,
-                 **kwargs):
+                 p=2, metric='minkowski', metric_params=None, n_jobs=None):
         super().__init__(
             n_neighbors=n_neighbors,
             algorithm=algorithm,
             leaf_size=leaf_size, metric=metric, p=p,
             metric_params=metric_params,
-            n_jobs=n_jobs, **kwargs)
+            n_jobs=n_jobs)
         self.weights = _check_weights(weights)
 
     def fit(self, X, y):
@@ -415,7 +414,7 @@ def __init__(self, radius=1.0, *, weights='uniform',
               algorithm=algorithm,
               leaf_size=leaf_size,
               metric=metric, p=p, metric_params=metric_params,
-              n_jobs=n_jobs, **kwargs)
+              n_jobs=n_jobs)
         self.weights = _check_weights(weights)
         self.outlier_label = outlier_label
 
diff --git a/sklearn/neighbors/_regression.py b/sklearn/neighbors/_regression.py
index d3878cd54aa06..62d6cf33575e4 100644
--- a/sklearn/neighbors/_regression.py
+++ b/sklearn/neighbors/_regression.py
@@ -146,13 +146,12 @@ class KNeighborsRegressor(KNeighborsMixin,
     @_deprecate_positional_args
     def __init__(self, n_neighbors=5, *, weights='uniform',
                  algorithm='auto', leaf_size=30,
-                 p=2, metric='minkowski', metric_params=None, n_jobs=None,
-                 **kwargs):
+                 p=2, metric='minkowski', metric_params=None, n_jobs=None):
         super().__init__(
               n_neighbors=n_neighbors,
               algorithm=algorithm,
               leaf_size=leaf_size, metric=metric, p=p,
-              metric_params=metric_params, n_jobs=n_jobs, **kwargs)
+              metric_params=metric_params, n_jobs=n_jobs)
         self.weights = _check_weights(weights)
 
     def _more_tags(self):
@@ -346,14 +345,13 @@ class RadiusNeighborsRegressor(RadiusNeighborsMixin,
     @_deprecate_positional_args
     def __init__(self, radius=1.0, *, weights='uniform',
                  algorithm='auto', leaf_size=30,
-                 p=2, metric='minkowski', metric_params=None, n_jobs=None,
-                 **kwargs):
+                 p=2, metric='minkowski', metric_params=None, n_jobs=None):
         super().__init__(
               radius=radius,
               algorithm=algorithm,
               leaf_size=leaf_size,
               p=p, metric=metric, metric_params=metric_params,
-              n_jobs=n_jobs, **kwargs)
+              n_jobs=n_jobs)
         self.weights = _check_weights(weights)
 
     def fit(self, X, y):
diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py
index 8ce52119faa02..5df7a6419b0b5 100644
--- a/sklearn/neighbors/tests/test_neighbors.py
+++ b/sklearn/neighbors/tests/test_neighbors.py
@@ -978,7 +978,6 @@ def test_RadiusNeighborsRegressor_multioutput_with_uniform_weight():
 def test_RadiusNeighborsRegressor_multioutput(n_samples=40,
                                               n_features=5,
                                               n_test_pts=10,
-                                              n_neighbors=3,
                                               random_state=0):
     # Test k-neighbors in multi-output regression with various weight
     rng = np.random.RandomState(random_state)
@@ -991,8 +990,7 @@ def test_RadiusNeighborsRegressor_multioutput(n_samples=40,
     weights = ['uniform', 'distance', _weight_func]
 
     for algorithm, weights in product(ALGORITHMS, weights):
-        rnn = neighbors.RadiusNeighborsRegressor(n_neighbors=n_neighbors,
-                                                 weights=weights,
+        rnn = neighbors.RadiusNeighborsRegressor(weights=weights,
                                                  algorithm=algorithm)
         rnn.fit(X, y)
         epsilon = 1E-5 * (2 * rng.rand(1, n_features) - 1)

From 99754cd4f6525b41d489fd541453fbd0c2e02fd5 Mon Sep 17 00:00:00 2001
From: Haoyin Xu <haoyinxu@gmail.com>
Date: Mon, 10 May 2021 08:06:35 -0400
Subject: [PATCH 382/478] DOC Add note for videos & improve syncing
 instructions (#20022)

Co-authored-by: Guillaume Lemaitre <g.lemaitre58@gmail.com>
---
 doc/developers/contributing.rst | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/doc/developers/contributing.rst b/doc/developers/contributing.rst
index 2a7ec7afe48a4..0284ad179fc19 100644
--- a/doc/developers/contributing.rst
+++ b/doc/developers/contributing.rst
@@ -218,6 +218,13 @@ latest up-to-date workflow.
   `Transcript
   <https://github.com/data-umbrella/data-umbrella-scikit-learn-sprint/blob/master/3_transcript_ACM_video_vol2.md>`__
 
+.. note::
+  In January 2021, the default branch name changed from ``master`` to ``main``
+  for the scikit-learn GitHub repository to use more inclusive terms.
+  These videos were created prior to the renaming of the branch.
+  For contributors who are viewing these videos to set up their
+  working environment and submitting a PR, ``master`` should be replaced to ``main``.
+
 How to contribute
 -----------------
 
@@ -274,12 +281,14 @@ You should now have a working installation of scikit-learn, and your git
 repository properly configured. The next steps now describe the process of
 modifying code and submitting a PR:
 
-7. Synchronize your main branch with the upstream main branch:
+7. Synchronize your ``main`` branch with the ``upstream/main`` branch,
+   more details on `GitHub Docs <https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/syncing-a-fork>`_:
 
    .. prompt:: bash $
 
         git checkout main
-        git pull upstream main
+        git fetch upstream
+        git merge upstream/main
 
 8. Create a feature branch to hold your development changes:
 
@@ -416,12 +425,12 @@ complies with the following rules before marking a PR as ``[MRG]``. The
    verify the correct behavior of the fix or feature. In this manner, further
    modifications on the code base are granted to be consistent with the
    desired behavior. In the case of bug fixes, at the time of the PR, the
-   non-regression tests should fail for the code base in the main branch
+   non-regression tests should fail for the code base in the ``main`` branch
    and pass for the PR code.
 
 5. **Make sure that your PR does not add PEP8 violations**. To check the
    code that you changed, you can run the following command (see
-   :ref:`above <upstream>` to set up the upstream remote):
+   :ref:`above <upstream>` to set up the ``upstream`` remote):
 
    .. prompt:: bash $
 
@@ -934,7 +943,7 @@ scikit-learn. Make sure it is up to date:
 
 In the benchmark suite, the benchmarks are organized following the same
 structure as scikit-learn. For example, you can compare the performance of a
-specific estimator between upstream/main and the branch you are working on:
+specific estimator between ``upstream/main`` and the branch you are working on:
 
 .. prompt:: bash $
 

From 6cc3afbe0221df9a0b51eedc714a8b46c21f5c50 Mon Sep 17 00:00:00 2001
From: Chuliang Xiao <ChuliangX@gmail.com>
Date: Mon, 10 May 2021 09:57:19 -0400
Subject: [PATCH 383/478] CLN Replace self.steps[-1][-1] with self.steps[-1][1]
 (#20063)

---
 sklearn/pipeline.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py
index 1c9a62d02b7d0..024bfe4f1dd38 100644
--- a/sklearn/pipeline.py
+++ b/sklearn/pipeline.py
@@ -416,7 +416,7 @@ def predict(self, X, **predict_params):
         Xt = X
         for _, name, transform in self._iter(with_final=False):
             Xt = transform.transform(Xt)
-        return self.steps[-1][-1].predict(Xt, **predict_params)
+        return self.steps[-1][1].predict(Xt, **predict_params)
 
     @if_delegate_has_method(delegate='_final_estimator')
     def fit_predict(self, X, y=None, **fit_params):
@@ -451,7 +451,7 @@ def fit_predict(self, X, y=None, **fit_params):
         fit_params_last_step = fit_params_steps[self.steps[-1][0]]
         with _print_elapsed_time('Pipeline',
                                  self._log_message(len(self.steps) - 1)):
-            y_pred = self.steps[-1][-1].fit_predict(Xt, y,
+            y_pred = self.steps[-1][1].fit_predict(Xt, y,
                                                     **fit_params_last_step)
         return y_pred
 
@@ -476,7 +476,7 @@ def predict_proba(self, X, **predict_proba_params):
         Xt = X
         for _, name, transform in self._iter(with_final=False):
             Xt = transform.transform(Xt)
-        return self.steps[-1][-1].predict_proba(Xt, **predict_proba_params)
+        return self.steps[-1][1].predict_proba(Xt, **predict_proba_params)
 
     @if_delegate_has_method(delegate='_final_estimator')
     def decision_function(self, X):
@@ -495,7 +495,7 @@ def decision_function(self, X):
         Xt = X
         for _, name, transform in self._iter(with_final=False):
             Xt = transform.transform(Xt)
-        return self.steps[-1][-1].decision_function(Xt)
+        return self.steps[-1][1].decision_function(Xt)
 
     @if_delegate_has_method(delegate='_final_estimator')
     def score_samples(self, X):
@@ -514,7 +514,7 @@ def score_samples(self, X):
         Xt = X
         for _, _, transformer in self._iter(with_final=False):
             Xt = transformer.transform(Xt)
-        return self.steps[-1][-1].score_samples(Xt)
+        return self.steps[-1][1].score_samples(Xt)
 
     @if_delegate_has_method(delegate='_final_estimator')
     def predict_log_proba(self, X, **predict_log_proba_params):
@@ -537,7 +537,7 @@ def predict_log_proba(self, X, **predict_log_proba_params):
         Xt = X
         for _, name, transform in self._iter(with_final=False):
             Xt = transform.transform(Xt)
-        return self.steps[-1][-1].predict_log_proba(
+        return self.steps[-1][1].predict_log_proba(
             Xt, **predict_log_proba_params
         )
 
@@ -629,11 +629,11 @@ def score(self, X, y=None, sample_weight=None):
         score_params = {}
         if sample_weight is not None:
             score_params['sample_weight'] = sample_weight
-        return self.steps[-1][-1].score(Xt, y, **score_params)
+        return self.steps[-1][1].score(Xt, y, **score_params)
 
     @property
     def classes_(self):
-        return self.steps[-1][-1].classes_
+        return self.steps[-1][1].classes_
 
     def _more_tags(self):
         # check if first estimator expects pairwise input

From ee2298213a74c0b120de5a5bd2b2f83a84a134d3 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Mon, 10 May 2021 12:11:49 -0400
Subject: [PATCH 384/478] TST Adjust atol in
 test_ridge_regression_check_arguments based on 32bit-ness (#20071)

---
 sklearn/linear_model/tests/test_ridge.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/sklearn/linear_model/tests/test_ridge.py b/sklearn/linear_model/tests/test_ridge.py
index b812788239b14..d83248cfae4af 100644
--- a/sklearn/linear_model/tests/test_ridge.py
+++ b/sklearn/linear_model/tests/test_ridge.py
@@ -5,6 +5,7 @@
 
 import pytest
 
+from sklearn.utils import _IS_32BIT
 from sklearn.utils._testing import assert_almost_equal
 from sklearn.utils._testing import assert_allclose
 from sklearn.utils._testing import assert_array_almost_equal
@@ -1279,7 +1280,8 @@ def test_ridge_regression_check_arguments_validity(return_intercept,
     y += true_intercept
     X_testing = arr_type(X)
 
-    alpha, atol, tol = 1e-3, 1e-4, 1e-6
+    alpha, tol = 1e-3, 1e-6
+    atol = 1e-3 if _IS_32BIT else 1e-4
 
     if solver not in ['sag', 'auto'] and return_intercept:
         with pytest.raises(ValueError, match="In Ridge, only 'sag' solver"):

From 2bd3a4db529d707a9862d69cc1ddbcbe7a6054b8 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Mon, 10 May 2021 22:10:21 +0200
Subject: [PATCH 385/478] ENH Consistent loss name for absolute error (#19733)

---
 doc/modules/ensemble.rst                      |  2 +-
 doc/whats_new/v1.0.rst                        | 29 ++++++++++++
 sklearn/ensemble/_base.py                     | 12 ++---
 sklearn/ensemble/_forest.py                   | 47 ++++++++++++-------
 sklearn/ensemble/_gb.py                       | 32 ++++++++-----
 sklearn/ensemble/_gb_losses.py                |  3 +-
 .../gradient_boosting.py                      | 19 ++++++--
 .../ensemble/_hist_gradient_boosting/loss.py  |  2 +-
 .../tests/test_compare_lightgbm.py            |  2 +-
 .../tests/test_gradient_boosting.py           | 25 +++++-----
 .../tests/test_loss.py                        | 10 ++--
 .../_hist_gradient_boosting/utils.pyx         |  6 +--
 sklearn/ensemble/tests/test_forest.py         | 24 +++++++---
 .../ensemble/tests/test_gradient_boosting.py  | 30 +++++++-----
 sklearn/linear_model/_ransac.py               | 24 +++++++---
 sklearn/linear_model/tests/test_ransac.py     | 12 +++--
 sklearn/tree/_classes.py                      | 28 ++++++++---
 sklearn/tree/tests/test_tree.py               | 20 ++++----
 18 files changed, 224 insertions(+), 103 deletions(-)

diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst
index 329215406c39c..21610228b9b37 100644
--- a/doc/modules/ensemble.rst
+++ b/doc/modules/ensemble.rst
@@ -944,7 +944,7 @@ controls the number of iterations of the boosting process::
   0.8965
 
 Available losses for regression are 'squared_error',
-'least_absolute_deviation', which is less sensitive to outliers, and
+'absolute_error', which is less sensitive to outliers, and
 'poisson', which is well suited to model counts and frequencies. For
 classification, 'binary_crossentropy' is used for binary classification and
 'categorical_crossentropy' is used for multiclass classification. By default
diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index d26c5dd0c347d..8ad8a295d72e0 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -76,6 +76,35 @@ Changelog
   - For :class:`tree.ExtraTreeRegressor`, `criterion="mse"` is deprecated,
     use `"squared_error"` instead which is now the default.
 
+- |API| The option for using the absolute error via ``loss`` and
+  ``criterion`` parameters was made more consistent. The preferred way is by
+  setting the value to `"absolute_error"`. Old option names are still valid,
+  produce the same models, but are deprecated and will be removed in version
+  1.2.
+  :pr:`19733` by :user:`Christian Lorentzen <lorentzenchr>`.
+
+  - For :class:`ensemble.ExtraTreesRegressor`, `criterion="mae"` is deprecated,
+    use `"absolute_error"` instead.
+
+  - For :class:`ensemble.GradientBoostingRegressor`, `loss="lad"` is deprecated,
+    use `"absolute_error"` instead.
+
+  - For :class:`ensemble.RandomForestRegressor`, `criterion="mae"` is deprecated,
+    use `"absolute_error"` instead.
+
+  - For :class:`ensemble.HistGradientBoostingRegressor`,
+    `loss="least_absolute_deviation"` is deprecated, use `"absolute_error"`
+    instead.
+
+  - For :class:`linear_model.RANSACRegressor`, `loss="absolute_loss"` is
+    deprecated, use `"absolute_error"` instead which is now the default.
+
+  - For :class:`tree.DecisionTreeRegressor`, `criterion="mae"` is deprecated,
+    use `"absolute_error"` instead.
+
+  - For :class:`tree.ExtraTreeRegressor`, `criterion="mae"` is deprecated,
+    use `"absolute_error"` instead.
+
 :mod:`sklearn.base`
 ...................
 
diff --git a/sklearn/ensemble/_base.py b/sklearn/ensemble/_base.py
index 095d801de166d..c58a0c7dbe9c7 100644
--- a/sklearn/ensemble/_base.py
+++ b/sklearn/ensemble/_base.py
@@ -153,13 +153,13 @@ def _make_estimator(self, append=True, random_state=None):
                                 for p in self.estimator_params})
 
         # TODO: Remove in v1.2
-        # criterion "mse" would cause warnings in every call to
+        # criterion "mse" and "mae" would cause warnings in every call to
         # DecisionTreeRegressor.fit(..)
-        if (
-            isinstance(estimator, (DecisionTreeRegressor, ExtraTreeRegressor))
-            and getattr(estimator, "criterion", None) == "mse"
-        ):
-            estimator.set_params(criterion="squared_error")
+        if isinstance(estimator, (DecisionTreeRegressor, ExtraTreeRegressor)):
+            if getattr(estimator, "criterion", None) == "mse":
+                estimator.set_params(criterion="squared_error")
+            elif getattr(estimator, "criterion", None) == "mae":
+                estimator.set_params(criterion="absolute_error")
 
         if random_state is not None:
             _set_random_states(estimator, random_state)
diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py
index 140c1c93e8eef..8eef1f3429227 100644
--- a/sklearn/ensemble/_forest.py
+++ b/sklearn/ensemble/_forest.py
@@ -346,16 +346,21 @@ def fit(self, X, y, sample_weight=None):
         # Check parameters
         self._validate_estimator()
         # TODO: Remove in v1.2
-        if (
-            isinstance(self, (RandomForestRegressor, ExtraTreesRegressor))
-            and self.criterion == "mse"
-        ):
-            warn(
-                "Criterion 'mse' was deprecated in v1.0 and will be "
-                "removed in version 1.2. Use `criterion='squared_error'` "
-                "which is equivalent.",
-                FutureWarning
-            )
+        if isinstance(self, (RandomForestRegressor, ExtraTreesRegressor)):
+            if self.criterion == "mse":
+                warn(
+                    "Criterion 'mse' was deprecated in v1.0 and will be "
+                    "removed in version 1.2. Use `criterion='squared_error'` "
+                    "which is equivalent.",
+                    FutureWarning
+                )
+            elif self.criterion == "mae":
+                warn(
+                    "Criterion 'mae' was deprecated in v1.0 and will be "
+                    "removed in version 1.2. Use `criterion='absolute_error'` "
+                    "which is equivalent.",
+                    FutureWarning
+                )
 
         if not self.bootstrap and self.oob_score:
             raise ValueError("Out of bag estimation only available"
@@ -1321,11 +1326,12 @@ class RandomForestRegressor(ForestRegressor):
            The default value of ``n_estimators`` changed from 10 to 100
            in 0.22.
 
-    criterion : {"squared_error", "mse", "mae"}, default="squared_error"
+    criterion : {"squared_error", "mse", "absolute_error", "mae"}, \
+            default="squared_error"
         The function to measure the quality of a split. Supported criteria
         are "squared_error" for the mean squared error, which is equal to
-        variance reduction as feature selection criterion, and "mae" for the
-        mean absolute error.
+        variance reduction as feature selection criterion, and "absolute_error"
+        for the mean absolute error.
 
         .. versionadded:: 0.18
            Mean Absolute Error (MAE) criterion.
@@ -1334,6 +1340,10 @@ class RandomForestRegressor(ForestRegressor):
             Criterion "mse" was deprecated in v1.0 and will be removed in
             version 1.2. Use `criterion="squared_error"` which is equivalent.
 
+        .. deprecated:: 1.0
+            Criterion "mae" was deprecated in v1.0 and will be removed in
+            version 1.2. Use `criterion="absolute_error"` which is equivalent.
+
     max_depth : int, default=None
         The maximum depth of the tree. If None, then nodes are expanded until
         all leaves are pure or until all leaves contain less than
@@ -1936,10 +1946,11 @@ class ExtraTreesRegressor(ForestRegressor):
            The default value of ``n_estimators`` changed from 10 to 100
            in 0.22.
 
-    criterion : {"squared_error", "mse", "mae"}, default="squared_error"
+    criterion : {"squared_error", "mse", "absolute_error", "mae"}, \
+            default="squared_error"
         The function to measure the quality of a split. Supported criteria
-        are "squared_error" and "mse" for the mean squared error, which is
-        equal to variance reduction as feature selection criterion, and "mae"
+        are "squared_error" for the mean squared error, which is equal to
+        variance reduction as feature selection criterion, and "absolute_error"
         for the mean absolute error.
 
         .. versionadded:: 0.18
@@ -1949,6 +1960,10 @@ class ExtraTreesRegressor(ForestRegressor):
             Criterion "mse" was deprecated in v1.0 and will be removed in
             version 1.2. Use `criterion="squared_error"` which is equivalent.
 
+        .. deprecated:: 1.0
+            Criterion "mae" was deprecated in v1.0 and will be removed in
+            version 1.2. Use `criterion="absolute_error"` which is equivalent.
+
     max_depth : int, default=None
         The maximum depth of the tree. If None, then nodes are expanded until
         all leaves are pure or until all leaves contain less than
diff --git a/sklearn/ensemble/_gb.py b/sklearn/ensemble/_gb.py
index 4984575bce8c3..527bbcb559b5f 100644
--- a/sklearn/ensemble/_gb.py
+++ b/sklearn/ensemble/_gb.py
@@ -238,11 +238,17 @@ def _check_params(self):
                 or self.loss not in _gb_losses.LOSS_FUNCTIONS):
             raise ValueError("Loss '{0:s}' not supported. ".format(self.loss))
 
+        # TODO: Remove in v1.2
         if self.loss == "ls":
             warnings.warn("The loss 'ls' was deprecated in v1.0 and "
                           "will be removed in version 1.2. Use 'squared_error'"
                           " which is equivalent.",
                           FutureWarning)
+        elif self.loss == "lad":
+            warnings.warn("The loss 'lad' was deprecated in v1.0 and "
+                          "will be removed in version 1.2. Use "
+                          "'absolute_error' which is equivalent.",
+                          FutureWarning)
 
         if self.loss == 'deviance':
             loss_class = (_gb_losses.MultinomialDeviance
@@ -403,7 +409,7 @@ def fit(self, X, y, sample_weight=None, monitor=None):
         -------
         self : object
         """
-        if self.criterion == 'mae':
+        if self.criterion in ('absolute_error', 'mae'):
             # TODO: This should raise an error from 1.1
             self._warn_mae_for_criterion()
 
@@ -1340,19 +1346,22 @@ class GradientBoostingRegressor(RegressorMixin, BaseGradientBoosting):
 
     Parameters
     ----------
-    loss : {'squared_error', 'ls', 'lad', 'huber', 'quantile'}, \
-            default='squared_error'
+    loss : {'squared_error', 'ls', 'absolute_error', 'lad', 'huber', \
+            'quantile'}, default='squared_error'
         Loss function to be optimized. 'squared_error' refers to the squared
-        error for regression.
-        'lad' (least absolute deviation) is a highly robust
-        loss function solely based on order information of the input
-        variables. 'huber' is a combination of the two. 'quantile'
-        allows quantile regression (use `alpha` to specify the quantile).
+        error for regression. 'absolute_error' refers to the absolute error of
+        regression and is a robust loss function. 'huber' is a
+        combination of the two. 'quantile' allows quantile regression (use
+        `alpha` to specify the quantile).
 
         .. deprecated:: 1.0
             The loss 'ls' was deprecated in v1.0 and will be removed in
             version 1.2. Use `loss='squared_error'` which is equivalent.
 
+        .. deprecated:: 1.0
+            The loss 'lad' was deprecated in v1.0 and will be removed in
+            version 1.2. Use `loss='absolute_error'` which is equivalent.
+
     learning_rate : float, default=0.1
         Learning rate shrinks the contribution of each tree by `learning_rate`.
         There is a trade-off between learning_rate and n_estimators.
@@ -1383,7 +1392,7 @@ class GradientBoostingRegressor(RegressorMixin, BaseGradientBoosting):
         .. deprecated:: 0.24
             `criterion='mae'` is deprecated and will be removed in version
             1.1 (renaming of 0.26). The correct way of minimizing the absolute
-            error is to use `loss='lad'` instead.
+            error is to use `loss='absolute_error'` instead.
 
         .. deprecated:: 1.0
             Criterion 'mse' was deprecated in v1.0 and will be removed in
@@ -1644,7 +1653,8 @@ class GradientBoostingRegressor(RegressorMixin, BaseGradientBoosting):
     """
 
     # TODO: remove "ls" in verion 1.2
-    _SUPPORTED_LOSS = ("squared_error", 'ls', 'lad', 'huber', 'quantile')
+    _SUPPORTED_LOSS = ("squared_error", 'ls', "absolute_error", 'lad', 'huber',
+                       'quantile')
 
     @_deprecate_positional_args
     def __init__(self, *, loss="squared_error", learning_rate=0.1,
@@ -1681,7 +1691,7 @@ def _warn_mae_for_criterion(self):
         warnings.warn("criterion='mae' was deprecated in version 0.24 and "
                       "will be removed in version 1.1 (renaming of 0.26). The "
                       "correct way of minimizing the absolute error is to use "
-                      " loss='lad' instead.", FutureWarning)
+                      " loss='absolute_error' instead.", FutureWarning)
 
     def predict(self, X):
         """Predict regression target for X.
diff --git a/sklearn/ensemble/_gb_losses.py b/sklearn/ensemble/_gb_losses.py
index f33c7086b596b..67a3b1b364f47 100644
--- a/sklearn/ensemble/_gb_losses.py
+++ b/sklearn/ensemble/_gb_losses.py
@@ -856,10 +856,11 @@ def get_init_raw_predictions(self, X, estimator):
         return raw_predictions.reshape(-1, 1).astype(np.float64)
 
 
-# TODO: Remove entry 'ls' in version 1.2.
+# TODO: Remove entry 'ls' and 'lad' in version 1.2.
 LOSS_FUNCTIONS = {
     "squared_error": LeastSquaresError,
     'ls': LeastSquaresError,
+    "absolute_error": LeastAbsoluteError,
     'lad': LeastAbsoluteError,
     'huber': HuberLossFunction,
     'quantile': QuantileLossFunction,
diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
index d3b62a5df784a..6d5de978add9b 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
@@ -893,8 +893,8 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting):
 
     Parameters
     ----------
-    loss : {'squared_error', 'least_squares', 'least_absolute_deviation', \
-            'poisson'}, default='squared_error'
+    loss : {'squared_error', 'least_squares', 'absolute_error', \
+            'least_absolute_deviation', 'poisson'}, default='squared_error'
         The loss function to use in the boosting process. Note that the
         "least squares" and "poisson" losses actually implement
         "half least squares loss" and "half poisson deviance" to simplify the
@@ -908,6 +908,11 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting):
             The loss 'least_squares' was deprecated in v1.0 and will be removed
             in version 1.2. Use `loss='squared_error'` which is equivalent.
 
+        .. deprecated:: 1.0
+            The loss 'least_absolute_deviation' was deprecated in v1.0 and will
+            be removed in version 1.2. Use `loss='absolute_error'` which is
+            equivalent.
+
     learning_rate : float, default=0.1
         The learning rate, also known as *shrinkage*. This is used as a
         multiplicative factor for the leaves values. Use ``1`` for no
@@ -1037,7 +1042,7 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting):
     0.92...
     """
 
-    _VALID_LOSSES = ('squared_error', 'least_squares',
+    _VALID_LOSSES = ('squared_error', 'least_squares', 'absolute_error',
                      'least_absolute_deviation', 'poisson')
 
     @_deprecate_positional_args
@@ -1113,6 +1118,7 @@ def _encode_y(self, y):
         return y
 
     def _get_loss(self, sample_weight):
+        # TODO: Remove in v1.2
         if self.loss == "least_squares":
             warnings.warn(
                 "The loss 'least_squares' was deprecated in v1.0 and will be "
@@ -1120,6 +1126,13 @@ def _get_loss(self, sample_weight):
                 "equivalent.",
                 FutureWarning)
             return _LOSSES["squared_error"](sample_weight=sample_weight)
+        elif self.loss == "least_absolute_deviation":
+            warnings.warn(
+                "The loss 'least_absolute_deviation' was deprecated in v1.0 "
+                " and will be removed in version 1.2. Use 'absolute_error' "
+                "which is equivalent.",
+                FutureWarning)
+            return _LOSSES["absolute_error"](sample_weight=sample_weight)
 
         return _LOSSES[self.loss](sample_weight=sample_weight)
 
diff --git a/sklearn/ensemble/_hist_gradient_boosting/loss.py b/sklearn/ensemble/_hist_gradient_boosting/loss.py
index c336bd347e4cf..036f075bdabd8 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/loss.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/loss.py
@@ -420,7 +420,7 @@ def predict_proba(self, raw_predictions):
 
 _LOSSES = {
     'squared_error': LeastSquares,
-    'least_absolute_deviation': LeastAbsoluteDeviation,
+    'absolute_error': LeastAbsoluteDeviation,
     'binary_crossentropy': BinaryCrossEntropy,
     'categorical_crossentropy': CategoricalCrossEntropy,
     'poisson': Poisson,
diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_compare_lightgbm.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_compare_lightgbm.py
index f34dffab2671c..ac58f39422687 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_compare_lightgbm.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_compare_lightgbm.py
@@ -34,7 +34,7 @@ def test_same_predictions_regression(seed, min_samples_leaf, n_samples,
     #   and max_leaf_nodes is low enough.
     # - To ignore  discrepancies caused by small differences the binning
     #   strategy, data is pre-binned if n_samples > 255.
-    # - We don't check the least_absolute_deviation loss here. This is because
+    # - We don't check the absolute_error loss here. This is because
     #   LightGBM's computation of the median (used for the initial value of
     #   raw_prediction) is a bit off (they'll e.g. return midpoints when there
     #   is no need to.). Since these tests only run 1 iteration, the
diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
index b2322f29f85d1..213d46cf58f04 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
@@ -192,26 +192,26 @@ def test_should_stop(scores, n_iter_no_change, tol, stopping):
     assert gbdt._should_stop(scores) == stopping
 
 
-def test_least_absolute_deviation():
+def test_absolute_error():
     # For coverage only.
     X, y = make_regression(n_samples=500, random_state=0)
-    gbdt = HistGradientBoostingRegressor(loss='least_absolute_deviation',
+    gbdt = HistGradientBoostingRegressor(loss='absolute_error',
                                          random_state=0)
     gbdt.fit(X, y)
     assert gbdt.score(X, y) > .9
 
 
-def test_least_absolute_deviation_sample_weight():
+def test_absolute_error_sample_weight():
     # non regression test for issue #19400
     # make sure no error is thrown during fit of
-    # HistGradientBoostingRegressor with least_absolute_deviation loss function
+    # HistGradientBoostingRegressor with absolute_error loss function
     # and passing sample_weight
     rng = np.random.RandomState(0)
     n_samples = 100
     X = rng.uniform(-1, 1, size=(n_samples, 2))
     y = rng.uniform(-1, 1, size=n_samples)
     sample_weight = rng.uniform(0, 1, size=n_samples)
-    gbdt = HistGradientBoostingRegressor(loss='least_absolute_deviation')
+    gbdt = HistGradientBoostingRegressor(loss='absolute_error')
     gbdt.fit(X, y, sample_weight=sample_weight)
 
 
@@ -650,8 +650,7 @@ def test_sample_weight_effect(problem, duplication):
                        est_dup._raw_predict(X_dup))
 
 
-@pytest.mark.parametrize('loss_name', ('squared_error',
-                                       'least_absolute_deviation'))
+@pytest.mark.parametrize('loss_name', ('squared_error', 'absolute_error'))
 def test_sum_hessians_are_sample_weight(loss_name):
     # For losses with constant hessians, the sum_hessians field of the
     # histograms must be equal to the sum of the sample weight of samples at
@@ -993,14 +992,18 @@ def test_uint8_predict(Est):
 
 
 # TODO: Remove in v1.2
-def test_loss_least_squares_deprecated():
+@pytest.mark.parametrize("old_loss, new_loss", [
+    ("least_squares", "squared_error"),
+    ("least_absolute_deviation", "absolute_error"),
+])
+def test_loss_deprecated(old_loss, new_loss):
     X, y = make_regression(n_samples=50, random_state=0)
-    est1 = HistGradientBoostingRegressor(loss="least_squares", random_state=0)
+    est1 = HistGradientBoostingRegressor(loss=old_loss, random_state=0)
 
     with pytest.warns(FutureWarning,
-                      match="The loss 'least_squares' was deprecated"):
+                      match=f"The loss '{old_loss}' was deprecated"):
         est1.fit(X, y)
 
-    est2 = HistGradientBoostingRegressor(loss="squared_error", random_state=0)
+    est2 = HistGradientBoostingRegressor(loss=new_loss, random_state=0)
     est2.fit(X, y)
     assert_allclose(est1.predict(X), est2.predict(X))
diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py
index ce7b4acedbae5..345e72c642668 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py
@@ -103,7 +103,7 @@ def fprime2(x: np.ndarray) -> np.ndarray:
 
 @pytest.mark.parametrize('loss, n_classes, prediction_dim', [
     ("squared_error", 0, 1),
-    ('least_absolute_deviation', 0, 1),
+    ("absolute_error", 0, 1),
     ('binary_crossentropy', 2, 1),
     ('categorical_crossentropy', 3, 3),
     ('poisson', 0, 1),
@@ -118,7 +118,7 @@ def test_numerical_gradients(loss, n_classes, prediction_dim, seed=0):
 
     rng = np.random.RandomState(seed)
     n_samples = 100
-    if loss in ("squared_error", 'least_absolute_deviation'):
+    if loss in ("squared_error", "absolute_error"):
         y_true = rng.normal(size=n_samples).astype(Y_DTYPE)
     elif loss in ('poisson'):
         y_true = rng.poisson(size=n_samples).astype(Y_DTYPE)
@@ -172,10 +172,10 @@ def test_baseline_least_squares():
                        baseline_prediction)
 
 
-def test_baseline_least_absolute_deviation():
+def test_baseline_absolute_error():
     rng = np.random.RandomState(0)
 
-    loss = _LOSSES['least_absolute_deviation'](sample_weight=None)
+    loss = _LOSSES["absolute_error"](sample_weight=None)
     y_train = rng.normal(size=100)
     baseline_prediction = loss.get_baseline_prediction(y_train, None, 1)
     assert baseline_prediction.shape == tuple()  # scalar
@@ -256,7 +256,7 @@ def test_baseline_categorical_crossentropy():
 
 @pytest.mark.parametrize('loss, problem', [
     ("squared_error", 'regression'),
-    ('least_absolute_deviation', 'regression'),
+    ("absolute_error", 'regression'),
     ('binary_crossentropy', 'classification'),
     ('categorical_crossentropy', 'classification'),
     ('poisson', 'poisson_regression'),
diff --git a/sklearn/ensemble/_hist_gradient_boosting/utils.pyx b/sklearn/ensemble/_hist_gradient_boosting/utils.pyx
index d1168acf94835..3b323b3e298b8 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/utils.pyx
+++ b/sklearn/ensemble/_hist_gradient_boosting/utils.pyx
@@ -43,7 +43,7 @@ def get_equivalent_estimator(estimator, lib='lightgbm'):
 
     lightgbm_loss_mapping = {
         'squared_error': 'regression_l2',
-        'least_absolute_deviation': 'regression_l1',
+        'absolute_error': 'regression_l1',
         'binary_crossentropy': 'binary',
         'categorical_crossentropy': 'multiclass'
     }
@@ -76,7 +76,7 @@ def get_equivalent_estimator(estimator, lib='lightgbm'):
     # XGB
     xgboost_loss_mapping = {
         'squared_error': 'reg:linear',
-        'least_absolute_deviation': 'LEAST_ABSOLUTE_DEV_NOT_SUPPORTED',
+        'absolute_error': 'LEAST_ABSOLUTE_DEV_NOT_SUPPORTED',
         'binary_crossentropy': 'reg:logistic',
         'categorical_crossentropy': 'multi:softmax'
     }
@@ -101,7 +101,7 @@ def get_equivalent_estimator(estimator, lib='lightgbm'):
     catboost_loss_mapping = {
         'squared_error': 'RMSE',
         # catboost does not support MAE when leaf_estimation_method is Newton
-        'least_absolute_deviation': 'LEAST_ASBOLUTE_DEV_NOT_SUPPORTED',
+        'absolute_error': 'LEAST_ASBOLUTE_DEV_NOT_SUPPORTED',
         'binary_crossentropy': 'Logloss',
         'categorical_crossentropy': 'MultiClass'
     }
diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py
index b6c1fea0e2f29..c74a1ca0c603e 100644
--- a/sklearn/ensemble/tests/test_forest.py
+++ b/sklearn/ensemble/tests/test_forest.py
@@ -176,7 +176,9 @@ def check_regression_criterion(name, criterion):
 
 
 @pytest.mark.parametrize('name', FOREST_REGRESSORS)
-@pytest.mark.parametrize('criterion', ("squared_error", "mae", "friedman_mse"))
+@pytest.mark.parametrize('criterion', (
+    "squared_error", "absolute_error", "friedman_mse"
+))
 def test_regression(name, criterion):
     check_regression_criterion(name, criterion)
 
@@ -261,10 +263,14 @@ def check_importances(name, criterion, dtype, tolerance):
         itertools.chain(product(FOREST_CLASSIFIERS,
                                 ["gini", "entropy"]),
                         product(FOREST_REGRESSORS,
-                                ["squared_error", "friedman_mse", "mae"])))
+                                [
+                                 "squared_error",
+                                 "friedman_mse",
+                                 "absolute_error"
+                                 ])))
 def test_importances(dtype, name, criterion):
     tolerance = 0.01
-    if name in FOREST_REGRESSORS and criterion == "mae":
+    if name in FOREST_REGRESSORS and criterion == "absolute_error":
         tolerance = 0.05
     check_importances(name, criterion, dtype, tolerance)
 
@@ -1498,14 +1504,18 @@ def test_n_features_deprecation(Estimator):
 
 
 # TODO: Remove in v1.2
-def test_mse_deprecated():
-    est1 = RandomForestRegressor(criterion="mse", random_state=0)
+@pytest.mark.parametrize("old_criterion, new_criterion", [
+    ("mse", "squared_error"),
+    ("mae", "absolute_error"),
+])
+def test_criterion_deprecated(old_criterion, new_criterion):
+    est1 = RandomForestRegressor(criterion=old_criterion, random_state=0)
 
     with pytest.warns(FutureWarning,
-                      match="Criterion 'mse' was deprecated"):
+                      match=f"Criterion '{old_criterion}' was deprecated"):
         est1.fit(X, y)
 
-    est2 = RandomForestRegressor(criterion="squared_error", random_state=0)
+    est2 = RandomForestRegressor(criterion=new_criterion, random_state=0)
     est2.fit(X, y)
     assert_allclose(est1.predict(X), est2.predict(X))
 
diff --git a/sklearn/ensemble/tests/test_gradient_boosting.py b/sklearn/ensemble/tests/test_gradient_boosting.py
index 166d6bdfc5c11..30c0cdc0cc8fd 100644
--- a/sklearn/ensemble/tests/test_gradient_boosting.py
+++ b/sklearn/ensemble/tests/test_gradient_boosting.py
@@ -133,7 +133,7 @@ def test_gbdt_loss_alpha_error(params, err_msg):
 @pytest.mark.parametrize(
     "GradientBoosting, loss",
     [(GradientBoostingClassifier, "ls"),
-     (GradientBoostingClassifier, "lad"),
+     (GradientBoostingClassifier, "absolute_error"),
      (GradientBoostingClassifier, "quantile"),
      (GradientBoostingClassifier, "huber"),
      (GradientBoostingRegressor, "deviance"),
@@ -171,7 +171,7 @@ def test_classification_synthetic(loss):
     assert error_rate < 0.08
 
 
-@pytest.mark.parametrize('loss', ('squared_error', 'lad', 'huber'))
+@pytest.mark.parametrize('loss', ('squared_error', 'absolute_error', 'huber'))
 @pytest.mark.parametrize('subsample', (1.0, 0.5))
 def test_regression_dataset(loss, subsample):
     # Check consistency on regression dataset with least squares
@@ -508,7 +508,7 @@ def test_degenerate_targets():
 
 
 def test_quantile_loss():
-    # Check if quantile loss with alpha=0.5 equals lad.
+    # Check if quantile loss with alpha=0.5 equals absolute_error.
     clf_quantile = GradientBoostingRegressor(n_estimators=100, loss='quantile',
                                              max_depth=4, alpha=0.5,
                                              random_state=7)
@@ -516,12 +516,12 @@ def test_quantile_loss():
     clf_quantile.fit(X_reg, y_reg)
     y_quantile = clf_quantile.predict(X_reg)
 
-    clf_lad = GradientBoostingRegressor(n_estimators=100, loss='lad',
-                                        max_depth=4, random_state=7)
+    clf_ae = GradientBoostingRegressor(n_estimators=100, loss='absolute_error',
+                                       max_depth=4, random_state=7)
 
-    clf_lad.fit(X_reg, y_reg)
-    y_lad = clf_lad.predict(X_reg)
-    assert_array_almost_equal(y_quantile, y_lad, decimal=4)
+    clf_ae.fit(X_reg, y_reg)
+    y_ae = clf_ae.predict(X_reg)
+    assert_array_almost_equal(y_quantile, y_ae, decimal=4)
 
 
 def test_symbol_labels():
@@ -1067,7 +1067,7 @@ def test_non_uniform_weights_toy_edge_case_reg():
     y = [0, 0, 1, 0]
     # ignore the first 2 training samples by setting their weight to 0
     sample_weight = [0, 0, 1, 1]
-    for loss in ('huber', 'squared_error', 'lad', 'quantile'):
+    for loss in ('huber', 'squared_error', 'absolute_error', 'quantile'):
         gb = GradientBoostingRegressor(learning_rate=1.0, n_estimators=2,
                                        loss=loss)
         gb.fit(X, y, sample_weight=sample_weight)
@@ -1390,13 +1390,17 @@ def test_criterion_mse_deprecated(Estimator):
 
 
 # TODO: Remove in v1.2
-def test_loss_ls_deprecated():
-    est1 = GradientBoostingRegressor(loss="ls", random_state=0)
+@pytest.mark.parametrize("old_loss, new_loss", [
+    ("ls", "squared_error"),
+    ("lad", "absolute_error"),
+])
+def test_loss_deprecated(old_loss, new_loss):
+    est1 = GradientBoostingRegressor(loss=old_loss, random_state=0)
 
     with pytest.warns(FutureWarning,
-                      match="The loss 'ls' was deprecated"):
+                      match=f"The loss '{old_loss}' was deprecated"):
         est1.fit(X, y)
 
-    est2 = GradientBoostingRegressor(loss="squared_error", random_state=0)
+    est2 = GradientBoostingRegressor(loss=new_loss, random_state=0)
     est2.fit(X, y)
     assert_allclose(est1.predict(X), est2.predict(X))
diff --git a/sklearn/linear_model/_ransac.py b/sklearn/linear_model/_ransac.py
index 2fc8143f432c8..3cde1f1235ec8 100644
--- a/sklearn/linear_model/_ransac.py
+++ b/sklearn/linear_model/_ransac.py
@@ -137,9 +137,9 @@ class RANSACRegressor(MetaEstimatorMixin, RegressorMixin,
         as 0.99 (the default) and e is the current fraction of inliers w.r.t.
         the total number of samples.
 
-    loss : string, callable, default='absolute_loss'
-        String inputs, 'absolute_loss' and 'squared_error' are supported which
-        find the absolute loss and squared error per sample respectively.
+    loss : string, callable, default='absolute_error'
+        String inputs, 'absolute_error' and 'squared_error' are supported which
+        find the absolute error and squared error per sample respectively.
 
         If ``loss`` is a callable, then it should be a function that takes
         two arrays as inputs, the true and predicted value and returns a 1-D
@@ -155,6 +155,10 @@ class RANSACRegressor(MetaEstimatorMixin, RegressorMixin,
             The loss 'squared_loss' was deprecated in v1.0 and will be removed
             in version 1.2. Use `loss='squared_error'` which is equivalent.
 
+        .. deprecated:: 1.0
+            The loss 'absolute_loss' was deprecated in v1.0 and will be removed
+            in version 1.2. Use `loss='absolute_error'` which is equivalent.
+
     random_state : int, RandomState instance, default=None
         The generator used to initialize the centers.
         Pass an int for reproducible output across multiple function calls.
@@ -212,7 +216,7 @@ def __init__(self, base_estimator=None, *, min_samples=None,
                  residual_threshold=None, is_data_valid=None,
                  is_model_valid=None, max_trials=100, max_skips=np.inf,
                  stop_n_inliers=np.inf, stop_score=np.inf,
-                 stop_probability=0.99, loss='absolute_loss',
+                 stop_probability=0.99, loss='absolute_error',
                  random_state=None):
 
         self.base_estimator = base_estimator
@@ -293,7 +297,15 @@ def fit(self, X, y, sample_weight=None):
         else:
             residual_threshold = self.residual_threshold
 
-        if self.loss == "absolute_loss":
+        # TODO: Remove absolute_loss in v1.2.
+        if self.loss in ("absolute_error", "absolute_loss"):
+            if self.loss == "absolute_loss":
+                warnings.warn(
+                    "The loss 'absolute_loss' was deprecated in v1.0 and will "
+                    "be removed in version 1.2. Use `loss='absolute_error'` "
+                    "which is equivalent.",
+                    FutureWarning
+                )
             if y.ndim == 1:
                 loss_function = lambda y_true, y_pred: np.abs(y_true - y_pred)
             else:
@@ -319,7 +331,7 @@ def fit(self, X, y, sample_weight=None):
 
         else:
             raise ValueError(
-                "loss should be 'absolute_loss', 'squared_error' or a "
+                "loss should be 'absolute_error', 'squared_error' or a "
                 "callable. Got %s. " % self.loss)
 
         random_state = check_random_state(self.random_state)
diff --git a/sklearn/linear_model/tests/test_ransac.py b/sklearn/linear_model/tests/test_ransac.py
index 857696bf387d5..071a67efcf28f 100644
--- a/sklearn/linear_model/tests/test_ransac.py
+++ b/sklearn/linear_model/tests/test_ransac.py
@@ -539,13 +539,17 @@ def test_ransac_final_model_fit_sample_weight():
 
 
 # TODO: Remove in v1.2
-def test_loss_squared_loss_deprecated():
-    est1 = RANSACRegressor(loss="squared_loss", random_state=0)
+@pytest.mark.parametrize("old_loss, new_loss", [
+    ("absolute_loss", "squared_error"),
+    ("squared_loss", "absolute_error"),
+])
+def test_loss_deprecated(old_loss, new_loss):
+    est1 = RANSACRegressor(loss=old_loss, random_state=0)
 
     with pytest.warns(FutureWarning,
-                      match="The loss 'squared_loss' was deprecated"):
+                      match=f"The loss '{old_loss}' was deprecated"):
         est1.fit(X, y)
 
-    est2 = RANSACRegressor(loss="squared_error", random_state=0)
+    est2 = RANSACRegressor(loss=new_loss, random_state=0)
     est2.fit(X, y)
     assert_allclose(est1.predict(X), est2.predict(X))
diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
index 420292881f7db..de5aebfa8a6e3 100644
--- a/sklearn/tree/_classes.py
+++ b/sklearn/tree/_classes.py
@@ -62,10 +62,11 @@
 
 CRITERIA_CLF = {"gini": _criterion.Gini,
                 "entropy": _criterion.Entropy}
-# TODO: Remove "mse" in version 1.2.
+# TODO: Remove "mse" and "mae" in version 1.2.
 CRITERIA_REG = {"squared_error": _criterion.MSE,
                 "mse": _criterion.MSE,
                 "friedman_mse": _criterion.FriedmanMSE,
+                "absolute_error": _criterion.MAE,
                 "mae": _criterion.MAE,
                 "poisson": _criterion.Poisson}
 
@@ -360,6 +361,13 @@ def fit(self, X, y, sample_weight=None, check_input=True,
                     "which is equivalent.",
                     FutureWarning
                 )
+            elif self.criterion == "mae":
+                warnings.warn(
+                    "Criterion 'mae' was deprecated in v1.0 and will be "
+                    "removed in version 1.2. Use `criterion='absolute_error'` "
+                    "which is equivalent.",
+                    FutureWarning
+                )
         else:
             # Make a deepcopy in case the criterion has mutable attributes that
             # might be shared and modified concurrently during parallel fitting
@@ -1001,16 +1009,16 @@ class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree):
 
     Parameters
     ----------
-    criterion : {"squared_error", "mse", "friedman_mse", "mae", "poisson"}, \
-            default="squared_error"
+    criterion : {"squared_error", "mse", "friedman_mse", "absolute_error", \
+            "mae", "poisson"}, default="squared_error"
         The function to measure the quality of a split. Supported criteria
         are "squared_error" for the mean squared error, which is equal to
         variance reduction as feature selection criterion and minimizes the L2
         loss using the mean of each terminal node, "friedman_mse", which uses
         mean squared error with Friedman's improvement score for potential
-        splits, "mae" for the mean absolute error, which minimizes the L1 loss
-        using the median of each terminal node, and "poisson" which uses
-        reduction in Poisson deviance to find splits.
+        splits, "absolute_error" for the mean absolute error, which minimizes
+        the L1 loss using the median of each terminal node, and "poisson" which
+        uses reduction in Poisson deviance to find splits.
 
         .. versionadded:: 0.18
            Mean Absolute Error (MAE) criterion.
@@ -1022,6 +1030,10 @@ class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree):
             Criterion "mse" was deprecated in v1.0 and will be removed in
             version 1.2. Use `criterion="squared_error"` which is equivalent.
 
+        .. deprecated:: 1.0
+            Criterion "mae" was deprecated in v1.0 and will be removed in
+            version 1.2. Use `criterion="absolute_error"` which is equivalent.
+
     splitter : {"best", "random"}, default="best"
         The strategy used to choose the split at each node. Supported
         strategies are "best" to choose the best split and "random" to choose
@@ -1577,6 +1589,10 @@ class ExtraTreeRegressor(DecisionTreeRegressor):
             Criterion "mse" was deprecated in v1.0 and will be removed in
             version 1.2. Use `criterion="squared_error"` which is equivalent.
 
+        .. deprecated:: 1.0
+            Criterion "mae" was deprecated in v1.0 and will be removed in
+            version 1.2. Use `criterion="absolute_error"` which is equivalent.
+
     splitter : {"random", "best"}, default="random"
         The strategy used to choose the split at each node. Supported
         strategies are "best" to choose the best split and "random" to choose
diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py
index 2a1da1e2bfce0..a6e30a9941756 100644
--- a/sklearn/tree/tests/test_tree.py
+++ b/sklearn/tree/tests/test_tree.py
@@ -51,7 +51,7 @@
 from sklearn.utils import compute_sample_weight
 
 CLF_CRITERIONS = ("gini", "entropy")
-REG_CRITERIONS = ("squared_error", "mae", "friedman_mse", "poisson")
+REG_CRITERIONS = ("squared_error", "absolute_error", "friedman_mse", "poisson")
 
 CLF_TREES = {
     "DecisionTreeClassifier": DecisionTreeClassifier,
@@ -294,7 +294,7 @@ def test_diabetes_overfit(name, Tree, criterion):
 @pytest.mark.parametrize(
     "criterion, max_depth, metric, max_loss",
     [("squared_error", 15, mean_squared_error, 60),
-     ("mae", 20, mean_squared_error, 60),
+     ("absolute_error", 20, mean_squared_error, 60),
      ("friedman_mse", 15, mean_squared_error, 60),
      ("poisson", 15, mean_poisson_deviance, 30)]
 )
@@ -1772,7 +1772,7 @@ def test_mae():
             = 0.75
             ------
     """
-    dt_mae = DecisionTreeRegressor(random_state=0, criterion="mae",
+    dt_mae = DecisionTreeRegressor(random_state=0, criterion="absolute_error",
                                    max_leaf_nodes=2)
 
     # Test MAE where sample weights are non-uniform (as illustrated above):
@@ -2121,12 +2121,16 @@ def test_X_idx_sorted_deprecated(TreeEstimator):
 
 # TODO: Remove in v1.2
 @pytest.mark.parametrize("Tree", REG_TREES.values())
-def test_mse_deprecated(Tree):
-    tree = Tree(criterion="mse")
+@pytest.mark.parametrize("old_criterion, new_criterion", [
+    ("mse", "squared_error"),
+    ("mae", "absolute_error"),
+])
+def test_criterion_deprecated(Tree, old_criterion, new_criterion):
+    tree = Tree(criterion=old_criterion)
 
     with pytest.warns(FutureWarning,
-                      match="Criterion 'mse' was deprecated"):
+                      match=f"Criterion '{old_criterion}' was deprecated"):
         tree.fit(X, y)
 
-    tree_sqer = Tree(criterion="squared_error").fit(X, y)
-    assert_allclose(tree.predict(X), tree_sqer.predict(X))
+    tree_new = Tree(criterion=new_criterion).fit(X, y)
+    assert_allclose(tree.predict(X), tree_new.predict(X))

From 5e85941bd66e651c17ce8d37a24e52fd0b00af28 Mon Sep 17 00:00:00 2001
From: kmatt10 <kmatt10@gmail.com>
Date: Tue, 11 May 2021 16:10:41 +0800
Subject: [PATCH 386/478] DOC Added return value info to
 SimpleImputer.transform docstring (#20005)

Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
Co-authored-by: Guillaume Lemaitre <g.lemaitre58@gmail.com>
---
 sklearn/impute/_base.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/sklearn/impute/_base.py b/sklearn/impute/_base.py
index 46f2301a1879d..e345fe44f0895 100644
--- a/sklearn/impute/_base.py
+++ b/sklearn/impute/_base.py
@@ -426,6 +426,12 @@ def transform(self, X):
         ----------
         X : {array-like, sparse matrix}, shape (n_samples, n_features)
             The input data to complete.
+
+        Returns
+        -------
+        X_imputed : {ndarray, sparse matrix} of shape \
+                (n_samples, n_features_out)
+            `X` with imputed values.
         """
         check_is_fitted(self)
 

From 6d67937b3ce28fd3fc966d3d417df56c08c98502 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?=
 <34657725+jeremiedbb@users.noreply.github.com>
Date: Tue, 11 May 2021 11:08:13 +0200
Subject: [PATCH 387/478] cln deprecations fixes (#19323)

Co-authored-by: Olivier Grisel <olivier.grisel@gmail.com>
---
 sklearn/utils/fixes.py            | 11 -----------
 sklearn/utils/tests/test_fixes.py |  6 ------
 2 files changed, 17 deletions(-)

diff --git a/sklearn/utils/fixes.py b/sklearn/utils/fixes.py
index a5a455ee7b9a1..13ecba4afc472 100644
--- a/sklearn/utils/fixes.py
+++ b/sklearn/utils/fixes.py
@@ -18,10 +18,7 @@
 import scipy
 import scipy.stats
 from scipy.sparse.linalg import lsqr as sparse_lsqr  # noqa
-from numpy.ma import MaskedArray as _MaskedArray  # TODO: remove in 1.0
 from .._config import config_context, get_config
-
-from .deprecation import deprecated
 from ..externals._packaging.version import parse as parse_version
 
 
@@ -151,14 +148,6 @@ class loguniform(scipy.stats.reciprocal):
     """
 
 
-@deprecated(
-    'MaskedArray is deprecated in version 0.23 and will be removed in version '
-    '1.0 (renaming of 0.25). Use numpy.ma.MaskedArray instead.'
-)
-class MaskedArray(_MaskedArray):
-    pass  # TODO: remove in 1.0
-
-
 def _take_along_axis(arr, indices, axis):
     """Implements a simplified version of np.take_along_axis if numpy
     version < 1.15"""
diff --git a/sklearn/utils/tests/test_fixes.py b/sklearn/utils/tests/test_fixes.py
index 03e11f5bc1a08..bcd57379fcff6 100644
--- a/sklearn/utils/tests/test_fixes.py
+++ b/sklearn/utils/tests/test_fixes.py
@@ -14,7 +14,6 @@
 from sklearn.utils.fixes import _joblib_parallel_args
 from sklearn.utils.fixes import _object_dtype_isnan
 from sklearn.utils.fixes import loguniform
-from sklearn.utils.fixes import MaskedArray
 from sklearn.utils.fixes import linspace, parse_version, np_version
 
 
@@ -87,11 +86,6 @@ def test_loguniform(low, high, base):
     )
 
 
-def test_masked_array_deprecated():  # TODO: remove in 1.0
-    with pytest.warns(FutureWarning, match='is deprecated'):
-        MaskedArray()
-
-
 def test_linspace():
     """Test that linespace works like np.linespace as of numpy version 1.16."""
     start, stop = 0, 10

From 8635580e5cec7afeab4d8d647a705d66ac5c4adc Mon Sep 17 00:00:00 2001
From: Chiara Marmo <cmarmo@users.noreply.github.com>
Date: Tue, 11 May 2021 15:46:10 +0200
Subject: [PATCH 388/478] DOC Add communication guidelines. (#20048)

* Add communication guidelines.

* Address comments. Fix sphinx warning.

* Some clarification.

* Update doc/developers/contributing.rst

Co-authored-by: Nicolas Hug <contact@nicolas-hug.com>

* Address comments.

* Address comments.

* Update doc/developers/contributing.rst

Co-authored-by: Nicolas Hug <contact@nicolas-hug.com>

Co-authored-by: Nicolas Hug <contact@nicolas-hug.com>
---
 doc/developers/contributing.rst | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/doc/developers/contributing.rst b/doc/developers/contributing.rst
index 0284ad179fc19..c808a806b3076 100644
--- a/doc/developers/contributing.rst
+++ b/doc/developers/contributing.rst
@@ -1262,6 +1262,38 @@ from high-level questions to a more detailed check-list.
 
 :ref:`saved_replies` includes some frequent comments that reviewers may make.
 
+.. _communication:
+
+Communication Guidelines
+------------------------
+
+Reviewing open pull requests (PRs) helps move the project forward. It is a
+great way to get familiar with the codebase and should motivate the
+contributor to keep involved in the project. [1]_
+
+- Every PR, good or bad, is an act of generosity. Opening with a positive
+  comment will help the author feel rewarded, and your subsequent remarks may
+  be heard more clearly. You may feel good also.
+- Begin if possible with the large issues, so the author knows they’ve been
+  understood. Resist the temptation to immediately go line by line, or to open
+  with small pervasive issues.
+- Do not let perfect be the enemy of the good. If you find yourself making
+  many small suggestions that are a matter of subjective taste rather than
+  somewhat objective, the following approaches are suggested:
+
+  - refrain from submitting these;
+  - prefix them as "Nit" so that the contributor knows it's OK not to address;
+  - follow up in a subsequent PR, out of courtesy, you may want to let the
+    original contributor know.
+
+- Do not rush, take the time to make your comments clear and justify your
+  suggestions.
+- You are the face of the project. Bad days occur to everyone, in that
+  occasion you deserve a break: try to take your time and stay offline.
+
+.. [1] Adapted from the numpy `communication guidelines
+       <https://numpy.org/devdocs/dev/reviewer_guidelines.html#communication-guidelines>`_.
+
 Reading the existing code base
 ==============================
 

From 1991531116a2cde1d44a7390a598cfff2957d010 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?=
 <34657725+jeremiedbb@users.noreply.github.com>
Date: Tue, 11 May 2021 22:38:11 +0200
Subject: [PATCH 389/478] MNT Clean deprecations for 1.0 | SGD (#19320)

Co-authored-by: Olivier Grisel <olivier.grisel@gmail.com>
---
 sklearn/linear_model/_stochastic_gradient.py  | 49 -------------------
 .../tests/test_passive_aggressive.py          | 14 ------
 sklearn/linear_model/tests/test_sgd.py        | 13 -----
 3 files changed, 76 deletions(-)

diff --git a/sklearn/linear_model/_stochastic_gradient.py b/sklearn/linear_model/_stochastic_gradient.py
index 44ecf564ffcc5..92b02155246df 100644
--- a/sklearn/linear_model/_stochastic_gradient.py
+++ b/sklearn/linear_model/_stochastic_gradient.py
@@ -37,7 +37,6 @@
 from ._sgd_fast import EpsilonInsensitive
 from ._sgd_fast import SquaredEpsilonInsensitive
 from ..utils.fixes import _joblib_parallel_args
-from ..utils import deprecated
 
 LEARNING_RATE_TYPES = {"constant": 1, "optimal": 2, "invscaling": 3,
                        "adaptive": 4, "pa1": 5, "pa2": 6}
@@ -309,39 +308,6 @@ def _make_validation_score_cb(self, validation_mask, X, y, sample_weight,
             self, X[validation_mask], y[validation_mask],
             sample_weight[validation_mask], classes=classes)
 
-    # mypy error: Decorated property not supported
-    @deprecated("Attribute standard_coef_ was deprecated "  # type: ignore
-                "in version 0.23 and will be removed in 1.0 "
-                "(renaming of 0.25).")
-    @property
-    def standard_coef_(self):
-        return self._standard_coef
-
-    # mypy error: Decorated property not supported
-    @deprecated(  # type: ignore
-        "Attribute standard_intercept_ was deprecated "
-        "in version 0.23 and will be removed in 1.0 (renaming of 0.25)."
-    )
-    @property
-    def standard_intercept_(self):
-        return self._standard_intercept
-
-    # mypy error: Decorated property not supported
-    @deprecated("Attribute average_coef_ was deprecated "  # type: ignore
-                "in version 0.23 and will be removed in 1.0 "
-                "(renaming of 0.25).")
-    @property
-    def average_coef_(self):
-        return self._average_coef
-
-    # mypy error: Decorated property not supported
-    @deprecated("Attribute average_intercept_ was deprecated "  # type: ignore
-                "in version 0.23 and will be removed in 1.0 "
-                "(renaming of 0.25).")
-    @property
-    def average_intercept_(self):
-        return self._average_intercept
-
 
 def _prepare_fit_binary(est, y, i):
     """Initialization for fit_binary.
@@ -1570,21 +1536,6 @@ class SGDRegressor(BaseSGDRegressor):
     intercept_ : ndarray of shape (1,)
         The intercept term.
 
-    average_coef_ : ndarray of shape (n_features,)
-        Averaged weights assigned to the features. Only available
-        if ``average=True``.
-
-        .. deprecated:: 0.23
-            Attribute ``average_coef_`` was deprecated
-            in version 0.23 and will be removed in 1.0 (renaming of 0.25).
-
-    average_intercept_ : ndarray of shape (1,)
-        The averaged intercept term. Only available if ``average=True``.
-
-        .. deprecated:: 0.23
-            Attribute ``average_intercept_`` was deprecated
-            in version 0.23 and will be removed in 1.0 (renaming of 0.25).
-
     n_iter_ : int
         The actual number of iterations before reaching the stopping criterion.
 
diff --git a/sklearn/linear_model/tests/test_passive_aggressive.py b/sklearn/linear_model/tests/test_passive_aggressive.py
index d0d099eeacc8d..251e4408464e2 100644
--- a/sklearn/linear_model/tests/test_passive_aggressive.py
+++ b/sklearn/linear_model/tests/test_passive_aggressive.py
@@ -270,17 +270,3 @@ def test_regressor_undefined_methods():
     for meth in ("transform",):
         with pytest.raises(AttributeError):
             getattr(reg, meth)
-
-
-# TODO: remove in 1.0
-@pytest.mark.parametrize('klass', [PassiveAggressiveClassifier,
-                                   PassiveAggressiveRegressor])
-def test_passive_aggressive_deprecated_attr(klass):
-    est = klass(average=True)
-    est.fit(X, y)
-
-    msg = "Attribute {} was deprecated"
-    for att in ['average_coef_', 'average_intercept_',
-                'standard_coef_', 'standard_intercept_']:
-        with pytest.warns(FutureWarning, match=msg.format(att)):
-            getattr(est, att)
diff --git a/sklearn/linear_model/tests/test_sgd.py b/sklearn/linear_model/tests/test_sgd.py
index 8465631828613..1fcf99997a031 100644
--- a/sklearn/linear_model/tests/test_sgd.py
+++ b/sklearn/linear_model/tests/test_sgd.py
@@ -299,19 +299,6 @@ def test_plain_has_no_average_attr(klass):
     assert not hasattr(clf, '_standard_coef')
 
 
-# TODO: remove in 1.0
-@pytest.mark.parametrize('klass', [SGDClassifier, SGDRegressor])
-def test_sgd_deprecated_attr(klass):
-    est = klass(average=True, eta0=.01)
-    est.fit(X, Y)
-
-    msg = "Attribute {} was deprecated"
-    for att in ['average_coef_', 'average_intercept_',
-                'standard_coef_', 'standard_intercept_']:
-        with pytest.warns(FutureWarning, match=msg.format(att)):
-            getattr(est, att)
-
-
 @pytest.mark.parametrize('klass', [SGDClassifier, SparseSGDClassifier,
                                    SGDRegressor, SparseSGDRegressor,
                                    SGDOneClassSVM, SparseSGDOneClassSVM])

From 32b60fb9769f7e43915e51e66a94994024a72764 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?=
 <34657725+jeremiedbb@users.noreply.github.com>
Date: Tue, 11 May 2021 22:40:50 +0200
Subject: [PATCH 390/478] MNT Clean deprecations for 1.0 | plot_tree (#19324)

---
 sklearn/tree/_export.py           | 37 +++++++++----------------------
 sklearn/tree/tests/test_export.py | 11 ---------
 2 files changed, 10 insertions(+), 38 deletions(-)

diff --git a/sklearn/tree/_export.py b/sklearn/tree/_export.py
index affe1b68cfe9a..17680db2b855d 100644
--- a/sklearn/tree/_export.py
+++ b/sklearn/tree/_export.py
@@ -25,8 +25,6 @@
 from ._reingold_tilford import buchheim, Tree
 from . import DecisionTreeClassifier
 
-import warnings
-
 
 def _color_brew(n):
     """Generate n colors with equally spaced hues.
@@ -80,10 +78,9 @@ def __repr__(self):
 
 @_deprecate_positional_args
 def plot_tree(decision_tree, *, max_depth=None, feature_names=None,
-              class_names=None, label='all', filled=False,
-              impurity=True, node_ids=False,
-              proportion=False, rotate='deprecated', rounded=False,
-              precision=3, ax=None, fontsize=None):
+              class_names=None, label='all', filled=False, impurity=True,
+              node_ids=False, proportion=False, rounded=False, precision=3,
+              ax=None, fontsize=None):
     """Plot a decision tree.
 
     The sample counts that are shown are weighted with any sample_weights that
@@ -135,14 +132,6 @@ def plot_tree(decision_tree, *, max_depth=None, feature_names=None,
         When set to ``True``, change the display of 'values' and/or 'samples'
         to be proportions and percentages respectively.
 
-    rotate : bool, default=False
-        This parameter has no effect on the matplotlib tree visualisation and
-        it is kept here for backward compatibility.
-
-        .. deprecated:: 0.23
-           ``rotate`` is deprecated in 0.23 and will be removed in 1.0
-           (renaming of 0.25).
-
     rounded : bool, default=False
         When set to ``True``, draw node boxes with rounded corners and use
         Helvetica fonts instead of Times-Roman.
@@ -180,16 +169,10 @@ def plot_tree(decision_tree, *, max_depth=None, feature_names=None,
 
     check_is_fitted(decision_tree)
 
-    if rotate != 'deprecated':
-        warnings.warn(("'rotate' has no effect and is deprecated in 0.23. "
-                       "It will be removed in 1.0 (renaming of 0.25)."),
-                      FutureWarning)
-
     exporter = _MPLTreeExporter(
         max_depth=max_depth, feature_names=feature_names,
-        class_names=class_names, label=label, filled=filled,
-        impurity=impurity, node_ids=node_ids,
-        proportion=proportion, rotate=rotate, rounded=rounded,
+        class_names=class_names, label=label, filled=filled, impurity=impurity,
+        node_ids=node_ids, proportion=proportion, rounded=rounded,
         precision=precision, fontsize=fontsize)
     return exporter.export(decision_tree, ax=ax)
 
@@ -198,7 +181,7 @@ class _BaseTreeExporter:
     def __init__(self, max_depth=None, feature_names=None,
                  class_names=None, label='all', filled=False,
                  impurity=True, node_ids=False,
-                 proportion=False, rotate=False, rounded=False,
+                 proportion=False, rounded=False,
                  precision=3, fontsize=None):
         self.max_depth = max_depth
         self.feature_names = feature_names
@@ -208,7 +191,6 @@ def __init__(self, max_depth=None, feature_names=None,
         self.impurity = impurity
         self.node_ids = node_ids
         self.proportion = proportion
-        self.rotate = rotate
         self.rounded = rounded
         self.precision = precision
         self.fontsize = fontsize
@@ -380,11 +362,12 @@ def __init__(self, out_file=SENTINEL, max_depth=None,
             max_depth=max_depth, feature_names=feature_names,
             class_names=class_names, label=label, filled=filled,
             impurity=impurity, node_ids=node_ids, proportion=proportion,
-            rotate=rotate, rounded=rounded, precision=precision)
+            rounded=rounded, precision=precision)
         self.leaves_parallel = leaves_parallel
         self.out_file = out_file
         self.special_characters = special_characters
         self.fontname = fontname
+        self.rotate = rotate
 
         # PostScript compatibility for special characters
         if special_characters:
@@ -531,14 +514,14 @@ class _MPLTreeExporter(_BaseTreeExporter):
     def __init__(self, max_depth=None, feature_names=None,
                  class_names=None, label='all', filled=False,
                  impurity=True, node_ids=False,
-                 proportion=False, rotate=False, rounded=False,
+                 proportion=False, rounded=False,
                  precision=3, fontsize=None):
 
         super().__init__(
             max_depth=max_depth, feature_names=feature_names,
             class_names=class_names, label=label, filled=filled,
             impurity=impurity, node_ids=node_ids, proportion=proportion,
-            rotate=rotate, rounded=rounded, precision=precision)
+            rounded=rounded, precision=precision)
         self.fontsize = fontsize
 
         # validate
diff --git a/sklearn/tree/tests/test_export.py b/sklearn/tree/tests/test_export.py
index 7b94fbb527dc9..d12daeaa657be 100644
--- a/sklearn/tree/tests/test_export.py
+++ b/sklearn/tree/tests/test_export.py
@@ -463,17 +463,6 @@ def test_plot_tree_gini(pyplot):
     assert nodes[2].get_text() == "gini = 0.0\nsamples = 3\nvalue = [0, 3]"
 
 
-# FIXME: to be removed in 1.0
-def test_plot_tree_rotate_deprecation(pyplot):
-    tree = DecisionTreeClassifier()
-    tree.fit(X, y)
-    # test that a warning is raised when rotate is used.
-    match = (r"'rotate' has no effect and is deprecated in 0.23. "
-             r"It will be removed in 1.0 \(renaming of 0.25\).")
-    with pytest.warns(FutureWarning, match=match):
-        plot_tree(tree, rotate=True)
-
-
 def test_not_fitted_tree(pyplot):
 
     # Testing if not fitted tree throws the correct error

From 0f85e6b32fd230320ca79926dc278d036a4b853a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?=
 <34657725+jeremiedbb@users.noreply.github.com>
Date: Tue, 11 May 2021 22:41:36 +0200
Subject: [PATCH 391/478] MNT Clean deprecations for 1.0 | Search (#19321)

Co-authored-by: Olivier Grisel <olivier.grisel@gmail.com>
---
 sklearn/model_selection/__init__.py          |  2 -
 sklearn/model_selection/_search.py           | 74 +-------------------
 sklearn/model_selection/tests/test_search.py | 52 +-------------
 3 files changed, 3 insertions(+), 125 deletions(-)

diff --git a/sklearn/model_selection/__init__.py b/sklearn/model_selection/__init__.py
index f79db2a5acc17..2dfb295f5c14c 100644
--- a/sklearn/model_selection/__init__.py
+++ b/sklearn/model_selection/__init__.py
@@ -30,7 +30,6 @@
 from ._search import RandomizedSearchCV
 from ._search import ParameterGrid
 from ._search import ParameterSampler
-from ._search import fit_grid_point
 
 if typing.TYPE_CHECKING:
     # Avoid errors in type checkers (e.g. mypy) for experimental estimators.
@@ -64,7 +63,6 @@
            'cross_val_predict',
            'cross_val_score',
            'cross_validate',
-           'fit_grid_point',
            'learning_curve',
            'permutation_test_score',
            'train_test_split',
diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py
index 6e837a2f97b24..d4444ce09dcb5 100644
--- a/sklearn/model_selection/_search.py
+++ b/sklearn/model_selection/_search.py
@@ -44,7 +44,7 @@
 from ..metrics import check_scoring
 from ..utils import deprecated
 
-__all__ = ['GridSearchCV', 'ParameterGrid', 'fit_grid_point',
+__all__ = ['GridSearchCV', 'ParameterGrid',
            'ParameterSampler', 'RandomizedSearchCV']
 
 
@@ -314,78 +314,6 @@ def __len__(self):
             return self.n_iter
 
 
-# FIXME Remove fit_grid_point in 1.0
-@deprecated(
-    "fit_grid_point is deprecated in version 0.23 "
-    "and will be removed in version 1.0 (renaming of 0.25)"
-)
-def fit_grid_point(X, y, estimator, parameters, train, test, scorer,
-                   verbose, error_score=np.nan, **fit_params):
-    """Run fit on one set of parameters.
-
-    Parameters
-    ----------
-    X : array-like, sparse matrix or list
-        Input data.
-
-    y : array-like or None
-        Targets for input data.
-
-    estimator : estimator object
-        A object of that type is instantiated for each grid point.
-        This is assumed to implement the scikit-learn estimator interface.
-        Either estimator needs to provide a ``score`` function,
-        or ``scoring`` must be passed.
-
-    parameters : dict
-        Parameters to be set on estimator for this grid point.
-
-    train : ndarray, dtype int or bool
-        Boolean mask or indices for training set.
-
-    test : ndarray, dtype int or bool
-        Boolean mask or indices for test set.
-
-    scorer : callable or None
-        The scorer callable object / function must have its signature as
-        ``scorer(estimator, X, y)``.
-
-        If ``None`` the estimator's score method is used.
-
-    verbose : int
-        Verbosity level.
-
-    **fit_params : kwargs
-        Additional parameter passed to the fit function of the estimator.
-
-    error_score : 'raise' or numeric, default=np.nan
-        Value to assign to the score if an error occurs in estimator fitting.
-        If set to 'raise', the error is raised. If a numeric value is given,
-        FitFailedWarning is raised. This parameter does not affect the refit
-        step, which will always raise the error.
-
-    Returns
-    -------
-    score : float
-         Score of this parameter setting on given test split.
-
-    parameters : dict
-        The parameters that have been evaluated.
-
-    n_samples_test : int
-        Number of test samples in this split.
-    """
-    # NOTE we are not using the return value as the scorer by itself should be
-    # validated before. We use check_scoring only to reject multimetric scorer
-    check_scoring(estimator, scorer)
-    results = _fit_and_score(estimator, X, y, scorer, train,
-                             test, verbose, parameters,
-                             fit_params=fit_params,
-                             return_n_test_samples=True,
-                             error_score=error_score)
-    return results["test_scores"], parameters, results["n_test_samples"]
-
-
 def _check_param_grid(param_grid):
     if hasattr(param_grid, 'items'):
         param_grid = [param_grid]
diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py
index b74e250e94192..2576d5f24006d 100644
--- a/sklearn/model_selection/tests/test_search.py
+++ b/sklearn/model_selection/tests/test_search.py
@@ -28,13 +28,12 @@
 from scipy.stats import bernoulli, expon, uniform
 
 from sklearn.base import BaseEstimator, ClassifierMixin
-from sklearn.base import clone, is_classifier
+from sklearn.base import is_classifier
 from sklearn.exceptions import NotFittedError
 from sklearn.datasets import make_classification
 from sklearn.datasets import make_blobs
 from sklearn.datasets import make_multilabel_classification
 
-from sklearn.model_selection import fit_grid_point
 from sklearn.model_selection import train_test_split
 from sklearn.model_selection import KFold
 from sklearn.model_selection import StratifiedKFold
@@ -129,6 +128,7 @@ def score(self):
 def assert_grid_iter_equals_getitem(grid):
     assert list(grid) == [grid[i] for i in range(len(grid))]
 
+
 @pytest.mark.parametrize("klass", [ParameterGrid,
                                    partial(ParameterSampler, n_iter=10)])
 @pytest.mark.parametrize(
@@ -1271,54 +1271,6 @@ def test_grid_search_correct_score_results():
                 assert_almost_equal(correct_score, cv_scores[i])
 
 
-# FIXME remove test_fit_grid_point as the function will be removed on 1.0
-@ignore_warnings(category=FutureWarning)
-def test_fit_grid_point():
-    X, y = make_classification(random_state=0)
-    cv = StratifiedKFold()
-    svc = LinearSVC(random_state=0)
-    scorer = make_scorer(accuracy_score)
-
-    for params in ({'C': 0.1}, {'C': 0.01}, {'C': 0.001}):
-        for train, test in cv.split(X, y):
-            this_scores, this_params, n_test_samples = fit_grid_point(
-                X, y, clone(svc), params, train, test,
-                scorer, verbose=False)
-
-            est = clone(svc).set_params(**params)
-            est.fit(X[train], y[train])
-            expected_score = scorer(est, X[test], y[test])
-
-            # Test the return values of fit_grid_point
-            assert_almost_equal(this_scores, expected_score)
-            assert params == this_params
-            assert n_test_samples == test.size
-
-    # Should raise an error upon multimetric scorer
-    error_msg = ("For evaluating multiple scores, use "
-                 "sklearn.model_selection.cross_validate instead.")
-    with pytest.raises(ValueError, match=error_msg):
-        fit_grid_point(
-            X, y, svc, params, train, test, {'score': scorer},
-            verbose=True
-        )
-
-
-# FIXME remove test_fit_grid_point_deprecated as
-# fit_grid_point will be removed on 1.0
-def test_fit_grid_point_deprecated():
-    X, y = make_classification(random_state=0)
-    svc = LinearSVC(random_state=0)
-    scorer = make_scorer(accuracy_score)
-    msg = ("fit_grid_point is deprecated in version 0.23 "
-           "and will be removed in version 1.0")
-    params = {'C': 0.1}
-    train, test = next(StratifiedKFold().split(X, y))
-
-    with pytest.warns(FutureWarning, match=msg):
-        fit_grid_point(X, y, svc, params, train, test, scorer, verbose=False)
-
-
 def test_pickle():
     # Test that a fit search can be pickled
     clf = MockClassifier()

From 0012d845f579ff62493a5cc9492e77c204f195d3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?=
 <34657725+jeremiedbb@users.noreply.github.com>
Date: Tue, 11 May 2021 22:42:23 +0200
Subject: [PATCH 392/478] MNT Clean deprecations for 1.0 | AffinityPropagation
 (#19318)

---
 sklearn/cluster/_affinity_propagation.py       | 18 ++++--------------
 .../cluster/tests/test_affinity_propagation.py |  8 --------
 sklearn/tests/test_docstring_parameters.py     |  4 ----
 3 files changed, 4 insertions(+), 26 deletions(-)

diff --git a/sklearn/cluster/_affinity_propagation.py b/sklearn/cluster/_affinity_propagation.py
index cb9230cd2382f..93b98d8aff7ee 100644
--- a/sklearn/cluster/_affinity_propagation.py
+++ b/sklearn/cluster/_affinity_propagation.py
@@ -35,7 +35,7 @@ def all_equal_similarities():
 @_deprecate_positional_args
 def affinity_propagation(S, *, preference=None, convergence_iter=15,
                          max_iter=200, damping=0.5, copy=True, verbose=False,
-                         return_n_iter=False, random_state='warn'):
+                         return_n_iter=False, random_state=None):
     """Perform Affinity Propagation Clustering of data.
 
     Read more in the :ref:`User Guide <affinity_propagation>`.
@@ -75,7 +75,7 @@ def affinity_propagation(S, *, preference=None, convergence_iter=15,
     return_n_iter : bool, default=False
         Whether or not to return the number of iterations.
 
-    random_state : int, RandomState instance or None, default=0
+    random_state : int, RandomState instance or None, default=None
         Pseudo-random number generator to control the starting state.
         Use an int for reproducible results across function calls.
         See the :term:`Glossary <random_state>`.
@@ -144,16 +144,6 @@ def affinity_propagation(S, *, preference=None, convergence_iter=15,
                     if return_n_iter
                     else (np.array([0]), np.array([0] * n_samples)))
 
-    if random_state == 'warn':
-        warnings.warn(
-            "'random_state' has been introduced in 0.23. It will be set to "
-            "None starting from 1.0 (renaming of 0.25) which means that "
-            "results will differ at every function call. Set 'random_state' "
-            "to None to silence this warning, or to 0 to keep the behavior of "
-            "versions <0.23.",
-            FutureWarning
-        )
-        random_state = 0
     random_state = check_random_state(random_state)
 
     # Place preference on the diagonal of S
@@ -295,7 +285,7 @@ class AffinityPropagation(ClusterMixin, BaseEstimator):
     verbose : bool, default=False
         Whether to be verbose.
 
-    random_state : int, RandomState instance or None, default=0
+    random_state : int, RandomState instance or None, default=None
         Pseudo-random number generator to control the starting state.
         Use an int for reproducible results across function calls.
         See the :term:`Glossary <random_state>`.
@@ -365,7 +355,7 @@ class AffinityPropagation(ClusterMixin, BaseEstimator):
     @_deprecate_positional_args
     def __init__(self, *, damping=.5, max_iter=200, convergence_iter=15,
                  copy=True, preference=None, affinity='euclidean',
-                 verbose=False, random_state='warn'):
+                 verbose=False, random_state=None):
 
         self.damping = damping
         self.max_iter = max_iter
diff --git a/sklearn/cluster/tests/test_affinity_propagation.py b/sklearn/cluster/tests/test_affinity_propagation.py
index 51b4fd425349e..ae2806bf38e59 100644
--- a/sklearn/cluster/tests/test_affinity_propagation.py
+++ b/sklearn/cluster/tests/test_affinity_propagation.py
@@ -209,14 +209,6 @@ def test_affinity_propagation_random_state():
     assert np.mean((centers0 - centers76) ** 2) > 1
 
 
-# FIXME: to be removed in 1.0
-def test_affinity_propagation_random_state_warning():
-    # test that a warning is raised when random_state is not defined.
-    X = np.array([[0, 0], [1, 1], [-2, -2]])
-    match = "'random_state' has been introduced in 0.23."
-    with pytest.warns(FutureWarning, match=match):
-        AffinityPropagation().fit(X)
-
 @pytest.mark.parametrize('centers', [csr_matrix(np.zeros((1, 10))),
                                      np.zeros((1, 10))])
 def test_affinity_propagation_convergence_warning_dense_sparse(centers):
diff --git a/sklearn/tests/test_docstring_parameters.py b/sklearn/tests/test_docstring_parameters.py
index 719df2f4a0f77..099c27341927e 100644
--- a/sklearn/tests/test_docstring_parameters.py
+++ b/sklearn/tests/test_docstring_parameters.py
@@ -245,10 +245,6 @@ def test_fit_docstring_attributes(name, Estimator):
     if 'PLS' in Estimator.__name__ or 'CCA' in Estimator.__name__:
         est.n_components = 1  # default = 2 is invalid for single target.
 
-    # FIXME: TO BE REMOVED for 1.0 (avoid FutureWarning)
-    if Estimator.__name__ == 'AffinityPropagation':
-        est.random_state = 63
-
     # FIXME: TO BE REMOVED for 1.1 (avoid FutureWarning)
     if Estimator.__name__ == 'NMF':
         est.init = 'nndsvda'

From 3bcbf85b3a133a6b27c272bf8566e8c26660903e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?=
 <34657725+jeremiedbb@users.noreply.github.com>
Date: Tue, 11 May 2021 22:42:53 +0200
Subject: [PATCH 393/478] MNT Clean deprecations for 1.0 | SVM (#19322)

---
 sklearn/svm/_classes.py       | 33 ---------------------------------
 sklearn/svm/tests/test_svm.py | 15 ---------------
 2 files changed, 48 deletions(-)

diff --git a/sklearn/svm/_classes.py b/sklearn/svm/_classes.py
index 674fa294dcf3c..b151f5267da50 100644
--- a/sklearn/svm/_classes.py
+++ b/sklearn/svm/_classes.py
@@ -7,7 +7,6 @@
 from ..utils.validation import _num_samples
 from ..utils.validation import _deprecate_positional_args
 from ..utils.multiclass import check_classification_targets
-from ..utils.deprecation import deprecated
 
 
 class LinearSVC(LinearClassifierMixin,
@@ -1045,22 +1044,6 @@ def __init__(self, *, kernel='rbf', degree=3, gamma='scale',
             shrinking=shrinking, probability=False, cache_size=cache_size,
             class_weight=None, max_iter=max_iter, random_state=None)
 
-    # mypy error: Decorated property not supported
-    @deprecated(  # type: ignore
-        "The probA_ attribute is deprecated in version 0.23 and will be "
-        "removed in version 1.0 (renaming of 0.25).")
-    @property
-    def probA_(self):
-        return self._probA
-
-    # mypy error: Decorated property not supported
-    @deprecated(  # type: ignore
-        "The probB_ attribute is deprecated in version 0.23 and will be "
-        "removed in version 1.0 (renaming of 0.25).")
-    @property
-    def probB_(self):
-        return self._probB
-
     def _more_tags(self):
         return {
             '_xfail_checks': {
@@ -1435,22 +1418,6 @@ def predict(self, X):
         y = super().predict(X)
         return np.asarray(y, dtype=np.intp)
 
-    # mypy error: Decorated property not supported
-    @deprecated(  # type: ignore
-        "The probA_ attribute is deprecated in version 0.23 and will be "
-        "removed in version 1.0.")
-    @property
-    def probA_(self):
-        return self._probA
-
-    # mypy error: Decorated property not supported
-    @deprecated(  # type: ignore
-        "The probB_ attribute is deprecated in version 0.23 and will be "
-        "removed in version 1.0.")
-    @property
-    def probB_(self):
-        return self._probB
-
     def _more_tags(self):
         return {
             '_xfail_checks': {
diff --git a/sklearn/svm/tests/test_svm.py b/sklearn/svm/tests/test_svm.py
index f1e2cea4be2dc..3fe57ad1b8375 100644
--- a/sklearn/svm/tests/test_svm.py
+++ b/sklearn/svm/tests/test_svm.py
@@ -1252,21 +1252,6 @@ def test_n_support_oneclass_svr():
     assert reg.n_support_ == 4
 
 
-# TODO: Remove in 1.0 when probA_ and probB_ are deprecated
-@pytest.mark.parametrize("SVMClass, data", [
-    (svm.OneClassSVM, (X, )),
-    (svm.SVR, (X, Y))
-])
-@pytest.mark.parametrize("deprecated_prob", ["probA_", "probB_"])
-def test_svm_probA_proB_deprecated(SVMClass, data, deprecated_prob):
-    clf = SVMClass().fit(*data)
-
-    msg = ("The {} attribute is deprecated in version 0.23 and will be "
-           "removed in version 1.0").format(deprecated_prob)
-    with pytest.warns(FutureWarning, match=msg):
-        getattr(clf, deprecated_prob)
-
-
 @pytest.mark.parametrize("Estimator", [svm.SVC, svm.SVR])
 def test_custom_kernel_not_array_input(Estimator):
     """Test using a custom kernel that is not fed with array-like for floats"""

From 847fc6a27431d96eaef926773608168e8edb9e12 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?=
 <34657725+jeremiedbb@users.noreply.github.com>
Date: Wed, 12 May 2021 00:00:03 +0200
Subject: [PATCH 394/478] MNT Clean deprecations for 1.0 | KMeans (#19317)

Co-authored-by: Olivier Grisel <olivier.grisel@gmail.com>
---
 sklearn/cluster/_bicluster.py           | 43 ++-----------
 sklearn/cluster/_kmeans.py              | 86 +++----------------------
 sklearn/cluster/tests/test_bicluster.py | 13 ----
 sklearn/cluster/tests/test_k_means.py   | 26 --------
 4 files changed, 13 insertions(+), 155 deletions(-)

diff --git a/sklearn/cluster/_bicluster.py b/sklearn/cluster/_bicluster.py
index 3bde33399a8e0..2b5184fae40ae 100644
--- a/sklearn/cluster/_bicluster.py
+++ b/sklearn/cluster/_bicluster.py
@@ -3,7 +3,6 @@
 # License: BSD 3 clause
 
 from abc import ABCMeta, abstractmethod
-import warnings
 
 import numpy as np
 
@@ -89,14 +88,13 @@ class BaseSpectral(BiclusterMixin, BaseEstimator, metaclass=ABCMeta):
     @abstractmethod
     def __init__(self, n_clusters=3, svd_method="randomized",
                  n_svd_vecs=None, mini_batch=False, init="k-means++",
-                 n_init=10, n_jobs='deprecated', random_state=None):
+                 n_init=10, random_state=None):
         self.n_clusters = n_clusters
         self.svd_method = svd_method
         self.n_svd_vecs = n_svd_vecs
         self.mini_batch = mini_batch
         self.init = init
         self.n_init = n_init
-        self.n_jobs = n_jobs
         self.random_state = random_state
 
     def _check_parameters(self):
@@ -116,10 +114,6 @@ def fit(self, X, y=None):
         y : Ignored
 
         """
-        if self.n_jobs != 'deprecated':
-            warnings.warn("'n_jobs' was deprecated in version 0.23 and will be"
-                          " removed in 1.0 (renaming of 0.25).", FutureWarning)
-
         X = self._validate_data(X, accept_sparse='csr', dtype=np.float64)
         self._check_parameters()
         self._fit(X)
@@ -171,8 +165,7 @@ def _k_means(self, data, n_clusters):
                                     random_state=self.random_state)
         else:
             model = KMeans(n_clusters, init=self.init,
-                           n_init=self.n_init, n_jobs=self.n_jobs,
-                           random_state=self.random_state)
+                           n_init=self.n_init, random_state=self.random_state)
         model.fit(data)
         centroid = model.cluster_centers_
         labels = model.labels_
@@ -242,19 +235,6 @@ class SpectralCoclustering(BaseSpectral):
         chosen and the algorithm runs once. Otherwise, the algorithm
         is run for each initialization and the best solution chosen.
 
-    n_jobs : int, default=None
-        The number of jobs to use for the computation. This works by breaking
-        down the pairwise matrix into n_jobs even slices and computing them in
-        parallel.
-
-        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
-        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
-        for more details.
-
-        .. deprecated:: 0.23
-            ``n_jobs`` was deprecated in version 0.23 and will be removed in
-            1.0 (renaming of 0.25).
-
     random_state : int, RandomState instance, default=None
         Used for randomizing the singular value decomposition and the k-means
         initialization. Use an int to make the randomness deterministic.
@@ -300,14 +280,13 @@ class SpectralCoclustering(BaseSpectral):
     @_deprecate_positional_args
     def __init__(self, n_clusters=3, *, svd_method='randomized',
                  n_svd_vecs=None, mini_batch=False, init='k-means++',
-                 n_init=10, n_jobs='deprecated', random_state=None):
+                 n_init=10, random_state=None):
         super().__init__(n_clusters,
                          svd_method,
                          n_svd_vecs,
                          mini_batch,
                          init,
                          n_init,
-                         n_jobs,
                          random_state)
 
     def _fit(self, X):
@@ -394,19 +373,6 @@ class SpectralBiclustering(BaseSpectral):
         chosen and the algorithm runs once. Otherwise, the algorithm
         is run for each initialization and the best solution chosen.
 
-    n_jobs : int, default=None
-        The number of jobs to use for the computation. This works by breaking
-        down the pairwise matrix into n_jobs even slices and computing them in
-        parallel.
-
-        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
-        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
-        for more details.
-
-        .. deprecated:: 0.23
-            ``n_jobs`` was deprecated in version 0.23 and will be removed in
-            1.0 (renaming of 0.25).
-
     random_state : int, RandomState instance, default=None
         Used for randomizing the singular value decomposition and the k-means
         initialization. Use an int to make the randomness deterministic.
@@ -453,14 +419,13 @@ class SpectralBiclustering(BaseSpectral):
     def __init__(self, n_clusters=3, *, method='bistochastic',
                  n_components=6, n_best=3, svd_method='randomized',
                  n_svd_vecs=None, mini_batch=False, init='k-means++',
-                 n_init=10, n_jobs='deprecated', random_state=None):
+                 n_init=10, random_state=None):
         super().__init__(n_clusters,
                          svd_method,
                          n_svd_vecs,
                          mini_batch,
                          init,
                          n_init,
-                         n_jobs,
                          random_state)
         self.method = method
         self.n_components = n_components
diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py
index 44c2837a8802a..ccb472b7f94dc 100644
--- a/sklearn/cluster/_kmeans.py
+++ b/sklearn/cluster/_kmeans.py
@@ -254,9 +254,9 @@ def _tolerance(X, tol):
 
 @_deprecate_positional_args
 def k_means(X, n_clusters, *, sample_weight=None, init='k-means++',
-            precompute_distances='deprecated', n_init=10, max_iter=300,
-            verbose=False, tol=1e-4, random_state=None, copy_x=True,
-            n_jobs='deprecated', algorithm="auto", return_n_iter=False):
+            n_init=10, max_iter=300, verbose=False, tol=1e-4,
+            random_state=None, copy_x=True, algorithm="auto",
+            return_n_iter=False):
     """K-means clustering algorithm.
 
     Read more in the :ref:`User Guide <k_means>`.
@@ -293,21 +293,6 @@ def k_means(X, n_clusters, *, sample_weight=None, init='k-means++',
         If a callable is passed, it should take arguments X, n_clusters and a
         random state and return an initialization.
 
-    precompute_distances : {'auto', True, False}
-        Precompute distances (faster but takes more memory).
-
-        'auto' : do not precompute distances if n_samples * n_clusters > 12
-        million. This corresponds to about 100MB overhead per job using
-        double precision.
-
-        True : always precompute distances
-
-        False : never precompute distances
-
-        .. deprecated:: 0.23
-            'precompute_distances' was deprecated in version 0.23 and will be
-            removed in 1.0 (renaming of 0.25). It has no effect.
-
     n_init : int, default=10
         Number of time the k-means algorithm will be run with different
         centroid seeds. The final results will be the best output of
@@ -339,17 +324,6 @@ def k_means(X, n_clusters, *, sample_weight=None, init='k-means++',
         copy_x is False. If the original data is sparse, but not in CSR format,
         a copy will be made even if copy_x is False.
 
-    n_jobs : int, default=None
-        The number of OpenMP threads to use for the computation. Parallelism is
-        sample-wise on the main cython loop which assigns each sample to its
-        closest center.
-
-        ``None`` or ``-1`` means using all processors.
-
-        .. deprecated:: 0.23
-            ``n_jobs`` was deprecated in version 0.23 and will be removed in
-            1.0 (renaming of 0.25).
-
     algorithm : {"auto", "full", "elkan"}, default="auto"
         K-means algorithm to use. The classical EM-style algorithm is "full".
         The "elkan" variation is more efficient on data with well-defined
@@ -382,8 +356,7 @@ def k_means(X, n_clusters, *, sample_weight=None, init='k-means++',
     """
     est = KMeans(
         n_clusters=n_clusters, init=init, n_init=n_init, max_iter=max_iter,
-        verbose=verbose, precompute_distances=precompute_distances, tol=tol,
-        random_state=random_state, copy_x=copy_x, n_jobs=n_jobs,
+        verbose=verbose, tol=tol, random_state=random_state, copy_x=copy_x,
         algorithm=algorithm
     ).fit(X, sample_weight=sample_weight)
     if return_n_iter:
@@ -747,21 +720,6 @@ class KMeans(TransformerMixin, ClusterMixin, BaseEstimator):
         in the cluster centers of two consecutive iterations to declare
         convergence.
 
-    precompute_distances : {'auto', True, False}, default='auto'
-        Precompute distances (faster but takes more memory).
-
-        'auto' : do not precompute distances if n_samples * n_clusters > 12
-        million. This corresponds to about 100MB overhead per job using
-        double precision.
-
-        True : always precompute distances.
-
-        False : never precompute distances.
-
-        .. deprecated:: 0.23
-            'precompute_distances' was deprecated in version 0.22 and will be
-            removed in 1.0 (renaming of 0.25). It has no effect.
-
     verbose : int, default=0
         Verbosity mode.
 
@@ -780,17 +738,6 @@ class KMeans(TransformerMixin, ClusterMixin, BaseEstimator):
         copy_x is False. If the original data is sparse, but not in CSR format,
         a copy will be made even if copy_x is False.
 
-    n_jobs : int, default=None
-        The number of OpenMP threads to use for the computation. Parallelism is
-        sample-wise on the main cython loop which assigns each sample to its
-        closest center.
-
-        ``None`` or ``-1`` means using all processors.
-
-        .. deprecated:: 0.23
-            ``n_jobs`` was deprecated in version 0.23 and will be removed in
-            1.0 (renaming of 0.25).
-
     algorithm : {"auto", "full", "elkan"}, default="auto"
         K-means algorithm to use. The classical EM-style algorithm is "full".
         The "elkan" variation is more efficient on data with well-defined
@@ -868,38 +815,20 @@ class KMeans(TransformerMixin, ClusterMixin, BaseEstimator):
     """
     @_deprecate_positional_args
     def __init__(self, n_clusters=8, *, init='k-means++', n_init=10,
-                 max_iter=300, tol=1e-4, precompute_distances='deprecated',
-                 verbose=0, random_state=None, copy_x=True,
-                 n_jobs='deprecated', algorithm='auto'):
+                 max_iter=300, tol=1e-4, verbose=0, random_state=None,
+                 copy_x=True, algorithm='auto'):
 
         self.n_clusters = n_clusters
         self.init = init
         self.max_iter = max_iter
         self.tol = tol
-        self.precompute_distances = precompute_distances
         self.n_init = n_init
         self.verbose = verbose
         self.random_state = random_state
         self.copy_x = copy_x
-        self.n_jobs = n_jobs
         self.algorithm = algorithm
 
     def _check_params(self, X):
-        # precompute_distances
-        if self.precompute_distances != 'deprecated':
-            warnings.warn("'precompute_distances' was deprecated in version "
-                          "0.23 and will be removed in 1.0 (renaming of 0.25)"
-                          ". It has no effect", FutureWarning)
-
-        # n_jobs
-        if self.n_jobs != 'deprecated':
-            warnings.warn("'n_jobs' was deprecated in version 0.23 and will be"
-                          " removed in 1.0 (renaming of 0.25).", FutureWarning)
-            self._n_threads = self.n_jobs
-        else:
-            self._n_threads = None
-        self._n_threads = _openmp_effective_n_threads(self._n_threads)
-
         # n_init
         if self.n_init <= 0:
             raise ValueError(
@@ -1088,6 +1017,7 @@ def fit(self, X, y=None, sample_weight=None):
         self._check_params(X)
         random_state = check_random_state(self.random_state)
         sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
+        self._n_threads = _openmp_effective_n_threads()
 
         # Validate init array
         init = self.init
@@ -1757,6 +1687,7 @@ def fit(self, X, y=None, sample_weight=None):
         self._check_params(X)
         random_state = check_random_state(self.random_state)
         sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
+        self._n_threads = _openmp_effective_n_threads()
         n_samples, n_features = X.shape
 
         # Validate init array
@@ -1906,6 +1837,7 @@ def partial_fit(self, X, y=None, sample_weight=None):
         if not has_centers:
             # this instance has not been fitted yet (fit or partial_fit)
             self._check_params(X)
+            self._n_threads = _openmp_effective_n_threads()
 
             # Validate init array
             init = self.init
diff --git a/sklearn/cluster/tests/test_bicluster.py b/sklearn/cluster/tests/test_bicluster.py
index 97ca3db0201b6..93e9a00c7bce8 100644
--- a/sklearn/cluster/tests/test_bicluster.py
+++ b/sklearn/cluster/tests/test_bicluster.py
@@ -262,16 +262,3 @@ def test_n_features_in_(est):
     assert not hasattr(est, 'n_features_in_')
     est.fit(X)
     assert est.n_features_in_ == 3
-
-
-@pytest.mark.parametrize("klass", [SpectralBiclustering, SpectralCoclustering])
-@pytest.mark.parametrize("n_jobs", [None, 1])
-def test_n_jobs_deprecated(klass, n_jobs):
-    # FIXME: remove in 1.0
-    depr_msg = ("'n_jobs' was deprecated in version 0.23 and will be removed "
-                "in 1.0")
-    S, _, _ = make_biclusters((30, 30), 3, noise=0.5, random_state=0)
-    est = klass(random_state=0, n_jobs=n_jobs)
-
-    with pytest.warns(FutureWarning, match=depr_msg):
-        est.fit(S)
diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py
index 248b2e1ddd498..8ba7f45691b70 100644
--- a/sklearn/cluster/tests/test_k_means.py
+++ b/sklearn/cluster/tests/test_k_means.py
@@ -884,32 +884,6 @@ def test_result_equal_in_diff_n_threads(Estimator):
     assert_array_equal(result_1, result_2)
 
 
-@pytest.mark.parametrize("precompute_distances", ["auto", False, True])
-def test_precompute_distance_deprecated(precompute_distances):
-    # FIXME: remove in 1.0
-    depr_msg = ("'precompute_distances' was deprecated in version 0.23 and "
-                "will be removed in 1.0")
-    X, _ = make_blobs(n_samples=10, n_features=2, centers=2, random_state=0)
-    kmeans = KMeans(n_clusters=2, n_init=1, init='random', random_state=0,
-                    precompute_distances=precompute_distances)
-
-    with pytest.warns(FutureWarning, match=depr_msg):
-        kmeans.fit(X)
-
-
-@pytest.mark.parametrize("n_jobs", [None, 1])
-def test_n_jobs_deprecated(n_jobs):
-    # FIXME: remove in 1.0
-    depr_msg = ("'n_jobs' was deprecated in version 0.23 and will be removed "
-                "in 1.0")
-    X, _ = make_blobs(n_samples=10, n_features=2, centers=2, random_state=0)
-    kmeans = KMeans(n_clusters=2, n_init=1, init='random', random_state=0,
-                    n_jobs=n_jobs)
-
-    with pytest.warns(FutureWarning, match=depr_msg):
-        kmeans.fit(X)
-
-
 @pytest.mark.parametrize("attr", ["counts_", "init_size_", "random_state_"])
 def test_minibatch_kmeans_deprecated_attributes(attr):
     # check that we raise a deprecation warning when accessing `init_size_`

From bfb5b39d06d29b965238411a07d462ab69d7b38c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20Fauchereau?=
 <clement.fauchereau@ensta-bretagne.org>
Date: Thu, 13 May 2021 13:10:54 +0000
Subject: [PATCH 395/478] DOC typo in doc/developers/tips.rst (#20088)

---
 doc/developers/tips.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/developers/tips.rst b/doc/developers/tips.rst
index 36e2cd4a58779..7bef6580c1a6e 100644
--- a/doc/developers/tips.rst
+++ b/doc/developers/tips.rst
@@ -229,7 +229,7 @@ Debugging memory errors in Cython with valgrind
 
 While python/numpy's built-in memory management is relatively robust, it can
 lead to performance penalties for some routines. For this reason, much of
-the high-performance code in scikit-learn in written in cython. This
+the high-performance code in scikit-learn is written in cython. This
 performance gain comes with a tradeoff, however: it is very easy for memory
 bugs to crop up in cython code, especially in situations where that code
 relies heavily on pointer arithmetic.

From 48ab1bf71aea9b7036108179e00e0b2e1c3fcf7e Mon Sep 17 00:00:00 2001
From: ZeyuSun <38712452+ZeyuSun@users.noreply.github.com>
Date: Thu, 13 May 2021 11:43:56 -0400
Subject: [PATCH 396/478] DOC correct behavior of needs_threshold in make_score
 (#20079)

Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
---
 sklearn/metrics/_scorer.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py
index 8a814242cb6f1..39c4523f9bde6 100644
--- a/sklearn/metrics/_scorer.py
+++ b/sklearn/metrics/_scorer.py
@@ -611,7 +611,8 @@ def make_scorer(score_func, *, greater_is_better=True, needs_proba=False,
     output of :term:`predict_proba` (For binary `y_true`, the score function is
     supposed to accept probability of the positive class). If
     `needs_threshold=True`, the score function is supposed to accept the
-    output of :term:`decision_function`.
+    output of :term:`decision_function` or :term:`predict_proba` when
+    :term:`decision_function` is not present.
     """
     sign = 1 if greater_is_better else -1
     if needs_proba and needs_threshold:

From f6e6ad2d9e9172c55c778392b27b69c6af87bd98 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=20Dupr=C3=A9=20la=20Tour?= <tom.dupre-la-tour@m4x.org>
Date: Fri, 14 May 2021 08:30:27 -0700
Subject: [PATCH 397/478] MNT clean futurewarning for 1.0 |
 _deprecate_positional_args (#20002)

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
---
 azure-pipelines.yml                           |  3 +-
 doc/modules/learning_curve.rst                |  6 ++--
 doc/modules/model_evaluation.rst              |  2 +-
 doc/whats_new/v1.0.rst                        | 12 +++++++
 examples/manifold/plot_compare_methods.py     |  6 ++--
 examples/manifold/plot_manifold_sphere.py     | 10 +++---
 sklearn/base.py                               |  2 --
 sklearn/calibration.py                        |  3 --
 sklearn/cluster/_affinity_propagation.py      |  4 +--
 sklearn/cluster/_agglomerative.py             |  5 +--
 sklearn/cluster/_bicluster.py                 |  4 +--
 sklearn/cluster/_birch.py                     |  3 +-
 sklearn/cluster/_dbscan.py                    |  4 +--
 sklearn/cluster/_kmeans.py                    |  4 ---
 sklearn/cluster/_mean_shift.py                |  5 +--
 sklearn/cluster/_optics.py                    |  4 ---
 sklearn/cluster/_spectral.py                  |  4 ---
 sklearn/cluster/tests/test_hierarchical.py    |  3 +-
 sklearn/compose/_column_transformer.py        |  3 --
 sklearn/compose/_target.py                    |  2 --
 sklearn/covariance/_elliptic_envelope.py      |  2 --
 sklearn/covariance/_empirical_covariance.py   |  3 --
 sklearn/covariance/_graph_lasso.py            |  4 ---
 sklearn/covariance/_robust_covariance.py      |  2 --
 sklearn/covariance/_shrunk_covariance.py      |  5 ---
 sklearn/cross_decomposition/_pls.py           |  5 ---
 sklearn/datasets/_base.py                     |  9 ------
 sklearn/datasets/_california_housing.py       |  2 --
 sklearn/datasets/_covtype.py                  |  2 --
 sklearn/datasets/_kddcup99.py                 |  2 --
 sklearn/datasets/_lfw.py                      |  3 --
 sklearn/datasets/_olivetti_faces.py           |  2 --
 sklearn/datasets/_openml.py                   |  2 --
 sklearn/datasets/_rcv1.py                     |  2 --
 sklearn/datasets/_samples_generator.py        | 21 -------------
 sklearn/datasets/_species_distributions.py    |  2 --
 sklearn/datasets/_svmlight_format_io.py       |  4 ---
 sklearn/datasets/_twenty_newsgroups.py        |  3 --
 sklearn/decomposition/_dict_learning.py       |  8 +----
 sklearn/decomposition/_factor_analysis.py     |  3 +-
 sklearn/decomposition/_fastica.py             |  3 --
 sklearn/decomposition/_incremental_pca.py     |  2 --
 sklearn/decomposition/_kernel_pca.py          |  2 --
 sklearn/decomposition/_lda.py                 |  2 --
 sklearn/decomposition/_nmf.py                 |  3 --
 sklearn/decomposition/_pca.py                 |  2 --
 sklearn/decomposition/_sparse_pca.py          |  3 --
 sklearn/decomposition/_truncated_svd.py       |  2 --
 sklearn/discriminant_analysis.py              |  2 --
 sklearn/dummy.py                              |  3 --
 sklearn/ensemble/_bagging.py                  |  4 +--
 sklearn/ensemble/_forest.py                   |  6 ----
 sklearn/ensemble/_gb.py                       |  3 --
 .../gradient_boosting.py                      |  5 +--
 sklearn/ensemble/_iforest.py                  |  2 --
 sklearn/ensemble/_stacking.py                 |  3 --
 sklearn/ensemble/_voting.py                   |  3 --
 sklearn/ensemble/_weight_boosting.py          |  3 --
 .../feature_extraction/_dict_vectorizer.py    |  2 --
 sklearn/feature_extraction/_hash.py           |  2 --
 sklearn/feature_extraction/image.py           |  5 ---
 sklearn/feature_extraction/text.py            |  5 ---
 sklearn/feature_selection/_from_model.py      |  2 --
 sklearn/feature_selection/_mutual_info.py     |  3 --
 sklearn/feature_selection/_rfe.py             |  3 --
 .../_univariate_selection.py                  | 18 +++--------
 sklearn/gaussian_process/_gpc.py              |  3 --
 sklearn/gaussian_process/_gpr.py              |  2 --
 sklearn/impute/_base.py                       |  3 --
 sklearn/impute/_knn.py                        |  2 --
 sklearn/inspection/_partial_dependence.py     |  2 --
 sklearn/inspection/_permutation_importance.py |  2 --
 .../inspection/_plot/partial_dependence.py    |  2 --
 sklearn/isotonic.py                           |  4 +--
 sklearn/kernel_approximation.py               |  6 +---
 sklearn/kernel_ridge.py                       |  2 --
 sklearn/linear_model/_base.py                 |  2 --
 sklearn/linear_model/_bayes.py                |  3 --
 sklearn/linear_model/_coordinate_descent.py   | 11 -------
 sklearn/linear_model/_huber.py                |  2 --
 sklearn/linear_model/_least_angle.py          |  8 -----
 sklearn/linear_model/_logistic.py             |  3 --
 sklearn/linear_model/_omp.py                  |  5 ---
 sklearn/linear_model/_passive_aggressive.py   |  3 --
 sklearn/linear_model/_perceptron.py           |  2 --
 sklearn/linear_model/_ransac.py               |  2 --
 sklearn/linear_model/_ridge.py                |  8 -----
 sklearn/linear_model/_sag.py                  |  2 --
 sklearn/linear_model/_stochastic_gradient.py  |  6 ----
 sklearn/linear_model/_theil_sen.py            |  2 --
 sklearn/linear_model/tests/test_omp.py        |  2 +-
 sklearn/manifold/_isomap.py                   |  2 --
 sklearn/manifold/_locally_linear.py           |  3 --
 sklearn/manifold/_mds.py                      |  3 --
 sklearn/manifold/_spectral_embedding.py       |  3 --
 sklearn/manifold/_t_sne.py                    |  3 --
 sklearn/metrics/_classification.py            | 19 ------------
 sklearn/metrics/_plot/confusion_matrix.py     |  4 ---
 .../metrics/_plot/precision_recall_curve.py   |  4 ---
 sklearn/metrics/_plot/roc_curve.py            |  4 ---
 sklearn/metrics/_ranking.py                   | 10 ------
 sklearn/metrics/_regression.py                | 10 ------
 sklearn/metrics/_scorer.py                    |  3 --
 sklearn/metrics/cluster/_bicluster.py         |  2 --
 sklearn/metrics/cluster/_supervised.py        |  8 -----
 sklearn/metrics/cluster/_unsupervised.py      |  3 --
 sklearn/metrics/cluster/tests/test_common.py  |  5 +--
 .../metrics/cluster/tests/test_supervised.py  | 31 +++++++++++--------
 sklearn/metrics/pairwise.py                   | 11 -------
 sklearn/metrics/tests/test_classification.py  |  6 ++--
 sklearn/mixture/_bayesian_mixture.py          |  2 --
 sklearn/mixture/_gaussian_mixture.py          |  2 --
 sklearn/model_selection/_search.py            |  6 ----
 sklearn/model_selection/_split.py             | 13 --------
 sklearn/model_selection/_validation.py        |  7 -----
 sklearn/model_selection/tests/test_split.py   |  6 ++--
 sklearn/multiclass.py                         |  4 ---
 sklearn/multioutput.py                        |  6 +---
 sklearn/naive_bayes.py                        |  6 ----
 sklearn/neighbors/_classification.py          |  3 --
 sklearn/neighbors/_graph.py                   |  6 +---
 sklearn/neighbors/_kde.py                     |  2 --
 sklearn/neighbors/_lof.py                     |  2 --
 sklearn/neighbors/_nca.py                     |  2 --
 sklearn/neighbors/_nearest_centroid.py        |  2 --
 sklearn/neighbors/_regression.py              |  3 --
 sklearn/neighbors/_unsupervised.py            |  2 --
 .../neural_network/_multilayer_perceptron.py  |  4 +--
 sklearn/neural_network/_rbm.py                |  3 +-
 sklearn/pipeline.py                           |  3 --
 sklearn/preprocessing/_data.py                | 18 +----------
 sklearn/preprocessing/_discretization.py      |  2 --
 sklearn/preprocessing/_encoders.py            |  3 --
 .../preprocessing/_function_transformer.py    |  2 --
 sklearn/preprocessing/_label.py               |  4 ---
 sklearn/preprocessing/_polynomial.py          |  4 +--
 sklearn/random_projection.py                  |  4 ---
 sklearn/semi_supervised/_label_propagation.py |  4 ---
 sklearn/svm/_bounds.py                        |  2 --
 sklearn/svm/_classes.py                       |  8 -----
 sklearn/tree/_classes.py                      |  6 ----
 sklearn/tree/_export.py                       |  4 ---
 sklearn/tree/tests/test_tree.py               |  2 +-
 sklearn/utils/__init__.py                     |  7 +----
 sklearn/utils/class_weight.py                 |  4 ---
 sklearn/utils/extmath.py                      |  6 ----
 sklearn/utils/graph.py                        |  2 --
 sklearn/utils/sparsefuncs.py                  |  2 --
 sklearn/utils/validation.py                   | 11 ++-----
 149 files changed, 84 insertions(+), 586 deletions(-)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 3cd2b5bb4cd9f..412de99f5e57d 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -44,7 +44,8 @@ jobs:
       inputs:
         versionSpec: '3.9'
     - bash: |
-        pip install flake8 mypy==0.782
+        # Include pytest compatibility with mypy
+        pip install pytest flake8 mypy==0.782
       displayName: Install linters
     - bash: |
         ./build_tools/circle/linting.sh
diff --git a/doc/modules/learning_curve.rst b/doc/modules/learning_curve.rst
index 4fb90df937e15..249571aa2320a 100644
--- a/doc/modules/learning_curve.rst
+++ b/doc/modules/learning_curve.rst
@@ -79,9 +79,9 @@ The function :func:`validation_curve` can help in this case::
   >>> np.random.shuffle(indices)
   >>> X, y = X[indices], y[indices]
 
-  >>> train_scores, valid_scores = validation_curve(Ridge(), X, y, "alpha",
-  ...                                               np.logspace(-7, 3, 3),
-  ...                                               cv=5)
+  >>> train_scores, valid_scores = validation_curve(
+  ...     Ridge(), X, y, param_name="alpha", param_range=np.logspace(-7, 3, 3),
+  ...     cv=5)
   >>> train_scores
   array([[0.93..., 0.94..., 0.92..., 0.91..., 0.92...],
          [0.93..., 0.94..., 0.92..., 0.91..., 0.92...],
diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst
index c807af982e277..bc781efc35d58 100644
--- a/doc/modules/model_evaluation.rst
+++ b/doc/modules/model_evaluation.rst
@@ -1095,7 +1095,7 @@ with a svm classifier in a multiclass problem::
   LinearSVC()
   >>> pred_decision = est.decision_function([[-1], [2], [3]])
   >>> y_true = [0, 2, 3]
-  >>> hinge_loss(y_true, pred_decision, labels)
+  >>> hinge_loss(y_true, pred_decision, labels=labels)
   0.56...
 
 .. _log_loss:
diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 8ad8a295d72e0..f94e7001fdc97 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -12,6 +12,18 @@ Version 1.0.0
 
 .. include:: changelog_legend.inc
 
+Enforcing keyword-only arguments
+--------------------------------
+
+In an effort to promote clear and non-ambiguous use of the library, most
+constructor and function parameters must now be passed as keyword arguments
+(i.e. using the `param=value` syntax) instead of positional. If a keyword-only
+parameter is used as positional, a `TypeError` is now raised.
+:issue:`15005` :pr:`20002` by `Joel Nothman`_, `Adrin Jalali`_, `Thomas Fan`_,
+`Nicolas Hug`_, and `Tom Dupre la Tour`_. See `SLEP009
+<https://scikit-learn-enhancement-proposals.readthedocs.io/en/latest/slep009/proposal.html>`_
+for more details.
+
 Put the changes in their relevant module.
 
 Changed models
diff --git a/examples/manifold/plot_compare_methods.py b/examples/manifold/plot_compare_methods.py
index ed01e8ac19b89..c78ecc234186a 100644
--- a/examples/manifold/plot_compare_methods.py
+++ b/examples/manifold/plot_compare_methods.py
@@ -53,14 +53,16 @@
 
 # Set-up manifold methods
 LLE = partial(manifold.LocallyLinearEmbedding,
-              n_neighbors, n_components, eigen_solver='auto')
+              n_neighbors=n_neighbors, n_components=n_components,
+              eigen_solver='auto')
 
 methods = OrderedDict()
 methods['LLE'] = LLE(method='standard')
 methods['LTSA'] = LLE(method='ltsa')
 methods['Hessian LLE'] = LLE(method='hessian')
 methods['Modified LLE'] = LLE(method='modified')
-methods['Isomap'] = manifold.Isomap(n_neighbors, n_components)
+methods['Isomap'] = manifold.Isomap(n_neighbors=n_neighbors,
+                                    n_components=n_components)
 methods['MDS'] = manifold.MDS(n_components, max_iter=100, n_init=1)
 methods['SE'] = manifold.SpectralEmbedding(n_components=n_components,
                                            n_neighbors=n_neighbors)
diff --git a/examples/manifold/plot_manifold_sphere.py b/examples/manifold/plot_manifold_sphere.py
index 2b6566c4ecd92..fbc125fb8773f 100644
--- a/examples/manifold/plot_manifold_sphere.py
+++ b/examples/manifold/plot_manifold_sphere.py
@@ -78,9 +78,9 @@
 
 for i, method in enumerate(methods):
     t0 = time()
-    trans_data = manifold\
-        .LocallyLinearEmbedding(n_neighbors, 2,
-                                method=method).fit_transform(sphere_data).T
+    trans_data = manifold.LocallyLinearEmbedding(
+        n_neighbors=n_neighbors, n_components=2,
+        method=method).fit_transform(sphere_data).T
     t1 = time()
     print("%s: %.2g sec" % (methods[i], t1 - t0))
 
@@ -93,8 +93,8 @@
 
 # Perform Isomap Manifold learning.
 t0 = time()
-trans_data = manifold.Isomap(n_neighbors, n_components=2)\
-    .fit_transform(sphere_data).T
+trans_data = manifold.Isomap(n_neighbors=n_neighbors,
+                             n_components=2).fit_transform(sphere_data).T
 t1 = time()
 print("%s: %.2g sec" % ('ISO', t1 - t0))
 
diff --git a/sklearn/base.py b/sklearn/base.py
index ec264b0cf5edc..e8b51df634a1f 100644
--- a/sklearn/base.py
+++ b/sklearn/base.py
@@ -23,10 +23,8 @@
 from .utils.validation import check_array
 from .utils.validation import _num_features
 from .utils._estimator_html_repr import estimator_html_repr
-from .utils.validation import _deprecate_positional_args
 
 
-@_deprecate_positional_args
 def clone(estimator, *, safe=True):
     """Constructs a new unfitted estimator with the same parameters.
 
diff --git a/sklearn/calibration.py b/sklearn/calibration.py
index c6289d1df2936..084f3bf242e3c 100644
--- a/sklearn/calibration.py
+++ b/sklearn/calibration.py
@@ -36,7 +36,6 @@
 from .isotonic import IsotonicRegression
 from .svm import LinearSVC
 from .model_selection import check_cv, cross_val_predict
-from .utils.validation import _deprecate_positional_args
 
 
 class CalibratedClassifierCV(ClassifierMixin,
@@ -215,7 +214,6 @@ class CalibratedClassifierCV(ClassifierMixin,
     .. [4] Predicting Good Probabilities with Supervised Learning,
            A. Niculescu-Mizil & R. Caruana, ICML 2005
     """
-    @_deprecate_positional_args
     def __init__(self, base_estimator=None, *, method='sigmoid',
                  cv=None, n_jobs=None, ensemble=True):
         self.base_estimator = base_estimator
@@ -788,7 +786,6 @@ def predict(self, T):
         return expit(-(self.a_ * T + self.b_))
 
 
-@_deprecate_positional_args
 def calibration_curve(y_true, y_prob, *, normalize=False, n_bins=5,
                       strategy='uniform'):
     """Compute true and predicted probabilities for a calibration curve.
diff --git a/sklearn/cluster/_affinity_propagation.py b/sklearn/cluster/_affinity_propagation.py
index 93b98d8aff7ee..ccae0b7538b58 100644
--- a/sklearn/cluster/_affinity_propagation.py
+++ b/sklearn/cluster/_affinity_propagation.py
@@ -12,7 +12,7 @@
 from ..base import BaseEstimator, ClusterMixin
 from ..utils import as_float_array, check_random_state
 from ..utils.deprecation import deprecated
-from ..utils.validation import check_is_fitted, _deprecate_positional_args
+from ..utils.validation import check_is_fitted
 from ..metrics import euclidean_distances
 from ..metrics import pairwise_distances_argmin
 from .._config import config_context
@@ -32,7 +32,6 @@ def all_equal_similarities():
     return all_equal_preferences() and all_equal_similarities()
 
 
-@_deprecate_positional_args
 def affinity_propagation(S, *, preference=None, convergence_iter=15,
                          max_iter=200, damping=0.5, copy=True, verbose=False,
                          return_n_iter=False, random_state=None):
@@ -352,7 +351,6 @@ class AffinityPropagation(ClusterMixin, BaseEstimator):
     array([[1, 2],
            [4, 2]])
     """
-    @_deprecate_positional_args
     def __init__(self, *, damping=.5, max_iter=200, convergence_iter=15,
                  copy=True, preference=None, affinity='euclidean',
                  verbose=False, random_state=None):
diff --git a/sklearn/cluster/_agglomerative.py b/sklearn/cluster/_agglomerative.py
index ee0a117824dd8..4b0089b707233 100644
--- a/sklearn/cluster/_agglomerative.py
+++ b/sklearn/cluster/_agglomerative.py
@@ -21,7 +21,7 @@
 from ..utils import check_array
 from ..utils._fast_dict import IntFloatDict
 from ..utils.fixes import _astype_copy_false
-from ..utils.validation import _deprecate_positional_args, check_memory
+from ..utils.validation import check_memory
 # mypy error: Module 'sklearn.cluster' has no attribute '_hierarchical_fast'
 from . import _hierarchical_fast as _hierarchical  # type: ignore
 from ._feature_agglomeration import AgglomerationTransform
@@ -134,7 +134,6 @@ def _single_linkage_tree(connectivity, n_samples, n_nodes, n_clusters,
 ###############################################################################
 # Hierarchical tree building functions
 
-@_deprecate_positional_args
 def ward_tree(X, *, connectivity=None, n_clusters=None, return_distance=False):
     """Ward clustering based on a Feature matrix.
 
@@ -800,7 +799,6 @@ class AgglomerativeClustering(ClusterMixin, BaseEstimator):
     array([1, 1, 1, 0, 0, 0])
 
     """
-    @_deprecate_positional_args
     def __init__(self, n_clusters=2, *, affinity="euclidean",
                  memory=None,
                  connectivity=None, compute_full_tree='auto',
@@ -1068,7 +1066,6 @@ class FeatureAgglomeration(AgglomerativeClustering, AgglomerationTransform):
     >>> X_reduced.shape
     (1797, 32)
     """
-    @_deprecate_positional_args
     def __init__(self, n_clusters=2, *, affinity="euclidean",
                  memory=None,
                  connectivity=None, compute_full_tree='auto',
diff --git a/sklearn/cluster/_bicluster.py b/sklearn/cluster/_bicluster.py
index 2b5184fae40ae..c8ff1bb036662 100644
--- a/sklearn/cluster/_bicluster.py
+++ b/sklearn/cluster/_bicluster.py
@@ -17,7 +17,7 @@
 from ..utils.extmath import (make_nonnegative, randomized_svd,
                              safe_sparse_dot)
 
-from ..utils.validation import assert_all_finite, _deprecate_positional_args
+from ..utils.validation import assert_all_finite
 
 
 __all__ = ['SpectralCoclustering',
@@ -277,7 +277,6 @@ class SpectralCoclustering(BaseSpectral):
       <http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.140.3011>`__.
 
     """
-    @_deprecate_positional_args
     def __init__(self, n_clusters=3, *, svd_method='randomized',
                  n_svd_vecs=None, mini_batch=False, init='k-means++',
                  n_init=10, random_state=None):
@@ -415,7 +414,6 @@ class SpectralBiclustering(BaseSpectral):
       <http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.135.1608>`__.
 
     """
-    @_deprecate_positional_args
     def __init__(self, n_clusters=3, *, method='bistochastic',
                  n_components=6, n_best=3, svd_method='randomized',
                  n_svd_vecs=None, mini_batch=False, init='k-means++',
diff --git a/sklearn/cluster/_birch.py b/sklearn/cluster/_birch.py
index 0587fe075a952..da1bf894f03f8 100644
--- a/sklearn/cluster/_birch.py
+++ b/sklearn/cluster/_birch.py
@@ -14,7 +14,7 @@
 from ..base import TransformerMixin, ClusterMixin, BaseEstimator
 from ..utils.extmath import row_norms
 from ..utils import deprecated
-from ..utils.validation import check_is_fitted, _deprecate_positional_args
+from ..utils.validation import check_is_fitted
 from ..exceptions import ConvergenceWarning
 from . import AgglomerativeClustering
 from .._config import config_context
@@ -440,7 +440,6 @@ class Birch(ClusterMixin, TransformerMixin, BaseEstimator):
     >>> brc.predict(X)
     array([0, 0, 0, 1, 1, 1])
     """
-    @_deprecate_positional_args
     def __init__(self, *, threshold=0.5, branching_factor=50, n_clusters=3,
                  compute_labels=True, copy=True):
         self.threshold = threshold
diff --git a/sklearn/cluster/_dbscan.py b/sklearn/cluster/_dbscan.py
index a841a9b7c213c..bbc3470256e90 100644
--- a/sklearn/cluster/_dbscan.py
+++ b/sklearn/cluster/_dbscan.py
@@ -14,13 +14,12 @@
 from scipy import sparse
 
 from ..base import BaseEstimator, ClusterMixin
-from ..utils.validation import _check_sample_weight, _deprecate_positional_args
+from ..utils.validation import _check_sample_weight
 from ..neighbors import NearestNeighbors
 
 from ._dbscan_inner import dbscan_inner
 
 
-@_deprecate_positional_args
 def dbscan(X, eps=0.5, *, min_samples=5, metric='minkowski',
            metric_params=None, algorithm='auto', leaf_size=30, p=2,
            sample_weight=None, n_jobs=None):
@@ -269,7 +268,6 @@ class DBSCAN(ClusterMixin, BaseEstimator):
     DBSCAN revisited, revisited: why and how you should (still) use DBSCAN.
     ACM Transactions on Database Systems (TODS), 42(3), 19.
     """
-    @_deprecate_positional_args
     def __init__(self, eps=0.5, *, min_samples=5, metric='euclidean',
                  metric_params=None, algorithm='auto', leaf_size=30, p=None,
                  n_jobs=None):
diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py
index ccb472b7f94dc..8b24be6ace987 100644
--- a/sklearn/cluster/_kmeans.py
+++ b/sklearn/cluster/_kmeans.py
@@ -24,7 +24,6 @@
 from ..utils.extmath import row_norms, stable_cumsum
 from ..utils.sparsefuncs_fast import assign_rows_csr
 from ..utils.sparsefuncs import mean_variance_axis
-from ..utils.validation import _deprecate_positional_args
 from ..utils import check_array
 from ..utils import check_random_state
 from ..utils import deprecated
@@ -252,7 +251,6 @@ def _tolerance(X, tol):
     return np.mean(variances) * tol
 
 
-@_deprecate_positional_args
 def k_means(X, n_clusters, *, sample_weight=None, init='k-means++',
             n_init=10, max_iter=300, verbose=False, tol=1e-4,
             random_state=None, copy_x=True, algorithm="auto",
@@ -813,7 +811,6 @@ class KMeans(TransformerMixin, ClusterMixin, BaseEstimator):
     array([[10.,  2.],
            [ 1.,  2.]])
     """
-    @_deprecate_positional_args
     def __init__(self, n_clusters=8, *, init='k-means++', n_init=10,
                  max_iter=300, tol=1e-4, verbose=0, random_state=None,
                  copy_x=True, algorithm='auto'):
@@ -1508,7 +1505,6 @@ class MiniBatchKMeans(KMeans):
     >>> kmeans.predict([[0, 0], [4, 4]])
     array([0, 1], dtype=int32)
     """
-    @_deprecate_positional_args
     def __init__(self, n_clusters=8, *, init='k-means++', max_iter=100,
                  batch_size=1024, verbose=0, compute_labels=True,
                  random_state=None, tol=0.0, max_no_improvement=10,
diff --git a/sklearn/cluster/_mean_shift.py b/sklearn/cluster/_mean_shift.py
index fa62d2c8d9fe7..f48ef46e8dbef 100644
--- a/sklearn/cluster/_mean_shift.py
+++ b/sklearn/cluster/_mean_shift.py
@@ -19,7 +19,7 @@
 from joblib import Parallel
 
 from collections import defaultdict
-from ..utils.validation import check_is_fitted, _deprecate_positional_args
+from ..utils.validation import check_is_fitted
 from ..utils.fixes import delayed
 from ..utils import check_random_state, gen_batches, check_array
 from ..base import BaseEstimator, ClusterMixin
@@ -28,7 +28,6 @@
 from .._config import config_context
 
 
-@_deprecate_positional_args
 def estimate_bandwidth(X, *, quantile=0.3, n_samples=None, random_state=0,
                        n_jobs=None):
     """Estimate the bandwidth to use with the mean-shift algorithm.
@@ -109,7 +108,6 @@ def _mean_shift_single_seed(my_mean, X, nbrs, max_iter):
     return tuple(my_mean), len(points_within), completed_iterations
 
 
-@_deprecate_positional_args
 def mean_shift(X, *, bandwidth=None, seeds=None, bin_seeding=False,
                min_bin_freq=1, cluster_all=True, max_iter=300,
                n_jobs=None):
@@ -352,7 +350,6 @@ class MeanShift(ClusterMixin, BaseEstimator):
     Machine Intelligence. 2002. pp. 603-619.
 
     """
-    @_deprecate_positional_args
     def __init__(self, *, bandwidth=None, seeds=None, bin_seeding=False,
                  min_bin_freq=1, cluster_all=True, n_jobs=None, max_iter=300):
         self.bandwidth = bandwidth
diff --git a/sklearn/cluster/_optics.py b/sklearn/cluster/_optics.py
index d0b94f43454b3..af0e8531aa7b8 100755
--- a/sklearn/cluster/_optics.py
+++ b/sklearn/cluster/_optics.py
@@ -17,7 +17,6 @@
 from ..exceptions import DataConversionWarning
 from ..metrics.pairwise import PAIRWISE_BOOLEAN_FUNCTIONS
 from ..utils import gen_batches, get_chunk_n_rows
-from ..utils.validation import _deprecate_positional_args
 from ..neighbors import NearestNeighbors
 from ..base import BaseEstimator, ClusterMixin
 from ..metrics import pairwise_distances
@@ -204,7 +203,6 @@ class OPTICS(ClusterMixin, BaseEstimator):
     >>> clustering.labels_
     array([0, 0, 0, 1, 1, 1])
     """
-    @_deprecate_positional_args
     def __init__(self, *, min_samples=5, max_eps=np.inf, metric='minkowski',
                  p=2, metric_params=None, cluster_method='xi', eps=None,
                  xi=0.05, predecessor_correction=True, min_cluster_size=None,
@@ -348,7 +346,6 @@ def _compute_core_distances_(X, neighbors, min_samples, working_memory):
     return core_distances
 
 
-@_deprecate_positional_args
 def compute_optics_graph(X, *, min_samples, max_eps, metric, p, metric_params,
                          algorithm, leaf_size, n_jobs):
     """Computes the OPTICS reachability graph.
@@ -552,7 +549,6 @@ def _set_reach_dist(core_distances_, reachability_, predecessor_,
     predecessor_[unproc[improved]] = point_index
 
 
-@_deprecate_positional_args
 def cluster_optics_dbscan(*, reachability, core_distances, ordering, eps):
     """Performs DBSCAN extraction for an arbitrary epsilon.
 
diff --git a/sklearn/cluster/_spectral.py b/sklearn/cluster/_spectral.py
index e9a5d7a7b4302..a1371b925595d 100644
--- a/sklearn/cluster/_spectral.py
+++ b/sklearn/cluster/_spectral.py
@@ -11,7 +11,6 @@
 
 from ..base import BaseEstimator, ClusterMixin
 from ..utils import check_random_state, as_float_array
-from ..utils.validation import _deprecate_positional_args
 from ..utils.deprecation import deprecated
 from ..metrics.pairwise import pairwise_kernels
 from ..neighbors import kneighbors_graph, NearestNeighbors
@@ -19,7 +18,6 @@
 from ._kmeans import k_means
 
 
-@_deprecate_positional_args
 def discretize(vectors, *, copy=True, max_svd_restarts=30, n_iter_max=20,
                random_state=None):
     """Search for a partition matrix (clustering) which is closest to the
@@ -158,7 +156,6 @@ def discretize(vectors, *, copy=True, max_svd_restarts=30, n_iter_max=20,
     return labels
 
 
-@_deprecate_positional_args
 def spectral_clustering(affinity, *, n_clusters=8, n_components=None,
                         eigen_solver=None, random_state=None, n_init=10,
                         eigen_tol=0.0, assign_labels='kmeans',
@@ -455,7 +452,6 @@ class SpectralClustering(ClusterMixin, BaseEstimator):
       Stella X. Yu, Jianbo Shi
       https://www1.icsi.berkeley.edu/~stellayu/publication/doc/2003kwayICCV.pdf
     """
-    @_deprecate_positional_args
     def __init__(self, n_clusters=8, *, eigen_solver=None, n_components=None,
                  random_state=None, n_init=10, gamma=1., affinity='rbf',
                  n_neighbors=10, eigen_tol=0.0, assign_labels='kmeans',
diff --git a/sklearn/cluster/tests/test_hierarchical.py b/sklearn/cluster/tests/test_hierarchical.py
index 513dbf8e9218e..bd70b2c1aac54 100644
--- a/sklearn/cluster/tests/test_hierarchical.py
+++ b/sklearn/cluster/tests/test_hierarchical.py
@@ -670,7 +670,8 @@ def test_n_components():
     connectivity = np.eye(5)
 
     for linkage_func in _TREE_BUILDERS.values():
-        assert ignore_warnings(linkage_func)(X, connectivity)[1] == 5
+        assert ignore_warnings(linkage_func)(
+            X, connectivity=connectivity)[1] == 5
 
 
 def test_agg_n_clusters():
diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py
index 441fc95a106f1..6c15b81be98c2 100644
--- a/sklearn/compose/_column_transformer.py
+++ b/sklearn/compose/_column_transformer.py
@@ -21,7 +21,6 @@
 from ..utils import _get_column_indices
 from ..utils.metaestimators import _BaseComposition
 from ..utils.validation import check_array, check_is_fitted
-from ..utils.validation import _deprecate_positional_args
 from ..utils.fixes import delayed
 
 
@@ -179,7 +178,6 @@ class ColumnTransformer(TransformerMixin, _BaseComposition):
     """
     _required_parameters = ['transformers']
 
-    @_deprecate_positional_args
     def __init__(self,
                  transformers, *,
                  remainder='drop',
@@ -867,7 +865,6 @@ class make_column_selector:
            [-0.30151134,  0.        ,  1.        ,  0.        ],
            [ 0.90453403,  0.        ,  0.        ,  1.        ]])
     """
-    @_deprecate_positional_args
     def __init__(self, pattern=None, *, dtype_include=None,
                  dtype_exclude=None):
         self.pattern = pattern
diff --git a/sklearn/compose/_target.py b/sklearn/compose/_target.py
index 1a80046c66376..12fe13ee848b9 100644
--- a/sklearn/compose/_target.py
+++ b/sklearn/compose/_target.py
@@ -10,7 +10,6 @@
 from ..utils.validation import check_is_fitted
 from ..utils import check_array, _safe_indexing
 from ..preprocessing import FunctionTransformer
-from ..utils.validation import _deprecate_positional_args
 from ..exceptions import NotFittedError
 
 __all__ = ['TransformedTargetRegressor']
@@ -109,7 +108,6 @@ class TransformedTargetRegressor(RegressorMixin, BaseEstimator):
     <sphx_glr_auto_examples_compose_plot_transformed_target.py>`.
 
     """
-    @_deprecate_positional_args
     def __init__(self, regressor=None, *, transformer=None,
                  func=None, inverse_func=None, check_inverse=True):
         self.regressor = regressor
diff --git a/sklearn/covariance/_elliptic_envelope.py b/sklearn/covariance/_elliptic_envelope.py
index e599f0435f48c..ad7904dc7831a 100644
--- a/sklearn/covariance/_elliptic_envelope.py
+++ b/sklearn/covariance/_elliptic_envelope.py
@@ -5,7 +5,6 @@
 import numpy as np
 from . import MinCovDet
 from ..utils.validation import check_is_fitted
-from ..utils.validation import _deprecate_positional_args
 from ..metrics import accuracy_score
 from ..base import OutlierMixin
 
@@ -120,7 +119,6 @@ class EllipticEnvelope(OutlierMixin, MinCovDet):
        minimum covariance determinant estimator" Technometrics 41(3), 212
        (1999)
     """
-    @_deprecate_positional_args
     def __init__(self, *, store_precision=True, assume_centered=False,
                  support_fraction=None, contamination=0.1,
                  random_state=None):
diff --git a/sklearn/covariance/_empirical_covariance.py b/sklearn/covariance/_empirical_covariance.py
index fb1797e50f96a..02bddd0f50330 100644
--- a/sklearn/covariance/_empirical_covariance.py
+++ b/sklearn/covariance/_empirical_covariance.py
@@ -19,7 +19,6 @@
 from ..utils import check_array
 from ..utils.extmath import fast_logdet
 from ..metrics.pairwise import pairwise_distances
-from ..utils.validation import _deprecate_positional_args
 
 
 def log_likelihood(emp_cov, precision):
@@ -49,7 +48,6 @@ def log_likelihood(emp_cov, precision):
     return log_likelihood_
 
 
-@_deprecate_positional_args
 def empirical_covariance(X, *, assume_centered=False):
     """Computes the Maximum likelihood covariance estimator
 
@@ -146,7 +144,6 @@ class EmpiricalCovariance(BaseEstimator):
     array([0.0622..., 0.0193...])
 
     """
-    @_deprecate_positional_args
     def __init__(self, *, store_precision=True, assume_centered=False):
         self.store_precision = store_precision
         self.assume_centered = assume_centered
diff --git a/sklearn/covariance/_graph_lasso.py b/sklearn/covariance/_graph_lasso.py
index 57167b81fe9e0..091d4f82e7e3e 100644
--- a/sklearn/covariance/_graph_lasso.py
+++ b/sklearn/covariance/_graph_lasso.py
@@ -19,7 +19,6 @@
 
 from ..exceptions import ConvergenceWarning
 from ..utils.validation import check_random_state
-from ..utils.validation import _deprecate_positional_args
 from ..utils.fixes import delayed
 # mypy error: Module 'sklearn.linear_model' has no attribute '_cd_fast'
 from ..linear_model import _cd_fast as cd_fast  # type: ignore
@@ -77,7 +76,6 @@ def alpha_max(emp_cov):
 
 
 # The g-lasso algorithm
-@_deprecate_positional_args
 def graphical_lasso(emp_cov, alpha, *, cov_init=None, mode='cd', tol=1e-4,
                     enet_tol=1e-4, max_iter=100, verbose=False,
                     return_costs=False, eps=np.finfo(np.float64).eps,
@@ -366,7 +364,6 @@ class GraphicalLasso(EmpiricalCovariance):
     --------
     graphical_lasso, GraphicalLassoCV
     """
-    @_deprecate_positional_args
     def __init__(self, alpha=.01, *, mode='cd', tol=1e-4, enet_tol=1e-4,
                  max_iter=100, verbose=False, assume_centered=False):
         super().__init__(assume_centered=assume_centered)
@@ -675,7 +672,6 @@ class GraphicalLassoCV(GraphicalLasso):
     values of alpha then come out as missing values, but the optimum may
     be close to these missing values.
     """
-    @_deprecate_positional_args
     def __init__(self, *, alphas=4, n_refinements=4, cv=None, tol=1e-4,
                  enet_tol=1e-4, max_iter=100, mode='cd', n_jobs=None,
                  verbose=False, assume_centered=False):
diff --git a/sklearn/covariance/_robust_covariance.py b/sklearn/covariance/_robust_covariance.py
index d4331b591e43f..337ba23f19059 100644
--- a/sklearn/covariance/_robust_covariance.py
+++ b/sklearn/covariance/_robust_covariance.py
@@ -17,7 +17,6 @@
 from . import empirical_covariance, EmpiricalCovariance
 from ..utils.extmath import fast_logdet
 from ..utils import check_random_state, check_array
-from ..utils.validation import _deprecate_positional_args
 
 
 # Minimum Covariance Determinant
@@ -615,7 +614,6 @@ class MinCovDet(EmpiricalCovariance):
     """
     _nonrobust_covariance = staticmethod(empirical_covariance)
 
-    @_deprecate_positional_args
     def __init__(self, *, store_precision=True, assume_centered=False,
                  support_fraction=None, random_state=None):
         self.store_precision = store_precision
diff --git a/sklearn/covariance/_shrunk_covariance.py b/sklearn/covariance/_shrunk_covariance.py
index 72b13681200ff..5fe590b33a1db 100644
--- a/sklearn/covariance/_shrunk_covariance.py
+++ b/sklearn/covariance/_shrunk_covariance.py
@@ -18,7 +18,6 @@
 
 from . import empirical_covariance, EmpiricalCovariance
 from ..utils import check_array
-from ..utils.validation import _deprecate_positional_args
 
 
 # ShrunkCovariance estimator
@@ -118,7 +117,6 @@ class ShrunkCovariance(EmpiricalCovariance):
 
     where mu = trace(cov) / n_features
     """
-    @_deprecate_positional_args
     def __init__(self, *, store_precision=True, assume_centered=False,
                  shrinkage=0.1):
         super().__init__(store_precision=store_precision,
@@ -253,7 +251,6 @@ def ledoit_wolf_shrinkage(X, assume_centered=False, block_size=1000):
     return shrinkage
 
 
-@_deprecate_positional_args
 def ledoit_wolf(X, *, assume_centered=False, block_size=1000):
     """Estimates the shrunk Ledoit-Wolf covariance matrix.
 
@@ -391,7 +388,6 @@ class LedoitWolf(EmpiricalCovariance):
     Ledoit and Wolf, Journal of Multivariate Analysis, Volume 88, Issue 2,
     February 2004, pages 365-411.
     """
-    @_deprecate_positional_args
     def __init__(self, *, store_precision=True, assume_centered=False,
                  block_size=1000):
         super().__init__(store_precision=store_precision,
@@ -431,7 +427,6 @@ def fit(self, X, y=None):
 
 
 # OAS estimator
-@_deprecate_positional_args
 def oas(X, *, assume_centered=False):
     """Estimate covariance with the Oracle Approximating Shrinkage algorithm.
 
diff --git a/sklearn/cross_decomposition/_pls.py b/sklearn/cross_decomposition/_pls.py
index 2f6e63d556388..11e5d7bb8c335 100644
--- a/sklearn/cross_decomposition/_pls.py
+++ b/sklearn/cross_decomposition/_pls.py
@@ -16,7 +16,6 @@
 from ..utils import check_array, check_consistent_length
 from ..utils.extmath import svd_flip
 from ..utils.validation import check_is_fitted, FLOAT_DTYPES
-from ..utils.validation import _deprecate_positional_args
 from ..exceptions import ConvergenceWarning
 from ..utils.deprecation import deprecated
 
@@ -588,7 +587,6 @@ class PLSRegression(_PLS):
     #     - "plspm " with function plsreg2(X, Y)
     #     - "pls" with function oscorespls.fit(X, Y)
 
-    @_deprecate_positional_args
     def __init__(self, n_components=2, *, scale=True,
                  max_iter=500, tol=1e-06, copy=True):
         super().__init__(
@@ -705,7 +703,6 @@ class PLSCanonical(_PLS):
     # exactly implement the Wold algorithm since it does not normalize
     # y_weights to one.
 
-    @_deprecate_positional_args
     def __init__(self, n_components=2, *, scale=True, algorithm="nipals",
                  max_iter=500, tol=1e-06, copy=True):
         super().__init__(
@@ -807,7 +804,6 @@ class CCA(_PLS):
     PLSSVD
     """
 
-    @_deprecate_positional_args
     def __init__(self, n_components=2, *, scale=True,
                  max_iter=500, tol=1e-06, copy=True):
         super().__init__(n_components=n_components, scale=scale,
@@ -893,7 +889,6 @@ class PLSSVD(TransformerMixin, BaseEstimator):
     PLSCanonical
     CCA
     """
-    @_deprecate_positional_args
     def __init__(self, n_components=2, *, scale=True, copy=True):
         self.n_components = n_components
         self.scale = scale
diff --git a/sklearn/datasets/_base.py b/sklearn/datasets/_base.py
index 948b4f7cba61e..da64faac54a36 100644
--- a/sklearn/datasets/_base.py
+++ b/sklearn/datasets/_base.py
@@ -17,7 +17,6 @@
 from ..utils import Bunch
 from ..utils import check_random_state
 from ..utils import check_pandas_support
-from ..utils.validation import _deprecate_positional_args
 
 import numpy as np
 
@@ -88,7 +87,6 @@ def _convert_data_dataframe(caller_name, data, target,
     return combined_df, X, y
 
 
-@_deprecate_positional_args
 def load_files(container_path, *, description=None, categories=None,
                load_content=True, shuffle=True, encoding=None,
                decode_error='strict', random_state=0):
@@ -276,7 +274,6 @@ def load_data(module_path, data_file_name):
     return data, target, target_names
 
 
-@_deprecate_positional_args
 def load_wine(*, return_X_y=False, as_frame=False):
     """Load and return the wine dataset (classification).
 
@@ -391,7 +388,6 @@ def load_wine(*, return_X_y=False, as_frame=False):
                  feature_names=feature_names)
 
 
-@_deprecate_positional_args
 def load_iris(*, return_X_y=False, as_frame=False):
     """Load and return the iris dataset (classification).
 
@@ -506,7 +502,6 @@ def load_iris(*, return_X_y=False, as_frame=False):
                  filename=iris_csv_filename)
 
 
-@_deprecate_positional_args
 def load_breast_cancer(*, return_X_y=False, as_frame=False):
     """Load and return the breast cancer wisconsin dataset (classification).
 
@@ -631,7 +626,6 @@ def load_breast_cancer(*, return_X_y=False, as_frame=False):
                  filename=csv_filename)
 
 
-@_deprecate_positional_args
 def load_digits(*, n_class=10, return_X_y=False, as_frame=False):
     """Load and return the digits dataset (classification).
 
@@ -755,7 +749,6 @@ def load_digits(*, n_class=10, return_X_y=False, as_frame=False):
                  DESCR=descr)
 
 
-@_deprecate_positional_args
 def load_diabetes(*, return_X_y=False, as_frame=False):
     """Load and return the diabetes dataset (regression).
 
@@ -854,7 +847,6 @@ def load_diabetes(*, return_X_y=False, as_frame=False):
                  target_filename=target_filename)
 
 
-@_deprecate_positional_args
 def load_linnerud(*, return_X_y=False, as_frame=False):
     """Load and return the physical excercise linnerud dataset.
 
@@ -958,7 +950,6 @@ def load_linnerud(*, return_X_y=False, as_frame=False):
                  target_filename=target_filename)
 
 
-@_deprecate_positional_args
 def load_boston(*, return_X_y=False):
     """Load and return the boston house-prices dataset (regression).
 
diff --git a/sklearn/datasets/_california_housing.py b/sklearn/datasets/_california_housing.py
index a25f8d63eceef..dd0b4ff25014b 100644
--- a/sklearn/datasets/_california_housing.py
+++ b/sklearn/datasets/_california_housing.py
@@ -36,7 +36,6 @@
 from ._base import _pkl_filepath
 from ._base import RemoteFileMetadata
 from ..utils import Bunch
-from ..utils.validation import _deprecate_positional_args
 
 
 # The original data can be found at:
@@ -50,7 +49,6 @@
 logger = logging.getLogger(__name__)
 
 
-@_deprecate_positional_args
 def fetch_california_housing(*, data_home=None, download_if_missing=True,
                              return_X_y=False, as_frame=False):
     """Load the California housing dataset (regression).
diff --git a/sklearn/datasets/_covtype.py b/sklearn/datasets/_covtype.py
index 305f465369604..85d0c0732e15f 100644
--- a/sklearn/datasets/_covtype.py
+++ b/sklearn/datasets/_covtype.py
@@ -29,7 +29,6 @@
 from ..utils import Bunch
 from ._base import _pkl_filepath
 from ..utils import check_random_state
-from ..utils.validation import _deprecate_positional_args
 
 
 # The original data can be found in:
@@ -59,7 +58,6 @@
 TARGET_NAMES = ["Cover_Type"]
 
 
-@_deprecate_positional_args
 def fetch_covtype(*, data_home=None, download_if_missing=True,
                   random_state=None, shuffle=False, return_X_y=False,
                   as_frame=False):
diff --git a/sklearn/datasets/_kddcup99.py b/sklearn/datasets/_kddcup99.py
index 26fb14197a211..f7bf454cc420e 100644
--- a/sklearn/datasets/_kddcup99.py
+++ b/sklearn/datasets/_kddcup99.py
@@ -24,7 +24,6 @@
 from ..utils import Bunch
 from ..utils import check_random_state
 from ..utils import shuffle as shuffle_method
-from ..utils.validation import _deprecate_positional_args
 
 
 # The original data can be found at:
@@ -46,7 +45,6 @@
 logger = logging.getLogger(__name__)
 
 
-@_deprecate_positional_args
 def fetch_kddcup99(*, subset=None, data_home=None, shuffle=False,
                    random_state=None,
                    percent10=True, download_if_missing=True, return_X_y=False,
diff --git a/sklearn/datasets/_lfw.py b/sklearn/datasets/_lfw.py
index dd56e532afdc3..73e5ac66bb4d4 100644
--- a/sklearn/datasets/_lfw.py
+++ b/sklearn/datasets/_lfw.py
@@ -19,7 +19,6 @@
 
 from ._base import get_data_home, _fetch_remote, RemoteFileMetadata
 from ..utils import Bunch
-from ..utils.validation import _deprecate_positional_args
 from ..utils.fixes import parse_version
 
 logger = logging.getLogger(__name__)
@@ -216,7 +215,6 @@ def _fetch_lfw_people(data_folder_path, slice_=None, color=False, resize=None,
     return faces, target, target_names
 
 
-@_deprecate_positional_args
 def fetch_lfw_people(*, data_home=None, funneled=True, resize=0.5,
                      min_faces_per_person=0, color=False,
                      slice_=(slice(70, 195), slice(78, 172)),
@@ -387,7 +385,6 @@ def _fetch_lfw_pairs(index_file_path, data_folder_path, slice_=None,
     return pairs, target, np.array(['Different persons', 'Same person'])
 
 
-@_deprecate_positional_args
 def fetch_lfw_pairs(*, subset='train', data_home=None, funneled=True,
                     resize=0.5,
                     color=False, slice_=(slice(70, 195), slice(78, 172)),
diff --git a/sklearn/datasets/_olivetti_faces.py b/sklearn/datasets/_olivetti_faces.py
index 76388a4a92a42..53609439bba90 100644
--- a/sklearn/datasets/_olivetti_faces.py
+++ b/sklearn/datasets/_olivetti_faces.py
@@ -25,7 +25,6 @@
 from ._base import RemoteFileMetadata
 from ._base import _pkl_filepath
 from ..utils import check_random_state, Bunch
-from ..utils.validation import _deprecate_positional_args
 
 # The original data can be found at:
 # https://cs.nyu.edu/~roweis/data/olivettifaces.mat
@@ -36,7 +35,6 @@
               'd5fca46a4b8906c18e454d41af987794'))
 
 
-@_deprecate_positional_args
 def fetch_olivetti_faces(*, data_home=None, shuffle=False, random_state=0,
                          download_if_missing=True, return_X_y=False):
     """Load the Olivetti faces data-set from AT&T (classification).
diff --git a/sklearn/datasets/_openml.py b/sklearn/datasets/_openml.py
index b589c9faa5213..ec3c3a9ae961d 100644
--- a/sklearn/datasets/_openml.py
+++ b/sklearn/datasets/_openml.py
@@ -27,7 +27,6 @@
 from ..utils import get_chunk_n_rows
 from ..utils import _chunk_generator
 from ..utils import check_pandas_support  # noqa
-from ..utils.validation import _deprecate_positional_args
 
 __all__ = ['fetch_openml']
 
@@ -690,7 +689,6 @@ def _valid_data_column_names(features_list, target_columns):
     return valid_data_column_names
 
 
-@_deprecate_positional_args
 def fetch_openml(
     name: Optional[str] = None,
     *,
diff --git a/sklearn/datasets/_rcv1.py b/sklearn/datasets/_rcv1.py
index 2d3a809848e83..4d1bd8e9ba44f 100644
--- a/sklearn/datasets/_rcv1.py
+++ b/sklearn/datasets/_rcv1.py
@@ -25,7 +25,6 @@
 from ._svmlight_format_io import load_svmlight_files
 from ..utils import shuffle as shuffle_
 from ..utils import Bunch
-from ..utils.validation import _deprecate_positional_args
 
 
 # The original vectorized data can be found at:
@@ -76,7 +75,6 @@
 logger = logging.getLogger(__name__)
 
 
-@_deprecate_positional_args
 def fetch_rcv1(*, data_home=None, subset='all', download_if_missing=True,
                random_state=None, shuffle=False, return_X_y=False):
     """Load the RCV1 multilabel dataset (classification).
diff --git a/sklearn/datasets/_samples_generator.py b/sklearn/datasets/_samples_generator.py
index 66d1baaaa9cb2..3a9e1812cb1e7 100644
--- a/sklearn/datasets/_samples_generator.py
+++ b/sklearn/datasets/_samples_generator.py
@@ -18,7 +18,6 @@
 from ..utils import check_array, check_random_state
 from ..utils import shuffle as util_shuffle
 from ..utils.random import sample_without_replacement
-from ..utils.validation import _deprecate_positional_args
 
 
 def _generate_hypercube(samples, dimensions, rng):
@@ -34,7 +33,6 @@ def _generate_hypercube(samples, dimensions, rng):
     return out
 
 
-@_deprecate_positional_args
 def make_classification(n_samples=100, n_features=20, *, n_informative=2,
                         n_redundant=2, n_repeated=0, n_classes=2,
                         n_clusters_per_class=2, weights=None, flip_y=0.01,
@@ -264,7 +262,6 @@ def make_classification(n_samples=100, n_features=20, *, n_informative=2,
     return X, y
 
 
-@_deprecate_positional_args
 def make_multilabel_classification(n_samples=100, n_features=20, *,
                                    n_classes=5,
                                    n_labels=2, length=50, allow_unlabeled=True,
@@ -427,7 +424,6 @@ def sample_example():
     return X, Y
 
 
-@_deprecate_positional_args
 def make_hastie_10_2(n_samples=12000, *, random_state=None):
     """Generates data for binary classification used in
     Hastie et al. 2009, Example 10.2.
@@ -476,7 +472,6 @@ def make_hastie_10_2(n_samples=12000, *, random_state=None):
     return X, y
 
 
-@_deprecate_positional_args
 def make_regression(n_samples=100, n_features=100, *, n_informative=10,
                     n_targets=1, bias=0.0, effective_rank=None,
                     tail_strength=0.5, noise=0.0, shuffle=True, coef=False,
@@ -600,7 +595,6 @@ def make_regression(n_samples=100, n_features=100, *, n_informative=10,
         return X, y
 
 
-@_deprecate_positional_args
 def make_circles(n_samples=100, *, shuffle=True, noise=None, random_state=None,
                  factor=.8):
     """Make a large circle containing a smaller circle in 2d.
@@ -680,7 +674,6 @@ def make_circles(n_samples=100, *, shuffle=True, noise=None, random_state=None,
     return X, y
 
 
-@_deprecate_positional_args
 def make_moons(n_samples=100, *, shuffle=True, noise=None, random_state=None):
     """Make two interleaving half circles.
 
@@ -747,7 +740,6 @@ def make_moons(n_samples=100, *, shuffle=True, noise=None, random_state=None):
     return X, y
 
 
-@_deprecate_positional_args
 def make_blobs(n_samples=100, n_features=2, *, centers=None, cluster_std=1.0,
                center_box=(-10.0, 10.0), shuffle=True, random_state=None,
                return_centers=False):
@@ -906,7 +898,6 @@ def make_blobs(n_samples=100, n_features=2, *, centers=None, cluster_std=1.0,
         return X, y
 
 
-@_deprecate_positional_args
 def make_friedman1(n_samples=100, n_features=10, *, noise=0.0,
                    random_state=None):
     """Generate the "Friedman #1" regression problem.
@@ -970,7 +961,6 @@ def make_friedman1(n_samples=100, n_features=10, *, noise=0.0,
     return X, y
 
 
-@_deprecate_positional_args
 def make_friedman2(n_samples=100, *, noise=0.0, random_state=None):
     """Generate the "Friedman #2" regression problem.
 
@@ -1036,7 +1026,6 @@ def make_friedman2(n_samples=100, *, noise=0.0, random_state=None):
     return X, y
 
 
-@_deprecate_positional_args
 def make_friedman3(n_samples=100, *, noise=0.0, random_state=None):
     """Generate the "Friedman #3" regression problem.
 
@@ -1101,7 +1090,6 @@ def make_friedman3(n_samples=100, *, noise=0.0, random_state=None):
     return X, y
 
 
-@_deprecate_positional_args
 def make_low_rank_matrix(n_samples=100, n_features=100, *, effective_rank=10,
                          tail_strength=0.5, random_state=None):
     """Generate a mostly low rank matrix with bell-shaped singular values.
@@ -1173,7 +1161,6 @@ def make_low_rank_matrix(n_samples=100, n_features=100, *, effective_rank=10,
     return np.dot(np.dot(u, s), v.T)
 
 
-@_deprecate_positional_args
 def make_sparse_coded_signal(n_samples, *, n_components, n_features,
                              n_nonzero_coefs, random_state=None):
     """Generate a signal as a sparse combination of dictionary elements.
@@ -1236,7 +1223,6 @@ def make_sparse_coded_signal(n_samples, *, n_components, n_features,
     return map(np.squeeze, (Y, D, X))
 
 
-@_deprecate_positional_args
 def make_sparse_uncorrelated(n_samples=100, n_features=10, *,
                              random_state=None):
     """Generate a random regression problem with sparse uncorrelated design.
@@ -1289,7 +1275,6 @@ def make_sparse_uncorrelated(n_samples=100, n_features=10, *,
     return X, y
 
 
-@_deprecate_positional_args
 def make_spd_matrix(n_dim, *, random_state=None):
     """Generate a random symmetric, positive-definite matrix.
 
@@ -1323,7 +1308,6 @@ def make_spd_matrix(n_dim, *, random_state=None):
     return X
 
 
-@_deprecate_positional_args
 def make_sparse_spd_matrix(dim=1, *, alpha=0.95, norm_diag=False,
                            smallest_coef=.1, largest_coef=.9,
                            random_state=None):
@@ -1398,7 +1382,6 @@ def make_sparse_spd_matrix(dim=1, *, alpha=0.95, norm_diag=False,
     return prec
 
 
-@_deprecate_positional_args
 def make_swiss_roll(n_samples=100, *, noise=0.0, random_state=None):
     """Generate a swiss roll dataset.
 
@@ -1451,7 +1434,6 @@ def make_swiss_roll(n_samples=100, *, noise=0.0, random_state=None):
     return X, t
 
 
-@_deprecate_positional_args
 def make_s_curve(n_samples=100, *, noise=0.0, random_state=None):
     """Generate an S curve dataset.
 
@@ -1494,7 +1476,6 @@ def make_s_curve(n_samples=100, *, noise=0.0, random_state=None):
     return X, t
 
 
-@_deprecate_positional_args
 def make_gaussian_quantiles(*, mean=None, cov=1., n_samples=100,
                             n_features=2, n_classes=3,
                             shuffle=True, random_state=None):
@@ -1590,7 +1571,6 @@ def _shuffle(data, random_state=None):
     return result, row_idx, col_idx
 
 
-@_deprecate_positional_args
 def make_biclusters(shape, n_clusters, *, noise=0.0, minval=10,
                     maxval=100, shuffle=True, random_state=None):
     """Generate an array with constant block diagonal structure for
@@ -1682,7 +1662,6 @@ def make_biclusters(shape, n_clusters, *, noise=0.0, minval=10,
     return result, rows, cols
 
 
-@_deprecate_positional_args
 def make_checkerboard(shape, n_clusters, *, noise=0.0, minval=10,
                       maxval=100, shuffle=True, random_state=None):
     """Generate an array with block checkerboard structure for
diff --git a/sklearn/datasets/_species_distributions.py b/sklearn/datasets/_species_distributions.py
index bc3fa3bcc7a04..039883ca4b06a 100644
--- a/sklearn/datasets/_species_distributions.py
+++ b/sklearn/datasets/_species_distributions.py
@@ -50,7 +50,6 @@
 from ._base import _fetch_remote
 from ._base import RemoteFileMetadata
 from ..utils import Bunch
-from ..utils.validation import _deprecate_positional_args
 from ._base import _pkl_filepath
 
 # The original data can be found at:
@@ -138,7 +137,6 @@ def construct_grids(batch):
     return (xgrid, ygrid)
 
 
-@_deprecate_positional_args
 def fetch_species_distributions(*, data_home=None,
                                 download_if_missing=True):
     """Loader for species distribution dataset from Phillips et. al. (2006)
diff --git a/sklearn/datasets/_svmlight_format_io.py b/sklearn/datasets/_svmlight_format_io.py
index 8997624da0755..4a1d1eb02e6da 100644
--- a/sklearn/datasets/_svmlight_format_io.py
+++ b/sklearn/datasets/_svmlight_format_io.py
@@ -25,7 +25,6 @@
 from .. import __version__
 
 from ..utils import check_array, IS_PYPY
-from ..utils.validation import _deprecate_positional_args
 
 if not IS_PYPY:
     from ._svmlight_format_fast import _load_svmlight_file
@@ -38,7 +37,6 @@ def _load_svmlight_file(*args, **kwargs):
                 'for the status updates).')
 
 
-@_deprecate_positional_args
 def load_svmlight_file(f, *, n_features=None, dtype=np.float64,
                        multilabel=False, zero_based="auto", query_id=False,
                        offset=0, length=-1):
@@ -202,7 +200,6 @@ def _open_and_load(f, dtype, multilabel, zero_based, query_id,
     return data, indices, indptr, labels, query
 
 
-@_deprecate_positional_args
 def load_svmlight_files(files, *, n_features=None, dtype=np.float64,
                         multilabel=False, zero_based="auto", query_id=False,
                         offset=0, length=-1):
@@ -387,7 +384,6 @@ def _dump_svmlight(X, y, f, multilabel, one_based, comment, query_id):
         f.write((line_pattern % feat).encode('ascii'))
 
 
-@_deprecate_positional_args
 def dump_svmlight_file(X, y, f, *, zero_based=True, comment=None,
                        query_id=None,
                        multilabel=False):
diff --git a/sklearn/datasets/_twenty_newsgroups.py b/sklearn/datasets/_twenty_newsgroups.py
index cb181d2108403..c41bf767d9ed5 100644
--- a/sklearn/datasets/_twenty_newsgroups.py
+++ b/sklearn/datasets/_twenty_newsgroups.py
@@ -46,7 +46,6 @@
 from ..feature_extraction.text import CountVectorizer
 from .. import preprocessing
 from ..utils import check_random_state, Bunch
-from ..utils.validation import _deprecate_positional_args
 
 logger = logging.getLogger(__name__)
 
@@ -148,7 +147,6 @@ def strip_newsgroup_footer(text):
         return text
 
 
-@_deprecate_positional_args
 def fetch_20newsgroups(*, data_home=None, subset='train', categories=None,
                        shuffle=True, random_state=42,
                        remove=(),
@@ -326,7 +324,6 @@ def fetch_20newsgroups(*, data_home=None, subset='train', categories=None,
     return data
 
 
-@_deprecate_positional_args
 def fetch_20newsgroups_vectorized(*, subset="train", remove=(), data_home=None,
                                   download_if_missing=True, return_X_y=False,
                                   normalize=True, as_frame=False):
diff --git a/sklearn/decomposition/_dict_learning.py b/sklearn/decomposition/_dict_learning.py
index bd8a95e37dbaf..1c48542a1c9ec 100644
--- a/sklearn/decomposition/_dict_learning.py
+++ b/sklearn/decomposition/_dict_learning.py
@@ -19,7 +19,7 @@
 from ..utils import (check_array, check_random_state, gen_even_slices,
                      gen_batches)
 from ..utils.extmath import randomized_svd, row_norms, svd_flip
-from ..utils.validation import check_is_fitted, _deprecate_positional_args
+from ..utils.validation import check_is_fitted
 from ..utils.fixes import delayed
 from ..linear_model import Lasso, orthogonal_mp_gram, LassoLars, Lars
 
@@ -193,7 +193,6 @@ def _sparse_encode(X, dictionary, gram, cov=None, algorithm='lasso_lars',
 
 
 # XXX : could be moved to the linear_model module
-@_deprecate_positional_args
 def sparse_encode(X, dictionary, *, gram=None, cov=None,
                   algorithm='lasso_lars', n_nonzero_coefs=None, alpha=None,
                   copy_cov=True, init=None, max_iter=1000, n_jobs=None,
@@ -427,7 +426,6 @@ def _update_dict(dictionary, Y, code, A=None, B=None, verbose=False,
         print(f"{n_unused} unused atoms resampled.")
 
 
-@_deprecate_positional_args
 def dict_learning(X, n_components, *, alpha, max_iter=100, tol=1e-8,
                   method='lars', n_jobs=None, dict_init=None, code_init=None,
                   callback=None, verbose=False, random_state=None,
@@ -626,7 +624,6 @@ def dict_learning(X, n_components, *, alpha, max_iter=100, tol=1e-8,
         return code, dictionary, errors
 
 
-@_deprecate_positional_args
 def dict_learning_online(X, n_components=2, *, alpha=1, n_iter=100,
                          return_code=True, dict_init=None, callback=None,
                          batch_size=3, verbose=False, shuffle=True,
@@ -1063,7 +1060,6 @@ class SparseCoder(_BaseSparseCoding, BaseEstimator):
     """
     _required_parameters = ["dictionary"]
 
-    @_deprecate_positional_args
     def __init__(self, dictionary, *, transform_algorithm='omp',
                  transform_n_nonzero_coefs=None, transform_alpha=None,
                  split_sign=False, n_jobs=None, positive_code=False,
@@ -1299,7 +1295,6 @@ class DictionaryLearning(_BaseSparseCoding, BaseEstimator):
     SparsePCA
     MiniBatchSparsePCA
     """
-    @_deprecate_positional_args
     def __init__(self, n_components=None, *, alpha=1, max_iter=1000, tol=1e-8,
                  fit_algorithm='lars', transform_algorithm='omp',
                  transform_n_nonzero_coefs=None, transform_alpha=None,
@@ -1537,7 +1532,6 @@ class MiniBatchDictionaryLearning(_BaseSparseCoding, BaseEstimator):
     MiniBatchSparsePCA
 
     """
-    @_deprecate_positional_args
     def __init__(self, n_components=None, *, alpha=1, n_iter=1000,
                  fit_algorithm='lars', n_jobs=None, batch_size=3, shuffle=True,
                  dict_init=None, transform_algorithm='omp',
diff --git a/sklearn/decomposition/_factor_analysis.py b/sklearn/decomposition/_factor_analysis.py
index 5dd9f13094a89..830e81e9268d5 100644
--- a/sklearn/decomposition/_factor_analysis.py
+++ b/sklearn/decomposition/_factor_analysis.py
@@ -28,7 +28,7 @@
 from ..base import BaseEstimator, TransformerMixin
 from ..utils import check_random_state
 from ..utils.extmath import fast_logdet, randomized_svd, squared_norm
-from ..utils.validation import check_is_fitted, _deprecate_positional_args
+from ..utils.validation import check_is_fitted
 from ..exceptions import ConvergenceWarning
 
 
@@ -147,7 +147,6 @@ class FactorAnalysis(TransformerMixin, BaseEstimator):
     FastICA: Independent component analysis, a latent variable model with
         non-Gaussian latent variables.
     """
-    @_deprecate_positional_args
     def __init__(self, n_components=None, *, tol=1e-2, copy=True,
                  max_iter=1000,
                  noise_variance_init=None, svd_method='randomized',
diff --git a/sklearn/decomposition/_fastica.py b/sklearn/decomposition/_fastica.py
index a57ddada85694..6c374e6e420f8 100644
--- a/sklearn/decomposition/_fastica.py
+++ b/sklearn/decomposition/_fastica.py
@@ -20,7 +20,6 @@
 from ..utils import check_array, as_float_array, check_random_state
 from ..utils.validation import check_is_fitted
 from ..utils.validation import FLOAT_DTYPES
-from ..utils.validation import _deprecate_positional_args
 
 __all__ = ['fastica', 'FastICA']
 
@@ -147,7 +146,6 @@ def _cube(x, fun_args):
     return x ** 3, (3 * x ** 2).mean(axis=-1)
 
 
-@_deprecate_positional_args
 def fastica(X, n_components=None, *, algorithm="parallel", whiten=True,
             fun="logcosh", fun_args=None, max_iter=200, tol=1e-04, w_init=None,
             random_state=None, return_X_mean=False, compute_sources=True,
@@ -392,7 +390,6 @@ def my_g(x):
     pp. 411-430*
 
     """
-    @_deprecate_positional_args
     def __init__(self, n_components=None, *, algorithm='parallel', whiten=True,
                  fun='logcosh', fun_args=None, max_iter=200, tol=1e-4,
                  w_init=None, random_state=None):
diff --git a/sklearn/decomposition/_incremental_pca.py b/sklearn/decomposition/_incremental_pca.py
index 10a1cceadd65e..486d4a22d8cdb 100644
--- a/sklearn/decomposition/_incremental_pca.py
+++ b/sklearn/decomposition/_incremental_pca.py
@@ -10,7 +10,6 @@
 from ._base import _BasePCA
 from ..utils import gen_batches
 from ..utils.extmath import svd_flip, _incremental_mean_and_var
-from ..utils.validation import _deprecate_positional_args
 
 
 class IncrementalPCA(_BasePCA):
@@ -164,7 +163,6 @@ class IncrementalPCA(_BasePCA):
     SparsePCA
     TruncatedSVD
     """
-    @_deprecate_positional_args
     def __init__(self, n_components=None, *, whiten=False, copy=True,
                  batch_size=None):
         self.n_components = n_components
diff --git a/sklearn/decomposition/_kernel_pca.py b/sklearn/decomposition/_kernel_pca.py
index 8663193a8383e..1e1cdb1722029 100644
--- a/sklearn/decomposition/_kernel_pca.py
+++ b/sklearn/decomposition/_kernel_pca.py
@@ -16,7 +16,6 @@
 from ..base import BaseEstimator, TransformerMixin
 from ..preprocessing import KernelCenterer
 from ..metrics.pairwise import pairwise_kernels
-from ..utils.validation import _deprecate_positional_args
 
 
 class KernelPCA(TransformerMixin, BaseEstimator):
@@ -192,7 +191,6 @@ class KernelPCA(TransformerMixin, BaseEstimator):
         A randomized algorithm for the decomposition of matrices
         Per-Gunnar Martinsson, Vladimir Rokhlin and Mark Tygert
     """
-    @_deprecate_positional_args
     def __init__(self, n_components=None, *, kernel="linear",
                  gamma=None, degree=3, coef0=1, kernel_params=None,
                  alpha=1.0, fit_inverse_transform=False, eigen_solver='auto',
diff --git a/sklearn/decomposition/_lda.py b/sklearn/decomposition/_lda.py
index e554d299fe478..34432557814c2 100644
--- a/sklearn/decomposition/_lda.py
+++ b/sklearn/decomposition/_lda.py
@@ -20,7 +20,6 @@
 from ..utils import check_random_state, gen_batches, gen_even_slices
 from ..utils.validation import check_non_negative
 from ..utils.validation import check_is_fitted
-from ..utils.validation import _deprecate_positional_args
 from ..utils.fixes import delayed
 
 from ._online_lda_fast import (mean_change, _dirichlet_expectation_1d,
@@ -293,7 +292,6 @@ class LatentDirichletAllocation(TransformerMixin, BaseEstimator):
         https://github.com/blei-lab/onlineldavb
 
     """
-    @_deprecate_positional_args
     def __init__(self, n_components=10, *, doc_topic_prior=None,
                  topic_word_prior=None, learning_method='batch',
                  learning_decay=.7, learning_offset=10., max_iter=10,
diff --git a/sklearn/decomposition/_nmf.py b/sklearn/decomposition/_nmf.py
index b978f1a33d3af..c8239147eb6c4 100644
--- a/sklearn/decomposition/_nmf.py
+++ b/sklearn/decomposition/_nmf.py
@@ -20,7 +20,6 @@
 from ..utils import check_random_state, check_array
 from ..utils.extmath import randomized_svd, safe_sparse_dot, squared_norm
 from ..utils.validation import check_is_fitted, check_non_negative
-from ..utils.validation import _deprecate_positional_args
 
 EPSILON = np.finfo(np.float32).eps
 
@@ -850,7 +849,6 @@ def _fit_multiplicative_update(X, W, H, beta_loss='frobenius',
     return W, H, n_iter
 
 
-@_deprecate_positional_args
 def non_negative_factorization(X, W=None, H=None, n_components=None, *,
                                init='warn', update_H=True, solver='cd',
                                beta_loss='frobenius', tol=1e-4,
@@ -1200,7 +1198,6 @@ class NMF(TransformerMixin, BaseEstimator):
     Fevotte, C., & Idier, J. (2011). Algorithms for nonnegative matrix
     factorization with the beta-divergence. Neural Computation, 23(9).
     """
-    @_deprecate_positional_args
     def __init__(self, n_components=None, *, init='warn', solver='cd',
                  beta_loss='frobenius', tol=1e-4, max_iter=200,
                  random_state=None, alpha=0., l1_ratio=0., verbose=0,
diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py
index eb0a73919021a..765320ccdb5a8 100644
--- a/sklearn/decomposition/_pca.py
+++ b/sklearn/decomposition/_pca.py
@@ -25,7 +25,6 @@
 from ..utils.extmath import fast_logdet, randomized_svd, svd_flip
 from ..utils.extmath import stable_cumsum
 from ..utils.validation import check_is_fitted
-from ..utils.validation import _deprecate_positional_args
 
 
 def _assess_dimension(spectrum, rank, n_samples):
@@ -328,7 +327,6 @@ class PCA(_BasePCA):
     >>> print(pca.singular_values_)
     [6.30061...]
     """
-    @_deprecate_positional_args
     def __init__(self, n_components=None, *, copy=True, whiten=False,
                  svd_solver='auto', tol=0.0, iterated_power='auto',
                  random_state=None):
diff --git a/sklearn/decomposition/_sparse_pca.py b/sklearn/decomposition/_sparse_pca.py
index 2348ada255fd4..7f280db3a3af6 100644
--- a/sklearn/decomposition/_sparse_pca.py
+++ b/sklearn/decomposition/_sparse_pca.py
@@ -6,7 +6,6 @@
 
 from ..utils import check_random_state
 from ..utils.validation import check_is_fitted
-from ..utils.validation import _deprecate_positional_args
 from ..linear_model import ridge_regression
 from ..base import BaseEstimator, TransformerMixin
 from ._dict_learning import dict_learning, dict_learning_online
@@ -111,7 +110,6 @@ class SparsePCA(TransformerMixin, BaseEstimator):
     MiniBatchSparsePCA
     DictionaryLearning
     """
-    @_deprecate_positional_args
     def __init__(self, n_components=None, *, alpha=1, ridge_alpha=0.01,
                  max_iter=1000, tol=1e-8, method='lars', n_jobs=None,
                  U_init=None, V_init=None, verbose=False, random_state=None):
@@ -304,7 +302,6 @@ class MiniBatchSparsePCA(SparsePCA):
     SparsePCA
     DictionaryLearning
     """
-    @_deprecate_positional_args
     def __init__(self, n_components=None, *, alpha=1, ridge_alpha=0.01,
                  n_iter=100, callback=None, batch_size=3, verbose=False,
                  shuffle=True, n_jobs=None, method='lars', random_state=None):
diff --git a/sklearn/decomposition/_truncated_svd.py b/sklearn/decomposition/_truncated_svd.py
index 1ea6b15c3ebd7..74239567dee48 100644
--- a/sklearn/decomposition/_truncated_svd.py
+++ b/sklearn/decomposition/_truncated_svd.py
@@ -15,7 +15,6 @@
 from ..utils._arpack import _init_arpack_v0
 from ..utils.extmath import randomized_svd, safe_sparse_dot, svd_flip
 from ..utils.sparsefuncs import mean_variance_axis
-from ..utils.validation import _deprecate_positional_args
 from ..utils.validation import check_is_fitted
 
 
@@ -119,7 +118,6 @@ class TruncatedSVD(TransformerMixin, BaseEstimator):
     class to data once, then keep the instance around to do transformations.
 
     """
-    @_deprecate_positional_args
     def __init__(self, n_components=2, *, algorithm="randomized", n_iter=5,
                  random_state=None, tol=0.):
         self.algorithm = algorithm
diff --git a/sklearn/discriminant_analysis.py b/sklearn/discriminant_analysis.py
index 2e80f94404175..4d94b19574f53 100644
--- a/sklearn/discriminant_analysis.py
+++ b/sklearn/discriminant_analysis.py
@@ -22,7 +22,6 @@
 from .utils.multiclass import check_classification_targets
 from .utils.extmath import softmax
 from .preprocessing import StandardScaler
-from .utils.validation import _deprecate_positional_args
 
 
 __all__ = ['LinearDiscriminantAnalysis', 'QuadraticDiscriminantAnalysis']
@@ -749,7 +748,6 @@ class QuadraticDiscriminantAnalysis(ClassifierMixin, BaseEstimator):
     --------
     LinearDiscriminantAnalysis : Linear Discriminant Analysis.
     """
-    @_deprecate_positional_args
     def __init__(self, *, priors=None, reg_param=0., store_covariance=False,
                  tol=1.0e-4):
         self.priors = np.asarray(priors) if priors is not None else None
diff --git a/sklearn/dummy.py b/sklearn/dummy.py
index ad5ab3f24731d..575b38aa7d2a8 100644
--- a/sklearn/dummy.py
+++ b/sklearn/dummy.py
@@ -17,7 +17,6 @@
 from .utils.random import _random_choice_csc
 from .utils.stats import _weighted_percentile
 from .utils.multiclass import class_distribution
-from .utils.validation import _deprecate_positional_args
 
 
 class DummyClassifier(MultiOutputMixin, ClassifierMixin, BaseEstimator):
@@ -94,7 +93,6 @@ class DummyClassifier(MultiOutputMixin, ClassifierMixin, BaseEstimator):
     >>> dummy_clf.score(X, y)
     0.75
     """
-    @_deprecate_positional_args
     def __init__(self, *, strategy="prior", random_state=None,
                  constant=None):
         self.strategy = strategy
@@ -444,7 +442,6 @@ class DummyRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
     >>> dummy_regr.score(X, y)
     0.0
     """
-    @_deprecate_positional_args
     def __init__(self, *, strategy="mean", constant=None, quantile=None):
         self.strategy = strategy
         self.constant = constant
diff --git a/sklearn/ensemble/_bagging.py b/sklearn/ensemble/_bagging.py
index 1ac309f00ad69..a4be68ba5e2d6 100644
--- a/sklearn/ensemble/_bagging.py
+++ b/sklearn/ensemble/_bagging.py
@@ -22,7 +22,7 @@
 from ..utils.multiclass import check_classification_targets
 from ..utils.random import sample_without_replacement
 from ..utils.validation import has_fit_parameter, check_is_fitted, \
-    _check_sample_weight, _deprecate_positional_args
+    _check_sample_weight
 from ..utils.fixes import delayed
 
 
@@ -593,7 +593,6 @@ class BaggingClassifier(ClassifierMixin, BaseBagging):
     .. [4] G. Louppe and P. Geurts, "Ensembles on Random Patches", Machine
            Learning and Knowledge Discovery in Databases, 346-361, 2012.
     """
-    @_deprecate_positional_args
     def __init__(self,
                  base_estimator=None,
                  n_estimators=10, *,
@@ -979,7 +978,6 @@ class BaggingRegressor(RegressorMixin, BaseBagging):
     .. [4] G. Louppe and P. Geurts, "Ensembles on Random Patches", Machine
            Learning and Knowledge Discovery in Databases, 346-361, 2012.
     """
-    @_deprecate_positional_args
     def __init__(self,
                  base_estimator=None,
                  n_estimators=10, *,
diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py
index 8eef1f3429227..5a93acd0c0554 100644
--- a/sklearn/ensemble/_forest.py
+++ b/sklearn/ensemble/_forest.py
@@ -64,7 +64,6 @@ class calls the ``fit`` method of each sub-estimator on random samples
 from ..utils.fixes import _joblib_parallel_args
 from ..utils.multiclass import check_classification_targets, type_of_target
 from ..utils.validation import check_is_fitted, _check_sample_weight
-from ..utils.validation import _deprecate_positional_args
 
 
 __all__ = ["RandomForestClassifier",
@@ -1254,7 +1253,6 @@ class labels (multi-output problem).
     >>> print(clf.predict([[0, 0, 0, 0]]))
     [1]
     """
-    @_deprecate_positional_args
     def __init__(self,
                  n_estimators=100, *,
                  criterion="gini",
@@ -1559,7 +1557,6 @@ class RandomForestRegressor(ForestRegressor):
     >>> print(regr.predict([[0, 0, 0, 0]]))
     [-8.32987858]
     """
-    @_deprecate_positional_args
     def __init__(self,
                  n_estimators=100, *,
                  criterion="squared_error",
@@ -1876,7 +1873,6 @@ class labels (multi-output problem).
     >>> clf.predict([[0, 0, 0, 0]])
     array([1])
     """
-    @_deprecate_positional_args
     def __init__(self,
                  n_estimators=100, *,
                  criterion="gini",
@@ -2172,7 +2168,6 @@ class ExtraTreesRegressor(ForestRegressor):
     >>> reg.score(X_test, y_test)
     0.2708...
     """
-    @_deprecate_positional_args
     def __init__(self,
                  n_estimators=100, *,
                  criterion="squared_error",
@@ -2390,7 +2385,6 @@ class RandomTreesEmbedding(BaseForest):
     criterion = "squared_error"
     max_features = 1
 
-    @_deprecate_positional_args
     def __init__(self,
                  n_estimators=100, *,
                  max_depth=5,
diff --git a/sklearn/ensemble/_gb.py b/sklearn/ensemble/_gb.py
index 527bbcb559b5f..54e4e510cd9b9 100644
--- a/sklearn/ensemble/_gb.py
+++ b/sklearn/ensemble/_gb.py
@@ -54,7 +54,6 @@
 from ..utils.validation import check_is_fitted, _check_sample_weight
 from ..utils.multiclass import check_classification_targets
 from ..exceptions import NotFittedError
-from ..utils.validation import _deprecate_positional_args
 
 
 class VerboseReporter:
@@ -1112,7 +1111,6 @@ class GradientBoostingClassifier(ClassifierMixin, BaseGradientBoosting):
 
     _SUPPORTED_LOSS = ('deviance', 'exponential')
 
-    @_deprecate_positional_args
     def __init__(self, *, loss='deviance', learning_rate=0.1, n_estimators=100,
                  subsample=1.0, criterion='friedman_mse', min_samples_split=2,
                  min_samples_leaf=1, min_weight_fraction_leaf=0.,
@@ -1656,7 +1654,6 @@ class GradientBoostingRegressor(RegressorMixin, BaseGradientBoosting):
     _SUPPORTED_LOSS = ("squared_error", 'ls', "absolute_error", 'lad', 'huber',
                        'quantile')
 
-    @_deprecate_positional_args
     def __init__(self, *, loss="squared_error", learning_rate=0.1,
                  n_estimators=100,
                  subsample=1.0, criterion='friedman_mse', min_samples_split=2,
diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
index 6d5de978add9b..99eb0d265b100 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
@@ -12,8 +12,7 @@
 from ...utils import check_random_state, resample
 from ...utils.validation import (check_is_fitted,
                                  check_consistent_length,
-                                 _check_sample_weight,
-                                 _deprecate_positional_args)
+                                 _check_sample_weight)
 from ...utils.multiclass import check_classification_targets
 from ...metrics import check_scoring
 from ...model_selection import train_test_split
@@ -1045,7 +1044,6 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting):
     _VALID_LOSSES = ('squared_error', 'least_squares', 'absolute_error',
                      'least_absolute_deviation', 'poisson')
 
-    @_deprecate_positional_args
     def __init__(self, loss='squared_error', *, learning_rate=0.1,
                  max_iter=100, max_leaf_nodes=31, max_depth=None,
                  min_samples_leaf=20, l2_regularization=0., max_bins=255,
@@ -1304,7 +1302,6 @@ class HistGradientBoostingClassifier(ClassifierMixin,
     _VALID_LOSSES = ('binary_crossentropy', 'categorical_crossentropy',
                      'auto')
 
-    @_deprecate_positional_args
     def __init__(self, loss='auto', *, learning_rate=0.1, max_iter=100,
                  max_leaf_nodes=31, max_depth=None, min_samples_leaf=20,
                  l2_regularization=0., max_bins=255,
diff --git a/sklearn/ensemble/_iforest.py b/sklearn/ensemble/_iforest.py
index 9c3f547f23459..3d2ac0928bd3f 100644
--- a/sklearn/ensemble/_iforest.py
+++ b/sklearn/ensemble/_iforest.py
@@ -16,7 +16,6 @@
 )
 from ..utils.fixes import _joblib_parallel_args
 from ..utils.validation import check_is_fitted, _num_samples
-from ..utils.validation import _deprecate_positional_args
 from ..base import OutlierMixin
 
 from ._bagging import BaseBagging
@@ -181,7 +180,6 @@ class IsolationForest(OutlierMixin, BaseBagging):
     >>> clf.predict([[0.1], [0], [90]])
     array([ 1,  1, -1])
     """
-    @_deprecate_positional_args
     def __init__(self, *,
                  n_estimators=100,
                  max_samples="auto",
diff --git a/sklearn/ensemble/_stacking.py b/sklearn/ensemble/_stacking.py
index 3522b381389d3..db5f5c26cf746 100644
--- a/sklearn/ensemble/_stacking.py
+++ b/sklearn/ensemble/_stacking.py
@@ -32,7 +32,6 @@
 from ..utils.multiclass import check_classification_targets
 from ..utils.validation import check_is_fitted
 from ..utils.validation import column_or_1d
-from ..utils.validation import _deprecate_positional_args
 from ..utils.fixes import delayed
 
 
@@ -397,7 +396,6 @@ class StackingClassifier(ClassifierMixin, _BaseStacking):
     0.9...
 
     """
-    @_deprecate_positional_args
     def __init__(self, estimators, final_estimator=None, *, cv=None,
                  stack_method='auto', n_jobs=None, passthrough=False,
                  verbose=0):
@@ -647,7 +645,6 @@ class StackingRegressor(RegressorMixin, _BaseStacking):
     0.3...
 
     """
-    @_deprecate_positional_args
     def __init__(self, estimators, final_estimator=None, *, cv=None,
                  n_jobs=None, passthrough=False, verbose=0):
         super().__init__(
diff --git a/sklearn/ensemble/_voting.py b/sklearn/ensemble/_voting.py
index 2072d5c7c5501..2c8db5bfbc633 100644
--- a/sklearn/ensemble/_voting.py
+++ b/sklearn/ensemble/_voting.py
@@ -30,7 +30,6 @@
 from ..utils.validation import check_is_fitted
 from ..utils.multiclass import check_classification_targets
 from ..utils.validation import column_or_1d
-from ..utils.validation import _deprecate_positional_args
 from ..exceptions import NotFittedError
 from ..utils._estimator_html_repr import _VisualBlock
 from ..utils.fixes import delayed
@@ -242,7 +241,6 @@ class VotingClassifier(ClassifierMixin, _BaseVoting):
     >>> print(eclf3.transform(X).shape)
     (6, 6)
     """
-    @_deprecate_positional_args
     def __init__(self, estimators, *, voting='hard', weights=None,
                  n_jobs=None, flatten_transform=True, verbose=False):
         super().__init__(estimators=estimators)
@@ -451,7 +449,6 @@ class VotingRegressor(RegressorMixin, _BaseVoting):
     >>> print(er.fit(X, y).predict(X))
     [ 3.3  5.7 11.8 19.7 28.  40.3]
     """
-    @_deprecate_positional_args
     def __init__(self, estimators, *, weights=None, n_jobs=None,
                  verbose=False):
         super().__init__(estimators=estimators)
diff --git a/sklearn/ensemble/_weight_boosting.py b/sklearn/ensemble/_weight_boosting.py
index 92c5e15731d63..1b6689b50fafc 100644
--- a/sklearn/ensemble/_weight_boosting.py
+++ b/sklearn/ensemble/_weight_boosting.py
@@ -41,7 +41,6 @@
 from ..utils.validation import _check_sample_weight
 from ..utils.validation import has_fit_parameter
 from ..utils.validation import _num_samples
-from ..utils.validation import _deprecate_positional_args
 
 __all__ = [
     'AdaBoostClassifier',
@@ -400,7 +399,6 @@ class AdaBoostClassifier(ClassifierMixin, BaseWeightBoosting):
     >>> clf.score(X, y)
     0.983...
     """
-    @_deprecate_positional_args
     def __init__(self,
                  base_estimator=None, *,
                  n_estimators=50,
@@ -964,7 +962,6 @@ class AdaBoostRegressor(RegressorMixin, BaseWeightBoosting):
     .. [2] H. Drucker, "Improving Regressors using Boosting Techniques", 1997.
 
     """
-    @_deprecate_positional_args
     def __init__(self,
                  base_estimator=None, *,
                  n_estimators=50,
diff --git a/sklearn/feature_extraction/_dict_vectorizer.py b/sklearn/feature_extraction/_dict_vectorizer.py
index 44b50dc45a103..a34775575d93a 100644
--- a/sklearn/feature_extraction/_dict_vectorizer.py
+++ b/sklearn/feature_extraction/_dict_vectorizer.py
@@ -12,7 +12,6 @@
 
 from ..base import BaseEstimator, TransformerMixin
 from ..utils import check_array, tosequence
-from ..utils.validation import _deprecate_positional_args
 
 
 def _tosequence(X):
@@ -96,7 +95,6 @@ class DictVectorizer(TransformerMixin, BaseEstimator):
     sklearn.preprocessing.OrdinalEncoder : Handles nominal/categorical
         features encoded as columns of arbitrary data types.
     """
-    @_deprecate_positional_args
     def __init__(self, *, dtype=np.float64, separator="=", sparse=True,
                  sort=True):
         self.dtype = dtype
diff --git a/sklearn/feature_extraction/_hash.py b/sklearn/feature_extraction/_hash.py
index 57f927649bd6f..9ace92c58c30a 100644
--- a/sklearn/feature_extraction/_hash.py
+++ b/sklearn/feature_extraction/_hash.py
@@ -7,7 +7,6 @@
 import scipy.sparse as sp
 
 from ..utils import IS_PYPY
-from ..utils.validation import _deprecate_positional_args
 from ..base import BaseEstimator, TransformerMixin
 
 if not IS_PYPY:
@@ -89,7 +88,6 @@ class FeatureHasher(TransformerMixin, BaseEstimator):
     DictVectorizer : Vectorizes string-valued features using a hash table.
     sklearn.preprocessing.OneHotEncoder : Handles nominal/categorical features.
     """
-    @_deprecate_positional_args
     def __init__(self, n_features=(2 ** 20), *, input_type="dict",
                  dtype=np.float64, alternate_sign=True):
         self._validate_params(n_features, input_type)
diff --git a/sklearn/feature_extraction/image.py b/sklearn/feature_extraction/image.py
index 5cd692fd6aa4f..71b4c1b57c6e8 100644
--- a/sklearn/feature_extraction/image.py
+++ b/sklearn/feature_extraction/image.py
@@ -16,7 +16,6 @@
 from numpy.lib.stride_tricks import as_strided
 
 from ..utils import check_array, check_random_state
-from ..utils.validation import _deprecate_positional_args
 from ..base import BaseEstimator
 
 __all__ = ['PatchExtractor',
@@ -130,7 +129,6 @@ def _to_graph(n_x, n_y, n_z, mask=None, img=None,
     return return_as(graph)
 
 
-@_deprecate_positional_args
 def img_to_graph(img, *, mask=None, return_as=sparse.coo_matrix, dtype=None):
     """Graph of the pixel-to-pixel gradient connections
 
@@ -167,7 +165,6 @@ def img_to_graph(img, *, mask=None, return_as=sparse.coo_matrix, dtype=None):
     return _to_graph(n_x, n_y, n_z, mask, img, return_as, dtype)
 
 
-@_deprecate_positional_args
 def grid_to_graph(n_x, n_y, n_z=1, *, mask=None, return_as=sparse.coo_matrix,
                   dtype=int):
     """Graph of the pixel-to-pixel connections
@@ -305,7 +302,6 @@ def _extract_patches(arr, patch_shape=8, extraction_step=1):
     return patches
 
 
-@_deprecate_positional_args
 def extract_patches_2d(image, patch_size, *, max_patches=None,
                        random_state=None):
     """Reshape a 2D image into a collection of patches
@@ -483,7 +479,6 @@ class PatchExtractor(BaseEstimator):
     >>> print('Patches shape: {}'.format(pe_trans.shape))
     Patches shape: (545706, 2, 2)
     """
-    @_deprecate_positional_args
     def __init__(self, *, patch_size=None, max_patches=None,
                  random_state=None):
         self.patch_size = patch_size
diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
index fad0e53ed31ca..00debc059440c 100644
--- a/sklearn/feature_extraction/text.py
+++ b/sklearn/feature_extraction/text.py
@@ -33,7 +33,6 @@
 from ..utils import _IS_32BIT
 from ..utils.fixes import _astype_copy_false
 from ..exceptions import NotFittedError
-from ..utils.validation import _deprecate_positional_args
 
 
 __all__ = ['HashingVectorizer',
@@ -679,7 +678,6 @@ class HashingVectorizer(TransformerMixin, _VectorizerMixin, BaseEstimator):
     CountVectorizer, TfidfVectorizer
 
     """
-    @_deprecate_positional_args
     def __init__(self, *, input='content', encoding='utf-8',
                  decode_error='strict', strip_accents=None,
                  lowercase=True, preprocessor=None, tokenizer=None,
@@ -1004,7 +1002,6 @@ class CountVectorizer(_VectorizerMixin, BaseEstimator):
     when pickling. This attribute is provided only for introspection and can
     be safely removed using delattr or set to None before pickling.
     """
-    @_deprecate_positional_args
     def __init__(self, *, input='content', encoding='utf-8',
                  decode_error='strict', strip_accents=None,
                  lowercase=True, preprocessor=None, tokenizer=None,
@@ -1424,7 +1421,6 @@ class TfidfTransformer(TransformerMixin, BaseEstimator):
                    Introduction to Information Retrieval. Cambridge University
                    Press, pp. 118-120.
     """
-    @_deprecate_positional_args
     def __init__(self, *, norm='l2', use_idf=True, smooth_idf=True,
                  sublinear_tf=False):
         self.norm = norm
@@ -1733,7 +1729,6 @@ class TfidfVectorizer(CountVectorizer):
     >>> print(X.shape)
     (4, 9)
     """
-    @_deprecate_positional_args
     def __init__(self, *, input='content', encoding='utf-8',
                  decode_error='strict', strip_accents=None, lowercase=True,
                  preprocessor=None, tokenizer=None, analyzer='word',
diff --git a/sklearn/feature_selection/_from_model.py b/sklearn/feature_selection/_from_model.py
index 4889f73518fe9..d3603f13be499 100644
--- a/sklearn/feature_selection/_from_model.py
+++ b/sklearn/feature_selection/_from_model.py
@@ -12,7 +12,6 @@
 
 from ..exceptions import NotFittedError
 from ..utils.metaestimators import if_delegate_has_method
-from ..utils.validation import _deprecate_positional_args
 
 
 def _calculate_threshold(estimator, importances, threshold):
@@ -165,7 +164,6 @@ class SelectFromModel(MetaEstimatorMixin, SelectorMixin, BaseEstimator):
     SequentialFeatureSelector : Sequential cross-validation based feature
         selection. Does not rely on importance weights.
     """
-    @_deprecate_positional_args
     def __init__(self, estimator, *, threshold=None, prefit=False,
                  norm_order=1, max_features=None,
                  importance_getter='auto'):
diff --git a/sklearn/feature_selection/_mutual_info.py b/sklearn/feature_selection/_mutual_info.py
index c5205fc68f370..79f7aea029f89 100644
--- a/sklearn/feature_selection/_mutual_info.py
+++ b/sklearn/feature_selection/_mutual_info.py
@@ -11,7 +11,6 @@
 from ..utils import check_random_state
 from ..utils.fixes import _astype_copy_false
 from ..utils.validation import check_array, check_X_y
-from ..utils.validation import _deprecate_positional_args
 from ..utils.multiclass import check_classification_targets
 
 
@@ -288,7 +287,6 @@ def _estimate_mi(X, y, discrete_features='auto', discrete_target=False,
     return np.array(mi)
 
 
-@_deprecate_positional_args
 def mutual_info_regression(X, y, *, discrete_features='auto', n_neighbors=3,
                            copy=True, random_state=None):
     """Estimate mutual information for a continuous target variable.
@@ -368,7 +366,6 @@ def mutual_info_regression(X, y, *, discrete_features='auto', n_neighbors=3,
                         copy, random_state)
 
 
-@_deprecate_positional_args
 def mutual_info_classif(X, y, *, discrete_features='auto', n_neighbors=3,
                         copy=True, random_state=None):
     """Estimate mutual information for a discrete target variable.
diff --git a/sklearn/feature_selection/_rfe.py b/sklearn/feature_selection/_rfe.py
index 16519dfba6761..d972ee7c991e9 100644
--- a/sklearn/feature_selection/_rfe.py
+++ b/sklearn/feature_selection/_rfe.py
@@ -15,7 +15,6 @@
 from ..utils.metaestimators import _safe_split
 from ..utils._tags import _safe_tags
 from ..utils.validation import check_is_fitted
-from ..utils.validation import _deprecate_positional_args
 from ..utils.fixes import delayed
 from ..base import BaseEstimator
 from ..base import MetaEstimatorMixin
@@ -152,7 +151,6 @@ class RFE(SelectorMixin, MetaEstimatorMixin, BaseEstimator):
            for cancer classification using support vector machines",
            Mach. Learn., 46(1-3), 389--422, 2002.
     """
-    @_deprecate_positional_args
     def __init__(self, estimator, *, n_features_to_select=None, step=1,
                  verbose=0, importance_getter='auto'):
         self.estimator = estimator
@@ -524,7 +522,6 @@ class RFECV(RFE):
            for cancer classification using support vector machines",
            Mach. Learn., 46(1-3), 389--422, 2002.
     """
-    @_deprecate_positional_args
     def __init__(self, estimator, *, step=1, min_features_to_select=1,
                  cv=None, scoring=None, verbose=0, n_jobs=None,
                  importance_getter='auto'):
diff --git a/sklearn/feature_selection/_univariate_selection.py b/sklearn/feature_selection/_univariate_selection.py
index d9db03e479163..989288dbb4ec7 100644
--- a/sklearn/feature_selection/_univariate_selection.py
+++ b/sklearn/feature_selection/_univariate_selection.py
@@ -17,7 +17,6 @@
                      safe_mask)
 from ..utils.extmath import safe_sparse_dot, row_norms
 from ..utils.validation import check_is_fitted
-from ..utils.validation import _deprecate_positional_args
 from ._base import SelectorMixin
 
 
@@ -296,7 +295,6 @@ def r_regression(X, y, *, center=True):
     return correlation_coefficient
 
 
-@_deprecate_positional_args
 def f_regression(X, y, *, center=True):
     """Univariate linear regression tests returning F-statistic and p-values.
 
@@ -485,7 +483,6 @@ class SelectPercentile(_BaseFilter):
     GenericUnivariateSelect : Univariate feature selector with configurable
         mode.
     """
-    @_deprecate_positional_args
     def __init__(self, score_func=f_classif, *, percentile=10):
         super().__init__(score_func=score_func)
         self.percentile = percentile
@@ -573,7 +570,6 @@ class SelectKBest(_BaseFilter):
     GenericUnivariateSelect : Univariate feature selector with configurable
         mode.
     """
-    @_deprecate_positional_args
     def __init__(self, score_func=f_classif, *, k=10):
         super().__init__(score_func=score_func)
         self.k = k
@@ -654,7 +650,6 @@ class SelectFpr(_BaseFilter):
     GenericUnivariateSelect : Univariate feature selector with configurable
         mode.
     """
-    @_deprecate_positional_args
     def __init__(self, score_func=f_classif, *, alpha=5e-2):
         super().__init__(score_func=score_func)
         self.alpha = alpha
@@ -722,7 +717,6 @@ class SelectFdr(_BaseFilter):
     GenericUnivariateSelect : Univariate feature selector with configurable
         mode.
     """
-    @_deprecate_positional_args
     def __init__(self, score_func=f_classif, *, alpha=5e-2):
         super().__init__(score_func=score_func)
         self.alpha = alpha
@@ -787,7 +781,6 @@ class SelectFwe(_BaseFilter):
     GenericUnivariateSelect : Univariate feature selector with configurable
         mode.
     """
-    @_deprecate_positional_args
     def __init__(self, score_func=f_classif, *, alpha=5e-2):
         super().__init__(score_func=score_func)
         self.alpha = alpha
@@ -857,13 +850,12 @@ class GenericUnivariateSelect(_BaseFilter):
     SelectFwe : Select features based on family-wise error rate.
     """
 
-    _selection_modes = {'percentile': SelectPercentile,
-                        'k_best': SelectKBest,
-                        'fpr': SelectFpr,
-                        'fdr': SelectFdr,
-                        'fwe': SelectFwe}
+    _selection_modes: dict = {'percentile': SelectPercentile,
+                              'k_best': SelectKBest,
+                              'fpr': SelectFpr,
+                              'fdr': SelectFdr,
+                              'fwe': SelectFwe}
 
-    @_deprecate_positional_args
     def __init__(self, score_func=f_classif, *, mode='percentile', param=1e-5):
         super().__init__(score_func=score_func)
         self.mode = mode
diff --git a/sklearn/gaussian_process/_gpc.py b/sklearn/gaussian_process/_gpc.py
index e6fe3eb26df49..d2b418b131c2f 100644
--- a/sklearn/gaussian_process/_gpc.py
+++ b/sklearn/gaussian_process/_gpc.py
@@ -19,7 +19,6 @@
 from ..utils.optimize import _check_optimize_result
 from ..preprocessing import LabelEncoder
 from ..multiclass import OneVsRestClassifier, OneVsOneClassifier
-from ..utils.validation import _deprecate_positional_args
 
 
 # Values required for approximating the logistic sigmoid by
@@ -145,7 +144,6 @@ def optimizer(obj_func, initial_theta, bounds):
         The log-marginal-likelihood of ``self.kernel_.theta``
 
     """
-    @_deprecate_positional_args
     def __init__(self, kernel=None, *, optimizer="fmin_l_bfgs_b",
                  n_restarts_optimizer=0, max_iter_predict=100,
                  warm_start=False, copy_X_train=True, random_state=None):
@@ -595,7 +593,6 @@ def optimizer(obj_func, initial_theta, bounds):
 
     .. versionadded:: 0.18
     """
-    @_deprecate_positional_args
     def __init__(self, kernel=None, *, optimizer="fmin_l_bfgs_b",
                  n_restarts_optimizer=0, max_iter_predict=100,
                  warm_start=False, copy_X_train=True, random_state=None,
diff --git a/sklearn/gaussian_process/_gpr.py b/sklearn/gaussian_process/_gpr.py
index 9b1d0ae409526..ae9e5c403fcf2 100644
--- a/sklearn/gaussian_process/_gpr.py
+++ b/sklearn/gaussian_process/_gpr.py
@@ -17,7 +17,6 @@
 from ..preprocessing._data import _handle_zeros_in_scale
 from ..utils import check_random_state
 from ..utils.optimize import _check_optimize_result
-from ..utils.validation import _deprecate_positional_args
 
 
 class GaussianProcessRegressor(MultiOutputMixin,
@@ -153,7 +152,6 @@ def optimizer(obj_func, initial_theta, bounds):
     (array([653.0..., 592.1...]), array([316.6..., 316.6...]))
 
     """
-    @_deprecate_positional_args
     def __init__(self, kernel=None, *, alpha=1e-10,
                  optimizer="fmin_l_bfgs_b", n_restarts_optimizer=0,
                  normalize_y=False, copy_X_train=True, random_state=None):
diff --git a/sklearn/impute/_base.py b/sklearn/impute/_base.py
index e345fe44f0895..85303f29c93e9 100644
--- a/sklearn/impute/_base.py
+++ b/sklearn/impute/_base.py
@@ -15,7 +15,6 @@
 from ..utils.sparsefuncs import _get_median
 from ..utils.validation import check_is_fitted
 from ..utils.validation import FLOAT_DTYPES
-from ..utils.validation import _deprecate_positional_args
 from ..utils._mask import _get_mask
 from ..utils import is_scalar_nan
 
@@ -211,7 +210,6 @@ class SimpleImputer(_BaseImputer):
     upon :meth:`transform` if strategy is not "constant".
 
     """
-    @_deprecate_positional_args
     def __init__(self, *, missing_values=np.nan, strategy="mean",
                  fill_value=None, verbose=0, copy=True, add_indicator=False):
         super().__init__(
@@ -626,7 +624,6 @@ class MissingIndicator(TransformerMixin, BaseEstimator):
            [False, False]])
 
     """
-    @_deprecate_positional_args
     def __init__(self, *, missing_values=np.nan, features="missing-only",
                  sparse="auto", error_on_new=True):
         self.missing_values = missing_values
diff --git a/sklearn/impute/_knn.py b/sklearn/impute/_knn.py
index c4b407fdd66e7..b9cfe0e1a60a0 100644
--- a/sklearn/impute/_knn.py
+++ b/sklearn/impute/_knn.py
@@ -13,7 +13,6 @@
 from ..utils import is_scalar_nan
 from ..utils._mask import _get_mask
 from ..utils.validation import check_is_fitted
-from ..utils.validation import _deprecate_positional_args
 
 
 class KNNImputer(_BaseImputer):
@@ -96,7 +95,6 @@ class KNNImputer(_BaseImputer):
            [5.5, 6. , 5. ],
            [8. , 8. , 7. ]])
     """
-    @_deprecate_positional_args
     def __init__(self, *, missing_values=np.nan, n_neighbors=5,
                  weights="uniform", metric="nan_euclidean", copy=True,
                  add_indicator=False):
diff --git a/sklearn/inspection/_partial_dependence.py b/sklearn/inspection/_partial_dependence.py
index 0736130f41524..d10cae40302a3 100644
--- a/sklearn/inspection/_partial_dependence.py
+++ b/sklearn/inspection/_partial_dependence.py
@@ -22,7 +22,6 @@
 from ..utils import _get_column_indices
 from ..utils.validation import check_is_fitted
 from ..utils import Bunch
-from ..utils.validation import _deprecate_positional_args
 from ..tree import DecisionTreeRegressor
 from ..ensemble import RandomForestRegressor
 from ..exceptions import NotFittedError
@@ -203,7 +202,6 @@ def _partial_dependence_brute(est, grid, features, X, response_method):
     return averaged_predictions, predictions
 
 
-@_deprecate_positional_args
 def partial_dependence(estimator, X, features, *, response_method='auto',
                        percentiles=(0.05, 0.95), grid_resolution=100,
                        method='auto', kind='legacy'):
diff --git a/sklearn/inspection/_permutation_importance.py b/sklearn/inspection/_permutation_importance.py
index 2a7b6cd23147b..8dadf19434693 100644
--- a/sklearn/inspection/_permutation_importance.py
+++ b/sklearn/inspection/_permutation_importance.py
@@ -8,7 +8,6 @@
 from ..utils import Bunch
 from ..utils import check_random_state
 from ..utils import check_array
-from ..utils.validation import _deprecate_positional_args
 from ..utils.fixes import delayed
 
 
@@ -80,7 +79,6 @@ def _create_importances_bunch(baseline_score, permuted_score):
                  importances=importances)
 
 
-@_deprecate_positional_args
 def permutation_importance(estimator, X, y, *, scoring=None, n_repeats=5,
                            n_jobs=None, random_state=None, sample_weight=None):
     """Permutation importance for feature evaluation [BRE]_.
diff --git a/sklearn/inspection/_plot/partial_dependence.py b/sklearn/inspection/_plot/partial_dependence.py
index a2ee1886066e2..f170460cf2ab6 100644
--- a/sklearn/inspection/_plot/partial_dependence.py
+++ b/sklearn/inspection/_plot/partial_dependence.py
@@ -17,7 +17,6 @@
 from ...utils.fixes import delayed
 
 
-@_deprecate_positional_args
 def plot_partial_dependence(
     estimator,
     X,
@@ -539,7 +538,6 @@ class PartialDependenceDisplay:
     partial_dependence : Compute Partial Dependence values.
     plot_partial_dependence : Plot Partial Dependence.
     """
-    @_deprecate_positional_args
     def __init__(
         self,
         pd_results,
diff --git a/sklearn/isotonic.py b/sklearn/isotonic.py
index b57ce23f8cc52..f4050fd2bc025 100644
--- a/sklearn/isotonic.py
+++ b/sklearn/isotonic.py
@@ -11,7 +11,7 @@
 
 from .base import BaseEstimator, TransformerMixin, RegressorMixin
 from .utils import check_array, check_consistent_length
-from .utils.validation import _check_sample_weight, _deprecate_positional_args
+from .utils.validation import _check_sample_weight
 from ._isotonic import _inplace_contiguous_isotonic_regression, _make_unique
 
 
@@ -76,7 +76,6 @@ def check_increasing(x, y):
     return increasing_bool
 
 
-@_deprecate_positional_args
 def isotonic_regression(y, *, sample_weight=None, y_min=None, y_max=None,
                         increasing=True):
     """Solve the isotonic regression model.
@@ -216,7 +215,6 @@ class IsotonicRegression(RegressorMixin, TransformerMixin, BaseEstimator):
     >>> iso_reg.predict([.1, .2])
     array([1.8628..., 3.7256...])
     """
-    @_deprecate_positional_args
     def __init__(self, *, y_min=None, y_max=None, increasing=True,
                  out_of_bounds='nan'):
         self.y_min = y_min
diff --git a/sklearn/kernel_approximation.py b/sklearn/kernel_approximation.py
index ca02aac3e982c..e7020dea0e970 100644
--- a/sklearn/kernel_approximation.py
+++ b/sklearn/kernel_approximation.py
@@ -24,7 +24,7 @@
 from .utils.extmath import safe_sparse_dot
 from .utils.validation import check_is_fitted
 from .metrics.pairwise import pairwise_kernels, KERNEL_PARAMS
-from .utils.validation import check_non_negative, _deprecate_positional_args
+from .utils.validation import check_non_negative
 
 
 class PolynomialCountSketch(BaseEstimator, TransformerMixin):
@@ -253,7 +253,6 @@ class RBFSampler(TransformerMixin, BaseEstimator):
     Benjamin Recht.
     (https://people.eecs.berkeley.edu/~brecht/papers/08.rah.rec.nips.pdf)
     """
-    @_deprecate_positional_args
     def __init__(self, *, gamma=1., n_components=100, random_state=None):
         self.gamma = gamma
         self.n_components = n_components
@@ -369,7 +368,6 @@ class SkewedChi2Sampler(TransformerMixin, BaseEstimator):
 
     sklearn.metrics.pairwise.chi2_kernel : The exact chi squared kernel.
     """
-    @_deprecate_positional_args
     def __init__(self, *, skewedness=1., n_components=100, random_state=None):
         self.skewedness = skewedness
         self.n_components = n_components
@@ -500,7 +498,6 @@ class AdditiveChi2Sampler(TransformerMixin, BaseEstimator):
     A. Vedaldi and A. Zisserman, Pattern Analysis and Machine Intelligence,
     2011
     """
-    @_deprecate_positional_args
     def __init__(self, *, sample_steps=2, sample_interval=None):
         self.sample_steps = sample_steps
         self.sample_interval = sample_interval
@@ -728,7 +725,6 @@ class Nystroem(TransformerMixin, BaseEstimator):
 
     sklearn.metrics.pairwise.kernel_metrics : List of built-in kernels.
     """
-    @_deprecate_positional_args
     def __init__(self, kernel="rbf", *, gamma=None, coef0=None, degree=None,
                  kernel_params=None, n_components=100, random_state=None,
                  n_jobs=None):
diff --git a/sklearn/kernel_ridge.py b/sklearn/kernel_ridge.py
index 8a27ea572b344..e562c22daed2f 100644
--- a/sklearn/kernel_ridge.py
+++ b/sklearn/kernel_ridge.py
@@ -10,7 +10,6 @@
 from .metrics.pairwise import pairwise_kernels
 from .linear_model._ridge import _solve_cholesky_kernel
 from .utils.validation import check_is_fitted, _check_sample_weight
-from .utils.validation import _deprecate_positional_args
 from .utils.deprecation import deprecated
 
 
@@ -113,7 +112,6 @@ class KernelRidge(MultiOutputMixin, RegressorMixin, BaseEstimator):
     >>> clf.fit(X, y)
     KernelRidge(alpha=1.0)
     """
-    @_deprecate_positional_args
     def __init__(self, alpha=1, *, kernel="linear", gamma=None, degree=3,
                  coef0=1, kernel_params=None):
         self.alpha = alpha
diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py
index 808ec9f3b3bb0..09eeced4f3a09 100644
--- a/sklearn/linear_model/_base.py
+++ b/sklearn/linear_model/_base.py
@@ -31,7 +31,6 @@
 from ..preprocessing._data import _is_constant_feature
 from ..utils import check_array
 from ..utils.validation import FLOAT_DTYPES
-from ..utils.validation import _deprecate_positional_args
 from ..utils import check_random_state
 from ..utils.extmath import safe_sparse_dot
 from ..utils.extmath import _incremental_mean_and_var
@@ -595,7 +594,6 @@ class LinearRegression(MultiOutputMixin, RegressorMixin, LinearModel):
     >>> reg.predict(np.array([[3, 5]]))
     array([16.])
     """
-    @_deprecate_positional_args
     def __init__(self, *, fit_intercept=True, normalize='deprecated',
                  copy_X=True, n_jobs=None, positive=False):
         self.fit_intercept = fit_intercept
diff --git a/sklearn/linear_model/_bayes.py b/sklearn/linear_model/_bayes.py
index 2eae8b5c13cee..1d25ac20aa34e 100644
--- a/sklearn/linear_model/_bayes.py
+++ b/sklearn/linear_model/_bayes.py
@@ -15,7 +15,6 @@
 from ..utils.extmath import fast_logdet
 from scipy.linalg import pinvh
 from ..utils.validation import _check_sample_weight
-from ..utils.validation import _deprecate_positional_args
 
 
 ###############################################################################
@@ -159,7 +158,6 @@ class BayesianRidge(RegressorMixin, LinearModel):
     M. E. Tipping, Sparse Bayesian Learning and the Relevance Vector Machine,
     Journal of Machine Learning Research, Vol. 1, 2001.
     """
-    @_deprecate_positional_args
     def __init__(self, *, n_iter=300, tol=1.e-3, alpha_1=1.e-6, alpha_2=1.e-6,
                  lambda_1=1.e-6, lambda_2=1.e-6, alpha_init=None,
                  lambda_init=None, compute_score=False, fit_intercept=True,
@@ -520,7 +518,6 @@ class ARDRegression(RegressorMixin, LinearModel):
     which ``self.lambda_ < self.threshold_lambda`` are kept and the rest are
     discarded.
     """
-    @_deprecate_positional_args
     def __init__(self, *, n_iter=300, tol=1.e-3, alpha_1=1.e-6, alpha_2=1.e-6,
                  lambda_1=1.e-6, lambda_2=1.e-6, compute_score=False,
                  threshold_lambda=1.e+4, fit_intercept=True,
diff --git a/sklearn/linear_model/_coordinate_descent.py b/sklearn/linear_model/_coordinate_descent.py
index 6a23fedd9902e..1d93a6695b0e0 100644
--- a/sklearn/linear_model/_coordinate_descent.py
+++ b/sklearn/linear_model/_coordinate_descent.py
@@ -24,7 +24,6 @@
 from ..utils.fixes import _astype_copy_false, _joblib_parallel_args
 from ..utils.validation import check_is_fitted, _check_sample_weight
 from ..utils.validation import column_or_1d
-from ..utils.validation import _deprecate_positional_args
 from ..utils.fixes import delayed
 
 # mypy error: Module 'sklearn.linear_model' has no attribute '_cd_fast'
@@ -169,7 +168,6 @@ def _alpha_grid(X, y, Xy=None, l1_ratio=1.0, fit_intercept=True,
                        num=n_alphas)[::-1]
 
 
-@_deprecate_positional_args
 def lasso_path(X, y, *, eps=1e-3, n_alphas=100, alphas=None,
                precompute='auto', Xy=None, copy_X=True, coef_init=None,
                verbose=False, return_n_iter=False, positive=False, **params):
@@ -314,7 +312,6 @@ def lasso_path(X, y, *, eps=1e-3, n_alphas=100, alphas=None,
                      positive=positive, return_n_iter=return_n_iter, **params)
 
 
-@_deprecate_positional_args
 def enet_path(X, y, *, l1_ratio=0.5, eps=1e-3, n_alphas=100, alphas=None,
               precompute='auto', Xy=None, copy_X=True, coef_init=None,
               verbose=False, return_n_iter=False, positive=False,
@@ -701,7 +698,6 @@ class ElasticNet(MultiOutputMixin, RegressorMixin, LinearModel):
     """
     path = staticmethod(enet_path)
 
-    @_deprecate_positional_args
     def __init__(self, alpha=1.0, *, l1_ratio=0.5, fit_intercept=True,
                  normalize=False, precompute=False, max_iter=1000,
                  copy_X=True, tol=1e-4, warm_start=False, positive=False,
@@ -1026,7 +1022,6 @@ class Lasso(ElasticNet):
     """
     path = staticmethod(enet_path)
 
-    @_deprecate_positional_args
     def __init__(self, alpha=1.0, *, fit_intercept=True, normalize=False,
                  precompute=False, copy_X=True, max_iter=1000,
                  tol=1e-4, warm_start=False, positive=False,
@@ -1512,7 +1507,6 @@ class LassoCV(RegressorMixin, LinearModelCV):
     """
     path = staticmethod(lasso_path)
 
-    @_deprecate_positional_args
     def __init__(self, *, eps=1e-3, n_alphas=100, alphas=None,
                  fit_intercept=True,
                  normalize=False, precompute='auto', max_iter=1000, tol=1e-4,
@@ -1719,7 +1713,6 @@ class ElasticNetCV(RegressorMixin, LinearModelCV):
     """
     path = staticmethod(enet_path)
 
-    @_deprecate_positional_args
     def __init__(self, *, l1_ratio=0.5, eps=1e-3, n_alphas=100, alphas=None,
                  fit_intercept=True, normalize=False, precompute='auto',
                  max_iter=1000, tol=1e-4, cv=None, copy_X=True,
@@ -1876,7 +1869,6 @@ class MultiTaskElasticNet(Lasso):
     To avoid unnecessary memory duplication the X and y arguments of the fit
     method should be directly passed as Fortran-contiguous numpy arrays.
     """
-    @_deprecate_positional_args
     def __init__(self, alpha=1.0, *, l1_ratio=0.5, fit_intercept=True,
                  normalize=False, copy_X=True, max_iter=1000, tol=1e-4,
                  warm_start=False, random_state=None, selection='cyclic'):
@@ -2077,7 +2069,6 @@ class MultiTaskLasso(MultiTaskElasticNet):
     To avoid unnecessary memory duplication the X and y arguments of the fit
     method should be directly passed as Fortran-contiguous numpy arrays.
     """
-    @_deprecate_positional_args
     def __init__(self, alpha=1.0, *, fit_intercept=True, normalize=False,
                  copy_X=True, max_iter=1000, tol=1e-4, warm_start=False,
                  random_state=None, selection='cyclic'):
@@ -2260,7 +2251,6 @@ class MultiTaskElasticNetCV(RegressorMixin, LinearModelCV):
     """
     path = staticmethod(enet_path)
 
-    @_deprecate_positional_args
     def __init__(self, *, l1_ratio=0.5, eps=1e-3, n_alphas=100, alphas=None,
                  fit_intercept=True, normalize=False,
                  max_iter=1000, tol=1e-4, cv=None, copy_X=True,
@@ -2441,7 +2431,6 @@ class MultiTaskLassoCV(RegressorMixin, LinearModelCV):
     """
     path = staticmethod(lasso_path)
 
-    @_deprecate_positional_args
     def __init__(self, *, eps=1e-3, n_alphas=100, alphas=None,
                  fit_intercept=True,
                  normalize=False, max_iter=1000, tol=1e-4, copy_X=True,
diff --git a/sklearn/linear_model/_huber.py b/sklearn/linear_model/_huber.py
index 56062fa783eb8..a8ae066d9ff63 100644
--- a/sklearn/linear_model/_huber.py
+++ b/sklearn/linear_model/_huber.py
@@ -9,7 +9,6 @@
 from ._base import LinearModel
 from ..utils import axis0_safe_slice
 from ..utils.validation import _check_sample_weight
-from ..utils.validation import _deprecate_positional_args
 from ..utils.extmath import safe_sparse_dot
 from ..utils.optimize import _check_optimize_result
 
@@ -223,7 +222,6 @@ class HuberRegressor(LinearModel, RegressorMixin, BaseEstimator):
     .. [2] Art B. Owen (2006), A robust hybrid of lasso and ridge regression.
            https://statweb.stanford.edu/~owen/reports/hhu.pdf
     """
-    @_deprecate_positional_args
     def __init__(self, *, epsilon=1.35, max_iter=100, alpha=0.0001,
                  warm_start=False, fit_intercept=True, tol=1e-05):
         self.epsilon = epsilon
diff --git a/sklearn/linear_model/_least_angle.py b/sklearn/linear_model/_least_angle.py
index 55e37ff51fc6a..0932d0bd1aee3 100644
--- a/sklearn/linear_model/_least_angle.py
+++ b/sklearn/linear_model/_least_angle.py
@@ -24,13 +24,11 @@
 from ..utils import check_random_state
 from ..model_selection import check_cv
 from ..exceptions import ConvergenceWarning
-from ..utils.validation import _deprecate_positional_args
 from ..utils.fixes import delayed
 
 SOLVE_TRIANGULAR_ARGS = {'check_finite': False}
 
 
-@_deprecate_positional_args
 def lars_path(
     X,
     y,
@@ -175,7 +173,6 @@ def lars_path(
         return_n_iter=return_n_iter, positive=positive)
 
 
-@_deprecate_positional_args
 def lars_path_gram(
     Xy,
     Gram,
@@ -910,7 +907,6 @@ class Lars(MultiOutputMixin, RegressorMixin, LinearModel):
     method = "lar"
     positive = False
 
-    @_deprecate_positional_args
     def __init__(self, *, fit_intercept=True, verbose=False, normalize=True,
                  precompute='auto', n_nonzero_coefs=500,
                  eps=np.finfo(float).eps, copy_X=True, fit_path=True,
@@ -1172,7 +1168,6 @@ class LassoLars(Lars):
     """
     method = 'lasso'
 
-    @_deprecate_positional_args
     def __init__(self, alpha=1.0, *, fit_intercept=True, verbose=False,
                  normalize=True, precompute='auto', max_iter=500,
                  eps=np.finfo(float).eps, copy_X=True, fit_path=True,
@@ -1434,7 +1429,6 @@ class LarsCV(Lars):
 
     method = "lar"
 
-    @_deprecate_positional_args
     def __init__(self, *, fit_intercept=True, verbose=False, max_iter=500,
                  normalize=True, precompute='auto', cv=None,
                  max_n_alphas=1000, n_jobs=None, eps=np.finfo(float).eps,
@@ -1681,7 +1675,6 @@ class LassoLarsCV(LarsCV):
 
     method = 'lasso'
 
-    @_deprecate_positional_args
     def __init__(self, *, fit_intercept=True, verbose=False, max_iter=500,
                  normalize=True, precompute='auto', cv=None,
                  max_n_alphas=1000, n_jobs=None, eps=np.finfo(float).eps,
@@ -1820,7 +1813,6 @@ class LassoLarsIC(LassoLars):
     --------
     lars_path, LassoLars, LassoLarsCV
     """
-    @_deprecate_positional_args
     def __init__(self, criterion='aic', *, fit_intercept=True, verbose=False,
                  normalize=True, precompute='auto', max_iter=500,
                  eps=np.finfo(float).eps, copy_X=True, positive=False):
diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py
index be28c5806ede5..c9f1f42f1eeec 100644
--- a/sklearn/linear_model/_logistic.py
+++ b/sklearn/linear_model/_logistic.py
@@ -29,7 +29,6 @@
 from ..utils.extmath import row_norms
 from ..utils.optimize import _newton_cg, _check_optimize_result
 from ..utils.validation import check_is_fitted, _check_sample_weight
-from ..utils.validation import _deprecate_positional_args
 from ..utils.multiclass import check_classification_targets
 from ..utils.fixes import _joblib_parallel_args
 from ..utils.fixes import delayed
@@ -1254,7 +1253,6 @@ class LogisticRegression(LinearClassifierMixin,
     >>> clf.score(X, y)
     0.97...
     """
-    @_deprecate_positional_args
     def __init__(self, penalty='l2', *, dual=False, tol=1e-4, C=1.0,
                  fit_intercept=True, intercept_scaling=1, class_weight=None,
                  random_state=None, solver='lbfgs', max_iter=100,
@@ -1745,7 +1743,6 @@ class LogisticRegressionCV(LogisticRegression,
     LogisticRegression
 
     """
-    @_deprecate_positional_args
     def __init__(self, *, Cs=10, fit_intercept=True, cv=None, dual=False,
                  penalty='l2', scoring=None, solver='lbfgs', tol=1e-4,
                  max_iter=100, class_weight=None, n_jobs=None, verbose=0,
diff --git a/sklearn/linear_model/_omp.py b/sklearn/linear_model/_omp.py
index 3f995f0f34318..c362fd4d73469 100644
--- a/sklearn/linear_model/_omp.py
+++ b/sklearn/linear_model/_omp.py
@@ -16,7 +16,6 @@
 from ._base import LinearModel, _pre_fit
 from ..base import RegressorMixin, MultiOutputMixin
 from ..utils import as_float_array, check_array
-from ..utils.validation import _deprecate_positional_args
 from ..utils.fixes import delayed
 from ..model_selection import check_cv
 
@@ -266,7 +265,6 @@ def _gram_omp(Gram, Xy, n_nonzero_coefs, tol_0=None, tol=None,
         return gamma, indices[:n_active], n_active
 
 
-@_deprecate_positional_args
 def orthogonal_mp(X, y, *, n_nonzero_coefs=None, tol=None, precompute=False,
                   copy_X=True, return_path=False,
                   return_n_iter=False):
@@ -410,7 +408,6 @@ def orthogonal_mp(X, y, *, n_nonzero_coefs=None, tol=None, precompute=False,
         return np.squeeze(coef)
 
 
-@_deprecate_positional_args
 def orthogonal_mp_gram(Gram, Xy, *, n_nonzero_coefs=None, tol=None,
                        norms_squared=None, copy_Gram=True,
                        copy_Xy=True, return_path=False,
@@ -628,7 +625,6 @@ class OrthogonalMatchingPursuit(MultiOutputMixin, RegressorMixin, LinearModel):
     sklearn.decomposition.sparse_encode
     OrthogonalMatchingPursuitCV
     """
-    @_deprecate_positional_args
     def __init__(self, *, n_nonzero_coefs=None, tol=None, fit_intercept=True,
                  normalize=True, precompute='auto'):
         self.n_nonzero_coefs = n_nonzero_coefs
@@ -866,7 +862,6 @@ class OrthogonalMatchingPursuitCV(RegressorMixin, LinearModel):
     sklearn.decomposition.sparse_encode
 
     """
-    @_deprecate_positional_args
     def __init__(self, *, copy=True, fit_intercept=True, normalize=True,
                  max_iter=None, cv=None, n_jobs=None, verbose=False):
         self.copy = copy
diff --git a/sklearn/linear_model/_passive_aggressive.py b/sklearn/linear_model/_passive_aggressive.py
index f3fa17ad1325e..678061be3c691 100644
--- a/sklearn/linear_model/_passive_aggressive.py
+++ b/sklearn/linear_model/_passive_aggressive.py
@@ -1,7 +1,6 @@
 # Authors: Rob Zinkov, Mathieu Blondel
 # License: BSD 3 clause
 
-from ..utils.validation import _deprecate_positional_args
 from ._stochastic_gradient import BaseSGDClassifier
 from ._stochastic_gradient import BaseSGDRegressor
 from ._stochastic_gradient import DEFAULT_EPSILON
@@ -164,7 +163,6 @@ class PassiveAggressiveClassifier(BaseSGDClassifier):
     K. Crammer, O. Dekel, J. Keshat, S. Shalev-Shwartz, Y. Singer - JMLR (2006)
 
     """
-    @_deprecate_positional_args
     def __init__(self, *, C=1.0, fit_intercept=True, max_iter=1000, tol=1e-3,
                  early_stopping=False, validation_fraction=0.1,
                  n_iter_no_change=5, shuffle=True, verbose=0, loss="hinge",
@@ -391,7 +389,6 @@ class PassiveAggressiveRegressor(BaseSGDRegressor):
     K. Crammer, O. Dekel, J. Keshat, S. Shalev-Shwartz, Y. Singer - JMLR (2006)
 
     """
-    @_deprecate_positional_args
     def __init__(self, *, C=1.0, fit_intercept=True, max_iter=1000, tol=1e-3,
                  early_stopping=False, validation_fraction=0.1,
                  n_iter_no_change=5, shuffle=True, verbose=0,
diff --git a/sklearn/linear_model/_perceptron.py b/sklearn/linear_model/_perceptron.py
index 9d886eb1ca065..b2bb145b904c8 100644
--- a/sklearn/linear_model/_perceptron.py
+++ b/sklearn/linear_model/_perceptron.py
@@ -1,7 +1,6 @@
 # Author: Mathieu Blondel
 # License: BSD 3 clause
 
-from ..utils.validation import _deprecate_positional_args
 from ._stochastic_gradient import BaseSGDClassifier
 
 
@@ -154,7 +153,6 @@ class Perceptron(BaseSGDClassifier):
 
     https://en.wikipedia.org/wiki/Perceptron and references therein.
     """
-    @_deprecate_positional_args
     def __init__(self, *, penalty=None, alpha=0.0001, l1_ratio=0.15,
                  fit_intercept=True,
                  max_iter=1000, tol=1e-3, shuffle=True, verbose=0, eta0=1.0,
diff --git a/sklearn/linear_model/_ransac.py b/sklearn/linear_model/_ransac.py
index 3cde1f1235ec8..f53785cfe0ced 100644
--- a/sklearn/linear_model/_ransac.py
+++ b/sklearn/linear_model/_ransac.py
@@ -12,7 +12,6 @@
 from ..utils import check_random_state, check_consistent_length
 from ..utils.random import sample_without_replacement
 from ..utils.validation import check_is_fitted, _check_sample_weight
-from ..utils.validation import _deprecate_positional_args
 from ._base import LinearRegression
 from ..utils.validation import has_fit_parameter
 from ..exceptions import ConvergenceWarning
@@ -211,7 +210,6 @@ class RANSACRegressor(MetaEstimatorMixin, RegressorMixin,
     .. [2] https://www.sri.com/sites/default/files/publications/ransac-publication.pdf
     .. [3] http://www.bmva.org/bmvc/2009/Papers/Paper355/Paper355.pdf
     """  # noqa: E501
-    @_deprecate_positional_args
     def __init__(self, base_estimator=None, *, min_samples=None,
                  residual_threshold=None, is_data_valid=None,
                  is_model_valid=None, max_trials=100, max_skips=np.inf,
diff --git a/sklearn/linear_model/_ridge.py b/sklearn/linear_model/_ridge.py
index 343bc6a170c9b..433e0c4313efc 100644
--- a/sklearn/linear_model/_ridge.py
+++ b/sklearn/linear_model/_ridge.py
@@ -28,7 +28,6 @@
 from ..utils import compute_sample_weight
 from ..utils import column_or_1d
 from ..utils.validation import _check_sample_weight
-from ..utils.validation import _deprecate_positional_args
 from ..preprocessing import LabelBinarizer
 from ..model_selection import GridSearchCV
 from ..metrics import check_scoring
@@ -236,7 +235,6 @@ def _get_valid_accept_sparse(is_X_sparse, solver):
         return ['csr', 'csc', 'coo']
 
 
-@_deprecate_positional_args
 def ridge_regression(X, y, alpha, *, sample_weight=None, solver='auto',
                      max_iter=None, tol=1e-3, verbose=0, random_state=None,
                      return_n_iter=False, return_intercept=False,
@@ -521,7 +519,6 @@ def _ridge_regression(X, y, alpha, sample_weight=None, solver='auto',
 
 class _BaseRidge(LinearModel, metaclass=ABCMeta):
     @abstractmethod
-    @_deprecate_positional_args
     def __init__(self, alpha=1.0, *, fit_intercept=True,
                  normalize='deprecated', copy_X=True, max_iter=None, tol=1e-3,
                  solver="auto", random_state=None):
@@ -739,7 +736,6 @@ class Ridge(MultiOutputMixin, RegressorMixin, _BaseRidge):
     >>> clf.fit(X, y)
     Ridge()
     """
-    @_deprecate_positional_args
     def __init__(self, alpha=1.0, *, fit_intercept=True,
                  normalize='deprecated', copy_X=True, max_iter=None, tol=1e-3,
                  solver="auto", random_state=None):
@@ -901,7 +897,6 @@ class RidgeClassifier(LinearClassifierMixin, _BaseRidge):
     >>> clf.score(X, y)
     0.9595...
     """
-    @_deprecate_positional_args
     def __init__(self, alpha=1.0, *, fit_intercept=True,
                  normalize='deprecated', copy_X=True, max_iter=None,
                  tol=1e-3, class_weight=None, solver="auto",
@@ -1127,7 +1122,6 @@ class _RidgeGCV(LinearModel):
     http://cbcl.mit.edu/publications/ps/MIT-CSAIL-TR-2007-025.pdf
     https://www.mit.edu/~9.520/spring07/Classes/rlsslides.pdf
     """
-    @_deprecate_positional_args
     def __init__(self, alphas=(0.1, 1.0, 10.0), *,
                  fit_intercept=True, normalize='deprecated',
                  scoring=None, copy_X=True,
@@ -1601,7 +1595,6 @@ def fit(self, X, y, sample_weight=None):
 
 
 class _BaseRidgeCV(LinearModel):
-    @_deprecate_positional_args
     def __init__(self, alphas=(0.1, 1.0, 10.0), *,
                  fit_intercept=True, normalize='deprecated', scoring=None,
                  cv=None, gcv_mode=None, store_cv_values=False,
@@ -1936,7 +1929,6 @@ class RidgeClassifierCV(LinearClassifierMixin, _BaseRidgeCV):
     a one-versus-all approach. Concretely, this is implemented by taking
     advantage of the multi-variate response support in Ridge.
     """
-    @_deprecate_positional_args
     def __init__(self, alphas=(0.1, 1.0, 10.0), *, fit_intercept=True,
                  normalize='deprecated', scoring=None, cv=None,
                  class_weight=None, store_cv_values=False):
diff --git a/sklearn/linear_model/_sag.py b/sklearn/linear_model/_sag.py
index d0bd001081d61..4d76677e83356 100644
--- a/sklearn/linear_model/_sag.py
+++ b/sklearn/linear_model/_sag.py
@@ -13,7 +13,6 @@
 from ..exceptions import ConvergenceWarning
 from ..utils import check_array
 from ..utils.validation import _check_sample_weight
-from ..utils.validation import _deprecate_positional_args
 from ..utils.extmath import row_norms
 
 
@@ -85,7 +84,6 @@ def get_auto_step_size(max_squared_sum, alpha_scaled, loss, fit_intercept,
     return step
 
 
-@_deprecate_positional_args
 def sag_solver(X, y, sample_weight=None, loss='log', alpha=1., beta=0.,
                max_iter=1000, tol=0.001, verbose=0, random_state=None,
                check_input=True, max_squared_sum=None,
diff --git a/sklearn/linear_model/_stochastic_gradient.py b/sklearn/linear_model/_stochastic_gradient.py
index 92b02155246df..78565178706a8 100644
--- a/sklearn/linear_model/_stochastic_gradient.py
+++ b/sklearn/linear_model/_stochastic_gradient.py
@@ -21,7 +21,6 @@
 from ..utils.extmath import safe_sparse_dot
 from ..utils.multiclass import _check_partial_fit_first_call
 from ..utils.validation import check_is_fitted, _check_sample_weight
-from ..utils.validation import _deprecate_positional_args
 from ..utils.fixes import delayed
 from ..exceptions import ConvergenceWarning
 from ..model_selection import StratifiedShuffleSplit, ShuffleSplit
@@ -71,7 +70,6 @@ def __call__(self, coef, intercept):
 
 class BaseSGD(SparseCoefMixin, BaseEstimator, metaclass=ABCMeta):
     """Base class for SGD classification and regression."""
-    @_deprecate_positional_args
     def __init__(self, loss, *, penalty='l2', alpha=0.0001, C=1.0,
                  l1_ratio=0.15, fit_intercept=True, max_iter=1000, tol=1e-3,
                  shuffle=True, verbose=0, epsilon=0.1, random_state=None,
@@ -454,7 +452,6 @@ class BaseSGDClassifier(LinearClassifierMixin, BaseSGD, metaclass=ABCMeta):
     }
 
     @abstractmethod
-    @_deprecate_positional_args
     def __init__(self, loss="hinge", *, penalty='l2', alpha=0.0001,
                  l1_ratio=0.15, fit_intercept=True, max_iter=1000, tol=1e-3,
                  shuffle=True, verbose=0, epsilon=DEFAULT_EPSILON, n_jobs=None,
@@ -962,7 +959,6 @@ class SGDClassifier(BaseSGDClassifier):
     >>> print(clf.predict([[-0.8, -1]]))
     [1]
     """
-    @_deprecate_positional_args
     def __init__(self, loss="hinge", *, penalty='l2', alpha=0.0001,
                  l1_ratio=0.15,
                  fit_intercept=True, max_iter=1000, tol=1e-3, shuffle=True,
@@ -1120,7 +1116,6 @@ class BaseSGDRegressor(RegressorMixin, BaseSGD):
     }
 
     @abstractmethod
-    @_deprecate_positional_args
     def __init__(self, loss="squared_error", *, penalty="l2", alpha=0.0001,
                  l1_ratio=0.15, fit_intercept=True, max_iter=1000, tol=1e-3,
                  shuffle=True, verbose=0, epsilon=DEFAULT_EPSILON,
@@ -1565,7 +1560,6 @@ class SGDRegressor(BaseSGDRegressor):
     Ridge, ElasticNet, Lasso, sklearn.svm.SVR
 
     """
-    @_deprecate_positional_args
     def __init__(self, loss="squared_error", *, penalty="l2", alpha=0.0001,
                  l1_ratio=0.15, fit_intercept=True, max_iter=1000, tol=1e-3,
                  shuffle=True, verbose=0, epsilon=DEFAULT_EPSILON,
diff --git a/sklearn/linear_model/_theil_sen.py b/sklearn/linear_model/_theil_sen.py
index 39f3d5c69fb00..4c75613c28a9b 100644
--- a/sklearn/linear_model/_theil_sen.py
+++ b/sklearn/linear_model/_theil_sen.py
@@ -20,7 +20,6 @@
 from ._base import LinearModel
 from ..base import RegressorMixin
 from ..utils import check_random_state
-from ..utils.validation import _deprecate_positional_args
 from ..utils.fixes import delayed
 from ..exceptions import ConvergenceWarning
 
@@ -291,7 +290,6 @@ class TheilSenRegressor(RegressorMixin, LinearModel):
       Xin Dang, Hanxiang Peng, Xueqin Wang and Heping Zhang
       http://home.olemiss.edu/~xdang/papers/MTSE.pdf
     """
-    @_deprecate_positional_args
     def __init__(self, *, fit_intercept=True, copy_X=True,
                  max_subpopulation=1e4, n_subsamples=None, max_iter=300,
                  tol=1.e-3, random_state=None, n_jobs=None, verbose=False):
diff --git a/sklearn/linear_model/tests/test_omp.py b/sklearn/linear_model/tests/test_omp.py
index 1d2eb6a239786..06df7fd349e8b 100644
--- a/sklearn/linear_model/tests/test_omp.py
+++ b/sklearn/linear_model/tests/test_omp.py
@@ -163,7 +163,7 @@ def test_identical_regressors():
         "The requested precision might not have been met."
     )
     with pytest.warns(RuntimeWarning, match=warning_message):
-        orthogonal_mp(newX, newy, 2)
+        orthogonal_mp(newX, newy, n_nonzero_coefs=2)
 
 
 def test_swapped_regressors():
diff --git a/sklearn/manifold/_isomap.py b/sklearn/manifold/_isomap.py
index d843c3ddd8462..63be19c1c287d 100644
--- a/sklearn/manifold/_isomap.py
+++ b/sklearn/manifold/_isomap.py
@@ -7,7 +7,6 @@
 from ..base import BaseEstimator, TransformerMixin
 from ..neighbors import NearestNeighbors, kneighbors_graph
 from ..utils.validation import check_is_fitted
-from ..utils.validation import _deprecate_positional_args
 from ..utils.graph import graph_shortest_path
 from ..decomposition import KernelPCA
 from ..preprocessing import KernelCenterer
@@ -123,7 +122,6 @@ class Isomap(TransformerMixin, BaseEstimator):
     .. [1] Tenenbaum, J.B.; De Silva, V.; & Langford, J.C. A global geometric
            framework for nonlinear dimensionality reduction. Science 290 (5500)
     """
-    @_deprecate_positional_args
     def __init__(self, *, n_neighbors=5, n_components=2, eigen_solver='auto',
                  tol=0, max_iter=None, path_method='auto',
                  neighbors_algorithm='auto', n_jobs=None, metric='minkowski',
diff --git a/sklearn/manifold/_locally_linear.py b/sklearn/manifold/_locally_linear.py
index 7a4e0ace9fccd..0fcd5f543c4d0 100644
--- a/sklearn/manifold/_locally_linear.py
+++ b/sklearn/manifold/_locally_linear.py
@@ -15,7 +15,6 @@
 from ..utils.extmath import stable_cumsum
 from ..utils.validation import check_is_fitted
 from ..utils.validation import FLOAT_DTYPES
-from ..utils.validation import _deprecate_positional_args
 from ..neighbors import NearestNeighbors
 
 
@@ -189,7 +188,6 @@ def null_space(M, k, k_skip=1, eigen_solver='arpack', tol=1E-6, max_iter=100,
         raise ValueError("Unrecognized eigen_solver '%s'" % eigen_solver)
 
 
-@_deprecate_positional_args
 def locally_linear_embedding(
         X, *, n_neighbors, n_components, reg=1e-3, eigen_solver='auto',
         tol=1e-6, max_iter=100, method='standard', hessian_tol=1E-4,
@@ -636,7 +634,6 @@ class LocallyLinearEmbedding(TransformerMixin,
         dimensionality reduction via tangent space alignment.
         Journal of Shanghai Univ.  8:406 (2004)
     """
-    @_deprecate_positional_args
     def __init__(self, *, n_neighbors=5, n_components=2, reg=1E-3,
                  eigen_solver='auto', tol=1E-6, max_iter=100,
                  method='standard', hessian_tol=1E-4, modified_tol=1E-12,
diff --git a/sklearn/manifold/_mds.py b/sklearn/manifold/_mds.py
index 6a144e3033e8e..d92ab67767fa3 100644
--- a/sklearn/manifold/_mds.py
+++ b/sklearn/manifold/_mds.py
@@ -14,7 +14,6 @@
 from ..metrics import euclidean_distances
 from ..utils import check_random_state, check_array, check_symmetric
 from ..isotonic import IsotonicRegression
-from ..utils.validation import _deprecate_positional_args
 from ..utils.deprecation import deprecated
 from ..utils.fixes import delayed
 
@@ -132,7 +131,6 @@ def _smacof_single(dissimilarities, metric=True, n_components=2, init=None,
     return X, stress, it + 1
 
 
-@_deprecate_positional_args
 def smacof(dissimilarities, *, metric=True, n_components=2, init=None,
            n_init=8, n_jobs=None, max_iter=300, verbose=0, eps=1e-3,
            random_state=None, return_n_iter=False):
@@ -372,7 +370,6 @@ class MDS(BaseEstimator):
     hypothesis" Kruskal, J. Psychometrika, 29, (1964)
 
     """
-    @_deprecate_positional_args
     def __init__(self, n_components=2, *, metric=True, n_init=4,
                  max_iter=300, verbose=0, eps=1e-3, n_jobs=None,
                  random_state=None, dissimilarity="euclidean"):
diff --git a/sklearn/manifold/_spectral_embedding.py b/sklearn/manifold/_spectral_embedding.py
index 76f52946e8e87..7fd371ee5af2f 100644
--- a/sklearn/manifold/_spectral_embedding.py
+++ b/sklearn/manifold/_spectral_embedding.py
@@ -25,7 +25,6 @@
 from ..utils.fixes import lobpcg
 from ..metrics.pairwise import rbf_kernel
 from ..neighbors import kneighbors_graph, NearestNeighbors
-from ..utils.validation import _deprecate_positional_args
 from ..utils.deprecation import deprecated
 
 
@@ -141,7 +140,6 @@ def _set_diag(laplacian, value, norm_laplacian):
     return laplacian
 
 
-@_deprecate_positional_args
 def spectral_embedding(adjacency, *, n_components=8, eigen_solver=None,
                        random_state=None, eigen_tol=0.0,
                        norm_laplacian=True, drop_first=True):
@@ -456,7 +454,6 @@ class SpectralEmbedding(BaseEstimator):
       Jianbo Shi, Jitendra Malik
       http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.160.2324
     """
-    @_deprecate_positional_args
     def __init__(self, n_components=2, *, affinity="nearest_neighbors",
                  gamma=None, random_state=None, eigen_solver=None,
                  n_neighbors=None, n_jobs=None):
diff --git a/sklearn/manifold/_t_sne.py b/sklearn/manifold/_t_sne.py
index 682fdc095d3bf..8e42d48f4ef07 100644
--- a/sklearn/manifold/_t_sne.py
+++ b/sklearn/manifold/_t_sne.py
@@ -20,7 +20,6 @@
 from ..utils import check_random_state
 from ..utils._openmp_helpers import _openmp_effective_n_threads
 from ..utils.validation import check_non_negative
-from ..utils.validation import _deprecate_positional_args
 from ..decomposition import PCA
 from ..metrics.pairwise import pairwise_distances
 # mypy error: Module 'sklearn.manifold' has no attribute '_utils'
@@ -401,7 +400,6 @@ def _gradient_descent(objective, p0, it, n_iter,
     return p, error, i
 
 
-@_deprecate_positional_args
 def trustworthiness(X, X_embedded, *, n_neighbors=5, metric='euclidean'):
     r"""Expresses to what extent the local structure is retained.
 
@@ -670,7 +668,6 @@ class TSNE(BaseEstimator):
     # Control the number of iterations between progress checks
     _N_ITER_CHECK = 50
 
-    @_deprecate_positional_args
     def __init__(self, n_components=2, *, perplexity=30.0,
                  early_exaggeration=12.0, learning_rate="warn", n_iter=1000,
                  n_iter_without_progress=300, min_grad_norm=1e-7,
diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py
index a68e17656a73b..434fd89f5bbd9 100644
--- a/sklearn/metrics/_classification.py
+++ b/sklearn/metrics/_classification.py
@@ -37,7 +37,6 @@
 from ..utils.multiclass import unique_labels
 from ..utils.multiclass import type_of_target
 from ..utils.validation import _num_samples
-from ..utils.validation import _deprecate_positional_args
 from ..utils.sparsefuncs import count_nonzero
 from ..exceptions import UndefinedMetricWarning
 
@@ -137,7 +136,6 @@ def _weighted_sum(sample_score, sample_weight, normalize=False):
         return sample_score.sum()
 
 
-@_deprecate_positional_args
 def accuracy_score(y_true, y_pred, *, normalize=True, sample_weight=None):
     """Accuracy classification score.
 
@@ -210,7 +208,6 @@ def accuracy_score(y_true, y_pred, *, normalize=True, sample_weight=None):
     return _weighted_sum(score, sample_weight, normalize)
 
 
-@_deprecate_positional_args
 def confusion_matrix(y_true, y_pred, *, labels=None, sample_weight=None,
                      normalize=None):
     """Compute confusion matrix to evaluate the accuracy of a classification.
@@ -366,7 +363,6 @@ def confusion_matrix(y_true, y_pred, *, labels=None, sample_weight=None,
     return cm
 
 
-@_deprecate_positional_args
 def multilabel_confusion_matrix(y_true, y_pred, *, sample_weight=None,
                                 labels=None, samplewise=False):
     """Compute a confusion matrix for each class or sample.
@@ -568,7 +564,6 @@ def multilabel_confusion_matrix(y_true, y_pred, *, sample_weight=None,
     return np.array([tn, fp, fn, tp]).T.reshape(-1, 2, 2)
 
 
-@_deprecate_positional_args
 def cohen_kappa_score(y1, y2, *, labels=None, weights=None,
                       sample_weight=None):
     r"""Cohen's kappa: a statistic that measures inter-annotator agreement.
@@ -650,7 +645,6 @@ class labels [2]_.
     return 1 - k
 
 
-@_deprecate_positional_args
 def jaccard_score(y_true, y_pred, *, labels=None, pos_label=1,
                   average='binary', sample_weight=None, zero_division="warn"):
     """Jaccard similarity coefficient score.
@@ -796,7 +790,6 @@ def jaccard_score(y_true, y_pred, *, labels=None, pos_label=1,
     return np.average(jaccard, weights=weights)
 
 
-@_deprecate_positional_args
 def matthews_corrcoef(y_true, y_pred, *, sample_weight=None):
     """Compute the Matthews correlation coefficient (MCC).
 
@@ -886,7 +879,6 @@ def matthews_corrcoef(y_true, y_pred, *, sample_weight=None):
         return mcc
 
 
-@_deprecate_positional_args
 def zero_one_loss(y_true, y_pred, *, normalize=True, sample_weight=None):
     """Zero-one classification loss.
 
@@ -957,7 +949,6 @@ def zero_one_loss(y_true, y_pred, *, normalize=True, sample_weight=None):
         return n_samples - score
 
 
-@_deprecate_positional_args
 def f1_score(y_true, y_pred, *, labels=None, pos_label=1, average='binary',
              sample_weight=None, zero_division="warn"):
     """Compute the F1 score, also known as balanced F-score or F-measure.
@@ -1082,7 +1073,6 @@ def f1_score(y_true, y_pred, *, labels=None, pos_label=1, average='binary',
                        zero_division=zero_division)
 
 
-@_deprecate_positional_args
 def fbeta_score(y_true, y_pred, *, beta, labels=None, pos_label=1,
                 average='binary', sample_weight=None, zero_division="warn"):
     """Compute the F-beta score.
@@ -1310,7 +1300,6 @@ def _check_set_wise_labels(y_true, y_pred, average, labels, pos_label):
     return labels
 
 
-@_deprecate_positional_args
 def precision_recall_fscore_support(y_true, y_pred, *, beta=1.0, labels=None,
                                     pos_label=1, average=None,
                                     warn_for=('precision', 'recall',
@@ -1551,7 +1540,6 @@ def precision_recall_fscore_support(y_true, y_pred, *, beta=1.0, labels=None,
     return precision, recall, f_score, true_sum
 
 
-@_deprecate_positional_args
 def precision_score(y_true, y_pred, *, labels=None, pos_label=1,
                     average='binary', sample_weight=None,
                     zero_division="warn"):
@@ -1671,7 +1659,6 @@ def precision_score(y_true, y_pred, *, labels=None, pos_label=1,
     return p
 
 
-@_deprecate_positional_args
 def recall_score(y_true, y_pred, *, labels=None, pos_label=1, average='binary',
                  sample_weight=None, zero_division="warn"):
     """Compute the recall.
@@ -1789,7 +1776,6 @@ def recall_score(y_true, y_pred, *, labels=None, pos_label=1, average='binary',
     return r
 
 
-@_deprecate_positional_args
 def balanced_accuracy_score(y_true, y_pred, *, sample_weight=None,
                             adjusted=False):
     """Compute the balanced accuracy.
@@ -1870,7 +1856,6 @@ def balanced_accuracy_score(y_true, y_pred, *, sample_weight=None,
     return score
 
 
-@_deprecate_positional_args
 def classification_report(y_true, y_pred, *, labels=None, target_names=None,
                           sample_weight=None, digits=2, output_dict=False,
                           zero_division="warn"):
@@ -2072,7 +2057,6 @@ class 2       1.00      0.67      0.80         3
         return report
 
 
-@_deprecate_positional_args
 def hamming_loss(y_true, y_pred, *, sample_weight=None):
     """Compute the average Hamming loss.
 
@@ -2164,7 +2148,6 @@ def hamming_loss(y_true, y_pred, *, sample_weight=None):
         raise ValueError("{0} is not supported".format(y_type))
 
 
-@_deprecate_positional_args
 def log_loss(y_true, y_pred, *, eps=1e-15, normalize=True, sample_weight=None,
              labels=None):
     r"""Log loss, aka logistic loss or cross-entropy loss.
@@ -2293,7 +2276,6 @@ def log_loss(y_true, y_pred, *, eps=1e-15, normalize=True, sample_weight=None,
     return _weighted_sum(loss, sample_weight, normalize)
 
 
-@_deprecate_positional_args
 def hinge_loss(y_true, pred_decision, *, labels=None, sample_weight=None):
     """Average hinge loss (non-regularized).
 
@@ -2433,7 +2415,6 @@ def hinge_loss(y_true, pred_decision, *, labels=None, sample_weight=None):
     return np.average(losses, weights=sample_weight)
 
 
-@_deprecate_positional_args
 def brier_score_loss(y_true, y_prob, *, sample_weight=None, pos_label=None):
     """Compute the Brier score loss.
 
diff --git a/sklearn/metrics/_plot/confusion_matrix.py b/sklearn/metrics/_plot/confusion_matrix.py
index 9fcecec775e6e..dd941a7e28e43 100644
--- a/sklearn/metrics/_plot/confusion_matrix.py
+++ b/sklearn/metrics/_plot/confusion_matrix.py
@@ -6,7 +6,6 @@
 from ...utils import check_matplotlib_support
 from ...utils import deprecated
 from ...utils.multiclass import unique_labels
-from ...utils.validation import _deprecate_positional_args
 from ...base import is_classifier
 
 
@@ -72,12 +71,10 @@ class ConfusionMatrixDisplay:
     ...                               display_labels=clf.classes_)
     >>> disp.plot() # doctest: +SKIP
     """
-    @_deprecate_positional_args
     def __init__(self, confusion_matrix, *, display_labels=None):
         self.confusion_matrix = confusion_matrix
         self.display_labels = display_labels
 
-    @_deprecate_positional_args
     def plot(self, *, include_values=True, cmap='viridis',
              xticks_rotation='horizontal', values_format=None,
              ax=None, colorbar=True):
@@ -435,7 +432,6 @@ def from_predictions(
     "ConfusionMatrixDisplay.from_predictions or "
     "ConfusionMatrixDisplay.from_estimator."
 )
-@_deprecate_positional_args
 def plot_confusion_matrix(estimator, X, y_true, *, labels=None,
                           sample_weight=None, normalize=None,
                           display_labels=None, include_values=True,
diff --git a/sklearn/metrics/_plot/precision_recall_curve.py b/sklearn/metrics/_plot/precision_recall_curve.py
index dcc20bbce25a7..f144c19e53e38 100644
--- a/sklearn/metrics/_plot/precision_recall_curve.py
+++ b/sklearn/metrics/_plot/precision_recall_curve.py
@@ -4,7 +4,6 @@
 from .. import precision_recall_curve
 
 from ...utils import check_matplotlib_support
-from ...utils.validation import _deprecate_positional_args
 
 
 class PrecisionRecallDisplay:
@@ -71,7 +70,6 @@ class PrecisionRecallDisplay:
     >>> disp = PrecisionRecallDisplay(precision=precision, recall=recall)
     >>> disp.plot() # doctest: +SKIP
     """
-    @_deprecate_positional_args
     def __init__(self, precision, recall, *,
                  average_precision=None, estimator_name=None, pos_label=None):
         self.estimator_name = estimator_name
@@ -80,7 +78,6 @@ def __init__(self, precision, recall, *,
         self.average_precision = average_precision
         self.pos_label = pos_label
 
-    @_deprecate_positional_args
     def plot(self, ax=None, *, name=None, **kwargs):
         """Plot visualization.
 
@@ -140,7 +137,6 @@ def plot(self, ax=None, *, name=None, **kwargs):
         return self
 
 
-@_deprecate_positional_args
 def plot_precision_recall_curve(estimator, X, y, *,
                                 sample_weight=None, response_method="auto",
                                 name=None, ax=None, pos_label=None, **kwargs):
diff --git a/sklearn/metrics/_plot/roc_curve.py b/sklearn/metrics/_plot/roc_curve.py
index 308ae4f4bf85d..35fde6ae031b8 100644
--- a/sklearn/metrics/_plot/roc_curve.py
+++ b/sklearn/metrics/_plot/roc_curve.py
@@ -4,7 +4,6 @@
 from .. import roc_curve
 
 from ...utils import check_matplotlib_support
-from ...utils.validation import _deprecate_positional_args
 
 
 class RocCurveDisplay:
@@ -67,7 +66,6 @@ class RocCurveDisplay:
     >>> display.plot()  # doctest: +SKIP
     >>> plt.show()      # doctest: +SKIP
     """
-    @_deprecate_positional_args
     def __init__(self, *, fpr, tpr,
                  roc_auc=None, estimator_name=None, pos_label=None):
         self.estimator_name = estimator_name
@@ -76,7 +74,6 @@ def __init__(self, *, fpr, tpr,
         self.roc_auc = roc_auc
         self.pos_label = pos_label
 
-    @_deprecate_positional_args
     def plot(self, ax=None, *, name=None, **kwargs):
         """Plot visualization
 
@@ -132,7 +129,6 @@ def plot(self, ax=None, *, name=None, **kwargs):
         return self
 
 
-@_deprecate_positional_args
 def plot_roc_curve(estimator, X, y, *, sample_weight=None,
                    drop_intermediate=True, response_method="auto",
                    name=None, ax=None, pos_label=None, **kwargs):
diff --git a/sklearn/metrics/_ranking.py b/sklearn/metrics/_ranking.py
index 8c458ac81e529..8482b9b87aedb 100644
--- a/sklearn/metrics/_ranking.py
+++ b/sklearn/metrics/_ranking.py
@@ -32,7 +32,6 @@
 from ..utils.multiclass import type_of_target
 from ..utils.extmath import stable_cumsum
 from ..utils.sparsefuncs import count_nonzero
-from ..utils.validation import _deprecate_positional_args
 from ..exceptions import UndefinedMetricWarning
 from ..preprocessing import label_binarize
 from ..utils._encode import _encode, _unique
@@ -107,7 +106,6 @@ def auc(x, y):
     return area
 
 
-@_deprecate_positional_args
 def average_precision_score(y_true, y_score, *, average="macro", pos_label=1,
                             sample_weight=None):
     """Compute average precision (AP) from prediction scores.
@@ -350,7 +348,6 @@ def _binary_roc_auc_score(y_true, y_score, sample_weight=None, max_fpr=None):
     return 0.5 * (1 + (partial_auc - min_area) / (max_area - min_area))
 
 
-@_deprecate_positional_args
 def roc_auc_score(y_true, y_score, *, average="macro", sample_weight=None,
                   max_fpr=None, multi_class="raise", labels=None):
     """Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC)
@@ -737,7 +734,6 @@ def _binary_clf_curve(y_true, y_score, pos_label=None, sample_weight=None):
     return fps, tps, y_score[threshold_idxs]
 
 
-@_deprecate_positional_args
 def precision_recall_curve(y_true, probas_pred, *, pos_label=None,
                            sample_weight=None):
     """Compute precision-recall pairs for different probability thresholds.
@@ -832,7 +828,6 @@ def precision_recall_curve(y_true, probas_pred, *, pos_label=None,
     return np.r_[precision[sl], 1], np.r_[recall[sl], 0], thresholds[sl]
 
 
-@_deprecate_positional_args
 def roc_curve(y_true, y_score, *, pos_label=None, sample_weight=None,
               drop_intermediate=True):
     """Compute Receiver operating characteristic (ROC).
@@ -965,7 +960,6 @@ def roc_curve(y_true, y_score, *, pos_label=None, sample_weight=None,
     return fpr, tpr, thresholds
 
 
-@_deprecate_positional_args
 def label_ranking_average_precision_score(y_true, y_score, *,
                                           sample_weight=None):
     """Compute ranking-based average precision.
@@ -1055,7 +1049,6 @@ def label_ranking_average_precision_score(y_true, y_score, *,
     return out
 
 
-@_deprecate_positional_args
 def coverage_error(y_true, y_score, *, sample_weight=None):
     """Coverage error measure.
 
@@ -1115,7 +1108,6 @@ def coverage_error(y_true, y_score, *, sample_weight=None):
     return np.average(coverage, weights=sample_weight)
 
 
-@_deprecate_positional_args
 def label_ranking_loss(y_true, y_score, *, sample_weight=None):
     """Compute Ranking loss measure.
 
@@ -1318,7 +1310,6 @@ def _check_dcg_target_type(y_true):
                 supported_fmt, y_type))
 
 
-@_deprecate_positional_args
 def dcg_score(y_true, y_score, *, k=None,
               log_base=2, sample_weight=None, ignore_ties=False):
     """Compute Discounted Cumulative Gain.
@@ -1475,7 +1466,6 @@ def _ndcg_sample_scores(y_true, y_score, k=None, ignore_ties=False):
     return gain
 
 
-@_deprecate_positional_args
 def ndcg_score(y_true, y_score, *, k=None, sample_weight=None,
                ignore_ties=False):
     """Compute Normalized Discounted Cumulative Gain.
diff --git a/sklearn/metrics/_regression.py b/sklearn/metrics/_regression.py
index c2a0e7f7f033b..ba3edab2f61cb 100644
--- a/sklearn/metrics/_regression.py
+++ b/sklearn/metrics/_regression.py
@@ -30,7 +30,6 @@
 from ..utils.validation import (check_array, check_consistent_length,
                                 _num_samples)
 from ..utils.validation import column_or_1d
-from ..utils.validation import _deprecate_positional_args
 from ..utils.validation import _check_sample_weight
 from ..utils.stats import _weighted_percentile
 from ..exceptions import UndefinedMetricWarning
@@ -123,7 +122,6 @@ def _check_reg_targets(y_true, y_pred, multioutput, dtype="numeric"):
     return y_type, y_true, y_pred, multioutput
 
 
-@_deprecate_positional_args
 def mean_absolute_error(y_true, y_pred, *,
                         sample_weight=None,
                         multioutput='uniform_average'):
@@ -354,7 +352,6 @@ def mean_absolute_percentage_error(y_true, y_pred,
     return np.average(output_errors, weights=multioutput)
 
 
-@_deprecate_positional_args
 def mean_squared_error(y_true, y_pred, *,
                        sample_weight=None,
                        multioutput='uniform_average', squared=True):
@@ -434,7 +431,6 @@ def mean_squared_error(y_true, y_pred, *,
     return np.average(output_errors, weights=multioutput)
 
 
-@_deprecate_positional_args
 def mean_squared_log_error(y_true, y_pred, *,
                            sample_weight=None,
                            multioutput='uniform_average'):
@@ -501,7 +497,6 @@ def mean_squared_log_error(y_true, y_pred, *,
                               multioutput=multioutput)
 
 
-@_deprecate_positional_args
 def median_absolute_error(y_true, y_pred, *, multioutput='uniform_average',
                           sample_weight=None):
     """Median absolute error regression loss.
@@ -575,7 +570,6 @@ def median_absolute_error(y_true, y_pred, *, multioutput='uniform_average',
     return np.average(output_errors, weights=multioutput)
 
 
-@_deprecate_positional_args
 def explained_variance_score(y_true, y_pred, *,
                              sample_weight=None,
                              multioutput='uniform_average'):
@@ -667,7 +661,6 @@ def explained_variance_score(y_true, y_pred, *,
     return np.average(output_scores, weights=avg_weights)
 
 
-@_deprecate_positional_args
 def r2_score(y_true, y_pred, *, sample_weight=None,
              multioutput="uniform_average"):
     """:math:`R^2` (coefficient of determination) regression score function.
@@ -839,7 +832,6 @@ def max_error(y_true, y_pred):
     return np.max(np.abs(y_true - y_pred))
 
 
-@_deprecate_positional_args
 def mean_tweedie_deviance(y_true, y_pred, *, sample_weight=None, power=0):
     """Mean Tweedie deviance regression loss.
 
@@ -904,7 +896,6 @@ def mean_tweedie_deviance(y_true, y_pred, *, sample_weight=None, power=0):
     return np.average(dev, weights=sample_weight)
 
 
-@_deprecate_positional_args
 def mean_poisson_deviance(y_true, y_pred, *, sample_weight=None):
     """Mean Poisson deviance regression loss.
 
@@ -942,7 +933,6 @@ def mean_poisson_deviance(y_true, y_pred, *, sample_weight=None):
     )
 
 
-@_deprecate_positional_args
 def mean_gamma_deviance(y_true, y_pred, *, sample_weight=None):
     """Mean Gamma deviance regression loss.
 
diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py
index 39c4523f9bde6..63427b01d7fc2 100644
--- a/sklearn/metrics/_scorer.py
+++ b/sklearn/metrics/_scorer.py
@@ -43,7 +43,6 @@
 from .cluster import fowlkes_mallows_score
 
 from ..utils.multiclass import type_of_target
-from ..utils.validation import _deprecate_positional_args
 from ..base import is_regressor
 
 
@@ -397,7 +396,6 @@ def _passthrough_scorer(estimator, *args, **kwargs):
     return estimator.score(*args, **kwargs)
 
 
-@_deprecate_positional_args
 def check_scoring(estimator, scoring=None, *, allow_none=False):
     """Determine scorer from user options.
 
@@ -534,7 +532,6 @@ def _check_multimetric_scoring(estimator, scoring):
     return scorers
 
 
-@_deprecate_positional_args
 def make_scorer(score_func, *, greater_is_better=True, needs_proba=False,
                 needs_threshold=False, **kwargs):
     """Make a scorer from a performance metric or loss function.
diff --git a/sklearn/metrics/cluster/_bicluster.py b/sklearn/metrics/cluster/_bicluster.py
index e267b44cee229..b58cc8ac77805 100644
--- a/sklearn/metrics/cluster/_bicluster.py
+++ b/sklearn/metrics/cluster/_bicluster.py
@@ -2,7 +2,6 @@
 from scipy.optimize import linear_sum_assignment
 
 from ...utils.validation import check_consistent_length, check_array
-from ...utils.validation import _deprecate_positional_args
 
 __all__ = ["consensus_score"]
 
@@ -45,7 +44,6 @@ def _pairwise_similarity(a, b, similarity):
     return result
 
 
-@_deprecate_positional_args
 def consensus_score(a, b, *, similarity="jaccard"):
     """The similarity of two sets of biclusters.
 
diff --git a/sklearn/metrics/cluster/_supervised.py b/sklearn/metrics/cluster/_supervised.py
index 19d1552518db4..ccc8077a3aab9 100644
--- a/sklearn/metrics/cluster/_supervised.py
+++ b/sklearn/metrics/cluster/_supervised.py
@@ -25,7 +25,6 @@
 from ._expected_mutual_info_fast import expected_mutual_information
 from ...utils.fixes import _astype_copy_false
 from ...utils.multiclass import type_of_target
-from ...utils.validation import _deprecate_positional_args
 from ...utils.validation import check_array, check_consistent_length
 
 
@@ -84,7 +83,6 @@ def _generalized_average(U, V, average_method):
                          "'arithmetic', or 'max'")
 
 
-@_deprecate_positional_args
 def contingency_matrix(labels_true, labels_pred, *, eps=None, sparse=False,
                        dtype=np.int64):
     """Build a contingency matrix describing the relationship between labels.
@@ -390,7 +388,6 @@ def adjusted_rand_score(labels_true, labels_pred):
                                        (tp + fp) * (fp + tn))
 
 
-@_deprecate_positional_args
 def homogeneity_completeness_v_measure(labels_true, labels_pred, *, beta=1.0):
     """Compute the homogeneity and completeness and V-Measure scores at once.
 
@@ -611,7 +608,6 @@ def completeness_score(labels_true, labels_pred):
     return homogeneity_completeness_v_measure(labels_true, labels_pred)[1]
 
 
-@_deprecate_positional_args
 def v_measure_score(labels_true, labels_pred, *, beta=1.0):
     """V-measure cluster labeling given a ground truth.
 
@@ -711,7 +707,6 @@ def v_measure_score(labels_true, labels_pred, *, beta=1.0):
                                               beta=beta)[2]
 
 
-@_deprecate_positional_args
 def mutual_info_score(labels_true, labels_pred, *, contingency=None):
     """Mutual Information between two clusterings.
 
@@ -799,7 +794,6 @@ def mutual_info_score(labels_true, labels_pred, *, contingency=None):
     return np.clip(mi.sum(), 0.0, None)
 
 
-@_deprecate_positional_args
 def adjusted_mutual_info_score(labels_true, labels_pred, *,
                                average_method='arithmetic'):
     """Adjusted Mutual Information between two clusterings.
@@ -920,7 +914,6 @@ def adjusted_mutual_info_score(labels_true, labels_pred, *,
     return ami
 
 
-@_deprecate_positional_args
 def normalized_mutual_info_score(labels_true, labels_pred, *,
                                  average_method='arithmetic'):
     """Normalized Mutual Information between two clusterings.
@@ -1021,7 +1014,6 @@ def normalized_mutual_info_score(labels_true, labels_pred, *,
     return nmi
 
 
-@_deprecate_positional_args
 def fowlkes_mallows_score(labels_true, labels_pred, *, sparse=False):
     """Measure the similarity of two clusterings of a set of points.
 
diff --git a/sklearn/metrics/cluster/_unsupervised.py b/sklearn/metrics/cluster/_unsupervised.py
index c597277a55b31..2b94557626486 100644
--- a/sklearn/metrics/cluster/_unsupervised.py
+++ b/sklearn/metrics/cluster/_unsupervised.py
@@ -16,7 +16,6 @@
 from ..pairwise import pairwise_distances_chunked
 from ..pairwise import pairwise_distances
 from ...preprocessing import LabelEncoder
-from ...utils.validation import _deprecate_positional_args
 
 
 def check_number_of_labels(n_labels, n_samples):
@@ -35,7 +34,6 @@ def check_number_of_labels(n_labels, n_samples):
                          "to n_samples - 1 (inclusive)" % n_labels)
 
 
-@_deprecate_positional_args
 def silhouette_score(X, labels, *, metric='euclidean', sample_size=None,
                      random_state=None, **kwds):
     """Compute the mean Silhouette Coefficient of all samples.
@@ -149,7 +147,6 @@ def _silhouette_reduce(D_chunk, start, labels, label_freqs):
     return intra_clust_dists, inter_clust_dists
 
 
-@_deprecate_positional_args
 def silhouette_samples(X, labels, *, metric='euclidean', **kwds):
     """Compute the Silhouette Coefficient for each sample.
 
diff --git a/sklearn/metrics/cluster/tests/test_common.py b/sklearn/metrics/cluster/tests/test_common.py
index a0d87ad4baa61..48c7c24218d83 100644
--- a/sklearn/metrics/cluster/tests/test_common.py
+++ b/sklearn/metrics/cluster/tests/test_common.py
@@ -1,4 +1,5 @@
 from functools import partial
+from itertools import chain
 
 import pytest
 import numpy as np
@@ -128,7 +129,7 @@ def test_normalized_output(metric_name):
 # 0.22 AMI and NMI changes
 @pytest.mark.filterwarnings('ignore::FutureWarning')
 @pytest.mark.parametrize(
-    "metric_name", dict(SUPERVISED_METRICS, **UNSUPERVISED_METRICS)
+    "metric_name", chain(SUPERVISED_METRICS, UNSUPERVISED_METRICS)
 )
 def test_permute_labels(metric_name):
     # All clustering metrics do not change score due to permutations of labels
@@ -151,7 +152,7 @@ def test_permute_labels(metric_name):
 # 0.22 AMI and NMI changes
 @pytest.mark.filterwarnings('ignore::FutureWarning')
 @pytest.mark.parametrize(
-    "metric_name", dict(SUPERVISED_METRICS, **UNSUPERVISED_METRICS)
+    "metric_name", chain(SUPERVISED_METRICS, UNSUPERVISED_METRICS)
 )
 # For all clustering metrics Input parameters can be both
 # in the form of arrays lists, positive, negative or string
diff --git a/sklearn/metrics/cluster/tests/test_supervised.py b/sklearn/metrics/cluster/tests/test_supervised.py
index c6d3d2f808843..c4e0149224d2d 100644
--- a/sklearn/metrics/cluster/tests/test_supervised.py
+++ b/sklearn/metrics/cluster/tests/test_supervised.py
@@ -80,17 +80,20 @@ def test_perfect_matches():
     means = {"min", "geometric", "arithmetic", "max"}
     for score_func in score_funcs_with_changing_means:
         for mean in means:
-            assert score_func([], [], mean) == pytest.approx(1.0)
-            assert score_func([0], [1], mean) == pytest.approx(1.0)
-            assert score_func([0, 0, 0], [0, 0, 0], mean) == pytest.approx(1.0)
-            assert score_func(
-                [0, 1, 0], [42, 7, 42], mean) == pytest.approx(1.0)
-            assert score_func(
-                [0., 1., 0.], [42., 7., 42.], mean) == pytest.approx(1.0)
-            assert score_func(
-                [0., 1., 2.], [42., 7., 2.], mean) == pytest.approx(1.0)
-            assert score_func(
-                [0, 1, 2], [42, 7, 2], mean) == pytest.approx(1.0)
+            assert score_func([], [],
+                              average_method=mean) == pytest.approx(1.0)
+            assert score_func([0], [1],
+                              average_method=mean) == pytest.approx(1.0)
+            assert score_func([0, 0, 0], [0, 0, 0],
+                              average_method=mean) == pytest.approx(1.0)
+            assert score_func([0, 1, 0], [42, 7, 42],
+                              average_method=mean) == pytest.approx(1.0)
+            assert score_func([0., 1., 0.], [42., 7., 42.],
+                              average_method=mean) == pytest.approx(1.0)
+            assert score_func([0., 1., 2.], [42., 7., 2.],
+                              average_method=mean) == pytest.approx(1.0)
+            assert score_func([0, 1, 2], [42, 7, 2],
+                              average_method=mean) == pytest.approx(1.0)
 
 
 def test_homogeneous_but_not_complete_labeling():
@@ -296,9 +299,11 @@ def test_exactly_zero_info_score():
             labels_a, labels_b) == pytest.approx(0.0)
         for method in ["min", "geometric", "arithmetic", "max"]:
             assert adjusted_mutual_info_score(
-                labels_a, labels_b,  method) == pytest.approx(0.0)
+                labels_a, labels_b,
+                average_method=method) == pytest.approx(0.0)
             assert normalized_mutual_info_score(
-                labels_a, labels_b, method) == pytest.approx(0.0)
+                labels_a, labels_b,
+                average_method=method) == pytest.approx(0.0)
 
 
 def test_v_measure_and_mutual_information(seed=36):
diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py
index 45eb256d59f67..c9e9f60d8aaf3 100644
--- a/sklearn/metrics/pairwise.py
+++ b/sklearn/metrics/pairwise.py
@@ -28,7 +28,6 @@
 from ..utils.extmath import row_norms, safe_sparse_dot
 from ..preprocessing import normalize
 from ..utils._mask import _get_mask
-from ..utils.validation import _deprecate_positional_args
 from ..utils.fixes import delayed
 from ..utils.fixes import sp_version, parse_version
 
@@ -61,7 +60,6 @@ def _return_float_dtype(X, Y):
     return X, Y, dtype
 
 
-@_deprecate_positional_args
 def check_pairwise_arrays(X, Y, *, precomputed=False, dtype=None,
                           accept_sparse='csr', force_all_finite=True,
                           copy=False):
@@ -199,7 +197,6 @@ def check_paired_arrays(X, Y):
 
 
 # Pairwise distances
-@_deprecate_positional_args
 def euclidean_distances(X, Y=None, *, Y_norm_squared=None, squared=False,
                         X_norm_squared=None):
     """
@@ -352,7 +349,6 @@ def _euclidean_distances(X, Y, X_norm_squared=None, Y_norm_squared=None,
     return distances if squared else np.sqrt(distances, out=distances)
 
 
-@_deprecate_positional_args
 def nan_euclidean_distances(X, Y=None, *, squared=False,
                             missing_values=np.nan, copy=True):
     """Calculate the euclidean distances in the presence of missing values.
@@ -543,7 +539,6 @@ def _argmin_min_reduce(dist, start):
     return indices, values
 
 
-@_deprecate_positional_args
 def pairwise_distances_argmin_min(X, Y, *, axis=1, metric="euclidean",
                                   metric_kwargs=None):
     """Compute minimum distances between one point and a set of points.
@@ -630,7 +625,6 @@ def pairwise_distances_argmin_min(X, Y, *, axis=1, metric="euclidean",
     return indices, values
 
 
-@_deprecate_positional_args
 def pairwise_distances_argmin(X, Y, *, axis=1, metric="euclidean",
                               metric_kwargs=None):
     """Compute minimum distances between one point and a set of points.
@@ -752,7 +746,6 @@ def haversine_distances(X, Y=None):
     return DistanceMetric.get_metric('haversine').pairwise(X, Y)
 
 
-@_deprecate_positional_args
 def manhattan_distances(X, Y=None, *, sum_over_features=True):
     """Compute the L1 distances between the vectors in X and Y.
 
@@ -949,7 +942,6 @@ def paired_cosine_distances(X, Y):
     'cityblock': paired_manhattan_distances}
 
 
-@_deprecate_positional_args
 def paired_distances(X, Y, *, metric="euclidean", **kwds):
     """
     Computes the paired distances between X and Y.
@@ -1499,7 +1491,6 @@ def _precompute_metric_params(X, Y, metric=None, **kwds):
     return {}
 
 
-@_deprecate_positional_args
 def pairwise_distances_chunked(X, Y=None, *, reduce_func=None,
                                metric='euclidean', n_jobs=None,
                                working_memory=None, **kwds):
@@ -1664,7 +1655,6 @@ def pairwise_distances_chunked(X, Y=None, *, reduce_func=None,
         yield D_chunk
 
 
-@_deprecate_positional_args
 def pairwise_distances(X, Y=None, metric="euclidean", *, n_jobs=None,
                        force_all_finite=True, **kwds):
     """Compute the distance matrix from a vector array X and optional Y.
@@ -1887,7 +1877,6 @@ def kernel_metrics():
 }
 
 
-@_deprecate_positional_args
 def pairwise_kernels(X, Y=None, metric="linear", *, filter_params=False,
                      n_jobs=None, **kwds):
     """Compute the kernel between arrays X and optional array Y.
diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index 7b634e88f2275..feed701f6cead 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -246,13 +246,13 @@ def test_precision_recall_f_binary_single_class():
     assert 1. == precision_score([1, 1], [1, 1])
     assert 1. == recall_score([1, 1], [1, 1])
     assert 1. == f1_score([1, 1], [1, 1])
-    assert 1. == fbeta_score([1, 1], [1, 1], 0)
+    assert 1. == fbeta_score([1, 1], [1, 1], beta=0)
 
     assert 0. == precision_score([-1, -1], [-1, -1])
     assert 0. == recall_score([-1, -1], [-1, -1])
     assert 0. == f1_score([-1, -1], [-1, -1])
-    assert 0. == fbeta_score([-1, -1], [-1, -1], float('inf'))
-    assert fbeta_score([-1, -1], [-1, -1], float('inf')) == pytest.approx(
+    assert 0. == fbeta_score([-1, -1], [-1, -1], beta=float('inf'))
+    assert fbeta_score([-1, -1], [-1, -1], beta=float('inf')) == pytest.approx(
         fbeta_score([-1, -1], [-1, -1], beta=1e5))
 
 
diff --git a/sklearn/mixture/_bayesian_mixture.py b/sklearn/mixture/_bayesian_mixture.py
index bd1954ddc15c8..b733c91baf99e 100644
--- a/sklearn/mixture/_bayesian_mixture.py
+++ b/sklearn/mixture/_bayesian_mixture.py
@@ -15,7 +15,6 @@
 from ._gaussian_mixture import _estimate_gaussian_parameters
 from ._gaussian_mixture import _estimate_log_gaussian_prob
 from ..utils import check_array
-from ..utils.validation import _deprecate_positional_args
 
 
 def _log_dirichlet_norm(dirichlet_concentration):
@@ -325,7 +324,6 @@ class BayesianGaussianMixture(BaseMixture):
        inference for Dirichlet process mixtures". Bayesian analysis 1.1
        <https://www.cs.princeton.edu/courses/archive/fall11/cos597C/reading/BleiJordan2005.pdf>`_
     """
-    @_deprecate_positional_args
     def __init__(self, *, n_components=1, covariance_type='full', tol=1e-3,
                  reg_covar=1e-6, max_iter=100, n_init=1, init_params='kmeans',
                  weight_concentration_prior_type='dirichlet_process',
diff --git a/sklearn/mixture/_gaussian_mixture.py b/sklearn/mixture/_gaussian_mixture.py
index 4bb14f9ca5bd7..777141be4feb8 100644
--- a/sklearn/mixture/_gaussian_mixture.py
+++ b/sklearn/mixture/_gaussian_mixture.py
@@ -11,7 +11,6 @@
 from ._base import BaseMixture, _check_shape
 from ..utils import check_array
 from ..utils.extmath import row_norms
-from ..utils.validation import _deprecate_positional_args
 
 
 ###############################################################################
@@ -604,7 +603,6 @@ class GaussianMixture(BaseMixture):
     BayesianGaussianMixture : Gaussian mixture model fit with a variational
         inference.
     """
-    @_deprecate_positional_args
     def __init__(self, n_components=1, *, covariance_type='full', tol=1e-3,
                  reg_covar=1e-6, max_iter=100, n_init=1, init_params='kmeans',
                  weights_init=None, means_init=None, precisions_init=None,
diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py
index d4444ce09dcb5..07ad3d7dbafe5 100644
--- a/sklearn/model_selection/_search.py
+++ b/sklearn/model_selection/_search.py
@@ -37,7 +37,6 @@
 from ..utils.random import sample_without_replacement
 from ..utils._tags import _safe_tags
 from ..utils.validation import indexable, check_is_fitted, _check_fit_params
-from ..utils.validation import _deprecate_positional_args
 from ..utils.metaestimators import if_delegate_has_method
 from ..utils.fixes import delayed
 from ..metrics._scorer import _check_multimetric_scoring
@@ -239,7 +238,6 @@ class ParameterSampler:
     ...                  {'b': 1.038159, 'a': 2}]
     True
     """
-    @_deprecate_positional_args
     def __init__(self, param_distributions, n_iter, *, random_state=None):
         if not isinstance(param_distributions, (Mapping, Iterable)):
             raise TypeError('Parameter distribution is not a dict or '
@@ -340,7 +338,6 @@ class BaseSearchCV(MetaEstimatorMixin, BaseEstimator, metaclass=ABCMeta):
     """
 
     @abstractmethod
-    @_deprecate_positional_args
     def __init__(self, estimator, *, scoring=None, n_jobs=None,
                  refit=True, cv=None, verbose=0,
                  pre_dispatch='2*n_jobs', error_score=np.nan,
@@ -648,7 +645,6 @@ def _check_refit_for_multimetric(self, scores):
                 and not callable(self.refit)):
             raise ValueError(multimetric_refit_msg)
 
-    @_deprecate_positional_args
     def fit(self, X, y=None, *, groups=None, **fit_params):
         """Run fit with all sets of parameters.
 
@@ -1206,7 +1202,6 @@ class GridSearchCV(BaseSearchCV):
     """
     _required_parameters = ["estimator", "param_grid"]
 
-    @_deprecate_positional_args
     def __init__(self, estimator, param_grid, *, scoring=None,
                  n_jobs=None, refit=True, cv=None,
                  verbose=0, pre_dispatch='2*n_jobs',
@@ -1541,7 +1536,6 @@ class RandomizedSearchCV(BaseSearchCV):
     """
     _required_parameters = ["estimator", "param_distributions"]
 
-    @_deprecate_positional_args
     def __init__(self, estimator, param_distributions, *, n_iter=10,
                  scoring=None, n_jobs=None, refit=True,
                  cv=None, verbose=0, pre_dispatch='2*n_jobs',
diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py
index 13edbeef071f5..5eaeb5df5be8e 100644
--- a/sklearn/model_selection/_split.py
+++ b/sklearn/model_selection/_split.py
@@ -27,7 +27,6 @@
 from ..utils import _approximate_mode
 from ..utils.validation import _num_samples, column_or_1d
 from ..utils.validation import check_array
-from ..utils.validation import _deprecate_positional_args
 from ..utils.multiclass import type_of_target
 from ..base import _pprint
 
@@ -272,7 +271,6 @@ class _BaseKFold(BaseCrossValidator, metaclass=ABCMeta):
     """Base class for KFold, GroupKFold, and StratifiedKFold"""
 
     @abstractmethod
-    @_deprecate_positional_args
     def __init__(self, n_splits, *, shuffle, random_state):
         if not isinstance(n_splits, numbers.Integral):
             raise ValueError('The number of folds must be of Integral type. '
@@ -426,7 +424,6 @@ class KFold(_BaseKFold):
 
     RepeatedKFold : Repeats K-Fold n times.
     """
-    @_deprecate_positional_args
     def __init__(self, n_splits=5, *, shuffle=False,
                  random_state=None):
         super().__init__(n_splits=n_splits, shuffle=shuffle,
@@ -635,7 +632,6 @@ class StratifiedKFold(_BaseKFold):
     --------
     RepeatedStratifiedKFold : Repeats Stratified K-Fold n times.
     """
-    @_deprecate_positional_args
     def __init__(self, n_splits=5, *, shuffle=False, random_state=None):
         super().__init__(n_splits=n_splits, shuffle=shuffle,
                          random_state=random_state)
@@ -1009,7 +1005,6 @@ class TimeSeriesSplit(_BaseKFold):
     with a test set of size ``n_samples//(n_splits + 1)`` by default,
     where ``n_samples`` is the number of samples.
     """
-    @_deprecate_positional_args
     def __init__(self,
                  n_splits=5,
                  *,
@@ -1339,7 +1334,6 @@ class _RepeatedSplits(metaclass=ABCMeta):
         Constructor parameters for cv. Must not contain random_state
         and shuffle.
     """
-    @_deprecate_positional_args
     def __init__(self, cv, *, n_repeats=10, random_state=None, **cvargs):
         if not isinstance(n_repeats, numbers.Integral):
             raise ValueError("Number of repetitions must be of Integral type.")
@@ -1467,7 +1461,6 @@ class RepeatedKFold(_RepeatedSplits):
     --------
     RepeatedStratifiedKFold : Repeats Stratified K-Fold n times.
     """
-    @_deprecate_positional_args
     def __init__(self, *, n_splits=5, n_repeats=10, random_state=None):
         super().__init__(
             KFold, n_repeats=n_repeats,
@@ -1523,7 +1516,6 @@ class RepeatedStratifiedKFold(_RepeatedSplits):
     --------
     RepeatedKFold : Repeats K-Fold n times.
     """
-    @_deprecate_positional_args
     def __init__(self, *, n_splits=5, n_repeats=10, random_state=None):
         super().__init__(
             StratifiedKFold, n_repeats=n_repeats, random_state=random_state,
@@ -1532,7 +1524,6 @@ def __init__(self, *, n_splits=5, n_repeats=10, random_state=None):
 
 class BaseShuffleSplit(metaclass=ABCMeta):
     """Base class for ShuffleSplit and StratifiedShuffleSplit"""
-    @_deprecate_positional_args
     def __init__(self, n_splits=10, *, test_size=None, train_size=None,
                  random_state=None):
         self.n_splits = n_splits
@@ -1666,7 +1657,6 @@ class ShuffleSplit(BaseShuffleSplit):
     TRAIN: [3 4 1] TEST: [5 2]
     TRAIN: [3 5 1] TEST: [2 4]
     """
-    @_deprecate_positional_args
     def __init__(self, n_splits=10, *, test_size=None, train_size=None,
                  random_state=None):
         super().__init__(
@@ -1757,7 +1747,6 @@ class GroupShuffleSplit(ShuffleSplit):
     TRAIN: [2 3 4 5 6 7] TEST: [0 1]
     TRAIN: [0 1 5 6 7] TEST: [2 3 4]
     '''
-    @_deprecate_positional_args
     def __init__(self, n_splits=5, *, test_size=None, train_size=None,
                  random_state=None):
         super().__init__(
@@ -1873,7 +1862,6 @@ class StratifiedShuffleSplit(BaseShuffleSplit):
     TRAIN: [4 1 0] TEST: [2 3 5]
     TRAIN: [0 5 1] TEST: [3 4 2]
     """
-    @_deprecate_positional_args
     def __init__(self, n_splits=10, *, test_size=None, train_size=None,
                  random_state=None):
         super().__init__(
@@ -2206,7 +2194,6 @@ def split(self, X=None, y=None, groups=None):
             yield train, test
 
 
-@_deprecate_positional_args
 def check_cv(cv=5, y=None, *, classifier=False):
     """Input checker utility for building a cross-validator
 
diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py
index 9765303a30b8d..e473db977bb30 100644
--- a/sklearn/model_selection/_validation.py
+++ b/sklearn/model_selection/_validation.py
@@ -24,7 +24,6 @@
 from ..utils import indexable, check_random_state, _safe_indexing
 from ..utils.validation import _check_fit_params
 from ..utils.validation import _num_samples
-from ..utils.validation import _deprecate_positional_args
 from ..utils.fixes import delayed
 from ..utils.metaestimators import _safe_split
 from ..metrics import check_scoring
@@ -38,7 +37,6 @@
            'permutation_test_score', 'learning_curve', 'validation_curve']
 
 
-@_deprecate_positional_args
 def cross_validate(estimator, X, y=None, *, groups=None, scoring=None, cv=None,
                    n_jobs=None, verbose=0, fit_params=None,
                    pre_dispatch='2*n_jobs', return_train_score=False,
@@ -317,7 +315,6 @@ def _normalize_score_results(scores, scaler_score_key='score'):
     return {scaler_score_key: scores}
 
 
-@_deprecate_positional_args
 def cross_val_score(estimator, X, y=None, *, groups=None, scoring=None,
                     cv=None, n_jobs=None, verbose=0, fit_params=None,
                     pre_dispatch='2*n_jobs', error_score=np.nan):
@@ -722,7 +719,6 @@ def _score(estimator, X_test, y_test, scorer, error_score="raise"):
     return scores
 
 
-@_deprecate_positional_args
 def cross_val_predict(estimator, X, y=None, *, groups=None, cv=None,
                       n_jobs=None, verbose=0, fit_params=None,
                       pre_dispatch='2*n_jobs', method='predict'):
@@ -1059,7 +1055,6 @@ def _check_is_permutation(indices, n_samples):
     return True
 
 
-@_deprecate_positional_args
 def permutation_test_score(estimator, X, y, *, groups=None, cv=None,
                            n_permutations=100, n_jobs=None, random_state=0,
                            verbose=0, scoring=None, fit_params=None):
@@ -1224,7 +1219,6 @@ def _shuffle(y, groups, random_state):
     return _safe_indexing(y, indices)
 
 
-@_deprecate_positional_args
 def learning_curve(estimator, X, y, *, groups=None,
                    train_sizes=np.linspace(0.1, 1.0, 5), cv=None,
                    scoring=None, exploit_incremental_learning=False,
@@ -1534,7 +1528,6 @@ def _incremental_fit_estimator(estimator, X, y, classes, train, test,
     return np.array(ret).T
 
 
-@_deprecate_positional_args
 def validation_curve(estimator, X, y, *, param_name, param_range, groups=None,
                      cv=None, scoring=None, n_jobs=None, pre_dispatch="all",
                      verbose=0, error_score=np.nan, fit_params=None):
diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py
index c66d8e1836ac9..98d173f141d96 100644
--- a/sklearn/model_selection/tests/test_split.py
+++ b/sklearn/model_selection/tests/test_split.py
@@ -697,15 +697,15 @@ def test_stratified_shuffle_split_init():
     y = np.asarray([0, 1, 1, 1, 2, 2, 2])
     # Check that error is raised if there is a class with only one sample
     with pytest.raises(ValueError):
-        next(StratifiedShuffleSplit(3, 0.2).split(X, y))
+        next(StratifiedShuffleSplit(3, test_size=0.2).split(X, y))
 
     # Check that error is raised if the test set size is smaller than n_classes
     with pytest.raises(ValueError):
-        next(StratifiedShuffleSplit(3, 2).split(X, y))
+        next(StratifiedShuffleSplit(3, test_size=2).split(X, y))
     # Check that error is raised if the train set size is smaller than
     # n_classes
     with pytest.raises(ValueError):
-        next(StratifiedShuffleSplit(3, 3, 2).split(X, y))
+        next(StratifiedShuffleSplit(3, test_size=3, train_size=2).split(X, y))
 
     X = np.arange(9)
     y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2])
diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py
index da29fdd4daf11..d75556bf60ab4 100644
--- a/sklearn/multiclass.py
+++ b/sklearn/multiclass.py
@@ -51,7 +51,6 @@
 from .utils.validation import _num_samples
 from .utils.validation import check_is_fitted
 from .utils.validation import check_X_y, check_array
-from .utils.validation import _deprecate_positional_args
 from .utils.multiclass import (_check_partial_fit_first_call,
                                check_classification_targets,
                                _ovr_decision_function)
@@ -245,7 +244,6 @@ class OneVsRestClassifier(MultiOutputMixin, ClassifierMixin,
     sklearn.preprocessing.MultiLabelBinarizer : Transform iterable of iterables
         to binary indicator matrix.
     """
-    @_deprecate_positional_args
     def __init__(self, estimator, *, n_jobs=None):
         self.estimator = estimator
         self.n_jobs = n_jobs
@@ -609,7 +607,6 @@ class OneVsOneClassifier(MetaEstimatorMixin, ClassifierMixin, BaseEstimator):
     >>> clf.predict(X_test[:10])
     array([2, 1, 0, 2, 0, 2, 0, 1, 1, 1])
     """
-    @_deprecate_positional_args
     def __init__(self, estimator, *, n_jobs=None):
         self.estimator = estimator
         self.n_jobs = n_jobs
@@ -867,7 +864,6 @@ class OutputCodeClassifier(MetaEstimatorMixin, ClassifierMixin, BaseEstimator):
        Hastie T., Tibshirani R., Friedman J., page 606 (second-edition)
        2008.
     """
-    @_deprecate_positional_args
     def __init__(self, estimator, *, code_size=1.5, random_state=None,
                  n_jobs=None):
         self.estimator = estimator
diff --git a/sklearn/multioutput.py b/sklearn/multioutput.py
index 9b64d28f41eb8..e78683fea3835 100644
--- a/sklearn/multioutput.py
+++ b/sklearn/multioutput.py
@@ -25,7 +25,7 @@
 from .utils import check_array, check_X_y, check_random_state
 from .utils.metaestimators import if_delegate_has_method
 from .utils.validation import (check_is_fitted, has_fit_parameter,
-                               _check_fit_params, _deprecate_positional_args)
+                               _check_fit_params)
 from .utils.multiclass import check_classification_targets
 from .utils.fixes import delayed
 
@@ -65,7 +65,6 @@ class _MultiOutputEstimator(MetaEstimatorMixin,
                             BaseEstimator,
                             metaclass=ABCMeta):
     @abstractmethod
-    @_deprecate_positional_args
     def __init__(self, estimator, *, n_jobs=None):
         self.estimator = estimator
         self.n_jobs = n_jobs
@@ -260,7 +259,6 @@ class MultiOutputRegressor(RegressorMixin, _MultiOutputEstimator):
     >>> clf.predict(X[[0]])
     array([[176..., 35..., 57...]])
     """
-    @_deprecate_positional_args
     def __init__(self, estimator, *, n_jobs=None):
         super().__init__(estimator, n_jobs=n_jobs)
 
@@ -339,7 +337,6 @@ class MultiOutputClassifier(ClassifierMixin, _MultiOutputEstimator):
     >>> clf.predict(X[-2:])
     array([[1, 1, 0], [1, 1, 1]])
     """
-    @_deprecate_positional_args
     def __init__(self, estimator, *, n_jobs=None):
         super().__init__(estimator, n_jobs=n_jobs)
 
@@ -440,7 +437,6 @@ def _more_tags(self):
 
 
 class _BaseChain(BaseEstimator, metaclass=ABCMeta):
-    @_deprecate_positional_args
     def __init__(self, base_estimator, *, order=None, cv=None,
                  random_state=None):
         self.base_estimator = base_estimator
diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py
index 70f5993f98b1a..7e936ac3a0c8e 100644
--- a/sklearn/naive_bayes.py
+++ b/sklearn/naive_bayes.py
@@ -32,7 +32,6 @@
 from .utils.multiclass import _check_partial_fit_first_call
 from .utils.validation import check_is_fitted, check_non_negative
 from .utils.validation import _check_sample_weight
-from .utils.validation import _deprecate_positional_args
 
 
 __all__ = ['BernoulliNB', 'GaussianNB', 'MultinomialNB', 'ComplementNB',
@@ -189,7 +188,6 @@ class labels known to the classifier
     [1]
     """
 
-    @_deprecate_positional_args
     def __init__(self, *, priors=None, var_smoothing=1e-9):
         self.priors = priors
         self.var_smoothing = var_smoothing
@@ -795,7 +793,6 @@ class MultinomialNB(_BaseDiscreteNB):
     https://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html
     """
 
-    @_deprecate_positional_args
     def __init__(self, *, alpha=1.0, fit_prior=True, class_prior=None):
         self.alpha = alpha
         self.fit_prior = fit_prior
@@ -920,7 +917,6 @@ class ComplementNB(_BaseDiscreteNB):
     https://people.csail.mit.edu/jrennie/papers/icml03-nb.pdf
     """
 
-    @_deprecate_positional_args
     def __init__(self, *, alpha=1.0, fit_prior=True, class_prior=None,
                  norm=False):
         self.alpha = alpha
@@ -1047,7 +1043,6 @@ class BernoulliNB(_BaseDiscreteNB):
     naive Bayes -- Which naive Bayes? 3rd Conf. on Email and Anti-Spam (CEAS).
     """
 
-    @_deprecate_positional_args
     def __init__(self, *, alpha=1.0, binarize=.0, fit_prior=True,
                  class_prior=None):
         self.alpha = alpha
@@ -1182,7 +1177,6 @@ class CategoricalNB(_BaseDiscreteNB):
     [3]
     """
 
-    @_deprecate_positional_args
     def __init__(self, *, alpha=1.0, fit_prior=True, class_prior=None,
                  min_categories=None):
         self.alpha = alpha
diff --git a/sklearn/neighbors/_classification.py b/sklearn/neighbors/_classification.py
index 83078e9f77ba9..29ab582c15ab9 100644
--- a/sklearn/neighbors/_classification.py
+++ b/sklearn/neighbors/_classification.py
@@ -17,7 +17,6 @@
 from ._base import _check_weights, _get_weights
 from ._base import NeighborsBase, KNeighborsMixin, RadiusNeighborsMixin
 from ..base import ClassifierMixin
-from ..utils.validation import _deprecate_positional_args
 
 
 class KNeighborsClassifier(KNeighborsMixin,
@@ -144,7 +143,6 @@ class KNeighborsClassifier(KNeighborsMixin,
     https://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm
     """
 
-    @_deprecate_positional_args
     def __init__(self, n_neighbors=5, *,
                  weights='uniform', algorithm='auto', leaf_size=30,
                  p=2, metric='minkowski', metric_params=None, n_jobs=None):
@@ -404,7 +402,6 @@ class RadiusNeighborsClassifier(RadiusNeighborsMixin,
     https://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm
     """
 
-    @_deprecate_positional_args
     def __init__(self, radius=1.0, *, weights='uniform',
                  algorithm='auto', leaf_size=30, p=2, metric='minkowski',
                  outlier_label=None, metric_params=None, n_jobs=None,
diff --git a/sklearn/neighbors/_graph.py b/sklearn/neighbors/_graph.py
index a23b45399f05c..7676d42d62c18 100644
--- a/sklearn/neighbors/_graph.py
+++ b/sklearn/neighbors/_graph.py
@@ -8,7 +8,7 @@
 from ._base import NeighborsBase
 from ._unsupervised import NearestNeighbors
 from ..base import TransformerMixin
-from ..utils.validation import check_is_fitted, _deprecate_positional_args
+from ..utils.validation import check_is_fitted
 
 
 def _check_params(X, metric, p, metric_params):
@@ -36,7 +36,6 @@ def _query_include_self(X, include_self, mode):
     return X
 
 
-@_deprecate_positional_args
 def kneighbors_graph(X, n_neighbors, *, mode='connectivity',
                      metric='minkowski', p=2, metric_params=None,
                      include_self=False, n_jobs=None):
@@ -113,7 +112,6 @@ def kneighbors_graph(X, n_neighbors, *, mode='connectivity',
     return X.kneighbors_graph(X=query, n_neighbors=n_neighbors, mode=mode)
 
 
-@_deprecate_positional_args
 def radius_neighbors_graph(X, radius, *, mode='connectivity',
                            metric='minkowski', p=2, metric_params=None,
                            include_self=False, n_jobs=None):
@@ -300,7 +298,6 @@ class KNeighborsTransformer(KNeighborsMixin,
     ...     KNeighborsTransformer(n_neighbors=5, mode='distance'),
     ...     Isomap(neighbors_algorithm='precomputed'))
     """
-    @_deprecate_positional_args
     def __init__(self, *, mode='distance', n_neighbors=5, algorithm='auto',
                  leaf_size=30, metric='minkowski', p=2, metric_params=None,
                  n_jobs=1):
@@ -483,7 +480,6 @@ class RadiusNeighborsTransformer(RadiusNeighborsMixin,
     ...     RadiusNeighborsTransformer(radius=42.0, mode='distance'),
     ...     DBSCAN(min_samples=30, metric='precomputed'))
     """
-    @_deprecate_positional_args
     def __init__(self, *, mode='distance', radius=1., algorithm='auto',
                  leaf_size=30, metric='minkowski', p=2, metric_params=None,
                  n_jobs=1):
diff --git a/sklearn/neighbors/_kde.py b/sklearn/neighbors/_kde.py
index 5a5ad55d3261c..816b023e0f23e 100644
--- a/sklearn/neighbors/_kde.py
+++ b/sklearn/neighbors/_kde.py
@@ -9,7 +9,6 @@
 from ..base import BaseEstimator
 from ..utils import check_random_state
 from ..utils.validation import _check_sample_weight, check_is_fitted
-from ..utils.validation import _deprecate_positional_args
 
 from ..utils.extmath import row_norms
 from ._ball_tree import BallTree, DTYPE
@@ -94,7 +93,6 @@ class KernelDensity(BaseEstimator):
     >>> log_density
     array([-1.52955942, -1.51462041, -1.60244657])
     """
-    @_deprecate_positional_args
     def __init__(self, *, bandwidth=1.0, algorithm='auto',
                  kernel='gaussian', metric="euclidean", atol=0, rtol=0,
                  breadth_first=True, leaf_size=40, metric_params=None):
diff --git a/sklearn/neighbors/_lof.py b/sklearn/neighbors/_lof.py
index 29bf1a5e73f91..941b9de781f9a 100644
--- a/sklearn/neighbors/_lof.py
+++ b/sklearn/neighbors/_lof.py
@@ -10,7 +10,6 @@
 from ..base import OutlierMixin
 
 from ..utils.validation import check_is_fitted
-from ..utils.validation import _deprecate_positional_args
 from ..utils import check_array
 
 __all__ = ["LocalOutlierFactor"]
@@ -177,7 +176,6 @@ class LocalOutlierFactor(KNeighborsMixin,
     .. [1] Breunig, M. M., Kriegel, H. P., Ng, R. T., & Sander, J. (2000, May).
            LOF: identifying density-based local outliers. In ACM sigmod record.
     """
-    @_deprecate_positional_args
     def __init__(self, n_neighbors=20, *, algorithm='auto', leaf_size=30,
                  metric='minkowski', p=2, metric_params=None,
                  contamination="auto", novelty=False, n_jobs=None):
diff --git a/sklearn/neighbors/_nca.py b/sklearn/neighbors/_nca.py
index a4ef02b687d97..5951b66ea7dbf 100644
--- a/sklearn/neighbors/_nca.py
+++ b/sklearn/neighbors/_nca.py
@@ -23,7 +23,6 @@
 from ..utils.multiclass import check_classification_targets
 from ..utils.random import check_random_state
 from ..utils.validation import check_is_fitted, check_array, check_scalar
-from ..utils.validation import _deprecate_positional_args
 from ..exceptions import ConvergenceWarning
 
 
@@ -162,7 +161,6 @@ class NeighborhoodComponentsAnalysis(TransformerMixin, BaseEstimator):
 
     """
 
-    @_deprecate_positional_args
     def __init__(self, n_components=None, *, init='auto', warm_start=False,
                  max_iter=50, tol=1e-5, callback=None, verbose=0,
                  random_state=None):
diff --git a/sklearn/neighbors/_nearest_centroid.py b/sklearn/neighbors/_nearest_centroid.py
index 0c726cdc0a62c..c5f6a612b0395 100644
--- a/sklearn/neighbors/_nearest_centroid.py
+++ b/sklearn/neighbors/_nearest_centroid.py
@@ -16,7 +16,6 @@
 from ..metrics.pairwise import pairwise_distances
 from ..preprocessing import LabelEncoder
 from ..utils.validation import check_is_fitted
-from ..utils.validation import _deprecate_positional_args
 from ..utils.sparsefuncs import csc_median_axis_0
 from ..utils.multiclass import check_classification_targets
 
@@ -86,7 +85,6 @@ class NearestCentroid(ClassifierMixin, BaseEstimator):
 
     """
 
-    @_deprecate_positional_args
     def __init__(self, metric='euclidean', *, shrink_threshold=None):
         self.metric = metric
         self.shrink_threshold = shrink_threshold
diff --git a/sklearn/neighbors/_regression.py b/sklearn/neighbors/_regression.py
index 62d6cf33575e4..96beb1ee022af 100644
--- a/sklearn/neighbors/_regression.py
+++ b/sklearn/neighbors/_regression.py
@@ -17,7 +17,6 @@
 from ._base import _get_weights, _check_weights
 from ._base import NeighborsBase, KNeighborsMixin, RadiusNeighborsMixin
 from ..base import RegressorMixin
-from ..utils.validation import _deprecate_positional_args
 from ..utils.deprecation import deprecated
 
 
@@ -143,7 +142,6 @@ class KNeighborsRegressor(KNeighborsMixin,
     https://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm
     """
 
-    @_deprecate_positional_args
     def __init__(self, n_neighbors=5, *, weights='uniform',
                  algorithm='auto', leaf_size=30,
                  p=2, metric='minkowski', metric_params=None, n_jobs=None):
@@ -342,7 +340,6 @@ class RadiusNeighborsRegressor(RadiusNeighborsMixin,
     https://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm
     """
 
-    @_deprecate_positional_args
     def __init__(self, radius=1.0, *, weights='uniform',
                  algorithm='auto', leaf_size=30,
                  p=2, metric='minkowski', metric_params=None, n_jobs=None):
diff --git a/sklearn/neighbors/_unsupervised.py b/sklearn/neighbors/_unsupervised.py
index a6af48d9ed341..0f14c56e8bac2 100644
--- a/sklearn/neighbors/_unsupervised.py
+++ b/sklearn/neighbors/_unsupervised.py
@@ -2,7 +2,6 @@
 from ._base import NeighborsBase
 from ._base import KNeighborsMixin
 from ._base import RadiusNeighborsMixin
-from ..utils.validation import _deprecate_positional_args
 
 
 class NearestNeighbors(KNeighborsMixin,
@@ -111,7 +110,6 @@ class NearestNeighbors(KNeighborsMixin,
     https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm
     """
 
-    @_deprecate_positional_args
     def __init__(self, *, n_neighbors=5, radius=1.0,
                  algorithm='auto', leaf_size=30, metric='minkowski',
                  p=2, metric_params=None, n_jobs=None):
diff --git a/sklearn/neural_network/_multilayer_perceptron.py b/sklearn/neural_network/_multilayer_perceptron.py
index e349dfd844f96..72120ad369275 100644
--- a/sklearn/neural_network/_multilayer_perceptron.py
+++ b/sklearn/neural_network/_multilayer_perceptron.py
@@ -25,7 +25,7 @@
 from ..utils import column_or_1d
 from ..exceptions import ConvergenceWarning
 from ..utils.extmath import safe_sparse_dot
-from ..utils.validation import check_is_fitted, _deprecate_positional_args
+from ..utils.validation import check_is_fitted
 from ..utils.multiclass import _check_partial_fit_first_call, unique_labels
 from ..utils.multiclass import type_of_target
 from ..utils.optimize import _check_optimize_result
@@ -943,7 +943,6 @@ class MLPClassifier(ClassifierMixin, BaseMultilayerPerceptron):
     Kingma, Diederik, and Jimmy Ba. "Adam: A method for stochastic
         optimization." arXiv preprint arXiv:1412.6980 (2014).
     """
-    @_deprecate_positional_args
     def __init__(self, hidden_layer_sizes=(100,), activation="relu", *,
                  solver='adam', alpha=0.0001,
                  batch_size='auto', learning_rate="constant",
@@ -1366,7 +1365,6 @@ class MLPRegressor(RegressorMixin, BaseMultilayerPerceptron):
     Kingma, Diederik, and Jimmy Ba. "Adam: A method for stochastic
         optimization." arXiv preprint arXiv:1412.6980 (2014).
     """
-    @_deprecate_positional_args
     def __init__(self, hidden_layer_sizes=(100,), activation="relu", *,
                  solver='adam', alpha=0.0001,
                  batch_size='auto', learning_rate="constant",
diff --git a/sklearn/neural_network/_rbm.py b/sklearn/neural_network/_rbm.py
index 7aa64c503bb21..b69a2c496a2c9 100644
--- a/sklearn/neural_network/_rbm.py
+++ b/sklearn/neural_network/_rbm.py
@@ -20,7 +20,7 @@
 from ..utils import gen_even_slices
 from ..utils.extmath import safe_sparse_dot
 from ..utils.extmath import log_logistic
-from ..utils.validation import check_is_fitted, _deprecate_positional_args
+from ..utils.validation import check_is_fitted
 
 
 class BernoulliRBM(TransformerMixin, BaseEstimator):
@@ -106,7 +106,6 @@ class BernoulliRBM(TransformerMixin, BaseEstimator):
         Approximations to the Likelihood Gradient. International Conference
         on Machine Learning (ICML) 2008
     """
-    @_deprecate_positional_args
     def __init__(self, n_components=256, *, learning_rate=0.1, batch_size=10,
                  n_iter=10, verbose=0, random_state=None):
         self.n_components = n_components
diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py
index 024bfe4f1dd38..e2ff6806ff3da 100644
--- a/sklearn/pipeline.py
+++ b/sklearn/pipeline.py
@@ -26,7 +26,6 @@
 from .utils.deprecation import deprecated
 from .utils._tags import _safe_tags
 from .utils.validation import check_memory
-from .utils.validation import _deprecate_positional_args
 from .utils.fixes import delayed
 
 from .utils.metaestimators import _BaseComposition
@@ -110,7 +109,6 @@ class Pipeline(_BaseComposition):
     # BaseEstimator interface
     _required_parameters = ['steps']
 
-    @_deprecate_positional_args
     def __init__(self, steps, *, memory=None, verbose=False):
         self.steps = steps
         self.memory = memory
@@ -846,7 +844,6 @@ class FeatureUnion(TransformerMixin, _BaseComposition):
     """
     _required_parameters = ["transformer_list"]
 
-    @_deprecate_positional_args
     def __init__(self, transformer_list, *, n_jobs=None,
                  transformer_weights=None, verbose=False):
         self.transformer_list = transformer_list
diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py
index befd3e61b96fc..393693fc87d2d 100644
--- a/sklearn/preprocessing/_data.py
+++ b/sklearn/preprocessing/_data.py
@@ -28,7 +28,7 @@
                                  min_max_axis)
 from ..utils.validation import (check_is_fitted, check_random_state,
                                 _check_sample_weight,
-                                FLOAT_DTYPES, _deprecate_positional_args)
+                                FLOAT_DTYPES)
 
 from ._encoders import OneHotEncoder
 
@@ -106,7 +106,6 @@ def _handle_zeros_in_scale(scale, copy=True, constant_mask=None):
         return scale
 
 
-@_deprecate_positional_args
 def scale(X, *, axis=0, with_mean=True, with_std=True, copy=True):
     """Standardize a dataset along any axis.
 
@@ -344,7 +343,6 @@ class MinMaxScaler(TransformerMixin, BaseEstimator):
     <sphx_glr_auto_examples_preprocessing_plot_all_scaling.py>`.
     """
 
-    @_deprecate_positional_args
     def __init__(self, feature_range=(0, 1), *, copy=True, clip=False):
         self.feature_range = feature_range
         self.copy = copy
@@ -492,7 +490,6 @@ def _more_tags(self):
         return {'allow_nan': True}
 
 
-@_deprecate_positional_args
 def minmax_scale(X, feature_range=(0, 1), *, axis=0, copy=True):
     """Transform features by scaling each feature to a given range.
 
@@ -707,7 +704,6 @@ class StandardScaler(TransformerMixin, BaseEstimator):
     <sphx_glr_auto_examples_preprocessing_plot_all_scaling.py>`.
     """  # noqa
 
-    @_deprecate_positional_args
     def __init__(self, *, copy=True, with_mean=True, with_std=True):
         self.with_mean = with_mean
         self.with_std = with_std
@@ -1026,7 +1022,6 @@ class MaxAbsScaler(TransformerMixin, BaseEstimator):
     <sphx_glr_auto_examples_preprocessing_plot_all_scaling.py>`.
     """
 
-    @_deprecate_positional_args
     def __init__(self, *, copy=True):
         self.copy = copy
 
@@ -1161,7 +1156,6 @@ def _more_tags(self):
         return {'allow_nan': True}
 
 
-@_deprecate_positional_args
 def maxabs_scale(X, *, axis=0, copy=True):
     """Scale each feature to the [-1, 1] range without breaking the sparsity.
 
@@ -1337,7 +1331,6 @@ class RobustScaler(TransformerMixin, BaseEstimator):
     https://en.wikipedia.org/wiki/Median
     https://en.wikipedia.org/wiki/Interquartile_range
     """
-    @_deprecate_positional_args
     def __init__(self, *, with_centering=True, with_scaling=True,
                  quantile_range=(25.0, 75.0), copy=True, unit_variance=False):
         self.with_centering = with_centering
@@ -1471,7 +1464,6 @@ def _more_tags(self):
         return {'allow_nan': True}
 
 
-@_deprecate_positional_args
 def robust_scale(X, *, axis=0, with_centering=True, with_scaling=True,
                  quantile_range=(25.0, 75.0), copy=True, unit_variance=False):
     """Standardize a dataset along any axis
@@ -1579,7 +1571,6 @@ def robust_scale(X, *, axis=0, with_centering=True, with_scaling=True,
     return X
 
 
-@_deprecate_positional_args
 def normalize(X, norm='l2', *, axis=1, copy=True, return_norm=False):
     """Scale input vectors individually to unit norm (vector length).
 
@@ -1738,7 +1729,6 @@ class Normalizer(TransformerMixin, BaseEstimator):
     normalize : Equivalent function without the estimator API.
     """
 
-    @_deprecate_positional_args
     def __init__(self, norm='l2', *, copy=True):
         self.norm = norm
         self.copy = copy
@@ -1790,7 +1780,6 @@ def _more_tags(self):
         return {'stateless': True}
 
 
-@_deprecate_positional_args
 def binarize(X, *, threshold=0.0, copy=True):
     """Boolean thresholding of array-like or scipy.sparse matrix.
 
@@ -1894,7 +1883,6 @@ class Binarizer(TransformerMixin, BaseEstimator):
     binarize : Equivalent function without the estimator API.
     """
 
-    @_deprecate_positional_args
     def __init__(self, *, threshold=0.0, copy=True):
         self.threshold = threshold
         self.copy = copy
@@ -2241,7 +2229,6 @@ class QuantileTransformer(TransformerMixin, BaseEstimator):
     <sphx_glr_auto_examples_preprocessing_plot_all_scaling.py>`.
     """
 
-    @_deprecate_positional_args
     def __init__(self, *, n_quantiles=1000, output_distribution='uniform',
                  ignore_implicit_zeros=False, subsample=int(1e5),
                  random_state=None, copy=True):
@@ -2560,7 +2547,6 @@ def _more_tags(self):
         return {'allow_nan': True}
 
 
-@_deprecate_positional_args
 def quantile_transform(X, *, axis=0, n_quantiles=1000,
                        output_distribution='uniform',
                        ignore_implicit_zeros=False,
@@ -2779,7 +2765,6 @@ class PowerTransformer(TransformerMixin, BaseEstimator):
     .. [2] G.E.P. Box and D.R. Cox, "An Analysis of Transformations", Journal
            of the Royal Statistical Society B, 26, 211-252 (1964).
     """
-    @_deprecate_positional_args
     def __init__(self, method='yeo-johnson', *, standardize=True, copy=True):
         self.method = method
         self.standardize = standardize
@@ -3057,7 +3042,6 @@ def _more_tags(self):
         return {'allow_nan': True}
 
 
-@_deprecate_positional_args
 def power_transform(X, method='yeo-johnson', *, standardize=True, copy=True):
     """
     Power transforms are a family of parametric, monotonic transformations
diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py
index 9ce95a97544a5..d7565ff2fb4b3 100644
--- a/sklearn/preprocessing/_discretization.py
+++ b/sklearn/preprocessing/_discretization.py
@@ -15,7 +15,6 @@
 from ..base import BaseEstimator, TransformerMixin
 from ..utils.validation import check_array
 from ..utils.validation import check_is_fitted
-from ..utils.validation import _deprecate_positional_args
 
 
 class KBinsDiscretizer(TransformerMixin, BaseEstimator):
@@ -125,7 +124,6 @@ class KBinsDiscretizer(TransformerMixin, BaseEstimator):
 
     """
 
-    @_deprecate_positional_args
     def __init__(self, n_bins=5, *, encode='onehot', strategy='quantile',
                  dtype=None):
         self.n_bins = n_bins
diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index ba1d48df175ee..385b4ed83d3eb 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -10,7 +10,6 @@
 from ..base import BaseEstimator, TransformerMixin
 from ..utils import check_array, is_scalar_nan
 from ..utils.validation import check_is_fitted
-from ..utils.validation import _deprecate_positional_args
 from ..utils._mask import _get_mask
 
 from ..utils._encode import _encode, _check_unknown, _unique
@@ -330,7 +329,6 @@ class OneHotEncoder(_BaseEncoder):
            [1., 0., 1., 0.]])
     """
 
-    @_deprecate_positional_args
     def __init__(self, *, categories='auto', drop=None, sparse=True,
                  dtype=np.float64, handle_unknown='error'):
         self.categories = categories
@@ -741,7 +739,6 @@ class OrdinalEncoder(_BaseEncoder):
            ['Female', 2]], dtype=object)
     """
 
-    @_deprecate_positional_args
     def __init__(self, *, categories='auto', dtype=np.float64,
                  handle_unknown='error', unknown_value=None):
         self.categories = categories
diff --git a/sklearn/preprocessing/_function_transformer.py b/sklearn/preprocessing/_function_transformer.py
index ca176aeb87a10..25975add1baf2 100644
--- a/sklearn/preprocessing/_function_transformer.py
+++ b/sklearn/preprocessing/_function_transformer.py
@@ -2,7 +2,6 @@
 
 from ..base import BaseEstimator, TransformerMixin
 from ..utils.validation import _allclose_dense_sparse
-from ..utils.validation import _deprecate_positional_args
 
 
 def _identity(X):
@@ -84,7 +83,6 @@ class FunctionTransformer(TransformerMixin, BaseEstimator):
            [1.0986..., 1.3862...]])
     """
 
-    @_deprecate_positional_args
     def __init__(self, func=None, inverse_func=None, *, validate=False,
                  accept_sparse=False, check_inverse=True, kw_args=None,
                  inv_kw_args=None):
diff --git a/sklearn/preprocessing/_label.py b/sklearn/preprocessing/_label.py
index 2b43dfffe716d..d07b7997ad36a 100644
--- a/sklearn/preprocessing/_label.py
+++ b/sklearn/preprocessing/_label.py
@@ -21,7 +21,6 @@
 from ..utils.validation import check_array
 from ..utils.validation import check_is_fitted
 from ..utils.validation import _num_samples
-from ..utils.validation import _deprecate_positional_args
 from ..utils.multiclass import unique_labels
 from ..utils.multiclass import type_of_target
 from ..utils._encode import _encode, _unique
@@ -257,7 +256,6 @@ class LabelBinarizer(TransformerMixin, BaseEstimator):
         scheme.
     """
 
-    @_deprecate_positional_args
     def __init__(self, *, neg_label=0, pos_label=1, sparse_output=False):
         if neg_label >= pos_label:
             raise ValueError("neg_label={0} must be strictly less than "
@@ -406,7 +404,6 @@ def _more_tags(self):
         return {'X_types': ['1dlabels']}
 
 
-@_deprecate_positional_args
 def label_binarize(y, *, classes, neg_label=0, pos_label=1,
                    sparse_output=False):
     """Binarize labels in a one-vs-all fashion.
@@ -720,7 +717,6 @@ class MultiLabelBinarizer(TransformerMixin, BaseEstimator):
         scheme.
     """
 
-    @_deprecate_positional_args
     def __init__(self, *, classes=None, sparse_output=False):
         self.classes = classes
         self.sparse_output = sparse_output
diff --git a/sklearn/preprocessing/_polynomial.py b/sklearn/preprocessing/_polynomial.py
index d1ec49d7539bf..44ac0d2175c4c 100644
--- a/sklearn/preprocessing/_polynomial.py
+++ b/sklearn/preprocessing/_polynomial.py
@@ -13,8 +13,7 @@
 from ..base import BaseEstimator, TransformerMixin
 from ..utils import check_array
 from ..utils.fixes import linspace
-from ..utils.validation import (check_is_fitted, FLOAT_DTYPES,
-                                _deprecate_positional_args)
+from ..utils.validation import check_is_fitted, FLOAT_DTYPES
 from ._csr_polynomial_expansion import _csr_polynomial_expansion
 
 
@@ -99,7 +98,6 @@ class PolynomialFeatures(TransformerMixin, BaseEstimator):
     See :ref:`examples/linear_model/plot_polynomial_interpolation.py
     <sphx_glr_auto_examples_linear_model_plot_polynomial_interpolation.py>`
     """
-    @_deprecate_positional_args
     def __init__(self, degree=2, *, interaction_only=False, include_bias=True,
                  order='C'):
         self.degree = degree
diff --git a/sklearn/random_projection.py b/sklearn/random_projection.py
index 8e968088e8141..06e4839e50eca 100644
--- a/sklearn/random_projection.py
+++ b/sklearn/random_projection.py
@@ -39,7 +39,6 @@
 from .utils.extmath import safe_sparse_dot
 from .utils.random import sample_without_replacement
 from .utils.validation import check_is_fitted
-from .utils.validation import _deprecate_positional_args
 from .exceptions import DataDimensionalityWarning
 
 
@@ -48,7 +47,6 @@
            "johnson_lindenstrauss_min_dim"]
 
 
-@_deprecate_positional_args
 def johnson_lindenstrauss_min_dim(n_samples, *, eps=0.1):
     """Find a 'safe' number of components to randomly project to.
 
@@ -477,7 +475,6 @@ class GaussianRandomProjection(BaseRandomProjection):
     SparseRandomProjection
 
     """
-    @_deprecate_positional_args
     def __init__(self, n_components='auto', *, eps=0.1, random_state=None):
         super().__init__(
             n_components=n_components,
@@ -618,7 +615,6 @@ class SparseRandomProjection(BaseRandomProjection):
            https://users.soe.ucsc.edu/~optas/papers/jl.pdf
 
     """
-    @_deprecate_positional_args
     def __init__(self, n_components='auto', *, density='auto', eps=0.1,
                  dense_output=False, random_state=None):
         super().__init__(
diff --git a/sklearn/semi_supervised/_label_propagation.py b/sklearn/semi_supervised/_label_propagation.py
index 8ba99b9603e05..e89dfab9310ab 100644
--- a/sklearn/semi_supervised/_label_propagation.py
+++ b/sklearn/semi_supervised/_label_propagation.py
@@ -68,7 +68,6 @@
 from ..utils.extmath import safe_sparse_dot
 from ..utils.multiclass import check_classification_targets
 from ..utils.validation import check_is_fitted
-from ..utils.validation import _deprecate_positional_args
 from ..exceptions import ConvergenceWarning
 
 
@@ -106,7 +105,6 @@ class BaseLabelPropagation(ClassifierMixin, BaseEstimator, metaclass=ABCMeta):
         for more details.
     """
 
-    @_deprecate_positional_args
     def __init__(self, kernel='rbf', *, gamma=20, n_neighbors=7,
                  alpha=1, max_iter=30, tol=1e-3, n_jobs=None):
 
@@ -382,7 +380,6 @@ class LabelPropagation(BaseLabelPropagation):
 
     _variant = 'propagation'
 
-    @_deprecate_positional_args
     def __init__(self, kernel='rbf', *, gamma=20, n_neighbors=7,
                  max_iter=1000, tol=1e-3, n_jobs=None):
         super().__init__(kernel=kernel, gamma=gamma,
@@ -496,7 +493,6 @@ class LabelSpreading(BaseLabelPropagation):
 
     _variant = 'spreading'
 
-    @_deprecate_positional_args
     def __init__(self, kernel='rbf', *, gamma=20, n_neighbors=7, alpha=0.2,
                  max_iter=30, tol=1e-3, n_jobs=None):
 
diff --git a/sklearn/svm/_bounds.py b/sklearn/svm/_bounds.py
index b35728041f6cf..97cbd6d5be355 100644
--- a/sklearn/svm/_bounds.py
+++ b/sklearn/svm/_bounds.py
@@ -6,11 +6,9 @@
 
 from ..preprocessing import LabelBinarizer
 from ..utils.validation import check_consistent_length, check_array
-from ..utils.validation import _deprecate_positional_args
 from ..utils.extmath import safe_sparse_dot
 
 
-@_deprecate_positional_args
 def l1_min_c(X, y, *, loss='squared_hinge', fit_intercept=True,
              intercept_scaling=1.0):
     """
diff --git a/sklearn/svm/_classes.py b/sklearn/svm/_classes.py
index b151f5267da50..050855c25c06a 100644
--- a/sklearn/svm/_classes.py
+++ b/sklearn/svm/_classes.py
@@ -5,7 +5,6 @@
 from ..linear_model._base import LinearClassifierMixin, SparseCoefMixin, \
     LinearModel
 from ..utils.validation import _num_samples
-from ..utils.validation import _deprecate_positional_args
 from ..utils.multiclass import check_classification_targets
 
 
@@ -178,7 +177,6 @@ class LinearSVC(LinearClassifierMixin,
     >>> print(clf.predict([[0, 0, 0, 0]]))
     [1]
     """
-    @_deprecate_positional_args
     def __init__(self, penalty='l2', loss='squared_hinge', *, dual=True,
                  tol=1e-4, C=1.0, multi_class='ovr', fit_intercept=True,
                  intercept_scaling=1, class_weight=None, verbose=0,
@@ -373,7 +371,6 @@ class LinearSVR(RegressorMixin, LinearModel):
         various loss functions and regularization regimes.
     """
 
-    @_deprecate_positional_args
     def __init__(self, *, epsilon=0.0, tol=1e-4, C=1.0,
                  loss='epsilon_insensitive', fit_intercept=True,
                  intercept_scaling=1., dual=True, verbose=0,
@@ -645,7 +642,6 @@ class SVC(BaseSVC):
 
     _impl = 'c_svc'
 
-    @_deprecate_positional_args
     def __init__(self, *, C=1.0, kernel='rbf', degree=3, gamma='scale',
                  coef0=0.0, shrinking=True, probability=False,
                  tol=1e-3, cache_size=200, class_weight=None,
@@ -866,7 +862,6 @@ class NuSVC(BaseSVC):
 
     _impl = 'nu_svc'
 
-    @_deprecate_positional_args
     def __init__(self, *, nu=0.5, kernel='rbf', degree=3, gamma='scale',
                  coef0=0.0, shrinking=True, probability=False, tol=1e-3,
                  cache_size=200, class_weight=None, verbose=False, max_iter=-1,
@@ -1033,7 +1028,6 @@ class SVR(RegressorMixin, BaseLibSVM):
 
     _impl = 'epsilon_svr'
 
-    @_deprecate_positional_args
     def __init__(self, *, kernel='rbf', degree=3, gamma='scale',
                  coef0=0.0, tol=1e-3, C=1.0, epsilon=0.1, shrinking=True,
                  cache_size=200, verbose=False, max_iter=-1):
@@ -1186,7 +1180,6 @@ class NuSVR(RegressorMixin, BaseLibSVM):
 
     _impl = 'nu_svr'
 
-    @_deprecate_positional_args
     def __init__(self, *, nu=0.5, C=1.0, kernel='rbf', degree=3,
                  gamma='scale', coef0=0.0, shrinking=True,
                  tol=1e-3, cache_size=200, verbose=False, max_iter=-1):
@@ -1325,7 +1318,6 @@ class OneClassSVM(OutlierMixin, BaseLibSVM):
 
     _impl = 'one_class'
 
-    @_deprecate_positional_args
     def __init__(self, *, kernel='rbf', degree=3, gamma='scale',
                  coef0=0.0, tol=1e-3, nu=0.5, shrinking=True, cache_size=200,
                  verbose=False, max_iter=-1):
diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
index de5aebfa8a6e3..a79a850f3b7c7 100644
--- a/sklearn/tree/_classes.py
+++ b/sklearn/tree/_classes.py
@@ -36,7 +36,6 @@
 from ..utils import compute_sample_weight
 from ..utils.multiclass import check_classification_targets
 from ..utils.validation import check_is_fitted
-from ..utils.validation import _deprecate_positional_args
 
 from ._criterion import Criterion
 from ._splitter import Splitter
@@ -89,7 +88,6 @@ class BaseDecisionTree(MultiOutputMixin, BaseEstimator, metaclass=ABCMeta):
     """
 
     @abstractmethod
-    @_deprecate_positional_args
     def __init__(self, *,
                  criterion,
                  splitter,
@@ -851,7 +849,6 @@ class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree):
     array([ 1.     ,  0.93...,  0.86...,  0.93...,  0.93...,
             0.93...,  0.93...,  1.     ,  0.93...,  1.      ])
     """
-    @_deprecate_positional_args
     def __init__(self, *,
                  criterion="gini",
                  splitter="best",
@@ -1212,7 +1209,6 @@ class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree):
     array([-0.39..., -0.46...,  0.02...,  0.06..., -0.50...,
            0.16...,  0.11..., -0.73..., -0.30..., -0.00...])
     """
-    @_deprecate_positional_args
     def __init__(self, *,
                  criterion="squared_error",
                  splitter="best",
@@ -1525,7 +1521,6 @@ class ExtraTreeClassifier(DecisionTreeClassifier):
     >>> cls.score(X_test, y_test)
     0.8947...
     """
-    @_deprecate_positional_args
     def __init__(self, *,
                  criterion="gini",
                  splitter="random",
@@ -1756,7 +1751,6 @@ class ExtraTreeRegressor(DecisionTreeRegressor):
     >>> reg.score(X_test, y_test)
     0.33...
     """
-    @_deprecate_positional_args
     def __init__(self, *,
                  criterion="squared_error",
                  splitter="random",
diff --git a/sklearn/tree/_export.py b/sklearn/tree/_export.py
index 17680db2b855d..a9763128c3a7e 100644
--- a/sklearn/tree/_export.py
+++ b/sklearn/tree/_export.py
@@ -17,7 +17,6 @@
 import numpy as np
 
 from ..utils.validation import check_is_fitted
-from ..utils.validation import _deprecate_positional_args
 from ..base import is_classifier
 
 from . import _criterion
@@ -76,7 +75,6 @@ def __repr__(self):
 SENTINEL = Sentinel()
 
 
-@_deprecate_positional_args
 def plot_tree(decision_tree, *, max_depth=None, feature_names=None,
               class_names=None, label='all', filled=False, impurity=True,
               node_ids=False, proportion=False, rounded=False, precision=3,
@@ -648,7 +646,6 @@ def recurse(self, node, tree, ax, scale_x, scale_y, height, depth=0):
             ax.annotate("\n  (...)  \n", xy_parent, xy, **kwargs)
 
 
-@_deprecate_positional_args
 def export_graphviz(decision_tree, out_file=None, *, max_depth=None,
                     feature_names=None, class_names=None, label='all',
                     filled=False, leaves_parallel=False, impurity=True,
@@ -804,7 +801,6 @@ def compute_depth_(current_node, current_depth,
     return max(depths)
 
 
-@_deprecate_positional_args
 def export_text(decision_tree, *, feature_names=None, max_depth=10,
                 spacing=3, decimals=2, show_weights=False):
     """Build a text report showing the rules of a decision tree.
diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py
index a6e30a9941756..4cb6f85fb5d0a 100644
--- a/sklearn/tree/tests/test_tree.py
+++ b/sklearn/tree/tests/test_tree.py
@@ -63,7 +63,7 @@
     "ExtraTreeRegressor": ExtraTreeRegressor,
 }
 
-ALL_TREES = dict()
+ALL_TREES: dict = dict()
 ALL_TREES.update(CLF_TREES)
 ALL_TREES.update(REG_TREES)
 
diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py
index 972d56f66d900..c1f7c2e641502 100644
--- a/sklearn/utils/__init__.py
+++ b/sklearn/utils/__init__.py
@@ -30,8 +30,7 @@
                          assert_all_finite,
                          check_random_state, column_or_1d, check_array,
                          check_consistent_length, check_X_y, indexable,
-                         check_symmetric, check_scalar,
-                         _deprecate_positional_args)
+                         check_symmetric, check_scalar)
 from .. import get_config
 
 
@@ -632,7 +631,6 @@ def shuffle(*arrays, random_state=None, n_samples=None):
                     random_state=random_state)
 
 
-@_deprecate_positional_args
 def safe_sqr(X, *, copy=True):
     """Element wise squaring of array-likes and sparse matrices.
 
@@ -672,7 +670,6 @@ def _chunk_generator(gen, chunksize):
             return
 
 
-@_deprecate_positional_args
 def gen_batches(n, batch_size, *, min_batch_size=0):
     """Generator to create slices containing batch_size elements, from 0 to n.
 
@@ -726,7 +723,6 @@ def gen_batches(n, batch_size, *, min_batch_size=0):
         yield slice(start, n)
 
 
-@_deprecate_positional_args
 def gen_even_slices(n, n_packs, *, n_samples=None):
     """Generator to create n_packs slices going up to n.
 
@@ -914,7 +910,6 @@ def _print_elapsed_time(source, message=None):
                                timeit.default_timer() - start))
 
 
-@_deprecate_positional_args
 def get_chunk_n_rows(row_bytes, *, max_n_rows=None, working_memory=None):
     """Calculates how many rows can be processed within working_memory.
 
diff --git a/sklearn/utils/class_weight.py b/sklearn/utils/class_weight.py
index 47d1dd25860dd..0daebccd51322 100644
--- a/sklearn/utils/class_weight.py
+++ b/sklearn/utils/class_weight.py
@@ -4,10 +4,7 @@
 
 import numpy as np
 
-from .validation import _deprecate_positional_args
 
-
-@_deprecate_positional_args
 def compute_class_weight(class_weight, *, classes, y):
     """Estimate class weights for unbalanced datasets.
 
@@ -72,7 +69,6 @@ def compute_class_weight(class_weight, *, classes, y):
     return weight
 
 
-@_deprecate_positional_args
 def compute_sample_weight(class_weight, y, *, indices=None):
     """Estimate sample weights by class for unbalanced datasets.
 
diff --git a/sklearn/utils/extmath.py b/sklearn/utils/extmath.py
index c72c54bd1aa4d..13d24486cbc79 100644
--- a/sklearn/utils/extmath.py
+++ b/sklearn/utils/extmath.py
@@ -21,7 +21,6 @@
 from .fixes import np_version, parse_version
 from .sparsefuncs_fast import csr_row_norms
 from .validation import check_array
-from .validation import _deprecate_positional_args
 
 
 def squared_norm(x):
@@ -116,7 +115,6 @@ def density(w, **kwargs):
     return d
 
 
-@_deprecate_positional_args
 def safe_sparse_dot(a, b, *, dense_output=False):
     """Dot product that handle the sparse matrix case correctly.
 
@@ -158,7 +156,6 @@ def safe_sparse_dot(a, b, *, dense_output=False):
     return ret
 
 
-@_deprecate_positional_args
 def randomized_range_finder(A, *, size, n_iter,
                             power_iteration_normalizer='auto',
                             random_state=None):
@@ -243,7 +240,6 @@ def randomized_range_finder(A, *, size, n_iter,
     return Q
 
 
-@_deprecate_positional_args
 def randomized_svd(M, n_components, *, n_oversamples=10, n_iter='auto',
                    power_iteration_normalizer='auto', transpose='auto',
                    flip_sign=True, random_state='warn'):
@@ -409,7 +405,6 @@ def randomized_svd(M, n_components, *, n_oversamples=10, n_iter='auto',
         return U[:, :n_components], s[:n_components], Vt[:n_components, :]
 
 
-@_deprecate_positional_args
 def _randomized_eigsh(M, n_components, *, n_oversamples=10, n_iter='auto',
                       power_iteration_normalizer='auto',
                       selection='module', random_state=None):
@@ -555,7 +550,6 @@ def _randomized_eigsh(M, n_components, *, n_oversamples=10, n_iter='auto',
     return eigvals, eigvecs
 
 
-@_deprecate_positional_args
 def weighted_mode(a, w, *, axis=0):
     """Returns an array of the weighted modal (most common) value in a.
 
diff --git a/sklearn/utils/graph.py b/sklearn/utils/graph.py
index b98fd6ac0baa0..8d5d6782b46f4 100644
--- a/sklearn/utils/graph.py
+++ b/sklearn/utils/graph.py
@@ -13,13 +13,11 @@
 from scipy import sparse
 
 from .graph_shortest_path import graph_shortest_path  # noqa
-from .validation import _deprecate_positional_args
 
 
 ###############################################################################
 # Path and connected component analysis.
 # Code adapted from networkx
-@_deprecate_positional_args
 def single_source_shortest_path_length(graph, source, *, cutoff=None):
     """Return the shortest path length from source to all reachable nodes.
 
diff --git a/sklearn/utils/sparsefuncs.py b/sklearn/utils/sparsefuncs.py
index fcd7a3f3fe54e..3f85fc39e3053 100644
--- a/sklearn/utils/sparsefuncs.py
+++ b/sklearn/utils/sparsefuncs.py
@@ -5,7 +5,6 @@
 # License: BSD 3 clause
 import scipy.sparse as sp
 import numpy as np
-from .validation import _deprecate_positional_args
 
 from .sparsefuncs_fast import (
     csr_mean_variance_axis0 as _csr_mean_var_axis0,
@@ -120,7 +119,6 @@ def mean_variance_axis(X, axis, weights=None, return_sum_weights=False):
         _raise_typeerror(X)
 
 
-@_deprecate_positional_args
 def incr_mean_variance_axis(X, *, axis, last_mean, last_var, last_n,
                             weights=None):
     """Compute incremental mean and variance along an axis on a CSR or
diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py
index 536d585caa8b7..acfc8f5d10db2 100644
--- a/sklearn/utils/validation.py
+++ b/sklearn/utils/validation.py
@@ -32,7 +32,7 @@
 FLOAT_DTYPES = (np.float64, np.float32, np.float16)
 
 
-def _deprecate_positional_args(func=None, *, version="1.0 (renaming of 0.25)"):
+def _deprecate_positional_args(func=None, *, version="1.1 (renaming of 0.26)"):
     """Decorator for methods that issues warnings for positional arguments.
 
     Using the keyword-only argument syntax in pep 3102, arguments after the
@@ -42,7 +42,7 @@ def _deprecate_positional_args(func=None, *, version="1.0 (renaming of 0.25)"):
     ----------
     func : callable, default=None
         Function to check arguments on.
-    version : callable, default="1.0 (renaming of 0.25)"
+    version : callable, default="1.1 (renaming of 0.26)"
         The version when positional arguments will result in error.
     """
     def _inner_deprecate_positional_args(f):
@@ -111,7 +111,6 @@ def _assert_all_finite(X, allow_nan=False, msg_dtype=None):
             raise ValueError("Input contains NaN")
 
 
-@_deprecate_positional_args
 def assert_all_finite(X, *, allow_nan=False):
     """Throw a ValueError if X contains NaN or infinity.
 
@@ -124,7 +123,6 @@ def assert_all_finite(X, *, allow_nan=False):
     _assert_all_finite(X.data if sp.issparse(X) else X, allow_nan)
 
 
-@_deprecate_positional_args
 def as_float_array(X, *, copy=True, force_all_finite=True):
     """Converts an array-like to an array of floats.
 
@@ -458,7 +456,6 @@ def _ensure_no_complex_data(array):
                          "{}\n".format(array))
 
 
-@_deprecate_positional_args
 def check_array(array, accept_sparse=False, *, accept_large_sparse=True,
                 dtype="numeric", order=None, copy=False, force_all_finite=True,
                 ensure_2d=True, allow_nd=False, ensure_min_samples=1,
@@ -761,7 +758,6 @@ def _check_large_sparse(X, accept_large_sparse=False):
                                  % indices_datatype)
 
 
-@_deprecate_positional_args
 def check_X_y(X, y, accept_sparse=False, *, accept_large_sparse=True,
               dtype="numeric", order=None, copy=False, force_all_finite=True,
               ensure_2d=True, allow_nd=False, multi_output=False,
@@ -890,7 +886,6 @@ def check_X_y(X, y, accept_sparse=False, *, accept_large_sparse=True,
     return X, y
 
 
-@_deprecate_positional_args
 def column_or_1d(y, *, warn=False):
     """ Ravel column or 1d numpy array, else raises an error.
 
@@ -971,7 +966,6 @@ def has_fit_parameter(estimator, parameter):
     return parameter in signature(estimator.fit).parameters
 
 
-@_deprecate_positional_args
 def check_symmetric(array, *, tol=1E-10, raise_warning=True,
                     raise_exception=False):
     """Make sure that array is 2D, square and symmetric.
@@ -1031,7 +1025,6 @@ def check_symmetric(array, *, tol=1E-10, raise_warning=True,
     return array
 
 
-@_deprecate_positional_args
 def check_is_fitted(estimator, attributes=None, *, msg=None, all_or_any=all):
     """Perform is_fitted validation for estimator.
 

From 5073d692f04dea88d595252a6cc0382509b6947d Mon Sep 17 00:00:00 2001
From: groceryheist <groceryheist@gmail.com>
Date: Fri, 14 May 2021 15:40:16 -0700
Subject: [PATCH 398/478] DOC Clarify wording in ensemble.rst (#20094)

---
 doc/modules/ensemble.rst | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst
index 21610228b9b37..91fc892f79d0a 100644
--- a/doc/modules/ensemble.rst
+++ b/doc/modules/ensemble.rst
@@ -761,12 +761,12 @@ the parameter ``loss``:
 
   * Classification
 
-    * Binomial deviance (``'deviance'``): The negative binomial
-      log-likelihood loss function for binary classification (provides
+    * Binomial deviance (``'deviance'``): The binomial
+      negative log-likelihood loss function for binary classification (provides
       probability estimates).  The initial model is given by the
       log odds-ratio.
-    * Multinomial deviance (``'deviance'``): The negative multinomial
-      log-likelihood loss function for multi-class classification with
+    * Multinomial deviance (``'deviance'``): The multinomial
+      negative log-likelihood loss function for multi-class classification with
       ``n_classes`` mutually exclusive classes. It provides
       probability estimates.  The initial model is given by the
       prior probability of each class. At each iteration ``n_classes``

From d73822f84f2832dcc25f0ff58769f60871a78025 Mon Sep 17 00:00:00 2001
From: Yu Feng <rainwoodman@gmail.com>
Date: Sun, 16 May 2021 06:49:42 -0700
Subject: [PATCH 399/478] DOC Add notes about the location of function body.
 (#20095)

---
 sklearn/svm/src/libsvm/svm.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sklearn/svm/src/libsvm/svm.h b/sklearn/svm/src/libsvm/svm.h
index 0e509c61c37ed..a1634119858f1 100644
--- a/sklearn/svm/src/libsvm/svm.h
+++ b/sklearn/svm/src/libsvm/svm.h
@@ -118,7 +118,7 @@ struct svm_csr_model
 				/* 0 if svm_model is created by svm_train */
 };
 
-
+/* svm_ functions are defined by libsvm_template.cpp from generic versions in svm.cpp */
 struct svm_model *svm_train(const struct svm_problem *prob, const struct svm_parameter *param, int *status, BlasFunctions *blas_functions);
 void svm_cross_validation(const struct svm_problem *prob, const struct svm_parameter *param, int nr_fold, double *target, BlasFunctions *blas_functions);
 
@@ -145,6 +145,7 @@ void svm_set_print_string_function(void (*print_func)(const char *));
 
 /* sparse version */
 
+/* svm_csr_ functions are defined by libsvm_template.cpp from generic versions in svm.cpp */
 struct svm_csr_model *svm_csr_train(const struct svm_csr_problem *prob, const struct svm_parameter *param, int *status, BlasFunctions *blas_functions);
 void svm_csr_cross_validation(const struct svm_csr_problem *prob, const struct svm_parameter *param, int nr_fold, double *target, BlasFunctions *blas_functions);
 

From 29e21a33ad61ecc840c816bfd0a29921e2b64940 Mon Sep 17 00:00:00 2001
From: tom1092 <33375092+tom1092@users.noreply.github.com>
Date: Mon, 17 May 2021 10:47:09 +0200
Subject: [PATCH 400/478] DOC Update hinge loss function in SVM (#20077)

Co-authored-by: Guillaume Lemaitre <g.lemaitre58@gmail.com>
---
 doc/modules/svm.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/modules/svm.rst b/doc/modules/svm.rst
index 57d2cfb3cb7a7..fcf1d3e23976b 100644
--- a/doc/modules/svm.rst
+++ b/doc/modules/svm.rst
@@ -677,7 +677,7 @@ The primal problem can be equivalently formulated as
 
 .. math::
 
-    \min_ {w, b} \frac{1}{2} w^T w + C \sum_{i=1}\max(0, y_i (w^T \phi(x_i) + b)),
+    \min_ {w, b} \frac{1}{2} w^T w + C \sum_{i=1}\max(0, 1 - y_i (w^T \phi(x_i) + b)),
 
 where we make use of the `hinge loss
 <https://en.wikipedia.org/wiki/Hinge_loss>`_. This is the form that is

From 053d2d1af477d9dc17e69162b9f2298c0fda5905 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Mon, 17 May 2021 04:52:34 -0400
Subject: [PATCH 401/478] CI Uses minimum version for doc-min-dependencies
 (#20057)

---
 .circleci/config.yml            | 6 ++++++
 build_tools/circle/build_doc.sh | 6 +++---
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index bc4acd8a35fcb..b407e8b15dd38 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -16,6 +16,9 @@ jobs:
       - SCIKIT_IMAGE_VERSION: 'min'
       - SPHINX_VERSION: 'min'
       - PANDAS_VERSION: 'min'
+      - SPHINX_GALLERY_VERSION: 'min'
+      - NUMPYDOC_VERSION: 'min'
+      - SPHINX_PROMPT_VERSION: 'min'
     steps:
       - checkout
       - run: ./build_tools/circle/checkout_merge_commit.sh
@@ -57,6 +60,9 @@ jobs:
       - SCIKIT_IMAGE_VERSION: 'latest'
       - SPHINX_VERSION: 'min'
       - PANDAS_VERSION: 'latest'
+      - SPHINX_GALLERY_VERSION: 'latest'
+      - NUMPYDOC_VERSION: 'latest'
+      - SPHINX_PROMPT_VERSION: 'latest'
     steps:
       - checkout
       - run: ./build_tools/circle/checkout_merge_commit.sh
diff --git a/build_tools/circle/build_doc.sh b/build_tools/circle/build_doc.sh
index 37afb1841d368..563d09fc0b7bd 100755
--- a/build_tools/circle/build_doc.sh
+++ b/build_tools/circle/build_doc.sh
@@ -177,9 +177,9 @@ conda create -n $CONDA_ENV_NAME --yes --quiet \
     joblib memory_profiler packaging seaborn pillow pytest coverage
 
 source activate testenv
-pip install sphinx-gallery
-pip install numpydoc
-pip install sphinx-prompt
+pip install "$(get_dep sphinx-gallery $SPHINX_GALLERY_VERSION)"
+pip install "$(get_dep numpydoc $NUMPYDOC_VERSION)"
+pip install "$(get_dep sphinx-prompt $SPHINX_PROMPT_VERSION)"
 
 # Set parallelism to 3 to overlap IO bound tasks with CPU bound tasks on CI
 # workers with 2 cores when building the compiled extensions of scikit-learn.

From 40b45e6b7c65e08311a13cc7a8528c3988e3b405 Mon Sep 17 00:00:00 2001
From: Chiara Marmo <cmarmo@users.noreply.github.com>
Date: Tue, 18 May 2021 16:09:36 +0200
Subject: [PATCH 402/478] [MRG] CI Push Scipy minimum version to 1.1.0. Remove
 Python 3.6 from builds. (#20069)

* Push scipy min version to 1.0.0

* Update all ubuntu images to 20.04 focal.

* Add ubuntu images 18.04 bionic and scipy fron conda-forge.

* Fix conditions.

* Pin python 3.6 for ubuntu bionic.

* Change pipeline name.

* Change matrix element name.

* Keep python 3.9 from system not conda in Ubuntu 20.04.

* Remove python directive when unnecessary.

* Cleanup.

* Downgrade to python 3.6 as scipy 1.0.0 is incompatible with 3.8.

* Fix comment.

* Fix comment.

* Pin pytest again as we are forced to use 3.6.

* Move to conda installer for 32bit linux.

* Install miniconda for ubuntu 32bit.

* Install wget for ubuntu 32bit.

* Revert 32bit OS to ubuntu bionic 18.04.

* Install scipy from pip in 32bit system.

* Fix doctest failures.

* Revert example rendering.

* Relax pytest version in ubuntu install.

* Skip failing tests.

* Put comment at the right place.

* Remove python3.6. Ubuntu32 still needs to be adapted.

* Push numpy and scipy min versions for compatibility with 3.7.

* Push matplotlib min version for compatibility with 3.7. Install numpy via pip in 32bit linux.

* Install numpy before scipy in Linux 32bit.

* Pass numpy version to linux32.

* Test 32bit architecture on debian buster (still exists for 32bit with python 3.7).

* Install matplotlib from distribution.

* Syntax error...

* Stick to the numpy debian version to avoid Expected 124 from C header, got 112 from PyObject error.

* Clean comments.

* Revert skip in doctest to check with new dependencies.

* Rename distrib.

* Skip again...

* Fix test on check_array.

* Remove comment and fix lint at the same time.

* Clean import.

* Increase atol in test_derivatives to make the test pass in py37_conda_openblas environment.

* Avoid sparse matrix dependent on scipy version.

* Skip docstring test for pandas versions less then 1.1.0.

* Fix lint error.

* Empty commit to force checks.

* Add minimal dependencies in changelog.

* Update to python 3.7 CircleCI and Travis builds.

* Move to debian buster for python3.7 dependencies.

* Fix the container tag.

* Lower the minimal pandas version for compatibility with python 3.7.
---
 .circleci/config.yml                          | 10 +--
 .travis.yml                                   | 10 +--
 azure-pipelines.yml                           | 66 +++++++++++--------
 build_tools/azure/install.sh                  |  4 +-
 build_tools/azure/posix-32.yml                |  4 +-
 build_tools/azure/test_script.sh              |  2 +-
 doc/conftest.py                               |  5 ++
 doc/modules/sgd.rst                           |  2 +-
 .../supervised_learning.rst                   |  2 +-
 doc/whats_new/v1.0.rst                        |  6 ++
 pyproject.toml                                |  2 +-
 sklearn/_min_dependencies.py                  | 11 ++--
 sklearn/decomposition/_truncated_svd.py       | 15 +++--
 .../tests/test_loss.py                        |  2 +-
 sklearn/utils/tests/test_validation.py        | 11 +---
 15 files changed, 80 insertions(+), 72 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index b407e8b15dd38..f4ee4e4cf1dfb 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -3,12 +3,12 @@ version: 2
 jobs:
   doc-min-dependencies:
     docker:
-      - image: circleci/python:3.7.3-stretch
+      - image: circleci/python:3.7.7-buster
     environment:
       - OMP_NUM_THREADS: 2
       - MKL_NUM_THREADS: 2
       - CONDA_ENV_NAME: testenv
-      - PYTHON_VERSION: 3.6
+      - PYTHON_VERSION: 3.7
       - NUMPY_VERSION: 'min'
       - SCIPY_VERSION: 'min'
       - MATPLOTLIB_VERSION: 'min'
@@ -47,7 +47,7 @@ jobs:
 
   doc:
     docker:
-      - image: circleci/python:3.7.3-stretch
+      - image: circleci/python:3.7.7-buster
     environment:
       - OMP_NUM_THREADS: 2
       - MKL_NUM_THREADS: 2
@@ -96,7 +96,7 @@ jobs:
 
   lint:
     docker:
-      - image: circleci/python:3.6
+      - image: circleci/python:3.7
     steps:
       - checkout
       - run: ./build_tools/circle/checkout_merge_commit.sh
@@ -130,7 +130,7 @@ jobs:
 
   deploy:
     docker:
-      - image: circleci/python:3.6
+      - image: circleci/python:3.7
     steps:
       - checkout
       - run: ./build_tools/circle/checkout_merge_commit.sh
diff --git a/.travis.yml b/.travis.yml
index 1e6ed78d28ac2..09f05b57eecfa 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -40,19 +40,11 @@ jobs:
         - CPU_COUNT=4
 
     # Linux environments to build the scikit-learn wheels for the ARM64
-    # architecture and Python 3.6 and newer. This is used both at release time
+    # architecture and Python 3.7 and newer. This is used both at release time
     # with the manual trigger in the commit message in the release branch and as
     # a scheduled task to build the weekly dev build on the main branch. The
     # weekly frequency is meant to avoid depleting the Travis CI credits too
     # fast.
-    - python: 3.6
-      os: linux
-      arch: arm64
-      if: type = cron or commit_message =~ /\[cd build\]/
-      env:
-        - BUILD_WHEEL=true
-        - CIBW_BUILD=cp36-manylinux_aarch64
-
     - python: 3.7
       os: linux
       arch: arm64
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 412de99f5e57d..31baf41ff4cb1 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -11,7 +11,7 @@ jobs:
 - job: git_commit
   displayName: Get Git Commit
   pool:
-    vmImage: ubuntu-18.04
+    vmImage: ubuntu-20.04
   steps:
     - bash: |
         set -ex
@@ -38,7 +38,7 @@ jobs:
     )
   displayName: Linting
   pool:
-    vmImage: ubuntu-18.04
+    vmImage: ubuntu-20.04
   steps:
     - task: UsePythonVersion@0
       inputs:
@@ -57,7 +57,7 @@ jobs:
 - template: build_tools/azure/posix.yml
   parameters:
     name: Linux_Nightly
-    vmImage: ubuntu-18.04
+    vmImage: ubuntu-20.04
     dependsOn: [git_commit, linting]
     condition: |
       and(
@@ -83,7 +83,7 @@ jobs:
 - template: build_tools/azure/posix.yml
   parameters:
     name: Linux_Nightly_ICC
-    vmImage: ubuntu-18.04
+    vmImage: ubuntu-20.04
     dependsOn: [git_commit, linting]
     condition: |
       and(
@@ -105,7 +105,7 @@ jobs:
 - template: build_tools/azure/posix.yml
   parameters:
     name: Linux_Runs
-    vmImage: ubuntu-18.04
+    vmImage: ubuntu-20.04
     dependsOn: [git_commit]
     condition: |
       and(
@@ -119,10 +119,30 @@ jobs:
         BLAS: 'mkl'
         COVERAGE: 'true'
 
+# Check compilation with Ubuntu bionic 18.04 LTS and scipy from conda-forge
 - template: build_tools/azure/posix.yml
   parameters:
-    name: Linux
+    name: Ubuntu_Bionic
     vmImage: ubuntu-18.04
+    dependsOn: [git_commit, linting]
+    condition: |
+      and(
+        succeeded(),
+        not(contains(dependencies['git_commit']['outputs']['commit.message'], '[ci skip]')),
+        ne(variables['Build.Reason'], 'Schedule')
+      )
+    matrix:
+      py37_conda:
+        DISTRIB: 'conda'
+        PYTHON_VERSION: '3.7'
+        BLAS: 'openblas'
+        COVERAGE: 'false'
+        BUILD_WITH_ICC: 'false'
+
+- template: build_tools/azure/posix.yml
+  parameters:
+    name: Linux
+    vmImage: ubuntu-20.04
     dependsOn: [linting, git_commit]
     condition: |
       and(
@@ -132,32 +152,23 @@ jobs:
       )
     matrix:
       # Linux environment to test that scikit-learn can be built against
-      # versions of numpy, scipy with ATLAS that comes with Ubuntu Bionic 18.04
-      # i.e. numpy 1.13.3 and scipy 0.19
-      py36_ubuntu_atlas:
+      # versions of numpy, scipy with ATLAS that comes with Ubuntu Focal 20.04
+      # i.e. numpy 1.17.4 and scipy 1.3.3
+      ubuntu_atlas:
         DISTRIB: 'ubuntu'
-        PYTHON_VERSION: '3.6'
         JOBLIB_VERSION: 'min'
         PANDAS_VERSION: 'none'
         THREADPOOLCTL_VERSION: 'min'
-        PYTEST_VERSION: 'min'
-        PYTEST_XDIST_VERSION: 'none'
         COVERAGE: 'false'
-      # Linux + Python 3.6 build with OpenBLAS and without SITE_JOBLIB
-      py36_conda_openblas:
+      # Linux + Python 3.7 build with OpenBLAS and without SITE_JOBLIB
+      py37_conda_openblas:
         DISTRIB: 'conda'
-        PYTHON_VERSION: '3.6'
+        PYTHON_VERSION: '3.7'
         BLAS: 'openblas'
         NUMPY_VERSION: 'min'
         SCIPY_VERSION: 'min'
         MATPLOTLIB_VERSION: 'min'
-        # latest version of joblib available in conda for Python 3.6
-        JOBLIB_VERSION: '0.13.2'
         THREADPOOLCTL_VERSION: '2.0.0'
-        # temporary pin pytest due to unknown failure with pytest 5.4 and
-        # python 3.6
-        PYTEST_VERSION: 'min'
-        PYTEST_XDIST_VERSION: 'none'
       # Linux environment to test the latest available dependencies and MKL.
       # It runs tests requiring lightgbm, pandas and PyAMG.
       pylatest_pip_openblas_pandas:
@@ -171,7 +182,7 @@ jobs:
 - template: build_tools/azure/posix-32.yml
   parameters:
     name: Linux32
-    vmImage: ubuntu-18.04
+    vmImage: ubuntu-20.04
     dependsOn: [linting, git_commit]
     condition: |
       and(
@@ -180,14 +191,11 @@ jobs:
         ne(variables['Build.Reason'], 'Schedule')
       )
     matrix:
-      py36_ubuntu_atlas_32bit:
-        DISTRIB: 'ubuntu-32'
-        PYTHON_VERSION: '3.6'
+      debian_atlas_32bit:
+        DISTRIB: 'debian-32'
         JOBLIB_VERSION: 'min'
         # disable pytest xdist due to unknown bug with 32-bit container
         PYTEST_XDIST_VERSION: 'none'
-        # temporary pin pytest due to unknown failure with pytest 5.4 and
-        # python 3.6
         PYTEST_VERSION: 'min'
         THREADPOOLCTL_VERSION: 'min'
 
@@ -231,6 +239,6 @@ jobs:
         PYTHON_ARCH: '64'
         PYTEST_VERSION: '*'
         COVERAGE: 'true'
-      py36_pip_openblas_32bit:
-        PYTHON_VERSION: '3.6'
+      py37_pip_openblas_32bit:
+        PYTHON_VERSION: '3.7'
         PYTHON_ARCH: '32'
diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh
index d2711d6bd610e..048ffe300ee2a 100755
--- a/build_tools/azure/install.sh
+++ b/build_tools/azure/install.sh
@@ -70,9 +70,9 @@ elif [[ "$DISTRIB" == "ubuntu" ]]; then
     python -m pip install $(get_dep cython $CYTHON_VERSION) \
                           $(get_dep joblib $JOBLIB_VERSION)
 
-elif [[ "$DISTRIB" == "ubuntu-32" ]]; then
+elif [[ "$DISTRIB" == "debian-32" ]]; then
     apt-get update
-    apt-get install -y python3-dev python3-scipy python3-matplotlib libatlas3-base libatlas-base-dev python3-virtualenv python3-pandas ccache
+    apt-get install -y python3-dev python3-numpy python3-scipy python3-matplotlib libatlas3-base libatlas-base-dev python3-virtualenv python3-pandas ccache
 
     python3 -m virtualenv --system-site-packages --python=python3 $VIRTUALENV
     source $VIRTUALENV/bin/activate
diff --git a/build_tools/azure/posix-32.yml b/build_tools/azure/posix-32.yml
index 5e4689a2505e5..039236a70fbe5 100644
--- a/build_tools/azure/posix-32.yml
+++ b/build_tools/azure/posix-32.yml
@@ -45,7 +45,7 @@ jobs:
         -w /io
         --detach
         --name skcontainer
-        -e DISTRIB=ubuntu-32
+        -e DISTRIB=debian-32
         -e TEST_DIR=/temp_dir
         -e JUNITXML=$JUNITXML
         -e VIRTUALENV=testvenv
@@ -63,7 +63,7 @@ jobs:
         -e OMP_NUM_THREADS=$OMP_NUM_THREADS
         -e OPENBLAS_NUM_THREADS=$OPENBLAS_NUM_THREADS
         -e SKLEARN_SKIP_NETWORK_TESTS=$SKLEARN_SKIP_NETWORK_TESTS
-        i386/ubuntu:18.04
+        i386/debian:10.9
         sleep 1000000
       displayName: 'Start container'
     - script: >
diff --git a/build_tools/azure/test_script.sh b/build_tools/azure/test_script.sh
index 858d691b38216..6e05d7d858e52 100755
--- a/build_tools/azure/test_script.sh
+++ b/build_tools/azure/test_script.sh
@@ -4,7 +4,7 @@ set -e
 
 if [[ "$DISTRIB" =~ ^conda.* ]]; then
     source activate $VIRTUALENV
-elif [[ "$DISTRIB" == "ubuntu" ]] || [[ "$DISTRIB" == "ubuntu-32" ]]; then
+elif [[ "$DISTRIB" == "ubuntu" ]] || [[ "$DISTRIB" == "debian-32" ]]; then
     source $VIRTUALENV/bin/activate
 fi
 
diff --git a/doc/conftest.py b/doc/conftest.py
index 5468184bf5509..a2770e5d36a10 100644
--- a/doc/conftest.py
+++ b/doc/conftest.py
@@ -7,6 +7,7 @@
 from sklearn.utils import IS_PYPY
 from sklearn.utils._testing import SkipTest
 from sklearn.utils._testing import check_skip_network
+from sklearn.utils.fixes import parse_version
 from sklearn.datasets import get_data_home
 from sklearn.datasets._base import _pkl_filepath
 from sklearn.datasets._twenty_newsgroups import CACHE_NAME
@@ -80,6 +81,10 @@ def setup_grid_search():
 def setup_preprocessing():
     try:
         import pandas  # noqa
+        if parse_version(pandas.__version__) < parse_version('1.1.0'):
+            raise SkipTest(
+                "Skipping preprocessing.rst, pandas version < 1.1.0"
+                )
     except ImportError:
         raise SkipTest("Skipping preprocessing.rst, pandas not installed")
 
diff --git a/doc/modules/sgd.rst b/doc/modules/sgd.rst
index 0a1d8407e64ae..0b618289b84ec 100644
--- a/doc/modules/sgd.rst
+++ b/doc/modules/sgd.rst
@@ -130,7 +130,7 @@ Using ``loss="log"`` or ``loss="modified_huber"`` enables the
 :math:`P(y|x)` per sample :math:`x`::
 
     >>> clf = SGDClassifier(loss="log", max_iter=5).fit(X, y)
-    >>> clf.predict_proba([[1., 1.]])
+    >>> clf.predict_proba([[1., 1.]]) # doctest: +SKIP
     array([[0.00..., 0.99...]])
 
 The concrete penalty can be set via the ``penalty`` parameter.
diff --git a/doc/tutorial/statistical_inference/supervised_learning.rst b/doc/tutorial/statistical_inference/supervised_learning.rst
index 3d87830fa0b26..e326b614472de 100644
--- a/doc/tutorial/statistical_inference/supervised_learning.rst
+++ b/doc/tutorial/statistical_inference/supervised_learning.rst
@@ -173,7 +173,7 @@ Linear models: :math:`y = X\beta + \epsilon`
     >>> regr = linear_model.LinearRegression()
     >>> regr.fit(diabetes_X_train, diabetes_y_train)
     LinearRegression()
-    >>> print(regr.coef_)
+    >>> print(regr.coef_) # doctest: +SKIP
     [   0.30349955 -237.63931533  510.53060544  327.73698041 -814.13170937
       492.81458798  102.84845219  184.60648906  743.51961675   76.09517222]
 
diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index f94e7001fdc97..87b0441bade5f 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -12,6 +12,12 @@ Version 1.0.0
 
 .. include:: changelog_legend.inc
 
+Minimal dependencies
+--------------------
+
+Version 1.0.0 of scikit-learn requires python 3.7+, numpy 1.14.5+ and
+scipy 1.1.0+. Optional minimal dependency is matplotlib 2.2.2+.
+
 Enforcing keyword-only arguments
 --------------------------------
 
diff --git a/pyproject.toml b/pyproject.toml
index c55c68b3182b8..84468f65341da 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -11,5 +11,5 @@ requires = [
     # see: https://github.com/scipy/oldest-supported-numpy/blob/master/setup.cfg
     "oldest-supported-numpy",
 
-    "scipy>=0.19.1",
+    "scipy>=1.1.0",
 ]
diff --git a/sklearn/_min_dependencies.py b/sklearn/_min_dependencies.py
index 56d44586cdc6d..aa01b7fdfa352 100644
--- a/sklearn/_min_dependencies.py
+++ b/sklearn/_min_dependencies.py
@@ -5,12 +5,11 @@
 
 # numpy scipy and cython should by in sync with pyproject.toml
 if platform.python_implementation() == 'PyPy':
-    SCIPY_MIN_VERSION = '1.1.0'
     NUMPY_MIN_VERSION = '1.19.0'
 else:
-    SCIPY_MIN_VERSION = '0.19.1'
-    NUMPY_MIN_VERSION = '1.13.3'
+    NUMPY_MIN_VERSION = '1.14.5'
 
+SCIPY_MIN_VERSION = '1.1.0'
 JOBLIB_MIN_VERSION = '0.11'
 THREADPOOLCTL_MIN_VERSION = '2.0.0'
 PYTEST_MIN_VERSION = '5.0.1'
@@ -26,9 +25,9 @@
     'joblib': (JOBLIB_MIN_VERSION, 'install'),
     'threadpoolctl': (THREADPOOLCTL_MIN_VERSION, 'install'),
     'cython': (CYTHON_MIN_VERSION, 'build'),
-    'matplotlib': ('2.1.1', 'benchmark, docs, examples, tests'),
-    'scikit-image': ('0.13', 'docs, examples, tests'),
-    'pandas': ('0.25.0', 'benchmark, docs, examples, tests'),
+    'matplotlib': ('2.2.2', 'benchmark, docs, examples, tests'),
+    'scikit-image': ('0.14', 'docs, examples, tests'),
+    'pandas': ('0.23.4', 'benchmark, docs, examples, tests'),
     'seaborn': ('0.9.0', 'docs, examples'),
     'memory_profiler': ('0.57.0', 'benchmark, docs'),
     'pytest': (PYTEST_MIN_VERSION, 'tests'),
diff --git a/sklearn/decomposition/_truncated_svd.py b/sklearn/decomposition/_truncated_svd.py
index 74239567dee48..7aa36c59da00e 100644
--- a/sklearn/decomposition/_truncated_svd.py
+++ b/sklearn/decomposition/_truncated_svd.py
@@ -87,18 +87,21 @@ class TruncatedSVD(TransformerMixin, BaseEstimator):
     Examples
     --------
     >>> from sklearn.decomposition import TruncatedSVD
-    >>> from scipy.sparse import random as sparse_random
-    >>> X = sparse_random(100, 100, density=0.01, format='csr',
-    ...                   random_state=42)
+    >>> from scipy.sparse import csr_matrix
+    >>> import numpy as np
+    >>> np.random.seed(0)
+    >>> X_dense = np.random.rand(100, 100)
+    >>> X_dense[:, 2 * np.arange(50)] = 0
+    >>> X = csr_matrix(X_dense)
     >>> svd = TruncatedSVD(n_components=5, n_iter=7, random_state=42)
     >>> svd.fit(X)
     TruncatedSVD(n_components=5, n_iter=7, random_state=42)
     >>> print(svd.explained_variance_ratio_)
-    [0.0646... 0.0633... 0.0639... 0.0535... 0.0406...]
+    [0.0157... 0.0512... 0.0499... 0.0479... 0.0453...]
     >>> print(svd.explained_variance_ratio_.sum())
-    0.286...
+    0.2102...
     >>> print(svd.singular_values_)
-    [1.553... 1.512...  1.510... 1.370... 1.199...]
+    [35.2410...  4.5981...   4.5420...  4.4486...  4.3288...]
 
     See Also
     --------
diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py
index 345e72c642668..9f4294a101700 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py
@@ -98,7 +98,7 @@ def fprime2(x: np.ndarray) -> np.ndarray:
     optimum = optimum.ravel()
     assert_allclose(loss.inverse_link_function(optimum), y_true)
     assert_allclose(func(optimum), 0, atol=1e-14)
-    assert_allclose(get_gradients(y_true, optimum), 0, atol=1e-7)
+    assert_allclose(get_gradients(y_true, optimum), 0, atol=1e-6)
 
 
 @pytest.mark.parametrize('loss, n_classes, prediction_dim', [
diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py
index 66f7d9ae77687..c244d6f6caffc 100644
--- a/sklearn/utils/tests/test_validation.py
+++ b/sklearn/utils/tests/test_validation.py
@@ -24,7 +24,7 @@
 from sklearn.utils import check_X_y
 from sklearn.utils import deprecated
 from sklearn.utils._mocking import MockDataFrame
-from sklearn.utils.fixes import np_version, parse_version
+from sklearn.utils.fixes import parse_version
 from sklearn.utils.estimator_checks import _NotAnArray
 from sklearn.random_projection import _sparse_random_matrix
 from sklearn.linear_model import ARDRegression
@@ -49,7 +49,6 @@
     _num_features,
     FLOAT_DTYPES)
 from sklearn.utils.validation import _check_fit_params
-from sklearn.utils.fixes import parse_version
 
 import sklearn
 
@@ -345,7 +344,7 @@ def test_check_array():
     assert isinstance(result, np.ndarray)
 
 
-# TODO: Check for error in 1.1 when implicit conversation is removed
+# TODO: Check for error in 1.1 when implicit conversion is removed
 @pytest.mark.parametrize("X", [
    [['1', '2'], ['3', '4']],
    np.array([['1', '2'], ['3', '4']], dtype='U'),
@@ -368,14 +367,10 @@ def test_check_array_numeric_warns(X):
    [['11', '12'], ['13', 'xx']],
    np.array([['11', '12'], ['13', 'xx']], dtype='U'),
    np.array([['11', '12'], ['13', 'xx']], dtype='S'),
-   [[b'a', b'b'], [b'c', b'd']],
-   np.array([[b'a', b'b'], [b'c', b'd']], dtype='V1')
+   [[b'a', b'b'], [b'c', b'd']]
 ])
 def test_check_array_dtype_numeric_errors(X):
     """Error when string-ike array can not be converted"""
-    if (np_version < parse_version("1.14")
-            and hasattr(X, "dtype") and X.dtype.kind == "V"):
-        pytest.skip("old numpy would convert V dtype into float silently")
     expected_warn_msg = "Unable to convert array of bytes/strings"
     with pytest.raises(ValueError, match=expected_warn_msg):
         check_array(X, dtype="numeric")

From ca6caa28ab92cbf75a3cc2a411d2a225abd9a4ce Mon Sep 17 00:00:00 2001
From: Ashvith Shetty <ashvithshetty10@gmail.com>
Date: Wed, 19 May 2021 00:39:11 +0530
Subject: [PATCH 403/478] TST Removed the estimators from the IGNORED list in
 test_fit_docstring_attributes (#20103)

---
 sklearn/tests/test_docstring_parameters.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/sklearn/tests/test_docstring_parameters.py b/sklearn/tests/test_docstring_parameters.py
index 099c27341927e..cc10f11fcd574 100644
--- a/sklearn/tests/test_docstring_parameters.py
+++ b/sklearn/tests/test_docstring_parameters.py
@@ -288,13 +288,6 @@ def test_fit_docstring_attributes(name, Estimator):
         with ignore_warnings(category=FutureWarning):
             assert hasattr(est, attr.name)
 
-    IGNORED = {'Birch', 'LarsCV', 'Lasso',
-               'OrthogonalMatchingPursuit'}
-
-    if Estimator.__name__ in IGNORED:
-        pytest.xfail(
-            reason="Estimator has too many undocumented attributes.")
-
     fit_attr = [k for k in est.__dict__.keys() if k.endswith('_')
                 and not k.startswith('_')]
     fit_attr_names = [attr.name for attr in attributes]

From 094992b7a784491abd1ec0ce011e9f7956f09397 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=20Dupr=C3=A9=20la=20Tour?= <tom.dupre-la-tour@m4x.org>
Date: Tue, 18 May 2021 15:28:03 -0700
Subject: [PATCH 404/478] DOC fix new line alignment

---
 doc/related_projects.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/related_projects.rst b/doc/related_projects.rst
index fb02ea8beaf0d..5d50196000e44 100644
--- a/doc/related_projects.rst
+++ b/doc/related_projects.rst
@@ -96,7 +96,7 @@ enhance the functionality of scikit-learn's estimators.
   cross-validated parameter search using any of these strategies.
 
 - `sklearn-deap <https://github.com/rsteca/sklearn-deap>`_ Use evolutionary
-   algorithms instead of gridsearch in scikit-learn.
+  algorithms instead of gridsearch in scikit-learn.
 
 **Model export for production**
 

From 2a43ed2bfc4614e449c1dadb87ce95a21e7e3457 Mon Sep 17 00:00:00 2001
From: Chiara Marmo <cmarmo@users.noreply.github.com>
Date: Wed, 19 May 2021 12:20:25 +0200
Subject: [PATCH 405/478] CI Fix min dependencies for scikit-image (#20108)

---
 build_tools/circle/build_doc.sh | 2 +-
 sklearn/_min_dependencies.py    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/build_tools/circle/build_doc.sh b/build_tools/circle/build_doc.sh
index 563d09fc0b7bd..3935b9a8deaa8 100755
--- a/build_tools/circle/build_doc.sh
+++ b/build_tools/circle/build_doc.sh
@@ -172,11 +172,11 @@ conda create -n $CONDA_ENV_NAME --yes --quiet \
     "$(get_dep cython $CYTHON_VERSION)" \
     "$(get_dep matplotlib $MATPLOTLIB_VERSION)" \
     "$(get_dep sphinx $SPHINX_VERSION)" \
-    "$(get_dep scikit-image $SCIKIT_IMAGE_VERSION)" \
     "$(get_dep pandas $PANDAS_VERSION)" \
     joblib memory_profiler packaging seaborn pillow pytest coverage
 
 source activate testenv
+pip install "$(get_dep scikit-image $SCIKIT_IMAGE_VERSION)"
 pip install "$(get_dep sphinx-gallery $SPHINX_GALLERY_VERSION)"
 pip install "$(get_dep numpydoc $NUMPYDOC_VERSION)"
 pip install "$(get_dep sphinx-prompt $SPHINX_PROMPT_VERSION)"
diff --git a/sklearn/_min_dependencies.py b/sklearn/_min_dependencies.py
index aa01b7fdfa352..d878a04eb4523 100644
--- a/sklearn/_min_dependencies.py
+++ b/sklearn/_min_dependencies.py
@@ -26,7 +26,7 @@
     'threadpoolctl': (THREADPOOLCTL_MIN_VERSION, 'install'),
     'cython': (CYTHON_MIN_VERSION, 'build'),
     'matplotlib': ('2.2.2', 'benchmark, docs, examples, tests'),
-    'scikit-image': ('0.14', 'docs, examples, tests'),
+    'scikit-image': ('0.14.5', 'docs, examples, tests'),
     'pandas': ('0.23.4', 'benchmark, docs, examples, tests'),
     'seaborn': ('0.9.0', 'docs, examples'),
     'memory_profiler': ('0.57.0', 'benchmark, docs'),

From 3014fcfcd0253ccfd7831bf85a36b763189a6417 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juan=20Carlos=20Alfaro=20Jim=C3=A9nez?=
 <JuanCarlos.Alfaro@uclm.es>
Date: Wed, 19 May 2021 15:29:30 +0200
Subject: [PATCH 406/478] MNT Move parameter validation from `__init__` to
 `fit` in `neighbors` module (#20072)

---
 doc/whats_new/v1.0.rst                    |  6 +++++
 sklearn/neighbors/_base.py                | 10 ++++----
 sklearn/neighbors/_classification.py      |  8 +++++--
 sklearn/neighbors/_regression.py          |  8 +++++--
 sklearn/neighbors/tests/test_neighbors.py | 28 +++++++++++++++--------
 5 files changed, 40 insertions(+), 20 deletions(-)

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 87b0441bade5f..34e9f0670ba81 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -415,6 +415,12 @@ Changelog
 - |FIX| :class:`neighbors.DistanceMetric` subclasses now support readonly
   memory-mapped datasets. :pr:`19883` by `Julien Jerphanion <jjerphan>`.
 
+- |FIX| :class:`neighbors.NearestNeighbors`, :class:`neighbors.KNeighborsClassifier`,
+  :class:`neighbors.RadiusNeighborsClassifier`, :class:`neighbors.KNeighborsRegressor`
+  and :class:`neighbors.RadiusNeighborsRegressor` does not validate `weights` in
+  `__init__` and validates `weights` in `fit` instead. :pr:`20072` by
+  :user:`Juan Carlos Alfaro Jiménez <alfaro96>`.
+
 :mod:`sklearn.pipeline`
 .......................
 
diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py
index 9a222762ec615..c6438165aba1a 100644
--- a/sklearn/neighbors/_base.py
+++ b/sklearn/neighbors/_base.py
@@ -58,14 +58,13 @@
 
 def _check_weights(weights):
     """Check to make sure weights are valid"""
-    if weights in (None, 'uniform', 'distance'):
-        return weights
-    elif callable(weights):
-        return weights
-    else:
+    if (weights not in (None, 'uniform', 'distance') and
+            not callable(weights)):
         raise ValueError("weights not recognized: should be 'uniform', "
                          "'distance', or a callable function")
 
+    return weights
+
 
 def _get_weights(dist, weights):
     """Get the weights from an array of distances and a parameter ``weights``
@@ -312,7 +311,6 @@ def __init__(self, n_neighbors=None, radius=None,
         self.metric_params = metric_params
         self.p = p
         self.n_jobs = n_jobs
-        self._check_algorithm_metric()
 
     def _check_algorithm_metric(self):
         if self.algorithm not in ['auto', 'brute',
diff --git a/sklearn/neighbors/_classification.py b/sklearn/neighbors/_classification.py
index 29ab582c15ab9..1fd1fb01c9762 100644
--- a/sklearn/neighbors/_classification.py
+++ b/sklearn/neighbors/_classification.py
@@ -152,7 +152,7 @@ def __init__(self, n_neighbors=5, *,
             leaf_size=leaf_size, metric=metric, p=p,
             metric_params=metric_params,
             n_jobs=n_jobs)
-        self.weights = _check_weights(weights)
+        self.weights = weights
 
     def fit(self, X, y):
         """Fit the k-nearest neighbors classifier from the training dataset.
@@ -172,6 +172,8 @@ def fit(self, X, y):
         self : KNeighborsClassifier
             The fitted k-nearest neighbors classifier.
         """
+        self.weights = _check_weights(self.weights)
+
         return self._fit(X, y)
 
     def predict(self, X):
@@ -412,7 +414,7 @@ def __init__(self, radius=1.0, *, weights='uniform',
               leaf_size=leaf_size,
               metric=metric, p=p, metric_params=metric_params,
               n_jobs=n_jobs)
-        self.weights = _check_weights(weights)
+        self.weights = weights
         self.outlier_label = outlier_label
 
     def fit(self, X, y):
@@ -433,6 +435,8 @@ def fit(self, X, y):
         self : RadiusNeighborsClassifier
             The fitted radius neighbors classifier.
         """
+        self.weights = _check_weights(self.weights)
+
         self._fit(X, y)
 
         classes_ = self.classes_
diff --git a/sklearn/neighbors/_regression.py b/sklearn/neighbors/_regression.py
index 96beb1ee022af..be60abcc64cb5 100644
--- a/sklearn/neighbors/_regression.py
+++ b/sklearn/neighbors/_regression.py
@@ -150,7 +150,7 @@ def __init__(self, n_neighbors=5, *, weights='uniform',
               algorithm=algorithm,
               leaf_size=leaf_size, metric=metric, p=p,
               metric_params=metric_params, n_jobs=n_jobs)
-        self.weights = _check_weights(weights)
+        self.weights = weights
 
     def _more_tags(self):
         # For cross-validation routines to split data correctly
@@ -183,6 +183,8 @@ def fit(self, X, y):
         self : KNeighborsRegressor
             The fitted k-nearest neighbors regressor.
         """
+        self.weights = _check_weights(self.weights)
+
         return self._fit(X, y)
 
     def predict(self, X):
@@ -349,7 +351,7 @@ def __init__(self, radius=1.0, *, weights='uniform',
               leaf_size=leaf_size,
               p=p, metric=metric, metric_params=metric_params,
               n_jobs=n_jobs)
-        self.weights = _check_weights(weights)
+        self.weights = weights
 
     def fit(self, X, y):
         """Fit the radius neighbors regressor from the training dataset.
@@ -369,6 +371,8 @@ def fit(self, X, y):
         self : RadiusNeighborsRegressor
             The fitted radius neighbors regressor.
         """
+        self.weights = _check_weights(self.weights)
+
         return self._fit(X, y)
 
     def predict(self, X):
diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py
index 5df7a6419b0b5..555687b7ea74a 100644
--- a/sklearn/neighbors/tests/test_neighbors.py
+++ b/sklearn/neighbors/tests/test_neighbors.py
@@ -1167,24 +1167,28 @@ def test_radius_neighbors_graph_sparse(seed=36):
 
 def test_neighbors_badargs():
     # Test bad argument values: these should all raise ValueErrors
-    with pytest.raises(ValueError):
-        neighbors.NearestNeighbors(algorithm='blah')
-
     X = rng.random_sample((10, 2))
     Xsparse = csr_matrix(X)
     X3 = rng.random_sample((10, 3))
     y = np.ones(10)
 
+    est = neighbors.NearestNeighbors(algorithm='blah')
+    with pytest.raises(ValueError):
+        est.fit(X)
+
     for cls in (neighbors.KNeighborsClassifier,
                 neighbors.RadiusNeighborsClassifier,
                 neighbors.KNeighborsRegressor,
                 neighbors.RadiusNeighborsRegressor):
+        est = cls(weights='blah')
         with pytest.raises(ValueError):
-            cls(weights='blah')
+            est.fit(X, y)
+        est = cls(p=-1)
         with pytest.raises(ValueError):
-            cls(p=-1)
+            est.fit(X, y)
+        est = cls(algorithm='blah')
         with pytest.raises(ValueError):
-            cls(algorithm='blah')
+            est.fit(X, y)
 
         nbrs = cls(algorithm='ball_tree', metric='haversine')
         with pytest.raises(ValueError):
@@ -1253,10 +1257,11 @@ def test_neighbors_metrics(n_samples=20, n_features=3,
             # KD tree doesn't support all metrics
             if (algorithm == 'kd_tree' and
                     metric not in neighbors.KDTree.valid_metrics):
+                est = neighbors.NearestNeighbors(algorithm=algorithm,
+                                                 metric=metric,
+                                                 metric_params=metric_params)
                 with pytest.raises(ValueError):
-                    neighbors.NearestNeighbors(algorithm=algorithm,
-                                               metric=metric,
-                                               metric_params=metric_params)
+                    est.fit(X)
                 continue
             neigh = neighbors.NearestNeighbors(n_neighbors=n_neighbors,
                                                algorithm=algorithm,
@@ -1359,8 +1364,11 @@ def test_valid_brute_metric_for_auto_algorithm():
 
 
 def test_metric_params_interface():
+    X = rng.rand(5, 5)
+    y = rng.randint(0, 2, 5)
+    est = neighbors.KNeighborsClassifier(metric_params={'p': 3})
     with pytest.warns(SyntaxWarning):
-        neighbors.KNeighborsClassifier(metric_params={'p': 3})
+        est.fit(X, y)
 
 
 def test_predict_sparse_ball_kd_tree():

From 1ac047d29a43bd1556d5c90e40376340a08bc3a6 Mon Sep 17 00:00:00 2001
From: Eleni Markou <eamarkou@gmail.com>
Date: Wed, 19 May 2021 16:34:22 +0300
Subject: [PATCH 407/478] DOC Replace broken link in clustering.rst (#20102)

---
 doc/modules/clustering.rst             | 2 +-
 sklearn/metrics/cluster/_supervised.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst
index 7f9fe2a7bd12e..0245c48920f11 100644
--- a/doc/modules/clustering.rst
+++ b/doc/modules/clustering.rst
@@ -1653,7 +1653,7 @@ Drawbacks
 
   * E. B. Fowkles and C. L. Mallows, 1983. "A method for comparing two
     hierarchical clusterings". Journal of the American Statistical Association.
-    http://wildfire.stat.ucla.edu/pdflibrary/fowlkes.pdf
+    https://www.tandfonline.com/doi/abs/10.1080/01621459.1983.10478008
 
   * `Wikipedia entry for the Fowlkes-Mallows Index
     <https://en.wikipedia.org/wiki/Fowlkes-Mallows_index>`_
diff --git a/sklearn/metrics/cluster/_supervised.py b/sklearn/metrics/cluster/_supervised.py
index ccc8077a3aab9..7814e7ba50e1c 100644
--- a/sklearn/metrics/cluster/_supervised.py
+++ b/sklearn/metrics/cluster/_supervised.py
@@ -1076,7 +1076,7 @@ def fowlkes_mallows_score(labels_true, labels_pred, *, sparse=False):
     .. [1] `E. B. Fowkles and C. L. Mallows, 1983. "A method for comparing two
        hierarchical clusterings". Journal of the American Statistical
        Association
-       <http://wildfire.stat.ucla.edu/pdflibrary/fowlkes.pdf>`_
+       <https://www.tandfonline.com/doi/abs/10.1080/01621459.1983.10478008>`_
 
     .. [2] `Wikipedia entry for the Fowlkes-Mallows Index
            <https://en.wikipedia.org/wiki/Fowlkes-Mallows_index>`_

From 1b6a651296787bcfa850f443a85308f62dffdf47 Mon Sep 17 00:00:00 2001
From: Alihan Zihna <alihanz@gmail.com>
Date: Thu, 20 May 2021 01:12:57 +0100
Subject: [PATCH 408/478] TST Changes assert to pytest style in decomposition,
 datasets, covariance, compose (#20104)

Co-authored-by: Alihan Zihna <a.zihna@ckhgbdp.onmicrosoft.com>
---
 .../compose/tests/test_column_transformer.py  |  81 +++++----
 .../tests/test_robust_covariance.py           |  14 +-
 sklearn/datasets/tests/test_openml.py         | 160 +++++++++---------
 .../datasets/tests/test_samples_generator.py  |  29 ++--
 sklearn/decomposition/tests/test_nmf.py       |  92 ++++++----
 5 files changed, 206 insertions(+), 170 deletions(-)

diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py
index 9278d67296ec5..b672885dad645 100644
--- a/sklearn/compose/tests/test_column_transformer.py
+++ b/sklearn/compose/tests/test_column_transformer.py
@@ -9,7 +9,6 @@
 import pytest
 
 from numpy.testing import assert_allclose
-from sklearn.utils._testing import assert_raise_message
 from sklearn.utils._testing import assert_array_equal
 from sklearn.utils._testing import assert_allclose_dense_sparse
 from sklearn.utils._testing import assert_almost_equal
@@ -540,14 +539,17 @@ def test_column_transformer_error_msg_1D():
     X_array = np.array([[0., 1., 2.], [2., 4., 6.]]).T
 
     col_trans = ColumnTransformer([('trans', StandardScaler(), 0)])
-    assert_raise_message(ValueError, "1D data passed to a transformer",
-                         col_trans.fit, X_array)
-    assert_raise_message(ValueError, "1D data passed to a transformer",
-                         col_trans.fit_transform, X_array)
+    msg = '1D data passed to a transformer'
+    with pytest.raises(ValueError, match=msg):
+        col_trans.fit(X_array)
+
+    with pytest.raises(ValueError, match=msg):
+        col_trans.fit_transform(X_array)
 
     col_trans = ColumnTransformer([('trans', TransRaise(), 0)])
     for func in [col_trans.fit, col_trans.fit_transform]:
-        assert_raise_message(ValueError, "specific message", func, X_array)
+        with pytest.raises(ValueError, match="specific message"):
+            func(X_array)
 
 
 def test_2D_transformer_output():
@@ -556,11 +558,13 @@ def test_2D_transformer_output():
     # if one transformer is dropped, test that name is still correct
     ct = ColumnTransformer([('trans1', 'drop', 0),
                             ('trans2', TransNo2D(), 1)])
-    assert_raise_message(ValueError, "the 'trans2' transformer should be 2D",
-                         ct.fit_transform, X_array)
+
+    msg = "the 'trans2' transformer should be 2D"
+    with pytest.raises(ValueError, match=msg):
+        ct.fit_transform(X_array)
     # because fit is also doing transform, this raises already on fit
-    assert_raise_message(ValueError, "the 'trans2' transformer should be 2D",
-                         ct.fit, X_array)
+    with pytest.raises(ValueError, match=msg):
+        ct.fit(X_array)
 
 
 def test_2D_transformer_output_pandas():
@@ -571,11 +575,12 @@ def test_2D_transformer_output_pandas():
 
     # if one transformer is dropped, test that name is still correct
     ct = ColumnTransformer([('trans1', TransNo2D(), 'col1')])
-    assert_raise_message(ValueError, "the 'trans1' transformer should be 2D",
-                         ct.fit_transform, X_df)
+    msg = "the 'trans1' transformer should be 2D"
+    with pytest.raises(ValueError, match=msg):
+        ct.fit_transform(X_df)
     # because fit is also doing transform, this raises already on fit
-    assert_raise_message(ValueError, "the 'trans1' transformer should be 2D",
-                         ct.fit, X_df)
+    with pytest.raises(ValueError, match=msg):
+        ct.fit(X_df)
 
 
 @pytest.mark.parametrize("remainder", ['drop', 'passthrough'])
@@ -585,14 +590,14 @@ def test_column_transformer_invalid_columns(remainder):
     # general invalid
     for col in [1.5, ['string', 1], slice(1, 's'), np.array([1.])]:
         ct = ColumnTransformer([('trans', Trans(), col)], remainder=remainder)
-        assert_raise_message(ValueError, "No valid specification",
-                             ct.fit, X_array)
+        with pytest.raises(ValueError, match="No valid specification"):
+            ct.fit(X_array)
 
     # invalid for arrays
     for col in ['string', ['string', 'other'], slice('a', 'b')]:
         ct = ColumnTransformer([('trans', Trans(), col)], remainder=remainder)
-        assert_raise_message(ValueError, "Specifying the columns",
-                             ct.fit, X_array)
+        with pytest.raises(ValueError, match="Specifying the columns"):
+            ct.fit(X_array)
 
     # transformed n_features does not match fitted n_features
     col = [0, 1]
@@ -621,9 +626,9 @@ def predict(self, X):
 
     X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
     ct = ColumnTransformer([('trans', NoTrans(), [0])])
-    assert_raise_message(TypeError,
-                         "All estimators should implement fit and transform",
-                         ct.fit, X_array)
+    msg = "All estimators should implement fit and transform"
+    with pytest.raises(TypeError, match=msg):
+        ct.fit(X_array)
 
 
 def test_make_column_transformer():
@@ -659,13 +664,13 @@ def test_make_column_transformer_kwargs():
     assert ct.remainder == 'drop'
     assert ct.sparse_threshold == 0.5
     # invalid keyword parameters should raise an error message
-    assert_raise_message(
-        TypeError,
+    msg = re.escape(
         "make_column_transformer() got an unexpected "
-        "keyword argument 'transformer_weights'",
-        make_column_transformer, (scaler, 'first'), (norm, ['second']),
-        transformer_weights={'pca': 10, 'Transf': 1}
+        "keyword argument 'transformer_weights'"
     )
+    with pytest.raises(TypeError, match=msg):
+        make_column_transformer((scaler, 'first'), (norm, ['second']),
+                                transformer_weights={'pca': 10, 'Transf': 1})
 
 
 def test_make_column_transformer_remainder_transformer():
@@ -893,10 +898,11 @@ def test_column_transformer_special_strings():
     for val in [None, 'other']:
         ct = ColumnTransformer(
             [('trans1', Trans(), [0]), ('trans2', None, [1])])
-        assert_raise_message(TypeError, "All estimators should implement",
-                             ct.fit_transform, X_array)
-        assert_raise_message(TypeError, "All estimators should implement",
-                             ct.fit, X_array)
+        msg = "All estimators should implement"
+        with pytest.raises(TypeError, match=msg):
+            ct.fit_transform(X_array)
+        with pytest.raises(TypeError, match=msg):
+            ct.fit(X_array)
 
 
 def test_column_transformer_remainder():
@@ -946,14 +952,15 @@ def test_column_transformer_remainder():
 
     # error on invalid arg
     ct = ColumnTransformer([('trans1', Trans(), [0])], remainder=1)
-    assert_raise_message(
-        ValueError,
-        "remainder keyword needs to be one of \'drop\', \'passthrough\', "
-        "or estimator.", ct.fit, X_array)
-    assert_raise_message(
-        ValueError,
+    msg = (
         "remainder keyword needs to be one of \'drop\', \'passthrough\', "
-        "or estimator.", ct.fit_transform, X_array)
+        "or estimator."
+    )
+    with pytest.raises(ValueError, match=msg):
+        ct.fit(X_array)
+
+    with pytest.raises(ValueError, match=msg):
+        ct.fit_transform(X_array)
 
     # check default for make_column_transformer
     ct = make_column_transformer((Trans(), [0]))
diff --git a/sklearn/covariance/tests/test_robust_covariance.py b/sklearn/covariance/tests/test_robust_covariance.py
index 01f32563710aa..1a6a1508170e7 100644
--- a/sklearn/covariance/tests/test_robust_covariance.py
+++ b/sklearn/covariance/tests/test_robust_covariance.py
@@ -10,7 +10,6 @@
 import pytest
 
 from sklearn.utils._testing import assert_array_almost_equal
-from sklearn.utils._testing import assert_raise_message
 
 from sklearn import datasets
 from sklearn.covariance import empirical_covariance, MinCovDet
@@ -43,15 +42,17 @@ def test_mcd():
 
 def test_fast_mcd_on_invalid_input():
     X = np.arange(100)
-    assert_raise_message(ValueError, 'Expected 2D array, got 1D array instead',
-                         fast_mcd, X)
+    msg = 'Expected 2D array, got 1D array instead'
+    with pytest.raises(ValueError, match=msg):
+        fast_mcd(X)
 
 
 def test_mcd_class_on_invalid_input():
     X = np.arange(100)
     mcd = MinCovDet()
-    assert_raise_message(ValueError, 'Expected 2D array, got 1D array instead',
-                         mcd.fit, X)
+    msg = 'Expected 2D array, got 1D array instead'
+    with pytest.raises(ValueError, match=msg):
+        mcd.fit(X)
 
 
 def launch_mcd_on_dataset(n_samples, n_features, n_outliers, tol_loc, tol_cov,
@@ -133,7 +134,8 @@ def test_mcd_support_covariance_is_zero():
     msg = ('The covariance matrix of the support data is equal to 0, try to '
            'increase support_fraction')
     for X in [X_1, X_2]:
-        assert_raise_message(ValueError, msg, MinCovDet().fit, X)
+        with pytest.raises(ValueError, match=msg):
+            MinCovDet().fit(X)
 
 
 def test_mcd_increasing_det_warning():
diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py
index dac0762eb2160..663d2ae3088ed 100644
--- a/sklearn/datasets/tests/test_openml.py
+++ b/sklearn/datasets/tests/test_openml.py
@@ -22,8 +22,6 @@
                                       _get_local_path,
                                       _retry_with_clean_cache,
                                       _feature_to_dtype)
-from sklearn.utils._testing import (assert_warns_message,
-                                    assert_raise_message)
 from sklearn.utils import is_scalar_nan
 from sklearn.utils._testing import assert_allclose, assert_array_equal
 from urllib.error import HTTPError
@@ -888,21 +886,20 @@ def test_fetch_openml_australian(monkeypatch, gzip_response):
     expected_features = 14
     expected_missing = 0
     _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
-    assert_warns_message(
-        UserWarning,
-        "Version 1 of dataset Australian is inactive,",
-        _fetch_dataset_from_openml,
-        **{'data_id': data_id, 'data_name': data_name,
-           'data_version': data_version,
-           'target_column': target_column,
-           'expected_observations': expected_observations,
-           'expected_features': expected_features,
-           'expected_missing': expected_missing,
-           'expect_sparse': True,
-           'expected_data_dtype': np.float64,
-           'expected_target_dtype': object,
-           'compare_default_target': False}  # numpy specific check
-    )
+    msg = "Version 1 of dataset Australian is inactive,"
+    with pytest.warns(UserWarning, match=msg):
+        _fetch_dataset_from_openml(
+            **{'data_id': data_id, 'data_name': data_name,
+               'data_version': data_version,
+               'target_column': target_column,
+               'expected_observations': expected_observations,
+               'expected_features': expected_features,
+               'expected_missing': expected_missing,
+               'expect_sparse': True,
+               'expected_data_dtype': np.float64,
+               'expected_target_dtype': object,
+               'compare_default_target': False}  # numpy specific check
+        )
 
 
 @pytest.mark.parametrize('gzip_response', [True, False])
@@ -1095,14 +1092,14 @@ def test_fetch_openml_inactive(monkeypatch, gzip_response):
     # fetch inactive dataset by id
     data_id = 40675
     _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
-    glas2 = assert_warns_message(
-        UserWarning, "Version 1 of dataset glass2 is inactive,", fetch_openml,
-        data_id=data_id, cache=False, as_frame=False)
+    msg = "Version 1 of dataset glass2 is inactive,"
+    with pytest.warns(UserWarning, match=msg):
+        glas2 = fetch_openml(data_id=data_id, cache=False, as_frame=False)
     # fetch inactive dataset by name and version
     assert glas2.data.shape == (163, 9)
-    glas2_by_version = assert_warns_message(
-        UserWarning, "Version 1 of dataset glass2 is inactive,", fetch_openml,
-        data_id=None, name="glass2", version=1, cache=False, as_frame=False)
+    with pytest.warns(UserWarning, match=msg):
+        glas2_by_version = fetch_openml(data_id=None, name='glass2',
+                                        cache=False, version=1, as_frame=False)
     assert int(glas2_by_version.details['id']) == data_id
 
 
@@ -1112,8 +1109,9 @@ def test_fetch_nonexiting(monkeypatch, gzip_response):
     data_id = 40675
     _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
     # Note that we only want to search by name (not data id)
-    assert_raise_message(ValueError, "No active dataset glass2 found",
-                         fetch_openml, name='glass2', cache=False)
+    msg = "No active dataset glass2 found"
+    with pytest.raises(ValueError, match=msg):
+        fetch_openml(name='glass2', cache=False)
 
 
 @pytest.mark.parametrize('gzip_response', [True, False])
@@ -1122,10 +1120,10 @@ def test_raises_illegal_multitarget(monkeypatch, gzip_response):
     targets = ['sepalwidth', 'class']
     _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
     # Note that we only want to search by name (not data id)
-    assert_raise_message(ValueError,
-                         "Can only handle homogeneous multi-target datasets,",
-                         fetch_openml, data_id=data_id,
-                         target_column=targets, cache=False)
+    msg = "Can only handle homogeneous multi-target datasets,"
+    with pytest.raises(ValueError, match=msg):
+        fetch_openml(data_id=data_id, target_column=targets,
+                     cache=False)
 
 
 @pytest.mark.parametrize('gzip_response', [True, False])
@@ -1135,23 +1133,27 @@ def test_warn_ignore_attribute(monkeypatch, gzip_response):
     expected_ignore_msg = "target_column={} has flag is_ignore."
     _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
     # single column test
-    assert_warns_message(UserWarning, expected_row_id_msg.format('MouseID'),
-                         fetch_openml, data_id=data_id,
-                         target_column='MouseID',
-                         cache=False, as_frame=False)
-    assert_warns_message(UserWarning, expected_ignore_msg.format('Genotype'),
-                         fetch_openml, data_id=data_id,
-                         target_column='Genotype',
-                         cache=False, as_frame=False)
+    target_col = 'MouseID'
+    msg = expected_row_id_msg.format(target_col)
+    with pytest.warns(UserWarning, match=msg):
+        fetch_openml(data_id=data_id, target_column=target_col,
+                     cache=False, as_frame=False)
+    target_col = 'Genotype'
+    msg = expected_ignore_msg.format(target_col)
+    with pytest.warns(UserWarning, match=msg):
+        fetch_openml(data_id=data_id, target_column=target_col,
+                     cache=False, as_frame=False)
     # multi column test
-    assert_warns_message(UserWarning, expected_row_id_msg.format('MouseID'),
-                         fetch_openml, data_id=data_id,
-                         target_column=['MouseID', 'class'],
-                         cache=False, as_frame=False)
-    assert_warns_message(UserWarning, expected_ignore_msg.format('Genotype'),
-                         fetch_openml, data_id=data_id,
-                         target_column=['Genotype', 'class'],
-                         cache=False, as_frame=False)
+    target_col = 'MouseID'
+    msg = expected_row_id_msg.format(target_col)
+    with pytest.warns(UserWarning, match=msg):
+        fetch_openml(data_id=data_id, target_column=[target_col, 'class'],
+                     cache=False, as_frame=False)
+    target_col = 'Genotype'
+    msg = expected_ignore_msg.format(target_col)
+    with pytest.warns(UserWarning, match=msg):
+        fetch_openml(data_id=data_id, target_column=[target_col, 'class'],
+                     cache=False, as_frame=False)
 
 
 @pytest.mark.parametrize('gzip_response', [True, False])
@@ -1159,73 +1161,77 @@ def test_string_attribute_without_dataframe(monkeypatch, gzip_response):
     data_id = 40945
     _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
     # single column test
-    assert_raise_message(ValueError,
-                         ('STRING attributes are not supported for '
-                          'array representation. Try as_frame=True'),
-                         fetch_openml, data_id=data_id, cache=False,
-                         as_frame=False)
+    msg = (
+        'STRING attributes are not supported for '
+        'array representation. Try as_frame=True'
+    )
+    with pytest.raises(ValueError, match=msg):
+        fetch_openml(data_id=data_id, cache=False, as_frame=False)
 
 
 @pytest.mark.parametrize('gzip_response', [True, False])
 def test_dataset_with_openml_error(monkeypatch, gzip_response):
     data_id = 1
     _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
-    assert_warns_message(
-        UserWarning,
+    msg = (
         "OpenML registered a problem with the dataset. It might be unusable. "
-        "Error:",
-        fetch_openml, data_id=data_id, cache=False, as_frame=False
+        "Error:"
     )
+    with pytest.warns(UserWarning, match=msg):
+        fetch_openml(data_id=data_id, cache=False, as_frame=False)
 
 
 @pytest.mark.parametrize('gzip_response', [True, False])
 def test_dataset_with_openml_warning(monkeypatch, gzip_response):
     data_id = 3
     _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
-    assert_warns_message(
-        UserWarning,
+    msg = (
         "OpenML raised a warning on the dataset. It might be unusable. "
-        "Warning:",
-        fetch_openml, data_id=data_id, cache=False, as_frame=False
+        "Warning:"
     )
+    with pytest.warns(UserWarning, match=msg):
+        fetch_openml(data_id=data_id, cache=False, as_frame=False)
 
 
 @pytest.mark.parametrize('gzip_response', [True, False])
 def test_illegal_column(monkeypatch, gzip_response):
     data_id = 61
     _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
-    assert_raise_message(KeyError, "Could not find target_column=",
-                         fetch_openml, data_id=data_id,
-                         target_column='undefined', cache=False)
+    msg = "Could not find target_column="
+    with pytest.raises(KeyError, match=msg):
+        fetch_openml(data_id=data_id, target_column='undefined', cache=False)
 
-    assert_raise_message(KeyError, "Could not find target_column=",
-                         fetch_openml, data_id=data_id,
-                         target_column=['undefined', 'class'],
-                         cache=False)
+    with pytest.raises(KeyError, match=msg):
+        fetch_openml(data_id=data_id, target_column=['undefined', 'class'],
+                     cache=False)
 
 
 @pytest.mark.parametrize('gzip_response', [True, False])
 def test_fetch_openml_raises_missing_values_target(monkeypatch, gzip_response):
     data_id = 2
     _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
-    assert_raise_message(ValueError, "Target column ",
-                         fetch_openml, data_id=data_id, target_column='family')
+    msg = 'Target column '
+    with pytest.raises(ValueError, match=msg):
+        fetch_openml(data_id=data_id, target_column='family')
 
 
 def test_fetch_openml_raises_illegal_argument():
-    assert_raise_message(ValueError, "Dataset data_id=",
-                         fetch_openml, data_id=-1, name="name")
+    msg = 'Dataset data_id='
+    with pytest.raises(ValueError, match=msg):
+        fetch_openml(data_id=-1, name="name")
 
-    assert_raise_message(ValueError, "Dataset data_id=",
-                         fetch_openml, data_id=-1, name=None,
-                         version="version")
+    with pytest.raises(ValueError, match=msg):
+        fetch_openml(data_id=-1, name=None, version="version")
 
-    assert_raise_message(ValueError, "Dataset data_id=",
-                         fetch_openml, data_id=-1, name="name",
-                         version="version")
+    with pytest.raises(ValueError, match=msg):
+        fetch_openml(data_id=-1, name="name", version="version")
 
-    assert_raise_message(ValueError, "Neither name nor data_id are provided. "
-                         "Please provide name or data_id.", fetch_openml)
+    msg = (
+        "Neither name nor data_id are provided. "
+        "Please provide name or data_id."
+    )
+    with pytest.raises(ValueError, match=msg):
+        fetch_openml()
 
 
 @pytest.mark.parametrize('gzip_response', [True, False])
diff --git a/sklearn/datasets/tests/test_samples_generator.py b/sklearn/datasets/tests/test_samples_generator.py
index fcdb1222bd116..df8989b69f59c 100644
--- a/sklearn/datasets/tests/test_samples_generator.py
+++ b/sklearn/datasets/tests/test_samples_generator.py
@@ -1,4 +1,5 @@
 
+import re
 from collections import defaultdict
 from functools import partial
 
@@ -9,7 +10,6 @@
 from sklearn.utils._testing import assert_array_equal
 from sklearn.utils._testing import assert_almost_equal
 from sklearn.utils._testing import assert_array_almost_equal
-from sklearn.utils._testing import assert_raise_message
 
 from sklearn.datasets import make_classification
 from sklearn.datasets import make_multilabel_classification
@@ -337,21 +337,22 @@ def test_make_blobs_error():
     n_samples = [20, 20, 20]
     centers = np.array([[0.0, 0.0], [1.0, 1.0], [0.0, 1.0]])
     cluster_stds = np.array([0.05, 0.2, 0.4])
-    wrong_centers_msg = ("Length of `n_samples` not consistent "
-                         "with number of centers. Got n_samples = {} "
-                         "and centers = {}".format(n_samples, centers[:-1]))
-    assert_raise_message(ValueError, wrong_centers_msg,
-                         make_blobs, n_samples, centers=centers[:-1])
-    wrong_std_msg = ("Length of `clusters_std` not consistent with "
-                     "number of centers. Got centers = {} "
-                     "and cluster_std = {}".format(centers, cluster_stds[:-1]))
-    assert_raise_message(ValueError, wrong_std_msg,
-                         make_blobs, n_samples,
-                         centers=centers, cluster_std=cluster_stds[:-1])
+    wrong_centers_msg = re.escape(
+        "Length of `n_samples` not consistent with number of centers. "
+        f"Got n_samples = {n_samples} and centers = {centers[:-1]}"
+    )
+    with pytest.raises(ValueError, match=wrong_centers_msg):
+        make_blobs(n_samples, centers=centers[:-1])
+    wrong_std_msg = re.escape(
+        "Length of `clusters_std` not consistent with number of centers. "
+        f"Got centers = {centers} and cluster_std = {cluster_stds[:-1]}"
+    )
+    with pytest.raises(ValueError, match=wrong_std_msg):
+        make_blobs(n_samples, centers=centers, cluster_std=cluster_stds[:-1])
     wrong_type_msg = ("Parameter `centers` must be array-like. "
                       "Got {!r} instead".format(3))
-    assert_raise_message(ValueError, wrong_type_msg,
-                         make_blobs, n_samples, centers=3)
+    with pytest.raises(ValueError, match=wrong_type_msg):
+        make_blobs(n_samples, centers=3)
 
 
 def test_make_friedman1():
diff --git a/sklearn/decomposition/tests/test_nmf.py b/sklearn/decomposition/tests/test_nmf.py
index 88c1ba406ad99..8bf0feb0b630d 100644
--- a/sklearn/decomposition/tests/test_nmf.py
+++ b/sklearn/decomposition/tests/test_nmf.py
@@ -1,3 +1,5 @@
+import re
+
 import numpy as np
 import scipy.sparse as sp
 
@@ -8,7 +10,6 @@
 
 import pytest
 
-from sklearn.utils._testing import assert_raise_message
 from sklearn.utils._testing import assert_array_equal
 from sklearn.utils._testing import assert_array_almost_equal
 from sklearn.utils._testing import assert_almost_equal
@@ -45,34 +46,43 @@ def test_parameter_checking():
     # FIXME : should be removed in 1.1
     init = 'nndsvda'
     msg = "Invalid solver parameter: got 'spam' instead of one of"
-    assert_raise_message(ValueError, msg, NMF(solver=name, init=init).fit, A)
+    with pytest.raises(ValueError, match=msg):
+        NMF(solver=name, init=init).fit(A)
     msg = "Invalid init parameter: got 'spam' instead of one of"
-    assert_raise_message(ValueError, msg, NMF(init=name).fit, A)
+    with pytest.raises(ValueError, match=msg):
+        NMF(init=name).fit(A)
     msg = "Invalid regularization parameter: got 'spam' instead of one of"
-    assert_raise_message(ValueError, msg, NMF(regularization=name,
-                                              init=init).fit, A)
+    with pytest.raises(ValueError, match=msg):
+        NMF(regularization=name, init=init).fit(A)
     msg = "Invalid beta_loss parameter: got 'spam' instead of one"
-    assert_raise_message(ValueError, msg, NMF(solver='mu', init=init,
-                                              beta_loss=name).fit, A)
-    msg = "Invalid beta_loss parameter: solver 'cd' does not handle "
-    msg += "beta_loss = 1.0"
-    assert_raise_message(ValueError, msg, NMF(solver='cd', init=init,
-                                              beta_loss=1.0).fit, A)
+    with pytest.raises(ValueError, match=msg):
+        NMF(solver='mu', init=init, beta_loss=name).fit(A)
+    msg = (
+        "Invalid beta_loss parameter: solver 'cd' does not handle "
+        "beta_loss = 1.0"
+    )
+    with pytest.raises(ValueError, match=msg):
+        NMF(solver='cd', init=init, beta_loss=1.0).fit(A)
 
     msg = "Negative values in data passed to"
-    assert_raise_message(ValueError, msg, NMF(init=init).fit, -A)
-    assert_raise_message(ValueError, msg, nmf._initialize_nmf, -A,
-                         2, 'nndsvd')
+    with pytest.raises(ValueError, match=msg):
+        NMF(init=init).fit(-A)
+    with pytest.raises(ValueError, match=msg):
+        nmf._initialize_nmf(-A, 2, 'nndsvd')
     clf = NMF(2, tol=0.1, init=init).fit(A)
-    assert_raise_message(ValueError, msg, clf.transform, -A)
+    with pytest.raises(ValueError, match=msg):
+        clf.transform(-A)
 
     for init in ['nndsvd', 'nndsvda', 'nndsvdar']:
-        msg = ("init = '{}' can only be used when "
-               "n_components <= min(n_samples, n_features)"
-               .format(init))
-        assert_raise_message(ValueError, msg, NMF(3, init=init).fit, A)
-        assert_raise_message(ValueError, msg, nmf._initialize_nmf, A,
-                             3, init)
+        msg = re.escape(
+            "init = '{}' can only be used when "
+            "n_components <= min(n_samples, n_features)"
+            .format(init)
+        )
+        with pytest.raises(ValueError, match=msg):
+            NMF(3, init=init).fit(A)
+        with pytest.raises(ValueError, match=msg):
+            nmf._initialize_nmf(A, 3, init)
 
 
 def test_initialize_close():
@@ -257,21 +267,30 @@ def test_non_negative_factorization_checking():
     A = np.ones((2, 2))
     # Test parameters checking is public function
     nnmf = non_negative_factorization
-    msg = ("Number of components must be a positive integer; "
-           "got (n_components=1.5)")
-    assert_raise_message(ValueError, msg, nnmf, A, A, A, 1.5, init='random')
-    msg = ("Number of components must be a positive integer; "
-           "got (n_components='2')")
-    assert_raise_message(ValueError, msg, nnmf, A, A, A, '2', init='random')
-    msg = "Negative values in data passed to NMF (input H)"
-    assert_raise_message(ValueError, msg, nnmf, A, A, -A, 2, init='custom')
-    msg = "Negative values in data passed to NMF (input W)"
-    assert_raise_message(ValueError, msg, nnmf, A, -A, A, 2, init='custom')
-    msg = "Array passed to NMF (input H) is full of zeros"
-    assert_raise_message(ValueError, msg, nnmf, A, A, 0 * A, 2, init='custom')
+    msg = re.escape(
+        "Number of components must be a positive integer; "
+        "got (n_components=1.5)"
+    )
+    with pytest.raises(ValueError, match=msg):
+        nnmf(A, A, A, 1.5, init='random')
+    msg = re.escape(
+        "Number of components must be a positive integer; "
+        "got (n_components='2')"
+    )
+    with pytest.raises(ValueError, match=msg):
+        nnmf(A, A, A, '2', init='random')
+    msg = re.escape("Negative values in data passed to NMF (input H)")
+    with pytest.raises(ValueError, match=msg):
+        nnmf(A, A, -A, 2, init='custom')
+    msg = re.escape("Negative values in data passed to NMF (input W)")
+    with pytest.raises(ValueError, match=msg):
+        nnmf(A, -A, A, 2, init='custom')
+    msg = re.escape("Array passed to NMF (input H) is full of zeros")
+    with pytest.raises(ValueError, match=msg):
+        nnmf(A, A, 0 * A, 2, init='custom')
     msg = "Invalid regularization parameter: got 'spam' instead of one of"
-    assert_raise_message(ValueError, msg, nnmf, A, A, 0 * A, 2, init='custom',
-                         regularization='spam')
+    with pytest.raises(ValueError, match=msg):
+        nnmf(A, A, 0 * A, 2, init='custom', regularization='spam')
 
 
 def _beta_divergence_dense(X, W, H, beta):
@@ -425,7 +444,8 @@ def _assert_nmf_no_nan(X, beta_loss):
 
     msg = "When beta_loss <= 0 and X contains zeros, the solver may diverge."
     for beta_loss in (-0.6, 0.):
-        assert_raise_message(ValueError, msg, _assert_nmf_no_nan, X, beta_loss)
+        with pytest.raises(ValueError, match=msg):
+            _assert_nmf_no_nan(X, beta_loss)
         _assert_nmf_no_nan(X + 1e-9, beta_loss)
 
     for beta_loss in (0.2, 1., 1.2, 2., 2.5):

From c67518350f91072f9d37ed09c5ef7edf555b6cf6 Mon Sep 17 00:00:00 2001
From: yoch <yoch.melka@gmail.com>
Date: Thu, 20 May 2021 17:27:08 +0300
Subject: [PATCH 409/478] DOC use reshape instead of manually reshaping in
 plot_color_quantization (#19960)

---
 examples/cluster/plot_color_quantization.py | 24 ++++++++-------------
 1 file changed, 9 insertions(+), 15 deletions(-)

diff --git a/examples/cluster/plot_color_quantization.py b/examples/cluster/plot_color_quantization.py
index ccc45eff73306..384e58f75e328 100644
--- a/examples/cluster/plot_color_quantization.py
+++ b/examples/cluster/plot_color_quantization.py
@@ -50,36 +50,30 @@
 
 print("Fitting model on a small sub-sample of the data")
 t0 = time()
-image_array_sample = shuffle(image_array, random_state=0)[:1000]
+image_array_sample = shuffle(image_array, random_state=0, n_samples=1_000)
 kmeans = KMeans(n_clusters=n_colors, random_state=0).fit(image_array_sample)
-print("done in %0.3fs." % (time() - t0))
+print(f"done in {time() - t0:0.3f}s.")
 
 # Get labels for all points
 print("Predicting color indices on the full image (k-means)")
 t0 = time()
 labels = kmeans.predict(image_array)
-print("done in %0.3fs." % (time() - t0))
+print(f"done in {time() - t0:0.3f}s.")
 
 
-codebook_random = shuffle(image_array, random_state=0)[:n_colors]
+codebook_random = shuffle(image_array, random_state=0, n_samples=n_colors)
 print("Predicting color indices on the full image (random)")
 t0 = time()
 labels_random = pairwise_distances_argmin(codebook_random,
                                           image_array,
                                           axis=0)
-print("done in %0.3fs." % (time() - t0))
+print(f"done in {time() - t0:0.3f}s.")
 
 
 def recreate_image(codebook, labels, w, h):
     """Recreate the (compressed) image from the code book & labels"""
-    d = codebook.shape[1]
-    image = np.zeros((w, h, d))
-    label_idx = 0
-    for i in range(w):
-        for j in range(h):
-            image[i][j] = codebook[labels[label_idx]]
-            label_idx += 1
-    return image
+    return codebook[labels].reshape(w, h, -1)
+
 
 # Display all results, alongside original image
 plt.figure(1)
@@ -91,12 +85,12 @@ def recreate_image(codebook, labels, w, h):
 plt.figure(2)
 plt.clf()
 plt.axis('off')
-plt.title('Quantized image (64 colors, K-Means)')
+plt.title(f'Quantized image ({n_colors} colors, K-Means)')
 plt.imshow(recreate_image(kmeans.cluster_centers_, labels, w, h))
 
 plt.figure(3)
 plt.clf()
 plt.axis('off')
-plt.title('Quantized image (64 colors, Random)')
+plt.title(f'Quantized image ({n_colors} colors, Random)')
 plt.imshow(recreate_image(codebook_random, labels_random, w, h))
 plt.show()

From e8e719dc8acfb58446f0d1fa92e5f9ef7dd1ad0c Mon Sep 17 00:00:00 2001
From: Chiara Marmo <cmarmo@users.noreply.github.com>
Date: Fri, 21 May 2021 14:41:15 +0200
Subject: [PATCH 410/478] [DOC] Update roadmap. (#20116)

---
 doc/roadmap.rst | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/doc/roadmap.rst b/doc/roadmap.rst
index 30c9f58339502..2bead90522739 100644
--- a/doc/roadmap.rst
+++ b/doc/roadmap.rst
@@ -70,16 +70,16 @@ the document up to date as we work on these issues.
 
    * document current handling
    * column reordering issue :issue:`7242`
-   * avoiding unnecessary conversion to ndarray :issue:`12147`
+   * avoiding unnecessary conversion to ndarray |ss| :issue:`12147` |se|
    * returning DataFrames from transformers :issue:`5523`
-   * getting DataFrames from dataset loaders :issue:`10733`,
+   * getting DataFrames from dataset loaders |ss| :issue:`10733` |se|,
      |ss| :issue:`13902` |se|
-   * Sparse currently not considered :issue:`12800`
+   * Sparse currently not considered |ss| :issue:`12800` |se|
 
 #. Improved handling of categorical features
 
    * Tree-based models should be able to handle both continuous and categorical
-     features :issue:`12866` and :issue:`15550`.
+     features :issue:`12866` and |ss| :issue:`15550` |se|.
    * |ss| In dataset loaders :issue:`13902` |se|
    * As generic transformers to be used with ColumnTransforms (e.g. ordinal
      encoding supervised by correlation with target variable) :issue:`5853`,
@@ -89,7 +89,7 @@ the document up to date as we work on these issues.
 #. Improved handling of missing data
 
    * Making sure meta-estimators are lenient towards missing data,
-     :issue:`15319`
+     |ss| :issue:`15319` |se|
    * Non-trivial imputers |ss| :issue:`11977`, :issue:`12852` |se|
    * Learners directly handling missing data |ss| :issue:`13911` |se|
    * An amputation sample generator to make parts of a dataset go missing
@@ -125,19 +125,20 @@ the document up to date as we work on these issues.
    components
 
    * More flexible estimator checks that do not select by estimator name
-     :issue:`6599` :issue:`6715`
-   * Example of how to develop an estimator or a meta-estimator, :issue:`14582`
+     |ss| :issue:`6599` |se| :issue:`6715`
+   * Example of how to develop an estimator or a meta-estimator,
+     |ss| :issue:`14582` |se|
    * More self-sufficient running of scikit-learn-contrib or a similar resource
 
 #. Support resampling and sample reduction
 
    * Allow subsampling of majority classes (in a pipeline?) :issue:`3855`
-   * Implement random forests with resampling :issue:`8732`
+   * Implement random forests with resampling :issue:`13227`
 
 #. Better interfaces for interactive development
 
-   * |ss| __repr__ |se| and HTML visualisations of estimators
-     |ss| :issue:`6323` |se| and :pr:`14180`.
+   * |ss| __repr__ and HTML visualisations of estimators
+     :issue:`6323` and :pr:`14180` |se|.
    * Include plotting tools, not just as examples. :issue:`9173`
 
 #. Improved tools for model diagnostics and basic inference
@@ -249,7 +250,7 @@ Subpackage-specific goals
 * perhaps we want to be able to get back more than multiple metrics
 * the handling of random states in CV splitters is a poor design and
   contradicts the validation of similar parameters in estimators,
-  :issue:`15177`
+  `SLEP011 <https://github.com/scikit-learn/enhancement_proposals/pull/24>`_
 * exploit warm-starting and path algorithms so the benefits of `EstimatorCV`
   objects can be accessed via `GridSearchCV` and used in Pipelines.
   :issue:`1626`

From 36a4dcafedbcbb112e1d96fd04e73ba922523bae Mon Sep 17 00:00:00 2001
From: Bryan Chen <bchen1116@gmail.com>
Date: Fri, 21 May 2021 09:59:40 -0400
Subject: [PATCH 411/478] DOC Add Evalml to scikit-learn related projects
 (#20109)

---
 doc/related_projects.rst | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/doc/related_projects.rst b/doc/related_projects.rst
index 5d50196000e44..0cef93f0fd196 100644
--- a/doc/related_projects.rst
+++ b/doc/related_projects.rst
@@ -58,6 +58,12 @@ enhance the functionality of scikit-learn's estimators.
   it can stream minibatches, use data checkpoints, build funky pipelines, and
   serialize models with custom per-step savers.
 
+- `EvalML <https://github.com/alteryx/evalml>`_
+  EvalML is an AutoML library which builds, optimizes, and evaluates
+  machine learning pipelines using domain-specific objective functions.
+  It incorporates multiple modeling libraries under one API, and
+  the objects that EvalML creates use an sklearn-compatible API.
+
 **Experimentation frameworks**
 
 - `Sacred <https://github.com/IDSIA/Sacred>`_ Tool to help you configure,

From 5081c2fcd28863dfaa28e4633b39a1c2a2906e3c Mon Sep 17 00:00:00 2001
From: Nate Parsons <4307001+thehomebrewnerd@users.noreply.github.com>
Date: Sat, 22 May 2021 08:03:10 -0500
Subject: [PATCH 412/478] DOC Update Featuretools link in Related Project Page
 (#20120)

---
 doc/related_projects.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/related_projects.rst b/doc/related_projects.rst
index 0cef93f0fd196..033d53ddb94ee 100644
--- a/doc/related_projects.rst
+++ b/doc/related_projects.rst
@@ -46,7 +46,7 @@ enhance the functionality of scikit-learn's estimators.
   preprocessors as well as the estimators. Works as a drop-in replacement for a
   scikit-learn estimator.
   
-- `Featuretools <https://github.com/FeatureLabs/featuretools>`_
+- `Featuretools <https://github.com/alteryx/featuretools>`_
   A framework to perform automated feature engineering. It can be used for 
   transforming temporal and relational datasets into feature matrices for 
   machine learning.

From aa898de885ed4861a03e4f79b28f92f70914643d Mon Sep 17 00:00:00 2001
From: Alihan Zihna <alihanz@gmail.com>
Date: Sat, 22 May 2021 19:55:22 +0100
Subject: [PATCH 413/478] TST Changes assert to pytest style in svm, manifold,
 linear_model, feature_extraction, decomposition (#19999)

Changed the assert_raises, assert_raise_message, assert_warns in the following files:

* test_factor_analysis.py
* test_text.py
* test_bayes.py
* test_ransac.py
* test_sag.py
* test_locally_linear.py
* test_bounds.py
* test_sparse.py
* test_svm.py
---
 .../tests/test_factor_analysis.py             | 12 +++---
 sklearn/feature_extraction/tests/test_text.py | 37 +++++++++++--------
 sklearn/linear_model/tests/test_bayes.py      |  4 +-
 sklearn/linear_model/tests/test_ransac.py     | 13 ++++---
 sklearn/linear_model/tests/test_sag.py        | 26 +++++++------
 sklearn/manifold/tests/test_locally_linear.py |  7 ++--
 sklearn/svm/tests/test_bounds.py              |  7 ++--
 sklearn/svm/tests/test_sparse.py              |  8 ++--
 sklearn/svm/tests/test_svm.py                 | 27 +++++++++-----
 9 files changed, 80 insertions(+), 61 deletions(-)

diff --git a/sklearn/decomposition/tests/test_factor_analysis.py b/sklearn/decomposition/tests/test_factor_analysis.py
index f889e49ea4a3a..45d4de948039d 100644
--- a/sklearn/decomposition/tests/test_factor_analysis.py
+++ b/sklearn/decomposition/tests/test_factor_analysis.py
@@ -7,8 +7,6 @@
 import numpy as np
 import pytest
 
-from sklearn.utils._testing import assert_warns
-from sklearn.utils._testing import assert_raises
 from sklearn.utils._testing import assert_almost_equal
 from sklearn.utils._testing import assert_array_almost_equal
 from sklearn.exceptions import ConvergenceWarning
@@ -69,14 +67,16 @@ def test_factor_analysis():
         with pytest.raises(ValueError):
             fa.fit(X[:, :2])
 
-    f = lambda x, y: np.abs(getattr(x, y))  # sign will not be equal
+    def f(x, y):
+        return np.abs(getattr(x, y))  # sign will not be equal
     fa1, fa2 = fas
     for attr in ['loglike_', 'components_', 'noise_variance_']:
         assert_almost_equal(f(fa1, attr), f(fa2, attr))
 
     fa1.max_iter = 1
     fa1.verbose = True
-    assert_warns(ConvergenceWarning, fa1.fit, X)
+    with pytest.warns(ConvergenceWarning):
+        fa1.fit(X)
 
     # Test get_covariance and get_precision with n_components == n_features
     # with n_components < n_features and with n_components == 0
@@ -101,8 +101,8 @@ def test_factor_analysis():
         assert not np.allclose(results[rot1], results[rot2])
         assert np.allclose(projections[rot1], projections[rot2], atol=3)
 
-    assert_raises(ValueError,
-                  FactorAnalysis(rotation='not_implemented').fit_transform, X)
+    with pytest.raises(ValueError):
+        FactorAnalysis(rotation="not_implemented").fit_transform(X)
 
     # test against R's psych::principal with rotate="varimax"
     # (i.e., the values below stem from rotating the components in R)
diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py
index 767b04ddb5d95..0033ae84948ac 100644
--- a/sklearn/feature_extraction/tests/test_text.py
+++ b/sklearn/feature_extraction/tests/test_text.py
@@ -29,7 +29,6 @@
 from numpy.testing import assert_array_equal
 from sklearn.utils import IS_PYPY
 from sklearn.utils._testing import (assert_almost_equal,
-                                    assert_raise_message,
                                     fails_if_pypy,
                                     assert_allclose_dense_sparse,
                                     skip_if_32bit)
@@ -1093,7 +1092,8 @@ def func():
         hv = HashingVectorizer()
         hv.fit_transform(['hello world', np.nan, 'hello hello'])
 
-    assert_raise_message(exception, message, func)
+    with pytest.raises(exception, match=message):
+        func()
 
 
 def test_tfidfvectorizer_binary():
@@ -1127,11 +1127,16 @@ def test_vectorizer_string_object_as_input(Vectorizer):
     message = ("Iterable over raw text documents expected, "
                "string object received.")
     vec = Vectorizer()
-    assert_raise_message(
-            ValueError, message, vec.fit_transform, "hello world!")
-    assert_raise_message(ValueError, message, vec.fit, "hello world!")
+
+    with pytest.raises(ValueError, match=message):
+        vec.fit_transform("hello world!")
+
+    with pytest.raises(ValueError, match=message):
+        vec.fit("hello world!")
     vec.fit(["some text", "some other text"])
-    assert_raise_message(ValueError, message, vec.transform, "hello world!")
+
+    with pytest.raises(ValueError, match=message):
+        vec.transform("hello world!")
 
 
 @pytest.mark.parametrize("X_dtype", [np.float32, np.float64])
@@ -1186,20 +1191,22 @@ def test_vectorizers_invalid_ngram_range(vec):
     # vectorizers could be initialized with invalid ngram range
     # test for raising error message
     invalid_range = vec.ngram_range
-    message = ("Invalid value for ngram_range=%s "
-               "lower boundary larger than the upper boundary."
-               % str(invalid_range))
+    message = re.escape(
+        f"Invalid value for ngram_range={invalid_range} "
+        "lower boundary larger than the upper boundary."
+    )
     if isinstance(vec, HashingVectorizer) and IS_PYPY:
         pytest.xfail(reason='HashingVectorizer is not supported on PyPy')
 
-    assert_raise_message(
-        ValueError, message, vec.fit, ["good news everyone"])
-    assert_raise_message(
-        ValueError, message, vec.fit_transform, ["good news everyone"])
+    with pytest.raises(ValueError, match=message):
+        vec.fit(['good news everyone'])
+
+    with pytest.raises(ValueError, match=message):
+        vec.fit_transform(['good news everyone'])
 
     if isinstance(vec, HashingVectorizer):
-        assert_raise_message(
-            ValueError, message, vec.transform, ["good news everyone"])
+        with pytest.raises(ValueError, match=message):
+            vec.transform(['good news everyone'])
 
 
 def _check_stop_words_consistency(estimator):
diff --git a/sklearn/linear_model/tests/test_bayes.py b/sklearn/linear_model/tests/test_bayes.py
index a22a0243cdcb7..fab87c5adf007 100644
--- a/sklearn/linear_model/tests/test_bayes.py
+++ b/sklearn/linear_model/tests/test_bayes.py
@@ -13,7 +13,6 @@
 from sklearn.utils._testing import assert_array_almost_equal
 from sklearn.utils._testing import assert_almost_equal
 from sklearn.utils._testing import assert_array_less
-from sklearn.utils._testing import assert_raise_message
 from sklearn.utils import check_random_state
 from sklearn.linear_model import BayesianRidge, ARDRegression
 from sklearn.linear_model import Ridge
@@ -29,7 +28,8 @@ def test_n_iter():
     y = np.array([1, 2, 6, 8, 10])
     clf = BayesianRidge(n_iter=0)
     msg = "n_iter should be greater than or equal to 1."
-    assert_raise_message(ValueError, msg, clf.fit, X, y)
+    with pytest.raises(ValueError, match=msg):
+        clf.fit(X, y)
 
 
 def test_bayesian_ridge_scores():
diff --git a/sklearn/linear_model/tests/test_ransac.py b/sklearn/linear_model/tests/test_ransac.py
index 071a67efcf28f..da7167c0feb2a 100644
--- a/sklearn/linear_model/tests/test_ransac.py
+++ b/sklearn/linear_model/tests/test_ransac.py
@@ -6,7 +6,6 @@
 from numpy.testing import assert_array_equal
 
 from sklearn.utils import check_random_state
-from sklearn.utils._testing import assert_raises_regexp
 from sklearn.utils._testing import assert_allclose
 from sklearn.datasets import make_regression
 from sklearn.linear_model import LinearRegression, RANSACRegressor
@@ -159,7 +158,8 @@ def test_ransac_resid_thresh_no_inliers():
                                        max_trials=5)
 
     msg = ("RANSAC could not find a valid consensus set")
-    assert_raises_regexp(ValueError, msg, ransac_estimator.fit, X, y)
+    with pytest.raises(ValueError, match=msg):
+        ransac_estimator.fit(X, y)
     assert ransac_estimator.n_skips_no_inliers_ == 5
     assert ransac_estimator.n_skips_invalid_data_ == 0
     assert ransac_estimator.n_skips_invalid_model_ == 0
@@ -175,7 +175,8 @@ def is_data_valid(X, y):
                                        max_trials=5)
 
     msg = ("RANSAC could not find a valid consensus set")
-    assert_raises_regexp(ValueError, msg, ransac_estimator.fit, X, y)
+    with pytest.raises(ValueError, match=msg):
+        ransac_estimator.fit(X, y)
     assert ransac_estimator.n_skips_no_inliers_ == 0
     assert ransac_estimator.n_skips_invalid_data_ == 5
     assert ransac_estimator.n_skips_invalid_model_ == 0
@@ -191,7 +192,8 @@ def is_model_valid(estimator, X, y):
                                        max_trials=5)
 
     msg = ("RANSAC could not find a valid consensus set")
-    assert_raises_regexp(ValueError, msg, ransac_estimator.fit, X, y)
+    with pytest.raises(ValueError, match=msg):
+        ransac_estimator.fit(X, y)
     assert ransac_estimator.n_skips_no_inliers_ == 0
     assert ransac_estimator.n_skips_invalid_data_ == 0
     assert ransac_estimator.n_skips_invalid_model_ == 5
@@ -208,7 +210,8 @@ def is_data_valid(X, y):
                                        max_skips=3)
 
     msg = ("RANSAC skipped more iterations than `max_skips`")
-    assert_raises_regexp(ValueError, msg, ransac_estimator.fit, X, y)
+    with pytest.raises(ValueError, match=msg):
+        ransac_estimator.fit(X, y)
     assert ransac_estimator.n_skips_no_inliers_ == 0
     assert ransac_estimator.n_skips_invalid_data_ == 4
     assert ransac_estimator.n_skips_invalid_model_ == 0
diff --git a/sklearn/linear_model/tests/test_sag.py b/sklearn/linear_model/tests/test_sag.py
index 22432185cc09b..62a7175271bd8 100644
--- a/sklearn/linear_model/tests/test_sag.py
+++ b/sklearn/linear_model/tests/test_sag.py
@@ -4,6 +4,7 @@
 # License: BSD 3 clause
 
 import math
+import re
 import pytest
 import numpy as np
 import scipy.sparse as sp
@@ -19,7 +20,6 @@
 from sklearn.utils._testing import assert_almost_equal
 from sklearn.utils._testing import assert_array_almost_equal
 from sklearn.utils._testing import assert_allclose
-from sklearn.utils._testing import assert_raise_message
 from sklearn.utils import compute_class_weight
 from sklearn.utils import check_random_state
 from sklearn.preprocessing import LabelEncoder, LabelBinarizer
@@ -449,8 +449,8 @@ def test_get_auto_step_size():
             assert_almost_equal(step_size_log, step_size_log_, decimal=4)
 
     msg = 'Unknown loss function for SAG solver, got wrong instead of'
-    assert_raise_message(ValueError, msg, get_auto_step_size,
-                         max_squared_sum_, alpha, "wrong", fit_intercept)
+    with pytest.raises(ValueError, match=msg):
+        get_auto_step_size(max_squared_sum_, alpha, "wrong", fit_intercept)
 
 
 @pytest.mark.parametrize("seed", range(3))  # locally tested with 1000 seeds
@@ -737,11 +737,9 @@ def test_classifier_single_class():
     X = [[1, 2], [3, 4]]
     y = [1, 1]
 
-    assert_raise_message(ValueError,
-                         "This solver needs samples of at least 2 classes "
-                         "in the data",
-                         LogisticRegression(solver='sag').fit,
-                         X, y)
+    msg = "This solver needs samples of at least 2 classes in the data"
+    with pytest.raises(ValueError, match=msg):
+        LogisticRegression(solver='sag').fit(X, y)
 
 
 def test_step_size_alpha_error():
@@ -749,15 +747,19 @@ def test_step_size_alpha_error():
     y = [1, -1]
     fit_intercept = False
     alpha = 1.
-    msg = ("Current sag implementation does not handle the case"
-           " step_size * alpha_scaled == 1")
+    msg = re.escape(
+        "Current sag implementation does not handle the case"
+        " step_size * alpha_scaled == 1"
+    )
 
     clf1 = LogisticRegression(solver='sag', C=1. / alpha,
                               fit_intercept=fit_intercept)
-    assert_raise_message(ZeroDivisionError, msg, clf1.fit, X, y)
+    with pytest.raises(ZeroDivisionError, match=msg):
+        clf1.fit(X, y)
 
     clf2 = Ridge(fit_intercept=fit_intercept, solver='sag', alpha=alpha)
-    assert_raise_message(ZeroDivisionError, msg, clf2.fit, X, y)
+    with pytest.raises(ZeroDivisionError, match=msg):
+        clf2.fit(X, y)
 
 
 def test_multinomial_loss():
diff --git a/sklearn/manifold/tests/test_locally_linear.py b/sklearn/manifold/tests/test_locally_linear.py
index 952da3ef41163..dc5df2f8896aa 100644
--- a/sklearn/manifold/tests/test_locally_linear.py
+++ b/sklearn/manifold/tests/test_locally_linear.py
@@ -8,7 +8,6 @@
 from sklearn import neighbors, manifold
 from sklearn.manifold._locally_linear import barycenter_kneighbors_graph
 from sklearn.utils._testing import ignore_warnings
-from sklearn.utils._testing import assert_raise_message
 
 eigen_solvers = ['dense', 'arpack']
 
@@ -106,11 +105,13 @@ def test_lle_init_parameters():
 
     clf = manifold.LocallyLinearEmbedding(eigen_solver="error")
     msg = "unrecognized eigen_solver 'error'"
-    assert_raise_message(ValueError, msg, clf.fit, X)
+    with pytest.raises(ValueError, match=msg):
+        clf.fit(X)
 
     clf = manifold.LocallyLinearEmbedding(method="error")
     msg = "unrecognized method 'error'"
-    assert_raise_message(ValueError, msg, clf.fit, X)
+    with pytest.raises(ValueError, match=msg):
+        clf.fit(X)
 
 
 def test_pipeline():
diff --git a/sklearn/svm/tests/test_bounds.py b/sklearn/svm/tests/test_bounds.py
index 423d5ed7a7fba..70e6152d7fdea 100644
--- a/sklearn/svm/tests/test_bounds.py
+++ b/sklearn/svm/tests/test_bounds.py
@@ -9,8 +9,6 @@
 from sklearn.linear_model import LogisticRegression
 from sklearn.svm._newrand import set_seed_wrap, bounded_rand_int_wrap
 
-from sklearn.utils._testing import assert_raise_message
-
 
 dense_X = [[-1, 0], [0, 1], [1, 1], [1, 1]]
 sparse_X = sp.csr_matrix(dense_X)
@@ -38,8 +36,9 @@ def test_l1_min_c(loss, X_label, Y_label, intercept_label):
 
 def test_l1_min_c_l2_loss():
     # loss='l2' should raise ValueError
-    assert_raise_message(ValueError, "loss type not in",
-                         l1_min_c, dense_X, Y1, loss="l2")
+    msg = 'loss type not in'
+    with pytest.raises(ValueError, match=msg):
+        l1_min_c(dense_X, Y1, loss="l2")
 
 
 def check_l1_min_c(X, y, loss, fit_intercept=True, intercept_scaling=None):
diff --git a/sklearn/svm/tests/test_sparse.py b/sklearn/svm/tests/test_sparse.py
index bb935e55e1912..5e1196fa84faf 100644
--- a/sklearn/svm/tests/test_sparse.py
+++ b/sklearn/svm/tests/test_sparse.py
@@ -9,8 +9,7 @@
 from sklearn.svm.tests import test_svm
 from sklearn.exceptions import ConvergenceWarning
 from sklearn.utils.extmath import safe_sparse_dot
-from sklearn.utils._testing import (assert_raise_message, ignore_warnings,
-                                    skip_if_32bit)
+from sklearn.utils._testing import ignore_warnings, skip_if_32bit
 
 
 # test sample 1
@@ -69,7 +68,8 @@ def check_svm_model_equal(dense_svm, sparse_svm, X_train, y_train, X_test):
                                   sparse_svm.predict_proba(X_test), 4)
         msg = "cannot use sparse input in 'SVC' trained on dense data"
     if sparse.isspmatrix(X_test):
-        assert_raise_message(ValueError, msg, dense_svm.predict, X_test)
+        with pytest.raises(ValueError, match=msg):
+            dense_svm.predict(X_test)
 
 
 @skip_if_32bit
@@ -148,7 +148,7 @@ def test_svc_iris():
     for k in ('linear', 'poly', 'rbf'):
         sp_clf = svm.SVC(kernel=k).fit(iris.data, iris.target)
         clf = svm.SVC(kernel=k).fit(iris.data.toarray(),
-                                                   iris.target)
+                                    iris.target)
 
         assert_array_almost_equal(clf.support_vectors_,
                                   sp_clf.support_vectors_.toarray())
diff --git a/sklearn/svm/tests/test_svm.py b/sklearn/svm/tests/test_svm.py
index 3fe57ad1b8375..97411c8c3c81b 100644
--- a/sklearn/svm/tests/test_svm.py
+++ b/sklearn/svm/tests/test_svm.py
@@ -19,7 +19,6 @@
 from sklearn.metrics import f1_score
 from sklearn.metrics.pairwise import rbf_kernel
 from sklearn.utils import check_random_state
-from sklearn.utils._testing import assert_raise_message
 from sklearn.utils._testing import ignore_warnings
 from sklearn.utils.validation import _num_samples
 from sklearn.utils import shuffle
@@ -123,7 +122,8 @@ def test_precomputed():
     # same as before, but using a callable function instead of the kernel
     # matrix. kernel is just a linear kernel
 
-    kfunc = lambda x, y: np.dot(x, y.T)
+    def kfunc(x, y):
+        return np.dot(x, y.T)
     clf = svm.SVC(kernel=kfunc)
     clf.fit(np.array(X), Y)
     pred = clf.predict(T)
@@ -739,13 +739,16 @@ def test_linear_svx_uppercase_loss_penality_raises_error():
 
     X, y = [[0.0], [1.0]], [0, 1]
 
-    assert_raise_message(ValueError, "loss='SQuared_hinge' is not supported",
-                         svm.LinearSVC(loss="SQuared_hinge").fit, X, y)
+    msg = "loss='SQuared_hinge' is not supported"
+    with pytest.raises(ValueError, match=msg):
+        svm.LinearSVC(loss="SQuared_hinge").fit(X, y)
 
-    assert_raise_message(ValueError,
-                         ("The combination of penalty='L2'"
-                          " and loss='squared_hinge' is not supported"),
-                         svm.LinearSVC(penalty="L2").fit, X, y)
+    msg = (
+        "The combination of penalty='L2'"
+        " and loss='squared_hinge' is not supported"
+    )
+    with pytest.raises(ValueError, match=msg):
+        svm.LinearSVC(penalty="L2").fit(X, y)
 
 
 def test_linearsvc():
@@ -1043,10 +1046,12 @@ def test_linear_svc_intercept_scaling():
 
     for i in [-1, 0]:
         lsvc = svm.LinearSVC(intercept_scaling=i)
+
         msg = ('Intercept scaling is %r but needs to be greater than 0.'
                ' To disable fitting an intercept,'
                ' set fit_intercept=False.' % lsvc.intercept_scaling)
-        assert_raise_message(ValueError, msg, lsvc.fit, X, Y)
+        with pytest.raises(ValueError, match=msg):
+            lsvc.fit(X, Y)
 
 
 def test_lsvc_intercept_scaling_zero():
@@ -1076,7 +1081,9 @@ def test_hasattr_predict_proba():
     G.probability = True
     assert hasattr(G, 'predict_proba')
     msg = "predict_proba is not available when fitted with probability=False"
-    assert_raise_message(NotFittedError, msg, G.predict_proba, iris.data)
+
+    with pytest.raises(NotFittedError, match=msg):
+        G.predict_proba(iris.data)
 
 
 def test_decision_function_shape_two_class():

From 5b7136f04068e7dcdf5ae8ec4aa729107ee905c0 Mon Sep 17 00:00:00 2001
From: Haoyin Xu <haoyinxu@gmail.com>
Date: Mon, 24 May 2021 13:46:45 -0400
Subject: [PATCH 414/478] MNT Update license year to 2021 (#20126)

---
 COPYING | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/COPYING b/COPYING
index 558c4c1245615..62bab0b0b5961 100644
--- a/COPYING
+++ b/COPYING
@@ -1,6 +1,6 @@
 BSD 3-Clause License
 
-Copyright (c) 2007-2020 The scikit-learn developers.
+Copyright (c) 2007-2021 The scikit-learn developers.
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without

From 88be3c1357f98b3f11e6fa1bf20e0ff249b6362e Mon Sep 17 00:00:00 2001
From: tliu68 <54865879+tliu68@users.noreply.github.com>
Date: Tue, 25 May 2021 14:23:07 +0800
Subject: [PATCH 415/478] Fix GaussianMixture UnboundLocalError (#20030)

---
 doc/whats_new/v1.0.rst                        |  9 ++++
 sklearn/mixture/_base.py                      |  6 +--
 .../mixture/tests/test_gaussian_mixture.py    | 45 +++++++++++++++++++
 3 files changed, 57 insertions(+), 3 deletions(-)

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 34e9f0670ba81..4e7ade1083921 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -377,6 +377,15 @@ Changelog
   :pr:`18328` by :user:`Albert Villanova del Moral <albertvillanova>` and
   :user:`Alonso Silva Allende <alonsosilvaallende>`.
 
+:mod:`sklearn.mixture`
+..............................
+
+- |Fix| Ensure that the best parameters are set appropriately
+  in the case of divergency for :class:`mixture.GaussianMixture` and
+  :class:`mixture.BayesianGaussianMixture`.
+  :pr:`20030` by :user:`Tingshan Liu <tliu68>` and
+  :user:`Benjamin Pedigo <bdpedigo>`.
+
 :mod:`sklearn.model_selection`
 ..............................
 
diff --git a/sklearn/mixture/_base.py b/sklearn/mixture/_base.py
index 6acb6c2e09292..d3414c33eb5d0 100644
--- a/sklearn/mixture/_base.py
+++ b/sklearn/mixture/_base.py
@@ -203,7 +203,7 @@ def fit_predict(self, X, y=None):
         do_init = not(self.warm_start and hasattr(self, 'converged_'))
         n_init = self.n_init if do_init else 1
 
-        max_lower_bound = -np.infty
+        max_lower_bound = -np.inf
         self.converged_ = False
 
         random_state = check_random_state(self.random_state)
@@ -215,7 +215,7 @@ def fit_predict(self, X, y=None):
             if do_init:
                 self._initialize_parameters(X, random_state)
 
-            lower_bound = (-np.infty if do_init else self.lower_bound_)
+            lower_bound = (-np.inf if do_init else self.lower_bound_)
 
             for n_iter in range(1, self.max_iter + 1):
                 prev_lower_bound = lower_bound
@@ -234,7 +234,7 @@ def fit_predict(self, X, y=None):
 
             self._print_verbose_msg_init_end(lower_bound)
 
-            if lower_bound > max_lower_bound:
+            if lower_bound > max_lower_bound or max_lower_bound == -np.inf:
                 max_lower_bound = lower_bound
                 best_params = self._get_parameters()
                 best_n_iter = n_iter
diff --git a/sklearn/mixture/tests/test_gaussian_mixture.py b/sklearn/mixture/tests/test_gaussian_mixture.py
index 2d8dc81e54275..c8e85823260cd 100644
--- a/sklearn/mixture/tests/test_gaussian_mixture.py
+++ b/sklearn/mixture/tests/test_gaussian_mixture.py
@@ -1040,3 +1040,48 @@ def test_init():
                                max_iter=1, random_state=random_state).fit(X)
 
         assert gmm2.lower_bound_ >= gmm1.lower_bound_
+
+
+def test_gaussian_mixture_setting_best_params():
+    """`GaussianMixture`'s best_parameters, `n_iter_` and `lower_bound_`
+    must be set appropriately in the case of divergence.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/18216
+    """
+    rnd = np.random.RandomState(0)
+    n_samples = 30
+    X = rnd.uniform(size=(n_samples, 3))
+
+    # following initialization parameters were found to lead to divergence
+    means_init = np.array([
+            [0.670637869618158, 0.21038256107384043, 0.12892629765485303],
+            [0.09394051075844147, 0.5759464955561779, 0.929296197576212],
+            [0.5033230372781258, 0.9569852381759425, 0.08654043447295741],
+            [0.18578301420435747, 0.5531158970919143, 0.19388943970532435],
+            [0.4548589928173794, 0.35182513658825276, 0.568146063202464],
+            [0.609279894978321, 0.7929063819678847, 0.9620097270828052],
+    ])
+    precisions_init = np.array([999999.999604483, 999999.9990869573,
+                                553.7603944542167, 204.78596008931834,
+                                15.867423501783637, 85.4595728389735])
+    weights_init = [0.03333333333333341, 0.03333333333333341,
+                    0.06666666666666674, 0.06666666666666674,
+                    0.7000000000000001, 0.10000000000000007]
+
+    gmm = GaussianMixture(covariance_type="spherical", reg_covar=0,
+                          means_init=means_init, weights_init=weights_init,
+                          random_state=rnd, n_components=len(weights_init),
+                          precisions_init=precisions_init)
+    # ensure that no error is thrown during fit
+    gmm.fit(X)
+
+    # check that the fit did not converge
+    assert not gmm.converged_
+
+    # check that parameters are set for gmm
+    for attr in [
+        "weights_", "means_", "covariances_", "precisions_cholesky_",
+        "n_iter_", "lower_bound_",
+    ]:
+        assert hasattr(gmm, attr)

From c1cc67dd06d31a9b110377afe0c94b0cd50848d5 Mon Sep 17 00:00:00 2001
From: David Dale <dale.david@mail.ru>
Date: Tue, 25 May 2021 15:02:37 +0300
Subject: [PATCH 416/478] FEA Add QuantileRegressor estimator (#9978)

Co-authored-by: David Dale <ddale@yandex-team.ru>
Co-authored-by: Christian Lorentzen <lorentzen.ch@gmail.com>
---
 doc/modules/classes.rst                       |   1 +
 doc/modules/linear_model.rst                  |  77 +++++
 doc/whats_new/v1.0.rst                        |   5 +
 .../linear_model/plot_quantile_regression.py  | 110 +++++++
 sklearn/linear_model/__init__.py              |   2 +
 sklearn/linear_model/_quantile.py             | 280 ++++++++++++++++++
 sklearn/linear_model/tests/test_quantile.py   | 254 ++++++++++++++++
 7 files changed, 729 insertions(+)
 create mode 100644 examples/linear_model/plot_quantile_regression.py
 create mode 100644 sklearn/linear_model/_quantile.py
 create mode 100644 sklearn/linear_model/tests/test_quantile.py

diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index 5462e06f81214..cdeb6f0523422 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -839,6 +839,7 @@ Any estimator using the Huber loss would also be robust to outliers, e.g.
    :template: class.rst
 
    linear_model.HuberRegressor
+   linear_model.QuantileRegressor
    linear_model.RANSACRegressor
    linear_model.TheilSenRegressor
 
diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index f1f376dc641c9..7fc14693c198d 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -1423,6 +1423,83 @@ Note that this estimator is different from the R implementation of Robust Regres
 squares implementation with weights given to each sample on the basis of how much the residual is
 greater than a certain threshold.
 
+.. _quantile_regression:
+
+Quantile Regression
+===================
+
+Quantile regression estimates the median or other quantiles of :math:`y`
+conditional on :math:`X`, while ordinary least squares (OLS) estimates the
+conditional mean.
+
+As a linear model, the :class:`QuantileRegressor` gives linear predictions
+:math:`\hat{y}(w, X) = Xw` for the :math:`q`-th quantile, :math:`q \in (0, 1)`.
+The weights or coefficients :math:`w` are then found by the following
+minimization problem:
+
+.. math::
+    \min_{w} {\frac{1}{n_{\text{samples}}}
+    \sum_i PB_q(y_i - X_i w) + \alpha ||w||_1}.
+
+This consists of the pinball loss (also known as linear loss),
+see also :class:`~sklearn.metrics.mean_pinball_loss`,
+
+.. math::
+    PB_q(t) = q \max(t, 0) + (1 - q) \max(-t, 0) =
+    \begin{cases}
+        q t, & t > 0, \\
+        0,    & t = 0, \\
+        (1-q) t, & t < 0
+    \end{cases}
+
+and the L1 penalty controlled by parameter ``alpha``, similar to
+:class:`Lasso`.
+
+As the pinball loss is only linear in the residuals, quantile regression is
+much more robust to outliers than squared error based estimation of the mean.
+Somewhat in between is the :class:`HuberRegressor`.
+
+Quantile regression may be useful if one is interested in predicting an
+interval instead of point prediction. Sometimes, prediction intervals are
+calculated based on the assumption that prediction error is distributed
+normally with zero mean and constant variance. Quantile regression provides
+sensible prediction intervals even for errors with non-constant (but
+predictable) variance or non-normal distribution.
+
+.. figure:: /auto_examples/linear_model/images/sphx_glr_plot_quantile_regression_001.png
+   :target: ../auto_examples/linear_model/plot_quantile_regression.html
+   :align: center
+   :scale: 50%
+
+Based on minimizing the pinball loss, conditional quantiles can also be
+estimated by models other than linear models. For example,
+:class:`~sklearn.ensemble.GradientBoostingRegressor` can predict conditional
+quantiles if its parameter ``loss`` is set to ``"quantile"`` and parameter
+``alpha`` is set to the quantile that should be predicted. See the example in
+:ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_quantile.py`.
+
+Most implementations of quantile regression are based on linear programming
+problem. The current implementation is based on
+:func:`scipy.optimize.linprog`.
+
+.. topic:: Examples:
+
+  * :ref:`sphx_glr_auto_examples_linear_model_plot_quantile_regression.py`
+
+.. topic:: References:
+
+  * Koenker, R., & Bassett Jr, G. (1978). `Regression quantiles.
+    <https://gib.people.uic.edu/RQ.pdf>`_
+    Econometrica: journal of the Econometric Society, 33-50.
+
+  * Portnoy, S., & Koenker, R. (1997). The Gaussian hare and the Laplacian
+    tortoise: computability of squared-error versus absolute-error estimators.
+    Statistical Science, 12, 279-300. https://doi.org/10.1214/ss/1030037960
+
+  * Koenker, R. (2005). Quantile Regression.
+    Cambridge University Press. https://doi.org/10.1017/CBO9780511754098
+
+
 .. _polynomial_regression:
 
 Polynomial regression: extending linear models with basis functions
diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 4e7ade1083921..29a4bce98ecb0 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -282,6 +282,11 @@ Changelog
 :mod:`sklearn.linear_model`
 ...........................
 
+- |Feature| Added :class:`linear_model.QuantileRegressor` which implements
+  linear quantile regression with L1 penalty.
+  :pr:`9978` by :user:`David Dale <avidale>` and
+  :user:`Christian Lorentzen <lorentzenchr>`.
+
 - |Feature| The new :class:`linear_model.SGDOneClassSVM` provides an SGD
   implementation of the linear One-Class SVM. Combined with kernel
   approximation techniques, this implementation approximates the solution of
diff --git a/examples/linear_model/plot_quantile_regression.py b/examples/linear_model/plot_quantile_regression.py
new file mode 100644
index 0000000000000..8af7785cc6733
--- /dev/null
+++ b/examples/linear_model/plot_quantile_regression.py
@@ -0,0 +1,110 @@
+"""
+===================
+Quantile regression
+===================
+This example illustrates how quantile regression can predict non-trivial
+conditional quantiles.
+
+The left figure shows the case when the error distribution is normal,
+but has non-constant variance, i.e. with heteroscedasticity.
+
+The right figure shows an example of an asymmetric error distribution,
+namely the Pareto distribution.
+"""
+print(__doc__)
+# Authors: David Dale <dale.david@mail.ru>
+#          Christian Lorentzen <lorentzen.ch@gmail.com>
+# License: BSD 3 clause
+import numpy as np
+import matplotlib.pyplot as plt
+
+from sklearn.linear_model import QuantileRegressor, LinearRegression
+from sklearn.metrics import mean_absolute_error, mean_squared_error
+from sklearn.model_selection import cross_val_score
+
+
+def plot_points_highlighted(x, y, model_low, model_high, ax):
+    """Plot points with highlighting."""
+    mask = y <= model_low.predict(X)
+    ax.scatter(x[mask], y[mask], c="k", marker="x")
+    mask = y > model_high.predict(X)
+    ax.scatter(x[mask], y[mask], c="k", marker="x")
+    mask = (y > model_low.predict(X)) & (y <= model_high.predict(X))
+    ax.scatter(x[mask], y[mask], c="k")
+
+
+fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5), sharey=True)
+
+rng = np.random.RandomState(42)
+x = np.linspace(0, 10, 100)
+X = x[:, np.newaxis]
+y = 10 + 0.5 * x + rng.normal(loc=0, scale=0.5 + 0.5 * x, size=x.shape[0])
+y_mean = 10 + 0.5 * x
+ax1.plot(x, y_mean, "k--")
+
+quantiles = [0.05, 0.5, 0.95]
+models = []
+for quantile in quantiles:
+    qr = QuantileRegressor(quantile=quantile, alpha=0)
+    qr.fit(X, y)
+    ax1.plot(x, qr.predict(X))
+    models.append(qr)
+
+plot_points_highlighted(x, y, models[0], models[2], ax1)
+ax1.set_xlabel("x")
+ax1.set_ylabel("y")
+ax1.set_title("Quantiles of heteroscedastic Normal distributed target")
+ax1.legend(["true mean"] + quantiles)
+
+
+a = 5
+y = 10 + 0.5 * x + 10 * (rng.pareto(a, size=x.shape[0]) - 1 / (a - 1))
+ax2.plot(x, y_mean, "k--")
+
+models = []
+for quantile in quantiles:
+    qr = QuantileRegressor(quantile=quantile, alpha=0)
+    qr.fit(X, y)
+    ax2.plot([0, 10], qr.predict([[0], [10]]))
+    models.append(qr)
+
+plot_points_highlighted(x, y, models[0], models[2], ax2)
+ax2.set_xlabel("x")
+ax2.set_ylabel("y")
+ax2.set_title("Quantiles of asymmetric Pareto distributed target")
+ax2.legend(["true mean"] + quantiles, loc="lower right")
+ax2.yaxis.set_tick_params(labelbottom=True)
+
+plt.show()
+
+# %%
+# Note that both targets have the same mean value, indicated by the dashed
+# black line. As the Normal distribution is symmetric, mean and median are
+# identical and the predicted 0.5 quantile almost hits the true mean.
+# In the Pareto case, the difference between predicted median and true mean
+# is evident. We also marked the points below the 0.05 and above 0.95
+# predicted quantiles by small crosses. You might count them and consider
+# that we have 100 samples in total.
+#
+# The second part of the example shows that LinearRegression minimizes MSE
+# in order to predict the mean, while QuantileRegressor with `quantile=0.5`
+# minimizes MAE in order to predict the median. Both do their own job well.
+
+models = [LinearRegression(), QuantileRegressor(alpha=0)]
+names = ["OLS", "Quantile"]
+
+print("# In-sample performance")
+for model_name, model in zip(names, models):
+    print(model_name + ":")
+    model.fit(X, y)
+    mae = mean_absolute_error(model.predict(X), y)
+    rmse = np.sqrt(mean_squared_error(model.predict(X), y))
+    print(f"MAE = {mae:.4}  RMSE = {rmse:.4}")
+print("\n# Cross-validated performance")
+for model_name, model in zip(names, models):
+    print(model_name + ":")
+    mae = -cross_val_score(model, X, y, cv=3,
+                           scoring="neg_mean_absolute_error").mean()
+    rmse = np.sqrt(-cross_val_score(model, X, y, cv=3,
+                                    scoring="neg_mean_squared_error").mean())
+    print(f"MAE = {mae:.4}  RMSE = {rmse:.4}")
diff --git a/sklearn/linear_model/__init__.py b/sklearn/linear_model/__init__.py
index f715e30795961..02e8cafaa7b88 100644
--- a/sklearn/linear_model/__init__.py
+++ b/sklearn/linear_model/__init__.py
@@ -28,6 +28,7 @@
 from ._passive_aggressive import PassiveAggressiveRegressor
 from ._perceptron import Perceptron
 
+from ._quantile import QuantileRegressor
 from ._ransac import RANSACRegressor
 from ._theil_sen import TheilSenRegressor
 
@@ -59,6 +60,7 @@
            'PassiveAggressiveClassifier',
            'PassiveAggressiveRegressor',
            'Perceptron',
+           'QuantileRegressor',
            'Ridge',
            'RidgeCV',
            'RidgeClassifier',
diff --git a/sklearn/linear_model/_quantile.py b/sklearn/linear_model/_quantile.py
new file mode 100644
index 0000000000000..bf8fea4552c9d
--- /dev/null
+++ b/sklearn/linear_model/_quantile.py
@@ -0,0 +1,280 @@
+# Authors: David Dale <dale.david@mail.ru>
+#          Christian Lorentzen <lorentzen.ch@gmail.com>
+# License: BSD 3 clause
+import warnings
+
+import numpy as np
+from scipy.optimize import linprog
+
+from ..base import BaseEstimator, RegressorMixin
+from ._base import LinearModel
+from ..exceptions import ConvergenceWarning
+from ..utils.validation import _check_sample_weight
+from ..utils.fixes import sp_version, parse_version
+
+
+class QuantileRegressor(LinearModel, RegressorMixin, BaseEstimator):
+    """Linear regression model that predicts conditional quantiles.
+
+    The linear :class:`QuantileRegressor` optimizes the pinball loss for a
+    desired `quantile` and is robust to outliers.
+
+    This model uses an L1 regularization like
+    :class:`~sklearn.linear_model.Lasso`.
+
+    Read more in the :ref:`User Guide <quantile_regression>`.
+
+    .. versionadded:: 1.0
+
+    Parameters
+    ----------
+    quantile : float, default=0.5
+        The quantile that the model tries to predict. It must be strictly
+        between 0 and 1. If 0.5 (default), the model predicts the 50%
+        quantile, i.e. the median.
+
+    alpha : float, default=1.0
+        Regularization constant that multiplies the L1 penalty term.
+
+    fit_intercept : bool, default=True
+        Whether or not to fit the intercept.
+
+    solver : {'highs-ds', 'highs-ipm', 'highs', 'interior-point', \
+            'revised simplex'}, default='interior-point'
+        Method used by :func:`scipy.optimize.linprog` to solve the linear
+        programming formulation. Note that the highs methods are recommended
+        for usage with `scipy>=1.6.0` because they are the fastest ones.
+
+    solver_options : dict, default=None
+        Additional parameters passed to :func:`scipy.optimize.linprog` as
+        options. If `None` and if `solver='interior-point'`, then
+        `{"lstsq": True}` is passed to :func:`scipy.optimize.linprog` for the
+        sake of stability.
+
+    Attributes
+    ----------
+    coef_ : array of shape (n_features,)
+        Estimated coefficients for the features.
+
+    intercept_ : float
+        The intercept of the model, aka bias term.
+
+    n_iter_ : int
+        The actual number of iterations performed by the solver.
+
+    See Also
+    --------
+    Lasso : The Lasso is a linear model that estimates sparse coefficients
+        with l1 regularization.
+    HuberRegressor : Linear regression model that is robust to outliers.
+
+    Examples
+    --------
+    >>> from sklearn.linear_model import QuantileRegressor
+    >>> import numpy as np
+    >>> n_samples, n_features = 10, 2
+    >>> rng = np.random.RandomState(0)
+    >>> y = rng.randn(n_samples)
+    >>> X = rng.randn(n_samples, n_features)
+    >>> reg = QuantileRegressor(quantile=0.8).fit(X, y)
+    >>> np.mean(y <= reg.predict(X))
+    0.8
+    """
+
+    def __init__(
+        self,
+        *,
+        quantile=0.5,
+        alpha=1.0,
+        fit_intercept=True,
+        solver="interior-point",
+        solver_options=None,
+    ):
+        self.quantile = quantile
+        self.alpha = alpha
+        self.fit_intercept = fit_intercept
+        self.solver = solver
+        self.solver_options = solver_options
+
+    def fit(self, X, y, sample_weight=None):
+        """Fit the model according to the given training data.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training data.
+
+        y : array-like of shape (n_samples,)
+            Target values.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights.
+
+        Returns
+        -------
+        self : object
+            Returns self.
+        """
+        X, y = self._validate_data(
+            X, y, accept_sparse=False, y_numeric=True, multi_output=False
+        )
+        sample_weight = _check_sample_weight(sample_weight, X)
+
+        n_features = X.shape[1]
+        n_params = n_features
+
+        if self.fit_intercept:
+            n_params += 1
+            # Note that centering y and X with _preprocess_data does not work
+            # for quantile regression.
+
+        # The objective is defined as 1/n * sum(pinball loss) + alpha * L1.
+        # So we rescale the penalty term, which is equivalent.
+        if self.alpha >= 0:
+            alpha = np.sum(sample_weight) * self.alpha
+        else:
+            raise ValueError(
+                f"Penalty alpha must be a non-negative number, "
+                f"got {self.alpha}"
+            )
+
+        if self.quantile >= 1.0 or self.quantile <= 0.0:
+            raise ValueError(
+                f"Quantile should be strictly between 0.0 and 1.0, got "
+                f"{self.quantile}"
+            )
+
+        if not isinstance(self.fit_intercept, bool):
+            raise ValueError(
+                f"The argument fit_intercept must be bool, "
+                f"got {self.fit_intercept}"
+            )
+
+        if self.solver not in (
+            "highs-ds",
+            "highs-ipm",
+            "highs",
+            "interior-point",
+            "revised simplex",
+        ):
+            raise ValueError(
+                f"Invalid value for argument solver, got {self.solver}"
+            )
+        elif self.solver == "revised simplex" and sp_version < parse_version(
+            "1.3.0"
+        ):
+            raise ValueError(
+                f"Solver 'revised simplex' is only available "
+                f"with scipy>=1.3.0, got {sp_version}"
+            )
+        elif self.solver in (
+            "highs-ds",
+            "highs-ipm",
+            "highs",
+        ) and sp_version < parse_version("1.6.0"):
+            raise ValueError(
+                f"Solver {self.solver} is only available "
+                f"with scipy>=1.6.0, got {sp_version}"
+            )
+
+        if self.solver_options is not None and not isinstance(
+            self.solver_options, dict
+        ):
+            raise ValueError(
+                f"Invalid value for argument solver_options, "
+                f"must be None or a dictionary, got "
+                f"{self.solver_options}"
+            )
+
+        # make default solver more stable
+        if self.solver_options is None and self.solver == "interior-point":
+            solver_options = {"lstsq": True}
+        else:
+            solver_options = self.solver_options
+
+        # Use linear programming formulation of quantile regression
+        #     min_x c x
+        #           A_eq x = b_eq
+        #                0 <= x
+        # x = (s0, s, t0, t, u, v) = slack variables
+        # intercept = s0 + t0
+        # coef = s + t
+        # c = (alpha * 1_p, alpha * 1_p, quantile * 1_n, (1-quantile) * 1_n)
+        # residual = y - X@coef - intercept = u - v
+        # A_eq = (1_n, X, -1_n, -X, diag(1_n), -diag(1_n))
+        # b_eq = y
+        # p = n_features + fit_intercept
+        # n = n_samples
+        # 1_n = vector of length n with entries equal one
+        # see https://stats.stackexchange.com/questions/384909/
+        #
+        # Filtering out zero samples weights from the beginning makes life
+        # easier for the linprog solver.
+        mask = sample_weight != 0
+        n_mask = int(np.sum(mask))  # use n_mask instead of n_samples
+        c = np.concatenate(
+            [
+                np.full(2 * n_params, fill_value=alpha),
+                sample_weight[mask] * self.quantile,
+                sample_weight[mask] * (1 - self.quantile),
+            ]
+        )
+        if self.fit_intercept:
+            # do not penalize the intercept
+            c[0] = 0
+            c[n_params] = 0
+
+            A_eq = np.concatenate(
+                [
+                    np.ones((n_mask, 1)),
+                    X[mask],
+                    -np.ones((n_mask, 1)),
+                    -X[mask],
+                    np.eye(n_mask),
+                    -np.eye(n_mask),
+                ],
+                axis=1,
+            )
+        else:
+            A_eq = np.concatenate(
+                [X[mask], -X[mask], np.eye(n_mask), -np.eye(n_mask)], axis=1
+            )
+
+        b_eq = y[mask]
+
+        result = linprog(
+            c=c,
+            A_eq=A_eq,
+            b_eq=b_eq,
+            method=self.solver,
+            options=solver_options,
+        )
+        solution = result.x
+        if not result.success:
+            failure = {
+                1: "Iteration limit reached.",
+                2: "Problem appears to be infeasible.",
+                3: "Problem appears to be unbounded.",
+                4: "Numerical difficulties encountered.",
+            }
+            warnings.warn(
+                f"Linear programming for QuantileRegressor did not succeed.\n"
+                f"Status is {result.status}: "
+                + failure.setdefault(result.status, "unknown reason") + "\n"
+                + "Result message of linprog:\n" + result.message,
+                ConvergenceWarning
+            )
+
+        # positive slack - negative slack
+        # solution is an array with (params_pos, params_neg, u, v)
+        params = solution[:n_params] - solution[n_params:2 * n_params]
+
+        self.n_iter_ = result.nit
+
+        if self.fit_intercept:
+            self.coef_ = params[1:]
+            self.intercept_ = params[0]
+        else:
+            self.coef_ = params
+            self.intercept_ = 0.0
+        return self
diff --git a/sklearn/linear_model/tests/test_quantile.py b/sklearn/linear_model/tests/test_quantile.py
new file mode 100644
index 0000000000000..6118889f4d1b6
--- /dev/null
+++ b/sklearn/linear_model/tests/test_quantile.py
@@ -0,0 +1,254 @@
+# Authors: David Dale <dale.david@mail.ru>
+#          Christian Lorentzen <lorentzen.ch@gmail.com>
+# License: BSD 3 clause
+
+import numpy as np
+import pytest
+from pytest import approx
+from scipy.optimize import minimize
+
+from sklearn.datasets import make_regression
+from sklearn.exceptions import ConvergenceWarning
+from sklearn.linear_model import HuberRegressor, QuantileRegressor
+from sklearn.metrics import mean_pinball_loss
+from sklearn.utils._testing import assert_allclose
+from sklearn.utils.fixes import parse_version, sp_version
+
+
+@pytest.fixture
+def X_y_data():
+    X, y = make_regression(n_samples=10, n_features=1, random_state=0, noise=1)
+    return X, y
+
+
+@pytest.mark.parametrize(
+    "params, err_msg",
+    [
+        ({"quantile": 2}, "Quantile should be strictly between 0.0 and 1.0"),
+        ({"quantile": 1}, "Quantile should be strictly between 0.0 and 1.0"),
+        ({"quantile": 0}, "Quantile should be strictly between 0.0 and 1.0"),
+        ({"quantile": -1}, "Quantile should be strictly between 0.0 and 1.0"),
+        ({"alpha": -1.5}, "Penalty alpha must be a non-negative number"),
+        ({"fit_intercept": "blah"}, "The argument fit_intercept must be bool"),
+        ({"fit_intercept": 0}, "The argument fit_intercept must be bool"),
+        ({"solver": "blah"}, "Invalid value for argument solver"),
+        (
+            {"solver_options": "blah"},
+            "Invalid value for argument solver_options",
+        ),
+    ],
+)
+def test_init_parameters_validation(X_y_data, params, err_msg):
+    """Test that invalid init parameters raise errors."""
+    X, y = X_y_data
+    with pytest.raises(ValueError, match=err_msg):
+        QuantileRegressor(**params).fit(X, y)
+
+
+@pytest.mark.parametrize("solver", ("highs-ds", "highs-ipm", "highs"))
+@pytest.mark.skipif(sp_version >= parse_version('1.6.0'),
+                    reason="Solvers are available as of scipy 1.6.0")
+def test_too_new_solver_methods_raise_error(X_y_data, solver):
+    """Test that highs solver raises for scipy<1.6.0."""
+    X, y = X_y_data
+    with pytest.raises(ValueError, match="scipy>=1.6.0"):
+        QuantileRegressor(solver=solver).fit(X, y)
+
+
+@pytest.mark.parametrize(
+    "quantile, alpha, intercept, coef",
+    [
+        # for 50% quantile w/o regularization, any slope in [1, 10] is okay
+        [0.5, 0, 1, None],
+        # if positive error costs more, the slope is maximal
+        [0.51, 0, 1, 10],
+        # if negative error costs more, the slope is minimal
+        [0.49, 0, 1, 1],
+        # for a small lasso penalty, the slope is also minimal
+        [0.5, 0.01, 1, 1],
+        # for a large lasso penalty, the model predicts the constant median
+        [0.5, 100, 2, 0],
+    ],
+)
+def test_quantile_toy_example(quantile, alpha, intercept, coef):
+    # test how different parameters affect a small intuitive example
+    X = [[0], [1], [1]]
+    y = [1, 2, 11]
+    model = QuantileRegressor(quantile=quantile, alpha=alpha).fit(X, y)
+    assert_allclose(model.intercept_, intercept, atol=1e-2)
+    if coef is not None:
+        assert_allclose(model.coef_[0], coef, atol=1e-2)
+    if alpha < 100:
+        assert model.coef_[0] >= 1
+    assert model.coef_[0] <= 10
+
+
+@pytest.mark.parametrize("fit_intercept", [True, False])
+def test_quantile_equals_huber_for_low_epsilon(fit_intercept):
+    X, y = make_regression(
+        n_samples=100, n_features=20, random_state=0, noise=1.0
+    )
+    alpha = 1e-4
+    huber = HuberRegressor(
+        epsilon=1 + 1e-4, alpha=alpha, fit_intercept=fit_intercept
+    ).fit(X, y)
+    quant = QuantileRegressor(alpha=alpha, fit_intercept=fit_intercept).fit(
+        X, y
+    )
+    assert_allclose(huber.coef_, quant.coef_, atol=1e-1)
+    if fit_intercept:
+        assert huber.intercept_ == approx(quant.intercept_, abs=1e-1)
+        # check that we still predict fraction
+        assert np.mean(y < quant.predict(X)) == approx(0.5, abs=1e-1)
+
+
+@pytest.mark.parametrize("q", [0.5, 0.9, 0.05])
+def test_quantile_estimates_calibration(q):
+    # Test that model estimates percentage of points below the prediction
+    X, y = make_regression(
+        n_samples=1000, n_features=20, random_state=0, noise=1.0
+    )
+    quant = QuantileRegressor(
+        quantile=q,
+        alpha=0,
+        solver_options={"lstsq": False},
+    ).fit(X, y)
+    assert np.mean(y < quant.predict(X)) == approx(q, abs=1e-2)
+
+
+def test_quantile_sample_weight():
+    # test that with unequal sample weights we still estimate weighted fraction
+    n = 1000
+    X, y = make_regression(
+        n_samples=n, n_features=5, random_state=0, noise=10.0
+    )
+    weight = np.ones(n)
+    # when we increase weight of upper observations,
+    # estimate of quantile should go up
+    weight[y > y.mean()] = 100
+    quant = QuantileRegressor(
+        quantile=0.5,
+        alpha=1e-8,
+        solver_options={"lstsq": False}
+    )
+    quant.fit(X, y, sample_weight=weight)
+    fraction_below = np.mean(y < quant.predict(X))
+    assert fraction_below > 0.5
+    weighted_fraction_below = np.average(y < quant.predict(X), weights=weight)
+    assert weighted_fraction_below == approx(0.5, abs=3e-2)
+
+
+@pytest.mark.parametrize("quantile", [0.2, 0.5, 0.8])
+def test_asymmetric_error(quantile):
+    """Test quantile regression for asymmetric distributed targets."""
+    n_samples = 1000
+    rng = np.random.RandomState(42)
+    # take care that X @ coef + intercept > 0
+    X = np.concatenate(
+        (
+            np.abs(rng.randn(n_samples)[:, None]),
+            -rng.randint(2, size=(n_samples, 1)),
+        ),
+        axis=1,
+    )
+    intercept = 1.23
+    coef = np.array([0.5, -2])
+    # For an exponential distribution with rate lambda, e.g. exp(-lambda * x),
+    # the quantile at level q is:
+    #   quantile(q) = - log(1 - q) / lambda
+    #   scale = 1/lambda = -quantile(q) / log(1-q)
+    y = rng.exponential(
+        scale=-(X @ coef + intercept) / np.log(1 - quantile), size=n_samples
+    )
+    model = QuantileRegressor(
+        quantile=quantile,
+        alpha=0,
+        solver="interior-point",
+        solver_options={"tol": 1e-5},
+    ).fit(X, y)
+    assert model.intercept_ == approx(intercept, rel=0.2)
+    assert_allclose(model.coef_, coef, rtol=0.6)
+    assert_allclose(np.mean(model.predict(X) > y), quantile)
+
+    # Now compare to Nelder-Mead optimization with L1 penalty
+    alpha = 0.01
+    model.set_params(alpha=alpha).fit(X, y)
+    model_coef = np.r_[model.intercept_, model.coef_]
+
+    def func(coef):
+        loss = mean_pinball_loss(y, X @ coef[1:] + coef[0], alpha=quantile)
+        L1 = np.sum(np.abs(coef[1:]))
+        return loss + alpha * L1
+
+    res = minimize(
+        fun=func,
+        x0=[1, 0, -1],
+        method="Nelder-Mead",
+        tol=1e-12,
+        options={"maxiter": 2000},
+    )
+
+    assert func(model_coef) == approx(func(res.x), rel=1e-3)
+    assert_allclose(model.intercept_, res.x[0], rtol=1e-3)
+    assert_allclose(model.coef_, res.x[1:], rtol=1e-3)
+    assert_allclose(np.mean(model.predict(X) > y), quantile, rtol=8e-3)
+
+
+@pytest.mark.parametrize("quantile", [0.2, 0.5, 0.8])
+def test_equivariance(quantile):
+    """Test equivariace of quantile regression.
+
+    See Koenker (2005) Quantile Regression, Chapter 2.2.3.
+    """
+    rng = np.random.RandomState(42)
+    n_samples, n_features = 100, 5
+    X, y = make_regression(
+        n_samples=n_samples,
+        n_features=n_features,
+        n_informative=n_features,
+        noise=0,
+        random_state=rng,
+        shuffle=False,
+    )
+    # make y asymmetric
+    y += rng.exponential(scale=100, size=y.shape)
+    params = dict(alpha=0, solver_options={"lstsq": True, "tol": 1e-10})
+    model1 = QuantileRegressor(quantile=quantile, **params).fit(X, y)
+
+    # coef(q; a*y, X) = a * coef(q; y, X)
+    a = 2.5
+    model2 = QuantileRegressor(quantile=quantile, **params).fit(X, a * y)
+    assert model2.intercept_ == approx(a * model1.intercept_, rel=1e-5)
+    assert_allclose(model2.coef_, a * model1.coef_, rtol=1e-5)
+
+    # coef(1-q; -a*y, X) = -a * coef(q; y, X)
+    model2 = QuantileRegressor(quantile=1 - quantile, **params).fit(X, -a * y)
+    assert model2.intercept_ == approx(-a * model1.intercept_, rel=1e-5)
+    assert_allclose(model2.coef_, -a * model1.coef_, rtol=1e-5)
+
+    # coef(q; y + X @ g, X) = coef(q; y, X) + g
+    g_intercept, g_coef = rng.randn(), rng.randn(n_features)
+    model2 = QuantileRegressor(quantile=quantile, **params)
+    model2.fit(X, y + X @ g_coef + g_intercept)
+    assert model2.intercept_ == approx(model1.intercept_ + g_intercept)
+    assert_allclose(model2.coef_, model1.coef_ + g_coef, rtol=1e-6)
+
+    # coef(q; y, X @ A) = A^-1 @ coef(q; y, X)
+    A = rng.randn(n_features, n_features)
+    model2 = QuantileRegressor(quantile=quantile, **params)
+    model2.fit(X @ A, y)
+    assert model2.intercept_ == approx(model1.intercept_, rel=1e-5)
+    assert_allclose(model2.coef_, np.linalg.solve(A, model1.coef_), rtol=1e-5)
+
+
+def test_linprog_failure():
+    """Test that linprog fails."""
+    X = np.linspace(0, 10, num=10).reshape(-1, 1)
+    y = np.linspace(0, 10, num=10)
+    reg = QuantileRegressor(
+        alpha=0, solver="interior-point", solver_options={"maxiter": 1}
+    )
+
+    msg = "Linear programming for QuantileRegressor did not succeed."
+    with pytest.warns(ConvergenceWarning, match=msg):
+        reg.fit(X, y)

From 6ec090f3935e1c7cf3d836ddb47f9db8502a98aa Mon Sep 17 00:00:00 2001
From: Eleni Markou <e.markou@thebeat.co>
Date: Wed, 26 May 2021 16:27:04 +0300
Subject: [PATCH 417/478] DOC fix broken links in faq.rst and glossary.rst
 (#20122)

---
 doc/faq.rst      | 1 -
 doc/glossary.rst | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/doc/faq.rst b/doc/faq.rst
index 4038106bc93d7..43ef246594de1 100644
--- a/doc/faq.rst
+++ b/doc/faq.rst
@@ -20,7 +20,6 @@ sy-kit learn. sci stands for science!
 Why scikit?
 ------------
 There are multiple scikits, which are scientific toolboxes built around SciPy.
-You can find a list at `<https://scikits.appspot.com/scikits>`_.
 Apart from scikit-learn, another popular one is `scikit-image <https://scikit-image.org/>`_.
 
 How can I contribute to scikit-learn?
diff --git a/doc/glossary.rst b/doc/glossary.rst
index a43eda4a79b67..ba924387bc5eb 100644
--- a/doc/glossary.rst
+++ b/doc/glossary.rst
@@ -171,7 +171,7 @@ General Concepts
         one-hot encode categorical features.
         See also :ref:`preprocessing_categorical_features` and the
         `categorical-encoding
-        <https://contrib.scikit-learn.org/categorical-encoding>`_
+        <https://github.com/scikit-learn-contrib/category_encoders>`_
         package for tools related to encoding categorical features.
 
     clone

From 7c212a2966a942d22da8935f1068e49a45ebc340 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Wed, 26 May 2021 16:18:15 +0200
Subject: [PATCH 418/478] EXA improve the example for QuantileRegressor
 (#20133)

---
 .../linear_model/plot_quantile_regression.py  | 352 ++++++++++++++----
 1 file changed, 281 insertions(+), 71 deletions(-)

diff --git a/examples/linear_model/plot_quantile_regression.py b/examples/linear_model/plot_quantile_regression.py
index 8af7785cc6733..404d7a314d553 100644
--- a/examples/linear_model/plot_quantile_regression.py
+++ b/examples/linear_model/plot_quantile_regression.py
@@ -2,6 +2,7 @@
 ===================
 Quantile regression
 ===================
+
 This example illustrates how quantile regression can predict non-trivial
 conditional quantiles.
 
@@ -11,100 +12,309 @@
 The right figure shows an example of an asymmetric error distribution,
 namely the Pareto distribution.
 """
+
 print(__doc__)
+
 # Authors: David Dale <dale.david@mail.ru>
 #          Christian Lorentzen <lorentzen.ch@gmail.com>
+#          Guillaume Lemaitre <glemaitre58@gmail.com>
 # License: BSD 3 clause
+
+# %%
+# Dataset generation
+# ------------------
+#
+# To illustrate the behaviour of quantile regression, we will generate two
+# synthetic datasets. The true generative random processess for both datasets
+# will be composed by the same expected value with a linear relationship with a
+# single feature `x`.
 import numpy as np
+
+rng = np.random.RandomState(42)
+x = np.linspace(start=0, stop=10, num=100)
+X = x[:, np.newaxis]
+y_true_mean = 10 + 0.5 * x
+
+# %%
+# We will create two subsequent problems by changing the distribution of the
+# target `y` while keeping the same expected value:
+#
+# - in the first case, a heteroscedastic Normal noise is added;
+# - in the second case, an asymmetric Pareto noise is added.
+y_normal = y_true_mean + rng.normal(
+    loc=0, scale=0.5 + 0.5 * x, size=x.shape[0]
+)
+a = 5
+y_pareto = y_true_mean + 10 * (rng.pareto(a, size=x.shape[0]) - 1 / (a - 1))
+
+# %%
+# Let's first visualize the datasets as well as the distribution of the
+# residuals `y - mean(y)`.
 import matplotlib.pyplot as plt
 
-from sklearn.linear_model import QuantileRegressor, LinearRegression
-from sklearn.metrics import mean_absolute_error, mean_squared_error
-from sklearn.model_selection import cross_val_score
+_, axs = plt.subplots(
+    nrows=2, ncols=2, figsize=(15, 11), sharex="row", sharey="row"
+)
 
+axs[0, 0].plot(x, y_true_mean, label="True mean")
+axs[0, 0].scatter(
+    x, y_normal, color="black", alpha=0.5, label="Observations"
+)
+axs[1, 0].hist(y_true_mean - y_normal, edgecolor="black")
 
-def plot_points_highlighted(x, y, model_low, model_high, ax):
-    """Plot points with highlighting."""
-    mask = y <= model_low.predict(X)
-    ax.scatter(x[mask], y[mask], c="k", marker="x")
-    mask = y > model_high.predict(X)
-    ax.scatter(x[mask], y[mask], c="k", marker="x")
-    mask = (y > model_low.predict(X)) & (y <= model_high.predict(X))
-    ax.scatter(x[mask], y[mask], c="k")
 
+axs[0, 1].plot(x, y_true_mean, label="True mean")
+axs[0, 1].scatter(
+    x, y_pareto, color="black", alpha=0.5, label="Observations"
+)
+axs[1, 1].hist(y_true_mean - y_pareto, edgecolor="black")
 
-fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5), sharey=True)
+axs[0, 0].set_title("Dataset with heteroscedastic Normal distributed targets")
+axs[0, 1].set_title("Dataset with asymmetric Pareto distributed target")
+axs[1, 0].set_title(
+    "Residuals distribution for heteroscedastic Normal distributed targets"
+)
+axs[1, 1].set_title(
+    "Residuals distribution for asymmetric Pareto distributed target"
+)
+axs[0, 0].legend()
+axs[0, 1].legend()
+axs[0, 0].set_ylabel("y")
+axs[1, 0].set_ylabel("Counts")
+axs[0, 1].set_xlabel("x")
+axs[0, 0].set_xlabel("x")
+axs[1, 0].set_xlabel("Residuals")
+_ = axs[1, 1].set_xlabel("Residuals")
 
-rng = np.random.RandomState(42)
-x = np.linspace(0, 10, 100)
-X = x[:, np.newaxis]
-y = 10 + 0.5 * x + rng.normal(loc=0, scale=0.5 + 0.5 * x, size=x.shape[0])
-y_mean = 10 + 0.5 * x
-ax1.plot(x, y_mean, "k--")
+# %%
+# With the heteroscedastic Normal distributed target, we observe that the
+# variance of the noise is increasing when the value of the feature `x` is
+# increasing.
+#
+# With the asymmetric Pareto distributed target, we observe that the positive
+# residuals are bounded.
+#
+# These types of noisy targets make the estimation via
+# :class:`~sklearn.linear_model.LinearRegression` less efficient, i.e. we need
+# more data to get stable results and, in addition, large outliers can have a
+# huge impact on the fitted coefficients. (Stated otherwise: in a setting with
+# constant variance, ordinary least squares estimators converge much faster to
+# the *true* coefficients with increasing sample size.)
+#
+# In this asymmetric setting, the median or different quantiles give additional
+# insights. On top of that, median estimation is much more robust to outliers
+# and heavy tailed distributions. But note that extreme quantiles are estimated
+# by very view data points. 95% quantile are more or less estimated by the 5%
+# largest values and thus also a bit sensitive outliers.
+#
+# In the remainder of this tutorial, we will show how
+# :class:`~sklearn.linear_model.QuantileRegressor` can be used in practice and
+# give the intuition into the properties of the fitted models. Finally,
+# we will compare the both :class:`~sklearn.linear_model.QuantileRegressor`
+# and :class:`~sklearn.linear_model.LinearRegression`.
+#
+# Fitting a `QuantileRegressor`
+# -----------------------------
+#
+# In this section, we want to estimate the conditional median as well as
+# a low and high quantile fixed at 5% and 95%, respectively. Thus, we will get
+# three linear models, one for each quantile.
+#
+# We will use the quantiles at 5% and 95% to find the outliers in the training
+# sample beyond the central 90% interval.
+from sklearn.linear_model import QuantileRegressor
 
 quantiles = [0.05, 0.5, 0.95]
-models = []
+predictions = {}
+out_bounds_predictions = np.zeros_like(y_true_mean, dtype=np.bool_)
 for quantile in quantiles:
     qr = QuantileRegressor(quantile=quantile, alpha=0)
-    qr.fit(X, y)
-    ax1.plot(x, qr.predict(X))
-    models.append(qr)
+    y_pred = qr.fit(X, y_normal).predict(X)
+    predictions[quantile] = y_pred
 
-plot_points_highlighted(x, y, models[0], models[2], ax1)
-ax1.set_xlabel("x")
-ax1.set_ylabel("y")
-ax1.set_title("Quantiles of heteroscedastic Normal distributed target")
-ax1.legend(["true mean"] + quantiles)
+    if quantile == min(quantiles):
+        out_bounds_predictions = np.logical_or(
+            out_bounds_predictions, y_pred >= y_normal
+        )
+    elif quantile == max(quantiles):
+        out_bounds_predictions = np.logical_or(
+            out_bounds_predictions, y_pred <= y_normal
+        )
 
+# %%
+# Now, we can plot the three linear models and the distinguished samples that
+# are within the central 90% interval from samples that are outside this
+# interval.
+plt.plot(X, y_true_mean, color="black", linestyle="dashed", label="True mean")
 
-a = 5
-y = 10 + 0.5 * x + 10 * (rng.pareto(a, size=x.shape[0]) - 1 / (a - 1))
-ax2.plot(x, y_mean, "k--")
+for quantile, y_pred in predictions.items():
+    plt.plot(X, y_pred, label=f"Quantile: {quantile}")
+
+plt.scatter(
+    x[out_bounds_predictions],
+    y_normal[out_bounds_predictions],
+    color="black",
+    marker="+",
+    alpha=0.5,
+    label="Outside interval",
+)
+plt.scatter(
+    x[~out_bounds_predictions],
+    y_normal[~out_bounds_predictions],
+    color="black",
+    alpha=0.5,
+    label="Inside interval",
+)
 
-models = []
+plt.legend()
+plt.xlabel("x")
+plt.ylabel("y")
+_ = plt.title("Quantiles of heteroscedastic Normal distributed target")
+
+# %%
+# Since the noise is still Normally distributed, in particular is symmetric,
+# the true conditional mean and the true conditional median coincide. Indeed,
+# we see that the estimated median almost hits the true mean. We observe the
+# effect of having an increasing noise variance on the 5% and 95% quantiles:
+# the slopes of those quantiles are very different and the interval between
+# them becomes wider with increasing `x`.
+#
+# To get an additional intuition regarding the meaning of the 5% and 95%
+# quantiles estimators, one can count the number of samples above and below the
+# predicted quantiles (represented by a cross on the above plot), considering
+# that we have a total of 100 samples.
+#
+# We can repeat the same experiment using the asymmetric Pareto distributed
+# target.
+quantiles = [0.05, 0.5, 0.95]
+predictions = {}
+out_bounds_predictions = np.zeros_like(y_true_mean, dtype=np.bool_)
 for quantile in quantiles:
     qr = QuantileRegressor(quantile=quantile, alpha=0)
-    qr.fit(X, y)
-    ax2.plot([0, 10], qr.predict([[0], [10]]))
-    models.append(qr)
+    y_pred = qr.fit(X, y_pareto).predict(X)
+    predictions[quantile] = y_pred
+
+    if quantile == min(quantiles):
+        out_bounds_predictions = np.logical_or(
+            out_bounds_predictions, y_pred >= y_pareto
+        )
+    elif quantile == max(quantiles):
+        out_bounds_predictions = np.logical_or(
+            out_bounds_predictions, y_pred <= y_pareto
+        )
+
+# %%
+plt.plot(X, y_true_mean, color="black", linestyle="dashed", label="True mean")
+
+for quantile, y_pred in predictions.items():
+    plt.plot(X, y_pred, label=f"Quantile: {quantile}")
+
+plt.scatter(
+    x[out_bounds_predictions],
+    y_pareto[out_bounds_predictions],
+    color="black",
+    marker="+",
+    alpha=0.5,
+    label="Outside interval",
+)
+plt.scatter(
+    x[~out_bounds_predictions],
+    y_pareto[~out_bounds_predictions],
+    color="black",
+    alpha=0.5,
+    label="Inside interval",
+)
+
+plt.legend()
+plt.xlabel("x")
+plt.ylabel("y")
+_ = plt.title("Quantiles of asymmetric Pareto distributed target")
 
-plot_points_highlighted(x, y, models[0], models[2], ax2)
-ax2.set_xlabel("x")
-ax2.set_ylabel("y")
-ax2.set_title("Quantiles of asymmetric Pareto distributed target")
-ax2.legend(["true mean"] + quantiles, loc="lower right")
-ax2.yaxis.set_tick_params(labelbottom=True)
 
-plt.show()
+# %%
+# Due to the asymmetry of the distribution of the noise, we observe that the
+# true mean and estimated conditional median are different. We also observe
+# that each quantile model has different parameters to better fit the desired
+# quantile. Note that ideally, all quantiles would be parallel in this case,
+# which would become more visible with more data points or less extreme
+# quantiles, e.g. 10% and 90%.
+#
+# Comparing `QuantileRegressor` and `LinearRegression`
+# ----------------------------------------------------
+#
+# In this section, we will linger on the difference regarding the error that
+# :class:`~sklearn.linear_model.QuantileRegressor` and
+# :class:`~sklearn.linear_model.LinearRegression` are minimizing.
+#
+# Indeed, :class:`~sklearn.linear_model.LinearRegression` is a least squares
+# approach minimizing the mean squared error (MSE) between the training and
+# predicted targets. In contrast,
+# :class:`~sklearn.linear_model.QuantileRegressor` with `quantile=0.5`
+# minimizes the mean absolute error (MAE) instead.
+#
+# Let's first compute the training errors of such models in terms of mean
+# squared error and mean absolute error. We will use the asymmetric Pareto
+# distributed target to make it more interesting as mean and median are not
+# equal.
+from sklearn.linear_model import LinearRegression
+from sklearn.metrics import mean_absolute_error
+from sklearn.metrics import mean_squared_error
+
+linear_regression = LinearRegression()
+quantile_regression = QuantileRegressor(quantile=0.5, alpha=0)
+
+y_pred_lr = linear_regression.fit(X, y_pareto).predict(X)
+y_pred_qr = quantile_regression.fit(X, y_pareto).predict(X)
+
+print(
+    f"""Training error (in-sample performance)
+    {linear_regression.__class__.__name__}:
+    MAE = {mean_absolute_error(y_pareto, y_pred_lr):.3f}
+    MSE = {mean_squared_error(y_pareto, y_pred_lr):.3f}
+    {quantile_regression.__class__.__name__}:
+    MAE = {mean_absolute_error(y_pareto, y_pred_qr):.3f}
+    MSE = {mean_squared_error(y_pareto, y_pred_qr):.3f}
+    """
+)
+
+# %%
+# On the training set, we see that MAE is lower for
+# :class:`~sklearn.linear_model.QuantileRegressor` than
+# :class:`~sklearn.linear_model.LinearRegression`. In contrast to that, MSE is
+# lower for :class:`~sklearn.linear_model.LinearRegression` than
+# :class:`~sklearn.linear_model.QuantileRegressor`. These results confirms that
+# MAE is the loss minimized by :class:`~sklearn.linear_model.QuantileRegressor`
+# while MSE is the loss minimized
+# :class:`~sklearn.linear_model.LinearRegression`.
+#
+# We can make a similar evaluation but looking a the test error obtained by
+# cross-validation.
+from sklearn.model_selection import cross_validate
+
+cv_results_lr = cross_validate(
+    linear_regression,
+    X,
+    y_pareto,
+    cv=3,
+    scoring=["neg_mean_absolute_error", "neg_mean_squared_error"],
+)
+cv_results_qr = cross_validate(
+    quantile_regression,
+    X,
+    y_pareto,
+    cv=3,
+    scoring=["neg_mean_absolute_error", "neg_mean_squared_error"],
+)
+print(
+    f"""Test error (cross-validated performance)
+    {linear_regression.__class__.__name__}:
+    MAE = {-cv_results_lr["test_neg_mean_absolute_error"].mean():.3f}
+    MSE = {-cv_results_lr["test_neg_mean_squared_error"].mean():.3f}
+    {quantile_regression.__class__.__name__}:
+    MAE = {-cv_results_qr["test_neg_mean_absolute_error"].mean():.3f}
+    MSE = {-cv_results_qr["test_neg_mean_squared_error"].mean():.3f}
+    """
+)
 
 # %%
-# Note that both targets have the same mean value, indicated by the dashed
-# black line. As the Normal distribution is symmetric, mean and median are
-# identical and the predicted 0.5 quantile almost hits the true mean.
-# In the Pareto case, the difference between predicted median and true mean
-# is evident. We also marked the points below the 0.05 and above 0.95
-# predicted quantiles by small crosses. You might count them and consider
-# that we have 100 samples in total.
-#
-# The second part of the example shows that LinearRegression minimizes MSE
-# in order to predict the mean, while QuantileRegressor with `quantile=0.5`
-# minimizes MAE in order to predict the median. Both do their own job well.
-
-models = [LinearRegression(), QuantileRegressor(alpha=0)]
-names = ["OLS", "Quantile"]
-
-print("# In-sample performance")
-for model_name, model in zip(names, models):
-    print(model_name + ":")
-    model.fit(X, y)
-    mae = mean_absolute_error(model.predict(X), y)
-    rmse = np.sqrt(mean_squared_error(model.predict(X), y))
-    print(f"MAE = {mae:.4}  RMSE = {rmse:.4}")
-print("\n# Cross-validated performance")
-for model_name, model in zip(names, models):
-    print(model_name + ":")
-    mae = -cross_val_score(model, X, y, cv=3,
-                           scoring="neg_mean_absolute_error").mean()
-    rmse = np.sqrt(-cross_val_score(model, X, y, cv=3,
-                                    scoring="neg_mean_squared_error").mean())
-    print(f"MAE = {mae:.4}  RMSE = {rmse:.4}")
+# We reach similar conclusions on the out-of-sample evaluation.

From 7c873713df056a9554dd545b0d5f0be93630219b Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Wed, 26 May 2021 17:45:01 +0200
Subject: [PATCH 419/478] DOC change figure in user guide of quantile
 regression

---
 doc/modules/linear_model.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index 7fc14693c198d..4b76c35245d36 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -1466,7 +1466,7 @@ normally with zero mean and constant variance. Quantile regression provides
 sensible prediction intervals even for errors with non-constant (but
 predictable) variance or non-normal distribution.
 
-.. figure:: /auto_examples/linear_model/images/sphx_glr_plot_quantile_regression_001.png
+.. figure:: /auto_examples/linear_model/images/sphx_glr_plot_quantile_regression_002.png
    :target: ../auto_examples/linear_model/plot_quantile_regression.html
    :align: center
    :scale: 50%

From 3c72fe50513d886e454ec7f64cdba7f44f2fbd95 Mon Sep 17 00:00:00 2001
From: Alihan Zihna <alihanz@gmail.com>
Date: Thu, 27 May 2021 07:44:12 +0100
Subject: [PATCH 420/478] TST Changes assert_raises to raises in
 sklearn/utils/test_estimator_checks.py (#20138)

Co-authored-by: Alihan Zihna <a.zihna@ckhgbdp.onmicrosoft.com>
---
 sklearn/utils/tests/test_estimator_checks.py | 154 +++++++++----------
 1 file changed, 77 insertions(+), 77 deletions(-)

diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py
index 4792f50f2baef..301ba2ffd6776 100644
--- a/sklearn/utils/tests/test_estimator_checks.py
+++ b/sklearn/utils/tests/test_estimator_checks.py
@@ -12,8 +12,7 @@
 from sklearn.base import BaseEstimator, ClassifierMixin
 from sklearn.utils import deprecated
 from sklearn.utils._testing import (
-    assert_raises,
-    assert_raises_regex,
+    raises,
     assert_warns,
     ignore_warnings,
     MinimalClassifier,
@@ -413,7 +412,8 @@ def test_not_an_array_array_function():
         raise SkipTest("array_function protocol not supported in numpy <1.17")
     not_array = _NotAnArray(np.ones(10))
     msg = "Don't want to call array_function sum!"
-    assert_raises_regex(TypeError, msg, np.sum, not_array)
+    with raises(TypeError, match=msg):
+        np.sum(not_array)
     # always returns True
     assert np.may_share_memory(not_array, None)
 
@@ -437,92 +437,93 @@ def test_check_estimator():
 
     # check that we have a set_params and can clone
     msg = "Passing a class was deprecated"
-    assert_raises_regex(TypeError, msg, check_estimator, object)
+    with raises(TypeError, match=msg):
+        check_estimator(object)
     msg = (
         "Parameter 'p' of estimator 'HasMutableParameters' is of type "
         "object which is not allowed"
     )
     # check that the "default_constructible" test checks for mutable parameters
     check_estimator(HasImmutableParameters())  # should pass
-    assert_raises_regex(
-        AssertionError, msg, check_estimator, HasMutableParameters()
-    )
+    with raises(AssertionError, match=msg):
+        check_estimator(HasMutableParameters())
     # check that values returned by get_params match set_params
     msg = "get_params result does not match what was passed to set_params"
-    assert_raises_regex(AssertionError, msg, check_estimator,
-                        ModifiesValueInsteadOfRaisingError())
+    with raises(AssertionError, match=msg):
+        check_estimator(ModifiesValueInsteadOfRaisingError())
     assert_warns(UserWarning, check_estimator, RaisesErrorInSetParams())
-    assert_raises_regex(AssertionError, msg, check_estimator,
-                        ModifiesAnotherValue())
+    with raises(AssertionError, match=msg):
+        check_estimator(ModifiesAnotherValue())
     # check that we have a fit method
     msg = "object has no attribute 'fit'"
-    assert_raises_regex(AttributeError, msg, check_estimator, BaseEstimator())
+    with raises(AttributeError, match=msg):
+        check_estimator(BaseEstimator())
     # check that fit does input validation
     msg = "Did not raise"
-    assert_raises_regex(AssertionError, msg, check_estimator,
-                        BaseBadClassifier())
+    with raises(AssertionError, match=msg):
+        check_estimator(BaseBadClassifier())
     # check that sample_weights in fit accepts pandas.Series type
     try:
         from pandas import Series  # noqa
         msg = ("Estimator NoSampleWeightPandasSeriesType raises error if "
                "'sample_weight' parameter is of type pandas.Series")
-        assert_raises_regex(
-            ValueError, msg, check_estimator, NoSampleWeightPandasSeriesType())
+        with raises(ValueError, match=msg):
+            check_estimator(NoSampleWeightPandasSeriesType())
     except ImportError:
         pass
     # check that predict does input validation (doesn't accept dicts in input)
     msg = "Estimator doesn't check for NaN and inf in predict"
-    assert_raises_regex(AssertionError, msg, check_estimator,
-                        NoCheckinPredict())
+    with raises(AssertionError, match=msg):
+        check_estimator(NoCheckinPredict())
     # check that estimator state does not change
     # at transform/predict/predict_proba time
     msg = 'Estimator changes __dict__ during predict'
-    assert_raises_regex(AssertionError, msg, check_estimator, ChangesDict())
+    with raises(AssertionError, match=msg):
+        check_estimator(ChangesDict())
     # check that `fit` only changes attribures that
     # are private (start with an _ or end with a _).
     msg = ('Estimator ChangesWrongAttribute should not change or mutate  '
            'the parameter wrong_attribute from 0 to 1 during fit.')
-    assert_raises_regex(AssertionError, msg,
-                        check_estimator, ChangesWrongAttribute())
+    with raises(AssertionError, match=msg):
+        check_estimator(ChangesWrongAttribute())
     check_estimator(ChangesUnderscoreAttribute())
     # check that `fit` doesn't add any public attribute
     msg = (r'Estimator adds public attribute\(s\) during the fit method.'
            ' Estimators are only allowed to add private attributes'
            ' either started with _ or ended'
            ' with _ but wrong_attribute added')
-    assert_raises_regex(AssertionError, msg,
-                        check_estimator, SetsWrongAttribute())
+    with raises(AssertionError, match=msg):
+        check_estimator(SetsWrongAttribute())
     # check for sample order invariance
     name = NotInvariantSampleOrder.__name__
     method = 'predict'
     msg = ("{method} of {name} is not invariant when applied to a dataset"
            "with different sample order.").format(method=method, name=name)
-    assert_raises_regex(AssertionError, msg,
-                        check_estimator, NotInvariantSampleOrder())
+    with raises(AssertionError, match=msg):
+        check_estimator(NotInvariantSampleOrder())
     # check for invariant method
     name = NotInvariantPredict.__name__
     method = 'predict'
     msg = ("{method} of {name} is not invariant when applied "
            "to a subset.").format(method=method, name=name)
-    assert_raises_regex(AssertionError, msg,
-                        check_estimator, NotInvariantPredict())
+    with raises(AssertionError, match=msg):
+        check_estimator(NotInvariantPredict())
     # check for sparse matrix input handling
     name = NoSparseClassifier.__name__
     msg = "Estimator %s doesn't seem to fail gracefully on sparse data" % name
-    assert_raises_regex(
-        AssertionError, msg, check_estimator, NoSparseClassifier()
-    )
+    with raises(AssertionError, match=msg):
+        check_estimator(NoSparseClassifier())
 
     # Large indices test on bad estimator
     msg = ('Estimator LargeSparseNotSupportedClassifier doesn\'t seem to '
            r'support \S{3}_64 matrix, and is not failing gracefully.*')
-    assert_raises_regex(AssertionError, msg, check_estimator,
-                        LargeSparseNotSupportedClassifier())
+    with raises(AssertionError, match=msg):
+        check_estimator(LargeSparseNotSupportedClassifier())
 
     # does error on binary_only untagged estimator
     msg = 'Only 2 classes are supported'
-    assert_raises_regex(ValueError, msg, check_estimator,
-                        UntaggedBinaryClassifier())
+    with raises(ValueError, match=msg):
+        check_estimator(UntaggedBinaryClassifier())
 
     # non-regression test for estimators transforming to sparse data
     check_estimator(SparseTransformer())
@@ -537,8 +538,8 @@ def test_check_estimator():
 
     # Check regressor with requires_positive_y estimator tag
     msg = 'negative y values not supported!'
-    assert_raises_regex(ValueError, msg, check_estimator,
-                        RequiresPositiveYRegressor())
+    with raises(ValueError, match=msg):
+        check_estimator(RequiresPositiveYRegressor())
 
     # Does not raise error on classifier with poor_score tag
     check_estimator(PoorScoreLogisticRegression())
@@ -547,7 +548,8 @@ def test_check_estimator():
 def test_check_outlier_corruption():
     # should raise AssertionError
     decision = np.array([0., 1., 1.5, 2.])
-    assert_raises(AssertionError, check_outlier_corruption, 1, 2, decision)
+    with raises(AssertionError):
+        check_outlier_corruption(1, 2, decision)
     # should pass
     decision = np.array([0., 1., 1., 2.])
     check_outlier_corruption(1, 2, decision)
@@ -555,8 +557,8 @@ def test_check_outlier_corruption():
 
 def test_check_estimator_transformer_no_mixin():
     # check that TransformerMixin is not required for transformer tests to run
-    assert_raises_regex(AttributeError, '.*fit_transform.*',
-                        check_estimator, BadTransformerWithoutMixin())
+    with raises(AttributeError, '.*fit_transform.*'):
+        check_estimator(BadTransformerWithoutMixin())
 
 
 def test_check_estimator_clones():
@@ -593,8 +595,8 @@ def test_check_estimators_unfitted():
     # check that a ValueError/AttributeError is raised when calling predict
     # on an unfitted estimator
     msg = "Did not raise"
-    assert_raises_regex(AssertionError, msg, check_estimators_unfitted,
-                        "estimator", NoSparseClassifier())
+    with raises(AssertionError, match=msg):
+        check_estimators_unfitted("estimator", NoSparseClassifier())
 
     # check that CorrectNotFittedError inherit from either ValueError
     # or AttributeError
@@ -610,19 +612,22 @@ class NonConformantEstimatorNoParamSet(BaseEstimator):
         def __init__(self, you_should_set_this_=None):
             pass
 
-    assert_raises_regex(AssertionError,
-                        "Estimator estimator_name should not set any"
-                        " attribute apart from parameters during init."
-                        r" Found attributes \['you_should_not_set_this_'\].",
-                        check_no_attributes_set_in_init,
-                        'estimator_name',
-                        NonConformantEstimatorPrivateSet())
-    assert_raises_regex(AttributeError,
-                        "Estimator estimator_name should store all "
-                        "parameters as an attribute during init.",
-                        check_no_attributes_set_in_init,
-                        'estimator_name',
-                        NonConformantEstimatorNoParamSet())
+    msg = (
+        "Estimator estimator_name should not set any"
+        " attribute apart from parameters during init."
+        r" Found attributes \['you_should_not_set_this_'\]."
+    )
+    with raises(AssertionError, match=msg):
+        check_no_attributes_set_in_init('estimator_name',
+                                        NonConformantEstimatorPrivateSet())
+
+    msg = (
+        "Estimator estimator_name should store all parameters as an attribute"
+        " during init"
+    )
+    with raises(AttributeError, match=msg):
+        check_no_attributes_set_in_init('estimator_name',
+                                        NonConformantEstimatorNoParamSet())
 
 
 def test_check_estimator_pairwise():
@@ -639,32 +644,24 @@ def test_check_estimator_pairwise():
 
 
 def test_check_classifier_data_not_an_array():
-    assert_raises_regex(AssertionError,
-                        'Not equal to tolerance',
-                        check_classifier_data_not_an_array,
-                        'estimator_name',
-                        EstimatorInconsistentForPandas())
+    with raises(AssertionError, match='Not equal to tolerance'):
+        check_classifier_data_not_an_array('estimator_name',
+                                           EstimatorInconsistentForPandas())
 
 
 def test_check_regressor_data_not_an_array():
-    assert_raises_regex(AssertionError,
-                        'Not equal to tolerance',
-                        check_regressor_data_not_an_array,
-                        'estimator_name',
-                        EstimatorInconsistentForPandas())
+    with raises(AssertionError, match='Not equal to tolerance'):
+        check_regressor_data_not_an_array('estimator_name',
+                                          EstimatorInconsistentForPandas())
 
 
 def test_check_estimator_get_tags_default_keys():
     estimator = EstimatorMissingDefaultTags()
     err_msg = (r"EstimatorMissingDefaultTags._get_tags\(\) is missing entries"
                r" for the following default tags: {'allow_nan'}")
-    assert_raises_regex(
-        AssertionError,
-        err_msg,
-        check_estimator_get_tags_default_keys,
-        estimator.__class__.__name__,
-        estimator,
-    )
+    with raises(AssertionError, match=err_msg):
+        check_estimator_get_tags_default_keys(estimator.__class__.__name__,
+                                              estimator)
 
     # noop check when _get_tags is not available
     estimator = MinimalTransformer()
@@ -688,12 +685,15 @@ def run_tests_without_pytest():
 
 def test_check_class_weight_balanced_linear_classifier():
     # check that ill-computed balanced weights raises an exception
-    assert_raises_regex(AssertionError,
-                        "Classifier estimator_name is not computing"
-                        " class_weight=balanced properly.",
-                        check_class_weight_balanced_linear_classifier,
-                        'estimator_name',
-                        BadBalancedWeightsClassifier)
+    msg = (
+        "Classifier estimator_name is not computing class_weight=balanced "
+        "properly"
+    )
+    with raises(AssertionError, match=msg):
+        check_class_weight_balanced_linear_classifier(
+            'estimator_name',
+            BadBalancedWeightsClassifier
+        )
 
 
 def test_all_estimators_all_public():

From 07880f0ad3c6716772603559f8e1d07c01dc0929 Mon Sep 17 00:00:00 2001
From: Chiara Marmo <cmarmo@users.noreply.github.com>
Date: Thu, 27 May 2021 11:29:22 +0200
Subject: [PATCH 421/478] DOC Update minimal versions for dependencies (#20143)

---
 README.rst      | 11 ++++++-----
 doc/install.rst |  3 ++-
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/README.rst b/README.rst
index ebc4339b2ab58..3c685fa4af13e 100644
--- a/README.rst
+++ b/README.rst
@@ -26,13 +26,13 @@
 .. |DOI| image:: https://zenodo.org/badge/21369/scikit-learn/scikit-learn.svg
 .. _DOI: https://zenodo.org/badge/latestdoi/21369/scikit-learn/scikit-learn
 
-.. |PythonMinVersion| replace:: 3.6
-.. |NumPyMinVersion| replace:: 1.13.3
-.. |SciPyMinVersion| replace:: 0.19.1
+.. |PythonMinVersion| replace:: 3.7
+.. |NumPyMinVersion| replace:: 1.14.5
+.. |SciPyMinVersion| replace:: 1.1.0
 .. |JoblibMinVersion| replace:: 0.11
 .. |ThreadpoolctlMinVersion| replace:: 2.0.0
-.. |MatplotlibMinVersion| replace:: 2.1.1
-.. |Scikit-ImageMinVersion| replace:: 0.13
+.. |MatplotlibMinVersion| replace:: 2.2.2
+.. |Scikit-ImageMinVersion| replace:: 0.14.5
 .. |PandasMinVersion| replace:: 0.25.0
 .. |SeabornMinVersion| replace:: 0.9.0
 .. |PytestMinVersion| replace:: 5.0.1
@@ -70,6 +70,7 @@ scikit-learn requires:
 
 **Scikit-learn 0.20 was the last version to support Python 2.7 and Python 3.4.**
 scikit-learn 0.23 and later require Python 3.6 or newer.
+scikit-learn 1.0 and later require Python 3.7 or newer.
 
 Scikit-learn plotting capabilities (i.e., functions start with ``plot_`` and
 classes end with "Display") require Matplotlib (>= |MatplotlibMinVersion|).
diff --git a/doc/install.rst b/doc/install.rst
index 7912cc4dc4df6..d0b0f50e78f90 100644
--- a/doc/install.rst
+++ b/doc/install.rst
@@ -142,7 +142,8 @@ purpose.
     Scikit-learn 0.20 was the last version to support Python 2.7 and Python 3.4.
     Scikit-learn 0.21 supported Python 3.5-3.7.
     Scikit-learn 0.22 supported Python 3.5-3.8.
-    Scikit-learn now requires Python 3.6 or newer.
+    Scikit-learn 0.23 - 0.24 require Python 3.6 or newer.
+    Scikit-learn 1.0 and later requires Python 3.7 or newer.
 
 
 .. note::

From 99472deef6b87197049d173657ccb7939b938f3e Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@gmail.com>
Date: Thu, 27 May 2021 12:45:06 +0200
Subject: [PATCH 422/478] MAINT silence spurious mypy error (#20147)

---
 sklearn/metrics/tests/test_common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py
index 66df47a778b38..63e37f5590959 100644
--- a/sklearn/metrics/tests/test_common.py
+++ b/sklearn/metrics/tests/test_common.py
@@ -821,7 +821,7 @@ def test_regression_thresholded_inf_nan_input(metric, y_true, y_score):
     # Add an additional case for classification only
     # non-regression test for:
     # https://github.com/scikit-learn/scikit-learn/issues/6809
-    [([np.nan, 1, 2], [1, 2, 3])]
+    [([np.nan, 1, 2], [1, 2, 3])]  # type: ignore
 )
 def test_classification_inf_nan_input(metric, y_true, y_score):
     """check that classification metrics raise a message mentioning the

From 67f6a5c6d9ac2a7051d6009237a59462faffd04e Mon Sep 17 00:00:00 2001
From: naozin555 <37050583+naozin555@users.noreply.github.com>
Date: Thu, 27 May 2021 20:44:21 +0900
Subject: [PATCH 423/478] Add missing link to user guide in PolynomialFeatures
 API documentation (#20146)

---
 sklearn/preprocessing/_polynomial.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sklearn/preprocessing/_polynomial.py b/sklearn/preprocessing/_polynomial.py
index 44ac0d2175c4c..ac4703dbb4cb2 100644
--- a/sklearn/preprocessing/_polynomial.py
+++ b/sklearn/preprocessing/_polynomial.py
@@ -30,6 +30,8 @@ class PolynomialFeatures(TransformerMixin, BaseEstimator):
     For example, if an input sample is two dimensional and of the form
     [a, b], the degree-2 polynomial features are [1, a, b, a^2, ab, b^2].
 
+    Read more in the :ref:`User Guide <polynomial_features>`.
+
     Parameters
     ----------
     degree : int, default=2

From aa86c83b8e31df0367b299e85c88828f48eb1940 Mon Sep 17 00:00:00 2001
From: Venkatachalam N <venky.yuvy@gmail.com>
Date: Thu, 27 May 2021 18:58:24 +0530
Subject: [PATCH 424/478] ENH Allowing sparse inputs for prediction in
 AffinityPropagation (#20117)

---
 doc/whats_new/v1.0.rst                        |  5 +++++
 sklearn/cluster/_affinity_propagation.py      |  2 +-
 .../tests/test_affinity_propagation.py        | 19 +++++++++++++++++++
 3 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 29a4bce98ecb0..7255fe82ff628 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -150,6 +150,11 @@ Changelog
 - |Efficiency| :class:`cluster.MiniBatchKMeans` is now faster in multicore
   settings. :pr:`17622` by :user:`Jérémie du Boisberranger <jeremiedbb>`.
 
+- |Enhancement| The `predict` and `fit_predict` methods of
+  :class:`cluster.AffinityPropagation` now accept sparse data type for input
+  data.
+  :pr:`20117` by :user:`Venkatachalam Natchiappan <venkyyuvy>`
+
 - |Fix| Fixed a bug in :class:`cluster.MiniBatchKMeans` where the sample
   weights were partially ignored when the input is sparse. :pr:`17622` by
   :user:`Jérémie du Boisberranger <jeremiedbb>`.
diff --git a/sklearn/cluster/_affinity_propagation.py b/sklearn/cluster/_affinity_propagation.py
index ccae0b7538b58..59620ab31f63d 100644
--- a/sklearn/cluster/_affinity_propagation.py
+++ b/sklearn/cluster/_affinity_propagation.py
@@ -436,7 +436,7 @@ def predict(self, X):
             Cluster labels.
         """
         check_is_fitted(self)
-        X = self._validate_data(X, reset=False)
+        X = self._validate_data(X, reset=False, accept_sparse='csr')
         if not hasattr(self, "cluster_centers_"):
             raise ValueError("Predict method is not supported when "
                              "affinity='precomputed'.")
diff --git a/sklearn/cluster/tests/test_affinity_propagation.py b/sklearn/cluster/tests/test_affinity_propagation.py
index ae2806bf38e59..a42a8112782a5 100644
--- a/sklearn/cluster/tests/test_affinity_propagation.py
+++ b/sklearn/cluster/tests/test_affinity_propagation.py
@@ -238,6 +238,25 @@ def test_affinity_propagation_float32():
     assert_array_equal(afp.labels_, expected)
 
 
+def test_sparse_input_for_predict():
+    # Test to make sure sparse inputs are accepted for predict
+    # (non-regression test for issue #20049)
+    af = AffinityPropagation(affinity="euclidean", random_state=42)
+    af.fit(X)
+    labels = af.predict(csr_matrix((2, 2)))
+    assert_array_equal(labels, (2, 2))
+
+
+def test_sparse_input_for_fit_predict():
+    # Test to make sure sparse inputs are accepted for fit_predict
+    # (non-regression test for issue #20049)
+    af = AffinityPropagation(affinity="euclidean", random_state=42)
+    rng = np.random.RandomState(42)
+    X = csr_matrix(rng.randint(0, 2, size=(5, 5)))
+    labels = af.fit_predict(X)
+    assert_array_equal(labels, (0, 1, 1, 2, 3))
+
+
 # TODO: Remove in 1.1
 def test_affinity_propagation_pairwise_is_deprecated():
     afp = AffinityPropagation(affinity='precomputed')

From 495ff48a7cc9a2dc913fd1a7018f2ea36205655e Mon Sep 17 00:00:00 2001
From: kobaski <cobacco@gmail.com>
Date: Thu, 27 May 2021 22:43:13 +0900
Subject: [PATCH 425/478] [MRG] resolve ambiguity of the nested cross-val
 example (#20148)

* resolve ambiguity of the nested cross-val example

* Update examples/model_selection/plot_nested_cross_validation_iris.py

make it more explict

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 examples/model_selection/plot_nested_cross_validation_iris.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/model_selection/plot_nested_cross_validation_iris.py b/examples/model_selection/plot_nested_cross_validation_iris.py
index d6aea44e6c546..a2c53841bc4da 100644
--- a/examples/model_selection/plot_nested_cross_validation_iris.py
+++ b/examples/model_selection/plot_nested_cross_validation_iris.py
@@ -80,11 +80,12 @@
     outer_cv = KFold(n_splits=4, shuffle=True, random_state=i)
 
     # Non_nested parameter search and scoring
-    clf = GridSearchCV(estimator=svm, param_grid=p_grid, cv=inner_cv)
+    clf = GridSearchCV(estimator=svm, param_grid=p_grid, cv=outer_cv)
     clf.fit(X_iris, y_iris)
     non_nested_scores[i] = clf.best_score_
 
     # Nested CV with parameter optimization
+    clf = GridSearchCV(estimator=svm, param_grid=p_grid, cv=inner_cv)
     nested_score = cross_val_score(clf, X=X_iris, y=y_iris, cv=outer_cv)
     nested_scores[i] = nested_score.mean()
 

From c9d223ccc58e2569b8e67f1d0217dd57a93ec07f Mon Sep 17 00:00:00 2001
From: jnboehm <jnboehm@users.noreply.github.com>
Date: Thu, 27 May 2021 16:00:56 +0200
Subject: [PATCH 426/478] [MRG] Expand documentation of random_state for
 spectral methods (#17314)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Jan Niklas Böhm <mail@jnboehm.com>
Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
Co-authored-by: Olivier Grisel <olivier.grisel@gmail.com>
---
 sklearn/cluster/_spectral.py            | 34 +++++++++++++++++--------
 sklearn/manifold/_spectral_embedding.py | 32 +++++++++++++++++------
 2 files changed, 48 insertions(+), 18 deletions(-)

diff --git a/sklearn/cluster/_spectral.py b/sklearn/cluster/_spectral.py
index a1371b925595d..cda6dac64ee54 100644
--- a/sklearn/cluster/_spectral.py
+++ b/sklearn/cluster/_spectral.py
@@ -197,11 +197,18 @@ def spectral_clustering(affinity, *, n_clusters=8, n_components=None,
         used.
 
     random_state : int, RandomState instance, default=None
-        A pseudo random number generator used for the initialization of the
-        lobpcg eigenvectors decomposition when eigen_solver == 'amg' and by
-        the K-Means initialization. Use an int to make the randomness
-        deterministic.
-        See :term:`Glossary <random_state>`.
+        A pseudo random number generator used for the initialization
+        of the lobpcg eigenvectors decomposition when `eigen_solver ==
+        'amg'`, and for the K-Means initialization. Use an int to make
+        the results deterministic across calls (See
+        :term:`Glossary <random_state>`).
+
+        .. note::
+            When using `eigen_solver == 'amg'`,
+            it is necessary to also fix the global numpy seed with
+            `np.random.seed(int)` to get deterministic results. See
+            https://github.com/pyamg/pyamg/issues/139 for further
+            information.
 
     n_init : int, default=10
         Number of time the k-means algorithm will be run with different
@@ -322,11 +329,18 @@ class SpectralClustering(ClusterMixin, BaseEstimator):
         Number of eigenvectors to use for the spectral embedding
 
     random_state : int, RandomState instance, default=None
-        A pseudo random number generator used for the initialization of the
-        lobpcg eigenvectors decomposition when ``eigen_solver='amg'`` and by
-        the K-Means initialization. Use an int to make the randomness
-        deterministic.
-        See :term:`Glossary <random_state>`.
+        A pseudo random number generator used for the initialization
+        of the lobpcg eigenvectors decomposition when `eigen_solver ==
+        'amg'`, and for the K-Means initialization. Use an int to make
+        the results deterministic across calls (See
+        :term:`Glossary <random_state>`).
+
+        .. note::
+            When using `eigen_solver == 'amg'`,
+            it is necessary to also fix the global numpy seed with
+            `np.random.seed(int)` to get deterministic results. See
+            https://github.com/pyamg/pyamg/issues/139 for further
+            information.
 
     n_init : int, default=10
         Number of time the k-means algorithm will be run with different
diff --git a/sklearn/manifold/_spectral_embedding.py b/sklearn/manifold/_spectral_embedding.py
index 7fd371ee5af2f..49e64401b6c00 100644
--- a/sklearn/manifold/_spectral_embedding.py
+++ b/sklearn/manifold/_spectral_embedding.py
@@ -178,10 +178,18 @@ def spectral_embedding(adjacency, *, n_components=8, eigen_solver=None,
         used.
 
     random_state : int, RandomState instance or None, default=None
-        Determines the random number generator used for the initialization of
-        the lobpcg eigenvectors decomposition when ``solver`` == 'amg'. Pass
-        an int for reproducible results across multiple function calls.
-        See :term: `Glossary <random_state>`.
+        A pseudo random number generator used for the initialization
+        of the lobpcg eigen vectors decomposition when `eigen_solver ==
+        'amg'`, and for the K-Means initialization. Use an int to make
+        the results deterministic across calls (See
+        :term:`Glossary <random_state>`).
+
+        .. note::
+            When using `eigen_solver == 'amg'`,
+            it is necessary to also fix the global numpy seed with
+            `np.random.seed(int)` to get deterministic results. See
+            https://github.com/pyamg/pyamg/issues/139 for further
+            information.
 
     eigen_tol : float, default=0.0
         Stopping criterion for eigendecomposition of the Laplacian matrix
@@ -396,10 +404,18 @@ class SpectralEmbedding(BaseEstimator):
         1/n_features.
 
     random_state : int, RandomState instance or None, default=None
-        Determines the random number generator used for the initialization of
-        the lobpcg eigenvectors when ``solver`` == 'amg'.  Pass an int for
-        reproducible results across multiple function calls.
-        See :term: `Glossary <random_state>`.
+        A pseudo random number generator used for the initialization
+        of the lobpcg eigen vectors decomposition when `eigen_solver ==
+        'amg'`, and for the K-Means initialization. Use an int to make
+        the results deterministic across calls (See
+        :term:`Glossary <random_state>`).
+
+        .. note::
+            When using `eigen_solver == 'amg'`,
+            it is necessary to also fix the global numpy seed with
+            `np.random.seed(int)` to get deterministic results. See
+            https://github.com/pyamg/pyamg/issues/139 for further
+            information.
 
     eigen_solver : {'arpack', 'lobpcg', 'amg'}, default=None
         The eigenvalue decomposition strategy to use. AMG requires pyamg

From bc91f01e541de34adf084b07d5154db15cab9b58 Mon Sep 17 00:00:00 2001
From: michalkrawczyk <mkrwczyk.1@gmail.com>
Date: Fri, 28 May 2021 12:25:55 +0200
Subject: [PATCH 427/478] DOC improve penalty/solver/muticlass support in
 LogisticRegression* (#19855)

Co-authored-by: Guillaume Lemaitre <g.lemaitre58@gmail.com>
---
 sklearn/linear_model/_logistic.py | 114 ++++++++++++++++++++----------
 1 file changed, 77 insertions(+), 37 deletions(-)

diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py
index c9f1f42f1eeec..abca6bb30e71f 100644
--- a/sklearn/linear_model/_logistic.py
+++ b/sklearn/linear_model/_logistic.py
@@ -1041,10 +1041,17 @@ class LogisticRegression(LinearClassifierMixin,
     Parameters
     ----------
     penalty : {'l1', 'l2', 'elasticnet', 'none'}, default='l2'
-        Used to specify the norm used in the penalization. The 'newton-cg',
-        'sag' and 'lbfgs' solvers support only l2 penalties. 'elasticnet' is
-        only supported by the 'saga' solver. If 'none' (not supported by the
-        liblinear solver), no regularization is applied.
+        Specify the norm of the penalty:
+
+        - `'none'`: no penalty is added;
+        - `'l2'`: add a L2 penalty term and it is the default choice;
+        - `'l1'`: add a L1 penalty term;
+        - `'elasticnet'`: both L1 and L2 penalty terms are added.
+
+        .. warning::
+           Some penalties may not work with some solvers. See the parameter
+           `solver` below, to know the compatibility between the penalty and
+           solver.
 
         .. versionadded:: 0.19
            l1 penalty with SAGA solver (allowing 'multinomial' + L1)
@@ -1100,21 +1107,38 @@ class LogisticRegression(LinearClassifierMixin,
     solver : {'newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'}, \
             default='lbfgs'
 
-        Algorithm to use in the optimization problem.
-
-        - For small datasets, 'liblinear' is a good choice, whereas 'sag' and
-          'saga' are faster for large ones.
-        - For multiclass problems, only 'newton-cg', 'sag', 'saga' and 'lbfgs'
-          handle multinomial loss; 'liblinear' is limited to one-versus-rest
-          schemes.
-        - 'newton-cg', 'lbfgs', 'sag' and 'saga' handle L2 or no penalty
-        - 'liblinear' and 'saga' also handle L1 penalty
-        - 'saga' also supports 'elasticnet' penalty
-        - 'liblinear' does not support setting ``penalty='none'``
-
-        Note that 'sag' and 'saga' fast convergence is only guaranteed on
-        features with approximately the same scale. You can
-        preprocess the data with a scaler from sklearn.preprocessing.
+        Algorithm to use in the optimization problem. Default is 'lbfgs'.
+        To choose a solver, you might want to consider the following aspects:
+
+            - For small datasets, 'liblinear' is a good choice, whereas 'sag'
+              and 'saga' are faster for large ones;
+            - For multiclass problems, only 'newton-cg', 'sag', 'saga' and
+              'lbfgs' handle multinomial loss;
+            - 'liblinear' is limited to one-versus-rest schemes.
+
+        .. warning::
+           The choice of the algorithm depends on the penalty chosen:
+           Supported penalties by solver:
+
+           - 'newton-cg'   -   ['l2', 'none']
+           - 'lbfgs'       -   ['l2', 'none']
+           - 'liblinear'   -   ['l1', 'l2']
+           - 'sag'         -   ['l2', 'none']
+           - 'saga'        -   ['elasticnet', 'l1', 'l2', 'none']
+
+        .. note::
+           'sag' and 'saga' fast convergence is only guaranteed on
+           features with approximately the same scale. You can
+           preprocess the data with a scaler from :mod:`sklearn.preprocessing`.
+
+        .. seealso::
+           Refer to the User Guide for more information regarding
+           :class:`LogisticRegression` and more specifically the
+           `Table <https://scikit-learn.org/dev/modules/linear_model.html#logistic-regression>`_
+           summarazing solver/penalty supports.
+           <!--
+           # noqa: E501
+           -->
 
         .. versionadded:: 0.17
            Stochastic Average Gradient descent solver.
@@ -1549,9 +1573,16 @@ class LogisticRegressionCV(LogisticRegression,
         n_samples > n_features.
 
     penalty : {'l1', 'l2', 'elasticnet'}, default='l2'
-        Used to specify the norm used in the penalization. The 'newton-cg',
-        'sag' and 'lbfgs' solvers support only l2 penalties. 'elasticnet' is
-        only supported by the 'saga' solver.
+        Specify the norm of the penalty:
+
+        - `'l2'`: add a L2 penalty term (used by default);
+        - `'l1'`: add a L1 penalty term;
+        - `'elasticnet'`: both L1 and L2 penalty terms are added.
+
+        .. warning::
+           Some penalties may not work with some solvers. See the parameter
+           `solver` below, to know the compatibility between the penalty and
+           solver.
 
     scoring : str or callable, default=None
         A string (see model evaluation documentation) or
@@ -1563,21 +1594,30 @@ class LogisticRegressionCV(LogisticRegression,
     solver : {'newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'}, \
             default='lbfgs'
 
-        Algorithm to use in the optimization problem.
-
-        - For small datasets, 'liblinear' is a good choice, whereas 'sag' and
-          'saga' are faster for large ones.
-        - For multiclass problems, only 'newton-cg', 'sag', 'saga' and 'lbfgs'
-          handle multinomial loss; 'liblinear' is limited to one-versus-rest
-          schemes.
-        - 'newton-cg', 'lbfgs' and 'sag' only handle L2 penalty, whereas
-          'liblinear' and 'saga' handle L1 penalty.
-        - 'liblinear' might be slower in LogisticRegressionCV because it does
-          not handle warm-starting.
-
-        Note that 'sag' and 'saga' fast convergence is only guaranteed on
-        features with approximately the same scale. You can preprocess the data
-        with a scaler from sklearn.preprocessing.
+        Algorithm to use in the optimization problem. Default is 'lbfgs'.
+        To choose a solver, you might want to consider the following aspects:
+
+            - For small datasets, 'liblinear' is a good choice, whereas 'sag'
+              and 'saga' are faster for large ones;
+            - For multiclass problems, only 'newton-cg', 'sag', 'saga' and
+              'lbfgs' handle multinomial loss;
+            - 'liblinear' might be slower in :class:`LogisticRegressionCV`
+              because it does not handle warm-starting. 'liblinear' is
+              limited to one-versus-rest schemes.
+
+        .. warning::
+           The choice of the algorithm depends on the penalty chosen:
+
+           - 'newton-cg'   -   ['l2']
+           - 'lbfgs'       -   ['l2']
+           - 'liblinear'   -   ['l1', 'l2']
+           - 'sag'         -   ['l2']
+           - 'saga'        -   ['elasticnet', 'l1', 'l2']
+
+        .. note::
+           'sag' and 'saga' fast convergence is only guaranteed on features
+           with approximately the same scale. You can preprocess the data with
+           a scaler from :mod:`sklearn.preprocessing`.
 
         .. versionadded:: 0.17
            Stochastic Average Gradient descent solver.

From 9406b3d2a4715fc71005194d30e0256c897453a0 Mon Sep 17 00:00:00 2001
From: naozin555 <37050583+naozin555@users.noreply.github.com>
Date: Fri, 28 May 2021 20:53:10 +0900
Subject: [PATCH 428/478] TST Removed assert_warns_message from
 feature_selection/tests (#20158)

---
 .../feature_selection/tests/test_feature_select.py  | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/sklearn/feature_selection/tests/test_feature_select.py b/sklearn/feature_selection/tests/test_feature_select.py
index 852c8228b2a76..b5e289cee9a00 100644
--- a/sklearn/feature_selection/tests/test_feature_select.py
+++ b/sklearn/feature_selection/tests/test_feature_select.py
@@ -12,9 +12,7 @@
 from sklearn.utils._testing import assert_almost_equal, _convert_container
 from sklearn.utils._testing import assert_array_equal
 from sklearn.utils._testing import assert_array_almost_equal
-from sklearn.utils._testing import assert_warns
 from sklearn.utils._testing import ignore_warnings
-from sklearn.utils._testing import assert_warns_message
 from sklearn.utils import safe_mask
 
 from sklearn.datasets import make_classification, make_regression
@@ -271,8 +269,8 @@ def test_select_kbest_zero():
     support = univariate_filter.get_support()
     gtruth = np.zeros(10, dtype=bool)
     assert_array_equal(support, gtruth)
-    X_selected = assert_warns_message(UserWarning, 'No features were selected',
-                                      univariate_filter.transform, X)
+    with pytest.warns(UserWarning, match="No features were selected"):
+        X_selected = univariate_filter.transform(X)
     assert X_selected.shape == (20, 0)
 
 
@@ -620,7 +618,8 @@ def test_f_classif_constant_feature():
 
     X, y = make_classification(n_samples=10, n_features=5)
     X[:, 0] = 2.0
-    assert_warns(UserWarning, f_classif, X, y)
+    with pytest.warns(UserWarning):
+        f_classif(X, y)
 
 
 def test_no_feature_selected():
@@ -639,8 +638,8 @@ def test_no_feature_selected():
     ]
     for selector in strict_selectors:
         assert_array_equal(selector.get_support(), np.zeros(10))
-        X_selected = assert_warns_message(
-            UserWarning, 'No features were selected', selector.transform, X)
+        with pytest.warns(UserWarning, match="No features were selected"):
+            X_selected = selector.transform(X)
         assert X_selected.shape == (40, 0)
 
 
From 3a64fecd1f1d30a17998b254f94613adee48a930 Mon Sep 17 00:00:00 2001
From: Whidou <Whidou@users.noreply.github.com>
Date: Fri, 28 May 2021 13:59:21 +0200
Subject: [PATCH 429/478] DOC Improve the description of california_housing
 (#20160)

Co-authored-by: Whidou <root@whidou.fr>
---
 sklearn/datasets/descr/california_housing.rst | 26 ++++++++++++-------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/sklearn/datasets/descr/california_housing.rst b/sklearn/datasets/descr/california_housing.rst
index 9ab3b679b68f5..494803a125d12 100644
--- a/sklearn/datasets/descr/california_housing.rst
+++ b/sklearn/datasets/descr/california_housing.rst
@@ -10,27 +10,33 @@ California Housing dataset
     :Number of Attributes: 8 numeric, predictive attributes and the target
 
     :Attribute Information:
-        - MedInc        median income in block
-        - HouseAge      median house age in block
-        - AveRooms      average number of rooms
-        - AveBedrms     average number of bedrooms
-        - Population    block population
-        - AveOccup      average house occupancy
-        - Latitude      house block latitude
-        - Longitude     house block longitude
+        - MedInc        median income in block group
+        - HouseAge      median house age in block group
+        - AveRooms      average number of rooms per household
+        - AveBedrms     average number of bedrooms per household
+        - Population    block group population
+        - AveOccup      average number of household members
+        - Latitude      block group latitude
+        - Longitude     block group longitude
 
     :Missing Attribute Values: None
 
 This dataset was obtained from the StatLib repository.
-http://lib.stat.cmu.edu/datasets/
+https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html
 
-The target variable is the median house value for California districts.
+The target variable is the median house value for California districts,
+expressed in hundreds of thousands of dollars ($100,000).
 
 This dataset was derived from the 1990 U.S. census, using one row per census
 block group. A block group is the smallest geographical unit for which the U.S.
 Census Bureau publishes sample data (a block group typically has a population
 of 600 to 3,000 people).
 
+An household is a group of people residing within a home. Since the average
+number of rooms and bedrooms in this dataset are provided per household, these
+columns may take surpinsingly large values for block groups with few households
+and many empty houses, such as vacation resorts.
+
 It can be downloaded/loaded using the
 :func:`sklearn.datasets.fetch_california_housing` function.
 

From deda6e2a5a01ad22096862bded5f66e9578cc39e Mon Sep 17 00:00:00 2001
From: Alihan Zihna <alihanz@gmail.com>
Date: Fri, 28 May 2021 21:31:18 +0100
Subject: [PATCH 430/478] EXA improve example of forest feature importances
 digits (#19429)

Co-authored-by: Alihan Zihna <a.zihna@ckhgbdp.onmicrosoft.com>
Co-authored-by: Christian Lorentzen <lorentzen.ch@gmail.com>
---
 .../ensemble/plot_forest_importances_faces.py | 87 ++++++++++++++-----
 1 file changed, 65 insertions(+), 22 deletions(-)

diff --git a/examples/ensemble/plot_forest_importances_faces.py b/examples/ensemble/plot_forest_importances_faces.py
index 6cea84ca4744c..ff2ec6f67ed99 100644
--- a/examples/ensemble/plot_forest_importances_faces.py
+++ b/examples/ensemble/plot_forest_importances_faces.py
@@ -3,46 +3,89 @@
 Pixel importances with a parallel forest of trees
 =================================================
 
-This example shows the use of forests of trees to evaluate the impurity-based
-importance of the pixels in an image classification task (faces).
-The hotter the pixel, the more important.
+This example shows the use of a forest of trees to evaluate the impurity
+based importance of the pixels in an image classification task on the faces
+dataset. The hotter the pixel, the more important it is.
 
 The code below also illustrates how the construction and the computation
 of the predictions can be parallelized within multiple jobs.
 """
+# %%
 print(__doc__)
 
-from time import time
-import matplotlib.pyplot as plt
-
+# %%
+# Loading the data and model fitting
+# ----------------------------------
+# First, we load the olivetti faces dataset and limit the dataset to contain
+# only the first five classes. Then we train a random forest on the dataset
+# and evaluate the impurity-based feature importance. One drawback of this
+# method is that it cannot be evaluated on a separate test set. For this
+# example, we are interested in representing the information learned from
+# the full dataset. Also, we'll set the number of cores to use for the tasks.
 from sklearn.datasets import fetch_olivetti_faces
-from sklearn.ensemble import ExtraTreesClassifier
 
-# Number of cores to use to perform parallel fitting of the forest model
-n_jobs = 1
+# %%
+# We select the number of cores to use to perform parallel fitting of
+# the forest model. `-1` means use all available cores.
+n_jobs = -1
 
+# %%
 # Load the faces dataset
 data = fetch_olivetti_faces()
 X, y = data.data, data.target
 
-mask = y < 5  # Limit to 5 classes
+# %%
+# Limit the dataset to 5 classes.
+mask = y < 5
 X = X[mask]
 y = y[mask]
 
-# Build a forest and compute the pixel importances
-print("Fitting ExtraTreesClassifier on faces data with %d cores..." % n_jobs)
-t0 = time()
-forest = ExtraTreesClassifier(n_estimators=1000,
-                              max_features=128,
-                              n_jobs=n_jobs,
-                              random_state=0)
+# %%
+# A random forest classifier will be fitted to compute the feature importances.
+from sklearn.ensemble import RandomForestClassifier
+
+forest = RandomForestClassifier(
+    n_estimators=750, n_jobs=n_jobs, random_state=42)
 
 forest.fit(X, y)
-print("done in %0.3fs" % (time() - t0))
+
+# %%
+# Feature importance based on mean decrease in impurity (MDI)
+# -----------------------------------------------------------
+# Feature importances are provided by the fitted attribute
+# `feature_importances_` and they are computed as the mean and standard
+# deviation of accumulation of the impurity decrease within each tree.
+#
+# .. warning::
+#     Impurity-based feature importances can be misleading for high cardinality
+#     features (many unique values). See :ref:`permutation_importance` as
+#     an alternative.
+import time
+import matplotlib.pyplot as plt
+
+start_time = time.time()
+img_shape = data.images[0].shape
 importances = forest.feature_importances_
-importances = importances.reshape(data.images[0].shape)
+elapsed_time = time.time() - start_time
 
-# Plot pixel importances
-plt.matshow(importances, cmap=plt.cm.hot)
-plt.title("Pixel importances with forests of trees")
+print(f"Elapsed time to compute the importances: "
+      f"{elapsed_time:.3f} seconds")
+imp_reshaped = importances.reshape(img_shape)
+plt.matshow(imp_reshaped, cmap=plt.cm.hot)
+plt.title("Pixel importances using impurity values")
+plt.colorbar()
 plt.show()
+
+# %%
+# Can you still recognize a face?
+
+# %%
+# The limitations of MDI is not a problem for this dataset because:
+#
+#  1. All features are (ordered) numeric and will thus not suffer the
+#     cardinality bias
+#  2. We are only interested to represent knowledge of the forest acquired
+#     on the training set.
+#
+# If these two conditions are not met, it is recommended to instead use
+# the :func:`~sklearn.inspection.permutation_importance`.

From eea26e7e81bc4120ed00d8bb39f58100747cecdc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?=
 <34657725+jeremiedbb@users.noreply.github.com>
Date: Mon, 31 May 2021 11:37:14 +0200
Subject: [PATCH 431/478] MNT Clean deprecations for 1.0 | pairwise_distances
 (#19325)

* cln deprecations pairwise_distances

* cln match
---
 sklearn/metrics/pairwise.py            | 20 +++-------
 sklearn/metrics/tests/test_pairwise.py | 51 ++++++++++----------------
 2 files changed, 25 insertions(+), 46 deletions(-)

diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py
index c9e9f60d8aaf3..5257f1bc6b95f 100644
--- a/sklearn/metrics/pairwise.py
+++ b/sklearn/metrics/pairwise.py
@@ -1468,25 +1468,17 @@ def _precompute_metric_params(X, Y, metric=None, **kwds):
         if X is Y:
             V = np.var(X, axis=0, ddof=1, dtype=dtype)
         else:
-            warnings.warn(
-                "from version 1.0 (renaming of 0.25), pairwise_distances for "
-                "metric='seuclidean' will require V to be specified if Y is "
-                "passed.",
-                FutureWarning
-            )
-            V = np.var(np.vstack([X, Y]), axis=0, ddof=1, dtype=dtype)
+            raise ValueError(
+                  "The 'V' parameter is required for the seuclidean metric "
+                  "when Y is passed.")
         return {'V': V}
     if metric == "mahalanobis" and 'VI' not in kwds:
         if X is Y:
             VI = np.linalg.inv(np.cov(X.T)).T
         else:
-            warnings.warn(
-                "from version 1.0 (renaming of 0.25), pairwise_distances for "
-                "metric='mahalanobis' will require VI to be specified if Y "
-                "is passed.",
-                FutureWarning
-            )
-            VI = np.linalg.inv(np.cov(np.vstack([X, Y]).T)).T
+            raise ValueError(
+                  "The 'VI' parameter is required for the mahalanobis metric "
+                  "when Y is passed.")
         return {'VI': VI}
     return {}
 
diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py
index 1ff62af04c05f..fba887d63b084 100644
--- a/sklearn/metrics/tests/test_pairwise.py
+++ b/sklearn/metrics/tests/test_pairwise.py
@@ -1323,44 +1323,31 @@ def test_check_preserve_type():
 @pytest.mark.parametrize("metric", ["seuclidean", "mahalanobis"])
 @pytest.mark.parametrize("dist_function",
                          [pairwise_distances, pairwise_distances_chunked])
-@pytest.mark.parametrize("y_is_x", [True, False], ids=["Y is X", "Y is not X"])
-def test_pairwise_distances_data_derived_params(n_jobs, metric, dist_function,
-                                                y_is_x):
+def test_pairwise_distances_data_derived_params(n_jobs, metric, dist_function):
     # check that pairwise_distances give the same result in sequential and
     # parallel, when metric has data-derived parameters.
     with config_context(working_memory=0.1):  # to have more than 1 chunk
         rng = np.random.RandomState(0)
         X = rng.random_sample((100, 10))
 
-        if y_is_x:
-            Y = X
-            expected_dist_default_params = squareform(pdist(X, metric=metric))
-            if metric == "seuclidean":
-                params = {'V': np.var(X, axis=0, ddof=1)}
-            else:
-                params = {'VI': np.linalg.inv(np.cov(X.T)).T}
-        else:
-            Y = rng.random_sample((100, 10))
-            expected_dist_default_params = cdist(X, Y, metric=metric)
-            if metric == "seuclidean":
-                params = {'V': np.var(np.vstack([X, Y]), axis=0, ddof=1)}
-            else:
-                params = {'VI': np.linalg.inv(np.cov(np.vstack([X, Y]).T)).T}
-
-        expected_dist_explicit_params = cdist(X, Y, metric=metric, **params)
-        # TODO: Remove warn_checker in 1.0
-        if y_is_x:
-            warn_checker = pytest.warns(None)
-        else:
-            warn_checker = pytest.warns(FutureWarning,
-                                        match="to be specified if Y is passed")
-        with warn_checker:
-            dist = np.vstack(tuple(dist_function(X, Y,
-                                                 metric=metric,
-                                                 n_jobs=n_jobs)))
-
-        assert_allclose(dist, expected_dist_explicit_params)
-        assert_allclose(dist, expected_dist_default_params)
+        expected_dist = squareform(pdist(X, metric=metric))
+        dist = np.vstack(tuple(dist_function(X, metric=metric, n_jobs=n_jobs)))
+
+        assert_allclose(dist, expected_dist)
+
+
+@pytest.mark.parametrize("metric", ["seuclidean", "mahalanobis"])
+def test_pairwise_distances_data_derived_params_error(metric):
+    # check that pairwise_distances raises an error when Y is passed but
+    # metric has data-derived params that are not provided by the user.
+    rng = np.random.RandomState(0)
+    X = rng.random_sample((100, 10))
+    Y = rng.random_sample((100, 10))
+
+    with pytest.raises(ValueError,
+                       match=fr"The '(V|VI)' parameter is required for the "
+                             fr"{metric} metric"):
+        pairwise_distances(X, Y, metric=metric)
 
 
 @pytest.mark.parametrize(

From 1c36b49eb266d72d4211cd29c9a645e690925538 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@gmail.com>
Date: Mon, 31 May 2021 14:49:14 +0200
Subject: [PATCH 432/478] MNT avoid pandas deprecation warning in
 test_validation.py (#20171)

---
 sklearn/utils/tests/test_validation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py
index c244d6f6caffc..ae2d5181f35a6 100644
--- a/sklearn/utils/tests/test_validation.py
+++ b/sklearn/utils/tests/test_validation.py
@@ -456,7 +456,7 @@ def test_check_array_pandas_dtype_casting():
     # check that we handle pandas dtypes in a semi-reasonable way
     # this is actually tricky because we can't really know that this
     # should be integer ahead of converting it.
-    cat_df = pd.DataFrame([pd.Categorical([1, 2, 3])])
+    cat_df = pd.DataFrame({"cat_col": pd.Categorical([1, 2, 3])})
     assert (check_array(cat_df).dtype == np.int64)
     assert (check_array(cat_df, dtype=FLOAT_DTYPES).dtype
             == np.float64)

From c8753d4174be948aefa2edfbe0f2e17a6b2bccb3 Mon Sep 17 00:00:00 2001
From: Takeshi Oura <limit_cycle_underground@yahoo.co.jp>
Date: Mon, 31 May 2021 23:33:03 +0900
Subject: [PATCH 433/478] ENH Preserving dtype for numpy.float32 in Least Angle
 Regression  (#20155)

---
 doc/whats_new/v1.0.rst                        |  4 ++
 sklearn/linear_model/_least_angle.py          | 21 ++++++--
 .../linear_model/tests/test_least_angle.py    | 53 ++++++++++++++++++-
 3 files changed, 72 insertions(+), 6 deletions(-)

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 7255fe82ff628..525f3439860ef 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -344,6 +344,10 @@ Changelog
   is now faster. This is especially noticeable on large sparse input.
   :pr:`19734` by :user:`Fred Robinson <frrad>`.
 
+- |Enhancement| `fit` method preserves dtype for numpy.float32 in
+  :class:`Lars`, :class:`LassoLars`, :class:`LassoLars`, :class:`LarsCV` and
+  :class:`LassoLarsCV`. :pr:`20155` by :user:`Takeshi Oura <takoika>`.
+
 :mod:`sklearn.manifold`
 .......................
 
diff --git a/sklearn/linear_model/_least_angle.py b/sklearn/linear_model/_least_angle.py
index 0932d0bd1aee3..3485344b99e02 100644
--- a/sklearn/linear_model/_least_angle.py
+++ b/sklearn/linear_model/_least_angle.py
@@ -476,12 +476,23 @@ def _lars_path_solver(
 
     max_features = min(max_iter, n_features)
 
+    dtypes = set(a.dtype for a in (X, y, Xy, Gram) if a is not None)
+    if len(dtypes) == 1:
+        # use the precision level of input data if it is consistent
+        return_dtype = next(iter(dtypes))
+    else:
+        # fallback to double precision otherwise
+        return_dtype = np.float64
+
     if return_path:
-        coefs = np.zeros((max_features + 1, n_features))
-        alphas = np.zeros(max_features + 1)
+        coefs = np.zeros((max_features + 1, n_features), dtype=return_dtype)
+        alphas = np.zeros(max_features + 1, dtype=return_dtype)
     else:
-        coef, prev_coef = np.zeros(n_features), np.zeros(n_features)
-        alpha, prev_alpha = np.array([0.]), np.array([0.])  # better ideas?
+        coef, prev_coef = (np.zeros(n_features, dtype=return_dtype),
+                           np.zeros(n_features, dtype=return_dtype))
+        alpha, prev_alpha = (np.array([0.], dtype=return_dtype),
+                             np.array([0.], dtype=return_dtype))
+        # above better ideas?
 
     n_iter, n_active = 0, 0
     active, indices = list(), np.arange(n_features)
@@ -948,7 +959,7 @@ def _fit(self, X, y, max_iter, alpha, fit_path, Xy=None):
 
         self.alphas_ = []
         self.n_iter_ = []
-        self.coef_ = np.empty((n_targets, n_features))
+        self.coef_ = np.empty((n_targets, n_features), dtype=X.dtype)
 
         if fit_path:
             self.active_ = []
diff --git a/sklearn/linear_model/tests/test_least_angle.py b/sklearn/linear_model/tests/test_least_angle.py
index 4321c39b45e92..656b7e3fef718 100644
--- a/sklearn/linear_model/tests/test_least_angle.py
+++ b/sklearn/linear_model/tests/test_least_angle.py
@@ -14,7 +14,7 @@
 from sklearn import linear_model, datasets
 from sklearn.linear_model._least_angle import _lars_path_residues
 from sklearn.linear_model import LassoLarsIC, lars_path
-from sklearn.linear_model import Lars, LassoLars
+from sklearn.linear_model import Lars, LassoLars, LarsCV, LassoLarsCV
 
 # TODO: use another dataset that has multiple drops
 diabetes = datasets.load_diabetes()
@@ -777,3 +777,54 @@ def test_copy_X_with_auto_gram():
     linear_model.lars_path(X, y, Gram='auto', copy_X=True, method='lasso')
     # X did not change
     assert_allclose(X, X_before)
+
+
+@pytest.mark.parametrize("LARS, has_coef_path, args",
+                         ((Lars, True, {}),
+                          (LassoLars, True, {}),
+                          (LassoLarsIC, False, {}),
+                          (LarsCV, True, {}),
+                          # max_iter=5 is for avoiding ConvergenceWarning
+                          (LassoLarsCV, True, {"max_iter": 5})))
+@pytest.mark.parametrize("dtype", (np.float32, np.float64))
+def test_lars_dtype_match(LARS, has_coef_path, args, dtype):
+    # The test ensures that the fit method preserves input dtype
+    rng = np.random.RandomState(0)
+    X = rng.rand(6, 6).astype(dtype)
+    y = rng.rand(6).astype(dtype)
+
+    model = LARS(**args)
+    model.fit(X, y)
+    assert model.coef_.dtype == dtype
+    if has_coef_path:
+        assert model.coef_path_.dtype == dtype
+    assert model.intercept_.dtype == dtype
+
+
+@pytest.mark.parametrize("LARS, has_coef_path, args",
+                         ((Lars, True, {}),
+                          (LassoLars, True, {}),
+                          (LassoLarsIC, False, {}),
+                          (LarsCV, True, {}),
+                          # max_iter=5 is for avoiding ConvergenceWarning
+                          (LassoLarsCV, True, {"max_iter": 5})))
+def test_lars_numeric_consistency(LARS, has_coef_path, args):
+    # The test ensures numerical consistency between trained coefficients
+    # of float32 and float64.
+    rtol = 1e-5
+    atol = 1e-5
+
+    rng = np.random.RandomState(0)
+    X_64 = rng.rand(6, 6)
+    y_64 = rng.rand(6)
+
+    model_64 = LARS(**args).fit(X_64, y_64)
+    model_32 = LARS(**args).fit(X_64.astype(np.float32),
+                                y_64.astype(np.float32))
+
+    assert_allclose(model_64.coef_, model_32.coef_, rtol=rtol, atol=atol)
+    if has_coef_path:
+        assert_allclose(model_64.coef_path_, model_32.coef_path_,
+                        rtol=rtol, atol=atol)
+    assert_allclose(model_64.intercept_, model_32.intercept_,
+                    rtol=rtol, atol=atol)

From 7bb3e22b3c454a59619a56c314be04b4b303e09a Mon Sep 17 00:00:00 2001
From: Alihan Zihna <alihanz@gmail.com>
Date: Mon, 31 May 2021 20:36:23 +0100
Subject: [PATCH 434/478] TST change load_boston in test_base to make_*
 (#20174)

Co-authored-by: maikia <maja_ka@hotmail.com>
Co-authored-by: Alihan Zihna <a.zihna@ckhgbdp.onmicrosoft.com>
---
 sklearn/tests/test_base.py | 36 +++++++++++++++++-------------------
 1 file changed, 17 insertions(+), 19 deletions(-)

diff --git a/sklearn/tests/test_base.py b/sklearn/tests/test_base.py
index c91419bf10a0e..3556f2fa20219 100644
--- a/sklearn/tests/test_base.py
+++ b/sklearn/tests/test_base.py
@@ -294,26 +294,24 @@ def test_set_params_updates_valid_params():
     assert gscv.estimator.C == 42.0
 
 
-def test_score_sample_weight():
-
+@pytest.mark.parametrize("tree,dataset", [
+    (DecisionTreeClassifier(max_depth=2, random_state=0),
+     datasets.make_classification(random_state=0)),
+    (DecisionTreeRegressor(max_depth=2, random_state=0),
+     datasets.make_regression(random_state=0)),
+])
+def test_score_sample_weight(tree, dataset):
     rng = np.random.RandomState(0)
-
-    # test both ClassifierMixin and RegressorMixin
-    estimators = [DecisionTreeClassifier(max_depth=2),
-                  DecisionTreeRegressor(max_depth=2)]
-    sets = [datasets.load_iris(),
-            datasets.load_boston()]
-
-    for est, ds in zip(estimators, sets):
-        est.fit(ds.data, ds.target)
-        # generate random sample weights
-        sample_weight = rng.randint(1, 10, size=len(ds.target))
-        # check that the score with and without sample weights are different
-        assert (est.score(ds.data, ds.target) !=
-                est.score(ds.data, ds.target,
-                          sample_weight=sample_weight)), (
-                              "Unweighted and weighted scores "
-                              "are unexpectedly equal")
+    # check that the score with and without sample weights are different
+    X, y = dataset
+
+    tree.fit(X, y)
+    # generate random sample weights
+    sample_weight = rng.randint(1, 10, size=len(y))
+    score_unweighted = tree.score(X, y)
+    score_weighted = tree.score(X, y, sample_weight=sample_weight)
+    msg = "Unweighted and weighted scores are unexpectedly equal"
+    assert score_unweighted != score_weighted, msg
 
 
 def test_clone_pandas_dataframe():

From 56f4b836275c49fb40b5642f7f4c69da009c6e93 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@gmail.com>
Date: Tue, 1 Jun 2021 10:33:08 +0200
Subject: [PATCH 435/478] Fix number of splines in legend of example plot
 (#20142)

---
 examples/linear_model/plot_polynomial_interpolation.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/linear_model/plot_polynomial_interpolation.py b/examples/linear_model/plot_polynomial_interpolation.py
index 34972b9522c68..c6cd1f9d591bd 100644
--- a/examples/linear_model/plot_polynomial_interpolation.py
+++ b/examples/linear_model/plot_polynomial_interpolation.py
@@ -128,7 +128,7 @@ def f(x):
 
 splt = SplineTransformer(n_knots=4, degree=3).fit(X_train)
 axes[1].plot(x_plot, splt.transform(X_plot))
-axes[1].legend(axes[1].lines, [f"spline {n}" for n in range(4)])
+axes[1].legend(axes[1].lines, [f"spline {n}" for n in range(6)])
 axes[1].set_title("SplineTransformer")
 
 # plot knots of spline
@@ -138,7 +138,7 @@ def f(x):
 
 # %%
 # In the left plot, we recognize the lines corresponding to simple monomials
-# from ``x**0`` to ``x**3``. In the right figure, we see the four B-spline
+# from ``x**0`` to ``x**3``. In the right figure, we see the six B-spline
 # basis functions of ``degree=3`` and also the four knot positions that were
 # chosen during ``fit``. Note that there are ``degree`` number of additional
 # knots each to the left and to the right of the fitted interval. These are

From 337f47abc6e74b91cfbda7709c641176719f979d Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 1 Jun 2021 10:44:22 +0200
Subject: [PATCH 436/478] DOC fix hyperlink for some users

---
 doc/whats_new/v1.0.rst | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 525f3439860ef..de2449d32ed5f 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -164,7 +164,8 @@ Changelog
   :pr:`17622` by :user:`Jérémie du Boisberranger <jeremiedbb>`.
 
 - |FIX| :class:`cluster.AgglomerativeClustering` now supports readonly
-  memory-mapped datasets. :pr:`19883` by `Julien Jerphanion <jjerphan>`.
+  memory-mapped datasets.
+  :pr:`19883` by :user:`Julien Jerphanion <jjerphan>`.
 
 - |API| :class:`cluster.Birch` attributes, `fit_` and `partial_fit_`, are
   deprecated and will be removed in 1.2. :pr:`19297` by `Thomas Fan`_.
@@ -274,8 +275,8 @@ Changelog
 
 - |Feature| :func:`feature_selection.r_regression` computes Pearson's R
   correlation coefficients between the features and the target.
-  :pr:`17169` by `Dmytro Lituiev <DSLituiev>`
-  and `Julien Jerphanion <jjerphan>`.
+  :pr:`17169` by :user:`Dmytro Lituiev <DSLituiev>`
+  and :user:`Julien Jerphanion <jjerphan>`.
 
 :mod:`sklearn.inspection`
 .........................
@@ -407,8 +408,8 @@ Changelog
   :class:`model_selection.StratifiedKFold` and `model_selection.GroupKFold`,
   providing an ability to split data preserving the distribution of classes in
   each split while keeping each group within a single split.
-  :pr:`18649` by `Leandro Hermida <hermidalc>` and
-  `Rodion Martynov <marrodion>`.
+  :pr:`18649` by :user:`Leandro Hermida <hermidalc>` and
+  :user:`Rodion Martynov <marrodion>`.
 
 :mod:`sklearn.naive_bayes`
 ..........................
@@ -436,7 +437,7 @@ Changelog
   :user:`Julien Jerphanion <jjerphan>`.
 
 - |FIX| :class:`neighbors.DistanceMetric` subclasses now support readonly
-  memory-mapped datasets. :pr:`19883` by `Julien Jerphanion <jjerphan>`.
+  memory-mapped datasets. :pr:`19883` by :user:`Julien Jerphanion <jjerphan>`.
 
 - |FIX| :class:`neighbors.NearestNeighbors`, :class:`neighbors.KNeighborsClassifier`,
   :class:`neighbors.RadiusNeighborsClassifier`, :class:`neighbors.KNeighborsRegressor`

From 1e24ea2b0df0cd828cd5dcbc6ee8e00ef0642c52 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 1 Jun 2021 11:01:32 +0200
Subject: [PATCH 437/478] DOC fix missing hyperlink in whats new

---
 doc/whats_new/v1.0.rst | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index de2449d32ed5f..6ecc421bafd48 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -405,9 +405,10 @@ Changelog
 ..............................
 
 - |Feature| added :class:`model_selection.StratifiedGroupKFold`, that combines
-  :class:`model_selection.StratifiedKFold` and `model_selection.GroupKFold`,
-  providing an ability to split data preserving the distribution of classes in
-  each split while keeping each group within a single split.
+  :class:`model_selection.StratifiedKFold` and
+  :class:`model_selection.GroupKFold`, providing an ability to split data
+  preserving the distribution of classes in each split while keeping each
+  group within a single split.
   :pr:`18649` by :user:`Leandro Hermida <hermidalc>` and
   :user:`Rodion Martynov <marrodion>`.
 

From 8bc36080d9855d29e1fcbc86da46a9e89e86c046 Mon Sep 17 00:00:00 2001
From: KurumeYuta <84881778+KurumeYuta@users.noreply.github.com>
Date: Tue, 1 Jun 2021 18:38:16 +0900
Subject: [PATCH 438/478] [MRG] Fix Sparse PCA optimization task #19775
 (#20153)

---
 doc/modules/decomposition.rst           | 14 ++++++++-----
 sklearn/decomposition/_dict_learning.py | 26 ++++++++++++++++++-------
 2 files changed, 28 insertions(+), 12 deletions(-)

diff --git a/doc/modules/decomposition.rst b/doc/modules/decomposition.rst
index fd51f60d8bfc6..0939318050d5c 100644
--- a/doc/modules/decomposition.rst
+++ b/doc/modules/decomposition.rst
@@ -230,12 +230,14 @@ problem solved is a PCA problem (dictionary learning) with an
 
 .. math::
    (U^*, V^*) = \underset{U, V}{\operatorname{arg\,min\,}} & \frac{1}{2}
-                ||X-UV||_2^2+\alpha||V||_1 \\
+                ||X-UV||_{\text{Fro}}^2+\alpha||V||_{1,1} \\
                 \text{subject to } & ||U_k||_2 = 1 \text{ for all }
                 0 \leq k < n_{components}
 
-
-The sparsity-inducing :math:`\ell_1` norm also prevents learning
+:math:`||.||_{\text{Fro}}` stands for the Frobenius norm and :math:`||.||_{1,1}`
+stands for the entry-wise matrix norm which is the sum of the absolute values
+of all the entries in the matrix.
+The sparsity-inducing :math:`||.||_{1,1}` matrix norm also prevents learning
 components from noise when few training samples are available. The degree
 of penalization (and thus sparsity) can be adjusted through the
 hyperparameter ``alpha``. Small values lead to a gently regularized
@@ -510,7 +512,7 @@ dictionary fixed, and then updating the dictionary to best fit the sparse code.
 
 .. math::
    (U^*, V^*) = \underset{U, V}{\operatorname{arg\,min\,}} & \frac{1}{2}
-                ||X-UV||_2^2+\alpha||U||_1 \\
+                ||X-UV||_{\text{Fro}}^2+\alpha||U||_{1,1} \\
                 \text{subject to } & ||V_k||_2 = 1 \text{ for all }
                 0 \leq k < n_{\mathrm{atoms}}
 
@@ -525,7 +527,9 @@ dictionary fixed, and then updating the dictionary to best fit the sparse code.
 
 .. centered:: |pca_img2| |dict_img2|
 
-
+:math:`||.||_{\text{Fro}}` stands for the Frobenius norm and :math:`||.||_{1,1}`
+stands for the entry-wise matrix norm which is the sum of the absolute values
+of all the entries in the matrix.
 After using such a procedure to fit the dictionary, the transform is simply a
 sparse coding step that shares the same implementation with all dictionary
 learning objects (see :ref:`SparseCoder`).
diff --git a/sklearn/decomposition/_dict_learning.py b/sklearn/decomposition/_dict_learning.py
index 1c48542a1c9ec..80b64570b3401 100644
--- a/sklearn/decomposition/_dict_learning.py
+++ b/sklearn/decomposition/_dict_learning.py
@@ -436,11 +436,13 @@ def dict_learning(X, n_components, *, alpha, max_iter=100, tol=1e-8,
     Finds the best dictionary and the corresponding sparse code for
     approximating the data matrix X by solving::
 
-        (U^*, V^*) = argmin 0.5 || X - U V ||_2^2 + alpha * || U ||_1
+        (U^*, V^*) = argmin 0.5 || X - U V ||_Fro^2 + alpha * || U ||_1,1
                      (U,V)
                     with || V_k ||_2 = 1 for all  0 <= k < n_components
 
-    where V is the dictionary and U is the sparse code.
+    where V is the dictionary and U is the sparse code. ||.||_Fro stands for
+    the Frobenius norm and ||.||_1,1 stands for the entry-wise matrix norm
+    which is the sum of the absolute values of all the entries in the matrix.
 
     Read more in the :ref:`User Guide <DictionaryLearning>`.
 
@@ -637,12 +639,14 @@ def dict_learning_online(X, n_components=2, *, alpha=1, n_iter=100,
     Finds the best dictionary and the corresponding sparse code for
     approximating the data matrix X by solving::
 
-        (U^*, V^*) = argmin 0.5 || X - U V ||_2^2 + alpha * || U ||_1
+        (U^*, V^*) = argmin 0.5 || X - U V ||_Fro^2 + alpha * || U ||_1,1
                      (U,V)
                      with || V_k ||_2 = 1 for all  0 <= k < n_components
 
-    where V is the dictionary and U is the sparse code. This is
-    accomplished by repeatedly iterating over mini-batches by slicing
+    where V is the dictionary and U is the sparse code. ||.||_Fro stands for
+    the Frobenius norm and ||.||_1,1 stands for the entry-wise matrix norm
+    which is the sum of the absolute values of all the entries in the matrix.
+    This is accomplished by repeatedly iterating over mini-batches by slicing
     the input data.
 
     Read more in the :ref:`User Guide <DictionaryLearning>`.
@@ -1137,10 +1141,14 @@ class DictionaryLearning(_BaseSparseCoding, BaseEstimator):
 
     Solves the optimization problem::
 
-        (U^*,V^*) = argmin 0.5 || X - U V ||_2^2 + alpha * || U ||_1
+        (U^*,V^*) = argmin 0.5 || X - U V ||_Fro^2 + alpha * || U ||_1,1
                     (U,V)
                     with || V_k ||_2 = 1 for all  0 <= k < n_components
 
+    ||.||_Fro stands for the Frobenius norm and ||.||_1,1 stands for
+    the entry-wise matrix norm which is the sum of the absolute values
+    of all the entries in the matrix.
+
     Read more in the :ref:`User Guide <DictionaryLearning>`.
 
     Parameters
@@ -1367,10 +1375,14 @@ class MiniBatchDictionaryLearning(_BaseSparseCoding, BaseEstimator):
 
     Solves the optimization problem::
 
-       (U^*,V^*) = argmin 0.5 || X - U V ||_2^2 + alpha * || U ||_1
+       (U^*,V^*) = argmin 0.5 || X - U V ||_Fro^2 + alpha * || U ||_1,1
                     (U,V)
                     with || V_k ||_2 = 1 for all  0 <= k < n_components
 
+    ||.||_Fro stands for the Frobenius norm and ||.||_1,1 stands for
+    the entry-wise matrix norm which is the sum of the absolute values
+    of all the entries in the matrix.
+
     Read more in the :ref:`User Guide <DictionaryLearning>`.
 
     Parameters

From 777ac15e67ff4c5ff1be89e3db6f1c385a1b415e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?=
 <34657725+jeremiedbb@users.noreply.github.com>
Date: Tue, 1 Jun 2021 19:14:04 +0200
Subject: [PATCH 439/478] MNT add n_features_in_ through the feature_extraction
 module (#20180)

---
 sklearn/feature_extraction/tests/test_text.py |  9 ---------
 sklearn/feature_extraction/text.py            | 15 ++++++++-------
 sklearn/tests/test_common.py                  |  1 -
 sklearn/utils/estimator_checks.py             |  3 ++-
 4 files changed, 10 insertions(+), 18 deletions(-)

diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py
index 0033ae84948ac..324d4f0875854 100644
--- a/sklearn/feature_extraction/tests/test_text.py
+++ b/sklearn/feature_extraction/tests/test_text.py
@@ -504,15 +504,6 @@ def test_vectorizer():
     with pytest.raises(ValueError):
         t3.transform(counts_train)
 
-    # test idf transform with incompatible n_features
-    X = [[1, 1, 5],
-         [1, 1, 0]]
-    t3.fit(X)
-    X_incompt = [[1, 3],
-                 [1, 3]]
-    with pytest.raises(ValueError):
-        t3.transform(X_incompt)
-
     # L1-normalized term frequencies sum to one
     assert_array_almost_equal(np.sum(tf, axis=1), [1.0] * n_train)
 
diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
index 00debc059440c..fed5c16ffbb54 100644
--- a/sklearn/feature_extraction/text.py
+++ b/sklearn/feature_extraction/text.py
@@ -1386,6 +1386,11 @@ class TfidfTransformer(TransformerMixin, BaseEstimator):
 
         .. versionadded:: 0.20
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 1.0
+
     Examples
     --------
     >>> from sklearn.feature_extraction.text import TfidfTransformer
@@ -1436,7 +1441,7 @@ def fit(self, X, y=None):
         X : sparse matrix of shape n_samples, n_features)
             A matrix of term/token counts.
         """
-        X = check_array(X, accept_sparse=('csr', 'csc'))
+        X = self._validate_data(X, accept_sparse=('csr', 'csc'))
         if not sp.issparse(X):
             X = sp.csr_matrix(X)
         dtype = X.dtype if X.dtype in FLOAT_DTYPES else np.float64
@@ -1476,7 +1481,8 @@ def transform(self, X, copy=True):
         -------
         vectors : sparse matrix of shape (n_samples, n_features)
         """
-        X = check_array(X, accept_sparse='csr', dtype=FLOAT_DTYPES, copy=copy)
+        X = self._validate_data(X, accept_sparse='csr',
+                                dtype=FLOAT_DTYPES, copy=copy, reset=False)
         if not sp.issparse(X):
             X = sp.csr_matrix(X, dtype=np.float64)
 
@@ -1493,11 +1499,6 @@ def transform(self, X, copy=True):
             check_is_fitted(self, attributes=["idf_"],
                             msg='idf vector is not fitted')
 
-            expected_n_features = self._idf_diag.shape[0]
-            if n_features != expected_n_features:
-                raise ValueError("Input has n_features=%d while the model"
-                                 " has been trained with n_features=%d" % (
-                                     n_features, expected_n_features))
             # *= doesn't work
             X = X * self._idf_diag
 
diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
index 8ec4125547722..6fd57c9e8d4fc 100644
--- a/sklearn/tests/test_common.py
+++ b/sklearn/tests/test_common.py
@@ -262,7 +262,6 @@ def test_search_cv(estimator, check, request):
 # check_classifiers_train would need to be updated with the error message
 N_FEATURES_IN_AFTER_FIT_MODULES_TO_IGNORE = {
     'compose',
-    'feature_extraction',
     'model_selection',
     'multiclass',
     'multioutput',
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index f0c0383a7bfe8..2058c8308ec29 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -3121,7 +3121,8 @@ def check_n_features_in_after_fitting(name, estimator_orig):
     # Make sure that n_features_in are checked after fitting
     tags = _safe_tags(estimator_orig)
 
-    if "2darray" not in tags["X_types"] or tags["no_validation"]:
+    if ("2darray" not in tags["X_types"] and "sparse" not in tags["X_types"] or
+            tags["no_validation"]):
         return
 
     rng = np.random.RandomState(0)

From c09be6ab8cf5366daea4a59ffe33cd437f58d4a7 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Tue, 1 Jun 2021 13:41:00 -0400
Subject: [PATCH 440/478] TST Fixes test and mis-matched pandas version
 (#20149)

---
 README.rst                                    | 2 +-
 doc/whats_new/v1.0.rst                        | 2 +-
 sklearn/_min_dependencies.py                  | 4 ++--
 sklearn/tests/test_min_dependencies_readme.py | 4 +++-
 4 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/README.rst b/README.rst
index 3c685fa4af13e..cf625bcd0f30d 100644
--- a/README.rst
+++ b/README.rst
@@ -27,7 +27,7 @@
 .. _DOI: https://zenodo.org/badge/latestdoi/21369/scikit-learn/scikit-learn
 
 .. |PythonMinVersion| replace:: 3.7
-.. |NumPyMinVersion| replace:: 1.14.5
+.. |NumPyMinVersion| replace:: 1.14.6
 .. |SciPyMinVersion| replace:: 1.1.0
 .. |JoblibMinVersion| replace:: 0.11
 .. |ThreadpoolctlMinVersion| replace:: 2.0.0
diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 6ecc421bafd48..930f99bda4cbb 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -15,7 +15,7 @@ Version 1.0.0
 Minimal dependencies
 --------------------
 
-Version 1.0.0 of scikit-learn requires python 3.7+, numpy 1.14.5+ and
+Version 1.0.0 of scikit-learn requires python 3.7+, numpy 1.14.6+ and
 scipy 1.1.0+. Optional minimal dependency is matplotlib 2.2.2+.
 
 Enforcing keyword-only arguments
diff --git a/sklearn/_min_dependencies.py b/sklearn/_min_dependencies.py
index d878a04eb4523..6a6ff13c479d1 100644
--- a/sklearn/_min_dependencies.py
+++ b/sklearn/_min_dependencies.py
@@ -7,7 +7,7 @@
 if platform.python_implementation() == 'PyPy':
     NUMPY_MIN_VERSION = '1.19.0'
 else:
-    NUMPY_MIN_VERSION = '1.14.5'
+    NUMPY_MIN_VERSION = '1.14.6'
 
 SCIPY_MIN_VERSION = '1.1.0'
 JOBLIB_MIN_VERSION = '0.11'
@@ -27,7 +27,7 @@
     'cython': (CYTHON_MIN_VERSION, 'build'),
     'matplotlib': ('2.2.2', 'benchmark, docs, examples, tests'),
     'scikit-image': ('0.14.5', 'docs, examples, tests'),
-    'pandas': ('0.23.4', 'benchmark, docs, examples, tests'),
+    'pandas': ('0.25.0', 'benchmark, docs, examples, tests'),
     'seaborn': ('0.9.0', 'docs, examples'),
     'memory_profiler': ('0.57.0', 'benchmark, docs'),
     'pytest': (PYTEST_MIN_VERSION, 'tests'),
diff --git a/sklearn/tests/test_min_dependencies_readme.py b/sklearn/tests/test_min_dependencies_readme.py
index f3958a88b6158..45825a18092a1 100644
--- a/sklearn/tests/test_min_dependencies_readme.py
+++ b/sklearn/tests/test_min_dependencies_readme.py
@@ -37,9 +37,11 @@ def test_min_dependencies_readme():
                 continue
 
             package, version = matched.group(2), matched.group(5)
+            package = package.lower()
 
             if package in dependent_packages:
                 version = parse_version(version)
                 min_version = parse_version(dependent_packages[package][0])
 
-                assert version == min_version
+                assert version == min_version, (f"{package} has a mismatched "
+                                                "version")

From 952b10fd549c4562fb47fb4e357b3e4b375f4fb5 Mon Sep 17 00:00:00 2001
From: Jonathan Schneider <jonathan.schneider@student.hpi.de>
Date: Tue, 1 Jun 2021 19:45:16 +0200
Subject: [PATCH 441/478] [MRG] DOC Improve documentation of Latent Dirichlet
 Allocation (#20181)

Rename `max_iters` to public name `max_doc_update_iter`
to prevent confusion between max_iter and max_iters

Improve parameter documentation
---
 sklearn/decomposition/_lda.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/sklearn/decomposition/_lda.py b/sklearn/decomposition/_lda.py
index 34432557814c2..75b123a118338 100644
--- a/sklearn/decomposition/_lda.py
+++ b/sklearn/decomposition/_lda.py
@@ -29,7 +29,7 @@
 
 
 def _update_doc_distribution(X, exp_topic_word_distr, doc_topic_prior,
-                             max_iters,
+                             max_doc_update_iter,
                              mean_change_tol, cal_sstats, random_state):
     """E-step: update document-topic distribution.
 
@@ -45,7 +45,7 @@ def _update_doc_distribution(X, exp_topic_word_distr, doc_topic_prior,
     doc_topic_prior : float
         Prior of document topic distribution `theta`.
 
-    max_iters : int
+    max_doc_update_iter : int
         Max number of iterations for updating document topic distribution in
         the E-step.
 
@@ -105,7 +105,7 @@ def _update_doc_distribution(X, exp_topic_word_distr, doc_topic_prior,
         exp_topic_word_d = exp_topic_word_distr[:, ids]
 
         # Iterate between `doc_topic_d` and `norm_phi` until convergence
-        for _ in range(0, max_iters):
+        for _ in range(0, max_doc_update_iter):
             last_d = doc_topic_d
 
             # The optimal phi_{dwk} is proportional to
@@ -187,7 +187,9 @@ class LatentDirichletAllocation(TransformerMixin, BaseEstimator):
         called tau_0.
 
     max_iter : int, default=10
-        The maximum number of iterations.
+        The maximum number of passes over the training data (aka epochs).
+        It only impacts the behavior in the :meth:`fit` method, and not the
+        :meth:`partial_fit` method.
 
     batch_size : int, default=128
         Number of documents to use in each EM iteration. Only used in online

From 5ff8632201d1faae8e6e0bc464d07b84c6df2578 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Tue, 1 Jun 2021 21:09:37 +0200
Subject: [PATCH 442/478] MNT add path as a static abstract method to
 LinearModelCV (#19970)

---
 sklearn/linear_model/_coordinate_descent.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/sklearn/linear_model/_coordinate_descent.py b/sklearn/linear_model/_coordinate_descent.py
index 1d93a6695b0e0..da50a3a817a38 100644
--- a/sklearn/linear_model/_coordinate_descent.py
+++ b/sklearn/linear_model/_coordinate_descent.py
@@ -1176,6 +1176,11 @@ def _get_estimator(self):
     def _is_multitask(self):
         """Bool indicating if class is meant for multidimensional target."""
 
+    @staticmethod
+    @abstractmethod
+    def path(X, y, **kwargs):
+        """Compute path with coordinate descent."""
+
     def fit(self, X, y):
         """Fit linear model with coordinate descent.
 

From 07a0cf37c9c1efa1ae7fddc9a33d0ee8798e635e Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Tue, 1 Jun 2021 16:06:28 -0400
Subject: [PATCH 443/478] MNT Finish removing python 3.6 (#20185)

---
 README.rst                               | 4 ++--
 doc/developers/advanced_installation.rst | 2 +-
 setup.py                                 | 9 ++++-----
 3 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/README.rst b/README.rst
index cf625bcd0f30d..b5ee90a304eff 100644
--- a/README.rst
+++ b/README.rst
@@ -17,8 +17,8 @@
 .. |Nightly wheels| image:: https://github.com/scikit-learn/scikit-learn/workflows/Wheel%20builder/badge.svg?event=schedule
 .. _`Nightly wheels`: https://github.com/scikit-learn/scikit-learn/actions?query=workflow%3A%22Wheel+builder%22+event%3Aschedule
 
-.. |PythonVersion| image:: https://img.shields.io/badge/python-3.6%20%7C%203.7%20%7C%203.8%20%7C%203.9-blue
-.. _PythonVersion: https://img.shields.io/badge/python-3.6%20%7C%203.7%20%7C%203.8%20%7C%203.9-blue
+.. |PythonVersion| image:: https://img.shields.io/badge/python-3.7%20%7C%203.8%20%7C%203.9-blue
+.. _PythonVersion: https://img.shields.io/badge/python-3.7%20%7C%203.8%20%7C%203.9-blue
 
 .. |PyPi| image:: https://badge.fury.io/py/scikit-learn.svg
 .. _PyPi: https://badge.fury.io/py/scikit-learn
diff --git a/doc/developers/advanced_installation.rst b/doc/developers/advanced_installation.rst
index 489f6447d57c8..c1dec51723861 100644
--- a/doc/developers/advanced_installation.rst
+++ b/doc/developers/advanced_installation.rst
@@ -94,7 +94,7 @@ Runtime dependencies
 Scikit-learn requires the following dependencies both at build time and at
 runtime:
 
-- Python (>= 3.6),
+- Python (>= 3.7),
 - NumPy (>= |NumpyMinVersion|),
 - SciPy (>= |ScipyMinVersion|),
 - Joblib (>= |JoblibMinVersion|),
diff --git a/setup.py b/setup.py
index 9758f62de1301..91602bafca408 100755
--- a/setup.py
+++ b/setup.py
@@ -19,7 +19,7 @@
 try:
     import builtins
 except ImportError:
-    # Python 2 compat: just to be able to declare that Python >=3.6 is needed.
+    # Python 2 compat: just to be able to declare that Python >=3.7 is needed.
     import __builtin__ as builtins
 
 # This is a bit (!) hackish: we are setting a global variable so that the
@@ -145,7 +145,7 @@ def build_extensions(self):
 
 except ImportError:
     # Numpy should not be a dependency just to be able to introspect
-    # that python 3.6 is required.
+    # that python 3.7 is required.
     pass
 
 
@@ -251,7 +251,6 @@ def setup_package():
                                  'Operating System :: Unix',
                                  'Operating System :: MacOS',
                                  'Programming Language :: Python :: 3',
-                                 'Programming Language :: Python :: 3.6',
                                  'Programming Language :: Python :: 3.7',
                                  'Programming Language :: Python :: 3.8',
                                  'Programming Language :: Python :: 3.9',
@@ -261,7 +260,7 @@ def setup_package():
                                   'Implementation :: PyPy')
                                  ],
                     cmdclass=cmdclass,
-                    python_requires=">=3.6",
+                    python_requires=">=3.7",
                     install_requires=min_deps.tag_to_packages['install'],
                     package_data={'': ['*.pxd']},
                     **extra_setuptools_args)
@@ -280,7 +279,7 @@ def setup_package():
     else:
         if sys.version_info < (3, 6):
             raise RuntimeError(
-                "Scikit-learn requires Python 3.6 or later. The current"
+                "Scikit-learn requires Python 3.7 or later. The current"
                 " Python version is %s installed in %s."
                 % (platform.python_version(), sys.executable))
 

From 6850c04186b88e88e9c8cd6eb673721af806e3da Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Tue, 1 Jun 2021 17:22:47 -0400
Subject: [PATCH 444/478] API Deprecates support for np.matrix in check_array
 (#20165)

* API Deprecates support for np.matrix in check_array

* DOC Adds whats new

* ENH Adds link to numpy.matrix
---
 doc/whats_new/v1.0.rst                 | 10 ++++++++++
 sklearn/utils/tests/test_validation.py | 19 +++++++++++++++++++
 sklearn/utils/validation.py            |  8 ++++++++
 3 files changed, 37 insertions(+)

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 930f99bda4cbb..fac578bcb1b03 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -123,6 +123,9 @@ Changelog
   - For :class:`tree.ExtraTreeRegressor`, `criterion="mae"` is deprecated,
     use `"absolute_error"` instead.
 
+- |API| `np.matrix` usage is deprecated in 1.0 and will raise a `TypeError` in
+  1.2. :pr:`20165` by `Thomas Fan`_.
+
 :mod:`sklearn.base`
 ...................
 
@@ -512,6 +515,13 @@ Changelog
   precision of the computed variance was very poor when the real variance is
   exactly zero. :pr:`19766` by :user:`Jérémie du Boisberranger <jeremiedbb>`.
 
+:mod:`sklearn.validation`
+.........................
+
+- |Fix| Support for `np.matrix` is deprecated in
+  :func:`~sklearn.utils.check_array` in 1.0 and will raise a `TypeError` in
+  1.2. :pr:`20165` by `Thomas Fan`_.
+
 Code and Documentation Contributors
 -----------------------------------
 
diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py
index ae2d5181f35a6..b3e28d7deeeef 100644
--- a/sklearn/utils/tests/test_validation.py
+++ b/sklearn/utils/tests/test_validation.py
@@ -57,6 +57,9 @@
 from sklearn.utils._testing import TempMemmap
 
 
+# TODO: Remove np.matrix usage in 1.2
+@pytest.mark.filterwarnings(
+    "ignore:np.matrix usage is deprecated in 1.0:FutureWarning")
 @pytest.mark.filterwarnings(
     "ignore:the matrix subclass:PendingDeprecationWarning")
 def test_as_float_array():
@@ -115,6 +118,9 @@ def test_as_float_array_nan(X):
     assert_allclose_dense_sparse(X_converted, X)
 
 
+# TODO: Remove np.matrix usage in 1.2
+@pytest.mark.filterwarnings(
+    "ignore:np.matrix usage is deprecated in 1.0:FutureWarning")
 @pytest.mark.filterwarnings(
     "ignore:the matrix subclass:PendingDeprecationWarning")
 def test_np_matrix():
@@ -1379,3 +1385,16 @@ def test_num_features_errors_scalars(X):
     )
     with pytest.raises(TypeError, match=msg):
         _num_features(X)
+
+
+# TODO: Remove in 1.2
+@pytest.mark.filterwarnings(
+    "ignore:the matrix subclass:PendingDeprecationWarning")
+def test_check_array_deprecated_matrix():
+    """Test that matrix support is deprecated in 1.0."""
+
+    X = np.matrix(np.arange(5))
+    msg = ("np.matrix usage is deprecated in 1.0 and will raise a TypeError "
+           "in 1.2. Please convert to a numpy array with np.asarray.")
+    with pytest.warns(FutureWarning, match=msg):
+        check_array(X)
diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py
index acfc8f5d10db2..b7af987d60c83 100644
--- a/sklearn/utils/validation.py
+++ b/sklearn/utils/validation.py
@@ -543,6 +543,14 @@ def check_array(array, accept_sparse=False, *, accept_large_sparse=True,
     array_converted : object
         The converted and validated array.
     """
+    if isinstance(array, np.matrix):
+        warnings.warn(
+            "np.matrix usage is deprecated in 1.0 and will raise a TypeError "
+            "in 1.2. Please convert to a numpy array with np.asarray. For "
+            "more information see: "
+            "https://numpy.org/doc/stable/reference/generated/numpy.matrix.html",  # noqa
+            FutureWarning)
+
     # store reference to original array to check if copy is needed when
     # function returns
     array_orig = array

From 5d25ce13ae0fa8f1f9e02d046d1820b6dcfd6155 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?=
 <34657725+jeremiedbb@users.noreply.github.com>
Date: Wed, 2 Jun 2021 17:41:41 +0200
Subject: [PATCH 445/478] TST enable test docstring params for feature
 extraction module (#20188)

---
 sklearn/tests/test_docstring_parameters.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sklearn/tests/test_docstring_parameters.py b/sklearn/tests/test_docstring_parameters.py
index cc10f11fcd574..bceaa21801872 100644
--- a/sklearn/tests/test_docstring_parameters.py
+++ b/sklearn/tests/test_docstring_parameters.py
@@ -183,7 +183,6 @@ def _construct_searchcv_instance(SearchCV):
     'discriminant_analysis',
     'dummy',
     'ensemble',
-    'feature_extraction',
     'feature_selection',
     'gaussian_process',
     'impute',

From e203750cf085d229c1755873d7208b6813de0443 Mon Sep 17 00:00:00 2001
From: tsuga <2888173+tsuga@users.noreply.github.com>
Date: Thu, 3 Jun 2021 10:54:41 +0900
Subject: [PATCH 446/478] DOC fix a reference in
 sklearn.ensemble.GradientBoostingRegressor (#20198)

---
 sklearn/ensemble/_gb.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/ensemble/_gb.py b/sklearn/ensemble/_gb.py
index 54e4e510cd9b9..78fee588ecf4e 100644
--- a/sklearn/ensemble/_gb.py
+++ b/sklearn/ensemble/_gb.py
@@ -1612,7 +1612,7 @@ class GradientBoostingRegressor(RegressorMixin, BaseGradientBoosting):
     HistGradientBoostingRegressor : Histogram-based Gradient Boosting
         Classification Tree.
     sklearn.tree.DecisionTreeRegressor : A decision tree regressor.
-    sklearn.tree.RandomForestRegressor : A random forest regressor.
+    sklearn.ensemble.RandomForestRegressor : A random forest regressor.
 
     Notes
     -----

From 64bafa313ef7afcbed74bbb0189da48ccf8e2230 Mon Sep 17 00:00:00 2001
From: Conner Shen <connershen98@hotmail.com>
Date: Thu, 3 Jun 2021 06:15:30 -0400
Subject: [PATCH 447/478] FIX mcc zero divsion  (#19977)

---
 sklearn/metrics/_classification.py           |  5 ++-
 sklearn/metrics/tests/test_classification.py | 35 ++++++++------------
 sklearn/utils/_testing.py                    | 24 --------------
 3 files changed, 15 insertions(+), 49 deletions(-)

diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py
index 434fd89f5bbd9..ada2af3f111e2 100644
--- a/sklearn/metrics/_classification.py
+++ b/sklearn/metrics/_classification.py
@@ -871,12 +871,11 @@ def matthews_corrcoef(y_true, y_pred, *, sample_weight=None):
     cov_ytyp = n_correct * n_samples - np.dot(t_sum, p_sum)
     cov_ypyp = n_samples ** 2 - np.dot(p_sum, p_sum)
     cov_ytyt = n_samples ** 2 - np.dot(t_sum, t_sum)
-    mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
 
-    if np.isnan(mcc):
+    if cov_ypyp * cov_ytyt == 0:
         return 0.
     else:
-        return mcc
+        return cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
 
 
 def zero_one_loss(y_true, y_pred, *, normalize=True, sample_weight=None):
diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index feed701f6cead..df352a8031948 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -20,7 +20,6 @@
 from sklearn.utils._testing import assert_array_equal
 from sklearn.utils._testing import assert_array_almost_equal
 from sklearn.utils._testing import assert_allclose
-from sklearn.utils._testing import assert_warns_div0
 from sklearn.utils._testing import assert_no_warnings
 from sklearn.utils._testing import assert_warns_message
 from sklearn.utils._testing import ignore_warnings
@@ -622,7 +621,6 @@ def test_cohen_kappa():
                         weights="quadratic"), 0.9541, decimal=4)
 
 
-@ignore_warnings
 def test_matthews_corrcoef_nan():
     assert matthews_corrcoef([0], [1]) == 0.0
     assert matthews_corrcoef([0, 0], [0, 1]) == 0.0
@@ -684,17 +682,11 @@ def test_matthews_corrcoef():
     assert_almost_equal(matthews_corrcoef(y_true, y_true_inv2), -1)
 
     # For the zero vector case, the corrcoef cannot be calculated and should
-    # result in a RuntimeWarning
-    mcc = assert_warns_div0(matthews_corrcoef, [0, 0, 0, 0], [0, 0, 0, 0])
-
-    # But will output 0
-    assert_almost_equal(mcc, 0.)
+    # output 0
+    assert_almost_equal(matthews_corrcoef([0, 0, 0, 0], [0, 0, 0, 0]), 0.)
 
     # And also for any other vector with 0 variance
-    mcc = assert_warns_div0(matthews_corrcoef, y_true, ['a'] * len(y_true))
-
-    # But will output 0
-    assert_almost_equal(mcc, 0.)
+    assert_almost_equal(matthews_corrcoef(y_true, ['a'] * len(y_true)), 0.)
 
     # These two vectors have 0 correlation and hence mcc should be 0
     y_1 = [1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1]
@@ -731,12 +723,15 @@ def test_matthews_corrcoef_multiclass():
     assert_almost_equal(matthews_corrcoef(y_true, y_pred_min),
                         -12 / np.sqrt(24 * 16))
 
-    # Zero variance will result in an mcc of zero and a Runtime Warning
+    # Zero variance will result in an mcc of zero
     y_true = [0, 1, 2]
     y_pred = [3, 3, 3]
-    mcc = assert_warns_message(RuntimeWarning, 'invalid value encountered',
-                               matthews_corrcoef, y_true, y_pred)
-    assert_almost_equal(mcc, 0.0)
+    assert_almost_equal(matthews_corrcoef(y_true, y_pred), 0.0)
+
+    # Also for ground truth with zero variance
+    y_true = [3, 3, 3]
+    y_pred = [0, 1, 2]
+    assert_almost_equal(matthews_corrcoef(y_true, y_pred), 0.0)
 
     # These two vectors have 0 correlation and hence mcc should be 0
     y_1 = [0, 1, 2, 0, 1, 2, 0, 1, 2]
@@ -754,16 +749,12 @@ def test_matthews_corrcoef_multiclass():
                                           sample_weight=sample_weight), -1)
 
     # For the zero vector case, the corrcoef cannot be calculated and should
-    # result in a RuntimeWarning
+    # output 0
     y_true = [0, 0, 1, 2]
     y_pred = [0, 0, 1, 2]
     sample_weight = [1, 1, 0, 0]
-    mcc = assert_warns_message(RuntimeWarning, 'invalid value encountered',
-                               matthews_corrcoef, y_true, y_pred,
-                               sample_weight=sample_weight)
-
-    # But will output 0
-    assert_almost_equal(mcc, 0.)
+    assert_almost_equal(matthews_corrcoef(y_true, y_pred,
+                                          sample_weight=sample_weight), 0.)
 
 
 @pytest.mark.parametrize('n_points', [100, 10000])
diff --git a/sklearn/utils/_testing.py b/sklearn/utils/_testing.py
index 8fc77748740d5..55ea23afbf9ec 100644
--- a/sklearn/utils/_testing.py
+++ b/sklearn/utils/_testing.py
@@ -187,30 +187,6 @@ def check_in_message(msg): return message in msg
     return result
 
 
-def assert_warns_div0(func, *args, **kw):
-    """Assume that numpy's warning for divide by zero is raised.
-
-    Handles the case of platforms that do not support warning on divide by
-    zero.
-
-    Parameters
-    ----------
-    func
-    *args
-    **kw
-    """
-
-    with np.errstate(divide='warn', invalid='warn'):
-        try:
-            assert_warns(RuntimeWarning, np.divide, 1, np.zeros(1))
-        except AssertionError:
-            # This platform does not report numpy divide by zeros
-            return func(*args, **kw)
-        return assert_warns_message(RuntimeWarning,
-                                    'invalid value encountered',
-                                    func, *args, **kw)
-
-
 # To remove when we support numpy 1.7
 def assert_no_warnings(func, *args, **kw):
     """

From 7f35724cfb72519e8e02cd44341c580e499d9fb3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?=
 <34657725+jeremiedbb@users.noreply.github.com>
Date: Thu, 3 Jun 2021 13:29:01 +0200
Subject: [PATCH 448/478] TST Add TransformedTargetRegressor to
 test_meta_estimators_delegate_data_validation (#20175)

Co-authored-by: Guillaume Lemaitre <g.lemaitre58@gmail.com>
---
 sklearn/compose/_column_transformer.py | 6 ++++++
 sklearn/compose/_target.py             | 6 ++++++
 sklearn/tests/test_common.py           | 2 +-
 sklearn/tests/test_metaestimators.py   | 2 +-
 4 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py
index 6c15b81be98c2..ada175c7f32c6 100644
--- a/sklearn/compose/_column_transformer.py
+++ b/sklearn/compose/_column_transformer.py
@@ -141,6 +141,12 @@ class ColumnTransformer(TransformerMixin, _BaseComposition):
 
         .. versionadded:: 1.0
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`. Only defined if the
+        underlying transformers expose such an attribute when fit.
+
+        .. versionadded:: 0.24
+
     Notes
     -----
     The order of the columns in the transformed feature matrix follows the
diff --git a/sklearn/compose/_target.py b/sklearn/compose/_target.py
index 12fe13ee848b9..af996623d8aa3 100644
--- a/sklearn/compose/_target.py
+++ b/sklearn/compose/_target.py
@@ -82,6 +82,12 @@ class TransformedTargetRegressor(RegressorMixin, BaseEstimator):
     transformer_ : object
         Transformer used in ``fit`` and ``predict``.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`. Only defined if the
+        underlying regressor exposes such an attribute when fit.
+
+        .. versionadded:: 0.24
+
     Examples
     --------
     >>> import numpy as np
diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
index 6fd57c9e8d4fc..848788647cf3f 100644
--- a/sklearn/tests/test_common.py
+++ b/sklearn/tests/test_common.py
@@ -261,7 +261,7 @@ def test_search_cv(estimator, check, request):
 #
 # check_classifiers_train would need to be updated with the error message
 N_FEATURES_IN_AFTER_FIT_MODULES_TO_IGNORE = {
-    'compose',
+    'feature_extraction',
     'model_selection',
     'multiclass',
     'multioutput',
diff --git a/sklearn/tests/test_metaestimators.py b/sklearn/tests/test_metaestimators.py
index ad716c3e4cd2f..c7412c98d4290 100644
--- a/sklearn/tests/test_metaestimators.py
+++ b/sklearn/tests/test_metaestimators.py
@@ -169,7 +169,7 @@ def _generate_meta_estimator_instances_with_pipeline():
     for _, Estimator in sorted(all_estimators()):
         sig = set(signature(Estimator).parameters)
 
-        if "estimator" in sig or "base_estimator" in sig:
+        if "estimator" in sig or "base_estimator" in sig or "regressor" in sig:
             if is_regressor(Estimator):
                 estimator = make_pipeline(TfidfVectorizer(), Ridge())
                 param_grid = {"ridge__alpha": [0.1, 1.0]}

From bd7ebf5aede5015c2127de6f7b670b446bb337a3 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 3 Jun 2021 13:34:48 +0200
Subject: [PATCH 449/478] TST enable n_feature_in_ test for feature_extraction
 module

---
 sklearn/tests/test_common.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
index 848788647cf3f..e891374f91051 100644
--- a/sklearn/tests/test_common.py
+++ b/sklearn/tests/test_common.py
@@ -261,7 +261,6 @@ def test_search_cv(estimator, check, request):
 #
 # check_classifiers_train would need to be updated with the error message
 N_FEATURES_IN_AFTER_FIT_MODULES_TO_IGNORE = {
-    'feature_extraction',
     'model_selection',
     'multiclass',
     'multioutput',

From 3a23e26da6f61622f7ebfcf8dfda3575c38d50fb Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Thu, 3 Jun 2021 08:28:27 -0400
Subject: [PATCH 450/478] FIX Uses points instead of pixels in plot_tree
 (#20023)

---
 sklearn/tree/_export.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/tree/_export.py b/sklearn/tree/_export.py
index a9763128c3a7e..a4ba02d5f8932 100644
--- a/sklearn/tree/_export.py
+++ b/sklearn/tree/_export.py
@@ -611,7 +611,7 @@ def export(self, decision_tree, ax=None):
     def recurse(self, node, tree, ax, scale_x, scale_y, height, depth=0):
         import matplotlib.pyplot as plt
         kwargs = dict(bbox=self.bbox_args.copy(), ha='center', va='center',
-                      zorder=100 - 10 * depth, xycoords='axes pixels',
+                      zorder=100 - 10 * depth, xycoords='axes points',
                       arrowprops=self.arrow_args.copy())
         kwargs['arrowprops']['edgecolor'] = plt.rcParams['text.color']
 

From 6bfaceded8e99396ff18356c9b97a7a673b6d9e1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?=
 <34657725+jeremiedbb@users.noreply.github.com>
Date: Thu, 3 Jun 2021 14:41:23 +0200
Subject: [PATCH 451/478] MNT n_features_in through the multiclass module
 (#20193)

---
 sklearn/multiclass.py                      | 78 +++++++++++++++-------
 sklearn/tests/test_common.py               |  1 -
 sklearn/tests/test_docstring_parameters.py |  4 +-
 sklearn/tests/test_metaestimators.py       |  3 +-
 4 files changed, 56 insertions(+), 30 deletions(-)

diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py
index d75556bf60ab4..99a6db2051030 100644
--- a/sklearn/multiclass.py
+++ b/sklearn/multiclass.py
@@ -50,13 +50,13 @@
 from .utils._tags import _safe_tags
 from .utils.validation import _num_samples
 from .utils.validation import check_is_fitted
-from .utils.validation import check_X_y, check_array
+from .utils.validation import column_or_1d
+from .utils.validation import _assert_all_finite
 from .utils.multiclass import (_check_partial_fit_first_call,
                                check_classification_targets,
                                _ovr_decision_function)
 from .utils.metaestimators import _safe_split, if_delegate_has_method
 from .utils.fixes import delayed
-from .exceptions import NotFittedError
 
 from joblib import Parallel
 
@@ -114,24 +114,28 @@ def _check_estimator(estimator):
 class _ConstantPredictor(BaseEstimator):
 
     def fit(self, X, y):
+        self._check_n_features(X, reset=True)
         self.y_ = y
         return self
 
     def predict(self, X):
         check_is_fitted(self)
+        self._check_n_features(X, reset=True)
 
-        return np.repeat(self.y_, X.shape[0])
+        return np.repeat(self.y_, _num_samples(X))
 
     def decision_function(self, X):
         check_is_fitted(self)
+        self._check_n_features(X, reset=True)
 
-        return np.repeat(self.y_, X.shape[0])
+        return np.repeat(self.y_, _num_samples(X))
 
     def predict_proba(self, X):
         check_is_fitted(self)
+        self._check_n_features(X, reset=True)
 
         return np.repeat([np.hstack([1 - self.y_, self.y_])],
-                         X.shape[0], axis=0)
+                         _num_samples(X), axis=0)
 
 
 class OneVsRestClassifier(MultiOutputMixin, ClassifierMixin,
@@ -219,6 +223,12 @@ class OneVsRestClassifier(MultiOutputMixin, ClassifierMixin,
     multilabel_ : boolean
         Whether a OneVsRestClassifier is a multilabel classifier.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`. Only defined if the
+        underlying estimator exposes such an attribute when fit.
+
+        .. versionadded:: 0.24
+
     Examples
     --------
     >>> import numpy as np
@@ -282,6 +292,9 @@ def fit(self, X, y):
                 self.label_binarizer_.classes_[i]])
             for i, column in enumerate(columns))
 
+        if hasattr(self.estimators_[0], "n_features_in_"):
+            self.n_features_in_ = self.estimators_[0].n_features_in_
+
         return self
 
     @if_delegate_has_method('estimator')
@@ -338,6 +351,9 @@ def partial_fit(self, X, y, classes=None):
             delayed(_partial_fit_binary)(estimator, X, column)
             for estimator, column in zip(self.estimators_, columns))
 
+        if hasattr(self.estimators_[0], "n_features_in_"):
+            self.n_features_in_ = self.estimators_[0].n_features_in_
+
         return self
 
     def predict(self, X):
@@ -504,19 +520,6 @@ def _more_tags(self):
     def _first_estimator(self):
         return self.estimators_[0]
 
-    @property
-    def n_features_in_(self):
-        # For consistency with other estimators we raise a AttributeError so
-        # that hasattr() fails if the OVR estimator isn't fitted.
-        try:
-            check_is_fitted(self)
-        except NotFittedError as nfe:
-            raise AttributeError(
-                "{} object has no n_features_in_ attribute."
-                .format(self.__class__.__name__)
-            ) from nfe
-        return self.estimators_[0].n_features_in_
-
 
 def _fit_ovo_binary(estimator, X, y, i, j):
     """Fit a single binary estimator (one-vs-one)."""
@@ -525,7 +528,7 @@ def _fit_ovo_binary(estimator, X, y, i, j):
     y_binary = np.empty(y.shape, int)
     y_binary[y == i] = 0
     y_binary[y == j] = 1
-    indcond = np.arange(X.shape[0])[cond]
+    indcond = np.arange(_num_samples(X))[cond]
     return _fit_binary(estimator,
                        _safe_split(estimator, X, None, indices=indcond)[0],
                        y_binary, classes=[i, j]), indcond
@@ -593,6 +596,12 @@ class OneVsOneClassifier(MetaEstimatorMixin, ClassifierMixin, BaseEstimator):
             (renaming of 0.25) and onward, `pairwise_indices_` will use the
             pairwise estimator tag instead.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`. Only defined if the
+        underlying estimator exposes such an attribute when fit.
+
+        .. versionadded:: 0.24
+
     Examples
     --------
     >>> from sklearn.datasets import load_iris
@@ -626,6 +635,7 @@ def fit(self, X, y):
         -------
         self
         """
+        # We need to validate the data because we do a safe_indexing later.
         X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc'],
                                    force_all_finite=False)
         check_classification_targets(y)
@@ -642,6 +652,9 @@ def fit(self, X, y):
 
         self.estimators_ = estimators_indices[0]
 
+        if hasattr(self.estimators_[0], "n_features_in_"):
+            self.n_features_in_ = self.estimators_[0].n_features_in_
+
         pairwise = _is_pairwise(self)
         self.pairwise_indices_ = (
             estimators_indices[1] if pairwise else None)
@@ -686,8 +699,9 @@ def partial_fit(self, X, y, classes=None):
                              "must be subset of {1}".format(np.unique(y),
                                                             self.classes_))
 
-        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'],
-                         force_all_finite=False)
+        X, y = self._validate_data(
+            X, y, accept_sparse=['csr', 'csc'], force_all_finite=False,
+            reset=_check_partial_fit_first_call(self, classes))
         check_classification_targets(y)
         combinations = itertools.combinations(range(self.n_classes_), 2)
         self.estimators_ = Parallel(
@@ -699,6 +713,9 @@ def partial_fit(self, X, y, classes=None):
 
         self.pairwise_indices_ = None
 
+        if hasattr(self.estimators_[0], "n_features_in_"):
+            self.n_features_in_ = self.estimators_[0].n_features_in_
+
         return self
 
     def predict(self, X):
@@ -832,6 +849,12 @@ class OutputCodeClassifier(MetaEstimatorMixin, ClassifierMixin, BaseEstimator):
     code_book_ : numpy array of shape [n_classes, code_size]
         Binary array containing the code of each class.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`. Only defined if the
+        underlying estimator exposes such an attribute when fit.
+
+        .. versionadded:: 0.24
+
     Examples
     --------
     >>> from sklearn.multiclass import OutputCodeClassifier
@@ -886,7 +909,9 @@ def fit(self, X, y):
         -------
         self
         """
-        X, y = self._validate_data(X, y, accept_sparse=True)
+        y = column_or_1d(y, warn=True)
+        _assert_all_finite(y)
+
         if self.code_size <= 0:
             raise ValueError("code_size should be greater than 0, got {0}"
                              "".format(self.code_size))
@@ -897,6 +922,9 @@ def fit(self, X, y):
 
         self.classes_ = np.unique(y)
         n_classes = self.classes_.shape[0]
+        if n_classes == 0:
+            raise ValueError("OutputCodeClassifier can not be fit when no "
+                             "class is present.")
         code_size_ = int(n_classes * self.code_size)
 
         # FIXME: there are more elaborate methods than generating the codebook
@@ -912,12 +940,15 @@ def fit(self, X, y):
         classes_index = {c: i for i, c in enumerate(self.classes_)}
 
         Y = np.array([self.code_book_[classes_index[y[i]]]
-                      for i in range(X.shape[0])], dtype=int)
+                      for i in range(_num_samples(y))], dtype=int)
 
         self.estimators_ = Parallel(n_jobs=self.n_jobs)(
             delayed(_fit_binary)(self.estimator, X, Y[:, i])
             for i in range(Y.shape[1]))
 
+        if hasattr(self.estimators_[0], "n_features_in_"):
+            self.n_features_in_ = self.estimators_[0].n_features_in_
+
         return self
 
     def predict(self, X):
@@ -934,7 +965,6 @@ def predict(self, X):
             Predicted multi-class targets.
         """
         check_is_fitted(self)
-        X = check_array(X, accept_sparse=True)
         Y = np.array([_predict_binary(e, X) for e in self.estimators_]).T
         pred = euclidean_distances(Y, self.code_book_).argmin(axis=1)
         return self.classes_[pred]
diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
index e891374f91051..bbffd7fa197cf 100644
--- a/sklearn/tests/test_common.py
+++ b/sklearn/tests/test_common.py
@@ -262,7 +262,6 @@ def test_search_cv(estimator, check, request):
 # check_classifiers_train would need to be updated with the error message
 N_FEATURES_IN_AFTER_FIT_MODULES_TO_IGNORE = {
     'model_selection',
-    'multiclass',
     'multioutput',
     'pipeline',
 }
diff --git a/sklearn/tests/test_docstring_parameters.py b/sklearn/tests/test_docstring_parameters.py
index bceaa21801872..97da48f1e6524 100644
--- a/sklearn/tests/test_docstring_parameters.py
+++ b/sklearn/tests/test_docstring_parameters.py
@@ -192,7 +192,6 @@ def _construct_searchcv_instance(SearchCV):
     'linear_model',
     'manifold',
     'model_selection',
-    'multiclass',
     'multioutput',
     'naive_bayes',
     'neighbors',
@@ -219,8 +218,7 @@ def test_fit_docstring_attributes(name, Estimator):
                'CountVectorizer', 'DictVectorizer', 'FeatureUnion',
                'GaussianRandomProjection',
                'MultiOutputClassifier', 'MultiOutputRegressor',
-               'NoSampleWeightWrapper', 'OneVsOneClassifier',
-               'OutputCodeClassifier', 'Pipeline', 'RFE', 'RFECV',
+               'NoSampleWeightWrapper', 'Pipeline', 'RFE', 'RFECV',
                'RegressorChain', 'SelectFromModel',
                'SparseCoder', 'SparseRandomProjection',
                'SpectralBiclustering', 'StackingClassifier',
diff --git a/sklearn/tests/test_metaestimators.py b/sklearn/tests/test_metaestimators.py
index c7412c98d4290..9a19008c3b322 100644
--- a/sklearn/tests/test_metaestimators.py
+++ b/sklearn/tests/test_metaestimators.py
@@ -219,8 +219,7 @@ def _generate_meta_estimator_instances_with_pipeline():
         "IterativeImputer",
         "MultiOutputClassifier",
         "MultiOutputRegressor",
-        "OneVsOneClassifier",
-        "OutputCodeClassifier",
+        "OneVsOneClassifier",  # input validation can't be avoided
         "RANSACRegressor",
         "RFE",
         "RFECV",

From 1038024a438e2bc76e7e48edde7b7ca732dc506b Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Thu, 3 Jun 2021 09:47:34 -0400
Subject: [PATCH 452/478] CI Removes python 3.6 builds from wheel building
 (#20184)

---
 .github/workflows/wheels.yml | 5 +++--
 pyproject.toml               | 5 ++++-
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
index ffddf9ef88db3..a280c29c31683 100644
--- a/.github/workflows/wheels.yml
+++ b/.github/workflows/wheels.yml
@@ -36,7 +36,7 @@ jobs:
         name: Check build trigger
         run: bash build_tools/github/check_build_trigger.sh
 
-  # Build the wheels for Linux, Windows and macOS for Python 3.6 and newer
+  # Build the wheels for Linux, Windows and macOS for Python 3.7 and newer
   build_wheels:
     name: Build wheel for cp${{ matrix.python }}-${{ matrix.platform_id }}-${{ matrix.manylinux_image }}
     runs-on: ${{ matrix.os }}
@@ -48,7 +48,7 @@ jobs:
       fail-fast: false
       matrix:
         os: [windows-latest, ubuntu-latest, macos-latest]
-        python: [36, 37, 38, 39]
+        python: [37, 38, 39]
         bitness: [32, 64]
         manylinux_image: [manylinux1, manylinux2010]
         include:
@@ -102,6 +102,7 @@ jobs:
           CIBW_TEST_REQUIRES: pytest pandas threadpoolctl
           CIBW_TEST_COMMAND: bash {project}/build_tools/github/test_wheels.sh
           CIBW_TEST_COMMAND_WINDOWS: bash {project}/build_tools/github/test_windows_wheels.sh ${{ matrix.python }} ${{ matrix.bitness }}
+          CIBW_BUILD_VERBOSITY: 1
 
         run: bash build_tools/github/build_wheels.sh
 
diff --git a/pyproject.toml b/pyproject.toml
index 84468f65341da..d172baaea7088 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -9,7 +9,10 @@ requires = [
     # wheels on PyPI
     #
     # see: https://github.com/scipy/oldest-supported-numpy/blob/master/setup.cfg
-    "oldest-supported-numpy",
+    "oldest-supported-numpy; python_version!='3.7' or platform_machine=='aarch64' or platform_system=='AIX' or platform_python_implementation == 'PyPy'",
+
+    # Override oldest-supported-numpy setting because pandas 0.25.0 requires 1.14.6
+    "numpy==1.14.6; python_version=='3.7' and platform_machine!='aarch64' and platform_system!='AIX' and platform_python_implementation != 'PyPy'",
 
     "scipy>=1.1.0",
 ]

From 95f5fb48e161625027cd245f941d15148c9e7949 Mon Sep 17 00:00:00 2001
From: mlondschien <61679398+mlondschien@users.noreply.github.com>
Date: Fri, 4 Jun 2021 14:08:05 +0200
Subject: [PATCH 453/478] FIX Fix typo in error message in `fetch_openml`
 (#20201)

---
 sklearn/datasets/_openml.py           | 2 +-
 sklearn/datasets/tests/test_openml.py | 9 +++++----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/sklearn/datasets/_openml.py b/sklearn/datasets/_openml.py
index ec3c3a9ae961d..2eedf57fa085e 100644
--- a/sklearn/datasets/_openml.py
+++ b/sklearn/datasets/_openml.py
@@ -840,7 +840,7 @@ def fetch_openml(
             raise ValueError(
                 "Dataset data_id={} and version={} passed, but you can only "
                 "specify a numeric data_id or a version, not "
-                "both.".format(data_id, name))
+                "both.".format(data_id, version))
     else:
         raise ValueError(
             "Neither name nor data_id are provided. Please provide name or "
diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py
index 663d2ae3088ed..39cd4c9ee1912 100644
--- a/sklearn/datasets/tests/test_openml.py
+++ b/sklearn/datasets/tests/test_openml.py
@@ -1216,15 +1216,16 @@ def test_fetch_openml_raises_missing_values_target(monkeypatch, gzip_response):
 
 
 def test_fetch_openml_raises_illegal_argument():
-    msg = 'Dataset data_id='
+    msg = 'Dataset data_id=-1 and version=version passed, but you can only'
     with pytest.raises(ValueError, match=msg):
-        fetch_openml(data_id=-1, name="name")
+        fetch_openml(data_id=-1, name=None, version="version")
 
+    msg = "Dataset data_id=-1 and name=name passed, but you can only"
     with pytest.raises(ValueError, match=msg):
-        fetch_openml(data_id=-1, name=None, version="version")
+        fetch_openml(data_id=-1, name="nAmE")
 
     with pytest.raises(ValueError, match=msg):
-        fetch_openml(data_id=-1, name="name", version="version")
+        fetch_openml(data_id=-1, name="nAmE", version="version")
 
     msg = (
         "Neither name nor data_id are provided. "

From 7b965c7893089ce7f22aca383fa14521c69204c9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20Fauchereau?=
 <clement.fauchereau@ensta-bretagne.org>
Date: Fri, 4 Jun 2021 12:11:14 +0000
Subject: [PATCH 454/478] FIX Fix error when using Calibrated with Voting
 (#20087)

---
 doc/whats_new/v1.0.rst            |  4 ++++
 sklearn/calibration.py            | 34 ++++++++++++++++++-------------
 sklearn/tests/test_calibration.py | 20 +++++++++++++++++-
 3 files changed, 43 insertions(+), 15 deletions(-)

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index fac578bcb1b03..fc7950f3590e0 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -138,6 +138,10 @@ Changelog
   :class:`calibration.CalibratedClassifierCV` can now properly be used on
   prefitted pipelines. :pr:`19641` by :user:`Alek Lefebvre <AlekLefebvre>`.
 
+- |Fix| Fixed an error when using a ::class:`ensemble.VotingClassifier`
+  as `base_estimator` in ::class:`calibration.CalibratedClassifierCV`.
+  :pr:`20087` by :user:`Clément Fauchereau <clement-f>`.
+
 :mod:`sklearn.cluster`
 ......................
 
diff --git a/sklearn/calibration.py b/sklearn/calibration.py
index 084f3bf242e3c..abdbed1bb797b 100644
--- a/sklearn/calibration.py
+++ b/sklearn/calibration.py
@@ -257,9 +257,10 @@ def fit(self, X, y, sample_weight=None):
             check_is_fitted(self.base_estimator, attributes=["classes_"])
             self.classes_ = self.base_estimator.classes_
 
-            pred_method = _get_prediction_method(base_estimator)
+            pred_method, method_name = _get_prediction_method(base_estimator)
             n_classes = len(self.classes_)
-            predictions = _compute_predictions(pred_method, X, n_classes)
+            predictions = _compute_predictions(pred_method, method_name, X,
+                                               n_classes)
 
             calibrated_classifier = _fit_calibrator(
                 base_estimator, predictions, y, self.classes_, self.method,
@@ -310,12 +311,13 @@ def fit(self, X, y, sample_weight=None):
                 )
             else:
                 this_estimator = clone(base_estimator)
-                method_name = _get_prediction_method(this_estimator).__name__
+                _, method_name = _get_prediction_method(this_estimator)
                 pred_method = partial(
                     cross_val_predict, estimator=this_estimator, X=X, y=y,
                     cv=cv, method=method_name, n_jobs=self.n_jobs
                 )
-                predictions = _compute_predictions(pred_method, X, n_classes)
+                predictions = _compute_predictions(pred_method, method_name, X,
+                                                   n_classes)
 
                 if sample_weight is not None and supports_sw:
                     this_estimator.fit(X, y, sample_weight)
@@ -441,8 +443,9 @@ def _fit_classifier_calibrator_pair(estimator, X, y, train, test, supports_sw,
         estimator.fit(X_train, y_train)
 
     n_classes = len(classes)
-    pred_method = _get_prediction_method(estimator)
-    predictions = _compute_predictions(pred_method, X_test, n_classes)
+    pred_method, method_name = _get_prediction_method(estimator)
+    predictions = _compute_predictions(pred_method, method_name, X_test,
+                                       n_classes)
 
     calibrated_classifier = _fit_calibrator(
         estimator, predictions, y_test, classes, method, sample_weight=sw_test
@@ -465,18 +468,21 @@ def _get_prediction_method(clf):
     -------
     prediction_method : callable
         The prediction method.
+    method_name : str
+        The name of the prediction method.
     """
     if hasattr(clf, 'decision_function'):
         method = getattr(clf, 'decision_function')
+        return method, 'decision_function'
     elif hasattr(clf, 'predict_proba'):
         method = getattr(clf, 'predict_proba')
+        return method, 'predict_proba'
     else:
         raise RuntimeError("'base_estimator' has no 'decision_function' or "
                            "'predict_proba' method.")
-    return method
 
 
-def _compute_predictions(pred_method, X, n_classes):
+def _compute_predictions(pred_method, method_name, X, n_classes):
     """Return predictions for `X` and reshape binary outputs to shape
     (n_samples, 1).
 
@@ -485,6 +491,9 @@ def _compute_predictions(pred_method, X, n_classes):
     pred_method : callable
         Prediction method.
 
+    method_name: str
+        Name of the prediction method
+
     X : array-like or None
         Data used to obtain predictions.
 
@@ -498,10 +507,6 @@ def _compute_predictions(pred_method, X, n_classes):
         (X.shape[0], 1).
     """
     predictions = pred_method(X=X)
-    if hasattr(pred_method, '__name__'):
-        method_name = pred_method.__name__
-    else:
-        method_name = signature(pred_method).parameters['method'].default
 
     if method_name == 'decision_function':
         if predictions.ndim == 1:
@@ -634,8 +639,9 @@ def predict_proba(self, X):
             The predicted probabilities. Can be exact zeros.
         """
         n_classes = len(self.classes)
-        pred_method = _get_prediction_method(self.base_estimator)
-        predictions = _compute_predictions(pred_method, X, n_classes)
+        pred_method, method_name = _get_prediction_method(self.base_estimator)
+        predictions = _compute_predictions(pred_method, method_name, X,
+                                           n_classes)
 
         label_encoder = LabelEncoder().fit(self.classes)
         pos_class_indices = label_encoder.transform(
diff --git a/sklearn/tests/test_calibration.py b/sklearn/tests/test_calibration.py
index 53d620b41031c..210d90f99f845 100644
--- a/sklearn/tests/test_calibration.py
+++ b/sklearn/tests/test_calibration.py
@@ -20,7 +20,8 @@
 from sklearn.preprocessing import LabelEncoder
 from sklearn.model_selection import KFold, cross_val_predict
 from sklearn.naive_bayes import MultinomialNB
-from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
+from sklearn.ensemble import (RandomForestClassifier, RandomForestRegressor,
+                              VotingClassifier)
 from sklearn.svm import LinearSVC
 from sklearn.isotonic import IsotonicRegression
 from sklearn.feature_extraction import DictVectorizer
@@ -607,3 +608,20 @@ def test_calibrated_classifier_cv_deprecation(data):
         calibrators, calib_clf.calibrated_classifiers_[0].calibrators
     ):
         assert clf1 is clf2
+
+
+def test_calibration_votingclassifier():
+    # Check that `CalibratedClassifier` works with `VotingClassifier`.
+    # The method `predict_proba` from `VotingClassifier` is dynamically
+    # defined via a property that only works when voting="soft".
+    X, y = make_classification(n_samples=10, n_features=5,
+                               n_classes=2, random_state=7)
+    vote = VotingClassifier(
+        estimators=[('dummy'+str(i), DummyClassifier()) for i in range(3)],
+        voting="soft"
+    )
+    vote.fit(X, y)
+
+    calib_clf = CalibratedClassifierCV(base_estimator=vote, cv="prefit")
+    # smoke test: should not raise an error
+    calib_clf.fit(X, y)

From a1a6b3a9602283792ec4091cdb990be1afab9163 Mon Sep 17 00:00:00 2001
From: murata-yu <67666318+murata-yu@users.noreply.github.com>
Date: Fri, 4 Jun 2021 22:23:05 +0900
Subject: [PATCH 455/478] FIX Fix RandomForestRegressor doesn't accept
 max_samples=1.0 (#20159)

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
---
 doc/whats_new/v1.0.rst                |  6 ++++
 sklearn/ensemble/_forest.py           | 17 ++++++-----
 sklearn/ensemble/tests/test_forest.py | 43 +++++++++++++++++++++++----
 3 files changed, 53 insertions(+), 13 deletions(-)

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index fc7950f3590e0..ece6ff15ac51b 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -270,6 +270,12 @@ Changelog
   :class:`ensemble.StackingClassifier` and :class:`ensemble.StackingRegressor`.
   :pr:`19564` by `Thomas Fan`_.
 
+- |Fix| Fixed the range of the argument max_samples to be (0.0, 1.0]
+  in :class:`ensemble.RandomForestClassifier`,
+  :class:`ensemble.RandomForestRegressor`, where `max_samples=1.0` is
+  interpreted as using all `n_samples` for bootstrapping. :pr:`20159` by
+  :user:`murata-yu`.
+
 :mod:`sklearn.feature_extraction`
 .................................
 
diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py
index 5a93acd0c0554..06ca0c171efc6 100644
--- a/sklearn/ensemble/_forest.py
+++ b/sklearn/ensemble/_forest.py
@@ -86,7 +86,7 @@ def _get_n_samples_bootstrap(n_samples, max_samples):
     max_samples : int or float
         The maximum number of samples to draw from the total available:
             - if float, this indicates a fraction of the total and should be
-              the interval `(0, 1)`;
+              the interval `(0.0, 1.0]`;
             - if int, this indicates the exact number of samples;
             - if None, this indicates the total number of samples.
 
@@ -105,8 +105,8 @@ def _get_n_samples_bootstrap(n_samples, max_samples):
         return max_samples
 
     if isinstance(max_samples, numbers.Real):
-        if not (0 < max_samples < 1):
-            msg = "`max_samples` must be in range (0, 1) but got value {}"
+        if not (0 < max_samples <= 1):
+            msg = "`max_samples` must be in range (0.0, 1.0] but got value {}"
             raise ValueError(msg.format(max_samples))
         return round(n_samples * max_samples)
 
@@ -1163,7 +1163,7 @@ class RandomForestClassifier(ForestClassifier):
         - If None (default), then draw `X.shape[0]` samples.
         - If int, then draw `max_samples` samples.
         - If float, then draw `max_samples * X.shape[0]` samples. Thus,
-          `max_samples` should be in the interval `(0, 1)`.
+          `max_samples` should be in the interval `(0.0, 1.0]`.
 
         .. versionadded:: 0.22
 
@@ -1473,7 +1473,7 @@ class RandomForestRegressor(ForestRegressor):
         - If None (default), then draw `X.shape[0]` samples.
         - If int, then draw `max_samples` samples.
         - If float, then draw `max_samples * X.shape[0]` samples. Thus,
-          `max_samples` should be in the interval `(0, 1)`.
+          `max_samples` should be in the interval `(0.0, 1.0]`.
 
         .. versionadded:: 0.22
 
@@ -1557,6 +1557,7 @@ class RandomForestRegressor(ForestRegressor):
     >>> print(regr.predict([[0, 0, 0, 0]]))
     [-8.32987858]
     """
+
     def __init__(self,
                  n_estimators=100, *,
                  criterion="squared_error",
@@ -1789,7 +1790,7 @@ class ExtraTreesClassifier(ForestClassifier):
         - If None (default), then draw `X.shape[0]` samples.
         - If int, then draw `max_samples` samples.
         - If float, then draw `max_samples * X.shape[0]` samples. Thus,
-          `max_samples` should be in the interval `(0, 1)`.
+          `max_samples` should be in the interval `(0.0, 1.0]`.
 
         .. versionadded:: 0.22
 
@@ -1873,6 +1874,7 @@ class labels (multi-output problem).
     >>> clf.predict([[0, 0, 0, 0]])
     array([1])
     """
+
     def __init__(self,
                  n_estimators=100, *,
                  criterion="gini",
@@ -2095,7 +2097,7 @@ class ExtraTreesRegressor(ForestRegressor):
         - If None (default), then draw `X.shape[0]` samples.
         - If int, then draw `max_samples` samples.
         - If float, then draw `max_samples * X.shape[0]` samples. Thus,
-          `max_samples` should be in the interval `(0, 1)`.
+          `max_samples` should be in the interval `(0.0, 1.0]`.
 
         .. versionadded:: 0.22
 
@@ -2168,6 +2170,7 @@ class ExtraTreesRegressor(ForestRegressor):
     >>> reg.score(X_test, y_test)
     0.2708...
     """
+
     def __init__(self,
                  n_estimators=100, *,
                  criterion="squared_error",
diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py
index c74a1ca0c603e..52615d037cf63 100644
--- a/sklearn/ensemble/tests/test_forest.py
+++ b/sklearn/ensemble/tests/test_forest.py
@@ -50,6 +50,8 @@
 from sklearn.svm import LinearSVC
 from sklearn.utils.validation import check_random_state
 
+from sklearn.metrics import mean_squared_error
+
 from sklearn.tree._classes import SPARSE_SPLITTERS
 
 
@@ -1419,16 +1421,14 @@ def test_forest_degenerate_feature_importances():
     'max_samples, exc_type, exc_msg',
     [(int(1e9), ValueError,
       "`max_samples` must be in range 1 to 6 but got value 1000000000"),
-     (1.0, ValueError,
-      r"`max_samples` must be in range \(0, 1\) but got value 1.0"),
      (2.0, ValueError,
-      r"`max_samples` must be in range \(0, 1\) but got value 2.0"),
+      r"`max_samples` must be in range \(0.0, 1.0\] but got value 2.0"),
      (0.0, ValueError,
-      r"`max_samples` must be in range \(0, 1\) but got value 0.0"),
+      r"`max_samples` must be in range \(0.0, 1.0\] but got value 0.0"),
      (np.nan, ValueError,
-      r"`max_samples` must be in range \(0, 1\) but got value nan"),
+      r"`max_samples` must be in range \(0.0, 1.0\] but got value nan"),
      (np.inf, ValueError,
-      r"`max_samples` must be in range \(0, 1\) but got value inf"),
+      r"`max_samples` must be in range \(0.0, 1.0\] but got value inf"),
      ('str max_samples?!', TypeError,
       r"`max_samples` should be int or float, but got "
       r"type '\<class 'str'\>'"),
@@ -1443,6 +1443,37 @@ def test_max_samples_exceptions(name, max_samples, exc_type, exc_msg):
         est.fit(X, y)
 
 
+@pytest.mark.parametrize('name', FOREST_REGRESSORS)
+def test_max_samples_boundary_regressors(name):
+    X_train, X_test, y_train, y_test = train_test_split(
+        X_reg, y_reg, train_size=0.7, test_size=0.3, random_state=0)
+
+    ms_1_model = FOREST_REGRESSORS[name](max_samples=1.0, random_state=0)
+    ms_1_predict = ms_1_model.fit(X_train, y_train).predict(X_test)
+
+    ms_None_model = FOREST_REGRESSORS[name](max_samples=None, random_state=0)
+    ms_None_predict = ms_None_model.fit(X_train, y_train).predict(X_test)
+
+    ms_1_ms = mean_squared_error(ms_1_predict, y_test)
+    ms_None_ms = mean_squared_error(ms_None_predict, y_test)
+
+    assert ms_1_ms == pytest.approx(ms_None_ms)
+
+
+@pytest.mark.parametrize('name', FOREST_CLASSIFIERS)
+def test_max_samples_boundary_classifiers(name):
+    X_train, X_test, y_train, _ = train_test_split(
+        X_large, y_large, random_state=0, stratify=y_large)
+
+    ms_1_model = FOREST_CLASSIFIERS[name](max_samples=1.0, random_state=0)
+    ms_1_proba = ms_1_model.fit(X_train, y_train).predict_proba(X_test)
+
+    ms_None_model = FOREST_CLASSIFIERS[name](max_samples=None, random_state=0)
+    ms_None_proba = ms_None_model.fit(X_train, y_train).predict_proba(X_test)
+
+    np.testing.assert_allclose(ms_1_proba, ms_None_proba)
+
+
 def test_forest_y_sparse():
     X = [[1, 2, 3]]
     y = csr_matrix([4, 5, 6])

From 36915ae390fab4742f98c82dc6802f072c4effa5 Mon Sep 17 00:00:00 2001
From: Brian Sun <52805678+bsun94@users.noreply.github.com>
Date: Sat, 5 Jun 2021 22:45:23 -0400
Subject: [PATCH 456/478] ENH Adds Poisson criterion in RandomForestRegressor
 (#19836)

Co-authored-by: Christian Lorentzen <lorentzen.ch@gmail.com>
Co-authored-by: Alihan Zihna <alihanz@gmail.com>
Co-authored-by: Alihan Zihna <a.zihna@ckhgbdp.onmicrosoft.com>
Co-authored-by: Chiara Marmo <cmarmo@users.noreply.github.com>
Co-authored-by: Olivier Grisel <olivier.grisel@gmail.com>
Co-authored-by: naozin555 <37050583+naozin555@users.noreply.github.com>
Co-authored-by: Venkatachalam N <venky.yuvy@gmail.com>
Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
---
 doc/whats_new/v1.0.rst                |  4 ++
 sklearn/ensemble/_forest.py           | 18 +++++-
 sklearn/ensemble/tests/test_forest.py | 89 +++++++++++++++++++++++++++
 3 files changed, 108 insertions(+), 3 deletions(-)

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index ece6ff15ac51b..b66c87815bae7 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -270,6 +270,10 @@ Changelog
   :class:`ensemble.StackingClassifier` and :class:`ensemble.StackingRegressor`.
   :pr:`19564` by `Thomas Fan`_.
 
+- |Enhancement| Documented and tested support of the Poisson criterion for
+  :class:`ensemble.RandomForestRegressor`. :pr:`19836` by
+  :user:`Brian Sun <bsun94>`.
+
 - |Fix| Fixed the range of the argument max_samples to be (0.0, 1.0]
   in :class:`ensemble.RandomForestClassifier`,
   :class:`ensemble.RandomForestRegressor`, where `max_samples=1.0` is
diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py
index 06ca0c171efc6..bc29c0362bb3e 100644
--- a/sklearn/ensemble/_forest.py
+++ b/sklearn/ensemble/_forest.py
@@ -323,6 +323,14 @@ def fit(self, X, y, sample_weight=None):
             # [:, np.newaxis] that does not.
             y = np.reshape(y, (-1, 1))
 
+        if self.criterion == "poisson":
+            if np.any(y < 0):
+                raise ValueError("Some value(s) of y are negative which is "
+                                 "not allowed for Poisson regression.")
+            if np.sum(y) <= 0:
+                raise ValueError("Sum of y is not strictly positive which "
+                                 "is necessary for Poisson regression.")
+
         self.n_outputs_ = y.shape[1]
 
         y, expanded_class_weight = self._validate_y_class_weight(y)
@@ -1324,16 +1332,20 @@ class RandomForestRegressor(ForestRegressor):
            The default value of ``n_estimators`` changed from 10 to 100
            in 0.22.
 
-    criterion : {"squared_error", "mse", "absolute_error", "mae"}, \
+    criterion : {"squared_error", "mse", "absolute_error", "poisson"}, \
             default="squared_error"
         The function to measure the quality of a split. Supported criteria
         are "squared_error" for the mean squared error, which is equal to
-        variance reduction as feature selection criterion, and "absolute_error"
-        for the mean absolute error.
+        variance reduction as feature selection criterion, "absolute_error"
+        for the mean absolute error, and "poisson" which uses reduction in
+        Poisson deviance to find splits.
 
         .. versionadded:: 0.18
            Mean Absolute Error (MAE) criterion.
 
+        .. versionadded:: 1.0
+           Poisson criterion.
+
         .. deprecated:: 1.0
             Criterion "mse" was deprecated in v1.0 and will be removed in
             version 1.2. Use `criterion="squared_error"` which is equivalent.
diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py
index 52615d037cf63..6c4aa905abe55 100644
--- a/sklearn/ensemble/tests/test_forest.py
+++ b/sklearn/ensemble/tests/test_forest.py
@@ -27,6 +27,8 @@
 import joblib
 from numpy.testing import assert_allclose
 
+from sklearn.dummy import DummyRegressor
+from sklearn.metrics import mean_poisson_deviance
 from sklearn.utils._testing import assert_almost_equal
 from sklearn.utils._testing import assert_array_almost_equal
 from sklearn.utils._testing import assert_array_equal
@@ -185,6 +187,76 @@ def test_regression(name, criterion):
     check_regression_criterion(name, criterion)
 
 
+def test_poisson_vs_mse():
+    """Test that random forest with poisson criterion performs better than
+    mse for a poisson target."""
+    rng = np.random.RandomState(42)
+    n_train, n_test, n_features = 500, 500, 10
+    X = datasets.make_low_rank_matrix(n_samples=n_train + n_test,
+                                      n_features=n_features, random_state=rng)
+    X = np.abs(X)
+    X /= np.max(np.abs(X), axis=0)
+    # We create a log-linear Poisson model
+    coef = rng.uniform(low=-4, high=1, size=n_features)
+    y = rng.poisson(lam=np.exp(X @ coef))
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=n_test,
+                                                        random_state=rng)
+
+    forest_poi = RandomForestRegressor(
+        criterion="poisson",
+        min_samples_leaf=10,
+        max_features="sqrt",
+        random_state=rng)
+    forest_mse = RandomForestRegressor(
+        criterion="squared_error",
+        min_samples_leaf=10,
+        max_features="sqrt",
+        random_state=rng)
+
+    forest_poi.fit(X_train, y_train)
+    forest_mse.fit(X_train, y_train)
+    dummy = DummyRegressor(strategy="mean").fit(X_train, y_train)
+
+    for X, y, val in [(X_train, y_train, "train"), (X_test, y_test, "test")]:
+        metric_poi = mean_poisson_deviance(y, forest_poi.predict(X))
+        # squared_error forest might produce non-positive predictions => clip
+        # If y = 0 for those, the poisson deviance gets too good.
+        # If we drew more samples, we would eventually get y > 0 and the
+        # poisson deviance would explode, i.e. be undefined. Therefore, we do
+        # not clip to a tiny value like 1e-15, but to 0.1. This acts like a
+        # mild penalty to the non-positive predictions.
+        metric_mse = mean_poisson_deviance(
+            y,
+            np.clip(forest_mse.predict(X), 1e-6, None))
+        metric_dummy = mean_poisson_deviance(y, dummy.predict(X))
+        # As squared_error might correctly predict 0 in train set, its train
+        # score can be better than Poisson. This is no longer the case for the
+        # test set. But keep the above comment for clipping in mind.
+        if val == "test":
+            assert metric_poi < metric_mse
+        assert metric_poi < metric_dummy
+
+
+@pytest.mark.parametrize('criterion', ('poisson', 'squared_error'))
+def test_balance_property_random_forest(criterion):
+    """"Test that sum(y_pred)==sum(y_true) on the training set."""
+    rng = np.random.RandomState(42)
+    n_train, n_test, n_features = 500, 500, 10
+    X = datasets.make_low_rank_matrix(n_samples=n_train + n_test,
+                                      n_features=n_features, random_state=rng)
+
+    coef = rng.uniform(low=-2, high=2, size=n_features) / np.max(X, axis=0)
+    y = rng.poisson(lam=np.exp(X @ coef))
+
+    reg = RandomForestRegressor(criterion=criterion,
+                                n_estimators=10,
+                                bootstrap=False,
+                                random_state=rng)
+    reg.fit(X, y)
+
+    assert np.sum(reg.predict(X)) == pytest.approx(np.sum(y))
+
+
 def check_regressor_attributes(name):
     # Regression models should not have a classes_ attribute.
     r = FOREST_REGRESSORS[name](random_state=0)
@@ -1367,6 +1439,23 @@ def test_min_impurity_decrease():
             assert tree.min_impurity_decrease == 0.1
 
 
+def test_poisson_y_positive_check():
+    est = RandomForestRegressor(criterion="poisson")
+    X = np.zeros((3, 3))
+
+    y = [-1, 1, 3]
+    err_msg = (r"Some value\(s\) of y are negative which is "
+               r"not allowed for Poisson regression.")
+    with pytest.raises(ValueError, match=err_msg):
+        est.fit(X, y)
+
+    y = [0, 0, 0]
+    err_msg = (r"Sum of y is not strictly positive which "
+               r"is necessary for Poisson regression.")
+    with pytest.raises(ValueError, match=err_msg):
+        est.fit(X, y)
+
+
 # mypy error: Variable "DEFAULT_JOBLIB_BACKEND" is not valid type
 class MyBackend(DEFAULT_JOBLIB_BACKEND):  # type: ignore
     def __init__(self, *args, **kwargs):

From 9884ccd609b818e2a87ea1cb4dfde56a0b624860 Mon Sep 17 00:00:00 2001
From: Nanshan Li <nanshanli@dsaid.gov.sg>
Date: Mon, 7 Jun 2021 00:58:35 +0800
Subject: [PATCH 457/478] TST Replace assert_warns from decomposition/tests
 (#20214)

---
 sklearn/decomposition/tests/test_fastica.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/sklearn/decomposition/tests/test_fastica.py b/sklearn/decomposition/tests/test_fastica.py
index 9f37ac25c2f76..4379b07697d0c 100644
--- a/sklearn/decomposition/tests/test_fastica.py
+++ b/sklearn/decomposition/tests/test_fastica.py
@@ -10,7 +10,6 @@
 
 from sklearn.utils._testing import assert_almost_equal
 from sklearn.utils._testing import assert_array_almost_equal
-from sklearn.utils._testing import assert_warns
 
 from sklearn.decomposition import FastICA, fastica, PCA
 from sklearn.decomposition._fastica import _gs_decorrelation
@@ -141,7 +140,9 @@ def test_fastica_nowhiten():
 
     # test for issue #697
     ica = FastICA(n_components=1, whiten=False, random_state=0)
-    assert_warns(UserWarning, ica.fit, m)
+    warn_msg = "Ignoring n_components with whiten=False."
+    with pytest.warns(UserWarning, match=warn_msg):
+        ica.fit(m)
     assert hasattr(ica, 'mixing_')
 
 
@@ -164,9 +165,14 @@ def test_fastica_convergence_fail():
     m = np.dot(mixing, s)
 
     # Do fastICA with tolerance 0. to ensure failing convergence
-    ica = FastICA(algorithm="parallel", n_components=2, random_state=rng,
-                  max_iter=2, tol=0.)
-    assert_warns(ConvergenceWarning, ica.fit, m.T)
+    warn_msg = (
+        "FastICA did not converge. Consider increasing tolerance "
+        "or the maximum number of iterations."
+    )
+    with pytest.warns(ConvergenceWarning, match=warn_msg):
+        ica = FastICA(algorithm="parallel", n_components=2, random_state=rng,
+                      max_iter=2, tol=0.)
+        ica.fit(m.T)
 
 
 @pytest.mark.parametrize('add_noise', [True, False])

From 800aee6d48be102a27a6f6d3df1822e52c628951 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Mon, 7 Jun 2021 12:29:45 +0200
Subject: [PATCH 458/478] TST check n_features_in_ in pipeline module (#20192)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
Co-authored-by: Jérémie du Boisberranger <34657725+jeremiedbb@users.noreply.github.com>
Co-authored-by: Olivier Grisel <olivier.grisel@gmail.com>
---
 sklearn/pipeline.py                        | 16 ++++++++
 sklearn/tests/test_common.py               |  1 -
 sklearn/tests/test_docstring_parameters.py |  5 +--
 sklearn/tests/test_metaestimators.py       | 44 ++++++++++++++--------
 4 files changed, 47 insertions(+), 19 deletions(-)

diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py
index e2ff6806ff3da..090d157b069bf 100644
--- a/sklearn/pipeline.py
+++ b/sklearn/pipeline.py
@@ -83,6 +83,13 @@ class Pipeline(_BaseComposition):
         Read-only attribute to access any step parameter by user given name.
         Keys are step names and values are steps parameters.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`. Only defined if the
+        underlying first estimator in `steps` exposes such an attribute
+        when fit.
+
+        .. versionadded:: 0.24
+
     See Also
     --------
     make_pipeline : Convenience function for simplified pipeline construction.
@@ -826,6 +833,15 @@ class FeatureUnion(TransformerMixin, _BaseComposition):
         If True, the time elapsed while fitting each transformer will be
         printed as it is completed.
 
+    Attributes
+    ----------
+    n_features_in_ : int
+        Number of features seen during :term:`fit`. Only defined if the
+        underlying first transformer in `transformer_list` exposes such an
+        attribute when fit.
+
+        .. versionadded:: 0.24
+
     See Also
     --------
     make_union : Convenience function for simplified feature union
diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
index bbffd7fa197cf..5e190437ca4a9 100644
--- a/sklearn/tests/test_common.py
+++ b/sklearn/tests/test_common.py
@@ -263,7 +263,6 @@ def test_search_cv(estimator, check, request):
 N_FEATURES_IN_AFTER_FIT_MODULES_TO_IGNORE = {
     'model_selection',
     'multioutput',
-    'pipeline',
 }
 
 N_FEATURES_IN_AFTER_FIT_ESTIMATORS = [
diff --git a/sklearn/tests/test_docstring_parameters.py b/sklearn/tests/test_docstring_parameters.py
index 97da48f1e6524..74a3e91a52a32 100644
--- a/sklearn/tests/test_docstring_parameters.py
+++ b/sklearn/tests/test_docstring_parameters.py
@@ -196,7 +196,6 @@ def _construct_searchcv_instance(SearchCV):
     'naive_bayes',
     'neighbors',
     'neural_network',
-    'pipeline',
     'preprocessing',
     'random_projection',
     'semi_supervised',
@@ -215,10 +214,10 @@ def test_fit_docstring_attributes(name, Estimator):
     attributes = doc['Attributes']
 
     IGNORED = {'ClassifierChain', 'ColumnTransformer',
-               'CountVectorizer', 'DictVectorizer', 'FeatureUnion',
+               'CountVectorizer', 'DictVectorizer',
                'GaussianRandomProjection',
                'MultiOutputClassifier', 'MultiOutputRegressor',
-               'NoSampleWeightWrapper', 'Pipeline', 'RFE', 'RFECV',
+               'NoSampleWeightWrapper', 'RFE', 'RFECV',
                'RegressorChain', 'SelectFromModel',
                'SparseCoder', 'SparseRandomProjection',
                'SpectralBiclustering', 'StackingClassifier',
diff --git a/sklearn/tests/test_metaestimators.py b/sklearn/tests/test_metaestimators.py
index 9a19008c3b322..5e9057429fa94 100644
--- a/sklearn/tests/test_metaestimators.py
+++ b/sklearn/tests/test_metaestimators.py
@@ -21,6 +21,7 @@
 from sklearn.exceptions import NotFittedError
 from sklearn.semi_supervised import SelfTrainingClassifier
 from sklearn.linear_model import Ridge, LogisticRegression
+from sklearn.preprocessing import StandardScaler, MaxAbsScaler
 
 
 class DelegatorData:
@@ -185,6 +186,19 @@ def _generate_meta_estimator_instances_with_pipeline():
             else:
                 yield Estimator(estimator)
 
+        elif "transformer_list" in sig:
+            # FeatureUnion
+            transformer_list = [
+                ("trans1", make_pipeline(TfidfVectorizer(), MaxAbsScaler())),
+                (
+                    "trans2",
+                    make_pipeline(
+                        TfidfVectorizer(), StandardScaler(with_mean=False)
+                    ),
+                ),
+            ]
+            yield Estimator(transformer_list)
+
         elif "estimators" in sig:
             # stacking, voting
             if is_regressor(Estimator):
@@ -211,21 +225,21 @@ def _generate_meta_estimator_instances_with_pipeline():
 # They should be able to work on any data and delegate data validation to
 # their inner estimator(s).
 DATA_VALIDATION_META_ESTIMATORS_TO_IGNORE = [
-        "AdaBoostClassifier",
-        "AdaBoostRegressor",
-        "BaggingClassifier",
-        "BaggingRegressor",
-        "ClassifierChain",
-        "IterativeImputer",
-        "MultiOutputClassifier",
-        "MultiOutputRegressor",
-        "OneVsOneClassifier",  # input validation can't be avoided
-        "RANSACRegressor",
-        "RFE",
-        "RFECV",
-        "RegressorChain",
-        "SelfTrainingClassifier",
-        "SequentialFeatureSelector"  # not applicable (2D data mandatory)
+    "AdaBoostClassifier",
+    "AdaBoostRegressor",
+    "BaggingClassifier",
+    "BaggingRegressor",
+    "ClassifierChain",
+    "IterativeImputer",
+    "MultiOutputClassifier",
+    "MultiOutputRegressor",
+    "OneVsOneClassifier",  # input validation can't be avoided
+    "RANSACRegressor",
+    "RFE",
+    "RFECV",
+    "RegressorChain",
+    "SelfTrainingClassifier",
+    "SequentialFeatureSelector",  # not applicable (2D data mandatory)
 ]
 
 DATA_VALIDATION_META_ESTIMATORS = [

From 778125645fbc84d6749c7b506662e12deb90c018 Mon Sep 17 00:00:00 2001
From: mlondschien <61679398+mlondschien@users.noreply.github.com>
Date: Mon, 7 Jun 2021 16:45:22 +0200
Subject: [PATCH 459/478] Allow `n_knots=None` if knots are explicitly
 specified in `SplineTransformer` (#20191)

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 sklearn/preprocessing/_polynomial.py           | 15 +++++++++------
 sklearn/preprocessing/tests/test_polynomial.py |  6 +++---
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/sklearn/preprocessing/_polynomial.py b/sklearn/preprocessing/_polynomial.py
index ac4703dbb4cb2..930e85c783711 100644
--- a/sklearn/preprocessing/_polynomial.py
+++ b/sklearn/preprocessing/_polynomial.py
@@ -350,7 +350,8 @@ class SplineTransformer(TransformerMixin, BaseEstimator):
     ----------
     n_knots : int, default=5
         Number of knots of the splines if `knots` equals one of
-        {'uniform', 'quantile'}. Must be larger or equal 2.
+        {'uniform', 'quantile'}. Must be larger or equal 2. Ignored if `knots`
+        is array-like.
 
     degree : int, default=3
         The polynomial degree of the spline basis. Must be a non-negative
@@ -546,15 +547,17 @@ def fit(self, X, y=None):
         ):
             raise ValueError("degree must be a non-negative integer.")
 
-        if not (
-            isinstance(self.n_knots, numbers.Integral) and self.n_knots >= 2
-        ):
-            raise ValueError("n_knots must be a positive integer >= 2.")
-
         if isinstance(self.knots, str) and self.knots in [
             "uniform",
             "quantile",
         ]:
+            if not (
+                isinstance(self.n_knots, numbers.Integral)
+                and self.n_knots >= 2
+            ):
+                raise ValueError("n_knots must be a positive integer >= 2, "
+                                 f"got: {self.n_knots}")
+
             base_knots = self._get_base_knot_positions(
                 X, n_knots=self.n_knots, knots=self.knots
             )
diff --git a/sklearn/preprocessing/tests/test_polynomial.py b/sklearn/preprocessing/tests/test_polynomial.py
index 1f70ec9854a54..b9be4e775b8d3 100644
--- a/sklearn/preprocessing/tests/test_polynomial.py
+++ b/sklearn/preprocessing/tests/test_polynomial.py
@@ -96,9 +96,9 @@ def test_spline_transformer_manual_knot_input():
     """
     X = np.arange(20).reshape(10, 2)
     knots = [[0.5, 1], [1.5, 2], [5, 10]]
-    st1 = SplineTransformer(degree=3, knots=knots).fit(X)
+    st1 = SplineTransformer(degree=3, knots=knots, n_knots=None).fit(X)
     knots = np.asarray(knots)
-    st2 = SplineTransformer(degree=3, knots=knots).fit(X)
+    st2 = SplineTransformer(degree=3, knots=knots, n_knots=None).fit(X)
     for i in range(X.shape[1]):
         assert_allclose(st1.bsplines_[i].t, st2.bsplines_[i].t)
 
@@ -216,7 +216,7 @@ def test_spline_transformer_linear_regression(bias, intercept):
     ("uniform", 12, 8),
     (
         [[-1.0, 0.0], [0, 1.0], [0.1, 2.0], [0.2, 3.0], [0.3, 4.0], [1, 5.0]],
-        100,  # this gets ignored.
+        None,
         3
     )
 ])

From 673625b29466310fa86a06b0a1577150cd34cc8a Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@gmail.com>
Date: Mon, 7 Jun 2021 18:15:17 +0200
Subject: [PATCH 460/478] FIX make check_complex_data deterministic (#20221)

---
 sklearn/utils/estimator_checks.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index 2058c8308ec29..cb1c96adbd153 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -967,6 +967,7 @@ def check_complex_data(name, estimator_orig):
     # Something both valid for classification and regression
     y = rng.randint(low=0, high=2, size=10) + 1j
     estimator = clone(estimator_orig)
+    set_random_state(estimator, random_state=0)
     with raises(ValueError, match="Complex data not supported"):
         estimator.fit(X, y)
 

From b15e312b29ddc9d527aa33002a0844b21e8dfb5a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?=
 <34657725+jeremiedbb@users.noreply.github.com>
Date: Mon, 7 Jun 2021 23:48:17 +0200
Subject: [PATCH 461/478] TST test_fit_docstring_attributes include properties
 (#20190)

---
 sklearn/cluster/_bicluster.py              |  3 +++
 sklearn/model_selection/_search.py         |  8 ++++++
 sklearn/tests/test_docstring_parameters.py | 29 ++++++++++++++++++++--
 3 files changed, 38 insertions(+), 2 deletions(-)

diff --git a/sklearn/cluster/_bicluster.py b/sklearn/cluster/_bicluster.py
index c8ff1bb036662..9267052b48f75 100644
--- a/sklearn/cluster/_bicluster.py
+++ b/sklearn/cluster/_bicluster.py
@@ -255,6 +255,9 @@ class SpectralCoclustering(BaseSpectral):
     column_labels_ : array-like of shape (n_cols,)
         The bicluster label of each column.
 
+    biclusters_ : tuple of two ndarrays
+        The tuple contains the `rows_` and `columns_` arrays.
+
     Examples
     --------
     >>> from sklearn.cluster import SpectralCoclustering
diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py
index 07ad3d7dbafe5..3ee0bcc4ec153 100644
--- a/sklearn/model_selection/_search.py
+++ b/sklearn/model_selection/_search.py
@@ -1177,6 +1177,10 @@ class GridSearchCV(BaseSearchCV):
     multimetric_ : bool
         Whether or not the scorers compute several metrics.
 
+    classes_ : ndarray of shape (n_classes,)
+        The classes labels. This is present only if ``refit`` is specified and
+        the underlying estimator is a classifier.
+
     Notes
     -----
     The parameters selected are those that maximize the score of the left out
@@ -1499,6 +1503,10 @@ class RandomizedSearchCV(BaseSearchCV):
     multimetric_ : bool
         Whether or not the scorers compute several metrics.
 
+    classes_ : ndarray of shape (n_classes,)
+        The classes labels. This is present only if ``refit`` is specified and
+        the underlying estimator is a classifier.
+
     Notes
     -----
     The parameters selected are those that maximize the score of the held-out
diff --git a/sklearn/tests/test_docstring_parameters.py b/sklearn/tests/test_docstring_parameters.py
index 74a3e91a52a32..a3a0605308c79 100644
--- a/sklearn/tests/test_docstring_parameters.py
+++ b/sklearn/tests/test_docstring_parameters.py
@@ -224,6 +224,7 @@ def test_fit_docstring_attributes(name, Estimator):
                'StackingRegressor', 'TfidfVectorizer', 'VotingClassifier',
                'VotingRegressor', 'SequentialFeatureSelector',
                'HalvingGridSearchCV', 'HalvingRandomSearchCV'}
+
     if Estimator.__name__ in IGNORED or Estimator.__name__.startswith('_'):
         pytest.skip("Estimator cannot be fit easily to test fit attributes")
 
@@ -284,10 +285,34 @@ def test_fit_docstring_attributes(name, Estimator):
         with ignore_warnings(category=FutureWarning):
             assert hasattr(est, attr.name)
 
-    fit_attr = [k for k in est.__dict__.keys() if k.endswith('_')
-                and not k.startswith('_')]
+    fit_attr = _get_all_fitted_attributes(est)
     fit_attr_names = [attr.name for attr in attributes]
     undocumented_attrs = set(fit_attr).difference(fit_attr_names)
     undocumented_attrs = set(undocumented_attrs).difference(skipped_attributes)
     assert not undocumented_attrs,\
         "Undocumented attributes: {}".format(undocumented_attrs)
+
+
+def _get_all_fitted_attributes(estimator):
+    "Get all the fitted attributes of an estimator including properties"
+    # attributes
+    fit_attr = list(estimator.__dict__.keys())
+
+    # properties
+    with warnings.catch_warnings():
+        warnings.filterwarnings("error", category=FutureWarning)
+
+        for name in dir(estimator.__class__):
+            obj = getattr(estimator.__class__, name)
+            if not isinstance(obj, property):
+                continue
+
+            # ignore properties that raises an AttributeError and deprecated
+            # properties
+            try:
+                getattr(estimator, name)
+            except (AttributeError, FutureWarning):
+                continue
+            fit_attr.append(name)
+
+    return [k for k in fit_attr if k.endswith('_') and not k.startswith('_')]

From 7f308675a75cabb2222d61b5d6f293e85c43581c Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Mon, 7 Jun 2021 19:39:34 -0400
Subject: [PATCH 462/478] FIX Uses the color max for colormap in
 ConfusionMatrixDisplay (#19784)

---
 doc/whats_new/v1.0.rst                             |  3 +++
 sklearn/metrics/_plot/confusion_matrix.py          |  2 +-
 .../_plot/tests/test_confusion_matrix_display.py   | 14 ++++++++++++++
 3 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index b66c87815bae7..02c77459ddc22 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -403,6 +403,9 @@ Changelog
   are integral.
   :pr:`9843` by :user:`Jon Crall <Erotemic>`.
 
+- |Fix| :meth:`metrics.ConfusionMatrixDisplay.plot` uses the correct max
+  for colormap. :pr:`19784` by `Thomas Fan`_.
+
 - |Fix| Samples with zero `sample_weight` values do not affect the results
   from :func:`metrics.det_curve`, :func:`metrics.precision_recall_curve`
   and :func:`metrics.roc_curve`.
diff --git a/sklearn/metrics/_plot/confusion_matrix.py b/sklearn/metrics/_plot/confusion_matrix.py
index dd941a7e28e43..06d2d002a8191 100644
--- a/sklearn/metrics/_plot/confusion_matrix.py
+++ b/sklearn/metrics/_plot/confusion_matrix.py
@@ -119,7 +119,7 @@ def plot(self, *, include_values=True, cmap='viridis',
         n_classes = cm.shape[0]
         self.im_ = ax.imshow(cm, interpolation='nearest', cmap=cmap)
         self.text_ = None
-        cmap_min, cmap_max = self.im_.cmap(0), self.im_.cmap(256)
+        cmap_min, cmap_max = self.im_.cmap(0), self.im_.cmap(1.0)
 
         if include_values:
             self.text_ = np.empty_like(cm, dtype=object)
diff --git a/sklearn/metrics/_plot/tests/test_confusion_matrix_display.py b/sklearn/metrics/_plot/tests/test_confusion_matrix_display.py
index ed0bc04117396..b1498afae89ae 100644
--- a/sklearn/metrics/_plot/tests/test_confusion_matrix_display.py
+++ b/sklearn/metrics/_plot/tests/test_confusion_matrix_display.py
@@ -380,3 +380,17 @@ def test_confusion_matrix_with_unknown_labels(pyplot, constructor_name):
     display_labels = [tick.get_text() for tick in disp.ax_.get_xticklabels()]
     expected_labels = [str(i) for i in range(n_classes + 1)]
     assert_array_equal(expected_labels, display_labels)
+
+
+def test_colormap_max(pyplot):
+    """Check that the max color is used for the color of the text."""
+
+    from matplotlib import cm
+    gray = cm.get_cmap('gray', 1024)
+    confusion_matrix = np.array([[1.0, 0.0], [0.0, 1.0]])
+
+    disp = ConfusionMatrixDisplay(confusion_matrix)
+    disp.plot(cmap=gray)
+
+    color = disp.text_[1, 0].get_color()
+    assert_allclose(color, [1.0, 1.0, 1.0, 1.0])

From 5c3cb6b0af04344d41d542b718d682604d6aa685 Mon Sep 17 00:00:00 2001
From: solosilence <abhishekkr23rs@gmail.com>
Date: Tue, 8 Jun 2021 06:29:59 +0530
Subject: [PATCH 463/478] STY Changing .format method to f-string formatting
 (#20215)

---
 benchmarks/bench_20newsgroups.py | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/benchmarks/bench_20newsgroups.py b/benchmarks/bench_20newsgroups.py
index 8efc740e937da..9546c8f1d6a39 100644
--- a/benchmarks/bench_20newsgroups.py
+++ b/benchmarks/bench_20newsgroups.py
@@ -46,18 +46,16 @@
 
     print("20 newsgroups")
     print("=============")
-    print("X_train.shape = {0}".format(X_train.shape))
-    print("X_train.format = {0}".format(X_train.format))
-    print("X_train.dtype = {0}".format(X_train.dtype))
-    print("X_train density = {0}"
-          "".format(X_train.nnz / np.product(X_train.shape)))
-    print("y_train {0}".format(y_train.shape))
-    print("X_test {0}".format(X_test.shape))
-    print("X_test.format = {0}".format(X_test.format))
-    print("X_test.dtype = {0}".format(X_test.dtype))
-    print("y_test {0}".format(y_test.shape))
+    print(f"X_train.shape = {X_train.shape}")
+    print(f"X_train.format = {X_train.format}")
+    print(f"X_train.dtype = {X_train.dtype}")
+    print(f"X_train density = {X_train.nnz / np.product(X_train.shape)}")
+    print(f"y_train {y_train.shape}")
+    print(f"X_test {X_test.shape}")
+    print(f"X_test.format = {X_test.format}")
+    print(f"X_test.dtype = {X_test.dtype}")
+    print(f"y_test {y_test.shape}")
     print()
-
     print("Classifier Training")
     print("===================")
     accuracy, train_time, test_time = {}, {}, {}

From c53d33ea965edee4fd59f85181694efc437c0e8b Mon Sep 17 00:00:00 2001
From: Shao Yang Hong <hongsy2006@gmail.com>
Date: Wed, 9 Jun 2021 16:01:00 +0800
Subject: [PATCH 464/478] [MRG] Listed valid metrics in neighbors.rst (#19379)

Co-authored-by: Julien Jerphanion <git@jjerphan.xyz>
Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
Co-authored-by: Chiara Marmo <cmarmo@users.noreply.github.com>
---
 doc/modules/neighbors.rst | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/doc/modules/neighbors.rst b/doc/modules/neighbors.rst
index bb84b79e8570a..f394f011af11a 100644
--- a/doc/modules/neighbors.rst
+++ b/doc/modules/neighbors.rst
@@ -464,6 +464,20 @@ leaf nodes.  The level of this switch can be specified with the parameter
 
 ``leaf_size`` is not referenced for brute force queries.
 
+Valid Metrics for Nearest Neighbor Algorithms
+---------------------------------------------
+
+For a list of available metrics, see the documentation of the :class:`DistanceMetric`
+class.
+
+A list of valid metrics for any of the above algorithms can be obtained by using their
+``valid_metric`` attribute. For example, valid metrics for ``KDTree`` can be generated by:
+
+    >>> from sklearn.neighbors import KDTree
+    >>> print(sorted(KDTree.valid_metrics))
+    ['chebyshev', 'cityblock', 'euclidean', 'infinity', 'l1', 'l2', 'manhattan', 'minkowski', 'p']
+
+
 .. _nearest_centroid_classifier:
 
 Nearest Centroid Classifier

From 45fc4b76f3ebcf36d3c470d80c85f652a7a0c322 Mon Sep 17 00:00:00 2001
From: Nanshan Li <nanshanli0@gmail.com>
Date: Wed, 9 Jun 2021 17:04:58 +0800
Subject: [PATCH 465/478] DOC Document n_features_in_ in cluster (#20228)

---
 sklearn/cluster/_affinity_propagation.py   | 3 +++
 sklearn/cluster/_agglomerative.py          | 6 ++++++
 sklearn/cluster/_bicluster.py              | 6 ++++++
 sklearn/cluster/_birch.py                  | 3 +++
 sklearn/cluster/_dbscan.py                 | 3 +++
 sklearn/cluster/_kmeans.py                 | 6 ++++++
 sklearn/cluster/_mean_shift.py             | 3 +++
 sklearn/cluster/_optics.py                 | 3 +++
 sklearn/cluster/_spectral.py               | 3 +++
 sklearn/tests/test_docstring_parameters.py | 1 -
 10 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/sklearn/cluster/_affinity_propagation.py b/sklearn/cluster/_affinity_propagation.py
index 59620ab31f63d..67c1fb42b650b 100644
--- a/sklearn/cluster/_affinity_propagation.py
+++ b/sklearn/cluster/_affinity_propagation.py
@@ -309,6 +309,9 @@ class AffinityPropagation(ClusterMixin, BaseEstimator):
     n_iter_ : int
         Number of iterations taken to converge.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
     Notes
     -----
     For an example, see :ref:`examples/cluster/plot_affinity_propagation.py
diff --git a/sklearn/cluster/_agglomerative.py b/sklearn/cluster/_agglomerative.py
index 4b0089b707233..05f57ff238bcf 100644
--- a/sklearn/cluster/_agglomerative.py
+++ b/sklearn/cluster/_agglomerative.py
@@ -773,6 +773,9 @@ class AgglomerativeClustering(ClusterMixin, BaseEstimator):
         .. versionadded:: 0.21
             ``n_connected_components_`` was added to replace ``n_components_``.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
     children_ : array-like of shape (n_samples-1, 2)
         The children of each non-leaf node. Values less than `n_samples`
         correspond to leaves of the tree which are the original samples.
@@ -1039,6 +1042,9 @@ class FeatureAgglomeration(AgglomerativeClustering, AgglomerationTransform):
         .. versionadded:: 0.21
             ``n_connected_components_`` was added to replace ``n_components_``.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
     children_ : array-like of shape (n_nodes-1, 2)
         The children of each non-leaf node. Values less than `n_features`
         correspond to leaves of the tree which are the original samples.
diff --git a/sklearn/cluster/_bicluster.py b/sklearn/cluster/_bicluster.py
index 9267052b48f75..1be7dd4e64186 100644
--- a/sklearn/cluster/_bicluster.py
+++ b/sklearn/cluster/_bicluster.py
@@ -258,6 +258,9 @@ class SpectralCoclustering(BaseSpectral):
     biclusters_ : tuple of two ndarrays
         The tuple contains the `rows_` and `columns_` arrays.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
     Examples
     --------
     >>> from sklearn.cluster import SpectralCoclustering
@@ -395,6 +398,9 @@ class SpectralBiclustering(BaseSpectral):
     column_labels_ : array-like of shape (n_cols,)
         Column partition labels.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
     Examples
     --------
     >>> from sklearn.cluster import SpectralBiclustering
diff --git a/sklearn/cluster/_birch.py b/sklearn/cluster/_birch.py
index da1bf894f03f8..81c9312f1488a 100644
--- a/sklearn/cluster/_birch.py
+++ b/sklearn/cluster/_birch.py
@@ -401,6 +401,9 @@ class Birch(ClusterMixin, TransformerMixin, BaseEstimator):
         if partial_fit is used instead of fit, they are assigned to the
         last batch of data.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
     See Also
     --------
     MiniBatchKMeans : Alternative implementation that does incremental updates
diff --git a/sklearn/cluster/_dbscan.py b/sklearn/cluster/_dbscan.py
index bbc3470256e90..abbb35e6e04af 100644
--- a/sklearn/cluster/_dbscan.py
+++ b/sklearn/cluster/_dbscan.py
@@ -217,6 +217,9 @@ class DBSCAN(ClusterMixin, BaseEstimator):
         Cluster labels for each point in the dataset given to fit().
         Noisy samples are given the label -1.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
     Examples
     --------
     >>> from sklearn.cluster import DBSCAN
diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py
index 8b24be6ace987..fc9ba7a868d10 100644
--- a/sklearn/cluster/_kmeans.py
+++ b/sklearn/cluster/_kmeans.py
@@ -766,6 +766,9 @@ class KMeans(TransformerMixin, ClusterMixin, BaseEstimator):
     n_iter_ : int
         Number of iterations run.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
     See Also
     --------
     MiniBatchKMeans : Alternative online implementation that does incremental
@@ -1465,6 +1468,9 @@ class MiniBatchKMeans(KMeans):
            This attribute is deprecated in 0.24 and will be removed in
            1.1 (renaming of 0.26).
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
     See Also
     --------
     KMeans : The classic implementation of the clustering method based on the
diff --git a/sklearn/cluster/_mean_shift.py b/sklearn/cluster/_mean_shift.py
index f48ef46e8dbef..147ec6c626eb0 100644
--- a/sklearn/cluster/_mean_shift.py
+++ b/sklearn/cluster/_mean_shift.py
@@ -311,6 +311,9 @@ class MeanShift(ClusterMixin, BaseEstimator):
 
         .. versionadded:: 0.22
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
     Examples
     --------
     >>> from sklearn.cluster import MeanShift
diff --git a/sklearn/cluster/_optics.py b/sklearn/cluster/_optics.py
index af0e8531aa7b8..0f2b96346660b 100755
--- a/sklearn/cluster/_optics.py
+++ b/sklearn/cluster/_optics.py
@@ -178,6 +178,9 @@ class OPTICS(ClusterMixin, BaseEstimator):
         ``X[ordering_][start:end + 1]`` form a cluster.
         Only available when ``cluster_method='xi'``.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
     See Also
     --------
     DBSCAN : A similar clustering for a specified neighborhood radius (eps).
diff --git a/sklearn/cluster/_spectral.py b/sklearn/cluster/_spectral.py
index cda6dac64ee54..de0192987f595 100644
--- a/sklearn/cluster/_spectral.py
+++ b/sklearn/cluster/_spectral.py
@@ -418,6 +418,9 @@ class SpectralClustering(ClusterMixin, BaseEstimator):
     labels_ : ndarray of shape (n_samples,)
         Labels of each point
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
     Examples
     --------
     >>> from sklearn.cluster import SpectralClustering
diff --git a/sklearn/tests/test_docstring_parameters.py b/sklearn/tests/test_docstring_parameters.py
index a3a0605308c79..85d8ad0cf6a36 100644
--- a/sklearn/tests/test_docstring_parameters.py
+++ b/sklearn/tests/test_docstring_parameters.py
@@ -176,7 +176,6 @@ def _construct_searchcv_instance(SearchCV):
 
 
 N_FEATURES_MODULES_TO_IGNORE = {
-    'cluster',
     'compose',
     'covariance',
     'decomposition',

From a25382629b6c3a2bb41d486a45f9dde6ccd021dc Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Wed, 9 Jun 2021 11:22:59 +0200
Subject: [PATCH 466/478] TST make sure to test SearchCV on both classification
 and regression (#20202)

---
 sklearn/tests/test_common.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
index 5e190437ca4a9..6588c677854ac 100644
--- a/sklearn/tests/test_common.py
+++ b/sklearn/tests/test_common.py
@@ -12,6 +12,7 @@
 import re
 import pkgutil
 from inspect import isgenerator
+from itertools import product
 from functools import partial
 
 import pytest
@@ -212,8 +213,11 @@ def test_class_support_removed():
 
 
 def _generate_search_cv_instances():
-    for SearchCV, (Estimator, param_grid) in zip(
-        [GridSearchCV, RandomizedSearchCV],
+    for SearchCV, (Estimator, param_grid) in product(
+        [
+            GridSearchCV,
+            RandomizedSearchCV,
+        ],
         [
             (Ridge, {"alpha": [0.1, 1.0]}),
             (LogisticRegression, {"C": [0.1, 1.0]}),
@@ -221,8 +225,11 @@ def _generate_search_cv_instances():
     ):
         yield SearchCV(Estimator(), param_grid)
 
-    for SearchCV, (Estimator, param_grid) in zip(
-        [GridSearchCV, RandomizedSearchCV],
+    for SearchCV, (Estimator, param_grid) in product(
+        [
+            GridSearchCV,
+            RandomizedSearchCV,
+        ],
         [
             (Ridge, {"ridge__alpha": [0.1, 1.0]}),
             (LogisticRegression, {"logisticregression__C": [0.1, 1.0]}),

From 007da8db4a90de82aa6ca46fc51e33c846599994 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Wed, 9 Jun 2021 10:49:58 -0400
Subject: [PATCH 467/478] FIX Do not reset for non-fit in multiclass (#20205)

---
 sklearn/multiclass.py | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py
index 99a6db2051030..ad420506a9694 100644
--- a/sklearn/multiclass.py
+++ b/sklearn/multiclass.py
@@ -114,25 +114,34 @@ def _check_estimator(estimator):
 class _ConstantPredictor(BaseEstimator):
 
     def fit(self, X, y):
-        self._check_n_features(X, reset=True)
+        check_params = dict(force_all_finite=False, dtype=None,
+                            ensure_2d=False, accept_sparse=True)
+        self._validate_data(X, y, reset=True,
+                            validate_separately=(check_params, check_params))
         self.y_ = y
         return self
 
     def predict(self, X):
         check_is_fitted(self)
-        self._check_n_features(X, reset=True)
+        self._validate_data(X, force_all_finite=False, dtype=None,
+                            accept_sparse=True,
+                            ensure_2d=False, reset=False)
 
         return np.repeat(self.y_, _num_samples(X))
 
     def decision_function(self, X):
         check_is_fitted(self)
-        self._check_n_features(X, reset=True)
+        self._validate_data(X, force_all_finite=False, dtype=None,
+                            accept_sparse=True,
+                            ensure_2d=False, reset=False)
 
         return np.repeat(self.y_, _num_samples(X))
 
     def predict_proba(self, X):
         check_is_fitted(self)
-        self._check_n_features(X, reset=True)
+        self._validate_data(X, force_all_finite=False, dtype=None,
+                            accept_sparse=True,
+                            ensure_2d=False, reset=False)
 
         return np.repeat([np.hstack([1 - self.y_, self.y_])],
                          _num_samples(X), axis=0)

From 1cd282d600088d2547d827af72a99e036106417a Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Wed, 9 Jun 2021 16:58:03 +0200
Subject: [PATCH 468/478] DOC add n_features_in_ in the documentation (#20236)

---
 sklearn/cluster/_affinity_propagation.py      |  2 +
 sklearn/cluster/_agglomerative.py             |  4 +
 sklearn/cluster/_bicluster.py                 |  4 +
 sklearn/cluster/_birch.py                     |  2 +
 sklearn/cluster/_dbscan.py                    |  2 +
 sklearn/cluster/_kmeans.py                    |  4 +
 sklearn/cluster/_mean_shift.py                |  2 +
 sklearn/cluster/_optics.py                    |  2 +
 sklearn/cluster/_spectral.py                  |  2 +
 sklearn/covariance/_elliptic_envelope.py      |  5 ++
 sklearn/covariance/_empirical_covariance.py   |  5 ++
 sklearn/covariance/_graph_lasso.py            | 10 +++
 sklearn/covariance/_robust_covariance.py      |  5 ++
 sklearn/covariance/_shrunk_covariance.py      | 15 ++++
 sklearn/decomposition/_dict_learning.py       | 10 +++
 sklearn/decomposition/_factor_analysis.py     |  5 ++
 sklearn/decomposition/_fastica.py             |  5 ++
 sklearn/decomposition/_incremental_pca.py     |  5 ++
 sklearn/decomposition/_kernel_pca.py          |  5 ++
 sklearn/decomposition/_lda.py                 |  5 ++
 sklearn/decomposition/_nmf.py                 |  5 ++
 sklearn/decomposition/_pca.py                 |  5 ++
 sklearn/decomposition/_sparse_pca.py          | 10 +++
 sklearn/decomposition/_truncated_svd.py       |  5 ++
 sklearn/discriminant_analysis.py              | 10 +++
 sklearn/dummy.py                              | 10 +++
 sklearn/ensemble/_bagging.py                  | 10 +++
 sklearn/ensemble/_forest.py                   | 25 +++++++
 sklearn/ensemble/_gb.py                       | 10 +++
 .../gradient_boosting.py                      |  8 ++
 sklearn/ensemble/_iforest.py                  |  5 ++
 sklearn/ensemble/_weight_boosting.py          | 10 +++
 .../_univariate_selection.py                  | 30 ++++++++
 .../feature_selection/_variance_threshold.py  |  5 ++
 sklearn/gaussian_process/_gpc.py              |  5 ++
 sklearn/gaussian_process/_gpr.py              |  5 ++
 sklearn/impute/_base.py                       | 10 +++
 sklearn/impute/_iterative.py                  |  5 ++
 sklearn/impute/_knn.py                        |  5 ++
 sklearn/kernel_approximation.py               | 24 ++++++
 sklearn/kernel_ridge.py                       |  5 ++
 sklearn/linear_model/_base.py                 |  5 ++
 sklearn/linear_model/_bayes.py                | 10 +++
 sklearn/linear_model/_coordinate_descent.py   | 40 ++++++++++
 sklearn/linear_model/_glm/glm.py              | 15 ++++
 sklearn/linear_model/_huber.py                |  5 ++
 sklearn/linear_model/_least_angle.py          | 24 ++++++
 sklearn/linear_model/_logistic.py             |  9 +++
 sklearn/linear_model/_omp.py                  | 10 +++
 sklearn/linear_model/_passive_aggressive.py   | 10 +++
 sklearn/linear_model/_perceptron.py           |  5 ++
 sklearn/linear_model/_quantile.py             |  5 ++
 sklearn/linear_model/_ransac.py               |  5 ++
 sklearn/linear_model/_ridge.py                | 20 +++++
 sklearn/linear_model/_stochastic_gradient.py  | 15 ++++
 sklearn/linear_model/_theil_sen.py            |  5 ++
 sklearn/manifold/_isomap.py                   |  5 ++
 sklearn/manifold/_locally_linear.py           |  5 ++
 sklearn/manifold/_mds.py                      |  5 ++
 sklearn/manifold/_spectral_embedding.py       |  5 ++
 sklearn/manifold/_t_sne.py                    |  5 ++
 .../_search_successive_halving.py             | 14 ++++
 sklearn/naive_bayes.py                        | 31 +++++++-
 sklearn/neighbors/_classification.py          | 10 +++
 sklearn/neighbors/_graph.py                   | 10 +++
 sklearn/neighbors/_kde.py                     |  5 ++
 sklearn/neighbors/_lof.py                     |  5 ++
 sklearn/neighbors/_nca.py                     |  5 ++
 sklearn/neighbors/_nearest_centroid.py        |  5 ++
 sklearn/neighbors/_regression.py              | 10 +++
 sklearn/neighbors/_unsupervised.py            |  5 ++
 .../neural_network/_multilayer_perceptron.py  | 10 +++
 sklearn/neural_network/_rbm.py                |  5 ++
 sklearn/preprocessing/_data.py                | 75 +++++++++++++++----
 sklearn/preprocessing/_discretization.py      | 11 ++-
 sklearn/preprocessing/_polynomial.py          | 45 ++++++-----
 sklearn/semi_supervised/_label_propagation.py | 10 +++
 sklearn/semi_supervised/_self_training.py     |  5 ++
 sklearn/svm/_classes.py                       | 35 +++++++++
 sklearn/tests/test_docstring_parameters.py    | 52 +++++--------
 sklearn/tree/_classes.py                      | 20 +++++
 81 files changed, 809 insertions(+), 73 deletions(-)

diff --git a/sklearn/cluster/_affinity_propagation.py b/sklearn/cluster/_affinity_propagation.py
index 67c1fb42b650b..cf0da5c5bc0f3 100644
--- a/sklearn/cluster/_affinity_propagation.py
+++ b/sklearn/cluster/_affinity_propagation.py
@@ -312,6 +312,8 @@ class AffinityPropagation(ClusterMixin, BaseEstimator):
     n_features_in_ : int
         Number of features seen during :term:`fit`.
 
+        .. versionadded:: 0.24
+
     Notes
     -----
     For an example, see :ref:`examples/cluster/plot_affinity_propagation.py
diff --git a/sklearn/cluster/_agglomerative.py b/sklearn/cluster/_agglomerative.py
index 05f57ff238bcf..a1adb8492ab89 100644
--- a/sklearn/cluster/_agglomerative.py
+++ b/sklearn/cluster/_agglomerative.py
@@ -776,6 +776,8 @@ class AgglomerativeClustering(ClusterMixin, BaseEstimator):
     n_features_in_ : int
         Number of features seen during :term:`fit`.
 
+        .. versionadded:: 0.24
+
     children_ : array-like of shape (n_samples-1, 2)
         The children of each non-leaf node. Values less than `n_samples`
         correspond to leaves of the tree which are the original samples.
@@ -1045,6 +1047,8 @@ class FeatureAgglomeration(AgglomerativeClustering, AgglomerationTransform):
     n_features_in_ : int
         Number of features seen during :term:`fit`.
 
+        .. versionadded:: 0.24
+
     children_ : array-like of shape (n_nodes-1, 2)
         The children of each non-leaf node. Values less than `n_features`
         correspond to leaves of the tree which are the original samples.
diff --git a/sklearn/cluster/_bicluster.py b/sklearn/cluster/_bicluster.py
index 1be7dd4e64186..939f044002f2d 100644
--- a/sklearn/cluster/_bicluster.py
+++ b/sklearn/cluster/_bicluster.py
@@ -261,6 +261,8 @@ class SpectralCoclustering(BaseSpectral):
     n_features_in_ : int
         Number of features seen during :term:`fit`.
 
+        .. versionadded:: 0.24
+
     Examples
     --------
     >>> from sklearn.cluster import SpectralCoclustering
@@ -401,6 +403,8 @@ class SpectralBiclustering(BaseSpectral):
     n_features_in_ : int
         Number of features seen during :term:`fit`.
 
+        .. versionadded:: 0.24
+
     Examples
     --------
     >>> from sklearn.cluster import SpectralBiclustering
diff --git a/sklearn/cluster/_birch.py b/sklearn/cluster/_birch.py
index 81c9312f1488a..fc4bfdcfc902d 100644
--- a/sklearn/cluster/_birch.py
+++ b/sklearn/cluster/_birch.py
@@ -404,6 +404,8 @@ class Birch(ClusterMixin, TransformerMixin, BaseEstimator):
     n_features_in_ : int
         Number of features seen during :term:`fit`.
 
+        .. versionadded:: 0.24
+
     See Also
     --------
     MiniBatchKMeans : Alternative implementation that does incremental updates
diff --git a/sklearn/cluster/_dbscan.py b/sklearn/cluster/_dbscan.py
index abbb35e6e04af..e862ee1080ace 100644
--- a/sklearn/cluster/_dbscan.py
+++ b/sklearn/cluster/_dbscan.py
@@ -220,6 +220,8 @@ class DBSCAN(ClusterMixin, BaseEstimator):
     n_features_in_ : int
         Number of features seen during :term:`fit`.
 
+        .. versionadded:: 0.24
+
     Examples
     --------
     >>> from sklearn.cluster import DBSCAN
diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py
index fc9ba7a868d10..6b54ec99ae825 100644
--- a/sklearn/cluster/_kmeans.py
+++ b/sklearn/cluster/_kmeans.py
@@ -769,6 +769,8 @@ class KMeans(TransformerMixin, ClusterMixin, BaseEstimator):
     n_features_in_ : int
         Number of features seen during :term:`fit`.
 
+        .. versionadded:: 0.24
+
     See Also
     --------
     MiniBatchKMeans : Alternative online implementation that does incremental
@@ -1471,6 +1473,8 @@ class MiniBatchKMeans(KMeans):
     n_features_in_ : int
         Number of features seen during :term:`fit`.
 
+        .. versionadded:: 0.24
+
     See Also
     --------
     KMeans : The classic implementation of the clustering method based on the
diff --git a/sklearn/cluster/_mean_shift.py b/sklearn/cluster/_mean_shift.py
index 147ec6c626eb0..619d52cb7313b 100644
--- a/sklearn/cluster/_mean_shift.py
+++ b/sklearn/cluster/_mean_shift.py
@@ -314,6 +314,8 @@ class MeanShift(ClusterMixin, BaseEstimator):
     n_features_in_ : int
         Number of features seen during :term:`fit`.
 
+        .. versionadded:: 0.24
+
     Examples
     --------
     >>> from sklearn.cluster import MeanShift
diff --git a/sklearn/cluster/_optics.py b/sklearn/cluster/_optics.py
index 0f2b96346660b..1d04ea7a3214f 100755
--- a/sklearn/cluster/_optics.py
+++ b/sklearn/cluster/_optics.py
@@ -181,6 +181,8 @@ class OPTICS(ClusterMixin, BaseEstimator):
     n_features_in_ : int
         Number of features seen during :term:`fit`.
 
+        .. versionadded:: 0.24
+
     See Also
     --------
     DBSCAN : A similar clustering for a specified neighborhood radius (eps).
diff --git a/sklearn/cluster/_spectral.py b/sklearn/cluster/_spectral.py
index de0192987f595..8cdbd859fde02 100644
--- a/sklearn/cluster/_spectral.py
+++ b/sklearn/cluster/_spectral.py
@@ -421,6 +421,8 @@ class SpectralClustering(ClusterMixin, BaseEstimator):
     n_features_in_ : int
         Number of features seen during :term:`fit`.
 
+        .. versionadded:: 0.24
+
     Examples
     --------
     >>> from sklearn.cluster import SpectralClustering
diff --git a/sklearn/covariance/_elliptic_envelope.py b/sklearn/covariance/_elliptic_envelope.py
index ad7904dc7831a..3e0c6a41d5913 100644
--- a/sklearn/covariance/_elliptic_envelope.py
+++ b/sklearn/covariance/_elliptic_envelope.py
@@ -83,6 +83,11 @@ class EllipticEnvelope(OutlierMixin, MinCovDet):
         Mahalanobis distances of the training set (on which :meth:`fit` is
         called) observations.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     Examples
     --------
     >>> import numpy as np
diff --git a/sklearn/covariance/_empirical_covariance.py b/sklearn/covariance/_empirical_covariance.py
index 02bddd0f50330..9c3d94c863c72 100644
--- a/sklearn/covariance/_empirical_covariance.py
+++ b/sklearn/covariance/_empirical_covariance.py
@@ -125,6 +125,11 @@ class EmpiricalCovariance(BaseEstimator):
         Estimated pseudo-inverse matrix.
         (stored only if store_precision is True)
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     Examples
     --------
     >>> import numpy as np
diff --git a/sklearn/covariance/_graph_lasso.py b/sklearn/covariance/_graph_lasso.py
index 091d4f82e7e3e..398a8af72f3a9 100644
--- a/sklearn/covariance/_graph_lasso.py
+++ b/sklearn/covariance/_graph_lasso.py
@@ -339,6 +339,11 @@ class GraphicalLasso(EmpiricalCovariance):
     n_iter_ : int
         Number of iterations run.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     Examples
     --------
     >>> import numpy as np
@@ -635,6 +640,11 @@ class GraphicalLassoCV(GraphicalLasso):
     n_iter_ : int
         Number of iterations run for the optimal alpha.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     Examples
     --------
     >>> import numpy as np
diff --git a/sklearn/covariance/_robust_covariance.py b/sklearn/covariance/_robust_covariance.py
index 337ba23f19059..2323d14d3359a 100644
--- a/sklearn/covariance/_robust_covariance.py
+++ b/sklearn/covariance/_robust_covariance.py
@@ -582,6 +582,11 @@ class MinCovDet(EmpiricalCovariance):
         Mahalanobis distances of the training set (on which :meth:`fit` is
         called) observations.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     Examples
     --------
     >>> import numpy as np
diff --git a/sklearn/covariance/_shrunk_covariance.py b/sklearn/covariance/_shrunk_covariance.py
index 5fe590b33a1db..a4dea261f2a45 100644
--- a/sklearn/covariance/_shrunk_covariance.py
+++ b/sklearn/covariance/_shrunk_covariance.py
@@ -91,6 +91,11 @@ class ShrunkCovariance(EmpiricalCovariance):
         Estimated pseudo inverse matrix.
         (stored only if store_precision is True)
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     Examples
     --------
     >>> import numpy as np
@@ -356,6 +361,11 @@ class LedoitWolf(EmpiricalCovariance):
         Coefficient in the convex combination used for the computation
         of the shrunk estimate. Range is [0, 1].
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     Examples
     --------
     >>> import numpy as np
@@ -533,6 +543,11 @@ class OAS(EmpiricalCovariance):
       coefficient in the convex combination used for the computation
       of the shrunk estimate. Range is [0, 1].
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     Examples
     --------
     >>> import numpy as np
diff --git a/sklearn/decomposition/_dict_learning.py b/sklearn/decomposition/_dict_learning.py
index 80b64570b3401..030ac06b454b1 100644
--- a/sklearn/decomposition/_dict_learning.py
+++ b/sklearn/decomposition/_dict_learning.py
@@ -1259,6 +1259,11 @@ class DictionaryLearning(_BaseSparseCoding, BaseEstimator):
     error_ : array
         vector of errors at each iteration
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     n_iter_ : int
         Number of iterations run.
 
@@ -1492,6 +1497,11 @@ class MiniBatchDictionaryLearning(_BaseSparseCoding, BaseEstimator):
         `A` `(n_components, n_components)` is the dictionary covariance matrix.
         `B` `(n_features, n_components)` is the data approximation matrix.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     n_iter_ : int
         Number of iterations run.
 
diff --git a/sklearn/decomposition/_factor_analysis.py b/sklearn/decomposition/_factor_analysis.py
index 830e81e9268d5..f3167ff225584 100644
--- a/sklearn/decomposition/_factor_analysis.py
+++ b/sklearn/decomposition/_factor_analysis.py
@@ -120,6 +120,11 @@ class FactorAnalysis(TransformerMixin, BaseEstimator):
     mean_ : ndarray of shape (n_features,)
         Per-feature empirical mean, estimated from the training set.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     Examples
     --------
     >>> from sklearn.datasets import load_digits
diff --git a/sklearn/decomposition/_fastica.py b/sklearn/decomposition/_fastica.py
index 6c374e6e420f8..5faf1985d3fc9 100644
--- a/sklearn/decomposition/_fastica.py
+++ b/sklearn/decomposition/_fastica.py
@@ -362,6 +362,11 @@ def my_g(x):
     mean_ : ndarray of shape(n_features,)
         The mean over features. Only set if `self.whiten` is True.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     n_iter_ : int
         If the algorithm is "deflation", n_iter is the
         maximum number of iterations run across all components. Else
diff --git a/sklearn/decomposition/_incremental_pca.py b/sklearn/decomposition/_incremental_pca.py
index 486d4a22d8cdb..b1221d69cf914 100644
--- a/sklearn/decomposition/_incremental_pca.py
+++ b/sklearn/decomposition/_incremental_pca.py
@@ -107,6 +107,11 @@ class IncrementalPCA(_BasePCA):
     batch_size_ : int
         Inferred batch size from ``batch_size``.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     Examples
     --------
     >>> from sklearn.datasets import load_digits
diff --git a/sklearn/decomposition/_kernel_pca.py b/sklearn/decomposition/_kernel_pca.py
index 1e1cdb1722029..70a12f5cb2e38 100644
--- a/sklearn/decomposition/_kernel_pca.py
+++ b/sklearn/decomposition/_kernel_pca.py
@@ -164,6 +164,11 @@ class KernelPCA(TransformerMixin, BaseEstimator):
         The data used to fit the model. If `copy_X=False`, then `X_fit_` is
         a reference. This attribute is used for the calls to transform.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     Examples
     --------
     >>> from sklearn.datasets import load_digits
diff --git a/sklearn/decomposition/_lda.py b/sklearn/decomposition/_lda.py
index 75b123a118338..3739a66a871e3 100644
--- a/sklearn/decomposition/_lda.py
+++ b/sklearn/decomposition/_lda.py
@@ -248,6 +248,11 @@ class LatentDirichletAllocation(TransformerMixin, BaseEstimator):
     n_batch_iter_ : int
         Number of iterations of the EM step.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     n_iter_ : int
         Number of passes over the dataset.
 
diff --git a/sklearn/decomposition/_nmf.py b/sklearn/decomposition/_nmf.py
index c8239147eb6c4..39d38af4c5f5a 100644
--- a/sklearn/decomposition/_nmf.py
+++ b/sklearn/decomposition/_nmf.py
@@ -1179,6 +1179,11 @@ class NMF(TransformerMixin, BaseEstimator):
     n_iter_ : int
         Actual number of iterations.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     Examples
     --------
     >>> import numpy as np
diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py
index 765320ccdb5a8..afeedeba28edb 100644
--- a/sklearn/decomposition/_pca.py
+++ b/sklearn/decomposition/_pca.py
@@ -264,6 +264,11 @@ class PCA(_BasePCA):
         Equal to the average of (min(n_features, n_samples) - n_components)
         smallest eigenvalues of the covariance matrix of X.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     See Also
     --------
     KernelPCA : Kernel Principal Component Analysis.
diff --git a/sklearn/decomposition/_sparse_pca.py b/sklearn/decomposition/_sparse_pca.py
index 7f280db3a3af6..19ff950228f62 100644
--- a/sklearn/decomposition/_sparse_pca.py
+++ b/sklearn/decomposition/_sparse_pca.py
@@ -88,6 +88,11 @@ class SparsePCA(TransformerMixin, BaseEstimator):
         Per-feature empirical mean, estimated from the training set.
         Equal to ``X.mean(axis=0)``.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     Examples
     --------
     >>> import numpy as np
@@ -279,6 +284,11 @@ class MiniBatchSparsePCA(SparsePCA):
         Per-feature empirical mean, estimated from the training set.
         Equal to ``X.mean(axis=0)``.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     Examples
     --------
     >>> import numpy as np
diff --git a/sklearn/decomposition/_truncated_svd.py b/sklearn/decomposition/_truncated_svd.py
index 7aa36c59da00e..677c6f1f36fb7 100644
--- a/sklearn/decomposition/_truncated_svd.py
+++ b/sklearn/decomposition/_truncated_svd.py
@@ -84,6 +84,11 @@ class TruncatedSVD(TransformerMixin, BaseEstimator):
         The singular values are equal to the 2-norms of the ``n_components``
         variables in the lower-dimensional space.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     Examples
     --------
     >>> from sklearn.decomposition import TruncatedSVD
diff --git a/sklearn/discriminant_analysis.py b/sklearn/discriminant_analysis.py
index 4d94b19574f53..3cb6cc1712f29 100644
--- a/sklearn/discriminant_analysis.py
+++ b/sklearn/discriminant_analysis.py
@@ -278,6 +278,11 @@ class LinearDiscriminantAnalysis(LinearClassifierMixin,
     classes_ : array-like of shape (n_classes,)
         Unique class labels.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     See Also
     --------
     QuadraticDiscriminantAnalysis : Quadratic Discriminant Analysis.
@@ -732,6 +737,11 @@ class QuadraticDiscriminantAnalysis(ClassifierMixin, BaseEstimator):
     classes_ : ndarray of shape (n_classes,)
         Unique class labels.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     Examples
     --------
     >>> from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
diff --git a/sklearn/dummy.py b/sklearn/dummy.py
index 575b38aa7d2a8..d78336730fc99 100644
--- a/sklearn/dummy.py
+++ b/sklearn/dummy.py
@@ -75,6 +75,11 @@ class DummyClassifier(MultiOutputMixin, ClassifierMixin, BaseEstimator):
     n_outputs_ : int
         Number of outputs.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     sparse_output_ : bool
         True if the array returned from predict is to be in sparse CSC format.
         Is automatically set to True if the input y is passed in sparse format.
@@ -425,6 +430,11 @@ class DummyRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
         Mean or median or quantile of the training targets or constant value
         given by the user.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     n_outputs_ : int
         Number of outputs.
 
diff --git a/sklearn/ensemble/_bagging.py b/sklearn/ensemble/_bagging.py
index a4be68ba5e2d6..d63c42d8f5539 100644
--- a/sklearn/ensemble/_bagging.py
+++ b/sklearn/ensemble/_bagging.py
@@ -537,6 +537,11 @@ class BaggingClassifier(ClassifierMixin, BaseBagging):
             Attribute `n_features_` was deprecated in version 1.0 and will be
             removed in 1.2. Use `n_features_in_` instead.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     estimators_ : list of estimators
         The collection of fitted base estimators.
 
@@ -928,6 +933,11 @@ class BaggingRegressor(RegressorMixin, BaseBagging):
             Attribute `n_features_` was deprecated in version 1.0 and will be
             removed in 1.2. Use `n_features_in_` instead.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     estimators_ : list of estimators
         The collection of fitted sub-estimators.
 
diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py
index bc29c0362bb3e..ef2de299c27ea 100644
--- a/sklearn/ensemble/_forest.py
+++ b/sklearn/ensemble/_forest.py
@@ -1199,6 +1199,11 @@ class labels (multi-output problem).
             Attribute `n_features_` was deprecated in version 1.0 and will be
             removed in 1.2. Use `n_features_in_` instead.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     n_outputs_ : int
         The number of outputs when ``fit`` is performed.
 
@@ -1516,6 +1521,11 @@ class RandomForestRegressor(ForestRegressor):
             Attribute `n_features_` was deprecated in version 1.0 and will be
             removed in 1.2. Use `n_features_in_` instead.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     n_outputs_ : int
         The number of outputs when ``fit`` is performed.
 
@@ -1841,6 +1851,11 @@ class labels (multi-output problem).
             Attribute `n_features_` was deprecated in version 1.0 and will be
             removed in 1.2. Use `n_features_in_` instead.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     n_outputs_ : int
         The number of outputs when ``fit`` is performed.
 
@@ -2140,6 +2155,11 @@ class ExtraTreesRegressor(ForestRegressor):
             Attribute `n_features_` was deprecated in version 1.0 and will be
             removed in 1.2. Use `n_features_in_` instead.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     n_outputs_ : int
         The number of outputs.
 
@@ -2368,6 +2388,11 @@ class RandomTreesEmbedding(BaseForest):
             Attribute `n_features_` was deprecated in version 1.0 and will be
             removed in 1.2. Use `n_features_in_` instead.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     n_outputs_ : int
         The number of outputs when ``fit`` is performed.
 
diff --git a/sklearn/ensemble/_gb.py b/sklearn/ensemble/_gb.py
index 78fee588ecf4e..496757ee9d605 100644
--- a/sklearn/ensemble/_gb.py
+++ b/sklearn/ensemble/_gb.py
@@ -1052,6 +1052,11 @@ class GradientBoostingClassifier(ClassifierMixin, BaseGradientBoosting):
             Attribute `n_features_` was deprecated in version 1.0 and will be
             removed in 1.2. Use `n_features_in_` instead.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     n_classes_ : int
         The number of classes.
 
@@ -1604,6 +1609,11 @@ class GradientBoostingRegressor(RegressorMixin, BaseGradientBoosting):
             Attribute `n_features_` was deprecated in version 1.0 and will be
             removed in 1.2. Use `n_features_in_` instead.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     max_features_ : int
         The inferred value of max_features.
 
diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
index 99eb0d265b100..b33b0652ca5be 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
@@ -1030,6 +1030,10 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting):
     is_categorical_ : ndarray, shape (n_features, ) or None
         Boolean mask for the categorical features. ``None`` if there are no
         categorical features.
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
 
     Examples
     --------
@@ -1288,6 +1292,10 @@ class HistGradientBoostingClassifier(ClassifierMixin,
     is_categorical_ : ndarray, shape (n_features, ) or None
         Boolean mask for the categorical features. ``None`` if there are no
         categorical features.
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
 
     Examples
     --------
diff --git a/sklearn/ensemble/_iforest.py b/sklearn/ensemble/_iforest.py
index 3d2ac0928bd3f..fb8614ae0528e 100644
--- a/sklearn/ensemble/_iforest.py
+++ b/sklearn/ensemble/_iforest.py
@@ -147,6 +147,11 @@ class IsolationForest(OutlierMixin, BaseBagging):
             Attribute `n_features_` was deprecated in version 1.0 and will be
             removed in 1.2. Use `n_features_in_` instead.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     Notes
     -----
     The implementation is based on an ensemble of ExtraTreeRegressor. The
diff --git a/sklearn/ensemble/_weight_boosting.py b/sklearn/ensemble/_weight_boosting.py
index 1b6689b50fafc..7d146e428a50b 100644
--- a/sklearn/ensemble/_weight_boosting.py
+++ b/sklearn/ensemble/_weight_boosting.py
@@ -359,6 +359,11 @@ class AdaBoostClassifier(ClassifierMixin, BaseWeightBoosting):
         high cardinality features (many unique values). See
         :func:`sklearn.inspection.permutation_importance` as an alternative.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     See Also
     --------
     AdaBoostRegressor : An AdaBoost regressor that begins by fitting a
@@ -935,6 +940,11 @@ class AdaBoostRegressor(RegressorMixin, BaseWeightBoosting):
         high cardinality features (many unique values). See
         :func:`sklearn.inspection.permutation_importance` as an alternative.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     Examples
     --------
     >>> from sklearn.ensemble import AdaBoostRegressor
diff --git a/sklearn/feature_selection/_univariate_selection.py b/sklearn/feature_selection/_univariate_selection.py
index 989288dbb4ec7..f74ca0e0ac2e2 100644
--- a/sklearn/feature_selection/_univariate_selection.py
+++ b/sklearn/feature_selection/_univariate_selection.py
@@ -453,6 +453,11 @@ class SelectPercentile(_BaseFilter):
     pvalues_ : array-like of shape (n_features,)
         p-values of feature scores, None if `score_func` returned only scores.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     Examples
     --------
     >>> from sklearn.datasets import load_digits
@@ -539,6 +544,11 @@ class SelectKBest(_BaseFilter):
     pvalues_ : array-like of shape (n_features,)
         p-values of feature scores, None if `score_func` returned only scores.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     Examples
     --------
     >>> from sklearn.datasets import load_digits
@@ -624,6 +634,11 @@ class SelectFpr(_BaseFilter):
     pvalues_ : array-like of shape (n_features,)
         p-values of feature scores.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     Examples
     --------
     >>> from sklearn.datasets import load_breast_cancer
@@ -698,6 +713,11 @@ class SelectFdr(_BaseFilter):
     pvalues_ : array-like of shape (n_features,)
         p-values of feature scores.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     References
     ----------
     https://en.wikipedia.org/wiki/False_discovery_rate
@@ -768,6 +788,11 @@ class SelectFwe(_BaseFilter):
     pvalues_ : array-like of shape (n_features,)
         p-values of feature scores.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     See Also
     --------
     f_classif : ANOVA F-value between label/feature for classification tasks.
@@ -823,6 +848,11 @@ class GenericUnivariateSelect(_BaseFilter):
     pvalues_ : array-like of shape (n_features,)
         p-values of feature scores, None if `score_func` returned scores only.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     Examples
     --------
     >>> from sklearn.datasets import load_breast_cancer
diff --git a/sklearn/feature_selection/_variance_threshold.py b/sklearn/feature_selection/_variance_threshold.py
index 39892876a6478..aabbc44ab8fc8 100644
--- a/sklearn/feature_selection/_variance_threshold.py
+++ b/sklearn/feature_selection/_variance_threshold.py
@@ -28,6 +28,11 @@ class VarianceThreshold(SelectorMixin, BaseEstimator):
     variances_ : array, shape (n_features,)
         Variances of individual features.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     Notes
     -----
     Allows NaN in the input.
diff --git a/sklearn/gaussian_process/_gpc.py b/sklearn/gaussian_process/_gpc.py
index d2b418b131c2f..491c33b9621e8 100644
--- a/sklearn/gaussian_process/_gpc.py
+++ b/sklearn/gaussian_process/_gpc.py
@@ -576,6 +576,11 @@ def optimizer(obj_func, initial_theta, bounds):
     n_classes_ : int
         The number of classes in the training data
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     Examples
     --------
     >>> from sklearn.datasets import load_iris
diff --git a/sklearn/gaussian_process/_gpr.py b/sklearn/gaussian_process/_gpr.py
index ae9e5c403fcf2..4583e013d06df 100644
--- a/sklearn/gaussian_process/_gpr.py
+++ b/sklearn/gaussian_process/_gpr.py
@@ -137,6 +137,11 @@ def optimizer(obj_func, initial_theta, bounds):
     log_marginal_likelihood_value_ : float
         The log-marginal-likelihood of ``self.kernel_.theta``
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     Examples
     --------
     >>> from sklearn.datasets import make_friedman2
diff --git a/sklearn/impute/_base.py b/sklearn/impute/_base.py
index 85303f29c93e9..396b3b95234dc 100644
--- a/sklearn/impute/_base.py
+++ b/sklearn/impute/_base.py
@@ -187,6 +187,11 @@ class SimpleImputer(_BaseImputer):
         Indicator used to add binary indicators for missing values.
         ``None`` if add_indicator is False.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     See Also
     --------
     IterativeImputer : Multivariate imputation of missing values.
@@ -604,6 +609,11 @@ class MissingIndicator(TransformerMixin, BaseEstimator):
         They are computed during ``fit``. For ``features='all'``, it is
         to ``range(n_features)``.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     Examples
     --------
     >>> import numpy as np
diff --git a/sklearn/impute/_iterative.py b/sklearn/impute/_iterative.py
index f5688fa96d238..3832bd9d35aa0 100644
--- a/sklearn/impute/_iterative.py
+++ b/sklearn/impute/_iterative.py
@@ -164,6 +164,11 @@ class IterativeImputer(_BaseImputer):
         Number of iteration rounds that occurred. Will be less than
         ``self.max_iter`` if early stopping criterion was reached.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     n_features_with_missing_ : int
         Number of features with missing values.
 
diff --git a/sklearn/impute/_knn.py b/sklearn/impute/_knn.py
index b9cfe0e1a60a0..f32232512dcde 100644
--- a/sklearn/impute/_knn.py
+++ b/sklearn/impute/_knn.py
@@ -76,6 +76,11 @@ class KNNImputer(_BaseImputer):
         Indicator used to add binary indicators for missing values.
         ``None`` if add_indicator is False.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     References
     ----------
     * Olga Troyanskaya, Michael Cantor, Gavin Sherlock, Pat Brown, Trevor
diff --git a/sklearn/kernel_approximation.py b/sklearn/kernel_approximation.py
index e7020dea0e970..d6d67fe85e941 100644
--- a/sklearn/kernel_approximation.py
+++ b/sklearn/kernel_approximation.py
@@ -77,6 +77,11 @@ class PolynomialCountSketch(BaseEstimator, TransformerMixin):
         Array with random entries in {+1, -1}, used to represent
         the 2-wise independent hash functions for Count Sketch computation.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     Examples
     --------
     >>> from sklearn.kernel_approximation import PolynomialCountSketch
@@ -228,6 +233,10 @@ class RBFSampler(TransformerMixin, BaseEstimator):
         Random projection directions drawn from the Fourier transform
         of the RBF kernel.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
 
     Examples
     --------
@@ -340,6 +349,11 @@ class SkewedChi2Sampler(TransformerMixin, BaseEstimator):
         Bias term, which will be added to the data. It is uniformly distributed
         between 0 and 2*pi.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     Examples
     --------
     >>> from sklearn.kernel_approximation import SkewedChi2Sampler
@@ -462,6 +476,11 @@ class AdditiveChi2Sampler(TransformerMixin, BaseEstimator):
         Stored sampling interval. Specified as a parameter if sample_steps not
         in {1,2,3}.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     Examples
     --------
     >>> from sklearn.datasets import load_digits
@@ -690,6 +709,11 @@ class Nystroem(TransformerMixin, BaseEstimator):
         Normalization matrix needed for embedding.
         Square root of the kernel matrix on ``components_``.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     Examples
     --------
     >>> from sklearn import datasets, svm
diff --git a/sklearn/kernel_ridge.py b/sklearn/kernel_ridge.py
index e562c22daed2f..2bb0b83763625 100644
--- a/sklearn/kernel_ridge.py
+++ b/sklearn/kernel_ridge.py
@@ -89,6 +89,11 @@ class KernelRidge(MultiOutputMixin, RegressorMixin, BaseEstimator):
         kernel == "precomputed" this is instead the precomputed
         training matrix, of shape (n_samples, n_samples).
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     References
     ----------
     * Kevin P. Murphy
diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py
index 09eeced4f3a09..3a55e3b0090c5 100644
--- a/sklearn/linear_model/_base.py
+++ b/sklearn/linear_model/_base.py
@@ -560,6 +560,11 @@ class LinearRegression(MultiOutputMixin, RegressorMixin, LinearModel):
         Independent term in the linear model. Set to 0.0 if
         `fit_intercept = False`.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     See Also
     --------
     Ridge : Ridge regression addresses some of the
diff --git a/sklearn/linear_model/_bayes.py b/sklearn/linear_model/_bayes.py
index 1d25ac20aa34e..aabd3d2e0f5a2 100644
--- a/sklearn/linear_model/_bayes.py
+++ b/sklearn/linear_model/_bayes.py
@@ -131,6 +131,11 @@ class BayesianRidge(RegressorMixin, LinearModel):
         If `normalize=True`, parameter used to scale data to a unit
         standard deviation.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     Examples
     --------
     >>> from sklearn import linear_model
@@ -491,6 +496,11 @@ class ARDRegression(RegressorMixin, LinearModel):
         If `normalize=True`, parameter used to scale data to a unit
         standard deviation.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     Examples
     --------
     >>> from sklearn import linear_model
diff --git a/sklearn/linear_model/_coordinate_descent.py b/sklearn/linear_model/_coordinate_descent.py
index da50a3a817a38..99517ff6e5bbf 100644
--- a/sklearn/linear_model/_coordinate_descent.py
+++ b/sklearn/linear_model/_coordinate_descent.py
@@ -666,6 +666,11 @@ class ElasticNet(MultiOutputMixin, RegressorMixin, LinearModel):
         Given param alpha, the dual gaps at the end of the optimization,
         same shape as each observation of y.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     Examples
     --------
     >>> from sklearn.linear_model import ElasticNet
@@ -993,6 +998,11 @@ class Lasso(ElasticNet):
         Number of iterations run by the coordinate descent solver to reach
         the specified tolerance.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     Examples
     --------
     >>> from sklearn import linear_model
@@ -1482,6 +1492,11 @@ class LassoCV(RegressorMixin, LinearModelCV):
         Number of iterations run by the coordinate descent solver to reach
         the specified tolerance for the optimal alpha.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     Examples
     --------
     >>> from sklearn.linear_model import LassoCV
@@ -1667,6 +1682,11 @@ class ElasticNetCV(RegressorMixin, LinearModelCV):
         Number of iterations run by the coordinate descent solver to reach
         the specified tolerance for the optimal alpha.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     Examples
     --------
     >>> from sklearn.linear_model import ElasticNetCV
@@ -1848,6 +1868,11 @@ class MultiTaskElasticNet(Lasso):
             (n_tasks, n_features)
         Sparse representation of the `coef_`.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     Examples
     --------
     >>> from sklearn import linear_model
@@ -2049,6 +2074,11 @@ class MultiTaskLasso(MultiTaskElasticNet):
             (n_tasks, n_features)
         Sparse representation of the `coef_`.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     Examples
     --------
     >>> from sklearn import linear_model
@@ -2228,6 +2258,11 @@ class MultiTaskElasticNetCV(RegressorMixin, LinearModelCV):
     dual_gap_ : float
         The dual gap at the end of the optimization for the optimal alpha.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     Examples
     --------
     >>> from sklearn import linear_model
@@ -2407,6 +2442,11 @@ class MultiTaskLassoCV(RegressorMixin, LinearModelCV):
     dual_gap_ : float
         The dual gap at the end of the optimization for the optimal alpha.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     Examples
     --------
     >>> from sklearn.linear_model import MultiTaskLassoCV
diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index 7d98f7734b322..5da65c77cf2f4 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -433,6 +433,11 @@ class PoissonRegressor(GeneralizedLinearRegressor):
     intercept_ : float
         Intercept (a.k.a. bias) added to linear predictor.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     n_iter_ : int
         Actual number of iterations used in the solver.
 
@@ -517,6 +522,11 @@ class GammaRegressor(GeneralizedLinearRegressor):
     intercept_ : float
         Intercept (a.k.a. bias) added to linear predictor.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     n_iter_ : int
         Actual number of iterations used in the solver.
 
@@ -633,6 +643,11 @@ class TweedieRegressor(GeneralizedLinearRegressor):
     n_iter_ : int
         Actual number of iterations used in the solver.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     Examples
     ----------
     >>> from sklearn import linear_model
diff --git a/sklearn/linear_model/_huber.py b/sklearn/linear_model/_huber.py
index a8ae066d9ff63..93cdb4ae8b5dc 100644
--- a/sklearn/linear_model/_huber.py
+++ b/sklearn/linear_model/_huber.py
@@ -179,6 +179,11 @@ class HuberRegressor(LinearModel, RegressorMixin, BaseEstimator):
     scale_ : float
         The value by which ``|y - X'w - c|`` is scaled down.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     n_iter_ : int
         Number of iterations that
         ``scipy.optimize.minimize(method="L-BFGS-B")`` has run for.
diff --git a/sklearn/linear_model/_least_angle.py b/sklearn/linear_model/_least_angle.py
index 3485344b99e02..a1fe31557cbe6 100644
--- a/sklearn/linear_model/_least_angle.py
+++ b/sklearn/linear_model/_least_angle.py
@@ -899,6 +899,11 @@ class Lars(MultiOutputMixin, RegressorMixin, LinearModel):
         The number of iterations taken by lars_path to find the
         grid of alphas for each target.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     Examples
     --------
     >>> from sklearn import linear_model
@@ -1157,6 +1162,11 @@ class LassoLars(Lars):
         The number of iterations taken by lars_path to find the
         grid of alphas for each target.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     Examples
     --------
     >>> from sklearn import linear_model
@@ -1420,6 +1430,11 @@ class LarsCV(Lars):
     n_iter_ : array-like or int
         the number of iterations run by Lars with the optimal alpha.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     Examples
     --------
     >>> from sklearn.linear_model import LarsCV
@@ -1654,6 +1669,11 @@ class LassoLarsCV(LarsCV):
     active_ : list of int
         Indices of active variables at the end of the path.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     Examples
     --------
     >>> from sklearn.linear_model import LassoLarsCV
@@ -1799,6 +1819,10 @@ class LassoLarsIC(LassoLars):
         chosen. This value is larger by a factor of ``n_samples`` compared to
         Eqns. 2.15 and 2.16 in (Zou et al, 2007).
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
 
     Examples
     --------
diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py
index abca6bb30e71f..c4876486e16de 100644
--- a/sklearn/linear_model/_logistic.py
+++ b/sklearn/linear_model/_logistic.py
@@ -1212,6 +1212,11 @@ class LogisticRegression(LinearClassifierMixin,
         corresponds to outcome 1 (True) and `-intercept_` corresponds to
         outcome 0 (False).
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     n_iter_ : ndarray of shape (n_classes,) or (1, )
         Actual number of iterations for all classes. If binary or multinomial,
         it returns only 1 element. For liblinear solver, only the maximum
@@ -1764,6 +1769,10 @@ class LogisticRegressionCV(LogisticRegression,
         If ``penalty='elasticnet'``, the shape is ``(n_classes, n_folds,
         n_cs, n_l1_ratios)`` or ``(1, n_folds, n_cs, n_l1_ratios)``.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
 
     Examples
     --------
diff --git a/sklearn/linear_model/_omp.py b/sklearn/linear_model/_omp.py
index c362fd4d73469..d61f8ba82a20c 100644
--- a/sklearn/linear_model/_omp.py
+++ b/sklearn/linear_model/_omp.py
@@ -592,6 +592,11 @@ class OrthogonalMatchingPursuit(MultiOutputMixin, RegressorMixin, LinearModel):
         `n_nonzero_coefs` is None and `tol` is None this value is either set
         to 10% of `n_features` or 1, whichever is greater.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     Examples
     --------
     >>> from sklearn.linear_model import OrthogonalMatchingPursuit
@@ -835,6 +840,11 @@ class OrthogonalMatchingPursuitCV(RegressorMixin, LinearModel):
         Number of active features across every target for the model refit with
         the best hyperparameters got by cross-validating across all folds.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     Examples
     --------
     >>> from sklearn.linear_model import OrthogonalMatchingPursuitCV
diff --git a/sklearn/linear_model/_passive_aggressive.py b/sklearn/linear_model/_passive_aggressive.py
index 678061be3c691..3a0a82debcc7b 100644
--- a/sklearn/linear_model/_passive_aggressive.py
+++ b/sklearn/linear_model/_passive_aggressive.py
@@ -120,6 +120,11 @@ class PassiveAggressiveClassifier(BaseSGDClassifier):
     intercept_ : array, shape = [1] if n_classes == 2 else [n_classes]
         Constants in decision function.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     n_iter_ : int
         The actual number of iterations to reach the stopping criterion.
         For multiclass fits, it is the maximum over every binary fit.
@@ -354,6 +359,11 @@ class PassiveAggressiveRegressor(BaseSGDRegressor):
     intercept_ : array, shape = [1] if n_classes == 2 else [n_classes]
         Constants in decision function.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     n_iter_ : int
         The actual number of iterations to reach the stopping criterion.
 
diff --git a/sklearn/linear_model/_perceptron.py b/sklearn/linear_model/_perceptron.py
index b2bb145b904c8..632996cd00c48 100644
--- a/sklearn/linear_model/_perceptron.py
+++ b/sklearn/linear_model/_perceptron.py
@@ -117,6 +117,11 @@ class Perceptron(BaseSGDClassifier):
         The function that determines the loss, or difference between the
         output of the algorithm and the target values.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     n_iter_ : int
         The actual number of iterations to reach the stopping criterion.
         For multiclass fits, it is the maximum over every binary fit.
diff --git a/sklearn/linear_model/_quantile.py b/sklearn/linear_model/_quantile.py
index bf8fea4552c9d..a39f48a804ffc 100644
--- a/sklearn/linear_model/_quantile.py
+++ b/sklearn/linear_model/_quantile.py
@@ -59,6 +59,11 @@ class QuantileRegressor(LinearModel, RegressorMixin, BaseEstimator):
     intercept_ : float
         The intercept of the model, aka bias term.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     n_iter_ : int
         The actual number of iterations performed by the solver.
 
diff --git a/sklearn/linear_model/_ransac.py b/sklearn/linear_model/_ransac.py
index f53785cfe0ced..5ee5b1e2fa502 100644
--- a/sklearn/linear_model/_ransac.py
+++ b/sklearn/linear_model/_ransac.py
@@ -192,6 +192,11 @@ class RANSACRegressor(MetaEstimatorMixin, RegressorMixin,
 
         .. versionadded:: 0.19
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     Examples
     --------
     >>> from sklearn.linear_model import RANSACRegressor
diff --git a/sklearn/linear_model/_ridge.py b/sklearn/linear_model/_ridge.py
index 433e0c4313efc..d82aca05fee7c 100644
--- a/sklearn/linear_model/_ridge.py
+++ b/sklearn/linear_model/_ridge.py
@@ -717,6 +717,11 @@ class Ridge(MultiOutputMixin, RegressorMixin, _BaseRidge):
 
         .. versionadded:: 0.17
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     See Also
     --------
     RidgeClassifier : Ridge classifier.
@@ -877,6 +882,11 @@ class RidgeClassifier(LinearClassifierMixin, _BaseRidge):
     classes_ : ndarray of shape (n_classes,)
         The classes labels.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     See Also
     --------
     Ridge : Ridge regression.
@@ -1793,6 +1803,11 @@ class RidgeCV(MultiOutputMixin, RegressorMixin, _BaseRidgeCV):
 
         .. versionadded:: 0.23
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     Examples
     --------
     >>> from sklearn.datasets import load_diabetes
@@ -1908,6 +1923,11 @@ class RidgeClassifierCV(LinearClassifierMixin, _BaseRidgeCV):
     classes_ : ndarray of shape (n_classes,)
         The classes labels.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     Examples
     --------
     >>> from sklearn.datasets import load_breast_cancer
diff --git a/sklearn/linear_model/_stochastic_gradient.py b/sklearn/linear_model/_stochastic_gradient.py
index 78565178706a8..eb84c06ac93b3 100644
--- a/sklearn/linear_model/_stochastic_gradient.py
+++ b/sklearn/linear_model/_stochastic_gradient.py
@@ -934,6 +934,11 @@ class SGDClassifier(BaseSGDClassifier):
         Number of weight updates performed during training.
         Same as ``(n_iter_ * n_samples)``.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     See Also
     --------
     sklearn.svm.LinearSVC : Linear support vector classification.
@@ -1538,6 +1543,11 @@ class SGDRegressor(BaseSGDRegressor):
         Number of weight updates performed during training.
         Same as ``(n_iter_ * n_samples)``.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     Examples
     --------
     >>> import numpy as np
@@ -1693,6 +1703,11 @@ class SGDOneClassSVM(BaseSGD, OutlierMixin):
 
     loss_function_ : concrete ``LossFunction``
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     Examples
     --------
     >>> import numpy as np
diff --git a/sklearn/linear_model/_theil_sen.py b/sklearn/linear_model/_theil_sen.py
index 4c75613c28a9b..c14b6979ef4d9 100644
--- a/sklearn/linear_model/_theil_sen.py
+++ b/sklearn/linear_model/_theil_sen.py
@@ -272,6 +272,11 @@ class TheilSenRegressor(RegressorMixin, LinearModel):
         Number of combinations taken into account from 'n choose k', where n is
         the number of samples and k is the number of subsamples.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     Examples
     --------
     >>> from sklearn.linear_model import TheilSenRegressor
diff --git a/sklearn/manifold/_isomap.py b/sklearn/manifold/_isomap.py
index 63be19c1c287d..4cf3b1885d2d0 100644
--- a/sklearn/manifold/_isomap.py
+++ b/sklearn/manifold/_isomap.py
@@ -104,6 +104,11 @@ class Isomap(TransformerMixin, BaseEstimator):
     dist_matrix_ : array-like, shape (n_samples, n_samples)
         Stores the geodesic distance matrix of training data.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     Examples
     --------
     >>> from sklearn.datasets import load_digits
diff --git a/sklearn/manifold/_locally_linear.py b/sklearn/manifold/_locally_linear.py
index 0fcd5f543c4d0..17e829270f1a7 100644
--- a/sklearn/manifold/_locally_linear.py
+++ b/sklearn/manifold/_locally_linear.py
@@ -603,6 +603,11 @@ class LocallyLinearEmbedding(TransformerMixin,
     reconstruction_error_ : float
         Reconstruction error associated with `embedding_`
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     nbrs_ : NearestNeighbors object
         Stores nearest neighbors instance, including BallTree or KDtree
         if applicable.
diff --git a/sklearn/manifold/_mds.py b/sklearn/manifold/_mds.py
index d92ab67767fa3..f833f24f981a3 100644
--- a/sklearn/manifold/_mds.py
+++ b/sklearn/manifold/_mds.py
@@ -343,6 +343,11 @@ class MDS(BaseEstimator):
         - or constructs a dissimilarity matrix from data using
           Euclidean distances.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     n_iter_ : int
         The number of iterations corresponding to the best stress.
 
diff --git a/sklearn/manifold/_spectral_embedding.py b/sklearn/manifold/_spectral_embedding.py
index 49e64401b6c00..01bdf06b92ed0 100644
--- a/sklearn/manifold/_spectral_embedding.py
+++ b/sklearn/manifold/_spectral_embedding.py
@@ -440,6 +440,11 @@ class SpectralEmbedding(BaseEstimator):
     affinity_matrix_ : ndarray of shape (n_samples, n_samples)
         Affinity_matrix constructed from samples or precomputed.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     n_neighbors_ : int
         Number of nearest neighbors effectively used.
 
diff --git a/sklearn/manifold/_t_sne.py b/sklearn/manifold/_t_sne.py
index 8e42d48f4ef07..7142909ae292c 100644
--- a/sklearn/manifold/_t_sne.py
+++ b/sklearn/manifold/_t_sne.py
@@ -627,6 +627,11 @@ class TSNE(BaseEstimator):
     kl_divergence_ : float
         Kullback-Leibler divergence after optimization.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     n_iter_ : int
         Number of iterations run.
 
diff --git a/sklearn/model_selection/_search_successive_halving.py b/sklearn/model_selection/_search_successive_halving.py
index 2f5c465d6cf41..81c70945b894e 100644
--- a/sklearn/model_selection/_search_successive_halving.py
+++ b/sklearn/model_selection/_search_successive_halving.py
@@ -560,6 +560,13 @@ class HalvingGridSearchCV(BaseSuccessiveHalving):
 
         This is present only if ``refit`` is not False.
 
+    multimetric_ : bool
+        Whether or not the scorers compute several metrics.
+
+    classes_ : ndarray of shape (n_classes,)
+        The classes labels. This is present only if ``refit`` is specified and
+        the underlying estimator is a classifier.
+
     See Also
     --------
     :class:`HalvingRandomSearchCV`:
@@ -850,6 +857,13 @@ class HalvingRandomSearchCV(BaseSuccessiveHalving):
 
         This is present only if ``refit`` is not False.
 
+    multimetric_ : bool
+        Whether or not the scorers compute several metrics.
+
+    classes_ : ndarray of shape (n_classes,)
+        The classes labels. This is present only if ``refit`` is specified and
+        the underlying estimator is a classifier.
+
     See Also
     --------
     :class:`HalvingGridSearchCV`:
diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py
index 7e936ac3a0c8e..7c46a771a2fd4 100644
--- a/sklearn/naive_bayes.py
+++ b/sklearn/naive_bayes.py
@@ -150,10 +150,15 @@ class GaussianNB(_BaseNB):
         probability of each class.
 
     classes_ : ndarray of shape (n_classes,)
-        class labels known to the classifier
+        class labels known to the classifier.
 
     epsilon_ : float
-        absolute additive value to variances
+        absolute additive value to variances.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
 
     sigma_ : ndarray of shape (n_classes, n_features)
         Variance of each feature per class.
@@ -168,7 +173,7 @@ class labels known to the classifier
         .. versionadded:: 1.0
 
     theta_ : ndarray of shape (n_classes, n_features)
-        mean of each feature per class
+        mean of each feature per class.
 
     Examples
     --------
@@ -767,6 +772,11 @@ class MultinomialNB(_BaseDiscreteNB):
             Attribute `n_features_` was deprecated in version 1.0 and will be
             removed in 1.2. Use `n_features_in_` instead.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     Examples
     --------
     >>> import numpy as np
@@ -896,6 +906,11 @@ class ComplementNB(_BaseDiscreteNB):
             Attribute `n_features_` was deprecated in version 1.0 and will be
             removed in 1.2. Use `n_features_in_` instead.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     Examples
     --------
     >>> import numpy as np
@@ -1016,6 +1031,11 @@ class BernoulliNB(_BaseDiscreteNB):
             Attribute `n_features_` was deprecated in version 1.0 and will be
             removed in 1.2. Use `n_features_in_` instead.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     Examples
     --------
     >>> import numpy as np
@@ -1157,6 +1177,11 @@ class CategoricalNB(_BaseDiscreteNB):
             Attribute `n_features_` was deprecated in version 1.0 and will be
             removed in 1.2. Use `n_features_in_` instead.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     n_categories_ : ndarray of shape (n_features,), dtype=np.int64
         Number of categories for each feature. This value is
         inferred from the data or set by the minimum number of categories.
diff --git a/sklearn/neighbors/_classification.py b/sklearn/neighbors/_classification.py
index 1fd1fb01c9762..76dd3db7444ab 100644
--- a/sklearn/neighbors/_classification.py
+++ b/sklearn/neighbors/_classification.py
@@ -101,6 +101,11 @@ class KNeighborsClassifier(KNeighborsMixin,
         `p` parameter value if the `effective_metric_` attribute is set to
         'minkowski'.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     n_samples_fit_ : int
         Number of samples in the fitted data.
 
@@ -365,6 +370,11 @@ class RadiusNeighborsClassifier(RadiusNeighborsMixin,
         `p` parameter value if the `effective_metric_` attribute is set to
         'minkowski'.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     n_samples_fit_ : int
         Number of samples in the fitted data.
 
diff --git a/sklearn/neighbors/_graph.py b/sklearn/neighbors/_graph.py
index 7676d42d62c18..247aef31ba2f7 100644
--- a/sklearn/neighbors/_graph.py
+++ b/sklearn/neighbors/_graph.py
@@ -286,6 +286,11 @@ class KNeighborsTransformer(KNeighborsMixin,
         `p` parameter value if the `effective_metric_` attribute is set to
         'minkowski'.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     n_samples_fit_ : int
         Number of samples in the fitted data.
 
@@ -468,6 +473,11 @@ class RadiusNeighborsTransformer(RadiusNeighborsMixin,
         `p` parameter value if the `effective_metric_` attribute is set to
         'minkowski'.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     n_samples_fit_ : int
         Number of samples in the fitted data.
 
diff --git a/sklearn/neighbors/_kde.py b/sklearn/neighbors/_kde.py
index 816b023e0f23e..1ebd713b16e69 100644
--- a/sklearn/neighbors/_kde.py
+++ b/sklearn/neighbors/_kde.py
@@ -71,6 +71,11 @@ class KernelDensity(BaseEstimator):
 
     Attributes
     ----------
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     tree_ : ``BinaryTree`` instance
         The tree algorithm for fast generalized N-point problems.
 
diff --git a/sklearn/neighbors/_lof.py b/sklearn/neighbors/_lof.py
index 941b9de781f9a..7b87076516687 100644
--- a/sklearn/neighbors/_lof.py
+++ b/sklearn/neighbors/_lof.py
@@ -157,6 +157,11 @@ class LocalOutlierFactor(KNeighborsMixin,
     effective_metric_params_ : dict
         The effective additional keyword arguments for the metric function.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     n_samples_fit_ : int
         It is the number of samples in the fitted data.
 
diff --git a/sklearn/neighbors/_nca.py b/sklearn/neighbors/_nca.py
index 5951b66ea7dbf..a3701a28909e8 100644
--- a/sklearn/neighbors/_nca.py
+++ b/sklearn/neighbors/_nca.py
@@ -121,6 +121,11 @@ class NeighborhoodComponentsAnalysis(TransformerMixin, BaseEstimator):
     components_ : ndarray of shape (n_components, n_features)
         The linear transformation learned during fitting.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     n_iter_ : int
         Counts the number of iterations performed by the optimizer.
 
diff --git a/sklearn/neighbors/_nearest_centroid.py b/sklearn/neighbors/_nearest_centroid.py
index c5f6a612b0395..4908465d7fafd 100644
--- a/sklearn/neighbors/_nearest_centroid.py
+++ b/sklearn/neighbors/_nearest_centroid.py
@@ -55,6 +55,11 @@ class NearestCentroid(ClassifierMixin, BaseEstimator):
     classes_ : array of shape (n_classes,)
         The unique classes labels.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     Examples
     --------
     >>> from sklearn.neighbors import NearestCentroid
diff --git a/sklearn/neighbors/_regression.py b/sklearn/neighbors/_regression.py
index be60abcc64cb5..64a4e3df8fcae 100644
--- a/sklearn/neighbors/_regression.py
+++ b/sklearn/neighbors/_regression.py
@@ -106,6 +106,11 @@ class KNeighborsRegressor(KNeighborsMixin,
         `p` parameter value if the `effective_metric_` attribute is set to
         'minkowski'.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     n_samples_fit_ : int
         Number of samples in the fitted data.
 
@@ -313,6 +318,11 @@ class RadiusNeighborsRegressor(RadiusNeighborsMixin,
         `p` parameter value if the `effective_metric_` attribute is set to
         'minkowski'.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     n_samples_fit_ : int
         Number of samples in the fitted data.
 
diff --git a/sklearn/neighbors/_unsupervised.py b/sklearn/neighbors/_unsupervised.py
index 0f14c56e8bac2..df452ff4ff1fa 100644
--- a/sklearn/neighbors/_unsupervised.py
+++ b/sklearn/neighbors/_unsupervised.py
@@ -72,6 +72,11 @@ class NearestNeighbors(KNeighborsMixin,
     effective_metric_params_ : dict
         Parameters for the metric used to compute distances to neighbors.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     n_samples_fit_ : int
         Number of samples in the fitted data.
 
diff --git a/sklearn/neural_network/_multilayer_perceptron.py b/sklearn/neural_network/_multilayer_perceptron.py
index 72120ad369275..e6c1ba340a7b3 100644
--- a/sklearn/neural_network/_multilayer_perceptron.py
+++ b/sklearn/neural_network/_multilayer_perceptron.py
@@ -886,6 +886,11 @@ class MLPClassifier(ClassifierMixin, BaseMultilayerPerceptron):
         The ith element in the list represents the bias vector corresponding to
         layer i + 1.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     n_iter_ : int
         The number of iterations the solver has run.
 
@@ -1310,6 +1315,11 @@ class MLPRegressor(RegressorMixin, BaseMultilayerPerceptron):
         The ith element in the list represents the bias vector corresponding to
         layer i + 1.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     n_iter_ : int
         The number of iterations the solver has run.
 
diff --git a/sklearn/neural_network/_rbm.py b/sklearn/neural_network/_rbm.py
index b69a2c496a2c9..42a9eb81e30cd 100644
--- a/sklearn/neural_network/_rbm.py
+++ b/sklearn/neural_network/_rbm.py
@@ -85,6 +85,11 @@ class BernoulliRBM(TransformerMixin, BaseEstimator):
         where batch_size in the number of examples per minibatch and
         n_components is the number of hidden units.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     Examples
     --------
 
diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py
index 393693fc87d2d..82e6d5d85ec19 100644
--- a/sklearn/preprocessing/_data.py
+++ b/sklearn/preprocessing/_data.py
@@ -307,6 +307,11 @@ class MinMaxScaler(TransformerMixin, BaseEstimator):
         .. versionadded:: 0.17
            *data_range_*
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     n_samples_seen_ : int
         The number of samples processed by the estimator.
         It will be reset on new calls to fit, but increments across
@@ -657,6 +662,11 @@ class StandardScaler(TransformerMixin, BaseEstimator):
         The variance for each feature in the training set. Used to compute
         `scale_`. Equal to ``None`` when ``with_std=False``.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     n_samples_seen_ : int or ndarray of shape (n_features,)
         The number of samples processed by the estimator for each feature.
         If there are no missing samples, the ``n_samples_seen`` will be an
@@ -990,6 +1000,11 @@ class MaxAbsScaler(TransformerMixin, BaseEstimator):
     max_abs_ : ndarray of shape (n_features,)
         Per feature maximum absolute value.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     n_samples_seen_ : int
         The number of samples processed by the estimator. Will be reset on
         new calls to fit, but increments across ``partial_fit`` calls.
@@ -1300,6 +1315,11 @@ class RobustScaler(TransformerMixin, BaseEstimator):
         .. versionadded:: 0.17
            *scale_* attribute.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     Examples
     --------
     >>> from sklearn.preprocessing import RobustScaler
@@ -1701,19 +1721,12 @@ class Normalizer(TransformerMixin, BaseEstimator):
         copy (if the input is already a numpy array or a scipy.sparse
         CSR matrix).
 
-    Examples
-    --------
-    >>> from sklearn.preprocessing import Normalizer
-    >>> X = [[4, 1, 2, 2],
-    ...      [1, 3, 9, 3],
-    ...      [5, 7, 5, 1]]
-    >>> transformer = Normalizer().fit(X)  # fit does nothing.
-    >>> transformer
-    Normalizer()
-    >>> transformer.transform(X)
-    array([[0.8, 0.2, 0.4, 0.4],
-           [0.1, 0.3, 0.9, 0.3],
-           [0.5, 0.7, 0.5, 0.1]])
+    Attributes
+    ----------
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
 
     Notes
     -----
@@ -1727,6 +1740,20 @@ class Normalizer(TransformerMixin, BaseEstimator):
     See Also
     --------
     normalize : Equivalent function without the estimator API.
+
+    Examples
+    --------
+    >>> from sklearn.preprocessing import Normalizer
+    >>> X = [[4, 1, 2, 2],
+    ...      [1, 3, 9, 3],
+    ...      [5, 7, 5, 1]]
+    >>> transformer = Normalizer().fit(X)  # fit does nothing.
+    >>> transformer
+    Normalizer()
+    >>> transformer.transform(X)
+    array([[0.8, 0.2, 0.4, 0.4],
+           [0.1, 0.3, 0.9, 0.3],
+           [0.5, 0.7, 0.5, 0.1]])
     """
 
     def __init__(self, norm='l2', *, copy=True):
@@ -1856,6 +1883,13 @@ class Binarizer(TransformerMixin, BaseEstimator):
         set to False to perform inplace binarization and avoid a copy (if
         the input is already a numpy array or a scipy.sparse CSR matrix).
 
+    Attributes
+    ----------
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     Examples
     --------
     >>> from sklearn.preprocessing import Binarizer
@@ -1972,6 +2006,11 @@ class KernelCenterer(TransformerMixin, BaseEstimator):
     K_fit_all_ : float
         Average of kernel matrix.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     References
     ----------
     .. [1] `Schölkopf, Bernhard, Alexander Smola, and Klaus-Robert Müller.
@@ -2199,6 +2238,11 @@ class QuantileTransformer(TransformerMixin, BaseEstimator):
     references_ : ndarray of shape (n_quantiles, )
         Quantiles of references.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     Examples
     --------
     >>> import numpy as np
@@ -2724,6 +2768,11 @@ class PowerTransformer(TransformerMixin, BaseEstimator):
     lambdas_ : ndarray of float of shape (n_features,)
         The parameters of the power transformation for the selected features.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     Examples
     --------
     >>> import numpy as np
diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py
index d7565ff2fb4b3..327c6211d66f2 100644
--- a/sklearn/preprocessing/_discretization.py
+++ b/sklearn/preprocessing/_discretization.py
@@ -64,13 +64,18 @@ class KBinsDiscretizer(TransformerMixin, BaseEstimator):
 
     Attributes
     ----------
+    bin_edges_ : ndarray of ndarray of shape (n_features,)
+        The edges of each bin. Contain arrays of varying shapes ``(n_bins_, )``
+        Ignored features will have empty arrays.
+
     n_bins_ : ndarray of shape (n_features,), dtype=np.int_
         Number of bins per feature. Bins whose width are too small
         (i.e., <= 1e-8) are removed with a warning.
 
-    bin_edges_ : ndarray of ndarray of shape (n_features,)
-        The edges of each bin. Contain arrays of varying shapes ``(n_bins_, )``
-        Ignored features will have empty arrays.
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
 
     See Also
     --------
diff --git a/sklearn/preprocessing/_polynomial.py b/sklearn/preprocessing/_polynomial.py
index 930e85c783711..6c520354b379d 100644
--- a/sklearn/preprocessing/_polynomial.py
+++ b/sklearn/preprocessing/_polynomial.py
@@ -53,26 +53,6 @@ class PolynomialFeatures(TransformerMixin, BaseEstimator):
 
         .. versionadded:: 0.21
 
-    Examples
-    --------
-    >>> import numpy as np
-    >>> from sklearn.preprocessing import PolynomialFeatures
-    >>> X = np.arange(6).reshape(3, 2)
-    >>> X
-    array([[0, 1],
-           [2, 3],
-           [4, 5]])
-    >>> poly = PolynomialFeatures(2)
-    >>> poly.fit_transform(X)
-    array([[ 1.,  0.,  1.,  0.,  0.,  1.],
-           [ 1.,  2.,  3.,  4.,  6.,  9.],
-           [ 1.,  4.,  5., 16., 20., 25.]])
-    >>> poly = PolynomialFeatures(interaction_only=True)
-    >>> poly.fit_transform(X)
-    array([[ 1.,  0.,  1.,  0.],
-           [ 1.,  2.,  3.,  6.],
-           [ 1.,  4.,  5., 20.]])
-
     Attributes
     ----------
     powers_ : ndarray of shape (n_output_features, n_input_features)
@@ -81,6 +61,11 @@ class PolynomialFeatures(TransformerMixin, BaseEstimator):
     n_input_features_ : int
         The total number of input features.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     n_output_features_ : int
         The total number of polynomial output features. The number of output
         features is computed by iterating over all suitably sized combinations
@@ -99,6 +84,26 @@ class PolynomialFeatures(TransformerMixin, BaseEstimator):
 
     See :ref:`examples/linear_model/plot_polynomial_interpolation.py
     <sphx_glr_auto_examples_linear_model_plot_polynomial_interpolation.py>`
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.preprocessing import PolynomialFeatures
+    >>> X = np.arange(6).reshape(3, 2)
+    >>> X
+    array([[0, 1],
+           [2, 3],
+           [4, 5]])
+    >>> poly = PolynomialFeatures(2)
+    >>> poly.fit_transform(X)
+    array([[ 1.,  0.,  1.,  0.,  0.,  1.],
+           [ 1.,  2.,  3.,  4.,  6.,  9.],
+           [ 1.,  4.,  5., 16., 20., 25.]])
+    >>> poly = PolynomialFeatures(interaction_only=True)
+    >>> poly.fit_transform(X)
+    array([[ 1.,  0.,  1.,  0.],
+           [ 1.,  2.,  3.,  6.],
+           [ 1.,  4.,  5., 20.]])
     """
     def __init__(self, degree=2, *, interaction_only=False, include_bias=True,
                  order='C'):
diff --git a/sklearn/semi_supervised/_label_propagation.py b/sklearn/semi_supervised/_label_propagation.py
index e89dfab9310ab..944b6b7acb149 100644
--- a/sklearn/semi_supervised/_label_propagation.py
+++ b/sklearn/semi_supervised/_label_propagation.py
@@ -350,6 +350,11 @@ class LabelPropagation(BaseLabelPropagation):
     transduction_ : ndarray of shape (n_samples)
         Label assigned to each item via the transduction.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     n_iter_ : int
         Number of iterations run.
 
@@ -463,6 +468,11 @@ class LabelSpreading(BaseLabelPropagation):
     transduction_ : ndarray of shape (n_samples,)
         Label assigned to each item via the transduction.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     n_iter_ : int
         Number of iterations run.
 
diff --git a/sklearn/semi_supervised/_self_training.py b/sklearn/semi_supervised/_self_training.py
index 54fa9ba45e1b8..761909903e8b0 100644
--- a/sklearn/semi_supervised/_self_training.py
+++ b/sklearn/semi_supervised/_self_training.py
@@ -86,6 +86,11 @@ class SelfTrainingClassifier(MetaEstimatorMixin, BaseEstimator):
         When a sample has iteration -1, the sample was not labeled in any
         iteration.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     n_iter_ : int
         The number of rounds of self-training, that is the number of times the
         base estimator is fitted on relabeled variants of the training set.
diff --git a/sklearn/svm/_classes.py b/sklearn/svm/_classes.py
index 050855c25c06a..8946e77ef905f 100644
--- a/sklearn/svm/_classes.py
+++ b/sklearn/svm/_classes.py
@@ -114,6 +114,11 @@ class LinearSVC(LinearClassifierMixin,
     classes_ : ndarray of shape (n_classes,)
         The unique classes labels.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     n_iter_ : int
         Maximum number of iterations run across all classes.
 
@@ -331,6 +336,11 @@ class LinearSVR(RegressorMixin, LinearModel):
     intercept_ : ndarray of shape (1) if n_classes == 2 else (n_classes)
         Constants in decision function.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     n_iter_ : int
         Maximum number of iterations run across all classes.
 
@@ -583,6 +593,11 @@ class SVC(BaseSVC):
     intercept_ : ndarray of shape (n_classes * (n_classes - 1) / 2,)
         Constants in decision function.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     support_ : ndarray of shape (n_SV)
         Indices of support vectors.
 
@@ -803,6 +818,11 @@ class NuSVC(BaseSVC):
     intercept_ : ndarray of shape (n_classes * (n_classes - 1) / 2,)
         Constants in decision function.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     support_ : ndarray of shape (n_SV,)
         Indices of support vectors.
 
@@ -981,6 +1001,11 @@ class SVR(RegressorMixin, BaseLibSVM):
     intercept_ : ndarray of shape (1,)
         Constants in decision function.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     n_support_ : ndarray of shape (n_classes,), dtype=int32
         Number of support vectors for each class.
 
@@ -1133,6 +1158,11 @@ class NuSVR(RegressorMixin, BaseLibSVM):
     intercept_ : ndarray of shape (1,)
         Constants in decision function.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     n_support_ : ndarray of shape (n_classes,), dtype=int32
         Number of support vectors for each class.
 
@@ -1281,6 +1311,11 @@ class OneClassSVM(OutlierMixin, BaseLibSVM):
     intercept_ : ndarray of shape (1,)
         Constant in the decision function.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     n_support_ : ndarray of shape (n_classes,), dtype=int32
         Number of support vectors for each class.
 
diff --git a/sklearn/tests/test_docstring_parameters.py b/sklearn/tests/test_docstring_parameters.py
index 85d8ad0cf6a36..ae9a29622c4aa 100644
--- a/sklearn/tests/test_docstring_parameters.py
+++ b/sklearn/tests/test_docstring_parameters.py
@@ -176,30 +176,8 @@ def _construct_searchcv_instance(SearchCV):
 
 
 N_FEATURES_MODULES_TO_IGNORE = {
-    'compose',
-    'covariance',
-    'decomposition',
-    'discriminant_analysis',
-    'dummy',
-    'ensemble',
-    'feature_selection',
-    'gaussian_process',
-    'impute',
-    'isotonic',
-    'kernel_approximation',
-    'kernel_ridge',
-    'linear_model',
-    'manifold',
     'model_selection',
     'multioutput',
-    'naive_bayes',
-    'neighbors',
-    'neural_network',
-    'preprocessing',
-    'random_projection',
-    'semi_supervised',
-    'svm',
-    'tree'
 }
 
 
@@ -212,22 +190,28 @@ def test_fit_docstring_attributes(name, Estimator):
     doc = docscrape.ClassDoc(Estimator)
     attributes = doc['Attributes']
 
-    IGNORED = {'ClassifierChain', 'ColumnTransformer',
-               'CountVectorizer', 'DictVectorizer',
-               'GaussianRandomProjection',
-               'MultiOutputClassifier', 'MultiOutputRegressor',
-               'NoSampleWeightWrapper', 'RFE', 'RFECV',
-               'RegressorChain', 'SelectFromModel',
-               'SparseCoder', 'SparseRandomProjection',
-               'SpectralBiclustering', 'StackingClassifier',
-               'StackingRegressor', 'TfidfVectorizer', 'VotingClassifier',
-               'VotingRegressor', 'SequentialFeatureSelector',
-               'HalvingGridSearchCV', 'HalvingRandomSearchCV'}
+    IGNORED = {
+        'ClassifierChain',
+        'CountVectorizer', 'DictVectorizer',
+        'GaussianRandomProjection',
+        'MultiOutputClassifier', 'MultiOutputRegressor',
+        'NoSampleWeightWrapper', 'RFE', 'RFECV',
+        'RegressorChain', 'SelectFromModel',
+        'SparseCoder', 'SparseRandomProjection',
+        'SpectralBiclustering', 'StackingClassifier',
+        'StackingRegressor', 'TfidfVectorizer', 'VotingClassifier',
+        'VotingRegressor', 'SequentialFeatureSelector',
+    }
 
     if Estimator.__name__ in IGNORED or Estimator.__name__.startswith('_'):
         pytest.skip("Estimator cannot be fit easily to test fit attributes")
 
-    if Estimator.__name__ in ("RandomizedSearchCV", "GridSearchCV"):
+    if Estimator.__name__ in (
+        "HalvingRandomSearchCV",
+        "RandomizedSearchCV",
+        "HalvingGridSearchCV",
+        "GridSearchCV",
+    ):
         est = _construct_searchcv_instance(Estimator)
     else:
         est = _construct_instance(Estimator)
diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
index a79a850f3b7c7..ba5bf2873bf18 100644
--- a/sklearn/tree/_classes.py
+++ b/sklearn/tree/_classes.py
@@ -796,6 +796,11 @@ class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree):
     n_features_ : int
         The number of features when ``fit`` is performed.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     n_outputs_ : int
         The number of outputs when ``fit`` is performed.
 
@@ -1161,6 +1166,11 @@ class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree):
     n_features_ : int
         The number of features when ``fit`` is performed.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     n_outputs_ : int
         The number of outputs when ``fit`` is performed.
 
@@ -1477,6 +1487,11 @@ class ExtraTreeClassifier(DecisionTreeClassifier):
     n_features_ : int
         The number of features when ``fit`` is performed.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     n_outputs_ : int
         The number of outputs when ``fit`` is performed.
 
@@ -1699,6 +1714,11 @@ class ExtraTreeRegressor(DecisionTreeRegressor):
     n_features_ : int
         The number of features when ``fit`` is performed.
 
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
     feature_importances_ : ndarray of shape (n_features,)
         Return impurity-based feature importances (the higher, the more
         important the feature).

From bf380eb4a9f67a7dc80528d11f8e239680144a65 Mon Sep 17 00:00:00 2001
From: amrcode <ahagen@mitre.org>
Date: Wed, 23 Sep 2020 12:23:22 -0400
Subject: [PATCH 469/478] Update _supervised.py

---
 sklearn/metrics/cluster/_supervised.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/sklearn/metrics/cluster/_supervised.py b/sklearn/metrics/cluster/_supervised.py
index 7814e7ba50e1c..38119449ff487 100644
--- a/sklearn/metrics/cluster/_supervised.py
+++ b/sklearn/metrics/cluster/_supervised.py
@@ -735,10 +735,10 @@ def mutual_info_score(labels_true, labels_pred, *, contingency=None):
     Parameters
     ----------
     labels_true : int array, shape = [n_samples]
-        A clustering of the data into disjoint subsets.
+        A clustering of the data into disjoint subsets (U).
 
     labels_pred : int array-like of shape (n_samples,)
-        A clustering of the data into disjoint subsets.
+        A clustering of the data into disjoint subsets (V).
 
     contingency : {ndarray, sparse matrix} of shape \
             (n_classes_true, n_classes_pred), default=None
@@ -749,7 +749,7 @@ def mutual_info_score(labels_true, labels_pred, *, contingency=None):
     Returns
     -------
     mi : float
-       Mutual information, a non-negative value
+       Mutual information, a non-negative value, measured in nats using the natural logarithm
 
     Notes
     -----
@@ -823,10 +823,10 @@ def adjusted_mutual_info_score(labels_true, labels_pred, *,
     Parameters
     ----------
     labels_true : int array, shape = [n_samples]
-        A clustering of the data into disjoint subsets.
+        A clustering of the data into disjoint subsets (U).
 
     labels_pred : int array-like of shape (n_samples,)
-        A clustering of the data into disjoint subsets.
+        A clustering of the data into disjoint subsets (V).
 
     average_method : str, default='arithmetic'
         How to compute the normalizer in the denominator. Possible options
@@ -843,7 +843,8 @@ def adjusted_mutual_info_score(labels_true, labels_pred, *,
     ami: float (upperlimited by 1.0)
        The AMI returns a value of 1 when the two partitions are identical
        (ie perfectly matched). Random partitions (independent labellings) have
-       an expected AMI around 0 on average hence can be negative.
+       an expected AMI around 0 on average hence can be negative. The value is
+       in adjusted nats (based on the natural logarithm).
 
     See Also
     --------
@@ -959,7 +960,8 @@ def normalized_mutual_info_score(labels_true, labels_pred, *,
     Returns
     -------
     nmi : float
-       score between 0.0 and 1.0. 1.0 stands for perfectly complete labeling
+       score between 0.0 and 1.0 in normalized nats (based on the natural 
+       logarithm). 1.0 stands for perfectly complete labeling.
 
     See Also
     --------

From 7421170c067faebcb30ef55df438c5f1044f88dd Mon Sep 17 00:00:00 2001
From: amrcode <ahagen@mitre.org>
Date: Tue, 20 Oct 2020 08:31:48 -0400
Subject: [PATCH 470/478] Update _supervised.py

Fix lint issues in doc updates.
---
 sklearn/metrics/cluster/_supervised.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/sklearn/metrics/cluster/_supervised.py b/sklearn/metrics/cluster/_supervised.py
index 38119449ff487..a4e349649a968 100644
--- a/sklearn/metrics/cluster/_supervised.py
+++ b/sklearn/metrics/cluster/_supervised.py
@@ -749,7 +749,8 @@ def mutual_info_score(labels_true, labels_pred, *, contingency=None):
     Returns
     -------
     mi : float
-       Mutual information, a non-negative value, measured in nats using the natural logarithm
+       Mutual information, a non-negative value, measured in nats using the
+       natural logarithm
 
     Notes
     -----
@@ -960,7 +961,7 @@ def normalized_mutual_info_score(labels_true, labels_pred, *,
     Returns
     -------
     nmi : float
-       score between 0.0 and 1.0 in normalized nats (based on the natural 
+       score between 0.0 and 1.0 in normalized nats (based on the natural
        logarithm). 1.0 stands for perfectly complete labeling.
 
     See Also

From 7cb0dbecd4477f8b690e333bd2c3e623f40d2828 Mon Sep 17 00:00:00 2001
From: amrcode <ahagen@mitre.org>
Date: Wed, 21 Oct 2020 08:42:19 -0400
Subject: [PATCH 471/478] Update sklearn/metrics/cluster/_supervised.py to add
 a period in the return value doc

Co-authored-by: Chiara Marmo <cmarmo@users.noreply.github.com>
---
 sklearn/metrics/cluster/_supervised.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/metrics/cluster/_supervised.py b/sklearn/metrics/cluster/_supervised.py
index a4e349649a968..0de0d3a57657f 100644
--- a/sklearn/metrics/cluster/_supervised.py
+++ b/sklearn/metrics/cluster/_supervised.py
@@ -750,7 +750,7 @@ def mutual_info_score(labels_true, labels_pred, *, contingency=None):
     -------
     mi : float
        Mutual information, a non-negative value, measured in nats using the
-       natural logarithm
+       natural logarithm.
 
     Notes
     -----

From 14df87ff74bb792507e463c2a21189957960b113 Mon Sep 17 00:00:00 2001
From: ahagen <ahagen@mitre.org>
Date: Tue, 5 Jan 2021 09:02:32 -0500
Subject: [PATCH 472/478] Updates from review

---
 sklearn/metrics/cluster/_supervised.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/sklearn/metrics/cluster/_supervised.py b/sklearn/metrics/cluster/_supervised.py
index 0de0d3a57657f..4eb56c1caaccb 100644
--- a/sklearn/metrics/cluster/_supervised.py
+++ b/sklearn/metrics/cluster/_supervised.py
@@ -735,10 +735,10 @@ def mutual_info_score(labels_true, labels_pred, *, contingency=None):
     Parameters
     ----------
     labels_true : int array, shape = [n_samples]
-        A clustering of the data into disjoint subsets (U).
+        A clustering of the data into disjoint subsets, called $U$ in the above formula.
 
     labels_pred : int array-like of shape (n_samples,)
-        A clustering of the data into disjoint subsets (V).
+        A clustering of the data into disjoint subsets, called $V$ in the above formula.
 
     contingency : {ndarray, sparse matrix} of shape \
             (n_classes_true, n_classes_pred), default=None
@@ -824,10 +824,10 @@ def adjusted_mutual_info_score(labels_true, labels_pred, *,
     Parameters
     ----------
     labels_true : int array, shape = [n_samples]
-        A clustering of the data into disjoint subsets (U).
+        A clustering of the data into disjoint subsets, called $U$ in the above formula.
 
     labels_pred : int array-like of shape (n_samples,)
-        A clustering of the data into disjoint subsets (V).
+        A clustering of the data into disjoint subsets, called $V$ in the above formula.
 
     average_method : str, default='arithmetic'
         How to compute the normalizer in the denominator. Possible options
@@ -961,7 +961,7 @@ def normalized_mutual_info_score(labels_true, labels_pred, *,
     Returns
     -------
     nmi : float
-       score between 0.0 and 1.0 in normalized nats (based on the natural
+       Score between 0.0 and 1.0 in normalized nats (based on the natural
        logarithm). 1.0 stands for perfectly complete labeling.
 
     See Also

From 6c1c7879f3ae94ec17a1c718d5ad7a127291fd37 Mon Sep 17 00:00:00 2001
From: ahagen <ahagen@mitre.org>
Date: Tue, 5 Jan 2021 12:15:50 -0500
Subject: [PATCH 473/478] Fix line lengths

---
 sklearn/metrics/cluster/_supervised.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/sklearn/metrics/cluster/_supervised.py b/sklearn/metrics/cluster/_supervised.py
index 4eb56c1caaccb..095675e691b8b 100644
--- a/sklearn/metrics/cluster/_supervised.py
+++ b/sklearn/metrics/cluster/_supervised.py
@@ -735,10 +735,12 @@ def mutual_info_score(labels_true, labels_pred, *, contingency=None):
     Parameters
     ----------
     labels_true : int array, shape = [n_samples]
-        A clustering of the data into disjoint subsets, called $U$ in the above formula.
+        A clustering of the data into disjoint subsets, called $U$ in the
+        above formula.
 
     labels_pred : int array-like of shape (n_samples,)
-        A clustering of the data into disjoint subsets, called $V$ in the above formula.
+        A clustering of the data into disjoint subsets, called $V$ in the
+        above formula.
 
     contingency : {ndarray, sparse matrix} of shape \
             (n_classes_true, n_classes_pred), default=None
@@ -824,10 +826,12 @@ def adjusted_mutual_info_score(labels_true, labels_pred, *,
     Parameters
     ----------
     labels_true : int array, shape = [n_samples]
-        A clustering of the data into disjoint subsets, called $U$ in the above formula.
+        A clustering of the data into disjoint subsets, called $U$ in the
+        above formula.
 
     labels_pred : int array-like of shape (n_samples,)
-        A clustering of the data into disjoint subsets, called $V$ in the above formula.
+        A clustering of the data into disjoint subsets, called $V$ in the
+        above formula.
 
     average_method : str, default='arithmetic'
         How to compute the normalizer in the denominator. Possible options

From b6e1275397ef5a54443dadf4f80f8f8afdeaad23 Mon Sep 17 00:00:00 2001
From: amrcode <ahagen@mitre.org>
Date: Tue, 2 Feb 2021 08:58:57 -0500
Subject: [PATCH 474/478] Update _supervised.py

Line length adjustment
---
 sklearn/metrics/cluster/_supervised.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/metrics/cluster/_supervised.py b/sklearn/metrics/cluster/_supervised.py
index 095675e691b8b..0865f5af82e1d 100644
--- a/sklearn/metrics/cluster/_supervised.py
+++ b/sklearn/metrics/cluster/_supervised.py
@@ -710,8 +710,8 @@ def v_measure_score(labels_true, labels_pred, *, beta=1.0):
 def mutual_info_score(labels_true, labels_pred, *, contingency=None):
     """Mutual Information between two clusterings.
 
-    The Mutual Information is a measure of the similarity between two labels of
-    the same data. Where :math:`|U_i|` is the number of the samples
+    The Mutual Information is a measure of the similarity between two labels 
+    of the same data. Where :math:`|U_i|` is the number of the samples
     in cluster :math:`U_i` and :math:`|V_j|` is the number of the
     samples in cluster :math:`V_j`, the Mutual Information
     between clusterings :math:`U` and :math:`V` is given as:

From 85aa9ec4cc101db6e5a34611633b3c93d78204c9 Mon Sep 17 00:00:00 2001
From: ahagen <ahagen@mitre.org>
Date: Tue, 2 Feb 2021 09:49:35 -0500
Subject: [PATCH 475/478] Remove trailing whitespace

---
 sklearn/metrics/cluster/_supervised.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/metrics/cluster/_supervised.py b/sklearn/metrics/cluster/_supervised.py
index 0865f5af82e1d..d7695f52e522e 100644
--- a/sklearn/metrics/cluster/_supervised.py
+++ b/sklearn/metrics/cluster/_supervised.py
@@ -710,7 +710,7 @@ def v_measure_score(labels_true, labels_pred, *, beta=1.0):
 def mutual_info_score(labels_true, labels_pred, *, contingency=None):
     """Mutual Information between two clusterings.
 
-    The Mutual Information is a measure of the similarity between two labels 
+    The Mutual Information is a measure of the similarity between two labels
     of the same data. Where :math:`|U_i|` is the number of the samples
     in cluster :math:`U_i` and :math:`|V_j|` is the number of the
     samples in cluster :math:`V_j`, the Mutual Information

From e8ff40438fc212ab0f288601ab26abae9399c040 Mon Sep 17 00:00:00 2001
From: amrcode <ahagen@mitre.org>
Date: Fri, 11 Jun 2021 08:14:43 -0400
Subject: [PATCH 476/478] Update math notation

---
 sklearn/metrics/cluster/_supervised.py | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/sklearn/metrics/cluster/_supervised.py b/sklearn/metrics/cluster/_supervised.py
index d7695f52e522e..390e4cd279cf9 100644
--- a/sklearn/metrics/cluster/_supervised.py
+++ b/sklearn/metrics/cluster/_supervised.py
@@ -725,21 +725,22 @@ def mutual_info_score(labels_true, labels_pred, *, contingency=None):
     a permutation of the class or cluster label values won't change the
     score value in any way.
 
-    This metric is furthermore symmetric: switching ``label_true`` with
-    ``label_pred`` will return the same score value. This can be useful to
-    measure the agreement of two independent label assignments strategies
-    on the same dataset when the real ground truth is not known.
+    This metric is furthermore symmetric: switching :math:`U` (i.e 
+    ``label_true``) with :math:`V` (i.e. ``label_pred``) will return the 
+    same score value. This can be useful to measure the agreement of two 
+    independent label assignments strategies on the same dataset when the 
+    real ground truth is not known.
 
     Read more in the :ref:`User Guide <mutual_info_score>`.
 
     Parameters
     ----------
     labels_true : int array, shape = [n_samples]
-        A clustering of the data into disjoint subsets, called $U$ in the
+        A clustering of the data into disjoint subsets, called :math:`U` in the
         above formula.
 
     labels_pred : int array-like of shape (n_samples,)
-        A clustering of the data into disjoint subsets, called $V$ in the
+        A clustering of the data into disjoint subsets, called :math:`V` in the
         above formula.
 
     contingency : {ndarray, sparse matrix} of shape \
@@ -813,10 +814,10 @@ def adjusted_mutual_info_score(labels_true, labels_pred, *,
     a permutation of the class or cluster label values won't change the
     score value in any way.
 
-    This metric is furthermore symmetric: switching ``label_true`` with
-    ``label_pred`` will return the same score value. This can be useful to
-    measure the agreement of two independent label assignments strategies
-    on the same dataset when the real ground truth is not known.
+    This metric is furthermore symmetric: switching :math:`U` (``label_true``) 
+    with :math:`V` (``labels_pred``) will return the same score value. This can 
+    be useful to measure the agreement of two independent label assignments 
+    strategies on the same dataset when the real ground truth is not known.
 
     Be mindful that this function is an order of magnitude slower than other
     metrics, such as the Adjusted Rand Index.
@@ -826,11 +827,11 @@ def adjusted_mutual_info_score(labels_true, labels_pred, *,
     Parameters
     ----------
     labels_true : int array, shape = [n_samples]
-        A clustering of the data into disjoint subsets, called $U$ in the
+        A clustering of the data into disjoint subsets, called :math:`U` in the
         above formula.
 
     labels_pred : int array-like of shape (n_samples,)
-        A clustering of the data into disjoint subsets, called $V$ in the
+        A clustering of the data into disjoint subsets, called :math:`V` in the
         above formula.
 
     average_method : str, default='arithmetic'

From bdc8ba9083f9dcfe4bc2eade5a50e3a01eb069f6 Mon Sep 17 00:00:00 2001
From: ahagen <ahagen@mitre.org>
Date: Fri, 11 Jun 2021 08:19:49 -0400
Subject: [PATCH 477/478] Fix line lengths

---
 sklearn/metrics/cluster/_supervised.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/sklearn/metrics/cluster/_supervised.py b/sklearn/metrics/cluster/_supervised.py
index 390e4cd279cf9..cf27012a3bbd7 100644
--- a/sklearn/metrics/cluster/_supervised.py
+++ b/sklearn/metrics/cluster/_supervised.py
@@ -736,12 +736,12 @@ def mutual_info_score(labels_true, labels_pred, *, contingency=None):
     Parameters
     ----------
     labels_true : int array, shape = [n_samples]
-        A clustering of the data into disjoint subsets, called :math:`U` in the
-        above formula.
+        A clustering of the data into disjoint subsets, called :math:`U` in 
+        the above formula.
 
     labels_pred : int array-like of shape (n_samples,)
-        A clustering of the data into disjoint subsets, called :math:`V` in the
-        above formula.
+        A clustering of the data into disjoint subsets, called :math:`V` in 
+        the above formula.
 
     contingency : {ndarray, sparse matrix} of shape \
             (n_classes_true, n_classes_pred), default=None
@@ -827,12 +827,12 @@ def adjusted_mutual_info_score(labels_true, labels_pred, *,
     Parameters
     ----------
     labels_true : int array, shape = [n_samples]
-        A clustering of the data into disjoint subsets, called :math:`U` in the
-        above formula.
+        A clustering of the data into disjoint subsets, called :math:`U` in 
+        the above formula.
 
     labels_pred : int array-like of shape (n_samples,)
-        A clustering of the data into disjoint subsets, called :math:`V` in the
-        above formula.
+        A clustering of the data into disjoint subsets, called :math:`V` in 
+        the above formula.
 
     average_method : str, default='arithmetic'
         How to compute the normalizer in the denominator. Possible options

From 26e2064d6739795331072ef002aee44d000d7de6 Mon Sep 17 00:00:00 2001
From: ahagen <ahagen@mitre.org>
Date: Fri, 11 Jun 2021 08:59:22 -0400
Subject: [PATCH 478/478] Remove trailing whitespace

---
 sklearn/metrics/cluster/_supervised.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/sklearn/metrics/cluster/_supervised.py b/sklearn/metrics/cluster/_supervised.py
index cf27012a3bbd7..636ba3e189394 100644
--- a/sklearn/metrics/cluster/_supervised.py
+++ b/sklearn/metrics/cluster/_supervised.py
@@ -725,10 +725,10 @@ def mutual_info_score(labels_true, labels_pred, *, contingency=None):
     a permutation of the class or cluster label values won't change the
     score value in any way.
 
-    This metric is furthermore symmetric: switching :math:`U` (i.e 
-    ``label_true``) with :math:`V` (i.e. ``label_pred``) will return the 
-    same score value. This can be useful to measure the agreement of two 
-    independent label assignments strategies on the same dataset when the 
+    This metric is furthermore symmetric: switching :math:`U` (i.e
+    ``label_true``) with :math:`V` (i.e. ``label_pred``) will return the
+    same score value. This can be useful to measure the agreement of two
+    independent label assignments strategies on the same dataset when the
     real ground truth is not known.
 
     Read more in the :ref:`User Guide <mutual_info_score>`.
@@ -736,11 +736,11 @@ def mutual_info_score(labels_true, labels_pred, *, contingency=None):
     Parameters
     ----------
     labels_true : int array, shape = [n_samples]
-        A clustering of the data into disjoint subsets, called :math:`U` in 
+        A clustering of the data into disjoint subsets, called :math:`U` in
         the above formula.
 
     labels_pred : int array-like of shape (n_samples,)
-        A clustering of the data into disjoint subsets, called :math:`V` in 
+        A clustering of the data into disjoint subsets, called :math:`V` in
         the above formula.
 
     contingency : {ndarray, sparse matrix} of shape \
@@ -814,9 +814,9 @@ def adjusted_mutual_info_score(labels_true, labels_pred, *,
     a permutation of the class or cluster label values won't change the
     score value in any way.
 
-    This metric is furthermore symmetric: switching :math:`U` (``label_true``) 
-    with :math:`V` (``labels_pred``) will return the same score value. This can 
-    be useful to measure the agreement of two independent label assignments 
+    This metric is furthermore symmetric: switching :math:`U` (``label_true``)
+    with :math:`V` (``labels_pred``) will return the same score value. This can
+    be useful to measure the agreement of two independent label assignments
     strategies on the same dataset when the real ground truth is not known.
 
     Be mindful that this function is an order of magnitude slower than other
@@ -827,11 +827,11 @@ def adjusted_mutual_info_score(labels_true, labels_pred, *,
     Parameters
     ----------
     labels_true : int array, shape = [n_samples]
-        A clustering of the data into disjoint subsets, called :math:`U` in 
+        A clustering of the data into disjoint subsets, called :math:`U` in
         the above formula.
 
     labels_pred : int array-like of shape (n_samples,)
-        A clustering of the data into disjoint subsets, called :math:`V` in 
+        A clustering of the data into disjoint subsets, called :math:`V` in
         the above formula.
 
     average_method : str, default='arithmetic'